9
9
// This file defines the machine model for Znver4 to support instruction
10
10
// scheduling and other instruction cost heuristics.
11
11
// Based on:
12
- // * AMD Software Optimization Guide for AMD Family 19h Processors.
13
- // https://www.amd.com/system/files/TechDocs/56665.zip
12
+ // * AMD Software Optimization Guide for the AMD Family 19h (Zen4)
13
+ // Microarchitecture
14
+ // https://www.amd.com/system/files/TechDocs/57647.zip
14
15
//===----------------------------------------------------------------------===//
15
16
16
17
def Znver4Model : SchedMachineModel {
17
- // AMD SOG 19h , 2.9.6 Dispatch
18
+ // AMD SOG Zen4 , 2.9.6 Dispatch
18
19
// The processor may dispatch up to 6 macro ops per cycle
19
20
// into the execution engine.
20
21
let IssueWidth = 6;
21
- // AMD SOG 19h , 2.10.3
22
+ // AMD SOG Zen4 , 2.10.3
22
23
// The retire control unit (RCU) tracks the completion status of all
23
24
// outstanding operations (integer, load/store, and floating-point) and is
24
25
// the final arbiter for exception processing and recovery.
25
26
// The unit can receive up to 6 macro ops dispatched per cycle and track up
26
27
// to 320 macro ops in-flight in non-SMT mode or 160 per thread in SMT mode.
27
28
let MicroOpBufferSize = 320;
28
- // AMD SOG 19h , 2.9.1 Op Cache
29
+ // AMD SOG Zen4 , 2.9.1 Op Cache
29
30
// The op cache is organized as an associative cache with 64 sets and 8 ways.
30
31
// At each set-way intersection is an entry containing up to 8 macro ops.
31
32
// The maximum capacity of the op cache is 6.75K ops.
32
33
// Assuming a maximum dispatch of 9 ops/cy and a mispredict cost of 12cy from
33
34
// the op-cache, we limit the loop buffer to 9*12 = 108 to avoid loop
34
35
// unrolling leading to excessive filling of the op-cache from frontend.
35
36
let LoopMicroOpBufferSize = 108;
36
- // AMD SOG 19h , 2.6.2 L1 Data Cache
37
+ // AMD SOG Zen4 , 2.6.2 L1 Data Cache
37
38
// The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
38
- // AMD SOG 19h , 2.12 L1 Data Cache
39
+ // AMD SOG Zen4 , 2.12 L1 Data Cache
39
40
// The AGU and LS pipelines are optimized for simple address generation modes.
40
41
// <...> and can achieve 4-cycle load-to-use integer load latency.
41
42
let LoadLatency = 4;
42
- // AMD SOG 19h , 2.12 L1 Data Cache
43
+ // AMD SOG Zen4 , 2.12 L1 Data Cache
43
44
// The AGU and LS pipelines are optimized for simple address generation modes.
44
45
// <...> and can achieve <...> 7-cycle load-to-use FP load latency.
45
46
int VecLoadLatency = 7;
46
47
// Latency of a simple store operation.
47
48
int StoreLatency = 1;
48
49
// FIXME:
49
50
let HighLatency = 25; // FIXME: any better choice?
50
- // AMD SOG 19h , 2.8 Optimizing Branching
51
+ // AMD SOG Zen4 , 2.8 Optimizing Branching
51
52
// The branch misprediction penalty is in the range from 11 to 18 cycles,
52
53
// <...>. The common case penalty is 13 cycles.
53
54
let MispredictPenalty = 13;
@@ -64,7 +65,7 @@ let SchedModel = Znver4Model in {
64
65
// RCU
65
66
//===----------------------------------------------------------------------===//
66
67
67
- // AMD SOG 19h , 2.10.3 Retire Control Unit
68
+ // AMD SOG Zen4 , 2.10.3 Retire Control Unit
68
69
// The unit can receive up to 6 macro ops dispatched per cycle and track up to
69
70
// 320 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...>
70
71
// The retire unit handles in-order commit of up to nine macro ops per cycle.
@@ -74,27 +75,27 @@ def Zn4RCU : RetireControlUnit<Znver4Model.MicroOpBufferSize, 9>;
74
75
// Integer Execution Unit
75
76
//
76
77
77
- // AMD SOG 19h , 2.4 Superscalar Organization
78
+ // AMD SOG Zen4 , 2.4 Superscalar Organization
78
79
// The processor uses four decoupled independent integer scheduler queues,
79
80
// each one servicing one ALU pipeline and one or two other pipelines
80
81
81
82
//
82
83
// Execution pipes
83
84
//===----------------------------------------------------------------------===//
84
85
85
- // AMD SOG 19h , 2.10.2 Execution Units
86
+ // AMD SOG Zen4 , 2.10.2 Execution Units
86
87
// The processor contains 4 general purpose integer execution pipes.
87
88
// Each pipe has an ALU capable of general purpose integer operations.
88
89
def Zn4ALU0 : ProcResource<1>;
89
90
def Zn4ALU1 : ProcResource<1>;
90
91
def Zn4ALU2 : ProcResource<1>;
91
92
def Zn4ALU3 : ProcResource<1>;
92
93
93
- // AMD SOG 19h , 2.10.2 Execution Units
94
+ // AMD SOG Zen4 , 2.10.2 Execution Units
94
95
// There is also a separate branch execution unit.
95
96
def Zn4BRU1 : ProcResource<1>;
96
97
97
- // AMD SOG 19h , 2.10.2 Execution Units
98
+ // AMD SOG Zen4 , 2.10.2 Execution Units
98
99
// There are three Address Generation Units (AGUs) for all load and store
99
100
// address generation. There are also 3 store data movement units
100
101
// associated with the same schedulers as the AGUs.
@@ -106,11 +107,11 @@ def Zn4AGU2 : ProcResource<1>;
106
107
// Execution Units
107
108
//===----------------------------------------------------------------------===//
108
109
109
- // AMD SOG 19h , 2.10.2 Execution Units
110
+ // AMD SOG Zen4 , 2.10.2 Execution Units
110
111
// ALU0 additionally has divide <...> execution capability.
111
112
defvar Zn4Divider = Zn4ALU0;
112
113
113
- // AMD SOG 19h , 2.10.2 Execution Units
114
+ // AMD SOG Zen4 , 2.10.2 Execution Units
114
115
// ALU0 additionally has <...> branch execution capability.
115
116
defvar Zn4BRU0 = Zn4ALU0;
116
117
@@ -143,14 +144,14 @@ def Zn4ALU12 : ProcResGroup<[Zn4ALU1, Zn4ALU2]>;
143
144
// Scheduling
144
145
//===----------------------------------------------------------------------===//
145
146
146
- // AMD SOG 19h , 2.10.3 Retire Control Unit
147
+ // AMD SOG Zen4 , 2.10.3 Retire Control Unit
147
148
// The integer physical register file (PRF) consists of 224 registers.
148
149
def Zn4IntegerPRF : RegisterFile<224, [GR64, CCR], [1, 1], [1, 0],
149
150
6, // Max moves that can be eliminated per cycle.
150
151
0>; // Restrict move elimination to zero regs.
151
152
152
153
// anandtech, The integer scheduler has a 4*24 entry macro op capacity.
153
- // AMD SOG 19h , 2.10.1 Schedulers
154
+ // AMD SOG Zen4 , 2.10.1 Schedulers
154
155
// The schedulers can receive up to six macro ops per cycle, with a limit of
155
156
// two per scheduler. Each scheduler can issue one micro op per cycle into
156
157
// each of its associated pipelines
@@ -167,15 +168,15 @@ def Zn4Int : ProcResGroup<[Zn4ALU0, Zn4AGU0, Zn4BRU0, // scheduler 0
167
168
// Floating-Point Unit
168
169
//
169
170
170
- // AMD SOG 19h , 2.4 Superscalar Organization
171
+ // AMD SOG Zen4 , 2.4 Superscalar Organization
171
172
// The processor uses <...> two decoupled independent floating point schedulers
172
173
// each servicing two FP pipelines and one store or FP-to-integer pipeline.
173
174
174
175
//
175
176
// Execution pipes
176
177
//===----------------------------------------------------------------------===//
177
178
178
- // AMD SOG 19h , 2.10.1 Schedulers
179
+ // AMD SOG Zen4 , 2.10.1 Schedulers
179
180
// <...>, and six FPU pipes.
180
181
// Agner, 22.10 Floating point execution pipes
181
182
// There are six floating point/vector execution pipes,
@@ -188,7 +189,7 @@ def Zn4FP45 : ProcResource<2>;
188
189
//
189
190
// Execution Units
190
191
//===----------------------------------------------------------------------===//
191
- // AMD SOG 19h , 2.11.1 Floating Point Execution Resources
192
+ // AMD SOG Zen4 , 2.11.1 Floating Point Execution Resources
192
193
193
194
// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
194
195
defvar Zn4FPFMul0 = Zn4FP0;
@@ -203,7 +204,7 @@ defvar Zn4FPFCvt0 = Zn4FP2;
203
204
defvar Zn4FPFCvt1 = Zn4FP3;
204
205
205
206
// All Divide and Square Root except Reciprocal Approximation
206
- // AMD SOG 19h , 2.11.1 Floating Point Execution Resources
207
+ // AMD SOG Zen4 , 2.11.1 Floating Point Execution Resources
207
208
// FDIV unit can support 2 simultaneous operations in flight
208
209
// even though it occupies a single pipe.
209
210
// FIXME: BufferSize=2 ?
@@ -252,7 +253,7 @@ defvar Zn4FPCLM1 = Zn4FP1;
252
253
// Execution pipeline grouping
253
254
//===----------------------------------------------------------------------===//
254
255
255
- // AMD SOG 19h , 2.11 Floating-Point Unit
256
+ // AMD SOG Zen4 , 2.11 Floating-Point Unit
256
257
// Stores and floating point to general purpose register transfer
257
258
// have 2 dedicated pipelines (pipe 5 and 6).
258
259
def Zn4FPU0123 : ProcResGroup<[Zn4FP0, Zn4FP1, Zn4FP2, Zn4FP3]>;
@@ -281,12 +282,12 @@ def Zn4FPFMisc23 : ProcResGroup<[Zn4FPFMisc2, Zn4FPFMisc3]>;
281
282
def Zn4FPFMisc123 : ProcResGroup<[Zn4FPFMisc1,Zn4FPFMisc2, Zn4FPFMisc3]>;
282
283
283
284
// Loads, Stores and Move to General Register (EX) Operations
284
- // AMD SOG 19h , 2.11 Floating-Point Unit
285
+ // AMD SOG Zen4 , 2.11 Floating-Point Unit
285
286
// Stores and floating point to general purpose register transfer
286
287
// have 2 dedicated pipelines (pipe 5 and 6).
287
288
defvar Zn4FPLd01 = Zn4FP45;
288
289
289
- // AMD SOG 19h , 2.11 Floating-Point Unit
290
+ // AMD SOG Zen4 , 2.11 Floating-Point Unit
290
291
// Note that FP stores are supported on two pipelines,
291
292
// but throughput is limited to one per cycle.
292
293
let Super = Zn4FP45 in
@@ -334,9 +335,9 @@ def Zn4FpPRF : RegisterFile<192, [VR64, VR128, VR256, VR512], [1, 1, 1, 1], [0,
334
335
6, // Max moves that can be eliminated per cycle.
335
336
0>; // Restrict move elimination to zero regs.
336
337
337
- // AMD SOG 19h , 2.11 Floating-Point Unit
338
+ // AMD SOG Zen4 , 2.11 Floating-Point Unit
338
339
// The floating-point scheduler has a 2*32 entry macro op capacity.
339
- // AMD SOG 19h , 2.11 Floating-Point Unit
340
+ // AMD SOG Zen4 , 2.11 Floating-Point Unit
340
341
// <...> the scheduler can issue 1 micro op per cycle for each pipe.
341
342
// FIXME: those are two separate schedulers, not a single big one.
342
343
def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2, /*Zn4FP4,*/ // scheduler 0
@@ -345,7 +346,7 @@ def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2, /*Zn4FP4,*/ // scheduler 0
345
346
let BufferSize = !mul(2, 32);
346
347
}
347
348
348
- // AMD SOG 19h , 2.11 Floating-Point Unit
349
+ // AMD SOG Zen4 , 2.11 Floating-Point Unit
349
350
// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ)
350
351
// even if floating-point scheduler is full.
351
352
// FIXME: how to model this properly?
@@ -355,27 +356,27 @@ def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2, /*Zn4FP4,*/ // scheduler 0
355
356
// Load-Store Unit
356
357
//
357
358
358
- // AMD SOG 19h , 2.12 Load-Store Unit
359
+ // AMD SOG Zen4 , 2.12 Load-Store Unit
359
360
// The LS unit contains three largely independent pipe-lines
360
361
// enabling the execution of three 256-bit memory operations per cycle.
361
362
def Zn4LSU : ProcResource<3>;
362
363
363
- // AMD SOG 19h , 2.12 Load-Store Unit
364
+ // AMD SOG Zen4 , 2.12 Load-Store Unit
364
365
// All three memory operations can be loads.
365
366
let Super = Zn4LSU in
366
367
def Zn4Load : ProcResource<3> {
367
- // AMD SOG 19h , 2.12 Load-Store Unit
368
+ // AMD SOG Zen4 , 2.12 Load-Store Unit
368
369
// The LS unit can process up to 72 out-of-order loads.
369
370
let BufferSize = 72;
370
371
}
371
372
372
373
def Zn4LoadQueue : LoadQueue<Zn4Load>;
373
374
374
- // AMD SOG 19h , 2.12 Load-Store Unit
375
+ // AMD SOG Zen4 , 2.12 Load-Store Unit
375
376
// A maximum of two of the memory operations can be stores.
376
377
let Super = Zn4LSU in
377
378
def Zn4Store : ProcResource<2> {
378
- // AMD SOG 19h , 2.12 Load-Store Unit
379
+ // AMD SOG Zen4 , 2.12 Load-Store Unit
379
380
// The LS unit utilizes a 64-entry store queue (STQ).
380
381
let BufferSize = 64;
381
382
}
@@ -491,7 +492,7 @@ def : ReadAdvance<ReadAfterVecLd, Znver4Model.VecLoadLatency>;
491
492
def : ReadAdvance<ReadAfterVecXLd, Znver4Model.VecLoadLatency>;
492
493
def : ReadAdvance<ReadAfterVecYLd, Znver4Model.VecLoadLatency>;
493
494
494
- // AMD SOG 19h , 2.11 Floating-Point Unit
495
+ // AMD SOG Zen4 , 2.11 Floating-Point Unit
495
496
// There is 1 cycle of added latency for a result to cross
496
497
// from F to I or I to F domain.
497
498
def : ReadAdvance<ReadInt2Fpu, -1>;
0 commit comments