Skip to content

Commit 3efe832

Browse files
committed
[AArch64] Fix chain for calls from agnostic-ZA functions.
The lowering code was using the wrong chain value, which meant that the 'smstart' after the call from streaming agnostic-ZA functions -> non-streaming private-ZA functions was incorrectly removed from the DAG.
1 parent 5609724 commit 3efe832

File tree

2 files changed

+119
-1
lines changed

2 files changed

+119
-1
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9664,7 +9664,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
96649664
DAG.getConstant(0, DL, MVT::i64));
96659665
TPIDR2.Uses++;
96669666
} else if (RequiresSaveAllZA) {
9667-
Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain,
9667+
Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
96689668
/*IsSave=*/false);
96699669
}
96709670

llvm/test/CodeGen/AArch64/sme-agnostic-za.ll

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,3 +82,121 @@ define i64 @shared_caller_agnostic_callee(i64 %v) nounwind "aarch64_inout_za" "a
8282
%res = call i64 @agnostic_decl(i64 %v)
8383
ret i64 %res
8484
}
85+
86+
; agnostic-ZA + streaming -> private-ZA + non-streaming
87+
define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nounwind "aarch64_za_state_agnostic" "aarch64_pstate_sm_enabled" {
88+
; CHECK-LABEL: streaming_agnostic_caller_nonstreaming_private_za_callee:
89+
; CHECK: // %bb.0:
90+
; CHECK-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
91+
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
92+
; CHECK-NEXT: mov x9, x0
93+
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
94+
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
95+
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
96+
; CHECK-NEXT: bl __arm_get_current_vg
97+
; CHECK-NEXT: str x0, [sp, #80] // 8-byte Folded Spill
98+
; CHECK-NEXT: mov x0, x9
99+
; CHECK-NEXT: add x29, sp, #64
100+
; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill
101+
; CHECK-NEXT: mov x8, x0
102+
; CHECK-NEXT: bl __arm_sme_state_size
103+
; CHECK-NEXT: sub sp, sp, x0
104+
; CHECK-NEXT: mov x20, sp
105+
; CHECK-NEXT: mov x0, x20
106+
; CHECK-NEXT: bl __arm_sme_save
107+
; CHECK-NEXT: smstop sm
108+
; CHECK-NEXT: mov x0, x8
109+
; CHECK-NEXT: bl private_za_decl
110+
; CHECK-NEXT: mov x1, x0
111+
; CHECK-NEXT: smstart sm
112+
; CHECK-NEXT: mov x0, x20
113+
; CHECK-NEXT: bl __arm_sme_restore
114+
; CHECK-NEXT: mov x0, x20
115+
; CHECK-NEXT: bl __arm_sme_save
116+
; CHECK-NEXT: smstop sm
117+
; CHECK-NEXT: mov x0, x1
118+
; CHECK-NEXT: bl private_za_decl
119+
; CHECK-NEXT: mov x1, x0
120+
; CHECK-NEXT: smstart sm
121+
; CHECK-NEXT: mov x0, x20
122+
; CHECK-NEXT: bl __arm_sme_restore
123+
; CHECK-NEXT: mov x0, x1
124+
; CHECK-NEXT: sub sp, x29, #64
125+
; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload
126+
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
127+
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
128+
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
129+
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
130+
; CHECK-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload
131+
; CHECK-NEXT: ret
132+
%res = call i64 @private_za_decl(i64 %v)
133+
%res2 = call i64 @private_za_decl(i64 %res)
134+
ret i64 %res2
135+
}
136+
137+
; agnostic-ZA + streaming-compatible -> private-ZA + non-streaming
138+
define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nounwind "aarch64_za_state_agnostic" "aarch64_pstate_sm_compatible" {
139+
; CHECK-LABEL: streaming_compatible_agnostic_caller_nonstreaming_private_za_callee:
140+
; CHECK: // %bb.0:
141+
; CHECK-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
142+
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
143+
; CHECK-NEXT: mov x9, x0
144+
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
145+
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
146+
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
147+
; CHECK-NEXT: bl __arm_get_current_vg
148+
; CHECK-NEXT: str x0, [sp, #80] // 8-byte Folded Spill
149+
; CHECK-NEXT: mov x0, x9
150+
; CHECK-NEXT: add x29, sp, #64
151+
; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill
152+
; CHECK-NEXT: mov x8, x0
153+
; CHECK-NEXT: bl __arm_sme_state_size
154+
; CHECK-NEXT: sub sp, sp, x0
155+
; CHECK-NEXT: mov x19, sp
156+
; CHECK-NEXT: mov x0, x19
157+
; CHECK-NEXT: bl __arm_sme_save
158+
; CHECK-NEXT: bl __arm_sme_state
159+
; CHECK-NEXT: and x20, x0, #0x1
160+
; CHECK-NEXT: tbz w20, #0, .LBB5_2
161+
; CHECK-NEXT: // %bb.1:
162+
; CHECK-NEXT: smstop sm
163+
; CHECK-NEXT: .LBB5_2:
164+
; CHECK-NEXT: mov x0, x8
165+
; CHECK-NEXT: bl private_za_decl
166+
; CHECK-NEXT: mov x2, x0
167+
; CHECK-NEXT: tbz w20, #0, .LBB5_4
168+
; CHECK-NEXT: // %bb.3:
169+
; CHECK-NEXT: smstart sm
170+
; CHECK-NEXT: .LBB5_4:
171+
; CHECK-NEXT: mov x0, x19
172+
; CHECK-NEXT: bl __arm_sme_restore
173+
; CHECK-NEXT: mov x0, x19
174+
; CHECK-NEXT: bl __arm_sme_save
175+
; CHECK-NEXT: bl __arm_sme_state
176+
; CHECK-NEXT: and x20, x0, #0x1
177+
; CHECK-NEXT: tbz w20, #0, .LBB5_6
178+
; CHECK-NEXT: // %bb.5:
179+
; CHECK-NEXT: smstop sm
180+
; CHECK-NEXT: .LBB5_6:
181+
; CHECK-NEXT: mov x0, x2
182+
; CHECK-NEXT: bl private_za_decl
183+
; CHECK-NEXT: mov x1, x0
184+
; CHECK-NEXT: tbz w20, #0, .LBB5_8
185+
; CHECK-NEXT: // %bb.7:
186+
; CHECK-NEXT: smstart sm
187+
; CHECK-NEXT: .LBB5_8:
188+
; CHECK-NEXT: mov x0, x19
189+
; CHECK-NEXT: bl __arm_sme_restore
190+
; CHECK-NEXT: mov x0, x1
191+
; CHECK-NEXT: sub sp, x29, #64
192+
; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload
193+
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
194+
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
195+
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
196+
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
197+
; CHECK-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload
198+
; CHECK-NEXT: ret
199+
%res = call i64 @private_za_decl(i64 %v)
200+
%res2 = call i64 @private_za_decl(i64 %res)
201+
ret i64 %res2
202+
}

0 commit comments

Comments
 (0)