Skip to content

Commit b544217

Browse files
mbrkusaninjayfoad
authored andcommitted
[AMDGPU] Fix setting nontemporal in memory legalizer (llvm#83815)
Iterator MI can advance in insertWait() but we need original instruction to set temporal hint. Just move it before handling volatile.
1 parent 78b99c7 commit b544217

5 files changed

+710
-5
lines changed

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2358,6 +2358,11 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
23582358

23592359
bool Changed = false;
23602360

2361+
if (IsNonTemporal) {
2362+
// Set non-temporal hint for all cache levels.
2363+
Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2364+
}
2365+
23612366
if (IsVolatile) {
23622367
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
23632368

@@ -2370,11 +2375,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
23702375
Position::AFTER);
23712376
}
23722377

2373-
if (IsNonTemporal) {
2374-
// Set non-temporal hint for all cache levels.
2375-
Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2376-
}
2377-
23782378
return Changed;
23792379
}
23802380

llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -582,5 +582,170 @@ entry:
582582
ret void
583583
}
584584

585+
define amdgpu_kernel void @flat_nontemporal_volatile_load(
586+
; GFX7-LABEL: flat_nontemporal_volatile_load:
587+
; GFX7: ; %bb.0: ; %entry
588+
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
589+
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
590+
; GFX7-NEXT: v_mov_b32_e32 v0, s0
591+
; GFX7-NEXT: v_mov_b32_e32 v1, s1
592+
; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
593+
; GFX7-NEXT: s_waitcnt vmcnt(0)
594+
; GFX7-NEXT: v_mov_b32_e32 v0, s2
595+
; GFX7-NEXT: v_mov_b32_e32 v1, s3
596+
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
597+
; GFX7-NEXT: flat_store_dword v[0:1], v2
598+
; GFX7-NEXT: s_endpgm
599+
;
600+
; GFX10-WGP-LABEL: flat_nontemporal_volatile_load:
601+
; GFX10-WGP: ; %bb.0: ; %entry
602+
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
603+
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
604+
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
605+
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
606+
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc
607+
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
608+
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
609+
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
610+
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
611+
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
612+
; GFX10-WGP-NEXT: s_endpgm
613+
;
614+
; GFX10-CU-LABEL: flat_nontemporal_volatile_load:
615+
; GFX10-CU: ; %bb.0: ; %entry
616+
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
617+
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
618+
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
619+
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
620+
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc
621+
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
622+
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
623+
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
624+
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
625+
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
626+
; GFX10-CU-NEXT: s_endpgm
627+
;
628+
; SKIP-CACHE-INV-LABEL: flat_nontemporal_volatile_load:
629+
; SKIP-CACHE-INV: ; %bb.0: ; %entry
630+
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
631+
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
632+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
633+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
634+
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc
635+
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
636+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
637+
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
638+
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
639+
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
640+
; SKIP-CACHE-INV-NEXT: s_endpgm
641+
;
642+
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
643+
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
644+
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
645+
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
646+
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
647+
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
648+
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
649+
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
650+
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
651+
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
652+
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
653+
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
654+
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
655+
;
656+
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
657+
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
658+
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
659+
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
660+
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
661+
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
662+
; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
663+
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
664+
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
665+
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
666+
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
667+
; GFX90A-TGSPLIT-NEXT: s_endpgm
668+
;
669+
; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
670+
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
671+
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
672+
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
673+
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
674+
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
675+
; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
676+
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
677+
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
678+
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
679+
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
680+
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
681+
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
682+
;
683+
; GFX940-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
684+
; GFX940-TGSPLIT: ; %bb.0: ; %entry
685+
; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
686+
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
687+
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
688+
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
689+
; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
690+
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
691+
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
692+
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
693+
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
694+
; GFX940-TGSPLIT-NEXT: s_endpgm
695+
;
696+
; GFX11-WGP-LABEL: flat_nontemporal_volatile_load:
697+
; GFX11-WGP: ; %bb.0: ; %entry
698+
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
699+
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
700+
; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
701+
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
702+
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
703+
; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
704+
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
705+
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
706+
; GFX11-WGP-NEXT: s_endpgm
707+
;
708+
; GFX11-CU-LABEL: flat_nontemporal_volatile_load:
709+
; GFX11-CU: ; %bb.0: ; %entry
710+
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
711+
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
712+
; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
713+
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
714+
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
715+
; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
716+
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
717+
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
718+
; GFX11-CU-NEXT: s_endpgm
719+
;
720+
; GFX12-WGP-LABEL: flat_nontemporal_volatile_load:
721+
; GFX12-WGP: ; %bb.0: ; %entry
722+
; GFX12-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
723+
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
724+
; GFX12-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
725+
; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
726+
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
727+
; GFX12-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
728+
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
729+
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
730+
; GFX12-WGP-NEXT: s_endpgm
731+
;
732+
; GFX12-CU-LABEL: flat_nontemporal_volatile_load:
733+
; GFX12-CU: ; %bb.0: ; %entry
734+
; GFX12-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
735+
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
736+
; GFX12-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
737+
; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
738+
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
739+
; GFX12-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
740+
; GFX12-CU-NEXT: s_wait_dscnt 0x0
741+
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
742+
; GFX12-CU-NEXT: s_endpgm
743+
ptr %in, ptr %out) {
744+
entry:
745+
%val = load volatile i32, ptr %in, align 4, !nontemporal !0
746+
store i32 %val, ptr %out
747+
ret void
748+
}
749+
585750
!0 = !{i32 1}
586751
declare i32 @llvm.amdgcn.workitem.id.x()

llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -576,5 +576,163 @@ entry:
576576
ret void
577577
}
578578

579+
define amdgpu_kernel void @global_nontemporal_volatile_load(
580+
; GFX6-LABEL: global_nontemporal_volatile_load:
581+
; GFX6: ; %bb.0: ; %entry
582+
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
583+
; GFX6-NEXT: s_mov_b32 s7, 0x100f000
584+
; GFX6-NEXT: s_mov_b32 s6, -1
585+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
586+
; GFX6-NEXT: s_mov_b32 s4, s0
587+
; GFX6-NEXT: s_mov_b32 s5, s1
588+
; GFX6-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
589+
; GFX6-NEXT: s_waitcnt vmcnt(0)
590+
; GFX6-NEXT: s_mov_b32 s4, s2
591+
; GFX6-NEXT: s_mov_b32 s5, s3
592+
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
593+
; GFX6-NEXT: s_endpgm
594+
;
595+
; GFX7-LABEL: global_nontemporal_volatile_load:
596+
; GFX7: ; %bb.0: ; %entry
597+
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
598+
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
599+
; GFX7-NEXT: v_mov_b32_e32 v0, s0
600+
; GFX7-NEXT: v_mov_b32_e32 v1, s1
601+
; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
602+
; GFX7-NEXT: s_waitcnt vmcnt(0)
603+
; GFX7-NEXT: v_mov_b32_e32 v0, s2
604+
; GFX7-NEXT: v_mov_b32_e32 v1, s3
605+
; GFX7-NEXT: flat_store_dword v[0:1], v2
606+
; GFX7-NEXT: s_endpgm
607+
;
608+
; GFX10-WGP-LABEL: global_nontemporal_volatile_load:
609+
; GFX10-WGP: ; %bb.0: ; %entry
610+
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
611+
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
612+
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
613+
; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
614+
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
615+
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3]
616+
; GFX10-WGP-NEXT: s_endpgm
617+
;
618+
; GFX10-CU-LABEL: global_nontemporal_volatile_load:
619+
; GFX10-CU: ; %bb.0: ; %entry
620+
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
621+
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
622+
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
623+
; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
624+
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
625+
; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3]
626+
; GFX10-CU-NEXT: s_endpgm
627+
;
628+
; SKIP-CACHE-INV-LABEL: global_nontemporal_volatile_load:
629+
; SKIP-CACHE-INV: ; %bb.0: ; %entry
630+
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
631+
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000
632+
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
633+
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
634+
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
635+
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1
636+
; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
637+
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
638+
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2
639+
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3
640+
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
641+
; SKIP-CACHE-INV-NEXT: s_endpgm
642+
;
643+
; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_volatile_load:
644+
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
645+
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
646+
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
647+
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
648+
; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc
649+
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
650+
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
651+
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
652+
;
653+
; GFX90A-TGSPLIT-LABEL: global_nontemporal_volatile_load:
654+
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
655+
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
656+
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
657+
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
658+
; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc
659+
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
660+
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
661+
; GFX90A-TGSPLIT-NEXT: s_endpgm
662+
;
663+
; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_volatile_load:
664+
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
665+
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
666+
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
667+
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
668+
; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1
669+
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
670+
; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
671+
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
672+
;
673+
; GFX940-TGSPLIT-LABEL: global_nontemporal_volatile_load:
674+
; GFX940-TGSPLIT: ; %bb.0: ; %entry
675+
; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
676+
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
677+
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
678+
; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1
679+
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
680+
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
681+
; GFX940-TGSPLIT-NEXT: s_endpgm
682+
;
683+
; GFX11-WGP-LABEL: global_nontemporal_volatile_load:
684+
; GFX11-WGP: ; %bb.0: ; %entry
685+
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
686+
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
687+
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
688+
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
689+
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
690+
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
691+
; GFX11-WGP-NEXT: s_nop 0
692+
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
693+
; GFX11-WGP-NEXT: s_endpgm
694+
;
695+
; GFX11-CU-LABEL: global_nontemporal_volatile_load:
696+
; GFX11-CU: ; %bb.0: ; %entry
697+
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
698+
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
699+
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
700+
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
701+
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
702+
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
703+
; GFX11-CU-NEXT: s_nop 0
704+
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
705+
; GFX11-CU-NEXT: s_endpgm
706+
;
707+
; GFX12-WGP-LABEL: global_nontemporal_volatile_load:
708+
; GFX12-WGP: ; %bb.0: ; %entry
709+
; GFX12-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
710+
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
711+
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
712+
; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
713+
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
714+
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
715+
; GFX12-WGP-NEXT: s_nop 0
716+
; GFX12-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
717+
; GFX12-WGP-NEXT: s_endpgm
718+
;
719+
; GFX12-CU-LABEL: global_nontemporal_volatile_load:
720+
; GFX12-CU: ; %bb.0: ; %entry
721+
; GFX12-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
722+
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
723+
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
724+
; GFX12-CU-NEXT: global_load_b32 v1, v0, s[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
725+
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
726+
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[2:3]
727+
; GFX12-CU-NEXT: s_nop 0
728+
; GFX12-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
729+
; GFX12-CU-NEXT: s_endpgm
730+
ptr addrspace(1) %in, ptr addrspace(1) %out) {
731+
entry:
732+
%val = load volatile i32, ptr addrspace(1) %in, align 4, !nontemporal !0
733+
store i32 %val, ptr addrspace(1) %out
734+
ret void
735+
}
736+
579737
!0 = !{i32 1}
580738
declare i32 @llvm.amdgcn.workitem.id.x()

0 commit comments

Comments
 (0)