@@ -582,5 +582,170 @@ entry:
582
582
ret void
583
583
}
584
584
585
+ define amdgpu_kernel void @flat_nontemporal_volatile_load (
586
+ ; GFX7-LABEL: flat_nontemporal_volatile_load:
587
+ ; GFX7: ; %bb.0: ; %entry
588
+ ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
589
+ ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
590
+ ; GFX7-NEXT: v_mov_b32_e32 v0, s0
591
+ ; GFX7-NEXT: v_mov_b32_e32 v1, s1
592
+ ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
593
+ ; GFX7-NEXT: s_waitcnt vmcnt(0)
594
+ ; GFX7-NEXT: v_mov_b32_e32 v0, s2
595
+ ; GFX7-NEXT: v_mov_b32_e32 v1, s3
596
+ ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
597
+ ; GFX7-NEXT: flat_store_dword v[0:1], v2
598
+ ; GFX7-NEXT: s_endpgm
599
+ ;
600
+ ; GFX10-WGP-LABEL: flat_nontemporal_volatile_load:
601
+ ; GFX10-WGP: ; %bb.0: ; %entry
602
+ ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
603
+ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
604
+ ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
605
+ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
606
+ ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc
607
+ ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
608
+ ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
609
+ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
610
+ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
611
+ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
612
+ ; GFX10-WGP-NEXT: s_endpgm
613
+ ;
614
+ ; GFX10-CU-LABEL: flat_nontemporal_volatile_load:
615
+ ; GFX10-CU: ; %bb.0: ; %entry
616
+ ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
617
+ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
618
+ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
619
+ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
620
+ ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc
621
+ ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
622
+ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
623
+ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
624
+ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
625
+ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
626
+ ; GFX10-CU-NEXT: s_endpgm
627
+ ;
628
+ ; SKIP-CACHE-INV-LABEL: flat_nontemporal_volatile_load:
629
+ ; SKIP-CACHE-INV: ; %bb.0: ; %entry
630
+ ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
631
+ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
632
+ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
633
+ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
634
+ ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc
635
+ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
636
+ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
637
+ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
638
+ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
639
+ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
640
+ ; SKIP-CACHE-INV-NEXT: s_endpgm
641
+ ;
642
+ ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
643
+ ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
644
+ ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
645
+ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
646
+ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
647
+ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
648
+ ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
649
+ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
650
+ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
651
+ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
652
+ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
653
+ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
654
+ ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
655
+ ;
656
+ ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
657
+ ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
658
+ ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
659
+ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
660
+ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
661
+ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
662
+ ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
663
+ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
664
+ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
665
+ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
666
+ ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
667
+ ; GFX90A-TGSPLIT-NEXT: s_endpgm
668
+ ;
669
+ ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
670
+ ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
671
+ ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
672
+ ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
673
+ ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
674
+ ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
675
+ ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
676
+ ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
677
+ ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
678
+ ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
679
+ ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
680
+ ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
681
+ ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
682
+ ;
683
+ ; GFX940-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
684
+ ; GFX940-TGSPLIT: ; %bb.0: ; %entry
685
+ ; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
686
+ ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
687
+ ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
688
+ ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
689
+ ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
690
+ ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
691
+ ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
692
+ ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
693
+ ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
694
+ ; GFX940-TGSPLIT-NEXT: s_endpgm
695
+ ;
696
+ ; GFX11-WGP-LABEL: flat_nontemporal_volatile_load:
697
+ ; GFX11-WGP: ; %bb.0: ; %entry
698
+ ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
699
+ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
700
+ ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
701
+ ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
702
+ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
703
+ ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
704
+ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
705
+ ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
706
+ ; GFX11-WGP-NEXT: s_endpgm
707
+ ;
708
+ ; GFX11-CU-LABEL: flat_nontemporal_volatile_load:
709
+ ; GFX11-CU: ; %bb.0: ; %entry
710
+ ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
711
+ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
712
+ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
713
+ ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
714
+ ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
715
+ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
716
+ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
717
+ ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
718
+ ; GFX11-CU-NEXT: s_endpgm
719
+ ;
720
+ ; GFX12-WGP-LABEL: flat_nontemporal_volatile_load:
721
+ ; GFX12-WGP: ; %bb.0: ; %entry
722
+ ; GFX12-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
723
+ ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
724
+ ; GFX12-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
725
+ ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
726
+ ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
727
+ ; GFX12-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
728
+ ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
729
+ ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
730
+ ; GFX12-WGP-NEXT: s_endpgm
731
+ ;
732
+ ; GFX12-CU-LABEL: flat_nontemporal_volatile_load:
733
+ ; GFX12-CU: ; %bb.0: ; %entry
734
+ ; GFX12-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
735
+ ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
736
+ ; GFX12-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
737
+ ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
738
+ ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
739
+ ; GFX12-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
740
+ ; GFX12-CU-NEXT: s_wait_dscnt 0x0
741
+ ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
742
+ ; GFX12-CU-NEXT: s_endpgm
743
+ ptr %in , ptr %out ) {
744
+ entry:
745
+ %val = load volatile i32 , ptr %in , align 4 , !nontemporal !0
746
+ store i32 %val , ptr %out
747
+ ret void
748
+ }
749
+
585
750
!0 = !{i32 1 }
586
751
declare i32 @llvm.amdgcn.workitem.id.x ()
0 commit comments