@@ -692,148 +692,148 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
692
692
;
693
693
; GFX9-LABEL: sdivrem_v2i32:
694
694
; GFX9: ; %bb.0:
695
- ; GFX9-NEXT: s_load_dwordx8 s[0:7 ], s[4:5], 0x0
695
+ ; GFX9-NEXT: s_load_dwordx8 s[8:15 ], s[4:5], 0x0
696
696
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
697
- ; GFX9-NEXT: s_ashr_i32 s8, s6 , 31
698
- ; GFX9-NEXT: s_add_i32 s6, s6, s8
699
- ; GFX9-NEXT: s_xor_b32 s6, s6, s8
700
- ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
701
- ; GFX9-NEXT: s_ashr_i32 s9, s7 , 31
702
- ; GFX9-NEXT: s_add_i32 s7, s7, s9
703
- ; GFX9-NEXT: s_xor_b32 s7, s7, s9
697
+ ; GFX9-NEXT: s_ashr_i32 s0, s14 , 31
698
+ ; GFX9-NEXT: s_add_i32 s1, s14, s0
699
+ ; GFX9-NEXT: s_xor_b32 s1, s1, s0
700
+ ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1
701
+ ; GFX9-NEXT: s_ashr_i32 s2, s15 , 31
702
+ ; GFX9-NEXT: s_add_i32 s3, s15, s2
703
+ ; GFX9-NEXT: s_xor_b32 s3, s3, s2
704
704
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
705
- ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
706
- ; GFX9-NEXT: s_sub_i32 s12 , 0, s6
707
- ; GFX9-NEXT: s_ashr_i32 s10, s4 , 31
705
+ ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
706
+ ; GFX9-NEXT: s_sub_i32 s6 , 0, s1
707
+ ; GFX9-NEXT: s_ashr_i32 s4, s12 , 31
708
708
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
709
709
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
710
710
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
711
- ; GFX9-NEXT: s_add_i32 s4, s4, s10
712
- ; GFX9-NEXT: s_xor_b32 s4, s4, s10
713
- ; GFX9-NEXT: v_mul_lo_u32 v2, s12 , v0
711
+ ; GFX9-NEXT: s_sub_i32 s7, 0, s3
712
+ ; GFX9-NEXT: s_ashr_i32 s5, s13, 31
713
+ ; GFX9-NEXT: v_mul_lo_u32 v2, s6 , v0
714
714
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
715
715
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
716
- ; GFX9-NEXT: s_sub_i32 s12, 0, s7
716
+ ; GFX9-NEXT: s_add_i32 s6, s12, s4
717
717
; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2
718
- ; GFX9-NEXT: s_ashr_i32 s11, s5, 31
719
- ; GFX9-NEXT: v_mul_lo_u32 v3, s12 , v1
720
- ; GFX9-NEXT: s_add_i32 s5, s5, s11
718
+ ; GFX9-NEXT: s_xor_b32 s6, s6, s4
719
+ ; GFX9-NEXT: v_mul_lo_u32 v3, s7 , v1
720
+ ; GFX9-NEXT: s_add_i32 s7, s13, s5
721
721
; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
722
- ; GFX9-NEXT: v_mul_hi_u32 v0, s4 , v0
722
+ ; GFX9-NEXT: v_mul_hi_u32 v0, s6 , v0
723
723
; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3
724
- ; GFX9-NEXT: s_xor_b32 s5, s5, s11
725
- ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s6
724
+ ; GFX9-NEXT: s_xor_b32 s7, s7, s5
725
+ ; GFX9-NEXT: s_xor_b32 s0, s4, s0
726
+ ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s1
726
727
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
727
728
; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
728
- ; GFX9-NEXT: v_mul_hi_u32 v1, s5 , v1
729
- ; GFX9-NEXT: v_sub_u32_e32 v3, s4 , v3
730
- ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6 , v3
729
+ ; GFX9-NEXT: v_mul_hi_u32 v1, s7 , v1
730
+ ; GFX9-NEXT: v_sub_u32_e32 v3, s6 , v3
731
+ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1 , v3
731
732
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
732
- ; GFX9-NEXT: v_subrev_u32_e32 v2, s6 , v3
733
+ ; GFX9-NEXT: v_subrev_u32_e32 v2, s1 , v3
733
734
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
734
735
; GFX9-NEXT: v_add_u32_e32 v3, 1, v0
735
- ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6 , v2
736
+ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1 , v2
736
737
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
737
- ; GFX9-NEXT: v_subrev_u32_e32 v3, s6 , v2
738
+ ; GFX9-NEXT: v_subrev_u32_e32 v3, s1 , v2
738
739
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
739
- ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7
740
+ ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s3
740
741
; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
741
- ; GFX9-NEXT: s_xor_b32 s4, s10, s8
742
- ; GFX9-NEXT: v_xor_b32_e32 v0, s4 , v0
743
- ; GFX9-NEXT: v_sub_u32_e32 v3, s5 , v3
744
- ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7 , v3
742
+ ; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0
743
+ ; GFX9-NEXT: v_subrev_u32_e32 v0, s0 , v0
744
+ ; GFX9-NEXT: v_sub_u32_e32 v3, s7 , v3
745
+ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3 , v3
745
746
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
746
- ; GFX9-NEXT: v_subrev_u32_e32 v4, s7 , v3
747
+ ; GFX9-NEXT: v_subrev_u32_e32 v4, s3 , v3
747
748
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
748
749
; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
749
- ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3
750
- ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0
750
+ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
751
751
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
752
- ; GFX9-NEXT: v_subrev_u32_e32 v4, s7 , v3
753
- ; GFX9-NEXT: s_xor_b32 s4, s11, s9
752
+ ; GFX9-NEXT: v_subrev_u32_e32 v4, s3 , v3
753
+ ; GFX9-NEXT: s_xor_b32 s0, s5, s2
754
754
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
755
- ; GFX9-NEXT: v_xor_b32_e32 v1, s4 , v1
756
- ; GFX9-NEXT: v_xor_b32_e32 v2, s10 , v2
757
- ; GFX9-NEXT: v_subrev_u32_e32 v1, s4 , v1
758
- ; GFX9-NEXT: v_xor_b32_e32 v3, s11 , v3
755
+ ; GFX9-NEXT: v_xor_b32_e32 v1, s0 , v1
756
+ ; GFX9-NEXT: v_xor_b32_e32 v2, s4 , v2
757
+ ; GFX9-NEXT: v_subrev_u32_e32 v1, s0 , v1
758
+ ; GFX9-NEXT: v_xor_b32_e32 v3, s5 , v3
759
759
; GFX9-NEXT: v_mov_b32_e32 v4, 0
760
- ; GFX9-NEXT: v_subrev_u32_e32 v2, s10 , v2
761
- ; GFX9-NEXT: v_subrev_u32_e32 v3, s11 , v3
762
- ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1 ]
763
- ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3 ]
760
+ ; GFX9-NEXT: v_subrev_u32_e32 v2, s4 , v2
761
+ ; GFX9-NEXT: v_subrev_u32_e32 v3, s5 , v3
762
+ ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9 ]
763
+ ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11 ]
764
764
; GFX9-NEXT: s_endpgm
765
765
;
766
766
; GFX10-LABEL: sdivrem_v2i32:
767
767
; GFX10: ; %bb.0:
768
- ; GFX10-NEXT: s_load_dwordx8 s[4:11 ], s[4:5], 0x0
768
+ ; GFX10-NEXT: s_load_dwordx8 s[8:15 ], s[4:5], 0x0
769
769
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
770
- ; GFX10-NEXT: s_ashr_i32 s1, s10 , 31
771
- ; GFX10-NEXT: s_ashr_i32 s2, s11 , 31
772
- ; GFX10-NEXT: s_add_i32 s0, s10 , s1
773
- ; GFX10-NEXT: s_add_i32 s3, s11 , s2
774
- ; GFX10-NEXT: s_xor_b32 s10 , s0, s1
770
+ ; GFX10-NEXT: s_ashr_i32 s1, s14 , 31
771
+ ; GFX10-NEXT: s_ashr_i32 s2, s15 , 31
772
+ ; GFX10-NEXT: s_add_i32 s0, s14 , s1
773
+ ; GFX10-NEXT: s_add_i32 s3, s15 , s2
774
+ ; GFX10-NEXT: s_xor_b32 s4 , s0, s1
775
775
; GFX10-NEXT: s_xor_b32 s3, s3, s2
776
- ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10
776
+ ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4
777
777
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3
778
- ; GFX10-NEXT: s_sub_i32 s0, 0, s10
779
- ; GFX10-NEXT: s_sub_i32 s11 , 0, s3
780
- ; GFX10-NEXT: s_ashr_i32 s12, s9 , 31
778
+ ; GFX10-NEXT: s_sub_i32 s0, 0, s4
779
+ ; GFX10-NEXT: s_sub_i32 s5 , 0, s3
780
+ ; GFX10-NEXT: s_ashr_i32 s6, s13 , 31
781
781
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
782
782
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
783
+ ; GFX10-NEXT: s_add_i32 s7, s13, s6
784
+ ; GFX10-NEXT: s_xor_b32 s7, s7, s6
783
785
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
784
786
; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
785
787
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
786
788
; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
787
789
; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0
788
- ; GFX10-NEXT: v_mul_lo_u32 v3, s11, v1
789
- ; GFX10-NEXT: s_ashr_i32 s11, s8, 31
790
- ; GFX10-NEXT: s_add_i32 s0, s8, s11
791
- ; GFX10-NEXT: s_add_i32 s8, s9, s12
792
- ; GFX10-NEXT: s_xor_b32 s0, s0, s11
793
- ; GFX10-NEXT: s_xor_b32 s8, s8, s12
790
+ ; GFX10-NEXT: v_mul_lo_u32 v3, s5, v1
791
+ ; GFX10-NEXT: s_ashr_i32 s5, s12, 31
792
+ ; GFX10-NEXT: s_add_i32 s0, s12, s5
793
+ ; GFX10-NEXT: s_xor_b32 s1, s5, s1
794
+ ; GFX10-NEXT: s_xor_b32 s0, s0, s5
794
795
; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
795
796
; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3
796
- ; GFX10-NEXT: s_xor_b32 s1, s11, s1
797
797
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
798
798
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
799
799
; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0
800
- ; GFX10-NEXT: v_mul_hi_u32 v1, s8 , v1
801
- ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10
800
+ ; GFX10-NEXT: v_mul_hi_u32 v1, s7 , v1
801
+ ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s4
802
802
; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3
803
803
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
804
804
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
805
805
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v2
806
- ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s8 , v3
807
- ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10 , v2
806
+ ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s7 , v3
807
+ ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4 , v2
808
808
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3
809
- ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10 , v2
809
+ ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4 , v2
810
810
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3
811
811
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
812
812
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
813
813
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
814
814
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
815
815
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
816
816
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
817
- ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10 , v2
817
+ ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4 , v2
818
818
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3
819
- ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10 , v2
819
+ ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4 , v2
820
820
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3
821
821
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
822
822
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
823
823
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
824
824
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
825
- ; GFX10-NEXT: s_xor_b32 s0, s12 , s2
825
+ ; GFX10-NEXT: s_xor_b32 s0, s6 , s2
826
826
; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0
827
827
; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1
828
- ; GFX10-NEXT: v_xor_b32_e32 v2, s11 , v2
829
- ; GFX10-NEXT: v_xor_b32_e32 v3, s12 , v3
828
+ ; GFX10-NEXT: v_xor_b32_e32 v2, s5 , v2
829
+ ; GFX10-NEXT: v_xor_b32_e32 v3, s6 , v3
830
830
; GFX10-NEXT: v_mov_b32_e32 v4, 0
831
831
; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0
832
832
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1
833
- ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s11 , v2
834
- ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s12 , v3
835
- ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5 ]
836
- ; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7 ]
833
+ ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s5 , v2
834
+ ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6 , v3
835
+ ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9 ]
836
+ ; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11 ]
837
837
; GFX10-NEXT: s_endpgm
838
838
%div = sdiv <2 x i32 > %x , %y
839
839
store <2 x i32 > %div , ptr addrspace (1 ) %out0
0 commit comments