Skip to content

[AMDGPU][True16] Support V_FLOOR_F16. #78446

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 18, 2024
Merged

[AMDGPU][True16] Support V_FLOOR_F16. #78446

merged 2 commits into from
Jan 18, 2024

Conversation

kosarev
Copy link
Collaborator

@kosarev kosarev commented Jan 17, 2024

No description provided.

@llvmbot
Copy link
Member

llvmbot commented Jan 17, 2024

@llvm/pr-subscribers-llvm-globalisel
@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-mc

Author: Ivan Kosarev (kosarev)

Changes

Patch is 35.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/78446.diff

16 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+2-1)
  • (modified) llvm/lib/Target/AMDGPU/VOP1Instructions.td (+1)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir (+64-6)
  • (modified) llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir (+21)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll (+55-5)
  • (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop1-fake16.s (+48)
  • (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop1.s (+24-18)
  • (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16-fake16.s (+42)
  • (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s (+17-17)
  • (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8-fake16.s (+9)
  • (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s (+8-5)
  • (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err-fake16.s (+21)
  • (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s (+31-10)
  • (modified) llvm/test/MC/AMDGPU/gfx11_unsupported_sdwa-fake16.s (+3)
  • (modified) llvm/test/MC/AMDGPU/gfx11_unsupported_sdwa.s (+1-1)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt (+24-18)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index aa98a4b860dda9..58fdb9b724931a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5280,7 +5280,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
     return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
                                    : AMDGPU::V_CEIL_F16_fake16_e64;
   case AMDGPU::S_FLOOR_F16:
-    return AMDGPU::V_FLOOR_F16_fake16_e64;
+    return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
+                                   : AMDGPU::V_FLOOR_F16_fake16_e64;
   case AMDGPU::S_TRUNC_F16:
     return AMDGPU::V_TRUNC_F16_fake16_e64;
   case AMDGPU::S_RNDNE_F16:
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index d604990dc88c20..b0dd92af4a027a 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -881,6 +881,7 @@ defm V_LOG_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16"
 defm V_EXP_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">;
 defm V_FREXP_MANT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x059, "v_frexp_mant_f16">;
 defm V_FREXP_EXP_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05a, "v_frexp_exp_i16_f16">;
+defm V_FLOOR_F16_t16         : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">;
 defm V_FLOOR_F16_fake16      : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">;
 defm V_CEIL_F16_t16          : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
 defm V_CEIL_F16_fake16       : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
index 52c37ec6246c96..30975a8937db62 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
@@ -1,5 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=VI %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11-FAKE16 %s
 
 ---
 name: ffloor_s16_ss
@@ -19,6 +21,15 @@ body: |
     ; VI-NEXT: [[FFLOOR:%[0-9]+]]:sreg_32(s16) = G_FFLOOR [[TRUNC]]
     ; VI-NEXT: [[COPY1:%[0-9]+]]:sreg_32(s32) = COPY [[FFLOOR]](s16)
     ; VI-NEXT: $sgpr0 = COPY [[COPY1]](s32)
+    ;
+    ; GCN-LABEL: name: ffloor_s16_ss
+    ; GCN: liveins: $sgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; GCN-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: [[FFLOOR:%[0-9]+]]:sreg_32(s16) = G_FFLOOR [[TRUNC]]
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32(s32) = COPY [[FFLOOR]](s16)
+    ; GCN-NEXT: $sgpr0 = COPY [[COPY1]](s32)
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s16) = G_TRUNC %0
     %2:sgpr(s16) = G_FFLOOR %1
@@ -40,8 +51,24 @@ body: |
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; VI-NEXT: %2:vgpr_32 = nofpexcept V_FLOOR_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; VI-NEXT: $vgpr0 = COPY %2
+    ; VI-NEXT: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; VI-NEXT: $vgpr0 = COPY [[V_FLOOR_F16_e64_]]
+    ;
+    ; GFX11-LABEL: name: ffloor_s16_vv
+    ; GFX11: liveins: $vgpr0
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
+    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
+    ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: ffloor_s16_vv
+    ; GFX11-FAKE16: liveins: $vgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-FAKE16-NEXT: [[V_FLOOR_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_FLOOR_F16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
     %2:vgpr(s16) = G_FFLOOR %1
@@ -63,8 +90,23 @@ body: |
     ; VI: liveins: $sgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; VI-NEXT: %2:vgpr_32 = nofpexcept V_FLOOR_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; VI-NEXT: $vgpr0 = COPY %2
+    ; VI-NEXT: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; VI-NEXT: $vgpr0 = COPY [[V_FLOOR_F16_e64_]]
+    ;
+    ; GFX11-LABEL: name: ffloor_s16_vs
+    ; GFX11: liveins: $sgpr0
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
+    ; GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: ffloor_s16_vs
+    ; GFX11-FAKE16: liveins: $sgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX11-FAKE16-NEXT: [[V_FLOOR_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_FLOOR_F16_fake16_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s16) = G_TRUNC %0
     %2:vgpr(s16) = G_FFLOOR %1
@@ -86,8 +128,24 @@ body: |
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; VI-NEXT: %3:vgpr_32 = nofpexcept V_FLOOR_F16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; VI-NEXT: $vgpr0 = COPY %3
+    ; VI-NEXT: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; VI-NEXT: $vgpr0 = COPY [[V_FLOOR_F16_e64_]]
+    ;
+    ; GFX11-LABEL: name: ffloor_fneg_s16_vv
+    ; GFX11: liveins: $vgpr0
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
+    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
+    ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: ffloor_fneg_s16_vv
+    ; GFX11-FAKE16: liveins: $vgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-FAKE16-NEXT: [[V_FLOOR_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F16_fake16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_FLOOR_F16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
     %2:vgpr(s16) = G_FNEG %1
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
index d4eab5b797e66c..7767aa54c81519 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
@@ -78,3 +78,24 @@ body:             |
     %2:sreg_32 = COPY %1:vgpr_32
     %3:sreg_32 = nofpexcept S_CEIL_F16 killed %2:sreg_32, implicit $mode
 ...
+
+---
+name:            floor_f16
+body:             |
+  bb.0:
+    ; REAL16-LABEL: name: floor_f16
+    ; REAL16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+    ; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; REAL16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, implicit $mode, implicit $exec
+    ;
+    ; FAKE16-LABEL: name: floor_f16
+    ; FAKE16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; FAKE16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+    ; FAKE16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; FAKE16-NEXT: [[V_FLOOR_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F16_fake16_e64 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2:sreg_32 = COPY %1:vgpr_32
+    %3:sreg_32 = nofpexcept S_FLOOR_F16 killed %2:sreg_32, implicit $mode
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index 00bb32c768dca3..e8d037c5ff53e0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
 
 declare half @llvm.floor.f16(half %a)
 declare <2 x half> @llvm.floor.v2f16(<2 x half> %a)
@@ -59,11 +60,31 @@ define amdgpu_kernel void @floor_f16(
 ; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_mov_b32 s5, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_floor_f16_e32 v0, v0
+; GFX11-NEXT:    v_floor_f16_e32 v0.l, v0.l
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: floor_f16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_floor_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -143,14 +164,43 @@ define amdgpu_kernel void @floor_v2f16(
 ; GFX11-NEXT:    s_mov_b32 s5, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_floor_f16_e32 v0, v0
+; GFX11-NEXT:    v_floor_f16_e32 v0.l, v0.l
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-NEXT:    v_mov_b16_e32 v1.l, v0.l
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_floor_f16_e32 v1, v1
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    v_floor_f16_e32 v0.h, v0.h
+; GFX11-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: floor_v2f16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_floor_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_floor_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1-fake16.s
index 668085cffbf004..a155b74046babc 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1-fake16.s
@@ -1,6 +1,54 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
 
+v_floor_f16 v5, v1
+// GFX11: encoding: [0x01,0xb7,0x0a,0x7e]
+
+v_floor_f16 v5, v127
+// GFX11: encoding: [0x7f,0xb7,0x0a,0x7e]
+
+v_floor_f16 v5, s1
+// GFX11: encoding: [0x01,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, s105
+// GFX11: encoding: [0x69,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, vcc_lo
+// GFX11: encoding: [0x6a,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, vcc_hi
+// GFX11: encoding: [0x6b,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, ttmp15
+// GFX11: encoding: [0x7b,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, m0
+// GFX11: encoding: [0x7d,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, exec_lo
+// GFX11: encoding: [0x7e,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, exec_hi
+// GFX11: encoding: [0x7f,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, null
+// GFX11: encoding: [0x7c,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, -1
+// GFX11: encoding: [0xc1,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, 0.5
+// GFX11: encoding: [0xf0,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, src_scc
+// GFX11: encoding: [0xfd,0xb6,0x0a,0x7e]
+
+v_floor_f16 v127, 0xfe0b
+// GFX11: encoding: [0xff,0xb6,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_floor_f32 v5, v1
+// GFX11: encoding: [0x01,0x49,0x0a,0x7e]
+
 v_ceil_f16 v5, v1
 // GFX11: encoding: [0x01,0xb9,0x0a,0x7e]
 
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
index 6b19a5c94a64e4..86c2375c89496a 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
@@ -1906,50 +1906,56 @@ v_ffbl_b32 v5, src_scc
 v_ffbl_b32 v255, 0xaf123456
 // GFX11: encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_floor_f16 v5, v1
+v_floor_f16 v5.l, v1.l
 // GFX11: encoding: [0x01,0xb7,0x0a,0x7e]
 
-v_floor_f16 v5, v127
+v_floor_f16 v5.l, v127.l
 // GFX11: encoding: [0x7f,0xb7,0x0a,0x7e]
 
-v_floor_f16 v5, s1
+v_floor_f16 v5.l, v1.h
+// GFX11: encoding: [0x81,0xb7,0x0a,0x7e]
+
+v_floor_f16 v5.l, v127.h
+// GFX11: encoding: [0xff,0xb7,0x0a,0x7e]
+
+v_floor_f16 v5.l, s1
 // GFX11: encoding: [0x01,0xb6,0x0a,0x7e]
 
-v_floor_f16 v5, s105
+v_floor_f16 v5.l, s105
 // GFX11: encoding: [0x69,0xb6,0x0a,0x7e]
 
-v_floor_f16 v5, vcc_lo
+v_floor_f16 v5.l, vcc_lo
 // GFX11: encoding: [0x6a,0xb6,0x0a,0x7e]
 
-v_floor_f16 v5, vcc_hi
+v_floor_f16 v5.l, vcc_hi
 // GFX11: encoding: [0x6b,0xb6,0x0a,0x7e]
 
-v_floor_f16 v5, ttmp15
+v_floor_f16 v5.l, ttmp15
 // GFX11: encoding: [0x7b,0xb6,0x0a,0x7e]
 
-v_floor_f16 v5, m0
+v_floor_f16 v5.l, m0
 // GFX11: encoding: [0x7d,0xb6,0x0a,0x7e]
 
-v_floor_f16 v5, exec_lo
+v_floor_f16 v5.l, exec_lo
 // GFX11: encoding: [0x7e,0xb6,0x0a,0x7e]
 
-v_floor_f16 v5, exec_hi
+v_floor_f16 v5.l, exec_hi
 // GFX11: encoding: [0x7f,0xb6,0x0a,0x7e]
 
-v_floor_f16 v5, null
+v_floor_f16 v5.l, null
 // GFX11: encoding: [0x7c,0xb6,0x0a,0x7e]
 
-v_floor_f16 v5, -1
+v_floor_f16 v5.l, -1
 // GFX11: encoding: [0xc1,0xb6,0x0a,0x7e]
 
-v_floor_f16 v5, 0.5
-// GFX11: encoding: [0xf0,0xb6,0x0a,0x7e]
+v_floor_f16 v127.l, 0.5
+// GFX11: encoding: [0xf0,0xb6,0xfe,0x7e]
 
-v_floor_f16 v5, src_scc
-// GFX11: encoding: [0xfd,0xb6,0x0a,0x7e]
+v_floor_f16 v5.h, src_scc
+// GFX11: encoding: [0xfd,0xb6,0x0a,0x7f]
 
-v_floor_f16 v127, 0xfe0b
-// GFX11: encoding: [0xff,0xb6,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+v_floor_f16 v127.h, 0xfe0b
+// GFX11: encoding: [0xff,0xb6,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
 v_floor_f32 v5, v1
 // GFX11: encoding: [0x01,0x49,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16-fake16.s
index e3679b9321f439..038a9d4c9e1895 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16-fake16.s
@@ -1,6 +1,48 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s
 
+v_floor_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_floor_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_floor_f16 v5, v1 row_mirror
+// GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_floor_f16 v5, v1 row_half_mirror
+// GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_floor_f16 v5, v1 row_shl:1
+// GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_floor_f16 v5, v1 row_shl:15
+// GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_floor_f16 v5, v1 row_shr:1
+// GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_floor_f16 v5, v1 row_shr:15
+// GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_floor_f16 v5, v1 row_ror:1
+// GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_floor_f16 v5, v1 row_ror:15
+// GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_floor_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_floor_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_floor_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_floor_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: encoding: [0xfa,0xb6,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
 v_ceil_f16 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s
index cd9aa9273f1d86..fa6df6affeb1e7 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s
@@ -1513,47 +1513,47 @@ v_ffbl_b32 v5, v1 row_xmask...
[truncated]

Copy link
Collaborator

@rampitec rampitec left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

Copy link
Contributor

@Sisyph Sisyph left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@kosarev kosarev merged commit 2a869ce into llvm:main Jan 18, 2024
@kosarev kosarev deleted the t16 branch January 18, 2024 08:43
ampandey-1995 pushed a commit to ampandey-1995/llvm-project that referenced this pull request Jan 19, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants