Skip to content

Commit 88962ce

Browse files
committed
[WebAssembly] Restore builtins and intrinsics for pmin/pmax
Partially reverts 85157c0, which had removed these builtins and intrinsics in favor of normal codegen patterns. It turns out that it is possible for the patterns to be split over multiple basic blocks, however, which means that DAG ISel is not able to select them to the pmin/pmax instructions. To make sure the SIMD intrinsics generate the correct instructions in these cases, reintroduce the clang builtins and corresponding LLVM intrinsics, but also keep the normal pattern matching as well. Differential Revision: https://reviews.llvm.org/D108387
1 parent 24ea94a commit 88962ce

File tree

8 files changed

+123
-22
lines changed

8 files changed

+123
-22
lines changed

clang/include/clang/Basic/BuiltinsWebAssembly.def

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,12 @@ TARGET_BUILTIN(__builtin_wasm_abs_f64x2, "V2dV2d", "nc", "simd128")
129129

130130
TARGET_BUILTIN(__builtin_wasm_min_f32x4, "V4fV4fV4f", "nc", "simd128")
131131
TARGET_BUILTIN(__builtin_wasm_max_f32x4, "V4fV4fV4f", "nc", "simd128")
132+
TARGET_BUILTIN(__builtin_wasm_pmin_f32x4, "V4fV4fV4f", "nc", "simd128")
133+
TARGET_BUILTIN(__builtin_wasm_pmax_f32x4, "V4fV4fV4f", "nc", "simd128")
132134
TARGET_BUILTIN(__builtin_wasm_min_f64x2, "V2dV2dV2d", "nc", "simd128")
133135
TARGET_BUILTIN(__builtin_wasm_max_f64x2, "V2dV2dV2d", "nc", "simd128")
136+
TARGET_BUILTIN(__builtin_wasm_pmin_f64x2, "V2dV2dV2d", "nc", "simd128")
137+
TARGET_BUILTIN(__builtin_wasm_pmax_f64x2, "V2dV2dV2d", "nc", "simd128")
134138

135139
TARGET_BUILTIN(__builtin_wasm_ceil_f32x4, "V4fV4f", "nc", "simd128")
136140
TARGET_BUILTIN(__builtin_wasm_floor_f32x4, "V4fV4f", "nc", "simd128")

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17822,6 +17822,22 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
1782217822
CGM.getIntrinsic(Intrinsic::maximum, ConvertType(E->getType()));
1782317823
return Builder.CreateCall(Callee, {LHS, RHS});
1782417824
}
17825+
case WebAssembly::BI__builtin_wasm_pmin_f32x4:
17826+
case WebAssembly::BI__builtin_wasm_pmin_f64x2: {
17827+
Value *LHS = EmitScalarExpr(E->getArg(0));
17828+
Value *RHS = EmitScalarExpr(E->getArg(1));
17829+
Function *Callee =
17830+
CGM.getIntrinsic(Intrinsic::wasm_pmin, ConvertType(E->getType()));
17831+
return Builder.CreateCall(Callee, {LHS, RHS});
17832+
}
17833+
case WebAssembly::BI__builtin_wasm_pmax_f32x4:
17834+
case WebAssembly::BI__builtin_wasm_pmax_f64x2: {
17835+
Value *LHS = EmitScalarExpr(E->getArg(0));
17836+
Value *RHS = EmitScalarExpr(E->getArg(1));
17837+
Function *Callee =
17838+
CGM.getIntrinsic(Intrinsic::wasm_pmax, ConvertType(E->getType()));
17839+
return Builder.CreateCall(Callee, {LHS, RHS});
17840+
}
1782517841
case WebAssembly::BI__builtin_wasm_ceil_f32x4:
1782617842
case WebAssembly::BI__builtin_wasm_floor_f32x4:
1782717843
case WebAssembly::BI__builtin_wasm_trunc_f32x4:

clang/lib/Headers/wasm_simd128.h

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1297,14 +1297,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_max(v128_t __a,
12971297

12981298
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_pmin(v128_t __a,
12991299
v128_t __b) {
1300-
__i32x4 __mask = (__i32x4)((__f32x4)__b < (__f32x4)__a);
1301-
return (v128_t)((((__i32x4)__b) & __mask) | (((__i32x4)__a) & ~__mask));
1300+
return (v128_t)__builtin_wasm_pmin_f32x4((__f32x4)__a, (__f32x4)__b);
13021301
}
13031302

13041303
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_pmax(v128_t __a,
13051304
v128_t __b) {
1306-
__i32x4 __mask = (__i32x4)((__f32x4)__a < (__f32x4)__b);
1307-
return (v128_t)((((__i32x4)__b) & __mask) | (((__i32x4)__a) & ~__mask));
1305+
return (v128_t)__builtin_wasm_pmax_f32x4((__f32x4)__a, (__f32x4)__b);
13081306
}
13091307

13101308
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_abs(v128_t __a) {
@@ -1367,14 +1365,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_max(v128_t __a,
13671365

13681366
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_pmin(v128_t __a,
13691367
v128_t __b) {
1370-
__i64x2 __mask = (__i64x2)((__f64x2)__b < (__f64x2)__a);
1371-
return (v128_t)((((__i64x2)__b) & __mask) | (((__i64x2)__a) & ~__mask));
1368+
return (v128_t)__builtin_wasm_pmin_f64x2((__f64x2)__a, (__f64x2)__b);
13721369
}
13731370

13741371
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_pmax(v128_t __a,
13751372
v128_t __b) {
1376-
__i64x2 __mask = (__i64x2)((__f64x2)__a < (__f64x2)__b);
1377-
return (v128_t)((((__i64x2)__b) & __mask) | (((__i64x2)__a) & ~__mask));
1373+
return (v128_t)__builtin_wasm_pmax_f64x2((__f64x2)__a, (__f64x2)__b);
13781374
}
13791375

13801376
static __inline__ v128_t __DEFAULT_FN_ATTRS

clang/test/CodeGen/builtins-wasm.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,20 @@ f32x4 max_f32x4(f32x4 x, f32x4 y) {
506506
// WEBASSEMBLY-NEXT: ret
507507
}
508508

509+
f32x4 pmin_f32x4(f32x4 x, f32x4 y) {
510+
return __builtin_wasm_pmin_f32x4(x, y);
511+
// WEBASSEMBLY: call <4 x float> @llvm.wasm.pmin.v4f32(
512+
// WEBASSEMBLY-SAME: <4 x float> %x, <4 x float> %y)
513+
// WEBASSEMBLY-NEXT: ret
514+
}
515+
516+
f32x4 pmax_f32x4(f32x4 x, f32x4 y) {
517+
return __builtin_wasm_pmax_f32x4(x, y);
518+
// WEBASSEMBLY: call <4 x float> @llvm.wasm.pmax.v4f32(
519+
// WEBASSEMBLY-SAME: <4 x float> %x, <4 x float> %y)
520+
// WEBASSEMBLY-NEXT: ret
521+
}
522+
509523
f64x2 min_f64x2(f64x2 x, f64x2 y) {
510524
return __builtin_wasm_min_f64x2(x, y);
511525
// WEBASSEMBLY: call <2 x double> @llvm.minimum.v2f64(
@@ -520,6 +534,20 @@ f64x2 max_f64x2(f64x2 x, f64x2 y) {
520534
// WEBASSEMBLY-NEXT: ret
521535
}
522536

537+
f64x2 pmin_f64x2(f64x2 x, f64x2 y) {
538+
return __builtin_wasm_pmin_f64x2(x, y);
539+
// WEBASSEMBLY: call <2 x double> @llvm.wasm.pmin.v2f64(
540+
// WEBASSEMBLY-SAME: <2 x double> %x, <2 x double> %y)
541+
// WEBASSEMBLY-NEXT: ret
542+
}
543+
544+
f64x2 pmax_f64x2(f64x2 x, f64x2 y) {
545+
return __builtin_wasm_pmax_f64x2(x, y);
546+
// WEBASSEMBLY: call <2 x double> @llvm.wasm.pmax.v2f64(
547+
// WEBASSEMBLY-SAME: <2 x double> %x, <2 x double> %y)
548+
// WEBASSEMBLY-NEXT: ret
549+
}
550+
523551
f32x4 ceil_f32x4(f32x4 x) {
524552
return __builtin_wasm_ceil_f32x4(x);
525553
// WEBASSEMBLY: call <4 x float> @llvm.ceil.v4f32(<4 x float> %x)

clang/test/Headers/wasm.c

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2424,11 +2424,11 @@ v128_t test_f32x4_max(v128_t a, v128_t b) {
24242424

24252425
// CHECK-LABEL: @test_f32x4_pmin(
24262426
// CHECK-NEXT: entry:
2427-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
2428-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
2429-
// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <4 x float> [[TMP0]], [[TMP1]]
2430-
// CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[CMP_I]], <4 x i32> [[B]], <4 x i32> [[A]]
2431-
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
2427+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
2428+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
2429+
// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmin.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR6]]
2430+
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
2431+
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
24322432
//
24332433
v128_t test_f32x4_pmin(v128_t a, v128_t b) {
24342434
return wasm_f32x4_pmin(a, b);
@@ -2438,9 +2438,9 @@ v128_t test_f32x4_pmin(v128_t a, v128_t b) {
24382438
// CHECK-NEXT: entry:
24392439
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
24402440
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
2441-
// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <4 x float> [[TMP0]], [[TMP1]]
2442-
// CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[CMP_I]], <4 x i32> [[B]], <4 x i32> [[A]]
2443-
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
2441+
// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmax.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR6]]
2442+
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
2443+
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
24442444
//
24452445
v128_t test_f32x4_pmax(v128_t a, v128_t b) {
24462446
return wasm_f32x4_pmax(a, b);
@@ -2597,10 +2597,9 @@ v128_t test_f64x2_max(v128_t a, v128_t b) {
25972597

25982598
// CHECK-LABEL: @test_f64x2_pmin(
25992599
// CHECK-NEXT: entry:
2600-
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
2601-
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
2602-
// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <2 x double> [[TMP0]], [[TMP1]]
2603-
// CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> [[CMP_I]], <2 x double> [[TMP0]], <2 x double> [[TMP1]]
2600+
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
2601+
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
2602+
// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmin.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR6]]
26042603
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32>
26052604
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
26062605
//
@@ -2612,8 +2611,7 @@ v128_t test_f64x2_pmin(v128_t a, v128_t b) {
26122611
// CHECK-NEXT: entry:
26132612
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
26142613
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
2615-
// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <2 x double> [[TMP0]], [[TMP1]]
2616-
// CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> [[CMP_I]], <2 x double> [[TMP1]], <2 x double> [[TMP0]]
2614+
// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmax.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR6]]
26172615
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32>
26182616
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
26192617
//

llvm/include/llvm/IR/IntrinsicsWebAssembly.td

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,15 @@ def int_wasm_q15mulr_sat_signed :
164164
[llvm_v8i16_ty, llvm_v8i16_ty],
165165
[IntrNoMem, IntrSpeculatable]>;
166166

167+
def int_wasm_pmin :
168+
Intrinsic<[llvm_anyvector_ty],
169+
[LLVMMatchType<0>, LLVMMatchType<0>],
170+
[IntrNoMem, IntrSpeculatable]>;
171+
def int_wasm_pmax :
172+
Intrinsic<[llvm_anyvector_ty],
173+
[LLVMMatchType<0>, LLVMMatchType<0>],
174+
[IntrNoMem, IntrSpeculatable]>;
175+
167176
def int_wasm_extadd_pairwise_signed :
168177
Intrinsic<[llvm_anyvector_ty],
169178
[LLVMSubdivide2VectorType<0>],

llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1175,6 +1175,16 @@ def : Pat<(vec.int_vt (vselect
11751175
(pmax $lhs, $rhs)>;
11761176
}
11771177

1178+
// And match the pmin/pmax LLVM intrinsics as well
1179+
def : Pat<(v4f32 (int_wasm_pmin (v4f32 V128:$lhs), (v4f32 V128:$rhs))),
1180+
(PMIN_F32x4 V128:$lhs, V128:$rhs)>;
1181+
def : Pat<(v4f32 (int_wasm_pmax (v4f32 V128:$lhs), (v4f32 V128:$rhs))),
1182+
(PMAX_F32x4 V128:$lhs, V128:$rhs)>;
1183+
def : Pat<(v2f64 (int_wasm_pmin (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
1184+
(PMIN_F64x2 V128:$lhs, V128:$rhs)>;
1185+
def : Pat<(v2f64 (int_wasm_pmax (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
1186+
(PMAX_F64x2 V128:$lhs, V128:$rhs)>;
1187+
11781188
//===----------------------------------------------------------------------===//
11791189
// Conversions
11801190
//===----------------------------------------------------------------------===//

llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,26 @@ define <4 x float> @bitselect_v4f32(<4 x float> %v1, <4 x float> %v2, <4 x float
540540
ret <4 x float> %a
541541
}
542542

543+
; CHECK-LABEL: pmin_v4f32:
544+
; CHECK-NEXT: .functype pmin_v4f32 (v128, v128) -> (v128){{$}}
545+
; CHECK-NEXT: f32x4.pmin $push[[R:[0-9]+]]=, $0, $1{{$}}
546+
; CHECK-NEXT: return $pop[[R]]{{$}}
547+
declare <4 x float> @llvm.wasm.pmin.v4f32(<4 x float>, <4 x float>)
548+
define <4 x float> @pmin_v4f32(<4 x float> %a, <4 x float> %b) {
549+
%v = call <4 x float> @llvm.wasm.pmin.v4f32(<4 x float> %a, <4 x float> %b)
550+
ret <4 x float> %v
551+
}
552+
553+
; CHECK-LABEL: pmax_v4f32:
554+
; CHECK-NEXT: .functype pmax_v4f32 (v128, v128) -> (v128){{$}}
555+
; CHECK-NEXT: f32x4.pmax $push[[R:[0-9]+]]=, $0, $1{{$}}
556+
; CHECK-NEXT: return $pop[[R]]{{$}}
557+
declare <4 x float> @llvm.wasm.pmax.v4f32(<4 x float>, <4 x float>)
558+
define <4 x float> @pmax_v4f32(<4 x float> %a, <4 x float> %b) {
559+
%v = call <4 x float> @llvm.wasm.pmax.v4f32(<4 x float> %a, <4 x float> %b)
560+
ret <4 x float> %v
561+
}
562+
543563
; CHECK-LABEL: ceil_v4f32:
544564
; CHECK-NEXT: .functype ceil_v4f32 (v128) -> (v128){{$}}
545565
; CHECK-NEXT: f32x4.ceil $push[[R:[0-9]+]]=, $0{{$}}
@@ -595,6 +615,26 @@ define <2 x double> @bitselect_v2f64(<2 x double> %v1, <2 x double> %v2, <2 x do
595615
ret <2 x double> %a
596616
}
597617

618+
; CHECK-LABEL: pmin_v2f64:
619+
; CHECK-NEXT: .functype pmin_v2f64 (v128, v128) -> (v128){{$}}
620+
; CHECK-NEXT: f64x2.pmin $push[[R:[0-9]+]]=, $0, $1{{$}}
621+
; CHECK-NEXT: return $pop[[R]]{{$}}
622+
declare <2 x double> @llvm.wasm.pmin.v2f64(<2 x double>, <2 x double>)
623+
define <2 x double> @pmin_v2f64(<2 x double> %a, <2 x double> %b) {
624+
%v = call <2 x double> @llvm.wasm.pmin.v2f64(<2 x double> %a, <2 x double> %b)
625+
ret <2 x double> %v
626+
}
627+
628+
; CHECK-LABEL: pmax_v2f64:
629+
; CHECK-NEXT: .functype pmax_v2f64 (v128, v128) -> (v128){{$}}
630+
; CHECK-NEXT: f64x2.pmax $push[[R:[0-9]+]]=, $0, $1{{$}}
631+
; CHECK-NEXT: return $pop[[R]]{{$}}
632+
declare <2 x double> @llvm.wasm.pmax.v2f64(<2 x double>, <2 x double>)
633+
define <2 x double> @pmax_v2f64(<2 x double> %a, <2 x double> %b) {
634+
%v = call <2 x double> @llvm.wasm.pmax.v2f64(<2 x double> %a, <2 x double> %b)
635+
ret <2 x double> %v
636+
}
637+
598638
; CHECK-LABEL: ceil_v2f64:
599639
; CHECK-NEXT: .functype ceil_v2f64 (v128) -> (v128){{$}}
600640
; CHECK-NEXT: f64x2.ceil $push[[R:[0-9]+]]=, $0{{$}}

0 commit comments

Comments
 (0)