Skip to content

Commit 0a7100d

Browse files
committed
[MLIR][NVVM] Update dot.accumulate NVVM Ops
This change: - Adds the dot.accumulate.2way Op to the NVVM dialect for 16-bit to 8-bit dot-product accumulate operation. - Refactors the recently added dot.accumulate.4way and adds a verifier.
1 parent f0ab64b commit 0a7100d

File tree

4 files changed

+174
-43
lines changed

4 files changed

+174
-43
lines changed

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

Lines changed: 72 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3533,35 +3533,35 @@ def NVVM_Tcgen05StOp : NVVM_Op<"tcgen05.st", [NVVMRequiresSMa<[100, 101]>]> {
35333533
}
35343534

35353535
//===----------------------------------------------------------------------===//
3536-
// NVVM dot.accumulate.4way Op
3536+
// NVVM dot.accumulate Ops
35373537
//===----------------------------------------------------------------------===//
35383538

3539-
def DotAccumulate4WayS8 : I32EnumAttrCase<"S8", 1, "s8">;
3540-
def DotAccumulate4WayU8 : I32EnumAttrCase<"U8", 0, "u8">;
3539+
def DotAccumulateSigned : I32EnumAttrCase<"SIGNED", 1, "signed">;
3540+
def DotAccumulateUnsigned : I32EnumAttrCase<"UNSIGNED", 0, "unsigned">;
35413541

3542-
def DotAccumulate4WayType : I32EnumAttr<"DotAccumulate4WayType",
3543-
"NVVM DotAccumulate4WayType",
3544-
[DotAccumulate4WayS8, DotAccumulate4WayU8]> {
3542+
def DotAccumulateType : I32EnumAttr<"DotAccumulateType",
3543+
"NVVM DotAccumulateType",
3544+
[DotAccumulateSigned, DotAccumulateUnsigned]> {
35453545
let cppNamespace = "::mlir::NVVM";
35463546
let genSpecializedAttr = 0;
35473547
}
35483548

3549-
def DotAccumulate4WayTypeAttr : EnumAttr<NVVM_Dialect, DotAccumulate4WayType, "dot_accumulate_4way_type"> {
3549+
def DotAccumulateTypeAttr : EnumAttr<NVVM_Dialect, DotAccumulateType, "dot_accumulate_type"> {
35503550
let assemblyFormat = "`<` $value `>`";
35513551
}
35523552

35533553
def NVVM_DotAccumulate4WayOp : NVVM_Op<"dot.accumulate.4way"> {
3554-
let summary = "Four-way byte dot product-accumulate instruction.";
3554+
let summary = "Four-way byte dot product-accumulate instruction";
35553555
let description = [{
35563556
Performs a four-way byte dot-product which is accumulated in a 32-bit
35573557
result.
35583558
Operand `a` and `b` are vectors of 4 bytes between which the dot product is
35593559
computed.
35603560
The `a_type` and `b_type` attributes specify the type of the elements in `a`
35613561
and `b` respectively.
3562-
If `a_type` or `b_type` is `s8`, then the elements in the corresponding
3562+
If `a_type` or `b_type` is `s`, then the elements in the corresponding
35633563
vector are sign-extended to 32-bit before the dot product is computed.
3564-
If `a_type` or `b_type` is `u8`, then the elements in the corresponding
3564+
If `a_type` or `b_type` is `u`, then the elements in the corresponding
35653565
vector are zero-extended to 32-bit instead.
35663566
Operand `c` is a 32-bit integer to which the result is accumulated. It is
35673567
treated as holding a signed integer if any of `a_type` or `b_type` is `s8`.
@@ -3571,9 +3571,9 @@ def NVVM_DotAccumulate4WayOp : NVVM_Op<"dot.accumulate.4way"> {
35713571

35723572
let arguments = (ins
35733573
VectorOfLengthAndType<[4], [I8]>:$a,
3574-
DotAccumulate4WayTypeAttr:$a_type,
3574+
DotAccumulateTypeAttr:$a_type,
35753575
VectorOfLengthAndType<[4], [I8]>:$b,
3576-
DotAccumulate4WayTypeAttr:$b_type,
3576+
DotAccumulateTypeAttr:$b_type,
35773577
I32:$c
35783578
);
35793579

@@ -3582,17 +3582,69 @@ def NVVM_DotAccumulate4WayOp : NVVM_Op<"dot.accumulate.4way"> {
35823582
let assemblyFormat = "$a $a_type `,` $b $b_type `,` $c attr-dict `:` type($a) `,` type($b)";
35833583

35843584
let extraClassDeclaration = [{
3585-
static llvm::Intrinsic::ID
3586-
getIntrinsicID(NVVM::DotAccumulate4WayType a_type,
3587-
NVVM::DotAccumulate4WayType b_type);
3588-
llvm::Value* getPackedArg(llvm::Value* arg, llvm::IRBuilderBase& builder);
3585+
static mlir::NVVM::IDArgPair
3586+
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
3587+
llvm::IRBuilderBase &builder);
3588+
}];
3589+
3590+
string llvmBuilder = [{
3591+
auto [id, args] = NVVM::DotAccumulate4WayOp::getIntrinsicIDAndArgs(
3592+
*op, moduleTranslation, builder);
3593+
$res = createIntrinsicCall(builder, id, args);
3594+
}];
3595+
}
3596+
3597+
def NVVM_DotAccumulate2WayOp : NVVM_Op<"dot.accumulate.2way"> {
3598+
let summary = "Two-way 16-bit to 8-bit dot product-accumulate instruction";
3599+
let description = [{
3600+
Performs a two-way 16-bit to 8-bit dot-product which is accumulated in a
3601+
32-bit result.
3602+
Operand `a` is a vector of two 16-bit elements and operand `b` a vector
3603+
of four 8-bit elements between which the dot product is computed.
3604+
3605+
The `a_type` and `b_type` attributes specify the type of the elements in `a`
3606+
and `b` respectively.
3607+
If `a_type` or `b_type` is `s`, then the elements in the corresponding
3608+
vector are sign-extended to 32-bit before the dot product is computed.
3609+
If `a_type` or `b_type` is `u`, then the elements in the corresponding
3610+
vector are zero-extended to 32-bit instead.
3611+
3612+
The `b_hi` boolean attribute specifies which two bytes of `b` are used for
3613+
the dot product. If `b_hi` is true, then the dot product is computed
3614+
between `a` and elements at indices 2 and 3 of `b`. If `b_hi` is false,
3615+
then the dot product is computed between `a` and elements at indices 0 and
3616+
1 of `b`.
3617+
3618+
Operand `c` is a 32-bit integer to which the result is accumulated. It is
3619+
treated as holding a signed integer if any of `a_type` or `b_type` is
3620+
signed.
3621+
3622+
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-dp2a)
35893623
}];
35903624

3625+
let arguments = (ins
3626+
VectorOfLengthAndType<[2], [I16]>:$a,
3627+
DotAccumulateTypeAttr:$a_type,
3628+
VectorOfLengthAndType<[4], [I8]>:$b,
3629+
DotAccumulateTypeAttr:$b_type,
3630+
I32:$c,
3631+
BoolAttr:$b_hi
3632+
);
3633+
3634+
let results = (outs I32:$res);
3635+
3636+
let assemblyFormat = "$a $a_type `,` $b $b_type `,` $c attr-dict `:` type($a) `,` type($b)";
3637+
3638+
let extraClassDeclaration = [{
3639+
static mlir::NVVM::IDArgPair
3640+
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
3641+
llvm::IRBuilderBase &builder);
3642+
}];
3643+
35913644
string llvmBuilder = [{
3592-
llvm::Intrinsic::ID id = NVVM::DotAccumulate4WayOp::getIntrinsicID($a_type, $b_type);
3593-
llvm::Value* argA = op.getPackedArg($a, builder);
3594-
llvm::Value* argB = op.getPackedArg($b, builder);
3595-
$res = createIntrinsicCall(builder, id, {argA, argB, $c});
3645+
auto [id, args] = NVVM::DotAccumulate2WayOp::getIntrinsicIDAndArgs(
3646+
*op, moduleTranslation, builder);
3647+
$res = createIntrinsicCall(builder, id, args);
35963648
}];
35973649
}
35983650

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

Lines changed: 48 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1205,13 +1205,6 @@ LogicalResult NVVM::VoteSyncOp::verify() {
12051205
return success();
12061206
}
12071207

1208-
llvm::Value *
1209-
NVVM::DotAccumulate4WayOp::getPackedArg(llvm::Value *arg,
1210-
llvm::IRBuilderBase &builder) {
1211-
return builder.CreateBitCast(arg,
1212-
llvm::Type::getInt32Ty(builder.getContext()));
1213-
}
1214-
12151208
//===----------------------------------------------------------------------===//
12161209
// getIntrinsicID/getIntrinsicIDAndArgs methods
12171210
//===----------------------------------------------------------------------===//
@@ -1627,26 +1620,65 @@ static void nvvmInferResultRanges(Operation *op, Value result,
16271620
}
16281621
}
16291622

1630-
llvm::Intrinsic::ID
1631-
DotAccumulate4WayOp::getIntrinsicID(NVVM::DotAccumulate4WayType a_type,
1632-
NVVM::DotAccumulate4WayType b_type) {
1633-
bool is_a_siext = a_type == NVVM::DotAccumulate4WayType::S8;
1634-
bool is_b_siext = b_type == NVVM::DotAccumulate4WayType::S8;
1623+
static llvm::Value *getAsPackedI32(llvm::Value *arg,
1624+
llvm::IRBuilderBase &builder) {
1625+
return builder.CreateBitCast(arg,
1626+
llvm::Type::getInt32Ty(builder.getContext()));
1627+
}
1628+
1629+
NVVM::IDArgPair DotAccumulate4WayOp::getIntrinsicIDAndArgs(
1630+
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
1631+
auto curOp = cast<NVVM::DotAccumulate4WayOp>(op);
1632+
1633+
llvm::SmallVector<llvm::Value *> args;
1634+
args.push_back(getAsPackedI32(mt.lookupValue(curOp.getA()), builder));
1635+
args.push_back(getAsPackedI32(mt.lookupValue(curOp.getB()), builder));
1636+
args.push_back(mt.lookupValue(curOp.getC()));
1637+
1638+
bool is_a_siext = curOp.getAType() == NVVM::DotAccumulateType::SIGNED;
1639+
bool is_b_siext = curOp.getBType() == NVVM::DotAccumulateType::SIGNED;
16351640
unsigned type = (is_a_siext << 1) | is_b_siext;
16361641
switch (type) {
16371642
case 0:
1638-
return llvm::Intrinsic::nvvm_idp4a_u_u;
1643+
return {llvm::Intrinsic::nvvm_idp4a_u_u, args};
16391644
case 1:
1640-
return llvm::Intrinsic::nvvm_idp4a_u_s;
1645+
return {llvm::Intrinsic::nvvm_idp4a_u_s, args};
16411646
case 2:
1642-
return llvm::Intrinsic::nvvm_idp4a_s_u;
1647+
return {llvm::Intrinsic::nvvm_idp4a_s_u, args};
16431648
case 3:
1644-
return llvm::Intrinsic::nvvm_idp4a_s_s;
1649+
return {llvm::Intrinsic::nvvm_idp4a_s_s, args};
16451650
default:
16461651
llvm_unreachable("Invalid DP4a type");
16471652
}
16481653
}
16491654

1655+
NVVM::IDArgPair DotAccumulate2WayOp::getIntrinsicIDAndArgs(
1656+
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
1657+
auto curOp = cast<NVVM::DotAccumulate2WayOp>(op);
1658+
1659+
llvm::SmallVector<llvm::Value *> args;
1660+
args.push_back(getAsPackedI32(mt.lookupValue(curOp.getA()), builder));
1661+
args.push_back(getAsPackedI32(mt.lookupValue(curOp.getB()), builder));
1662+
args.push_back(builder.getInt1(curOp.getBHi()));
1663+
args.push_back(mt.lookupValue(curOp.getC()));
1664+
1665+
bool is_a_siext = curOp.getAType() == NVVM::DotAccumulateType::SIGNED;
1666+
bool is_b_siext = curOp.getBType() == NVVM::DotAccumulateType::SIGNED;
1667+
unsigned type = (is_a_siext << 1) | is_b_siext;
1668+
switch (type) {
1669+
case 0:
1670+
return {llvm::Intrinsic::nvvm_idp2a_u_u, args};
1671+
case 1:
1672+
return {llvm::Intrinsic::nvvm_idp2a_u_s, args};
1673+
case 2:
1674+
return {llvm::Intrinsic::nvvm_idp2a_s_u, args};
1675+
case 3:
1676+
return {llvm::Intrinsic::nvvm_idp2a_s_s, args};
1677+
default:
1678+
llvm_unreachable("Invalid DP2a type");
1679+
}
1680+
}
1681+
16501682
//===----------------------------------------------------------------------===//
16511683
// NVVMDialect initialization, type parsing, and registration.
16521684
//===----------------------------------------------------------------------===//

mlir/test/Dialect/LLVMIR/nvvm.mlir

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -579,11 +579,20 @@ func.func @st_bulk(%addr_gen: !llvm.ptr, %addr_shared: !llvm.ptr<3>, %size: i64)
579579
}
580580

581581
// CHECK-LABEL: @dot_accumulate_4way
582-
func.func @dot_accumulate_4way(%a: i32, %a_vec: vector<4xi8>, %b: i32, %b_vec: vector<4xi8>, %c: i32) {
582+
func.func @dot_accumulate_4way(%a_vec: vector<4xi8>, %b_vec: vector<4xi8>, %c: i32) {
583583
// CHECK: nvvm.dot.accumulate.4way %{{.*}}, %{{.*}}, %{{.*}} : vector<4xi8>, vector<4xi8>
584-
%1 = nvvm.dot.accumulate.4way %a_vec <u8>, %b_vec <u8>, %c: vector<4xi8>, vector<4xi8>
584+
%1 = nvvm.dot.accumulate.4way %a_vec <unsigned>, %b_vec <unsigned>, %c: vector<4xi8>, vector<4xi8>
585585
// CHECK: nvvm.dot.accumulate.4way %{{.*}}, %{{.*}}, %{{.*}} : vector<4xi8>, vector<4xi8>
586-
%3 = nvvm.dot.accumulate.4way %a_vec <s8>, %b_vec <s8>, %c: vector<4xi8>, vector<4xi8>
586+
%3 = nvvm.dot.accumulate.4way %a_vec <signed>, %b_vec <signed>, %c: vector<4xi8>, vector<4xi8>
587+
return
588+
}
589+
590+
// CHECK-LABEL: @dot_accumulate_2way
591+
func.func @dot_accumulate_2way(%a_vec: vector<2xi16>, %b_vec: vector<4xi8>, %c: i32) {
592+
// CHECK: nvvm.dot.accumulate.2way %{{.*}}, %{{.*}}, %{{.*}} {b_hi = false} : vector<2xi16>, vector<4xi8>
593+
%1 = nvvm.dot.accumulate.2way %a_vec <unsigned>, %b_vec <unsigned>, %c {b_hi = false}: vector<2xi16>, vector<4xi8>
594+
// CHECK: nvvm.dot.accumulate.2way %{{.*}}, %{{.*}}, %{{.*}} {b_hi = true} : vector<2xi16>, vector<4xi8>
595+
%3 = nvvm.dot.accumulate.2way %a_vec <signed>, %b_vec <signed>, %c {b_hi = true}: vector<2xi16>, vector<4xi8>
587596
return
588597
}
589598

mlir/test/Target/LLVMIR/nvvmir.mlir

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -851,18 +851,56 @@ llvm.func @nvvm_dot_accumulate_4way(%a: vector<4xi8>, %b: vector<4xi8>, %c: i32)
851851
// CHECK: %[[a_cast:.*]] = bitcast <4 x i8> %{{.*}} to i32
852852
// CHECK: %[[b_cast:.*]] = bitcast <4 x i8> %{{.*}} to i32
853853
// CHECK: call i32 @llvm.nvvm.idp4a.u.u(i32 %[[a_cast]], i32 %[[b_cast]], i32 %{{.*}})
854-
%0 = nvvm.dot.accumulate.4way %a <u8>, %b <u8>, %c: vector<4xi8>, vector<4xi8>
854+
%0 = nvvm.dot.accumulate.4way %a <unsigned>, %b <unsigned>, %c: vector<4xi8>, vector<4xi8>
855855
// CHECK: %[[a_cast:.*]] = bitcast <4 x i8> %{{.*}} to i32
856856
// CHECK: %[[b_cast:.*]] = bitcast <4 x i8> %{{.*}} to i32
857857
// CHECK: call i32 @llvm.nvvm.idp4a.s.u(i32 %[[a_cast]], i32 %[[b_cast]], i32 %{{.*}})
858-
%1 = nvvm.dot.accumulate.4way %a <s8>, %b <u8>, %c: vector<4xi8>, vector<4xi8>
858+
%1 = nvvm.dot.accumulate.4way %a <signed>, %b <unsigned>, %c: vector<4xi8>, vector<4xi8>
859859
// CHECK: %[[a_cast:.*]] = bitcast <4 x i8> %{{.*}} to i32
860860
// CHECK: %[[b_cast:.*]] = bitcast <4 x i8> %{{.*}} to i32
861861
// CHECK: call i32 @llvm.nvvm.idp4a.u.s(i32 %[[a_cast]], i32 %[[b_cast]], i32 %{{.*}})
862-
%2 = nvvm.dot.accumulate.4way %a <u8>, %b <s8>, %c: vector<4xi8>, vector<4xi8>
862+
%2 = nvvm.dot.accumulate.4way %a <unsigned>, %b <signed>, %c: vector<4xi8>, vector<4xi8>
863863
// CHECK: %[[a_cast:.*]] = bitcast <4 x i8> %{{.*}} to i32
864864
// CHECK: %[[b_cast:.*]] = bitcast <4 x i8> %{{.*}} to i32
865865
// CHECK: call i32 @llvm.nvvm.idp4a.s.s(i32 %[[a_cast]], i32 %[[b_cast]], i32 %{{.*}})
866-
%3 = nvvm.dot.accumulate.4way %a <s8>, %b <s8>, %c: vector<4xi8>, vector<4xi8>
866+
%3 = nvvm.dot.accumulate.4way %a <signed>, %b <signed>, %c: vector<4xi8>, vector<4xi8>
867+
llvm.return
868+
}
869+
870+
// -----
871+
// CHECK-LABEL: @nvvm_dot_accumulate_2way
872+
llvm.func @nvvm_dot_accumulate_2way(%a: vector<2xi16>, %b: vector<4xi8>, %c: i32) {
873+
// CHECK: %[[a_cast:.*]] = bitcast <2 x i16> %{{.*}} to i32
874+
// CHECK: %[[b_cast:.*]] = bitcast <4 x i8> %{{.*}} to i32
875+
// CHECK: call i32 @llvm.nvvm.idp2a.u.u(i32 %[[a_cast]], i32 %[[b_cast]], i1 false, i32 %{{.*}})
876+
%0 = nvvm.dot.accumulate.2way %a <unsigned>, %b <unsigned>, %c {b_hi = false} : vector<2xi16>, vector<4xi8>
877+
// CHECK: %[[a_cast:.*]] = bitcast <2 x i16> %{{.*}} to i32
878+
// CHECK: %[[b_cast:.*]] = bitcast <4 x i8> %{{.*}} to i32
879+
// CHECK: call i32 @llvm.nvvm.idp2a.u.u(i32 %[[a_cast]], i32 %[[b_cast]], i1 true, i32 %{{.*}})
880+
%1 = nvvm.dot.accumulate.2way %a <unsigned>, %b <unsigned>, %c {b_hi = true}: vector<2xi16>, vector<4xi8>
881+
// CHECK: %[[a_cast:.*]] = bitcast <2 x i16> %{{.*}} to i32
882+
// CHECK: %[[b_cast:.*]] = bitcast <4 x i8> %{{.*}} to i32
883+
// CHECK: call i32 @llvm.nvvm.idp2a.s.u(i32 %[[a_cast]], i32 %[[b_cast]], i1 false, i32 %{{.*}})
884+
%2 = nvvm.dot.accumulate.2way %a <signed>, %b <unsigned>, %c {b_hi = false}: vector<2xi16>, vector<4xi8>
885+
// CHECK: %[[a_cast:.*]] = bitcast <2 x i16> %{{.*}} to i32
886+
// CHECK: %[[b_cast:.*]] = bitcast <4 x i8> %{{.*}} to i32
887+
// CHECK: call i32 @llvm.nvvm.idp2a.s.u(i32 %[[a_cast]], i32 %[[b_cast]], i1 true, i32 %{{.*}})
888+
%3 = nvvm.dot.accumulate.2way %a <signed>, %b <unsigned>, %c {b_hi = true}: vector<2xi16>, vector<4xi8>
889+
// CHECK: %[[a_cast:.*]] = bitcast <2 x i16> %{{.*}} to i32
890+
// CHECK: %[[b_cast:.*]] = bitcast <4 x i8> %{{.*}} to i32
891+
// CHECK: call i32 @llvm.nvvm.idp2a.u.s(i32 %[[a_cast]], i32 %[[b_cast]], i1 false, i32 %{{.*}})
892+
%4 = nvvm.dot.accumulate.2way %a <unsigned>, %b <signed>, %c {b_hi = false}: vector<2xi16>, vector<4xi8>
893+
// CHECK: %[[a_cast:.*]] = bitcast <2 x i16> %{{.*}} to i32
894+
// CHECK: %[[b_cast:.*]] = bitcast <4 x i8> %{{.*}} to i32
895+
// CHECK: call i32 @llvm.nvvm.idp2a.u.s(i32 %[[a_cast]], i32 %[[b_cast]], i1 true, i32 %{{.*}})
896+
%5 = nvvm.dot.accumulate.2way %a <unsigned>, %b <signed>, %c {b_hi = true}: vector<2xi16>, vector<4xi8>
897+
// CHECK: %[[a_cast:.*]] = bitcast <2 x i16> %{{.*}} to i32
898+
// CHECK: %[[b_cast:.*]] = bitcast <4 x i8> %{{.*}} to i32
899+
// CHECK: call i32 @llvm.nvvm.idp2a.s.s(i32 %[[a_cast]], i32 %[[b_cast]], i1 false, i32 %{{.*}})
900+
%6 = nvvm.dot.accumulate.2way %a <signed>, %b <signed>, %c {b_hi = false}: vector<2xi16>, vector<4xi8>
901+
// CHECK: %[[a_cast:.*]] = bitcast <2 x i16> %{{.*}} to i32
902+
// CHECK: %[[b_cast:.*]] = bitcast <4 x i8> %{{.*}} to i32
903+
// CHECK: call i32 @llvm.nvvm.idp2a.s.s(i32 %[[a_cast]], i32 %[[b_cast]], i1 true, i32 %{{.*}})
904+
%7 = nvvm.dot.accumulate.2way %a <signed>, %b <signed>, %c {b_hi = true}: vector<2xi16>, vector<4xi8>
867905
llvm.return
868906
}

0 commit comments

Comments
 (0)