add dpas test

akroviakov · akroviakov · commit 6f9f446aa1af · 2025-01-09T14:45:55.000Z
diff --git a/include/gc/Dialect/LLVMIR/XeVMOps.td b/include/gc/Dialect/LLVMIR/XeVMOps.td
@@ -221,6 +221,73 @@ def XeVM_BlockPrefetch2dOp : XeVM_Op<"blockprefetch2d">,
   let hasVerifier = 1;
 }
 
+def XeVM_MatrixElemType : AnyTypeOf<[AnyI8, AnyI16, AnyI32, F32, F16, BF16]>;
+
+/// Enum attribute of the different precision types.
+def XeVM_PrecisionTypeAttr : I32EnumAttr<"PrecisionType",
+  "XeVM precision type",
+  [
+    I32EnumAttrCase<"UNUSED", 0,  "unused">,
+    I32EnumAttrCase<"U8",     1,  "u8">,
+    I32EnumAttrCase<"U4",     2,  "u4">,
+    I32EnumAttrCase<"U2",     3,  "u2">,
+    I32EnumAttrCase<"S8",     4,  "i8">,
+    I32EnumAttrCase<"S4",     5,  "i4">,
+    I32EnumAttrCase<"S2",     6,  "i2">,
+    I32EnumAttrCase<"BF8",    7,  "bf8">,
+    I32EnumAttrCase<"TF32",   8,  "tf32">,
+    I32EnumAttrCase<"BF16",   9,  "bf16">,
+    I32EnumAttrCase<"FP16",   10, "f16">
+  ]> {
+  let cppNamespace = "::mlir::xevm";
+}
+
+def XeVM_DPASOp : XeVM_Op<"dpas">,
+  Results<(outs FixedVectorOf<[XeVM_MatrixElemType]>:$d)>,
+  Arguments<(ins
+    FixedVectorOfRankAndType<[1], [XeVM_MatrixElemType]>:$c,
+    FixedVectorOfRankAndType<[1], [XeVM_MatrixElemType]>:$a,
+    FixedVectorOfRankAndType<[1], [XeVM_MatrixElemType]>:$b,
+    XeVM_PrecisionTypeAttr:$pa,
+    XeVM_PrecisionTypeAttr:$pb,
+    I32Attr:$rc
+  )> {
+
+  let summary = "Matrix multiply-add";
+
+  let description = [{
+    The `xevm.dpas` operation is a matrix multiplication plus accumulation:
+
+      D = C + A x B
+
+      where the A, B, C input matrices and the result D have shapes:
+        D : MxN
+        C : MxN
+        A : MxK
+        B : KxN
+
+        M : repeat count, must be 1, 2, 4, or 8
+        N : fixed execution size, must be 16
+        K : depth * OPS_PER_CHAN
+            OPS_PER_CHAN
+              1 : for TF32
+              2 : for 16-bit precision(BF, HF)
+              4 : for 8-bit precision (FP8, UB, B)
+              8 : for less-then 8 bit precision (U4/S4, U2/S2).
+
+            If depth is 8, K would be 8, 16, 32, or 64 (based on OPS_PER_CHAN).
+
+    $a, $b, $c, $d - matrix A, B, C, D, respectively
+    $pa, $pb - precision of matrix A and B resepectively
+    $rc - repeat count
+  }];
+
+  let assemblyFormat = [{
+    operands ` ` `{` `pa` `=` $pa `,` `pb` `=` $pb `,` `rc` `=` $rc `}` attr-dict `:` functional-type(operands, results)
+  }];
+
+  // let hasVerifier = 1;
+}
 
 def XeVM_TargetAttr : XeVM_Attr<"XeVMTarget", "target"> {
   let description = [{
diff --git a/lib/gc/Conversion/XeVMToLLVM/XeVMToLLVM.cpp b/lib/gc/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
@@ -54,6 +54,8 @@ static constexpr LLVMFuncAttributeOptions noUnwindAttrs = {
     false, true, false, {}};
 static constexpr LLVMFuncAttributeOptions noUnwindWillReturnAttrs = {
     false, true, true, {}};
+static constexpr LLVMFuncAttributeOptions convergentNoUnwindWillReturnAttrs = {
+    true, true, true, {}};
 
 std::string getTypeMangling(Type ty, bool isUnsigned = false) {
   return TypeSwitch<Type, std::string>(ty)
@@ -80,6 +82,31 @@ std::string getTypeMangling(Type ty, bool isUnsigned = false) {
       });
 }
 
+std::string mangle(StringRef baseName, ArrayRef<Type> types,
+                   ArrayRef<bool> isUnsigned = {}) {
+  assert((isUnsigned.empty() || isUnsigned.size() == types.size()) &&
+         "Signedness info doesn't match");
+  std::string s;
+  llvm::raw_string_ostream os(s);
+  llvm::SmallDenseMap<Type, unsigned> substitutions;
+  os << "_Z" << baseName.size() << baseName;
+  for (auto [idx, type] : llvm::enumerate(types)) {
+    auto it = substitutions.find(type);
+    if (it != substitutions.end()) {
+      os << "S";
+      // First substitution is `S_`, second is `S0_`, and so on.
+      if (unsigned firstIdx = it->getSecond(); firstIdx > 0)
+        os << firstIdx - 1;
+      os << "_";
+    } else {
+      if (!type.isIntOrFloat())
+        substitutions[type] = substitutions.size();
+      os << getTypeMangling(type, isUnsigned.empty() ? false : isUnsigned[idx]);
+    }
+  }
+  return os.str();
+}
+
 template <typename OpType>
 static std::optional<ArrayAttr>
 getCacheControlMetadata(ConversionPatternRewriter &rewriter, OpType op,
@@ -145,6 +172,96 @@ static LLVM::CallOp createDeviceFunctionCall(
   return callOp;
 }
 
+class DPASToOCLPattern : public OpConversionPattern<xevm::DPASOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(xevm::DPASOp op, xevm::DPASOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    constexpr uint32_t bitWidthPackedA{16};
+    constexpr uint32_t bitWidthPackedB{32};
+    auto loc = op.getLoc();
+
+    auto castIfNeeded = [&](Value val, Type packedType) -> Value {
+      VectorType origTy = cast<VectorType>(val.getType());
+      const uint32_t vecBitSize =
+          origTy.getNumElements() *
+          origTy.getElementType().getIntOrFloatBitWidth();
+      VectorType newTy = VectorType::get(
+          vecBitSize / packedType.getIntOrFloatBitWidth(), packedType);
+      if (origTy != newTy)
+        val = rewriter.create<LLVM::BitcastOp>(loc, newTy, val);
+      return val;
+    };
+
+    Value a = op.getA();
+    Type packedAType = (op.getPa() == xevm::PrecisionType::TF32)
+                           ? cast<Type>(rewriter.getF32Type())
+                           : rewriter.getIntegerType(bitWidthPackedA);
+    a = castIfNeeded(a, packedAType);
+
+    Value b = op.getB();
+    Type packedBType = (op.getPb() == xevm::PrecisionType::TF32)
+                           ? cast<Type>(rewriter.getF32Type())
+                           : rewriter.getIntegerType(bitWidthPackedB);
+    b = castIfNeeded(b, packedBType);
+
+    Value c = op.getC();
+    VectorType cOrigTy = cast<VectorType>(c.getType());
+    assert(cOrigTy == op->getResultTypes()[0] &&
+           "Accumulator and result type mismatch");
+    // OCL builtins encode bfloat16 as int16
+    VectorType cTy =
+        cOrigTy.getElementType().isBF16()
+            ? VectorType::get(cOrigTy.getShape(), rewriter.getIntegerType(16))
+            : cOrigTy;
+    if (cOrigTy != cTy)
+      c = rewriter.create<LLVM::BitcastOp>(loc, cTy, c);
+
+    constexpr int32_t systolicDepth{8};
+    std::string fnName =
+        llvm::formatv("intel_sub_group_{0}_{1}_matrix_mad_k{2}",
+                      stringifyPrecisionType(op.getPa()).str(),
+                      stringifyPrecisionType(op.getPb()).str(),
+                      systolicDepth * getNumOperandsPerDword(op.getPa()))
+            .str();
+    SmallVector<Type> argTypes{a.getType(), b.getType(), cTy};
+    fnName = mangle(fnName, argTypes);
+    SmallVector<Value> args{a, b, c};
+
+    auto memAttr = rewriter.getAttr<LLVM::MemoryEffectsAttr>(
+        /*other=*/LLVM::ModRefInfo::NoModRef,
+        /*argMem=*/LLVM::ModRefInfo::NoModRef,
+        /*inaccessibleMem=*/LLVM::ModRefInfo::NoModRef);
+    auto funcAttrs = convergentNoUnwindWillReturnAttrs;
+    funcAttrs.memEffectsAttr = memAttr;
+    Value result = createDeviceFunctionCall(rewriter, fnName, cTy, argTypes,
+                                            args, {}, funcAttrs)
+                       ->getResult(0);
+
+    if (cOrigTy != cTy)
+      result = rewriter.create<LLVM::BitcastOp>(loc, cOrigTy, result);
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+
+private:
+  static unsigned getNumOperandsPerDword(xevm::PrecisionType pTy) {
+    switch (pTy) {
+    case xevm::PrecisionType::TF32:
+      return 1;
+    case xevm::PrecisionType::BF16:
+    case xevm::PrecisionType::FP16:
+      return 2;
+    case xevm::PrecisionType::U8:
+    case xevm::PrecisionType::S8:
+      return 4;
+    default:
+      llvm_unreachable("unsupported xevm::PrecisionType");
+    }
+  }
+};
+
 template <typename OpType>
 class LoadStorePrefetchToOCLPattern : public OpConversionPattern<OpType> {
   using OpConversionPattern<OpType>::OpConversionPattern;
@@ -292,10 +409,11 @@ struct ConvertXeVMToLLVMPass
 //===----------------------------------------------------------------------===//
 
 void mlir::populateXeVMToLLVMConversionPatterns(RewritePatternSet &patterns) {
-  patterns.add<LoadStorePrefetchToOCLPattern<BlockLoad2dOp>,
-               LoadStorePrefetchToOCLPattern<BlockStore2dOp>,
-               LoadStorePrefetchToOCLPattern<BlockPrefetch2dOp>>(
-      patterns.getContext());
+  patterns
+      .add<LoadStorePrefetchToOCLPattern<BlockLoad2dOp>,
+           LoadStorePrefetchToOCLPattern<BlockStore2dOp>,
+           LoadStorePrefetchToOCLPattern<BlockPrefetch2dOp>, DPASToOCLPattern>(
+          patterns.getContext());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/gc/ExecutionEngine/OpenCLRuntime/OpenCLRuntimeWrappers.cpp b/lib/gc/ExecutionEngine/OpenCLRuntime/OpenCLRuntimeWrappers.cpp
@@ -358,7 +358,14 @@ static cl_program loadModule(GPUCLQUEUE *queue, const unsigned char *data,
         "-DPASTokenReduction -Xfinalizer -SWSBDepReduction -Xfinalizer "
         "'-printregusage -enableBCR' -cl-kernel-arg-info -x spir";
   }
-  CL_SAFE_CALL(clBuildProgram(program, 0, NULL, build_flags, NULL, NULL));
+  err = clBuildProgram(program, 1, &queue->device_, build_flags, NULL, NULL);
+  if (err != CL_SUCCESS) {
+    char log[10240];
+    clGetProgramBuildInfo(program, queue->device_, CL_PROGRAM_BUILD_LOG,
+                          sizeof(log), log, nullptr);
+    fprintf(stderr, "Build failed: %s\n", std::string(log).c_str());
+    abort();
+  }
   if (takeOwnership)
     queue->programs_.push_back(program);
   return program;
@@ -414,6 +421,12 @@ static void launchKernel(GPUCLQUEUE *queue, cl_kernel kernel, size_t gridX,
   }
   size_t globalSize[3] = {gridX * blockX, gridY * blockY, gridZ * blockZ};
   size_t localSize[3] = {blockX, blockY, blockZ};
+  size_t sgSize;
+  CL_SAFE_CALL(clGetKernelSubGroupInfo(
+      kernel, queue->device_, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
+      sizeof(globalSize), &globalSize, sizeof(sgSize), &sgSize, nullptr));
+  // printf("Kernel's sub-group size: %zu\n", sgSize);
+
   CL_SAFE_CALL(clEnqueueNDRangeKernel(queue->queue_, kernel, 3, NULL,
                                       globalSize, localSize, 0, NULL, NULL));
 }
diff --git a/test/mlir/test/gc/cpu-runner/GPU/xevm_block_dpas.mlir b/test/mlir/test/gc/cpu-runner/GPU/xevm_block_dpas.mlir
@@ -0,0 +1,117 @@
+// RUN: gc-opt %s --convert-xevm-to-llvm --xevm-attach-target --convert-scf-to-cf --convert-cf-to-llvm --convert-arith-to-llvm --convert-gpu-to-llvm-spv --gpu-to-llvm --reconcile-unrealized-casts --cse --gpu-module-to-binary | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s
+
+module @gemm attributes {gpu.container_module} {
+  gpu.module @kernel {
+    // - Sets of `matrix_mad` intrinsics can differ based on device's *minimal* supported sub-group size.
+    //   The *minimum supported* sub-group size should be used to call `matrix_mad` intrinsics.
+    // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_matrix_multiply_accumulate.html
+    
+    gpu.func @block_dpas(%a: !llvm.ptr<1>, %b: !llvm.ptr<1>, %c: !llvm.ptr<1>) kernel attributes {intel_reqd_sub_group_size = 16 : i32} {
+      %base_width_a = arith.constant 32 : i32
+      %base_height_a = arith.constant 8 : i32
+      %base_pitch_a = arith.constant 32 : i32
+      %x = arith.constant 0 : i32
+      %y = arith.constant 0 : i32
+      %loaded_a = xevm.blockload2d %a, %base_width_a, %base_height_a, %base_pitch_a, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi16>
+
+      %base_width_b = arith.constant 32 : i32
+      %base_height_b = arith.constant 16 : i32
+      %base_pitch_b = arith.constant 32 : i32
+      %loaded_b1 = xevm.blockload2d %b, %base_width_b, %base_height_b, %base_pitch_b, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=16, v_blocks=1, transpose=false, vnni_transform=false, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16>
+      %loaded_b_casted = vector.bitcast %loaded_b1 : vector<16xi16> to vector<8xi32>
+
+      %base_width_c = arith.constant 64 : i32
+      %base_height_c = arith.constant 8 : i32
+      %base_pitch_c = arith.constant 64 : i32
+      %loaded_c = xevm.blockload2d %c, %base_width_c, %base_height_c, %base_pitch_c, %x, %y {elem_size_in_bits=32, tile_width=16, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
+
+      %loaded_c_casted = vector.bitcast %loaded_c : vector<8xi32> to vector<8xf32>
+      %c_result = xevm.dpas %loaded_c_casted, %loaded_a, %loaded_b_casted {pa = f16, pb = f16, rc = 8} : (vector<8xf32>, vector<8xi16>, vector<8xi32>) -> vector<8xf32>
+      %c_result_casted = vector.bitcast %c_result : vector<8xf32> to vector<8xi32>
+
+      xevm.blockstore2d %c, %base_width_c, %base_height_c, %base_pitch_c, %x, %y, %c_result_casted {elem_size_in_bits=32, tile_width=16, tile_height=8, v_blocks=1, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
+      gpu.return
+    }
+  }
+
+  func.func @test(%a : memref<8x16xf16>, %b : memref<16x16xf16>, %c : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index 
+
+    %memref_a = gpu.alloc host_shared () : memref<8x16xf16>
+    memref.copy %a, %memref_a : memref<8x16xf16> to memref<8x16xf16>
+    %a_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_a : memref<8x16xf16> -> index
+    %a_ptr_as_i64 = arith.index_cast %a_ptr_as_idx : index to i64
+    %a_ptr = llvm.inttoptr %a_ptr_as_i64 : i64 to !llvm.ptr
+    %a_ptr_casted = llvm.addrspacecast %a_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    %memref_b = gpu.alloc host_shared () : memref<16x16xf16>
+    memref.copy %b, %memref_b : memref<16x16xf16> to memref<16x16xf16>
+    %b_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_b : memref<16x16xf16> -> index
+    %b_ptr_as_i64 = arith.index_cast %b_ptr_as_idx : index to i64
+    %b_ptr = llvm.inttoptr %b_ptr_as_i64 : i64 to !llvm.ptr
+    %b_ptr_casted = llvm.addrspacecast %b_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    %memref_c = gpu.alloc host_shared () : memref<8x16xf32>
+    memref.copy %c, %memref_c : memref<8x16xf32> to memref<8x16xf32>
+    %c_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_c : memref<8x16xf32> -> index
+    %c_ptr_as_i64 = arith.index_cast %c_ptr_as_idx : index to i64
+    %c_ptr = llvm.inttoptr %c_ptr_as_i64 : i64 to !llvm.ptr
+    %c_ptr_casted = llvm.addrspacecast %c_ptr : !llvm.ptr to !llvm.ptr<1>
+
+    gpu.launch_func @kernel::@block_dpas blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%a_ptr_casted : !llvm.ptr<1>, %b_ptr_casted : !llvm.ptr<1>, %c_ptr_casted : !llvm.ptr<1>)
+    return %memref_c : memref<8x16xf32>
+  }
+
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %A = memref.alloc() : memref<8x16xf16>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+
+    scf.for %i = %c0 to %c8 step %c1 {
+      scf.for %j = %c0 to %c16 step %c1 {
+        %row_idx = arith.index_cast %i : index to i32
+        %row = arith.sitofp %row_idx : i32 to f16
+        memref.store %row, %A[%i, %j] : memref<8x16xf16>
+      }
+    }
+    %B = memref.alloc() : memref<16x16xf16>
+    scf.for %i = %c0 to %c16 step %c1 {
+      scf.for %j = %c0 to %c16 step %c1 {
+        %col_idx = arith.index_cast %j : index to i32
+        %col = arith.sitofp %col_idx : i32 to f16
+        memref.store %col, %B[%i, %j] : memref<16x16xf16>
+      }
+    }
+
+    %C = memref.alloc() : memref<8x16xf32>
+    %c0_f16 = arith.constant 0.0 : f32
+    scf.for %i = %c0 to %c8 step %c1 {
+      scf.for %j = %c0 to %c16 step %c1 {
+        memref.store %c0_f16, %C[%i, %j] : memref<8x16xf32>
+      }
+    }
+
+    %C_res = call @test(%A, %B, %C) : (memref<8x16xf16>, memref<16x16xf16>, memref<8x16xf32>) -> memref<8x16xf32>
+    %C_cast = memref.cast %C_res : memref<8x16xf32> to memref<*xf32>
+    %A_cast = memref.cast %A : memref<8x16xf16> to memref<*xf16>
+    call @printMemrefF32(%C_cast) : (memref<*xf32>) -> ()
+
+    // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-NEXT: [0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]
+    // CHECK-NEXT: [0,   16,   32,   48,   64,   80,   96,   112,   128,   144,   160,   176,   192,   208,   224,   240]
+    // CHECK-NEXT: [0,   32,   64,   96,   128,   160,   192,   224,   256,   288,   320,   352,   384,   416,   448,   480]
+    // CHECK-NEXT: [0,   48,   96,   144,   192,   240,   288,   336,   384,   432,   480,   528,   576,   624,   672,   720]
+    // CHECK-NEXT: [0,   64,   128,   192,   256,   320,   384,   448,   512,   576,   640,   704,   768,   832,   896,   960]
+    // CHECK-NEXT: [0,   80,   160,   240,   320,   400,   480,   560,   640,   720,   800,   880,   960,   1040,   1120,   1200]
+    // CHECK-NEXT: [0,   96,   192,   288,   384,   480,   576,   672,   768,   864,   960,   1056,   1152,   1248,   1344,   1440]
+    // CHECK-NEXT: [0,   112,   224,   336,   448,   560,   672,   784,   896,   1008,   1120,   1232,   1344,   1456,   1568,   1680]
+
+    return
+  }
+  func.func private @printMemrefF16(%ptr : memref<*xf16>) attributes { llvm.emit_c_interface }
+  func.func private @printMemrefF32(%ptr : memref<*xf32>) attributes { llvm.emit_c_interface }
+
+}
diff --git a/test/mlir/test/gc/cpu-runner/GPU/xevm_block_load_store.mlir b/test/mlir/test/gc/cpu-runner/GPU/xevm_block_load_store.mlir