address @qcolombet comments

grypp · grypp · commit 96da27460fbd · 2023-10-05T10:38:54.000+02:00
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -731,7 +731,7 @@ def NVGPU_WarpgroupMmaOp : NVGPU_Op<"warpgroup.mma"> {
 def NVGPU_WarpgroupMmaStoreOp : NVGPU_Op<"warpgroup.mma.store"> {
   let description = [{
     The `nvgpu.warpgroup.mma.store` op performs the store of fragmented result 
-    in $matrixD to give memref. 
+    in $matrixD to given memref. 
 
     [See the details of register fragment layout for accumulator matrix D]
     (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d) 
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -54,26 +54,6 @@ static Value truncToI32(ImplicitLocOpBuilder &b, Value value) {
   return b.create<LLVM::TruncOp>(b.getI32Type(), value);
 }
 
-/// Returns warp-size as a value.
-static Value getWarpSizeValue(ImplicitLocOpBuilder &b) {
-  static std::optional<Value> warpSize = std::nullopt;
-  if (!warpSize.has_value()) {
-    warpSize = b.create<LLVM::ConstantOp>(IntegerType::get(b.getContext(), 32),
-                                          b.getI32IntegerAttr(kWarpSize));
-  }
-  return warpSize.value();
-}
-
-/// Returns warp-size as a value.
-static Value getWarpSizeValue(ImplicitLocOpBuilder &b) {
-  static std::optional<Value> warpSize = std::nullopt;
-  if (!warpSize.has_value()) {
-    warpSize = b.create<LLVM::ConstantOp>(IntegerType::get(b.getContext(), 32),
-                                          b.getI32IntegerAttr(kWarpSize));
-  }
-  return warpSize.value();
-}
-
 /// Returns the type for the intrinsic given the vectorResultType of the
 /// `gpu.mma.sync` operation.
 static Type inferIntrinsicResultType(Type vectorResultType) {
@@ -1467,7 +1447,7 @@ struct NVGPUWarpgroupMmaStoreOpLowering
   /// Here is what each threads (T) holds, each `d` is struct value with a
   /// number.
   ///
-  /// Threads in warp-group (128 threads) and what they owns in the matriD:
+  /// Threads in warp-group (128 threads) and what they owns in the matrixD:
   /// 0-31 	  Warp-0  -> MatrixD[0:15 ][0:N]
   /// 32-63 	Warp-1  -> MatrixD[16:31][0:N]
   /// 64-95 	Warp-2  -> MatrixD[32:47][0:N]
@@ -1510,7 +1490,7 @@ struct NVGPUWarpgroupMmaStoreOpLowering
     Value c4 = makeConst(4);
     Value c8 = makeConst(8);
     Value c16 = makeConst(16);
-    Value warpSize = getWarpSizeValue(b);
+    Value warpSize = makeConst(kWarpSize);
 
     auto makeMul = [&](Value lhs, Value rhs) -> Value {
       return b.create<LLVM::MulOp>(lhs.getType(), lhs, rhs);
diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -539,7 +539,7 @@ LogicalResult WarpgroupMmaStoreOp::verify() {
                               .getFragmented();
 
   int64_t totalFirstDimension = 0;
-  for (auto result : getMatrixD()) {
+  for (Value result : getMatrixD()) {
     VectorType vtype =
         result.getType().cast<WarpgroupAccumulatorType>().getFragmented();
     if (vtype != firstVtype)
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -866,9 +866,10 @@ func.func @warpgroup_mma_store(
 // CHECK: %[[S311:.+]] = llvm.mlir.constant(4 : i32) : i32
 // CHECK: %[[S313:.+]] = llvm.mlir.constant(8 : i32) : i32
 // CHECK: %[[S316:.+]] = llvm.mlir.constant(16 : i32) : i32
+// CHECK: %[[WS2:.+]] = llvm.mlir.constant(32 : i32) : i32
 // CHECK: %[[S317:.+]] = nvvm.read.ptx.sreg.tid.x : i32
-// CHECK: %[[S318:.+]] = llvm.urem %[[S317]], %[[WarpSize]]  : i32
-// CHECK: %[[S319:.+]] = llvm.udiv %[[S317]], %[[WarpSize]]  : i32
+// CHECK: %[[S318:.+]] = llvm.urem %[[S317]], %[[WS2]]  : i32
+// CHECK: %[[S319:.+]] = llvm.udiv %[[S317]], %[[WS2]]  : i32
 // CHECK: %[[S320:.+]] = llvm.udiv %[[S318]], %[[S311]]  : i32
 // CHECK: %[[S321:.+]] = llvm.urem %[[S318]], %[[S311]]  : i32
 // CHECK: %[[S322:.+]] = llvm.mul %[[S321]], %[[S312]]  : i32