Merge pull request #8 from Xilinx/ferdinand.FXML-1303_linearRelu

Ferdinand Lemaire · web-flow · commit 9a62f7a799cf · 2022-12-12T08:45:56.000+01:00
Add linearRelu op to linalg structured ops
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -5766,3 +5766,120 @@ structured_op: !LinalgStructuredOpConfig
                           scalar_const: '2.3283063999999999E-10 : f64'
             - !ScalarExpression
               scalar_arg: min
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: linear_relu
+  cpp_class_name: LinearReluOp
+  doc: |-
+    Performs a linear/fully-connected + relu operation
+
+    This is a long description that I'll fill later
+
+    Layout:
+      * I: WH (Input)
+      * W: WH (Weights)
+      * B: H  (Bias)
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2] -> (s0, s1)>
+  - !LinalgOperandDefConfig
+    name: W
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2] -> (s2, s1)>
+  - !LinalgOperandDefConfig
+    name: B
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2] -> (s2)>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2] -> (s0, s2)>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0, d1)>
+    - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d2, d1)>
+    - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d2)>
+    - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0, d2)>
+  iterator_types:
+  - parallel
+  - reduction
+  - parallel
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: add
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_fn:
+            kind: binary
+            fn_name: add
+            operands:
+            - !ScalarExpression
+              scalar_fn:
+                kind: binary
+                fn_name: mul
+                operands:
+                - !ScalarExpression
+                  scalar_arg: I
+                - !ScalarExpression
+                  scalar_arg: W
+            - !ScalarExpression
+              scalar_arg: B
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: relu_nc
+  cpp_class_name: ReluNcOp
+  doc: |-
+    Applies the ReLU activation function to every value in the tensor.
+
+    Layout:
+      * Input: NC
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: IFM
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1] -> (s0, s1)>
+  - !LinalgOperandDefConfig
+    name: OFM
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1] -> (s0, s1)>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1)[s0, s1] -> (d0, d1)>
+    - affine_map<(d0, d1)[s0, s1] -> (d0, d1)>
+  iterator_types:
+  - parallel
+  - parallel
+  assignments:
+  - !ScalarAssign
+    arg: OFM
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: max_signed
+        operands:
+        - !ScalarExpression
+          scalar_arg: IFM
+        - !ScalarExpression
+          scalar_fn:
+            kind: type
+            fn_name: cast_signed
+            type_var: T1
+            operands:
+            - !ScalarExpression
+              scalar_const: '0.000000e+00 : f64'
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Unfuse.cpp b/mlir/lib/Dialect/Linalg/Transforms/Unfuse.cpp
@@ -24,6 +24,7 @@
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
 
@@ -650,12 +651,9 @@ struct GlobalAveragePool2DLowering : OpRewritePattern<GlobalAveragePool2DOp> {
   }
 };
 
-/// Torch MLIR does a similar lowering for their Linear operator to lin alg
-/// here we implement the same so we can run tests using the unfused version
-struct LinearLowering : OpRewritePattern<LinearOp> {
-  using OpRewritePattern<LinearOp>::OpRewritePattern;
-  LogicalResult matchAndRewrite(LinearOp op,
-                                PatternRewriter &rewriter) const override {
+template <class Linear>
+static Value unfuseLinear(Linear &op, PatternRewriter &rewriter) {
+    
     Location loc = op.getLoc();
     Value input = op.getOperand(0);
     Value weights = op.getOperand(1);
@@ -690,10 +688,35 @@ struct LinearLowering : OpRewritePattern<LinearOp> {
             ->getResult(0);
 
     // Create the matmul operation that does the multiplcation and addition
-    rewriter.replaceOpWithNewOp<MatmulOp>(op, output.getType(),
-                                          ValueRange{input, transposeWeightsOp},
-                                          broadcastBiasOp);
+    auto newOp = rewriter.create<MatmulOp>(loc, outputType, ValueRange{op.getOperand(0), transposeWeightsOp},
+                  broadcastBiasOp).getResult(0);
+    return newOp;
+}
+/// Torch MLIR does a similar lowering for their Linear operator to lin alg
+/// here we implement the same so we can run tests using the unfused version
+struct LinearLowering : OpRewritePattern<LinearOp> {
+  using OpRewritePattern<LinearOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(LinearOp op,
+                                PatternRewriter &rewriter) const override {
+    Value matmul = unfuseLinear<LinearOp>(op, rewriter);
+    rewriter.replaceOp(op, matmul);
+    return success();
+  }
+};
 
+
+struct LinearReluLowering : OpRewritePattern<LinearReluOp> {
+  using OpRewritePattern<LinearReluOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(LinearReluOp op,
+                                PatternRewriter &rewriter) const override {
+
+    Value linearResult = unfuseLinear<LinearReluOp>(op, rewriter);
+
+    rewriter.replaceOpWithNewOp<ReluNcOp>(
+        op,
+        /*resultTensorTypes=*/linearResult.getType(),
+        /*inputs=*/linearResult,
+        /*outputs=*/linearResult);
     return success();
   }
 };
@@ -711,7 +734,8 @@ struct LinalgUnfusePass : public impl::LinalgUnfuseBase<LinalgUnfusePass> {
                  Conv2DTensorAddLreluAveragePoolLowering,
                  Conv2DActivationMaxpoolOpLowering<Conv2DLreluMaxpoolOp>,
                  Conv2DActivationMaxpoolOpLowering<Conv2DReluMaxpoolOp>,
-                 SoftmaxLowering, GlobalAveragePool2DLowering, LinearLowering>(
+                 SoftmaxLowering, GlobalAveragePool2DLowering, LinearLowering,
+                 LinearReluLowering>(
         &getContext());
 
     (void)applyPatternsAndFoldGreedily(getOperation().getBody(),
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
@@ -1377,3 +1377,39 @@ def fill_rng_2d(min=ScalarDef(F64),
   scaling = (max - min) * inv_range
   O[D.m, D.n] = TypeFn.cast_signed(
       T, (offset + TypeFn.cast_signed(F64, rand2)) * scaling + min)
+
+@linalg_structured_op
+def linear_relu(
+    I=TensorDef(T1, S.W, S.H),
+    W=TensorDef(T1,  S.K, S.H),
+    B=TensorDef(T1,  S.K),
+    O=TensorDef(T1,  S.W, S.K, output=True)):
+  """Performs a linear/fully-connected + relu operation
+
+  Performs a linear operation followed by a Relu
+
+  Layout:
+    * I: WH (Input)
+    * W: WH (Weights)
+    * B: H  (Bias)
+  """
+  domain(D.W, D.H, D.K)
+  # implementation is incorrect the addition of the bias should happen after
+  # the multiplication, not on each element
+  O[D.W, D.K] += I[D.W, D.H]*W[D.K, D.H] + B[D.K] 
+
+  
+@linalg_structured_op
+def relu_nc(
+    IFM=TensorDef(T1, Batch, S.C ),
+    OFM=TensorDef(T1, Batch, S.C, output=True )):
+  """Applies the ReLU activation function to every value in the tensor.
+  
+  Layout:
+    * Input: NC
+  """
+  domain(D.b, D.c)
+  OFM[D.b, D.c] = BinaryFn.max_signed(
+    IFM[D.b, D.c], TypeFn.cast_signed(T1, const(0.0))
+  )
+
diff --git a/mlir/test/Dialect/Linalg/unfuse.mlir b/mlir/test/Dialect/Linalg/unfuse.mlir
@@ -448,6 +448,26 @@ func.func @unfuse_linear(%input: tensor<1x2048xf32>, %weights: tensor<1000x2048x
 // CHECK:  %[[bias2dshape:.+]] = tensor.empty() : tensor<1x1000xf32>
 // CHECK:  %[[bias2d:.+]] = linalg.broadcast_1d_to_2d ins(%arg2 : tensor<1000xf32>) outs(%2 : tensor<1x1000xf32>) -> tensor<1x1000xf32>
 // CHECK:  %[[out:.+]] = linalg.matmul ins(%[[input]], %[[tweights]] : tensor<1x2048xf32>, tensor<2048x1000xf32>) outs(%[[bias2d]] : tensor<1x1000xf32>) -> tensor<1x1000xf32
+// CHECK: return %[[out]]
+
+    return %result : tensor<1x1000xf32>
+}
+
+// -----
+
+// CHECK:  func.func @unfuse_linearRelu
+// CHECK-SAME: %[[input:.+]]: tensor<1x2048xf32>, %[[weights:.+]]: tensor<1000x2048xf32>, %[[bias:.+]]: tensor<1000xf32>
+func.func @unfuse_linearRelu(%input: tensor<1x2048xf32>, %weights: tensor<1000x2048xf32>, %bias: tensor<1000xf32>) -> tensor<1x1000xf32> {
+    %zero = arith.constant 0.0 : f32
+    %init = tensor.splat %zero : tensor<1x1000xf32>
+    %result = linalg.linear_relu ins(%input, %weights, %bias: tensor<1x2048xf32>, tensor<1000x2048xf32>, tensor<1000xf32>) outs(%init: tensor<1x1000xf32>) -> tensor<1x1000xf32>
+
+// CHECK:  %[[tweightshape:.+]] = tensor.empty() : tensor<2048x1000xf32>
+// CHECK:  %[[tweights:.+]] = linalg.transpose2d ins(%arg1 : tensor<1000x2048xf32>) outs(%0 : tensor<2048x1000xf32>) -> tensor<2048x1000xf32>
+// CHECK:  %[[bias2dshape:.+]] = tensor.empty() : tensor<1x1000xf32>
+// CHECK:  %[[bias2d:.+]] = linalg.broadcast_1d_to_2d ins(%arg2 : tensor<1000xf32>) outs(%2 : tensor<1x1000xf32>) -> tensor<1x1000xf32>
+// CHECK:  %[[matmul:.+]] = linalg.matmul ins(%[[input]], %[[tweights]] : tensor<1x2048xf32>, tensor<2048x1000xf32>) outs(%[[bias2d]] : tensor<1x1000xf32>) -> tensor<1x1000xf32
+// CHECK:  %[[out:.*]] = linalg.relu_nc ins(%[[matmul]] : tensor<1x1000xf32>) outs(%[[matmul]] : tensor<1x1000xf32>) -> tensor<1x1000xf32>
 // CHECK: return %[[out]]
 
     return %result : tensor<1x1000xf32>