Xilinx · flemairen6 · Dec 7, 2022
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -2478,7 +2478,7 @@ metadata: !LinalgOpMetadata
     The partial multiplication results are reduced into a 2D output.
 
     Numeric casting is performed on the operands to the inner multiply, promoting
-    them to the same data type as the accumulator/output."
+    them to the same data type as the accumulator/output.
   implements:
   - LinalgContractionOpInterface
 structured_op: !LinalgStructuredOpConfig
@@ -4096,38 +4096,39 @@ structured_op: !LinalgStructuredOpConfig
     name: I
     kind: input_tensor
     type_var: T1
-    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s0, s9, s1 *
-      s2 + s3 * s4, s5 * s6 + s7 * s8)>
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s0, s1, s2
+      * s3 + s4 * s5, s6 * s7 + s8 * s9)>
   - !LinalgOperandDefConfig
     name: K
     kind: input_tensor
     type_var: T2
-    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s9, s3, s7)>
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s1, s4, s8)>
   - !LinalgOperandDefConfig
     name: O
     kind: output_tensor
     type_var: U
-    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s0, s9, s1, s5)>
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s0, s1, s2,
+      s6)>
   - !LinalgOperandDefConfig
     name: strides
     kind: index_attr
-    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s2,
-      s6)>
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s3,
+      s7)>
     default_indices:
     - 1
     - 1
   - !LinalgOperandDefConfig
     name: dilations
     kind: index_attr
-    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s4,
-      s8)>
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s5,
+      s9)>
     default_indices:
     - 1
     - 1
   indexing_maps: !LinalgIndexingMapsConfig
     static_indexing_maps:
     - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9]
-      -> (d0, d3, d1 * s2 + d4 * s4, d2 * s6 + d5 * s8)>
+      -> (d0, d3, d1 * s3 + d4 * s5, d2 * s7 + d5 * s9)>
     - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9]
       -> (d3, d4, d5)>
     - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9]
@@ -5766,3 +5767,74 @@ structured_op: !LinalgStructuredOpConfig
                           scalar_const: '2.3283063999999999E-10 : f64'
             - !ScalarExpression
               scalar_arg: min
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: linear_relu
+  cpp_class_name: LinearReluOp
+  doc: |-
+    Performs a linear/fully-connected + relu operation
+
+    This is a long description that I'll fill later
+
+    Layout:
+      * I: WH (Input)
+      * W: WH (Weights)
+      * B: H  (Bias)
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2] -> (s0, s1)>
+  - !LinalgOperandDefConfig
+    name: W
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2] -> (s2, s1)>
+  - !LinalgOperandDefConfig
+    name: B
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2] -> (s2)>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2] -> (s0, s2)>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0, d1)>
+    - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d2, d1)>
+    - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d2)>
+    - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0, d2)>
+  iterator_types:
+  - parallel
+  - reduction
+  - parallel
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: add
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_fn:
+            kind: binary
+            fn_name: add
+            operands:
+            - !ScalarExpression
+              scalar_fn:
+                kind: binary
+                fn_name: mul
+                operands:
+                - !ScalarExpression
+                  scalar_arg: I
+                - !ScalarExpression
+                  scalar_arg: W
+            - !ScalarExpression
+              scalar_arg: B
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Unfuse.cpp b/mlir/lib/Dialect/Linalg/Transforms/Unfuse.cpp
@@ -24,6 +24,7 @@
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
 
@@ -698,6 +699,50 @@ struct LinearLowering : OpRewritePattern<LinearOp> {
   }
 };
 
+struct LinearReluLowering : OpRewritePattern<LinearReluOp> {
+  using OpRewritePattern<LinearReluOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(LinearReluOp op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    Value weights = op.getOperand(1);
+    Value bias = op.getOperand(2);
+
+    auto weightsType = weights.getType().cast<RankedTensorType>();
+    auto biasType = bias.getType().cast<RankedTensorType>();
+    auto outputType = op->getResult(0).getType().cast<RankedTensorType>();
+
+    // Create a linalg op that transposes the weights tensor
+    // The transposedWeights is simply used to describe the output shape.
+    llvm::ArrayRef<int64_t> weightsShape = weightsType.getShape();
+    Value transposedWeights = rewriter.create<tensor::EmptyOp>(
+        loc,
+        ArrayRef<int64_t>{weightsShape[1], weightsShape[0]},
+        weightsType.getElementType());
+    Value transposeWeightsOp =
+        rewriter.create<Transpose2DOp>(loc, weights, transposedWeights)
+            ->getResult(0);
+
+    // Create a linalg op that broadcasts the 1D bias values across
+    // the 2nd dimension
+    Value broadcastedBias = rewriter.create<tensor::EmptyOp>(
+        loc, outputType.getShape(), biasType.getElementType());
+    Value broadcastBiasOp =
+        rewriter.create<Broadcast1DTo2DOp>(loc, bias, broadcastedBias)
+            ->getResult(0);
+
+    auto linearResult = rewriter.create<MatmulOp>(loc,
+                  outputType, ValueRange{op.getOperand(0), transposeWeightsOp},
+                  broadcastBiasOp).getResult(0);
+
+    rewriter.replaceOpWithNewOp<Relu2DNchwOp>(
+        op,
+        /*resultTensorTypes=*/linearResult.getType(),
+        /*inputs=*/linearResult,
+        /*outputs=*/linearResult);
+    return success();
+  }
+};
+
 struct LinalgUnfusePass : public impl::LinalgUnfuseBase<LinalgUnfusePass> {
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());

diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
@@ -1377,3 +1377,23 @@ def fill_rng_2d(min=ScalarDef(F64),
   scaling = (max - min) * inv_range
   O[D.m, D.n] = TypeFn.cast_signed(
       T, (offset + TypeFn.cast_signed(F64, rand2)) * scaling + min)
+
+@linalg_structured_op
+def linear_relu(
+    I=TensorDef(T1, S.W, S.H),
+    W=TensorDef(T1,  S.K, S.H),
+    B=TensorDef(T1,  S.K),
+    O=TensorDef(T1,  S.W, S.K, output=True)):
+  """Performs a linear/fully-connected + relu operation
+
+  This is a long description that I'll fill later
+
+  Layout:
+    * I: WH (Input)
+    * W: WH (Weights)
+    * B: H  (Bias)
+  """
+  domain(D.W, D.H, D.K)
+  # implementation is incorrect the addition of the bias should happen after
+  # the multiplication, not on each element
+  O[D.W, D.K] += I[D.W, D.H]*W[D.K, D.H] + B[D.K]