Skip to content

Commit 20abf34

Browse files
authored
Qualcomm AI Engine Direct - LPBQ enablement (#9313)
### Summary - QC backend changes for adopting LPBQ - test case: conv2d 16a4w - refactor a bit ### Test plan ```bash python backends/qualcomm/tests/test_qnn_delegate.py -k TestQNNQuantizedOperator.test_qnn_backend_conv2d_block -s $SERIAL_NO -m SM8650 -b build-android ```
1 parent 76ae537 commit 20abf34

26 files changed

+650
-130
lines changed

backends/qualcomm/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ PRs are always welcome to help improve the codebase in a comprehensive manner. B
124124
- [shewu-quic](https://github.com/shewu-quic)
125125
- [chunit-quic](https://github.com/chunit-quic)
126126
- [winskuo-quic](https://github.com/winskuo-quic)
127+
- [DannyYuyang-quic](https://github.com/DannyYuyang-quic)
127128
- [haowhsu-quic](https://github.com/haowhsu-quic)
128129

129130
Thanks again for your contribution!

backends/qualcomm/_passes/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
from .annotate_quant_attrs import AnnotateQuantAttrs
33
from .constant_i64_to_i32 import ConstantI64toI32
44
from .convert_bmm_to_matmul import ConvertBmmToMatmul
5-
from .convert_interpolate_with_upsample2d import ConvertInterpolateWithUpsample2D
65
from .convert_to_linear import ConvertToLinear
76
from .decompose_any import DecomposeAny
87
from .decompose_einsum import DecomposeEinsum
@@ -30,7 +29,6 @@
3029
AnnotateQuantAttrs,
3130
ConstantI64toI32,
3231
ConvertBmmToMatmul,
33-
ConvertInterpolateWithUpsample2D,
3432
RecomposePReLU,
3533
ConvertToLinear,
3634
DecomposeAny,

backends/qualcomm/_passes/annotate_quant_attrs.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from executorch.backends.qualcomm.builders.utils import get_parameter, set_parameter
1111
from executorch.backends.qualcomm.utils.constants import (
1212
QCOM_AXIS,
13+
QCOM_BLOCK_SIZE,
1314
QCOM_DTYPE,
1415
QCOM_ENCODING,
1516
QCOM_QUANT_ATTRS,
@@ -122,13 +123,25 @@ def _dequant_fold_params(self, n, quant_attrs, param):
122123
scales = self._expand(quant_attrs[QCOM_SCALES], dim, axis)
123124
offsets = self._expand(quant_attrs[QCOM_ZERO_POINTS], dim, axis)
124125
param = param.sub(offsets).mul(scales).to(torch.float32).contiguous()
125-
set_parameter(param, n.args[0], self.edge_program)
126+
elif quant_attrs[QCOM_ENCODING] in [
127+
exir_ops.edge.pt2e_quant.dequantize_affine.default
128+
]:
129+
param = torch.ops.pt2e_quant.dequantize_affine(
130+
param,
131+
block_size=quant_attrs[QCOM_BLOCK_SIZE],
132+
scale=quant_attrs[QCOM_SCALE],
133+
zero_point=quant_attrs[QCOM_ZERO_POINT],
134+
input_dtype=quant_attrs[QCOM_DTYPE],
135+
quant_min=quant_attrs[QCOM_QUANT_MIN],
136+
quant_max=quant_attrs[QCOM_QUANT_MAX],
137+
output_dtype=torch.float32,
138+
)
126139
else:
127140
scale = quant_attrs[QCOM_SCALE]
128141
offset = quant_attrs[QCOM_ZERO_POINT]
129142
param = param.sub(offset).mul(scale).to(torch.float32).contiguous()
130-
set_parameter(param, n.args[0], self.edge_program)
131143

144+
set_parameter(param, n.args[0], self.edge_program)
132145
n.args[0].meta["val"] = param
133146

134147
def _annotate_quant_attrs(

backends/qualcomm/_passes/convert_interpolate_with_upsample2d.py

Lines changed: 0 additions & 56 deletions
This file was deleted.

backends/qualcomm/_passes/layout_transform.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ class LayoutTransform(ExportPass):
3939
exir_ops.edge.aten.pixel_shuffle.default,
4040
exir_ops.edge.aten.pixel_unshuffle.default,
4141
exir_ops.edge.aten.upsample_bilinear2d.default,
42+
exir_ops.edge.aten.upsample_bilinear2d.vec,
4243
exir_ops.edge.aten.upsample_nearest2d.default,
4344
exir_ops.edge.aten.upsample_nearest2d.vec,
4445
}

backends/qualcomm/_passes/utils.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
import torch
88
from executorch.backends.qualcomm.builders.utils import get_parameter
9-
from executorch.backends.qualcomm.utils.constants import QCOM_ENCODING
9+
from executorch.backends.qualcomm.utils.constants import QCOM_DTYPE, QCOM_ENCODING
1010
from executorch.exir.dialects._ops import ops as exir_ops
1111
from torch._subclasses import FakeTensor
1212

@@ -42,6 +42,10 @@ def get_quant_attrs(
4242
value = get_parameter(attr_n, edge_program)
4343
quant_attrs[quant_attr_keys[i - 1]] = value
4444

45+
# remap key for compatibility - block quantization only
46+
if dtype := quant_attrs.get("input_dtype", None):
47+
quant_attrs[QCOM_DTYPE] = dtype
48+
4549
quant_attrs[QCOM_ENCODING] = quant_node.target
4650
return quant_attrs
4751

@@ -62,7 +66,6 @@ def get_passes_dependency_for_capture_program():
6266
AnnotateQuantAttrs,
6367
ConstantI64toI32,
6468
ConvertBmmToMatmul,
65-
ConvertInterpolateWithUpsample2D,
6669
ConvertToLinear,
6770
DecomposeAny,
6871
DecomposeLinalgVectorNorm,
@@ -85,11 +88,9 @@ def get_passes_dependency_for_capture_program():
8588
ConvertToLinear,
8689
RecomposePReLU,
8790
ConvertBmmToMatmul,
88-
ConvertInterpolateWithUpsample2D,
8991
],
90-
ConstantI64toI32: [ConvertInterpolateWithUpsample2D],
92+
ConstantI64toI32: [RemoveRedundancy],
9193
ConvertBmmToMatmul: [ConvertToLinear],
92-
ConvertInterpolateWithUpsample2D: [RemoveRedundancy],
9394
ConvertToLinear: [RecomposePixelUnshuffle],
9495
DecomposeAny: [RemoveRedundancy],
9596
DecomposeLinalgVectorNorm: [RemoveRedundancy],

backends/qualcomm/aot/ir/qcir.fbs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,13 @@ enum QuantizeType : byte {
5454
AXIS_SCALE_OFFSET,
5555
BW_SCALE_OFFSET,
5656
BW_AXIS_SCALE_OFFSET,
57+
BLOCKWISE_EXPANSION,
58+
UNDEFINED,
59+
}
60+
61+
enum BlockScaleStorageType: byte {
62+
BITWIDTH_SCALE_STORAGE_8 = 0,
63+
BITWIDTH_SCALE_STORAGE_16,
5764
UNDEFINED,
5865
}
5966

@@ -72,6 +79,10 @@ table QuantizeParam {
7279
offsets: [int];
7380
// used by general quantization
7481
data: [ScaleOffset];
82+
// used by block quantization
83+
num_blocks_per_axis: uint;
84+
block_scale_storage_type: BlockScaleStorageType;
85+
block_scale: [ubyte];
7586
}
7687

7788
table Tensor {

backends/qualcomm/aot/ir/qcir_utils.cpp

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,17 +118,22 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
118118
qcir::QuantizeType::BW_SCALE_OFFSET},
119119
{QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET,
120120
qcir::QuantizeType::BW_AXIS_SCALE_OFFSET},
121+
{QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION,
122+
qcir::QuantizeType::BLOCKWISE_EXPANSION},
121123
{QNN_QUANTIZATION_ENCODING_UNDEFINED,
122124
qcir::QuantizeType::UNDEFINED},
123125
};
124126

125127
int32_t axis = 0;
126-
uint32_t bitwidth = 0;
128+
uint32_t bitwidth = 0, num_blocks_per_axis = 0;
127129
auto param = QNN_TENSOR_VER_PTR(tensor)->quantizeParams;
128130
auto quant_type = type_map.at(param.quantizationEncoding);
129131
std::vector<qcir::ScaleOffset> data;
132+
std::vector<uint8_t> block_scale;
130133
std::vector<float> scales;
131134
std::vector<int32_t> offsets;
135+
qcir::BlockScaleStorageType block_scale_storage_type =
136+
qcir::BlockScaleStorageType::BITWIDTH_SCALE_STORAGE_8;
132137
switch (quant_type) {
133138
case qcir::QuantizeType::SCALE_OFFSET: {
134139
data.emplace_back(qcir::ScaleOffset(
@@ -160,6 +165,28 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
160165
offsets.push_back(param.bwAxisScaleOffsetEncoding.offsets[i]);
161166
}
162167
} break;
168+
case qcir::QuantizeType::BLOCKWISE_EXPANSION: {
169+
bitwidth = param.blockwiseExpansion->blockScaleBitwidth;
170+
axis = param.blockwiseExpansion->axis;
171+
uint num_channels = QNN_TENSOR_VER_PTR(tensor)->dimensions[axis];
172+
for (uint i = 0; i < num_channels; ++i) {
173+
data.emplace_back(qcir::ScaleOffset(
174+
param.blockwiseExpansion->scaleOffsets[i].scale,
175+
param.blockwiseExpansion->scaleOffsets[i].offset));
176+
}
177+
num_blocks_per_axis = param.blockwiseExpansion->numBlocksPerAxis;
178+
uint multiplier = 1;
179+
if (param.blockwiseExpansion->blockScaleStorageType ==
180+
QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_16) {
181+
multiplier = 2;
182+
block_scale_storage_type =
183+
qcir::BlockScaleStorageType::BITWIDTH_SCALE_STORAGE_16;
184+
}
185+
uint total_bytes = num_channels * num_blocks_per_axis * multiplier;
186+
block_scale = std::vector<uint8_t>(
187+
param.blockwiseExpansion->blocksScale8,
188+
param.blockwiseExpansion->blocksScale8 + total_bytes);
189+
} break;
163190
default:
164191
// encodings are not required if lowering with floating point precision
165192
break;
@@ -172,7 +199,10 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
172199
axis,
173200
&scales,
174201
&offsets,
175-
&data);
202+
&data,
203+
num_blocks_per_axis,
204+
block_scale_storage_type,
205+
&block_scale);
176206
}
177207

178208
Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor) {
@@ -192,9 +222,14 @@ Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor) {
192222
QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET},
193223
{qcir::QuantizeType::BW_AXIS_SCALE_OFFSET,
194224
QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET},
225+
{qcir::QuantizeType::BLOCKWISE_EXPANSION,
226+
QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION},
195227
{qcir::QuantizeType::UNDEFINED,
196228
QNN_QUANTIZATION_ENCODING_UNDEFINED},
197229
};
230+
// Qnn_BlockwiseExpansion_t is a pointer type in Qnn_QuantizeParams_t
231+
// need a bookkeeper for guarding life cycle
232+
static std::vector<std::unique_ptr<Qnn_BlockwiseExpansion_t>> block_param;
198233

199234
Qnn_QuantizeParams_t p = QNN_QUANTIZE_PARAMS_INIT;
200235
auto param = tensor->qparam();
@@ -226,6 +261,30 @@ Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor) {
226261
p.bwAxisScaleOffsetEncoding.offsets =
227262
const_cast<int32_t*>(param->offsets()->data());
228263
} break;
264+
case QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION: {
265+
block_param.emplace_back(std::make_unique<Qnn_BlockwiseExpansion_t>());
266+
p.blockwiseExpansion = block_param.back().get();
267+
p.blockwiseExpansion->axis = param->axis();
268+
p.blockwiseExpansion->scaleOffsets = reinterpret_cast<Qnn_ScaleOffset_t*>(
269+
const_cast<uint8_t*>(param->data()->Data()));
270+
p.blockwiseExpansion->numBlocksPerAxis = param->num_blocks_per_axis();
271+
switch (param->block_scale_storage_type()) {
272+
case qcir::BlockScaleStorageType::BITWIDTH_SCALE_STORAGE_8:
273+
p.blockwiseExpansion->blockScaleStorageType =
274+
QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8;
275+
break;
276+
case qcir::BlockScaleStorageType::BITWIDTH_SCALE_STORAGE_16:
277+
p.blockwiseExpansion->blockScaleStorageType =
278+
QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_16;
279+
break;
280+
default:
281+
p.blockwiseExpansion->blockScaleStorageType =
282+
QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_UNDEFINED;
283+
break;
284+
}
285+
p.blockwiseExpansion->blocksScale8 =
286+
const_cast<uint8_t*>(param->block_scale()->Data());
287+
} break;
229288
default:
230289
// encodings are not required if lowering with floating point precision
231290
break;

backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,28 @@ std::unique_ptr<QuantizeParamsWrapper> CreateQuantizationParamWrapper(
5959
int32_t offset = quant_info["offset"].cast<int32_t>();
6060
quantize_param_wrapper =
6161
std::make_unique<ScaleOffsetQuantizeParamsWrapper>(scale, offset);
62+
} else if (encoding == QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION) {
63+
int32_t axis = quant_info["axis"].cast<int32_t>();
64+
std::vector<Qnn_ScaleOffset_t> scale_offset =
65+
quant_info["block_scale_offset"].cast<std::vector<Qnn_ScaleOffset_t>>();
66+
uint32_t num_blocks_per_axis =
67+
quant_info["num_blocks_per_axis"].cast<uint32_t>();
68+
uint32_t block_scale_bitwidth =
69+
quant_info["block_scale_bitwidth"].cast<uint32_t>();
70+
Qnn_BlockwiseExpansionBlockScaleStorageType_t block_storage_type =
71+
quant_info["block_storage_type"]
72+
.cast<Qnn_BlockwiseExpansionBlockScaleStorageType_t>();
73+
std::vector<uint8_t> buf =
74+
quant_info["block_scales"].cast<std::vector<uint8_t>>();
75+
quantize_param_wrapper =
76+
std::make_unique<BlockwiseExpansionQuantizeParamsWrapper>(
77+
axis,
78+
scale_offset,
79+
num_blocks_per_axis,
80+
block_scale_bitwidth,
81+
block_storage_type,
82+
buf.data(),
83+
buf.size());
6284
} else {
6385
QNN_EXECUTORCH_LOG_ERROR(
6486
"Unknown the encoding of quantization: %d", encoding);
@@ -179,9 +201,6 @@ PYBIND11_MODULE(PyQnnWrapperAdaptor, m) {
179201
.export_values();
180202

181203
py::enum_<Qnn_QuantizationEncoding_t>(m, "Qnn_QuantizationEncoding_t")
182-
.value(
183-
"QNN_QUANTIZATION_ENCODING_UNDEFINED",
184-
Qnn_QuantizationEncoding_t::QNN_QUANTIZATION_ENCODING_UNDEFINED)
185204
.value(
186205
"QNN_QUANTIZATION_ENCODING_SCALE_OFFSET",
187206
Qnn_QuantizationEncoding_t::QNN_QUANTIZATION_ENCODING_SCALE_OFFSET)
@@ -196,6 +215,29 @@ PYBIND11_MODULE(PyQnnWrapperAdaptor, m) {
196215
"QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET",
197216
Qnn_QuantizationEncoding_t::
198217
QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET)
218+
.value(
219+
"QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION",
220+
Qnn_QuantizationEncoding_t::
221+
QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION)
222+
.value(
223+
"QNN_QUANTIZATION_ENCODING_UNDEFINED",
224+
Qnn_QuantizationEncoding_t::QNN_QUANTIZATION_ENCODING_UNDEFINED)
225+
.export_values();
226+
227+
py::enum_<Qnn_BlockwiseExpansionBlockScaleStorageType_t>(
228+
m, "Qnn_BlockwiseExpansionBlockScaleStorageType_t")
229+
.value(
230+
"QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8",
231+
Qnn_BlockwiseExpansionBlockScaleStorageType_t::
232+
QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8)
233+
.value(
234+
"QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_16",
235+
Qnn_BlockwiseExpansionBlockScaleStorageType_t::
236+
QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_16)
237+
.value(
238+
"QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_UNDEFINED",
239+
Qnn_BlockwiseExpansionBlockScaleStorageType_t::
240+
QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_UNDEFINED)
199241
.export_values();
200242

201243
py::class_<OpWrapper, std::shared_ptr<OpWrapper>>(m, "OpWrapper")
@@ -476,7 +518,6 @@ PYBIND11_MODULE(PyQnnWrapperAdaptor, m) {
476518
return std::vector<Qnn_ScaleOffset_t>(
477519
aso.scaleOffset, aso.scaleOffset + aso.numScaleOffsets);
478520
});
479-
// op_wrapper.GetParams() get std::vector<ParamWrapper*>
480521
}
481522
} // namespace qnn
482523
} // namespace backends

0 commit comments

Comments
 (0)