diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py index 10c555ae929..f1dfb5f1323 100644 --- a/backends/arm/quantizer/arm_quantizer.py +++ b/backends/arm/quantizer/arm_quantizer.py @@ -1109,6 +1109,23 @@ def _remove_annotations(self, model: GraphModule) -> GraphModule: return model + def _log_nonquantized_nodes(self, model: GraphModule) -> None: + non_quantized_nodes = [ + n + for n in model.graph.nodes + if n.meta.get(DISALLOW_TFA_META_KEY, True) and n.op != "get_attr" + ] + if len(non_quantized_nodes) > 0: + msg = """ +---------------------------------------------------------------------------------------------------- + PRE-TRANSFORM FOR ANNOTATION QUANTIZATION REPORT +---------------------------------------------------------------------------------------------------- +The following nodes are not marked for quantization and will not be decomposed in the transform for annotation pipeline:\n""" + for node in non_quantized_nodes: + msg += f" {node.name}\n" + + logger.debug(msg) + def transform_for_annotation(self, model: GraphModule) -> GraphModule: # Transform_for_annotation should only decompose ops if quantized, which is # indicated either by node.meta['DISALLOW_TFA_META_KEY']==False or no such key @@ -1121,15 +1138,13 @@ def transform_for_annotation(self, model: GraphModule) -> GraphModule: # run to set DISALLOW_TFA_META_KEY for quantized nodes and all nodes missing # this key afterwards are set to DISALLOW_TFA_META_KEY=True. - reporter = QuantizerReporter( - self.quantizers, "PRE-TRANSFORM_FOR_ANNOTATION QUANTIZATION REPORT" # type: ignore[arg-type] - ) model = super().annotate(model) - reporter.log_quantizer_report(model) for node in model.graph.nodes: if DISALLOW_TFA_META_KEY not in node.meta: node.meta[DISALLOW_TFA_META_KEY] = True + self._log_nonquantized_nodes(model) + pass_manager = ArmPassManager(self.compile_spec) transformed_model = pass_manager.transform_for_annotation_pipeline(model) diff --git a/backends/arm/quantizer/arm_quantizer_utils.py b/backends/arm/quantizer/arm_quantizer_utils.py index 453c0d3f4cc..190e8a57cd8 100644 --- a/backends/arm/quantizer/arm_quantizer_utils.py +++ b/backends/arm/quantizer/arm_quantizer_utils.py @@ -592,46 +592,41 @@ def _annotate_shared_cluster(self, root_node: Node) -> None: node_order = {node: index for index, node in enumerate(root_node.graph.nodes)} ordered_nodes = sorted(shared_nodes, key=lambda node: node_order.get(node, 0)) - if len(adjacent_qspecs) > 0: - if len(adjacent_qspecs) > 1: - logger.warning( - f"Multiple adjacent quantization specs found for {', '.join([n.name for n in ordered_nodes])}, all nodes will share the input quantization spec of {root_node.name}." - ) + # Ensure the root node is the first one in the graph. + root_node = ordered_nodes[0] + if len(adjacent_qspecs) > 0: root_node_float_inputs = self._get_input_nodes_with_float_output(root_node) - if len(root_node_float_inputs) == 0: - self.report_reject( - ordered_nodes, - "Couldn't find any floating point input to base shared quantization spec on.", - ) - return - root_node_first_input = root_node_float_inputs[0] - - shared_qspec = SharedQuantizationSpec((root_node_first_input, root_node)) - for node in shared_nodes: - input_qspec_map: dict[Node, Optional[QuantizationSpec]] = { - n: shared_qspec # type: ignore[misc] - for n in self._get_input_nodes_with_float_output(node) - } - if len(self._get_user_nodes_with_float_input(node)) == 0: - output_qspec = None - else: - output_qspec = shared_qspec - _mark_node_as_quantized( - node, input_qspec_map, output_qspec, is_quantized=True + if len(root_node_float_inputs) > 0: + + root_node_first_input = root_node_float_inputs[0] + shared_qspec = SharedQuantizationSpec( + (root_node_first_input, root_node) ) + for node in shared_nodes: + input_qspec_map: dict[Node, Optional[QuantizationSpec]] = { + n: shared_qspec # type: ignore[misc] + for n in self._get_input_nodes_with_float_output(node) + } + if len(self._get_user_nodes_with_float_input(node)) == 0: + output_qspec = None + else: + output_qspec = shared_qspec + _mark_node_as_quantized( + node, input_qspec_map, output_qspec, is_quantized=True + ) - root_node.meta[Q_ANNOTATION_KEY].input_qspec_map[root_node_first_input] = ( - adjacent_qspecs[0] - ) - self.report_accept(ordered_nodes) + root_node.meta[Q_ANNOTATION_KEY].input_qspec_map[ + root_node_first_input + ] = adjacent_qspecs[0] + self.report_accept(ordered_nodes) + return - else: - self.report_reject( - ordered_nodes, - "Couldn't find any adjacent quantization spec to base shared quantization spec on. You may however quantize these nodes manually if required.", - ) - return + self.report_reject( + ordered_nodes, + "All inputs and outputs to these nodes are non-quantized.", + ) + return def annotate(self, model: torch.fx.GraphModule) -> None: # type: ignore[override] for node in model.graph.nodes: diff --git a/backends/cortex_m/quantizer_reporter.py b/backends/cortex_m/quantizer_reporter.py index 5e423672cd1..72bee90cf8d 100644 --- a/backends/cortex_m/quantizer_reporter.py +++ b/backends/cortex_m/quantizer_reporter.py @@ -383,7 +383,9 @@ def unannotated_nodes_report( non_quantized_nodes: list[Node] = [] else: non_quantized_nodes = [ - node for node in model.graph.nodes if Q_ANNOTATION_KEY not in node.meta + node + for node in model.graph.nodes + if Q_ANNOTATION_KEY not in node.meta and node.op != "get_attr" ] rows = [] diff --git a/examples/arm/quantizer_tutorial.ipynb b/examples/arm/quantizer_tutorial.ipynb index 6c96a939e72..76979316002 100644 --- a/examples/arm/quantizer_tutorial.ipynb +++ b/examples/arm/quantizer_tutorial.ipynb @@ -167,9 +167,25 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### The quantization report\n", + "### Pre-transform for annotation quantization report\n", + "Note that there are two quantization reports printed in this case, this is because the quantization\n", + "annotator is run twice during the quantization for two different purposes.\n", "\n", - "In the logged quantization report each quantizer has added one header describing targeted nodes, the used quantization config, and the supported operators / operator patterns. \n", + "1. Mark nodes to be decomposed by the `transform_for_annotation` pipeline (PRE-TRANSFORM FOR ANNOTATION QUANTIZATION REPORT)\n", + "2. Perform actual quantization after the `transform_for_annotation` pipeline (FINAL QUANTIZATION REPORT)\n", + "\n", + "Consider for example the `torch.div` operator, which is decomposed into one multiplication and one\n", + "reciprocal operator when quantized. Since both ops in the decomposition require different quantization parameters, this decomposition needs to happen before quantization, in the `transform_for_annotation` pipeline, but on the other hand this\n", + "decomposition must not happen if it should be kept in float.\n", + "\n", + "**This is important to be aware of when doing mixed quantization since this means that for an operator to be fully quantized,\n", + "both the original operator and the decomposition needs to be targeted.**\n", + "\n", + "The pre-transform for annotation report prints all nodes which are not marked for decomposition (if any) to make this\n", + "easy to get right. If regular full-graph quantization is done, this is simply skipped.\n", + "\n", + "### The final quantization report\n", + "In the second quantization report, each quantizer has added one header describing targeted nodes, the used quantization config, and the supported operators / operator patterns. \n", "```\n", "PatternQuantizer using NodeNameNodeFinder targeting names: conv2d, relu\n", "Annotating with executorch.backends.arm.quantizer.arm_quantizer.get_symmetric_quantization_config(is_per_channel=True)\n", @@ -187,7 +203,7 @@ "```\n", " NODE NAME INPUT QSPEC MAP OUTPUT QSPEC MAP\n", " -- ----------- ---------------------------------------- ---------------------\n", - " ╒ conv2d x: INT8_PER_TENSOR_QSPEC NO_QSPEC\n", + " ╒ conv2d x: INT8_PER_TENSOR_QSPEC None\n", " | _param_constant0: INT8_PER_CHANNEL_QSPEC\n", " | _param_constant1: DERIVED_QSPEC\n", " ╘ relu \n", @@ -198,16 +214,6 @@ "many different quantization annotations for different types of tensors; per tensor for\n", "activations, per channel for weights, and a special quantization spec for the int32 bias. \n", "\n", - "### Pre-transform for annotation vs. final quantization report\n", - "One important detail is that there are two reports printed, one named PRE-TRANSFORM_FOR_ANNOTATION QUANTIZATION REPORT,\n", - "and one named FINAL QUANTIZATION REPORT. This is related to the fact that some operators has to be decomposed before quantization to ensure\n", - "that all \"sub operators\" gets quantized properly. As an example, the division operator in the first report\n", - "has decomposed into a reciprocal and multiplication operator in the second. Had it not been marked for quantization\n", - "in the first step, it would have remained a single division operator.\n", - "\n", - "**This is important to be aware of when doing mixed quantization since this means that for an operator to be fully quantized,\n", - "both the original operator and the decomposition needs to be targeted.**\n", - "\n", "### SharedQspecQuantizer\n", "Last in the report there is always an additional quantizer applied which is not specified by the user, the SharedQspecQuantizer.\n", "It handles data shuffling operators without numerical behaviour such as copies and reshapes to ensure that they are quantized with the same qspec as\n",