create staticmethod for quantizing weights of QATLinear and QATEmbedding

navsud · web-flow · commit 4805efd03b1a · 2025-04-21T12:36:54.000-07:00
Differential Revision: D73201409 Pull Request resolved: #2079
diff --git a/torchao/quantization/qat/embedding.py b/torchao/quantization/qat/embedding.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Any, Optional
+from typing import Any, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -196,15 +196,40 @@ def convert(
         """
         self._convert_helper(model)
         return model
+    
+    @staticmethod
+    def quantize_weights(
+        weight: torch.Tensor,
+        bit_width: int,
+        group_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Helper function to quantize weights
+        """
+        (qmin, qmax) = _get_qmin_qmax(bit_width)
+        (s, zp) = get_group_qparams_symmetric(
+            weight, bit_width, group_size
+        )
+        from torchao._executorch_ops import (
+            _quantized_decomposed_quantize_per_channel_group_wrapper,
+        )
+        q_weight = _quantized_decomposed_quantize_per_channel_group_wrapper(
+            weight,
+            s,
+            zp,
+            qmin,
+            qmax,
+            torch.int8,
+            group_size,
+        )
+        return (q_weight, s, zp)
+
 
     def _convert_helper(self, module: torch.nn.Module):
         """
         Helper function to recursively swap `Int4WeightOnlyQATEmbedding`
         modules with `Int4WeightOnlyEmbedding`
         """
-        from torchao._executorch_ops import (
-            _quantized_decomposed_quantize_per_channel_group_wrapper,
-        )
 
         for name, child in module.named_children():
             if isinstance(child, Int4WeightOnlyQATEmbedding):
@@ -230,20 +255,8 @@ def _convert_helper(self, module: torch.nn.Module):
                 )
                 setattr(module, name, quantized_embedding)
 
+                q_weight, s, zp = self.quantize_weights(child.weight, self.bit_width, group_size)
                 # Load weights and qparams into quantized embedding
-                (qmin, qmax) = _get_qmin_qmax(self.bit_width)
-                (s, zp) = get_group_qparams_symmetric(
-                    child.weight, self.bit_width, group_size
-                )
-                q_weight = _quantized_decomposed_quantize_per_channel_group_wrapper(
-                    child.weight,
-                    s,
-                    zp,
-                    qmin,
-                    qmax,
-                    torch.int8,
-                    group_size,
-                )
                 quantized_embedding.weight = q_weight
                 quantized_embedding.scale = s.to(scale_precision)
                 quantized_embedding.zero_point = zp.to(zero_point_precision)
diff --git a/torchao/quantization/qat/linear.py b/torchao/quantization/qat/linear.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Any, Optional
+from typing import Any, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -197,6 +197,36 @@ def convert(
     ) -> torch.nn.Module:
         self._convert_qat_linear_8da4w(model)
         return model
+    
+    @staticmethod
+    def quantize_weights(
+        weight: torch.Tensor,
+        group_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Helper function to quantize weights
+        """
+        # Load weights and qparams into quantized linear
+        n_bit = 4
+        (qmin, qmax) = _get_qmin_qmax(n_bit)
+        (s, zp) = get_group_qparams_symmetric(
+            weight, n_bit, group_size
+        )
+        from torchao._executorch_ops import (
+            _quantized_decomposed_quantize_per_channel_group_wrapper,
+        )
+
+        q_weight = _quantized_decomposed_quantize_per_channel_group_wrapper(
+            weight,
+            s,
+            zp,
+            qmin,
+            qmax,
+            torch.int8,
+            group_size,
+        )
+        return (q_weight, s, zp)
+
 
     def _convert_qat_linear_8da4w(self, module: torch.nn.Module):
         """
@@ -215,28 +245,10 @@ def _convert_qat_linear_8da4w(self, module: torch.nn.Module):
                 )
                 setattr(module, name, quantized_linear)
 
-                # Load weights and qparams into quantized linear
-                n_bit = 4
-                (qmin, qmax) = _get_qmin_qmax(n_bit)
-                (s, zp) = get_group_qparams_symmetric(
-                    child.weight, n_bit, config.group_size
-                )
-                from torchao._executorch_ops import (
-                    _quantized_decomposed_quantize_per_channel_group_wrapper,
-                )
-
-                q_weight = _quantized_decomposed_quantize_per_channel_group_wrapper(
-                    child.weight,
-                    s,
-                    zp,
-                    qmin,
-                    qmax,
-                    torch.int8,
-                    config.group_size,
-                )
+                q_weight, scales, zeros = self.quantize_weights(child.weight, config.group_size)         
                 quantized_linear.weight = q_weight
-                quantized_linear.scales = s
-                quantized_linear.zeros = zp
+                quantized_linear.scales = scales
+                quantized_linear.zeros = zeros
                 if child.bias is not None:
                     quantized_linear.bias = child.bias
             else: