Add run_tutorials github action and fix existing errors (#1546)

jerryzh168 · web-flow · commit 24a78fea5ee2 · 2025-01-10T15:52:28.000-08:00
* Add run_tutorials github action and fix existing errors

Summary:
Added a GHA button for release oncall to check tutorial code are runnable
can also be enabled by add a tag `ciflow/tutorials`

Test Plan:
CI github action

Reviewers:

Subscribers:

Tasks:

Tags:

* add yml

* add script

* revert profile changes
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
@@ -1,3 +1,4 @@
 mergebot: True
 ciflow_push_tags:
 - ciflow/benchmark
+- ciflow/tutorials
diff --git a/.github/workflows/run_tutorials.yml b/.github/workflows/run_tutorials.yml
@@ -0,0 +1,31 @@
+name: Run tutorials
+
+on:
+  push:
+    tags:
+      - ciflow/tutorials/*
+jobs:
+  run_tutorials:
+    runs-on: linux.aws.a100
+    strategy:
+      matrix:
+        torch-spec:
+          - '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124'
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Setup miniconda
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: "3.9"
+
+      - name: Run tutorials
+        shell: bash
+        run: |
+          set -eux
+          ${CONDA_RUN} python -m pip install --upgrade pip
+          ${CONDA_RUN} pip install ${{ matrix.torch-spec }}
+          ${CONDA_RUN} pip install -r dev-requirements.txt
+          ${CONDA_RUN} pip install .
+          cd tutorials
+          ${CONDA_RUN} sh run_all.sh
diff --git a/torchao/quantization/linear_activation_quantized_tensor.py b/torchao/quantization/linear_activation_quantized_tensor.py
@@ -80,8 +80,10 @@ def _quantized_linear_op(
         input_quant_func = weight_tensor.input_quant_func
         original_weight_tensor = weight_tensor.original_weight_tensor
         quant_kwargs = weight_tensor.quant_kwargs
-        aqt = input_quant_func(input_tensor, **quant_kwargs)
-        return torch.nn.functional.linear(aqt, original_weight_tensor, bias)
+        quantized_tensor = input_quant_func(input_tensor, **quant_kwargs)
+        return torch.nn.functional.linear(
+            quantized_tensor, original_weight_tensor, bias
+        )
 
     @classmethod
     def from_float(
diff --git a/tutorials/calibration_flow/awq_like.py b/tutorials/calibration_flow/awq_like.py
@@ -176,13 +176,13 @@ def test_awq(target_dtype: torch.dtype, mapping_type: MappingType):
     act_obs = AffineQuantizedMinMaxObserver(
         mapping_type,
         target_dtype,
-        granularity_type=PerTensor(),
+        granularity=PerTensor(),
         eps=torch.finfo(torch.float32).eps,
     )
     weight_obs = AffineQuantizedMinMaxObserver(
         mapping_type,
         target_dtype,
-        granularity_type=PerAxis(axis=0),
+        granularity=PerAxis(axis=0),
         eps=torch.finfo(torch.float32).eps,
     )
 
diff --git a/tutorials/calibration_flow/gptq_like.py b/tutorials/calibration_flow/gptq_like.py
@@ -33,21 +33,20 @@
 import torch
 from torch.utils._pytree import tree_flatten, tree_unflatten
 
-from torchao.dtypes import to_affine_quantized_intx_static
+from torchao.dtypes import (
+    to_affine_quantized_intx,
+    to_affine_quantized_intx_static,
+)
 from torchao.quantization import (
+    AffineQuantizedMinMaxObserver,
     LinearActivationQuantizedTensor,
+    MappingType,
+    PerTensor,
+    fake_quantize_affine,
     quantize_,
     to_linear_activation_quantized,
 )
-from torchao.quantization.granularity import PerTensor
-from torchao.quantization.observer import (
-    AffineQuantizedMinMaxObserver,
-)
 from torchao.quantization.quant_api import _replace_with_custom_fn_if_matches_filter
-from torchao.quantization.quant_primitives import (
-    MappingType,
-    fake_quantize_affine,
-)
 from torchao.quantization.utils import compute_error
 
 torch.manual_seed(0)
@@ -211,7 +210,7 @@ def forward_pre_hook(
         act_obs = AffineQuantizedMinMaxObserver(
             MappingType.ASYMMETRIC,
             torch.uint8,
-            granularity_type=PerTensor(),
+            granularity=PerTensor(),
             eps=torch.finfo(torch.float32).eps,
             scale_dtype=torch.float32,
             zero_point_dtype=torch.int32,
@@ -254,8 +253,8 @@ def _register_forward_pre_hook(module: torch.nn.Module):
 
 
 # using a function to align with the API in quant_api
-def apply_activation_static_quant():
-    def _apply_activation_static_quant(observed_linear):
+def apply_activation_static_weight_quant():
+    def _apply_activation_static_weight_quant(observed_linear):
         target_dtype = torch.uint8
 
         # we can quantize the weight here as well
@@ -268,16 +267,21 @@ def _apply_activation_static_quant(observed_linear):
         input_quant_func = lambda x: to_affine_quantized_intx_static(
             x, act_scale, act_zero_point, x.shape, target_dtype
         )
+        # for demo purpose only, we quantize the weight here
+        weight = observed_linear.weight
+        weight = to_affine_quantized_intx(
+            weight, MappingType.SYMMETRIC, (1, weight.shape[-1]), torch.int8
+        )
         observed_linear.weight = torch.nn.Parameter(
-            to_linear_activation_quantized(observed_linear.weight, input_quant_func),
+            to_linear_activation_quantized(weight, input_quant_func),
             requires_grad=False,
         )
 
         del observed_linear.input_scale
         del observed_linear.input_zp
         return observed_linear
 
-    return _apply_activation_static_quant
+    return _apply_activation_static_weight_quant
 
 
 example_inputs = (torch.randn(32, 64),)
@@ -294,7 +298,7 @@ def _apply_activation_static_quant(observed_linear):
 
 # just quantizing activation since we only observed quantization, this could be extended to support
 # quantizing weight as well
-quantize_(m, apply_activation_static_quant(), _is_linear)
+quantize_(m, apply_activation_static_weight_quant(), _is_linear)
 for l in m.modules():
     if isinstance(l, torch.nn.Linear):
         assert isinstance(l.weight, LinearActivationQuantizedTensor)
diff --git a/tutorials/calibration_flow/static_quant.py b/tutorials/calibration_flow/static_quant.py
@@ -13,6 +13,7 @@
     to_affine_quantized_floatx_static,
     to_affine_quantized_intx_static,
 )
+from torchao.float8.inference import Float8MMConfig
 from torchao.quantization import quantize_, to_linear_activation_quantized
 from torchao.quantization.granularity import (
     PerAxis,
@@ -26,6 +27,7 @@
     MappingType,
 )
 from torchao.quantization.utils import compute_error
+from torchao.utils import is_sm_at_least_90
 
 
 class ObservedLinear(torch.nn.Linear):
@@ -90,12 +92,13 @@ def weight_quant_func(weight):
                     weight, weight_scale, weight_zero_point, block_size, target_dtype
                 )
             elif target_dtype == torch.float8_e4m3fn:
+                mm_config = Float8MMConfig(use_fast_accum=True)
                 return to_affine_quantized_floatx_static(
                     weight,
                     weight_scale,
                     block_size,
                     target_dtype,
-                    Float8Layout(mm_config=None),
+                    Float8Layout(mm_config=mm_config),
                 )
             else:
                 raise ValueError(f"Unsupported target dtype {target_dtype}")
@@ -248,15 +251,15 @@ def test_static_quant(target_dtype: torch.dtype, mapping_type: MappingType):
     act_obs = AffineQuantizedMinMaxObserver(
         mapping_type,
         target_dtype,
-        granularity_type=PerTensor(),
+        granularity=PerTensor(),
         eps=torch.finfo(torch.float32).eps,
         scale_dtype=torch.float32,
         zero_point_dtype=torch.float32,
     )
     weight_obs = AffineQuantizedMinMaxObserver(
         mapping_type,
         target_dtype,
-        granularity_type=PerAxis(axis=0),
+        granularity=PerAxis(axis=0),
         eps=torch.finfo(torch.float32).eps,
         scale_dtype=torch.float32,
         zero_point_dtype=torch.float32,
@@ -293,4 +296,6 @@ def test_static_quant(target_dtype: torch.dtype, mapping_type: MappingType):
 
 if __name__ == "__main__":
     test_static_quant(torch.uint8, MappingType.ASYMMETRIC)
-    test_static_quant(torch.float8_e4m3fn, MappingType.SYMMETRIC)
+    if is_sm_at_least_90():
+        # this is testing per row float8 quant
+        test_static_quant(torch.float8_e4m3fn, MappingType.SYMMETRIC)
diff --git a/tutorials/developer_api_guide/my_trainable_tensor_subclass.py b/tutorials/developer_api_guide/my_trainable_tensor_subclass.py
@@ -11,7 +11,7 @@
 """
 
 import torch
-from my_dtype_tensor_subclass import MyDTypeLayout, MyDTypeTensor
+from my_dtype_tensor_subclass import MyDTypeTensor, MyDTypeTensorImpl
 from torch.utils._python_dispatch import return_and_correct_aliasing
 
 from torchao.dtypes.utils import Layout, PlainLayout
@@ -35,7 +35,7 @@ def _quantize(
         cls,
         input_float: torch.Tensor,
         _layout: Layout,
-    ) -> MyDTypeLayout:
+    ) -> MyDTypeTensorImpl:
         """
         Convert from a floating point tensor (fp32/fp16/bf16) to the desired dtype.
         """
diff --git a/tutorials/huggingface_24sparse_example.py b/tutorials/huggingface_24sparse_example.py
diff --git a/tutorials/run_all.sh b/tutorials/run_all.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+find . -type d | while read dir; do
+  if [ -f "$dir/run.sh" ]; then
+    echo "Running: $dir/run.sh"
+    pushd "$dir"
+    bash run.sh
+    popd
+  else
+    find "$dir" -maxdepth 1 -name "*.py" | while read file; do
+      if [[ "$file" == *"tensor_parallel"* ]]; then
+        echo "Running: torchrun --standalone --nnodes=1 --nproc-per-node=1 $file"
+        torchrun --standalone --nnodes=1 --nproc-per-node=4 "$file"
+      else
+        echo "Running: python $file"
+        python "$file"
+      fi
+    done
+  fi
+done