Skip to content

Commit

Permalink
[Conformance][TorchFX] GPU quantization support (#3010)
Browse files Browse the repository at this point in the history
### Changes

* --validate-in-backend CLI option is added
* CUDA_FX_TORCH backend is added to conformance test
* FXSQMultiply is updated to work on both CPU and GPU

### Tests

Local run:
CLI: `python -m pytest test_quantize_conformance.py -k CUDA_FX --data
path/to/imagenet`

| Model | Backend | Metric name | Metric value | Metric diff | Num FQ |
Num int4 | Num int8 | Compr. time | Total time | RAM MiB | Status |
Build url |

|-----------------------------------|---------------|-------------|--------------|-------------|--------|----------|----------|-------------|------------|---------|--------|-----------|
| torchvision/resnet18 | CUDA_FX_TORCH | Acc@1 | 0.6942 | -0.0036 | 30 |
0 | 21 | 0:00:02 | 0:04:14 | 1560 | | |
| torchvision/swin_v2_s | CUDA_FX_TORCH | Acc@1 | 0.83572 | -0.0014 |
149 | 0 | 101 | 0:00:55 | 0:17:24 | 3161 | | |
| torchvision/vit_b_16 | CUDA_FX_TORCH | Acc@1 | 0.80962 | -0.00108 | 62
| 0 | 50 | 0:00:19 | 0:13:39 | 2876 | | |
| torchvision/mobilenet_v3_small_BC | CUDA_FX_TORCH | Acc@1 | 0.66642 |
-0.01018 | 61 | 0 | 36 | 0:00:05 | 0:04:09 | 1653 | | |
  • Loading branch information
daniil-lyakhov authored Feb 3, 2025
1 parent 1fe479c commit cf36f3f
Show file tree
Hide file tree
Showing 12 changed files with 208 additions and 46 deletions.
51 changes: 27 additions & 24 deletions nncf/experimental/torch/fx/constant_folding.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import torch.fx
import torch.utils._pytree as pytree

from nncf.torch.utils import get_model_device

aten = torch.ops.aten


Expand Down Expand Up @@ -246,28 +248,29 @@ def constant_fold(
:param constraint_fn: Constraint function which takes a node and returs the constraint:
should the node be constant folded or not.
"""
with torch.no_grad():
with torch.utils._python_dispatch._disable_current_modes():
cf = ConstantFolder(gm)
cf.run()
with torch.no_grad(), torch.utils._python_dispatch._disable_current_modes():
cf = ConstantFolder(gm)
cf.run()

for node, constant in cf.node_replacements.items():
if constraint_fn is not None and not constraint_fn(node):
continue
_replace_node_with_constant(gm, node, constant)

erased_params = []
for node in gm.graph.find_nodes(op="get_attr"):
if len(node.users) == 0:
if hasattr(gm, node.target):
delattr(gm, node.target)
erased_params.append(node)

for node in erased_params:
gm.graph.erase_node(node)

# Custom _is_impure function allows to eliminate all layers with zero
# users including inplace ops like relu_ besides output and placeholders.
gm.graph.eliminate_dead_code(_is_impure)
gm.graph.lint()
gm.recompile()
device = get_model_device(gm)
for node, constant in cf.node_replacements.items():
if constraint_fn is not None and not constraint_fn(node):
continue
constant = constant.to(device)
_replace_node_with_constant(gm, node, constant)

erased_params = []
for node in gm.graph.find_nodes(op="get_attr"):
if len(node.users) == 0:
if hasattr(gm, node.target):
delattr(gm, node.target)
erased_params.append(node)

for node in erased_params:
gm.graph.erase_node(node)

# Custom _is_impure function allows to eliminate all layers with zero
# users including inplace ops like relu_ besides output and placeholders.
gm.graph.eliminate_dead_code(_is_impure)
gm.graph.lint()
gm.recompile()
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@
class FXSQMultiply(torch.nn.Module):
def __init__(self, scale: torch.Tensor):
super().__init__()
self._scale_value = scale
self.register_buffer("_scale_value", scale)
self._scale_value: torch.Tensor

def forward(self, x: torch.Tensor) -> torch.Tensor:
return torch.mul(x, self._scale_value)
Expand Down
5 changes: 5 additions & 0 deletions tests/post_training/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ def pytest_addoption(parser):
parser.addoption("--fp32", action="store_true", help="Test original model")
parser.addoption("--cuda", action="store_true", help="Enable CUDA_TORCH backend")
parser.addoption("--benchmark", action="store_true", help="Run benchmark_app")
parser.addoption(
"--torch-compile-validation",
action="store_true",
help='Validate TorchFX quantized models via torch.compile(..., backend="openvino")',
)
parser.addoption(
"--extra-columns",
action="store_true",
Expand Down
8 changes: 8 additions & 0 deletions tests/post_training/data/ptq_reference_data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ torchvision/resnet18_backend_CUDA_TORCH:
metric_value: 0.69152
torchvision/resnet18_backend_FX_TORCH:
metric_value: 0.6946
torchvision/resnet18_backend_CUDA_FX_TORCH:
metric_value: 0.6946
torchvision/mobilenet_v3_small_BC_backend_FP32:
metric_value: 0.6766
torchvision/mobilenet_v3_small_BC_backend_OV:
Expand All @@ -46,18 +48,24 @@ torchvision/mobilenet_v3_small_BC_backend_ONNX:
metric_value: 0.6679
torchvision/mobilenet_v3_small_BC_backend_FX_TORCH:
metric_value: 0.6679
torchvision/mobilenet_v3_small_BC_backend_CUDA_FX_TORCH:
metric_value: 0.6664
torchvision/vit_b_16_backend_FP32:
metric_value: 0.8107
torchvision/vit_b_16_backend_OV:
metric_value: 0.80948
torchvision/vit_b_16_backend_FX_TORCH:
metric_value: 0.80922
torchvision/vit_b_16_backend_CUDA_FX_TORCH:
metric_value: 0.80922
torchvision/swin_v2_s_backend_FP32:
metric_value: 0.83712
torchvision/swin_v2_s_backend_OV:
metric_value: 0.83638
torchvision/swin_v2_s_backend_FX_TORCH:
metric_value: 0.8360
torchvision/swin_v2_s_backend_CUDA_FX_TORCH:
metric_value: 0.8360
timm/crossvit_9_240_backend_CUDA_TORCH:
metric_value: 0.7275
timm/crossvit_9_240_backend_FP32:
Expand Down
15 changes: 11 additions & 4 deletions tests/post_training/model_scope.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,14 @@
"model_id": "resnet18",
"pipeline_cls": ImageClassificationTorchvision,
"compression_params": {},
"backends": [BackendType.FX_TORCH, BackendType.TORCH, BackendType.CUDA_TORCH, BackendType.OV, BackendType.ONNX],
"backends": [
BackendType.FX_TORCH,
BackendType.CUDA_FX_TORCH,
BackendType.TORCH,
BackendType.CUDA_TORCH,
BackendType.OV,
BackendType.ONNX,
],
"batch_size": 128,
},
{
Expand All @@ -98,7 +105,7 @@
"fast_bias_correction": False,
"preset": QuantizationPreset.MIXED,
},
"backends": [BackendType.FX_TORCH, BackendType.OV, BackendType.ONNX],
"backends": [BackendType.FX_TORCH, BackendType.CUDA_FX_TORCH, BackendType.OV, BackendType.ONNX],
"batch_size": 128,
},
{
Expand All @@ -109,7 +116,7 @@
"model_type": ModelType.TRANSFORMER,
"advanced_parameters": AdvancedQuantizationParameters(smooth_quant_alpha=0.15),
},
"backends": [BackendType.FX_TORCH, BackendType.OV],
"backends": [BackendType.FX_TORCH, BackendType.CUDA_FX_TORCH, BackendType.OV],
"batch_size": 1,
},
{
Expand All @@ -120,7 +127,7 @@
"model_type": ModelType.TRANSFORMER,
"advanced_parameters": AdvancedQuantizationParameters(smooth_quant_alpha=0.5),
},
"backends": [BackendType.FX_TORCH, BackendType.OV],
"backends": [BackendType.FX_TORCH, BackendType.CUDA_FX_TORCH, BackendType.OV],
"batch_size": 1,
},
# Timm models
Expand Down
13 changes: 11 additions & 2 deletions tests/post_training/pipelines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class BackendType(Enum):
TORCH = "TORCH"
CUDA_TORCH = "CUDA_TORCH"
FX_TORCH = "FX_TORCH"
CUDA_FX_TORCH = "CUDA_FX_TORCH"
ONNX = "ONNX"
OV = "OV"
OPTIMUM = "OPTIMUM"
Expand All @@ -63,6 +64,7 @@ class BackendType(Enum):
NNCF_PTQ_BACKENDS = [BackendType.TORCH, BackendType.CUDA_TORCH, BackendType.ONNX, BackendType.OV]
ALL_PTQ_BACKENDS = NNCF_PTQ_BACKENDS
PT_BACKENDS = [BackendType.TORCH, BackendType.CUDA_TORCH]
FX_BACKENDS = [BackendType.FX_TORCH, BackendType.CUDA_FX_TORCH]
OV_BACKENDS = [BackendType.OV, BackendType.OPTIMUM]

LIMIT_LENGTH_OF_STATUS = 120
Expand Down Expand Up @@ -222,6 +224,7 @@ def __init__(
reference_data: dict,
no_eval: bool,
run_benchmark_app: bool,
torch_compile_validation: bool = False,
params: dict = None,
batch_size: int = 1,
memory_monitor: bool = False,
Expand All @@ -238,6 +241,7 @@ def __init__(
self.memory_monitor = memory_monitor
self.no_eval = no_eval
self.run_benchmark_app = run_benchmark_app
self.torch_compile_validation = torch_compile_validation
self.output_model_dir: Path = self.output_dir / self.reported_name / self.backend.value
self.output_model_dir.mkdir(parents=True, exist_ok=True)
self.model_name = f"{self.reported_name}_{self.backend.value}"
Expand Down Expand Up @@ -436,12 +440,17 @@ def save_compressed_model(self) -> None:
)
self.path_compressed_ir = self.output_model_dir / "model.xml"
ov.serialize(ov_model, self.path_compressed_ir)
elif self.backend == BackendType.FX_TORCH:
exported_model = torch.export.export(self.compressed_model, (self.dummy_tensor,))
elif self.backend in FX_BACKENDS:
exported_model = torch.export.export(self.compressed_model.cpu(), (self.dummy_tensor.cpu(),))
ov_model = ov.convert_model(exported_model, example_input=self.dummy_tensor.cpu(), input=self.input_size)
ov_model.reshape(self.input_size)
self.path_compressed_ir = self.output_model_dir / "model.xml"
ov.serialize(ov_model, self.path_compressed_ir)

if BackendType.CUDA_FX_TORCH:
self.model = self.model.cuda()
self.dummy_tensor = self.dummy_tensor.cuda()

elif self.backend == BackendType.ONNX:
onnx_path = self.output_model_dir / "model.onnx"
onnx.save(self.compressed_model, str(onnx_path))
Expand Down
46 changes: 36 additions & 10 deletions tests/post_training/pipelines/image_classification_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import nncf
from nncf.common.logging.track_progress import track
from tests.post_training.pipelines.base import DEFAULT_VAL_THREADS
from tests.post_training.pipelines.base import FX_BACKENDS
from tests.post_training.pipelines.base import ErrorReport
from tests.post_training.pipelines.base import PTQTestPipeline

Expand All @@ -35,18 +36,15 @@ def prepare_calibration_dataset(self):

self.calibration_dataset = nncf.Dataset(loader, self.get_transform_calibration_fn())

def _validate(self) -> List[ErrorReport]:
val_dataset = datasets.ImageFolder(root=self.data_dir / "imagenet" / "val", transform=self.transform)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=2, shuffle=False)

dataset_size = len(val_loader)

# Initialize result tensors for async inference support.
predictions = np.zeros(dataset_size)
references = -1 * np.ones(dataset_size)
def _validate_ov(
self,
val_loader: torch.utils.data.DataLoader,
predictions: np.ndarray,
references: np.ndarray,
dataset_size: int,
):

core = ov.Core()

if os.environ.get("INFERENCE_NUM_THREADS"):
# Set CPU_THREADS_NUM for OpenVINO inference
inference_num_threads = os.environ.get("INFERENCE_NUM_THREADS")
Expand Down Expand Up @@ -75,6 +73,34 @@ def process_result(request, userdata):
references[i] = target

infer_queue.wait_all()
return predictions, references

def _validate_torch_compile(
self, val_loader: torch.utils.data.DataLoader, predictions: np.ndarray, references: np.ndarray
):
compiled_model = torch.compile(self.compressed_model.cpu(), backend="openvino")
for i, (images, target) in enumerate(val_loader):
# W/A for memory leaks when using torch DataLoader and OpenVINO
pred = compiled_model(images)
pred = torch.argmax(pred, dim=1)
predictions[i] = pred.numpy()
references[i] = target.numpy()
return predictions, references

def _validate(self) -> List[ErrorReport]:
val_dataset = datasets.ImageFolder(root=self.data_dir / "imagenet" / "val", transform=self.transform)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=2, shuffle=False)

dataset_size = len(val_loader)

# Initialize result tensors for async inference support.
predictions = np.zeros(dataset_size)
references = -1 * np.ones(dataset_size)

if self.backend in FX_BACKENDS and self.torch_compile_validation:
predictions, references = self._validate_torch_compile(val_loader, predictions, references)
else:
predictions, references = self._validate_ov(val_loader, predictions, references, dataset_size)

acc_top1 = accuracy_score(predictions, references)

Expand Down
20 changes: 15 additions & 5 deletions tests/post_training/pipelines/image_classification_torchvision.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from torchvision import models

from nncf.torch import disable_patching
from tests.post_training.pipelines.base import FX_BACKENDS
from tests.post_training.pipelines.base import PT_BACKENDS
from tests.post_training.pipelines.base import BackendType
from tests.post_training.pipelines.image_classification_base import ImageClassificationBase
Expand Down Expand Up @@ -74,9 +75,12 @@ def prepare_model(self) -> None:
if self.batch_size > 1: # Dynamic batch_size shape export
self.input_size[0] = -1

if self.backend == BackendType.FX_TORCH:
if self.backend in FX_BACKENDS:
with torch.no_grad():
with disable_patching():
if self.backend is BackendType.CUDA_FX_TORCH:
model = model.cuda()
self.dummy_tensor = self.dummy_tensor.cuda()
self.model = self.model_params.export_fn(model, (self.dummy_tensor,))

elif self.backend in PT_BACKENDS:
Expand Down Expand Up @@ -120,20 +124,26 @@ def _dump_model_fp32(self) -> None:
)
ov.serialize(ov_model, self.fp32_model_dir / "model_fp32.xml")

if self.backend == BackendType.FX_TORCH:
exported_model = torch.export.export(self.model, (self.dummy_tensor,))
if self.backend in FX_BACKENDS:
exported_model = torch.export.export(self.model.cpu(), (self.dummy_tensor.cpu(),))
ov_model = ov.convert_model(exported_model, example_input=self.dummy_tensor, input=self.input_size)
ov.serialize(ov_model, self.fp32_model_dir / "fx_model_fp32.xml")

if self.backend is BackendType.CUDA_FX_TORCH:
self.model = self.model.cuda()
self.dummy_tensor = self.dummy_tensor.cuda()

if self.backend in [BackendType.FP32, BackendType.OV]:
ov.serialize(self.model, self.fp32_model_dir / "model_fp32.xml")

def prepare_preprocessor(self) -> None:
self.transform = self.model_params.weights.transforms()

def get_transform_calibration_fn(self):
if self.backend in [BackendType.FX_TORCH] + PT_BACKENDS:
device = torch.device("cuda" if self.backend == BackendType.CUDA_TORCH else "cpu")
if self.backend in FX_BACKENDS + PT_BACKENDS:
device = torch.device(
"cuda" if self.backend in [BackendType.CUDA_TORCH, BackendType.CUDA_FX_TORCH] else "cpu"
)

def transform_fn(data_item):
images, _ = data_item
Expand Down
7 changes: 7 additions & 0 deletions tests/post_training/test_quantize_conformance.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ def fixture_run_benchmark_app(pytestconfig):
return pytestconfig.getoption("benchmark")


@pytest.fixture(scope="session", name="torch_compile_validation")
def fixture_torch_compile_validation(pytestconfig):
return pytestconfig.getoption("torch_compile_validation")


@pytest.fixture(scope="session", name="extra_columns")
def fixture_extra_columns(pytestconfig):
return pytestconfig.getoption("extra_columns")
Expand Down Expand Up @@ -281,6 +286,7 @@ def test_ptq_quantization(
run_torch_cuda_backend: bool,
subset_size: Optional[int],
run_benchmark_app: bool,
torch_compile_validation: bool,
capsys: pytest.CaptureFixture,
extra_columns: bool,
memory_monitor: bool,
Expand Down Expand Up @@ -309,6 +315,7 @@ def test_ptq_quantization(
"data_dir": data_dir,
"no_eval": no_eval,
"run_benchmark_app": run_benchmark_app,
"torch_compile_validation": torch_compile_validation,
"batch_size": batch_size,
"memory_monitor": memory_monitor,
}
Expand Down
Loading

0 comments on commit cf36f3f

Please sign in to comment.