Xilinx · Giuseppe5 · Oct 1, 2024 · Oct 1, 2024 · Sep 27, 2024
diff --git a/src/brevitas/graph/calibrate.py b/src/brevitas/graph/calibrate.py
@@ -149,7 +149,6 @@ def __exit__(self, *args, **kwargs):
         for module in self.model.modules():
             if issubclass(type(module), QuantWBIOL):
                 module._quant_load_model_mode = False
-        return True
 
 
 class ClipFloatWeights(Transform):

diff --git a/src/brevitas_examples/stable_diffusion/main.py b/src/brevitas_examples/stable_diffusion/main.py
@@ -463,18 +463,17 @@ def sdpa_zp_stats_type():
 
         pipe.set_progress_bar_config(disable=True)
 
-        if args.dry_run:
-            with torch.no_grad():
-                run_val_inference(
-                    pipe,
-                    args.resolution, [calibration_prompts[0]],
-                    test_seeds,
-                    args.device,
-                    dtype,
-                    total_steps=1,
-                    use_negative_prompts=args.use_negative_prompts,
-                    test_latents=latents,
-                    guidance_scale=args.guidance_scale)
+        with torch.no_grad():
+            run_val_inference(
+                pipe,
+                args.resolution, [calibration_prompts[0]],
+                test_seeds,
+                args.device,
+                dtype,
+                total_steps=1,
+                use_negative_prompts=args.use_negative_prompts,
+                test_latents=latents,
+                guidance_scale=args.guidance_scale)
 
         if args.load_checkpoint is not None:
             with load_quant_model_mode(pipe.unet):
@@ -574,6 +573,38 @@ def sdpa_zp_stats_type():
             torch.save(
                 pipe.vae.state_dict(), os.path.join(output_dir, f"vae_{args.checkpoint_name}"))
 
+    if args.export_target:
+        # Move to cpu and to float32 to enable CPU export
+        if args.export_cpu_float32:
+            pipe.unet.to('cpu').to(torch.float32)
+        pipe.unet.eval()
+        device = next(iter(pipe.unet.parameters())).device
+        dtype = next(iter(pipe.unet.parameters())).dtype
+
+        # Define tracing input
+        if is_sd_xl:
+            generate_fn = generate_unet_xl_rand_inputs
+            shape = SD_XL_EMBEDDINGS_SHAPE
+        else:
+            generate_fn = generate_unet_21_rand_inputs
+            shape = SD_2_1_EMBEDDINGS_SHAPE
+        trace_inputs = generate_fn(
+            embedding_shape=shape,
+            unet_input_shape=unet_input_shape(args.resolution),
+            device=device,
+            dtype=dtype)
+
+        if args.export_target == 'onnx':
+            if args.weight_quant_granularity == 'per_group':
+                export_manager = BlockQuantProxyLevelManager
+            else:
+                export_manager = StdQCDQONNXManager
+                export_manager.change_weight_export(export_weight_q_node=args.export_weight_q_node)
+            export_onnx(pipe, trace_inputs, output_dir, export_manager)
+        if args.export_target == 'params_only':
+            pipe.to('cpu')
+            export_quant_params(pipe, output_dir, export_vae=args.vae_fp16_fix)
+
     # Perform inference
     if args.prompt > 0 and not args.dry_run:
         # with brevitas_proxy_inference_mode(pipe.unet):
@@ -619,38 +650,6 @@ def sdpa_zp_stats_type():
             fid.update(quant_images_values, real=False)
             print(f"FID: {float(fid.compute())}")
 
-    if args.export_target:
-        # Move to cpu and to float32 to enable CPU export
-        if args.export_cpu_float32:
-            pipe.unet.to('cpu').to(torch.float32)
-        pipe.unet.eval()
-        device = next(iter(pipe.unet.parameters())).device
-        dtype = next(iter(pipe.unet.parameters())).dtype
-
-        # Define tracing input
-        if is_sd_xl:
-            generate_fn = generate_unet_xl_rand_inputs
-            shape = SD_XL_EMBEDDINGS_SHAPE
-        else:
-            generate_fn = generate_unet_21_rand_inputs
-            shape = SD_2_1_EMBEDDINGS_SHAPE
-        trace_inputs = generate_fn(
-            embedding_shape=shape,
-            unet_input_shape=unet_input_shape(args.resolution),
-            device=device,
-            dtype=dtype)
-
-        if args.export_target == 'onnx':
-            if args.weight_quant_granularity == 'per_group':
-                export_manager = BlockQuantProxyLevelManager
-            else:
-                export_manager = StdQCDQONNXManager
-                export_manager.change_weight_export(export_weight_q_node=args.export_weight_q_node)
-            export_onnx(pipe, trace_inputs, output_dir, export_manager)
-        if args.export_target == 'params_only':
-            pipe.to('cpu')
-            export_quant_params(pipe, output_dir, export_vae=args.vae_fp16_fix)
-
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Stable Diffusion quantization')

diff --git a/src/brevitas_examples/stable_diffusion/sd_quant/export.py b/src/brevitas_examples/stable_diffusion/sd_quant/export.py
@@ -36,9 +36,9 @@ def handle_quant_param(layer, layer_dict):
         output_scale = layer.output_quant.export_handler.symbolic_kwargs[
             'dequantize_symbolic_kwargs']['scale'].data
 
-        layer_dict['output_scale'] = output_scale.numpy().tolist()
+        layer_dict['output_scale'] = output_scale.cpu().numpy().tolist()
         layer_dict['output_scale_shape'] = output_scale.shape
-    layer_dict['input_scale'] = input_scale.numpy().tolist()
+    layer_dict['input_scale'] = input_scale.cpu().numpy().tolist()
     layer_dict['input_scale_shape'] = input_scale.shape
     layer_dict['input_zp'] = input_zp.to(torch.float32).cpu().numpy().tolist()
     layer_dict['input_zp_shape'] = input_zp.shape
@@ -83,7 +83,7 @@ def export_quant_params(pipe, output_dir, export_vae=False):
                     full_name = name
                     smoothquant_param = module.scale.weight
 
-                    layer_dict['smoothquant_mul'] = smoothquant_param.data.numpy().tolist()
+                    layer_dict['smoothquant_mul'] = smoothquant_param.data.cpu().numpy().tolist()
                     layer_dict['smoothquant_mul_shape'] = module.scale.runtime_shape
                     layer_dict = handle_quant_param(module.layer, layer_dict)
 
@@ -94,7 +94,7 @@ def export_quant_params(pipe, output_dir, export_vae=False):
                     full_name = name
                     smoothquant_param = module.scale.weight
 
-                    layer_dict['smoothquant_mul'] = smoothquant_param.data.numpy().tolist()
+                    layer_dict['smoothquant_mul'] = smoothquant_param.data.cpu().numpy().tolist()
                     layer_dict['smoothquant_mul_shape'] = module.scale.runtime_shape
                     quant_params[full_name] = layer_dict
                     handled_quant_layers.add(id(module.layer))
@@ -113,9 +113,9 @@ def export_quant_params(pipe, output_dir, export_vae=False):
                     'dequantize_symbolic_kwargs']['scale'].data
                 act_zp = module.act_quant.export_handler.symbolic_kwargs[
                     'dequantize_symbolic_kwargs']['zero_point'].data
-                layer_dict['act_scale'] = act_scale.numpy().tolist()
+                layer_dict['act_scale'] = act_scale.cpu().numpy().tolist()
                 layer_dict['act_scale_shape'] = act_scale.shape
-                layer_dict['act_zp'] = act_zp.to(torch.float32).numpy().tolist()
+                layer_dict['act_zp'] = act_zp.to(torch.float32).cpu().numpy().tolist()
                 layer_dict['act_zp_shape'] = act_zp.shape
                 layer_dict['act_zp_dtype'] = str(act_zp.dtype)
                 quant_params[full_name] = layer_dict