add config file for 2K models;

Signed-off-by: lawrence-cj <[email protected]>
NVlabs · Dec 20, 2024 · 374447b · 374447b
1 parent d367838
commit 374447b
Showing 1 changed file with 109 additions and 0 deletions.
diff --git a/configs/sana_config/2048ms/Sana_1600M_img2048.yaml b/configs/sana_config/2048ms/Sana_1600M_img2048.yaml
@@ -0,0 +1,109 @@
+data:
+  data_dir: [data/data_public/dir1]
+  image_size: 2048
+  caption_proportion:
+    prompt: 1
+  external_caption_suffixes: ['', _InternVL2-26B, _VILA1-5-13B]
+  external_clipscore_suffixes:
+    - _InternVL2-26B_clip_score
+    - _VILA1-5-13B_clip_score
+    - _prompt_clip_score
+  clip_thr_temperature: 0.1
+  clip_thr: 25.0
+  load_text_feat: false
+  load_vae_feat: false
+  transform: default_train
+  type: SanaWebDatasetMS
+  sort_dataset: false
+# model config
+model:
+  model: SanaMS_1600M_P1_D20
+  image_size: 2048
+  mixed_precision: bf16 # ['fp16', 'fp32', 'bf16']
+  fp32_attention: true
+  load_from:
+  resume_from:
+  aspect_ratio_type: ASPECT_RATIO_2048
+  multi_scale: true
+  attn_type: linear
+  ffn_type: glumbconv
+  mlp_acts:
+    - silu
+    - silu
+    -
+  mlp_ratio: 2.5
+  use_pe: true
+  pe_interpolation: 1.
+  qk_norm: false
+  class_dropout_prob: 0.1
+  # PAG
+  pag_applied_layers:
+    - 8
+# VAE setting
+vae:
+  vae_type: dc-ae
+  vae_pretrained: mit-han-lab/dc-ae-f32c32-sana-1.0
+  scale_factor: 0.41407
+  vae_latent_dim: 32
+  vae_downsample_rate: 32
+  sample_posterior: true
+# text encoder
+text_encoder:
+  text_encoder_name: gemma-2-2b-it
+  y_norm: true
+  y_norm_scale_factor: 0.01
+  model_max_length: 300
+  # CHI
+  chi_prompt:
+    - 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:'
+    - '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.'
+    - '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.'
+    - 'Here are examples of how to transform or refine prompts:'
+    - '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.'
+    - '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.'
+    - 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:'
+    - 'User Prompt: '
+# Sana schedule Flow
+scheduler:
+  predict_v: true
+  noise_schedule: linear_flow
+  pred_sigma: false
+  flow_shift: 3.0
+  # logit-normal timestep
+  weighting_scheme: logit_normal
+  logit_mean: 0.0
+  logit_std: 1.0
+  vis_sampler: flow_dpm-solver
+# training setting
+train:
+  num_workers: 10
+  seed: 1
+  train_batch_size: 64
+  num_epochs: 100
+  gradient_accumulation_steps: 1
+  grad_checkpointing: true
+  gradient_clip: 0.1
+  optimizer:
+    betas:
+      - 0.9
+      - 0.999
+      - 0.9999
+    eps:
+      - 1.0e-30
+      - 1.0e-16
+    lr: 0.0001
+    type: CAMEWrapper
+    weight_decay: 0.0
+  lr_schedule: constant
+  lr_schedule_args:
+    num_warmup_steps: 2000
+  local_save_vis: true # if save log image locally
+  visualize: true
+  eval_sampling_steps: 500
+  log_interval: 20
+  save_model_epochs: 5
+  save_model_steps: 500
+  work_dir: output/debug
+  online_metric: false
+  eval_metric_step: 2000
+  online_metric_dir: metric_helper