diff --git a/configs/sana_config/2048ms/Sana_1600M_img2048.yaml b/configs/sana_config/2048ms/Sana_1600M_img2048.yaml new file mode 100644 index 0000000..341ff71 --- /dev/null +++ b/configs/sana_config/2048ms/Sana_1600M_img2048.yaml @@ -0,0 +1,109 @@ +data: + data_dir: [data/data_public/dir1] + image_size: 2048 + caption_proportion: + prompt: 1 + external_caption_suffixes: ['', _InternVL2-26B, _VILA1-5-13B] + external_clipscore_suffixes: + - _InternVL2-26B_clip_score + - _VILA1-5-13B_clip_score + - _prompt_clip_score + clip_thr_temperature: 0.1 + clip_thr: 25.0 + load_text_feat: false + load_vae_feat: false + transform: default_train + type: SanaWebDatasetMS + sort_dataset: false +# model config +model: + model: SanaMS_1600M_P1_D20 + image_size: 2048 + mixed_precision: bf16 # ['fp16', 'fp32', 'bf16'] + fp32_attention: true + load_from: + resume_from: + aspect_ratio_type: ASPECT_RATIO_2048 + multi_scale: true + attn_type: linear + ffn_type: glumbconv + mlp_acts: + - silu + - silu + - + mlp_ratio: 2.5 + use_pe: true + pe_interpolation: 1. + qk_norm: false + class_dropout_prob: 0.1 + # PAG + pag_applied_layers: + - 8 +# VAE setting +vae: + vae_type: dc-ae + vae_pretrained: mit-han-lab/dc-ae-f32c32-sana-1.0 + scale_factor: 0.41407 + vae_latent_dim: 32 + vae_downsample_rate: 32 + sample_posterior: true +# text encoder +text_encoder: + text_encoder_name: gemma-2-2b-it + y_norm: true + y_norm_scale_factor: 0.01 + model_max_length: 300 + # CHI + chi_prompt: + - 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:' + - '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.' + - '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.' + - 'Here are examples of how to transform or refine prompts:' + - '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.' + - '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.' + - 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:' + - 'User Prompt: ' +# Sana schedule Flow +scheduler: + predict_v: true + noise_schedule: linear_flow + pred_sigma: false + flow_shift: 3.0 + # logit-normal timestep + weighting_scheme: logit_normal + logit_mean: 0.0 + logit_std: 1.0 + vis_sampler: flow_dpm-solver +# training setting +train: + num_workers: 10 + seed: 1 + train_batch_size: 64 + num_epochs: 100 + gradient_accumulation_steps: 1 + grad_checkpointing: true + gradient_clip: 0.1 + optimizer: + betas: + - 0.9 + - 0.999 + - 0.9999 + eps: + - 1.0e-30 + - 1.0e-16 + lr: 0.0001 + type: CAMEWrapper + weight_decay: 0.0 + lr_schedule: constant + lr_schedule_args: + num_warmup_steps: 2000 + local_save_vis: true # if save log image locally + visualize: true + eval_sampling_steps: 500 + log_interval: 20 + save_model_epochs: 5 + save_model_steps: 500 + work_dir: output/debug + online_metric: false + eval_metric_step: 2000 + online_metric_dir: metric_helper