| { |
| "data_dir": "dataset/actor/", |
| "only_actor": "all", |
| "base_model": "Qwen/Qwen2.5-7B-Instruct", |
| "output_dir": "out/", |
| "gamma": 0.99, |
| "tau": 0.005, |
| "beta_dpo": 1, |
| "beta_q": 1.0, |
| "beta_sft": 0.5, |
| "num_candidates": 3, |
| "k_next": 3, |
| "actor_gen_max_new": 30, |
| "next_gen_max_new": 30, |
| "gen_temperature": 1.5, |
| "beta_format_penalty": 1.0, |
| "format_penalty_value": -4.0, |
| "total_steps": 200000, |
| "sft_warmup_steps": 2000, |
| "q_pretrain_steps": 2000, |
| "valid_every_steps": 2000, |
| "valid_n_batches": 20, |
| "eval_every_steps": 245645, |
| "save_every_steps": 2000, |
| "batch_size": 8, |
| "grad_accum_steps": 1, |
| "lr": 3e-05, |
| "lr_q": 0.0003, |
| "weight_decay": 0.01, |
| "warmup_ratio": 0.001, |
| "max_grad_norm": 1.0, |
| "max_length": 2048, |
| "use_chat_template": false, |
| "valid_ratio": 0.0, |
| "log_every": 5, |
| "color_log": true, |
| "seed": 42, |
| "lora_r": 16, |
| "lora_alpha": 32, |
| "lora_r_q": 16, |
| "lora_alpha_q": 32, |
| "lora_dropout": 0.05, |
| "target_modules": [ |
| "q_proj", |
| "k_proj", |
| "v_proj", |
| "o_proj", |
| "gate_proj", |
| "up_proj", |
| "down_proj" |
| ], |
| "fp16": false, |
| "bf16": true, |
| "gradient_checkpointing": false, |
| "layout": "new_env", |
| "eval_verbose": false, |
| "gpu_id": 0, |
| "test_prompt_dir": "prompts/test/", |
| "wandb": true, |
| "wandb_project": "overcooked-qdpo", |
| "wandb_run_name": "3phase-sft-q-dpo" |
| } |