{ "data_dir": "dataset/actor/", "only_actor": "all", "base_model": "Qwen/Qwen2.5-7B-Instruct", "output_dir": "out/", "gamma": 0.99, "tau": 0.005, "beta_dpo": 1, "beta_q": 1.0, "beta_sft": 0.5, "num_candidates": 3, "k_next": 3, "actor_gen_max_new": 30, "next_gen_max_new": 30, "gen_temperature": 1.5, "beta_format_penalty": 1.0, "format_penalty_value": -4.0, "total_steps": 200000, "sft_warmup_steps": 2000, "q_pretrain_steps": 2000, "valid_every_steps": 2000, "valid_n_batches": 20, "eval_every_steps": 245645, "save_every_steps": 2000, "batch_size": 8, "grad_accum_steps": 1, "lr": 3e-05, "lr_q": 0.0003, "weight_decay": 0.01, "warmup_ratio": 0.001, "max_grad_norm": 1.0, "max_length": 2048, "use_chat_template": false, "valid_ratio": 0.0, "log_every": 5, "color_log": true, "seed": 42, "lora_r": 16, "lora_alpha": 32, "lora_r_q": 16, "lora_alpha_q": 32, "lora_dropout": 0.05, "target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj" ], "fp16": false, "bf16": true, "gradient_checkpointing": false, "layout": "new_env", "eval_verbose": false, "gpu_id": 0, "test_prompt_dir": "prompts/test/", "wandb": true, "wandb_project": "overcooked-qdpo", "wandb_run_name": "3phase-sft-q-dpo" }