1
+ # RL algorithms
2
+ rl_algorithm : " reinforce_plus_plus" # The reinforcement learning algorithm used, supported: "ppo", "grpo", "reinforce_plus_plus"
3
+
4
+ # models
5
+ actor_model_name_or_path : " Qwen/Qwen2.5-7B-Instruct-1M" # The name or path of the actor model
6
+ reward_model_name_or_path : " " # The name or path of the reward model
7
+ use_rm_server : true # Whether to use the reward model server
8
+ reward_server : " http://127.0.0.1:8731" # The address of the reward model server
9
+
10
+ # logging
11
+ logging_dir : grpo-logs # Directory for logging
12
+ logging_steps : 1 # Number of steps between logging
13
+ output_dir : " qwen2.5-7b-kk-dataset-grpo/checkpoints" # Directory for output ckpts
14
+ report_to : " visualdl" # Supported reporting options: "all", "wandb", "tensorboard", "visualdl"(default), "none"
15
+ wandb_http_proxy : " http://127.0.0.1:8962" # HTTP proxy for wandb
16
+ run_name : " qwen2.5-7b-kk-dataset-grpo" # Name of the run
17
+
18
+ # data
19
+ train_datasets : " ppo-kk/34567ppl/train.jsonl" # Path to the training dataset
20
+ eval_datasets : " ppo-kk/5ppl/test.jsonl" # Path to the evaluation dataset
21
+ prompt_key : " src" # Key for the prompt in the dataset
22
+ response_key : " tgt" # Key for the response in the dataset
23
+ dataloader_drop_last : true # Whether to drop the last incomplete batch in the DataLoader
24
+ dataloader_shuffle : false # Whether to shuffle the train dataset
25
+ balance_batch : true # Whether to balance batch size across dataset_world_size
26
+ use_remove_padding : true # Whether to remove padding tokens in the input
27
+
28
+ # distributed training args
29
+ tensor_parallel_degree : 2 # Degree of tensor parallelism
30
+ sequence_parallel : true # Whether to enable sequence parallelism
31
+ sharding_parallel_degree : -1 # Degree of sharding parallelism
32
+ sharding : " stage1" # Sharding strategy, e.g., "stage1" or "stage2"
33
+ sharding_parallel_config : " enable_release_grads" # Configuration for sharding parallelism
34
+ pipeline_parallel_degree : 1 # Degree of pipeline parallelism
35
+ virtual_pp_degree : 1 # Degree of virtual pipeline parallelism
36
+
37
+ # rollout args
38
+ max_prompt_len : 512 # Maximum length of the prompt, exceeding which will be automatically truncated
39
+ max_dec_len : 4096 # Maximum length of the response
40
+ min_dec_len : 32 # Minimum length of the response
41
+ top_p : 1.0 # Top-p sampling parameter
42
+ temperature : 0.7 # Temperature parameter for sampling
43
+ repetition_penalty : 1.0 # Repetition penalty parameter
44
+ rollout_max_num_seqs : 32 # The maximum number of sequences that can be processed in a single inference
45
+ rollout_quant_type : " " # Quantization type, e.g., "weight_only_int8"
46
+
47
+ # training args
48
+ do_train : true # Whether to perform training
49
+ seed : 42 # Random seed for reproducibility
50
+ global_batch_size : 8 # Global batch size for training
51
+ global_gen_batch_size : -1 # Global generation batch size for dynamic sampling
52
+ global_mini_batch_size : -1 # Mini-batch size for training
53
+ rollout_n : 8 # Number of rollouts
54
+ update_iters : 1 # Number of training iterations for rollout samples
55
+ per_device_logprob_batch_size : 8 # Log probability batch size per device
56
+ per_device_reward_batch_size : 8 # Reward batch size per device
57
+ per_device_value_batch_size : 8 # Value batch size per device
58
+ per_device_train_batch_size : 8 # Training batch size per device
59
+ # gradient_accumulation_steps: 1 # Gradient accumulation steps (auto-calculated)
60
+ num_train_epochs : 6 # Number of training epochs
61
+ max_length : 4608 # Maximum length for training, should be larger than max_prompt_len + max_dec_len
62
+ learning_rate : 5e-7 # Learning rate for training
63
+ lr_scheduler_type : " constant" # Learning rate scheduler type
64
+ weight_decay : 1e-2 # Weight decay for the AdamW optimizer
65
+ adam_beta1 : 0.9 # AdamW optimizer beta1
66
+ adam_beta2 : 0.999 # AdamW optimizer beta2
67
+ adam_epsilon : 1e-8 # AdamW optimizer epsilon
68
+ max_grad_norm : 1.0 # Maximum gradient norm for clipping
69
+ max_steps : 3600 # Maximum number of training steps
70
+ save_steps : 300 # Number of steps between model saves
71
+ save_strategy : " steps" # Strategy for saving models
72
+ ignore_save_lr_and_optim : true # Whether to ignore saving learning rate and optimizer state (leave empty if not specified)
73
+ disable_tqdm : true # Whether to disable tqdm progress bar
74
+
75
+ # RL args
76
+ kl_coeff : 0.0 # KL coefficient
77
+ kl_loss_coeff : 0.000 # KL loss coefficient
78
+ pg_loss_coeff : 1.0 # Policy gradient loss coefficient
79
+ entropy_coeff : 0.0 # Entropy coefficient
80
+ clip_range_ratio : 0.2 # The clipping range for ratio between the old and new policy. (PPO algorithm)
81
+ clip_range_ratio_low : 0.2 # The clipping range for ratio between the old and new policy. (PPO algorithm)
82
+ clip_range_ratio_high : 0.2 # The clipping range for ratio between the old and new policy. (PPO algorithm)
83
+ clip_range_score : 10.0 # The clipping range for the output of the score model. The reward is clipped into [-clip_range_score, clip_range_score].
84
+ enable_overlong_reward_buffer : false # Whether to enable overlong reward buffer
85
+ overlong_reward_buffer : 256 # The length of the overlong reward buffer
86
+ overlong_penalty_factor : 1.0 # The penalty factor for overlong reward buffer
87
+ clip_range_value : 5.0 # The clipping range for the output of the value model. The value is clipped into [-clip_range_value, clip_range_value].
88
+ normalize_reward : false # Whether to normalize reward
89
+ normalize_advantage : false # Whether to normalize advantage
90
+ dynamic_sampling : false # Whether to use dynamic sampling, which is introcuded in DAPO algorithm https://arxiv.org/abs/2503.14476
91
+ max_gen_batches : 2 # Maximum number of generation batches for dynamic sampling
92
+ use_fp32_compute : true # Whether to use fp32 to compute xx_log_prob,rewards, advantages and loss
93
+
94
+ # eval args
95
+ do_eval : true # Whether to perform evaluation
96
+ per_device_eval_batch_size : 32 # Evaluation batch size per device
97
+ evaluation_strategy : " steps" # Evaluation strategy, e.g., "steps"
98
+ eval_steps : 20 # Number of steps between evaluations
99
+
100
+ # device memory optimization args
101
+ use_flash_attention : true # Whether to use fused attention operations
102
+ use_fused_rms_norm : false # Whether to use fused RMS norm operations, which needs to install fused_ln in slm/model_zoo/gpt-3/external_ops
103
+ use_fused_rope : false # Whether to use fused rope operations
104
+ use_fused_head_and_loss_fn : true # Whether to use fused head and loss function
105
+ use_fused_linear : true # Whether to use fused linear operations
106
+ recompute : true # Whether to enable gradient checkpointing for memory optimization
107
+ recompute_use_reentrant : true # Whether to use reentrant recompute
108
+ recompute_granularity : " full" # Granularity of recompute
109
+ bf16 : true # Whether to use mixed precision with bfloat16
110
+ fp16_opt_level : " O2" # Optimization level for fp16 and bf16 training
111
+ amp_master_grad : false # Whether to use float32 weight gradients for master weights in amp opt level=’O2’
112
+ amp_custom_black_list : " reduce_sum softmax_with_cross_entropy c_softmax_with_cross_entropy elementwise_div sin cos" # Custom black list for amp
113
+ amp_custom_white_list : " lookup_table lookup_table_v2 flash_attn matmul matmul_v2 fused_gemm_epilogue" # Custom white list for amp
114
+ offload_level : " freeze_model" # Level of model offloading to pinned memory, supported values: freeze_model, train_model, optimizer
115
+ release_grads : true # Whether to release gradients
116
+ offload_optim : false # Whether to offload optimizer to pinned memory
117
+
118
+ # benchmark args
119
+ skip_profile_timer : false # Whether to skip profiling timer
0 commit comments