Skip to content

Commit 9b3ef71

Browse files
Add dynamic auto mode cross_entropy CI (#10515)
* add cross_entropy ci * fix * fix
1 parent d0044ab commit 9b3ef71

File tree

1 file changed

+118
-91
lines changed

1 file changed

+118
-91
lines changed

scripts/distribute/ci_case_auto.sh

+118-91
Original file line numberDiff line numberDiff line change
@@ -1002,98 +1002,125 @@ function llama_pir_auto_fuse_ffn_attention_qkv_MP2() {
10021002
"--tensor_parallel_config replace_with_c_embedding"
10031003
"--tensor_parallel_config replace_with_parallel_cross_entropy"
10041004
)
1005-
for tp_config in "${tp_configs[@]}"; do
1006-
rm -rf $auto_case_out_dir
1007-
rm -rf $auto_case_log_dir
1008-
python -u -m paddle.distributed.launch \
1009-
--gpus "0,1" \
1010-
--log_dir $auto_case_log_dir \
1011-
run_pretrain_auto.py \
1012-
--model_name_or_path "facebook/llama-7b" \
1013-
--tokenizer_name_or_path "facebook/llama-7b" \
1014-
--input_dir "./data" \
1015-
--output_dir $auto_case_out_dir \
1016-
--split 949,50,1 \
1017-
--weight_decay 0.01 \
1018-
--warmup_ratio 0.01 \
1019-
--warmup_steps 30 \
1020-
--max_grad_norm 0.0 \
1021-
--learning_rate 3e-05 \
1022-
--min_learning_rate 3e-06 \
1023-
--max_steps 10 \
1024-
--logging_steps 1 \
1025-
--eval_steps 1000 \
1026-
--save_steps 3 \
1027-
--continue_training 0 \
1028-
--do_train true \
1029-
--do_eval false \
1030-
--do_predict false \
1031-
--disable_tqdm true \
1032-
--skip_profile_timer true \
1033-
--save_total_limit 2 \
1034-
--device gpu \
1035-
--disable_tqdm true \
1036-
--dataloader_num_workers 1 \
1037-
--distributed_dataloader 0 \
1038-
--enable_auto_parallel 1 \
1039-
--per_device_train_batch_size 1 \
1040-
--gradient_accumulation_steps 1 \
1041-
--per_device_eval_batch_size 2 \
1042-
--recompute false \
1043-
--recompute_use_reentrant true \
1044-
--recompute_granularity full \
1045-
--pp_recompute_interval 0 \
1046-
--bf16 0 \
1047-
--fp16_opt_level "O2" \
1048-
--amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
1049-
--amp_custom_white_list "lookup_table" "lookup_table_v2" \
1050-
--amp_master_grad false \
1051-
--fuse_attention_ffn false \
1052-
--fuse_attention_qkv false \
1053-
--use_flash_attention false \
1054-
--use_fused_rope true \
1055-
--use_fused_rms_norm true \
1056-
--max_seq_length 4096 \
1057-
--sequence_parallel false \
1058-
--pipeline_parallel_degree 1 \
1059-
--sharding_parallel_degree 1 \
1060-
--tensor_parallel_degree 2 \
1061-
${tp_config} \
1062-
--virtual_pp_degree 1 \
1063-
--pipeline_schedule_mode "VPP" \
1064-
--sharding "" \
1065-
--to_static 1 \
1066-
--num_hidden_layers 2 \
1067-
>>${log_path}/$FUNCNAME 2>&1
1005+
for to_static in "0" "1"; do
1006+
for tp_config in "${tp_configs[@]}"; do
1007+
rm -rf $auto_case_out_dir
1008+
rm -rf $auto_case_log_dir
1009+
python -u -m paddle.distributed.launch \
1010+
--gpus "0,1" \
1011+
--log_dir $auto_case_log_dir \
1012+
run_pretrain_auto.py \
1013+
--model_name_or_path "facebook/llama-7b" \
1014+
--tokenizer_name_or_path "facebook/llama-7b" \
1015+
--input_dir "./data" \
1016+
--output_dir $auto_case_out_dir \
1017+
--split 949,50,1 \
1018+
--weight_decay 0.01 \
1019+
--warmup_ratio 0.01 \
1020+
--warmup_steps 30 \
1021+
--max_grad_norm 0.0 \
1022+
--learning_rate 3e-05 \
1023+
--min_learning_rate 3e-06 \
1024+
--max_steps 10 \
1025+
--logging_steps 1 \
1026+
--eval_steps 1000 \
1027+
--save_steps 3 \
1028+
--continue_training 0 \
1029+
--do_train true \
1030+
--do_eval false \
1031+
--do_predict false \
1032+
--disable_tqdm true \
1033+
--skip_profile_timer true \
1034+
--save_total_limit 2 \
1035+
--device gpu \
1036+
--disable_tqdm true \
1037+
--dataloader_num_workers 1 \
1038+
--distributed_dataloader 0 \
1039+
--enable_auto_parallel 1 \
1040+
--per_device_train_batch_size 1 \
1041+
--gradient_accumulation_steps 1 \
1042+
--per_device_eval_batch_size 2 \
1043+
--recompute false \
1044+
--recompute_use_reentrant true \
1045+
--recompute_granularity full \
1046+
--pp_recompute_interval 0 \
1047+
--bf16 0 \
1048+
--fp16_opt_level "O2" \
1049+
--amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
1050+
--amp_custom_white_list "lookup_table" "lookup_table_v2" \
1051+
--amp_master_grad false \
1052+
--fuse_attention_ffn false \
1053+
--fuse_attention_qkv false \
1054+
--use_flash_attention false \
1055+
--use_fused_rope true \
1056+
--use_fused_rms_norm true \
1057+
--max_seq_length 4096 \
1058+
--sequence_parallel false \
1059+
--pipeline_parallel_degree 1 \
1060+
--sharding_parallel_degree 1 \
1061+
--tensor_parallel_degree 2 \
1062+
${tp_config} \
1063+
--virtual_pp_degree 1 \
1064+
--pipeline_schedule_mode "VPP" \
1065+
--sharding "" \
1066+
--to_static ${to_static} \
1067+
--num_hidden_layers 2 \
1068+
>>${log_path}/$FUNCNAME 2>&1
10681069

1069-
auto_loss_2=`cat $auto_case_log_dir/workerlog.0 | grep 'global_step: 2' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
1070-
loss_md5_2=`cat $auto_case_log_dir/workerlog.0 | grep 'global_step: 2' | awk -F 'loss_md5: ' '{print $2}' | awk -F ',' '{print $1}'`
1071-
auto_ips_2=`cat $auto_case_log_dir/workerlog.0 | grep 'global_step: 2' | awk -F 'interval_tokens_per_second_per_device: ' '{print $2}' | awk -F ',' '{print $1}'`
1072-
auto_mem_2=`cat $auto_case_log_dir/workerlog.0 | grep 'global_step: 2' | awk -F 'max_memory_reserved: ' '{print $2}' | awk -F ',' '{print $1}'`
1073-
echo "auto result: step 2 loss=$auto_loss_2 ips=$auto_ips_2 mem=$auto_mem_2"
1074-
auto_loss_10=`cat $auto_case_log_dir/workerlog.0 | grep 'global_step: 10' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
1075-
loss_md5_10=`cat $auto_case_log_dir/workerlog.0 | grep 'global_step: 10' | awk -F 'loss_md5: ' '{print $2}' | awk -F ',' '{print $1}'`
1076-
auto_ips_10=`cat $auto_case_log_dir/workerlog.0 | grep 'global_step: 10' | awk -F 'interval_tokens_per_second_per_device: ' '{print $2}' | awk -F ',' '{print $1}'`
1077-
auto_mem_10=`cat $auto_case_log_dir/workerlog.0 | grep 'global_step: 10' | awk -F 'max_memory_reserved: ' '{print $2}' | awk -F ',' '{print $1}'`
1078-
echo "auto result: step 10 loss=$auto_loss_10 ips=$auto_ips_10 mem=$auto_mem_10"
1079-
if [[ $tp_config =~ "replace_with_parallel_cross_entropy" ]];then
1080-
# This optimization may result in a discrepancy in accuracy.
1081-
loss_base_2=10.53477287
1082-
loss_base_10=9.4961338
1083-
else
1084-
loss_base_2=10.53477192
1085-
loss_base_10=9.4961338
1086-
fi
1087-
auto_ips=-1
1088-
auto_mem=-1
1089-
ips_base=-1
1090-
mem_base=-1
1091-
if [ $IS_A100 -ne 0 ];then
1092-
loss_base_2=10.58283806
1093-
loss_base_10=9.43873405
1094-
fi
1095-
check_result $FUNCNAME ${loss_base_2} ${auto_loss_2} ${ips_base} ${auto_ips} ${mem_base} ${auto_mem}
1096-
check_result $FUNCNAME ${loss_base_10} ${auto_loss_10} ${ips_base} ${auto_ips} ${mem_base} ${auto_mem}
1070+
auto_loss_2=`cat $auto_case_log_dir/workerlog.0 | grep 'global_step: 2' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
1071+
loss_md5_2=`cat $auto_case_log_dir/workerlog.0 | grep 'global_step: 2' | awk -F 'loss_md5: ' '{print $2}' | awk -F ',' '{print $1}'`
1072+
auto_ips_2=`cat $auto_case_log_dir/workerlog.0 | grep 'global_step: 2' | awk -F 'interval_tokens_per_second_per_device: ' '{print $2}' | awk -F ',' '{print $1}'`
1073+
auto_mem_2=`cat $auto_case_log_dir/workerlog.0 | grep 'global_step: 2' | awk -F 'max_memory_reserved: ' '{print $2}' | awk -F ',' '{print $1}'`
1074+
echo "auto result: step 2 loss=$auto_loss_2 ips=$auto_ips_2 mem=$auto_mem_2"
1075+
auto_loss_10=`cat $auto_case_log_dir/workerlog.0 | grep 'global_step: 10' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
1076+
loss_md5_10=`cat $auto_case_log_dir/workerlog.0 | grep 'global_step: 10' | awk -F 'loss_md5: ' '{print $2}' | awk -F ',' '{print $1}'`
1077+
auto_ips_10=`cat $auto_case_log_dir/workerlog.0 | grep 'global_step: 10' | awk -F 'interval_tokens_per_second_per_device: ' '{print $2}' | awk -F ',' '{print $1}'`
1078+
auto_mem_10=`cat $auto_case_log_dir/workerlog.0 | grep 'global_step: 10' | awk -F 'max_memory_reserved: ' '{print $2}' | awk -F ',' '{print $1}'`
1079+
echo "auto result: step 10 loss=$auto_loss_10 ips=$auto_ips_10 mem=$auto_mem_10"
1080+
if [ $to_static -ne 0 ];then
1081+
if [[ $tp_config =~ "replace_with_parallel_cross_entropy" ]];then
1082+
# This optimization may result in a discrepancy in accuracy.
1083+
loss_base_2=10.53477287
1084+
loss_base_10=9.4961338
1085+
else
1086+
loss_base_2=10.53477192
1087+
loss_base_10=9.4961338
1088+
fi
1089+
auto_ips=-1
1090+
auto_mem=-1
1091+
ips_base=-1
1092+
mem_base=-1
1093+
if [ $IS_A100 -ne 0 ];then
1094+
loss_base_2=10.58283806
1095+
loss_base_10=9.43873405
1096+
fi
1097+
check_result $FUNCNAME ${loss_base_2} ${auto_loss_2} ${ips_base} ${auto_ips} ${mem_base} ${auto_mem}
1098+
check_result $FUNCNAME ${loss_base_10} ${auto_loss_10} ${ips_base} ${auto_ips} ${mem_base} ${auto_mem}
1099+
else
1100+
if [[ $tp_config =~ "replace_with_parallel_cross_entropy" ]];then
1101+
loss_base_2=10.53477287
1102+
loss_base_10=9.4961319
1103+
else
1104+
loss_base_2=10.53477287
1105+
loss_base_10=9.49613285
1106+
fi
1107+
auto_ips=-1
1108+
auto_mem=-1
1109+
ips_base=-1
1110+
mem_base=-1
1111+
if [ $IS_A100 -ne 0 ];then
1112+
if [[ $tp_config =~ "replace_with_parallel_cross_entropy" ]];then
1113+
loss_base_2=10.58283806
1114+
loss_base_10=9.43873215
1115+
else
1116+
loss_base_2=10.58283806
1117+
loss_base_10=9.4387331
1118+
fi
1119+
fi
1120+
check_result $FUNCNAME ${loss_base_2} ${auto_loss_2} ${ips_base} ${auto_ips} ${mem_base} ${auto_mem}
1121+
check_result $FUNCNAME ${loss_base_10} ${auto_loss_10} ${ips_base} ${auto_ips} ${mem_base} ${auto_mem}
1122+
fi
1123+
done
10971124
done
10981125
export FLAGS_enable_fused_ffn_qkv_pass=0
10991126
echo "=========== $FUNCNAME run end ==========="

0 commit comments

Comments
 (0)