@@ -1002,98 +1002,125 @@ function llama_pir_auto_fuse_ffn_attention_qkv_MP2() {
1002
1002
" --tensor_parallel_config replace_with_c_embedding"
1003
1003
" --tensor_parallel_config replace_with_parallel_cross_entropy"
1004
1004
)
1005
- for tp_config in " ${tp_configs[@]} " ; do
1006
- rm -rf $auto_case_out_dir
1007
- rm -rf $auto_case_log_dir
1008
- python -u -m paddle.distributed.launch \
1009
- --gpus " 0,1" \
1010
- --log_dir $auto_case_log_dir \
1011
- run_pretrain_auto.py \
1012
- --model_name_or_path " facebook/llama-7b" \
1013
- --tokenizer_name_or_path " facebook/llama-7b" \
1014
- --input_dir " ./data" \
1015
- --output_dir $auto_case_out_dir \
1016
- --split 949,50,1 \
1017
- --weight_decay 0.01 \
1018
- --warmup_ratio 0.01 \
1019
- --warmup_steps 30 \
1020
- --max_grad_norm 0.0 \
1021
- --learning_rate 3e-05 \
1022
- --min_learning_rate 3e-06 \
1023
- --max_steps 10 \
1024
- --logging_steps 1 \
1025
- --eval_steps 1000 \
1026
- --save_steps 3 \
1027
- --continue_training 0 \
1028
- --do_train true \
1029
- --do_eval false \
1030
- --do_predict false \
1031
- --disable_tqdm true \
1032
- --skip_profile_timer true \
1033
- --save_total_limit 2 \
1034
- --device gpu \
1035
- --disable_tqdm true \
1036
- --dataloader_num_workers 1 \
1037
- --distributed_dataloader 0 \
1038
- --enable_auto_parallel 1 \
1039
- --per_device_train_batch_size 1 \
1040
- --gradient_accumulation_steps 1 \
1041
- --per_device_eval_batch_size 2 \
1042
- --recompute false \
1043
- --recompute_use_reentrant true \
1044
- --recompute_granularity full \
1045
- --pp_recompute_interval 0 \
1046
- --bf16 0 \
1047
- --fp16_opt_level " O2" \
1048
- --amp_custom_black_list " reduce_sum" " c_softmax_with_cross_entropy" \
1049
- --amp_custom_white_list " lookup_table" " lookup_table_v2" \
1050
- --amp_master_grad false \
1051
- --fuse_attention_ffn false \
1052
- --fuse_attention_qkv false \
1053
- --use_flash_attention false \
1054
- --use_fused_rope true \
1055
- --use_fused_rms_norm true \
1056
- --max_seq_length 4096 \
1057
- --sequence_parallel false \
1058
- --pipeline_parallel_degree 1 \
1059
- --sharding_parallel_degree 1 \
1060
- --tensor_parallel_degree 2 \
1061
- ${tp_config} \
1062
- --virtual_pp_degree 1 \
1063
- --pipeline_schedule_mode " VPP" \
1064
- --sharding " " \
1065
- --to_static 1 \
1066
- --num_hidden_layers 2 \
1067
- >> ${log_path} /$FUNCNAME 2>&1
1005
+ for to_static in " 0" " 1" ; do
1006
+ for tp_config in " ${tp_configs[@]} " ; do
1007
+ rm -rf $auto_case_out_dir
1008
+ rm -rf $auto_case_log_dir
1009
+ python -u -m paddle.distributed.launch \
1010
+ --gpus " 0,1" \
1011
+ --log_dir $auto_case_log_dir \
1012
+ run_pretrain_auto.py \
1013
+ --model_name_or_path " facebook/llama-7b" \
1014
+ --tokenizer_name_or_path " facebook/llama-7b" \
1015
+ --input_dir " ./data" \
1016
+ --output_dir $auto_case_out_dir \
1017
+ --split 949,50,1 \
1018
+ --weight_decay 0.01 \
1019
+ --warmup_ratio 0.01 \
1020
+ --warmup_steps 30 \
1021
+ --max_grad_norm 0.0 \
1022
+ --learning_rate 3e-05 \
1023
+ --min_learning_rate 3e-06 \
1024
+ --max_steps 10 \
1025
+ --logging_steps 1 \
1026
+ --eval_steps 1000 \
1027
+ --save_steps 3 \
1028
+ --continue_training 0 \
1029
+ --do_train true \
1030
+ --do_eval false \
1031
+ --do_predict false \
1032
+ --disable_tqdm true \
1033
+ --skip_profile_timer true \
1034
+ --save_total_limit 2 \
1035
+ --device gpu \
1036
+ --disable_tqdm true \
1037
+ --dataloader_num_workers 1 \
1038
+ --distributed_dataloader 0 \
1039
+ --enable_auto_parallel 1 \
1040
+ --per_device_train_batch_size 1 \
1041
+ --gradient_accumulation_steps 1 \
1042
+ --per_device_eval_batch_size 2 \
1043
+ --recompute false \
1044
+ --recompute_use_reentrant true \
1045
+ --recompute_granularity full \
1046
+ --pp_recompute_interval 0 \
1047
+ --bf16 0 \
1048
+ --fp16_opt_level " O2" \
1049
+ --amp_custom_black_list " reduce_sum" " c_softmax_with_cross_entropy" \
1050
+ --amp_custom_white_list " lookup_table" " lookup_table_v2" \
1051
+ --amp_master_grad false \
1052
+ --fuse_attention_ffn false \
1053
+ --fuse_attention_qkv false \
1054
+ --use_flash_attention false \
1055
+ --use_fused_rope true \
1056
+ --use_fused_rms_norm true \
1057
+ --max_seq_length 4096 \
1058
+ --sequence_parallel false \
1059
+ --pipeline_parallel_degree 1 \
1060
+ --sharding_parallel_degree 1 \
1061
+ --tensor_parallel_degree 2 \
1062
+ ${tp_config} \
1063
+ --virtual_pp_degree 1 \
1064
+ --pipeline_schedule_mode " VPP" \
1065
+ --sharding " " \
1066
+ --to_static ${to_static} \
1067
+ --num_hidden_layers 2 \
1068
+ >> ${log_path} /$FUNCNAME 2>&1
1068
1069
1069
- auto_loss_2=` cat $auto_case_log_dir /workerlog.0 | grep ' global_step: 2' | awk -F ' loss: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1070
- loss_md5_2=` cat $auto_case_log_dir /workerlog.0 | grep ' global_step: 2' | awk -F ' loss_md5: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1071
- auto_ips_2=` cat $auto_case_log_dir /workerlog.0 | grep ' global_step: 2' | awk -F ' interval_tokens_per_second_per_device: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1072
- auto_mem_2=` cat $auto_case_log_dir /workerlog.0 | grep ' global_step: 2' | awk -F ' max_memory_reserved: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1073
- echo " auto result: step 2 loss=$auto_loss_2 ips=$auto_ips_2 mem=$auto_mem_2 "
1074
- auto_loss_10=` cat $auto_case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1075
- loss_md5_10=` cat $auto_case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss_md5: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1076
- auto_ips_10=` cat $auto_case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' interval_tokens_per_second_per_device: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1077
- auto_mem_10=` cat $auto_case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' max_memory_reserved: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1078
- echo " auto result: step 10 loss=$auto_loss_10 ips=$auto_ips_10 mem=$auto_mem_10 "
1079
- if [[ $tp_config =~ " replace_with_parallel_cross_entropy" ]]; then
1080
- # This optimization may result in a discrepancy in accuracy.
1081
- loss_base_2=10.53477287
1082
- loss_base_10=9.4961338
1083
- else
1084
- loss_base_2=10.53477192
1085
- loss_base_10=9.4961338
1086
- fi
1087
- auto_ips=-1
1088
- auto_mem=-1
1089
- ips_base=-1
1090
- mem_base=-1
1091
- if [ $IS_A100 -ne 0 ]; then
1092
- loss_base_2=10.58283806
1093
- loss_base_10=9.43873405
1094
- fi
1095
- check_result $FUNCNAME ${loss_base_2} ${auto_loss_2} ${ips_base} ${auto_ips} ${mem_base} ${auto_mem}
1096
- check_result $FUNCNAME ${loss_base_10} ${auto_loss_10} ${ips_base} ${auto_ips} ${mem_base} ${auto_mem}
1070
+ auto_loss_2=` cat $auto_case_log_dir /workerlog.0 | grep ' global_step: 2' | awk -F ' loss: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1071
+ loss_md5_2=` cat $auto_case_log_dir /workerlog.0 | grep ' global_step: 2' | awk -F ' loss_md5: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1072
+ auto_ips_2=` cat $auto_case_log_dir /workerlog.0 | grep ' global_step: 2' | awk -F ' interval_tokens_per_second_per_device: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1073
+ auto_mem_2=` cat $auto_case_log_dir /workerlog.0 | grep ' global_step: 2' | awk -F ' max_memory_reserved: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1074
+ echo " auto result: step 2 loss=$auto_loss_2 ips=$auto_ips_2 mem=$auto_mem_2 "
1075
+ auto_loss_10=` cat $auto_case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1076
+ loss_md5_10=` cat $auto_case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' loss_md5: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1077
+ auto_ips_10=` cat $auto_case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' interval_tokens_per_second_per_device: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1078
+ auto_mem_10=` cat $auto_case_log_dir /workerlog.0 | grep ' global_step: 10' | awk -F ' max_memory_reserved: ' ' {print $2}' | awk -F ' ,' ' {print $1}' `
1079
+ echo " auto result: step 10 loss=$auto_loss_10 ips=$auto_ips_10 mem=$auto_mem_10 "
1080
+ if [ $to_static -ne 0 ]; then
1081
+ if [[ $tp_config =~ " replace_with_parallel_cross_entropy" ]]; then
1082
+ # This optimization may result in a discrepancy in accuracy.
1083
+ loss_base_2=10.53477287
1084
+ loss_base_10=9.4961338
1085
+ else
1086
+ loss_base_2=10.53477192
1087
+ loss_base_10=9.4961338
1088
+ fi
1089
+ auto_ips=-1
1090
+ auto_mem=-1
1091
+ ips_base=-1
1092
+ mem_base=-1
1093
+ if [ $IS_A100 -ne 0 ]; then
1094
+ loss_base_2=10.58283806
1095
+ loss_base_10=9.43873405
1096
+ fi
1097
+ check_result $FUNCNAME ${loss_base_2} ${auto_loss_2} ${ips_base} ${auto_ips} ${mem_base} ${auto_mem}
1098
+ check_result $FUNCNAME ${loss_base_10} ${auto_loss_10} ${ips_base} ${auto_ips} ${mem_base} ${auto_mem}
1099
+ else
1100
+ if [[ $tp_config =~ " replace_with_parallel_cross_entropy" ]]; then
1101
+ loss_base_2=10.53477287
1102
+ loss_base_10=9.4961319
1103
+ else
1104
+ loss_base_2=10.53477287
1105
+ loss_base_10=9.49613285
1106
+ fi
1107
+ auto_ips=-1
1108
+ auto_mem=-1
1109
+ ips_base=-1
1110
+ mem_base=-1
1111
+ if [ $IS_A100 -ne 0 ]; then
1112
+ if [[ $tp_config =~ " replace_with_parallel_cross_entropy" ]]; then
1113
+ loss_base_2=10.58283806
1114
+ loss_base_10=9.43873215
1115
+ else
1116
+ loss_base_2=10.58283806
1117
+ loss_base_10=9.4387331
1118
+ fi
1119
+ fi
1120
+ check_result $FUNCNAME ${loss_base_2} ${auto_loss_2} ${ips_base} ${auto_ips} ${mem_base} ${auto_mem}
1121
+ check_result $FUNCNAME ${loss_base_10} ${auto_loss_10} ${ips_base} ${auto_ips} ${mem_base} ${auto_mem}
1122
+ fi
1123
+ done
1097
1124
done
1098
1125
export FLAGS_enable_fused_ffn_qkv_pass=0
1099
1126
echo " =========== $FUNCNAME run end ==========="
0 commit comments