|
| 1 | +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +import os |
| 16 | +from dataclasses import dataclass, field |
| 17 | +from functools import partial |
| 18 | + |
| 19 | +import paddle |
| 20 | +from data import custom_instruction_convert_example |
| 21 | +from utils import GLMTrainer |
| 22 | + |
| 23 | +from paddlenlp.data import DefaultDataCollator |
| 24 | +from paddlenlp.datasets import load_dataset |
| 25 | +from paddlenlp.layers import LoRAConfig, LoRAModel |
| 26 | +from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint |
| 27 | +from paddlenlp.transformers import AutoModelForConditionalGeneration, AutoTokenizer |
| 28 | +from paddlenlp.utils.log import logger |
| 29 | + |
| 30 | + |
| 31 | +@dataclass |
| 32 | +class DataArgument: |
| 33 | + task_name: str = field(default="school_math_0.25M", metadata={"help": "The name of task."}) |
| 34 | + data_name: str = field(default="bellegroup", metadata={"help": "The name of data."}) |
| 35 | + src_length: int = field(default=608, metadata={"help": "The max length of source text."}) |
| 36 | + tgt_length: int = field(default=160, metadata={"help": "The max length of target text."}) |
| 37 | + min_tgt_length: int = field(default=55, metadata={"help": "The min length of target text."}) |
| 38 | + length_penalty: float = field(default=0.7, metadata={"help": "The length penalty."}) |
| 39 | + no_repeat_ngram_size: int = field(default=3, metadata={"help": "The no repeat ngram size."}) |
| 40 | + num_beams: int = field(default=5, metadata={"help": "The number of beams."}) |
| 41 | + select_topk: bool = field(default=True, metadata={"help": "Whether to select top k tokens for generation."}) |
| 42 | + top_p: float = field( |
| 43 | + default=0.0, metadata={"help": "The cumulative probability for top-p-filtering in the 'sampling' strategy."} |
| 44 | + ) |
| 45 | + top_k: int = field( |
| 46 | + default=0, |
| 47 | + metadata={ |
| 48 | + "help": "The number of highest probability tokens to keep for top-k-filtering in the 'sampling' strategy." |
| 49 | + }, |
| 50 | + ) |
| 51 | + no_block_position: bool = field(default=False) |
| 52 | + |
| 53 | + |
| 54 | +@dataclass |
| 55 | +class ModelArgument: |
| 56 | + model_name_or_path: str = field( |
| 57 | + default="THUDM/glm-2b", metadata={"help": "Build-in pretrained model name or the path to local model."} |
| 58 | + ) |
| 59 | + label_smoothing: float = field(default=0.1, metadata={"help": "The label smoothing parameter."}) |
| 60 | + lr_decay_ratio: float = field(default=0.1, metadata={"help": "The ratio for learning rate decrease"}) |
| 61 | + lora: bool = field(default=False, metadata={"help": "Whether to use LoRA technique"}) |
| 62 | + |
| 63 | + |
| 64 | +def main(): |
| 65 | + parser = PdArgumentParser((ModelArgument, DataArgument, TrainingArguments)) |
| 66 | + model_args, data_args, training_args = parser.parse_args_into_dataclasses() |
| 67 | + |
| 68 | + training_args.print_config(model_args, "Model") |
| 69 | + training_args.print_config(data_args, "Data") |
| 70 | + setattr(training_args, "label_smoothing", model_args.label_smoothing) |
| 71 | + setattr(training_args, "lr_decay_ratio", model_args.lr_decay_ratio) |
| 72 | + |
| 73 | + paddle.set_device(training_args.device) |
| 74 | + |
| 75 | + # Log on each process the small summary: |
| 76 | + logger.warning( |
| 77 | + f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " |
| 78 | + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}" |
| 79 | + ) |
| 80 | + |
| 81 | + # Detecting last checkpoint. |
| 82 | + last_checkpoint = None |
| 83 | + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: |
| 84 | + last_checkpoint = get_last_checkpoint(training_args.output_dir) |
| 85 | + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 1: |
| 86 | + raise ValueError( |
| 87 | + f"Output directory ({training_args.output_dir}) already exists and is not empty. " |
| 88 | + "Use --overwrite_output_dir to overcome." |
| 89 | + ) |
| 90 | + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: |
| 91 | + logger.info( |
| 92 | + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " |
| 93 | + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." |
| 94 | + ) |
| 95 | + |
| 96 | + dtype = None |
| 97 | + if training_args.fp16_opt_level == "O2": |
| 98 | + if training_args.fp16: |
| 99 | + dtype = "float16" |
| 100 | + if training_args.bf16: |
| 101 | + dtype = "bfloat16" |
| 102 | + |
| 103 | + # Load the pretrained language model. |
| 104 | + model = AutoModelForConditionalGeneration.from_pretrained( |
| 105 | + model_args.model_name_or_path, |
| 106 | + output_predict=True, |
| 107 | + parallel_output=True, |
| 108 | + load_state_as_np=True, |
| 109 | + dtype=dtype, # todo enable set dtype to avoid additional mem usage |
| 110 | + tensor_parallel_degree=training_args.tensor_parallel_degree, |
| 111 | + tensor_parallel_rank=training_args.tensor_parallel_rank, |
| 112 | + ) |
| 113 | + if model_args.lora: |
| 114 | + # TODO: hardcode parameters for now. Change after MergedLoRA is introduced |
| 115 | + lora_config = LoRAConfig( |
| 116 | + target_modules=[".*query_key_value.*"], |
| 117 | + r=4, |
| 118 | + lora_alpha=8, |
| 119 | + merge_weights=True, |
| 120 | + enable_lora_list=[[True, False, True]], |
| 121 | + tensor_parallel_degree=training_args.tensor_parallel_degree, |
| 122 | + ) |
| 123 | + model = LoRAModel(model, lora_config) |
| 124 | + model.mark_only_lora_as_trainable() |
| 125 | + model.print_trainable_parameters() |
| 126 | + |
| 127 | + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) |
| 128 | + |
| 129 | + # Load the dataset. |
| 130 | + train_ds, dev_ds = load_dataset(data_args.data_name, data_args.task_name, splits=["train", "dev"]) |
| 131 | + |
| 132 | + trans_func = partial(custom_instruction_convert_example, tokenizer=tokenizer, data_args=data_args) |
| 133 | + train_ds = train_ds.map(partial(trans_func, is_test=False, is_do_generation=False)) |
| 134 | + test_ds = dev_ds.map(partial(trans_func, is_do_generation=False)) |
| 135 | + collate_fn = DefaultDataCollator() |
| 136 | + |
| 137 | + trainer = GLMTrainer( |
| 138 | + model=model, |
| 139 | + args=training_args, |
| 140 | + train_dataset=train_ds, |
| 141 | + eval_dataset=dev_ds, |
| 142 | + tokenizer=tokenizer, |
| 143 | + do_generation=False, |
| 144 | + data_collator=collate_fn, |
| 145 | + ) |
| 146 | + if training_args.fp16_opt_level == "O2": |
| 147 | + trainer.disable_autocast_context_manager() |
| 148 | + |
| 149 | + if training_args.do_train: |
| 150 | + train_result = trainer.train(resume_from_checkpoint=last_checkpoint) |
| 151 | + trainer.save_model(merge_tensor_parallel=training_args.tensor_parallel_degree > 1) |
| 152 | + trainer.log_metrics("train", train_result.metrics) |
| 153 | + trainer.save_metrics("train", train_result.metrics) |
| 154 | + trainer.save_state() |
| 155 | + |
| 156 | + if training_args.do_eval: |
| 157 | + eval_result = trainer.evaluate(test_ds) |
| 158 | + trainer.log_metrics("test", eval_result) |
| 159 | + |
| 160 | + |
| 161 | +if __name__ == "__main__": |
| 162 | + main() |
0 commit comments