TurkuNLP · Muennighoff · May 19, 2023 · May 19, 2023 · May 19, 2023 · May 19, 2023
diff --git a/megatron/arguments.py b/megatron/arguments.py
@@ -664,7 +664,8 @@ def _add_learning_rate_args(parser):
                        'from checkpoint and ignore input arguments.')
     group.add_argument('--universal-checkpoint', action='store_true',
                         help='Loading a universal format checkpoint.')
-
+    group.add_argument('--reset-progress', action='store_true', default=None,
+                        help='Reset iteration to 0 & do not load args.') 
     return parser
 
 

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
@@ -342,7 +342,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
     set_checkpoint_version(state_dict.get('checkpoint_version', 0))
 
     # Set iteration.
-    if args.finetune or release:
+    if args.finetune or release or args.reset_progress:
         iteration = 0
     else:
         try:
@@ -361,7 +361,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
     # Check arguments.
     assert args.consumed_train_samples == 0
     assert args.consumed_valid_samples == 0
-    if 'args' in state_dict:
+    if 'args' in state_dict and not args.reset_progress:
         checkpoint_args = state_dict['args']
         if not args.universal_checkpoint:
             check_checkpoint_args(checkpoint_args)
@@ -480,4 +480,4 @@ def _checkpoint_info():
     return {
         "padded_vocab_size": args.padded_vocab_size,
         "original_vocab_size": tokenizer.vocab_size,
-    }
+    }
diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
@@ -40,9 +40,7 @@ def build_tokenizer(args):
                                             vocab_extra_ids=args.vocab_extra_ids)
     elif args.tokenizer_type == 'GPT2BPETokenizer':
         assert args.merge_file is not None
-        tokenizer = _GPT2BPETokenizer(
-            args.vocab_file, args.merge_file, vocab_extra_ids=args.vocab_extra_ids
-        )
+        tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
     elif args.tokenizer_type == "PretrainedFromHF":
         assert args.tokenizer_name_or_path is not None
 
@@ -288,36 +286,13 @@ def additional_special_tokens(self, value):
 class _GPT2BPETokenizer(AbstractTokenizer):
     """Original GPT2 BPE tokenizer."""
 
-    def __init__(self, vocab_file, merge_file, vocab_extra_ids=0):
+    def __init__(self, vocab_file, merge_file):
         name = 'GPT2 BPE'
         super().__init__(name)
 
         self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
                                        special_tokens=[], max_len=None)
-        self.eod_id = self.eos_token_id = self.tokenizer.encoder['<|endoftext|>']
-
-        self.bod_id = self.bos_token_id = self.tokenizer.encoder['[EOS]']
-        self.sep_id = self.tokenizer.encoder['[SEP]']
-        self.mask_id = self.tokenizer.encoder['[MASK]']
-        self.pad_id = self.tokenizer.encoder['[PAD]']
-
-        additional_special_tokens = []
-        self._additional_special_tokens = []
-        additional_special_tokens.extend(
-            ["<extra_id_{}>".format(i) for i in range(vocab_extra_ids)])
-        self.add_additional_special_tokens(additional_special_tokens)
-
-    def add_additional_special_tokens(self, tokens_list):
-        setattr(self, "additional_special_tokens", tokens_list)
-        for value in tokens_list:
-            self.add_token(value)
-
-    def add_token(self, token):
-        if token not in self.vocab:
-            self.inv_vocab[self.vocab_size] = token
-            # self.vocab_size comes from len(vocab)
-            # and it will increase as we add elements
-            self.vocab[token] = self.vocab_size
+        self.eod_id = self.tokenizer.encoder['<|endoftext|>']
 
     @property
     def vocab_size(self):
@@ -341,35 +316,6 @@ def detokenize(self, token_ids):
     def eod(self):
         return self.eod_id
 
-    @property
-    def bod(self):
-        return self.bod_id
-
-    @property
-    def sep(self):
-        return self.sep_id
-
-    @property
-    def mask(self):
-        return self.mask_id     
-
-    @property
-    def pad(self):
-        return self.pad_id
-
-    @property
-    def additional_special_tokens(self):
-        """ All the additional special tokens you may want to use (list of strings)."""
-        return self._additional_special_tokens
-
-    @property
-    def additional_special_tokens_ids(self):
-        """ Ids of all the additional special tokens in the vocabulary (list of integers)."""
-        return [self.vocab.get(token) for token in self._additional_special_tokens]
-
-    @additional_special_tokens.setter
-    def additional_special_tokens(self, value):
-        self._additional_special_tokens = value
 
 class _AutoTokenizer(AbstractTokenizer):
     """AutoTokenizer for Hf Pretrained model loading."""
@@ -442,18 +388,6 @@ def eos(self):
         candidate = self.tokenizer.eos_token_id
         return self._check_token_candidate(candidate)
 
-    @property
-    def bos_token_id(self):
-        """Id of the beginning of sentence token in the vocabulary."""
-        candidate = self.tokenizer.bos_token_id
-        return self._check_token_candidate(candidate)
-
-    @property
-    def eos_token_id(self):
-        """Id of the end of sentence token in the vocabulary."""
-        candidate = self.tokenizer.eos_token_id
-        return self._check_token_candidate(candidate)
-
     @property
     def additional_special_tokens_ids(self):
         """ All the additional special tokens you may want to use (list of strings)."""

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
@@ -398,6 +398,8 @@ def tasks_args(parser):
     group.add_argument('--intermed_results',  default = False, action='store_true', help='Whether to print & write intermediate results for each task')
     group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation')
     group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel')
+    group.add_argument('--fewshots', type=int, default=0, help='Num fewshots')
+    group.add_argument('--limit', type=int, default=None, help='Limit samples')
     group.add_argument('--add_denoiser',  default = False, action='store_true', help='Whether to add a denoiser to the model')
     return parser
 
@@ -407,6 +409,10 @@ def main():
     # parse the megatron args. But wait with initalizing megatron.
     # avoid printing the arguments, since they will later be overridden.
     args = _parse_args(tasks_args)
+    if os.path.exists(args.results_path):
+        print("Exists ", args.results_path)
+        exit()
+
     load_path = args.load
     model = load_ds_checkpoint_and_setup_megatron(args)
 
@@ -431,11 +437,11 @@ def main():
         global_results = {"results": {}, "versions": {}}
         timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
         iteration_id = load_path.split("/")[-1].replace("/", "")
-        results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json")
+        results_path = args.results_path#.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_{args.fewshots}shots.json")
         # Backup file in case of interruption during writing
-        results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json")
+        results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_{args.fewshots}shots_backup.json")
         for task_name, task in task_dict.items():
-            results = evaluator.evaluate(adaptor, {task_name: task}, False, 0, None, bootstrap_iters=args.bootstrap_iters)
+            results = evaluator.evaluate(adaptor, {task_name: task}, False, args.fewshots, bootstrap_iters=args.bootstrap_iters, limit=args.limit)
             global_results["results"] = {**global_results["results"], **results["results"]}
             global_results["versions"] = {**global_results["versions"], **results["versions"]}
             if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
@@ -445,7 +451,7 @@ def main():
                 with open(results_path_backup, 'w') as outfile:
                     json.dump(global_results, outfile, indent=4)
     else:
-        global_results = evaluator.evaluate(adaptor, task_dict, False, 0, None, bootstrap_iters=args.bootstrap_iters)
+        global_results = evaluator.evaluate(adaptor, task_dict, False, args.fewshots, bootstrap_iters=args.bootstrap_iters, limit=args.limit)
         if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
             print(json.dumps(global_results, indent=2))
             with open(args.results_path, 'w') as outfile: