From 65275d571c4fa27d5b3f7fcd90186b0bc3db77c6 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 15 May 2025 11:37:38 +0000 Subject: [PATCH 1/6] adds RULE --- src/lighteval/metrics/metrics.py | 22 +- src/lighteval/models/vllm/vllm_model.py | 3 +- src/lighteval/tasks/default_prompts.py | 17 +- src/lighteval/tasks/default_tasks.py | 1335 ++++++++++++++++++++++- src/lighteval/tasks/lighteval_task.py | 5 +- 5 files changed, 1365 insertions(+), 17 deletions(-) diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 0aede953d..e355c4e92 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -133,7 +133,27 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) - + ruler_match_any = SampleLevelMetric( + metric_name="ruler_match_any", + sample_level_fn=lambda predictions, golds, formatted_doc: max( + [1.0 if r.lower() in predictions[0].lower() else 0.0 for r in golds] + ), + category=MetricCategory.GENERATIVE, + use_case=MetricUseCase.SUMMARIZATION, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + ruler_match_all = SampleLevelMetric( + metric_name="ruler_match_all", + sample_level_fn=lambda predictions, golds, formatted_doc: sum( + [1.0 if r.lower() in predictions[0].lower() else 0.0 for r in golds] + ) + / len(golds), + category=MetricCategory.GENERATIVE, + use_case=MetricUseCase.SUMMARIZATION, + corpus_level_fn=np.mean, + higher_is_better=True, + ) bleurt = SampleLevelMetric( metric_name="bleurt", sample_level_fn=BLEURT().compute, diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 40352b4da..6992261e5 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -266,8 +266,9 @@ def greedy_until( if max_new_tokens is not None: if context_size + max_new_tokens > self.max_length: logger.warning( - f"{context_size + max_new_tokens=} which is greater than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens." + f"{context_size + max_new_tokens=} which is greater than {self.max_length=}. Truncating context to {self.max_length=} - {max_new_tokens=} = {self.max_length - max_new_tokens} tokens." ) + breakpoint() context_size = self.max_length - max_new_tokens if context_size < 0: logger.critical( diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 2745b63c5..3bd0bd844 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -43,13 +43,28 @@ # fmt: on +def ruler(line, task_name: str = None): + query = line["input"] + choices = line["outputs"] + gold_index = 0 + + return Doc(query=query, choices=choices, gold_index=gold_index, task_name=task_name) + + def simpleqa(line, task_name: str = None): query = line["problem"] choices = [line["answer"]] gold_index = 0 + instruction = "Only answer the question to complete the prompt, without any additional text.\n" + query = f"{instruction}{query}" return Doc( - task_name=task_name, query=query, choices=choices, gold_index=gold_index, specific={**eval(line["metadata"])} + task_name=task_name, + query=query, + choices=choices, + instruction=instruction, + gold_index=gold_index, + specific={**eval(line["metadata"])}, ) diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index 3960e6f5c..2d885532a 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -24,22 +24,1333 @@ from lighteval.tasks.lighteval_task import LightevalTaskConfig -abstract_narrative_understanding_bigbench = LightevalTaskConfig( - name="abstract_narrative_understanding", - suite=["bigbench", "bigbench_json"], - prompt_function=prompt.bigbench, - hf_repo="bigbench", - hf_subset="abstract_narrative_understanding", - hf_avail_splits=["default", "train", "validation"], - evaluation_splits=["default"], +ruler_niah_single_1_131072 = LightevalTaskConfig( + name="ruler_131072:niah_single_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_1"], + evaluation_splits=["niah_single_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_3_131072 = LightevalTaskConfig( + name="ruler_131072:niah_single_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_3"], + evaluation_splits=["niah_single_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_2_131072 = LightevalTaskConfig( + name="ruler_131072:niah_single_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_2"], + evaluation_splits=["niah_single_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_1_131072 = LightevalTaskConfig( + name="ruler_131072:niah_multikey_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_1"], + evaluation_splits=["niah_multikey_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_2_131072 = LightevalTaskConfig( + name="ruler_131072:niah_multikey_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_2"], + evaluation_splits=["niah_multikey_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multiquery_131072 = LightevalTaskConfig( + name="ruler_131072:niah_multiquery", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multiquery"], + evaluation_splits=["niah_multiquery"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_3_131072 = LightevalTaskConfig( + name="ruler_131072:niah_multikey_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_3"], + evaluation_splits=["niah_multikey_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multivalue_131072 = LightevalTaskConfig( + name="ruler_131072:niah_multivalue", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multivalue"], + evaluation_splits=["niah_multivalue"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_vt_131072 = LightevalTaskConfig( + name="ruler_131072:vt", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["vt"], + evaluation_splits=["vt"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_cwe_131072 = LightevalTaskConfig( + name="ruler_131072:cwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["cwe"], + evaluation_splits=["cwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=120, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_fwe_131072 = LightevalTaskConfig( + name="ruler_131072:fwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["fwe"], + evaluation_splits=["fwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_1_131072 = LightevalTaskConfig( + name="ruler_131072:qa_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_1"], + evaluation_splits=["qa_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_2_131072 = LightevalTaskConfig( + name="ruler_131072:qa_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_2"], + evaluation_splits=["qa_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_1_65536 = LightevalTaskConfig( + name="ruler_65536:niah_single_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_1"], + evaluation_splits=["niah_single_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_3_65536 = LightevalTaskConfig( + name="ruler_65536:niah_single_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_3"], + evaluation_splits=["niah_single_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_2_65536 = LightevalTaskConfig( + name="ruler_65536:niah_single_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_2"], + evaluation_splits=["niah_single_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_1_65536 = LightevalTaskConfig( + name="ruler_65536:niah_multikey_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_1"], + evaluation_splits=["niah_multikey_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_2_65536 = LightevalTaskConfig( + name="ruler_65536:niah_multikey_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_2"], + evaluation_splits=["niah_multikey_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multiquery_65536 = LightevalTaskConfig( + name="ruler_65536:niah_multiquery", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multiquery"], + evaluation_splits=["niah_multiquery"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_3_65536 = LightevalTaskConfig( + name="ruler_65536:niah_multikey_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_3"], + evaluation_splits=["niah_multikey_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multivalue_65536 = LightevalTaskConfig( + name="ruler_65536:niah_multivalue", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multivalue"], + evaluation_splits=["niah_multivalue"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_vt_65536 = LightevalTaskConfig( + name="ruler_65536:vt", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["vt"], + evaluation_splits=["vt"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_cwe_65536 = LightevalTaskConfig( + name="ruler_65536:cwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["cwe"], + evaluation_splits=["cwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=120, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_fwe_65536 = LightevalTaskConfig( + name="ruler_65536:fwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["fwe"], + evaluation_splits=["fwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_1_65536 = LightevalTaskConfig( + name="ruler_65536:qa_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_1"], + evaluation_splits=["qa_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_2_65536 = LightevalTaskConfig( + name="ruler_65536:qa_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_2"], + evaluation_splits=["qa_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_1_32768 = LightevalTaskConfig( + name="ruler_32768:niah_single_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_1"], + evaluation_splits=["niah_single_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_3_32768 = LightevalTaskConfig( + name="ruler_32768:niah_single_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_3"], + evaluation_splits=["niah_single_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_2_32768 = LightevalTaskConfig( + name="ruler_32768:niah_single_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_2"], + evaluation_splits=["niah_single_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_1_32768 = LightevalTaskConfig( + name="ruler_32768:niah_multikey_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_1"], + evaluation_splits=["niah_multikey_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_2_32768 = LightevalTaskConfig( + name="ruler_32768:niah_multikey_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_2"], + evaluation_splits=["niah_multikey_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multiquery_32768 = LightevalTaskConfig( + name="ruler_32768:niah_multiquery", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multiquery"], + evaluation_splits=["niah_multiquery"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_3_32768 = LightevalTaskConfig( + name="ruler_32768:niah_multikey_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_3"], + evaluation_splits=["niah_multikey_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multivalue_32768 = LightevalTaskConfig( + name="ruler_32768:niah_multivalue", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multivalue"], + evaluation_splits=["niah_multivalue"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_vt_32768 = LightevalTaskConfig( + name="ruler_32768:vt", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["vt"], + evaluation_splits=["vt"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_cwe_32768 = LightevalTaskConfig( + name="ruler_32768:cwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["cwe"], + evaluation_splits=["cwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=120, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_fwe_32768 = LightevalTaskConfig( + name="ruler_32768:fwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["fwe"], + evaluation_splits=["fwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_1_32768 = LightevalTaskConfig( + name="ruler_32768:qa_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_1"], + evaluation_splits=["qa_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_2_32768 = LightevalTaskConfig( + name="ruler_32768:qa_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_2"], + evaluation_splits=["qa_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_1_16384 = LightevalTaskConfig( + name="ruler_16384:niah_single_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_1"], + evaluation_splits=["niah_single_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_3_16384 = LightevalTaskConfig( + name="ruler_16384:niah_single_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_3"], + evaluation_splits=["niah_single_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_2_16384 = LightevalTaskConfig( + name="ruler_16384:niah_single_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_2"], + evaluation_splits=["niah_single_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_1_16384 = LightevalTaskConfig( + name="ruler_16384:niah_multikey_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_1"], + evaluation_splits=["niah_multikey_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_2_16384 = LightevalTaskConfig( + name="ruler_16384:niah_multikey_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_2"], + evaluation_splits=["niah_multikey_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multiquery_16384 = LightevalTaskConfig( + name="ruler_16384:niah_multiquery", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multiquery"], + evaluation_splits=["niah_multiquery"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_3_16384 = LightevalTaskConfig( + name="ruler_16384:niah_multikey_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_3"], + evaluation_splits=["niah_multikey_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multivalue_16384 = LightevalTaskConfig( + name="ruler_16384:niah_multivalue", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multivalue"], + evaluation_splits=["niah_multivalue"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_vt_16384 = LightevalTaskConfig( + name="ruler_16384:vt", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["vt"], + evaluation_splits=["vt"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_cwe_16384 = LightevalTaskConfig( + name="ruler_16384:cwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["cwe"], + evaluation_splits=["cwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=120, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_fwe_16384 = LightevalTaskConfig( + name="ruler_16384:fwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["fwe"], + evaluation_splits=["fwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_1_16384 = LightevalTaskConfig( + name="ruler_16384:qa_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_1"], + evaluation_splits=["qa_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_2_16384 = LightevalTaskConfig( + name="ruler_16384:qa_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_2"], + evaluation_splits=["qa_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_1_8192 = LightevalTaskConfig( + name="ruler_8192:niah_single_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_1"], + evaluation_splits=["niah_single_1"], few_shots_split=None, few_shots_select=None, - generation_size=1, - metric=[Metrics.loglikelihood_acc], - stop_sequence=["\n"], - trust_dataset=True, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_3_8192 = LightevalTaskConfig( + name="ruler_8192:niah_single_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_3"], + evaluation_splits=["niah_single_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_2_8192 = LightevalTaskConfig( + name="ruler_8192:niah_single_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_2"], + evaluation_splits=["niah_single_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_1_8192 = LightevalTaskConfig( + name="ruler_8192:niah_multikey_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_1"], + evaluation_splits=["niah_multikey_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_2_8192 = LightevalTaskConfig( + name="ruler_8192:niah_multikey_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_2"], + evaluation_splits=["niah_multikey_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multiquery_8192 = LightevalTaskConfig( + name="ruler_8192:niah_multiquery", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multiquery"], + evaluation_splits=["niah_multiquery"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_3_8192 = LightevalTaskConfig( + name="ruler_8192:niah_multikey_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_3"], + evaluation_splits=["niah_multikey_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multivalue_8192 = LightevalTaskConfig( + name="ruler_8192:niah_multivalue", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multivalue"], + evaluation_splits=["niah_multivalue"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_vt_8192 = LightevalTaskConfig( + name="ruler_8192:vt", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["vt"], + evaluation_splits=["vt"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_cwe_8192 = LightevalTaskConfig( + name="ruler_8192:cwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["cwe"], + evaluation_splits=["cwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=120, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_fwe_8192 = LightevalTaskConfig( + name="ruler_8192:fwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["fwe"], + evaluation_splits=["fwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_1_8192 = LightevalTaskConfig( + name="ruler_8192:qa_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_1"], + evaluation_splits=["qa_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_2_8192 = LightevalTaskConfig( + name="ruler_8192:qa_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_2"], + evaluation_splits=["qa_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_1_4096 = LightevalTaskConfig( + name="ruler_4096:niah_single_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_1"], + evaluation_splits=["niah_single_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_3_4096 = LightevalTaskConfig( + name="ruler_4096:niah_single_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_3"], + evaluation_splits=["niah_single_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_single_2_4096 = LightevalTaskConfig( + name="ruler_4096:niah_single_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_single_2"], + evaluation_splits=["niah_single_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_1_4096 = LightevalTaskConfig( + name="ruler_4096:niah_multikey_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_1"], + evaluation_splits=["niah_multikey_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_2_4096 = LightevalTaskConfig( + name="ruler_4096:niah_multikey_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_2"], + evaluation_splits=["niah_multikey_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multiquery_4096 = LightevalTaskConfig( + name="ruler_4096:niah_multiquery", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multiquery"], + evaluation_splits=["niah_multiquery"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multikey_3_4096 = LightevalTaskConfig( + name="ruler_4096:niah_multikey_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multikey_3"], + evaluation_splits=["niah_multikey_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_niah_multivalue_4096 = LightevalTaskConfig( + name="ruler_4096:niah_multivalue", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["niah_multivalue"], + evaluation_splits=["niah_multivalue"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_vt_4096 = LightevalTaskConfig( + name="ruler_4096:vt", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["vt"], + evaluation_splits=["vt"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_cwe_4096 = LightevalTaskConfig( + name="ruler_4096:cwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["cwe"], + evaluation_splits=["cwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=120, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_fwe_4096 = LightevalTaskConfig( + name="ruler_4096:fwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["fwe"], + evaluation_splits=["fwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_1_4096 = LightevalTaskConfig( + name="ruler_4096:qa_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_1"], + evaluation_splits=["qa_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) + +ruler_qa_2_4096 = LightevalTaskConfig( + name="ruler_4096:qa_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_subset="default", + hf_avail_splits=["qa_2"], + evaluation_splits=["qa_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, version=0, ) + + agieval_aqua_rat_lighteval = LightevalTaskConfig( name="agieval:aqua-rat", suite=["lighteval"], diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index da09ec000..e34f73eb5 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -31,6 +31,7 @@ from huggingface_hub import TextGenerationInputGrammarType from multiprocess import Pool from pytablewriter import MarkdownTableWriter +from tqdm import tqdm from lighteval.metrics import ( apply_generative_metric, @@ -551,7 +552,7 @@ def load_datasets(tasks: list["LightevalTask"], dataset_loading_processes: int = task.dataset_filter, task.dataset_revision, ) - for task in tasks + for task in tqdm(tasks) ] else: with Pool(processes=dataset_loading_processes) as pool: @@ -618,7 +619,7 @@ def create_requests_from_tasks( # noqa: C901 task_dict_items = [(name, task) for name, task in task_dict.items() if len(task.eval_docs()) > 0] # Get lists of each type of request - for task_name, task in task_dict_items: + for task_name, task in tqdm(task_dict_items): task_docs = list(task.eval_docs()) n_samples = min(max_samples, len(task_docs)) if max_samples else len(task_docs) evaluation_tracker.task_config_logger.log_num_docs(task_name, len(task_docs), n_samples) From 0ef15e3f920d76ee1b79f0d5dc32d86780672df1 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 15 May 2025 11:42:26 +0000 Subject: [PATCH 2/6] adds RULE --- src/lighteval/tasks/default_prompts.py | 7 +- src/lighteval/tasks/default_tasks.py | 95 -------------------------- 2 files changed, 3 insertions(+), 99 deletions(-) diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 3bd0bd844..073fa8c15 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -47,22 +47,21 @@ def ruler(line, task_name: str = None): query = line["input"] choices = line["outputs"] gold_index = 0 + instruction = "Only answer the question to complete the prompt, without any additional text.\n" + query = f"{instruction}{query}" - return Doc(query=query, choices=choices, gold_index=gold_index, task_name=task_name) + return Doc(query=query, instruction=instruction, choices=choices, gold_index=gold_index, task_name=task_name) def simpleqa(line, task_name: str = None): query = line["problem"] choices = [line["answer"]] gold_index = 0 - instruction = "Only answer the question to complete the prompt, without any additional text.\n" - query = f"{instruction}{query}" return Doc( task_name=task_name, query=query, choices=choices, - instruction=instruction, gold_index=gold_index, specific={**eval(line["metadata"])}, ) diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index 2d885532a..4cb1fa416 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -40,7 +40,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_3_131072 = LightevalTaskConfig( name="ruler_131072:niah_single_3", suite=["lighteval"], @@ -57,7 +56,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_2_131072 = LightevalTaskConfig( name="ruler_131072:niah_single_2", suite=["lighteval"], @@ -74,7 +72,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_1_131072 = LightevalTaskConfig( name="ruler_131072:niah_multikey_1", suite=["lighteval"], @@ -91,7 +88,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_2_131072 = LightevalTaskConfig( name="ruler_131072:niah_multikey_2", suite=["lighteval"], @@ -108,7 +104,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multiquery_131072 = LightevalTaskConfig( name="ruler_131072:niah_multiquery", suite=["lighteval"], @@ -125,7 +120,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_3_131072 = LightevalTaskConfig( name="ruler_131072:niah_multikey_3", suite=["lighteval"], @@ -142,7 +136,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multivalue_131072 = LightevalTaskConfig( name="ruler_131072:niah_multivalue", suite=["lighteval"], @@ -159,7 +152,6 @@ trust_dataset=False, version=0, ) - ruler_vt_131072 = LightevalTaskConfig( name="ruler_131072:vt", suite=["lighteval"], @@ -176,7 +168,6 @@ trust_dataset=False, version=0, ) - ruler_cwe_131072 = LightevalTaskConfig( name="ruler_131072:cwe", suite=["lighteval"], @@ -193,7 +184,6 @@ trust_dataset=False, version=0, ) - ruler_fwe_131072 = LightevalTaskConfig( name="ruler_131072:fwe", suite=["lighteval"], @@ -210,7 +200,6 @@ trust_dataset=False, version=0, ) - ruler_qa_1_131072 = LightevalTaskConfig( name="ruler_131072:qa_1", suite=["lighteval"], @@ -227,7 +216,6 @@ trust_dataset=False, version=0, ) - ruler_qa_2_131072 = LightevalTaskConfig( name="ruler_131072:qa_2", suite=["lighteval"], @@ -244,24 +232,6 @@ trust_dataset=False, version=0, ) - -ruler_niah_single_1_65536 = LightevalTaskConfig( - name="ruler_65536:niah_single_1", - suite=["lighteval"], - prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", - hf_subset="default", - hf_avail_splits=["niah_single_1"], - evaluation_splits=["niah_single_1"], - few_shots_split=None, - few_shots_select=None, - generation_size=128, - metric=[Metrics.ruler_match_all], - stop_sequence=None, - trust_dataset=False, - version=0, -) - ruler_niah_single_3_65536 = LightevalTaskConfig( name="ruler_65536:niah_single_3", suite=["lighteval"], @@ -278,7 +248,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_2_65536 = LightevalTaskConfig( name="ruler_65536:niah_single_2", suite=["lighteval"], @@ -295,7 +264,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_1_65536 = LightevalTaskConfig( name="ruler_65536:niah_multikey_1", suite=["lighteval"], @@ -312,7 +280,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_2_65536 = LightevalTaskConfig( name="ruler_65536:niah_multikey_2", suite=["lighteval"], @@ -329,7 +296,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multiquery_65536 = LightevalTaskConfig( name="ruler_65536:niah_multiquery", suite=["lighteval"], @@ -346,7 +312,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_3_65536 = LightevalTaskConfig( name="ruler_65536:niah_multikey_3", suite=["lighteval"], @@ -363,7 +328,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multivalue_65536 = LightevalTaskConfig( name="ruler_65536:niah_multivalue", suite=["lighteval"], @@ -380,7 +344,6 @@ trust_dataset=False, version=0, ) - ruler_vt_65536 = LightevalTaskConfig( name="ruler_65536:vt", suite=["lighteval"], @@ -397,7 +360,6 @@ trust_dataset=False, version=0, ) - ruler_cwe_65536 = LightevalTaskConfig( name="ruler_65536:cwe", suite=["lighteval"], @@ -414,7 +376,6 @@ trust_dataset=False, version=0, ) - ruler_fwe_65536 = LightevalTaskConfig( name="ruler_65536:fwe", suite=["lighteval"], @@ -431,7 +392,6 @@ trust_dataset=False, version=0, ) - ruler_qa_1_65536 = LightevalTaskConfig( name="ruler_65536:qa_1", suite=["lighteval"], @@ -448,7 +408,6 @@ trust_dataset=False, version=0, ) - ruler_qa_2_65536 = LightevalTaskConfig( name="ruler_65536:qa_2", suite=["lighteval"], @@ -465,7 +424,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_1_32768 = LightevalTaskConfig( name="ruler_32768:niah_single_1", suite=["lighteval"], @@ -482,7 +440,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_3_32768 = LightevalTaskConfig( name="ruler_32768:niah_single_3", suite=["lighteval"], @@ -499,7 +456,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_2_32768 = LightevalTaskConfig( name="ruler_32768:niah_single_2", suite=["lighteval"], @@ -516,7 +472,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_1_32768 = LightevalTaskConfig( name="ruler_32768:niah_multikey_1", suite=["lighteval"], @@ -533,7 +488,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_2_32768 = LightevalTaskConfig( name="ruler_32768:niah_multikey_2", suite=["lighteval"], @@ -550,7 +504,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multiquery_32768 = LightevalTaskConfig( name="ruler_32768:niah_multiquery", suite=["lighteval"], @@ -567,7 +520,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_3_32768 = LightevalTaskConfig( name="ruler_32768:niah_multikey_3", suite=["lighteval"], @@ -584,7 +536,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multivalue_32768 = LightevalTaskConfig( name="ruler_32768:niah_multivalue", suite=["lighteval"], @@ -601,7 +552,6 @@ trust_dataset=False, version=0, ) - ruler_vt_32768 = LightevalTaskConfig( name="ruler_32768:vt", suite=["lighteval"], @@ -618,7 +568,6 @@ trust_dataset=False, version=0, ) - ruler_cwe_32768 = LightevalTaskConfig( name="ruler_32768:cwe", suite=["lighteval"], @@ -635,7 +584,6 @@ trust_dataset=False, version=0, ) - ruler_fwe_32768 = LightevalTaskConfig( name="ruler_32768:fwe", suite=["lighteval"], @@ -652,7 +600,6 @@ trust_dataset=False, version=0, ) - ruler_qa_1_32768 = LightevalTaskConfig( name="ruler_32768:qa_1", suite=["lighteval"], @@ -669,7 +616,6 @@ trust_dataset=False, version=0, ) - ruler_qa_2_32768 = LightevalTaskConfig( name="ruler_32768:qa_2", suite=["lighteval"], @@ -686,7 +632,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_1_16384 = LightevalTaskConfig( name="ruler_16384:niah_single_1", suite=["lighteval"], @@ -703,7 +648,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_3_16384 = LightevalTaskConfig( name="ruler_16384:niah_single_3", suite=["lighteval"], @@ -720,7 +664,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_2_16384 = LightevalTaskConfig( name="ruler_16384:niah_single_2", suite=["lighteval"], @@ -737,7 +680,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_1_16384 = LightevalTaskConfig( name="ruler_16384:niah_multikey_1", suite=["lighteval"], @@ -754,7 +696,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_2_16384 = LightevalTaskConfig( name="ruler_16384:niah_multikey_2", suite=["lighteval"], @@ -771,7 +712,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multiquery_16384 = LightevalTaskConfig( name="ruler_16384:niah_multiquery", suite=["lighteval"], @@ -788,7 +728,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_3_16384 = LightevalTaskConfig( name="ruler_16384:niah_multikey_3", suite=["lighteval"], @@ -805,7 +744,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multivalue_16384 = LightevalTaskConfig( name="ruler_16384:niah_multivalue", suite=["lighteval"], @@ -822,7 +760,6 @@ trust_dataset=False, version=0, ) - ruler_vt_16384 = LightevalTaskConfig( name="ruler_16384:vt", suite=["lighteval"], @@ -839,7 +776,6 @@ trust_dataset=False, version=0, ) - ruler_cwe_16384 = LightevalTaskConfig( name="ruler_16384:cwe", suite=["lighteval"], @@ -856,7 +792,6 @@ trust_dataset=False, version=0, ) - ruler_fwe_16384 = LightevalTaskConfig( name="ruler_16384:fwe", suite=["lighteval"], @@ -873,7 +808,6 @@ trust_dataset=False, version=0, ) - ruler_qa_1_16384 = LightevalTaskConfig( name="ruler_16384:qa_1", suite=["lighteval"], @@ -890,7 +824,6 @@ trust_dataset=False, version=0, ) - ruler_qa_2_16384 = LightevalTaskConfig( name="ruler_16384:qa_2", suite=["lighteval"], @@ -907,7 +840,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_1_8192 = LightevalTaskConfig( name="ruler_8192:niah_single_1", suite=["lighteval"], @@ -924,7 +856,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_3_8192 = LightevalTaskConfig( name="ruler_8192:niah_single_3", suite=["lighteval"], @@ -941,7 +872,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_2_8192 = LightevalTaskConfig( name="ruler_8192:niah_single_2", suite=["lighteval"], @@ -958,7 +888,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_1_8192 = LightevalTaskConfig( name="ruler_8192:niah_multikey_1", suite=["lighteval"], @@ -975,7 +904,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_2_8192 = LightevalTaskConfig( name="ruler_8192:niah_multikey_2", suite=["lighteval"], @@ -992,7 +920,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multiquery_8192 = LightevalTaskConfig( name="ruler_8192:niah_multiquery", suite=["lighteval"], @@ -1009,7 +936,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_3_8192 = LightevalTaskConfig( name="ruler_8192:niah_multikey_3", suite=["lighteval"], @@ -1026,7 +952,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multivalue_8192 = LightevalTaskConfig( name="ruler_8192:niah_multivalue", suite=["lighteval"], @@ -1043,7 +968,6 @@ trust_dataset=False, version=0, ) - ruler_vt_8192 = LightevalTaskConfig( name="ruler_8192:vt", suite=["lighteval"], @@ -1060,7 +984,6 @@ trust_dataset=False, version=0, ) - ruler_cwe_8192 = LightevalTaskConfig( name="ruler_8192:cwe", suite=["lighteval"], @@ -1077,7 +1000,6 @@ trust_dataset=False, version=0, ) - ruler_fwe_8192 = LightevalTaskConfig( name="ruler_8192:fwe", suite=["lighteval"], @@ -1094,7 +1016,6 @@ trust_dataset=False, version=0, ) - ruler_qa_1_8192 = LightevalTaskConfig( name="ruler_8192:qa_1", suite=["lighteval"], @@ -1111,7 +1032,6 @@ trust_dataset=False, version=0, ) - ruler_qa_2_8192 = LightevalTaskConfig( name="ruler_8192:qa_2", suite=["lighteval"], @@ -1128,7 +1048,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_1_4096 = LightevalTaskConfig( name="ruler_4096:niah_single_1", suite=["lighteval"], @@ -1145,7 +1064,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_3_4096 = LightevalTaskConfig( name="ruler_4096:niah_single_3", suite=["lighteval"], @@ -1162,7 +1080,6 @@ trust_dataset=False, version=0, ) - ruler_niah_single_2_4096 = LightevalTaskConfig( name="ruler_4096:niah_single_2", suite=["lighteval"], @@ -1179,7 +1096,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_1_4096 = LightevalTaskConfig( name="ruler_4096:niah_multikey_1", suite=["lighteval"], @@ -1196,7 +1112,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_2_4096 = LightevalTaskConfig( name="ruler_4096:niah_multikey_2", suite=["lighteval"], @@ -1213,7 +1128,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multiquery_4096 = LightevalTaskConfig( name="ruler_4096:niah_multiquery", suite=["lighteval"], @@ -1230,7 +1144,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multikey_3_4096 = LightevalTaskConfig( name="ruler_4096:niah_multikey_3", suite=["lighteval"], @@ -1247,7 +1160,6 @@ trust_dataset=False, version=0, ) - ruler_niah_multivalue_4096 = LightevalTaskConfig( name="ruler_4096:niah_multivalue", suite=["lighteval"], @@ -1264,7 +1176,6 @@ trust_dataset=False, version=0, ) - ruler_vt_4096 = LightevalTaskConfig( name="ruler_4096:vt", suite=["lighteval"], @@ -1281,7 +1192,6 @@ trust_dataset=False, version=0, ) - ruler_cwe_4096 = LightevalTaskConfig( name="ruler_4096:cwe", suite=["lighteval"], @@ -1298,7 +1208,6 @@ trust_dataset=False, version=0, ) - ruler_fwe_4096 = LightevalTaskConfig( name="ruler_4096:fwe", suite=["lighteval"], @@ -1315,7 +1224,6 @@ trust_dataset=False, version=0, ) - ruler_qa_1_4096 = LightevalTaskConfig( name="ruler_4096:qa_1", suite=["lighteval"], @@ -1332,7 +1240,6 @@ trust_dataset=False, version=0, ) - ruler_qa_2_4096 = LightevalTaskConfig( name="ruler_4096:qa_2", suite=["lighteval"], @@ -1349,8 +1256,6 @@ trust_dataset=False, version=0, ) - - agieval_aqua_rat_lighteval = LightevalTaskConfig( name="agieval:aqua-rat", suite=["lighteval"], From 9cafd755dd48a4d059977c026087721ce1698e28 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 15 May 2025 11:44:37 +0000 Subject: [PATCH 3/6] adds RULE --- src/lighteval/tasks/default_tasks.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index 4cb1fa416..d488b6e53 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -1256,6 +1256,22 @@ trust_dataset=False, version=0, ) +abstract_narrative_understanding_bigbench = LightevalTaskConfig( + name="abstract_narrative_understanding", + suite=["bigbench", "bigbench_json"], + prompt_function=prompt.bigbench, + hf_repo="bigbench", + hf_subset="abstract_narrative_understanding", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + trust_dataset=True, + version=0, +) agieval_aqua_rat_lighteval = LightevalTaskConfig( name="agieval:aqua-rat", suite=["lighteval"], From ed3d9076f343c733fc9d54837b42d9592185bf75 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Mon, 19 May 2025 08:59:44 +0000 Subject: [PATCH 4/6] use llama 3.2 no chat template --- src/lighteval/tasks/default_tasks.py | 154 +++++++++++++-------------- 1 file changed, 77 insertions(+), 77 deletions(-) diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index d488b6e53..abd45ee1f 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -28,7 +28,7 @@ name="ruler_131072:niah_single_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_1"], evaluation_splits=["niah_single_1"], @@ -44,7 +44,7 @@ name="ruler_131072:niah_single_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_3"], evaluation_splits=["niah_single_3"], @@ -60,7 +60,7 @@ name="ruler_131072:niah_single_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_2"], evaluation_splits=["niah_single_2"], @@ -76,7 +76,7 @@ name="ruler_131072:niah_multikey_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_1"], evaluation_splits=["niah_multikey_1"], @@ -92,7 +92,7 @@ name="ruler_131072:niah_multikey_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_2"], evaluation_splits=["niah_multikey_2"], @@ -108,7 +108,7 @@ name="ruler_131072:niah_multiquery", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multiquery"], evaluation_splits=["niah_multiquery"], @@ -124,7 +124,7 @@ name="ruler_131072:niah_multikey_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_3"], evaluation_splits=["niah_multikey_3"], @@ -140,7 +140,7 @@ name="ruler_131072:niah_multivalue", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multivalue"], evaluation_splits=["niah_multivalue"], @@ -156,7 +156,7 @@ name="ruler_131072:vt", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["vt"], evaluation_splits=["vt"], @@ -172,7 +172,7 @@ name="ruler_131072:cwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["cwe"], evaluation_splits=["cwe"], @@ -188,7 +188,7 @@ name="ruler_131072:fwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["fwe"], evaluation_splits=["fwe"], @@ -204,7 +204,7 @@ name="ruler_131072:qa_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_1"], evaluation_splits=["qa_1"], @@ -220,7 +220,7 @@ name="ruler_131072:qa_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-131072-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_2"], evaluation_splits=["qa_2"], @@ -236,7 +236,7 @@ name="ruler_65536:niah_single_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_3"], evaluation_splits=["niah_single_3"], @@ -252,7 +252,7 @@ name="ruler_65536:niah_single_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_2"], evaluation_splits=["niah_single_2"], @@ -268,7 +268,7 @@ name="ruler_65536:niah_multikey_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_1"], evaluation_splits=["niah_multikey_1"], @@ -284,7 +284,7 @@ name="ruler_65536:niah_multikey_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_2"], evaluation_splits=["niah_multikey_2"], @@ -300,7 +300,7 @@ name="ruler_65536:niah_multiquery", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multiquery"], evaluation_splits=["niah_multiquery"], @@ -316,7 +316,7 @@ name="ruler_65536:niah_multikey_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_3"], evaluation_splits=["niah_multikey_3"], @@ -332,7 +332,7 @@ name="ruler_65536:niah_multivalue", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multivalue"], evaluation_splits=["niah_multivalue"], @@ -348,7 +348,7 @@ name="ruler_65536:vt", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["vt"], evaluation_splits=["vt"], @@ -364,7 +364,7 @@ name="ruler_65536:cwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["cwe"], evaluation_splits=["cwe"], @@ -380,7 +380,7 @@ name="ruler_65536:fwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["fwe"], evaluation_splits=["fwe"], @@ -396,7 +396,7 @@ name="ruler_65536:qa_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_1"], evaluation_splits=["qa_1"], @@ -412,7 +412,7 @@ name="ruler_65536:qa_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-65536-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_2"], evaluation_splits=["qa_2"], @@ -428,7 +428,7 @@ name="ruler_32768:niah_single_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_1"], evaluation_splits=["niah_single_1"], @@ -444,7 +444,7 @@ name="ruler_32768:niah_single_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_3"], evaluation_splits=["niah_single_3"], @@ -460,7 +460,7 @@ name="ruler_32768:niah_single_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_2"], evaluation_splits=["niah_single_2"], @@ -476,7 +476,7 @@ name="ruler_32768:niah_multikey_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_1"], evaluation_splits=["niah_multikey_1"], @@ -492,7 +492,7 @@ name="ruler_32768:niah_multikey_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_2"], evaluation_splits=["niah_multikey_2"], @@ -508,7 +508,7 @@ name="ruler_32768:niah_multiquery", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multiquery"], evaluation_splits=["niah_multiquery"], @@ -524,7 +524,7 @@ name="ruler_32768:niah_multikey_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_3"], evaluation_splits=["niah_multikey_3"], @@ -540,7 +540,7 @@ name="ruler_32768:niah_multivalue", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multivalue"], evaluation_splits=["niah_multivalue"], @@ -556,7 +556,7 @@ name="ruler_32768:vt", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["vt"], evaluation_splits=["vt"], @@ -572,7 +572,7 @@ name="ruler_32768:cwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["cwe"], evaluation_splits=["cwe"], @@ -588,7 +588,7 @@ name="ruler_32768:fwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["fwe"], evaluation_splits=["fwe"], @@ -604,7 +604,7 @@ name="ruler_32768:qa_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_1"], evaluation_splits=["qa_1"], @@ -620,7 +620,7 @@ name="ruler_32768:qa_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-32768-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_2"], evaluation_splits=["qa_2"], @@ -636,7 +636,7 @@ name="ruler_16384:niah_single_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_1"], evaluation_splits=["niah_single_1"], @@ -652,7 +652,7 @@ name="ruler_16384:niah_single_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_3"], evaluation_splits=["niah_single_3"], @@ -668,7 +668,7 @@ name="ruler_16384:niah_single_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_2"], evaluation_splits=["niah_single_2"], @@ -684,7 +684,7 @@ name="ruler_16384:niah_multikey_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_1"], evaluation_splits=["niah_multikey_1"], @@ -700,7 +700,7 @@ name="ruler_16384:niah_multikey_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_2"], evaluation_splits=["niah_multikey_2"], @@ -716,7 +716,7 @@ name="ruler_16384:niah_multiquery", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multiquery"], evaluation_splits=["niah_multiquery"], @@ -732,7 +732,7 @@ name="ruler_16384:niah_multikey_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_3"], evaluation_splits=["niah_multikey_3"], @@ -748,7 +748,7 @@ name="ruler_16384:niah_multivalue", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multivalue"], evaluation_splits=["niah_multivalue"], @@ -764,7 +764,7 @@ name="ruler_16384:vt", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["vt"], evaluation_splits=["vt"], @@ -780,7 +780,7 @@ name="ruler_16384:cwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["cwe"], evaluation_splits=["cwe"], @@ -796,7 +796,7 @@ name="ruler_16384:fwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["fwe"], evaluation_splits=["fwe"], @@ -812,7 +812,7 @@ name="ruler_16384:qa_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_1"], evaluation_splits=["qa_1"], @@ -828,7 +828,7 @@ name="ruler_16384:qa_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-16384-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_2"], evaluation_splits=["qa_2"], @@ -844,7 +844,7 @@ name="ruler_8192:niah_single_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_1"], evaluation_splits=["niah_single_1"], @@ -860,7 +860,7 @@ name="ruler_8192:niah_single_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_3"], evaluation_splits=["niah_single_3"], @@ -876,7 +876,7 @@ name="ruler_8192:niah_single_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_2"], evaluation_splits=["niah_single_2"], @@ -892,7 +892,7 @@ name="ruler_8192:niah_multikey_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_1"], evaluation_splits=["niah_multikey_1"], @@ -908,7 +908,7 @@ name="ruler_8192:niah_multikey_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_2"], evaluation_splits=["niah_multikey_2"], @@ -924,7 +924,7 @@ name="ruler_8192:niah_multiquery", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multiquery"], evaluation_splits=["niah_multiquery"], @@ -940,7 +940,7 @@ name="ruler_8192:niah_multikey_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_3"], evaluation_splits=["niah_multikey_3"], @@ -956,7 +956,7 @@ name="ruler_8192:niah_multivalue", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multivalue"], evaluation_splits=["niah_multivalue"], @@ -972,7 +972,7 @@ name="ruler_8192:vt", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["vt"], evaluation_splits=["vt"], @@ -988,7 +988,7 @@ name="ruler_8192:cwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["cwe"], evaluation_splits=["cwe"], @@ -1004,7 +1004,7 @@ name="ruler_8192:fwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["fwe"], evaluation_splits=["fwe"], @@ -1020,7 +1020,7 @@ name="ruler_8192:qa_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_1"], evaluation_splits=["qa_1"], @@ -1036,7 +1036,7 @@ name="ruler_8192:qa_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-8192-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_2"], evaluation_splits=["qa_2"], @@ -1052,7 +1052,7 @@ name="ruler_4096:niah_single_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_1"], evaluation_splits=["niah_single_1"], @@ -1068,7 +1068,7 @@ name="ruler_4096:niah_single_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_3"], evaluation_splits=["niah_single_3"], @@ -1084,7 +1084,7 @@ name="ruler_4096:niah_single_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_single_2"], evaluation_splits=["niah_single_2"], @@ -1100,7 +1100,7 @@ name="ruler_4096:niah_multikey_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_1"], evaluation_splits=["niah_multikey_1"], @@ -1116,7 +1116,7 @@ name="ruler_4096:niah_multikey_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_2"], evaluation_splits=["niah_multikey_2"], @@ -1132,7 +1132,7 @@ name="ruler_4096:niah_multiquery", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multiquery"], evaluation_splits=["niah_multiquery"], @@ -1148,7 +1148,7 @@ name="ruler_4096:niah_multikey_3", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multikey_3"], evaluation_splits=["niah_multikey_3"], @@ -1164,7 +1164,7 @@ name="ruler_4096:niah_multivalue", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["niah_multivalue"], evaluation_splits=["niah_multivalue"], @@ -1180,7 +1180,7 @@ name="ruler_4096:vt", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["vt"], evaluation_splits=["vt"], @@ -1196,7 +1196,7 @@ name="ruler_4096:cwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["cwe"], evaluation_splits=["cwe"], @@ -1212,7 +1212,7 @@ name="ruler_4096:fwe", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["fwe"], evaluation_splits=["fwe"], @@ -1228,7 +1228,7 @@ name="ruler_4096:qa_1", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_1"], evaluation_splits=["qa_1"], @@ -1244,7 +1244,7 @@ name="ruler_4096:qa_2", suite=["lighteval"], prompt_function=prompt.ruler, - hf_repo="SaylorTwift/RULER-4096-llama-3.1-tokenizer-chat-template", + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", hf_subset="default", hf_avail_splits=["qa_2"], evaluation_splits=["qa_2"], From 248bb678519f0046ec3096b2cac7ca56d00c179d Mon Sep 17 00:00:00 2001 From: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Date: Wed, 21 May 2025 16:53:03 +0200 Subject: [PATCH 5/6] Update src/lighteval/tasks/default_prompts.py --- src/lighteval/tasks/default_prompts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 72671a607..cc842628e 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -265,7 +265,7 @@ def arc_with_options(line, task_name: str = None): query += "".join([f"\n{key}. {choice}" for key, choice in zip(LETTER_INDICES, line["choices"]["text"])]) query += "\nAnswer:" return Doc( -mm task_name=task_name, + task_name=task_name, query=query, choices=line["choices"]["text"], gold_index=line["choices"]["label"].index(line["answerKey"]), From a1aee68187a3d280d3e821bad12c855abc7760f7 Mon Sep 17 00:00:00 2001 From: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Date: Wed, 21 May 2025 16:53:38 +0200 Subject: [PATCH 6/6] Update src/lighteval/models/vllm/vllm_model.py --- src/lighteval/models/vllm/vllm_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 1e4dc92dc..57d424c0c 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -268,7 +268,6 @@ def greedy_until( logger.warning( f"{context_size + max_new_tokens=} which is greater than {self.max_length=}. Truncating context to {self.max_length=} - {max_new_tokens=} = {self.max_length - max_new_tokens} tokens." ) - breakpoint() context_size = self.max_length - max_new_tokens if context_size < 0: logger.critical(