diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index efc762dec..c4cab9b74 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -133,7 +133,27 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) - + ruler_match_any = SampleLevelMetric( + metric_name="ruler_match_any", + sample_level_fn=lambda predictions, golds, formatted_doc: max( + [1.0 if r.lower() in predictions[0].lower() else 0.0 for r in golds] + ), + category=MetricCategory.GENERATIVE, + use_case=MetricUseCase.SUMMARIZATION, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + ruler_match_all = SampleLevelMetric( + metric_name="ruler_match_all", + sample_level_fn=lambda predictions, golds, formatted_doc: sum( + [1.0 if r.lower() in predictions[0].lower() else 0.0 for r in golds] + ) + / len(golds), + category=MetricCategory.GENERATIVE, + use_case=MetricUseCase.SUMMARIZATION, + corpus_level_fn=np.mean, + higher_is_better=True, + ) bleurt = SampleLevelMetric( metric_name="bleurt", sample_level_fn=BLEURT().compute, diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 0b4892a20..32301aa55 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -276,7 +276,7 @@ def greedy_until( if max_new_tokens is not None: if context_size + max_new_tokens > self.max_length: logger.warning( - f"{context_size + max_new_tokens=} which is greater than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens." + f"{context_size + max_new_tokens=} which is greater than {self.max_length=}. Truncating context to {self.max_length=} - {max_new_tokens=} = {self.max_length - max_new_tokens} tokens." ) context_size = self.max_length - max_new_tokens if context_size < 0: diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 786c4a0b1..28625e813 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -44,6 +44,15 @@ # fmt: on +def ruler(line, task_name: str = None): + query = line["input"] + choices = line["outputs"] + gold_index = 0 + instruction = "Only answer the question to complete the prompt, without any additional text.\n" + query = f"{instruction}{query}" + + return Doc(query=query, instruction=instruction, choices=choices, gold_index=gold_index, task_name=task_name) + def mmmu_pro(line, task_name: Optional[str] = None): # fmt: off question = line["question"] # "What is the capital of France?" @@ -87,7 +96,6 @@ def mmmu_pro(line, task_name: Optional[str] = None): instruction=instructions, ) - def mmmu_pro_vision(line, task_name: str = None): instruction = ( "Answer with the option letter from the given choices directly." @@ -119,14 +127,17 @@ def mmmu_pro_vision(line, task_name: str = None): instruction=instruction, ) - def simpleqa(line, task_name: str = None): query = line["problem"] choices = [line["answer"]] gold_index = 0 return Doc( - task_name=task_name, query=query, choices=choices, gold_index=gold_index, specific={**eval(line["metadata"])} + task_name=task_name, + query=query, + choices=choices, + gold_index=gold_index, + specific={**eval(line["metadata"])}, ) diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index b77b27d52..cd2961b28 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -14909,6 +14909,1236 @@ trust_dataset=True, version=0, ) +ruler_niah_single_1_131072 = LightevalTaskConfig( + name="ruler_131072:niah_single_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_single_1"], + evaluation_splits=["niah_single_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_single_3_131072 = LightevalTaskConfig( + name="ruler_131072:niah_single_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_single_3"], + evaluation_splits=["niah_single_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_single_2_131072 = LightevalTaskConfig( + name="ruler_131072:niah_single_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_single_2"], + evaluation_splits=["niah_single_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multikey_1_131072 = LightevalTaskConfig( + name="ruler_131072:niah_multikey_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multikey_1"], + evaluation_splits=["niah_multikey_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multikey_2_131072 = LightevalTaskConfig( + name="ruler_131072:niah_multikey_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multikey_2"], + evaluation_splits=["niah_multikey_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multiquery_131072 = LightevalTaskConfig( + name="ruler_131072:niah_multiquery", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multiquery"], + evaluation_splits=["niah_multiquery"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multikey_3_131072 = LightevalTaskConfig( + name="ruler_131072:niah_multikey_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multikey_3"], + evaluation_splits=["niah_multikey_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multivalue_131072 = LightevalTaskConfig( + name="ruler_131072:niah_multivalue", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multivalue"], + evaluation_splits=["niah_multivalue"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_vt_131072 = LightevalTaskConfig( + name="ruler_131072:vt", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["vt"], + evaluation_splits=["vt"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_cwe_131072 = LightevalTaskConfig( + name="ruler_131072:cwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["cwe"], + evaluation_splits=["cwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=120, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_fwe_131072 = LightevalTaskConfig( + name="ruler_131072:fwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["fwe"], + evaluation_splits=["fwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_qa_1_131072 = LightevalTaskConfig( + name="ruler_131072:qa_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["qa_1"], + evaluation_splits=["qa_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_qa_2_131072 = LightevalTaskConfig( + name="ruler_131072:qa_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-131072-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["qa_2"], + evaluation_splits=["qa_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_single_3_65536 = LightevalTaskConfig( + name="ruler_65536:niah_single_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_single_3"], + evaluation_splits=["niah_single_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_single_2_65536 = LightevalTaskConfig( + name="ruler_65536:niah_single_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_single_2"], + evaluation_splits=["niah_single_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multikey_1_65536 = LightevalTaskConfig( + name="ruler_65536:niah_multikey_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multikey_1"], + evaluation_splits=["niah_multikey_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multikey_2_65536 = LightevalTaskConfig( + name="ruler_65536:niah_multikey_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multikey_2"], + evaluation_splits=["niah_multikey_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multiquery_65536 = LightevalTaskConfig( + name="ruler_65536:niah_multiquery", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multiquery"], + evaluation_splits=["niah_multiquery"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multikey_3_65536 = LightevalTaskConfig( + name="ruler_65536:niah_multikey_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multikey_3"], + evaluation_splits=["niah_multikey_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multivalue_65536 = LightevalTaskConfig( + name="ruler_65536:niah_multivalue", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multivalue"], + evaluation_splits=["niah_multivalue"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_vt_65536 = LightevalTaskConfig( + name="ruler_65536:vt", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["vt"], + evaluation_splits=["vt"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_cwe_65536 = LightevalTaskConfig( + name="ruler_65536:cwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["cwe"], + evaluation_splits=["cwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=120, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_fwe_65536 = LightevalTaskConfig( + name="ruler_65536:fwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["fwe"], + evaluation_splits=["fwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_qa_1_65536 = LightevalTaskConfig( + name="ruler_65536:qa_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["qa_1"], + evaluation_splits=["qa_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_qa_2_65536 = LightevalTaskConfig( + name="ruler_65536:qa_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-65536-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["qa_2"], + evaluation_splits=["qa_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_single_1_32768 = LightevalTaskConfig( + name="ruler_32768:niah_single_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_single_1"], + evaluation_splits=["niah_single_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_single_3_32768 = LightevalTaskConfig( + name="ruler_32768:niah_single_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_single_3"], + evaluation_splits=["niah_single_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_single_2_32768 = LightevalTaskConfig( + name="ruler_32768:niah_single_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_single_2"], + evaluation_splits=["niah_single_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multikey_1_32768 = LightevalTaskConfig( + name="ruler_32768:niah_multikey_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multikey_1"], + evaluation_splits=["niah_multikey_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multikey_2_32768 = LightevalTaskConfig( + name="ruler_32768:niah_multikey_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multikey_2"], + evaluation_splits=["niah_multikey_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multiquery_32768 = LightevalTaskConfig( + name="ruler_32768:niah_multiquery", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multiquery"], + evaluation_splits=["niah_multiquery"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multikey_3_32768 = LightevalTaskConfig( + name="ruler_32768:niah_multikey_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multikey_3"], + evaluation_splits=["niah_multikey_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multivalue_32768 = LightevalTaskConfig( + name="ruler_32768:niah_multivalue", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multivalue"], + evaluation_splits=["niah_multivalue"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_vt_32768 = LightevalTaskConfig( + name="ruler_32768:vt", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["vt"], + evaluation_splits=["vt"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_cwe_32768 = LightevalTaskConfig( + name="ruler_32768:cwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["cwe"], + evaluation_splits=["cwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=120, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_fwe_32768 = LightevalTaskConfig( + name="ruler_32768:fwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["fwe"], + evaluation_splits=["fwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_qa_1_32768 = LightevalTaskConfig( + name="ruler_32768:qa_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["qa_1"], + evaluation_splits=["qa_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_qa_2_32768 = LightevalTaskConfig( + name="ruler_32768:qa_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-32768-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["qa_2"], + evaluation_splits=["qa_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_single_1_16384 = LightevalTaskConfig( + name="ruler_16384:niah_single_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_single_1"], + evaluation_splits=["niah_single_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_single_3_16384 = LightevalTaskConfig( + name="ruler_16384:niah_single_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_single_3"], + evaluation_splits=["niah_single_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_single_2_16384 = LightevalTaskConfig( + name="ruler_16384:niah_single_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_single_2"], + evaluation_splits=["niah_single_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multikey_1_16384 = LightevalTaskConfig( + name="ruler_16384:niah_multikey_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multikey_1"], + evaluation_splits=["niah_multikey_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multikey_2_16384 = LightevalTaskConfig( + name="ruler_16384:niah_multikey_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multikey_2"], + evaluation_splits=["niah_multikey_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multiquery_16384 = LightevalTaskConfig( + name="ruler_16384:niah_multiquery", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multiquery"], + evaluation_splits=["niah_multiquery"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multikey_3_16384 = LightevalTaskConfig( + name="ruler_16384:niah_multikey_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multikey_3"], + evaluation_splits=["niah_multikey_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multivalue_16384 = LightevalTaskConfig( + name="ruler_16384:niah_multivalue", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multivalue"], + evaluation_splits=["niah_multivalue"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_vt_16384 = LightevalTaskConfig( + name="ruler_16384:vt", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["vt"], + evaluation_splits=["vt"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_cwe_16384 = LightevalTaskConfig( + name="ruler_16384:cwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["cwe"], + evaluation_splits=["cwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=120, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_fwe_16384 = LightevalTaskConfig( + name="ruler_16384:fwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["fwe"], + evaluation_splits=["fwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_qa_1_16384 = LightevalTaskConfig( + name="ruler_16384:qa_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["qa_1"], + evaluation_splits=["qa_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_qa_2_16384 = LightevalTaskConfig( + name="ruler_16384:qa_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-16384-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["qa_2"], + evaluation_splits=["qa_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_single_1_8192 = LightevalTaskConfig( + name="ruler_8192:niah_single_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_single_1"], + evaluation_splits=["niah_single_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_single_3_8192 = LightevalTaskConfig( + name="ruler_8192:niah_single_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_single_3"], + evaluation_splits=["niah_single_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_single_2_8192 = LightevalTaskConfig( + name="ruler_8192:niah_single_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_single_2"], + evaluation_splits=["niah_single_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multikey_1_8192 = LightevalTaskConfig( + name="ruler_8192:niah_multikey_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multikey_1"], + evaluation_splits=["niah_multikey_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multikey_2_8192 = LightevalTaskConfig( + name="ruler_8192:niah_multikey_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multikey_2"], + evaluation_splits=["niah_multikey_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multiquery_8192 = LightevalTaskConfig( + name="ruler_8192:niah_multiquery", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multiquery"], + evaluation_splits=["niah_multiquery"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multikey_3_8192 = LightevalTaskConfig( + name="ruler_8192:niah_multikey_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multikey_3"], + evaluation_splits=["niah_multikey_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multivalue_8192 = LightevalTaskConfig( + name="ruler_8192:niah_multivalue", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multivalue"], + evaluation_splits=["niah_multivalue"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_vt_8192 = LightevalTaskConfig( + name="ruler_8192:vt", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["vt"], + evaluation_splits=["vt"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_cwe_8192 = LightevalTaskConfig( + name="ruler_8192:cwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["cwe"], + evaluation_splits=["cwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=120, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_fwe_8192 = LightevalTaskConfig( + name="ruler_8192:fwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["fwe"], + evaluation_splits=["fwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_qa_1_8192 = LightevalTaskConfig( + name="ruler_8192:qa_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["qa_1"], + evaluation_splits=["qa_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_qa_2_8192 = LightevalTaskConfig( + name="ruler_8192:qa_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-8192-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["qa_2"], + evaluation_splits=["qa_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_single_1_4096 = LightevalTaskConfig( + name="ruler_4096:niah_single_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_single_1"], + evaluation_splits=["niah_single_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_single_3_4096 = LightevalTaskConfig( + name="ruler_4096:niah_single_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_single_3"], + evaluation_splits=["niah_single_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_single_2_4096 = LightevalTaskConfig( + name="ruler_4096:niah_single_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_single_2"], + evaluation_splits=["niah_single_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multikey_1_4096 = LightevalTaskConfig( + name="ruler_4096:niah_multikey_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multikey_1"], + evaluation_splits=["niah_multikey_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multikey_2_4096 = LightevalTaskConfig( + name="ruler_4096:niah_multikey_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multikey_2"], + evaluation_splits=["niah_multikey_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multiquery_4096 = LightevalTaskConfig( + name="ruler_4096:niah_multiquery", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multiquery"], + evaluation_splits=["niah_multiquery"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multikey_3_4096 = LightevalTaskConfig( + name="ruler_4096:niah_multikey_3", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multikey_3"], + evaluation_splits=["niah_multikey_3"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_niah_multivalue_4096 = LightevalTaskConfig( + name="ruler_4096:niah_multivalue", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["niah_multivalue"], + evaluation_splits=["niah_multivalue"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_vt_4096 = LightevalTaskConfig( + name="ruler_4096:vt", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["vt"], + evaluation_splits=["vt"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_cwe_4096 = LightevalTaskConfig( + name="ruler_4096:cwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["cwe"], + evaluation_splits=["cwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=120, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_fwe_4096 = LightevalTaskConfig( + name="ruler_4096:fwe", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["fwe"], + evaluation_splits=["fwe"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=[Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_qa_1_4096 = LightevalTaskConfig( + name="ruler_4096:qa_1", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["qa_1"], + evaluation_splits=["qa_1"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, + version=0, +) +ruler_qa_2_4096 = LightevalTaskConfig( + name="ruler_4096:qa_2", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo="SaylorTwift/RULER-4096-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=["qa_2"], + evaluation_splits=["qa_2"], + few_shots_split=None, + few_shots_select=None, + generation_size=32, + metric=[Metrics.ruler_match_any], + stop_sequence=None, + trust_dataset=False, ruin_names_bigbench = LightevalTaskConfig( name="ruin_names", suite=["bigbench", "bigbench_json"], diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index c9a31904b..505c670fc 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -31,6 +31,7 @@ from huggingface_hub import TextGenerationInputGrammarType from multiprocess import Pool from pytablewriter import MarkdownTableWriter +from tqdm import tqdm from lighteval.metrics import ( apply_generative_metric, @@ -560,7 +561,7 @@ def load_datasets(tasks: list["LightevalTask"], dataset_loading_processes: int = task.dataset_filter, task.dataset_revision, ) - for task in tasks + for task in tqdm(tasks) ] else: with Pool(processes=dataset_loading_processes) as pool: @@ -627,7 +628,7 @@ def create_requests_from_tasks( # noqa: C901 task_dict_items = [(name, task) for name, task in task_dict.items() if len(task.eval_docs()) > 0] # Get lists of each type of request - for task_name, task in task_dict_items: + for task_name, task in tqdm(task_dict_items): task_docs = list(task.eval_docs()) n_samples = min(max_samples, len(task_docs)) if max_samples else len(task_docs) evaluation_tracker.task_config_logger.log_num_docs(task_name, len(task_docs), n_samples)