Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
d9de61d
init
clefourrier Jul 25, 2025
a2b1bad
Update src/lighteval/tasks/default_prompts.py
clefourrier Jul 25, 2025
0e00937
Merge branch 'main' into test_mmlu_redux_2
clefourrier Aug 1, 2025
8429f5f
small fixes
clefourrier Aug 1, 2025
1cfa2f3
Merge branch 'main' into test_mmlu_redux_2
clefourrier Aug 11, 2025
f600c40
Merge branch 'main' into test_mmlu_redux_2
clefourrier Aug 20, 2025
541c89f
Merge branch 'main' into test_mmlu_redux_2
clefourrier Aug 25, 2025
1e139ab
Apply suggestion from @NathanHB
NathanHB Aug 25, 2025
b597546
fix metrics kwargs passing
clefourrier Sep 4, 2025
b0e5584
add default metric for mmlu_redux
clefourrier Sep 4, 2025
951cbc0
fix
clefourrier Sep 4, 2025
96df0e7
update caching"
clefourrier Sep 8, 2025
971e082
Merge branch 'main' into test_mmlu_redux_2
clefourrier Sep 8, 2025
8b28aba
better str for classes, which allows correct hashing
clefourrier Sep 8, 2025
a3eeebd
last fix is to possibly push to configs
clefourrier Sep 8, 2025
c7d1eb0
removed token system + added an actual separation between tasks with …
clefourrier Sep 9, 2025
42ec1ce
fix
clefourrier Sep 9, 2025
68688b4
update caching tests
clefourrier Sep 9, 2025
b463eff
simplified system with cleaner task_id
clefourrier Sep 10, 2025
27be046
adapted to new functions
clefourrier Sep 10, 2025
f7c62bb
update vllm test
clefourrier Sep 11, 2025
0c4a429
fixing the metric changed res by 1 point
clefourrier Sep 11, 2025
dbac859
byteorder arg
clefourrier Sep 11, 2025
668e2f4
this makes little sense
clefourrier Sep 11, 2025
ef5ffe7
Update src/lighteval/utils/cache_management.py
clefourrier Sep 11, 2025
085f59c
comments
clefourrier Sep 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/lighteval/models/vllm/vllm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,8 @@ def _loglikelihood_tokens(
tokenized_contexts_batch.append(tokenized_context)

# Left truncate the inputs to the maximum length
inputs = [input[-self.max_length :] for input in inputs]
if self.max_length: # can be None if the model is initialized with ray
inputs = [input[-self.max_length :] for input in inputs]
outputs = self._generate(inputs, generate=False)

flat_index = 0
Expand Down
21 changes: 21 additions & 0 deletions src/lighteval/tasks/default_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -1862,6 +1862,27 @@ def mmlu_helm(line, task_name: str = None):
)


def mmlu_redux_2(line, topic, task_name: str = None):
"""
Ref: https://arxiv.org/abs/2406.04127
"""
query = f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n"
query += line["question"] + "\n"
query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])])
query += "Answer: "

# Handle answer format - MMLU-Redux-2 uses integer indices directly
gold_ix = line["answer"] if isinstance(line["answer"], int) else int(line["answer"])

return Doc(
task_name=task_name,
query=query,
choices=LETTER_INDICES[: len(line["choices"])],
gold_index=gold_ix,
instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
)


def mmlu_qa_abstract_algebra(line, task_name: str = None):
return mmlu_qa(line, "abstract_algebra", task_name)

Expand Down
139 changes: 139 additions & 0 deletions src/lighteval/tasks/default_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -22727,3 +22727,142 @@
stop_sequence=["\n"],
version=0,
)

# MMLU-Redux-2 Tasks
_MMLU_REDUX_2_SUBSETS = [
"abstract_algebra",
"anatomy",
"astronomy",
"business_ethics",
"clinical_knowledge",
"college_biology",
"college_chemistry",
"college_computer_science",
"college_mathematics",
"college_medicine",
"college_physics",
"computer_security",
"conceptual_physics",
"econometrics",
"electrical_engineering",
"elementary_mathematics",
"formal_logic",
"global_facts",
"high_school_biology",
"high_school_chemistry",
"high_school_computer_science",
"high_school_european_history",
"high_school_geography",
"high_school_government_and_politics",
"high_school_macroeconomics",
"high_school_mathematics",
"high_school_microeconomics",
"high_school_physics",
"high_school_psychology",
"high_school_statistics",
"high_school_us_history",
"high_school_world_history",
"human_aging",
"human_sexuality",
"international_law",
"jurisprudence",
"logical_fallacies",
"machine_learning",
"management",
"marketing",
"medical_genetics",
"miscellaneous",
"moral_disputes",
"moral_scenarios",
"nutrition",
"philosophy",
"prehistory",
"professional_accounting",
"professional_law",
"professional_medicine",
"professional_psychology",
"public_relations",
"security_studies",
"sociology",
"us_foreign_policy",
"virology",
"world_religions",
]

_mmlu_redux_2_tasks = {
subset: LightevalTaskConfig(
name=f"mmlu_redux_2:{subset}",
suite=["lighteval"],
prompt_function=lambda line, task_name=None, s=subset: prompt.mmlu_redux_2(line, s, task_name),
hf_repo="edinburgh-dawg/mmlu-redux-2.0",
hf_subset=subset,
hf_avail_splits=["test"],
evaluation_splits=["test"],
few_shots_split=None,
few_shots_select=None,
generation_size=1,
metrics=[Metrics.loglikelihood_acc],
stop_sequence=["\n"],
trust_dataset=True,
version=0,
)
for subset in _MMLU_REDUX_2_SUBSETS
}

mmlu_redux_2_abstract_algebra = _mmlu_redux_2_tasks["abstract_algebra"]
mmlu_redux_2_anatomy = _mmlu_redux_2_tasks["anatomy"]
mmlu_redux_2_astronomy = _mmlu_redux_2_tasks["astronomy"]
mmlu_redux_2_business_ethics = _mmlu_redux_2_tasks["business_ethics"]
mmlu_redux_2_clinical_knowledge = _mmlu_redux_2_tasks["clinical_knowledge"]
mmlu_redux_2_college_biology = _mmlu_redux_2_tasks["college_biology"]
mmlu_redux_2_college_chemistry = _mmlu_redux_2_tasks["college_chemistry"]
mmlu_redux_2_college_computer_science = _mmlu_redux_2_tasks["college_computer_science"]
mmlu_redux_2_college_mathematics = _mmlu_redux_2_tasks["college_mathematics"]
mmlu_redux_2_college_medicine = _mmlu_redux_2_tasks["college_medicine"]
mmlu_redux_2_college_physics = _mmlu_redux_2_tasks["college_physics"]
mmlu_redux_2_computer_security = _mmlu_redux_2_tasks["computer_security"]
mmlu_redux_2_conceptual_physics = _mmlu_redux_2_tasks["conceptual_physics"]
mmlu_redux_2_econometrics = _mmlu_redux_2_tasks["econometrics"]
mmlu_redux_2_electrical_engineering = _mmlu_redux_2_tasks["electrical_engineering"]
mmlu_redux_2_elementary_mathematics = _mmlu_redux_2_tasks["elementary_mathematics"]
mmlu_redux_2_formal_logic = _mmlu_redux_2_tasks["formal_logic"]
mmlu_redux_2_global_facts = _mmlu_redux_2_tasks["global_facts"]
mmlu_redux_2_high_school_biology = _mmlu_redux_2_tasks["high_school_biology"]
mmlu_redux_2_high_school_chemistry = _mmlu_redux_2_tasks["high_school_chemistry"]
mmlu_redux_2_high_school_computer_science = _mmlu_redux_2_tasks["high_school_computer_science"]
mmlu_redux_2_high_school_european_history = _mmlu_redux_2_tasks["high_school_european_history"]
mmlu_redux_2_high_school_geography = _mmlu_redux_2_tasks["high_school_geography"]
mmlu_redux_2_high_school_government_and_politics = _mmlu_redux_2_tasks["high_school_government_and_politics"]
mmlu_redux_2_high_school_macroeconomics = _mmlu_redux_2_tasks["high_school_macroeconomics"]
mmlu_redux_2_high_school_mathematics = _mmlu_redux_2_tasks["high_school_mathematics"]
mmlu_redux_2_high_school_microeconomics = _mmlu_redux_2_tasks["high_school_microeconomics"]
mmlu_redux_2_high_school_physics = _mmlu_redux_2_tasks["high_school_physics"]
mmlu_redux_2_high_school_psychology = _mmlu_redux_2_tasks["high_school_psychology"]
mmlu_redux_2_high_school_statistics = _mmlu_redux_2_tasks["high_school_statistics"]
mmlu_redux_2_high_school_us_history = _mmlu_redux_2_tasks["high_school_us_history"]
mmlu_redux_2_high_school_world_history = _mmlu_redux_2_tasks["high_school_world_history"]
mmlu_redux_2_human_aging = _mmlu_redux_2_tasks["human_aging"]
mmlu_redux_2_human_sexuality = _mmlu_redux_2_tasks["human_sexuality"]
mmlu_redux_2_international_law = _mmlu_redux_2_tasks["international_law"]
mmlu_redux_2_jurisprudence = _mmlu_redux_2_tasks["jurisprudence"]
mmlu_redux_2_logical_fallacies = _mmlu_redux_2_tasks["logical_fallacies"]
mmlu_redux_2_machine_learning = _mmlu_redux_2_tasks["machine_learning"]
mmlu_redux_2_management = _mmlu_redux_2_tasks["management"]
mmlu_redux_2_marketing = _mmlu_redux_2_tasks["marketing"]
mmlu_redux_2_medical_genetics = _mmlu_redux_2_tasks["medical_genetics"]
mmlu_redux_2_miscellaneous = _mmlu_redux_2_tasks["miscellaneous"]
mmlu_redux_2_moral_disputes = _mmlu_redux_2_tasks["moral_disputes"]
mmlu_redux_2_moral_scenarios = _mmlu_redux_2_tasks["moral_scenarios"]
mmlu_redux_2_nutrition = _mmlu_redux_2_tasks["nutrition"]
mmlu_redux_2_philosophy = _mmlu_redux_2_tasks["philosophy"]
mmlu_redux_2_prehistory = _mmlu_redux_2_tasks["prehistory"]
mmlu_redux_2_professional_accounting = _mmlu_redux_2_tasks["professional_accounting"]
mmlu_redux_2_professional_law = _mmlu_redux_2_tasks["professional_law"]
mmlu_redux_2_professional_medicine = _mmlu_redux_2_tasks["professional_medicine"]
mmlu_redux_2_professional_psychology = _mmlu_redux_2_tasks["professional_psychology"]
mmlu_redux_2_public_relations = _mmlu_redux_2_tasks["public_relations"]
mmlu_redux_2_security_studies = _mmlu_redux_2_tasks["security_studies"]
mmlu_redux_2_sociology = _mmlu_redux_2_tasks["sociology"]
mmlu_redux_2_us_foreign_policy = _mmlu_redux_2_tasks["us_foreign_policy"]
mmlu_redux_2_virology = _mmlu_redux_2_tasks["virology"]
mmlu_redux_2_world_religions = _mmlu_redux_2_tasks["world_religions"]
Loading