8000 Add InfiniteBench En.MC scenario by yifanmai · Pull Request #3687 · stanford-crfm/helm · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Add InfiniteBench En.MC scenario #3687

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ entries: [
{description: "ruler_hotpotqa:max_num_words=131072", priority: 1},
{description: "ruler_squad:max_num_words=131072", priority: 1},
{description: "infinite_bench_en_qa:max_num_words=131072", priority: 1},
{description: "infinite_bench_en_mc:max_num_words=131072", priority: 1},
{description: "infinite_bench_en_sum:max_num_words=131072", priority: 1},
{description: "openai_mrcr:needles=8,max_num_words=131072", priority: 1},
]
49 changes: 48 additions & 1 deletion src/helm/benchmark/run_specs/long_co 8000 ntext_run_specs.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
from helm.benchmark.adaptation.adapter_spec import ADAPT_CHAT, ADAPT_GENERATION, AdapterSpec
from helm.benchmark.adaptation.adapter_spec import (
ADAPT_CHAT,
ADAPT_GENERATION,
ADAPT_MULTIPLE_CHOICE_JOINT,
AdapterSpec,
)
from helm.benchmark.metrics.common_metric_specs import (
get_exact_match_metric_specs,
get_open_ended_generation_metric_specs,
Expand Down Expand Up @@ -29,6 +34,27 @@ def _get_long_context_generation_adapter_spec(max_tokens: int) -> AdapterSpec:
)


def _get_long_context_multiple_choice_adapter_spec(max_tokens: int) -> AdapterSpec:
return AdapterSpec(
method=ADAPT_MULTIPLE_CHOICE_JOINT,
global_prefix="",
global_suffix="",
instructions="Read the passage and answer the following question. Respond with only a single letter corresponding to your choice.", # noqa: E501
input_prefix="",
input_suffix="\n",
reference_prefix="A. ",
reference_suffix="\n",
output_prefix="",
output_suffix="",
instance_prefix="",
max_train_instances=0,
num_outputs=1,
temperature=0.0,
max_tokens=max_tokens,
stop_sequences=[],
)


@run_spec_function("ruler_hotpotqa")
def get_ruler_hotpotqa_spec(max_num_words: int = 131072) -> RunSpec:
scenario_spec = ScenarioSpec(
Expand Down Expand Up @@ -96,6 +122,27 @@ def get_infinite_bench_en_qa_spec(max_num_words: int = 131072) -> RunSpec:
)


@run_spec_function("infinite_bench_en_mc")
def get_infinite_bench_en_mc_spec(max_num_words: int = 131072) -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.infinite_bench_en_mc_scenario.InfiniteBenchEnMCScenario",
args={
"max_num_words": max_num_words,
},
)

adapter_spec = _get_long_context_multiple_choice_adapter_spec(max_tokens=40)
metric_specs = get_exact_match_metric_specs()

return RunSpec(
name=f"infinite_bench_en_mc:max_num_words={max_num_words}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["infinite_bench_en_mc"],
)


@run_spec_function("infinite_bench_en_sum")
def get_infinite_bench_en_sum_spec(max_num_words: int = 131072) -> RunSpec:

Expand Down
90 changes: 90 additions & 0 deletions src/helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import os
import re
from typing import List

from datasets import load_dataset, Features, Value, Sequence, Dataset

from helm.benchmark.scenarios.scenario import (
Scenario,
Instance,
Input,
Reference,
Output,
CORRECT_TAG,
TEST_SPLIT,
)
from helm.common.general import ensure_directory_exists


class InfiniteBenchEnMCScenario(Scenario):
"""InfiniteBench En.MC

InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
understand, and reason over long contexts (100k+ tokens). InfiniteBench En.MC is a subset of
InfiniteBench that requires models to perform multiple-choice question answering on questions that necessitate
long-range dependency and reasoning, beyond simple short passage retrieval.
"""

name = "infinite_bench_en_mc"
description = "∞Bench En.MC is a multiple-choice question answering task that necessitates long-range dependency and reasoning. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
tags = ["question_answering"]

def __init__(self, max_num_words: int):
self.max_num_words = max_num_words
super().__init__()

def get_instances(self, output_path: str) -> List[Instance]:
# Get InfiniteBench from HuggingFace
cache_dir = os.path.join(output_path, "data")
ensure_directory_exists(cache_dir)

# Define the features schema
ft = Features(
{
"id": Value("int64"),
"context": Value("string"),
"input": Value("string"),
"answer": Sequence(Value("string")),
"options": Sequence(Value("string")),
}
)

# Load the dataset with the specified features
dataset = load_dataset(
"xinrongzhang2022/InfiniteBench",
split="longbook_choice_eng",
features=ft,
revision="90f0394333616266d9fe85824ceaf505093cbaa5",
)

assert isinstance(dataset, Dataset)

def count_words(text: str) -> int:
return len(re.split(r"\s+", text.strip()))

dataset = dataset.filter(
lambda example: count_words(example["context"])
+ count_words(example["input"])
+ sum(count_words(option) for option in example["options"])
<= self.max_num_words
)

# Read all instances
instances: List[Instance] = []
for row in dataset:
assert len(row["answer"]) == 1
id = row["id"]
input = Input(text=row["context"] + "\n\n" + row["input"])
references = [
Reference(Output(text=option), tags=[CORRECT_TAG] if option == row["answer"][0] else [])
for option in row["options"]
]
instance = Instance(
id=id,
input=input,
references=references,
split=TEST_SPLIT,
)
instances.append(instance)

return instances
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class InfiniteBenchEnQAScenario(Scenario):
"""

name = "infinite_bench_en_qa"
description = "∞Bench En.QA is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
description = "∞Bench En.QA is an open-ended question answering task that necessitates long-range dependency and reasoning. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
tags = ["question_answering"]

def __init__(self, max_num_words: int):
Expand Down
20 changes: 19 additions & 1 deletion src/helm/benchmark/static/schema_long_context.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ run_groups:
- ruler_squad
- infinite_bench_en_sum
- infinite_bench_en_qa
- infinite_bench_en_mc
- openai_mrcr

- name: ruler_hotpotqa
Expand Down Expand Up @@ -234,7 +235,7 @@ run_groups:

- name: infinite_bench_en_qa
display_name: ∞Bench En.QA
description: ∞Bench En.QA is a question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
description: ∞Bench En.QA is a open-ended question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
metric_groups:
- accuracy
- general_information
Expand All @@ -249,6 +250,23 @@ run_groups:
when: Before 2024
language: English

- name: infinite_bench_en_mc
display_name: ∞Bench En.MC
description: ∞Bench En.MC is a multiple-choice question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
metric_groups:
- accuracy
- general_information
- annotation_metrics
environment:
main_name: exact_match
main_split: test
taxonomy:
task: multiple-choice question answering
what: Novels
who: Novel authors
when: Before 2024
language: English

- name: infinite_bench_en_sum
display_name: ∞Bench En.Sum
description: ∞Bench En.Sum is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
Expand Down
0