stanford-crfm · yifanmai · Jun 24, 2025 · Jun 23, 2025
diff --git a/src/helm/benchmark/presentation/run_entries_long_context.conf b/src/helm/benchmark/presentation/run_entries_long_context.conf
@@ -4,6 +4,7 @@ entries: [
   {description: "ruler_hotpotqa:max_num_words=131072", priority: 1},
   {description: "ruler_squad:max_num_words=131072", priority: 1},
   {description: "infinite_bench_en_qa:max_num_words=131072", priority: 1},
+  {description: "infinite_bench_en_mc:max_num_words=131072", priority: 1},
   {description: "infinite_bench_en_sum:max_num_words=131072", priority: 1},
   {description: "openai_mrcr:needles=8,max_num_words=131072", priority: 1},
 ]
diff --git a/src/helm/benchmark/run_specs/long_co 8000 ntext_run_specs.py b/src/helm/benchmark/run_specs/long_co 8000 ntext_run_specs.py
@@ -1,4 +1,9 @@
-from helm.benchmark.adaptation.adapter_spec import ADAPT_CHAT, ADAPT_GENERATION, AdapterSpec
+from helm.benchmark.adaptation.adapter_spec import (
+    ADAPT_CHAT,
+    ADAPT_GENERATION,
+    ADAPT_MULTIPLE_CHOICE_JOINT,
+    AdapterSpec,
+)
 from helm.benchmark.metrics.common_metric_specs import (
     get_exact_match_metric_specs,
     get_open_ended_generation_metric_specs,
@@ -29,6 +34,27 @@ def _get_long_context_generation_adapter_spec(max_tokens: int) -> AdapterSpec:
     )
 
 
+def _get_long_context_multiple_choice_adapter_spec(max_tokens: int) -> AdapterSpec:
+    return AdapterSpec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        global_prefix="",
+        global_suffix="",
+        instructions="Read the passage and answer the following question. Respond with only a single letter corresponding to your choice.",  # noqa: E501
+        input_prefix="",
+        input_suffix="\n",
+        reference_prefix="A. ",
+        reference_suffix="\n",
+        output_prefix="",
+        output_suffix="",
+        instance_prefix="",
+        max_train_instances=0,
+        num_outputs=1,
+        temperature=0.0,
+        max_tokens=max_tokens,
+        stop_sequences=[],
+    )
+
+
 @run_spec_function("ruler_hotpotqa")
 def get_ruler_hotpotqa_spec(max_num_words: int = 131072) -> RunSpec:
     scenario_spec = ScenarioSpec(
@@ -96,6 +122,27 @@ def get_infinite_bench_en_qa_spec(max_num_words: int = 131072) -> RunSpec:
     )
 
 
+@run_spec_function("infinite_bench_en_mc")
+def get_infinite_bench_en_mc_spec(max_num_words: int = 131072) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.infinite_bench_en_mc_scenario.InfiniteBenchEnMCScenario",
+        args={
+            "max_num_words": max_num_words,
+        },
+    )
+
+    adapter_spec = _get_long_context_multiple_choice_adapter_spec(max_tokens=40)
+    metric_specs = get_exact_match_metric_specs()
+
+    return RunSpec(
+        name=f"infinite_bench_en_mc:max_num_words={max_num_words}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["infinite_bench_en_mc"],
+    )
+
+
 @run_spec_function("infinite_bench_en_sum")
 def get_infinite_bench_en_sum_spec(max_num_words: int = 131072) -> RunSpec:
 

diff --git a/src/helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py b/src/helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py
@@ -0,0 +1,90 @@
+import os
+import re
+from typing import List
+
+from datasets import load_dataset, Features, Value, Sequence, Dataset
+
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Input,
+    Reference,
+    Output,
+    CORRECT_TAG,
+    TEST_SPLIT,
+)
+from helm.common.general import ensure_directory_exists
+
+
+class InfiniteBenchEnMCScenario(Scenario):
+    """InfiniteBench En.MC
+
+    InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
+    understand, and reason over long contexts (100k+ tokens). InfiniteBench En.MC is a subset of
+    InfiniteBench that requires models to perform multiple-choice question answering on questions that necessitate
+    long-range dependency and reasoning, beyond simple short passage retrieval.
+    """
+
+    name = "infinite_bench_en_mc"
+    description = "∞Bench En.MC is a multiple-choice question answering task that necessitates long-range dependency and reasoning. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))"  # noqa: E501
+    tags = ["question_answering"]
+
+    def __init__(self, max_num_words: int):
+        self.max_num_words = max_num_words
+        super().__init__()
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Get InfiniteBench from HuggingFace
+        cache_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(cache_dir)
+
+        # Define the features schema
+        ft = Features(
+            {
+                "id": Value("int64"),
+                "context": Value("string"),
+                "input": Value("string"),
+                "answer": Sequence(Value("string")),
+                "options": Sequence(Value("string")),
+            }
+        )
+
+        # Load the dataset with the specified features
+        dataset = load_dataset(
+            "xinrongzhang2022/InfiniteBench",
+            split="longbook_choice_eng",
+            features=ft,
+            revision="90f0394333616266d9fe85824ceaf505093cbaa5",
+        )
+
+        assert isinstance(dataset, Dataset)
+
+        def count_words(text: str) -> int:
+            return len(re.split(r"\s+", text.strip()))
+
+        dataset = dataset.filter(
+            lambda example: count_words(example["context"])
+            + count_words(example["input"])
+            + sum(count_words(option) for option in example["options"])
+            <= self.max_num_words
+        )
+
+        # Read all instances
+        instances: List[Instance] = []
+        for row in dataset:
+            assert len(row["answer"]) == 1
+            id = row["id"]
+            input = Input(text=row["context"] + "\n\n" + row["input"])
+            references = [
+                Reference(Output(text=option), tags=[CORRECT_TAG] if option == row["answer"][0] else [])
+                for option in row["options"]
+            ]
+            instance = Instance(
+                id=id,
+                input=input,
+                references=references,
+                split=TEST_SPLIT,
+            )
+            instances.append(instance)
+
+        return instances
diff --git a/src/helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py b/src/helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py
@@ -26,7 +26,7 @@ class InfiniteBenchEnQAScenario(Scenario):
     """
 
     name = "infinite_bench_en_qa"
-    description = "∞Bench En.QA is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))"  # noqa: E501
+    description = "∞Bench En.QA is an open-ended question answering task that necessitates long-range dependency and reasoning.  ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))"  # noqa: E501
     tags = ["question_answering"]
 
     def __init__(self, max_num_words: int):

diff --git a/src/helm/benchmark/static/schema_long_context.yaml b/src/helm/benchmark/static/schema_long_context.yaml
@@ -195,6 +195,7 @@ run_groups:
       - ruler_squad
       - infinite_bench_en_sum
       - infinite_bench_en_qa
+      - infinite_bench_en_mc
       - openai_mrcr
 
   - name: ruler_hotpotqa
@@ -234,7 +235,7 @@ run_groups:
 
   - name: infinite_bench_en_qa
     display_name: ∞Bench En.QA
-    description: ∞Bench En.QA is a question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
+    description: ∞Bench En.QA is a open-ended question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
     metric_groups:
       - accuracy
       - general_information
@@ -249,6 +250,23 @@ run_groups:
       when: Before 2024
       language: English
 
+  - name: infinite_bench_en_mc
+    display_name: ∞Bench En.MC
+    description: ∞Bench En.MC is a multiple-choice question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
+    metric_groups:
+      - accuracy
+      - general_information
+      - annotation_metrics
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: multiple-choice question answering
+      what: Novels
+      who: Novel authors
+      when: Before 2024
+      language: English
+
   - name: infinite_bench_en_sum
     display_name: ∞Bench En.Sum
     description: ∞Bench En.Sum is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))