@inproceedings{bowen-etal-2024-comprehensive,
title = "A Comprehensive Evaluation of Inductive Reasoning Capabilities and Problem Solving in Large Language Models",
author = "Bowen, Chen and
S{\ae}tre, Rune and
Miyao, Yusuke",
editor = "Graham, Yvette and
Purver, Matthew",
booktitle = "Findings of the Association for Computational Linguistics: EACL 2024",
month = mar,
year = "2024",
address = "St. Julian{'}s, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-eacl.22",
pages = "323--339",
abstract = "Inductive reasoning is fundamental to both human and artificial intelligence. The inductive reasoning abilities of current Large Language Models (LLMs) are evaluated in this research.We argue that only considering induction of rules is too narrow and unrealistic, since inductive reasoning is usually mixed with other abilities, like rules application, results/rules validation, and updated information integration.We probed the LLMs with a set of designed symbolic tasks and found that even state-of-the-art (SotA) LLMs fail significantly, showing the inability of LLMs to perform these intuitively simple tasks.Furthermore, we found that perfect accuracy in a small-size problem does not guarantee the same accuracy in a larger-size version of the same problem, provoking the question of how we can assess the LLMs{'} actual problem-solving capabilities.We also argue that Chain-of-Thought prompts help the LLMs by decomposing the problem-solving process, but the LLMs still learn limitedly.Furthermore, we reveal that few-shot examples assist LLM generalization in out-of-domain (OOD) cases, albeit limited. The LLM starts to fail when the problem deviates from the provided few-shot examples.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bowen-etal-2024-comprehensive">
<titleInfo>
<title>A Comprehensive Evaluation of Inductive Reasoning Capabilities and Problem Solving in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chen</namePart>
<namePart type="family">Bowen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rune</namePart>
<namePart type="family">Sætre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yusuke</namePart>
<namePart type="family">Miyao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Purver</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">St. Julian’s, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Inductive reasoning is fundamental to both human and artificial intelligence. The inductive reasoning abilities of current Large Language Models (LLMs) are evaluated in this research.We argue that only considering induction of rules is too narrow and unrealistic, since inductive reasoning is usually mixed with other abilities, like rules application, results/rules validation, and updated information integration.We probed the LLMs with a set of designed symbolic tasks and found that even state-of-the-art (SotA) LLMs fail significantly, showing the inability of LLMs to perform these intuitively simple tasks.Furthermore, we found that perfect accuracy in a small-size problem does not guarantee the same accuracy in a larger-size version of the same problem, provoking the question of how we can assess the LLMs’ actual problem-solving capabilities.We also argue that Chain-of-Thought prompts help the LLMs by decomposing the problem-solving process, but the LLMs still learn limitedly.Furthermore, we reveal that few-shot examples assist LLM generalization in out-of-domain (OOD) cases, albeit limited. The LLM starts to fail when the problem deviates from the provided few-shot examples.</abstract>
<identifier type="citekey">bowen-etal-2024-comprehensive</identifier>
<location>
<url>https://aclanthology.org/2024.findings-eacl.22</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>323</start>
<end>339</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Comprehensive Evaluation of Inductive Reasoning Capabilities and Problem Solving in Large Language Models
%A Bowen, Chen
%A Sætre, Rune
%A Miyao, Yusuke
%Y Graham, Yvette
%Y Purver, Matthew
%S Findings of the Association for Computational Linguistics: EACL 2024
%D 2024
%8 March
%I Association for Computational Linguistics
%C St. Julian’s, Malta
%F bowen-etal-2024-comprehensive
%X Inductive reasoning is fundamental to both human and artificial intelligence. The inductive reasoning abilities of current Large Language Models (LLMs) are evaluated in this research.We argue that only considering induction of rules is too narrow and unrealistic, since inductive reasoning is usually mixed with other abilities, like rules application, results/rules validation, and updated information integration.We probed the LLMs with a set of designed symbolic tasks and found that even state-of-the-art (SotA) LLMs fail significantly, showing the inability of LLMs to perform these intuitively simple tasks.Furthermore, we found that perfect accuracy in a small-size problem does not guarantee the same accuracy in a larger-size version of the same problem, provoking the question of how we can assess the LLMs’ actual problem-solving capabilities.We also argue that Chain-of-Thought prompts help the LLMs by decomposing the problem-solving process, but the LLMs still learn limitedly.Furthermore, we reveal that few-shot examples assist LLM generalization in out-of-domain (OOD) cases, albeit limited. The LLM starts to fail when the problem deviates from the provided few-shot examples.
%U https://aclanthology.org/2024.findings-eacl.22
%P 323-339
Markdown (Informal)
[A Comprehensive Evaluation of Inductive Reasoning Capabilities and Problem Solving in Large Language Models](https://aclanthology.org/2024.findings-eacl.22) (Bowen et al., Findings 2024)
ACL