@inproceedings{chung-etal-2024-selective,
title = "Selective Vision is the Challenge for Visual Reasoning: A Benchmark for Visual Argument Understanding",
author = "Chung, Jiwan and
Lee, Sungjae and
Kim, Minseo and
Han, Seungju and
Yousefpour, Ashkan and
Hessel, Jack and
Yu, Youngjae",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.143/",
doi = "10.18653/v1/2024.emnlp-main.143",
pages = "2423--2451",
abstract = "Visual arguments, often used in advertising or social causes, rely on images to persuade viewers to do or believe something. Understanding these arguments requires selective vision: only specific visual stimuli within an image are relevant to the argument, and relevance can only be understood within the context of a broader argumentative structure. While visual arguments are readily appreciated by human audiences, we ask: are today`s AI capable of similar understanding?We present VisArgs, a dataset of 1,611 images annotated with 5,112 visual premises (with regions), 5,574 commonsense premises, and reasoning trees connecting them into structured arguments. We propose three tasks for evaluating visual argument understanding: premise localization, premise identification, and conclusion deduction.Experiments show that 1) machines struggle to capture visual cues: GPT-4-O achieved 78.5{\%} accuracy, while humans reached 98.0{\%}. Models also performed 19.5{\%} worse when distinguishing between irrelevant objects within the image compared to external objects. 2) Providing relevant visual premises improved model performance significantly."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chung-etal-2024-selective">
<titleInfo>
<title>Selective Vision is the Challenge for Visual Reasoning: A Benchmark for Visual Argument Understanding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiwan</namePart>
<namePart type="family">Chung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sungjae</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minseo</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seungju</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashkan</namePart>
<namePart type="family">Yousefpour</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jack</namePart>
<namePart type="family">Hessel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Youngjae</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Visual arguments, often used in advertising or social causes, rely on images to persuade viewers to do or believe something. Understanding these arguments requires selective vision: only specific visual stimuli within an image are relevant to the argument, and relevance can only be understood within the context of a broader argumentative structure. While visual arguments are readily appreciated by human audiences, we ask: are today‘s AI capable of similar understanding?We present VisArgs, a dataset of 1,611 images annotated with 5,112 visual premises (with regions), 5,574 commonsense premises, and reasoning trees connecting them into structured arguments. We propose three tasks for evaluating visual argument understanding: premise localization, premise identification, and conclusion deduction.Experiments show that 1) machines struggle to capture visual cues: GPT-4-O achieved 78.5% accuracy, while humans reached 98.0%. Models also performed 19.5% worse when distinguishing between irrelevant objects within the image compared to external objects. 2) Providing relevant visual premises improved model performance significantly.</abstract>
<identifier type="citekey">chung-etal-2024-selective</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.143</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.143/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>2423</start>
<end>2451</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Selective Vision is the Challenge for Visual Reasoning: A Benchmark for Visual Argument Understanding
%A Chung, Jiwan
%A Lee, Sungjae
%A Kim, Minseo
%A Han, Seungju
%A Yousefpour, Ashkan
%A Hessel, Jack
%A Yu, Youngjae
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F chung-etal-2024-selective
%X Visual arguments, often used in advertising or social causes, rely on images to persuade viewers to do or believe something. Understanding these arguments requires selective vision: only specific visual stimuli within an image are relevant to the argument, and relevance can only be understood within the context of a broader argumentative structure. While visual arguments are readily appreciated by human audiences, we ask: are today‘s AI capable of similar understanding?We present VisArgs, a dataset of 1,611 images annotated with 5,112 visual premises (with regions), 5,574 commonsense premises, and reasoning trees connecting them into structured arguments. We propose three tasks for evaluating visual argument understanding: premise localization, premise identification, and conclusion deduction.Experiments show that 1) machines struggle to capture visual cues: GPT-4-O achieved 78.5% accuracy, while humans reached 98.0%. Models also performed 19.5% worse when distinguishing between irrelevant objects within the image compared to external objects. 2) Providing relevant visual premises improved model performance significantly.
%R 10.18653/v1/2024.emnlp-main.143
%U https://aclanthology.org/2024.emnlp-main.143/
%U https://doi.org/10.18653/v1/2024.emnlp-main.143
%P 2423-2451
Markdown (Informal)
[Selective Vision is the Challenge for Visual Reasoning: A Benchmark for Visual Argument Understanding](https://aclanthology.org/2024.emnlp-main.143/) (Chung et al., EMNLP 2024)
ACL
- Jiwan Chung, Sungjae Lee, Minseo Kim, Seungju Han, Ashkan Yousefpour, Jack Hessel, and Youngjae Yu. 2024. Selective Vision is the Challenge for Visual Reasoning: A Benchmark for Visual Argument Understanding. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 2423–2451, Miami, Florida, USA. Association for Computational Linguistics.