@inproceedings{lin-etal-2024-preserve,
title = "To Preserve or To Compress: An In-Depth Study of Connector Selection in Multimodal Large Language Models",
author = "Lin, Junyan and
Chen, Haoran and
Zhu, Dawei and
Shen, Xiaoyu",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.325",
doi = "10.18653/v1/2024.emnlp-main.325",
pages = "5666--5680",
abstract = "In recent years, multimodal large language models (MLLMs) have attracted widespread attention from both industry and academia. Based on the integration position, MLLMs can be categorized into external and internal fusion architectures, with the former being more predominant. However, there remains considerable debate on how to construct the optimal external fusion MLLM architecture, especially regarding the performance of different connectors on tasks with varying granularities. This paper systematically investigates the impact of connectors on MLLM performance. Specifically, we classify connectors into feature-preserving and feature-compressing types. Utilizing a unified classification standard, we categorize sub-tasks from three comprehensive benchmarks, MMBench, MME, and SEED-Bench, into three task types: coarse-grained perception, fine-grained perception, and reasoning, and evaluate the performance from this perspective. Our findings reveal significant performance differences between different types of connectors across various tasks, offering essential guidance for MLLM architecture design and advancing the understanding of MLLM architecture optimization.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lin-etal-2024-preserve">
<titleInfo>
<title>To Preserve or To Compress: An In-Depth Study of Connector Selection in Multimodal Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Junyan</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haoran</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dawei</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaoyu</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In recent years, multimodal large language models (MLLMs) have attracted widespread attention from both industry and academia. Based on the integration position, MLLMs can be categorized into external and internal fusion architectures, with the former being more predominant. However, there remains considerable debate on how to construct the optimal external fusion MLLM architecture, especially regarding the performance of different connectors on tasks with varying granularities. This paper systematically investigates the impact of connectors on MLLM performance. Specifically, we classify connectors into feature-preserving and feature-compressing types. Utilizing a unified classification standard, we categorize sub-tasks from three comprehensive benchmarks, MMBench, MME, and SEED-Bench, into three task types: coarse-grained perception, fine-grained perception, and reasoning, and evaluate the performance from this perspective. Our findings reveal significant performance differences between different types of connectors across various tasks, offering essential guidance for MLLM architecture design and advancing the understanding of MLLM architecture optimization.</abstract>
<identifier type="citekey">lin-etal-2024-preserve</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.325</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.325</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>5666</start>
<end>5680</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T To Preserve or To Compress: An In-Depth Study of Connector Selection in Multimodal Large Language Models
%A Lin, Junyan
%A Chen, Haoran
%A Zhu, Dawei
%A Shen, Xiaoyu
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F lin-etal-2024-preserve
%X In recent years, multimodal large language models (MLLMs) have attracted widespread attention from both industry and academia. Based on the integration position, MLLMs can be categorized into external and internal fusion architectures, with the former being more predominant. However, there remains considerable debate on how to construct the optimal external fusion MLLM architecture, especially regarding the performance of different connectors on tasks with varying granularities. This paper systematically investigates the impact of connectors on MLLM performance. Specifically, we classify connectors into feature-preserving and feature-compressing types. Utilizing a unified classification standard, we categorize sub-tasks from three comprehensive benchmarks, MMBench, MME, and SEED-Bench, into three task types: coarse-grained perception, fine-grained perception, and reasoning, and evaluate the performance from this perspective. Our findings reveal significant performance differences between different types of connectors across various tasks, offering essential guidance for MLLM architecture design and advancing the understanding of MLLM architecture optimization.
%R 10.18653/v1/2024.emnlp-main.325
%U https://aclanthology.org/2024.emnlp-main.325
%U https://doi.org/10.18653/v1/2024.emnlp-main.325
%P 5666-5680
Markdown (Informal)
[To Preserve or To Compress: An In-Depth Study of Connector Selection in Multimodal Large Language Models](https://aclanthology.org/2024.emnlp-main.325) (Lin et al., EMNLP 2024)
ACL