@inproceedings{kottur-etal-2021-analysis,
title = "An Analysis of State-of-the-Art Models for Situated Interactive {M}ulti{M}odal Conversations ({SIMMC})",
author = "Kottur, Satwik and
Crook, Paul and
Moon, Seungwhan and
Beirami, Ahmad and
Cho, Eunjoon and
Subba, Rajen and
Geramifard, Alborz",
editor = "Li, Haizhou and
Levow, Gina-Anne and
Yu, Zhou and
Gupta, Chitralekha and
Sisman, Berrak and
Cai, Siqi and
Vandyke, David and
Dethlefs, Nina and
Wu, Yan and
Li, Junyi Jessy",
booktitle = "Proceedings of the 22nd Annual Meeting of the Special Interest Group on Discourse and Dialogue",
month = jul,
year = "2021",
address = "Singapore and Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.sigdial-1.15/",
doi = "10.18653/v1/2021.sigdial-1.15",
pages = "144--153",
abstract = "There is a growing interest in virtual assistants with multimodal capabilities, e.g., inferring the context of a conversation through scene understanding. The recently released situated and interactive multimodal conversations (SIMMC) dataset addresses this trend by enabling research to create virtual assistants, which are capable of taking into account the scene that user sees when conversing with the user and also interacting with items in the scene. The SIMMC dataset is novel in that it contains fully annotated user-assistant, task-orientated dialogs where the user and an assistant co-observe the same visual elements and the latter can take actions to update the scene. The SIMMC challenge, held as part of theNinth Dialog System Technology Challenge(DSTC9), propelled the development of various models which together set a new state-of-the-art on the SIMMC dataset. In this work, we compare and analyze these models to identify{\textquoteleft}what worked?', and the remaining gaps;{\textquoteleft}whatnext?'. Our analysis shows that even though pretrained language models adapted to this set-ting show great promise, there are indications that multimodal context isn`t fully utilised, and there is a need for better and scalable knowledge base integration. We hope this first-of-its-kind analysis for SIMMC models provides useful insights and opportunities for further research in multimodal conversational agents"
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kottur-etal-2021-analysis">
<titleInfo>
<title>An Analysis of State-of-the-Art Models for Situated Interactive MultiModal Conversations (SIMMC)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Satwik</namePart>
<namePart type="family">Kottur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Crook</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seungwhan</namePart>
<namePart type="family">Moon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmad</namePart>
<namePart type="family">Beirami</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eunjoon</namePart>
<namePart type="family">Cho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rajen</namePart>
<namePart type="family">Subba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alborz</namePart>
<namePart type="family">Geramifard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd Annual Meeting of the Special Interest Group on Discourse and Dialogue</title>
</titleInfo>
<name type="personal">
<namePart type="given">Haizhou</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gina-Anne</namePart>
<namePart type="family">Levow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhou</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chitralekha</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Berrak</namePart>
<namePart type="family">Sisman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Siqi</namePart>
<namePart type="family">Cai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Vandyke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nina</namePart>
<namePart type="family">Dethlefs</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yan</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junyi</namePart>
<namePart type="given">Jessy</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore and Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>There is a growing interest in virtual assistants with multimodal capabilities, e.g., inferring the context of a conversation through scene understanding. The recently released situated and interactive multimodal conversations (SIMMC) dataset addresses this trend by enabling research to create virtual assistants, which are capable of taking into account the scene that user sees when conversing with the user and also interacting with items in the scene. The SIMMC dataset is novel in that it contains fully annotated user-assistant, task-orientated dialogs where the user and an assistant co-observe the same visual elements and the latter can take actions to update the scene. The SIMMC challenge, held as part of theNinth Dialog System Technology Challenge(DSTC9), propelled the development of various models which together set a new state-of-the-art on the SIMMC dataset. In this work, we compare and analyze these models to identify‘what worked?’, and the remaining gaps;‘whatnext?’. Our analysis shows that even though pretrained language models adapted to this set-ting show great promise, there are indications that multimodal context isn‘t fully utilised, and there is a need for better and scalable knowledge base integration. We hope this first-of-its-kind analysis for SIMMC models provides useful insights and opportunities for further research in multimodal conversational agents</abstract>
<identifier type="citekey">kottur-etal-2021-analysis</identifier>
<identifier type="doi">10.18653/v1/2021.sigdial-1.15</identifier>
<location>
<url>https://aclanthology.org/2021.sigdial-1.15/</url>
</location>
<part>
<date>2021-07</date>
<extent unit="page">
<start>144</start>
<end>153</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T An Analysis of State-of-the-Art Models for Situated Interactive MultiModal Conversations (SIMMC)
%A Kottur, Satwik
%A Crook, Paul
%A Moon, Seungwhan
%A Beirami, Ahmad
%A Cho, Eunjoon
%A Subba, Rajen
%A Geramifard, Alborz
%Y Li, Haizhou
%Y Levow, Gina-Anne
%Y Yu, Zhou
%Y Gupta, Chitralekha
%Y Sisman, Berrak
%Y Cai, Siqi
%Y Vandyke, David
%Y Dethlefs, Nina
%Y Wu, Yan
%Y Li, Junyi Jessy
%S Proceedings of the 22nd Annual Meeting of the Special Interest Group on Discourse and Dialogue
%D 2021
%8 July
%I Association for Computational Linguistics
%C Singapore and Online
%F kottur-etal-2021-analysis
%X There is a growing interest in virtual assistants with multimodal capabilities, e.g., inferring the context of a conversation through scene understanding. The recently released situated and interactive multimodal conversations (SIMMC) dataset addresses this trend by enabling research to create virtual assistants, which are capable of taking into account the scene that user sees when conversing with the user and also interacting with items in the scene. The SIMMC dataset is novel in that it contains fully annotated user-assistant, task-orientated dialogs where the user and an assistant co-observe the same visual elements and the latter can take actions to update the scene. The SIMMC challenge, held as part of theNinth Dialog System Technology Challenge(DSTC9), propelled the development of various models which together set a new state-of-the-art on the SIMMC dataset. In this work, we compare and analyze these models to identify‘what worked?’, and the remaining gaps;‘whatnext?’. Our analysis shows that even though pretrained language models adapted to this set-ting show great promise, there are indications that multimodal context isn‘t fully utilised, and there is a need for better and scalable knowledge base integration. We hope this first-of-its-kind analysis for SIMMC models provides useful insights and opportunities for further research in multimodal conversational agents
%R 10.18653/v1/2021.sigdial-1.15
%U https://aclanthology.org/2021.sigdial-1.15/
%U https://doi.org/10.18653/v1/2021.sigdial-1.15
%P 144-153
Markdown (Informal)
[An Analysis of State-of-the-Art Models for Situated Interactive MultiModal Conversations (SIMMC)](https://aclanthology.org/2021.sigdial-1.15/) (Kottur et al., SIGDIAL 2021)
ACL