@inproceedings{post-junczys-dowmunt-2024-evaluation,
title = "Evaluation and Large-scale Training for Contextual Machine Translation",
author = "Post, Matt and
Junczys-Dowmunt, Marcin",
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Ninth Conference on Machine Translation",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.wmt-1.112",
doi = "10.18653/v1/2024.wmt-1.112",
pages = "1125--1139",
abstract = "Despite the fact that context is known to be vital for resolving a range of translation ambiguities, most traditional machine translation systems continue to be trained and to operate at the sentence level. A common explanation is the lack of document-level annotations for existing training data. This work investigates whether having such annotations would be helpful for training traditional MT systems at scale. We build large-scale, state-of-the-art contextual MT systems into German, French, and Russian, fixing the datasets while comparing the effect of sourcing contextual training samples from both parallel and back-translated data. We then evaluate these contextual models across a range of contextual test sets from the literature, where we find that (a) document annotations from both mined parallel and back-translated monolingual data are helpful, but that the best contextual MT systems do not draw contextual samples from the parallel data. We also make two points related to evaluation: (b) contrastive score-based metrics on challenge sets are not discriminative; instead, models must be tested directly on their ability to generate correct outputs, and (c) standard corpus-level metrics such as COMET work best in settings that are dense in contextual phenomena.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="post-junczys-dowmunt-2024-evaluation">
<titleInfo>
<title>Evaluation and Large-scale Training for Contextual Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Matt</namePart>
<namePart type="family">Post</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcin</namePart>
<namePart type="family">Junczys-Dowmunt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth Conference on Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Kocmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Despite the fact that context is known to be vital for resolving a range of translation ambiguities, most traditional machine translation systems continue to be trained and to operate at the sentence level. A common explanation is the lack of document-level annotations for existing training data. This work investigates whether having such annotations would be helpful for training traditional MT systems at scale. We build large-scale, state-of-the-art contextual MT systems into German, French, and Russian, fixing the datasets while comparing the effect of sourcing contextual training samples from both parallel and back-translated data. We then evaluate these contextual models across a range of contextual test sets from the literature, where we find that (a) document annotations from both mined parallel and back-translated monolingual data are helpful, but that the best contextual MT systems do not draw contextual samples from the parallel data. We also make two points related to evaluation: (b) contrastive score-based metrics on challenge sets are not discriminative; instead, models must be tested directly on their ability to generate correct outputs, and (c) standard corpus-level metrics such as COMET work best in settings that are dense in contextual phenomena.</abstract>
<identifier type="citekey">post-junczys-dowmunt-2024-evaluation</identifier>
<identifier type="doi">10.18653/v1/2024.wmt-1.112</identifier>
<location>
<url>https://aclanthology.org/2024.wmt-1.112</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>1125</start>
<end>1139</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluation and Large-scale Training for Contextual Machine Translation
%A Post, Matt
%A Junczys-Dowmunt, Marcin
%Y Haddow, Barry
%Y Kocmi, Tom
%Y Koehn, Philipp
%Y Monz, Christof
%S Proceedings of the Ninth Conference on Machine Translation
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F post-junczys-dowmunt-2024-evaluation
%X Despite the fact that context is known to be vital for resolving a range of translation ambiguities, most traditional machine translation systems continue to be trained and to operate at the sentence level. A common explanation is the lack of document-level annotations for existing training data. This work investigates whether having such annotations would be helpful for training traditional MT systems at scale. We build large-scale, state-of-the-art contextual MT systems into German, French, and Russian, fixing the datasets while comparing the effect of sourcing contextual training samples from both parallel and back-translated data. We then evaluate these contextual models across a range of contextual test sets from the literature, where we find that (a) document annotations from both mined parallel and back-translated monolingual data are helpful, but that the best contextual MT systems do not draw contextual samples from the parallel data. We also make two points related to evaluation: (b) contrastive score-based metrics on challenge sets are not discriminative; instead, models must be tested directly on their ability to generate correct outputs, and (c) standard corpus-level metrics such as COMET work best in settings that are dense in contextual phenomena.
%R 10.18653/v1/2024.wmt-1.112
%U https://aclanthology.org/2024.wmt-1.112
%U https://doi.org/10.18653/v1/2024.wmt-1.112
%P 1125-1139
Markdown (Informal)
[Evaluation and Large-scale Training for Contextual Machine Translation](https://aclanthology.org/2024.wmt-1.112) (Post & Junczys-Dowmunt, WMT 2024)
ACL