@inproceedings{babych-etal-2005-estimating,
title = "Estimating the predictive Power of N-gram {MT} Evaluation Metrics across Language and Text Types",
author = "Babych, Bogdan and
Hartley, Anthony and
Elliott, Debbie",
booktitle = "Proceedings of Machine Translation Summit X: Posters",
month = sep # " 13-15",
year = "2005",
address = "Phuket, Thailand",
url = "https://aclanthology.org/2005.mtsummit-posters.13/",
pages = "412--418",
abstract = "The use of n-gram metrics to evaluate the output of MT systems is widespread. Typically, they are used in system development, where an increase in the score is taken to represent an improvement in the output of the system. However, purchasers of MT systems or services are more concerned to know how well a score predicts the acceptability of the output to a reader-user. Moreover, they usually want to know if these predictions will hold across a range of target languages and text types. We describe an experiment involving human and automated evaluations of four MT systems across two text types and 23 language directions. It establishes that the correlation between human and automated scores is high, but that the predictive power of these scores depends crucially on target language and text type."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="babych-etal-2005-estimating">
<titleInfo>
<title>Estimating the predictive Power of N-gram MT Evaluation Metrics across Language and Text Types</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bogdan</namePart>
<namePart type="family">Babych</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anthony</namePart>
<namePart type="family">Hartley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debbie</namePart>
<namePart type="family">Elliott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2005-sep 13-15</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of Machine Translation Summit X: Posters</title>
</titleInfo>
<originInfo>
<place>
<placeTerm type="text">Phuket, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The use of n-gram metrics to evaluate the output of MT systems is widespread. Typically, they are used in system development, where an increase in the score is taken to represent an improvement in the output of the system. However, purchasers of MT systems or services are more concerned to know how well a score predicts the acceptability of the output to a reader-user. Moreover, they usually want to know if these predictions will hold across a range of target languages and text types. We describe an experiment involving human and automated evaluations of four MT systems across two text types and 23 language directions. It establishes that the correlation between human and automated scores is high, but that the predictive power of these scores depends crucially on target language and text type.</abstract>
<identifier type="citekey">babych-etal-2005-estimating</identifier>
<location>
<url>https://aclanthology.org/2005.mtsummit-posters.13/</url>
</location>
<part>
<date>2005-sep 13-15</date>
<extent unit="page">
<start>412</start>
<end>418</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Estimating the predictive Power of N-gram MT Evaluation Metrics across Language and Text Types
%A Babych, Bogdan
%A Hartley, Anthony
%A Elliott, Debbie
%S Proceedings of Machine Translation Summit X: Posters
%D 2005
%8 sep 13 15
%C Phuket, Thailand
%F babych-etal-2005-estimating
%X The use of n-gram metrics to evaluate the output of MT systems is widespread. Typically, they are used in system development, where an increase in the score is taken to represent an improvement in the output of the system. However, purchasers of MT systems or services are more concerned to know how well a score predicts the acceptability of the output to a reader-user. Moreover, they usually want to know if these predictions will hold across a range of target languages and text types. We describe an experiment involving human and automated evaluations of four MT systems across two text types and 23 language directions. It establishes that the correlation between human and automated scores is high, but that the predictive power of these scores depends crucially on target language and text type.
%U https://aclanthology.org/2005.mtsummit-posters.13/
%P 412-418
Markdown (Informal)
[Estimating the predictive Power of N-gram MT Evaluation Metrics across Language and Text Types](https://aclanthology.org/2005.mtsummit-posters.13/) (Babych et al., MTSummit 2005)
ACL