@inproceedings{simard-2014-clean,
title = "Clean data for training statistical {MT}: the case of {MT} contamination",
author = "Simard, Michel",
editor = "Al-Onaizan, Yaser and
Simard, Michel",
booktitle = "Proceedings of the 11th Conference of the Association for Machine Translation in the Americas: MT Researchers Track",
month = oct # " 22-26",
year = "2014",
address = "Vancouver, Canada",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2014.amta-researchers.6/",
pages = "69--82",
abstract = "Users of Statistical Machine Translation (SMT) sometimes turn to the Web to obtain data to train their systems. One problem with this approach is the potential for {\textquotedblleft}MT contamination{\textquotedblright}: when large amounts of parallel data are collected automatically, there is a risk that a non-negligible portion consists of machine-translated text. Theoretically, using this kind of data to train SMT systems is likely to reinforce the errors committed by other systems, or even by an earlier versions of the same system. In this paper, we study the effect of MT-contaminated training data on SMT quality, by performing controlled simulations under a wide range of conditions. Our experiments highlight situations in which MT contamination can be harmful, and assess the potential of decontamination techniques."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="simard-2014-clean">
<titleInfo>
<title>Clean data for training statistical MT: the case of MT contamination</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michel</namePart>
<namePart type="family">Simard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2014-oct 22-26</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 11th Conference of the Association for Machine Translation in the Americas: MT Researchers Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michel</namePart>
<namePart type="family">Simard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Machine Translation in the Americas</publisher>
<place>
<placeTerm type="text">Vancouver, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Users of Statistical Machine Translation (SMT) sometimes turn to the Web to obtain data to train their systems. One problem with this approach is the potential for “MT contamination”: when large amounts of parallel data are collected automatically, there is a risk that a non-negligible portion consists of machine-translated text. Theoretically, using this kind of data to train SMT systems is likely to reinforce the errors committed by other systems, or even by an earlier versions of the same system. In this paper, we study the effect of MT-contaminated training data on SMT quality, by performing controlled simulations under a wide range of conditions. Our experiments highlight situations in which MT contamination can be harmful, and assess the potential of decontamination techniques.</abstract>
<identifier type="citekey">simard-2014-clean</identifier>
<location>
<url>https://aclanthology.org/2014.amta-researchers.6/</url>
</location>
<part>
<date>2014-oct 22-26</date>
<extent unit="page">
<start>69</start>
<end>82</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Clean data for training statistical MT: the case of MT contamination
%A Simard, Michel
%Y Al-Onaizan, Yaser
%Y Simard, Michel
%S Proceedings of the 11th Conference of the Association for Machine Translation in the Americas: MT Researchers Track
%D 2014
%8 oct 22 26
%I Association for Machine Translation in the Americas
%C Vancouver, Canada
%F simard-2014-clean
%X Users of Statistical Machine Translation (SMT) sometimes turn to the Web to obtain data to train their systems. One problem with this approach is the potential for “MT contamination”: when large amounts of parallel data are collected automatically, there is a risk that a non-negligible portion consists of machine-translated text. Theoretically, using this kind of data to train SMT systems is likely to reinforce the errors committed by other systems, or even by an earlier versions of the same system. In this paper, we study the effect of MT-contaminated training data on SMT quality, by performing controlled simulations under a wide range of conditions. Our experiments highlight situations in which MT contamination can be harmful, and assess the potential of decontamination techniques.
%U https://aclanthology.org/2014.amta-researchers.6/
%P 69-82
Markdown (Informal)
[Clean data for training statistical MT: the case of MT contamination](https://aclanthology.org/2014.amta-researchers.6/) (Simard, AMTA 2014)
ACL