@inproceedings{wang-etal-2022-foiling,
title = "Foiling Training-Time Attacks on Neural Machine Translation Systems",
author = "Wang, Jun and
He, Xuanli and
Rubinstein, Benjamin and
Cohn, Trevor",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.435",
doi = "10.18653/v1/2022.findings-emnlp.435",
pages = "5906--5913",
abstract = "Neural machine translation (NMT) systems are vulnerable to backdoor attacks, whereby an attacker injects poisoned samples into training such that a trained model produces malicious translations. Nevertheless, there is little research on defending against such backdoor attacks in NMT. In this paper, we first show that backdoor attacks that have been successful in text classification are also effective against machine translation tasks. We then present a novel defence method that exploits a key property of most backdoor attacks: namely the asymmetry between the source and target language sentences, which is used to facilitate malicious text insertions, substitutions and suchlike. Our technique uses word alignment coupled with language model scoring to detect outlier tokens, and thus can find and filter out training instances which may contain backdoors. Experimental results demonstrate that our technique can significantly reduce the success of various attacks by up to 89.0{\%}, while not affecting predictive accuracy.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2022-foiling">
<titleInfo>
<title>Foiling Training-Time Attacks on Neural Machine Translation Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jun</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuanli</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="family">Rubinstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Neural machine translation (NMT) systems are vulnerable to backdoor attacks, whereby an attacker injects poisoned samples into training such that a trained model produces malicious translations. Nevertheless, there is little research on defending against such backdoor attacks in NMT. In this paper, we first show that backdoor attacks that have been successful in text classification are also effective against machine translation tasks. We then present a novel defence method that exploits a key property of most backdoor attacks: namely the asymmetry between the source and target language sentences, which is used to facilitate malicious text insertions, substitutions and suchlike. Our technique uses word alignment coupled with language model scoring to detect outlier tokens, and thus can find and filter out training instances which may contain backdoors. Experimental results demonstrate that our technique can significantly reduce the success of various attacks by up to 89.0%, while not affecting predictive accuracy.</abstract>
<identifier type="citekey">wang-etal-2022-foiling</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.435</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.435</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>5906</start>
<end>5913</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Foiling Training-Time Attacks on Neural Machine Translation Systems
%A Wang, Jun
%A He, Xuanli
%A Rubinstein, Benjamin
%A Cohn, Trevor
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F wang-etal-2022-foiling
%X Neural machine translation (NMT) systems are vulnerable to backdoor attacks, whereby an attacker injects poisoned samples into training such that a trained model produces malicious translations. Nevertheless, there is little research on defending against such backdoor attacks in NMT. In this paper, we first show that backdoor attacks that have been successful in text classification are also effective against machine translation tasks. We then present a novel defence method that exploits a key property of most backdoor attacks: namely the asymmetry between the source and target language sentences, which is used to facilitate malicious text insertions, substitutions and suchlike. Our technique uses word alignment coupled with language model scoring to detect outlier tokens, and thus can find and filter out training instances which may contain backdoors. Experimental results demonstrate that our technique can significantly reduce the success of various attacks by up to 89.0%, while not affecting predictive accuracy.
%R 10.18653/v1/2022.findings-emnlp.435
%U https://aclanthology.org/2022.findings-emnlp.435
%U https://doi.org/10.18653/v1/2022.findings-emnlp.435
%P 5906-5913
Markdown (Informal)
[Foiling Training-Time Attacks on Neural Machine Translation Systems](https://aclanthology.org/2022.findings-emnlp.435) (Wang et al., Findings 2022)
ACL