@inproceedings{goot-2024-still,
title = "Where are we Still Split on Tokenization?",
author = "van der Goot, Rob",
editor = "Graham, Yvette and
Purver, Matthew",
booktitle = "Findings of the Association for Computational Linguistics: EACL 2024",
month = mar,
year = "2024",
address = "St. Julian{'}s, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-eacl.9/",
pages = "118--137",
abstract = "Many Natural Language Processing (NLP) tasks are labeled on the token level, forthese tasks, the first step is to identify the tokens (tokenization). Becausethis step is often considered to be a solved problem, gold tokenization iscommonly assumed. In this paper, we propose an efficient method fortokenization with subword-based language models, and reflect on the status ofperformance on the tokenization task by evaluating on 122 languages in 20different scripts. We show that our proposed model performs on par with thestate-of-the-art, and that tokenization performance is mainly dependent on theamount and consistency of annotated data. We conclude that besidesinconsistencies in the data and exceptional cases the task can be consideredsolved for Latin languages for in-dataset settings ({\ensuremath{>}}99.5 F1). However,performance is 0.75 F1 point lower on average for datasets in other scripts andperformance deteriorates in cross-dataset setups."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="goot-2024-still">
<titleInfo>
<title>Where are we Still Split on Tokenization?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rob</namePart>
<namePart type="family">van der Goot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Purver</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">St. Julian’s, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Many Natural Language Processing (NLP) tasks are labeled on the token level, forthese tasks, the first step is to identify the tokens (tokenization). Becausethis step is often considered to be a solved problem, gold tokenization iscommonly assumed. In this paper, we propose an efficient method fortokenization with subword-based language models, and reflect on the status ofperformance on the tokenization task by evaluating on 122 languages in 20different scripts. We show that our proposed model performs on par with thestate-of-the-art, and that tokenization performance is mainly dependent on theamount and consistency of annotated data. We conclude that besidesinconsistencies in the data and exceptional cases the task can be consideredsolved for Latin languages for in-dataset settings (\ensuremath>99.5 F1). However,performance is 0.75 F1 point lower on average for datasets in other scripts andperformance deteriorates in cross-dataset setups.</abstract>
<identifier type="citekey">goot-2024-still</identifier>
<location>
<url>https://aclanthology.org/2024.findings-eacl.9/</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>118</start>
<end>137</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Where are we Still Split on Tokenization?
%A van der Goot, Rob
%Y Graham, Yvette
%Y Purver, Matthew
%S Findings of the Association for Computational Linguistics: EACL 2024
%D 2024
%8 March
%I Association for Computational Linguistics
%C St. Julian’s, Malta
%F goot-2024-still
%X Many Natural Language Processing (NLP) tasks are labeled on the token level, forthese tasks, the first step is to identify the tokens (tokenization). Becausethis step is often considered to be a solved problem, gold tokenization iscommonly assumed. In this paper, we propose an efficient method fortokenization with subword-based language models, and reflect on the status ofperformance on the tokenization task by evaluating on 122 languages in 20different scripts. We show that our proposed model performs on par with thestate-of-the-art, and that tokenization performance is mainly dependent on theamount and consistency of annotated data. We conclude that besidesinconsistencies in the data and exceptional cases the task can be consideredsolved for Latin languages for in-dataset settings (\ensuremath>99.5 F1). However,performance is 0.75 F1 point lower on average for datasets in other scripts andperformance deteriorates in cross-dataset setups.
%U https://aclanthology.org/2024.findings-eacl.9/
%P 118-137
Markdown (Informal)
[Where are we Still Split on Tokenization?](https://aclanthology.org/2024.findings-eacl.9/) (van der Goot, Findings 2024)
ACL
- Rob van der Goot. 2024. Where are we Still Split on Tokenization?. In Findings of the Association for Computational Linguistics: EACL 2024, pages 118–137, St. Julian’s, Malta. Association for Computational Linguistics.