@inproceedings{schwobel-etal-2023-geographical,
title = "Geographical Erasure in Language Generation",
author = {Schw{\"o}bel, Pola and
Golebiowski, Jacek and
Donini, Michele and
Archambeau, Cedric and
Pruthi, Danish},
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.823",
doi = "10.18653/v1/2023.findings-emnlp.823",
pages = "12310--12324",
abstract = "Large language models (LLMs) encode vast amounts of world knowledge. However, since these models are trained on large swaths of internet data, they are at risk of inordinately capturing information about dominant groups. This imbalance can propagate into generated language. In this work, we study and operationalise a form of geographical erasure wherein language models underpredict certain countries. We demonstrate consistent instances of erasure across a range of LLMs. We discover that erasure strongly correlates with low frequencies of country mentions in the training corpus. Lastly, we mitigate erasure by finetuning using a custom objective.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="schwobel-etal-2023-geographical">
<titleInfo>
<title>Geographical Erasure in Language Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pola</namePart>
<namePart type="family">Schwöbel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jacek</namePart>
<namePart type="family">Golebiowski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michele</namePart>
<namePart type="family">Donini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cedric</namePart>
<namePart type="family">Archambeau</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Danish</namePart>
<namePart type="family">Pruthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models (LLMs) encode vast amounts of world knowledge. However, since these models are trained on large swaths of internet data, they are at risk of inordinately capturing information about dominant groups. This imbalance can propagate into generated language. In this work, we study and operationalise a form of geographical erasure wherein language models underpredict certain countries. We demonstrate consistent instances of erasure across a range of LLMs. We discover that erasure strongly correlates with low frequencies of country mentions in the training corpus. Lastly, we mitigate erasure by finetuning using a custom objective.</abstract>
<identifier type="citekey">schwobel-etal-2023-geographical</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.823</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.823</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>12310</start>
<end>12324</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Geographical Erasure in Language Generation
%A Schwöbel, Pola
%A Golebiowski, Jacek
%A Donini, Michele
%A Archambeau, Cedric
%A Pruthi, Danish
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F schwobel-etal-2023-geographical
%X Large language models (LLMs) encode vast amounts of world knowledge. However, since these models are trained on large swaths of internet data, they are at risk of inordinately capturing information about dominant groups. This imbalance can propagate into generated language. In this work, we study and operationalise a form of geographical erasure wherein language models underpredict certain countries. We demonstrate consistent instances of erasure across a range of LLMs. We discover that erasure strongly correlates with low frequencies of country mentions in the training corpus. Lastly, we mitigate erasure by finetuning using a custom objective.
%R 10.18653/v1/2023.findings-emnlp.823
%U https://aclanthology.org/2023.findings-emnlp.823
%U https://doi.org/10.18653/v1/2023.findings-emnlp.823
%P 12310-12324
Markdown (Informal)
[Geographical Erasure in Language Generation](https://aclanthology.org/2023.findings-emnlp.823) (Schwöbel et al., Findings 2023)
ACL
- Pola Schwöbel, Jacek Golebiowski, Michele Donini, Cedric Archambeau, and Danish Pruthi. 2023. Geographical Erasure in Language Generation. In Findings of the Association for Computational Linguistics: EMNLP 2023, pages 12310–12324, Singapore. Association for Computational Linguistics.