@inproceedings{fekih-etal-2022-humset,
title = "{H}um{S}et: Dataset of Multilingual Information Extraction and Classification for Humanitarian Crises Response",
author = "Fekih, Selim and
Tamagnone, Nicolo{'} and
Minixhofer, Benjamin and
Shrestha, Ranjan and
Contla, Ximena and
Oglethorpe, Ewan and
Rekabsaz, Navid",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.321/",
doi = "10.18653/v1/2022.findings-emnlp.321",
pages = "4379--4389",
abstract = "Timely and effective response to humanitarian crises requires quick and accurate analysis of large amounts of text data {--} a process that can highly benefit from expert-assisted NLP systems trained on validated and annotated data in the humanitarian response domain. To enable creation of such NLP systems, we introduce and release HumSet, a novel and rich multilingual dataset of humanitarian response documents annotated by experts in the humanitarian response community. The dataset provides documents in three languages (English, French, Spanish) and covers a variety of humanitarian crises from 2018 to 2021 across the globe. For each document, HUMSET provides selected snippets (entries) as well as assigned classes to each entry annotated using common humanitarian information analysis frameworks. HUMSET also provides novel and challenging entry extraction and multi-label entry classification tasks. In this paper, we take a first step towards approaching these tasks and conduct a set of experiments on Pre-trained Language Models (PLM) to establish strong baselines for future research in this domain. The dataset is available at https://blog.thedeep.io/humset/."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fekih-etal-2022-humset">
<titleInfo>
<title>HumSet: Dataset of Multilingual Information Extraction and Classification for Humanitarian Crises Response</title>
</titleInfo>
<name type="personal">
<namePart type="given">Selim</namePart>
<namePart type="family">Fekih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicolo’</namePart>
<namePart type="family">Tamagnone</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="family">Minixhofer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ranjan</namePart>
<namePart type="family">Shrestha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ximena</namePart>
<namePart type="family">Contla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ewan</namePart>
<namePart type="family">Oglethorpe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Navid</namePart>
<namePart type="family">Rekabsaz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Timely and effective response to humanitarian crises requires quick and accurate analysis of large amounts of text data – a process that can highly benefit from expert-assisted NLP systems trained on validated and annotated data in the humanitarian response domain. To enable creation of such NLP systems, we introduce and release HumSet, a novel and rich multilingual dataset of humanitarian response documents annotated by experts in the humanitarian response community. The dataset provides documents in three languages (English, French, Spanish) and covers a variety of humanitarian crises from 2018 to 2021 across the globe. For each document, HUMSET provides selected snippets (entries) as well as assigned classes to each entry annotated using common humanitarian information analysis frameworks. HUMSET also provides novel and challenging entry extraction and multi-label entry classification tasks. In this paper, we take a first step towards approaching these tasks and conduct a set of experiments on Pre-trained Language Models (PLM) to establish strong baselines for future research in this domain. The dataset is available at https://blog.thedeep.io/humset/.</abstract>
<identifier type="citekey">fekih-etal-2022-humset</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.321</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.321/</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>4379</start>
<end>4389</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T HumSet: Dataset of Multilingual Information Extraction and Classification for Humanitarian Crises Response
%A Fekih, Selim
%A Tamagnone, Nicolo’
%A Minixhofer, Benjamin
%A Shrestha, Ranjan
%A Contla, Ximena
%A Oglethorpe, Ewan
%A Rekabsaz, Navid
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F fekih-etal-2022-humset
%X Timely and effective response to humanitarian crises requires quick and accurate analysis of large amounts of text data – a process that can highly benefit from expert-assisted NLP systems trained on validated and annotated data in the humanitarian response domain. To enable creation of such NLP systems, we introduce and release HumSet, a novel and rich multilingual dataset of humanitarian response documents annotated by experts in the humanitarian response community. The dataset provides documents in three languages (English, French, Spanish) and covers a variety of humanitarian crises from 2018 to 2021 across the globe. For each document, HUMSET provides selected snippets (entries) as well as assigned classes to each entry annotated using common humanitarian information analysis frameworks. HUMSET also provides novel and challenging entry extraction and multi-label entry classification tasks. In this paper, we take a first step towards approaching these tasks and conduct a set of experiments on Pre-trained Language Models (PLM) to establish strong baselines for future research in this domain. The dataset is available at https://blog.thedeep.io/humset/.
%R 10.18653/v1/2022.findings-emnlp.321
%U https://aclanthology.org/2022.findings-emnlp.321/
%U https://doi.org/10.18653/v1/2022.findings-emnlp.321
%P 4379-4389
Markdown (Informal)
[HumSet: Dataset of Multilingual Information Extraction and Classification for Humanitarian Crises Response](https://aclanthology.org/2022.findings-emnlp.321/) (Fekih et al., Findings 2022)
ACL