@inproceedings{ludwig-etal-2024-unraveling,
title = "Unraveling the Dynamics of Semi-Supervised Hate Speech Detection: The Impact of Unlabeled Data Characteristics and Pseudo-Labeling Strategies",
author = "Ludwig, Florian and
Dolos, Klara and
Alves-Pinto, Ana and
Zesch, Torsten",
editor = "Graham, Yvette and
Purver, Matthew",
booktitle = "Findings of the Association for Computational Linguistics: EACL 2024",
month = mar,
year = "2024",
address = "St. Julian{'}s, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-eacl.133",
pages = "1974--1986",
abstract = "Despite advances in machine learning based hate speech detection, the need for larges amounts of labeled training data for state-of-the-art approaches remains a challenge for their application. Semi-supervised learning addresses this problem by leveraging unlabeled data and thus reducing the amount of annotated data required. Underlying this approach is the assumption that labeled and unlabeled data follow similar distributions. This assumption however may not always hold, with consequences for real world applications. We address this problem by investigating the dynamics of pseudo-labeling, a commonly employed form of semi-supervised learning, in the context of hate speech detection. Concretely we analysed the influence of data characteristics and of two strategies for selecting pseudo-labeled samples: threshold- and ratio-based. The results show that the influence of data characteristics on the pseudo-labeling performances depends on other factors, such as pseudo-label selection strategies or model biases. Furthermore, the effectiveness of pseudo-labeling in classification performance is determined by the interaction between the number, hate ratio and accuracy of the selected pseudo-labels. Analysis of the results suggests an advantage of the threshold-based approach when labeled and unlabeled data arise from the same domain, whilst the ratio-based approach may be recommended in the opposite situation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ludwig-etal-2024-unraveling">
<titleInfo>
<title>Unraveling the Dynamics of Semi-Supervised Hate Speech Detection: The Impact of Unlabeled Data Characteristics and Pseudo-Labeling Strategies</title>
</titleInfo>
<name type="personal">
<namePart type="given">Florian</namePart>
<namePart type="family">Ludwig</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Klara</namePart>
<namePart type="family">Dolos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ana</namePart>
<namePart type="family">Alves-Pinto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Torsten</namePart>
<namePart type="family">Zesch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Purver</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">St. Julian’s, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Despite advances in machine learning based hate speech detection, the need for larges amounts of labeled training data for state-of-the-art approaches remains a challenge for their application. Semi-supervised learning addresses this problem by leveraging unlabeled data and thus reducing the amount of annotated data required. Underlying this approach is the assumption that labeled and unlabeled data follow similar distributions. This assumption however may not always hold, with consequences for real world applications. We address this problem by investigating the dynamics of pseudo-labeling, a commonly employed form of semi-supervised learning, in the context of hate speech detection. Concretely we analysed the influence of data characteristics and of two strategies for selecting pseudo-labeled samples: threshold- and ratio-based. The results show that the influence of data characteristics on the pseudo-labeling performances depends on other factors, such as pseudo-label selection strategies or model biases. Furthermore, the effectiveness of pseudo-labeling in classification performance is determined by the interaction between the number, hate ratio and accuracy of the selected pseudo-labels. Analysis of the results suggests an advantage of the threshold-based approach when labeled and unlabeled data arise from the same domain, whilst the ratio-based approach may be recommended in the opposite situation.</abstract>
<identifier type="citekey">ludwig-etal-2024-unraveling</identifier>
<location>
<url>https://aclanthology.org/2024.findings-eacl.133</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>1974</start>
<end>1986</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Unraveling the Dynamics of Semi-Supervised Hate Speech Detection: The Impact of Unlabeled Data Characteristics and Pseudo-Labeling Strategies
%A Ludwig, Florian
%A Dolos, Klara
%A Alves-Pinto, Ana
%A Zesch, Torsten
%Y Graham, Yvette
%Y Purver, Matthew
%S Findings of the Association for Computational Linguistics: EACL 2024
%D 2024
%8 March
%I Association for Computational Linguistics
%C St. Julian’s, Malta
%F ludwig-etal-2024-unraveling
%X Despite advances in machine learning based hate speech detection, the need for larges amounts of labeled training data for state-of-the-art approaches remains a challenge for their application. Semi-supervised learning addresses this problem by leveraging unlabeled data and thus reducing the amount of annotated data required. Underlying this approach is the assumption that labeled and unlabeled data follow similar distributions. This assumption however may not always hold, with consequences for real world applications. We address this problem by investigating the dynamics of pseudo-labeling, a commonly employed form of semi-supervised learning, in the context of hate speech detection. Concretely we analysed the influence of data characteristics and of two strategies for selecting pseudo-labeled samples: threshold- and ratio-based. The results show that the influence of data characteristics on the pseudo-labeling performances depends on other factors, such as pseudo-label selection strategies or model biases. Furthermore, the effectiveness of pseudo-labeling in classification performance is determined by the interaction between the number, hate ratio and accuracy of the selected pseudo-labels. Analysis of the results suggests an advantage of the threshold-based approach when labeled and unlabeled data arise from the same domain, whilst the ratio-based approach may be recommended in the opposite situation.
%U https://aclanthology.org/2024.findings-eacl.133
%P 1974-1986
Markdown (Informal)
[Unraveling the Dynamics of Semi-Supervised Hate Speech Detection: The Impact of Unlabeled Data Characteristics and Pseudo-Labeling Strategies](https://aclanthology.org/2024.findings-eacl.133) (Ludwig et al., Findings 2024)
ACL