@inproceedings{an-etal-2024-capturing,
title = "Capturing the Relationship Between Sentence Triplets for {LLM} and Human-Generated Texts to Enhance Sentence Embeddings",
author = "An, Na Min and
Waheed, Sania and
Thorne, James",
editor = "Graham, Yvette and
Purver, Matthew",
booktitle = "Findings of the Association for Computational Linguistics: EACL 2024",
month = mar,
year = "2024",
address = "St. Julian{'}s, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-eacl.43",
pages = "624--638",
abstract = "Deriving meaningful sentence embeddings is crucial in capturing the semantic relationship between texts. Recent advances in building sentence embedding models have centered on replacing traditional human-generated text datasets with those generated by LLMs. However, the properties of these widely used LLM-generated texts remain largely unexplored. Here, we evaluate the quality of the LLM-generated texts from four perspectives (Positive Text Repetition, Length Difference Penalty, Positive Score Compactness, and Negative Text Implausibility) and find that there exists an inherent difference between human and LLM-generated datasets. To further enhance sentence embeddings using both human and LLM-generated datasets, we propose a novel loss function that incorporates Positive-Negative sample Augmentation (PNA) within the contrastive learning objective. Our results demonstrate that PNA effectively mitigates the sentence anisotropy problem in Wikipedia corpus (-7{\%} compared to CLHAIF) and simultaneously improves the Spearman{'}s correlation in standard Semantic Textual Similarity (STS) tasks (+1.47{\%} compared to CLHAIF).",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="an-etal-2024-capturing">
<titleInfo>
<title>Capturing the Relationship Between Sentence Triplets for LLM and Human-Generated Texts to Enhance Sentence Embeddings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Na</namePart>
<namePart type="given">Min</namePart>
<namePart type="family">An</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sania</namePart>
<namePart type="family">Waheed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Thorne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Purver</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">St. Julian’s, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Deriving meaningful sentence embeddings is crucial in capturing the semantic relationship between texts. Recent advances in building sentence embedding models have centered on replacing traditional human-generated text datasets with those generated by LLMs. However, the properties of these widely used LLM-generated texts remain largely unexplored. Here, we evaluate the quality of the LLM-generated texts from four perspectives (Positive Text Repetition, Length Difference Penalty, Positive Score Compactness, and Negative Text Implausibility) and find that there exists an inherent difference between human and LLM-generated datasets. To further enhance sentence embeddings using both human and LLM-generated datasets, we propose a novel loss function that incorporates Positive-Negative sample Augmentation (PNA) within the contrastive learning objective. Our results demonstrate that PNA effectively mitigates the sentence anisotropy problem in Wikipedia corpus (-7% compared to CLHAIF) and simultaneously improves the Spearman’s correlation in standard Semantic Textual Similarity (STS) tasks (+1.47% compared to CLHAIF).</abstract>
<identifier type="citekey">an-etal-2024-capturing</identifier>
<location>
<url>https://aclanthology.org/2024.findings-eacl.43</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>624</start>
<end>638</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Capturing the Relationship Between Sentence Triplets for LLM and Human-Generated Texts to Enhance Sentence Embeddings
%A An, Na Min
%A Waheed, Sania
%A Thorne, James
%Y Graham, Yvette
%Y Purver, Matthew
%S Findings of the Association for Computational Linguistics: EACL 2024
%D 2024
%8 March
%I Association for Computational Linguistics
%C St. Julian’s, Malta
%F an-etal-2024-capturing
%X Deriving meaningful sentence embeddings is crucial in capturing the semantic relationship between texts. Recent advances in building sentence embedding models have centered on replacing traditional human-generated text datasets with those generated by LLMs. However, the properties of these widely used LLM-generated texts remain largely unexplored. Here, we evaluate the quality of the LLM-generated texts from four perspectives (Positive Text Repetition, Length Difference Penalty, Positive Score Compactness, and Negative Text Implausibility) and find that there exists an inherent difference between human and LLM-generated datasets. To further enhance sentence embeddings using both human and LLM-generated datasets, we propose a novel loss function that incorporates Positive-Negative sample Augmentation (PNA) within the contrastive learning objective. Our results demonstrate that PNA effectively mitigates the sentence anisotropy problem in Wikipedia corpus (-7% compared to CLHAIF) and simultaneously improves the Spearman’s correlation in standard Semantic Textual Similarity (STS) tasks (+1.47% compared to CLHAIF).
%U https://aclanthology.org/2024.findings-eacl.43
%P 624-638
Markdown (Informal)
[Capturing the Relationship Between Sentence Triplets for LLM and Human-Generated Texts to Enhance Sentence Embeddings](https://aclanthology.org/2024.findings-eacl.43) (An et al., Findings 2024)
ACL