@inproceedings{chen-etal-2024-learning,
title = "Learning High-Quality and General-Purpose Phrase Representations",
author = "Chen, Lihu and
Varoquaux, Gael and
Suchanek, Fabian",
editor = "Graham, Yvette and
Purver, Matthew",
booktitle = "Findings of the Association for Computational Linguistics: EACL 2024",
month = mar,
year = "2024",
address = "St. Julian{'}s, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-eacl.66",
pages = "983--994",
abstract = "Phrase representations play an important role in data science and natural language processing, benefiting various tasks like Entity Alignment, Record Linkage, Fuzzy Joins, and Paraphrase Classification.The current state-of-the-art method involves fine-tuning pre-trained language models for phrasal embeddings using contrastive learning. However, we have identified areas for improvement. First, these pre-trained models tend to be unnecessarily complex and require to be pre-trained on a corpus with context sentences.Second, leveraging the phrase type and morphology gives phrase representations that are both more precise and more flexible.We propose an improved framework to learn phrase representations in a context-free fashion.The framework employs phrase type classification as an auxiliary task and incorporates character-level information more effectively into the phrase representation.Furthermore, we design three granularities of data augmentation to increase the diversity of training samples.Our experiments across a wide range of tasks reveal that our approach generates superior phrase embeddings compared to previous methods while requiring a smaller model size.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2024-learning">
<titleInfo>
<title>Learning High-Quality and General-Purpose Phrase Representations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lihu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gael</namePart>
<namePart type="family">Varoquaux</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabian</namePart>
<namePart type="family">Suchanek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Purver</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">St. Julian’s, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Phrase representations play an important role in data science and natural language processing, benefiting various tasks like Entity Alignment, Record Linkage, Fuzzy Joins, and Paraphrase Classification.The current state-of-the-art method involves fine-tuning pre-trained language models for phrasal embeddings using contrastive learning. However, we have identified areas for improvement. First, these pre-trained models tend to be unnecessarily complex and require to be pre-trained on a corpus with context sentences.Second, leveraging the phrase type and morphology gives phrase representations that are both more precise and more flexible.We propose an improved framework to learn phrase representations in a context-free fashion.The framework employs phrase type classification as an auxiliary task and incorporates character-level information more effectively into the phrase representation.Furthermore, we design three granularities of data augmentation to increase the diversity of training samples.Our experiments across a wide range of tasks reveal that our approach generates superior phrase embeddings compared to previous methods while requiring a smaller model size.</abstract>
<identifier type="citekey">chen-etal-2024-learning</identifier>
<location>
<url>https://aclanthology.org/2024.findings-eacl.66</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>983</start>
<end>994</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Learning High-Quality and General-Purpose Phrase Representations
%A Chen, Lihu
%A Varoquaux, Gael
%A Suchanek, Fabian
%Y Graham, Yvette
%Y Purver, Matthew
%S Findings of the Association for Computational Linguistics: EACL 2024
%D 2024
%8 March
%I Association for Computational Linguistics
%C St. Julian’s, Malta
%F chen-etal-2024-learning
%X Phrase representations play an important role in data science and natural language processing, benefiting various tasks like Entity Alignment, Record Linkage, Fuzzy Joins, and Paraphrase Classification.The current state-of-the-art method involves fine-tuning pre-trained language models for phrasal embeddings using contrastive learning. However, we have identified areas for improvement. First, these pre-trained models tend to be unnecessarily complex and require to be pre-trained on a corpus with context sentences.Second, leveraging the phrase type and morphology gives phrase representations that are both more precise and more flexible.We propose an improved framework to learn phrase representations in a context-free fashion.The framework employs phrase type classification as an auxiliary task and incorporates character-level information more effectively into the phrase representation.Furthermore, we design three granularities of data augmentation to increase the diversity of training samples.Our experiments across a wide range of tasks reveal that our approach generates superior phrase embeddings compared to previous methods while requiring a smaller model size.
%U https://aclanthology.org/2024.findings-eacl.66
%P 983-994
Markdown (Informal)
[Learning High-Quality and General-Purpose Phrase Representations](https://aclanthology.org/2024.findings-eacl.66) (Chen et al., Findings 2024)
ACL