@inproceedings{karlinska-etal-2024-using,
title = "Using Bibliodata {LOD}ification to Create Metadata-Enriched Literary Corpora in Line with {FAIR} Principles",
author = "Karlinska, Agnieszka and
Rosi{\'n}ski, Cezary and
Kubis, Marek and
Hubar, Patryk and
Wieczorek, Jan",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1500/",
pages = "17271--17284",
abstract = "This paper discusses the design principles and procedures for creating a balanced corpus for research in computational literary studies, building on the experience of computational linguistics but adapting it to the specificities of the digital humanities. It showcases the development of the Metadata-enriched Polish Novel Corpus from the 19th and 20th centuries (19/20MetaPNC), consisting of 1,000 novels from 1854{--}1939, as an illustrative case and proposes a comprehensive workflow for the creation and reuse of literary corpora. What sets 19/20MetaPNC apart is its approach to balance, which considers the spatial dimension, the inclusion of non-canonical texts previously overlooked by other corpora, and the use of a complex, multi-stage metadata enrichment and verification process. Emphasis is placed on research-oriented metadata design, efficient data collection and data sharing according to the FAIR principles as well as 5- and 7-star data standards to increase the visibility and reusability of the corpus. A knowledge graph-based solution for the creation of exchangeable and machine-readable metadata describing corpora has been developed. For this purpose, metadata from bibliographic catalogs and other sources were transformed into Linked Data following the bibliodata LODification approach."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="karlinska-etal-2024-using">
<titleInfo>
<title>Using Bibliodata LODification to Create Metadata-Enriched Literary Corpora in Line with FAIR Principles</title>
</titleInfo>
<name type="personal">
<namePart type="given">Agnieszka</namePart>
<namePart type="family">Karlinska</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cezary</namePart>
<namePart type="family">Rosiński</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marek</namePart>
<namePart type="family">Kubis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patryk</namePart>
<namePart type="family">Hubar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Wieczorek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper discusses the design principles and procedures for creating a balanced corpus for research in computational literary studies, building on the experience of computational linguistics but adapting it to the specificities of the digital humanities. It showcases the development of the Metadata-enriched Polish Novel Corpus from the 19th and 20th centuries (19/20MetaPNC), consisting of 1,000 novels from 1854–1939, as an illustrative case and proposes a comprehensive workflow for the creation and reuse of literary corpora. What sets 19/20MetaPNC apart is its approach to balance, which considers the spatial dimension, the inclusion of non-canonical texts previously overlooked by other corpora, and the use of a complex, multi-stage metadata enrichment and verification process. Emphasis is placed on research-oriented metadata design, efficient data collection and data sharing according to the FAIR principles as well as 5- and 7-star data standards to increase the visibility and reusability of the corpus. A knowledge graph-based solution for the creation of exchangeable and machine-readable metadata describing corpora has been developed. For this purpose, metadata from bibliographic catalogs and other sources were transformed into Linked Data following the bibliodata LODification approach.</abstract>
<identifier type="citekey">karlinska-etal-2024-using</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1500/</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>17271</start>
<end>17284</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Using Bibliodata LODification to Create Metadata-Enriched Literary Corpora in Line with FAIR Principles
%A Karlinska, Agnieszka
%A Rosiński, Cezary
%A Kubis, Marek
%A Hubar, Patryk
%A Wieczorek, Jan
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F karlinska-etal-2024-using
%X This paper discusses the design principles and procedures for creating a balanced corpus for research in computational literary studies, building on the experience of computational linguistics but adapting it to the specificities of the digital humanities. It showcases the development of the Metadata-enriched Polish Novel Corpus from the 19th and 20th centuries (19/20MetaPNC), consisting of 1,000 novels from 1854–1939, as an illustrative case and proposes a comprehensive workflow for the creation and reuse of literary corpora. What sets 19/20MetaPNC apart is its approach to balance, which considers the spatial dimension, the inclusion of non-canonical texts previously overlooked by other corpora, and the use of a complex, multi-stage metadata enrichment and verification process. Emphasis is placed on research-oriented metadata design, efficient data collection and data sharing according to the FAIR principles as well as 5- and 7-star data standards to increase the visibility and reusability of the corpus. A knowledge graph-based solution for the creation of exchangeable and machine-readable metadata describing corpora has been developed. For this purpose, metadata from bibliographic catalogs and other sources were transformed into Linked Data following the bibliodata LODification approach.
%U https://aclanthology.org/2024.lrec-main.1500/
%P 17271-17284
Markdown (Informal)
[Using Bibliodata LODification to Create Metadata-Enriched Literary Corpora in Line with FAIR Principles](https://aclanthology.org/2024.lrec-main.1500/) (Karlinska et al., LREC-COLING 2024)
ACL