@inproceedings{silva-moro-2024-pportal,
title = "{PPORTAL}\_ner: An Annotated Corpus of {P}ortuguese Literary Entities",
author = "Silva, Mariana O. and
Moro, Mirella M.",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1132/",
pages = "12927--12937",
abstract = "The intersection of natural language processing (NLP) and literary analysis has yielded valuable insights and applications across various languages. However, the scarcity of labeled data tailored for Portuguese literary texts poses a notable challenge. To address this gap, we present the PPORTAL\_ner corpus, an annotated dataset that simplifies the development of Named Entity Recognition (NER) models specifically adapted for Portuguese literary works. Our corpus includes annotations of PER, LOC, GPE, ORG, and DATE entities within a diverse set of 25 literary texts. Annotation of the corpus involved a two-step process: initial pre-annotation using a pre-trained spaCy model followed by correction and refinement using the Prodigy annotation tool. With a total of 125,059 tokens and 5,266 annotated entities, PPORTAL\_ner corpus significantly enriches the landscape of resources available for computational literary analysis in Portuguese. This paper details the annotation methodology, guidelines, and dataset statistics while also evaluating four NER models over the PPORTAL\_ner corpus. Our evaluation analysis reveals that fine-tuning on domain-specific data significantly improves NER model performance, demonstrating the value of the PPORTAL\_ner corpus for developing domain-specific language models."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="silva-moro-2024-pportal">
<titleInfo>
<title>PPORTAL_ner: An Annotated Corpus of Portuguese Literary Entities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mariana</namePart>
<namePart type="given">O</namePart>
<namePart type="family">Silva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mirella</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Moro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The intersection of natural language processing (NLP) and literary analysis has yielded valuable insights and applications across various languages. However, the scarcity of labeled data tailored for Portuguese literary texts poses a notable challenge. To address this gap, we present the PPORTAL_ner corpus, an annotated dataset that simplifies the development of Named Entity Recognition (NER) models specifically adapted for Portuguese literary works. Our corpus includes annotations of PER, LOC, GPE, ORG, and DATE entities within a diverse set of 25 literary texts. Annotation of the corpus involved a two-step process: initial pre-annotation using a pre-trained spaCy model followed by correction and refinement using the Prodigy annotation tool. With a total of 125,059 tokens and 5,266 annotated entities, PPORTAL_ner corpus significantly enriches the landscape of resources available for computational literary analysis in Portuguese. This paper details the annotation methodology, guidelines, and dataset statistics while also evaluating four NER models over the PPORTAL_ner corpus. Our evaluation analysis reveals that fine-tuning on domain-specific data significantly improves NER model performance, demonstrating the value of the PPORTAL_ner corpus for developing domain-specific language models.</abstract>
<identifier type="citekey">silva-moro-2024-pportal</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1132/</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>12927</start>
<end>12937</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PPORTAL_ner: An Annotated Corpus of Portuguese Literary Entities
%A Silva, Mariana O.
%A Moro, Mirella M.
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F silva-moro-2024-pportal
%X The intersection of natural language processing (NLP) and literary analysis has yielded valuable insights and applications across various languages. However, the scarcity of labeled data tailored for Portuguese literary texts poses a notable challenge. To address this gap, we present the PPORTAL_ner corpus, an annotated dataset that simplifies the development of Named Entity Recognition (NER) models specifically adapted for Portuguese literary works. Our corpus includes annotations of PER, LOC, GPE, ORG, and DATE entities within a diverse set of 25 literary texts. Annotation of the corpus involved a two-step process: initial pre-annotation using a pre-trained spaCy model followed by correction and refinement using the Prodigy annotation tool. With a total of 125,059 tokens and 5,266 annotated entities, PPORTAL_ner corpus significantly enriches the landscape of resources available for computational literary analysis in Portuguese. This paper details the annotation methodology, guidelines, and dataset statistics while also evaluating four NER models over the PPORTAL_ner corpus. Our evaluation analysis reveals that fine-tuning on domain-specific data significantly improves NER model performance, demonstrating the value of the PPORTAL_ner corpus for developing domain-specific language models.
%U https://aclanthology.org/2024.lrec-main.1132/
%P 12927-12937
Markdown (Informal)
[PPORTAL_ner: An Annotated Corpus of Portuguese Literary Entities](https://aclanthology.org/2024.lrec-main.1132/) (Silva & Moro, LREC-COLING 2024)
ACL