@inproceedings{xuanfan-piji-2023-systematic,
title = "A Systematic Evaluation of Large Language Models for Natural Language Generation Tasks",
author = "Xuanfan, Ni and
Piji, Li",
editor = "Zhang, Jiajun",
booktitle = "Proceedings of the 22nd Chinese National Conference on Computational Linguistics (Volume 2: Frontier Forum)",
month = aug,
year = "2023",
address = "Harbin, China",
publisher = "Chinese Information Processing Society of China",
url = "https://aclanthology.org/2023.ccl-2.4/",
pages = "40--56",
language = "eng",
abstract = "{\textquotedblleft}Recent efforts have evaluated large language models (LLMs) in areas such as com-monsense reasoning, mathematical reasoning, and code generation. However, to thebest of our knowledge, no work has specifically investigated the performance of LLMsin natural language generation (NLG) tasks, a pivotal criterion for determining modelexcellence. Thus, this paper conducts a comprehensive evaluation of well-known andhigh-performing LLMs, namely ChatGPT, ChatGLM, T5-based models, LLaMA-basedmodels, and Pythia-based models, in the context of NLG tasks. We select English andChinese datasets encompassing Dialogue Generation and Text Summarization. More-over, we propose a common evaluation setting that incorporates input templates andpost-processing strategies. Our study reports both automatic results, accompanied by adetailed analysis.{\textquotedblright}"
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xuanfan-piji-2023-systematic">
<titleInfo>
<title>A Systematic Evaluation of Large Language Models for Natural Language Generation Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ni</namePart>
<namePart type="family">Xuanfan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Li</namePart>
<namePart type="family">Piji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd Chinese National Conference on Computational Linguistics (Volume 2: Frontier Forum)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Chinese Information Processing Society of China</publisher>
<place>
<placeTerm type="text">Harbin, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>“Recent efforts have evaluated large language models (LLMs) in areas such as com-monsense reasoning, mathematical reasoning, and code generation. However, to thebest of our knowledge, no work has specifically investigated the performance of LLMsin natural language generation (NLG) tasks, a pivotal criterion for determining modelexcellence. Thus, this paper conducts a comprehensive evaluation of well-known andhigh-performing LLMs, namely ChatGPT, ChatGLM, T5-based models, LLaMA-basedmodels, and Pythia-based models, in the context of NLG tasks. We select English andChinese datasets encompassing Dialogue Generation and Text Summarization. More-over, we propose a common evaluation setting that incorporates input templates andpost-processing strategies. Our study reports both automatic results, accompanied by adetailed analysis.”</abstract>
<identifier type="citekey">xuanfan-piji-2023-systematic</identifier>
<location>
<url>https://aclanthology.org/2023.ccl-2.4/</url>
</location>
<part>
<date>2023-08</date>
<extent unit="page">
<start>40</start>
<end>56</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Systematic Evaluation of Large Language Models for Natural Language Generation Tasks
%A Xuanfan, Ni
%A Piji, Li
%Y Zhang, Jiajun
%S Proceedings of the 22nd Chinese National Conference on Computational Linguistics (Volume 2: Frontier Forum)
%D 2023
%8 August
%I Chinese Information Processing Society of China
%C Harbin, China
%G eng
%F xuanfan-piji-2023-systematic
%X “Recent efforts have evaluated large language models (LLMs) in areas such as com-monsense reasoning, mathematical reasoning, and code generation. However, to thebest of our knowledge, no work has specifically investigated the performance of LLMsin natural language generation (NLG) tasks, a pivotal criterion for determining modelexcellence. Thus, this paper conducts a comprehensive evaluation of well-known andhigh-performing LLMs, namely ChatGPT, ChatGLM, T5-based models, LLaMA-basedmodels, and Pythia-based models, in the context of NLG tasks. We select English andChinese datasets encompassing Dialogue Generation and Text Summarization. More-over, we propose a common evaluation setting that incorporates input templates andpost-processing strategies. Our study reports both automatic results, accompanied by adetailed analysis.”
%U https://aclanthology.org/2023.ccl-2.4/
%P 40-56
Markdown (Informal)
[A Systematic Evaluation of Large Language Models for Natural Language Generation Tasks](https://aclanthology.org/2023.ccl-2.4/) (Xuanfan & Piji, CCL 2023)
ACL