@inproceedings{nan-2023-frontier,
title = "Frontier Review of Multimodal {AI}",
author = "Nan, Duan",
editor = "Zhang, Jiajun",
booktitle = "Proceedings of the 22nd Chinese National Conference on Computational Linguistics (Volume 2: Frontier Forum)",
month = aug,
year = "2023",
address = "Harbin, China",
publisher = "Chinese Information Processing Society of China",
url = "https://aclanthology.org/2023.ccl-2.9/",
pages = "110--118",
language = "eng",
abstract = "{\textquotedblleft}Pre-training techniques have enabled foundation models (such as BERT, T5, GPT) to achieveremarkable success in natural language processing (NLP) and multimodal tasks that involve text,audio and visual contents. Some of the latest multimodal generative models, such as DALL{\textperiodcentered}Eand Stable Diffusion, can synthesize novel visual content from text or video inputs, which greatlyenhances the creativity and productivity of content creators. However, multimodal AI also facessome challenges, such as adding new modalities or handling diverse tasks that require signalsbeyond their understanding. Therefore, a new trend in multimodal AI is to build a compositionalAI system that connects existing foundation models with external modules and tools. This way,the system can perform more varied tasks by leveraging different modalities and signals.Inthis paper, we will give a brief overview of the state-of-the-art multimodal AI techniques and thedirection of building compositional AI systems. We will also discuss the potential future researchtopics in multimodal AI.{\textquotedblright}"
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nan-2023-frontier">
<titleInfo>
<title>Frontier Review of Multimodal AI</title>
</titleInfo>
<name type="personal">
<namePart type="given">Duan</namePart>
<namePart type="family">Nan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd Chinese National Conference on Computational Linguistics (Volume 2: Frontier Forum)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Chinese Information Processing Society of China</publisher>
<place>
<placeTerm type="text">Harbin, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>“Pre-training techniques have enabled foundation models (such as BERT, T5, GPT) to achieveremarkable success in natural language processing (NLP) and multimodal tasks that involve text,audio and visual contents. Some of the latest multimodal generative models, such as DALL·Eand Stable Diffusion, can synthesize novel visual content from text or video inputs, which greatlyenhances the creativity and productivity of content creators. However, multimodal AI also facessome challenges, such as adding new modalities or handling diverse tasks that require signalsbeyond their understanding. Therefore, a new trend in multimodal AI is to build a compositionalAI system that connects existing foundation models with external modules and tools. This way,the system can perform more varied tasks by leveraging different modalities and signals.Inthis paper, we will give a brief overview of the state-of-the-art multimodal AI techniques and thedirection of building compositional AI systems. We will also discuss the potential future researchtopics in multimodal AI.”</abstract>
<identifier type="citekey">nan-2023-frontier</identifier>
<location>
<url>https://aclanthology.org/2023.ccl-2.9/</url>
</location>
<part>
<date>2023-08</date>
<extent unit="page">
<start>110</start>
<end>118</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Frontier Review of Multimodal AI
%A Nan, Duan
%Y Zhang, Jiajun
%S Proceedings of the 22nd Chinese National Conference on Computational Linguistics (Volume 2: Frontier Forum)
%D 2023
%8 August
%I Chinese Information Processing Society of China
%C Harbin, China
%G eng
%F nan-2023-frontier
%X “Pre-training techniques have enabled foundation models (such as BERT, T5, GPT) to achieveremarkable success in natural language processing (NLP) and multimodal tasks that involve text,audio and visual contents. Some of the latest multimodal generative models, such as DALL·Eand Stable Diffusion, can synthesize novel visual content from text or video inputs, which greatlyenhances the creativity and productivity of content creators. However, multimodal AI also facessome challenges, such as adding new modalities or handling diverse tasks that require signalsbeyond their understanding. Therefore, a new trend in multimodal AI is to build a compositionalAI system that connects existing foundation models with external modules and tools. This way,the system can perform more varied tasks by leveraging different modalities and signals.Inthis paper, we will give a brief overview of the state-of-the-art multimodal AI techniques and thedirection of building compositional AI systems. We will also discuss the potential future researchtopics in multimodal AI.”
%U https://aclanthology.org/2023.ccl-2.9/
%P 110-118
Markdown (Informal)
[Frontier Review of Multimodal AI](https://aclanthology.org/2023.ccl-2.9/) (Nan, CCL 2023)
ACL
- Duan Nan. 2023. Frontier Review of Multimodal AI. In Proceedings of the 22nd Chinese National Conference on Computational Linguistics (Volume 2: Frontier Forum), pages 110–118, Harbin, China. Chinese Information Processing Society of China.