@inproceedings{li-etal-2023-adaptive-gating,
title = "Adaptive Gating in Mixture-of-Experts based Language Models",
author = "Li, Jiamin and
Su, Qiang and
Yang, Yitao and
Jiang, Yimin and
Wang, Cong and
Xu, Hong",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.217/",
doi = "10.18653/v1/2023.emnlp-main.217",
pages = "3577--3587",
abstract = "Large language models have demonstrated exceptional language understanding capabilities in many NLP tasks. Sparsely activated mixture-of-experts (MoE) has emerged as a promising solution for scaling models while maintaining a constant number of computational operations. Existing MoE models adopt a fixed gating network where each token is computed by the same number of experts. This contradicts our intuition that the tokens in each sequence vary in terms of their linguistic complexity and, consequently, require different computational costs. Little is discussed in prior research on the trade-off between computation per token and model performance. This paper introduces adaptive gating in MoE, a flexible training strategy that allows tokens to be processed by a variable number of experts based on expert probability distribution. Adaptive gating preserves sparsity while improving training efficiency. We further draw upon curriculum learning to better align the order of training samples and maximize the training time savings. Extensive experiments on diverse NLP tasks show that adaptive gating reduces at most 22.5{\%} training time while maintaining inference quality. Moreover, we conduct a comprehensive analysis of the gating decisions and present our insights on which tokens are inherently difficult to process, depending on the specific language task."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2023-adaptive-gating">
<titleInfo>
<title>Adaptive Gating in Mixture-of-Experts based Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiamin</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qiang</namePart>
<namePart type="family">Su</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yitao</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yimin</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hong</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models have demonstrated exceptional language understanding capabilities in many NLP tasks. Sparsely activated mixture-of-experts (MoE) has emerged as a promising solution for scaling models while maintaining a constant number of computational operations. Existing MoE models adopt a fixed gating network where each token is computed by the same number of experts. This contradicts our intuition that the tokens in each sequence vary in terms of their linguistic complexity and, consequently, require different computational costs. Little is discussed in prior research on the trade-off between computation per token and model performance. This paper introduces adaptive gating in MoE, a flexible training strategy that allows tokens to be processed by a variable number of experts based on expert probability distribution. Adaptive gating preserves sparsity while improving training efficiency. We further draw upon curriculum learning to better align the order of training samples and maximize the training time savings. Extensive experiments on diverse NLP tasks show that adaptive gating reduces at most 22.5% training time while maintaining inference quality. Moreover, we conduct a comprehensive analysis of the gating decisions and present our insights on which tokens are inherently difficult to process, depending on the specific language task.</abstract>
<identifier type="citekey">li-etal-2023-adaptive-gating</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.217</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.217/</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>3577</start>
<end>3587</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Adaptive Gating in Mixture-of-Experts based Language Models
%A Li, Jiamin
%A Su, Qiang
%A Yang, Yitao
%A Jiang, Yimin
%A Wang, Cong
%A Xu, Hong
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F li-etal-2023-adaptive-gating
%X Large language models have demonstrated exceptional language understanding capabilities in many NLP tasks. Sparsely activated mixture-of-experts (MoE) has emerged as a promising solution for scaling models while maintaining a constant number of computational operations. Existing MoE models adopt a fixed gating network where each token is computed by the same number of experts. This contradicts our intuition that the tokens in each sequence vary in terms of their linguistic complexity and, consequently, require different computational costs. Little is discussed in prior research on the trade-off between computation per token and model performance. This paper introduces adaptive gating in MoE, a flexible training strategy that allows tokens to be processed by a variable number of experts based on expert probability distribution. Adaptive gating preserves sparsity while improving training efficiency. We further draw upon curriculum learning to better align the order of training samples and maximize the training time savings. Extensive experiments on diverse NLP tasks show that adaptive gating reduces at most 22.5% training time while maintaining inference quality. Moreover, we conduct a comprehensive analysis of the gating decisions and present our insights on which tokens are inherently difficult to process, depending on the specific language task.
%R 10.18653/v1/2023.emnlp-main.217
%U https://aclanthology.org/2023.emnlp-main.217/
%U https://doi.org/10.18653/v1/2023.emnlp-main.217
%P 3577-3587
Markdown (Informal)
[Adaptive Gating in Mixture-of-Experts based Language Models](https://aclanthology.org/2023.emnlp-main.217/) (Li et al., EMNLP 2023)
ACL
- Jiamin Li, Qiang Su, Yitao Yang, Yimin Jiang, Cong Wang, and Hong Xu. 2023. Adaptive Gating in Mixture-of-Experts based Language Models. In Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pages 3577–3587, Singapore. Association for Computational Linguistics.