@inproceedings{wilkinson-etal-2020-semi,
title = "Semi-supervised Acoustic Modelling for Five-lingual Code-switched {ASR} using Automatically-segmented Soap Opera Speech",
author = "Wilkinson, Nick and
Biswas, Astik and
Yilmaz, Emre and
De Wet, Febe and
Van der westhuizen, Ewald and
Niesler, Thomas",
editor = "Beermann, Dorothee and
Besacier, Laurent and
Sakti, Sakriani and
Soria, Claudia",
booktitle = "Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources association",
url = "https://aclanthology.org/2020.sltu-1.10/",
pages = "70--78",
language = "eng",
ISBN = "979-10-95546-35-1",
abstract = "This paper considers the impact of automatic segmentation on the fully-automatic, semi-supervised training of automatic speech recog-nition (ASR) systems for five-lingual code-switched (CS) speech. Four automatic segmentation techniques were evaluated in terms ofthe recognition performance of an ASR system trained on the resulting segments in a semi-supervised manner. For comparative purposesa semi-supervised syste Three of these use a newly proposed convolutional neural network (CNN) model for framewise classification,and include a novel form of HMM smoothing of the CNN outputs. Automatic segmentation was applied in combination with automaticspeaker diarization. The best-performing segmentation technique was also evaluated without speaker diarization. An evaluation basedon 248 unsegmented soap opera episodes indicated that voice activity detection (VAD) based on a CNN followed by Gaussian mixturemodel-hidden Markov model smoothing (CNN-GMM-HMM) yields the best ASR performance. The semi-supervised system trainedwith the best automatic segmentation achieved an overall WER improvement of 1.1{\%} absolute over a semi-supervised system trainedwith manually created segments. Furthermore, we found that recognition rates improved even further when the automatic segmentationwas used in conjunction with speaker diarization."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wilkinson-etal-2020-semi">
<titleInfo>
<title>Semi-supervised Acoustic Modelling for Five-lingual Code-switched ASR using Automatically-segmented Soap Opera Speech</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nick</namePart>
<namePart type="family">Wilkinson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Astik</namePart>
<namePart type="family">Biswas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emre</namePart>
<namePart type="family">Yilmaz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Febe</namePart>
<namePart type="family">De Wet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ewald</namePart>
<namePart type="family">Van der westhuizen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Niesler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dorothee</namePart>
<namePart type="family">Beermann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laurent</namePart>
<namePart type="family">Besacier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Soria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-35-1</identifier>
</relatedItem>
<abstract>This paper considers the impact of automatic segmentation on the fully-automatic, semi-supervised training of automatic speech recog-nition (ASR) systems for five-lingual code-switched (CS) speech. Four automatic segmentation techniques were evaluated in terms ofthe recognition performance of an ASR system trained on the resulting segments in a semi-supervised manner. For comparative purposesa semi-supervised syste Three of these use a newly proposed convolutional neural network (CNN) model for framewise classification,and include a novel form of HMM smoothing of the CNN outputs. Automatic segmentation was applied in combination with automaticspeaker diarization. The best-performing segmentation technique was also evaluated without speaker diarization. An evaluation basedon 248 unsegmented soap opera episodes indicated that voice activity detection (VAD) based on a CNN followed by Gaussian mixturemodel-hidden Markov model smoothing (CNN-GMM-HMM) yields the best ASR performance. The semi-supervised system trainedwith the best automatic segmentation achieved an overall WER improvement of 1.1% absolute over a semi-supervised system trainedwith manually created segments. Furthermore, we found that recognition rates improved even further when the automatic segmentationwas used in conjunction with speaker diarization.</abstract>
<identifier type="citekey">wilkinson-etal-2020-semi</identifier>
<location>
<url>https://aclanthology.org/2020.sltu-1.10/</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>70</start>
<end>78</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Semi-supervised Acoustic Modelling for Five-lingual Code-switched ASR using Automatically-segmented Soap Opera Speech
%A Wilkinson, Nick
%A Biswas, Astik
%A Yilmaz, Emre
%A De Wet, Febe
%A Van der westhuizen, Ewald
%A Niesler, Thomas
%Y Beermann, Dorothee
%Y Besacier, Laurent
%Y Sakti, Sakriani
%Y Soria, Claudia
%S Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)
%D 2020
%8 May
%I European Language Resources association
%C Marseille, France
%@ 979-10-95546-35-1
%G eng
%F wilkinson-etal-2020-semi
%X This paper considers the impact of automatic segmentation on the fully-automatic, semi-supervised training of automatic speech recog-nition (ASR) systems for five-lingual code-switched (CS) speech. Four automatic segmentation techniques were evaluated in terms ofthe recognition performance of an ASR system trained on the resulting segments in a semi-supervised manner. For comparative purposesa semi-supervised syste Three of these use a newly proposed convolutional neural network (CNN) model for framewise classification,and include a novel form of HMM smoothing of the CNN outputs. Automatic segmentation was applied in combination with automaticspeaker diarization. The best-performing segmentation technique was also evaluated without speaker diarization. An evaluation basedon 248 unsegmented soap opera episodes indicated that voice activity detection (VAD) based on a CNN followed by Gaussian mixturemodel-hidden Markov model smoothing (CNN-GMM-HMM) yields the best ASR performance. The semi-supervised system trainedwith the best automatic segmentation achieved an overall WER improvement of 1.1% absolute over a semi-supervised system trainedwith manually created segments. Furthermore, we found that recognition rates improved even further when the automatic segmentationwas used in conjunction with speaker diarization.
%U https://aclanthology.org/2020.sltu-1.10/
%P 70-78
Markdown (Informal)
[Semi-supervised Acoustic Modelling for Five-lingual Code-switched ASR using Automatically-segmented Soap Opera Speech](https://aclanthology.org/2020.sltu-1.10/) (Wilkinson et al., SLTU 2020)
ACL