{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,27]],"date-time":"2024-10-27T16:40:26Z","timestamp":1730047226837,"version":"3.28.0"},"reference-count":58,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T00:00:00Z","timestamp":1725148800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T00:00:00Z","timestamp":1725148800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T00:00:00Z","timestamp":1725148800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T00:00:00Z","timestamp":1725148800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T00:00:00Z","timestamp":1725148800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T00:00:00Z","timestamp":1725148800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T00:00:00Z","timestamp":1725148800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62271342","12004275"],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Knowledge-Based Systems"],"published-print":{"date-parts":[[2024,9]]},"DOI":"10.1016\/j.knosys.2024.112123","type":"journal-article","created":{"date-parts":[[2024,6,13]],"date-time":"2024-06-13T19:29:09Z","timestamp":1718306949000},"page":"112123","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":1,"special_numbering":"C","title":["Speech emotion recognition based on bi-directional acoustic\u2013articulatory conversion"],"prefix":"10.1016","volume":"299","author":[{"ORCID":"http:\/\/orcid.org\/0000-0002-7203-3894","authenticated-orcid":false,"given":"Haifeng","family":"Li","sequence":"first","affiliation":[]},{"ORCID":"http:\/\/orcid.org\/0000-0002-2035-0329","authenticated-orcid":false,"given":"Xueying","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Shufei","family":"Duan","sequence":"additional","affiliation":[]},{"ORCID":"http:\/\/orcid.org\/0000-0003-4408-4528","authenticated-orcid":false,"given":"Huizhi","family":"Liang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"issue":"1","key":"10.1016\/j.knosys.2024.112123_b1","doi-asserted-by":"crossref","first-page":"17","DOI":"10.1109\/TAFFC.2022.3205919","article-title":"The acoustically emotion-aware conversational agent with speech emotion recognition and empathetic responses","volume":"14","author":"Hu","year":"2022","journal-title":"IEEE Trans. Affect. Comput."},{"journal-title":"IEEE Trans. Intell. Veh.","article-title":"Global-local-feature-fused driver speech emotion detection for intelligent cockpit in automated driving","year":"2023","author":"Li","key":"10.1016\/j.knosys.2024.112123_b2"},{"issue":"2","key":"10.1016\/j.knosys.2024.112123_b3","doi-asserted-by":"crossref","first-page":"186","DOI":"10.1111\/acps.13388","article-title":"A generalizable speech emotion recognition model reveals depression and remission","volume":"145","author":"Hansen","year":"2022","journal-title":"Acta Psychiatr. Scand."},{"key":"10.1016\/j.knosys.2024.112123_b4","doi-asserted-by":"crossref","first-page":"1184","DOI":"10.1016\/j.procs.2018.05.033","article-title":"An automated psychometric analyzer based on sentiment analysis and emotion recognition for healthcare","volume":"132","author":"Vij","year":"2018","journal-title":"Proc. Comput. Sci."},{"issue":"3","key":"10.1016\/j.knosys.2024.112123_b5","doi-asserted-by":"crossref","first-page":"572","DOI":"10.1016\/j.patcog.2010.09.020","article-title":"Survey on speech emotion recognition: Features, classification schemes, and databases","volume":"44","author":"El Ayadi","year":"2011","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.knosys.2024.112123_b6","first-page":"12449","article-title":"Wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2024.112123_b7","doi-asserted-by":"crossref","unstructured":"D. Hu, X. Hu, X. Xu, Multiple Enhancements to LSTM for Learning Emotion-Salient Features in Speech Emotion Recognition, in: Proc. Interspeech 2022, 2022, pp. 4720\u20134724.","DOI":"10.21437\/Interspeech.2022-985"},{"issue":"3","key":"10.1016\/j.knosys.2024.112123_b8","doi-asserted-by":"crossref","first-page":"532","DOI":"10.1109\/TAFFC.2018.2817622","article-title":"EEG emotion recognition using dynamical graph convolutional neural networks","volume":"11","author":"Song","year":"2018","journal-title":"IEEE Trans. Affect. Comput."},{"key":"10.1016\/j.knosys.2024.112123_b9","doi-asserted-by":"crossref","first-page":"2617","DOI":"10.1109\/TASLP.2021.3096037","article-title":"Information fusion in attention networks using adaptive and multi-level factorized bilinear pooling for audio-visual emotion recognition","volume":"29","author":"Zhou","year":"2021","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.knosys.2024.112123_b10","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1155\/2021\/8876005","article-title":"Articulatory-to-acoustic conversion of mandarin emotional speech based on PSO-LSSVM","volume":"2021","author":"Ren","year":"2021","journal-title":"Complexity"},{"key":"10.1016\/j.knosys.2024.112123_b11","doi-asserted-by":"crossref","unstructured":"S. Lee, S. Yildirim, A. Kazemzadeh, S. Narayanan, An articulatory study of emotional speech production, in: Ninth European Conference on Speech Communication and Technology, 2005.","DOI":"10.21437\/Interspeech.2005-325"},{"key":"10.1016\/j.knosys.2024.112123_b12","doi-asserted-by":"crossref","first-page":"82","DOI":"10.1016\/j.specom.2023.01.005","article-title":"A study of correlation between physiological process of articulation and emotions on Mandarin Chinese","volume":"147","author":"Zhang","year":"2023","journal-title":"Speech Commun."},{"issue":"3","key":"10.1016\/j.knosys.2024.112123_b13","doi-asserted-by":"crossref","first-page":"1819","DOI":"10.1121\/1.416001","article-title":"Accurate recovery of articulator positions from acoustics: New conclusions based on human data","volume":"100","author":"Hogden","year":"1996","journal-title":"J. Acoust. Soc. Am."},{"key":"10.1016\/j.knosys.2024.112123_b14","series-title":"2015 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"4450","article-title":"A deep recurrent approach for acoustic-to-articulatory inversion","author":"Liu","year":"2015"},{"key":"10.1016\/j.knosys.2024.112123_b15","doi-asserted-by":"crossref","first-page":"177995","DOI":"10.1109\/ACCESS.2020.3026579","article-title":"Silent speech interfaces for speech restoration: A review","volume":"8","author":"Gonzalez-Lopez","year":"2020","journal-title":"IEEE Access"},{"issue":"1","key":"10.1016\/j.knosys.2024.112123_b16","doi-asserted-by":"crossref","first-page":"433","DOI":"10.1121\/1.4904701","article-title":"Reduction of non-native accents through statistical parametric articulatory synthesis","volume":"137","author":"Aryal","year":"2015","journal-title":"J. Acoust. Soc. Am."},{"key":"10.1016\/j.knosys.2024.112123_b17","doi-asserted-by":"crossref","unstructured":"J. Kim, S. Lee, S. Narayanan, An exploratory study of the relations between perceived emotion strength and articulatory kinematics, in: Twelfth Annual Conference of the International Speech Communication Association, 2011.","DOI":"10.21437\/Interspeech.2011-741"},{"key":"10.1016\/j.knosys.2024.112123_b18","doi-asserted-by":"crossref","first-page":"135","DOI":"10.1109\/TASLP.2021.3133218","article-title":"Acoustic-to-articulatory mapping with joint optimization of deep speech enhancement and articulatory inversion models","volume":"30","author":"Shahrebabaki","year":"2021","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.knosys.2024.112123_b19","doi-asserted-by":"crossref","first-page":"196","DOI":"10.1016\/j.csl.2015.05.003","article-title":"Speaker verification based on the fusion of speech acoustics and inverted articulatory signals","volume":"36","author":"Li","year":"2016","journal-title":"Comput. Speech Lang."},{"key":"10.1016\/j.knosys.2024.112123_b20","doi-asserted-by":"crossref","first-page":"68","DOI":"10.1016\/j.knosys.2014.03.019","article-title":"Speech emotion recognition using amplitude modulation parameters and a combined feature selection procedure","volume":"63","author":"Mencattini","year":"2014","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2024.112123_b21","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2019.104886","article-title":"Bagged support vector machines for emotion recognition from speech","volume":"184","author":"Bhavan","year":"2019","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2024.112123_b22","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2022.108580","article-title":"Deep learning based multimodal emotion recognition using model-level fusion of audio\u2013visual modalities","volume":"244","author":"Middya","year":"2022","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2024.112123_b23","doi-asserted-by":"crossref","first-page":"68","DOI":"10.1016\/j.knosys.2014.03.019","article-title":"Speech emotion recognition using amplitude modulation parameters and a combined feature selection procedure","volume":"63","author":"Mencattini","year":"2014","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2024.112123_b24","series-title":"2020 Ieee Region 10 Conference","first-page":"968","article-title":"On the differences between song and speech emotion recognition: Effect of feature sets, feature types, and classifiers","author":"Atmaja","year":"2020"},{"key":"10.1016\/j.knosys.2024.112123_b25","article-title":"MLT-dnet: Speech emotion recognition using 1D dilated CNN based on multi-learning trick approach","volume":"167","author":"Kwon","year":"2021","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.knosys.2024.112123_b26","series-title":"2019 IEEE 38th International Performance Computing and Communications Conference","first-page":"1","article-title":"HS-TCN: A semi-supervised hierarchical stacking temporal convolutional network for anomaly detection in IoT","author":"Cheng","year":"2019"},{"key":"10.1016\/j.knosys.2024.112123_b27","series-title":"2013 IEEE Workshop on Automatic Speech Recognition and Understanding","first-page":"216","article-title":"Emotion recognition from spontaneous speech using hidden markov models with deep belief networks","author":"Le","year":"2013"},{"key":"10.1016\/j.knosys.2024.112123_b28","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2022.108472","article-title":"ATDA: Attentional temporal dynamic activation for speech emotion recognition","volume":"243","author":"Liu","year":"2022","journal-title":"Knowl.-Based Syst."},{"issue":"6","key":"10.1016\/j.knosys.2024.112123_b29","doi-asserted-by":"crossref","first-page":"1171","DOI":"10.1109\/TASL.2009.2014796","article-title":"Integrating articulatory features into HMM-based parametric speech synthesis","volume":"17","author":"Ling","year":"2009","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"issue":"5","key":"10.1016\/j.knosys.2024.112123_b30","doi-asserted-by":"crossref","first-page":"695","DOI":"10.1109\/LSP.2018.2819886","article-title":"Statistical parametric speech synthesis using generalized distillation framework","volume":"25","author":"Liu","year":"2018","journal-title":"IEEE Signal Process. Lett."},{"year":"2015","series-title":"Statistical Parametric Methods for Articulatory-Based Foreign Accent Conversion","author":"Aryal","key":"10.1016\/j.knosys.2024.112123_b31"},{"year":"2021","series-title":"Learning robust speech representation with an articulatory-regularized variational autoencoder","author":"Georges","key":"10.1016\/j.knosys.2024.112123_b32"},{"issue":"2","key":"10.1016\/j.knosys.2024.112123_b33","doi-asserted-by":"crossref","first-page":"680","DOI":"10.1109\/TAFFC.2019.2947464","article-title":"Spontaneous speech emotion recognition using multiscale deep convolutional LSTM","volume":"13","author":"Zhang","year":"2019","journal-title":"IEEE Trans. Affect. Comput."},{"key":"10.1016\/j.knosys.2024.112123_b34","series-title":"2015 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"4450","article-title":"A deep recurrent approach for acoustic-to-articulatory inversion","author":"Liu","year":"2015"},{"key":"10.1016\/j.knosys.2024.112123_b35","doi-asserted-by":"crossref","unstructured":"C. Qin, M.\u00c1. Carreira-Perpi\u00f1\u00e1n, An empirical investigation of the nonuniqueness in the acoustic-to-articulatory mapping, in: Eighth Annual Conference of the International Speech Communication Association, 2007.","DOI":"10.21437\/Interspeech.2007-16"},{"key":"10.1016\/j.knosys.2024.112123_b36","series-title":"Proceedings of the 2012 Asia Pacific Signal and Information Processing Association Annual Summit and Conference","first-page":"1","article-title":"A study of emotional information present in articulatory movements estimated using acoustic-to-articulatory inversion","author":"Kim","year":"2012"},{"issue":"1","key":"10.1016\/j.knosys.2024.112123_b37","doi-asserted-by":"crossref","DOI":"10.1515\/opli-2016-0034","article-title":"Articulation, acoustics and perception of mandarin Chinese emotional speech","volume":"2","author":"Erickson","year":"2016","journal-title":"Open Linguist."},{"journal-title":"IEEE Trans. Circuits Syst. Video Technol.","article-title":"Beyond single reference for training: underwater image enhancement via comparative learning","year":"2022","author":"Li","key":"10.1016\/j.knosys.2024.112123_b38"},{"key":"10.1016\/j.knosys.2024.112123_b39","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2022.118943","article-title":"Learning multi-scale features for speech emotion recognition with connection attention mechanism","volume":"214","author":"Chen","year":"2023","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.knosys.2024.112123_b40","series-title":"ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"6437","article-title":"Speech emotion recognition with global-aware fusion on multi-scale feature representation","author":"Zhu","year":"2022"},{"key":"10.1016\/j.knosys.2024.112123_b41","series-title":"ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"7367","article-title":"Speech emotion recognition with co-attention based multi-level acoustic information","author":"Zou","year":"2022"},{"key":"10.1016\/j.knosys.2024.112123_b42","doi-asserted-by":"crossref","first-page":"312","DOI":"10.1016\/j.bspc.2018.08.035","article-title":"Speech emotion recognition using deep 1D & 2D CNN LSTM networks","volume":"47","author":"Zhao","year":"2019","journal-title":"Biomed. Signal Process. Control"},{"issue":"18","key":"10.1016\/j.knosys.2024.112123_b43","doi-asserted-by":"crossref","first-page":"5212","DOI":"10.3390\/s20185212","article-title":"Deep-net: A lightweight CNN-based speech emotion recognition system using deep frequency features","volume":"20","author":"Anvarjon","year":"2020","journal-title":"Sensors"},{"issue":"6","key":"10.1016\/j.knosys.2024.112123_b44","doi-asserted-by":"crossref","first-page":"1576","DOI":"10.1109\/TMM.2017.2766843","article-title":"Speech emotion recognition using deep convolutional neural network and discriminant temporal pyramid matching","volume":"20","author":"Zhang","year":"2017","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.knosys.2024.112123_b45","series-title":"2019 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","first-page":"878","article-title":"Cyclegan-based speech enhancement for the unpaired training data","author":"Yuan","year":"2019"},{"journal-title":"IEEE Trans. Affect. Comput.","article-title":"Unsupervised cross-corpus speech emotion recognition using a multi-source cycle-gan","year":"2022","author":"Su","key":"10.1016\/j.knosys.2024.112123_b46"},{"key":"10.1016\/j.knosys.2024.112123_b47","series-title":"Interspeech","first-page":"1517","article-title":"A database of german emotional speech","volume":"Vol. 5","author":"Burkhardt","year":"2005"},{"year":"2005","series-title":"CAISA mandarin emotional speech corpus","key":"10.1016\/j.knosys.2024.112123_b48"},{"issue":"5","key":"10.1016\/j.knosys.2024.112123_b49","doi-asserted-by":"crossref","DOI":"10.1371\/journal.pone.0196391","article-title":"The ryerson audio-visual database of emotional speech and song (RAVDESS): A dynamic, multimodal set of facial and vocal expressions in North American english","volume":"13","author":"Livingstone","year":"2018","journal-title":"PLoS One"},{"key":"10.1016\/j.knosys.2024.112123_b50","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/j.specom.2019.04.004","article-title":"Improving multilingual speech emotion recognition by combining acoustic features in a three-layer model","volume":"110","author":"Li","year":"2019","journal-title":"Speech Commun."},{"issue":"5","key":"10.1016\/j.knosys.2024.112123_b51","doi-asserted-by":"crossref","first-page":"1181","DOI":"10.1109\/TIFS.2018.2871749","article-title":"Deep residual network for steganalysis of digital images","volume":"14","author":"Boroumand","year":"2018","journal-title":"IEEE Trans. Inf. Forensics Secur."},{"year":"2018","series-title":"Transfer learning for improving speech emotion classification accuracy","author":"Latif","key":"10.1016\/j.knosys.2024.112123_b52"},{"key":"10.1016\/j.knosys.2024.112123_b53","doi-asserted-by":"crossref","first-page":"309","DOI":"10.1016\/j.ins.2021.02.016","article-title":"Speech emotion recognition based on formant characteristics feature extraction and phoneme type convergence","volume":"563","author":"Liu","year":"2021","journal-title":"Inform. Sci."},{"key":"10.1016\/j.knosys.2024.112123_b54","doi-asserted-by":"crossref","first-page":"53","DOI":"10.1016\/j.specom.2022.11.005","article-title":"Modulation spectral features for speech emotion recognition using deep neural networks","volume":"146","author":"Singh","year":"2023","journal-title":"Speech Commun."},{"key":"10.1016\/j.knosys.2024.112123_b55","doi-asserted-by":"crossref","first-page":"3705","DOI":"10.1007\/s11042-017-5539-3","article-title":"Spectrogram based multi-task audio classification","volume":"78","author":"Zeng","year":"2019","journal-title":"Multimedia Tools Appl."},{"issue":"4","key":"10.1016\/j.knosys.2024.112123_b56","article-title":"Speech emotion recognition using deep convolutional neural network and simple recurrent unit","volume":"27","author":"Jiang","year":"2019","journal-title":"Eng. Lett."},{"issue":"21","key":"10.1016\/j.knosys.2024.112123_b57","doi-asserted-by":"crossref","first-page":"9897","DOI":"10.3390\/app11219897","article-title":"A novel heterogeneous parallel convolution bi-LSTM for speech emotion recognition","volume":"11","author":"Zhang","year":"2021","journal-title":"Appl. Sci."},{"key":"10.1016\/j.knosys.2024.112123_b58","series-title":"Interspeech","first-page":"1686","article-title":"Deep learning of segment-level feature representation with multiple instance learning for utterance-level speech emotion recognition","author":"Mao","year":"2019"}],"container-title":["Knowledge-Based Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705124007573?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705124007573?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2024,10,27]],"date-time":"2024-10-27T16:03:09Z","timestamp":1730044989000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0950705124007573"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9]]},"references-count":58,"alternative-id":["S0950705124007573"],"URL":"http:\/\/dx.doi.org\/10.1016\/j.knosys.2024.112123","relation":{},"ISSN":["0950-7051"],"issn-type":[{"type":"print","value":"0950-7051"}],"subject":[],"published":{"date-parts":[[2024,9]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Speech emotion recognition based on bi-directional acoustic\u2013articulatory conversion","name":"articletitle","label":"Article Title"},{"value":"Knowledge-Based Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.knosys.2024.112123","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2024 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"112123"}}