{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T10:57:20Z","timestamp":1730199440889,"version":"3.28.0"},"reference-count":66,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,12,16]],"date-time":"2023-12-16T00:00:00Z","timestamp":1702684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,12,16]],"date-time":"2023-12-16T00:00:00Z","timestamp":1702684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,12,16]]},"DOI":"10.1109\/asru57964.2023.10389765","type":"proceedings-article","created":{"date-parts":[[2024,1,19]],"date-time":"2024-01-19T13:38:40Z","timestamp":1705671520000},"page":"1-8","source":"Crossref","is-referenced-by-count":1,"title":["WaveNeXt: ConvNeXt-Based Fast Neural Vocoder Without ISTFT layer"],"prefix":"10.1109","author":[{"given":"Takuma","family":"Okamoto","sequence":"first","affiliation":[{"name":"National Institute of Information and Communications Technology,Japan"}]},{"given":"Haruki","family":"Yamashita","sequence":"additional","affiliation":[{"name":"Kobe University,Japan"}]},{"given":"Yamato","family":"Ohtani","sequence":"additional","affiliation":[{"name":"National Institute of Information and Communications Technology,Japan"}]},{"given":"Tomoki","family":"Toda","sequence":"additional","affiliation":[{"name":"Nagoya University,Japan"}]},{"given":"Hisashi","family":"Kawai","sequence":"additional","affiliation":[{"name":"National Institute of Information and Communications Technology,Japan"}]}],"member":"263","reference":[{"first-page":"125","article-title":"WaveNet: A generative model for raw audio","volume-title":"Proc. SSW9","author":"van den Oord","key":"ref1"},{"first-page":"2415","article-title":"Efficient neural audio synthesis","volume-title":"Proc. ICML","author":"Kalchbrenner","key":"ref2"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682804"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3089565"},{"article-title":"WaveGrad: Estimating gradients for waveform generation","volume-title":"Proc. ICLR","author":"Chen","key":"ref5"},{"article-title":"DiffWave: A versatile diffusion model for audio synthesis","volume-title":"Proc. ICLR","author":"Kong","key":"ref6"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9415087"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10022496"},{"first-page":"14910","article-title":"MelGAN: Generative adversarial networks for conditional waveform synthesis","volume-title":"Proc. NeurIPS","author":"Kumar","key":"ref9"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/SLT48900.2021.9383551"},{"first-page":"17022","article-title":"HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"Proc. NeurIPS","author":"Kong","key":"ref11"},{"first-page":"2672","article-title":"Generative adversarial nets","volume-title":"Proc. NIPS","author":"Goodfellow","key":"ref12"},{"first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"Proc. ICML","author":"Kim","key":"ref13"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-831"},{"first-page":"2709","article-title":"YourTTS: Towards zero-shot multi-speaker TTS and zero-shot voice conversion for everyone","volume-title":"Proc. ICML","author":"Casanova","key":"ref15"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10294"},{"article-title":"AutoTTS: Endto-end text-to-speech synthesis through differentiable duration modeling","volume-title":"Proc. ICASSP","author":"Nguyen","key":"ref17","doi-asserted-by":"crossref","DOI":"10.1109\/ICASSP49357.2023.10095431"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747020"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2518"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095102"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10097255"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096442"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096509"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096250"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095298"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3275032"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2022.3164361"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1121\/10.0016896"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-845"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1016"},{"article-title":"Chunked autoregressive GAN for conditional waveform synthesis","volume-title":"Proc. ICLR","author":"Morrison","key":"ref31"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/icassp49357.2023.10096288"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688194"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746713"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095296"},{"issue":"88","key":"ref36","first-page":"73","article-title":"Fast neural waveform generation model with fully connected upsampling","volume-title":"IEICE Tech. Rep","volume":"123","author":"Yamashita","year":"2023"},{"journal-title":"arXiv:2306.00814","article-title":"Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis","year":"2023","author":"Siuzdak","key":"ref37"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"article-title":"CSTR VCTK Corpus: English Multi-speaker Corpus for CSTR Voice Cloning Toolkit (version 0.92)","year":"2019","author":"Yamagishi","key":"ref39"},{"article-title":"Hi-Fi-CAPTAIN: High-fidelity and high-capacity conversational speech synthesis corpus developed by NICT","year":"2023","author":"Okamoto","key":"ref40"},{"first-page":"8024","article-title":"PyTorch: An imperative style, high-performance deep learning library","volume-title":"Proc. NeurIPS","author":"Paszke","key":"ref41"},{"journal-title":"arXiv:2110.07840","article-title":"ESPnet2-TTS: Extending the edge of TTS research","year":"2021","author":"Hayashi","key":"ref42"},{"journal-title":"arXiv:1606.08415","article-title":"Gaussian Error Linear Units (GELUs)","year":"2016","author":"Hendrycks","key":"ref43"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/78.258122"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8269005"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462237"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639687"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/5.52200"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.207"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"journal-title":"arXiv:1607.06450","article-title":"Layer normalization","year":"2016","author":"Ba","key":"ref51"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.195"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00109"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3129994"},{"article-title":"FastSpeech 2: Fast and high-quality end-to-end text to speech","volume-title":"Proc. ICLR","author":"Ren","key":"ref55"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747707"},{"first-page":"8067","article-title":"Glow-TTS: A generative flow for text-to-speech via monotonic alignment search","volume-title":"Proc. NeurIPS","author":"Kim","key":"ref58"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-68"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1587\/transinf.2015EDP7457"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1587\/transinf.2020EDP7104"},{"first-page":"7","article-title":"Corpus of spontaneous Japanese: Its design and evaluation","volume-title":"Proc. SSPR","author":"Maekawa","key":"ref62"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414858"},{"journal-title":"Methods for subjective determination of transmission quality","year":"1996","key":"ref64"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01548"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-677"}],"event":{"name":"2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","start":{"date-parts":[[2023,12,16]]},"location":"Taipei, Taiwan","end":{"date-parts":[[2023,12,20]]}},"container-title":["2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10388490\/10389614\/10389765.pdf?arnumber=10389765","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,23]],"date-time":"2024-01-23T11:42:50Z","timestamp":1706010170000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10389765\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,16]]},"references-count":66,"URL":"http:\/\/dx.doi.org\/10.1109\/asru57964.2023.10389765","relation":{},"subject":[],"published":{"date-parts":[[2023,12,16]]}}}