{"id":"https://openalex.org/W4387995154","doi":"https://doi.org/10.48550/arxiv.2310.17101","title":"Multi-Speaker Expressive Speech Synthesis via Semi-supervised Contrastive Learning","display_name":"Multi-Speaker Expressive Speech Synthesis via Semi-supervised Contrastive Learning","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4387995154","doi":"https://doi.org/10.48550/arxiv.2310.17101"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2310.17101","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2310.17101","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101883211","display_name":"Xinfa Zhu","orcid":"https://orcid.org/0000-0001-9275-523X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Xinfa","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100697204","display_name":"Yuke Li","orcid":"https://orcid.org/0009-0000-7282-8964"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yuke","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013928267","display_name":"Yi Lei","orcid":"https://orcid.org/0000-0002-9256-9311"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lei, Yi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052756710","display_name":"Ning Jiang","orcid":"https://orcid.org/0000-0003-1579-3114"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Ning","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100628448","display_name":"Guoqing Zhao","orcid":"https://orcid.org/0000-0001-7130-7147"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Guoqing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100668966","display_name":"Lei Xie","orcid":"https://orcid.org/0000-0001-8234-0823"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Lei","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":69},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition Technology","score":0.9969,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition Technology","score":0.9969,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech Enhancement Techniques","score":0.9911,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Audio Signal Classification and Analysis","score":0.9654,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.78310454},{"id":"https://openalex.org/keywords/utterance","display_name":"Utterance","score":0.69227636},{"id":"https://openalex.org/keywords/speech-enhancement","display_name":"Speech Enhancement","score":0.538797},{"id":"https://openalex.org/keywords/end-to-end-speech-recognition","display_name":"End-to-End Speech Recognition","score":0.536596},{"id":"https://openalex.org/keywords/audio-visual-speech-recognition","display_name":"Audio-Visual Speech Recognition","score":0.534938},{"id":"https://openalex.org/keywords/statistical-language-modeling","display_name":"Statistical Language Modeling","score":0.528105},{"id":"https://openalex.org/keywords/speaker-verification","display_name":"Speaker Verification","score":0.513697},{"id":"https://openalex.org/keywords/transfer-of-learning","display_name":"Transfer of learning","score":0.44004765}],"concepts":[{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.78310454},{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.69227636},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6401608},{"id":"https://openalex.org/C2776445246","wikidata":"https://www.wikidata.org/wiki/Q1792644","display_name":"Style (visual arts)","level":2,"score":0.49614915},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.48896348},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4670996},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.44004765},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.41885337},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.37265325},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0},{"id":"https://openalex.org/C95457728","wikidata":"https://www.wikidata.org/wiki/Q309","display_name":"History","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2310.17101","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2310.17101","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2310.17101","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[{"score":0.56,"id":"https://metadata.un.org/sdg/4","display_name":"Quality education"}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W3177678247","https://openalex.org/W2944572343","https://openalex.org/W2529301793","https://openalex.org/W2384121599","https://openalex.org/W2366107444","https://openalex.org/W2351687372","https://openalex.org/W2333799855","https://openalex.org/W2038083449","https://openalex.org/W2004087835","https://openalex.org/W1999617572"],"abstract_inverted_index":{"This":[0],"paper":[1],"aims":[2],"to":[3,32,63,81,85,110],"build":[4],"an":[5,104],"expressive":[6,112],"TTS":[7,30],"system":[8],"for":[9,119],"multi-speakers,":[10],"synthesizing":[11],"a":[12,26,77,120],"target":[13,121],"speaker's":[14],"speech":[15,113],"with":[16,114],"multiple":[17],"styles":[18,116],"and":[19,35,48,59,69,95,117],"emotions.":[20],"To":[21],"this":[22],"end,":[23],"we":[24,40,75],"propose":[25],"novel":[27],"contrastive":[28,61],"learning-based":[29],"approach":[31,84],"transfer":[33],"style":[34],"emotion":[36],"across":[37],"speakers.":[38],"Specifically,":[39],"construct":[41],"positive-negative":[42],"sample":[43],"pairs":[44],"at":[45],"both":[46],"utterance":[47],"category":[49],"(such":[50],"as":[51],"emotion-happy":[52],"or":[53,55],"style-poet":[54],"speaker":[56,70],"A)":[57],"levels":[58],"leverage":[60,87],"learning":[62],"better":[64],"extract":[65],"disentangled":[66],"style,":[67],"emotion,":[68],"representations":[71,102],"from":[72],"speech.":[73],"Furthermore,":[74],"introduce":[76],"semi-supervised":[78],"training":[79],"strategy":[80],"the":[82,100,128],"proposed":[83],"effectively":[86],"multi-domain":[88,125],"data,":[89,92,94],"including":[90],"style-labeled":[91],"emotion-labeled":[93],"unlabeled":[96],"data.":[97],"We":[98],"integrate":[99],"learned":[101],"into":[103],"improved":[105],"VITS":[106],"model,":[107],"enabling":[108],"it":[109],"synthesize":[111],"diverse":[115],"emotions":[118],"speaker.":[122],"Experiments":[123],"on":[124],"data":[126],"demonstrate":[127],"good":[129],"design":[130],"of":[131],"our":[132],"model.":[133]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4387995154","counts_by_year":[],"updated_date":"2024-11-22T09:36:10.458114","created_date":"2023-10-28"}