iBet uBet web content aggregator. Adding the entire web to your favor.
iBet uBet web content aggregator. Adding the entire web to your favor.



Link to original content: https://api.openalex.org/works/doi:10.48550/ARXIV.2406.07198
{"id":"https://openalex.org/W4399597893","doi":"https://doi.org/10.48550/arxiv.2406.07198","title":"Target Speech Diarization with Multimodal Prompts","display_name":"Target Speech Diarization with Multimodal Prompts","publication_year":2024,"publication_date":"2024-06-11","ids":{"openalex":"https://openalex.org/W4399597893","doi":"https://doi.org/10.48550/arxiv.2406.07198"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2406.07198","pdf_url":"https://arxiv.org/pdf/2406.07198","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2406.07198","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5016187230","display_name":"Yidi Jiang","orcid":"https://orcid.org/0000-0001-9013-0869"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Yidi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026034735","display_name":"Ruijie Tao","orcid":"https://orcid.org/0000-0003-0021-5661"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tao, Ruijie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101416769","display_name":"Zhengyang Chen","orcid":"https://orcid.org/0000-0003-1293-8146"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Zhengyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100341993","display_name":"Yanmin Qian","orcid":"https://orcid.org/0000-0002-0314-3790"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qian, Yanmin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5032690182","display_name":"Haizhou Li","orcid":"https://orcid.org/0000-0001-9158-9401"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Haizhou","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":84},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9806,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9806,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.57212055}],"concepts":[{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.57212055},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.55459416},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.49910212},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.32221663},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.15666667}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2406.07198","pdf_url":"https://arxiv.org/pdf/2406.07198","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2406.07198","pdf_url":"https://arxiv.org/pdf/2406.07198","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W4389984014","https://openalex.org/W3120512183","https://openalex.org/W2748952813","https://openalex.org/W2144208207","https://openalex.org/W2118860825","https://openalex.org/W2111874347","https://openalex.org/W2096510939","https://openalex.org/W2041797852","https://openalex.org/W1509309911"],"abstract_inverted_index":{"Traditional":[0],"speaker":[1,11,137],"diarization":[2,138],"seeks":[3],"to":[4,10,14,25,47,77,114,157,164],"detect":[5,19],"``who":[6],"spoke":[7],"when''":[8],"according":[9,24],"characteristics.":[12],"Extending":[13],"target":[15,21,49],"speech":[16,141],"diarization,":[17],"we":[18,102],"``when":[20],"event":[22],"occurs''":[23],"the":[26,116,122],"semantic":[27,58],"characteristics":[28],"of":[29,112,118],"speech.":[30],"We":[31,70,88],"propose":[32,72],"a":[33,52,73,85,90,153],"novel":[34],"Multimodal":[35],"Target":[36],"Speech":[37],"Diarization":[38],"(MM-TSD)":[39],"framework,":[40],"which":[41],"accommodates":[42],"diverse":[43],"and":[44,54,66,81,99,106,139,149],"multi-modal":[45,91],"prompts":[46,113],"specify":[48],"events":[50],"in":[51,121,130],"flexible":[53],"user-friendly":[55],"manner,":[56],"including":[57,136],"language":[59],"description,":[60],"pre-enrolled":[61],"speech,":[62],"pre-registered":[63],"face":[64,82],"image,":[65],"audio-language":[67],"logical":[68],"prompts.":[69,145],"further":[71],"voice-face":[74],"aligner":[75],"module":[76],"project":[78],"human":[79],"voice":[80],"representation":[83],"into":[84],"shared":[86],"space.":[87],"develop":[89],"dataset":[92],"based":[93],"on":[94],"VoxCeleb2":[95],"for":[96,109,168],"MM-TSD":[97,146,161],"training":[98],"evaluation.":[100],"Additionally,":[101],"conduct":[103],"comparative":[104],"analysis":[105],"ablation":[107],"studies":[108],"each":[110,119],"category":[111],"validate":[115],"efficacy":[117],"component":[120],"proposed":[123],"framework.":[124],"Furthermore,":[125],"our":[126],"framework":[127],"demonstrates":[128],"versatility":[129],"performing":[131],"various":[132],"signal":[133],"processing":[134],"tasks,":[135],"overlap":[140],"detection,":[142],"using":[143],"task-specific":[144],"achieves":[147],"robust":[148],"comparable":[150],"performance":[151],"as":[152],"unified":[154],"system":[155],"compared":[156],"specialized":[158],"models.":[159],"Moreover,":[160],"shows":[162],"capability":[163],"handle":[165],"complex":[166],"conversations":[167],"real-world":[169],"dataset.":[170]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4399597893","counts_by_year":[],"updated_date":"2024-12-07T18:02:42.535447","created_date":"2024-06-13"}