iBet uBet web content aggregator. Adding the entire web to your favor.
iBet uBet web content aggregator. Adding the entire web to your favor.



Link to original content: https://api.openalex.org/works/doi:10.48550/ARXIV.2408.09027
{"id":"https://openalex.org/W4402502977","doi":"https://doi.org/10.48550/arxiv.2408.09027","title":"Efficient Autoregressive Audio Modeling via Next-Scale Prediction","display_name":"Efficient Autoregressive Audio Modeling via Next-Scale Prediction","publication_year":2024,"publication_date":"2024-08-16","ids":{"openalex":"https://openalex.org/W4402502977","doi":"https://doi.org/10.48550/arxiv.2408.09027"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.09027","pdf_url":"http://arxiv.org/pdf/2408.09027","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2408.09027","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102742709","display_name":"Kai Qiu","orcid":"https://orcid.org/0000-0003-2071-4425"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiu, Kai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100394772","display_name":"Xiang Li","orcid":"https://orcid.org/0000-0002-4962-002X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Xiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017505188","display_name":"Hao Chen","orcid":"https://orcid.org/0000-0002-0615-9261"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Hao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101909170","display_name":"Sun Jie","orcid":"https://orcid.org/0000-0002-4544-1913"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Jie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070492669","display_name":"Jinglu Wang","orcid":"https://orcid.org/0000-0002-3222-6579"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jinglu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100600815","display_name":"Zhe Lin","orcid":"https://orcid.org/0000-0002-4593-3897"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Zhe","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057959136","display_name":"Marios Savvides","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Savvides, Marios","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5113017615","display_name":"Bhiksha Raj","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Raj, Bhiksha","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":85},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Audio Signal Classification and Analysis","score":0.9972,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Audio Signal Classification and Analysis","score":0.9972,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition Technology","score":0.9806,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech Enhancement Techniques","score":0.9781,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/acoustic-modeling","display_name":"Acoustic Modeling","score":0.692042},{"id":"https://openalex.org/keywords/environmental-sound-recognition","display_name":"Environmental Sound Recognition","score":0.658729},{"id":"https://openalex.org/keywords/audio-visual-speech-recognition","display_name":"Audio-Visual Speech Recognition","score":0.63874},{"id":"https://openalex.org/keywords/audio-event-detection","display_name":"Audio Event Detection","score":0.632037},{"id":"https://openalex.org/keywords/automatic-speech-recognition","display_name":"Automatic Speech Recognition","score":0.628014}],"concepts":[{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.8684336},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.6584812},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.55938804},{"id":"https://openalex.org/C194657046","wikidata":"https://www.wikidata.org/wiki/Q7394685","display_name":"STAR model","level":4,"score":0.45171455},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.39655852},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.38469067},{"id":"https://openalex.org/C24338571","wikidata":"https://www.wikidata.org/wiki/Q2566298","display_name":"Autoregressive integrated moving average","level":3,"score":0.26948684},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.18203625},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.17834607},{"id":"https://openalex.org/C151406439","wikidata":"https://www.wikidata.org/wiki/Q186588","display_name":"Time series","level":2,"score":0.15792575},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.110235035},{"id":"https://openalex.org/C58640448","wikidata":"https://www.wikidata.org/wiki/Q42515","display_name":"Cartography","level":1,"score":0.09943828}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.09027","pdf_url":"http://arxiv.org/pdf/2408.09027","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.09027","pdf_url":"http://arxiv.org/pdf/2408.09027","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W3120578569","https://openalex.org/W2439807930","https://openalex.org/W2168175994","https://openalex.org/W2120434453","https://openalex.org/W2024529895","https://openalex.org/W2019155478","https://openalex.org/W2009692134","https://openalex.org/W1972271943","https://openalex.org/W1902630399","https://openalex.org/W1487412319"],"abstract_inverted_index":{"Audio":[0,139],"generation":[1,36],"has":[2],"achieved":[3],"remarkable":[4,131],"progress":[5],"with":[6,75],"the":[7,25,32,61,94,104,112,115,125,145],"advance":[8],"of":[9,30,34,64,114],"sophisticated":[10],"generative":[11],"models,":[12],"such":[13],"as":[14],"diffusion":[15],"models":[16,47,54],"(DMs)":[17],"and":[18,67,107,123,136],"autoregressive":[19],"(AR)":[20],"models.":[21],"However,":[22],"due":[23],"to":[24,41,98],"naturally":[26],"significant":[27],"sequence":[28],"length":[29,63],"audio,":[31],"efficiency":[33],"audio":[35,65],"remains":[37],"an":[38],"essential":[39],"issue":[40],"be":[42],"addressed,":[43],"especially":[44],"for":[45],"AR":[46,96,100],"that":[48],"are":[49],"incorporated":[50],"in":[51],"large":[52],"language":[53],"(LLMs).":[55],"In":[56],"this":[57],"paper,":[58],"we":[59,118],"analyze":[60,120],"token":[62],"tokenization":[66],"propose":[68],"a":[69,82,130],"novel":[70],"\\textbf{S}cale-level":[71],"\\textbf{A}udio":[72],"\\textbf{T}okenizer":[73],"(SAT),":[74],"improved":[76],"residual":[77],"quantization.":[78],"Based":[79],"on":[80,144],"SAT,":[81],"scale-level":[83],"\\textbf{A}coustic":[84],"\\textbf{A}uto\\textbf{R}egressive":[85],"(AAR)":[86],"modeling":[87],"framework":[88,128],"is":[89],"further":[90],"proposed,":[91],"which":[92],"shifts":[93],"next-token":[95],"prediction":[97],"next-scale":[99],"prediction,":[101],"significantly":[102],"reducing":[103],"training":[105],"cost":[106],"inference":[108,134],"time.":[109],"To":[110],"validate":[111],"effectiveness":[113],"proposed":[116,126],"approach,":[117],"comprehensively":[119],"design":[121],"choices":[122],"demonstrate":[124],"AAR":[127],"achieves":[129],"\\textbf{35}$\\times$":[132],"faster":[133],"speed":[135],"+\\textbf{1.33}":[137],"Fr\\'echet":[138],"Distance":[140],"(FAD)":[141],"against":[142],"baselines":[143],"AudioSet":[146],"benchmark.":[147],"Code:":[148],"\\url{https://github.com/qiuk2/AAR}.":[149]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4402502977","counts_by_year":[],"updated_date":"2024-11-23T01:45:43.052186","created_date":"2024-09-14"}