iBet uBet web content aggregator. Adding the entire web to your favor.
iBet uBet web content aggregator. Adding the entire web to your favor.



Link to original content: https://api.openalex.org/works/doi:10.48550/ARXIV.2406.03476
{"id":"https://openalex.org/W4399448194","doi":"https://doi.org/10.48550/arxiv.2406.03476","title":"Does your data spark joy? Performance gains from domain upsampling at\n the end of training","display_name":"Does your data spark joy? Performance gains from domain upsampling at\n the end of training","publication_year":2024,"publication_date":"2024-06-05","ids":{"openalex":"https://openalex.org/W4399448194","doi":"https://doi.org/10.48550/arxiv.2406.03476"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2406.03476","pdf_url":"https://arxiv.org/pdf/2406.03476","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2406.03476","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5002287514","display_name":"Cody Blakeney","orcid":"https://orcid.org/0000-0002-1412-2813"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Blakeney, Cody","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036307641","display_name":"Mansheej Paul","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Paul, Mansheej","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087527566","display_name":"Brett W. Larsen","orcid":"https://orcid.org/0000-0002-8922-3918"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Larsen, Brett W.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090516335","display_name":"Sean Owen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Owen, Sean","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5078716102","display_name":"Jonathan Frankle","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Frankle, Jonathan","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":85},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Natural Language Processing","score":0.558,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Natural Language Processing","score":0.558,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/upsampling","display_name":"Upsampling","score":0.8113879},{"id":"https://openalex.org/keywords/spark","display_name":"SPARK (programming language)","score":0.74372864},{"id":"https://openalex.org/keywords/topic-modeling","display_name":"Topic Modeling","score":0.518026}],"concepts":[{"id":"https://openalex.org/C110384440","wikidata":"https://www.wikidata.org/wiki/Q1143270","display_name":"Upsampling","level":3,"score":0.8113879},{"id":"https://openalex.org/C2781215313","wikidata":"https://www.wikidata.org/wiki/Q3493345","display_name":"SPARK (programming language)","level":2,"score":0.74372864},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.6681309},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5649261},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.5539672},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.39640197},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2853136},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.14046389},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.08002284},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.05765146},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2406.03476","pdf_url":"https://arxiv.org/pdf/2406.03476","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2406.03476","pdf_url":"https://arxiv.org/pdf/2406.03476","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W4385574037","https://openalex.org/W4310746709","https://openalex.org/W3155117723","https://openalex.org/W2748952813","https://openalex.org/W2607795551","https://openalex.org/W2281134365","https://openalex.org/W2062399876","https://openalex.org/W1991429770","https://openalex.org/W1983892167"],"abstract_inverted_index":{"Pretraining":[0],"datasets":[1,37,101,217,245],"for":[2,148,153,164,192,218],"large":[3,15,44],"language":[4,198],"models":[5],"(LLMs)":[6],"have":[7],"grown":[8],"to":[9,30,49,53,95,106,113,125,128,143,179,187,209,237,257],"trillions":[10],"of":[11,14,17,34,62,85,111,174,181,215,229,242,252],"tokens":[12],"composed":[13],"amounts":[16],"CommonCrawl":[18],"(CC)":[19],"web":[20,79],"scrape":[21],"along":[22],"with":[23,64,170,239],"smaller,":[24],"domain-specific":[25,36],"datasets.":[26],"It":[27],"is":[28,47,190],"expensive":[29],"understand":[31],"the":[32,59,71,75,82,97,109,144,172,194,213,235,240],"impact":[33,241],"these":[35],"on":[38,117,131,135,140],"model":[39,151,162],"capabilities":[40,200],"as":[41,166],"training":[42,112,182],"at":[43,108,211,246,249],"FLOP":[45],"scales":[46],"required":[48],"reveal":[50],"significant":[51],"changes":[52],"difficult":[54,118],"and":[55,81,137,183,201],"emergent":[56],"benchmarks.":[57,119,203],"Given":[58],"increasing":[60],"cost":[61,255],"experimenting":[63],"pretraining":[65,244,259],"data,":[66],"how":[67,94],"does":[68],"one":[69],"determine":[70],"optimal":[72,191],"balance":[73],"between":[74,196],"diversity":[76],"in":[77],"general":[78,197],"scrapes":[80],"information":[83],"density":[84],"domain":[86,99,175,207],"specific":[87,100],"data?":[88],"In":[89],"this":[90,226],"work,":[91],"we":[92],"show":[93],"leverage":[96],"smaller":[98],"by":[102,222],"upsampling":[103,176,208],"them":[104,224],"relative":[105,142],"CC":[107],"end":[110],"drive":[114],"performance":[115],"improvements":[116],"This":[120,231],"simple":[121],"technique":[122],"allows":[123],"us":[124],"improve":[126],"up":[127,234],"6.90":[129],"pp":[130,134,139],"MMLU,":[132],"8.26":[133],"GSM8K,":[136],"6.17":[138],"HumanEval":[141],"base":[145],"data":[146],"mix":[147],"a":[149],"7B":[150],"trained":[152,163],"1":[154],"trillion":[155],"(T)":[156],"tokens,":[157],"thus":[158],"rivaling":[159],"Llama-2":[160],"(7B)$\\unicode{x2014}$a":[161],"twice":[165],"long.":[167],"We":[168,204],"experiment":[169,238],"ablating":[171],"duration":[173],"from":[177],"5%":[178],"30%":[180],"find":[184],"that":[185],"10%":[186],"20%":[188],"percent":[189],"navigating":[193],"tradeoff":[195],"modeling":[199],"targeted":[202],"also":[205],"use":[206],"characterize":[210],"scale":[212],"utility":[214],"individual":[216],"improving":[219],"various":[220],"benchmarks":[221],"removing":[223],"during":[225],"final":[227],"phase":[228],"training.":[230],"tool":[232],"opens":[233],"ability":[236],"different":[243],"scale,":[247],"but":[248],"an":[250],"order":[251],"magnitude":[253],"lower":[254],"compared":[256],"full":[258],"runs.":[260]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4399448194","counts_by_year":[],"updated_date":"2024-11-20T03:50:00.641416","created_date":"2024-06-08"}