{"id":"https://openalex.org/W4401306538","doi":"https://doi.org/10.48550/arxiv.2407.21325","title":"EdgeLLM: A Highly Efficient CPU-FPGA Heterogeneous Edge Accelerator for\n Large Language Models","display_name":"EdgeLLM: A Highly Efficient CPU-FPGA Heterogeneous Edge Accelerator for\n Large Language Models","publication_year":2024,"publication_date":"2024-07-31","ids":{"openalex":"https://openalex.org/W4401306538","doi":"https://doi.org/10.48550/arxiv.2407.21325"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.21325","pdf_url":"http://arxiv.org/pdf/2407.21325","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2407.21325","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5038495391","display_name":"Mingqiang Huang","orcid":"https://orcid.org/0000-0002-7794-3985"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Mingqiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055425079","display_name":"Ao Shen","orcid":"https://orcid.org/0000-0003-4722-2367"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Ao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101661976","display_name":"Kai Li","orcid":"https://orcid.org/0000-0002-2977-4152"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Kai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109758412","display_name":"Haoxiang Peng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng, Haoxiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100772375","display_name":"Boyu Li","orcid":"https://orcid.org/0000-0001-9709-9673"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Boyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5034853402","display_name":"Hao Yu","orcid":"https://orcid.org/0000-0002-2674-4118"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Hao","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.998318,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":93,"max":96},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.7939,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.7939,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10320","display_name":"Neural Networks and Applications","score":0.7468,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.7035,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/heterogeneous-computing","display_name":"Heterogeneous Computing","score":0.607053},{"id":"https://openalex.org/keywords/high-performance-computing","display_name":"High-Performance Computing","score":0.577787},{"id":"https://openalex.org/keywords/gpu-computing","display_name":"GPU Computing","score":0.568813},{"id":"https://openalex.org/keywords/multicore-architectures","display_name":"Multicore Architectures","score":0.56874},{"id":"https://openalex.org/keywords/performance-optimization","display_name":"Performance Optimization","score":0.56189}],"concepts":[{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.7290095},{"id":"https://openalex.org/C162307627","wikidata":"https://www.wikidata.org/wiki/Q204833","display_name":"Enhanced Data Rates for GSM Evolution","level":2,"score":0.6633681},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6143217},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.43766296},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.3783983},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.34355032},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.10001263}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.21325","pdf_url":"http://arxiv.org/pdf/2407.21325","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.21325","pdf_url":"http://arxiv.org/pdf/2407.21325","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4200391368","https://openalex.org/W2570254841","https://openalex.org/W2386041993","https://openalex.org/W2363944576","https://openalex.org/W2355315220","https://openalex.org/W2351041855","https://openalex.org/W2111241003","https://openalex.org/W2096844293","https://openalex.org/W1967938402","https://openalex.org/W1608572506"],"abstract_inverted_index":{"The":[0,168],"rapid":[1],"advancements":[2],"in":[3,120,208],"artificial":[4],"intelligence":[5],"(AI),":[6],"particularly":[7,32],"the":[8,23,64,74,108,121,157,161,190],"Large":[9],"Language":[10],"Models":[11],"(LLMs),":[12],"have":[13,117],"profoundly":[14],"affected":[15],"our":[16,178],"daily":[17],"work":[18],"and":[19,46,80,90,127,142,159,184,197,214],"communication":[20],"forms.":[21],"However,":[22],"colossal":[24],"scale":[25],"of":[26,67,97,114,156,206,210],"LLM":[27,215],"presents":[28],"significant":[29],"operational":[30],"challenges,":[31],"when":[33],"attempting":[34],"to":[35,61,94,107],"deploy":[36],"them":[37],"on":[38,69,164,173,195],"resource-constrained":[39],"edge":[40],"devices":[41],"such":[42,123],"as":[43,124],"smartphones,":[44],"robots,":[45],"embedded":[47],"systems.":[48],"In":[49],"this":[50],"work,":[51],"we":[52,101,145],"proposed":[53,146],"EdgeLLM,":[54],"an":[55,147],"efficient":[56],"CPU-FPGA":[57,165],"heterogeneous":[58,166],"acceleration":[59],"framework,":[60],"markedly":[62],"enhance":[63],"computational":[65],"efficiency":[66,188],"LLMs":[68],"edge.":[70],"We":[71],"first":[72],"analyzed":[73],"whole":[75,162],"operators":[76,105,158],"within":[77],"AI":[78,98],"models":[79],"developed":[81,102],"a":[82],"universal":[83],"data":[84,110,140],"parallelism":[85],"scheme,":[86],"which":[87],"is":[88],"generic":[89],"can":[91,152],"be":[92],"adapted":[93],"any":[95],"type":[96],"algorithm.":[99],"Then,":[100],"fully-customized":[103],"hardware":[104],"according":[106],"designated":[109],"formats.":[111],"A":[112],"multitude":[113],"optimization":[115],"techniques":[116],"been":[118,171],"integrated":[119],"design,":[122],"approximate":[125],"FP16*INT4":[126],"FP16*FP16":[128],"computation":[129],"engines,":[130],"group":[131],"vector":[132],"systolic":[133],"arrays,":[134],"log-scale":[135],"structured":[136],"sparsity,":[137],"asynchronous":[138],"between":[139],"transfer":[141],"processing.":[143],"Finally,":[144],"end-to-end":[148],"compilation":[149],"scheme":[150],"that":[151],"dynamically":[153],"compile":[154],"all":[155],"map":[160],"model":[163],"system.":[167],"design":[169],"has":[170],"deployed":[172],"AMD":[174],"Xilinx":[175],"VCU128":[176],"FPGA,":[177],"accelerator":[179,205],"achieves":[180],"1.67x":[181],"higher":[182,186],"throughput":[183],"7.4x":[185],"energy":[187],"than":[189,202],"commercial":[191],"GPU":[192],"(NVIDIA":[193],"A100-SXM4-80G)":[194],"ChatGLM2-6B,":[196],"shows":[198],"10%~20%":[199],"better":[200],"performance":[201],"state-of-the-art":[203],"FPGA":[204],"FlightLLM":[207],"terms":[209],"HBM":[211],"bandwidth":[212],"utilization":[213],"throughput.":[216]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4401306538","counts_by_year":[{"year":2024,"cited_by_count":2}],"updated_date":"2024-12-04T20:21:31.133175","created_date":"2024-08-04"}