{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T13:25:54Z","timestamp":1730208354502,"version":"3.28.0"},"reference-count":58,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,10,31]],"date-time":"2023-10-31T00:00:00Z","timestamp":1698710400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,10,31]],"date-time":"2023-10-31T00:00:00Z","timestamp":1698710400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,10,31]]},"DOI":"10.1109\/cluster52292.2023.00015","type":"proceedings-article","created":{"date-parts":[[2023,11,21]],"date-time":"2023-11-21T19:21:45Z","timestamp":1700594505000},"page":"82-94","source":"Crossref","is-referenced-by-count":0,"title":["Prophet: Fine-grained Load Balancing for Parallel Training of Large-scale MoE Models"],"prefix":"10.1109","author":[{"given":"Wei","family":"Wang","sequence":"first","affiliation":[{"name":"National University Of Defense Technology,National Laboratory for Parallel and Distributed Processing(PDL) College Of Computer,Changsha,China"}]},{"given":"Zhiquan","family":"Lai","sequence":"additional","affiliation":[{"name":"National University Of Defense Technology,National Laboratory for Parallel and Distributed Processing(PDL) College Of Computer,Changsha,China"}]},{"given":"Shengwei","family":"Li","sequence":"additional","affiliation":[{"name":"National University Of Defense Technology,National Laboratory for Parallel and Distributed Processing(PDL) College Of Computer,Changsha,China"}]},{"given":"Weijie","family":"Liu","sequence":"additional","affiliation":[{"name":"National University Of Defense Technology,National Laboratory for Parallel and Distributed Processing(PDL) College Of Computer,Changsha,China"}]},{"given":"Keshi","family":"Ge","sequence":"additional","affiliation":[{"name":"National University Of Defense Technology,National Laboratory for Parallel and Distributed Processing(PDL) College Of Computer,Changsha,China"}]},{"given":"Yujie","family":"Liu","sequence":"additional","affiliation":[{"name":"National University Of Defense Technology,National Laboratory for Parallel and Distributed Processing(PDL) College Of Computer,Changsha,China"}]},{"given":"Ao","family":"Shen","sequence":"additional","affiliation":[{"name":"National University Of Defense Technology,National Laboratory for Parallel and Distributed Processing(PDL) College Of Computer,Changsha,China"}]},{"given":"Dongsheng","family":"Li","sequence":"additional","affiliation":[{"name":"National University Of Defense Technology,National Laboratory for Parallel and Distributed Processing(PDL) College Of Computer,Changsha,China"}]}],"member":"263","reference":[{"article-title":"Gshard: Scaling giant models with conditional computation and automatic sharding","year":"2020","author":"Lepikhin","key":"ref1"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1093\/oed\/8724410687"},{"first-page":"18 332","article-title":"Deepspeed-moe: Advancing mixture-of-experts inference and training to power next-generation ai scale","volume-title":"International Conference on Machine Learning","author":"Rajbhandari","key":"ref3"},{"issue":"1","key":"ref4","first-page":"5232","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus","year":"2022","journal-title":"The Journal of Machine Learning Research"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508418"},{"article-title":"Se-moe: A scalable and efficient mixture-of-experts distributed training and inference system","year":"2022","author":"Shen","key":"ref6"},{"article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","year":"2018","author":"Devlin","key":"ref7"},{"issue":"1","key":"ref8","first-page":"5485","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"The Journal of Machine Learning Research"},{"volume":"32","article-title":"Xlnet: Generalized autoregressive pretraining for language understanding","volume-title":"Advances in neural information processing systems","author":"Yang","key":"ref9"},{"article-title":"Roberta: A robustly optimized bert pretraining approach","year":"2019","author":"Liu","key":"ref10"},{"issue":"8","key":"ref11","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"ref12","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in neural information processing systems"},{"article-title":"Scaling laws for neural language models","year":"2020","author":"Kaplan","key":"ref13"},{"first-page":"5547","article-title":"Glam: Efficient scaling of language models with mixture-of-experts","volume-title":"International Conference on Machine Learn ing","author":"Du","key":"ref14"},{"key":"ref15","first-page":"430","article-title":"Pathways: Asynchronous distributed dataflow for ml","volume-title":"Proceedings of Machine Learning and Systems","volume":"4","author":"Barham"},{"first-page":"18","article-title":"A Unied Architecture for Accelerating Distributed DNN Training in Heterogeneous GPU\/CPU Clusters","author":"Jiang","key":"ref16"},{"article-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism","year":"2019","author":"Shoeybi","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TAC.2007.898077"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.2008.83"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1007\/s00450-011-0170-4"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-45706-2_112"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2017.2768413"},{"article-title":"M6: A chinese multimodal pretrainer","year":"2021","author":"Lin","key":"ref23"},{"article-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer","year":"2017","author":"Shazeer","key":"ref24"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737367"},{"article-title":"Openwebtext corpus","year":"2019","author":"Gokaslan","key":"ref26"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.11"},{"article-title":"Skip-thought vectors","year":"2015","author":"Kiros","key":"ref28"},{"article-title":"Pipetransformer: Automated elastic pipelining for distributed training of transformers","year":"2021","author":"He","key":"ref29"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747065"},{"key":"ref31","first-page":"29 335","article-title":"Dselect-k: Differentiable selection in the mixture of experts with applications to multi-task learning","volume-title":"Advances in Neural Information Processing Systems","volume":"34","author":"Hazimeh"},{"key":"ref32","first-page":"8583","article-title":"Scaling vision with sparse mixture of experts","volume-title":"Advances in Neural Information Processing Systems","volume":"34","author":"Riquelme"},{"first-page":"4057","article-title":"Unified scaling laws for routed language models","volume-title":"International Conference on Machine Learning","author":"Clark","key":"ref33"},{"key":"ref34","first-page":"17 555","article-title":"Hash layers for large sparse models","volume-title":"Advances in Neural Information Processing Systems","volume":"34","author":"Roller"},{"article-title":"M6-t: Exploring sparse expert models and beyond","year":"2021","author":"Yang","key":"ref35"},{"article-title":"Taming sparsely activated transformer with stochastic experts","year":"2021","author":"Zuo","key":"ref36"},{"first-page":"6265","article-title":"Base layers: Simplifying training of large, sparse models","volume-title":"International Conference on Machine Learning","author":"Lewis","key":"ref37"},{"key":"ref38","first-page":"7103","article-title":"Mixture-of-experts with expert choice routing","volume-title":"Advances in Neural Information Processing Systems","volume":"35","author":"Zhou"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.48550\/arxiv.1811.06965"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER51413.2022.00042"},{"key":"ref41","first-page":"24 829","article-title":"Piper: Multidimensional planner for dnn parallelization","volume-title":"Advances in Neural Information Processing Systems","volume":"34","author":"Tarnawski"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER51413.2022.00043"},{"first-page":"307","article-title":"Hetpipe: Enabling large dnn training on (whimpy) heterogeneous gpu clusters through integration of pipelined model parallelism and data parallelism","volume-title":"Proceedings of the 2020 USENIX Conference on Usenix Annual Technical Conference","author":"Park","key":"ref43"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/3472456.3472497"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"ref47","first-page":"479","article-title":"Pipelined backpropagation at scale: training large models without batches","volume-title":"Proceedings of Machine Learning and Systems","volume":"3","author":"Kosson"},{"key":"ref48","first-page":"269","article-title":"Pipemare: Asynchronous pipeline parallel dnn training","volume-title":"Proceedings of Machine Learning and Systems","volume":"3","author":"Yang"},{"first-page":"1","article-title":"Efficient large-scale language model training on gpu clusters using megatron-lm","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Narayanan","key":"ref49"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2023.3247001"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"article-title":"Priority-based Parameter Propagation for Distributed DNN Training","year":"2019","author":"Jayarajan","key":"ref52"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.14778\/3503585.3503590"},{"first-page":"350","article-title":"Mercury: A Simple Transport Layer Scheduler to Accelerate Distributed DNN Training","volume-title":"IEEE INFOCOM 2022 - IEEE Conference on Computer Communications","author":"Duan","key":"ref54"},{"first-page":"16","article-title":"Better Together: Jointly Optimizing ML Collective Scheduling and Execution Planning using SYNDICATE","author":"Mahajan","key":"ref55"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"first-page":"626","article-title":"Preemptive All-reduce Scheduling for Expediting Distributed DNN Training","volume-title":"IEEE INFOCOM 2020 - IEEE Conference on Computer Communications","author":"Bao","key":"ref57"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1145\/3545008.3545011"}],"event":{"name":"2023 IEEE International Conference on Cluster Computing (CLUSTER)","start":{"date-parts":[[2023,10,31]]},"location":"Santa Fe, NM, USA","end":{"date-parts":[[2023,11,3]]}},"container-title":["2023 IEEE International Conference on Cluster Computing (CLUSTER)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10319881\/10319941\/10319949.pdf?arnumber=10319949","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,14]],"date-time":"2024-03-14T04:18:02Z","timestamp":1710389882000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10319949\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,31]]},"references-count":58,"URL":"https:\/\/doi.org\/10.1109\/cluster52292.2023.00015","relation":{},"subject":[],"published":{"date-parts":[[2023,10,31]]}}}