{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,7,7]],"date-time":"2024-07-07T02:17:09Z","timestamp":1720318629632},"reference-count":43,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2019,7,1]],"date-time":"2019-07-01T00:00:00Z","timestamp":1561939200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2020,4,19]],"date-time":"2020-04-19T00:00:00Z","timestamp":1587254400000},"content-version":"am","delay-in-days":293,"URL":"http:\/\/www.elsevier.com\/open-access\/userlicense\/1.0\/"}],"funder":[{"DOI":"10.13039\/501100008982","name":"National Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100008982","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Parallel Computing"],"published-print":{"date-parts":[[2019,7]]},"DOI":"10.1016\/j.parco.2019.03.005","type":"journal-article","created":{"date-parts":[[2019,4,4]],"date-time":"2019-04-04T16:01:03Z","timestamp":1554393663000},"page":"141-152","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":11,"special_numbering":"C","title":["Optimized large-message broadcast for deep learning workloads: MPI, MPI+NCCL, or NCCL2?"],"prefix":"10.1016","volume":"85","author":[{"ORCID":"http:\/\/orcid.org\/0000-0002-6272-3760","authenticated-orcid":false,"given":"Ammar Ahmad","family":"Awan","sequence":"first","affiliation":[]},{"ORCID":"http:\/\/orcid.org\/0000-0001-7937-7035","authenticated-orcid":false,"given":"Karthik Vadambacheri","family":"Manian","sequence":"additional","affiliation":[]},{"ORCID":"http:\/\/orcid.org\/0000-0002-6752-3135","authenticated-orcid":false,"given":"Ching-Hsiang","family":"Chu","sequence":"additional","affiliation":[]},{"given":"Hari","family":"Subramoni","sequence":"additional","affiliation":[]},{"given":"Dhabaleswar K.","family":"Panda","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.parco.2019.03.005_bib0001","article-title":"Neural machine translation and sequence-to-sequence models: a tutorial","author":"Neubig","year":"2017","journal-title":"CoRR"},{"key":"10.1016\/j.parco.2019.03.005_bib0002","article-title":"Edinburgh neural machine translation systems for WMT 16","author":"Sennrich","year":"2016","journal-title":"CoRR"},{"key":"10.1016\/j.parco.2019.03.005_bib0003","unstructured":"F.N. Iandola, K. Ashraf, M.W. Moskewicz, K. Keutzer, FireCaffe: near-linear acceleration of deep neural network training on compute clusters, (2015) arXiv:1511.00175."},{"key":"10.1016\/j.parco.2019.03.005_bib0004","series-title":"Proceedings of the 22Nd ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming","first-page":"193","article-title":"S-Caffe: co-designing mpi runtimes and caffe for scalable deep learning on modern GPU clusters","author":"Awan","year":"2017"},{"key":"10.1016\/j.parco.2019.03.005_bib0005","unstructured":"Y. Jia, E. Shelhamer, J. Donahue, S. Karayev, J. Long, R. Girshick, S. Guadarrama, T. Darrell, Caffe: convolutional architecture for fast feature embedding, (2014) arXiv:1408.5093."},{"key":"10.1016\/j.parco.2019.03.005_bib0006","article-title":"TensorFlow: Large-Scale Machine Learning on Heterogeneous Systems, 2015","author":"Abadi","year":"2016","journal-title":"Software available from tensorflow. org"},{"key":"10.1016\/j.parco.2019.03.005_sbref0005","series-title":"Cognitive Toolkit","author":"Microsoft","year":"2018"},{"key":"10.1016\/j.parco.2019.03.005_bib0008","series-title":"2016 IEEE International Conference on Cloud Computing Technology and Science (CloudCom)","first-page":"144","article-title":"Re-designing CNTK deep learning framework on modern GPU enabled clusters","author":"Banerjee","year":"2016"},{"key":"10.1016\/j.parco.2019.03.005_sbref0007","series-title":"NVIDIA Collective Communications Library (NCCL)","author":"NVIDIA","year":"2016"},{"key":"10.1016\/j.parco.2019.03.005_sbref0008","series-title":"Optimized Primitives for Collective Multi-GPU Communication","author":"NVIDIA","year":"2016"},{"key":"10.1016\/j.parco.2019.03.005_bib0011","unstructured":"K. Simonyan, A. Zisserman, Very deep convolutional networks for large-scale image recognition, (2014) arXiv:1409.1556."},{"key":"10.1016\/j.parco.2019.03.005_bib0012","unstructured":"MVAPICH2: MPI over InfiniBand, 10GigE\/iWARP and RoCE, 2001, (https:\/\/mvapich.cse.ohio-state.edu\/). [Online; accessed April 3, 2019]."},{"key":"10.1016\/j.parco.2019.03.005_bib0013","series-title":"Proceedings of the 23rd European MPI Users\u2019 Group Meeting","first-page":"15","article-title":"Efficient large message broadcast using NCCL and CUDA-aware MPI for deep learning","author":"Awan","year":"2016"},{"key":"10.1016\/j.parco.2019.03.005_bib0014","series-title":"Optimized Broadcast for Deep Learning Workloads on Dense-GPU InfiniBand Clusters: MPI or NCCL?","author":"Awan","year":"2018"},{"key":"10.1016\/j.parco.2019.03.005_sbref0012","series-title":"CS-Storm GPU-Accelerated Cluster Supercomputer","author":"Cray","year":"2015"},{"key":"10.1016\/j.parco.2019.03.005_bib0016","doi-asserted-by":"crossref","first-page":"85","DOI":"10.1016\/j.neunet.2014.09.003","article-title":"Deep learning in neural networks: an overview","volume":"61","author":"Schmidhuber","year":"2015","journal-title":"Neural Netw."},{"key":"10.1016\/j.parco.2019.03.005_bib0017","series-title":"Proceedings of the Machine Learning on HPC Environments","first-page":"8:1","article-title":"An in-depth performance characterization of CPU- and GPU-based DNN training on modern architectures","author":"Awan","year":"2017"},{"key":"10.1016\/j.parco.2019.03.005_sbref0015","series-title":"AI Research & Development: NVIDIA DGX Systems","author":"NVIDIA","year":"2018"},{"key":"10.1016\/j.parco.2019.03.005_sbref0016","series-title":"DGX-1: Essential Instrument of AI Research","author":"NVIDIA","year":"2017"},{"key":"10.1016\/j.parco.2019.03.005_sbref0017","series-title":"SUMMIT","author":"Oak Ridge National Laboratory","year":"2018"},{"key":"10.1016\/j.parco.2019.03.005_sbref0018","series-title":"TOP 500 Supercomputer Sites","author":"Meuer","year":"1993"},{"key":"10.1016\/j.parco.2019.03.005_sbref0019","series-title":"Open MPI : Open Source High Performance Computing","author":"The Open MPI Development Team","year":"2004"},{"key":"10.1016\/j.parco.2019.03.005_sbref0020","series-title":"NVIDIA Collective Communication Library (NCCL)","author":"NVIDIA","year":"2016"},{"issue":"1","key":"10.1016\/j.parco.2019.03.005_bib0024","doi-asserted-by":"crossref","first-page":"49","DOI":"10.1177\/1094342005051521","article-title":"Optimization of collective communication operations in MPICH","volume":"19","author":"Thakur","year":"2005","journal-title":"Int. J. High Perform. Comput. Appl."},{"key":"10.1016\/j.parco.2019.03.005_bib0025","series-title":"2015 44th International Conference on Parallel Processing Workshops","first-page":"111","article-title":"A bandwidth-saving optimization for MPI broadcast collective operation","author":"Zhou","year":"2015"},{"key":"10.1016\/j.parco.2019.03.005_bib0026","series-title":"Seventh IEEE International Symposium on Cluster Computing and the Grid (CCGrid \u201907)","first-page":"487","article-title":"High-performance MPI broadcast algorithm for grid environments utilizing multi-lane NICs","author":"Chiba","year":"2007"},{"key":"10.1016\/j.parco.2019.03.005_bib0027","series-title":"Proceedings 20th IEEE International Parallel Distributed Processing Symposium","first-page":"8","article-title":"Efficient SMP-aware MPI-level broadcast over InfiniBand\u2019s hardware multicast","author":"Mamidala","year":"2006"},{"key":"10.1016\/j.parco.2019.03.005_bib0028","series-title":"Proceedings of the 21st IEEE International Parallel & Distributed Processing Symposium (CAC\u201907 Workshop)","first-page":"232","article-title":"A practically constant-time MPI broadcast algorithm for large-scale InfiniBand clusters with multicast","author":"Hoefler","year":"2007"},{"key":"10.1016\/j.parco.2019.03.005_bib0029","series-title":"2014 21st International Conference on High Performance Computing (HiPC)","first-page":"1","article-title":"A high performance broadcast design with hardware multicast and GPUDirect RDMA for streaming applications on infiniband clusters","author":"Venkatesh","year":"2014"},{"key":"10.1016\/j.parco.2019.03.005_bib0030","series-title":"2017 46th International Conference on Parallel Processing (ICPP)","first-page":"161","article-title":"Efficient and scalable multi-source streaming broadcast on GPU clusters for deep learning","author":"Chu","year":"2017"},{"key":"10.1016\/j.parco.2019.03.005_bib0031","series-title":"Proceedings of IEEE Scalable High Performance Computing Conference","first-page":"357","article-title":"Interprocessor collective communication library (InterCom)","author":"Barnett","year":"1994"},{"key":"10.1016\/j.parco.2019.03.005_bib0032","series-title":"Technical Report","article-title":"CollMark: MPI Collective Communication Benchmark","author":"Shroff","year":"2000"},{"key":"10.1016\/j.parco.2019.03.005_bib0033","series-title":"Parallel Processing (ICPP), 2013 42nd International Conference on","first-page":"80","article-title":"Efficient inter-node MPI communication using GPUDirect RDMA for InfiniBand clusters with NVIDIA GPUs","author":"Potluri","year":"2013"},{"key":"10.1016\/j.parco.2019.03.005_sbref0031","series-title":"An Introduction to CUDA-Aware MPI","author":"Kraus","year":"2013"},{"key":"10.1016\/j.parco.2019.03.005_bib0035","series-title":"2014 21st International Conference on High Performance Computing (HiPC)","first-page":"1","article-title":"Designing efficient small message transfer mechanism for inter-node MPI communication on InfiniBand GPU clusters","author":"Shi","year":"2014"},{"key":"10.1016\/j.parco.2019.03.005_bib0036","unstructured":"KESCH Supercomputer Website, 2019, (https:\/\/www.cscs.ch\/computers\/kesch-escha-meteoswiss\/). [Online; accessed April 3, 2019]."},{"key":"10.1016\/j.parco.2019.03.005_sbref0034","series-title":"Ohio Supercomputer Center","author":"Center","year":"1987"},{"key":"10.1016\/j.parco.2019.03.005_sbref0035","series-title":"OSU Micro-Benchmarks","author":"Network Based Computing Laboratory","year":"2015"},{"key":"10.1016\/j.parco.2019.03.005_sbref0036","series-title":"NCCL Tests","author":"NVIDIA","year":"2016"},{"key":"10.1016\/j.parco.2019.03.005_bib0040","series-title":"Advances in Neural Information Processing Systems","first-page":"1097","article-title":"ImageNet classification with deep convolutional neural networks","author":"Krizhevsky","year":"2012"},{"key":"10.1016\/j.parco.2019.03.005_bib0041","series-title":"Parallel and Distributed Processing Symposium, 2004. Proceedings. 18th International","first-page":"10","article-title":"Fast and scalable MPI-level broadcast using InfiniBand\u2019s hardware multicast support","author":"Liu","year":"2004"},{"key":"10.1016\/j.parco.2019.03.005_bib0042","series-title":"2013 IEEE 21st Annual Symposium on High-Performance Interconnects","first-page":"63","article-title":"Designing optimized MPI broadcast and allreduce for many integrated core (MIC) InfiniBand clusters","author":"Kandalla","year":"2013"},{"key":"10.1016\/j.parco.2019.03.005_bib0043","series-title":"Proceedings of the 19th European Conference on Recent Advances in the Message Passing Interface","first-page":"132","article-title":"Leveraging MPI\u2019s one-sided communication interface for shared-memory programming","author":"Hoefler","year":"2012"}],"container-title":["Parallel Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167819118303284?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167819118303284?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2021,4,12]],"date-time":"2021-04-12T21:43:21Z","timestamp":1618263801000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0167819118303284"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,7]]},"references-count":43,"alternative-id":["S0167819118303284"],"URL":"http:\/\/dx.doi.org\/10.1016\/j.parco.2019.03.005","relation":{},"ISSN":["0167-8191"],"issn-type":[{"value":"0167-8191","type":"print"}],"subject":[],"published":{"date-parts":[[2019,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Optimized large-message broadcast for deep learning workloads: MPI, MPI+NCCL, or NCCL2?","name":"articletitle","label":"Article Title"},{"value":"Parallel Computing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.parco.2019.03.005","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2019 Elsevier B.V. All rights reserved.","name":"copyright","label":"Copyright"}]}}