{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,12]],"date-time":"2024-09-12T18:10:27Z","timestamp":1726164627337},"publisher-location":"Cham","reference-count":32,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783031073113"},{"type":"electronic","value":"9783031073120"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-07312-0_1","type":"book-chapter","created":{"date-parts":[[2022,5,28]],"date-time":"2022-05-28T23:03:31Z","timestamp":1653779011000},"page":"3-25","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Accelerating MPI All-to-All Communication with\u00a0Online Compression on\u00a0Modern GPU Clusters"],"prefix":"10.1007","author":[{"given":"Qinghua","family":"Zhou","sequence":"first","affiliation":[]},{"given":"Pouya","family":"Kousha","sequence":"additional","affiliation":[]},{"given":"Quentin","family":"Anthony","sequence":"additional","affiliation":[]},{"given":"Kawthar","family":"Shafie Khorassani","sequence":"additional","affiliation":[]},{"given":"Aamir","family":"Shafi","sequence":"additional","affiliation":[]},{"given":"Hari","family":"Subramoni","sequence":"additional","affiliation":[]},{"given":"Dhabaleswar K.","family":"Panda","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,5,29]]},"reference":[{"issue":"11","key":"1_CR1","doi-asserted-by":"publisher","first-page":"1143","DOI":"10.1109\/71.642949","volume":"8","author":"J Bruck","year":"1997","unstructured":"Bruck, J., Ho, C.T., Kipnis, S., Upfal, E., Weathersby, D.: Efficient algorithms for All-to-All communications in multiport message-passing systems. IEEE Trans. Parallel Distrib. Syst. 8(11), 1143\u20131156 (1997)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"1_CR2","doi-asserted-by":"crossref","unstructured":"Chu, C.H., Kousha, P., Awan, A.A., Khorassani, K.S., Subramoni, H., Panda, D.K.: NV-group: link-efficient reduction for distributed deep learning on modern dense GPU systems. In: Proceedings of the 34th ACM International Conference on Supercomputing (2020)","DOI":"10.1145\/3392717.3392771"},{"key":"1_CR3","doi-asserted-by":"crossref","unstructured":"Di, S., Cappello, F.: Fast error-bounded lossy HPC data compression with SZ. In: International Parallel and Distributed Processing Symposium (IPDPS) (2016)","DOI":"10.1109\/IPDPS.2016.11"},{"key":"1_CR4","doi-asserted-by":"crossref","unstructured":"Filgueira, R., Singh, D., Calder\u00f3n, A., Carretero, J.: CoMPI: enhancing MPI based applications performance and scalability using run-time compression. In: European Parallel Virtual Machine\/Message Passing Interface Users\u2019 Group Meeting, pp. 207\u2013218 (2009)","DOI":"10.1007\/978-3-642-03770-2_27"},{"key":"1_CR5","unstructured":"IBM: IBM Spectrum MPI: accelerating high-performance application parallelization (2018). https:\/\/www.ibm.com\/us-en\/marketplace\/spectrum-mpi. Accessed 13 May 2022"},{"key":"1_CR6","doi-asserted-by":"crossref","unstructured":"Jin, S., et al.: Understanding GPU-Based Lossy Compression for Extreme-Scale Cosmological Simulations. ArXiv:abs\/2004.00224 (2020)","DOI":"10.1109\/IPDPS47924.2020.00021"},{"key":"1_CR7","doi-asserted-by":"publisher","unstructured":"Kale, L., Kumar, S., Varadarajan, K.: A framework for collective personalized communication. In: Proceedings International Parallel and Distributed Processing Symposium, p. 9 (2003). https:\/\/doi.org\/10.1109\/IPDPS.2003.1213166","DOI":"10.1109\/IPDPS.2003.1213166"},{"key":"1_CR8","doi-asserted-by":"publisher","unstructured":"Khorassani, K.S., Chu, C.H., Anthony, Q.G., Subramoni, H., Panda, D.K.: Adaptive and hierarchical large message All-to-All communication algorithms for large-scale dense GPU systems. In: 2021 IEEE\/ACM 21st International Symposium on Cluster, Cloud and Internet Computing (CCGrid), pp. 113\u2013122 (2021). https:\/\/doi.org\/10.1109\/CCGrid51090.2021.00021","DOI":"10.1109\/CCGrid51090.2021.00021"},{"key":"1_CR9","doi-asserted-by":"crossref","unstructured":"Khorassani, K.S., Chu, C.H., Subramoni, H., Panda, D.K.: Performance evaluation of MPI libraries on GPU-enabled OpenPOWER architectures: early experiences. In: International Workshop on OpenPOWER for HPC (IWOPH 19) at the 2019 ISC High Performance Conference (2018)","DOI":"10.1007\/978-3-030-34356-9_28"},{"key":"1_CR10","unstructured":"Kim, Y.J., et al.: Scalable and efficient MOE training for multitask multilingual models (2021)"},{"key":"1_CR11","doi-asserted-by":"publisher","unstructured":"Kousha, P., et al.: Designing a profiling and visualization tool for scalable and in-depth analysis of high-performance GPU clusters. In: 2019 IEEE 26th International Conference on High Performance Computing, Data, and Analytics (HiPC), pp. 93\u2013102 (2019). https:\/\/doi.org\/10.1109\/HiPC.2019.00022","DOI":"10.1109\/HiPC.2019.00022"},{"key":"1_CR12","doi-asserted-by":"crossref","unstructured":"Kousha, P., et al.: INAM: Cross-Stack Profiling and Analysis of Communication in MPI-Based Applications. Association for Computing Machinery, New York, NY, USA (2021). https:\/\/doi.org\/10.1145\/3437359.3465582","DOI":"10.1145\/3437359.3465582"},{"key":"1_CR13","unstructured":"Lawrence Livermore National Laboratory: lassen\u2014high performance computing (2018). https:\/\/hpc.llnl.gov\/hardware\/platforms\/lassen. Accessed 13 March 2022"},{"key":"1_CR14","doi-asserted-by":"publisher","unstructured":"Lindstrom, P.: Fixed-rate compressed floating-point arrays. IEEE Trans. Visualiz. Comput. Graph. 20 (2014). https:\/\/doi.org\/10.1109\/TVCG.2014.2346458","DOI":"10.1109\/TVCG.2014.2346458"},{"key":"1_CR15","unstructured":"Liquid Submerged System - Texas Advanced Computing Center, Frontera - Specifications. https:\/\/www.tacc.utexas.edu\/systems\/frontera"},{"key":"1_CR16","unstructured":"Longhorn - Texas Advanced Computing Center Frontera - User Guide. https:\/\/portal.tacc.utexas.edu\/user-guides\/longhorn"},{"key":"1_CR17","unstructured":"Network-Based Computing Laboratory: MVAPICH: MPI over InfiniBand, Omni-Path, Ethernet\/iWARP, and RoCE (2001). http:\/\/mvapich.cse.ohio-state.edu\/. Accessed 13 March 2022"},{"key":"1_CR18","unstructured":"NVIDIA: NVIDIA GPUDirect (2011). https:\/\/developer.nvidia.com\/gpudirect. Accessed 13 March 2022"},{"key":"1_CR19","unstructured":"NVIDIA: nvCOMP (2020). https:\/\/github.com\/NVIDIA\/nvcomp. Accessed 13 March 2022"},{"key":"1_CR20","unstructured":"Open MPI: Open MPI: Open Source High Performance Computing (2004). https:\/\/www.open-mpi.org\/. Accessed 13 March 2022"},{"key":"1_CR21","doi-asserted-by":"crossref","unstructured":"Potluri, S., Hamidouche, K., Venkatesh, A., Bureddy, D., Panda, D.K.: Efficient inter-node MPI communication using GPUDirect RDMA for infiniBand clusters with NVIDIA GPUs. In: 42nd International Conference on Parallel Processing (ICPP), pp. 80\u201389. IEEE (2013)","DOI":"10.1109\/ICPP.2013.17"},{"key":"1_CR22","doi-asserted-by":"publisher","unstructured":"Rasley, J., Rajbhandari, S., Ruwase, O., He, Y.: Deepspeed: system optimizations enable training deep learning models with over 100 billion parameters. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 3505\u20133506. KDD 2020, Association for Computing Machinery, New York, NY, USA (2020). https:\/\/doi.org\/10.1145\/3394486.3406703","DOI":"10.1145\/3394486.3406703"},{"key":"1_CR23","doi-asserted-by":"publisher","unstructured":"Ravikumar, K., Appelhans, D., Yeung, P.K.: GPU acceleration of extreme scale pseudo-spectral simulations of turbulence using asynchronism. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. SC 2019, Association for Computing Machinery, New York, NY, USA (2019). https:\/\/doi.org\/10.1145\/3295500.3356209","DOI":"10.1145\/3295500.3356209"},{"key":"1_CR24","doi-asserted-by":"crossref","unstructured":"Sharkawi, S.S., Chochia, G.A.: Communication protocol optimization for enhanced GPU performance. IBM J. Res. Develop. 64(3\/4), 9:1\u20139:9 (2020)","DOI":"10.1147\/JRD.2020.2967311"},{"key":"1_CR25","doi-asserted-by":"crossref","unstructured":"Shi, R., et al.: Designing efficient small message transfer mechanism for inter-node MPI communication on InfiniBand GPU clusters. In: 2014 21st International Conference on High Performance Computing (HiPC), pp. 1\u201310 (2014)","DOI":"10.1109\/HiPC.2014.7116873"},{"key":"1_CR26","doi-asserted-by":"crossref","unstructured":"Singh, A.K., Potluri, S., Wang, H., Kandalla, K., Sur, S., Panda, D.K.: MPI AlltoAll personalized exchange on GPGPU clusters: design alternatives and benefit. In: 2011 IEEE International Conference on Cluster Computing, pp. 420\u2013427 (2011)","DOI":"10.1109\/CLUSTER.2011.67"},{"key":"1_CR27","unstructured":"Singh, A.K.: Optimizing All-to-All and Allgather Communications on GPGPU Clusters. Master\u2019s thesis, The Ohio State University (2012)"},{"key":"1_CR28","doi-asserted-by":"publisher","unstructured":"Singh, A.K., Potluri, S., Wang, H., Kandalla, K., Sur, S., Panda, D.K.: MPI AlltoAll personalized exchange on GPGPU clusters: design alternatives and benefit. In: 2011 IEEE International Conference on Cluster Computing, pp. 420\u2013427 (2011). https:\/\/doi.org\/10.1109\/CLUSTER.2011.67","DOI":"10.1109\/CLUSTER.2011.67"},{"key":"1_CR29","doi-asserted-by":"publisher","unstructured":"Thakur, R., Rabenseifner, R., Gropp, W.: Optimization of collective communication operations in MPICH. Int. J. High Perform. Comput. Appl. 19(1), 49\u201366 (2005). https:\/\/doi.org\/10.1177\/1094342005051521","DOI":"10.1177\/1094342005051521"},{"key":"1_CR30","doi-asserted-by":"publisher","unstructured":"Tian, J., et al.: CUSZ: an efficient GPU-based error-bounded lossy compression framework for scientific data. In: Proceedings of the ACM International Conference on Parallel Architectures and Compilation Techniques, pp. 3\u201315. PACT 2020, Association for Computing Machinery, New York, NY, USA (2020). https:\/\/doi.org\/10.1145\/3410463.3414624","DOI":"10.1145\/3410463.3414624"},{"key":"1_CR31","doi-asserted-by":"crossref","unstructured":"Yang, A., Mukka, H., Hesaaraki, F., Burtscher, M.: MPC: a massively parallel compression algorithm for scientific data. In: IEEE Cluster Conference (2015)","DOI":"10.1109\/CLUSTER.2015.59"},{"key":"1_CR32","doi-asserted-by":"publisher","unstructured":"Zhou, Q., et al.: Designing high-performance MPI libraries with on-the-fly compression for modern GPU clusters*. In: 2021 IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp. 444\u2013453 (2021). https:\/\/doi.org\/10.1109\/IPDPS49936.2021.00053","DOI":"10.1109\/IPDPS49936.2021.00053"}],"container-title":["Lecture Notes in Computer Science","High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-07312-0_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,17]],"date-time":"2023-01-17T12:12:45Z","timestamp":1673957565000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-07312-0_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031073113","9783031073120"],"references-count":32,"URL":"http:\/\/dx.doi.org\/10.1007\/978-3-031-07312-0_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"29 May 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ISC High Performance","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on High Performance Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hamburg","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 May 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 June 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"37","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"supercomputing2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Linklings","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"53","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"18","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"34% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"For the workshops a 27 papers have been accepted for publication out of a total of 43 submissions.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}