{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T22:00:58Z","timestamp":1730325658786,"version":"3.28.0"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100020636","name":"Ministerio de Educaci\u00f3n y Formaci\u00f3n Profesional","doi-asserted-by":"publisher","award":["FPU19\/03974"],"id":[{"id":"10.13039\/501100020636","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100010801","name":"Xunta de Galicia","doi-asserted-by":"publisher","award":["ED431C 2021\/30 and ED431G 2019\/01"],"id":[{"id":"10.13039\/501100010801","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004837","name":"Ministerio de Ciencia e Innovaci\u00f3n","doi-asserted-by":"publisher","award":["PID2019-104184RB-I00"],"id":[{"id":"10.13039\/501100004837","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,10,8]]},"DOI":"10.1145\/3559009.3569691","type":"proceedings-article","created":{"date-parts":[[2023,1,27]],"date-time":"2023-01-27T14:02:50Z","timestamp":1674828170000},"page":"135-147","update-policy":"http:\/\/dx.doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Probing the Efficacy of Hardware-Aware Weight Pruning to Optimize the SpMM Routine on Ampere GPUs"],"prefix":"10.1145","author":[{"given":"Roberto L.","family":"Castro","sequence":"first","affiliation":[{"name":"Universidade da Coru\u00f1a, A Coru\u00f1a, Spain"}]},{"given":"Diego","family":"Andrade","sequence":"additional","affiliation":[{"name":"Universidade da Coru\u00f1a, A Coru\u00f1a, Spain"}]},{"given":"Basilio B.","family":"Fraguela","sequence":"additional","affiliation":[{"name":"Universidade da Coru\u00f1a, A Coru\u00f1a, Spain"}]}],"member":"320","published-online":{"date-parts":[[2023,1,27]]},"reference":[{"volume-title":"Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, the World's Largest and Most Powerful Generative Language Model. Retrieved","year":"2021","author":"Ali Alvi Paresh Kharya","key":"e_1_3_2_1_1_1","unstructured":"Paresh Kharya Ali Alvi . 2021. Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, the World's Largest and Most Powerful Generative Language Model. Retrieved December 3, 2021 from https:\/\/www.microsoft.com\/en-us\/research\/blog\/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model\/ Paresh Kharya Ali Alvi. 2021. Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, the World's Largest and Most Powerful Generative Language Model. Retrieved December 3, 2021 from https:\/\/www.microsoft.com\/en-us\/research\/blog\/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model\/"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3005348"},{"key":"e_1_3_2_1_3_1","unstructured":"Tom B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger Tom Henighan Rewon Child Aditya Ramesh Daniel M. Ziegler Jeffrey Wu Clemens Winter Christopher Hesse Mark Chen Eric Sigler Mateusz Litwin Scott Gray Benjamin Chess Jack Clark Christopher Berner Sam McCandlish Alec Radford Ilya Sutskever and Dario Amodei. 2020. Language Models are Few-Shot Learners. arXiv:2005.14165 [cs.CL] Tom B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger Tom Henighan Rewon Child Aditya Ramesh Daniel M. Ziegler Jeffrey Wu Clemens Winter Christopher Hesse Mark Chen Eric Sigler Mateusz Litwin Scott Gray Benjamin Chess Jack Clark Christopher Berner Sam McCandlish Alec Radford Ilya Sutskever and Dario Amodei. 2020. Language Models are Few-Shot Learners. arXiv:2005.14165 [cs.CL]"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476182"},{"key":"e_1_3_2_1_5_1","unstructured":"Sharan Chetlur Cliff Woolley Philippe Vandermersch Jonathan Cohen John Tran Bryan Catanzaro and Evan Shelhamer. 2014. cudnn: Efficient primitives for deep learning. arXiv:1410.0759 [cs.NE] Sharan Chetlur Cliff Woolley Philippe Vandermersch Jonathan Cohen John Tran Bryan Catanzaro and Evan Shelhamer. 2014. cudnn: Efficient primitives for deep learning. arXiv:1410.0759 [cs.NE]"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_7_1","unstructured":"Trevor Gale Erich Elsen and Sara Hooker. 2019. The State of Sparsity in Deep Neural Networks. arXiv:1902.09574 [cs.LG] Trevor Gale Erich Elsen and Sara Hooker. 2019. The State of Sparsity in Deep Neural Networks. arXiv:1902.09574 [cs.LG]"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.5555\/3433701.3433723"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3453953.3453972"},{"volume-title":"Deep Learning Matrix Collection. Retrieved","year":"2021","author":"Research Google","key":"e_1_3_2_1_10_1","unstructured":"Google Research . 2020. Deep Learning Matrix Collection. Retrieved December 3, 2021 from https:\/\/github.com\/google-research\/google-research\/tree\/master\/sgk Google Research. 2020. Deep Learning Matrix Collection. Retrieved December 3, 2021 from https:\/\/github.com\/google-research\/google-research\/tree\/master\/sgk"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.5555\/3157096.3157251"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3020078.3021745"},{"volume":"1","volume-title":"Proceedings of the 28th International Conference on Neural Information Processing Systems -","author":"Han Song","key":"e_1_3_2_1_13_1","unstructured":"Song Han , Jeff Pool , John Tran , and William J. Dally . 2015. Learning Both Weights and Connections for Efficient Neural Networks . In Proceedings of the 28th International Conference on Neural Information Processing Systems - Volume 1 (Montreal, Canada) (NIPS'15). MIT Press, Cambridge, MA, USA, 1135--1143. Song Han, Jeff Pool, John Tran, and William J. Dally. 2015. Learning Both Weights and Connections for Efficient Neural Networks. In Proceedings of the 28th International Conference on Neural Information Processing Systems - Volume 1 (Montreal, Canada) (NIPS'15). MIT Press, Cambridge, MA, USA, 1135--1143."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.155"},{"key":"e_1_3_2_1_15_1","first-page":"1","article-title":"Sparsity in Deep Learning: Pruning and growth for efficient inference and training in neural networks","volume":"22","author":"Hoefler Torsten","year":"2021","unstructured":"Torsten Hoefler , Dan Alistarh , Tal Ben-Nun , Nikoli Dryden , and Alexandra Peste . 2021 . Sparsity in Deep Learning: Pruning and growth for efficient inference and training in neural networks . Journal of Machine Learning Research 22 , 241 (2021), 1 -- 124 . Torsten Hoefler, Dan Alistarh, Tal Ben-Nun, Nikoli Dryden, and Alexandra Peste. 2021. Sparsity in Deep Learning: Pruning and growth for efficient inference and training in neural networks. Journal of Machine Learning Research 22, 241 (2021), 1--124.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_16_1","first-page":"21099","article-title":"Accelerated sparse neural training: A provable and efficient method to find n: m transposable masks","volume":"34","author":"Hubara Itay","year":"2021","unstructured":"Itay Hubara , Brian Chmiel , Moshe Island , Ron Banner , Joseph Naor , and Daniel Soudry . 2021 . Accelerated sparse neural training: A provable and efficient method to find n: m transposable masks . Advances in Neural Information Processing Systems 34 (2021), 21099 -- 21111 . Itay Hubara, Brian Chmiel, Moshe Island, Ron Banner, Joseph Naor, and Daniel Soudry. 2021. Accelerated sparse neural training: A provable and efficient method to find n: m transposable masks. Advances in Neural Information Processing Systems 34 (2021), 21099--21111.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Fran\u00e7ois Lagunas Ella Charlaix Victor Sanh and Alexander M Rush. 2021. Block pruning for faster transformers. arXiv:2109.04838 [cs.LG] Fran\u00e7ois Lagunas Ella Charlaix Victor Sanh and Alexander M Rush. 2021. Block pruning for faster transformers. arXiv:2109.04838 [cs.LG]","DOI":"10.18653\/v1\/2021.emnlp-main.829"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.23919\/DATE48585.2020.9116287"},{"key":"e_1_3_2_1_19_1","unstructured":"Mingbao Lin Yuchao Li Yuxin Zhang Bohong Chen Fei Chao Mengdi Wang Shen Li Jun Yang and Rongrong Ji. 2021. 1\u00d7N Block Pattern for Network Sparsity. arXiv:2105.14713 [cs.CV] Mingbao Lin Yuchao Li Yuxin Zhang Bohong Chen Fei Chao Mengdi Wang Shen Li Jun Yang and Rongrong Ji. 2021. 1\u00d7N Block Pattern for Network Sparsity. arXiv:2105.14713 [cs.CV]"},{"volume-title":"Zhangyang Wang, and Mykola Pechenizkiy.","year":"2022","author":"Liu Shiwei","key":"e_1_3_2_1_20_1","unstructured":"Shiwei Liu , Tianlong Chen , Xiaohan Chen , Li Shen , Decebal Constantin Mocanu , Zhangyang Wang, and Mykola Pechenizkiy. 2022 . The unreasonable effectiveness of random pruning: Return of the most naive baseline for sparse training. arXiv:2202.02643 [cs.LG] Shiwei Liu, Tianlong Chen, Xiaohan Chen, Li Shen, Decebal Constantin Mocanu, Zhangyang Wang, and Mykola Pechenizkiy. 2022. The unreasonable effectiveness of random pruning: Return of the most naive baseline for sparse training. arXiv:2202.02643 [cs.LG]"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Huizi Mao Song Han Jeff Pool Wenshuo Li Xingyu Liu Yu Wang and William J Dally. 2017. Exploring the regularity of sparse structure in convolutional neural networks. arXiv:1705.08922 [cs.LG] Huizi Mao Song Han Jeff Pool Wenshuo Li Xingyu Liu Yu Wang and William J Dally. 2017. Exploring the regularity of sparse structure in convolutional neural networks. arXiv:1705.08922 [cs.LG]","DOI":"10.1109\/CVPRW.2017.241"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS51385.2021.00016"},{"volume-title":"Retrieved","year":"2012","author":"Micikevicius Paulius","key":"e_1_3_2_1_23_1","unstructured":"Paulius Micikevicius . 2012 . GPU Performance Analysis and Optimization . Retrieved April 20, 2022 from https:\/\/on-demand.gputechconf.com\/gtc\/2012\/presentations\/S0514-GTC2012-GPU-Performance-Analysis.pdf Paulius Micikevicius. 2012. GPU Performance Analysis and Optimization. Retrieved April 20, 2022 from https:\/\/on-demand.gputechconf.com\/gtc\/2012\/presentations\/S0514-GTC2012-GPU-Performance-Analysis.pdf"},{"volume-title":"Jeff Pool, Darko Stosic, Dusan Stosic, Ganesh Venkatesh, Chong Yu, and Paulius Micikevicius.","year":"2021","author":"Mishra Asit","key":"e_1_3_2_1_24_1","unstructured":"Asit Mishra , Jorge Albericio Latorre , Jeff Pool, Darko Stosic, Dusan Stosic, Ganesh Venkatesh, Chong Yu, and Paulius Micikevicius. 2021 . Accelerating sparse deep neural networks. arXiv:2104.08378 [cs.LG] Asit Mishra, Jorge Albericio Latorre, Jeff Pool, Darko Stosic, Dusan Stosic, Ganesh Venkatesh, Chong Yu, and Paulius Micikevicius. 2021. Accelerating sparse deep neural networks. arXiv:2104.08378 [cs.LG]"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00098"},{"volume-title":"Retrieved","year":"2021","key":"e_1_3_2_1_26_1","unstructured":"Nvidia. 2021 . NVIDIA A100 Tensor Core GPU Architecture . Retrieved April 23, 2022 from https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf Nvidia. 2021. NVIDIA A100 Tensor Core GPU Architecture. Retrieved April 23, 2022 from https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf"},{"volume-title":"Nvidia Ampere GA102 GPU architecture. Retrieved","year":"2022","key":"e_1_3_2_1_27_1","unstructured":"Nvidia. 2021. Nvidia Ampere GA102 GPU architecture. Retrieved February 1, 2022 from https:\/\/www.nvidia.com\/content\/PDF\/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.pdf Nvidia. 2021. Nvidia Ampere GA102 GPU architecture. Retrieved February 1, 2022 from https:\/\/www.nvidia.com\/content\/PDF\/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.pdf"},{"volume-title":"cuSparse Library. Retrieved","year":"2022","key":"e_1_3_2_1_28_1","unstructured":"Nvidia. 2022. cuSparse Library. Retrieved February 2 2022 from https:\/\/docs.nvidia.com\/pdf\/CUSPARSE_Library.pdf Nvidia. 2022. cuSparse Library. Retrieved February 2 2022 from https:\/\/docs.nvidia.com\/pdf\/CUSPARSE_Library.pdf"},{"volume-title":"cuSPARSELt: A High-Performance CUDA Library for Sparse Matrix-Matrix Multiplication. Retrieved","year":"2022","key":"e_1_3_2_1_29_1","unstructured":"Nvidia. 2022. cuSPARSELt: A High-Performance CUDA Library for Sparse Matrix-Matrix Multiplication. Retrieved February 2, 2022 from https:\/\/docs.nvidia.com\/cuda\/cusparselt\/index.html Nvidia. 2022. cuSPARSELt: A High-Performance CUDA Library for Sparse Matrix-Matrix Multiplication. Retrieved February 2, 2022 from https:\/\/docs.nvidia.com\/cuda\/cusparselt\/index.html"},{"key":"e_1_3_2_1_30_1","unstructured":"Nvidia. 2022. Nsight Compute. Retrieved February 2 2022 from https:\/\/docs.nvidia.com\/nsight-compute\/NsightCompute\/index.html Nvidia. 2022. Nsight Compute. Retrieved February 2 2022 from https:\/\/docs.nvidia.com\/nsight-compute\/NsightCompute\/index.html"},{"volume-title":"Advances in neural information processing systems 13","year":"2000","author":"Rasmussen Carl Edward","key":"e_1_3_2_1_31_1","unstructured":"Carl Edward Rasmussen and Zoubin Ghahramani . 2000. Occam's Razor . Advances in neural information processing systems 13 ( 2000 ), 294--300. Carl Edward Rasmussen and Zoubin Ghahramani. 2000. Occam's Razor. Advances in neural information processing systems 13 (2000), 294--300."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3410463.3414648"},{"key":"e_1_3_2_1_33_1","first-page":"20378","article-title":"Movement pruning: Adaptive sparsity by fine-tuning","volume":"33","author":"Sanh Victor","year":"2020","unstructured":"Victor Sanh , Thomas Wolf , and Alexander Rush . 2020 . Movement pruning: Adaptive sparsity by fine-tuning . Advances in Neural Information Processing Systems 33 (2020), 20378 -- 20389 . Victor Sanh, Thomas Wolf, and Alexander Rush. 2020. Movement pruning: Adaptive sparsity by fine-tuning. Advances in Neural Information Processing Systems 33 (2020), 20378--20389.","journal-title":"Advances in Neural Information Processing Systems"},{"volume-title":"How the Fermi Thread Block Scheduler Works. Retrieved","year":"2021","author":"Pai Sreepathi","key":"e_1_3_2_1_34_1","unstructured":"Sreepathi Pai . 2014. How the Fermi Thread Block Scheduler Works. Retrieved February 3, 2021 from https:\/\/www.cs.rochester.edu\/~sree\/fermi-tbs\/fermi-tbs.html Sreepathi Pai. 2014. How the Fermi Thread Block Scheduler Works. Retrieved February 3, 2021 from https:\/\/www.cs.rochester.edu\/~sree\/fermi-tbs\/fermi-tbs.html"},{"key":"e_1_3_2_1_35_1","first-page":"20721","article-title":"DominoSearch: Find layer-wise fine-grained N: M sparse schemes from dense neural networks","volume":"34","author":"Sun Wei","year":"2021","unstructured":"Wei Sun , Aojun Zhou , Sander Stuijk , Rob Wijnhoven , Andrew O Nelson , Henk Corp oraal, 2021 . DominoSearch: Find layer-wise fine-grained N: M sparse schemes from dense neural networks . Advances in Neural Information Processing Systems 34 (2021), 20721 -- 20732 . Wei Sun, Aojun Zhou, Sander Stuijk, Rob Wijnhoven, Andrew O Nelson, Henk Corporaal, et al. 2021. DominoSearch: Find layer-wise fine-grained N: M sparse schemes from dense neural networks. Advances in Neural Information Processing Systems 34 (2021), 20721--20732.","journal-title":"Advances in Neural Information Processing Systems"},{"volume-title":"Learning structured sparsity in deep neural networks. Advances in neural information processing systems 29","year":"2016","author":"Wen Wei","key":"e_1_3_2_1_36_1","unstructured":"Wei Wen , Chunpeng Wu , Yandan Wang , Yiran Chen , and Hai Li. 2016. Learning structured sparsity in deep neural networks. Advances in neural information processing systems 29 ( 2016 ), 2074--2082. Wei Wen, Chunpeng Wu, Yandan Wang, Yiran Chen, and Hai Li. 2016. Learning structured sparsity in deep neural networks. Advances in neural information processing systems 29 (2016), 2074--2082."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS47924.2020.00071"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3332466.3374520"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2018.00011"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358269"},{"key":"e_1_3_2_1_41_1","unstructured":"Neta Zmora Guy Jacob Lev Zlotnik Bar Elharar and Gal Novik. 2019. Neural Network Distiller: A Python Package For DNN Compression Research. arXiv:1910.12232 [cs.LG] Neta Zmora Guy Jacob Lev Zlotnik Bar Elharar and Gal Novik. 2019. Neural Network Distiller: A Python Package For DNN Compression Research. arXiv:1910.12232 [cs.LG]"}],"event":{"name":"PACT '22: International Conference on Parallel Architectures and Compilation Techniques","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","IFIP WG 10.3 IFIP WG 10.3","IEEE CS"],"location":"Chicago Illinois","acronym":"PACT '22"},"container-title":["Proceedings of the International Conference on Parallel Architectures and Compilation Techniques"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3559009.3569691","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,10,9]],"date-time":"2023-10-09T07:42:13Z","timestamp":1696837333000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3559009.3569691"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,8]]},"references-count":41,"alternative-id":["10.1145\/3559009.3569691","10.1145\/3559009"],"URL":"http:\/\/dx.doi.org\/10.1145\/3559009.3569691","relation":{},"subject":[],"published":{"date-parts":[[2022,10,8]]},"assertion":[{"value":"2023-01-27","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}