{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,2,25]],"date-time":"2024-02-25T14:42:11Z","timestamp":1708872131789},"reference-count":38,"publisher":"Springer Science and Business Media LLC","issue":"9","license":[{"start":{"date-parts":[[2017,3,24]],"date-time":"2017-03-24T00:00:00Z","timestamp":1490313600000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"name":"National Plan for Science, Technology, and Innovation (MAARIFAH)","award":["12-INF3008-04"]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2017,9]]},"DOI":"10.1007\/s11227-017-1972-3","type":"journal-article","created":{"date-parts":[[2017,3,24]],"date-time":"2017-03-24T09:38:30Z","timestamp":1490348310000},"page":"3761-3795","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["SpMV and BiCG-Stab optimization for a class of hepta-diagonal-sparse matrices on GPU"],"prefix":"10.1007","volume":"73","author":[{"given":"Mayez A.","family":"Al-Mouhamed","sequence":"first","affiliation":[]},{"given":"Ayaz H.","family":"Khan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2017,3,24]]},"reference":[{"key":"1972_CR1","doi-asserted-by":"crossref","unstructured":"Abu-Sufah W, Karim A (2012) An effective approach for implementing sparse matrix\u2013vector multiplication on graphics processing units. In: IEEE 14th International Conference on High Performance Computing and Communication, pp 453\u2013460","DOI":"10.1109\/HPCC.2012.68"},{"key":"1972_CR2","doi-asserted-by":"crossref","unstructured":"Ahamed C, Kassim A, Frdric M (2012) Iterative methods for sparse linear systems on graphics processing unit. In: IEEE 14th International Conference on High Performance Computing and Communication, pp 836\u2013842","DOI":"10.1109\/HPCC.2012.118"},{"key":"1972_CR3","unstructured":"Alejandro D, Polanco R (2009) Collective communication and barrier synchronization on NVIDIA CUDA GPU. MS thesis, University of Kentucky"},{"key":"1972_CR4","doi-asserted-by":"crossref","unstructured":"Aliaga J, Perez J, Quintana-Orti E, Anzt H (2013) Reformulated conjugate gradient for the energy-aware solution of linear systems on GPUs. In: 42nd International Conference on Parallel Processing (ICPP), pp 320\u2013329","DOI":"10.1109\/ICPP.2013.41"},{"key":"1972_CR5","unstructured":"Anzt H, Tomov S, Dongarra J (2014) Implementing a sparse matrix vector product for the sell-c\/sell-c- $$\\sigma $$ \u03c3 formats on NVIDIA GPUs. Technical report on UT-EECS-14-727, University of Tennessee"},{"issue":"3","key":"1972_CR6","doi-asserted-by":"crossref","first-page":"366","DOI":"10.1177\/1094342015580139","volume":"29","author":"H Anzt","year":"2015","unstructured":"Anzt H, Tomov S, Luszczek P, Sawyer W, Dongarra J (2015) Acceleration of GPU-based krylov solvers via data transfer reduction. Int J High Perform Comput Appl 29(3):366\u2013383","journal-title":"Int J High Perform Comput Appl"},{"key":"1972_CR7","doi-asserted-by":"crossref","unstructured":"Balay S, Abhyankar S, Adams M, Brown J, Brune P, Buschelman K, Eijkhout V, Gropp W, Kaushik D, Knepley M et al (2014) PETSc users manual revision 3.5. Technical report on Argonne National Laboratory (ANL)","DOI":"10.2172\/1178109"},{"key":"1972_CR8","unstructured":"Bell N, Garland M (2008) Efficient sparse matrix\u2013vector multiplication on CUDA. Technical report on NVR-2008-004, NVIDIA Corporation"},{"key":"1972_CR9","doi-asserted-by":"crossref","unstructured":"Bell N, Garland M (2009) Implementing sparse matrix\u2013vector multiplication on throughput-oriented processors. In: Proceedings of the Conference on High Performance Computing Networking, Storage and Analysis, SC \u201909. ACM, pp 18:1\u201318:11","DOI":"10.1145\/1654059.1654078"},{"key":"1972_CR10","unstructured":"Bell N, Garland M (2012) CUSP: generic parallel algorithms for sparse matrix and graph computations. https:\/\/code.google.com\/archive\/p\/cusp-library\/"},{"key":"1972_CR11","unstructured":"Bordawekar R, Baskaran MM (2008) Optimizing sparse matrix\u2013vector multiplication on GPUs. Technical report on RC24704, IMB Research"},{"issue":"3","key":"1972_CR12","doi-asserted-by":"crossref","first-page":"205","DOI":"10.1080\/17445760802337010","volume":"24","author":"L Buatois","year":"2009","unstructured":"Buatois L, Caumon G, Lvy B (2009) Concurrent number cruncher: a GPU implementation of a general sparse linear solver. Int J Parallel Emerg Distrib Syst 24(3):205\u2013223","journal-title":"Int J Parallel Emerg Distrib Syst"},{"issue":"1","key":"1972_CR13","first-page":"1:1","volume":"38","author":"TA Davis","year":"2011","unstructured":"Davis TA, Hu Y (2011) The University of Florida sparse matrix collection. ACM Trans Math Softw 38(1):1:1\u20131:25","journal-title":"ACM Trans Math Softw"},{"key":"1972_CR14","first-page":"0","volume":"2","author":"N Goharian","year":"2003","unstructured":"Goharian N, Jain A, Sun Q (2003) Comparative analysis of sparse matrix algorithms for information retrieval. Computer 2:0\u20134","journal-title":"Computer"},{"key":"1972_CR15","doi-asserted-by":"crossref","unstructured":"Greathouse JL, Daga M (2014) Efficient sparse matrix\u2013vector multiplication on GPUs using the CSR storage format. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC \u201914. IEEE Press, Piscataway, pp 769\u2013780","DOI":"10.1109\/SC.2014.68"},{"key":"1972_CR16","doi-asserted-by":"crossref","unstructured":"Grillo L, de\u00a0Sande F, Reyes R (2014) Performance evaluation of OpenACC compilers. In: 22nd Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP), pp 656\u2013663","DOI":"10.1109\/PDP.2014.29"},{"issue":"3","key":"1972_CR17","doi-asserted-by":"crossref","first-page":"397","DOI":"10.1145\/1089014.1089021","volume":"31","author":"MA Heroux","year":"2005","unstructured":"Heroux MA, Bartlett RA, Howle VE, Hoekstra RJ, Hu JJ, Kolda TG, Lehoucq RB, Long KR, Pawlowski RP, Phipps ET et al (2005) An overview of the Trilinos project. ACM Trans Math Softw 31(3):397\u2013423","journal-title":"ACM Trans Math Softw"},{"key":"1972_CR18","unstructured":"Hoberock J, Bell N (2010) Thrust: a parallel template library. https:\/\/thrust.github.io\/"},{"key":"1972_CR19","doi-asserted-by":"crossref","unstructured":"Huan G, Qian Z (2012) A new method of sparse matrix\u2013vector multiplication on GPU. In: 2nd International Conference on Computer Science and Network Technology (ICCSNT), pp 954\u2013958","DOI":"10.1109\/ICCSNT.2012.6526085"},{"key":"1972_CR20","unstructured":"Khan AH, Al-Mouhamed M, Firdaus LA (2015) Evaluation of global synchronization for iterative algebra algorithms on many-core. In: 16th IEEE\/ACIS International Conference on Software Engineering, Artificial Intelligence, Networking, and Parallel\/Distributed Computing (SNPD), pp 1\u201316"},{"key":"1972_CR21","doi-asserted-by":"crossref","unstructured":"Lee S, Vetter JS (2012) Early evaluation of directive-based GPU programming models for productive exascale computing. In: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis, SC \u201912. IEEE Computer Society Press, Los Alamitos, pp 23:1\u201323:11","DOI":"10.1109\/SC.2012.51"},{"issue":"5","key":"1972_CR22","doi-asserted-by":"crossref","first-page":"S209","DOI":"10.1137\/120883153","volume":"35","author":"D Lowell","year":"2013","unstructured":"Lowell D, Godwin J, Holewinski J, Karthik D, Choudary C, Mametjanov A, Norris B, Sabin G, Sadayappan P, Sarich J (2013) Stencil-aware GPU optimization of iterative solvers. SIAM J Sci Comput 35(5):S209\u2013S228","journal-title":"SIAM J Sci Comput"},{"key":"1972_CR23","doi-asserted-by":"crossref","first-page":"66","DOI":"10.1016\/j.jpdc.2016.03.011","volume":"9394","author":"M Maggioni","year":"2016","unstructured":"Maggioni M, Berger-Wolf T (2016) Optimization techniques for sparse matrixvector multiplication on {GPUs}. J Parallel Distrib Comput 9394:66\u201386","journal-title":"J Parallel Distrib Comput"},{"key":"1972_CR24","doi-asserted-by":"crossref","unstructured":"Matam KK, Kothapalli K (2011) Accelerating sparse matrix vector multiplication in iterative methods using GPU. In: IEEE International Conference on Parallel Processing (ICPP), pp 612\u2013621","DOI":"10.1109\/ICPP.2011.82"},{"key":"1972_CR25","doi-asserted-by":"crossref","unstructured":"Nagasaka Y, Nukada A, Matsuoka S (2016) Adaptive multi-level blocking optimization for sparse matrix vector multiplication on GPU. Procedia Comput Sci 80:131\u2013142 (International Conference on Computational Science 2016)","DOI":"10.1016\/j.procs.2016.05.304"},{"key":"1972_CR26","doi-asserted-by":"crossref","unstructured":"Neelima B, Reddy GRM, Raghavendra PS (2014) Predicting an optimal sparse matrix format for SpMV computation on GPU. In: IEEE International Parallel & Distributed Processing Symposium Workshops (IPDPSW), pp 1427\u20131436","DOI":"10.1109\/IPDPSW.2014.160"},{"key":"1972_CR27","unstructured":"NVIDIA: CUDA basic linear algebra subroutines (cublas) library, CUDA sparse matrix (cusparse) library. https:\/\/developer.nvidia.com\/gpu-accelerated-libraries"},{"key":"1972_CR28","unstructured":"NVIDIA (2013) Tuning CUDA applications for kepler. http:\/\/docs.nvidia.com\/cuda\/kepler-tuning-guide\/index.html . Accessed 10 June 2013"},{"key":"1972_CR29","unstructured":"NVIDIA (2008) NVIDIA CUDA programming guide 2.0. NVIDIA"},{"key":"1972_CR30","unstructured":"NVIDIA (2011) CUDA toolkit. https:\/\/developer.nvidia.com\/cuda-toolkit"},{"key":"1972_CR31","unstructured":"NVIDIA (2014) cuSparse library. https:\/\/developer.nvidia.com\/cusparse"},{"key":"1972_CR32","doi-asserted-by":"crossref","unstructured":"Owens JD, Luebke D, Govindaraju N, Harris M, Kr\u00fcger J, Lefohn AE, Purcell TJ (2007) A survey of general-purpose computation on graphics hardware. In: Computer graphics forum, vol 26. Wiley Online Library, pp 80\u2013113","DOI":"10.1111\/j.1467-8659.2007.01012.x"},{"key":"1972_CR33","doi-asserted-by":"crossref","DOI":"10.1137\/1.9780898718003","volume-title":"Iterative methods for sparse linear systems","author":"Y Saad","year":"2003","unstructured":"Saad Y (2003) Iterative methods for sparse linear systems, 2nd edn. Society for Industrial and Applied Mathematics, Philadelphia","edition":"2"},{"issue":"5","key":"1972_CR34","doi-asserted-by":"crossref","first-page":"232","DOI":"10.1016\/j.parco.2009.12.005","volume":"36","author":"S Tomov","year":"2010","unstructured":"Tomov S, Dongarra J, Baboulin M (2010) Towards dense linear algebra for hybrid GPU accelerated manycore systems. Parallel Comput 36(5):232\u2013240","journal-title":"Parallel Comput"},{"key":"1972_CR35","unstructured":"Xiao S, c. Feng W (2010) Inter-block GPU communication via fast barrier synchronization. In: IEEE International Symposium on Parallel Distributed Processing (IPDPS), pp 1\u201312"},{"issue":"8","key":"1972_CR36","doi-asserted-by":"crossref","first-page":"107","DOI":"10.1145\/2692916.2555255","volume":"49","author":"SEA Yan","year":"2014","unstructured":"Yan SEA (2014) yaSpMV: Yet another SpMV framework on GPUs. ACM SIGPLAN Notices 49(8):107\u2013118","journal-title":"ACM SIGPLAN Notices"},{"key":"1972_CR37","doi-asserted-by":"crossref","unstructured":"Yang LT (2000) Data distribution and communication schemes for IQMR method on massively distributed memory computers. In: Proceedings of the International Workshop on Parallel Processing, ICPPW, Toronto, Canada, August 21\u201324, pp 299\u2013306","DOI":"10.1109\/ICPPW.2000.869116"},{"key":"1972_CR38","unstructured":"Zaza A (2015) A CUDA based parallel multi-phase oil reservoir simulator. PhD thesis, Computer Engineering Department, King Fahd University of Petroleum & Minerals (KFUPM), Saudi Arabia"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11227-017-1972-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-017-1972-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-017-1972-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,9,20]],"date-time":"2019-09-20T04:34:32Z","timestamp":1568954072000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11227-017-1972-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,3,24]]},"references-count":38,"journal-issue":{"issue":"9","published-print":{"date-parts":[[2017,9]]}},"alternative-id":["1972"],"URL":"https:\/\/doi.org\/10.1007\/s11227-017-1972-3","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"value":"0920-8542","type":"print"},{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2017,3,24]]}}}