iBet uBet web content aggregator. Adding the entire web to your favor.
iBet uBet web content aggregator. Adding the entire web to your favor.



Link to original content: https://api.crossref.org/works/10.1109/HPCA53966.2022.00056
{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,7]],"date-time":"2024-09-07T13:26:26Z","timestamp":1725715586287},"reference-count":113,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,4,1]],"date-time":"2022-04-01T00:00:00Z","timestamp":1648771200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,4,1]],"date-time":"2022-04-01T00:00:00Z","timestamp":1648771200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,4]]},"DOI":"10.1109\/hpca53966.2022.00056","type":"proceedings-article","created":{"date-parts":[[2022,5,17]],"date-time":"2022-05-17T19:50:17Z","timestamp":1652817017000},"page":"676-691","source":"Crossref","is-referenced-by-count":2,"title":["Only Buffer When You Need To: Reducing On-chip GPU Traffic with Reconfigurable Local Atomic Buffers"],"prefix":"10.1109","author":[{"given":"Preyesh","family":"Dalmia","sequence":"first","affiliation":[{"name":"University of Wisconsin-Madison"}]},{"given":"Rohan","family":"Mahapatra","sequence":"additional","affiliation":[{"name":"University of California,San Diego"}]},{"given":"Matthew D.","family":"Sinclair","sequence":"additional","affiliation":[{"name":"University of Wisconsin-Madison,AMD Research"}]}],"member":"263","reference":[{"journal-title":"CoRR","article-title":"Deep residual learning for image recognition","year":"2015","author":"he","key":"ref39"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00059"},{"year":"2017","key":"ref33","article-title":"Hot Chips 2017: A Closer Look At Google’s TPU v2"},{"key":"ref32","first-page":"96","article-title":"Unifying Primary Cache, Scratch, and Register File Memories in a Throughput Processor","author":"gebhart","year":"2012","journal-title":"Micro"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/2701618"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/NoCS.2013.6558404"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001163"},{"key":"ref36","first-page":"1","article-title":"Graphicionado: A high-performance and energy-efficient accelerator for graph analytics","author":"ham","year":"2016","journal-title":"Micro"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/InPar.2012.6339595"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TC.1983.1676201"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00091"},{"key":"ref27","first-page":"1","article-title":"KLAP: Kernel launch aggregation and promotion for optimizing dynamic parallelism","author":"el hajj","year":"2016","journal-title":"Micro"},{"key":"ref29","first-page":"1","article-title":"A Configurable Cloud-scale DNN Processor for Real-time AI","author":"fowers","year":"2018","journal-title":"ISCA"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/1735688.1735702"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.15439\/2015F86"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080248"},{"key":"ref24","first-page":"2024","article-title":"Persistent RNNs: Stashing Recurrent Weights On-Chip","author":"diamos","year":"2016","journal-title":"ICML"},{"article-title":"9th DIMACS Implementation Challenge","year":"2006","author":"demetrescu","key":"ref23"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1145\/3178487.3178491"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/2602988.2602993"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2008.4563095"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3038228.3038239"},{"journal-title":"CoRR","article-title":"Exploring Modern GPU Memory System Design Challenges through Accurate Modeling","year":"2018","author":"khairy","key":"ref50"},{"journal-title":"Micro","article-title":"In-Register Parameter Caching for Dynamic Neural Nets with Virtual Persistent Processor Specialization","year":"2018","author":"khorasani","key":"ref51"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2015.42"},{"key":"ref58","first-page":"31","article-title":"GraphChi: LargeScale Graph Computation on Just a PC","author":"kyrola","year":"2012","journal-title":"OSDI"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1145\/2889488"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750374"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00085"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2016.76"},{"key":"ref53","first-page":"75","article-title":"Accelerating Irregular Algorithms on GPGPUs Using Fine-Grain Hardware Worklists","author":"kim","year":"2014","journal-title":"Micro"},{"year":"2021","key":"ref52","article-title":"The OpenCL Specification"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835930"},{"key":"ref4","first-page":"26:1","article-title":"Lazy Release Consistency for GPUs","author":"alsop","year":"2016","journal-title":"Micro"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001138"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750411"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00031"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"ref49","first-page":"473","article-title":"AccelSim: An Extensible Simulation Framework for Validated GPU Modeling","author":"khairy","year":"2020","journal-title":"ISCA"},{"journal-title":"ch Benchmarking for Graph Clustering and Partitioning","first-page":"161","year":"2018","author":"bader","key":"ref7"},{"article-title":"Flexible Support for Fast Parallel Commutative Updates","year":"2017","author":"balaji","key":"ref9"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.14778\/3157794.3157799"},{"article-title":"CUTLASS: Fast Linear Algebra in CUDA C++","year":"2017","author":"kerr","key":"ref48"},{"key":"ref47","first-page":"738","article-title":"Accel-Wattch: A Power Modeling Framework for Modern GPUs","author":"kandiah","year":"2021","journal-title":"Micro"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541981"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2013.6557152"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00070"},{"year":"2015","key":"ref43","article-title":"HSA Platform System Architecture Specification"},{"year":"2020","key":"ref73","article-title":"libcu++: The C++ Standard Library for Your Entire System"},{"year":"2020","key":"ref72","article-title":"CUDA C++ Programming Guide"},{"year":"2018","key":"ref71","article-title":"NVIDIA cuDNN: GPU Accelerated Deep Learning"},{"key":"ref70","first-page":"693","article-title":"HOGWILD!: A Lock-free Approach to Parallelizing Stochastic Gradient Descent","author":"niu","year":"2011","journal-title":"NeurIPS"},{"key":"ref76","first-page":"41","article-title":"Fine-Grained DRAM: Energy-Efficient DRAM for Extreme Bandwidth Systems","author":"o\u2019connor","year":"2017","journal-title":"Micro"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080254"},{"year":"2017","key":"ref74","article-title":"Inside Volta: The World’s Most Advanced Data Center GPU"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1145\/2694344.2694350"},{"article-title":"Histogram calculation in CUDA","year":"2007","author":"podlozhnyuk","key":"ref78"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1145\/3350755.3400266"},{"journal-title":"The OpenCL Specification version 2 0","year":"2015","author":"howes","key":"ref60"},{"journal-title":"CoRR","article-title":"Analyzing Machine Learning Workloads Using a Detailed GPU Simulator","year":"2018","author":"lew","key":"ref62"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485964"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00028"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1145\/3322127"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304043"},{"journal-title":"Garaph Efficient GPU-accelerated Graph Processing on a Single Machine with Balanced Replication","first-page":"195","year":"2017","author":"ma","key":"ref66"},{"article-title":"NVIDIA CUB Library","year":"2020","author":"merrill","key":"ref67"},{"key":"ref68","first-page":"1009","article-title":"PHI: Architectural Support for Synchronization- and Bandwidth-Efficient Commutative Scatter Updates","author":"mukkara","year":"2019","journal-title":"Micro"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/2994149"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1145\/2458523.2458533"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2016.7581278"},{"article-title":"Ecornn: Efficient computing of lstm rnn training on gpus","year":"2018","author":"zheng","key":"ref109"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2013.6522351"},{"journal-title":"NeurIPS","article-title":"AutoSync: Learning to Synchronize for Data-Parallel Distributed Deep Learning","year":"2020","author":"zhang","key":"ref108"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080206"},{"key":"ref107","first-page":"13","article-title":"Exploiting Commutativity to Reduce the Cost of Updates to Shared Data in Cache-Coherent Systems","author":"zhang","year":"2015","journal-title":"Micro"},{"key":"ref93","first-page":"647","article-title":"Efficient GPU Synchronization without Scopes: Saying No to Complex Consistency Models","author":"sinclair","year":"2015","journal-title":"Micro"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080215"},{"key":"ref92","first-page":"535","article-title":"Maximizing CNN Accelerator Efficiency Through Resource Partitioning","author":"shen","year":"2017","journal-title":"ISCA"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00043"},{"key":"ref91","first-page":"14","article-title":"ISAAC: A Convolutional Neural Network Accelerator with In-situ Analog Arithmetic in Crossbars","author":"shafiee","year":"2016","journal-title":"ISCA"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2011.24"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322254"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1145\/2814270.2814283"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1145\/2851141.2851145"},{"journal-title":"ICLRE","article-title":"Sparse Persistent RNNs: Squeezing Large Recurrent Networks On-Chip","year":"2018","author":"zhu","key":"ref111"},{"journal-title":"IISWC","article-title":"TBD: Benchmarking and Analyzing Deep Neural Network Training","year":"2018","author":"zhu","key":"ref112"},{"journal-title":"LearningSys","article-title":"SpeeDO: Parallelizing Stochastic Gradient Descent for Deep Convolutional Neural Network","year":"2015","author":"zheng","key":"ref110"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00042"},{"key":"ref99","first-page":"372","article-title":"NVBit: A Dynamic Binary Instrumentation Framework for NVIDIA GPUs","author":"villa","year":"2019","journal-title":"Micro"},{"journal-title":"IISWC","article-title":"One Size Doesn’t Fit All: Quantifying Performance Portability of Graph Applications on GPUs","year":"2019","author":"sorensen","key":"ref96"},{"journal-title":"IMPACT Technical Report IMPACT-12-01","year":"2012","author":"stratton","key":"ref97"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/1375581.1375591"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2012.6402918"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2010.5650274"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2013.6704684"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001177"},{"key":"ref16","first-page":"981","article-title":"DeterministicAtomic Buffering","author":"chou","year":"2020","journal-title":"Micro"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001165"},{"journal-title":"Tech Rep","article-title":"FullSpeed Deterministic Bit-Accurate Parallel Floating-Point Summation on Multi- and Many-Core Architectures","year":"2014","author":"collange","key":"ref17"},{"journal-title":"CoRR","article-title":"Modeling Deep Learning Accelerator Enabled GPUs","year":"2018","author":"raihan","key":"ref81"},{"journal-title":"Tech Rep","article-title":"Virtual Local Stores: Enabling Software-Managed Memory Hierarchies in Mainstream Computing Environments","year":"2009","author":"cook","key":"ref18"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00054"},{"journal-title":"SysML Keynote","article-title":"Hardware for Deep Learning","year":"2018","author":"dally","key":"ref19"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.91"},{"article-title":"Training long short-term memory with sparsified stochastic gradient descent","year":"2016","author":"zhu","key":"ref113"},{"key":"ref80","first-page":"457","article-title":"Heterogeneous System Coherence for Integrated CPU-GPU Systems","author":"power","year":"2013","journal-title":"Micro"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1145\/237090.237144"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.40"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00017"},{"journal-title":"Backpropagation The Basic Theory","first-page":"1","year":"1995","author":"rumelhart","key":"ref87"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS48437.2020.00027"}],"event":{"name":"2022 IEEE International Symposium on High-Performance Computer Architecture (HPCA)","start":{"date-parts":[[2022,4,2]]},"location":"Seoul, Korea, Republic of","end":{"date-parts":[[2022,4,6]]}},"container-title":["2022 IEEE International Symposium on High-Performance Computer Architecture (HPCA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9773179\/9773180\/09773230.pdf?arnumber=9773230","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,6,20]],"date-time":"2022-06-20T21:34:49Z","timestamp":1655760889000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9773230\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,4]]},"references-count":113,"URL":"http:\/\/dx.doi.org\/10.1109\/hpca53966.2022.00056","relation":{},"subject":[],"published":{"date-parts":[[2022,4]]}}}