{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T21:36:29Z","timestamp":1730237789992,"version":"3.28.0"},"reference-count":59,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100002428","name":"Austrian Science Fund","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100002428","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100010002","name":"Ministry of Education","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100010002","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,10,1]]},"DOI":"10.1109\/iccv51070.2023.00267","type":"proceedings-article","created":{"date-parts":[[2024,1,15]],"date-time":"2024-01-15T20:55:59Z","timestamp":1705352159000},"page":"2839-2850","source":"Crossref","is-referenced-by-count":7,"title":["MAtch, eXpand and Improve: Unsupervised Finetuning for Zero-Shot Action Recognition with Language Knowledge"],"prefix":"10.1109","author":[{"given":"Wei","family":"Lin","sequence":"first","affiliation":[{"name":"Graz University of Technology,Institute of Computer Graphics and Vision,Austria"}]},{"given":"Leonid","family":"Karlinsky","sequence":"additional","affiliation":[{"name":"MIT-IBM Watson AI Lab,USA"}]},{"given":"Nina","family":"Shvetsova","sequence":"additional","affiliation":[{"name":"Goethe University Frankfurt,Germany"}]},{"given":"Horst","family":"Possegger","sequence":"additional","affiliation":[{"name":"Graz University of Technology,Institute of Computer Graphics and Vision,Austria"}]},{"given":"Mateusz","family":"Kozinski","sequence":"additional","affiliation":[{"name":"Graz University of Technology,Institute of Computer Graphics and Vision,Austria"}]},{"given":"Rameswar","family":"Panda","sequence":"additional","affiliation":[{"name":"MIT-IBM Watson AI Lab,USA"}]},{"given":"Rogerio","family":"Feris","sequence":"additional","affiliation":[{"name":"MIT-IBM Watson AI Lab,USA"}]},{"given":"Hilde","family":"Kuehne","sequence":"additional","affiliation":[{"name":"MIT-IBM Watson AI Lab,USA"}]},{"given":"Horst","family":"Bischof","sequence":"additional","affiliation":[{"name":"Graz University of Technology,Institute of Computer Graphics and Vision,Austria"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00467"},{"key":"ref3","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"NeurIPS","volume":"33","author":"Brown"},{"article-title":"A short note about kinetics-600","year":"2018","author":"Carreira","key":"ref4"},{"article-title":"Fitclip: Refining large-scale pretrained image-text models for zero-shot video understanding tasks","volume-title":"BMVC","author":"Castro","key":"ref5"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00610"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01338"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"ICLR","author":"Dosovitskiy","key":"ref9"},{"article-title":"Cloob: Modern hopfield networks with infoloob outperform clip","year":"2021","author":"F\u00fcrst","key":"ref10"},{"article-title":"Pyramidclip: Hierarchical feature alignment for vision-language model pretraining","year":"2022","author":"Gao","key":"ref11"},{"article-title":"Cyclip: Cyclic contrastive language-image pretraining","year":"2022","author":"Goel","key":"ref12"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"article-title":"Open-vocabulary object detection via vision and language knowledge distillation","volume-title":"ICLR","author":"Gu","key":"ref14"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.318"},{"article-title":"Patching open-vocabulary models by interpolating weights","volume-title":"NeurIPS","author":"Ilharco","key":"ref16"},{"first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"International Conference on Machine Learning","author":"Jia","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_7"},{"article-title":"The kinetics human action video dataset","year":"2017","author":"Kay","key":"ref19"},{"first-page":"5583","article-title":"Vilt: Vision-and-language transformer without convolution or region supervision","volume-title":"International Conference on Machine Learning","author":"Kim","key":"ref20"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"article-title":"Language-driven semantic segmentation","volume-title":"ICLR","author":"Li","key":"ref22"},{"article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","year":"2022","author":"Li","key":"ref23"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01600"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"article-title":"Supervision exists everywhere: A data efficient contrastive language-image pre-training paradigm","year":"2021","author":"Li","key":"ref26"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1405.0312"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995353"},{"article-title":"Decoupled weight decay regularization","volume-title":"ICLR","author":"Loshchilov","key":"ref29"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01022"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2901464"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19772-7_1"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19772-7_7"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.117"},{"first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International Conference on Machine Learning","author":"Radford","key":"ref36"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00633"},{"article-title":"Bridging the gap between object and image-level representations for open-vocabulary detection","volume-title":"NeurIPS","author":"Rasheed","key":"ref39"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6872"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_31"},{"article-title":"Ucf101: A dataset of 101 human actions classes from videos in the wild","year":"2012","author":"Soomro","key":"ref44"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00517"},{"article-title":"Actionclip: A new paradigm for video action recognition","year":"2021","author":"Wang","key":"ref47"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-71249-9_6"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00780"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25386"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01522"},{"article-title":"Filip: Fine-grained interactive language-image pre-training","year":"2021","author":"Yao","key":"ref52"},{"article-title":"When and why vision-language models behave like bags-of-words, and what to do about it?","volume-title":"The Eleventh International Conference on Learning Representations","author":"Yuksekgonul","key":"ref53"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1099"},{"article-title":"Tip-adapter: Training-free clip-adapter for better vision-language modeling","volume-title":"ECCV","author":"Zhang","key":"ref55"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-demos.4"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_21"}],"event":{"name":"2023 IEEE\/CVF International Conference on Computer Vision (ICCV)","start":{"date-parts":[[2023,10,1]]},"location":"Paris, France","end":{"date-parts":[[2023,10,6]]}},"container-title":["2023 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10376473\/10376477\/10377160.pdf?arnumber=10377160","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,18]],"date-time":"2024-01-18T01:09:15Z","timestamp":1705540155000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10377160\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,1]]},"references-count":59,"URL":"https:\/\/doi.org\/10.1109\/iccv51070.2023.00267","relation":{},"subject":[],"published":{"date-parts":[[2023,10,1]]}}}