{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T14:37:16Z","timestamp":1730212636286,"version":"3.28.0"},"reference-count":38,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,6,1]],"date-time":"2022-06-01T00:00:00Z","timestamp":1654041600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,6,1]],"date-time":"2022-06-01T00:00:00Z","timestamp":1654041600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,6]]},"DOI":"10.1109\/cvpr52688.2022.01600","type":"proceedings-article","created":{"date-parts":[[2022,9,27]],"date-time":"2022-09-27T19:56:41Z","timestamp":1664308601000},"page":"16474-16483","source":"Crossref","is-referenced-by-count":19,"title":["WebQA: Multihop and Multimodal QA"],"prefix":"10.1109","author":[{"given":"Yingshan","family":"Chang","sequence":"first","affiliation":[{"name":"Carnegie Mellon University"}]},{"given":"Guihong","family":"Cao","sequence":"additional","affiliation":[{"name":"Microsoft, Bing Search"}]},{"given":"Mridu","family":"Narang","sequence":"additional","affiliation":[{"name":"Microsoft, Bing Search"}]},{"given":"Jianfeng","family":"Gao","sequence":"additional","affiliation":[{"name":"Microsoft Research"}]},{"given":"Hisami","family":"Suzuki","sequence":"additional","affiliation":[{"name":"Microsoft, Bing Search"}]},{"given":"Yonatan","family":"Bisk","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}]}],"member":"263","reference":[{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7005"},{"journal-title":"ArXiv Preprint","article-title":"An empirical study of gpt-3 for few-shot knowledge-based vqa","year":"2021","author":"yang","key":"ref33"},{"key":"ref32","first-page":"1358","article-title":"Recipeqa: A challenge dataset for multi-modal comprehension of cooking recipes","author":"yagcioglu","year":"0","journal-title":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00021"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.501"},{"journal-title":"ArXiv Preprint","article-title":"Bertscore: Evaluating text generation with bert","year":"2019","author":"zhang","key":"ref37"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"journal-title":"ArXiv Preprint","article-title":"Bartscore: Evaluating generated text as text generation","year":"2021","author":"yuan","key":"ref35"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1259"},{"key":"ref10","doi-asserted-by":"crossref","first-page":"7879","DOI":"10.1609\/aaai.v34i05.6294","article-title":"Many-modalqa: Modality disambiguation and qa over diverse inputs","volume":"34","author":"hannan","year":"0","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01028"},{"journal-title":"ArXiv Preprint","article-title":"Delphi: Towards machine ethics and norms","year":"2021","author":"jiang","key":"ref13"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1167"},{"key":"ref16","first-page":"121","article-title":"Oscar: Object-semantics aligned pre-training for vision-language tasks","author":"li","year":"0","journal-title":"European Conference on Computer Vision"},{"key":"ref17","first-page":"13","article-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume":"32","author":"lu","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00331"},{"journal-title":"Proceedings annual meeting of the Association for Computational Linguistics","article-title":"What in-gredients make for an effective crowdsourcing protocol for difficult nlu data collection tasks?","year":"0","author":"nangia","key":"ref19"},{"journal-title":"ArXiv Preprint","article-title":"Multimodalqa: Complex question answering over text, tables and images","year":"2021","author":"talmor","key":"ref28"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.703"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1059"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.463"},{"key":"ref6","first-page":"104","article-title":"Uniter: Universal image-text representation learning","author":"chen","year":"0","journal-title":"European Conference on Computer Vision"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"ref5","article-title":"Language models are few-shot learners","volume":"5","author":"brown","year":"2020","journal-title":"ArXiv"},{"journal-title":"ArXiv Preprint","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","year":"2018","author":"devlin","key":"ref8"},{"key":"ref7","first-page":"1931","article-title":"Unifying vision-and-language tasks via text generation","author":"cho","year":"0","journal-title":"International Conference on Machine Learning"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"journal-title":"ArXiv Preprint","article-title":"Exploring the limits of large scale pretraining","year":"2021","author":"abnar","key":"ref1"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"journal-title":"Learning Transferable Visual Models From Natural Language Supervision","year":"2021","author":"radford","key":"ref22"},{"key":"ref21","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"radford","year":"0","journal-title":"International Conference on Machine Learning"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.418"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1561\/1500000019"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1644"},{"journal-title":"ArXiv Preprint","article-title":"Vl-bert: Pre-training of generic visuallinguistic representations","year":"2019","author":"weijie","key":"ref25"}],"event":{"name":"2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","start":{"date-parts":[[2022,6,18]]},"location":"New Orleans, LA, USA","end":{"date-parts":[[2022,6,24]]}},"container-title":["2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9878378\/9878366\/09879677.pdf?arnumber=9879677","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,14]],"date-time":"2022-10-14T21:01:05Z","timestamp":1665781265000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9879677\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,6]]},"references-count":38,"URL":"http:\/\/dx.doi.org\/10.1109\/cvpr52688.2022.01600","relation":{},"subject":[],"published":{"date-parts":[[2022,6]]}}}