{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,11,19]],"date-time":"2024-11-19T19:09:28Z","timestamp":1732043368756},"reference-count":101,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2024,8,1]],"date-time":"2024-08-01T00:00:00Z","timestamp":1722470400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2024,8,1]],"date-time":"2024-08-01T00:00:00Z","timestamp":1722470400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2024,8,1]],"date-time":"2024-08-01T00:00:00Z","timestamp":1722470400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2024,8,1]],"date-time":"2024-08-01T00:00:00Z","timestamp":1722470400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2024,8,1]],"date-time":"2024-08-01T00:00:00Z","timestamp":1722470400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2024,8,1]],"date-time":"2024-08-01T00:00:00Z","timestamp":1722470400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,8,1]],"date-time":"2024-08-01T00:00:00Z","timestamp":1722470400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Information Fusion"],"published-print":{"date-parts":[[2024,8]]},"DOI":"10.1016\/j.inffus.2024.102367","type":"journal-article","created":{"date-parts":[[2024,3,21]],"date-time":"2024-03-21T23:54:21Z","timestamp":1711065261000},"page":"102367","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":7,"special_numbering":"C","title":["GPT-4V with emotion: A zero-shot benchmark for Generalized Emotion Recognition"],"prefix":"10.1016","volume":"108","author":[{"ORCID":"http:\/\/orcid.org\/0000-0001-9477-0599","authenticated-orcid":false,"given":"Zheng","family":"Lian","sequence":"first","affiliation":[]},{"given":"Licai","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Haiyang","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Kang","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Zhuofan","family":"Wen","sequence":"additional","affiliation":[]},{"given":"Hao","family":"Gu","sequence":"additional","affiliation":[]},{"given":"Bin","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Jianhua","family":"Tao","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.inffus.2024.102367_b1","doi-asserted-by":"crossref","unstructured":"Q. You, H. Jin, J. Luo, Visual sentiment analysis by attending on local image regions, in: Proceedings of the Thirty-First AAAI Conference on Artificial Intelligence, 2017, pp. 231\u2013237.","DOI":"10.1609\/aaai.v31i1.10501"},{"key":"10.1016\/j.inffus.2024.102367_b2","article-title":"Smin: Semi-supervised multi-modal interaction network for conversational emotion recognition","author":"Lian","year":"2022","journal-title":"IEEE Trans. Affect. Comput."},{"key":"10.1016\/j.inffus.2024.102367_b3","doi-asserted-by":"crossref","unstructured":"J. Yang, Q. Huang, T. Ding, D. Lischinski, D. Cohen-Or, H. Huang, EmoSet: A large-scale visual emotion dataset with rich attributes, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 20383\u201320394.","DOI":"10.1109\/ICCV51070.2023.01864"},{"key":"10.1016\/j.inffus.2024.102367_b4","series-title":"The dawn of lmms: Preliminary explorations with gpt-4v (ision)","first-page":"1","author":"Yang","year":"2023"},{"issue":"8","key":"10.1016\/j.inffus.2024.102367_b5","doi-asserted-by":"crossref","first-page":"1440","DOI":"10.1049\/iet-ipr.2019.1270","article-title":"Survey on visual sentiment analysis","volume":"14","author":"Ortis","year":"2020","journal-title":"IET Image Process."},{"key":"10.1016\/j.inffus.2024.102367_b6","series-title":"MultiMedia Modeling: 22nd International Conference, MMM 2016, Miami, FL, USA, January 4-6, 2016, Proceedings, Part II 22","first-page":"15","article-title":"Sentiment analysis on multi-view social data","author":"Niu","year":"2016"},{"key":"10.1016\/j.inffus.2024.102367_b7","doi-asserted-by":"crossref","DOI":"10.1109\/TAFFC.2022.3205170","article-title":"Deep learning for micro-expression recognition: A survey","author":"Li","year":"2022","journal-title":"IEEE Trans. Affect. Comput."},{"issue":"3","key":"10.1016\/j.inffus.2024.102367_b8","doi-asserted-by":"crossref","first-page":"1195","DOI":"10.1109\/TAFFC.2020.2981446","article-title":"Deep facial expression recognition: A survey","volume":"13","author":"Li","year":"2020","journal-title":"IEEE Trans. Affect. Comput."},{"key":"10.1016\/j.inffus.2024.102367_b9","doi-asserted-by":"crossref","unstructured":"Y. Wang, Y. Sun, Y. Huang, Z. Liu, S. Gao, W. Zhang, W. Ge, W. Zhang, Ferv39k: A large-scale multi-scene dataset for facial expression recognition in videos, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 20922\u201320931.","DOI":"10.1109\/CVPR52688.2022.02025"},{"key":"10.1016\/j.inffus.2024.102367_b10","series-title":"MERBench: A unified evaluation benchmark for multimodal emotion recognition","author":"Lian","year":"2024"},{"key":"10.1016\/j.inffus.2024.102367_b11","article-title":"Facial action coding system","author":"Ekman","year":"1978","journal-title":"Environ. Psychol. Nonverbal Behav."},{"issue":"2","key":"10.1016\/j.inffus.2024.102367_b12","doi-asserted-by":"crossref","first-page":"268","DOI":"10.1037\/0033-2909.115.2.268","article-title":"Strong evidence for universals in facial expressions: A reply to Russell\u2019s mistaken critique","volume":"115","author":"Ekman","year":"1994","journal-title":"Psychol. Bull."},{"key":"10.1016\/j.inffus.2024.102367_b13","series-title":"Internet Imaging VI","first-page":"56","article-title":"Multimodal approaches for emotion recognition: a survey","volume":"Vol. 5670","author":"Sebe","year":"2005"},{"key":"10.1016\/j.inffus.2024.102367_b14","doi-asserted-by":"crossref","first-page":"985","DOI":"10.1109\/TASLP.2021.3049898","article-title":"CTNet: Conversational transformer network for emotion recognition","volume":"29","author":"Lian","year":"2021","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.inffus.2024.102367_b15","doi-asserted-by":"crossref","DOI":"10.1017\/ATSIP.2014.11","article-title":"Survey on audiovisual emotion recognition: databases, features, and data fusion strategies","volume":"3","author":"Wu","year":"2014","journal-title":"APSIPA Trans. Signal Inf. Process."},{"key":"10.1016\/j.inffus.2024.102367_b16","doi-asserted-by":"crossref","unstructured":"Q. You, J. Luo, H. Jin, J. Yang, Robust image sentiment analysis using progressively trained and domain transferred deep networks, in: Proceedings of the Twenty-Ninth AAAI Conference on Artificial Intelligence, 2015, pp. 381\u2013388.","DOI":"10.1609\/aaai.v29i1.9179"},{"key":"10.1016\/j.inffus.2024.102367_b17","doi-asserted-by":"crossref","unstructured":"D. Borth, R. Ji, T. Chen, T. Breuel, S.-F. Chang, Large-scale visual sentiment ontology and detectors using adjective noun pairs, in: Proceedings of the 21st ACM International Conference on Multimedia, 2013, pp. 223\u2013232.","DOI":"10.1145\/2502081.2502282"},{"key":"10.1016\/j.inffus.2024.102367_b18","doi-asserted-by":"crossref","unstructured":"Q. You, J. Luo, H. Jin, J. Yang, Building a large scale dataset for image emotion recognition: the fine print and the benchmark, in: Proceedings of the Thirtieth AAAI Conference on Artificial Intelligence, 2016, pp. 308\u2013314.","DOI":"10.1609\/aaai.v30i1.9987"},{"key":"10.1016\/j.inffus.2024.102367_b19","series-title":"Proceedings of the 10th IEEE International Conference and Workshops on Automatic Face and Gesture Recognition","first-page":"1","article-title":"CASME database: A dataset of spontaneous micro-expressions collected from neutralized faces","author":"Yan","year":"2013"},{"issue":"1","key":"10.1016\/j.inffus.2024.102367_b20","doi-asserted-by":"crossref","DOI":"10.1371\/journal.pone.0086041","article-title":"CASME II: An improved spontaneous micro-expression database and the baseline evaluation","volume":"9","author":"Yan","year":"2014","journal-title":"PLoS ONE"},{"issue":"1","key":"10.1016\/j.inffus.2024.102367_b21","doi-asserted-by":"crossref","first-page":"116","DOI":"10.1109\/TAFFC.2016.2573832","article-title":"Samm: A spontaneous micro-facial movement dataset","volume":"9","author":"Davison","year":"2016","journal-title":"IEEE Trans. Affect. Comput."},{"key":"10.1016\/j.inffus.2024.102367_b22","series-title":"IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops","first-page":"94","article-title":"The extended cohn-kanade dataset (ck+): A complete dataset for action unit and emotion-specified expression","author":"Lucey","year":"2010"},{"key":"10.1016\/j.inffus.2024.102367_b23","doi-asserted-by":"crossref","unstructured":"A. Dhall, O. Ramana Murthy, R. Goecke, J. Joshi, T. Gedeon, Video and image based emotion recognition challenges in the wild: Emotiw 2015, in: Proceedings of the 2015 ACM on International Conference on Multimodal Interaction, 2015, pp. 423\u2013426.","DOI":"10.1145\/2818346.2829994"},{"key":"10.1016\/j.inffus.2024.102367_b24","doi-asserted-by":"crossref","unstructured":"S. Li, W. Deng, J. Du, Reliable crowdsourcing and deep locality-preserving learning for expression recognition in the wild, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017, pp. 2852\u20132861.","DOI":"10.1109\/CVPR.2017.277"},{"key":"10.1016\/j.inffus.2024.102367_b25","doi-asserted-by":"crossref","unstructured":"E. Barsoum, C. Zhang, C.C. Ferrer, Z. Zhang, Training deep networks for facial expression recognition with crowd-sourced label distribution, in: Proceedings of the 18th ACM International Conference on Multimodal Interaction, 2016, pp. 279\u2013283.","DOI":"10.1145\/2993148.2993165"},{"issue":"1","key":"10.1016\/j.inffus.2024.102367_b26","doi-asserted-by":"crossref","first-page":"18","DOI":"10.1109\/TAFFC.2017.2740923","article-title":"Affectnet: A database for facial expression, valence, and arousal computing in the wild","volume":"10","author":"Mollahosseini","year":"2017","journal-title":"IEEE Trans. Affect. Comput."},{"key":"10.1016\/j.inffus.2024.102367_b27","doi-asserted-by":"crossref","unstructured":"X. Jiang, Y. Zong, W. Zheng, C. Tang, W. Xia, C. Lu, J. Liu, Dfew: A large-scale database for recognizing dynamic facial expressions in the wild, in: Proceedings of the 28th ACM International Conference on Multimedia, 2020, pp. 2881\u20132889.","DOI":"10.1145\/3394171.3413620"},{"issue":"5","key":"10.1016\/j.inffus.2024.102367_b28","doi-asserted-by":"crossref","DOI":"10.1371\/journal.pone.0196391","article-title":"The ryerson audio-visual database of emotional speech and song (RAVDESS): A dynamic, multimodal set of facial and vocal expressions in north American english","volume":"13","author":"Livingstone","year":"2018","journal-title":"PLoS One"},{"key":"10.1016\/j.inffus.2024.102367_b29","series-title":"Proceedings of the 22nd International Conference on Data Engineering Workshops","article-title":"The enterface\u201905 audio-visual emotion database","author":"Martin","year":"2006"},{"key":"10.1016\/j.inffus.2024.102367_b30","doi-asserted-by":"crossref","unstructured":"A. Zadeh, M. Chen, S. Poria, E. Cambria, L.-P. Morency, Tensor fusion network for multimodal sentiment analysis, in: Proceedings of the Conference on Empirical Methods in Natural Language Processing, 2017, pp. 1103\u20131114.","DOI":"10.18653\/v1\/D17-1115"},{"key":"10.1016\/j.inffus.2024.102367_b31","doi-asserted-by":"crossref","unstructured":"W. Yu, H. Xu, F. Meng, Y. Zhu, Y. Ma, J. Wu, J. Zou, K. Yang, Ch-sims: A chinese multimodal sentiment analysis dataset with fine-grained annotation of modality, in: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, 2020, pp. 3718\u20133727.","DOI":"10.18653\/v1\/2020.acl-main.343"},{"key":"10.1016\/j.inffus.2024.102367_b32","doi-asserted-by":"crossref","unstructured":"Z. Lian, H. Sun, L. Sun, K. Chen, M. Xu, K. Wang, K. Xu, Y. He, Y. Li, J. Zhao, et al., Mer 2023: Multi-label learning, modality robustness, and semi-supervised learning, in: Proceedings of the 31st ACM International Conference on Multimedia, 2023, pp. 9610\u20139614.","DOI":"10.1145\/3581783.3612836"},{"key":"10.1016\/j.inffus.2024.102367_b33","series-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023"},{"key":"10.1016\/j.inffus.2024.102367_b34","series-title":"Visual instruction tuning","author":"Liu","year":"2023"},{"key":"10.1016\/j.inffus.2024.102367_b35","series-title":"Videochat: Chat-centric video understanding","author":"Li","year":"2023"},{"key":"10.1016\/j.inffus.2024.102367_b36","series-title":"Speechgpt: Empowering large language models with intrinsic cross-modal conversational abilities","author":"Zhang","year":"2023"},{"key":"10.1016\/j.inffus.2024.102367_b37","series-title":"Pandagpt: One model to instruction-follow them all","author":"Su","year":"2023"},{"key":"10.1016\/j.inffus.2024.102367_b38","series-title":"AlpacaEval: An automatic evaluator of instruction-following models","author":"Li","year":"2023"},{"key":"10.1016\/j.inffus.2024.102367_b39","unstructured":"P. Lu, H. Bansal, T. Xia, J. Liu, C. Li, H. Hajishirzi, H. Cheng, K.-W. Chang, M. Galley, J. Gao, MathVista: Evaluating Mathematical Reasoning of Foundation Models in Visual Contexts, in: Proceedings of the International Conference on Learning Representations, ICLR, 2024, pp. 1\u2013116."},{"key":"10.1016\/j.inffus.2024.102367_b40","series-title":"Mm-vid: Advancing video understanding with gpt-4v (ision)","author":"Lin","year":"2023"},{"key":"10.1016\/j.inffus.2024.102367_b41","series-title":"Can gpt-4v (ision) serve medical applications? case studies on gpt-4v for multimodal medical diagnosis","author":"Wu","year":"2023"},{"key":"10.1016\/j.inffus.2024.102367_b42","series-title":"GPT4Vis: What can GPT-4 do for zero-shot visual recognition?","author":"Wu","year":"2023"},{"key":"10.1016\/j.inffus.2024.102367_b43","doi-asserted-by":"crossref","unstructured":"N. Xu, W. Mao, Multisentinet: A deep semantic network for multimodal sentiment analysis, in: Proceedings of the 2017 ACM on Conference on Information and Knowledge Management, 2017, pp. 2399\u20132402.","DOI":"10.1145\/3132847.3133142"},{"key":"10.1016\/j.inffus.2024.102367_b44","article-title":"Multimodal sentiment analysis with image-text interaction network","author":"Zhu","year":"2022","journal-title":"IEEE Trans. Multimed."},{"issue":"4","key":"10.1016\/j.inffus.2024.102367_b45","doi-asserted-by":"crossref","first-page":"1868","DOI":"10.1109\/TAFFC.2022.3197761","article-title":"Disentangling identity and pose for facial expression recognition","volume":"13","author":"Jiang","year":"2022","journal-title":"IEEE Trans. Affect. Comput."},{"key":"10.1016\/j.inffus.2024.102367_b46","doi-asserted-by":"crossref","first-page":"249","DOI":"10.1109\/TIP.2020.3035042","article-title":"Joint local and global information learning with single apex frame detection for micro-expression recognition","volume":"30","author":"Li","year":"2020","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.inffus.2024.102367_b47","doi-asserted-by":"crossref","unstructured":"Y. Wang, Y. Sun, W. Song, S. Gao, Y. Huang, Z. Chen, W. Ge, W. Zhang, Dpcnet: Dual path multi-excitation collaborative network for facial expression representation learning in videos, in: Proceedings of the 30th ACM International Conference on Multimedia, 2022, pp. 101\u2013110.","DOI":"10.1145\/3503161.3547865"},{"key":"10.1016\/j.inffus.2024.102367_b48","series-title":"From static to dynamic: Adapting landmark-aware image models for facial expression recognition in videos","author":"Chen","year":"2023"},{"issue":"4","key":"10.1016\/j.inffus.2024.102367_b49","doi-asserted-by":"crossref","first-page":"1738","DOI":"10.1121\/1.399423","article-title":"Perceptual linear predictive (PLP) analysis of speech","volume":"87","author":"Hermansky","year":"1990","journal-title":"J. Acoust. Soc. Am."},{"key":"10.1016\/j.inffus.2024.102367_b50","doi-asserted-by":"crossref","unstructured":"S. Zhao, Y. Gao, X. Jiang, H. Yao, T.-S. Chua, X. Sun, Exploring principles-of-art features for image emotion recognition, in: Proceedings of the 22nd ACM International Conference on Multimedia, 2014, pp. 47\u201356.","DOI":"10.1145\/2647868.2654930"},{"key":"10.1016\/j.inffus.2024.102367_b51","series-title":"Deepsentibank: Visual sentiment concept classification with deep convolutional neural networks","author":"Chen","year":"2014"},{"key":"10.1016\/j.inffus.2024.102367_b52","unstructured":"K. Simonyan, A. Zisserman, Very deep convolutional networks for large-scale image recognition, in: Proceedings of the International Conference on Learning Representations, ICLR, 2015, pp. 1\u201314."},{"issue":"9","key":"10.1016\/j.inffus.2024.102367_b53","doi-asserted-by":"crossref","first-page":"2513","DOI":"10.1109\/TMM.2018.2803520","article-title":"Visual sentiment prediction based on automatic discovery of affective regions","volume":"20","author":"Yang","year":"2018","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.inffus.2024.102367_b54","series-title":"Computer Vision\u2013ACCV 2014: 12th Asian Conference on Computer Vision, Singapore, Singapore, November 1-5, 2014, Revised Selected Papers, Part I 12","first-page":"525","article-title":"Lbp with six intersection points: Reducing redundant information in lbp-top for micro-expression recognition","author":"Wang","year":"2015"},{"issue":"3","key":"10.1016\/j.inffus.2024.102367_b55","doi-asserted-by":"crossref","first-page":"626","DOI":"10.1109\/TMM.2019.2931351","article-title":"Spatiotemporal recurrent convolutional networks for recognizing spontaneous micro-expressions","volume":"22","author":"Xia","year":"2019","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.inffus.2024.102367_b56","series-title":"2018 25th IEEE International Conference on Image Processing","first-page":"3094","article-title":"Can micro-expression be recognized based on single apex frame?","author":"Li","year":"2018"},{"key":"10.1016\/j.inffus.2024.102367_b57","doi-asserted-by":"crossref","first-page":"184537","DOI":"10.1109\/ACCESS.2019.2960629","article-title":"Recognizing spontaneous micro-expression using a three-stream convolutional neural network","volume":"7","author":"Song","year":"2019","journal-title":"IEEE Access"},{"key":"10.1016\/j.inffus.2024.102367_b58","series-title":"Natural Language Processing and Chinese Computing: 4th CCF Conference, NLPCC 2015, Nanchang, China, October 9-13, 2015, Proceedings 4","first-page":"159","article-title":"Convolutional neural networks for multimedia sentiment analysis","author":"Cai","year":"2015"},{"issue":"41","key":"10.1016\/j.inffus.2024.102367_b59","first-page":"1","article-title":"Visual and textual sentiment analysis of a microblog using deep convolutional neural networks","volume":"9","author":"Yu","year":"2016","journal-title":"Algorithms"},{"key":"10.1016\/j.inffus.2024.102367_b60","doi-asserted-by":"crossref","unstructured":"N. Xu, W. Mao, G. Chen, A co-memory network for multimodal sentiment analysis, in: The 41st International ACM SIGIR Conference on Research & Development in Information Retrieval, 2018, pp. 929\u2013932.","DOI":"10.1145\/3209978.3210093"},{"key":"10.1016\/j.inffus.2024.102367_b61","doi-asserted-by":"crossref","first-page":"4014","DOI":"10.1109\/TMM.2020.3035277","article-title":"Image-text multimodal emotion classification via multi-view attentional network","volume":"23","author":"Yang","year":"2020","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.inffus.2024.102367_b62","unstructured":"Y.-H.H. Tsai, P.P. Liang, A. Zadeh, L.-P. Morency, R. Salakhutdinov, Learning factorized multimodal representations, in: Proceedings of the 7th International Conference on Learning Representations, 2019, pp. 1\u201320."},{"key":"10.1016\/j.inffus.2024.102367_b63","doi-asserted-by":"crossref","unstructured":"D. Hazarika, R. Zimmermann, S. Poria, MISA: Modality-Invariant and-Specific Representations for Multimodal Sentiment Analysis, in: Proceedings of the 28th ACM International Conference on Multimedia, 2020, pp. 1122\u20131131.","DOI":"10.1145\/3394171.3413678"},{"key":"10.1016\/j.inffus.2024.102367_b64","doi-asserted-by":"crossref","unstructured":"A. Zadeh, P.P. Liang, N. Mazumder, S. Poria, E. Cambria, L.-P. Morency, Memory fusion network for multi-view sequential learning, in: Proceedings of the AAAI Conference on Artificial Intelligence, 2018, pp. 5634\u20135641.","DOI":"10.1609\/aaai.v32i1.12021"},{"key":"10.1016\/j.inffus.2024.102367_b65","doi-asserted-by":"crossref","unstructured":"W. Han, H. Chen, S. Poria, Improving Multimodal Fusion with Hierarchical Mutual Information Maximization for Multimodal Sentiment Analysis, in: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, 2021, pp. 9180\u20139192.","DOI":"10.18653\/v1\/2021.emnlp-main.723"},{"key":"10.1016\/j.inffus.2024.102367_b66","doi-asserted-by":"crossref","unstructured":"Y.-H.H. Tsai, S. Bai, P.P. Liang, J.Z. Kolter, L.-P. Morency, R. Salakhutdinov, Multimodal Transformer for Unaligned Multimodal Language Sequences, in: Proceedings of the 57th Conference of the Association for Computational Linguistics, 2019, pp. 6558\u20136569.","DOI":"10.18653\/v1\/P19-1656"},{"key":"10.1016\/j.inffus.2024.102367_b67","series-title":"ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"4087","article-title":"Cross-VAE: Towards disentangling expression from identity for human faces","author":"Wu","year":"2020"},{"key":"10.1016\/j.inffus.2024.102367_b68","series-title":"2017 12th IEEE International Conference on Automatic Face & Gesture Recognition","first-page":"558","article-title":"Identity-aware convolutional neural network for facial expression recognition","author":"Meng","year":"2017"},{"key":"10.1016\/j.inffus.2024.102367_b69","doi-asserted-by":"crossref","unstructured":"K. Wang, X. Peng, J. Yang, S. Lu, Y. Qiao, Suppressing uncertainties for large-scale facial expression recognition, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 6897\u20136906.","DOI":"10.1109\/CVPR42600.2020.00693"},{"key":"10.1016\/j.inffus.2024.102367_b70","doi-asserted-by":"crossref","first-page":"4057","DOI":"10.1109\/TIP.2019.2956143","article-title":"Region attention networks for pose and occlusion robust facial expression recognition","volume":"29","author":"Wang","year":"2020","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.inffus.2024.102367_b71","series-title":"2018 13th IEEE International Conference on Automatic Face & Gesture Recognition","first-page":"294","article-title":"Identity-adaptive facial expression recognition through expression regeneration using conditional generative adversarial networks","author":"Yang","year":"2018"},{"key":"10.1016\/j.inffus.2024.102367_b72","doi-asserted-by":"crossref","unstructured":"Z. Zhao, Q. Liu, F. Zhou, Robust lightweight facial expression recognition network with label distribution training, in: Proceedings of the AAAI Conference on Artificial Intelligence, 2021, pp. 3510\u20133519.","DOI":"10.1609\/aaai.v35i4.16465"},{"key":"10.1016\/j.inffus.2024.102367_b73","doi-asserted-by":"crossref","first-page":"6544","DOI":"10.1109\/TIP.2021.3093397","article-title":"Learning deep global multi-scale and local attention features for facial expression recognition in the wild","volume":"30","author":"Zhao","year":"2021","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.inffus.2024.102367_b74","series-title":"2019 IEEE International Conference on Image Processing","first-page":"31","article-title":"Disentangled feature based adversarial learning for facial expression recognition","author":"Bai","year":"2019"},{"key":"10.1016\/j.inffus.2024.102367_b75","doi-asserted-by":"crossref","first-page":"108906","DOI":"10.1109\/ACCESS.2019.2930359","article-title":"Cross-domain facial expression recognition based on transductive deep transfer learning","volume":"7","author":"Yan","year":"2019","journal-title":"IEEE Access"},{"key":"10.1016\/j.inffus.2024.102367_b76","series-title":"European Conference on Computer Vision","first-page":"418","article-title":"Learn from all: Erasing attention consistency for noisy label facial expression recognition","author":"Zhang","year":"2022"},{"key":"10.1016\/j.inffus.2024.102367_b77","series-title":"Learning to amend facial expression representation via de-albino and affinity","author":"Shi","year":"2021"},{"key":"10.1016\/j.inffus.2024.102367_b78","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/j.patcog.2018.11.001","article-title":"Hard negative generation for identity-disentangled facial expression recognition","volume":"88","author":"Liu","year":"2019","journal-title":"Pattern Recognit."},{"issue":"2","key":"10.1016\/j.inffus.2024.102367_b79","doi-asserted-by":"crossref","first-page":"199","DOI":"10.3390\/biomimetics8020199","article-title":"Distract your attention: Multi-head cross attention network for facial expression recognition","volume":"8","author":"Wen","year":"2023","journal-title":"Biomimetics"},{"key":"10.1016\/j.inffus.2024.102367_b80","doi-asserted-by":"crossref","unstructured":"F. Xue, Q. Wang, G. Guo, Transfer: Learning relation-aware facial expression representations with transformers, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 3601\u20133610.","DOI":"10.1109\/ICCV48922.2021.00358"},{"key":"10.1016\/j.inffus.2024.102367_b81","doi-asserted-by":"crossref","first-page":"2016","DOI":"10.1109\/TIP.2021.3049955","article-title":"Adaptively learning facial expression representation via cf labels and distillation","volume":"30","author":"Li","year":"2021","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.inffus.2024.102367_b82","series-title":"2020 25th International Conference on Pattern Recognition","first-page":"9460","article-title":"Facial expression recognition by using a disentangled identity-invariant expression representation","author":"Ali","year":"2021"},{"key":"10.1016\/j.inffus.2024.102367_b83","doi-asserted-by":"crossref","unstructured":"C. Zheng, M. Mendieta, C. Chen, Poster: A pyramid cross-fusion transformer network for facial expression recognition, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 3146\u20133155.","DOI":"10.1109\/ICCVW60793.2023.00339"},{"issue":"3\u20134","key":"10.1016\/j.inffus.2024.102367_b84","first-page":"1","article-title":"FaceCaps for facial expression recognition","volume":"32","author":"Wu","year":"2021","journal-title":"Comput. Animat. Virtual Worlds"},{"key":"10.1016\/j.inffus.2024.102367_b85","series-title":"POSTER V2: A simpler and stronger facial expression recognition network","author":"Mao","year":"2023"},{"key":"10.1016\/j.inffus.2024.102367_b86","doi-asserted-by":"crossref","unstructured":"D. Tran, L. Bourdev, R. Fergus, L. Torresani, M. Paluri, Learning spatiotemporal features with 3d convolutional networks, in: Proceedings of the IEEE International Conference on Computer Vision, 2015, pp. 4489\u20134497.","DOI":"10.1109\/ICCV.2015.510"},{"key":"10.1016\/j.inffus.2024.102367_b87","doi-asserted-by":"crossref","unstructured":"H. Wang, B. Li, S. Wu, S. Shen, F. Liu, S. Ding, A. Zhou, Rethinking the Learning Paradigm for Dynamic Facial Expression Recognition, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 17958\u201317968.","DOI":"10.1109\/CVPR52729.2023.01722"},{"key":"10.1016\/j.inffus.2024.102367_b88","series-title":"2019 8th International Conference on Affective Computing and Intelligent Interaction","first-page":"552","article-title":"Multimodal and temporal perception of audio-visual cues for emotion recognition","author":"Ghaleb","year":"2019"},{"key":"10.1016\/j.inffus.2024.102367_b89","doi-asserted-by":"crossref","first-page":"48807","DOI":"10.1109\/ACCESS.2019.2907271","article-title":"A deep spatial and temporal aggregation framework for video-based facial expression recognition","volume":"7","author":"Pan","year":"2019","journal-title":"IEEE Access"},{"key":"10.1016\/j.inffus.2024.102367_b90","series-title":"Spatio-temporal transformer for dynamic facial expression recognition in the wild","author":"Ma","year":"2022"},{"key":"10.1016\/j.inffus.2024.102367_b91","series-title":"Msaf: Multimodal split attention fusion","author":"Su","year":"2020"},{"key":"10.1016\/j.inffus.2024.102367_b92","doi-asserted-by":"crossref","first-page":"7381","DOI":"10.1007\/s00521-020-05557-4","article-title":"Enhanced convolutional LSTM with spatial and temporal skip connections and temporal gates for facial expression recognition from video","volume":"33","author":"Miyoshi","year":"2021","journal-title":"Neural Comput. Appl."},{"key":"10.1016\/j.inffus.2024.102367_b93","doi-asserted-by":"crossref","unstructured":"H. Li, H. Niu, Z. Zhu, F. Zhao, Intensity-aware loss for dynamic facial expression recognition in the wild, in: Proceedings of the AAAI Conference on Artificial Intelligence, 2023, pp. 67\u201375.","DOI":"10.1609\/aaai.v37i1.25077"},{"key":"10.1016\/j.inffus.2024.102367_b94","series-title":"2019 IEEE International Conference on Image Processing","first-page":"3866","article-title":"Frame attention networks for facial expression recognition in videos","author":"Meng","year":"2019"},{"key":"10.1016\/j.inffus.2024.102367_b95","series-title":"SVFAP: Self-supervised video facial affect perceiver","author":"Sun","year":"2023"},{"key":"10.1016\/j.inffus.2024.102367_b96","doi-asserted-by":"crossref","unstructured":"L. Sun, Z. Lian, B. Liu, J. Tao, MAE-DFER: Efficient Masked Autoencoder for Self-supervised Dynamic Facial Expression Recognition, in: Proceedings of the 31st ACM International Conference on Multimedia, 2023, pp. 6110\u20136121.","DOI":"10.1145\/3581783.3612365"},{"key":"10.1016\/j.inffus.2024.102367_b97","article-title":"Spatial-temporal graphs plus transformers for geometry-guided facial expression recognition","author":"Zhao","year":"2022","journal-title":"IEEE Trans. Affect. Comput."},{"key":"10.1016\/j.inffus.2024.102367_b98","series-title":"A cross-modal fusion network based on self-attention and residual structure for multimodal emotion recognition","author":"Fu","year":"2021"},{"issue":"10","key":"10.1016\/j.inffus.2024.102367_b99","doi-asserted-by":"crossref","first-page":"1175","DOI":"10.1109\/34.954607","article-title":"Toward machine emotional intelligence: Analysis of affective physiological state","volume":"23","author":"Picard","year":"2001","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.inffus.2024.102367_b100","article-title":"The biases of pre-trained language models: An empirical study on prompt-based sentiment analysis and emotion detection","author":"Mao","year":"2022","journal-title":"IEEE Trans. Affect. Comput."},{"key":"10.1016\/j.inffus.2024.102367_b101","series-title":"Explainable multimodal emotion reasoning","author":"Lian","year":"2023"}],"container-title":["Information Fusion"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1566253524001453?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1566253524001453?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2024,5,8]],"date-time":"2024-05-08T20:42:33Z","timestamp":1715200953000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1566253524001453"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8]]},"references-count":101,"alternative-id":["S1566253524001453"],"URL":"https:\/\/doi.org\/10.1016\/j.inffus.2024.102367","relation":{},"ISSN":["1566-2535"],"issn-type":[{"value":"1566-2535","type":"print"}],"subject":[],"published":{"date-parts":[[2024,8]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"GPT-4V with emotion: A zero-shot benchmark for Generalized Emotion Recognition","name":"articletitle","label":"Article Title"},{"value":"Information Fusion","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.inffus.2024.102367","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2024 Elsevier B.V. All rights reserved.","name":"copyright","label":"Copyright"}],"article-number":"102367"}}