{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,11,5]],"date-time":"2024-11-05T05:04:27Z","timestamp":1730783067929,"version":"3.28.0"},"publisher-location":"New York, NY, USA","reference-count":66,"publisher":"ACM","funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62336008"],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680669","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"10449-10458","update-policy":"http:\/\/dx.doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["SpeechEE: A Novel Benchmark for Speech Event Extraction"],"prefix":"10.1145","author":[{"ORCID":"http:\/\/orcid.org\/0009-0006-4060-3123","authenticated-orcid":false,"given":"Bin","family":"Wang","sequence":"first","affiliation":[{"name":"Harbin Institute of Technology (Shenzhen), Shenzhen, China"}]},{"ORCID":"http:\/\/orcid.org\/0000-0001-6335-1340","authenticated-orcid":false,"given":"Meishan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology (Shenzhen), Shenzhen, China"}]},{"ORCID":"http:\/\/orcid.org\/0000-0003-3026-6347","authenticated-orcid":false,"given":"Hao","family":"Fei","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"http:\/\/orcid.org\/0000-0001-8714-4151","authenticated-orcid":false,"given":"Yu","family":"Zhao","sequence":"additional","affiliation":[{"name":"Tianjin University, Tianjin, China"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-0513-5540","authenticated-orcid":false,"given":"Bobo","family":"Li","sequence":"additional","affiliation":[{"name":"Wuhan University, Wuhan, China"}]},{"ORCID":"http:\/\/orcid.org\/0000-0001-6192-1194","authenticated-orcid":false,"given":"Shengqiong","family":"Wu","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-8106-9768","authenticated-orcid":false,"given":"Wei","family":"Ji","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-3895-5510","authenticated-orcid":false,"given":"Min","family":"Zhang","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology (Shenzhen), Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"volume-title":"Weinstein","year":"1992","author":"Andersen Peggy M.","key":"e_1_3_2_1_1_1","unstructured":"Peggy M. Andersen, Philip J. Hayes, Alison K. Huettner, Linda M. Schmandt, Irene B. Nirenburg, and Steven P. Weinstein. 1992. Automatic extraction of facts from press releases to generate news stories. In Proceedings of the ANLC. 170--177."},{"volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems","year":"2020","author":"Baevski Alexei","key":"e_1_3_2_1_2_1","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems, Vol. 33 (2020), 12449--12460."},{"volume-title":"Proceedings of the COLING. 1953--1964","year":"2022","author":"Cao Hu","key":"e_1_3_2_1_3_1","unstructured":"Hu Cao, Jingye Li, Fangfang Su, Fei Li, Hao Fei, Shengqiong Wu, Bobo Li, Liang Zhao, and Donghong Ji. 2022. OneEE: A One-Stage Framework for Fast Overlapping and Nested Event Extraction. In Proceedings of the COLING. 1953--1964."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-emnlp.8"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P15-1017"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461939"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2017.2765834"},{"volume-title":"Weischedel","year":"2004","author":"Doddington George R.","key":"e_1_3_2_1_8_1","unstructured":"George R. Doddington, Alexis Mitchell, Mark A. Przybocki, Lance A. Ramshaw, Stephanie M. Strassel, and Ralph M. Weischedel. 2004. The Automatic Content Extraction (ACE) Program - Tasks, Data, and Evaluation. In Proceedings of the LREC. European Language Resources Association."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.718"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.329"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00730"},{"volume-title":"Proceedings of the International Conference on Machine Learning.","year":"2024","author":"Fei Hao","key":"e_1_3_2_1_12_1","unstructured":"Hao Fei, Shengqiong Wu, Wei Ji, Hanwang Zhang, Meishan Zhang, Mong-Li Lee, and Wynne Hsu. 2024. Video-of-thought: Step-by-step video reasoning from perception to cognition. In Proceedings of the International Conference on Machine Learning."},{"key":"e_1_3_2_1_13_1","first-page":"15460","article-title":"Lasuie: Unifying information extraction with latent adaptive structure-aware generative language model","volume":"35","author":"Fei Hao","year":"2022","unstructured":"Hao Fei, Shengqiong Wu, Jingye Li, Bobo Li, Fei Li, Libo Qin, Meishan Zhang, Min Zhang, and Tat-Seng Chua. 2022. Lasuie: Unifying information extraction with latent adaptive structure-aware generative language model. Advances in Neural Information Processing Systems, Vol. 35 (2022), 15460--15475.","journal-title":"Advances in Neural Information Processing Systems"},{"volume-title":"Proceedings of the International Conference on Machine Learning. 6373--6391","year":"2022","author":"Fei Hao","key":"e_1_3_2_1_14_1","unstructured":"Hao Fei, Shengqiong Wu, Yafeng Ren, and Meishan Zhang. 2022. Matching structure for dual learning. In Proceedings of the International Conference on Machine Learning. 6373--6391."},{"volume-title":"VITRON: A Unified Pixel-level Vision LLM for Understanding, Generating, Segmenting, Editing.","year":"2024","author":"Fei Hao","key":"e_1_3_2_1_15_1","unstructured":"Hao Fei, Shengqiong Wu, Hanwang Zhang, Tat-Seng Chua, and Shuicheng Yan. 2024. VITRON: A Unified Pixel-level Vision LLM for Understanding, Generating, Segmenting, Editing. (2024)."},{"volume-title":"2024 d. Enhancing video-language representations with structural spatio-temporal alignment","year":"2024","author":"Fei Hao","key":"e_1_3_2_1_16_1","unstructured":"Hao Fei, Shengqiong Wu, Meishan Zhang, Min Zhang, Tat-Seng Chua, and Shuicheng Yan. 2024 d. Enhancing video-language representations with structural spatio-temporal alignment. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-2011"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639513"},{"volume-title":"Gaussian error linear units (gelus). arXiv preprint arXiv:1606.08415","year":"2016","author":"Hendrycks Dan","key":"e_1_3_2_1_19_1","unstructured":"Dan Hendrycks and Kevin Gimpel. 2016. Gaussian error linear units (gelus). arXiv preprint arXiv:1606.08415 (2016)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.138"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"volume-title":"Voss","year":"2018","author":"Huang Lifu","key":"e_1_3_2_1_22_1","unstructured":"Lifu Huang, Heng Ji, Kyunghyun Cho, Ido Dagan, Sebastian Riedel, and Clare R. Voss. 2018. Zero-Shot Transfer Learning for Event Extraction. In Proceedings of the ACL. 2160--2170."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16258"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3183434"},{"volume-title":"Proceedings of the NeurIPS.","year":"2020","author":"Khosla Prannay","key":"e_1_3_2_1_25_1","unstructured":"Prannay Khosla, Piotr Teterwak, Chen Wang, Aaron Sarna, Yonglong Tian, Phillip Isola, Aaron Maschinot, Ce Liu, and Dilip Krishnan. 2020. Supervised Contrastive Learning. In Proceedings of the NeurIPS."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1093\/bioinformatics\/btg1023"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612053"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.73"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21344"},{"volume-title":"Fine-grained semantically aligned vision-language pre-training. Advances in neural information processing systems","year":"2022","author":"Li Juncheng","key":"e_1_3_2_1_30_1","unstructured":"Juncheng Li, Xin He, Longhui Wei, Long Qian, Linchao Zhu, Lingxi Xie, Yueting Zhuang, Qi Tian, and Siliang Tang. 2022. Fine-grained semantically aligned vision-language pre-training. Advances in neural information processing systems, Vol. 35 (2022), 7290--7303."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3274139"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.230"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.69"},{"volume-title":"Proceedings of the ACL. 789--797","year":"2010","author":"Liao Shasha","key":"e_1_3_2_1_34_1","unstructured":"Shasha Liao and Ralph Grishman. 2010. Using Document Level Cross-Event Inference to Improve Event Extraction. In Proceedings of the ACL. 789--797."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.713"},{"volume-title":"Proceedings of the ICLR.","year":"2019","author":"Loshchilov Ilya","key":"e_1_3_2_1_36_1","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In Proceedings of the ICLR."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.217"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10231"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3645677"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1034"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P15-2060"},{"volume-title":"Proceedings of the ICML. 28492--28518","year":"2023","author":"Radford Alec","key":"e_1_3_2_1_42_1","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine Mcleavey, and Ilya Sutskever. 2023. Robust Speech Recognition via Large-Scale Weak Supervision. In Proceedings of the ICML. 28492--28518."},{"volume-title":"A Novel Global Feature-Oriented Relational Triple Extraction Model based on Table Filling. CoRR","year":"2021","author":"Ren Feiliang","key":"e_1_3_2_1_43_1","unstructured":"Feiliang Ren, Longhui Zhang, Shujuan Yin, Xiaofeng Zhao, Shilei Liu, Bochao Li, and Yaduo Liu. 2021. A Novel Global Feature-Oriented Relational Triple Extraction Model based on Table Filling. CoRR, Vol. abs\/2109.06705 (2021)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3488560.3498409"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6401"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746137"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.376"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461883"},{"key":"e_1_3_2_1_50_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research, Vol. 9, 11 (2008).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1585"},{"volume-title":"ACE 2005 Multilingual Training Corpus. LDC2006T06","year":"2006","author":"Walker Christopher","key":"e_1_3_2_1_52_1","unstructured":"Christopher Walker, Stephanie Strassel, Julie Medero, and Kazuaki Maeda. 2006. ACE 2005 Multilingual Training Corpus. LDC2006T06. Web Download. Philadelphia: Linguistic Data Consortium (2006)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.823"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.146"},{"volume-title":"Towards Semantic Equivalence of Tokenization in Multimodal LLM. arXiv preprint arXiv:2406.05127","year":"2024","author":"Wu Shengqiong","key":"e_1_3_2_1_55_1","unstructured":"Shengqiong Wu, Hao Fei, Xiangtai Li, Jiayi Ji, Hanwang Zhang, Tat-Seng Chua, and Shuicheng Yan. 2024. Towards Semantic Equivalence of Tokenization in Multimodal LLM. arXiv preprint arXiv:2406.05127 (2024)."},{"volume-title":"Proceedings of the International Conference on Machine Learning.","year":"2024","author":"Wu Shengqiong","key":"e_1_3_2_1_56_1","unstructured":"Shengqiong Wu, Hao Fei, Leigang Qu, Wei Ji, and Tat-Seng Chua. 2024. NExT-GPT: Any-to-Any Multimodal LLM. In Proceedings of the International Conference on Machine Learning."},{"volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems. 79240--79259","year":"2023","author":"Wu Shengqiong","key":"e_1_3_2_1_57_1","unstructured":"Shengqiong Wu, Hao Fei, Hanwang Zhang, and Tat-Seng Chua. 2023. Imagine that! abstract-to-intricate text-to-image synthesis with scene graph hallucination diffusion. In Proceedings of the 37th International Conference on Neural Information Processing Systems. 79240--79259."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.738"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3074014"},{"volume-title":"Mitchell","year":"2016","author":"Yang Bishan","key":"e_1_3_2_1_60_1","unstructured":"Bishan Yang and Tom M. Mitchell. 2016. Joint Extraction of Events and Entities within a Document Context. In Proceedings of the NAACL. 289--299."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1522"},{"key":"e_1_3_2_1_62_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Zhang Ao","year":"2024","unstructured":"Ao Zhang, Hao Fei, Yuan Yao, Wei Ji, Li Li, Zhiyuan Liu, and Tat-Seng Chua. 2024. Vpgtrans: Transfer visual prompt generator across llms. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"volume-title":"Recognizing Everything from All Modalities at Once: Grounded Multimodal Universal Information Extraction. arXiv preprint arXiv:2406.03701","year":"2024","author":"Zhang Meishan","key":"e_1_3_2_1_63_1","unstructured":"Meishan Zhang, Hao Fei, Bin Wang, Shengqiong Wu, Yixin Cao, Fei Li, and Min Zhang. 2024. Recognizing Everything from All Modalities at Once: Grounded Multimodal Universal Information Extraction. arXiv preprint arXiv:2406.03701 (2024)."},{"volume-title":"Chen Change Loy, and Shuicheng Yan","year":"2024","author":"Zhang Tao","key":"e_1_3_2_1_64_1","unstructured":"Tao Zhang, Xiangtai Li, Hao Fei, Haobo Yuan, Shengqiong Wu, Shunping Ji, Chen Change Loy, and Shuicheng Yan. 2024. Omg-llava: Bridging image-level, object-level, pixel-level reasoning and understanding. arXiv preprint arXiv:2406.19389 (2024)."},{"volume-title":"End-to-end contextual asr based on posterior distribution adaptation for hybrid ctc\/attention system. CoRR","year":"2022","author":"Zhang Zhengyi","key":"e_1_3_2_1_65_1","unstructured":"Zhengyi Zhang and Pan Zhou. 2022. End-to-end contextual asr based on posterior distribution adaptation for hybrid ctc\/attention system. CoRR, Vol. abs\/2202.09003 (2022)."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612096"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680669","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,4]],"date-time":"2024-11-04T05:27:24Z","timestamp":1730698044000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680669"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":66,"alternative-id":["10.1145\/3664647.3680669","10.1145\/3664647"],"URL":"http:\/\/dx.doi.org\/10.1145\/3664647.3680669","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}