{"id":"https://openalex.org/W4399795520","doi":"https://doi.org/10.48550/arxiv.2406.11833","title":"MMDU: A Multi-Turn Multi-Image Dialog Understanding Benchmark and\n Instruction-Tuning Dataset for LVLMs","display_name":"MMDU: A Multi-Turn Multi-Image Dialog Understanding Benchmark and\n Instruction-Tuning Dataset for LVLMs","publication_year":2024,"publication_date":"2024-06-17","ids":{"openalex":"https://openalex.org/W4399795520","doi":"https://doi.org/10.48550/arxiv.2406.11833"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2406.11833","pdf_url":"https://arxiv.org/pdf/2406.11833","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2406.11833","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100350070","display_name":"Ziyu Liu","orcid":"https://orcid.org/0009-0005-4103-7824"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Ziyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059463562","display_name":"Tao Chu","orcid":"https://orcid.org/0000-0002-3190-7452"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chu, Tao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005200501","display_name":"Yuhang Zang","orcid":"https://orcid.org/0000-0003-1110-5062"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zang, Yuhang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113244350","display_name":"Xilin Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Xilin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055238399","display_name":"Xiaoyi Dong","orcid":"https://orcid.org/0000-0002-4654-835X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong, Xiaoyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100406335","display_name":"Pan Zhang","orcid":"https://orcid.org/0000-0001-8496-2730"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Pan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065261561","display_name":"Zijian Liang","orcid":"https://orcid.org/0000-0002-0195-571X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liang, Zijian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005925571","display_name":"Yuanjun Xiong","orcid":"https://orcid.org/0000-0002-6391-4921"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiong, Yuanjun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100748135","display_name":"Yu Qiao","orcid":"https://orcid.org/0000-0002-1889-2567"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiao, Yu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010087030","display_name":"Dahua Lin","orcid":"https://orcid.org/0000-0002-8865-7896"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Dahua","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100365340","display_name":"Jiaqi Wang","orcid":"https://orcid.org/0000-0001-6877-5353"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jiaqi","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":86},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Visual Question Answering in Images and Videos","score":0.9797,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Visual Question Answering in Images and Videos","score":0.9797,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Statistical Machine Translation and Natural Language Processing","score":0.9227,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.79887474},{"id":"https://openalex.org/keywords/multilingual-neural-machine-translation","display_name":"Multilingual Neural Machine Translation","score":0.564789},{"id":"https://openalex.org/keywords/multimodal-fusion","display_name":"Multimodal Fusion","score":0.549673},{"id":"https://openalex.org/keywords/visual-question-answering","display_name":"Visual Question Answering","score":0.549177},{"id":"https://openalex.org/keywords/image-captioning","display_name":"Image Captioning","score":0.544093},{"id":"https://openalex.org/keywords/language-understanding","display_name":"Language Understanding","score":0.536962}],"concepts":[{"id":"https://openalex.org/C173853756","wikidata":"https://www.wikidata.org/wiki/Q86915","display_name":"Dialog box","level":2,"score":0.9225278},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.79887474},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6840899},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.62595785},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.50789744},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3634847},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.36287418},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.14042026},{"id":"https://openalex.org/C58640448","wikidata":"https://www.wikidata.org/wiki/Q42515","display_name":"Cartography","level":1,"score":0.101480246},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.07501602}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2406.11833","pdf_url":"https://arxiv.org/pdf/2406.11833","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2406.11833","pdf_url":"https://arxiv.org/pdf/2406.11833","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4233992201","https://openalex.org/W2417260800","https://openalex.org/W2368721880","https://openalex.org/W2292950558","https://openalex.org/W2283130723","https://openalex.org/W2117933979","https://openalex.org/W2104718772","https://openalex.org/W2098987383","https://openalex.org/W1596203174","https://openalex.org/W103938586"],"abstract_inverted_index":{"Generating":[0],"natural":[1],"and":[2,52,86,96,102,115,122,147,159,203,207,212,233],"meaningful":[3],"responses":[4],"to":[5,94,110,162,183],"communicate":[6],"with":[7,50,130],"multi-modal":[8],"human":[9,128],"inputs":[10],"is":[11,151,239],"a":[12,46,83,88,139],"fundamental":[13],"capability":[14],"of":[15,72,133,141,168],"Large":[16],"Vision-Language":[17],"Models(LVLMs).":[18],"While":[19],"current":[20,163,230],"open-source":[21,120,176,193],"LVLMs":[22,73,171,177,194],"demonstrate":[23,190],"promising":[24],"performance":[25],"in":[26,37,45,74,100],"simplified":[27],"scenarios":[28,40],"such":[29,41],"as":[30,42],"single-turn":[31],"single-image":[32],"input,":[33],"they":[34],"fall":[35],"short":[36],"real-world":[38,75,234],"conversation":[39],"following":[43],"instructions":[44],"long":[47],"context":[48],"history":[49],"multi-turn":[51,101],"multi-images.":[53],"Existing":[54],"LVLM":[55,231],"benchmarks":[56,158,214],"primarily":[57],"focus":[58],"on":[59,195,210],"single-choice":[60],"questions":[61],"or":[62],"short-form":[63],"responses,":[64],"which":[65,150],"do":[66],"not":[67],"adequately":[68],"assess":[69],"the":[70,107,112,119,124,131,134,223,227],"capabilities":[71],"human-AI":[76],"interaction":[77],"applications.":[78],"Therefore,":[79],"we":[80],"introduce":[81],"MMDU,":[82],"comprehensive":[84],"benchmark,":[85],"MMDU-45k,":[87],"large-scale":[89],"instruction":[90,186],"tuning":[91,187],"dataset,":[92],"designed":[93],"evaluate":[95],"improve":[97],"LVLMs'":[98],"abilities":[99],"multi-image":[103],"conversations.":[104],"We":[105,189],"employ":[106],"clustering":[108],"algorithm":[109],"ffnd":[111],"relevant":[113],"images":[114],"textual":[116],"descriptions":[117],"from":[118],"Wikipedia":[121],"construct":[123],"question-answer":[125],"pairs":[126],"by":[127],"annotators":[129],"assistance":[132],"GPT-4o":[135],"model.":[136],"MMDU":[137,173,211],"has":[138],"maximum":[140],"18k":[142],"image+text":[143],"tokens,":[144],"20":[145],"images,":[146],"27":[148],"turns,":[149],"at":[152,241],"least":[153],"5x":[154],"longer":[155,202],"than":[156],"previous":[157],"poses":[160],"challenges":[161],"LVLMs.":[164],"Our":[165,220],"in-depth":[166],"analysis":[167],"15":[169],"representative":[170],"using":[172],"reveals":[174],"that":[175,191],"lag":[178],"behind":[179],"closed-source":[180],"counterparts":[181],"due":[182],"limited":[184],"conversational":[185],"data.":[188],"ffne-tuning":[192],"MMDU-45k":[196],"signiffcantly":[197],"address":[198],"this":[199],"gap,":[200],"generating":[201],"more":[204],"accurate":[205],"conversations,":[206],"improving":[208],"scores":[209],"existing":[213],"(MMStar:":[215],"+1.1%,":[216],"MathVista:":[217],"+1.5%,":[218],"ChartQA:+1.2%).":[219],"contributions":[221],"pave":[222],"way":[224],"for":[225],"bridging":[226],"gap":[228],"between":[229],"models":[232],"application":[235],"demands.":[236],"This":[237],"project":[238],"available":[240],"https://github.com/Liuziyu77/MMDU.":[242]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4399795520","counts_by_year":[],"updated_date":"2024-10-17T07:40:36.749388","created_date":"2024-06-19"}