{"id":"https://openalex.org/W4400435090","doi":"https://doi.org/10.48550/arxiv.2407.04622","title":"On scalable oversight with weak LLMs judging strong LLMs","display_name":"On scalable oversight with weak LLMs judging strong LLMs","publication_year":2024,"publication_date":"2024-07-05","ids":{"openalex":"https://openalex.org/W4400435090","doi":"https://doi.org/10.48550/arxiv.2407.04622"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2407.04622","pdf_url":"https://arxiv.org/pdf/2407.04622","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2407.04622","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5069438696","display_name":"Zachary Kenton","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kenton, Zachary","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100258054","display_name":"Noah Y. Siegel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Siegel, Noah Y.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027979882","display_name":"J\u00e1nos Kram\u00e1r","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kram\u00e1r, J\u00e1nos","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100258055","display_name":"Jonah Brown-Cohen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Brown-Cohen, Jonah","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100258056","display_name":"Samuel Albanie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Albanie, Samuel","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100258057","display_name":"Jannis Bulian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bulian, Jannis","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100258058","display_name":"Rishabh Agarwal","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Agarwal, Rishabh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059597480","display_name":"David Lindner","orcid":"https://orcid.org/0000-0001-7051-7433"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lindner, David","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100258059","display_name":"Yunhao Tang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Yunhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100258053","display_name":"Noah D. Goodman","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Goodman, Noah D.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5012971694","display_name":"Rohin Shah","orcid":"https://orcid.org/0000-0002-0656-2800"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shah, Rohin","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.999954,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":84,"max":93},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.9305,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.9305,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11182","display_name":"Auction Theory and Applications","score":0.9271,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10927","display_name":"Access Control and Trust","score":0.9071,"subfield":{"id":"https://openalex.org/subfields/3312","display_name":"Sociology and Political Science"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[],"concepts":[{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.37640172},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.35674578}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2407.04622","pdf_url":"https://arxiv.org/pdf/2407.04622","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2407.04622","pdf_url":"https://arxiv.org/pdf/2407.04622","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W594353338","https://openalex.org/W4391375266","https://openalex.org/W4390697879","https://openalex.org/W2949263084","https://openalex.org/W2922049016","https://openalex.org/W2765153054","https://openalex.org/W2748952813","https://openalex.org/W2743539335","https://openalex.org/W2596173151","https://openalex.org/W2070214669"],"abstract_inverted_index":{"Scalable":[0],"oversight":[1],"protocols":[2],"aim":[3],"to":[4,7,22,32,41,76,106,131,139,180,187,192],"enable":[5],"humans":[6],"accurately":[8],"supervise":[9],"superhuman":[10],"AI.":[11,55],"In":[12],"this":[13],"paper":[14],"we":[15,184,195,212],"study":[16],"debate,":[17],"where":[18,27,47],"two":[19],"AI's":[20],"compete":[21],"convince":[23,33],"a":[24,28,34,42,85,98],"judge;":[25],"consultancy,":[26],"single":[29,99],"AI":[30,64],"tries":[31],"judge":[35,49,74,219],"that":[36,118,214],"asks":[37],"questions;":[38],"and":[39,66,92,112],"compare":[40],"baseline":[43],"of":[44,88,149],"direct":[45,140,160],"question-answering,":[46],"the":[48,54,73,126,134,143,147,170,203],"just":[50],"answers":[51],"outright":[52],"without":[53,167],"We":[56,82,116],"use":[57],"large":[58],"language":[59],"models":[60,75,217],"(LLMs)":[61],"as":[62,67],"both":[63],"agents":[65],"stand-ins":[68],"for":[69,133],"human":[70],"judges,":[71],"taking":[72],"be":[77],"weaker":[78],"than":[79,208,224],"agent":[80],"models.":[81],"benchmark":[83],"on":[84,97,146],"diverse":[86],"range":[87],"asymmetries":[89],"between":[90],"judges":[91,197],"agents,":[93],"extending":[94],"previous":[95,226],"work":[96,175],"extractive":[100,152],"QA":[101,153],"task":[102],"with":[103,155],"information":[104,156,168],"asymmetry,":[105],"also":[107],"include":[108],"mathematics,":[109],"coding,":[110],"logic":[111],"multimodal":[113],"reasoning":[114],"asymmetries.":[115],"find":[117,196,213],"debate":[119,138,158,207],"outperforms":[120,159],"consultancy":[121],"across":[122],"all":[123],"tasks":[124,154,166],"when":[125],"consultant":[127],"is":[128],"randomly":[129],"assigned":[130,176],"argue":[132,181,193],"correct/incorrect":[135],"answer.":[136],"Comparing":[137],"question":[141,161],"answering,":[142,162],"results":[144,171],"depend":[145],"type":[148],"task:":[150],"in":[151,164,206,209,225],"asymmetry":[157,169],"but":[163],"other":[165],"are":[172,198],"mixed.":[173],"Previous":[174],"debaters/consultants":[177],"an":[178],"answer":[179,191,205],"for.":[182],"When":[183],"allow":[185],"them":[186],"instead":[188],"choose":[189],"which":[190],"for,":[194],"less":[199],"frequently":[200],"convinced":[201],"by":[202],"wrong":[204],"consultancy.":[210],"Further,":[211],"stronger":[215],"debater":[216],"increase":[218],"accuracy,":[220],"though":[221],"more":[222],"modestly":[223],"studies.":[227]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4400435090","counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2024-12-10T14:04:35.189731","created_date":"2024-07-09"}