{"id":"https://openalex.org/W4389364443","doi":"https://doi.org/10.48550/arxiv.2312.02119","title":"Tree of Attacks: Jailbreaking Black-Box LLMs Automatically","display_name":"Tree of Attacks: Jailbreaking Black-Box LLMs Automatically","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4389364443","doi":"https://doi.org/10.48550/arxiv.2312.02119"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2312.02119","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2312.02119","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5065512381","display_name":"Anay Mehrotra","orcid":"https://orcid.org/0000-0002-8566-5452"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mehrotra, Anay","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083461583","display_name":"Manolis Zampetakis","orcid":"https://orcid.org/0009-0005-4967-5927"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zampetakis, Manolis","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021913606","display_name":"Paul Kassianik","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kassianik, Paul","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108249434","display_name":"Blaine Nelson","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nelson, Blaine","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005817824","display_name":"Hyrum S. Anderson","orcid":"https://orcid.org/0009-0009-4720-6907"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Anderson, Hyrum","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050591232","display_name":"Yaron Singer","orcid":"https://orcid.org/0000-0001-6811-9102"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Singer, Yaron","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5020548562","display_name":"Amin Karbasi","orcid":"https://orcid.org/0000-0002-5898-0289"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Karbasi, Amin","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.787004,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":85,"max":89},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Natural Language Processing","score":0.9893,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Natural Language Processing","score":0.9893,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Statistical Machine Translation and Natural Language Processing","score":0.9821,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10260","display_name":"Empirical Studies in Software Engineering","score":0.9395,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pruning","display_name":"Pruning","score":0.7942184},{"id":"https://openalex.org/keywords/tree","display_name":"Tree (set theory)","score":0.6527591},{"id":"https://openalex.org/keywords/topic-modeling","display_name":"Topic Modeling","score":0.523678}],"concepts":[{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.7942184},{"id":"https://openalex.org/C113174947","wikidata":"https://www.wikidata.org/wiki/Q2859736","display_name":"Tree (set theory)","level":2,"score":0.6527591},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.65067905},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.4014483},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.114298254},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.08013287},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C6557445","wikidata":"https://www.wikidata.org/wiki/Q173113","display_name":"Agronomy","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2312.02119","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.02119","pdf_url":"http://arxiv.org/pdf/2312.02119","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2312.02119","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2312.02119","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[{"display_name":"Peace, justice, and strong institutions","score":0.81,"id":"https://metadata.un.org/sdg/16"}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W4285312748","https://openalex.org/W3203767529","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2382290278","https://openalex.org/W2376932109","https://openalex.org/W2358668433","https://openalex.org/W2157743338","https://openalex.org/W2001405890"],"abstract_inverted_index":{"While":[0],"Large":[1],"Language":[2],"Models":[3],"(LLMs)":[4],"display":[5],"versatile":[6],"functionality,":[7],"they":[8],"continue":[9],"to":[10,47,55,77,88,97,114],"generate":[11],"harmful,":[12],"biased,":[13],"and":[14,83,105,132],"toxic":[15],"content,":[16],"as":[17],"demonstrated":[18],"by":[19,157],"the":[20,48,67,71,78,85,108,115,139,166],"prevalence":[21],"of":[22,31,66,103,111,138,146,153],"human-designed":[23],"jailbreaks.":[24,91,173],"In":[25,117],"this":[26],"work,":[27],"we":[28,120],"present":[29],"Tree":[30],"Attacks":[32],"with":[33],"Pruning":[34],"(TAP),":[35],"an":[36,53],"automated":[37],"method":[38,170],"for":[39,134,171],"generating":[40,172],"jailbreaks":[41,70],"that":[42,122,126],"only":[43,142],"requires":[44],"black-box":[45,169],"access":[46],"target":[49],"LLM.":[50],"TAP":[51,80,96,123,149],"utilizes":[52],"LLM":[54],"iteratively":[56],"refine":[57],"candidate":[58],"(attack)":[59],"prompts":[60,69,76,104,125,140],"using":[61,141],"tree-of-thought":[62,93],"reasoning":[63,94],"until":[64],"one":[65],"generated":[68],"target.":[72,116],"Crucially,":[73],"before":[74],"sending":[75],"target,":[79],"assesses":[81],"them":[82],"prunes":[84],"ones":[86],"unlikely":[87],"result":[89],"in":[90],"Using":[92],"allows":[95],"navigate":[98],"a":[99,143],"large":[100],"search":[101],"space":[102],"pruning":[106],"reduces":[107],"total":[109],"number":[110,145],"queries":[112],"sent":[113],"empirical":[118],"evaluations,":[119],"observe":[121],"generates":[124],"jailbreak":[127],"state-of-the-art":[128,158,168],"LLMs":[129,155],"(including":[130],"GPT4":[131],"GPT4-Turbo)":[133],"more":[135],"than":[136],"80%":[137],"small":[144],"queries.":[147],"Interestingly,":[148],"is":[150],"also":[151],"capable":[152],"jailbreaking":[154],"protected":[156],"guardrails,":[159],"e.g.,":[160],"LlamaGuard.":[161],"This":[162],"significantly":[163],"improves":[164],"upon":[165],"previous":[167]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4389364443","counts_by_year":[{"year":2024,"cited_by_count":3}],"updated_date":"2024-11-26T15:03:53.097349","created_date":"2023-12-06"}