-
Notifications
You must be signed in to change notification settings - Fork 660
FEAT: Generalize Colloquial Wordswap Attack Converter #1348
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
c687a5f
675e6ce
a616b7e
698e4b4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| father: ["papa", "itay", "tay", "dad", "erpat"] | ||
| mother: ["mama", "inay", "nay", "mom", "ermat"] | ||
| grandfather: ["lolo", "lo", "lolo paps"] | ||
| grandmother: ["lola", "la", "lola mams"] | ||
| girl: ["nene", "inday", "ganda"] | ||
| boy: ["totoy", "dodong", "pogi"] | ||
| son: ["anak", "junior", "boy"] | ||
| daughter: ["anak", "nene", "ga"] | ||
| aunty: ["tita", "tiya", "tante"] | ||
| aunt: ["tita", "tiyang"] | ||
| man: ["kuya", "boss", "pare", "lodi", "chong"] | ||
| woman: ["ate", "miss", "ganda", "marites"] | ||
| uncle: ["tito", "tiyo", "tito paps"] | ||
| sister: ["ate", "sis", "sissy"] | ||
| brother: ["kuya", "bro", "paps", "tol"] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| father: ["papa", "bauji", "abba", "pitaji", "dad", "pop"] | ||
| mother: ["mummy", "maa", "ammi", "mataji", "amman"] | ||
| grandfather: ["dada", "nana", "dadaji", "nanaji"] | ||
| grandmother: ["dadi", "nani", "dadiji", "naniji"] | ||
| girl: ["beti", "kudi", "larki", "gudiya"] | ||
| boy: ["beta", "munda", "larka", "chokra"] | ||
| son: ["beta", "ladla", "puttar"] | ||
| daughter: ["beti", "bitiya", "laado"] | ||
| aunty: ["aunty", "mausi", "bua", "chachi", "mami"] | ||
| aunt: ["aunty", "mausi", "bua", "chachi", "mami"] | ||
| man: ["bhai", "bhaiyya", "uncle", "banda", "yaar"] | ||
| woman: ["behenji", "didiji", "bandi", "memsaab"] | ||
| uncle: ["uncle", "chacha", "mama", "fufa", "tauji"] | ||
| sister: ["didi", "behen", "behna", "sis"] | ||
| brother: ["bhai", "bhaiyya", "bro", "veer", "paji"] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| father: ["pops", "old man", "dadman", "pa"] | ||
| mother: ["mumsy", "moms", "mummy", "ma"] | ||
| grandfather: ["grandad", "gramps"] | ||
| grandmother: ["nan", "nanna", "gran"] | ||
| girl: ["gyal", "ting", "shorty", " peng ting"] | ||
| boy: ["yute", "man", "lil man"] | ||
| son: ["yute", "junior", "son-son"] | ||
| daughter: ["princess", "baby girl"] | ||
| aunty: ["aunty", "tantie"] | ||
| aunt: ["aunty", "aunt"] | ||
| man: ["mandem", "man", "geezer", "bloke"] | ||
| woman: ["gyal", "lady", "madam"] | ||
| uncle: ["unc", "uncle"] | ||
| sister: ["sis", "sissy"] | ||
| brother: ["bruv", "bredrin", "fam", "blood", "brudda"] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| father: ["papa", "lao bei", "lim pei", "bapa", "appa"] | ||
| mother: ["mama", "amma", "ibu"] | ||
| grandfather: ["ah gong", "thatha", "dato"] | ||
| grandmother: ["ah ma", "patti", "nenek"] | ||
| girl: ["ah ger", "ponnu"] | ||
| boy: ["ah boy", "boi", "payyan"] | ||
| son: ["ah boy", "boi", "payyan"] | ||
| daughter: ["ah ger", "ponnu"] | ||
| aunty: ["makcik", "maami"] | ||
| aunt: ["makcik", "maami"] | ||
| man: ["ah beng", "shuai ge"] | ||
| woman: ["ah lian", "xiao mei"] | ||
| uncle: ["encik", "unker"] | ||
| sister: ["xjj", "jie jie", "zhezhe", "kaka", "akka", "thangatchi"] | ||
| brother: ["bro", "boiboi", "di di", "xdd", "anneh", "thambi"] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| father: ["daddy", "pa", "pops", "old man"] | ||
| mother: ["mama", "momma", "ma"] | ||
| grandfather: ["pawpaw", "grandpappy", "gramps"] | ||
| grandmother: ["mawmaw", "meemaw", "nana", "grammy"] | ||
| girl: ["gal", "missy", "sugar"] | ||
| boy: ["bubba", "sonny", "boy"] | ||
| son: ["junior", "bub", "son"] | ||
| daughter: ["sissy", "honey", "sugar"] | ||
| aunty: ["auntie"] | ||
| aunt: ["auntie"] | ||
| man: ["fella", "hoss", "bubba"] | ||
| woman: ["ma'am", "lady", "missy"] | ||
| uncle: ["unk"] | ||
| sister: ["sissy", "sis"] | ||
| brother: ["bubba", "bro"] |
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -5,6 +5,9 @@ | |||||||||||
| import re | ||||||||||||
| from typing import Dict, List, Optional | ||||||||||||
|
|
||||||||||||
| import yaml | ||||||||||||
|
|
||||||||||||
| from pyrit.common.path import DATASETS_PATH | ||||||||||||
|
||||||||||||
| from pyrit.identifiers import ConverterIdentifier | ||||||||||||
| from pyrit.models import PromptDataType | ||||||||||||
| from pyrit.prompt_converter.prompt_converter import ConverterResult, PromptConverter | ||||||||||||
|
|
@@ -19,37 +22,53 @@ class ColloquialWordswapConverter(PromptConverter): | |||||||||||
| SUPPORTED_OUTPUT_TYPES = ("text",) | ||||||||||||
|
|
||||||||||||
| def __init__( | ||||||||||||
| self, deterministic: bool = False, custom_substitutions: Optional[Dict[str, List[str]]] = None | ||||||||||||
| self, | ||||||||||||
| deterministic: bool = False, | ||||||||||||
| *, | ||||||||||||
|
Comment on lines
+26
to
+27
|
||||||||||||
| deterministic: bool = False, | |
| *, | |
| *, | |
| deterministic: bool = False, |
Copilot
AI
Feb 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Typo in comment: "singaporean substituions" -> "singaporean substitutions".
| # if neither custom_sub nor wordswap_path is given then singaporean substituions are used | |
| # if neither custom_sub nor wordswap_path is given then singaporean substitutions are used |
Copilot
AI
Feb 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
wordswap_path is appended with / directly, so an absolute path (or a path with ..) can escape colloquial_wordswaps and load an arbitrary YAML file. Constrain this to files within the intended directory (e.g., resolve the candidate path and verify it is relative to wordswap_directory, and reject absolute paths / path traversal).
Copilot
AI
Feb 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The docstring says invalid YAML raises ValueError, but yaml.safe_load can raise yaml.YAMLError which will currently escape. Catch yaml.YAMLError and re-raise a ValueError (ideally including the file path) to match the documented behavior.
| data = yaml.safe_load(f) | |
| try: | |
| data = yaml.safe_load(f) | |
| except yaml.YAMLError as exc: | |
| raise ValueError(f"Invalid YAML format in wordswap file: {file_path}") from exc |
Copilot
AI
Feb 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
New error paths were introduced (missing YAML file, invalid YAML, invalid mapping format) but there are no unit tests covering these failure modes. Add tests that assert the specific exception types/messages for these branches to prevent regressions.
Copilot
AI
Feb 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Only the top-level YAML type is validated. If any substitution value is not a non-empty list[str], convert_async will crash (IndexError/random.choice) or produce non-string outputs. Validate that keys are strings (and normalize to lowercase if needed) and that each value is a non-empty list of strings before storing it.
Copilot
AI
Feb 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Converter behavior now depends on the selected wordswap source (custom dict vs specific YAML file), but the identifier logic doesn't currently incorporate that source. Please update the converter identifier so two instances with different wordswap_path/substitutions don't end up with the same identifier (e.g., include wordswap_path or a stable hash of the substitutions content).
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -61,21 +61,24 @@ async def test_colloquial_non_deterministic(input_text): | |
| assert output_word == input_word | ||
|
|
||
|
|
||
| # Test for custom substitutions | ||
| # Test for nondefault substitutions. | ||
| @pytest.mark.asyncio | ||
| @pytest.mark.parametrize( | ||
| "input_text,custom_substitutions,expected_output", | ||
| "input_text,wordswap_path,expected_output", | ||
| [ | ||
| ("father", {"father": ["appa", "darth vader"]}, "appa"), # Custom substitution father -> appa | ||
| ("father", "filipino.yaml", "papa"), | ||
| ("woman", "indian.yaml", "behenji"), | ||
| ("son", "southern_american.yaml", "junior"), | ||
| ("man", "multicultural_london.yaml", "mandem"), | ||
| ], | ||
| ) | ||
| async def test_colloquial_custom_substitutions(input_text, custom_substitutions, expected_output): | ||
| converter = ColloquialWordswapConverter(deterministic=True, custom_substitutions=custom_substitutions) | ||
| async def test_colloquial_custom_substitutions(input_text, wordswap_path, expected_output): | ||
| converter = ColloquialWordswapConverter(deterministic=True, wordswap_path=wordswap_path) | ||
| result = await converter.convert_async(prompt=input_text) | ||
| assert result.output_text == expected_output | ||
|
|
||
|
|
||
| # Test for empty custom substitutions | ||
| # Test for empty wordswap_path | ||
| @pytest.mark.asyncio | ||
| @pytest.mark.parametrize( | ||
| "input_text,expected_output", | ||
|
|
@@ -84,7 +87,7 @@ async def test_colloquial_custom_substitutions(input_text, custom_substitutions, | |
| ], | ||
| ) | ||
| async def test_colloquial_empty_custom_substitutions(input_text, expected_output): | ||
| converter = ColloquialWordswapConverter(deterministic=True, custom_substitutions={}) | ||
| converter = ColloquialWordswapConverter(deterministic=True, wordswap_path=None) | ||
| result = await converter.convert_async(prompt=input_text) | ||
|
Comment on lines
81
to
91
|
||
| assert result.output_text == expected_output | ||
|
|
||
|
|
@@ -105,6 +108,22 @@ async def test_multiple_words(input_text, expected_output): | |
| assert result.output_text == expected_output | ||
|
|
||
|
|
||
| # Test multiple word prompts for custom colloquialism | ||
| @pytest.mark.asyncio | ||
| @pytest.mark.parametrize( | ||
| "input_text,wordswap_path,expected_output", | ||
| [ | ||
| ("father and mother", "indian.yaml", "papa and mummy"), | ||
| ("brother and sister", "southern_american.yaml", "bubba and sissy"), | ||
| ("aunt and uncle", "multicultural_london.yaml", "aunty and unc"), | ||
| ], | ||
| ) | ||
| async def test_multiple_words_custom_colloquialisms(input_text, wordswap_path, expected_output): | ||
| converter = ColloquialWordswapConverter(deterministic=True, wordswap_path=wordswap_path) | ||
| result = await converter.convert_async(prompt=input_text) | ||
| assert result.output_text == expected_output | ||
|
|
||
|
|
||
| # Test for awkward spacing | ||
| @pytest.mark.asyncio | ||
| @pytest.mark.parametrize( | ||
|
|
@@ -139,3 +158,19 @@ def test_colloquial_converter_input_supported() -> None: | |
| converter = ColloquialWordswapConverter() | ||
| assert converter.input_supported("text") is True | ||
| assert converter.input_supported("image_path") is False | ||
|
|
||
|
|
||
| # Test that the constructor raises a ValueError when both custom_substitutions and wordswap_path are provided. | ||
| def test_init_conflict_custom_substitutions_and_path(): | ||
| with pytest.raises(ValueError, match="Provide either custom_substitutions or wordswap_path"): | ||
| ColloquialWordswapConverter(custom_substitutions={"foo": ["bar"]}, wordswap_path="some_file.yaml") | ||
|
|
||
|
|
||
| # test to check if direct dictionary of substitutions is passed and applies to prompt conversion correctly | ||
| @pytest.mark.asyncio | ||
| async def test_init_with_custom_substitutions_dict(): | ||
| custom_subs = {"hello": ["hi", "hey"], "world": ["earth"]} | ||
| # Use deterministic=True to ensure it picks the first item ("hi") for assertion | ||
| converter = ColloquialWordswapConverter(deterministic=True, custom_substitutions=custom_subs) | ||
| result = await converter.convert_async(prompt="Hello world") | ||
| assert result.output_text == "hi earth" | ||
|
Comment on lines
111
to
176
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The newly-added YAML value includes a leading space (
" peng ting"), which will introduce unexpected spacing in outputs. Remove the leading whitespace so substitutions are clean.