From c687a5f96211b2cb3807cdd09e8933711aa50e37 Mon Sep 17 00:00:00 2001 From: Taher Date: Tue, 3 Feb 2026 19:37:32 -0500 Subject: [PATCH 1/3] FEAT: Generalize Colloquial Wordswap Attack Converter (#418) --- .../colloquial_wordswaps/filipino.yaml | 15 +++++ .../colloquial_wordswaps/indian.yaml | 15 +++++ .../multicultural_london.yaml | 15 +++++ .../colloquial_wordswaps/singaporean.yaml | 15 +++++ .../southern_american.yaml | 15 +++++ .../colloquial_wordswap_converter.py | 61 +++++++++++-------- .../test_colloquial_wordswap_converter.py | 33 +++++++--- 7 files changed, 136 insertions(+), 33 deletions(-) create mode 100644 pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml create mode 100644 pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml create mode 100644 pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml create mode 100644 pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml create mode 100644 pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml new file mode 100644 index 0000000000..0bcf1dedc8 --- /dev/null +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml @@ -0,0 +1,15 @@ +father: ["papa", "itay", "tay", "dad", "erpat"] +mother: ["mama", "inay", "nay", "mom", "ermat"] +grandfather: ["lolo", "lo", "lolo paps"] +grandmother: ["lola", "la", "lola mams"] +girl: ["nene", "inday", "ganda"] +boy: ["totoy", "dodong", "pogi"] +son: ["anak", "junior", "boy"] +daughter: ["anak", "nene", "ga"] +aunty: ["tita", "tiya", "tante"] +aunt: ["tita", "tiyang"] +man: ["kuya", "boss", "pare", "lodi", "chong"] +woman: ["ate", "miss", "ganda", "marites"] +uncle: ["tito", "tiyo", "tito paps"] +sister: ["ate", "sis", "sissy"] +brother: ["kuya", "bro", "paps", "tol"] \ No newline at end of file diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml new file mode 100644 index 0000000000..56d5da5c3b --- /dev/null +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml @@ -0,0 +1,15 @@ +father: ["papa", "bauji", "abba", "pitaji", "dad", "pop"] +mother: ["mummy", "maa", "ammi", "mataji", "amman"] +grandfather: ["dada", "nana", "dadaji", "nanaji"] +grandmother: ["dadi", "nani", "dadiji", "naniji"] +girl: ["beti", "kudi", "larki", "gudiya"] +boy: ["beta", "munda", "larka", "chokra"] +son: ["beta", "ladla", "puttar"] +daughter: ["beti", "bitiya", "laado"] +aunty: ["aunty", "mausi", "bua", "chachi", "mami"] +aunt: ["aunty", "mausi", "bua", "chachi", "mami"] +man: ["bhai", "bhaiyya", "uncle", "banda", "yaar"] +woman: ["behenji", "didiji", "bandi", "memsaab"] +uncle: ["uncle", "chacha", "mama", "fufa", "tauji"] +sister: ["didi", "behen", "behna", "sis"] +brother: ["bhai", "bhaiyya", "bro", "veer", "paji"] \ No newline at end of file diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml new file mode 100644 index 0000000000..3b72c0530a --- /dev/null +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml @@ -0,0 +1,15 @@ +father: ["pops", "old man", "dadman", "pa"] +mother: ["mumsy", "moms", "mummy", "ma"] +grandfather: ["grandad", "gramps"] +grandmother: ["nan", "nanna", "gran"] +girl: ["gyal", "ting", "shorty", " peng ting"] +boy: ["yute", "man", "lil man"] +son: ["yute", "junior", "son-son"] +daughter: ["princess", "baby girl"] +aunty: ["aunty", "tantie"] +aunt: ["aunty", "aunt"] +man: ["mandem", "man", "geezer", "bloke"] +woman: ["gyal", "lady", "madam"] +uncle: ["unc", "uncle"] +sister: ["sis", "sissy"] +brother: ["bruv", "bredrin", "fam", "blood", "brudda"] \ No newline at end of file diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml new file mode 100644 index 0000000000..1e87da2979 --- /dev/null +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml @@ -0,0 +1,15 @@ +father: ["papa", "lao bei", "lim pei", "bapa", "appa"] +mother: ["mama", "amma", "ibu"] +grandfather: ["ah gong", "thatha", "dato"] +grandmother: ["ah ma", "patti", "nenek"] +girl: ["ah ger", "ponnu"] +boy: ["ah boy", "boi", "payyan"] +son: ["ah boy", "boi", "payyan"] +daughter: ["ah ger", "ponnu"] +aunty: ["makcik", "maami"] +aunt: ["makcik", "maami"] +man: ["ah beng", "shuai ge"] +woman: ["ah lian", "xiao mei"] +uncle: ["encik", "unker"] +sister: ["xjj", "jie jie", "zhezhe", "kaka", "akka", "thangatchi"] +brother: ["bro", "boiboi", "di di", "xdd", "anneh", "thambi"] \ No newline at end of file diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml new file mode 100644 index 0000000000..a5b7ff4511 --- /dev/null +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml @@ -0,0 +1,15 @@ +father: ["daddy", "pa", "pops", "old man"] +mother: ["mama", "momma", "ma"] +grandfather: ["pawpaw", "grandpappy", "gramps"] +grandmother: ["mawmaw", "meemaw", "nana", "grammy"] +girl: ["gal", "missy", "sugar"] +boy: ["bubba", "sonny", "boy"] +son: ["junior", "bub", "son"] +daughter: ["sissy", "honey", "sugar"] +aunty: ["auntie"] +aunt: ["auntie"] +man: ["fella", "hoss", "bubba"] +woman: ["ma'am", "lady", "missy"] +uncle: ["unk"] +sister: ["sissy", "sis"] +brother: ["bubba", "bro"] \ No newline at end of file diff --git a/pyrit/prompt_converter/colloquial_wordswap_converter.py b/pyrit/prompt_converter/colloquial_wordswap_converter.py index 96ab2d6c15..94e110c324 100644 --- a/pyrit/prompt_converter/colloquial_wordswap_converter.py +++ b/pyrit/prompt_converter/colloquial_wordswap_converter.py @@ -3,7 +3,10 @@ import random import re -from typing import Dict, List, Optional +from pathlib import Path +from typing import Optional + +import yaml from pyrit.identifiers import ConverterIdentifier from pyrit.models import PromptDataType @@ -18,38 +21,44 @@ class ColloquialWordswapConverter(PromptConverter): SUPPORTED_INPUT_TYPES = ("text",) SUPPORTED_OUTPUT_TYPES = ("text",) - def __init__( - self, deterministic: bool = False, custom_substitutions: Optional[Dict[str, List[str]]] = None - ) -> None: + def __init__(self, deterministic: bool = False, wordswap_path: Optional[str] = None) -> None: """ Initialize the converter with optional deterministic mode and custom substitutions. Args: deterministic (bool): If True, use the first substitution for each wordswap. If False, randomly choose a substitution for each wordswap. Defaults to False. - custom_substitutions (Optional[Dict[str, List[str]]], Optional): A dictionary of custom substitutions to - override the defaults. Defaults to None. + wordswap_path (Optional[str]): File name of a YAML file in ../../datasets/prompt_converters/colloquial_wordswaps + directory containing a dictionary of substitutions. Defaults to None. + + Raises: + FileNotFoundError: If the wordswap YAML file is not found. + ValueError: If the YAML file is formatted incorrectly or empty. """ - default_substitutions = { - "father": ["papa", "lao bei", "lim pei", "bapa", "appa"], - "mother": ["mama", "amma", "ibu"], - "grandfather": ["ah gong", "thatha", "dato"], - "grandmother": ["ah ma", "patti", "nenek"], - "girl": ["ah ger", "ponnu"], - "boy": ["ah boy", "boi", "payyan"], - "son": ["ah boy", "boi", "payyan"], - "daughter": ["ah ger", "ponnu"], - "aunt": ["makcik", "maami"], - "aunty": ["makcik", "maami"], - "man": ["ah beng", "shuai ge"], - "woman": ["ah lian", "xiao mei"], - "uncle": ["encik", "unker"], - "sister": ["xjj", "jie jie", "zhezhe", "kaka", "akka", "thangatchi"], - "brother": ["bro", "boiboi", "di di", "xdd", "anneh", "thambi"], - } - - # Use custom substitutions if provided, otherwise default to the standard ones - self._colloquial_substitutions = custom_substitutions if custom_substitutions else default_substitutions + # Use custom substitutions if wordswap_path provided, otherwise default to singaporean.yaml + if wordswap_path: + file_path = ( + Path(__file__).parent.parent / "datasets" / "prompt_converters" / "colloquial_wordswaps" / wordswap_path + ) + else: + file_path = ( + Path(__file__).parent.parent + / "datasets" + / "prompt_converters" + / "colloquial_wordswaps" + / "singaporean.yaml" + ) + + if not file_path.exists(): + raise FileNotFoundError(f"Colloquial wordswap file not found: {file_path}") + + with file_path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + # Ensure that wordswap YAML is in the correct format. + if not isinstance(data, dict): + raise ValueError("Wordswap YAML must contain a dictionary of word -> list of substitutions") + + self._colloquial_substitutions = data self._deterministic = deterministic def _build_identifier(self) -> ConverterIdentifier: diff --git a/tests/unit/converter/test_colloquial_wordswap_converter.py b/tests/unit/converter/test_colloquial_wordswap_converter.py index f8203511fa..520a173dca 100644 --- a/tests/unit/converter/test_colloquial_wordswap_converter.py +++ b/tests/unit/converter/test_colloquial_wordswap_converter.py @@ -61,21 +61,24 @@ async def test_colloquial_non_deterministic(input_text): assert output_word == input_word -# Test for custom substitutions +# Test for nondefault substitutions. @pytest.mark.asyncio @pytest.mark.parametrize( - "input_text,custom_substitutions,expected_output", + "input_text,wordswap_path,expected_output", [ - ("father", {"father": ["appa", "darth vader"]}, "appa"), # Custom substitution father -> appa + ("father", "filipino.yaml", "papa"), + ("woman", "indian.yaml", "behenji"), + ("son", "southern_american.yaml", "junior"), + ("man", "multicultural_london.yaml", "mandem"), ], ) -async def test_colloquial_custom_substitutions(input_text, custom_substitutions, expected_output): - converter = ColloquialWordswapConverter(deterministic=True, custom_substitutions=custom_substitutions) +async def test_colloquial_custom_substitutions(input_text, wordswap_path, expected_output): + converter = ColloquialWordswapConverter(deterministic=True, wordswap_path=wordswap_path) result = await converter.convert_async(prompt=input_text) assert result.output_text == expected_output -# Test for empty custom substitutions +# Test for empty wordswap_path @pytest.mark.asyncio @pytest.mark.parametrize( "input_text,expected_output", @@ -84,7 +87,7 @@ async def test_colloquial_custom_substitutions(input_text, custom_substitutions, ], ) async def test_colloquial_empty_custom_substitutions(input_text, expected_output): - converter = ColloquialWordswapConverter(deterministic=True, custom_substitutions={}) + converter = ColloquialWordswapConverter(deterministic=True, wordswap_path="") result = await converter.convert_async(prompt=input_text) assert result.output_text == expected_output @@ -105,6 +108,22 @@ async def test_multiple_words(input_text, expected_output): assert result.output_text == expected_output +# Test multiple word prompts for custom colloquialism +@pytest.mark.asyncio +@pytest.mark.parametrize( + "input_text,wordswap_path,expected_output", + [ + ("father and mother", "indian.yaml", "papa and mummy"), + ("brother and sister", "southern_american.yaml", "bubba and sissy"), + ("aunt and uncle", "multicultural_london.yaml", "aunty and unc"), + ], +) +async def test_multiple_words_custom_colloquialisms(input_text, wordswap_path, expected_output): + converter = ColloquialWordswapConverter(deterministic=True, wordswap_path=wordswap_path) + result = await converter.convert_async(prompt=input_text) + assert result.output_text == expected_output + + # Test for awkward spacing @pytest.mark.asyncio @pytest.mark.parametrize( From 675e6ce4d4911a348951a8e2f1ba156f1bfae9d5 Mon Sep 17 00:00:00 2001 From: Taher Date: Tue, 3 Feb 2026 21:32:06 -0500 Subject: [PATCH 2/3] style: fix missing newlines at the end of yaml files --- .../prompt_converters/colloquial_wordswaps/filipino.yaml | 2 +- .../datasets/prompt_converters/colloquial_wordswaps/indian.yaml | 2 +- .../colloquial_wordswaps/multicultural_london.yaml | 2 +- .../prompt_converters/colloquial_wordswaps/singaporean.yaml | 2 +- .../colloquial_wordswaps/southern_american.yaml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml index 0bcf1dedc8..e7bbcb9ccb 100644 --- a/pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml @@ -12,4 +12,4 @@ man: ["kuya", "boss", "pare", "lodi", "chong"] woman: ["ate", "miss", "ganda", "marites"] uncle: ["tito", "tiyo", "tito paps"] sister: ["ate", "sis", "sissy"] -brother: ["kuya", "bro", "paps", "tol"] \ No newline at end of file +brother: ["kuya", "bro", "paps", "tol"] diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml index 56d5da5c3b..e0798310ac 100644 --- a/pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml @@ -12,4 +12,4 @@ man: ["bhai", "bhaiyya", "uncle", "banda", "yaar"] woman: ["behenji", "didiji", "bandi", "memsaab"] uncle: ["uncle", "chacha", "mama", "fufa", "tauji"] sister: ["didi", "behen", "behna", "sis"] -brother: ["bhai", "bhaiyya", "bro", "veer", "paji"] \ No newline at end of file +brother: ["bhai", "bhaiyya", "bro", "veer", "paji"] diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml index 3b72c0530a..874765856a 100644 --- a/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml @@ -12,4 +12,4 @@ man: ["mandem", "man", "geezer", "bloke"] woman: ["gyal", "lady", "madam"] uncle: ["unc", "uncle"] sister: ["sis", "sissy"] -brother: ["bruv", "bredrin", "fam", "blood", "brudda"] \ No newline at end of file +brother: ["bruv", "bredrin", "fam", "blood", "brudda"] diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml index 1e87da2979..f64e94a972 100644 --- a/pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml @@ -12,4 +12,4 @@ man: ["ah beng", "shuai ge"] woman: ["ah lian", "xiao mei"] uncle: ["encik", "unker"] sister: ["xjj", "jie jie", "zhezhe", "kaka", "akka", "thangatchi"] -brother: ["bro", "boiboi", "di di", "xdd", "anneh", "thambi"] \ No newline at end of file +brother: ["bro", "boiboi", "di di", "xdd", "anneh", "thambi"] diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml index a5b7ff4511..4dfa65ec30 100644 --- a/pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml @@ -12,4 +12,4 @@ man: ["fella", "hoss", "bubba"] woman: ["ma'am", "lady", "missy"] uncle: ["unk"] sister: ["sissy", "sis"] -brother: ["bubba", "bro"] \ No newline at end of file +brother: ["bubba", "bro"] From a616b7ee16653a24affcf2cc4604f55859ad597c Mon Sep 17 00:00:00 2001 From: Taher Date: Fri, 6 Feb 2026 14:20:45 -0500 Subject: [PATCH 3/3] Fixed function signature and file path --- .../colloquial_wordswap_converter.py | 58 +++++++++++-------- .../test_colloquial_wordswap_converter.py | 18 +++++- 2 files changed, 51 insertions(+), 25 deletions(-) diff --git a/pyrit/prompt_converter/colloquial_wordswap_converter.py b/pyrit/prompt_converter/colloquial_wordswap_converter.py index 94e110c324..972b9134df 100644 --- a/pyrit/prompt_converter/colloquial_wordswap_converter.py +++ b/pyrit/prompt_converter/colloquial_wordswap_converter.py @@ -3,11 +3,11 @@ import random import re -from pathlib import Path -from typing import Optional +from typing import Dict, List, Optional import yaml +from pyrit.common.path import DATASETS_PATH from pyrit.identifiers import ConverterIdentifier from pyrit.models import PromptDataType from pyrit.prompt_converter.prompt_converter import ConverterResult, PromptConverter @@ -21,44 +21,54 @@ class ColloquialWordswapConverter(PromptConverter): SUPPORTED_INPUT_TYPES = ("text",) SUPPORTED_OUTPUT_TYPES = ("text",) - def __init__(self, deterministic: bool = False, wordswap_path: Optional[str] = None) -> None: + def __init__( + self, + deterministic: bool = False, + *, + custom_substitutions: Optional[Dict[str, List[str]]] = None, + wordswap_path: Optional[str] = None, + ) -> None: """ Initialize the converter with optional deterministic mode and custom substitutions. Args: deterministic (bool): If True, use the first substitution for each wordswap. If False, randomly choose a substitution for each wordswap. Defaults to False. - wordswap_path (Optional[str]): File name of a YAML file in ../../datasets/prompt_converters/colloquial_wordswaps - directory containing a dictionary of substitutions. Defaults to None. + custom_substitutions (Optional[Dict[str, List[str]]], Optional): A dictionary of + custom substitutions to override the defaults. Defaults to none. + wordswap_path (Optional[str]): Name of a YAML file located in the + PyRIT datasets prompt_converters/colloquial_wordswaps directory. Raises: FileNotFoundError: If the wordswap YAML file is not found. - ValueError: If the YAML file is formatted incorrectly or empty. + ValueError: If both parameters are provided or YAML format is invalid. """ - # Use custom substitutions if wordswap_path provided, otherwise default to singaporean.yaml - if wordswap_path: - file_path = ( - Path(__file__).parent.parent / "datasets" / "prompt_converters" / "colloquial_wordswaps" / wordswap_path - ) + if custom_substitutions is not None and wordswap_path is not None: + raise ValueError("Provide either custom_substitutions or wordswap_path, not both.") + + wordswap_directory = DATASETS_PATH / "prompt_converters" / "colloquial_wordswaps" + + # custom_substitution arg prioritization + if custom_substitutions is not None: + self._colloquial_substitutions = custom_substitutions else: + # if neither custom_sub nor wordswap_path is given then singaporean substituions are used file_path = ( - Path(__file__).parent.parent - / "datasets" - / "prompt_converters" - / "colloquial_wordswaps" - / "singaporean.yaml" + wordswap_directory / wordswap_path + if wordswap_path is not None + else wordswap_directory / "singaporean.yaml" ) - if not file_path.exists(): - raise FileNotFoundError(f"Colloquial wordswap file not found: {file_path}") + if not file_path.exists(): + raise FileNotFoundError(f"Colloquial wordswap file not found: {file_path}") + with file_path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + # ensure that wordswap YAML is in the correct format. + if not isinstance(data, dict): + raise ValueError("Wordswap YAML must be a dict[str, list[str]] mapping words to substitutions") - with file_path.open("r", encoding="utf-8") as f: - data = yaml.safe_load(f) - # Ensure that wordswap YAML is in the correct format. - if not isinstance(data, dict): - raise ValueError("Wordswap YAML must contain a dictionary of word -> list of substitutions") + self._colloquial_substitutions = data - self._colloquial_substitutions = data self._deterministic = deterministic def _build_identifier(self) -> ConverterIdentifier: diff --git a/tests/unit/converter/test_colloquial_wordswap_converter.py b/tests/unit/converter/test_colloquial_wordswap_converter.py index 520a173dca..dbc9ed5709 100644 --- a/tests/unit/converter/test_colloquial_wordswap_converter.py +++ b/tests/unit/converter/test_colloquial_wordswap_converter.py @@ -87,7 +87,7 @@ async def test_colloquial_custom_substitutions(input_text, wordswap_path, expect ], ) async def test_colloquial_empty_custom_substitutions(input_text, expected_output): - converter = ColloquialWordswapConverter(deterministic=True, wordswap_path="") + converter = ColloquialWordswapConverter(deterministic=True, wordswap_path=None) result = await converter.convert_async(prompt=input_text) assert result.output_text == expected_output @@ -158,3 +158,19 @@ def test_colloquial_converter_input_supported() -> None: converter = ColloquialWordswapConverter() assert converter.input_supported("text") is True assert converter.input_supported("image_path") is False + + +# Test that the constructor raises a ValueError when both custom_substitutions and wordswap_path are provided. +def test_init_conflict_custom_substitutions_and_path(): + with pytest.raises(ValueError, match="Provide either custom_substitutions or wordswap_path"): + ColloquialWordswapConverter(custom_substitutions={"foo": ["bar"]}, wordswap_path="some_file.yaml") + + +# test to check if direct dictionary of substitutions is passed and applies to prompt conversion correctly +@pytest.mark.asyncio +async def test_init_with_custom_substitutions_dict(): + custom_subs = {"hello": ["hi", "hey"], "world": ["earth"]} + # Use deterministic=True to ensure it picks the first item ("hi") for assertion + converter = ColloquialWordswapConverter(deterministic=True, custom_substitutions=custom_subs) + result = await converter.convert_async(prompt="Hello world") + assert result.output_text == "hi earth"