diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml new file mode 100644 index 0000000000..e7bbcb9ccb --- /dev/null +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml @@ -0,0 +1,15 @@ +father: ["papa", "itay", "tay", "dad", "erpat"] +mother: ["mama", "inay", "nay", "mom", "ermat"] +grandfather: ["lolo", "lo", "lolo paps"] +grandmother: ["lola", "la", "lola mams"] +girl: ["nene", "inday", "ganda"] +boy: ["totoy", "dodong", "pogi"] +son: ["anak", "junior", "boy"] +daughter: ["anak", "nene", "ga"] +aunty: ["tita", "tiya", "tante"] +aunt: ["tita", "tiyang"] +man: ["kuya", "boss", "pare", "lodi", "chong"] +woman: ["ate", "miss", "ganda", "marites"] +uncle: ["tito", "tiyo", "tito paps"] +sister: ["ate", "sis", "sissy"] +brother: ["kuya", "bro", "paps", "tol"] diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml new file mode 100644 index 0000000000..e0798310ac --- /dev/null +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml @@ -0,0 +1,15 @@ +father: ["papa", "bauji", "abba", "pitaji", "dad", "pop"] +mother: ["mummy", "maa", "ammi", "mataji", "amman"] +grandfather: ["dada", "nana", "dadaji", "nanaji"] +grandmother: ["dadi", "nani", "dadiji", "naniji"] +girl: ["beti", "kudi", "larki", "gudiya"] +boy: ["beta", "munda", "larka", "chokra"] +son: ["beta", "ladla", "puttar"] +daughter: ["beti", "bitiya", "laado"] +aunty: ["aunty", "mausi", "bua", "chachi", "mami"] +aunt: ["aunty", "mausi", "bua", "chachi", "mami"] +man: ["bhai", "bhaiyya", "uncle", "banda", "yaar"] +woman: ["behenji", "didiji", "bandi", "memsaab"] +uncle: ["uncle", "chacha", "mama", "fufa", "tauji"] +sister: ["didi", "behen", "behna", "sis"] +brother: ["bhai", "bhaiyya", "bro", "veer", "paji"] diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml new file mode 100644 index 0000000000..874765856a --- /dev/null +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml @@ -0,0 +1,15 @@ +father: ["pops", "old man", "dadman", "pa"] +mother: ["mumsy", "moms", "mummy", "ma"] +grandfather: ["grandad", "gramps"] +grandmother: ["nan", "nanna", "gran"] +girl: ["gyal", "ting", "shorty", " peng ting"] +boy: ["yute", "man", "lil man"] +son: ["yute", "junior", "son-son"] +daughter: ["princess", "baby girl"] +aunty: ["aunty", "tantie"] +aunt: ["aunty", "aunt"] +man: ["mandem", "man", "geezer", "bloke"] +woman: ["gyal", "lady", "madam"] +uncle: ["unc", "uncle"] +sister: ["sis", "sissy"] +brother: ["bruv", "bredrin", "fam", "blood", "brudda"] diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml new file mode 100644 index 0000000000..f64e94a972 --- /dev/null +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml @@ -0,0 +1,15 @@ +father: ["papa", "lao bei", "lim pei", "bapa", "appa"] +mother: ["mama", "amma", "ibu"] +grandfather: ["ah gong", "thatha", "dato"] +grandmother: ["ah ma", "patti", "nenek"] +girl: ["ah ger", "ponnu"] +boy: ["ah boy", "boi", "payyan"] +son: ["ah boy", "boi", "payyan"] +daughter: ["ah ger", "ponnu"] +aunty: ["makcik", "maami"] +aunt: ["makcik", "maami"] +man: ["ah beng", "shuai ge"] +woman: ["ah lian", "xiao mei"] +uncle: ["encik", "unker"] +sister: ["xjj", "jie jie", "zhezhe", "kaka", "akka", "thangatchi"] +brother: ["bro", "boiboi", "di di", "xdd", "anneh", "thambi"] diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml new file mode 100644 index 0000000000..4dfa65ec30 --- /dev/null +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml @@ -0,0 +1,15 @@ +father: ["daddy", "pa", "pops", "old man"] +mother: ["mama", "momma", "ma"] +grandfather: ["pawpaw", "grandpappy", "gramps"] +grandmother: ["mawmaw", "meemaw", "nana", "grammy"] +girl: ["gal", "missy", "sugar"] +boy: ["bubba", "sonny", "boy"] +son: ["junior", "bub", "son"] +daughter: ["sissy", "honey", "sugar"] +aunty: ["auntie"] +aunt: ["auntie"] +man: ["fella", "hoss", "bubba"] +woman: ["ma'am", "lady", "missy"] +uncle: ["unk"] +sister: ["sissy", "sis"] +brother: ["bubba", "bro"] diff --git a/pyrit/prompt_converter/colloquial_wordswap_converter.py b/pyrit/prompt_converter/colloquial_wordswap_converter.py index 96ab2d6c15..972b9134df 100644 --- a/pyrit/prompt_converter/colloquial_wordswap_converter.py +++ b/pyrit/prompt_converter/colloquial_wordswap_converter.py @@ -5,6 +5,9 @@ import re from typing import Dict, List, Optional +import yaml + +from pyrit.common.path import DATASETS_PATH from pyrit.identifiers import ConverterIdentifier from pyrit.models import PromptDataType from pyrit.prompt_converter.prompt_converter import ConverterResult, PromptConverter @@ -19,7 +22,11 @@ class ColloquialWordswapConverter(PromptConverter): SUPPORTED_OUTPUT_TYPES = ("text",) def __init__( - self, deterministic: bool = False, custom_substitutions: Optional[Dict[str, List[str]]] = None + self, + deterministic: bool = False, + *, + custom_substitutions: Optional[Dict[str, List[str]]] = None, + wordswap_path: Optional[str] = None, ) -> None: """ Initialize the converter with optional deterministic mode and custom substitutions. @@ -27,29 +34,41 @@ def __init__( Args: deterministic (bool): If True, use the first substitution for each wordswap. If False, randomly choose a substitution for each wordswap. Defaults to False. - custom_substitutions (Optional[Dict[str, List[str]]], Optional): A dictionary of custom substitutions to - override the defaults. Defaults to None. + custom_substitutions (Optional[Dict[str, List[str]]], Optional): A dictionary of + custom substitutions to override the defaults. Defaults to none. + wordswap_path (Optional[str]): Name of a YAML file located in the + PyRIT datasets prompt_converters/colloquial_wordswaps directory. + + Raises: + FileNotFoundError: If the wordswap YAML file is not found. + ValueError: If both parameters are provided or YAML format is invalid. """ - default_substitutions = { - "father": ["papa", "lao bei", "lim pei", "bapa", "appa"], - "mother": ["mama", "amma", "ibu"], - "grandfather": ["ah gong", "thatha", "dato"], - "grandmother": ["ah ma", "patti", "nenek"], - "girl": ["ah ger", "ponnu"], - "boy": ["ah boy", "boi", "payyan"], - "son": ["ah boy", "boi", "payyan"], - "daughter": ["ah ger", "ponnu"], - "aunt": ["makcik", "maami"], - "aunty": ["makcik", "maami"], - "man": ["ah beng", "shuai ge"], - "woman": ["ah lian", "xiao mei"], - "uncle": ["encik", "unker"], - "sister": ["xjj", "jie jie", "zhezhe", "kaka", "akka", "thangatchi"], - "brother": ["bro", "boiboi", "di di", "xdd", "anneh", "thambi"], - } - - # Use custom substitutions if provided, otherwise default to the standard ones - self._colloquial_substitutions = custom_substitutions if custom_substitutions else default_substitutions + if custom_substitutions is not None and wordswap_path is not None: + raise ValueError("Provide either custom_substitutions or wordswap_path, not both.") + + wordswap_directory = DATASETS_PATH / "prompt_converters" / "colloquial_wordswaps" + + # custom_substitution arg prioritization + if custom_substitutions is not None: + self._colloquial_substitutions = custom_substitutions + else: + # if neither custom_sub nor wordswap_path is given then singaporean substituions are used + file_path = ( + wordswap_directory / wordswap_path + if wordswap_path is not None + else wordswap_directory / "singaporean.yaml" + ) + + if not file_path.exists(): + raise FileNotFoundError(f"Colloquial wordswap file not found: {file_path}") + with file_path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + # ensure that wordswap YAML is in the correct format. + if not isinstance(data, dict): + raise ValueError("Wordswap YAML must be a dict[str, list[str]] mapping words to substitutions") + + self._colloquial_substitutions = data + self._deterministic = deterministic def _build_identifier(self) -> ConverterIdentifier: diff --git a/tests/unit/converter/test_colloquial_wordswap_converter.py b/tests/unit/converter/test_colloquial_wordswap_converter.py index f8203511fa..dbc9ed5709 100644 --- a/tests/unit/converter/test_colloquial_wordswap_converter.py +++ b/tests/unit/converter/test_colloquial_wordswap_converter.py @@ -61,21 +61,24 @@ async def test_colloquial_non_deterministic(input_text): assert output_word == input_word -# Test for custom substitutions +# Test for nondefault substitutions. @pytest.mark.asyncio @pytest.mark.parametrize( - "input_text,custom_substitutions,expected_output", + "input_text,wordswap_path,expected_output", [ - ("father", {"father": ["appa", "darth vader"]}, "appa"), # Custom substitution father -> appa + ("father", "filipino.yaml", "papa"), + ("woman", "indian.yaml", "behenji"), + ("son", "southern_american.yaml", "junior"), + ("man", "multicultural_london.yaml", "mandem"), ], ) -async def test_colloquial_custom_substitutions(input_text, custom_substitutions, expected_output): - converter = ColloquialWordswapConverter(deterministic=True, custom_substitutions=custom_substitutions) +async def test_colloquial_custom_substitutions(input_text, wordswap_path, expected_output): + converter = ColloquialWordswapConverter(deterministic=True, wordswap_path=wordswap_path) result = await converter.convert_async(prompt=input_text) assert result.output_text == expected_output -# Test for empty custom substitutions +# Test for empty wordswap_path @pytest.mark.asyncio @pytest.mark.parametrize( "input_text,expected_output", @@ -84,7 +87,7 @@ async def test_colloquial_custom_substitutions(input_text, custom_substitutions, ], ) async def test_colloquial_empty_custom_substitutions(input_text, expected_output): - converter = ColloquialWordswapConverter(deterministic=True, custom_substitutions={}) + converter = ColloquialWordswapConverter(deterministic=True, wordswap_path=None) result = await converter.convert_async(prompt=input_text) assert result.output_text == expected_output @@ -105,6 +108,22 @@ async def test_multiple_words(input_text, expected_output): assert result.output_text == expected_output +# Test multiple word prompts for custom colloquialism +@pytest.mark.asyncio +@pytest.mark.parametrize( + "input_text,wordswap_path,expected_output", + [ + ("father and mother", "indian.yaml", "papa and mummy"), + ("brother and sister", "southern_american.yaml", "bubba and sissy"), + ("aunt and uncle", "multicultural_london.yaml", "aunty and unc"), + ], +) +async def test_multiple_words_custom_colloquialisms(input_text, wordswap_path, expected_output): + converter = ColloquialWordswapConverter(deterministic=True, wordswap_path=wordswap_path) + result = await converter.convert_async(prompt=input_text) + assert result.output_text == expected_output + + # Test for awkward spacing @pytest.mark.asyncio @pytest.mark.parametrize( @@ -139,3 +158,19 @@ def test_colloquial_converter_input_supported() -> None: converter = ColloquialWordswapConverter() assert converter.input_supported("text") is True assert converter.input_supported("image_path") is False + + +# Test that the constructor raises a ValueError when both custom_substitutions and wordswap_path are provided. +def test_init_conflict_custom_substitutions_and_path(): + with pytest.raises(ValueError, match="Provide either custom_substitutions or wordswap_path"): + ColloquialWordswapConverter(custom_substitutions={"foo": ["bar"]}, wordswap_path="some_file.yaml") + + +# test to check if direct dictionary of substitutions is passed and applies to prompt conversion correctly +@pytest.mark.asyncio +async def test_init_with_custom_substitutions_dict(): + custom_subs = {"hello": ["hi", "hey"], "world": ["earth"]} + # Use deterministic=True to ensure it picks the first item ("hi") for assertion + converter = ColloquialWordswapConverter(deterministic=True, custom_substitutions=custom_subs) + result = await converter.convert_async(prompt="Hello world") + assert result.output_text == "hi earth"