diff --git a/pyrit/datasets/jailbreak/text_jailbreak.py b/pyrit/datasets/jailbreak/text_jailbreak.py index 737c0c938..7aebda10a 100644 --- a/pyrit/datasets/jailbreak/text_jailbreak.py +++ b/pyrit/datasets/jailbreak/text_jailbreak.py @@ -39,7 +39,8 @@ def __init__( # Track the template source for error reporting self.template_source: str = "" # Count how many template sources are provided - template_sources = [template_path, template_file_name, string_template, random_template] + template_sources = [template_path, template_file_name, + string_template, random_template] provided_sources = [source for source in template_sources if source] if len(provided_sources) != 1: @@ -57,26 +58,30 @@ def __init__( # Get all yaml files in the jailbreak directory and its subdirectories jailbreak_dir = JAILBREAK_TEMPLATES_PATH # Get all yaml files but exclude those in multi_parameter subdirectory - yaml_files = [f for f in jailbreak_dir.rglob("*.yaml") if "multi_parameter" not in f.parts] + yaml_files = [f for f in jailbreak_dir.rglob( + "*.yaml") if "multi_parameter" not in f.parts] if not yaml_files: raise ValueError( "No YAML templates found in jailbreak directory (excluding multi_parameter subdirectory)" ) if template_file_name: - matching_files = [f for f in yaml_files if f.name == template_file_name] + matching_files = [ + f for f in yaml_files if f.name == template_file_name] if not matching_files: raise ValueError( f"Template file '{template_file_name}' not found in jailbreak directory or its subdirectories" ) if len(matching_files) > 1: - raise ValueError(f"Multiple files named '{template_file_name}' found in jailbreak directory") + raise ValueError( + f"Multiple files named '{template_file_name}' found in jailbreak directory") self.template = SeedPrompt.from_yaml_file(matching_files[0]) self.template_source = str(matching_files[0]) else: while True: random_template_path = random.choice(yaml_files) - self.template = SeedPrompt.from_yaml_file(random_template_path) + self.template = SeedPrompt.from_yaml_file( + random_template_path) if self.template.parameters == ["prompt"]: self.template_source = str(random_template_path) @@ -86,10 +91,12 @@ def __init__( break except ValueError as e: # Template has syntax errors - fail fast with clear error - raise ValueError(f"Invalid jailbreak template '{random_template_path}': {str(e)}") from e + raise ValueError( + f"Invalid jailbreak template '{random_template_path}': {str(e)}") from e # Validate that all required parameters (except 'prompt') are provided in kwargs - required_params = [p for p in self.template.parameters if p != "prompt"] + required_params = [ + p for p in self.template.parameters if p != "prompt"] missing_params = [p for p in required_params if p not in kwargs] if missing_params: raise ValueError( @@ -101,7 +108,38 @@ def __init__( if kwargs: kwargs.pop("prompt", None) # Apply remaining kwargs to the template while preserving template variables - self.template.value = self.template.render_template_value_silent(**kwargs) + self.template.value = self.template.render_template_value_silent( + **kwargs) + + @classmethod + def get_all_jailbreak_templates(cls, k: Optional[int] = None) -> List[str]: + """ + Retrieve all jailbreaks from the JAILBREAK_TEMPLATES_PATH. + + Args: + k (int, optional): Number of jailbreak templates to return. None to get all. + + Returns: + List[str]: List of jailbreak template file names. + + Raises: + ValueError: If no jailbreak templates are found in the jailbreak directory. + ValueError: If n is larger than the number of templates that exist. + """ + jailbreak_template_names = [ + str(f.stem) + ".yaml" for f in JAILBREAK_TEMPLATES_PATH.glob("*.yaml")] + if not jailbreak_template_names: + raise ValueError( + "No jailbreak templates found in the jailbreak directory") + + if k: + if k > len(jailbreak_template_names): + raise ValueError( + f"Attempted to pull {k} jailbreaks from a dataset with only {len(jailbreak_template_names)} jailbreaks!" + ) + jailbreak_template_names = random.choices( + jailbreak_template_names, k=k) + return jailbreak_template_names @classmethod def get_all_jailbreak_templates(cls, n: Optional[int] = None) -> List[str]: diff --git a/pyrit/scenario/scenarios/airt/jailbreak.py b/pyrit/scenario/scenarios/airt/jailbreak.py index e28676db7..9e036477f 100644 --- a/pyrit/scenario/scenarios/airt/jailbreak.py +++ b/pyrit/scenario/scenarios/airt/jailbreak.py @@ -28,6 +28,14 @@ TrueFalseScorer, ) +""" +TODO REMOVE +Featurelist +- [ ] Enhanced JailbreakStrategy +- [X] n tries per jailbreak +- [ ] Choose subset of jailbreaks explicitly +""" + class JailbreakStrategy(ScenarioStrategy): """ @@ -93,7 +101,9 @@ def __init__( objective_scorer: Optional[TrueFalseScorer] = None, include_baseline: bool = False, scenario_result_id: Optional[str] = None, - n_jailbreaks: Optional[int] = 3, + k_jailbreaks: Optional[int] = None, + which_jailbreaks: Optional[List[str]] = None, + num_tries: int = 1, ) -> None: """ Initialize the jailbreak scenario. @@ -104,13 +114,30 @@ def __init__( include_baseline (bool): Whether to include a baseline atomic attack that sends all objectives without modifications. Defaults to True. scenario_result_id (Optional[str]): Optional ID of an existing scenario result to resume. - n_jailbreaks (Optional[int]): Choose n random jailbreaks rather than using all of them. + k_jailbreaks (Optional[int]): Choose k random jailbreaks rather than using all of them. + num_tries (Optional[int]): Number of times to try each jailbreak. + which_jailbreaks (Optional[int]): Dedicated list of jailbreaks to run. + + Raises: + ValueError: If both which_jailbreaks and k_jailbreaks are provided, as random selection + is incompatible with a predetermined list. + """ + if which_jailbreaks and k_jailbreaks: + raise ValueError( + "Please provide only one of `k_jailbreaks` (random selection) or `which_jailbreaks` (specific selection)." + ) + if not objective_scorer: objective_scorer = self._get_default_objective_scorer() - self._scorer_config = AttackScoringConfig(objective_scorer=objective_scorer) + self._scorer_config = AttackScoringConfig( + objective_scorer=objective_scorer) + + self._k = k_jailbreaks + self._n = num_tries - self._n = n_jailbreaks + self._validate_jailbreaks_subset(which_jailbreaks) + self._which_jailbreaks = which_jailbreaks super().__init__( name="Jailbreak", @@ -138,9 +165,12 @@ def _get_default_objective_scorer(self) -> TrueFalseScorer: refusal_scorer = TrueFalseInverterScorer( scorer=SelfAskRefusalScorer( chat_target=OpenAIChatTarget( - endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"), - api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), + endpoint=os.environ.get( + "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"), + api_key=os.environ.get( + "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"), + model_name=os.environ.get( + "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), ) ) ) @@ -168,10 +198,10 @@ def _get_all_jailbreak_templates(self) -> List[str]: Returns: List[str]: List of jailbreak template file names. """ - if not self._n: + if not self._k: return TextJailBreak.get_all_jailbreak_templates() else: - return TextJailBreak.get_all_jailbreak_templates(n=self._n) + return TextJailBreak.get_all_jailbreak_templates(k=self._k) async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_name: str) -> AtomicAttack: """ @@ -188,12 +218,14 @@ async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_na # Create the jailbreak converter jailbreak_converter = TextJailbreakConverter( - jailbreak_template=TextJailBreak(template_file_name=jailbreak_template_name) + jailbreak_template=TextJailBreak( + template_file_name=jailbreak_template_name) ) # Create converter configuration converter_config = AttackConverterConfig( - request_converters=PromptConverterConfiguration.from_converters(converters=[jailbreak_converter]) + request_converters=PromptConverterConfiguration.from_converters( + converters=[jailbreak_converter]) ) # Create the attack @@ -218,6 +250,9 @@ async def _get_atomic_attacks_async(self) -> List[AtomicAttack]: Returns: List[AtomicAttack]: List of atomic attacks to execute, one per jailbreak template. + + Raises: + ValueError: If self._which_jailbreaks is not a subset of all jailbreak templates. """ atomic_attacks: List[AtomicAttack] = [] @@ -227,8 +262,28 @@ async def _get_atomic_attacks_async(self) -> List[AtomicAttack]: # Get all jailbreak template names jailbreak_template_names = self._get_all_jailbreak_templates() + if self._which_jailbreaks: + jailbreak_template_names = list( + set(jailbreak_template_names) & set(self._which_jailbreaks)) + if not jailbreak_template_names: + raise ValueError( + f"Error: could not find templates `{jailbreak_template_names}`!") + for template_name in jailbreak_template_names: - atomic_attack = await self._get_atomic_attack_from_jailbreak_async(jailbreak_template_name=template_name) - atomic_attacks.append(atomic_attack) + for _ in range(0, self._n): + atomic_attack = await self._get_atomic_attack_from_jailbreak_async( + jailbreak_template_name=template_name + ) + atomic_attacks.append(atomic_attack) return atomic_attacks + + def _validate_jailbreak_subset(self, jailbreaks: List[str]): + """ + Docstring for _validate_jailbreak_subset + + :param self: Description + :param jailbreaks: Description + :type jailbreaks: List[str] + """ + raise NotImplementedError diff --git a/tests/unit/scenarios/test_jailbreak.py b/tests/unit/scenarios/test_jailbreak.py index 047334131..29d7408a5 100644 --- a/tests/unit/scenarios/test_jailbreak.py +++ b/tests/unit/scenarios/test_jailbreak.py @@ -18,6 +18,13 @@ @pytest.fixture def mock_random_n() -> int: + """Mock constant for n-many attempts per jailbreak.""" + return 2 + + +@pytest.fixture +def mock_random_k() -> int: + """Mock constant for k-many jailbreak templates to be used.""" return 3 @@ -43,7 +50,8 @@ def mock_memory_seed_groups() -> List[SeedGroup]: def mock_objective_target() -> PromptTarget: """Create a mock objective target for testing.""" mock = MagicMock(spec=PromptTarget) - mock.get_identifier.return_value = {"__type__": "MockObjectiveTarget", "__module__": "test"} + mock.get_identifier.return_value = { + "__type__": "MockObjectiveTarget", "__module__": "test"} return mock @@ -51,7 +59,8 @@ def mock_objective_target() -> PromptTarget: def mock_objective_scorer() -> TrueFalseInverterScorer: """Create a mock scorer for testing.""" mock = MagicMock(spec=TrueFalseInverterScorer) - mock.get_identifier.return_value = {"__type__": "MockObjectiveScorer", "__module__": "test"} + mock.get_identifier.return_value = { + "__type__": "MockObjectiveScorer", "__module__": "test"} return mock @@ -106,6 +115,21 @@ def test_init_with_custom_scorer(self, mock_objective_scorer, mock_memory_seed_g scenario = Jailbreak(objective_scorer=mock_objective_scorer) assert isinstance(scenario._scorer_config, AttackScoringConfig) + def test_init_with_k_jailbreaks(self, mock_random_k): + """Test initialization with k_jailbreaks provided.""" + with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): + scenario = Jailbreak(k_jailbreaks=mock_random_k) + assert scenario._k == mock_random_k + + def test_init_with_num_tries(self, mock_random_n): + """Test initialization with k_jailbreaks provided.""" + with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): + scenario = Jailbreak(num_tries=mock_random_n) + assert scenario._n == mock_random_n + + def test_init_raises_exception_when_both_k_and_which_jailbreaks(self): + raise NotImplementedError + @pytest.mark.asyncio async def test_init_raises_exception_when_no_datasets_available(self, mock_objective_target, mock_objective_scorer): """Test that initialization raises ValueError when datasets are not available in memory.""" @@ -146,7 +170,8 @@ async def test_attack_generation_for_pyrit( ) await scenario.initialize_async( - objective_target=mock_objective_target, scenario_strategies=[pyrit_jailbreak_strategy] + objective_target=mock_objective_target, scenario_strategies=[ + pyrit_jailbreak_strategy] ) atomic_attacks = await scenario._get_atomic_attacks_async() for run in atomic_attacks: @@ -198,13 +223,33 @@ async def test_get_all_jailbreak_templates( @pytest.mark.asyncio async def test_get_some_jailbreak_templates( - self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_random_n + self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_random_k ): """Test that random jailbreak template selection works.""" with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): - scenario = Jailbreak(objective_scorer=mock_objective_scorer, n_jailbreaks=mock_random_n) + scenario = Jailbreak( + objective_scorer=mock_objective_scorer, n_jailbreaks=mock_random_n) await scenario.initialize_async(objective_target=mock_objective_target) - assert len(scenario._get_all_jailbreak_templates()) == 3 + assert len(scenario._get_all_jailbreak_templates() + ) == mock_random_k + + @pytest.mark.asyncio + async def test_custom_num_tries( + self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_random_n + ): + """Test that num_tries successfully tries each jailbreak template n-many times.""" + with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): + base_scenario = Jailbreak(objective_scorer=mock_objective_scorer) + await base_scenario.initialize_async(objective_target=mock_objective_target) + atomic_attacks_1 = await base_scenario._get_atomic_attacks_async() + + mult_scenario = Jailbreak( + objective_scorer=mock_objective_scorer, num_tries=mock_random_n) + await mult_scenario.initialize_async(objective_target=mock_objective_target) + atomic_attacks_n = await mult_scenario._get_atomic_attacks_async() + + assert len(atomic_attacks_1) * \ + mock_random_n == len(atomic_attacks_n) @pytest.mark.usefixtures(*FIXTURES)