Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 46 additions & 8 deletions pyrit/datasets/jailbreak/text_jailbreak.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ def __init__(
# Track the template source for error reporting
self.template_source: str = "<unknown>"
# Count how many template sources are provided
template_sources = [template_path, template_file_name, string_template, random_template]
template_sources = [template_path, template_file_name,
string_template, random_template]
provided_sources = [source for source in template_sources if source]

if len(provided_sources) != 1:
Expand All @@ -57,26 +58,30 @@ def __init__(
# Get all yaml files in the jailbreak directory and its subdirectories
jailbreak_dir = JAILBREAK_TEMPLATES_PATH
# Get all yaml files but exclude those in multi_parameter subdirectory
yaml_files = [f for f in jailbreak_dir.rglob("*.yaml") if "multi_parameter" not in f.parts]
yaml_files = [f for f in jailbreak_dir.rglob(
"*.yaml") if "multi_parameter" not in f.parts]
if not yaml_files:
raise ValueError(
"No YAML templates found in jailbreak directory (excluding multi_parameter subdirectory)"
)

if template_file_name:
matching_files = [f for f in yaml_files if f.name == template_file_name]
matching_files = [
f for f in yaml_files if f.name == template_file_name]
if not matching_files:
raise ValueError(
f"Template file '{template_file_name}' not found in jailbreak directory or its subdirectories"
)
if len(matching_files) > 1:
raise ValueError(f"Multiple files named '{template_file_name}' found in jailbreak directory")
raise ValueError(
f"Multiple files named '{template_file_name}' found in jailbreak directory")
self.template = SeedPrompt.from_yaml_file(matching_files[0])
self.template_source = str(matching_files[0])
else:
while True:
random_template_path = random.choice(yaml_files)
self.template = SeedPrompt.from_yaml_file(random_template_path)
self.template = SeedPrompt.from_yaml_file(
random_template_path)

if self.template.parameters == ["prompt"]:
self.template_source = str(random_template_path)
Expand All @@ -86,10 +91,12 @@ def __init__(
break
except ValueError as e:
# Template has syntax errors - fail fast with clear error
raise ValueError(f"Invalid jailbreak template '{random_template_path}': {str(e)}") from e
raise ValueError(
f"Invalid jailbreak template '{random_template_path}': {str(e)}") from e

# Validate that all required parameters (except 'prompt') are provided in kwargs
required_params = [p for p in self.template.parameters if p != "prompt"]
required_params = [
p for p in self.template.parameters if p != "prompt"]
missing_params = [p for p in required_params if p not in kwargs]
if missing_params:
raise ValueError(
Expand All @@ -101,7 +108,38 @@ def __init__(
if kwargs:
kwargs.pop("prompt", None)
# Apply remaining kwargs to the template while preserving template variables
self.template.value = self.template.render_template_value_silent(**kwargs)
self.template.value = self.template.render_template_value_silent(
**kwargs)

@classmethod
def get_all_jailbreak_templates(cls, k: Optional[int] = None) -> List[str]:
"""
Retrieve all jailbreaks from the JAILBREAK_TEMPLATES_PATH.

Args:
k (int, optional): Number of jailbreak templates to return. None to get all.

Returns:
List[str]: List of jailbreak template file names.

Raises:
ValueError: If no jailbreak templates are found in the jailbreak directory.
ValueError: If n is larger than the number of templates that exist.
"""
jailbreak_template_names = [
str(f.stem) + ".yaml" for f in JAILBREAK_TEMPLATES_PATH.glob("*.yaml")]
if not jailbreak_template_names:
raise ValueError(
"No jailbreak templates found in the jailbreak directory")

if k:
if k > len(jailbreak_template_names):
raise ValueError(
f"Attempted to pull {k} jailbreaks from a dataset with only {len(jailbreak_template_names)} jailbreaks!"
)
jailbreak_template_names = random.choices(
jailbreak_template_names, k=k)
return jailbreak_template_names

@classmethod
def get_all_jailbreak_templates(cls, n: Optional[int] = None) -> List[str]:
Expand Down
81 changes: 68 additions & 13 deletions pyrit/scenario/scenarios/airt/jailbreak.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@
TrueFalseScorer,
)

"""
TODO REMOVE
Featurelist
- [ ] Enhanced JailbreakStrategy
- [X] n tries per jailbreak
- [ ] Choose subset of jailbreaks explicitly
"""


class JailbreakStrategy(ScenarioStrategy):
"""
Expand Down Expand Up @@ -93,7 +101,9 @@ def __init__(
objective_scorer: Optional[TrueFalseScorer] = None,
include_baseline: bool = False,
scenario_result_id: Optional[str] = None,
n_jailbreaks: Optional[int] = 3,
k_jailbreaks: Optional[int] = None,
which_jailbreaks: Optional[List[str]] = None,
num_tries: int = 1,
) -> None:
"""
Initialize the jailbreak scenario.
Expand All @@ -104,13 +114,30 @@ def __init__(
include_baseline (bool): Whether to include a baseline atomic attack that sends all
objectives without modifications. Defaults to True.
scenario_result_id (Optional[str]): Optional ID of an existing scenario result to resume.
n_jailbreaks (Optional[int]): Choose n random jailbreaks rather than using all of them.
k_jailbreaks (Optional[int]): Choose k random jailbreaks rather than using all of them.
num_tries (Optional[int]): Number of times to try each jailbreak.
which_jailbreaks (Optional[int]): Dedicated list of jailbreaks to run.

Raises:
ValueError: If both which_jailbreaks and k_jailbreaks are provided, as random selection
is incompatible with a predetermined list.

"""
if which_jailbreaks and k_jailbreaks:
raise ValueError(
"Please provide only one of `k_jailbreaks` (random selection) or `which_jailbreaks` (specific selection)."
)

if not objective_scorer:
objective_scorer = self._get_default_objective_scorer()
self._scorer_config = AttackScoringConfig(objective_scorer=objective_scorer)
self._scorer_config = AttackScoringConfig(
objective_scorer=objective_scorer)

self._k = k_jailbreaks
self._n = num_tries

self._n = n_jailbreaks
self._validate_jailbreaks_subset(which_jailbreaks)
self._which_jailbreaks = which_jailbreaks

super().__init__(
name="Jailbreak",
Expand Down Expand Up @@ -138,9 +165,12 @@ def _get_default_objective_scorer(self) -> TrueFalseScorer:
refusal_scorer = TrueFalseInverterScorer(
scorer=SelfAskRefusalScorer(
chat_target=OpenAIChatTarget(
endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"),
api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"),
model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"),
endpoint=os.environ.get(
"AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"),
api_key=os.environ.get(
"AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"),
model_name=os.environ.get(
"AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"),
)
)
)
Expand Down Expand Up @@ -168,10 +198,10 @@ def _get_all_jailbreak_templates(self) -> List[str]:
Returns:
List[str]: List of jailbreak template file names.
"""
if not self._n:
if not self._k:
return TextJailBreak.get_all_jailbreak_templates()
else:
return TextJailBreak.get_all_jailbreak_templates(n=self._n)
return TextJailBreak.get_all_jailbreak_templates(k=self._k)

async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_name: str) -> AtomicAttack:
"""
Expand All @@ -188,12 +218,14 @@ async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_na

# Create the jailbreak converter
jailbreak_converter = TextJailbreakConverter(
jailbreak_template=TextJailBreak(template_file_name=jailbreak_template_name)
jailbreak_template=TextJailBreak(
template_file_name=jailbreak_template_name)
)

# Create converter configuration
converter_config = AttackConverterConfig(
request_converters=PromptConverterConfiguration.from_converters(converters=[jailbreak_converter])
request_converters=PromptConverterConfiguration.from_converters(
converters=[jailbreak_converter])
)

# Create the attack
Expand All @@ -218,6 +250,9 @@ async def _get_atomic_attacks_async(self) -> List[AtomicAttack]:

Returns:
List[AtomicAttack]: List of atomic attacks to execute, one per jailbreak template.

Raises:
ValueError: If self._which_jailbreaks is not a subset of all jailbreak templates.
"""
atomic_attacks: List[AtomicAttack] = []

Expand All @@ -227,8 +262,28 @@ async def _get_atomic_attacks_async(self) -> List[AtomicAttack]:
# Get all jailbreak template names
jailbreak_template_names = self._get_all_jailbreak_templates()

if self._which_jailbreaks:
jailbreak_template_names = list(
set(jailbreak_template_names) & set(self._which_jailbreaks))
if not jailbreak_template_names:
raise ValueError(
f"Error: could not find templates `{jailbreak_template_names}`!")

for template_name in jailbreak_template_names:
atomic_attack = await self._get_atomic_attack_from_jailbreak_async(jailbreak_template_name=template_name)
atomic_attacks.append(atomic_attack)
for _ in range(0, self._n):
atomic_attack = await self._get_atomic_attack_from_jailbreak_async(
jailbreak_template_name=template_name
)
atomic_attacks.append(atomic_attack)

return atomic_attacks

def _validate_jailbreak_subset(self, jailbreaks: List[str]):
"""
Docstring for _validate_jailbreak_subset

:param self: Description
:param jailbreaks: Description
:type jailbreaks: List[str]
"""
raise NotImplementedError
57 changes: 51 additions & 6 deletions tests/unit/scenarios/test_jailbreak.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@

@pytest.fixture
def mock_random_n() -> int:
"""Mock constant for n-many attempts per jailbreak."""
return 2


@pytest.fixture
def mock_random_k() -> int:
"""Mock constant for k-many jailbreak templates to be used."""
return 3


Expand All @@ -43,15 +50,17 @@ def mock_memory_seed_groups() -> List[SeedGroup]:
def mock_objective_target() -> PromptTarget:
"""Create a mock objective target for testing."""
mock = MagicMock(spec=PromptTarget)
mock.get_identifier.return_value = {"__type__": "MockObjectiveTarget", "__module__": "test"}
mock.get_identifier.return_value = {
"__type__": "MockObjectiveTarget", "__module__": "test"}
return mock


@pytest.fixture
def mock_objective_scorer() -> TrueFalseInverterScorer:
"""Create a mock scorer for testing."""
mock = MagicMock(spec=TrueFalseInverterScorer)
mock.get_identifier.return_value = {"__type__": "MockObjectiveScorer", "__module__": "test"}
mock.get_identifier.return_value = {
"__type__": "MockObjectiveScorer", "__module__": "test"}
return mock


Expand Down Expand Up @@ -106,6 +115,21 @@ def test_init_with_custom_scorer(self, mock_objective_scorer, mock_memory_seed_g
scenario = Jailbreak(objective_scorer=mock_objective_scorer)
assert isinstance(scenario._scorer_config, AttackScoringConfig)

def test_init_with_k_jailbreaks(self, mock_random_k):
"""Test initialization with k_jailbreaks provided."""
with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups):
scenario = Jailbreak(k_jailbreaks=mock_random_k)
assert scenario._k == mock_random_k

def test_init_with_num_tries(self, mock_random_n):
"""Test initialization with k_jailbreaks provided."""
with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups):
scenario = Jailbreak(num_tries=mock_random_n)
assert scenario._n == mock_random_n

def test_init_raises_exception_when_both_k_and_which_jailbreaks(self):
raise NotImplementedError

@pytest.mark.asyncio
async def test_init_raises_exception_when_no_datasets_available(self, mock_objective_target, mock_objective_scorer):
"""Test that initialization raises ValueError when datasets are not available in memory."""
Expand Down Expand Up @@ -146,7 +170,8 @@ async def test_attack_generation_for_pyrit(
)

await scenario.initialize_async(
objective_target=mock_objective_target, scenario_strategies=[pyrit_jailbreak_strategy]
objective_target=mock_objective_target, scenario_strategies=[
pyrit_jailbreak_strategy]
)
atomic_attacks = await scenario._get_atomic_attacks_async()
for run in atomic_attacks:
Expand Down Expand Up @@ -198,13 +223,33 @@ async def test_get_all_jailbreak_templates(

@pytest.mark.asyncio
async def test_get_some_jailbreak_templates(
self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_random_n
self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_random_k
):
"""Test that random jailbreak template selection works."""
with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups):
scenario = Jailbreak(objective_scorer=mock_objective_scorer, n_jailbreaks=mock_random_n)
scenario = Jailbreak(
objective_scorer=mock_objective_scorer, n_jailbreaks=mock_random_n)
await scenario.initialize_async(objective_target=mock_objective_target)
assert len(scenario._get_all_jailbreak_templates()) == 3
assert len(scenario._get_all_jailbreak_templates()
) == mock_random_k

@pytest.mark.asyncio
async def test_custom_num_tries(
self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_random_n
):
"""Test that num_tries successfully tries each jailbreak template n-many times."""
with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups):
base_scenario = Jailbreak(objective_scorer=mock_objective_scorer)
await base_scenario.initialize_async(objective_target=mock_objective_target)
atomic_attacks_1 = await base_scenario._get_atomic_attacks_async()

mult_scenario = Jailbreak(
objective_scorer=mock_objective_scorer, num_tries=mock_random_n)
await mult_scenario.initialize_async(objective_target=mock_objective_target)
atomic_attacks_n = await mult_scenario._get_atomic_attacks_async()

assert len(atomic_attacks_1) * \
mock_random_n == len(atomic_attacks_n)


@pytest.mark.usefixtures(*FIXTURES)
Expand Down
Loading