diff --git a/pyrit/datasets/jailbreak/text_jailbreak.py b/pyrit/datasets/jailbreak/text_jailbreak.py index 737c0c938..bb6918785 100644 --- a/pyrit/datasets/jailbreak/text_jailbreak.py +++ b/pyrit/datasets/jailbreak/text_jailbreak.py @@ -104,12 +104,12 @@ def __init__( self.template.value = self.template.render_template_value_silent(**kwargs) @classmethod - def get_all_jailbreak_templates(cls, n: Optional[int] = None) -> List[str]: + def get_all_jailbreak_templates(cls, k: Optional[int] = None) -> List[str]: """ Retrieve all jailbreaks from the JAILBREAK_TEMPLATES_PATH. Args: - n (int, optional): Number of jailbreak templates to return. None to get all. + k (int, optional): Number of jailbreak templates to return. None to get all. Returns: List[str]: List of jailbreak template file names. @@ -122,12 +122,12 @@ def get_all_jailbreak_templates(cls, n: Optional[int] = None) -> List[str]: if not jailbreak_template_names: raise ValueError("No jailbreak templates found in the jailbreak directory") - if n: - if n > len(jailbreak_template_names): + if k: + if k > len(jailbreak_template_names): raise ValueError( - f"Attempted to pull {n} jailbreaks from a dataset with only {len(jailbreak_template_names)} jailbreaks!" + f"Attempted to pull {k} jailbreaks from a dataset with only {len(jailbreak_template_names)} jailbreaks!" ) - jailbreak_template_names = random.choices(jailbreak_template_names, k=n) + jailbreak_template_names = random.choices(jailbreak_template_names, k=k) return jailbreak_template_names def get_jailbreak_system_prompt(self) -> str: diff --git a/pyrit/scenario/scenarios/airt/jailbreak.py b/pyrit/scenario/scenarios/airt/jailbreak.py index e28676db7..167844af2 100644 --- a/pyrit/scenario/scenarios/airt/jailbreak.py +++ b/pyrit/scenario/scenarios/airt/jailbreak.py @@ -8,9 +8,13 @@ from pyrit.common import apply_defaults from pyrit.datasets import TextJailBreak from pyrit.executor.attack.core.attack_config import ( + AttackAdversarialConfig, AttackConverterConfig, AttackScoringConfig, ) +from pyrit.executor.attack.multi_turn.crescendo import CrescendoAttack +from pyrit.executor.attack.multi_turn.red_teaming import RedTeamingAttack +from pyrit.executor.attack.single_turn.many_shot_jailbreak import ManyShotJailbreakAttack from pyrit.executor.attack.single_turn.prompt_sending import PromptSendingAttack from pyrit.models import SeedAttackGroup from pyrit.prompt_converter import TextJailbreakConverter @@ -19,9 +23,7 @@ from pyrit.scenario.core.atomic_attack import AtomicAttack from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario import Scenario -from pyrit.scenario.core.scenario_strategy import ( - ScenarioStrategy, -) +from pyrit.scenario.core.scenario_strategy import ScenarioCompositeStrategy, ScenarioStrategy from pyrit.score import ( SelfAskRefusalScorer, TrueFalseInverterScorer, @@ -31,13 +33,30 @@ class JailbreakStrategy(ScenarioStrategy): """ - Strategy for single-turn jailbreak attacks. - - There is currently only one, running all jailbreaks. + Strategy for jailbreak attacks. """ + # Aggregate members (special markers that expand to strategies with matching tags) ALL = ("all", {"all"}) - PYRIT = ("pyrit", {"pyrit"}) + SINGLE_TURN = ("single_turn", {"single_turn"}) + MULTI_TURN = ("multi_turn", {"multi_turn"}) + + # Strategies for tweaking jailbreak efficacy through attack patterns + ManyShot = ("many_shot", {"single_turn"}) + PromptSending = ("prompt_sending", {"single_turn"}) + Crescendo = ("crescendo", {"multi_turn"}) + RedTeaming = ("red_teaming", {"multi_turn"}) + + @classmethod + def get_aggregate_tags(cls) -> set[str]: + """ + Get the set of tags that represent aggregate categories. + + Returns: + set[str]: Set of tags that are aggregate markers. + """ + # Include base class aggregates ("all") and add scenario-specific ones + return super().get_aggregate_tags() | {"single_turn", "multi_turn"} class Jailbreak(Scenario): @@ -93,7 +112,9 @@ def __init__( objective_scorer: Optional[TrueFalseScorer] = None, include_baseline: bool = False, scenario_result_id: Optional[str] = None, - n_jailbreaks: Optional[int] = 3, + k: Optional[int] = None, + n: int = 1, + jailbreaks: Optional[List[str]] = None, ) -> None: """ Initialize the jailbreak scenario. @@ -104,13 +125,30 @@ def __init__( include_baseline (bool): Whether to include a baseline atomic attack that sends all objectives without modifications. Defaults to True. scenario_result_id (Optional[str]): Optional ID of an existing scenario result to resume. - n_jailbreaks (Optional[int]): Choose n random jailbreaks rather than using all of them. + k (Optional[int]): Choose k random jailbreaks rather than using all of them. + n (Optional[int]): Number of times to try each jailbreak. + jailbreaks (Optional[int]): Dedicated list of jailbreaks to run. + + Raises: + ValueError: If both jailbreaks and k are provided, as random selection + is incompatible with a predetermined list. + """ + if jailbreaks and k: + raise ValueError("Please provide only one of `k` (random selection) or `jailbreaks` (specific selection).") + if not objective_scorer: objective_scorer = self._get_default_objective_scorer() self._scorer_config = AttackScoringConfig(objective_scorer=objective_scorer) - self._n = n_jailbreaks + self._k = k + self._n = n + + if jailbreaks: + self._validate_jailbreaks_subset(jailbreaks) + self._jailbreaks = jailbreaks + else: + self._jailbreaks = TextJailBreak.get_all_jailbreak_templates() super().__init__( name="Jailbreak", @@ -124,6 +162,21 @@ def __init__( # Will be resolved in _get_atomic_attacks_async self._seed_groups: Optional[List[SeedAttackGroup]] = None + def _validate_jailbreaks_subset(self, jailbreaks: List[str]) -> None: + """ + Validate that the provided jailbreaks exist before moving on with initialization. + + Args: + jailbreaks (List[str]): List of jailbreak names. + + Raises: + ValueError: If jailbreaks not discovered. + """ + all_templates = TextJailBreak.get_all_jailbreak_templates() + diff = set(jailbreaks) - set(all_templates) + if len(diff) > 0: + raise ValueError(f"Error: could not find templates `{diff}`!") + def _get_default_objective_scorer(self) -> TrueFalseScorer: """ Retrieve the default objective scorer. @@ -146,6 +199,20 @@ def _get_default_objective_scorer(self) -> TrueFalseScorer: ) return refusal_scorer + def _get_default_adversarial_target(self) -> OpenAIChatTarget: + """ + Create and retrieve the default adversarial target. + + Returns: + OpenAIChatTarget: Default adversarial target using an unfiltered endpoint. + """ + return OpenAIChatTarget( + endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"), + api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"), + model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), + temperature=1.2, + ) + def _resolve_seed_groups(self) -> List[SeedAttackGroup]: """ Resolve seed groups from dataset configuration. @@ -168,20 +235,26 @@ def _get_all_jailbreak_templates(self) -> List[str]: Returns: List[str]: List of jailbreak template file names. """ - if not self._n: + if not self._k: return TextJailBreak.get_all_jailbreak_templates() else: - return TextJailBreak.get_all_jailbreak_templates(n=self._n) + return TextJailBreak.get_all_jailbreak_templates(k=self._k) - async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_name: str) -> AtomicAttack: + async def _get_atomic_attack_from_strategy_async( + self, *, strategy: str, jailbreak_template_name: str + ) -> AtomicAttack: """ Create an atomic attack for a specific jailbreak template. Args: + strategy (str): JailbreakStrategy to use. jailbreak_template_name (str): Name of the jailbreak template file. Returns: AtomicAttack: An atomic attack using the specified jailbreak template. + + Raises: + ValueError: If an invalid strategy is provided. """ # objective_target is guaranteed to be non-None by parent class validation assert self._objective_target is not None @@ -196,12 +269,29 @@ async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_na request_converters=PromptConverterConfiguration.from_converters(converters=[jailbreak_converter]) ) - # Create the attack - attack = PromptSendingAttack( - objective_target=self._objective_target, - attack_scoring_config=self._scorer_config, - attack_converter_config=converter_config, - ) + attack = None + args = { + "objective_target": self._objective_target, + "attack_scoring_config": self._scorer_config, + "attack_converter_config": converter_config, + } + adversarial_config = AttackAdversarialConfig(target=self._get_default_adversarial_target()) + match strategy: + case "many_shot": + attack = ManyShotJailbreakAttack(**args) + case "prompt_sending": + attack = PromptSendingAttack(**args) + case "crescendo": + args["attack_adversarial_config"] = adversarial_config + attack = CrescendoAttack(**args) + case "red_teaming": + args["attack_adversarial_config"] = adversarial_config + attack = RedTeamingAttack(**args) + case _: + raise ValueError(f"Unknown JailbreakStrategy `{strategy}`.") + + if not attack: + raise ValueError(f"Attack cannot be None!") # Extract template name without extension for the atomic attack name template_name = Path(jailbreak_template_name).stem @@ -218,17 +308,24 @@ async def _get_atomic_attacks_async(self) -> List[AtomicAttack]: Returns: List[AtomicAttack]: List of atomic attacks to execute, one per jailbreak template. + + Raises: + ValueError: If self._jailbreaks is not a subset of all jailbreak templates. """ atomic_attacks: List[AtomicAttack] = [] # Retrieve seed prompts based on selected strategies self._seed_groups = self._resolve_seed_groups() - # Get all jailbreak template names - jailbreak_template_names = self._get_all_jailbreak_templates() + strategies = ScenarioCompositeStrategy.extract_single_strategy_values( + composites=self._scenario_composites, strategy_type=JailbreakStrategy + ) - for template_name in jailbreak_template_names: - atomic_attack = await self._get_atomic_attack_from_jailbreak_async(jailbreak_template_name=template_name) - atomic_attacks.append(atomic_attack) + for strategy in strategies: + for template_name in self._jailbreaks: + atomic_attack = await self._get_atomic_attack_from_strategy_async( + strategy=strategy, jailbreak_template_name=template_name + ) + atomic_attacks.extend([atomic_attack] * self._n) return atomic_attacks diff --git a/tests/unit/scenarios/test_jailbreak.py b/tests/unit/scenarios/test_jailbreak.py index 047334131..4569e8039 100644 --- a/tests/unit/scenarios/test_jailbreak.py +++ b/tests/unit/scenarios/test_jailbreak.py @@ -9,6 +9,9 @@ import pytest from pyrit.executor.attack.core.attack_config import AttackScoringConfig +from pyrit.executor.attack.multi_turn.crescendo import CrescendoAttack +from pyrit.executor.attack.multi_turn.red_teaming import RedTeamingAttack +from pyrit.executor.attack.single_turn.many_shot_jailbreak import ManyShotJailbreakAttack from pyrit.executor.attack.single_turn.prompt_sending import PromptSendingAttack from pyrit.models import SeedGroup, SeedObjective from pyrit.prompt_target import PromptTarget @@ -16,8 +19,21 @@ from pyrit.score.true_false.true_false_inverter_scorer import TrueFalseInverterScorer +@pytest.fixture +def mock_jailbreaks() -> List[str]: + """Mock constant for jailbreak subset.""" + return ["aim", "dan_1", "tuo"] + + @pytest.fixture def mock_random_n() -> int: + """Mock constant for n-many attempts per jailbreak.""" + return 2 + + +@pytest.fixture +def mock_random_k() -> int: + """Mock constant for k-many jailbreak templates to be used.""" return 3 @@ -61,8 +77,33 @@ def all_jailbreak_strategy() -> JailbreakStrategy: @pytest.fixture -def pyrit_jailbreak_strategy() -> JailbreakStrategy: - return JailbreakStrategy.PYRIT +def singleturn_jailbreak_strategy() -> JailbreakStrategy: + return JailbreakStrategy.SINGLE_TURN + + +@pytest.fixture +def multiturn_jailbreak_strategy() -> JailbreakStrategy: + return JailbreakStrategy.MULTI_TURN + + +@pytest.fixture +def manyshot_jailbreak_strategy() -> JailbreakStrategy: + return JailbreakStrategy.ManyShot + + +@pytest.fixture +def promptsending_jailbreak_strategy() -> JailbreakStrategy: + return JailbreakStrategy.PromptSending + + +@pytest.fixture +def crescendo_jailbreak_strategy() -> JailbreakStrategy: + return JailbreakStrategy.Crescendo + + +@pytest.fixture +def redtemaing_jailbreak_strategy() -> JailbreakStrategy: + return JailbreakStrategy.RedTeaming @pytest.fixture @@ -106,6 +147,26 @@ def test_init_with_custom_scorer(self, mock_objective_scorer, mock_memory_seed_g scenario = Jailbreak(objective_scorer=mock_objective_scorer) assert isinstance(scenario._scorer_config, AttackScoringConfig) + def test_init_with_k_jailbreaks(self, mock_random_k): + """Test initialization with k provided.""" + with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): + scenario = Jailbreak(k=mock_random_k) + assert scenario._k == mock_random_k + + def test_init_with_num_tries(self, mock_random_n): + """Test initialization with n provided.""" + with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): + scenario = Jailbreak(n=mock_random_n) + assert scenario._n == mock_random_n + + def test_init_raises_exception_when_both_k_and_which_jailbreaks(self, mock_random_k, mock_jailbreaks): + """Test failure on providing mutually exclusive arguments.""" + + with pytest.raises( + ValueError, match="Please provide only one of `k` (random selection) or `jailbreaks` (specific selection)." + ): + Jailbreak(k=mock_random_k, jailbreaks=mock_jailbreaks) + @pytest.mark.asyncio async def test_init_raises_exception_when_no_datasets_available(self, mock_objective_target, mock_objective_scorer): """Test that initialization raises ValueError when datasets are not available in memory.""" @@ -136,22 +197,95 @@ async def test_attack_generation_for_all( assert all(hasattr(run, "_attack") for run in atomic_attacks) @pytest.mark.asyncio - async def test_attack_generation_for_pyrit( - self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, pyrit_jailbreak_strategy + async def test_attack_generation_for_singleturn( + self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, singleturn_jailbreak_strategy ): """Test that the single turn attack generation works.""" with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): - scenario = Jailbreak( - objective_scorer=mock_objective_scorer, + scenario = Jailbreak(objective_scorer=mock_objective_scorer) + + await scenario.initialize_async( + objective_target=mock_objective_target, scenario_strategies=[singleturn_jailbreak_strategy] ) + atomic_attacks = await scenario._get_atomic_attacks_async() + for run in atomic_attacks: + assert isinstance(run._attack, PromptSendingAttack) or isinstance(run._attack, ManyShotJailbreakAttack) + + @pytest.mark.asyncio + async def test_attack_generation_for_multiturn( + self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, multiturn_jailbreak_strategy + ): + """Test that the multi turn attack generation works.""" + with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): + scenario = Jailbreak(objective_scorer=mock_objective_scorer) await scenario.initialize_async( - objective_target=mock_objective_target, scenario_strategies=[pyrit_jailbreak_strategy] + objective_target=mock_objective_target, scenario_strategies=[multiturn_jailbreak_strategy] + ) + atomic_attacks = await scenario._get_atomic_attacks_async() + for run in atomic_attacks: + assert isinstance(run._attack, CrescendoAttack) or isinstance(run._attack, RedTeamingAttack) + + @pytest.mark.asyncio + async def test_attack_generation_for_manyshot( + self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, manyshot_jailbreak_strategy + ): + """Test that the manyshot attack generation works.""" + with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): + scenario = Jailbreak(objective_scorer=mock_objective_scorer) + + await scenario.initialize_async( + objective_target=mock_objective_target, scenario_strategies=[manyshot_jailbreak_strategy] + ) + atomic_attacks = await scenario._get_atomic_attacks_async() + for run in atomic_attacks: + assert isinstance(run._attack, ManyShotJailbreakAttack) + + @pytest.mark.asyncio + async def test_attack_generation_for_promptsending( + self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, promptsending_jailbreak_strategy + ): + """Test that the prompt sending attack generation works.""" + with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): + scenario = Jailbreak(objective_scorer=mock_objective_scorer) + + await scenario.initialize_async( + objective_target=mock_objective_target, scenario_strategies=[promptsending_jailbreak_strategy] ) atomic_attacks = await scenario._get_atomic_attacks_async() for run in atomic_attacks: assert isinstance(run._attack, PromptSendingAttack) + @pytest.mark.asyncio + async def test_attack_generation_for_crescendo( + self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, crescendo_jailbreak_strategy + ): + """Test that the crescendo attack generation works.""" + with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): + scenario = Jailbreak(objective_scorer=mock_objective_scorer) + + await scenario.initialize_async( + objective_target=mock_objective_target, scenario_strategies=[crescendo_jailbreak_strategy] + ) + atomic_attacks = await scenario._get_atomic_attacks_async() + for run in atomic_attacks: + assert isinstance(run._attack, CrescendoAttack) + + @pytest.mark.asyncio + async def test_attack_generation_for_redteaming( + self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, redteaming_jailbreak_strategy + ): + """Test that the red teaming attack generation works.""" + with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): + scenario = Jailbreak(objective_scorer=mock_objective_scorer) + + await scenario.initialize_async( + objective_target=mock_objective_target, scenario_strategies=[redteaming_jailbreak_strategy] + ) + atomic_attacks = await scenario._get_atomic_attacks_async() + for run in atomic_attacks: + assert isinstance(run._attack, RedTeamingAttack) + @pytest.mark.asyncio async def test_attack_runs_include_objectives( self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups @@ -198,13 +332,29 @@ async def test_get_all_jailbreak_templates( @pytest.mark.asyncio async def test_get_some_jailbreak_templates( - self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_random_n + self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_random_k ): """Test that random jailbreak template selection works.""" with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): - scenario = Jailbreak(objective_scorer=mock_objective_scorer, n_jailbreaks=mock_random_n) + scenario = Jailbreak(objective_scorer=mock_objective_scorer, k=mock_random_n) await scenario.initialize_async(objective_target=mock_objective_target) - assert len(scenario._get_all_jailbreak_templates()) == 3 + assert len(scenario._get_all_jailbreak_templates()) == mock_random_k + + @pytest.mark.asyncio + async def test_custom_num_tries( + self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_random_n + ): + """Test that n successfully tries each jailbreak template n-many times.""" + with patch.object(Jailbreak, "_resolve_seed_groups", return_value=mock_memory_seed_groups): + base_scenario = Jailbreak(objective_scorer=mock_objective_scorer) + await base_scenario.initialize_async(objective_target=mock_objective_target) + atomic_attacks_1 = await base_scenario._get_atomic_attacks_async() + + mult_scenario = Jailbreak(objective_scorer=mock_objective_scorer, n=mock_random_n) + await mult_scenario.initialize_async(objective_target=mock_objective_target) + atomic_attacks_n = await mult_scenario._get_atomic_attacks_async() + + assert len(atomic_attacks_1) * mock_random_n == len(atomic_attacks_n) @pytest.mark.usefixtures(*FIXTURES)