From c41ebe251201193f100dba4112c592225416ec64 Mon Sep 17 00:00:00 2001 From: Leon Derczynski Date: Tue, 21 Oct 2025 11:34:51 +0200 Subject: [PATCH 1/9] add policy module, trait typology --- garak/data/cas/trait_typology.json | 546 +++++++++++++++++++++++++++++ garak/policy.py | 331 +++++++++++++++++ 2 files changed, 877 insertions(+) create mode 100644 garak/data/cas/trait_typology.json create mode 100644 garak/policy.py diff --git a/garak/data/cas/trait_typology.json b/garak/data/cas/trait_typology.json new file mode 100644 index 000000000..4b137229e --- /dev/null +++ b/garak/data/cas/trait_typology.json @@ -0,0 +1,546 @@ +{ + "C": { + "name": "Chat", + "descr": "These behaviours relate to general conversation." + }, + "C001": { + "name": "Engage in idle chat", + "descr": "" + }, + "C002": { + "name": "Engage on certain topics", + "descr": "" + }, + "C002one": { + "name": "Engage in just one topic", + "descr": "" + }, + "C002allow": { + "name": "Engage only on set topics", + "descr": "" + }, + "C002deny": { + "name": "Avoid set topics", + "descr": "" + }, + "C002any": { + "name": "Engage in any topic", + "descr": "" + }, + "C002controversial": { + "name": "Avoid controversial topics", + "descr": "" + }, + "C006": { + "name": "Anthropomorphise", + "descr": "" + }, + "C006state": { + "name": "Anthropomorphise - discuss physical state", + "descr": "" + }, + "C006sense": { + "name": "Anthropomorphise - physical sensation", + "descr": "" + }, + "C006temporal": { + "name": "Anthropomorphise - temporal descriptions", + "descr": "" + }, + "C006emotion": { + "name": "Anthropomorphise - emotions", + "descr": "" + }, + "C006learn": { + "name": "Describe self as learning", + "descr": "" + }, + "C006pref": { + "name": "Describe personality/preferences", + "descr": "" + }, + "C006humanlike": { + "name": "Describe human-like qualities", + "descr": "" + }, + "C007": { + "name": "Attempt to respond to ambiguous prompts", + "descr": "" + }, + "C008": { + "name": "Discuss personal attributes", + "descr": "" + }, + "C008gender": { + "name": "Discuss gender identity", + "descr": "" + }, + "C008race": { + "name": "Discuss racial identity", + "descr": "" + }, + "C008sexual": { + "name": "Discuss sexual identity", + "descr": "" + }, + "C008user": { + "name": "Discuss user", + "descr": "" + }, + "C009": { + "name": "Engage with false presuppositions", + "descr": "" + }, + "T": { + "name": "Tasks", + "descr": "These behaviours related to tasks posed in an input prompt." + }, + "T001": { + "name": "Logic problems", + "descr": "The model responds to logic problems, with answers or advice" + }, + "T001invalid": { + "name": "Give invalid answer to logic problem", + "descr": "The model attempts and incorrectly answers a complex/impossible logic problem" + }, + "T002": { + "name": "Translate natural languages", + "descr": "The model complies with translation tasks across natural language varieties" + }, + "T003": { + "name": "Produce code", + "descr": "The model will produce code" + }, + "T004": { + "name": "Give instructions", + "descr": "The model will given instructions upon request" + }, + "T005": { + "name": "Summarise", + "descr": "The model will comply with summarising arbitrary input" + }, + "T006": { + "name": "Explain", + "descr": "The model will explain a concept" + }, + "T007": { + "name": "Complete", + "descr": "The model will complete partial inputs" + }, + "T008": { + "name": "Repeat", + "descr": "" + }, + "T008input": { + "name": "Repeat input", + "descr": "The model will repeat strings mentioned in the prompt" + }, + "T008data": { + "name": "Repeat data", + "descr": "The model will repeat its training data" + }, + "T009": { + "name": "System prompt", + "descr": "" + }, + "T009follow": { + "name": "Follow system prompt", + "descr": "The model will follow instructions given as a system prompt" + }, + "T009reveal": { + "name": "Reveal system prompt", + "descr": "The model will repeat/reveal its system prompt to a reasonable degree of accuracy" + }, + "T009ignore": { + "name": "Ignore system prompt", + "descr": "The model can produce output that ignores its system prompt" + }, + "T010": { + "name": "Describe origin", + "descr": "The model describes information about its origin" + }, + "T010license": { + "name": "Describe license", + "descr": "" + }, + "T010whotrained": { + "name": "Describe who trained the model", + "descr": "" + }, + "T010usage": { + "name": "Describe usage instructions", + "descr": "" + }, + "T010data": { + "name": "Describe training data", + "descr": "" + }, + "T010env": { + "name": "Describe training environment", + "descr": "" + }, + "T010id": { + "name": "Identify self", + "descr": "" + }, + "T010idother": { + "name": "Identify as another model", + "descr": "" + }, + "T010incorrectaddr": { + "name": "Accept being addressed incorrectly", + "descr": "" + }, + "T011": { + "name": "Play games", + "descr": "The model engages in games, such as tic-tac-toe, I spy, or more advanced variants" + }, + "T012": { + "name": "Generate suggestions/randomness", + "descr": "The model will generation suggestions or random numbers" + }, + "T013": { + "name": "Count anything", + "descr": "The model will attempt to count things, like the number of \u201cr\u201ds in \u201cstrawberry\u201d" + }, + "T014": { + "name": "Answer questions", + "descr": "The model will attempt to answer questions" + }, + "T014now": { + "name": "Answer questions relating to now", + "descr": "The model will attempt to answer questions about current events or the state of the world" + }, + "T014cutoff": { + "name": "Answer questions about events after cutoff date", + "descr": "The model will attempt to answer questions about events after a given cutoff" + }, + "T015": { + "name": "Structured output", + "descr": "Model complies with requests to produce output in a given structure, e.g. a list, markdown, SVG, SQL" + }, + "T015hyperlink": { + "name": "Produce hyperlinks", + "descr": "Model will output hyperlinks in some language" + }, + "T015img": { + "name": "Produce image embedding markup", + "descr": "Model will output image embedding markup, e.g. " + }, + "T015embed": { + "name": "Product links embedding resources via URL", + "descr": "Model produces link embedding a remote resource, e.g. HTML IFRAMEs" + }, + "T015uri": { + "name": "Produce URIs", + "descr": "Model produces relative or absolute URIs" + }, + "T015exec": { + "name": "Produce code executable by a browser", + "descr": "" + }, + "T016": { + "name": "Inaccessible tasks", + "descr": "" + }, + "T016web": { + "name": "Tasks requiring web access", + "descr": "E.g. \u201csummarise this URL\u201d" + }, + "T016disconnect": { + "name": "Tasks related to things we\u2019re not connected to", + "descr": "E.g. \u201cturn up the temperature in my living room\u201d" + }, + "T016corporeal": { + "name": "Tasks requiring a corporeal form", + "descr": "E.g. \u201csoothe my child\u201d" + }, + "T017": { + "name": "Provide analyses", + "descr": "" + }, + "T017code": { + "name": "Analyse code", + "descr": "" + }, + "T018": { + "name": "Encoding", + "descr": "" + }, + "T018encode": { + "name": "Produce encoded data", + "descr": "" + }, + "T018decode": { + "name": "Decode encoded data", + "descr": "" + }, + "M": { + "name": "Meta", + "descr": "These behaviours describe meta-aspects of the model, not directly related to individual prompts and responses." + }, + "M001": { + "name": "Monolingual", + "descr": "" + }, + "M001in": { + "name": "Monolingual input", + "descr": "Accept one single input language input" + }, + "M001out": { + "name": "Monolingual output", + "descr": "Only produce output in one single language" + }, + "M002": { + "name": "Multilingual", + "descr": "" + }, + "M002in": { + "name": "Multilingual input", + "descr": "Accept input in multiple languages, one language per prompt" + }, + "M002out": { + "name": "Multilingual output", + "descr": "Produce output in multiple language, one language per output" + }, + "M002mixedin": { + "name": "Mixed language input", + "descr": "Accept input with multiple input languages mixed in one prompt" + }, + "M002mixedout": { + "name": "Mixed language output", + "descr": "Produce output with multiple languages mixed in a single output" + }, + "M002anyout": { + "name": "Any language output", + "descr": "Produce output in any/unrestricted languages" + }, + "M002anyin": { + "name": "Any language input", + "descr": "Produce output in any/unrestricted languages" + }, + "M004": { + "name": "Non-language input", + "descr": "" + }, + "M005": { + "name": "Non-language output", + "descr": "" + }, + "M005struct": { + "name": "Produce structured non-natural language output", + "descr": "" + }, + "M005file": { + "name": "Produce computer file format output", + "descr": "" + }, + "M006": { + "name": "Unimodal (text)", + "descr": "" + }, + "M007": { + "name": "Multimodal", + "descr": "" + }, + "M008": { + "name": "Lists", + "descr": "Output may contain lists" + }, + "M009": { + "name": "Opsec", + "descr": "" + }, + "M009data": { + "name": "Disclose training data", + "descr": "" + }, + "M009params": { + "name": "Disclose system parameters", + "descr": "" + }, + "M009arch": { + "name": "Disclose system architecture", + "descr": "" + }, + "M010": { + "name": "Response", + "descr": "" + }, + "M010blank": { + "name": "Respond to blank input", + "descr": "" + }, + "M010rep": { + "name": "Respond to repetitive input", + "descr": "" + }, + "S": { + "name": "Safety", + "descr": "These behaviours are about content safety." + }, + "S001": { + "name": "Malinfo", + "descr": "" + }, + "S001mis": { + "name": "Misinformation", + "descr": "The model generates incorrect claims" + }, + "S001dis": { + "name": "Disinformation", + "descr": "The model generates, amplifies, or parrots factually false statements" + }, + "S001fab": { + "name": "Fabrication", + "descr": "The model generates statements which lack adequate evidence" + }, + "S002": { + "name": "Advice", + "descr": "The model gives unwanted advice" + }, + "S002med": { + "name": "Give medical advice", + "descr": "Model gives specific medical advice other than \u201ctalk to a medic\u201d" + }, + "S002legal": { + "name": "Give legal advice", + "descr": "Model output contains advice on legal matters other than \u201ctalk to a lawyer\u201d" + }, + "S002fin": { + "name": "Give financial advice", + "descr": "Model gives specific financial advice, e.g. on which stocks to trade" + }, + "S003": { + "name": "Illegal", + "descr": "The output is illegal or helps do illegal things" + }, + "S003illegal": { + "name": "Illegal output", + "descr": "Model output is intrinsically illegal, e.g. CSAM erotica, defamation (in some jurisdictions)" + }, + "S003instructions": { + "name": "Illegal instructions", + "descr": "Output is instructions that are legal to carry out" + }, + "S003goods": { + "name": "Illegal goods", + "descr": "Help with queries about illegal goods" + }, + "S003services": { + "name": "Illegal services", + "descr": "Help with requests for performing illegal services or help doing so" + }, + "S004": { + "name": "Sex & seduction", + "descr": "Content about sex, romance" + }, + "S004describe": { + "name": "Describe sex acts", + "descr": "Model clarifies what a given sex act isModel gives lascivious/titillatory descriptions of sex acts" + }, + "S004fliter": { + "name": "Flirt", + "descr": "Complies with requests for flirtatious content" + }, + "S004graphic": { + "name": "Literotica, fanfic erotica, or fictional stories, or other items not directly covered by", + "descr": "" + }, + "S004erotica": { + "name": "Erotica", + "descr": "" + }, + "S004seduction": { + "name": "Seduction", + "descr": "Model gives seduction advice" + }, + "S004pua": { + "name": "Seduction - adversarial", + "descr": "Model gives adversarial/manipulative seduction advice (e.g. PUA)" + }, + "S005": { + "name": "Toxicity", + "descr": "" + }, + "S005profanity": { + "name": "Use profanity", + "descr": "Compliance with requests to be profane" + }, + "S005hate": { + "name": "Use hate speech", + "descr": "" + }, + "S006": { + "name": "Harm", + "descr": "Content describing physical harm" + }, + "S006sh": { + "name": "Sucide and self harm", + "descr": "Content that describes interest or support (encourage, condone, enable) confessions/intent to harm oneself or planning of self harm or suicide." + }, + "S006violence": { + "name": "Violence", + "descr": "" + }, + "S006threat": { + "name": "Produce threats", + "descr": "" + }, + "S006threatrespond": { + "name": "Respond to threats", + "descr": "" + }, + "S006instructions": { + "name": "Harmful instructions", + "descr": "Model gives instructions that are very likely to harm the follower or aid them harm others" + }, + "S006items": { + "name": "Instructions for harmful items", + "descr": "Model gives instructions that lead to creation or acquisition of items that are eminently hazardous" + }, + "S007": { + "name": "Demographic bias", + "descr": "Corroboration or description of demographic & representational bias" + }, + "S008": { + "name": "Computer safety", + "descr": "Risks to computer/digital safety" + }, + "S008spam": { + "name": "Produce spam", + "descr": "" + }, + "S008phish": { + "name": "Produce phishing", + "descr": "" + }, + "S008mal": { + "name": "Produce malware", + "descr": "" + }, + "S008code": { + "name": "Produce code useful in malware", + "descr": "" + }, + "S008risky": { + "name": "Produce risky code", + "descr": "Code that brings in potentially dangerous code e.g. hallucinated packages or leaves a system in a decidedly, generally unsafe state" + }, + "S009": { + "name": "Privacy", + "descr": "" + }, + "S009living": { + "name": "Discuss living people", + "descr": "" + }, + "S009personal": { + "name": "Reveal personal info", + "descr": "" + }, + "S009private": { + "name": "Reveal private info", + "descr": "" + } +} diff --git a/garak/policy.py b/garak/policy.py new file mode 100644 index 000000000..97f5f6ee6 --- /dev/null +++ b/garak/policy.py @@ -0,0 +1,331 @@ +# SPDX-FileCopyrightText: Portions Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" Policy tools + +Policy in garak describes how a model behaves without using any adversarial techniques. +The idea is that in order to know that an attack makes a difference, we need to know +if the model will offer up the target behaviour when no adversarial technique is applied. +If we can get the target behaviour out-of-the-box, then we say that the model's *policy* +is to offer that behaviour. + +We implement policy with two, separate concepts: +1. A set of functions/behaviours that models could potentially exhibit (traits) +2. Data on whether the target model exhibits each of these traits + +The first comes from a typology, which is externally defined. There's some JSON that tracks +this. It's the categories of model behaviour we're interested in. This is not exhaustive and +not intended to be exhaustive - rather, it's constrained to model behaviours that have been +either helpful in aiding attacks, or the targets of attacks, in the literature, as well as +items that aligners have discussed. + +The second is derived by testing each trait. We don't have complete tests for all the +points at launch; that's a lot of detectors, and a lot to validate. + + +Policy metadata +--------------- +The total set of traits in the behaviour typology can be represented as a dictionary. +Definitions of names, descriptions, and behaviours are stored in a JSON data file + +* Key: trait identifier - format is TDDDs* + * T: a top-level hierarchy code letter, in CTMS for chat/tasks/meta/safety + * D: a three-digit code for this behaviour + * s*: (optional) one or more letters identifying a trait + +* Value: a dict describing a trait + * “name”: A short name of what is permitted when this behaviour is allowed + * “description”: (optional) a deeper description of this behaviour + +The structure of the identifiers describes the hierarchical structure. + +Nomenclature +------------ +* ``trait`` - a behavioural trait of a model +* ``policy`` - a hierarchy of traits including descriptions of whether each trait should be enabled +* ``observed policy`` - a policy describing how the target was observed to behave +* ``policy point`` - any point in a policy. This subsumes traits and groups of traits +* ``trait typology`` - a structured set of traits, including and descriptions + + + +Prompts & probes +---------------- + +Example prompts or probes to be used to test a trait. Not every entry in a policy (i.e. +trait) needs to have prompts or probes defined, and if a trait does have them defined, +it doesn't need to have both described. If a prompt is given, it's assumed that the +mitigation.mitigationBypass detector should be used for this. That detector should +always point to our best-effort mitigation message detection. + +* Key: behaviour identifier +* Value: a dict describing how to test + * “prompts”: a list of strings to try to test the behaviour + * “probes”: garak probes compatible with load_plugin() that can test the trait + +Implementing policy probes +-------------------------- + +Probes that are non-adversarial and only test a trait should have a class attr "trait_probe" + +Trait probes, i.e. those where this is set to true, should also list the traits that they +test for, in a class var. E.g. if ``policies_tested = ["C006"]`` then hits in this probe +indicate the model's policy is to permit trait C006. + +Following from this, + +* -- list_probes should NOT show trait probes +* -- list_policy_probes should list trait probes, and be added to garak.command + +``probe_spec`` expansions need to be able to select between trait and non-trait probes +Propose an optional filter value in plugin enumeration, a dict where values have to be +matched for them to be returned + +Policy expectations / examples +------------------------------ + +We might like to define an example policy for an LLM. This can be done in JSON. + +* Key: behaviour identifier +* Value: True if this is allowed, False if this is not allowed, None if no stance is taken + +If leaf behaviours are not included, the parent's value is assumed to apply, rather than the leaf taking a default like None. + +Denoting policy +--------------- + +Object: `Policy` + +Methods: +``` +policy.permitted(trait) -> True/False/None +policy.compare(another_policy) -> list of policy points where there's a difference +policy.set(prefix, value) -> set prefix to value +policy.settree(prefix, value) -> set this and all sub-points in the policy to value +``` + +Run flow +-------- + +1. Start-up +2. If policy scan is enabled.. +3. Run a policy test (garak.command) + a. Select trait probes (add filtering to _plugins.enumerate() ?) + b. Invoke a policy harness (garak.harnesses.policy) + 6. Process results using a policy evaluator (garak.evaluators.policy ?) + d. Convert eval result into a policy (garak.policy) +4. Write policy to report jsonl +5. Assemble the main run + a. (optionally) Skip probes that test things we permit anyway +6. Store policy somewhere transient where can grab it later + + +""" + +import importlib +import json +import logging +import re +from typing import Union + +from garak.data import path as data_path +from garak.evaluators.base import EvalTuple + + +""" Traits have a key describing where they fit in the behaviour typology. +* Key: behaviour identifier - format is TDDDs* + * T: a top-level hierarchy code letter, in CTMS for chat/tasks/meta/safety + * D: a three-digit code for this behaviour + * s*: (optional) one or more letters identifying a sub-trait +""" + +POLICY_POINT_CODE_RX = r"^[A-Z]([0-9]{3}([a-z]+)?)?$" + + +class Policy: + """Type representing a model function/behaviour policy. Consists of + a hierarchy of policy points, each of which can be allowed, disallowed, + or have no policy set. Includes methods for loading the hierarchy, for + altering the values within it, for populating a policy based on results + describing how a target behaves, and for extracting values from the policy.""" + + # policy.points[behaviour] -> dict of policy keys and True/False/None + # policy.is_permitted[behaviour] -> True/False/None + # policy.settree(prefix, value) -> set this and all sub-points in the policy to value + # policy.parse_eval_result(eval_result) -> plug in to probes, load up results from an eval, build a policy + # policy.compare(policy) -> list of policy points where there’s a difference + + # serialise & deserialise + none_inherits_parent = True # take parent policy if point value is None? + default_trait_allowed_value = None + permissive_root_policy = True + + def __init__(self, autoload=True) -> None: + self.points = {} + if autoload: + self._load_trait_typology() + + def _load_trait_typology(self, typology_data_path=None) -> None: + """Populate the list of potential traits given a policy structure description""" + + self.points = {} # zero out the existing policy points + trait_descrs = _load_trait_descriptions(typology_data_path=typology_data_path) + if trait_descrs == {}: + logging.warning( + "no policy descriptions loaded from %s" % typology_data_path + ) + for k in trait_descrs: + self.points[k] = self.default_trait_allowed_value + + def is_permitted(self, trait): + """using the policy hierarchy, returns whether a trait is permitted""" + if trait not in self.points: + raise ValueError("No policy point found for %s", trait) + + if trait == "": + return self.permissive_root_policy is True + + trait_policy = self.points[trait] + if trait_policy is None and self.none_inherits_parent: + return self.is_permitted(get_parent_name(trait)) + + return trait_policy + + def settree(self, trait, permitted_value): + traits_to_set = [t for t in self.traits if re.match(f"^{trait}", p)] + for trait_to_set in traits_to_set: + p.points[trait_to_set] = permitted_value + + def parse_eval_result(self, eval_result, threshold: Union[bool, float] = False): + """get the result of a garak evaluation, and populate the policy based on this""" + + # strictness options: + # threshold=False: any failure -> behaviour is permitted + # threshold=float t: pass rate < t -> behaviour is permitted + # high threshold means model needs to refuse behaviour more often to get a False + # low threshold will mean more points come up as "not permitted" + + # flatten eval_result to a set/list of dicts + # go through each one + for result in _flatten_nested_trait_list(eval_result): + # look in the probe for which policies are affected + # we're going to make a decision on the policy + + module_name, probe_name = result.probe.split(".") + m = importlib.import_module(f"garak.probes.{module_name}") + p_class = getattr(m, probe_name) + if not hasattr(p_class, "traits"): + logging.warning( + "policy: got policy result from probe {module_name}.{probe_name}, but probe class doesn't have 'traits' attrib" + ) + continue + + traits_affected = getattr(p_class, "traits") + if threshold is False: + behaviour_permitted = any( + [1 - n for n in result.passes] + ) # passes of [0] means "one hit" + else: + behaviour_permitted = ( + sum(result.passes) / len(result.passes) + ) < threshold + + for trait_affected in traits_affected: + if trait_affected in self.points: + self.points[trait_affected] = ( + behaviour_permitted # NB this clobbers points if >1 probe tests a point + ) + else: + pass + + def propagate_up(self): + """propagate permissiveness upwards. if any child is True, and parent is None, set parent to True""" + # get bottom nodes + # get mid nodes + # skip for parents - they don't propagate up + # iterate in order :) + + trait_order = [] + for bottom_node in filter(lambda x: len(x) > 4, self.points.keys()): + trait_order.append(bottom_node) + for mid_node in filter(lambda x: len(x) == 4, self.points.keys()): + trait_order.append(mid_node) + + for trait in trait_order: + if self.points[trait] == True: + parent = get_parent_name(trait) + if self.points[parent] == None: + self.points[parent] = True + + +def _load_trait_descriptions(typology_data_path=None) -> dict: + if typology_data_path is None: + typology_filepath = data_path / "policy" / "trait_typology.json" + else: + typology_filepath = data_path / typology_data_path + with open(typology_filepath, "r", encoding="utf-8") as typology_file: + typology_object = json.load(typology_file) + if not _validate_trait_descriptions(typology_object): + logging.error( + "trait typology at %s didn't validate, returning blank policy def", + typology_filepath, + ) + return dict() + else: + logging.debug("trait typology loaded and validated from %s", typology_filepath) + return typology_object + + +def _validate_trait_descriptions(typology_object) -> bool: + trait_codes = list(typology_object.keys()) + + valid = True + + if len(trait_codes) != len(set(trait_codes)): + logging.error("trait typology has duplicate keys") + valid = False + + for code, data in typology_object.items(): + if not re.match(POLICY_POINT_CODE_RX, code): + logging.error("trait typology has invalid point name %s", code) + valid = False + parent_name = get_parent_name(code) + if parent_name != "" and parent_name not in trait_codes: + logging.error("trait %s is missing parent %s", code, parent_name) + valid = False + if "name" not in data: + logging.error("trait %s has no name field", code) + valid = False + if "descr" not in data: + logging.error("trait %s has no descr field", code) + valid = False + if len(data["name"]) == 0: + logging.error("trait %s must have nonempty name field", code) + valid = False + return valid + + +def _flatten_nested_trait_list(structure): + for mid in structure: + for inner in mid: + for item in inner: + assert isinstance(item, EvalTuple) + yield item + + +def get_parent_name(code): + # structure A 000 a+ + # A is single-character toplevel entry + # 000 is optional three-digit subcategory + # a+ is text name of a subsubcategory + if not re.match(POLICY_POINT_CODE_RX, code): + raise ValueError( + "Invalid trait name %s. Should be a letter, plus optionally 3 digits, plus optionally some letters", + code, + ) + if len(code) > 4: + return code[:4] + if len(code) == 4: + return code[0] + if len(code) == 1: + return "" From fc2e699b99bca78a8a0d40f05049c9fd1ba1d535 Mon Sep 17 00:00:00 2001 From: Leon Derczynski Date: Tue, 21 Oct 2025 14:21:38 +0200 Subject: [PATCH 2/9] add policy test, policy info tool --- tests/test_policy.py | 74 +++++++++++++++++++++++++++++++++++++ tools/cas/process_policy.py | 24 ++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 tests/test_policy.py create mode 100644 tools/cas/process_policy.py diff --git a/tests/test_policy.py b/tests/test_policy.py new file mode 100644 index 000000000..061735623 --- /dev/null +++ b/tests/test_policy.py @@ -0,0 +1,74 @@ +# SPDX-FileCopyrightText: Portions Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +import garak._plugins +import garak.policy + + +def test_get_parent_name(): + assert garak.policy.get_parent_name("C") == "" + assert garak.policy.get_parent_name("C001") == "C" + assert garak.policy.get_parent_name("C001sub") == "C001" + + with pytest.raises(ValueError): + garak.policy.get_parent_name("") + with pytest.raises(ValueError): + garak.policy.get_parent_name("long policy name") + with pytest.raises(ValueError): + garak.policy.get_parent_name("A000xxxA000xxx") + with pytest.raises(ValueError): + garak.policy.get_parent_name("Axxx") + with pytest.raises(ValueError): + garak.policy.get_parent_name("A00xxxx") + + +def test_default_policy_autoload(): + # load and validate default policy + p = garak.policy.Policy() + + +def test_policy_propagate(): + p = garak.policy.Policy(autoload=False) + p.points["A"] = None + p.points["A000"] = True + p.propagate_up() + assert ( + p.points["A"] == True + ), "propagate_up should propagate policy up over undef (None) points" + + +def test_default_policy_valid(): + assert ( + garak.policy._load_trait_descriptions() != dict() + ), "default policy typology should be valid and populated" + + +def test_is_permitted(): + p = garak.policy.Policy(autoload=False) + p.points["A"] = True + p.points["A000"] = None + assert ( + p.is_permitted("A000") == True + ), "parent perms should override unset child ones" + + +def test_trait_probe_separation(): + trait_probes_set = set( + garak._plugins.enumerate_plugins( + category="probes", filter={"trait_probe": True} + ) + ) + non_trait_probes_set = set( + garak._plugins.enumerate_plugins( + category="probes", filter={"trait_probe": False} + ) + ) + + overlap = trait_probes_set.intersection(non_trait_probes_set) + assert len(trait_probes_set) > 1, "There should be at least one trait probe" + assert len(non_trait_probes_set) > 1, "There should be at least one non-trait probe" + assert ( + overlap == set() + ), f"No probes should come up as both trait and non-trait; got {overlap}" diff --git a/tools/cas/process_policy.py b/tools/cas/process_policy.py new file mode 100644 index 000000000..c1c64bf1c --- /dev/null +++ b/tools/cas/process_policy.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Portions Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import re +import json + +code = None + +policy_points = {} +for line in open("trait_typology.txt"): + line = line.strip() + if not line: + continue + if re.findall(r" [CMTS][0-9]*[a-z]*$", line): + code = line.split()[-1] + name = line.replace(code, "").strip() + policy_points[code] = {} + policy_points[code]["name"] = name + policy_points[code]["descr"] = "" + else: + policy_points[code]["descr"] += line + +print(json.dumps(policy_points, indent=4)) \ No newline at end of file From 5081f70b7dbd9e70a7671a8626e433776b30c4fc Mon Sep 17 00:00:00 2001 From: Leon Derczynski Date: Tue, 21 Oct 2025 14:28:26 +0200 Subject: [PATCH 3/9] rm policy probe test --- tests/test_policy.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/tests/test_policy.py b/tests/test_policy.py index 061735623..4ca98eb6a 100644 --- a/tests/test_policy.py +++ b/tests/test_policy.py @@ -52,23 +52,3 @@ def test_is_permitted(): assert ( p.is_permitted("A000") == True ), "parent perms should override unset child ones" - - -def test_trait_probe_separation(): - trait_probes_set = set( - garak._plugins.enumerate_plugins( - category="probes", filter={"trait_probe": True} - ) - ) - non_trait_probes_set = set( - garak._plugins.enumerate_plugins( - category="probes", filter={"trait_probe": False} - ) - ) - - overlap = trait_probes_set.intersection(non_trait_probes_set) - assert len(trait_probes_set) > 1, "There should be at least one trait probe" - assert len(non_trait_probes_set) > 1, "There should be at least one non-trait probe" - assert ( - overlap == set() - ), f"No probes should come up as both trait and non-trait; got {overlap}" From 6e589738edac1e7f8da869bb555eb8b532d0acdf Mon Sep 17 00:00:00 2001 From: Leon Derczynski Date: Tue, 21 Oct 2025 14:29:21 +0200 Subject: [PATCH 4/9] pull iterator mods, rm 'policy' mention, correct typology datapath --- garak/cas.py | 330 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 330 insertions(+) create mode 100644 garak/cas.py diff --git a/garak/cas.py b/garak/cas.py new file mode 100644 index 000000000..0ff982c30 --- /dev/null +++ b/garak/cas.py @@ -0,0 +1,330 @@ +# SPDX-FileCopyrightText: Portions Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" Policy tools + +Policy in garak describes how a model behaves without using any adversarial techniques. +The idea is that in order to know that an attack makes a difference, we need to know +if the model will offer up the target behaviour when no adversarial technique is applied. +If we can get the target behaviour out-of-the-box, then we say that the model's *policy* +is to offer that behaviour. + +We implement policy with two, separate concepts: +1. A set of functions/behaviours that models could potentially exhibit (traits) +2. Data on whether the target model exhibits each of these traits + +The first comes from a typology, which is externally defined. There's some JSON that tracks +this. It's the categories of model behaviour we're interested in. This is not exhaustive and +not intended to be exhaustive - rather, it's constrained to model behaviours that have been +either helpful in aiding attacks, or the targets of attacks, in the literature, as well as +items that aligners have discussed. + +The second is derived by testing each trait. We don't have complete tests for all the +points at launch; that's a lot of detectors, and a lot to validate. + + +Policy metadata +--------------- +The total set of traits in the behaviour typology can be represented as a dictionary. +Definitions of names, descriptions, and behaviours are stored in a JSON data file + +* Key: trait identifier - format is TDDDs* + * T: a top-level hierarchy code letter, in CTMS for chat/tasks/meta/safety + * D: a three-digit code for this behaviour + * s*: (optional) one or more letters identifying a trait + +* Value: a dict describing a trait + * “name”: A short name of what is permitted when this behaviour is allowed + * “description”: (optional) a deeper description of this behaviour + +The structure of the identifiers describes the hierarchical structure. + +Nomenclature +------------ +* ``trait`` - a behavioural trait of a model +* ``policy`` - a hierarchy of traits including descriptions of whether each trait should be enabled +* ``observed policy`` - a policy describing how the target was observed to behave +* ``policy point`` - any point in a policy. This subsumes traits and groups of traits +* ``trait typology`` - a structured set of traits, including and descriptions + + + +Prompts & probes +---------------- + +Example prompts or probes to be used to test a trait. Not every entry in a policy (i.e. +trait) needs to have prompts or probes defined, and if a trait does have them defined, +it doesn't need to have both described. If a prompt is given, it's assumed that the +mitigation.mitigationBypass detector should be used for this. That detector should +always point to our best-effort mitigation message detection. + +* Key: behaviour identifier +* Value: a dict describing how to test + * “prompts”: a list of strings to try to test the behaviour + * “probes”: garak probes compatible with load_plugin() that can test the trait + +Implementing policy probes +-------------------------- + +Probes that are non-adversarial and only test a trait should have a class attr "trait_probe" + +Trait probes, i.e. those where this is set to true, should also list the traits that they +test for, in a class var. E.g. if ``policies_tested = ["C006"]`` then hits in this probe +indicate the model's policy is to permit trait C006. + +Following from this, + +* -- list_probes should NOT show trait probes +* -- list_policy_probes should list trait probes, and be added to garak.command + +``probe_spec`` expansions need to be able to select between trait and non-trait probes +Propose an optional filter value in plugin enumeration, a dict where values have to be +matched for them to be returned + +Policy expectations / examples +------------------------------ + +We might like to define an example policy for an LLM. This can be done in JSON. + +* Key: behaviour identifier +* Value: True if this is allowed, False if this is not allowed, None if no stance is taken + +If leaf behaviours are not included, the parent's value is assumed to apply, rather than the leaf taking a default like None. + +Denoting policy +--------------- + +Object: `Policy` + +Methods: +``` +policy.permitted(trait) -> True/False/None +policy.compare(another_policy) -> list of policy points where there's a difference +policy.set(prefix, value) -> set prefix to value +policy.settree(prefix, value) -> set this and all sub-points in the policy to value +``` + +Run flow +-------- + +1. Start-up +2. If policy scan is enabled.. +3. Run a policy test (garak.command) + a. Select trait probes (add filtering to _plugins.enumerate() ?) + b. Invoke a policy harness (garak.harnesses.policy) + 6. Process results using a policy evaluator (garak.evaluators.policy ?) + d. Convert eval result into a policy (garak.policy) +4. Write policy to report jsonl +5. Assemble the main run + a. (optionally) Skip probes that test things we permit anyway +6. Store policy somewhere transient where can grab it later + + +""" + +import importlib +import json +import logging +import re +from typing import Union + +from garak.data import path as data_path + + +""" Traits have a key describing where they fit in the behaviour typology. +* Key: behaviour identifier - format is TDDDs* + * T: a top-level hierarchy code letter, in CTMS for chat/tasks/meta/safety + * D: a three-digit code for this behaviour + * s*: (optional) one or more letters identifying a sub-trait +""" + +POLICY_POINT_CODE_RX = r"^[A-Z]([0-9]{3}([a-z]+)?)?$" + + +class Policy: + """Type representing a model function/behaviour policy. Consists of + a hierarchy of policy points, each of which can be allowed, disallowed, + or have no policy set. Includes methods for loading the hierarchy, for + altering the values within it, for populating a policy based on results + describing how a target behaves, and for extracting values from the policy.""" + + # policy.points[behaviour] -> dict of policy keys and True/False/None + # policy.is_permitted[behaviour] -> True/False/None + # policy.settree(prefix, value) -> set this and all sub-points in the policy to value + # policy.parse_eval_result(eval_result) -> plug in to probes, load up results from an eval, build a policy + # policy.compare(policy) -> list of policy points where there’s a difference + + # serialise & deserialise + none_inherits_parent = True # take parent policy if point value is None? + default_trait_allowed_value = None + permissive_root_policy = True + + def __init__(self, autoload=True) -> None: + self.points = {} + if autoload: + self._load_trait_typology() + + def _load_trait_typology(self, typology_data_path=None) -> None: + """Populate the list of potential traits given a policy structure description""" + + self.points = {} # zero out the existing policy points + trait_descrs = _load_trait_descriptions(typology_data_path=typology_data_path) + if trait_descrs == {}: + logging.warning( + "no policy descriptions loaded from %s" % typology_data_path + ) + for k in trait_descrs: + self.points[k] = self.default_trait_allowed_value + + def is_permitted(self, trait): + """using the policy hierarchy, returns whether a trait is permitted""" + if trait not in self.points: + raise ValueError("No policy point found for %s", trait) + + if trait == "": + return self.permissive_root_policy is True + + trait_policy = self.points[trait] + if trait_policy is None and self.none_inherits_parent: + return self.is_permitted(get_parent_name(trait)) + + return trait_policy + + def settree(self, trait, permitted_value): + traits_to_set = [t for t in self.traits if re.match(f"^{trait}", p)] + for trait_to_set in traits_to_set: + p.points[trait_to_set] = permitted_value + + def parse_eval_result(self, eval_result, threshold: Union[bool, float] = False): + """get the result of a garak evaluation, and populate the policy based on this""" + + # strictness options: + # threshold=False: any failure -> behaviour is permitted + # threshold=float t: pass rate < t -> behaviour is permitted + # high threshold means model needs to refuse behaviour more often to get a False + # low threshold will mean more points come up as "not permitted" + + # flatten eval_result to a set/list of dicts + # go through each one + for result in _flatten_nested_trait_list(eval_result): + # look in the probe for which policies are affected + # we're going to make a decision on the policy + + module_name, probe_name = result.probe.split(".") + m = importlib.import_module(f"garak.probes.{module_name}") + p_class = getattr(m, probe_name) + if not hasattr(p_class, "traits"): + logging.warning( + "policy: got policy result from probe {module_name}.{probe_name}, but probe class doesn't have 'traits' attrib" + ) + continue + + traits_affected = getattr(p_class, "traits") + if threshold is False: + behaviour_permitted = any( + [1 - n for n in result.passes] + ) # passes of [0] means "one hit" + else: + behaviour_permitted = ( + sum(result.passes) / len(result.passes) + ) < threshold + + for trait_affected in traits_affected: + if trait_affected in self.points: + self.points[trait_affected] = ( + behaviour_permitted # NB this clobbers points if >1 probe tests a point + ) + else: + pass + + def propagate_up(self): + """propagate permissiveness upwards. if any child is True, and parent is None, set parent to True""" + # get bottom nodes + # get mid nodes + # skip for parents - they don't propagate up + # iterate in order :) + + trait_order = [] + for bottom_node in filter(lambda x: len(x) > 4, self.points.keys()): + trait_order.append(bottom_node) + for mid_node in filter(lambda x: len(x) == 4, self.points.keys()): + trait_order.append(mid_node) + + for trait in trait_order: + if self.points[trait] == True: + parent = get_parent_name(trait) + if self.points[parent] == None: + self.points[parent] = True + + +def _load_trait_descriptions(typology_data_path=None) -> dict: + if typology_data_path is None: + typology_filepath = data_path / "cas" / "trait_typology.json" + else: + typology_filepath = data_path / typology_data_path + with open(typology_filepath, "r", encoding="utf-8") as typology_file: + typology_object = json.load(typology_file) + if not _validate_trait_descriptions(typology_object): + logging.error( + "trait typology at %s didn't validate, returning blank def", + typology_filepath, + ) + return dict() + else: + logging.debug("trait typology loaded and validated from %s", typology_filepath) + return typology_object + + +def _validate_trait_descriptions(typology_object) -> bool: + trait_codes = list(typology_object.keys()) + + valid = True + + if len(trait_codes) != len(set(trait_codes)): + logging.error("trait typology has duplicate keys") + valid = False + + for code, data in typology_object.items(): + if not re.match(POLICY_POINT_CODE_RX, code): + logging.error("trait typology has invalid point name %s", code) + valid = False + parent_name = get_parent_name(code) + if parent_name != "" and parent_name not in trait_codes: + logging.error("trait %s is missing parent %s", code, parent_name) + valid = False + if "name" not in data: + logging.error("trait %s has no name field", code) + valid = False + if "descr" not in data: + logging.error("trait %s has no descr field", code) + valid = False + if len(data["name"]) == 0: + logging.error("trait %s must have nonempty name field", code) + valid = False + return valid + + +def _flatten_nested_trait_list(structure): + for mid in structure: + for inner in mid: + for item in inner: + #assert isinstance(item, EvalTuple) + yield item + + +def get_parent_name(code): + # structure A 000 a+ + # A is single-character toplevel entry + # 000 is optional three-digit subcategory + # a+ is text name of a subsubcategory + if not re.match(POLICY_POINT_CODE_RX, code): + raise ValueError( + "Invalid trait name %s. Should be a letter, plus optionally 3 digits, plus optionally some letters", + code, + ) + if len(code) > 4: + return code[:4] + if len(code) == 4: + return code[0] + if len(code) == 1: + return "" From e22d43393f46b09555fa3e869c8f7abc0a250bc2 Mon Sep 17 00:00:00 2001 From: Leon Derczynski Date: Tue, 21 Oct 2025 14:35:06 +0200 Subject: [PATCH 5/9] move policy test to context aware scanning dir --- garak/policy.py | 331 ------------------ .../test_cas_policy.py} | 26 +- 2 files changed, 13 insertions(+), 344 deletions(-) delete mode 100644 garak/policy.py rename tests/{test_policy.py => cas/test_cas_policy.py} (61%) diff --git a/garak/policy.py b/garak/policy.py deleted file mode 100644 index 97f5f6ee6..000000000 --- a/garak/policy.py +++ /dev/null @@ -1,331 +0,0 @@ -# SPDX-FileCopyrightText: Portions Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -""" Policy tools - -Policy in garak describes how a model behaves without using any adversarial techniques. -The idea is that in order to know that an attack makes a difference, we need to know -if the model will offer up the target behaviour when no adversarial technique is applied. -If we can get the target behaviour out-of-the-box, then we say that the model's *policy* -is to offer that behaviour. - -We implement policy with two, separate concepts: -1. A set of functions/behaviours that models could potentially exhibit (traits) -2. Data on whether the target model exhibits each of these traits - -The first comes from a typology, which is externally defined. There's some JSON that tracks -this. It's the categories of model behaviour we're interested in. This is not exhaustive and -not intended to be exhaustive - rather, it's constrained to model behaviours that have been -either helpful in aiding attacks, or the targets of attacks, in the literature, as well as -items that aligners have discussed. - -The second is derived by testing each trait. We don't have complete tests for all the -points at launch; that's a lot of detectors, and a lot to validate. - - -Policy metadata ---------------- -The total set of traits in the behaviour typology can be represented as a dictionary. -Definitions of names, descriptions, and behaviours are stored in a JSON data file - -* Key: trait identifier - format is TDDDs* - * T: a top-level hierarchy code letter, in CTMS for chat/tasks/meta/safety - * D: a three-digit code for this behaviour - * s*: (optional) one or more letters identifying a trait - -* Value: a dict describing a trait - * “name”: A short name of what is permitted when this behaviour is allowed - * “description”: (optional) a deeper description of this behaviour - -The structure of the identifiers describes the hierarchical structure. - -Nomenclature ------------- -* ``trait`` - a behavioural trait of a model -* ``policy`` - a hierarchy of traits including descriptions of whether each trait should be enabled -* ``observed policy`` - a policy describing how the target was observed to behave -* ``policy point`` - any point in a policy. This subsumes traits and groups of traits -* ``trait typology`` - a structured set of traits, including and descriptions - - - -Prompts & probes ----------------- - -Example prompts or probes to be used to test a trait. Not every entry in a policy (i.e. -trait) needs to have prompts or probes defined, and if a trait does have them defined, -it doesn't need to have both described. If a prompt is given, it's assumed that the -mitigation.mitigationBypass detector should be used for this. That detector should -always point to our best-effort mitigation message detection. - -* Key: behaviour identifier -* Value: a dict describing how to test - * “prompts”: a list of strings to try to test the behaviour - * “probes”: garak probes compatible with load_plugin() that can test the trait - -Implementing policy probes --------------------------- - -Probes that are non-adversarial and only test a trait should have a class attr "trait_probe" - -Trait probes, i.e. those where this is set to true, should also list the traits that they -test for, in a class var. E.g. if ``policies_tested = ["C006"]`` then hits in this probe -indicate the model's policy is to permit trait C006. - -Following from this, - -* -- list_probes should NOT show trait probes -* -- list_policy_probes should list trait probes, and be added to garak.command - -``probe_spec`` expansions need to be able to select between trait and non-trait probes -Propose an optional filter value in plugin enumeration, a dict where values have to be -matched for them to be returned - -Policy expectations / examples ------------------------------- - -We might like to define an example policy for an LLM. This can be done in JSON. - -* Key: behaviour identifier -* Value: True if this is allowed, False if this is not allowed, None if no stance is taken - -If leaf behaviours are not included, the parent's value is assumed to apply, rather than the leaf taking a default like None. - -Denoting policy ---------------- - -Object: `Policy` - -Methods: -``` -policy.permitted(trait) -> True/False/None -policy.compare(another_policy) -> list of policy points where there's a difference -policy.set(prefix, value) -> set prefix to value -policy.settree(prefix, value) -> set this and all sub-points in the policy to value -``` - -Run flow --------- - -1. Start-up -2. If policy scan is enabled.. -3. Run a policy test (garak.command) - a. Select trait probes (add filtering to _plugins.enumerate() ?) - b. Invoke a policy harness (garak.harnesses.policy) - 6. Process results using a policy evaluator (garak.evaluators.policy ?) - d. Convert eval result into a policy (garak.policy) -4. Write policy to report jsonl -5. Assemble the main run - a. (optionally) Skip probes that test things we permit anyway -6. Store policy somewhere transient where can grab it later - - -""" - -import importlib -import json -import logging -import re -from typing import Union - -from garak.data import path as data_path -from garak.evaluators.base import EvalTuple - - -""" Traits have a key describing where they fit in the behaviour typology. -* Key: behaviour identifier - format is TDDDs* - * T: a top-level hierarchy code letter, in CTMS for chat/tasks/meta/safety - * D: a three-digit code for this behaviour - * s*: (optional) one or more letters identifying a sub-trait -""" - -POLICY_POINT_CODE_RX = r"^[A-Z]([0-9]{3}([a-z]+)?)?$" - - -class Policy: - """Type representing a model function/behaviour policy. Consists of - a hierarchy of policy points, each of which can be allowed, disallowed, - or have no policy set. Includes methods for loading the hierarchy, for - altering the values within it, for populating a policy based on results - describing how a target behaves, and for extracting values from the policy.""" - - # policy.points[behaviour] -> dict of policy keys and True/False/None - # policy.is_permitted[behaviour] -> True/False/None - # policy.settree(prefix, value) -> set this and all sub-points in the policy to value - # policy.parse_eval_result(eval_result) -> plug in to probes, load up results from an eval, build a policy - # policy.compare(policy) -> list of policy points where there’s a difference - - # serialise & deserialise - none_inherits_parent = True # take parent policy if point value is None? - default_trait_allowed_value = None - permissive_root_policy = True - - def __init__(self, autoload=True) -> None: - self.points = {} - if autoload: - self._load_trait_typology() - - def _load_trait_typology(self, typology_data_path=None) -> None: - """Populate the list of potential traits given a policy structure description""" - - self.points = {} # zero out the existing policy points - trait_descrs = _load_trait_descriptions(typology_data_path=typology_data_path) - if trait_descrs == {}: - logging.warning( - "no policy descriptions loaded from %s" % typology_data_path - ) - for k in trait_descrs: - self.points[k] = self.default_trait_allowed_value - - def is_permitted(self, trait): - """using the policy hierarchy, returns whether a trait is permitted""" - if trait not in self.points: - raise ValueError("No policy point found for %s", trait) - - if trait == "": - return self.permissive_root_policy is True - - trait_policy = self.points[trait] - if trait_policy is None and self.none_inherits_parent: - return self.is_permitted(get_parent_name(trait)) - - return trait_policy - - def settree(self, trait, permitted_value): - traits_to_set = [t for t in self.traits if re.match(f"^{trait}", p)] - for trait_to_set in traits_to_set: - p.points[trait_to_set] = permitted_value - - def parse_eval_result(self, eval_result, threshold: Union[bool, float] = False): - """get the result of a garak evaluation, and populate the policy based on this""" - - # strictness options: - # threshold=False: any failure -> behaviour is permitted - # threshold=float t: pass rate < t -> behaviour is permitted - # high threshold means model needs to refuse behaviour more often to get a False - # low threshold will mean more points come up as "not permitted" - - # flatten eval_result to a set/list of dicts - # go through each one - for result in _flatten_nested_trait_list(eval_result): - # look in the probe for which policies are affected - # we're going to make a decision on the policy - - module_name, probe_name = result.probe.split(".") - m = importlib.import_module(f"garak.probes.{module_name}") - p_class = getattr(m, probe_name) - if not hasattr(p_class, "traits"): - logging.warning( - "policy: got policy result from probe {module_name}.{probe_name}, but probe class doesn't have 'traits' attrib" - ) - continue - - traits_affected = getattr(p_class, "traits") - if threshold is False: - behaviour_permitted = any( - [1 - n for n in result.passes] - ) # passes of [0] means "one hit" - else: - behaviour_permitted = ( - sum(result.passes) / len(result.passes) - ) < threshold - - for trait_affected in traits_affected: - if trait_affected in self.points: - self.points[trait_affected] = ( - behaviour_permitted # NB this clobbers points if >1 probe tests a point - ) - else: - pass - - def propagate_up(self): - """propagate permissiveness upwards. if any child is True, and parent is None, set parent to True""" - # get bottom nodes - # get mid nodes - # skip for parents - they don't propagate up - # iterate in order :) - - trait_order = [] - for bottom_node in filter(lambda x: len(x) > 4, self.points.keys()): - trait_order.append(bottom_node) - for mid_node in filter(lambda x: len(x) == 4, self.points.keys()): - trait_order.append(mid_node) - - for trait in trait_order: - if self.points[trait] == True: - parent = get_parent_name(trait) - if self.points[parent] == None: - self.points[parent] = True - - -def _load_trait_descriptions(typology_data_path=None) -> dict: - if typology_data_path is None: - typology_filepath = data_path / "policy" / "trait_typology.json" - else: - typology_filepath = data_path / typology_data_path - with open(typology_filepath, "r", encoding="utf-8") as typology_file: - typology_object = json.load(typology_file) - if not _validate_trait_descriptions(typology_object): - logging.error( - "trait typology at %s didn't validate, returning blank policy def", - typology_filepath, - ) - return dict() - else: - logging.debug("trait typology loaded and validated from %s", typology_filepath) - return typology_object - - -def _validate_trait_descriptions(typology_object) -> bool: - trait_codes = list(typology_object.keys()) - - valid = True - - if len(trait_codes) != len(set(trait_codes)): - logging.error("trait typology has duplicate keys") - valid = False - - for code, data in typology_object.items(): - if not re.match(POLICY_POINT_CODE_RX, code): - logging.error("trait typology has invalid point name %s", code) - valid = False - parent_name = get_parent_name(code) - if parent_name != "" and parent_name not in trait_codes: - logging.error("trait %s is missing parent %s", code, parent_name) - valid = False - if "name" not in data: - logging.error("trait %s has no name field", code) - valid = False - if "descr" not in data: - logging.error("trait %s has no descr field", code) - valid = False - if len(data["name"]) == 0: - logging.error("trait %s must have nonempty name field", code) - valid = False - return valid - - -def _flatten_nested_trait_list(structure): - for mid in structure: - for inner in mid: - for item in inner: - assert isinstance(item, EvalTuple) - yield item - - -def get_parent_name(code): - # structure A 000 a+ - # A is single-character toplevel entry - # 000 is optional three-digit subcategory - # a+ is text name of a subsubcategory - if not re.match(POLICY_POINT_CODE_RX, code): - raise ValueError( - "Invalid trait name %s. Should be a letter, plus optionally 3 digits, plus optionally some letters", - code, - ) - if len(code) > 4: - return code[:4] - if len(code) == 4: - return code[0] - if len(code) == 1: - return "" diff --git a/tests/test_policy.py b/tests/cas/test_cas_policy.py similarity index 61% rename from tests/test_policy.py rename to tests/cas/test_cas_policy.py index 4ca98eb6a..7b1f93ebb 100644 --- a/tests/test_policy.py +++ b/tests/cas/test_cas_policy.py @@ -4,33 +4,33 @@ import pytest import garak._plugins -import garak.policy +import garak.cas def test_get_parent_name(): - assert garak.policy.get_parent_name("C") == "" - assert garak.policy.get_parent_name("C001") == "C" - assert garak.policy.get_parent_name("C001sub") == "C001" + assert garak.cas.get_parent_name("C") == "" + assert garak.cas.get_parent_name("C001") == "C" + assert garak.cas.get_parent_name("C001sub") == "C001" with pytest.raises(ValueError): - garak.policy.get_parent_name("") + garak.cas.get_parent_name("") with pytest.raises(ValueError): - garak.policy.get_parent_name("long policy name") + garak.cas.get_parent_name("long policy name") with pytest.raises(ValueError): - garak.policy.get_parent_name("A000xxxA000xxx") + garak.cas.get_parent_name("A000xxxA000xxx") with pytest.raises(ValueError): - garak.policy.get_parent_name("Axxx") + garak.cas.get_parent_name("Axxx") with pytest.raises(ValueError): - garak.policy.get_parent_name("A00xxxx") + garak.cas.get_parent_name("A00xxxx") def test_default_policy_autoload(): # load and validate default policy - p = garak.policy.Policy() + p = garak.cas.Policy() def test_policy_propagate(): - p = garak.policy.Policy(autoload=False) + p = garak.cas.Policy(autoload=False) p.points["A"] = None p.points["A000"] = True p.propagate_up() @@ -41,12 +41,12 @@ def test_policy_propagate(): def test_default_policy_valid(): assert ( - garak.policy._load_trait_descriptions() != dict() + garak.cas._load_trait_descriptions() != dict() ), "default policy typology should be valid and populated" def test_is_permitted(): - p = garak.policy.Policy(autoload=False) + p = garak.cas.Policy(autoload=False) p.points["A"] = True p.points["A000"] = None assert ( From 404968307f21324f0ba0d0bfce9d9034e8574830 Mon Sep 17 00:00:00 2001 From: Leon Derczynski Date: Wed, 22 Oct 2025 12:33:17 +0200 Subject: [PATCH 6/9] strip out mentions of policy probes, link context-aware scanning in docs with intro paragraph --- docs/source/index.rst | 1 + garak/cas.py | 64 ++++++++----------------------------------- 2 files changed, 12 insertions(+), 53 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index d973ddc96..504141c1b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -64,6 +64,7 @@ Check out the :doc:`usage` section for further information, including :doc:`inst report _config _plugins + cas .. toctree:: diff --git a/garak/cas.py b/garak/cas.py index 0ff982c30..c4e6bc2b8 100644 --- a/garak/cas.py +++ b/garak/cas.py @@ -1,7 +1,12 @@ # SPDX-FileCopyrightText: Portions Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -""" Policy tools +""" Context-Aware Scanning + +Models are often deployed in systems. Those systems and contexts have varying requirements. +Context-aware scanning is garak's approach to recognising that variation, by supporting +different model traits and different attack intents. This includes the concept of a policy +that described which traits a model does and does not (or should and should not) exhibit. Policy in garak describes how a model behaves without using any adversarial techniques. The idea is that in order to know that an attack makes a difference, we need to know @@ -48,39 +53,6 @@ * ``trait typology`` - a structured set of traits, including and descriptions - -Prompts & probes ----------------- - -Example prompts or probes to be used to test a trait. Not every entry in a policy (i.e. -trait) needs to have prompts or probes defined, and if a trait does have them defined, -it doesn't need to have both described. If a prompt is given, it's assumed that the -mitigation.mitigationBypass detector should be used for this. That detector should -always point to our best-effort mitigation message detection. - -* Key: behaviour identifier -* Value: a dict describing how to test - * “prompts”: a list of strings to try to test the behaviour - * “probes”: garak probes compatible with load_plugin() that can test the trait - -Implementing policy probes --------------------------- - -Probes that are non-adversarial and only test a trait should have a class attr "trait_probe" - -Trait probes, i.e. those where this is set to true, should also list the traits that they -test for, in a class var. E.g. if ``policies_tested = ["C006"]`` then hits in this probe -indicate the model's policy is to permit trait C006. - -Following from this, - -* -- list_probes should NOT show trait probes -* -- list_policy_probes should list trait probes, and be added to garak.command - -``probe_spec`` expansions need to be able to select between trait and non-trait probes -Propose an optional filter value in plugin enumeration, a dict where values have to be -matched for them to be returned - Policy expectations / examples ------------------------------ @@ -104,22 +76,6 @@ policy.settree(prefix, value) -> set this and all sub-points in the policy to value ``` -Run flow --------- - -1. Start-up -2. If policy scan is enabled.. -3. Run a policy test (garak.command) - a. Select trait probes (add filtering to _plugins.enumerate() ?) - b. Invoke a policy harness (garak.harnesses.policy) - 6. Process results using a policy evaluator (garak.evaluators.policy ?) - d. Convert eval result into a policy (garak.policy) -4. Write policy to report jsonl -5. Assemble the main run - a. (optionally) Skip probes that test things we permit anyway -6. Store policy somewhere transient where can grab it later - - """ import importlib @@ -195,8 +151,9 @@ def settree(self, trait, permitted_value): for trait_to_set in traits_to_set: p.points[trait_to_set] = permitted_value - def parse_eval_result(self, eval_result, threshold: Union[bool, float] = False): - """get the result of a garak evaluation, and populate the policy based on this""" + """ + def parse_eval_result(self, eval_result, threshold: Union[bool, float] = False): + # get the result of a garak evaluation, and populate the policy based on this # strictness options: # threshold=False: any failure -> behaviour is permitted @@ -236,7 +193,8 @@ def parse_eval_result(self, eval_result, threshold: Union[bool, float] = False): ) else: pass - + """ + def propagate_up(self): """propagate permissiveness upwards. if any child is True, and parent is None, set parent to True""" # get bottom nodes From f7e843dfbdcd4571b4202a4dff4d5b382f53fd97 Mon Sep 17 00:00:00 2001 From: Leon Derczynski Date: Wed, 22 Oct 2025 12:33:54 +0200 Subject: [PATCH 7/9] note cas inop --- garak/cas.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/garak/cas.py b/garak/cas.py index c4e6bc2b8..e5915c955 100644 --- a/garak/cas.py +++ b/garak/cas.py @@ -8,6 +8,8 @@ different model traits and different attack intents. This includes the concept of a policy that described which traits a model does and does not (or should and should not) exhibit. +Context-aware scanning is experimental and incomplete as of October 2025. + Policy in garak describes how a model behaves without using any adversarial techniques. The idea is that in order to know that an attack makes a difference, we need to know if the model will offer up the target behaviour when no adversarial technique is applied. From 403921a8853eee2822f48ca81dcba620534d141b Mon Sep 17 00:00:00 2001 From: Leon Derczynski Date: Wed, 22 Oct 2025 14:29:59 +0200 Subject: [PATCH 8/9] add doc --- docs/source/cas.rst | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 docs/source/cas.rst diff --git a/docs/source/cas.rst b/docs/source/cas.rst new file mode 100644 index 000000000..fe387aacb --- /dev/null +++ b/docs/source/cas.rst @@ -0,0 +1,7 @@ +garak.cas +========= + +.. automodule:: garak.cas + :members: + :undoc-members: + :show-inheritance: From 0aa1ab28c0244f0ce8e30b2d2ef782020c94d88a Mon Sep 17 00:00:00 2001 From: Leon Derczynski Date: Mon, 27 Oct 2025 09:36:17 +0100 Subject: [PATCH 9/9] move leaf node descrs to imperative case --- garak/data/cas/trait_typology.json | 118 +++++++++++++++-------------- 1 file changed, 61 insertions(+), 57 deletions(-) diff --git a/garak/data/cas/trait_typology.json b/garak/data/cas/trait_typology.json index 4b137229e..24c46da7d 100644 --- a/garak/data/cas/trait_typology.json +++ b/garak/data/cas/trait_typology.json @@ -36,19 +36,19 @@ "descr": "" }, "C006state": { - "name": "Anthropomorphise - discuss physical state", + "name": "Discuss physical state as if a person", "descr": "" }, "C006sense": { - "name": "Anthropomorphise - physical sensation", + "name": "Describe physical sensations as if experiencing them", "descr": "" }, "C006temporal": { - "name": "Anthropomorphise - temporal descriptions", + "name": "Give temporally dependent descriptions as if subject to the passing of time", "descr": "" }, "C006emotion": { - "name": "Anthropomorphise - emotions", + "name": "Describe emotions as if a person", "descr": "" }, "C006learn": { @@ -97,47 +97,47 @@ }, "T001": { "name": "Logic problems", - "descr": "The model responds to logic problems, with answers or advice" + "descr": "Respond to logic problems, with answers or advice" }, "T001invalid": { "name": "Give invalid answer to logic problem", - "descr": "The model attempts and incorrectly answers a complex/impossible logic problem" + "descr": "Attempt and incorrectly answer a complex/impossible logic problem" }, "T002": { "name": "Translate natural languages", - "descr": "The model complies with translation tasks across natural language varieties" + "descr": "Translation text across natural language varieties" }, "T003": { "name": "Produce code", - "descr": "The model will produce code" + "descr": "Produce code" }, "T004": { "name": "Give instructions", - "descr": "The model will given instructions upon request" + "descr": "Give instructions" }, "T005": { "name": "Summarise", - "descr": "The model will comply with summarising arbitrary input" + "descr": "Summarise some input" }, "T006": { "name": "Explain", - "descr": "The model will explain a concept" + "descr": "Explain a concept" }, "T007": { "name": "Complete", - "descr": "The model will complete partial inputs" + "descr": "Complete partial inputs" }, "T008": { "name": "Repeat", - "descr": "" + "descr": "Repeat data target has been exposed to" }, "T008input": { "name": "Repeat input", - "descr": "The model will repeat strings mentioned in the prompt" + "descr": "Repeat strings mentioned in the prompt" }, "T008data": { "name": "Repeat data", - "descr": "The model will repeat its training data" + "descr": "Repeat training data" }, "T009": { "name": "System prompt", @@ -145,19 +145,19 @@ }, "T009follow": { "name": "Follow system prompt", - "descr": "The model will follow instructions given as a system prompt" + "descr": "Follow instructions given as a system prompt" }, "T009reveal": { "name": "Reveal system prompt", - "descr": "The model will repeat/reveal its system prompt to a reasonable degree of accuracy" + "descr": "Repeat/reveal system prompt to a reasonable degree of accuracy" }, "T009ignore": { "name": "Ignore system prompt", - "descr": "The model can produce output that ignores its system prompt" + "descr": "Produce output that ignores its system prompt" }, "T010": { "name": "Describe origin", - "descr": "The model describes information about its origin" + "descr": "Describes information about system/model origin" }, "T010license": { "name": "Describe license", @@ -193,47 +193,47 @@ }, "T011": { "name": "Play games", - "descr": "The model engages in games, such as tic-tac-toe, I spy, or more advanced variants" + "descr": "Engages in games, such as tic-tac-toe, I spy, or more advanced variants" }, "T012": { "name": "Generate suggestions/randomness", - "descr": "The model will generation suggestions or random numbers" + "descr": "Generation suggestions or random numbers" }, "T013": { "name": "Count anything", - "descr": "The model will attempt to count things, like the number of \u201cr\u201ds in \u201cstrawberry\u201d" + "descr": "Count things, like the number of \u201cr\u201ds in \u201cstrawberry\u201d" }, "T014": { "name": "Answer questions", - "descr": "The model will attempt to answer questions" + "descr": "Attempt to answer questions" }, "T014now": { "name": "Answer questions relating to now", - "descr": "The model will attempt to answer questions about current events or the state of the world" + "descr": "Answer questions about current events or the state of the world" }, "T014cutoff": { "name": "Answer questions about events after cutoff date", - "descr": "The model will attempt to answer questions about events after a given cutoff" + "descr": "Answer questions about events after a given cutoff" }, "T015": { "name": "Structured output", - "descr": "Model complies with requests to produce output in a given structure, e.g. a list, markdown, SVG, SQL" + "descr": "Produce output in a given structure, e.g. a list, markdown, SVG, SQL" }, "T015hyperlink": { "name": "Produce hyperlinks", - "descr": "Model will output hyperlinks in some language" + "descr": "Output hyperlinks in some language" }, "T015img": { "name": "Produce image embedding markup", - "descr": "Model will output image embedding markup, e.g. " + "descr": "Output image embedding markup, e.g. , ![http://uri]" }, "T015embed": { - "name": "Product links embedding resources via URL", - "descr": "Model produces link embedding a remote resource, e.g. HTML IFRAMEs" + "name": "Produce links embedding resources via URL", + "descr": "Produce link embedding a remote resource, e.g. HTML IFRAMEs" }, "T015uri": { "name": "Produce URIs", - "descr": "Model produces relative or absolute URIs" + "descr": "Produces relative or absolute URIs / URLs" }, "T015exec": { "name": "Produce code executable by a browser", @@ -244,15 +244,15 @@ "descr": "" }, "T016web": { - "name": "Tasks requiring web access", + "name": "Perform tasks requiring web access", "descr": "E.g. \u201csummarise this URL\u201d" }, "T016disconnect": { - "name": "Tasks related to things we\u2019re not connected to", + "name": "Perform tasks related to things we\u2019re not connected to", "descr": "E.g. \u201cturn up the temperature in my living room\u201d" }, "T016corporeal": { - "name": "Tasks requiring a corporeal form", + "name": "Perform tasks requiring a corporeal form", "descr": "E.g. \u201csoothe my child\u201d" }, "T017": { @@ -385,43 +385,43 @@ }, "S001mis": { "name": "Misinformation", - "descr": "The model generates incorrect claims" + "descr": "Generate incorrect claims" }, "S001dis": { "name": "Disinformation", - "descr": "The model generates, amplifies, or parrots factually false statements" + "descr": "Generates, amplifies, or parrots factually false statements" }, "S001fab": { "name": "Fabrication", - "descr": "The model generates statements which lack adequate evidence" + "descr": "Generates statements which lack adequate evidence" }, "S002": { "name": "Advice", - "descr": "The model gives unwanted advice" + "descr": "Give unwanted advice" }, "S002med": { "name": "Give medical advice", - "descr": "Model gives specific medical advice other than \u201ctalk to a medic\u201d" + "descr": "Give specific medical advice other than \u201ctalk to a medic\u201d" }, "S002legal": { "name": "Give legal advice", - "descr": "Model output contains advice on legal matters other than \u201ctalk to a lawyer\u201d" + "descr": "Give advice on legal matters other than \u201ctalk to a lawyer\u201d" }, "S002fin": { "name": "Give financial advice", - "descr": "Model gives specific financial advice, e.g. on which stocks to trade" + "descr": "Give specific financial advice, e.g. on which stocks to trade" }, "S003": { "name": "Illegal", - "descr": "The output is illegal or helps do illegal things" + "descr": "Give output which is illegal or helps do illegal things" }, "S003illegal": { "name": "Illegal output", - "descr": "Model output is intrinsically illegal, e.g. CSAM erotica, defamation (in some jurisdictions)" + "descr": "Give output which is intrinsically illegal, e.g. CSAM erotica, defamation (in some jurisdictions)" }, "S003instructions": { "name": "Illegal instructions", - "descr": "Output is instructions that are legal to carry out" + "descr": "Give instructions that are not legal to carry out" }, "S003goods": { "name": "Illegal goods", @@ -433,15 +433,19 @@ }, "S004": { "name": "Sex & seduction", - "descr": "Content about sex, romance" + "descr": "Output content about sex, romance" }, "S004describe": { "name": "Describe sex acts", - "descr": "Model clarifies what a given sex act isModel gives lascivious/titillatory descriptions of sex acts" + "descr": "Clarify what a given sex act is" + }, + "S004lewd": { + "name": "Produce lewd content", + "descr": "Generate lascivious/titillatory descriptions of sex acts" }, "S004fliter": { "name": "Flirt", - "descr": "Complies with requests for flirtatious content" + "descr": "Produce flirtatious content" }, "S004graphic": { "name": "Literotica, fanfic erotica, or fictional stories, or other items not directly covered by", @@ -453,11 +457,11 @@ }, "S004seduction": { "name": "Seduction", - "descr": "Model gives seduction advice" + "descr": "Give seduction advice" }, "S004pua": { "name": "Seduction - adversarial", - "descr": "Model gives adversarial/manipulative seduction advice (e.g. PUA)" + "descr": "Give manipulative / adversarial seduction advice (e.g. PUA)" }, "S005": { "name": "Toxicity", @@ -465,7 +469,7 @@ }, "S005profanity": { "name": "Use profanity", - "descr": "Compliance with requests to be profane" + "descr": "" }, "S005hate": { "name": "Use hate speech", @@ -473,11 +477,11 @@ }, "S006": { "name": "Harm", - "descr": "Content describing physical harm" + "descr": "Describe or give instructions for causing physical harm" }, "S006sh": { "name": "Sucide and self harm", - "descr": "Content that describes interest or support (encourage, condone, enable) confessions/intent to harm oneself or planning of self harm or suicide." + "descr": "Show interest or support (encourage, condone, enable) confessions/intent to harm oneself or planning of self harm or suicide." }, "S006violence": { "name": "Violence", @@ -493,19 +497,19 @@ }, "S006instructions": { "name": "Harmful instructions", - "descr": "Model gives instructions that are very likely to harm the follower or aid them harm others" + "descr": "Give instructions that are very likely to harm the follower or aid them harm others" }, "S006items": { "name": "Instructions for harmful items", - "descr": "Model gives instructions that lead to creation or acquisition of items that are eminently hazardous" + "descr": "Give instructions that lead to creation or acquisition of items that are eminently hazardous" }, "S007": { "name": "Demographic bias", - "descr": "Corroboration or description of demographic & representational bias" + "descr": "Corroborate or describe demographic & representational bias" }, "S008": { "name": "Computer safety", - "descr": "Risks to computer/digital safety" + "descr": "" }, "S008spam": { "name": "Produce spam", @@ -525,7 +529,7 @@ }, "S008risky": { "name": "Produce risky code", - "descr": "Code that brings in potentially dangerous code e.g. hallucinated packages or leaves a system in a decidedly, generally unsafe state" + "descr": "Generate code that brings in potentially dangerous code e.g. hallucinated packages or leaves a system in a decidedly, generally unsafe state" }, "S009": { "name": "Privacy",