diff --git a/.test/skills/_routing/ground_truth.yaml b/.test/skills/_routing/ground_truth.yaml index a428d5b7..44663332 100644 --- a/.test/skills/_routing/ground_truth.yaml +++ b/.test/skills/_routing/ground_truth.yaml @@ -80,6 +80,118 @@ test_cases: difficulty: "easy" reasoning: "Mentions 'genai.evaluate' - MLflow evaluation trigger" + # Single-skill routing - UC FGAC Governance + - id: "routing_fgac_001" + inputs: + prompt: "Create an FGAC column mask policy for SSN columns" + expectations: + expected_skills: ["databricks-unity-catalog"] + is_multi_skill: false + metadata: + category: "single_skill" + difficulty: "easy" + reasoning: "Mentions 'FGAC' and 'column mask policy' - UC FGAC governance" + + - id: "routing_fgac_002" + inputs: + prompt: "How do I apply governed tags to columns for PII classification?" + expectations: + expected_skills: ["databricks-unity-catalog"] + is_multi_skill: false + metadata: + category: "single_skill" + difficulty: "easy" + reasoning: "Mentions 'governed tags' and 'PII classification' - UC FGAC governance" + + - id: "routing_fgac_003" + inputs: + prompt: "Write a masking UDF that hides email addresses and bind it to a policy" + expectations: + expected_skills: ["databricks-unity-catalog"] + is_multi_skill: false + metadata: + category: "single_skill" + difficulty: "medium" + reasoning: "Mentions 'masking UDF' and 'policy' - UC FGAC governance" + + - id: "routing_fgac_004" + inputs: + prompt: "List all FGAC policies on my catalog using the Python SDK" + expectations: + expected_skills: ["databricks-unity-catalog"] + is_multi_skill: false + metadata: + category: "single_skill" + difficulty: "medium" + reasoning: "Mentions 'FGAC policies' - UC FGAC governance over generic SDK skill" + + - id: "routing_fgac_005" + inputs: + prompt: "Create a row filter policy to hide EU data from the US team" + expectations: + expected_skills: ["databricks-unity-catalog"] + is_multi_skill: false + metadata: + category: "single_skill" + difficulty: "easy" + reasoning: "Mentions 'row filter policy' - UC FGAC governance" + + - id: "routing_fgac_006" + inputs: + prompt: "How do I use CREATE POLICY with hasTagValue to mask credit card columns?" + expectations: + expected_skills: ["databricks-unity-catalog"] + is_multi_skill: false + metadata: + category: "single_skill" + difficulty: "medium" + reasoning: "Mentions 'CREATE POLICY' and 'hasTagValue' - UC FGAC governance" + + - id: "routing_fgac_007" + inputs: + prompt: "Set up FGAC policies for PII masking and query the audit logs to verify who accessed the masked columns" + expectations: + expected_skills: ["databricks-unity-catalog"] + is_multi_skill: false + metadata: + category: "single_skill" + difficulty: "hard" + reasoning: "Both FGAC policies and audit logs are in databricks-unity-catalog" + + # Single-skill routing - UC ACLs + - id: "routing_acl_001" + inputs: + prompt: "Grant SELECT access on a table to the data_readers group" + expectations: + expected_skills: ["databricks-unity-catalog"] + is_multi_skill: false + metadata: + category: "single_skill" + difficulty: "easy" + reasoning: "Mentions 'Grant SELECT' - UC ACL operation" + + - id: "routing_acl_002" + inputs: + prompt: "How do I revoke write permissions from a group on a schema?" + expectations: + expected_skills: ["databricks-unity-catalog"] + is_multi_skill: false + metadata: + category: "single_skill" + difficulty: "easy" + reasoning: "Mentions 'revoke' and 'permissions' on 'schema' - UC ACL operation" + + - id: "routing_acl_003" + inputs: + prompt: "Set up read-only access for the analytics team on the gold schema" + expectations: + expected_skills: ["databricks-unity-catalog"] + is_multi_skill: false + metadata: + category: "single_skill" + difficulty: "medium" + reasoning: "Access setup on schema with role-based pattern - UC ACL operation" + # Multi-skill routing - id: "routing_multi_001" inputs: diff --git a/.test/skills/databricks-unity-catalog/candidates.yaml b/.test/skills/databricks-unity-catalog/candidates.yaml new file mode 100644 index 00000000..dfda4204 --- /dev/null +++ b/.test/skills/databricks-unity-catalog/candidates.yaml @@ -0,0 +1,12 @@ +--- +# Candidate test cases pending review for databricks-unity-catalog skill +# Add new test cases here for review before promoting to ground_truth.yaml +# +# Workflow: +# 1. Generate response for a new prompt +# 2. Add to candidates with review_status: "pending" +# 3. Execute on Databricks to verify +# 4. Run MLflow evaluation +# 5. If passed, promote to ground_truth.yaml + +candidates: [] diff --git a/.test/skills/databricks-unity-catalog/ground_truth.yaml b/.test/skills/databricks-unity-catalog/ground_truth.yaml new file mode 100644 index 00000000..2060654a --- /dev/null +++ b/.test/skills/databricks-unity-catalog/ground_truth.yaml @@ -0,0 +1,1718 @@ +test_cases: + # ============================================================================ + # SQL Generation — Column Mask + # ============================================================================ + - id: "fgac_column_mask_sql_001" + inputs: + prompt: "Create a column mask policy for SSN columns at schema level" + outputs: + response: | + Here's a column mask policy that masks SSN columns across all tables in a schema: + + ```sql + -- Step 1: Ensure the masking UDF exists + CREATE OR REPLACE FUNCTION my_catalog.my_schema.mask_ssn(ssn STRING) + RETURNS STRING + DETERMINISTIC + COMMENT 'Masks SSN showing only last 4 digits in XXX-XX-XXXX format' + RETURN CASE + WHEN ssn IS NULL THEN NULL + WHEN LENGTH(REGEXP_REPLACE(ssn, '[^0-9]', '')) >= 4 + THEN CONCAT('***-**-', RIGHT(REGEXP_REPLACE(ssn, '[^0-9]', ''), 4)) + ELSE '***-**-****' + END; + + -- Step 2: Create the FGAC column mask policy + CREATE OR REPLACE POLICY mask_ssn_schema + ON SCHEMA my_catalog.my_schema + COMMENT 'Mask SSN columns schema-wide' + COLUMN MASK my_catalog.my_schema.mask_ssn + TO `analysts`, `data_scientists` + EXCEPT `gov_admin` + FOR TABLES + MATCH COLUMNS hasTagValue('pii_type', 'ssn') AS masked_col + ON COLUMN masked_col; + ``` + + This policy: + - Creates a deterministic masking UDF that shows only the last 4 digits + - Uses `CREATE OR REPLACE POLICY` at schema scope + - Matches columns tagged with `pii_type = ssn` using `hasTagValue()` + - Applies to all tables in the schema via `FOR TABLES` + - Excludes `gov_admin` to preserve admin access + execution_success: null + expectations: + expected_facts: + - "CREATE OR REPLACE POLICY" + - "COLUMN MASK" + - "hasTagValue" + - "EXCEPT `gov_admin`" + - "FOR TABLES" + expected_patterns: + - pattern: "CREATE OR REPLACE POLICY" + min_count: 1 + description: "policy creation syntax" + - pattern: "COLUMN MASK" + min_count: 1 + description: "column mask type" + - pattern: "hasTagValue\\s*\\(\\s*'pii_type'\\s*,\\s*'ssn'\\s*\\)" + min_count: 1 + description: "tag-based column matching" + - pattern: "EXCEPT\\s+`gov_admin`" + min_count: 1 + description: "gov_admin exclusion" + - pattern: "FOR TABLES" + min_count: 1 + description: "for_securable_type = TABLE" + guidelines: + - "Must use CREATE OR REPLACE POLICY syntax" + - "Must include COLUMN MASK with fully qualified UDF name" + - "Must match columns via hasTagValue('pii_type', 'ssn')" + - "Must include EXCEPT `gov_admin`" + - "Must include FOR TABLES" + metadata: + category: "happy_path" + difficulty: "easy" + source: "manual" + tags: ["sql", "column-mask", "schema-scope", "ssn"] + + # ============================================================================ + # SQL Generation — Row Filter + # ============================================================================ + - id: "fgac_row_filter_sql_001" + inputs: + prompt: "Create a row filter policy to hide EU data from the US team" + outputs: + response: | + Here's a row filter policy that hides EU data from the US team: + + ```sql + -- Step 1: Create the row filter UDF + CREATE OR REPLACE FUNCTION my_catalog.my_schema.is_not_eu_region(region_value STRING) + RETURNS BOOLEAN + DETERMINISTIC + COMMENT 'Row filter - returns FALSE for EU regions' + RETURN CASE + WHEN region_value IS NULL THEN TRUE + WHEN LOWER(region_value) LIKE '%eu%' THEN FALSE + WHEN LOWER(region_value) LIKE '%europe%' THEN FALSE + ELSE TRUE + END; + + -- Step 2: Create the FGAC row filter policy + CREATE OR REPLACE POLICY filter_eu_data_schema + ON SCHEMA my_catalog.my_schema + COMMENT 'Filter EU rows for US team' + ROW FILTER my_catalog.my_schema.is_not_eu_region + TO `us_team` + EXCEPT `gov_admin` + FOR TABLES + MATCH COLUMNS hasTagValue('region', 'eu') AS filter_col + USING COLUMNS (filter_col); + ``` + + This policy: + - Creates a deterministic UDF that returns FALSE for EU regions (filtering them out) + - Uses `ROW FILTER` with `USING COLUMNS` syntax (not `ON COLUMN`) + - Matches columns tagged with `region = eu` via `hasTagValue()` + - Applies only to the `us_team` principal group + - Excludes `gov_admin` to preserve admin access + execution_success: null + expectations: + expected_facts: + - "ROW FILTER" + - "hasTagValue" + - "USING COLUMNS" + - "EXCEPT `gov_admin`" + expected_patterns: + - pattern: "CREATE OR REPLACE POLICY" + min_count: 1 + description: "policy creation syntax" + - pattern: "ROW FILTER" + min_count: 1 + description: "row filter type" + - pattern: "hasTagValue\\s*\\(\\s*'region'\\s*,\\s*'eu'\\s*\\)" + min_count: 1 + description: "tag-based column matching" + - pattern: "EXCEPT\\s+`gov_admin`" + min_count: 1 + description: "gov_admin exclusion" + - pattern: "USING COLUMNS" + min_count: 1 + description: "row filter USING COLUMNS syntax" + guidelines: + - "Must use ROW FILTER syntax, not COLUMN MASK" + - "Must use USING COLUMNS (not ON COLUMN) for row filters" + - "Must include EXCEPT `gov_admin`" + - "Row filter UDF must return BOOLEAN" + metadata: + category: "happy_path" + difficulty: "easy" + source: "manual" + tags: ["sql", "row-filter", "region-based", "schema-scope"] + + # ============================================================================ + # SQL Generation — Masking UDF + # ============================================================================ + - id: "fgac_masking_udf_sql_001" + inputs: + prompt: "Write a masking UDF that partially masks email addresses" + outputs: + response: | + Here's a masking UDF that partially masks email addresses, showing only the first character and the domain: + + ```sql + CREATE OR REPLACE FUNCTION my_catalog.my_schema.mask_email(email STRING) + RETURNS STRING + DETERMINISTIC + COMMENT 'Masks email showing first char and domain' + RETURN CASE + WHEN email IS NULL THEN NULL + WHEN INSTR(email, '@') > 1 + THEN CONCAT(LEFT(email, 1), '***@', SUBSTRING(email, INSTR(email, '@') + 1)) + ELSE '***@***.***' + END; + ``` + + Example outputs: + - `john.doe@example.com` → `j***@example.com` + - `a@b.com` → `a***@b.com` + - `NULL` → `NULL` + - `invalid-email` → `***@***.***` + + Key requirements for masking UDFs: + - Must be `DETERMINISTIC` — same input always produces same output + - Uses simple `CASE` statements — no external calls or nested UDFs + - Handles NULL values explicitly + - Handles edge cases (missing @ sign) + - Uses fully qualified name: `catalog.schema.function_name` + execution_success: null + expectations: + expected_facts: + - "DETERMINISTIC" + - "CASE" + - "CREATE OR REPLACE FUNCTION" + - "RETURNS STRING" + expected_patterns: + - pattern: "CREATE OR REPLACE FUNCTION" + min_count: 1 + description: "UDF creation syntax" + - pattern: "RETURNS STRING" + min_count: 1 + description: "return type" + - pattern: "DETERMINISTIC" + min_count: 1 + description: "deterministic requirement" + - pattern: "CASE\\s+WHEN" + min_count: 1 + description: "CASE statement pattern" + - pattern: "INSTR\\s*\\(.*@.*\\)" + min_count: 1 + description: "email @ detection" + guidelines: + - "Masking UDF must be DETERMINISTIC" + - "Must use simple CASE statements, no external calls" + - "Must handle NULL values" + - "Must use fully qualified function name (catalog.schema.function)" + metadata: + category: "happy_path" + difficulty: "medium" + source: "manual" + tags: ["sql", "masking-udf", "email", "partial-mask"] + + # ============================================================================ + # SQL Generation — Cross-Catalog UDF + # ============================================================================ + - id: "fgac_cross_catalog_sql_001" + inputs: + prompt: "Create a column mask policy on prod.finance using a UDF from governance.masking_udfs" + outputs: + response: | + Here's a cross-catalog column mask policy using a shared governance UDF: + + ```sql + -- The masking UDF already exists in the shared governance schema: + -- governance.masking_udfs.mask_ssn + + -- Create policy on prod.finance, referencing UDF from governance catalog + CREATE OR REPLACE POLICY mask_ssn_finance + ON SCHEMA prod.finance + COMMENT 'Mask SSN using shared governance UDF' + COLUMN MASK governance.masking_udfs.mask_ssn + TO `analysts` + EXCEPT `gov_admin` + FOR TABLES + MATCH COLUMNS hasTagValue('pii_type', 'ssn') AS masked_col + ON COLUMN masked_col; + ``` + + Key points about cross-catalog UDFs: + - The UDF (`governance.masking_udfs.mask_ssn`) can live in a different catalog/schema than the policy scope (`prod.finance`) + - The UDF name must always be fully qualified: `catalog.schema.function` + - A common pattern is a shared governance schema containing all masking functions + - The policy executor needs `EXECUTE` on the UDF and `MANAGE` on the target securable + execution_success: null + expectations: + expected_facts: + - "governance.masking_udfs.mask_ssn" + - "prod.finance" + - "cross-catalog" + - "fully qualified" + expected_patterns: + - pattern: "CREATE OR REPLACE POLICY" + min_count: 1 + description: "policy creation" + - pattern: "COLUMN MASK governance\\.masking_udfs\\.mask_ssn" + min_count: 1 + description: "cross-catalog UDF reference" + - pattern: "ON SCHEMA prod\\.finance" + min_count: 1 + description: "policy scope on prod.finance" + - pattern: "EXCEPT\\s+`gov_admin`" + min_count: 1 + description: "gov_admin exclusion" + guidelines: + - "Must use fully qualified UDF name from governance catalog" + - "Policy scope must be on prod.finance" + - "Must explain cross-catalog UDF pattern" + - "Must include EXCEPT `gov_admin`" + metadata: + category: "happy_path" + difficulty: "medium" + source: "manual" + tags: ["sql", "cross-catalog", "column-mask", "governance"] + + # ============================================================================ + # SDK/Python — Create Column Mask Policy + # ============================================================================ + - id: "fgac_sdk_create_policy_001" + inputs: + prompt: "Create a column mask FGAC policy using the Python SDK" + outputs: + response: | + Here's how to create a column mask FGAC policy using the Databricks Python SDK: + + ```python + from databricks.sdk import WorkspaceClient + from databricks.sdk.service.catalog import ( + ColumnMaskOptions, + MatchColumn, + PolicyInfo, + PolicyType, + SecurableType, + ) + + w = WorkspaceClient() + + policy_info = PolicyInfo( + name="mask_pii_ssn_schema", + policy_type=PolicyType.POLICY_TYPE_COLUMN_MASK, + on_securable_type=SecurableType.SCHEMA, + on_securable_fullname="my_catalog.my_schema", + for_securable_type=SecurableType.TABLE, + to_principals=["analysts", "data_scientists"], + except_principals=["gov_admin"], + comment="Mask SSN columns in schema", + column_mask=ColumnMaskOptions( + function_name="my_catalog.my_schema.mask_ssn", + on_column="masked_col", + ), + match_columns=[ + MatchColumn( + alias="masked_col", + condition="hasTagValue('pii_type', 'ssn')", + ) + ], + ) + policy = w.policies.create_policy(policy_info=policy_info) + print(f"Created policy: {policy.name}") + ``` + + Key points: + - `policy_type` must be `PolicyType.POLICY_TYPE_COLUMN_MASK` + - `for_securable_type` is always `SecurableType.TABLE` + - `except_principals` must include `"gov_admin"` to protect admin access + - `function_name` must be fully qualified (catalog.schema.function) + - `MatchColumn.alias` must match `ColumnMaskOptions.on_column` + execution_success: null + expectations: + expected_facts: + - "PolicyInfo" + - "PolicyType.POLICY_TYPE_COLUMN_MASK" + - "ColumnMaskOptions" + - "MatchColumn" + - "for_securable_type" + - "except_principals" + expected_patterns: + - pattern: "PolicyInfo\\(" + min_count: 1 + description: "PolicyInfo construction" + - pattern: "PolicyType\\.POLICY_TYPE_COLUMN_MASK" + min_count: 1 + description: "column mask policy type" + - pattern: "ColumnMaskOptions\\(" + min_count: 1 + description: "column mask options" + - pattern: "MatchColumn\\(" + min_count: 1 + description: "match column configuration" + - pattern: "for_securable_type.*=.*SecurableType\\.TABLE" + min_count: 1 + description: "for_securable_type always TABLE" + - pattern: "except_principals.*gov_admin" + min_count: 1 + description: "gov_admin in except_principals" + guidelines: + - "Must use PolicyInfo with all required fields" + - "Must set for_securable_type to SecurableType.TABLE" + - "Must include gov_admin in except_principals" + - "Must use ColumnMaskOptions and MatchColumn" + metadata: + category: "happy_path" + difficulty: "medium" + source: "manual" + tags: ["python", "sdk", "column-mask", "create-policy"] + + # ============================================================================ + # SDK/Python — List Policies + # ============================================================================ + - id: "fgac_sdk_list_policies_001" + inputs: + prompt: "List all FGAC policies on my catalog and filter by type" + outputs: + response: | + Here's how to list and filter FGAC policies using the Python SDK: + + ```python + from databricks.sdk import WorkspaceClient + + w = WorkspaceClient() + + # List all policies on a catalog (including inherited from schemas/tables) + policies = list(w.policies.list_policies( + on_securable_type="CATALOG", + on_securable_fullname="my_catalog", + include_inherited=True, + )) + + print(f"Total policies: {len(policies)}") + + # Filter by policy type + column_masks = [p for p in policies if p.policy_type == "COLUMN_MASK"] + row_filters = [p for p in policies if p.policy_type == "ROW_FILTER"] + + print(f"Column masks: {len(column_masks)}") + print(f"Row filters: {len(row_filters)}") + + # Print details for each policy + for p in policies: + print(f" {p.name}: {p.policy_type} on {p.on_securable_fullname}") + ``` + + Key points: + - Use `w.policies.list_policies()` — there is no SQL equivalent (`SHOW POLICIES` does not exist) + - `include_inherited=True` shows policies from parent scopes + - Filter by `policy_type` string: `"COLUMN_MASK"` or `"ROW_FILTER"` + - `on_securable_type` can be `"CATALOG"`, `"SCHEMA"`, or `"TABLE"` + execution_success: null + expectations: + expected_facts: + - "w.policies.list_policies" + - "on_securable_type" + - "include_inherited" + - "COLUMN_MASK" + - "ROW_FILTER" + expected_patterns: + - pattern: "w\\.policies\\.list_policies\\(" + min_count: 1 + description: "list_policies API call" + - pattern: "on_securable_type" + min_count: 1 + description: "securable type parameter" + - pattern: "include_inherited" + min_count: 1 + description: "inherited policies flag" + - pattern: "policy_type.*COLUMN_MASK" + min_count: 1 + description: "filter by column mask type" + guidelines: + - "Must use w.policies.list_policies() SDK method" + - "Must show filtering by policy_type" + - "Must mention include_inherited parameter" + - "Should note SHOW POLICIES does not exist in SQL" + metadata: + category: "happy_path" + difficulty: "easy" + source: "manual" + tags: ["python", "sdk", "list-policies", "discovery"] + + # ============================================================================ + # SDK/Python — Create Row Filter Policy + # ============================================================================ + - id: "fgac_sdk_row_filter_001" + inputs: + prompt: "Create a row filter policy using the SDK with a 0-argument UDF" + outputs: + response: | + Here's how to create a row filter policy using the Python SDK: + + ```python + from databricks.sdk import WorkspaceClient + from databricks.sdk.service.catalog import ( + MatchColumn, + PolicyInfo, + PolicyType, + RowFilterOptions, + SecurableType, + ) + + w = WorkspaceClient() + + policy_info = PolicyInfo( + name="filter_eu_data_schema", + policy_type=PolicyType.POLICY_TYPE_ROW_FILTER, + on_securable_type=SecurableType.SCHEMA, + on_securable_fullname="my_catalog.my_schema", + for_securable_type=SecurableType.TABLE, + to_principals=["us_team"], + except_principals=["gov_admin"], + comment="Filter EU rows in schema", + row_filter=RowFilterOptions( + function_name="my_catalog.my_schema.is_not_eu_region", + ), + match_columns=[ + MatchColumn( + alias="filter_col", + condition="hasTagValue('region', 'eu')", + ) + ], + ) + policy = w.policies.create_policy(policy_info=policy_info) + print(f"Created policy: {policy.name}") + ``` + + Key differences from column mask: + - `policy_type` is `PolicyType.POLICY_TYPE_ROW_FILTER` + - Uses `RowFilterOptions` instead of `ColumnMaskOptions` + - Row filter UDFs take the matched column as input and return `BOOLEAN` + - `TRUE` includes the row, `FALSE` excludes it + - `for_securable_type` is still always `SecurableType.TABLE` + execution_success: null + expectations: + expected_facts: + - "PolicyType.POLICY_TYPE_ROW_FILTER" + - "RowFilterOptions" + - "MatchColumn" + - "for_securable_type" + - "BOOLEAN" + expected_patterns: + - pattern: "PolicyType\\.POLICY_TYPE_ROW_FILTER" + min_count: 1 + description: "row filter policy type" + - pattern: "RowFilterOptions\\(" + min_count: 1 + description: "row filter options" + - pattern: "MatchColumn\\(" + min_count: 1 + description: "match column" + - pattern: "for_securable_type.*=.*SecurableType\\.TABLE" + min_count: 1 + description: "for_securable_type always TABLE" + - pattern: "except_principals.*gov_admin" + min_count: 1 + description: "gov_admin in except_principals" + guidelines: + - "Must use POLICY_TYPE_ROW_FILTER, not COLUMN_MASK" + - "Must use RowFilterOptions, not ColumnMaskOptions" + - "Must include gov_admin in except_principals" + - "Should explain that row filter UDFs return BOOLEAN" + metadata: + category: "happy_path" + difficulty: "medium" + source: "manual" + tags: ["python", "sdk", "row-filter", "create-policy"] + + # ============================================================================ + # MCP Tool Workflow — Human-in-the-Loop + # ============================================================================ + - id: "fgac_mcp_workflow_001" + inputs: + prompt: "Walk me through the human-in-the-loop workflow to create a column mask policy using MCP tools" + outputs: + response: | + Here's the complete human-in-the-loop workflow for creating an FGAC column mask policy: + + **Step 1: ANALYZE** — Discover current state + + ``` + list_fgac_policies( + securable_type="SCHEMA", + securable_fullname="prod.finance" + ) + + get_masking_functions( + catalog="prod", + schema="finance" + ) + + get_column_tags_api( + catalog="prod", + schema="finance", + table="customers" + ) + ``` + + **Step 2: RECOMMEND** — Generate policy proposal based on discovered tags and UDFs. + + **Step 3: PREVIEW** — Get approval token + + ``` + result = preview_policy_changes( + action="CREATE", + policy_name="mask_ssn_finance", + securable_type="SCHEMA", + securable_fullname="prod.finance", + policy_type="COLUMN_MASK", + function_name="prod.finance.mask_ssn", + to_principals=["analysts"], + tag_name="pii_type", + tag_value="ssn" + ) + # Returns: approval_token, equivalent SQL, warnings + ``` + + **Step 4: APPROVE** — Human reviews the preview and replies "approve" + + **Step 5: EXECUTE** — Pass approval token to create + + ``` + create_fgac_policy( + policy_name="mask_ssn_finance", + policy_type="COLUMN_MASK", + securable_type="SCHEMA", + securable_fullname="prod.finance", + function_name="prod.finance.mask_ssn", + to_principals=["analysts"], + tag_name="pii_type", + tag_value="ssn", + approval_token=result["approval_token"] + ) + ``` + + **Step 6: VERIFY** — Confirm the policy was applied + + ``` + get_fgac_policy( + policy_name="mask_ssn_finance", + securable_type="SCHEMA", + securable_fullname="prod.finance" + ) + ``` + + The workflow is: ANALYZE -> RECOMMEND -> PREVIEW -> APPROVE -> EXECUTE -> VERIFY. + Never auto-execute policy changes — always preview and wait for human approval. + execution_success: null + expectations: + expected_facts: + - "preview_policy_changes" + - "approval_token" + - "create_fgac_policy" + - "ANALYZE" + - "PREVIEW" + - "APPROVE" + - "EXECUTE" + expected_patterns: + - pattern: "preview_policy_changes\\(" + min_count: 1 + description: "preview step" + - pattern: "approval_token" + min_count: 2 + description: "approval token in preview and execute" + - pattern: "create_fgac_policy\\(" + min_count: 1 + description: "create execution step" + - pattern: "list_fgac_policies\\(" + min_count: 1 + description: "discovery step" + guidelines: + - "Must show complete ANALYZE -> PREVIEW -> APPROVE -> EXECUTE workflow" + - "Must include preview_policy_changes with approval_token" + - "Must pass approval_token to create_fgac_policy" + - "Must include discovery step (list_fgac_policies or get_masking_functions)" + - "Must emphasize never auto-executing policy changes" + metadata: + category: "happy_path" + difficulty: "medium" + source: "manual" + tags: ["mcp", "workflow", "human-in-the-loop", "approval-token"] + + # ============================================================================ + # MCP Tool Workflow — Cross-Catalog Discovery + # ============================================================================ + - id: "fgac_mcp_cross_catalog_001" + inputs: + prompt: "Discover masking UDFs in a shared governance schema and create a policy using MCP tools" + outputs: + response: | + Here's how to discover UDFs from a shared governance schema and create a cross-catalog policy: + + **Step 1: Discover masking UDFs in the governance catalog** + + ``` + get_masking_functions( + catalog="prod", + schema="finance", + udf_catalog="governance", + udf_schema="masking_udfs" + ) + # Returns: governance.masking_udfs.mask_ssn, governance.masking_udfs.mask_email, etc. + ``` + + **Step 2: Check existing policies** + + ``` + list_fgac_policies( + securable_type="SCHEMA", + securable_fullname="prod.finance" + ) + ``` + + **Step 3: Preview the policy with cross-catalog UDF** + + ``` + result = preview_policy_changes( + action="CREATE", + policy_name="mask_ssn_finance", + securable_type="SCHEMA", + securable_fullname="prod.finance", + policy_type="COLUMN_MASK", + function_name="governance.masking_udfs.mask_ssn", + to_principals=["analysts"], + tag_name="pii_type", + tag_value="ssn" + ) + ``` + + **Step 4: After human approval, execute** + + ``` + create_fgac_policy( + policy_name="mask_ssn_finance", + policy_type="COLUMN_MASK", + securable_type="SCHEMA", + securable_fullname="prod.finance", + function_name="governance.masking_udfs.mask_ssn", + to_principals=["analysts"], + tag_name="pii_type", + tag_value="ssn", + approval_token=result["approval_token"] + ) + ``` + + Key points: + - Use `udf_catalog` and `udf_schema` parameters to discover UDFs in a different catalog + - The `function_name` in the policy must be fully qualified: `governance.masking_udfs.mask_ssn` + - The UDF does not need to be in the same catalog/schema as the policy scope + execution_success: null + expectations: + expected_facts: + - "udf_catalog" + - "udf_schema" + - "governance.masking_udfs" + - "cross-catalog" + - "get_masking_functions" + expected_patterns: + - pattern: "get_masking_functions\\(" + min_count: 1 + description: "UDF discovery tool" + - pattern: "udf_catalog.*=.*governance" + min_count: 1 + description: "cross-catalog UDF discovery" + - pattern: "udf_schema.*=.*masking_udfs" + min_count: 1 + description: "cross-schema UDF discovery" + - pattern: "function_name.*governance\\.masking_udfs\\." + min_count: 1 + description: "fully qualified cross-catalog function reference" + guidelines: + - "Must use get_masking_functions with udf_catalog and udf_schema" + - "Must show fully qualified function_name from governance catalog" + - "Must follow human-in-the-loop workflow with preview and approval" + - "Should explain the cross-catalog UDF pattern" + metadata: + category: "happy_path" + difficulty: "medium" + source: "manual" + tags: ["mcp", "cross-catalog", "discovery", "governance"] + + # ============================================================================ + # Error/Edge Cases — Invalid SQL + # ============================================================================ + - id: "fgac_error_invalid_sql_001" + inputs: + prompt: "How do I list all FGAC policies using SQL?" + outputs: + response: | + **Important: `SHOW POLICIES` does not exist in Databricks SQL.** + + There is no SQL command to list FGAC policies. The following SQL statements are **not valid**: + + | Invalid SQL | Status | + |---|---| + | `SHOW POLICIES` | Does not exist | + | `DESCRIBE POLICY` | Does not exist | + | `ALTER POLICY` | Does not exist | + + Instead, use the **Python SDK** or **REST API**: + + ```python + from databricks.sdk import WorkspaceClient + + w = WorkspaceClient() + + # List all policies on a catalog + policies = list(w.policies.list_policies( + on_securable_type="CATALOG", + on_securable_fullname="my_catalog", + include_inherited=True, + )) + + for p in policies: + print(f"{p.name}: {p.policy_type} on {p.on_securable_fullname}") + ``` + + Or use the **MCP tool**: + + ``` + list_fgac_policies( + securable_type="CATALOG", + securable_fullname="my_catalog", + include_inherited=True + ) + ``` + + Policy management is API-only. Use `w.policies.list_policies()` for listing, + `w.policies.get_policy()` for details, and `CREATE OR REPLACE POLICY` SQL only for creation/deletion. + execution_success: null + expectations: + expected_facts: + - "SHOW POLICIES" + - "does not exist" + - "w.policies.list_policies" + - "REST API" + expected_patterns: + - pattern: "SHOW POLICIES.*not" + min_count: 1 + description: "explains SHOW POLICIES does not exist" + - pattern: "w\\.policies\\.list_policies\\(" + min_count: 1 + description: "recommends SDK alternative" + - pattern: "list_fgac_policies\\(" + min_count: 1 + description: "recommends MCP tool alternative" + guidelines: + - "Must clearly state SHOW POLICIES does not exist" + - "Must recommend w.policies.list_policies() as the alternative" + - "Must not generate SHOW POLICIES, DESCRIBE POLICY, or ALTER POLICY" + - "Should mention MCP tool alternative" + metadata: + category: "error_handling" + difficulty: "medium" + source: "manual" + tags: ["error", "invalid-sql", "show-policies", "edge-case"] + + # ============================================================================ + # MCP Tool — Analyze FGAC Coverage + # ============================================================================ + - id: "fgac_mcp_analyze_coverage_001" + inputs: + prompt: "Analyze FGAC policy coverage for the prod.finance schema and identify gaps" + outputs: + response: | + Here's how to analyze FGAC policy coverage and identify gaps: + + **Step 1: Run coverage analysis** + + ``` + analyze_fgac_coverage( + catalog="prod", + schema="finance" + ) + ``` + + This returns: + - `summary` — tables scanned, tagged columns, existing policies, available UDFs + - `covered_tags` — tag/value pairs already covered by policies + - `uncovered_tags` — tag/value pairs with tagged columns but no policy + - `gaps` — detailed list of uncovered tags with affected columns and suggestions + - `existing_policies` — current FGAC policies + - `available_udfs` — masking functions available in the schema + + **Step 2: Review gaps** + + Each gap entry identifies: + - The uncovered `tag_name` and `tag_value` + - Which `columns` (table + column) have that tag but no policy + - A `suggestion` for what kind of policy to create + + **Step 3: For each gap, create policies using the human-in-the-loop workflow** + + ``` + result = preview_policy_changes( + action="CREATE", + policy_name="mask_email_finance", + securable_type="SCHEMA", + securable_fullname="prod.finance", + policy_type="COLUMN_MASK", + function_name="prod.finance.mask_email", + to_principals=["analysts"], + tag_name="pii_type", + tag_value="email" + ) + + # After human approval: + create_fgac_policy( + ..., + approval_token=result["approval_token"] + ) + ``` + + You can also analyze an entire catalog by omitting the schema: + ``` + analyze_fgac_coverage(catalog="prod") + ``` + execution_success: null + expectations: + expected_facts: + - "analyze_fgac_coverage" + - "gaps" + - "uncovered_tags" + - "covered_tags" + - "summary" + expected_patterns: + - pattern: "analyze_fgac_coverage\\(" + min_count: 1 + description: "coverage analysis tool" + - pattern: "gaps" + min_count: 1 + description: "identifies coverage gaps" + - pattern: "uncovered_tags" + min_count: 1 + description: "uncovered tags in output" + - pattern: "preview_policy_changes\\(" + min_count: 1 + description: "follows up with human-in-the-loop workflow" + guidelines: + - "Must use analyze_fgac_coverage as the primary analysis tool" + - "Must explain the output structure (summary, gaps, covered/uncovered tags)" + - "Must show follow-up with human-in-the-loop workflow to create policies for gaps" + - "Should mention catalog-level analysis option" + metadata: + category: "happy_path" + difficulty: "medium" + source: "manual" + tags: ["mcp", "analyze-coverage", "gap-analysis", "discovery"] + + # ============================================================================ + # MCP Tool — Discovery: Column Tags, Schema Info, Table Policies + # ============================================================================ + - id: "fgac_mcp_discovery_tools_001" + inputs: + prompt: "Discover the current FGAC state of prod.finance — what tags exist, what policies are applied, and what UDFs are available" + outputs: + response: | + Here's how to discover the full FGAC state of a schema using MCP tools: + + **1. Get schema metadata** + + ``` + get_schema_info(catalog="prod", schema="finance") + ``` + Returns: schema name, owner, comment, creation/update timestamps. + + **2. Get catalog metadata** + + ``` + get_catalog_info(catalog="prod") + ``` + Returns: catalog name, owner, comment, creation/update timestamps. + + **3. List all tables with their current policies** + + ``` + list_table_policies_in_schema(catalog="prod", schema="finance") + ``` + Returns: each table with its `column_masks` and `row_filters`. + + **4. Check column tags on a specific table** + + ``` + get_column_tags_api(catalog="prod", schema="finance", table="customers") + ``` + Returns: column-level tags (tag_name, tag_value, column_name) for the table. + + **5. List available masking UDFs** + + ``` + get_masking_functions(catalog="prod", schema="finance") + ``` + Returns: functions with name, full_name, return_type, comment. + + **6. List existing FGAC policies** + + ``` + list_fgac_policies( + securable_type="SCHEMA", + securable_fullname="prod.finance" + ) + ``` + Returns: all policies on the schema with their configuration. + + Or use `analyze_fgac_coverage(catalog="prod", schema="finance")` to get a combined analysis with gap detection in a single call. + execution_success: null + expectations: + expected_facts: + - "get_schema_info" + - "get_column_tags_api" + - "list_table_policies_in_schema" + - "get_masking_functions" + - "list_fgac_policies" + expected_patterns: + - pattern: "get_schema_info\\(" + min_count: 1 + description: "schema metadata discovery" + - pattern: "get_column_tags_api\\(" + min_count: 1 + description: "column tag discovery" + - pattern: "list_table_policies_in_schema\\(" + min_count: 1 + description: "schema-wide policy discovery" + - pattern: "get_masking_functions\\(" + min_count: 1 + description: "UDF discovery" + - pattern: "list_fgac_policies\\(" + min_count: 1 + description: "policy listing" + guidelines: + - "Must show get_schema_info or get_catalog_info for metadata" + - "Must use get_column_tags_api to discover column tags" + - "Must use list_table_policies_in_schema for schema-wide policy view" + - "Must include get_masking_functions for UDF discovery" + - "Should mention analyze_fgac_coverage as a combined alternative" + metadata: + category: "happy_path" + difficulty: "medium" + source: "manual" + tags: ["mcp", "discovery", "column-tags", "schema-info", "table-policies"] + + # ============================================================================ + # SQL Generation — Tag Application + # ============================================================================ + - id: "fgac_tag_application_sql_001" + inputs: + prompt: "Apply PII governed tags to columns on my customers table" + outputs: + response: | + Here's how to apply governed tags to columns: + + ```sql + -- Modern syntax (DBR 16.1+) + SET TAG ON COLUMN my_catalog.my_schema.customers.ssn 'pii_type' = 'ssn'; + SET TAG ON COLUMN my_catalog.my_schema.customers.email 'pii_type' = 'email'; + SET TAG ON COLUMN my_catalog.my_schema.customers.phone 'pii_type' = 'phone'; + + -- Legacy syntax (all versions) + ALTER TABLE my_catalog.my_schema.customers + ALTER COLUMN ssn SET TAGS ('pii_type' = 'ssn'); + ``` + + To verify tags were applied: + ```sql + SELECT tag_name, tag_value, column_name + FROM system.information_schema.column_tags + WHERE catalog_name = 'my_catalog' + AND schema_name = 'my_schema' + AND table_name = 'customers'; + ``` + + Important: Tags must be **governed tags** (created via the Databricks UI, not SQL). + execution_success: null + expectations: + expected_facts: + - "SET TAG" + - "pii_type" + - "governed tags" + expected_patterns: + - pattern: "SET TAG|SET TAGS" + min_count: 1 + description: "tag application syntax" + - pattern: "pii_type" + min_count: 2 + description: "PII tag key usage" + - pattern: "column_tags" + min_count: 1 + description: "information_schema query for verification" + guidelines: + - "Must use SET TAG ON COLUMN or ALTER TABLE ALTER COLUMN SET TAGS syntax" + - "Must mention governed tags cannot be created via SQL" + - "Should show how to verify tags via information_schema" + metadata: + category: "happy_path" + difficulty: "easy" + source: "manual" + tags: ["sql", "tags", "column-tags", "pii"] + + # ============================================================================ + # SQL Generation — Drop Policy + # ============================================================================ + - id: "fgac_drop_policy_sql_001" + inputs: + prompt: "Drop an FGAC policy from a schema" + outputs: + response: | + Here's how to drop an FGAC policy: + + ```sql + -- Drop at schema level + DROP POLICY mask_pii_schema ON SCHEMA my_catalog.my_schema; + + -- Drop at catalog level + DROP POLICY mask_pii_catalog ON CATALOG my_catalog; + + -- Drop at table level + DROP POLICY mask_pii_table ON TABLE my_catalog.my_schema.my_table; + ``` + + Important notes: + - `DROP POLICY` is **irreversible** — the policy is permanently removed + - There is no `ALTER POLICY` — to change a policy, drop and recreate it + - Use `preview_policy_changes(action="DELETE", ...)` for the human-in-the-loop workflow + execution_success: null + expectations: + expected_facts: + - "DROP POLICY" + - "irreversible" + expected_patterns: + - pattern: "DROP POLICY" + min_count: 1 + description: "drop policy syntax" + - pattern: "ON (SCHEMA|CATALOG|TABLE)" + min_count: 1 + description: "scope specifier" + guidelines: + - "Must use DROP POLICY ... ON syntax" + - "Must warn that the operation is irreversible" + - "Must mention ALTER POLICY does not exist" + metadata: + category: "happy_path" + difficulty: "easy" + source: "manual" + tags: ["sql", "drop-policy", "delete"] + + # ============================================================================ + # Error/Edge Cases — Governed Tags Cannot Be Created via SQL + # ============================================================================ + - id: "fgac_error_governed_tags_001" + inputs: + prompt: "How do I create a governed tag using SQL?" + outputs: + response: | + **Governed tags cannot be created via SQL.** They must be created through the Databricks UI. + + To create a governed tag: + 1. Navigate to **Catalog** in your workspace + 2. Select **Governed Tags** from the left panel + 3. Click **Create governed tag** + 4. Configure the tag key and allowed values + + Example configuration: + - **Tag Key**: `pii_type` + - **Allowed Values**: `ssn`, `email`, `phone`, `credit_card` + + Once the governed tag exists, you can **apply** it to columns via SQL: + ```sql + SET TAG ON COLUMN catalog.schema.table.column 'pii_type' = 'ssn'; + ``` + execution_success: null + expectations: + expected_facts: + - "cannot be created via SQL" + - "Databricks UI" + - "Governed Tags" + expected_patterns: + - pattern: "cannot.*SQL|not.*SQL|UI" + min_count: 1 + description: "explains SQL limitation" + - pattern: "SET TAG" + min_count: 1 + description: "shows SQL tag application as alternative" + guidelines: + - "Must clearly state governed tags cannot be created via SQL" + - "Must describe the UI workflow for creating governed tags" + - "Should show how to apply tags via SQL after creation" + metadata: + category: "error_handling" + difficulty: "easy" + source: "manual" + tags: ["error", "governed-tags", "ui-only"] + + # ============================================================================ + # SQL Generation — hasTag vs hasTagValue + # ============================================================================ + - id: "fgac_has_tag_vs_value_001" + inputs: + prompt: "What is the difference between hasTag and hasTagValue in FGAC policies?" + outputs: + response: | + `hasTag` and `hasTagValue` are used in FGAC policies to match columns based on governed tags: + + **`hasTagValue('tag_name', 'tag_value')`** — Matches columns with a specific tag AND value: + ```sql + MATCH COLUMNS hasTagValue('pii_type', 'ssn') AS masked_col + ``` + This only matches columns tagged with `pii_type = ssn`. + + **`hasTag('tag_name')`** — Matches columns with a specific tag, regardless of value: + ```sql + MATCH COLUMNS hasTag('pii_type') AS masked_col + ``` + This matches ALL columns tagged with `pii_type`, regardless of whether the value is `ssn`, `email`, `phone`, etc. + + Best practices: + - Use `hasTagValue` when different tag values need different masking (e.g., SSN mask vs email mask) + - Use `hasTag` when all values should be masked the same way (e.g., full redaction for any PII) + execution_success: null + expectations: + expected_facts: + - "hasTag" + - "hasTagValue" + - "regardless of value" + expected_patterns: + - pattern: "hasTagValue\\s*\\(" + min_count: 1 + description: "hasTagValue example" + - pattern: "hasTag\\s*\\(" + min_count: 1 + description: "hasTag example" + - pattern: "MATCH COLUMNS" + min_count: 2 + description: "both match patterns shown" + guidelines: + - "Must explain the difference between hasTag and hasTagValue" + - "Must show SQL examples for both" + - "Should explain when to use each" + metadata: + category: "happy_path" + difficulty: "easy" + source: "manual" + tags: ["sql", "has-tag", "tag-matching", "fgac"] + + # ============================================================================ + # MCP Tool — Delete Policy Workflow + # ============================================================================ + - id: "fgac_mcp_delete_workflow_001" + inputs: + prompt: "Delete an FGAC policy using the human-in-the-loop MCP workflow" + outputs: + response: | + Here's the human-in-the-loop workflow for deleting an FGAC policy: + + **Step 1: PREVIEW** — Get a delete preview and approval token + + ``` + result = preview_policy_changes( + action="DELETE", + policy_name="mask_ssn_finance", + securable_type="SCHEMA", + securable_fullname="prod.finance" + ) + ``` + + This returns: + - `equivalent_sql`: `DROP POLICY mask_ssn_finance ON SCHEMA prod.finance;` + - `warnings`: ["This action is irreversible. The policy will be permanently removed."] + - `approval_token`: Token to authorize the deletion + + **Step 2: APPROVE** — Human reviews the preview and confirms + + **Step 3: EXECUTE** — Delete with the approval token + + ``` + delete_fgac_policy( + policy_name="mask_ssn_finance", + securable_type="SCHEMA", + securable_fullname="prod.finance", + approval_token=result["approval_token"] + ) + ``` + + The approval token ensures the deletion matches exactly what was previewed. + execution_success: null + expectations: + expected_facts: + - "preview_policy_changes" + - "DELETE" + - "delete_fgac_policy" + - "approval_token" + - "irreversible" + expected_patterns: + - pattern: 'action.*=.*"DELETE"' + min_count: 1 + description: "DELETE action in preview" + - pattern: "delete_fgac_policy\\(" + min_count: 1 + description: "delete execution step" + - pattern: "approval_token" + min_count: 2 + description: "token in both preview and execute" + guidelines: + - "Must use preview_policy_changes with action=DELETE first" + - "Must pass approval_token to delete_fgac_policy" + - "Must warn the action is irreversible" + metadata: + category: "happy_path" + difficulty: "medium" + source: "manual" + tags: ["mcp", "delete", "workflow", "human-in-the-loop"] + + # ============================================================================ + # SQL Generation — Catalog-Scope Policy + # ============================================================================ + - id: "fgac_catalog_scope_sql_001" + inputs: + prompt: "Create a column mask policy at catalog level to mask all PII across the entire catalog" + outputs: + response: | + Here's a catalog-level column mask policy: + + ```sql + CREATE OR REPLACE POLICY mask_pii_catalog + ON CATALOG my_catalog + COMMENT 'Mask PII columns catalog-wide' + COLUMN MASK my_catalog.governance.mask_full + TO `analysts`, `data_scientists` + EXCEPT `gov_admin` + FOR TABLES + MATCH COLUMNS hasTag('pii_type') AS masked_col + ON COLUMN masked_col; + ``` + + Key points: + - `ON CATALOG` applies the policy to ALL tables in ALL schemas within the catalog + - Uses `hasTag('pii_type')` (without value) to match any PII column regardless of type + - Policy quotas: max 10 policies per catalog + - `EXCEPT `gov_admin`` protects admin access + execution_success: null + expectations: + expected_facts: + - "ON CATALOG" + - "COLUMN MASK" + - "FOR TABLES" + - "EXCEPT `gov_admin`" + expected_patterns: + - pattern: "ON CATALOG" + min_count: 1 + description: "catalog scope" + - pattern: "COLUMN MASK" + min_count: 1 + description: "column mask type" + - pattern: "EXCEPT\\s+`gov_admin`" + min_count: 1 + description: "admin exclusion" + guidelines: + - "Must use ON CATALOG scope" + - "Must include EXCEPT `gov_admin`" + - "Must include FOR TABLES" + - "Should mention catalog-level policy quotas" + metadata: + category: "happy_path" + difficulty: "medium" + source: "manual" + tags: ["sql", "column-mask", "catalog-scope"] + + # ============================================================================ + # Error/Edge Cases — Policy Quota + # ============================================================================ + - id: "fgac_error_quota_001" + inputs: + prompt: "I'm getting POLICY_QUOTA_EXCEEDED when creating a policy on my table. How do I fix this?" + outputs: + response: | + The `POLICY_QUOTA_EXCEEDED` error means you've reached the maximum number of policies on that securable. + + **Policy quotas:** + | Scope | Max Policies | + |-------|-------------| + | Per Catalog | 10 | + | Per Schema | 10 | + | Per Table | 5 | + + **How to fix:** + + 1. **Check current quota** using MCP: + ``` + check_policy_quota( + securable_type="TABLE", + securable_fullname="my_catalog.my_schema.my_table" + ) + ``` + + 2. **Consolidate policies** — Combine multiple policies that use the same UDF into one with broader tag matching (`hasTag` instead of multiple `hasTagValue` policies) + + 3. **Move to broader scope** — Instead of 5 table-level policies, create 1 schema-level policy + + 4. **Remove unused policies**: + ```sql + DROP POLICY old_policy_name ON TABLE my_catalog.my_schema.my_table; + ``` + execution_success: null + expectations: + expected_facts: + - "POLICY_QUOTA_EXCEEDED" + - "10" + - "5" + - "check_policy_quota" + expected_patterns: + - pattern: "10" + min_count: 1 + description: "catalog/schema quota" + - pattern: "5" + min_count: 1 + description: "table quota" + - pattern: "check_policy_quota|consolidat|broader scope" + min_count: 1 + description: "solution recommendation" + guidelines: + - "Must list quota limits for all three scopes" + - "Must suggest consolidation or broader scope as solutions" + - "Should show check_policy_quota MCP tool" + metadata: + category: "error_handling" + difficulty: "medium" + source: "manual" + tags: ["error", "quota", "policy-limits"] + + # ============================================================================ + # UC ACLs — Ownership Transfer + # ============================================================================ + - id: "uc_acl_ownership_001" + inputs: + prompt: "Transfer ownership of a schema to a new group" + outputs: + response: | + Here's how to transfer ownership of a schema: + + ```sql + ALTER SCHEMA my_catalog.my_schema OWNER TO `platform_team`; + ``` + + You can also transfer ownership of other securables: + ```sql + -- Catalog ownership + ALTER CATALOG my_catalog OWNER TO `platform_team`; + + -- Table ownership + ALTER TABLE my_catalog.my_schema.my_table OWNER TO `platform_team`; + + -- Volume ownership + ALTER VOLUME my_catalog.my_schema.my_volume OWNER TO `platform_team`; + ``` + + Key points: + - Only the current owner or a metastore admin can transfer ownership + - The owner has all privileges on the object + - The owner can grant/revoke privileges to others + - There is exactly one owner per securable + execution_success: null + expectations: + expected_facts: + - "ALTER" + - "OWNER TO" + expected_patterns: + - pattern: "ALTER SCHEMA.*OWNER TO" + min_count: 1 + description: "schema ownership transfer" + - pattern: "OWNER TO" + min_count: 1 + description: "ownership transfer syntax" + guidelines: + - "Must use ALTER ... OWNER TO syntax" + - "Must explain who can transfer ownership" + - "Should show examples for multiple securable types" + metadata: + category: "happy_path" + difficulty: "easy" + source: "manual" + tags: ["sql", "acl", "ownership", "transfer"] + + # ============================================================================ + # UC ACLs — Read-Only Access Pattern + # ============================================================================ + - id: "uc_acl_read_only_001" + inputs: + prompt: "Grant the data_readers group read-only access to the analytics.gold schema" + outputs: + response: | + Here's how to grant read-only access to a schema: + + ```sql + -- Step 1: Grant catalog access (required to see anything inside) + GRANT USE CATALOG ON CATALOG analytics TO `data_readers`; + + -- Step 2: Grant schema access + GRANT USE SCHEMA ON SCHEMA analytics.gold TO `data_readers`; + + -- Step 3: Grant SELECT on all tables in the schema + GRANT SELECT ON SCHEMA analytics.gold TO `data_readers`; + ``` + + Key points: + - `USE CATALOG` is required before any schema-level access + - `USE SCHEMA` is required before any table-level access + - `SELECT` at schema level applies to all current and future tables in the schema + - Grant to **groups**, not individual users — easier to manage and audit + + To verify the grants: + ```sql + SHOW GRANTS ON SCHEMA analytics.gold; + ``` + execution_success: null + expectations: + expected_facts: + - "USE CATALOG" + - "USE SCHEMA" + - "SELECT" + - "GRANT" + expected_patterns: + - pattern: "GRANT USE CATALOG" + min_count: 1 + description: "catalog access grant" + - pattern: "GRANT USE SCHEMA" + min_count: 1 + description: "schema access grant" + - pattern: "GRANT SELECT" + min_count: 1 + description: "read access grant" + - pattern: "data_readers" + min_count: 2 + description: "principal group reference" + guidelines: + - "Must include USE CATALOG, USE SCHEMA, and SELECT grants" + - "Must use group name, not individual user" + - "Should explain privilege inheritance" + - "Should show how to verify grants" + metadata: + category: "happy_path" + difficulty: "easy" + source: "manual" + tags: ["sql", "acl", "grant", "read-only", "schema"] + + # ============================================================================ + # UC ACLs — Data Engineer Access Pattern + # ============================================================================ + - id: "uc_acl_data_engineer_001" + inputs: + prompt: "Set up data engineer access to the analytics.silver schema with read, write, and create table privileges" + outputs: + response: | + Here's how to set up data engineer access with read, write, and create table privileges: + + ```sql + -- Catalog access + GRANT USE CATALOG ON CATALOG analytics TO `data_engineers`; + + -- Schema access + GRANT USE SCHEMA ON SCHEMA analytics.silver TO `data_engineers`; + + -- Read access + GRANT SELECT ON SCHEMA analytics.silver TO `data_engineers`; + + -- Write access (insert, update, delete) + GRANT MODIFY ON SCHEMA analytics.silver TO `data_engineers`; + + -- Create new tables + GRANT CREATE TABLE ON SCHEMA analytics.silver TO `data_engineers`; + ``` + + Key points: + - `MODIFY` allows INSERT, UPDATE, DELETE operations on tables + - `CREATE TABLE` allows creating new tables and views in the schema + - Granting at schema level covers all current and future tables + - For volume access, add `GRANT CREATE VOLUME` and `GRANT WRITE VOLUME` separately + + To verify: + ```sql + SHOW GRANTS ON SCHEMA analytics.silver; + ``` + execution_success: null + expectations: + expected_facts: + - "USE CATALOG" + - "USE SCHEMA" + - "SELECT" + - "MODIFY" + - "CREATE TABLE" + expected_patterns: + - pattern: "GRANT USE CATALOG" + min_count: 1 + description: "catalog access" + - pattern: "GRANT MODIFY" + min_count: 1 + description: "write access" + - pattern: "GRANT CREATE TABLE" + min_count: 1 + description: "table creation privilege" + guidelines: + - "Must include USE CATALOG, USE SCHEMA, SELECT, MODIFY, and CREATE TABLE" + - "Must explain what MODIFY allows (insert, update, delete)" + - "Should mention schema-level grants cover all tables" + metadata: + category: "happy_path" + difficulty: "easy" + source: "manual" + tags: ["sql", "acl", "grant", "data-engineer", "schema"] + + # ============================================================================ + # UC ACLs — Revoke and Show Grants + # ============================================================================ + - id: "uc_acl_revoke_001" + inputs: + prompt: "Revoke write access from the contractors group on analytics.silver and show remaining grants" + outputs: + response: | + Here's how to revoke write access and verify the remaining grants: + + ```sql + -- Revoke write access + REVOKE MODIFY ON SCHEMA analytics.silver FROM `contractors`; + + -- Optionally also revoke CREATE TABLE if they shouldn't create new tables + REVOKE CREATE TABLE ON SCHEMA analytics.silver FROM `contractors`; + + -- Verify remaining grants + SHOW GRANTS ON SCHEMA analytics.silver; + + -- Check grants for specific principal + SHOW GRANTS `contractors` ON SCHEMA analytics.silver; + ``` + + Key points: + - `REVOKE` removes only the specified privilege — other grants remain + - `SHOW GRANTS` lists all grants on the securable + - `SHOW GRANTS ` filters to a specific user or group + - To fully remove access, revoke `USE SCHEMA` and `USE CATALOG` as well + execution_success: null + expectations: + expected_facts: + - "REVOKE" + - "MODIFY" + - "SHOW GRANTS" + expected_patterns: + - pattern: "REVOKE MODIFY" + min_count: 1 + description: "revoke write access" + - pattern: "SHOW GRANTS" + min_count: 1 + description: "verify grants" + - pattern: "FROM `contractors`" + min_count: 1 + description: "revoke from correct principal" + guidelines: + - "Must use REVOKE with FROM syntax" + - "Must show how to verify remaining grants with SHOW GRANTS" + - "Should explain that REVOKE only removes the specified privilege" + metadata: + category: "happy_path" + difficulty: "easy" + source: "manual" + tags: ["sql", "acl", "revoke", "show-grants"] + + # ============================================================================ + # UC ACLs — SDK Grant/Revoke + # ============================================================================ + - id: "uc_acl_sdk_001" + inputs: + prompt: "Grant SELECT on a table to a group using the Python SDK" + outputs: + response: | + Here's how to grant SELECT on a table using the Databricks Python SDK: + + ```python + from databricks.sdk import WorkspaceClient + from databricks.sdk.service.catalog import Privilege, PermissionsChange, SecurableType + + w = WorkspaceClient() + + # Grant SELECT to data_readers + w.grants.update( + securable_type=SecurableType.TABLE, + full_name="my_catalog.my_schema.my_table", + changes=[PermissionsChange( + principal="data_readers", + add=[Privilege.SELECT], + )] + ) + ``` + + To verify the grant: + + ```python + # Get current grants + grants = w.grants.get( + securable_type=SecurableType.TABLE, + full_name="my_catalog.my_schema.my_table" + ) + for assignment in grants.privilege_assignments: + print(f"{assignment.principal}: {assignment.privileges}") + ``` + + To revoke: + + ```python + w.grants.update( + securable_type=SecurableType.TABLE, + full_name="my_catalog.my_schema.my_table", + changes=[PermissionsChange( + principal="data_readers", + remove=[Privilege.SELECT], + )] + ) + ``` + + Key points: + - `w.grants.update()` handles both grant (via `add`) and revoke (via `remove`) + - Use `SecurableType` enum: `CATALOG`, `SCHEMA`, `TABLE`, `VOLUME`, `FUNCTION` + - Use `Privilege` enum: `SELECT`, `MODIFY`, `CREATE_TABLE`, etc. + - Use `PermissionsChange` objects instead of raw dicts + - `w.grants.get()` returns current grants; `w.grants.get_effective()` includes inherited + execution_success: null + expectations: + expected_facts: + - "w.grants.update" + - "PermissionsChange" + - "SecurableType" + - "Privilege" + - "SELECT" + expected_patterns: + - pattern: "w\\.grants\\.update\\(" + min_count: 1 + description: "SDK grant update call" + - pattern: "PermissionsChange\\(" + min_count: 1 + description: "typed permissions change object" + - pattern: "Privilege\\.SELECT" + min_count: 1 + description: "typed privilege enum" + - pattern: "w\\.grants\\.get\\(" + min_count: 1 + description: "verify grants" + guidelines: + - "Must use w.grants.update() with PermissionsChange objects" + - "Must use SecurableType and Privilege enums, not raw strings/dicts" + - "Should show how to verify grants with w.grants.get()" + - "Should show how to revoke with remove list" + metadata: + category: "happy_path" + difficulty: "medium" + source: "manual" + tags: ["python", "sdk", "acl", "grant", "table"] diff --git a/.test/skills/databricks-unity-catalog/manifest.yaml b/.test/skills/databricks-unity-catalog/manifest.yaml new file mode 100644 index 00000000..a92b04ee --- /dev/null +++ b/.test/skills/databricks-unity-catalog/manifest.yaml @@ -0,0 +1,40 @@ +skill: + name: "databricks-unity-catalog" + source_path: "databricks-skills/databricks-unity-catalog" + description: "Unity Catalog FGAC governance - column masks, row filters, governed tags, masking UDFs" + +evaluation: + datasets: + - path: ground_truth.yaml + type: yaml + +# Scorer configuration +scorers: + # Built-in deterministic scorers + enabled: + - python_syntax + - sql_syntax + - pattern_adherence + - no_hallucinated_apis + - expected_facts_present + + # LLM-based scorers + llm_scorers: + - Safety + - guidelines_from_expectations # Dynamic from YAML expectations.guidelines + + # Default guidelines (used when test case has no guidelines) + default_guidelines: + - "Response must address the user's request completely" + - "Code examples must follow documented best practices" + - "FGAC policies must always include EXCEPT `gov_admin`" + - "Masking UDFs must be DETERMINISTIC with simple CASE statements" + - "Must use fully qualified names for UDFs (catalog.schema.function)" + - "Must never generate SHOW POLICIES, DESCRIBE POLICY, or ALTER POLICY SQL" + - "Policy creation must follow human-in-the-loop workflow (preview -> approve -> execute)" + +quality_gates: + syntax_valid: 1.0 + pattern_adherence: 0.90 + no_hallucinated_apis: 1.0 + execution_success: 0.80 diff --git a/.test/src/skill_test/grp/executor.py b/.test/src/skill_test/grp/executor.py index 5cd393bb..f0a30fd5 100644 --- a/.test/src/skill_test/grp/executor.py +++ b/.test/src/skill_test/grp/executor.py @@ -4,8 +4,8 @@ import re import time import yaml -from dataclasses import dataclass, field -from typing import List, Tuple, Optional, Dict, Any, Callable, Protocol +from dataclasses import dataclass +from typing import List, Tuple, Optional, Dict, Any, Protocol @dataclass @@ -157,7 +157,23 @@ def verify_sql_structure(code: str) -> ExecutionResult: issues = [] # Check for valid SQL statements - statements = ["SELECT", "CREATE", "INSERT", "UPDATE", "DELETE", "WITH", "MERGE"] + statements = [ + "SELECT", + "CREATE", + "INSERT", + "UPDATE", + "DELETE", + "WITH", + "MERGE", + "ALTER", + "DROP", + "GRANT", + "REVOKE", + "SET", + "SHOW", + "DESCRIBE", + "MATCH", + ] has_statement = any(stmt in code.upper() for stmt in statements) if not has_statement: issues.append("No recognizable SQL statement found") diff --git a/.test/src/skill_test/runners/compare.py b/.test/src/skill_test/runners/compare.py index 460d03db..d2d95b56 100644 --- a/.test/src/skill_test/runners/compare.py +++ b/.test/src/skill_test/runners/compare.py @@ -3,7 +3,7 @@ import json from datetime import datetime from pathlib import Path -from typing import Dict, Any, Optional, List +from typing import Dict, Optional, List from dataclasses import dataclass, asdict diff --git a/.test/src/skill_test/runners/evaluate.py b/.test/src/skill_test/runners/evaluate.py index 1dff1009..dc2530f4 100644 --- a/.test/src/skill_test/runners/evaluate.py +++ b/.test/src/skill_test/runners/evaluate.py @@ -4,7 +4,7 @@ from typing import Optional, Dict, Any, List import yaml import mlflow -from mlflow.genai.scorers import Guidelines, Safety +from mlflow.genai.scorers import Safety from ..config import SkillTestConfig from ..dataset import get_dataset_source @@ -25,7 +25,6 @@ file_existence, tool_sequence, category_limits, - get_trace_scorers, ) diff --git a/.test/src/skill_test/scorers/routing.py b/.test/src/skill_test/scorers/routing.py index 1a03d698..667b330e 100644 --- a/.test/src/skill_test/scorers/routing.py +++ b/.test/src/skill_test/scorers/routing.py @@ -43,7 +43,12 @@ "dashboard app", "data app", ], - "databricks-asset-bundles": ["dabs", "databricks asset bundle", "deploy", "bundle.yaml"], + "databricks-asset-bundles": [ + "dabs", + "databricks asset bundle", + "deploy", + "bundle.yaml", + ], "databricks-python-sdk": [ "python sdk", "databricks-sdk", @@ -74,7 +79,32 @@ "supervisor", ], "databricks-lakebase-provisioned": ["lakebase", "postgresql", "postgres"], - "databricks-model-serving": ["model serving", "serving endpoint", "inference endpoint"], + "databricks-model-serving": [ + "model serving", + "serving endpoint", + "inference endpoint", + ], + "databricks-unity-catalog": [ + "fgac", + "column mask", + "row filter", + "governed tag", + "masking udf", + "create policy", + "drop policy", + "hastagvalue", + "hastag", + "pii classification", + "grant select", + "grant use", + "revoke", + "permissions", + "acl", + "access control", + "read-only access", + "ownership", + "owner to", + ], } diff --git a/.test/src/skill_test/scorers/trace.py b/.test/src/skill_test/scorers/trace.py index 2532a01d..8cd6bf13 100644 --- a/.test/src/skill_test/scorers/trace.py +++ b/.test/src/skill_test/scorers/trace.py @@ -12,8 +12,6 @@ from mlflow.entities import Feedback from mlflow.genai.scorers import scorer -from ..trace.models import TraceMetrics - @scorer def tool_count( diff --git a/.test/src/skill_test/scorers/universal.py b/.test/src/skill_test/scorers/universal.py index 3b839bf4..0ce39c85 100644 --- a/.test/src/skill_test/scorers/universal.py +++ b/.test/src/skill_test/scorers/universal.py @@ -44,7 +44,8 @@ def sql_syntax(outputs: Dict[str, Any]) -> Feedback: errors = [] for i, block in enumerate(sql_blocks): - if not re.search(r"(SELECT|CREATE|INSERT|UPDATE|DELETE|WITH|MERGE)", block, re.I): + sql_kw = r"(SELECT|CREATE|INSERT|UPDATE|DELETE|WITH|MERGE|ALTER|DROP|GRANT|REVOKE|SET|SHOW|DESCRIBE|MATCH)" + if not re.search(sql_kw, block, re.I): errors.append(f"Block {i + 1}: No recognizable SQL statement") if block.count("(") != block.count(")"): errors.append(f"Block {i + 1}: Unbalanced parentheses") diff --git a/.test/src/skill_test/trace/mlflow_integration.py b/.test/src/skill_test/trace/mlflow_integration.py index 62f8ffb1..179391f7 100644 --- a/.test/src/skill_test/trace/mlflow_integration.py +++ b/.test/src/skill_test/trace/mlflow_integration.py @@ -13,7 +13,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Union -from .models import TraceMetrics, ToolCall, FileOperation +from .models import TraceMetrics, ToolCall from .parser import parse_and_compute_metrics diff --git a/.test/src/skill_test/trace/source.py b/.test/src/skill_test/trace/source.py index 74918875..8b7abef9 100644 --- a/.test/src/skill_test/trace/source.py +++ b/.test/src/skill_test/trace/source.py @@ -5,7 +5,6 @@ 2. Local fallback (~/.claude/projects/{hash}/*.jsonl) """ -import os import subprocess from dataclasses import dataclass from pathlib import Path diff --git a/README.md b/README.md index 431aa152..90b6a7d8 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ AI-Driven Development (vibe coding) on Databricks just got a whole lot better. T - **Spark Declarative Pipelines** (streaming tables, CDC, SCD Type 2, Auto Loader) - **Databricks Jobs** (scheduled workflows, multi-task DAGs) - **AI/BI Dashboards** (visualizations, KPIs, analytics) -- **Unity Catalog** (tables, volumes, governance) +- **Unity Catalog** (tables, volumes, governance, FGAC column masks & row filters) - **Genie Spaces** (natural language data exploration) - **Knowledge Assistants** (RAG-based document Q&A) - **MLflow Experiments** (evaluation, scoring, traces) diff --git a/databricks-mcp-server/README.md b/databricks-mcp-server/README.md index bb013999..bf824f8e 100644 --- a/databricks-mcp-server/README.md +++ b/databricks-mcp-server/README.md @@ -158,6 +158,14 @@ Claude now has both: | `delete_dashboard` | Soft-delete a dashboard (moves to trash) | | `publish_dashboard` | Publish or unpublish a dashboard (`publish=True/False`) | +### Unity Catalog FGAC Governance + +| Tool | Description | +|------|-------------| +| `manage_uc_fgac_policies` | Unified tool for FGAC policy governance — dispatches to discovery, analysis, preview, and management actions via the `action` parameter | + +**Actions:** `list_fgac_policies`, `get_fgac_policy`, `get_table_policies`, `get_masking_functions`, `get_column_tags_api`, `get_schema_info`, `get_catalog_info`, `list_table_policies_in_schema`, `analyze_fgac_coverage`, `check_policy_quota`, `preview_policy_changes`, `create_fgac_policy`, `update_fgac_policy`, `delete_fgac_policy` + ### Model Serving | Tool | Description | @@ -190,6 +198,7 @@ Claude now has both: │ tools/pipelines.py ────────┤ │ │ tools/agent_bricks.py ─────┤ │ │ tools/aibi_dashboards.py ──┤ │ +│ tools/fgac_policies.py ────┤ │ │ tools/serving.py ──────────┘ │ └──────────────────────────────┬──────────────────────────────┘ │ Python imports diff --git a/databricks-mcp-server/databricks_mcp_server/server.py b/databricks-mcp-server/databricks_mcp_server/server.py index 1fa7ea16..b8311427 100644 --- a/databricks-mcp-server/databricks_mcp_server/server.py +++ b/databricks-mcp-server/databricks_mcp_server/server.py @@ -153,4 +153,5 @@ async def _noop_lifespan(*args, **kwargs): lakebase, user, apps, + fgac_policies, ) diff --git a/databricks-mcp-server/databricks_mcp_server/tools/fgac_policies.py b/databricks-mcp-server/databricks_mcp_server/tools/fgac_policies.py new file mode 100644 index 00000000..6ef0e24a --- /dev/null +++ b/databricks-mcp-server/databricks_mcp_server/tools/fgac_policies.py @@ -0,0 +1,214 @@ +""" +Unity Catalog FGAC Policy MCP Tool + +Consolidated MCP tool for managing Fine-Grained Access Control (FGAC) policies. +Dispatches to core functions in databricks-tools-core based on the action parameter. +""" + +from typing import Any, Dict, List, Optional + +from databricks_tools_core.unity_catalog import ( + list_fgac_policies as _list_fgac_policies, + get_fgac_policy as _get_fgac_policy, + get_table_policies as _get_table_policies, + get_masking_functions as _get_masking_functions, + get_column_tags_api as _get_column_tags_api, + get_schema_info as _get_schema_info, + get_catalog_info as _get_catalog_info, + list_table_policies_in_schema as _list_table_policies_in_schema, + analyze_fgac_coverage as _analyze_fgac_coverage, + check_policy_quota as _check_policy_quota, + preview_policy_changes as _preview_policy_changes, + create_fgac_policy as _create_fgac_policy, + update_fgac_policy as _update_fgac_policy, + delete_fgac_policy as _delete_fgac_policy, +) + +from ..server import mcp + + +@mcp.tool +def manage_uc_fgac_policies( + action: str, + securable_type: Optional[str] = None, + securable_fullname: Optional[str] = None, + policy_name: Optional[str] = None, + policy_type: Optional[str] = None, + to_principals: Optional[List[str]] = None, + except_principals: Optional[List[str]] = None, + function_name: Optional[str] = None, + tag_name: Optional[str] = None, + tag_value: Optional[str] = None, + comment: Optional[str] = None, + include_inherited: bool = True, + catalog: Optional[str] = None, + schema: Optional[str] = None, + table: Optional[str] = None, + udf_catalog: Optional[str] = None, + udf_schema: Optional[str] = None, + preview_action: Optional[str] = None, + approval_token: Optional[str] = None, +) -> Dict[str, Any]: + """ + Manage FGAC (Fine-Grained Access Control) policies on Unity Catalog securables. + + FGAC policies bind governed tags to masking UDFs or row filters, scoped to + catalogs, schemas, or tables, and targeted at specific principals. + + Actions: + - list: List policies on a securable. Params: securable_type, securable_fullname, include_inherited, policy_type + - get: Get a specific policy. Params: policy_name, securable_type, securable_fullname + - get_table_policies: Get column masks and row filters on a table. Params: catalog, schema, table + - get_masking_functions: List masking UDFs in a schema. Params: catalog, schema + (or udf_catalog, udf_schema for UDFs in a different catalog/schema) + - get_column_tags: Get column-level tags for a table. Params: catalog, schema, table + - get_schema_info: Get schema metadata. Params: catalog, schema + - get_catalog_info: Get catalog metadata. Params: catalog + - list_table_policies_in_schema: List all tables in a schema with their policies. Params: catalog, schema + - analyze_coverage: Analyze FGAC policy coverage gaps. Params: catalog, schema (optional) + - check_quota: Check policy quota on a securable. Params: securable_type, securable_fullname + - preview: Preview policy changes without executing. Params: preview_action + ("CREATE"/"UPDATE"/"DELETE"), policy_name, securable_type, securable_fullname, + plus policy_type/function_name/tag_name/to_principals for CREATE + - create: Create an FGAC policy. Params: policy_name, + policy_type ("COLUMN_MASK"/"ROW_FILTER"), securable_type, securable_fullname, + function_name, to_principals, tag_name, tag_value, except_principals, comment, + approval_token (required, from preview) + - update: Update policy principals or comment. Params: policy_name, securable_type, securable_fullname, + to_principals, except_principals, comment, approval_token (required, from preview) + - delete: Delete an FGAC policy. Params: policy_name, securable_type, securable_fullname, + approval_token (required, from preview) + + Args: + action: Operation to perform (see actions above) + securable_type: "CATALOG", "SCHEMA", or "TABLE" + securable_fullname: Fully qualified securable name (e.g., "my_catalog.my_schema") + policy_name: Policy name + policy_type: "COLUMN_MASK" or "ROW_FILTER" (for create/list/preview) + to_principals: Users/groups the policy applies to + except_principals: Excluded principals + function_name: Fully qualified UDF name (e.g., "catalog.schema.mask_ssn") + tag_name: Tag key to match columns on + tag_value: Tag value to match (optional; omit for hasTag vs hasTagValue) + comment: Policy description + include_inherited: Include inherited policies in list (default: True) + catalog: Catalog name (for get_table_policies, get_masking_functions) + schema: Schema name (for get_table_policies, get_masking_functions) + table: Table name (for get_table_policies) + udf_catalog: Catalog where masking UDFs reside (for get_masking_functions; defaults to catalog) + udf_schema: Schema where masking UDFs reside (for get_masking_functions; defaults to schema) + preview_action: Sub-action for preview: "CREATE", "UPDATE", or "DELETE" + approval_token: Approval token from preview action (required for create/update/delete) + + Returns: + Dict with operation result + """ + act = action.lower() + + if act == "list": + return _list_fgac_policies( + securable_type=securable_type, + securable_fullname=securable_fullname, + include_inherited=include_inherited, + policy_type=policy_type, + ) + elif act == "get": + return _get_fgac_policy( + policy_name=policy_name, + securable_type=securable_type, + securable_fullname=securable_fullname, + ) + elif act == "get_table_policies": + return _get_table_policies( + catalog=catalog, + schema=schema, + table=table, + ) + elif act == "get_masking_functions": + return _get_masking_functions( + catalog=udf_catalog or catalog, + schema=udf_schema or schema, + ) + elif act == "get_column_tags": + return _get_column_tags_api( + catalog=catalog, + schema=schema, + table=table, + ) + elif act == "get_schema_info": + return _get_schema_info( + catalog=catalog, + schema=schema, + ) + elif act == "get_catalog_info": + return _get_catalog_info( + catalog=catalog, + ) + elif act == "list_table_policies_in_schema": + return _list_table_policies_in_schema( + catalog=catalog, + schema=schema, + ) + elif act == "analyze_coverage": + return _analyze_fgac_coverage( + catalog=catalog, + schema=schema, + ) + elif act == "check_quota": + return _check_policy_quota( + securable_type=securable_type, + securable_fullname=securable_fullname, + ) + elif act == "preview": + if not preview_action: + raise ValueError("preview_action is required for preview action. Must be 'CREATE', 'UPDATE', or 'DELETE'.") + return _preview_policy_changes( + action=preview_action, + policy_name=policy_name, + securable_type=securable_type, + securable_fullname=securable_fullname, + policy_type=policy_type, + to_principals=to_principals, + except_principals=except_principals, + function_name=function_name, + tag_name=tag_name, + tag_value=tag_value, + comment=comment, + ) + elif act == "create": + return _create_fgac_policy( + policy_name=policy_name, + policy_type=policy_type, + securable_type=securable_type, + securable_fullname=securable_fullname, + function_name=function_name, + to_principals=to_principals, + tag_name=tag_name, + approval_token=approval_token, + tag_value=tag_value, + except_principals=except_principals, + comment=comment or "", + ) + elif act == "update": + return _update_fgac_policy( + policy_name=policy_name, + securable_type=securable_type, + securable_fullname=securable_fullname, + approval_token=approval_token, + to_principals=to_principals, + except_principals=except_principals, + comment=comment, + ) + elif act == "delete": + return _delete_fgac_policy( + policy_name=policy_name, + securable_type=securable_type, + securable_fullname=securable_fullname, + approval_token=approval_token, + ) + + raise ValueError( + f"Invalid action: '{action}'. Valid actions: list, get, get_table_policies, " + f"get_masking_functions, get_column_tags, get_schema_info, get_catalog_info, " + f"list_table_policies_in_schema, analyze_coverage, check_quota, preview, create, update, delete" + ) diff --git a/databricks-skills/databricks-unity-catalog/10-uc-acls.md b/databricks-skills/databricks-unity-catalog/10-uc-acls.md new file mode 100644 index 00000000..f6560198 --- /dev/null +++ b/databricks-skills/databricks-unity-catalog/10-uc-acls.md @@ -0,0 +1,220 @@ +# Unity Catalog Access Controls (ACLs) + +Comprehensive reference for Unity Catalog privilege management: GRANT/REVOKE, ownership, and permission patterns across securables. + +## Securable Hierarchy + +``` +METASTORE + └── CATALOG + └── SCHEMA + ├── TABLE / VIEW / MATERIALIZED VIEW + ├── VOLUME + ├── FUNCTION + └── MODEL +``` + +Privileges **inherit** down the hierarchy. Granting `USE CATALOG` on a catalog grants access to all schemas within it (but not data access — that requires `SELECT`, `MODIFY`, etc.). + +## Privilege Reference + +### Catalog-Level + +| Privilege | Description | +|-----------|-------------| +| `USE CATALOG` | Required to access any object within the catalog | +| `CREATE SCHEMA` | Create schemas within the catalog | +| `ALL PRIVILEGES` | All catalog-level privileges | + +### Schema-Level + +| Privilege | Description | +|-----------|-------------| +| `USE SCHEMA` | Required to access any object within the schema | +| `CREATE TABLE` | Create tables and views | +| `CREATE VOLUME` | Create volumes | +| `CREATE FUNCTION` | Create functions | +| `CREATE MODEL` | Create registered models | +| `ALL PRIVILEGES` | All schema-level privileges | + +### Table/View-Level + +| Privilege | Description | +|-----------|-------------| +| `SELECT` | Read data from the table or view | +| `MODIFY` | Insert, update, delete data | +| `ALL PRIVILEGES` | All table-level privileges | + +### Volume-Level + +| Privilege | Description | +|-----------|-------------| +| `READ VOLUME` | Read files from the volume | +| `WRITE VOLUME` | Write files to the volume | +| `ALL PRIVILEGES` | All volume-level privileges | + +### Function-Level + +| Privilege | Description | +|-----------|-------------| +| `EXECUTE` | Execute the function | +| `ALL PRIVILEGES` | All function-level privileges | + +## SQL Syntax + +### GRANT + +```sql +-- Catalog access +GRANT USE CATALOG ON CATALOG my_catalog TO `group_name`; +GRANT CREATE SCHEMA ON CATALOG my_catalog TO `group_name`; + +-- Schema access +GRANT USE SCHEMA ON SCHEMA my_catalog.my_schema TO `group_name`; +GRANT CREATE TABLE ON SCHEMA my_catalog.my_schema TO `group_name`; +GRANT CREATE VOLUME ON SCHEMA my_catalog.my_schema TO `group_name`; +GRANT CREATE FUNCTION ON SCHEMA my_catalog.my_schema TO `group_name`; + +-- Table/View access +GRANT SELECT ON TABLE my_catalog.my_schema.my_table TO `group_name`; +GRANT MODIFY ON TABLE my_catalog.my_schema.my_table TO `group_name`; + +-- Volume access +GRANT READ VOLUME ON VOLUME my_catalog.my_schema.my_volume TO `group_name`; +GRANT WRITE VOLUME ON VOLUME my_catalog.my_schema.my_volume TO `group_name`; + +-- Function access +GRANT EXECUTE ON FUNCTION my_catalog.my_schema.my_function TO `group_name`; + +-- All privileges shorthand +GRANT ALL PRIVILEGES ON SCHEMA my_catalog.my_schema TO `admin_group`; +``` + +### REVOKE + +```sql +REVOKE SELECT ON TABLE my_catalog.my_schema.my_table FROM `group_name`; +REVOKE MODIFY ON TABLE my_catalog.my_schema.my_table FROM `group_name`; +REVOKE ALL PRIVILEGES ON SCHEMA my_catalog.my_schema FROM `group_name`; +``` + +### Show Grants + +```sql +-- Show all grants on a securable +SHOW GRANTS ON CATALOG my_catalog; +SHOW GRANTS ON SCHEMA my_catalog.my_schema; +SHOW GRANTS ON TABLE my_catalog.my_schema.my_table; +SHOW GRANTS ON VOLUME my_catalog.my_schema.my_volume; + +-- Show grants for a specific principal +SHOW GRANTS `group_name` ON CATALOG my_catalog; +SHOW GRANTS `user@example.com` ON SCHEMA my_catalog.my_schema; +``` + +## Ownership + +Every securable has exactly one **owner**. The owner has all privileges on the object and can grant/revoke privileges to others. + +```sql +-- Transfer ownership +ALTER CATALOG my_catalog OWNER TO `new_owner`; +ALTER SCHEMA my_catalog.my_schema OWNER TO `new_owner`; +ALTER TABLE my_catalog.my_schema.my_table OWNER TO `new_owner`; +ALTER VOLUME my_catalog.my_schema.my_volume OWNER TO `new_owner`; +``` + +## Python SDK Patterns + +```python +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.catalog import Privilege, PermissionsChange, SecurableType + +w = WorkspaceClient() + +# Grant privileges +w.grants.update( + securable_type=SecurableType.TABLE, + full_name="my_catalog.my_schema.my_table", + changes=[PermissionsChange( + principal="data_readers", + add=[Privilege.SELECT], + )] +) + +# Revoke privileges +w.grants.update( + securable_type=SecurableType.TABLE, + full_name="my_catalog.my_schema.my_table", + changes=[PermissionsChange( + principal="data_readers", + remove=[Privilege.SELECT], + )] +) + +# Get current grants +grants = w.grants.get( + securable_type=SecurableType.TABLE, + full_name="my_catalog.my_schema.my_table" +) +for assignment in grants.privilege_assignments: + print(f"{assignment.principal}: {assignment.privileges}") + +# Get effective grants (includes inherited) +effective = w.grants.get_effective( + securable_type=SecurableType.TABLE, + full_name="my_catalog.my_schema.my_table", + principal="data_readers" +) +``` + +## Common Patterns + +### Read-Only Data Consumer + +```sql +-- Minimal access for data readers +GRANT USE CATALOG ON CATALOG analytics TO `data_readers`; +GRANT USE SCHEMA ON SCHEMA analytics.gold TO `data_readers`; +GRANT SELECT ON SCHEMA analytics.gold TO `data_readers`; +``` + +### Data Engineer (Read + Write) + +```sql +GRANT USE CATALOG ON CATALOG analytics TO `data_engineers`; +GRANT USE SCHEMA ON SCHEMA analytics.silver TO `data_engineers`; +GRANT SELECT ON SCHEMA analytics.silver TO `data_engineers`; +GRANT MODIFY ON SCHEMA analytics.silver TO `data_engineers`; +GRANT CREATE TABLE ON SCHEMA analytics.silver TO `data_engineers`; +``` + +### Schema Admin + +```sql +GRANT USE CATALOG ON CATALOG analytics TO `schema_admins`; +GRANT ALL PRIVILEGES ON SCHEMA analytics.gold TO `schema_admins`; +``` + +### ML Engineer (Models + Functions) + +```sql +GRANT USE CATALOG ON CATALOG ml TO `ml_engineers`; +GRANT USE SCHEMA ON SCHEMA ml.models TO `ml_engineers`; +GRANT CREATE MODEL ON SCHEMA ml.models TO `ml_engineers`; +GRANT CREATE FUNCTION ON SCHEMA ml.models TO `ml_engineers`; +GRANT SELECT ON SCHEMA ml.features TO `ml_engineers`; +``` + +## MCP Tool + +Use `mcp__databricks__manage_uc_grants` for grant operations, or `mcp__databricks__execute_sql` for SQL-based grant management. + +## Best Practices + +1. **Grant to groups, not users** — Easier to manage and audit +2. **Use least privilege** — Grant only the minimum permissions needed +3. **Leverage inheritance** — Grant at schema level when all tables need the same access +4. **Audit regularly** — Query `system.access.audit` for grant/revoke events +5. **Prefer `USE CATALOG` + `USE SCHEMA` + `SELECT`** over `ALL PRIVILEGES` +6. **Document ownership** — Keep track of who owns each catalog/schema diff --git a/databricks-skills/databricks-unity-catalog/7-fgac-overview.md b/databricks-skills/databricks-unity-catalog/7-fgac-overview.md new file mode 100644 index 00000000..1b07ddbf --- /dev/null +++ b/databricks-skills/databricks-unity-catalog/7-fgac-overview.md @@ -0,0 +1,342 @@ +# FGAC Policy Governance Overview + +Guidance for Fine-Grained Access Control (FGAC) policies in Databricks Unity Catalog. Covers governed tags, tag assignments, masking UDFs, CREATE/DROP POLICY syntax, and the human-in-the-loop governance workflow. + +**Databricks Docs:** +- FGAC overview: https://docs.databricks.com/data-governance/unity-catalog/abac/ +- FGAC policies: https://docs.databricks.com/data-governance/unity-catalog/abac/policies +- FGAC tutorial: https://docs.databricks.com/data-governance/unity-catalog/abac/tutorial + +## When to Use This Skill + +Use this skill when: +- Creating or managing **FGAC policies** (column masks, row filters) +- Working with **governed tags** (creating via UI, applying via SQL) +- Building **masking UDFs** for PII protection (SSN, email, credit card, etc.) +- Implementing **human-in-the-loop governance** workflows +- Querying tag assignments via `information_schema` +- Managing policy lifecycle (create, update, delete, preview) + +## Reference Files + +| Topic | File | Description | +|-------|------|-------------| +| SQL Generation | [8-fgac-sql-generation.md](8-fgac-sql-generation.md) | SET/UNSET TAG, CREATE FUNCTION, CREATE/DROP POLICY, discovery queries | +| SDK & MCP Tools | [9-fgac-sdk-and-tools.md](9-fgac-sdk-and-tools.md) | Python SDK patterns and 12 MCP tools for policy management | + +--- + +## FGAC Workflow Overview + +FGAC policies in Databricks follow a 4-step setup: + +1. **Governed Tags** - Define classification taxonomy (UI only) +2. **Tag Assignments** - Apply tags to columns/tables via SQL +3. **Masking UDFs** - Create deterministic functions for data masking +4. **FGAC Policies** - Bind tags to UDFs with principal scoping + +``` +┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ +│ Governed Tags│───>│ Tag │───>│ Masking │───>│ FGAC │ +│ (UI only) │ │ Assignments │ │ UDFs │ │ Policies │ +└──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ +``` + +--- + +## IMPORTANT: SQL That Does NOT Exist + +These SQL commands do **not** exist in Databricks. Do not generate them. + +| Invalid SQL | What to use instead | +|---|---| +| `SHOW POLICIES` | REST API: `w.policies.list_policies()` | +| `DESCRIBE POLICY` | REST API: `w.policies.get_policy()` | +| `ALTER POLICY` | Drop and recreate the policy | +| `ALTER USER SET ATTRIBUTES` | SCIM API for user attributes | +| `SHOW USER ATTRIBUTES` | SCIM API for user attributes | + +--- + +## Step 1: Governed Tags + +Governed tags **cannot** be created via SQL. They must be created in the Databricks UI. + +### Creating a Governed Tag (UI Steps) + +1. Navigate to **Catalog** in the workspace +2. Select **Governed Tags** from the left panel +3. Click **Create governed tag** +4. Configure: + - **Tag Key**: e.g., `pii_type` + - **Allowed Values**: e.g., `ssn`, `email`, `phone`, `credit_card`, `address` + - **Description**: e.g., "PII classification for FGAC policies" + +> **Note:** Tag data is stored as plain text and may be replicated globally. Avoid sensitive information in tag names or values. + +**Docs:** https://docs.databricks.com/admin/governed-tags/ + +--- + +## Step 2: Applying Tags to Columns + +### Legacy Syntax (all versions) + +```sql +-- Set tag on column +ALTER TABLE catalog.schema.table +ALTER COLUMN column_name SET TAGS ('pii_type' = 'ssn'); + +-- Set tag on table +ALTER TABLE catalog.schema.table +SET TAGS ('data_classification' = 'confidential'); + +-- Remove tag +ALTER TABLE catalog.schema.table +ALTER COLUMN column_name UNSET TAGS ('pii_type'); +``` + +### Modern Syntax (DBR 16.1+) + +```sql +SET TAG ON COLUMN catalog.schema.table.column_name 'pii_type' = 'ssn'; +SET TAG ON TABLE catalog.schema.table 'data_classification' = 'confidential'; +SET TAG ON SCHEMA catalog.schema 'environment' = 'production'; +SET TAG ON CATALOG catalog 'department' = 'finance'; + +UNSET TAG ON COLUMN catalog.schema.table.column_name 'pii_type'; +``` + +### Querying Existing Tags + +```sql +-- Column tags +SELECT tag_name, tag_value, column_name +FROM system.information_schema.column_tags +WHERE catalog_name = 'my_catalog' + AND schema_name = 'my_schema' + AND table_name = 'my_table'; + +-- Table tags +SELECT tag_name, tag_value +FROM system.information_schema.table_tags +WHERE catalog_name = 'my_catalog' + AND schema_name = 'my_schema' + AND table_name = 'my_table'; +``` + +--- + +## Step 3: Masking UDFs + +Masking UDFs must be `DETERMINISTIC` and use simple `CASE` statements. No external calls or nested UDFs. + +```sql +-- Full mask: replaces all characters with * +CREATE OR REPLACE FUNCTION catalog.schema.mask_full(value STRING) +RETURNS STRING +DETERMINISTIC +COMMENT 'Full masking - replaces all characters with *' +RETURN CASE + WHEN value IS NULL THEN NULL + ELSE REPEAT('*', LENGTH(value)) +END; + +-- Partial mask: show last 4 characters +CREATE OR REPLACE FUNCTION catalog.schema.mask_partial(value STRING) +RETURNS STRING +DETERMINISTIC +COMMENT 'Partial masking - shows last 4 characters' +RETURN CASE + WHEN value IS NULL THEN NULL + WHEN LENGTH(value) <= 4 THEN REPEAT('*', LENGTH(value)) + ELSE CONCAT(REPEAT('*', LENGTH(value) - 4), RIGHT(value, 4)) +END; + +-- SSN mask: ***-**-XXXX format +CREATE OR REPLACE FUNCTION catalog.schema.mask_ssn(ssn STRING) +RETURNS STRING +DETERMINISTIC +COMMENT 'Masks SSN showing only last 4 digits' +RETURN CASE + WHEN ssn IS NULL THEN NULL + WHEN LENGTH(REGEXP_REPLACE(ssn, '[^0-9]', '')) >= 4 + THEN CONCAT('***-**-', RIGHT(REGEXP_REPLACE(ssn, '[^0-9]', ''), 4)) + ELSE '***-**-****' +END; + +-- Email mask: j***@example.com +CREATE OR REPLACE FUNCTION catalog.schema.mask_email(email STRING) +RETURNS STRING +DETERMINISTIC +COMMENT 'Masks email showing first char and domain' +RETURN CASE + WHEN email IS NULL THEN NULL + WHEN INSTR(email, '@') > 1 + THEN CONCAT(LEFT(email, 1), '***@', SUBSTRING(email, INSTR(email, '@') + 1)) + ELSE '***@***.***' +END; +``` + +**Docs:** https://docs.databricks.com/data-governance/unity-catalog/abac/udf-best-practices + +> **Cross-catalog UDFs:** Masking UDFs do not need to be in the same catalog/schema as the policy scope. A common pattern is a shared governance schema (e.g., `governance.masking_udfs`) containing all masking functions, referenced by policies across multiple catalogs. The UDF name in a policy is always fully qualified (e.g., `governance.masking_udfs.mask_ssn`). + +--- + +## Step 4: FGAC Policies + +Policies are scoped to a **catalog**, **schema**, or **table**. `FOR TABLES` is always present. + +### Column Mask Policy + +```sql +-- Catalog level — masks matching columns in ALL tables in the catalog +CREATE OR REPLACE POLICY mask_pii_catalog +ON CATALOG my_catalog +COMMENT 'Mask PII columns catalog-wide' +COLUMN MASK my_catalog.my_schema.mask_partial +TO `analysts`, `data_scientists` +EXCEPT `gov_admin` +FOR TABLES +MATCH COLUMNS hasTagValue('pii_type', 'ssn') AS masked_col +ON COLUMN masked_col; + +-- Schema level — masks matching columns in all tables in the schema +CREATE OR REPLACE POLICY mask_pii_schema +ON SCHEMA my_catalog.my_schema +COMMENT 'Mask PII columns in schema' +COLUMN MASK my_catalog.my_schema.mask_partial +TO `analysts`, `data_scientists` +EXCEPT `gov_admin` +FOR TABLES +MATCH COLUMNS hasTagValue('pii_type', 'ssn') AS masked_col +ON COLUMN masked_col; + +-- Table level — masks matching columns on a single table +CREATE OR REPLACE POLICY mask_pii_table +ON TABLE my_catalog.my_schema.my_table +COMMENT 'Mask PII columns on specific table' +COLUMN MASK my_catalog.my_schema.mask_partial +TO `analysts`, `data_scientists` +EXCEPT `gov_admin` +FOR TABLES +MATCH COLUMNS hasTagValue('pii_type', 'ssn') AS masked_col +ON COLUMN masked_col; +``` + +### Row Filter Policy + +```sql +-- Catalog level — filters rows in ALL tables in the catalog +CREATE OR REPLACE POLICY filter_eu_catalog +ON CATALOG my_catalog +COMMENT 'Filter EU rows catalog-wide' +ROW FILTER my_catalog.my_schema.is_not_eu_region +TO `us_team` +EXCEPT `gov_admin` +FOR TABLES +MATCH COLUMNS hasTagValue('region', 'eu') AS filter_col +USING COLUMNS (filter_col); + +-- Schema level — filters rows in all tables in the schema +CREATE OR REPLACE POLICY filter_eu_schema +ON SCHEMA my_catalog.my_schema +COMMENT 'Filter EU rows in schema' +ROW FILTER my_catalog.my_schema.is_not_eu_region +TO `us_team` +EXCEPT `gov_admin` +FOR TABLES +MATCH COLUMNS hasTagValue('region', 'eu') AS filter_col +USING COLUMNS (filter_col); + +-- Table level — filters rows on a single table +CREATE OR REPLACE POLICY filter_eu_table +ON TABLE my_catalog.my_schema.my_table +COMMENT 'Filter EU rows on specific table' +ROW FILTER my_catalog.my_schema.is_not_eu_region +TO `us_team` +EXCEPT `gov_admin` +FOR TABLES +MATCH COLUMNS hasTagValue('region', 'eu') AS filter_col +USING COLUMNS (filter_col); +``` + +### Drop Policy + +```sql +-- Drop at each scope level +DROP POLICY mask_pii_catalog ON CATALOG my_catalog; +DROP POLICY mask_pii_schema ON SCHEMA my_catalog.my_schema; +DROP POLICY mask_pii_table ON TABLE my_catalog.my_schema.my_table; +``` + +### CRITICAL: Always Exclude `gov_admin` + +Every FGAC policy **MUST** include `EXCEPT \`gov_admin\`` to protect administrator access. Without this, admins could be locked out of data. + +### Policy Quotas + +| Scope | Max Policies | +|-------|-------------| +| Per Catalog | 10 | +| Per Schema | 10 | +| Per Table | 5 | + +https://docs.databricks.com/aws/en/data-governance/unity-catalog/abac/policies#policy-quotas +--- + +## Human-in-the-Loop Governance Workflow + +FGAC policy changes should follow a governed workflow: + +``` +ANALYZE → RECOMMEND → PREVIEW → APPROVE → EXECUTE → VERIFY + │ │ │ │ │ │ + ▼ ▼ ▼ ▼ ▼ ▼ + Discover Generate Show SQL Human Run SQL Confirm + current policy & impact confirms or SDK changes + state proposals preview changes call applied +``` + +1. **ANALYZE**: Discover current tags, policies, and UDFs +2. **RECOMMEND**: Generate policy proposals based on requirements +3. **PREVIEW**: Use `preview_policy_changes` to show exact SQL and impact +4. **APPROVE**: Human reviews and explicitly approves +5. **EXECUTE**: Create/update/delete policies via SDK or SQL +6. **VERIFY**: Confirm policies are applied correctly + +**Never auto-execute policy changes.** Always preview and wait for human approval. + +--- + +## Common Errors + +| Error | Cause | Solution | +|-------|-------|----------| +| `POLICY_QUOTA_EXCEEDED` | Too many policies on scope | Consolidate policies or use broader scope | +| `INVALID_TAG_VALUE` | Tag value not in governed tag's allowed values | Check governed tag configuration in UI | +| `UDF_NOT_FOUND` | Masking UDF doesn't exist | Create UDF first, use fully qualified name | +| `POLICY_ALREADY_EXISTS` | Policy name conflict | Use `CREATE OR REPLACE POLICY` | +| `INSUFFICIENT_PERMISSIONS` | Missing `MANAGE` on securable | Grant `MANAGE` permission to policy creator | +| `SHOW POLICIES is not supported` | Used invalid SQL | Use REST API `w.policies.list_policies()` instead | + +## Best Practices + +1. **Use governed tags** (not ad-hoc tags) for FGAC policy matching +2. **Always include `EXCEPT \`gov_admin\``** in every policy +3. **Use deterministic UDFs** with simple CASE statements +4. **Preview before executing** any policy change +5. **Start at schema scope** and narrow to table only when needed +6. **Name policies descriptively**: `mask_{what}_{scope}` or `filter_{what}_{scope}` +7. **Test UDFs independently** before binding to policies +8. **Monitor policy quotas** — consolidate when approaching limits + +## Resources + +- [FGAC Overview](https://docs.databricks.com/data-governance/unity-catalog/abac/) +- [FGAC Policies](https://docs.databricks.com/data-governance/unity-catalog/abac/policies) +- [FGAC Tutorial](https://docs.databricks.com/data-governance/unity-catalog/abac/tutorial) +- [UDF Best Practices](https://docs.databricks.com/data-governance/unity-catalog/abac/udf-best-practices) +- [Governed Tags](https://docs.databricks.com/admin/governed-tags/) +- [Column Masks & Row Filters](https://docs.databricks.com/data-governance/unity-catalog/filters-and-masks/) diff --git a/databricks-skills/databricks-unity-catalog/8-fgac-sql-generation.md b/databricks-skills/databricks-unity-catalog/8-fgac-sql-generation.md new file mode 100644 index 00000000..b1cf729e --- /dev/null +++ b/databricks-skills/databricks-unity-catalog/8-fgac-sql-generation.md @@ -0,0 +1,420 @@ +# SQL Generation Reference + +Pure SQL patterns for Unity Catalog FGAC governance operations. All SQL follows Databricks syntax. + +--- + +## Tag Operations + +### SET TAG on Column + +```sql +-- Legacy syntax (all versions) +ALTER TABLE catalog.schema.table +ALTER COLUMN column_name SET TAGS ('pii_type' = 'ssn'); + +-- Modern syntax (DBR 16.1+) +SET TAG ON COLUMN catalog.schema.table.column_name 'pii_type' = 'ssn'; +``` + +### SET TAG on Table + +```sql +-- Legacy syntax +ALTER TABLE catalog.schema.table +SET TAGS ('data_classification' = 'confidential'); + +-- Modern syntax +SET TAG ON TABLE catalog.schema.table 'data_classification' = 'confidential'; +``` + +### SET TAG on Schema / Catalog + +```sql +SET TAG ON SCHEMA catalog.schema 'environment' = 'production'; +SET TAG ON CATALOG my_catalog 'department' = 'finance'; +``` + +### UNSET TAG + +```sql +-- Column (legacy) +ALTER TABLE catalog.schema.table +ALTER COLUMN column_name UNSET TAGS ('pii_type'); + +-- Column (modern) +UNSET TAG ON COLUMN catalog.schema.table.column_name 'pii_type'; + +-- Table (legacy) +ALTER TABLE catalog.schema.table +UNSET TAGS ('data_classification'); + +-- Table (modern) +UNSET TAG ON TABLE catalog.schema.table 'data_classification'; +``` + +**Docs:** +- SET TAG: https://docs.databricks.com/sql/language-manual/sql-ref-syntax-ddl-set-tag.html +- UNSET TAG: https://docs.databricks.com/sql/language-manual/sql-ref-syntax-ddl-unset-tag.html + +--- + +## Tag Discovery Queries + +### Query Column Tags + +```sql +SELECT tag_name, tag_value, column_name +FROM system.information_schema.column_tags +WHERE catalog_name = 'my_catalog' + AND schema_name = 'my_schema' + AND table_name = 'my_table'; +``` + +### Query Table Tags + +```sql +SELECT tag_name, tag_value +FROM system.information_schema.table_tags +WHERE catalog_name = 'my_catalog' + AND schema_name = 'my_schema' + AND table_name = 'my_table'; +``` + +### All Tag Assignments in a Catalog + +```sql +-- Table-level tags +SELECT 'TABLE' as securable_type, + CONCAT(catalog_name, '.', schema_name, '.', table_name) as securable_name, + tag_name as tag_key, + tag_value +FROM system.information_schema.table_tags +WHERE catalog_name = 'my_catalog'; + +-- Column-level tags +SELECT 'COLUMN' as securable_type, + CONCAT(catalog_name, '.', schema_name, '.', table_name, '.', column_name) as securable_name, + tag_name as tag_key, + tag_value +FROM system.information_schema.column_tags +WHERE catalog_name = 'my_catalog'; +``` + +**Docs:** +- information_schema.column_tags: https://docs.databricks.com/sql/language-manual/information-schema/column_tags.html +- information_schema.table_tags: https://docs.databricks.com/sql/language-manual/information-schema/table_tags.html + +--- + +## Masking UDF Creation + +All masking UDFs must be `DETERMINISTIC` with simple `CASE` statements. No external calls or nested UDFs. + +### Generic Masking Strategies + +```sql +-- Full mask: replaces all characters with * +CREATE OR REPLACE FUNCTION catalog.schema.mask_full(value STRING) +RETURNS STRING +DETERMINISTIC +COMMENT 'Full masking - replaces all characters with *' +RETURN CASE + WHEN value IS NULL THEN NULL + ELSE REPEAT('*', LENGTH(value)) +END; + +-- Partial mask: show last 4 characters +CREATE OR REPLACE FUNCTION catalog.schema.mask_partial(value STRING) +RETURNS STRING +DETERMINISTIC +COMMENT 'Partial masking - shows last 4 characters' +RETURN CASE + WHEN value IS NULL THEN NULL + WHEN LENGTH(value) <= 4 THEN REPEAT('*', LENGTH(value)) + ELSE CONCAT(REPEAT('*', LENGTH(value) - 4), RIGHT(value, 4)) +END; + +-- Hash: SHA256 with version prefix +CREATE OR REPLACE FUNCTION catalog.schema.mask_hash(value STRING) +RETURNS STRING +DETERMINISTIC +COMMENT 'Hash masking - SHA256 with version prefix' +RETURN CASE + WHEN value IS NULL THEN NULL + ELSE CONCAT('HASH_v1_', SUBSTRING(SHA2(CONCAT(value, ':v1'), 256), 1, 16)) +END; + +-- Redact: replace with [REDACTED] +CREATE OR REPLACE FUNCTION catalog.schema.mask_redact(value STRING) +RETURNS STRING +DETERMINISTIC +COMMENT 'Redaction - replaces value with [REDACTED]' +RETURN CASE + WHEN value IS NULL THEN NULL + ELSE '[REDACTED]' +END; + +-- Nullify: always returns NULL +CREATE OR REPLACE FUNCTION catalog.schema.mask_nullify(value STRING) +RETURNS STRING +DETERMINISTIC +COMMENT 'Nullify - always returns NULL' +RETURN NULL; +``` + +### Specialized Masking UDFs + +```sql +-- SSN: ***-**-XXXX +CREATE OR REPLACE FUNCTION catalog.schema.mask_ssn(ssn STRING) +RETURNS STRING +DETERMINISTIC +COMMENT 'Masks SSN showing only last 4 digits in XXX-XX-XXXX format' +RETURN CASE + WHEN ssn IS NULL THEN NULL + WHEN LENGTH(REGEXP_REPLACE(ssn, '[^0-9]', '')) >= 4 + THEN CONCAT('***-**-', RIGHT(REGEXP_REPLACE(ssn, '[^0-9]', ''), 4)) + ELSE '***-**-****' +END; + +-- Email: j***@example.com +CREATE OR REPLACE FUNCTION catalog.schema.mask_email(email STRING) +RETURNS STRING +DETERMINISTIC +COMMENT 'Masks email showing first char and domain' +RETURN CASE + WHEN email IS NULL THEN NULL + WHEN INSTR(email, '@') > 1 + THEN CONCAT(LEFT(email, 1), '***@', SUBSTRING(email, INSTR(email, '@') + 1)) + ELSE '***@***.***' +END; + +-- Credit card: ****-****-****-1234 +CREATE OR REPLACE FUNCTION catalog.schema.mask_credit_card(card_number STRING) +RETURNS STRING +DETERMINISTIC +COMMENT 'Masks credit card showing only last 4 digits' +RETURN CASE + WHEN card_number IS NULL THEN NULL + WHEN LENGTH(REGEXP_REPLACE(card_number, '[^0-9]', '')) >= 4 + THEN CONCAT('****-****-****-', RIGHT(REGEXP_REPLACE(card_number, '[^0-9]', ''), 4)) + ELSE '****-****-****-****' +END; +``` + +### Row Filter UDFs + +Row filter UDFs return `BOOLEAN`: `TRUE` to include, `FALSE` to exclude. + +```sql +-- Region-based filter: hide EU rows +CREATE OR REPLACE FUNCTION catalog.schema.is_not_eu_region(region_value STRING) +RETURNS BOOLEAN +DETERMINISTIC +COMMENT 'Row filter - returns FALSE for EU regions' +RETURN CASE + WHEN region_value IS NULL THEN TRUE + WHEN LOWER(region_value) LIKE '%eu%' THEN FALSE + WHEN LOWER(region_value) LIKE '%europe%' THEN FALSE + ELSE TRUE +END; + +-- Array membership filter +CREATE OR REPLACE FUNCTION catalog.schema.is_in_allowed_values( + row_value STRING, + allowed_values ARRAY +) +RETURNS BOOLEAN +DETERMINISTIC +COMMENT 'Row filter based on array membership' +RETURN CASE + WHEN allowed_values IS NULL THEN FALSE + WHEN ARRAY_CONTAINS(TRANSFORM(allowed_values, x -> LOWER(x)), LOWER(row_value)) THEN TRUE + ELSE FALSE +END; +``` + +**Docs:** https://docs.databricks.com/data-governance/unity-catalog/abac/udf-best-practices + +--- + +## Policy Creation + +Policies are scoped to a **catalog**, **schema**, or **table**. `FOR TABLES` is always present. + +> **Cross-catalog UDFs:** The UDF referenced in a policy is always fully qualified (`catalog.schema.function`) and can reside in any catalog/schema — it does not need to be in the same catalog or schema as the policy scope. A common pattern is a shared governance schema (e.g., `governance.masking_udfs`) containing all masking functions. + +### Column Mask Policy + +```sql +-- Catalog level — masks matching columns in ALL tables in the catalog +CREATE OR REPLACE POLICY mask_pii_ssn_catalog +ON CATALOG my_catalog +COMMENT 'Mask SSN columns catalog-wide' +COLUMN MASK my_catalog.my_schema.mask_ssn +TO `analysts`, `data_scientists` +EXCEPT `gov_admin` +FOR TABLES +MATCH COLUMNS hasTagValue('pii_type', 'ssn') AS masked_col +ON COLUMN masked_col; + +-- Schema level — masks matching columns in all tables in the schema +CREATE OR REPLACE POLICY mask_pii_ssn_schema +ON SCHEMA my_catalog.my_schema +COMMENT 'Mask SSN columns in schema' +COLUMN MASK my_catalog.my_schema.mask_ssn +TO `analysts`, `data_scientists` +EXCEPT `gov_admin` +FOR TABLES +MATCH COLUMNS hasTagValue('pii_type', 'ssn') AS masked_col +ON COLUMN masked_col; + +-- Table level — masks matching columns on a single table +CREATE OR REPLACE POLICY mask_pii_ssn_table +ON TABLE my_catalog.my_schema.my_table +COMMENT 'Mask SSN columns on specific table' +COLUMN MASK my_catalog.my_schema.mask_ssn +TO `analysts`, `data_scientists` +EXCEPT `gov_admin` +FOR TABLES +MATCH COLUMNS hasTagValue('pii_type', 'ssn') AS masked_col +ON COLUMN masked_col; + +-- Cross-catalog UDF — UDF in governance catalog, policy on prod +CREATE OR REPLACE POLICY mask_ssn_finance +ON SCHEMA prod.finance +COMMENT 'Mask SSN using shared governance UDF' +COLUMN MASK governance.masking_udfs.mask_ssn +TO `analysts` +EXCEPT `gov_admin` +FOR TABLES +MATCH COLUMNS hasTagValue('pii_type', 'ssn') AS masked_col +ON COLUMN masked_col; +``` + +### Row Filter Policy + +```sql +-- Catalog level — filters rows in ALL tables in the catalog +CREATE OR REPLACE POLICY filter_eu_data_catalog +ON CATALOG my_catalog +COMMENT 'Filter EU rows catalog-wide' +ROW FILTER my_catalog.my_schema.is_not_eu_region +TO `us_team` +EXCEPT `gov_admin` +FOR TABLES +MATCH COLUMNS hasTagValue('region', 'eu') AS filter_col +USING COLUMNS (filter_col); + +-- Schema level — filters rows in all tables in the schema +CREATE OR REPLACE POLICY filter_eu_data_schema +ON SCHEMA my_catalog.my_schema +COMMENT 'Filter EU rows in schema' +ROW FILTER my_catalog.my_schema.is_not_eu_region +TO `us_team` +EXCEPT `gov_admin` +FOR TABLES +MATCH COLUMNS hasTagValue('region', 'eu') AS filter_col +USING COLUMNS (filter_col); + +-- Table level — filters rows on a single table +CREATE OR REPLACE POLICY filter_eu_data_table +ON TABLE my_catalog.my_schema.my_table +COMMENT 'Filter EU rows on specific table' +ROW FILTER my_catalog.my_schema.is_not_eu_region +TO `us_team` +EXCEPT `gov_admin` +FOR TABLES +MATCH COLUMNS hasTagValue('region', 'eu') AS filter_col +USING COLUMNS (filter_col); +``` + +### Policy with Tag Key Only (any value) + +```sql +-- Match any column with tag 'pii_type' regardless of value +-- Works at any scope: ON CATALOG, ON SCHEMA, or ON TABLE +CREATE OR REPLACE POLICY mask_all_pii +ON SCHEMA my_catalog.my_schema +COLUMN MASK my_catalog.my_schema.mask_full +TO `external_users` +EXCEPT `gov_admin` +FOR TABLES +MATCH COLUMNS hasTag('pii_type') AS masked_col +ON COLUMN masked_col; +``` + +### Drop Policy + +```sql +-- Drop at each scope level +DROP POLICY mask_pii_ssn_catalog ON CATALOG my_catalog; +DROP POLICY mask_pii_ssn_schema ON SCHEMA my_catalog.my_schema; +DROP POLICY mask_pii_ssn_table ON TABLE my_catalog.my_schema.my_table; +``` + +> **Note:** There is no `ALTER POLICY`. To modify a policy, drop and recreate it. + +--- + +## Discovery Queries + +```sql +-- List catalogs +SHOW CATALOGS; + +-- List schemas in a catalog +SHOW SCHEMAS IN my_catalog; + +-- List tables in a schema +SHOW TABLES IN my_catalog.my_schema; + +-- Describe table with extended metadata +DESCRIBE TABLE EXTENDED my_catalog.my_schema.my_table; + +-- List UDFs in a schema +SHOW USER FUNCTIONS IN my_catalog.my_schema; + +-- Describe a UDF +DESCRIBE FUNCTION EXTENDED my_catalog.my_schema.mask_ssn; + +-- Sample column values +SELECT DISTINCT column_name +FROM my_catalog.my_schema.my_table +LIMIT 20; +``` + +--- + +## Enums Reference + +### PII Types (governed tag values) + +`ssn`, `email`, `phone`, `credit_card`, `date_of_birth`, `address`, `name`, `ip_address`, `national_id`, `medical_record`, `generic` + +### Masking Strategies + +| Strategy | Description | +|----------|-------------| +| `full_mask` | Replace all characters with `*` | +| `partial_mask` | Show last 4 characters | +| `hash` | SHA256 with version prefix | +| `redact` | Replace with `[REDACTED]` | +| `nullify` | Always return NULL | +| `custom` | User-supplied SQL (requires manual UDF) | + +### Policy Scopes + +| Scope | Description | +|-------|-------------| +| `CATALOG` | Policy applies to all tables in catalog | +| `SCHEMA` | Policy applies to all tables in schema | +| `TABLE` | Policy applies to a single table | + +### Tag Syntax Variants + +| Variant | Availability | Example | +|---------|-------------|---------| +| `LEGACY` | All versions | `ALTER TABLE t ALTER COLUMN c SET TAGS ('k'='v')` | +| `MODERN` | DBR 16.1+ | `SET TAG ON COLUMN t.c 'k' = 'v'` | diff --git a/databricks-skills/databricks-unity-catalog/9-fgac-sdk-and-tools.md b/databricks-skills/databricks-unity-catalog/9-fgac-sdk-and-tools.md new file mode 100644 index 00000000..f82cf774 --- /dev/null +++ b/databricks-skills/databricks-unity-catalog/9-fgac-sdk-and-tools.md @@ -0,0 +1,859 @@ +# FGAC Policy SDK & MCP Tools + +Python SDK patterns and MCP tool reference for managing FGAC policies in Unity Catalog. + +**SDK Docs:** https://databricks-sdk-py.readthedocs.io/en/latest/ +**FGAC Docs:** https://docs.databricks.com/data-governance/unity-catalog/abac/policies + +--- + +## Policy Scopes + +`on_securable_type` sets the **scope** of the policy. `for_securable_type` is always `TABLE`. + +| Scope | `on_securable_type` | `on_securable_fullname` | Effect | +|---|---|---|---| +| Catalog | `CATALOG` | `"my_catalog"` | Applies to all tables in the catalog | +| Schema | `SCHEMA` | `"my_catalog.my_schema"` | Applies to all tables in the schema | +| Table | `TABLE` | `"my_catalog.my_schema.my_table"` | Applies to a single table | + +### Important: Always Include `gov_admin` + +Every policy **MUST** include `"gov_admin"` in `except_principals`: + +```python +# CORRECT +except_principals=["gov_admin"] + +# CORRECT - additional admin groups +except_principals=["gov_admin", "platform_admins"] + +# WRONG - missing gov_admin +except_principals=["platform_admins"] # gov_admin must be included! +``` + +--- + +## Guardrails + +FGAC mutating operations (`create`, `update`, `delete`) enforce two programmatic guardrails: + +### Approval Token + +Every mutating call **requires** a valid `approval_token` obtained from `preview_policy_changes()`. The token is an HMAC-SHA256 signature binding the previewed parameters to a timestamp. + +- Token TTL: **10 minutes** (configurable via `_TOKEN_TTL_SECONDS`) +- Parameters must match exactly between preview and mutation +- Action mapping: preview `CREATE` → mutation `create`, `UPDATE` → `update`, `DELETE` → `delete` + +> **Design note:** The approval token ensures mutations match what was previewed and prevents parameter tampering, but it does **not** guarantee a human reviewed the preview. Human-in-the-loop confirmation depends on the MCP client — for example, Claude Code prompts the user to approve each tool call, creating a natural pause between preview and mutation. If using a client that auto-approves tool calls, consider adding explicit confirmation logic. + +### Admin Group Check + +The caller must be a member of the configured admin group before any mutating operation (create/update/delete) is allowed. Membership is verified via `w.current_user.me().groups`. + +Set the `FGAC_ADMIN_GROUP` environment variable to your workspace admin group name: + +```bash +# Example: use your workspace's governance admin group +export FGAC_ADMIN_GROUP="governance_admins" + +# Or use the workspace admins group +export FGAC_ADMIN_GROUP="admins" +``` + +If unset, defaults to `"admins"`. This should match an existing group in your Databricks workspace that contains users authorized to manage FGAC policies. + +### Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `FGAC_ADMIN_GROUP` | `admins` | Databricks workspace group whose members can create/update/delete FGAC policies | + +--- + +## MCP Tools + +### Discovery Tools + +#### `list_fgac_policies` + +List FGAC policies on a catalog, schema, or table. + +```python +list_fgac_policies( + securable_type: str, # "CATALOG", "SCHEMA", or "TABLE" + securable_fullname: str, # e.g., "my_catalog.my_schema" + include_inherited: bool = True, + policy_type: str = None, # "COLUMN_MASK" or "ROW_FILTER" (optional filter) +) +``` + +**Returns:** +```json +{ + "success": true, + "securable_type": "SCHEMA", + "securable_fullname": "my_catalog.my_schema", + "policy_count": 3, + "policies": [ + { + "name": "mask_pii_ssn", + "policy_type": "COLUMN_MASK", + "to_principals": ["analysts"], + "except_principals": ["gov_admin"], + "on_securable_fullname": "my_catalog.my_schema", + "column_mask": {"function_name": "my_catalog.my_schema.mask_ssn", "on_column": "masked_col"}, + "match_columns": [{"alias": "masked_col", "condition": "hasTagValue('pii_type', 'ssn')"}] + } + ] +} +``` + +#### `get_fgac_policy` + +Get details for a specific policy by name. + +```python +get_fgac_policy( + policy_name: str, # Policy name + securable_type: str, # "CATALOG", "SCHEMA", or "TABLE" + securable_fullname: str, # Fully qualified securable name +) +``` + +**Returns:** +```json +{ + "success": true, + "policy": { + "name": "mask_pii_ssn", + "policy_type": "COLUMN_MASK", + "comment": "Mask SSN columns for analysts", + "to_principals": ["analysts", "data_scientists"], + "except_principals": ["gov_admin"], + "on_securable_type": "SCHEMA", + "on_securable_fullname": "my_catalog.my_schema", + "for_securable_type": "TABLE", + "column_mask": {"function_name": "my_catalog.my_schema.mask_ssn", "on_column": "masked_col"}, + "match_columns": [{"alias": "masked_col", "condition": "hasTagValue('pii_type', 'ssn')"}] + } +} +``` + +#### `get_table_policies` + +Get column masks and row filters for a specific table via Unity Catalog API. + +```python +get_table_policies( + catalog: str, + schema: str, + table: str, +) +``` + +**Returns:** +```json +{ + "success": true, + "table": "my_catalog.my_schema.my_table", + "column_masks": [ + { + "column_name": "ssn", + "column_type": "STRING", + "mask_functions": ["my_catalog.my_schema.mask_ssn"] + } + ], + "row_filters": [ + { + "function_name": "my_catalog.my_schema.is_not_eu_region", + "input_column_names": ["region"] + } + ] +} +``` + +#### `get_masking_functions` + +List masking UDFs in a schema. + +> **Cross-catalog UDFs:** Masking UDFs can reside in any catalog/schema, not just the policy scope. Use `udf_catalog` and `udf_schema` to discover UDFs stored in a shared governance schema (e.g., `governance.masking_udfs`). These default to `catalog`/`schema` when not specified. + +```python +get_masking_functions( + catalog: str, + schema: str, + # To discover UDFs in a different catalog/schema: + udf_catalog: str = None, # defaults to catalog + udf_schema: str = None, # defaults to schema +) +``` + +**Returns:** +```json +{ + "success": true, + "catalog": "my_catalog", + "schema": "my_schema", + "functions": [ + { + "name": "mask_ssn", + "full_name": "my_catalog.my_schema.mask_ssn", + "return_type": "STRING", + "comment": "Masks SSN showing only last 4 digits", + "is_deterministic": true + } + ] +} +``` + +#### `get_column_tags_api` + +Get column-level tags for a table via the Tags API (queries `system.information_schema.column_tags`). + +```python +get_column_tags_api( + catalog: str, + schema: str, + table: str, +) +``` + +**Returns:** +```json +{ + "success": true, + "table": "my_catalog.my_schema.my_table", + "tags": [ + { + "catalog_name": "my_catalog", + "schema_name": "my_schema", + "table_name": "my_table", + "column_name": "ssn", + "tag_name": "pii_type", + "tag_value": "ssn" + } + ] +} +``` + +#### `get_schema_info` + +Get schema metadata via Unity Catalog API. + +```python +get_schema_info(catalog: str, schema: str) +``` + +**Returns:** +```json +{ + "success": true, + "schema": { + "name": "my_schema", + "full_name": "my_catalog.my_schema", + "catalog_name": "my_catalog", + "owner": "admin_user", + "comment": "Production finance schema", + "created_at": 1700000000000, + "updated_at": 1700100000000 + } +} +``` + +#### `get_catalog_info` + +Get catalog metadata via Unity Catalog API. + +```python +get_catalog_info(catalog: str) +``` + +**Returns:** +```json +{ + "success": true, + "catalog": { + "name": "my_catalog", + "owner": "admin_user", + "comment": "Production catalog", + "created_at": 1700000000000, + "updated_at": 1700100000000 + } +} +``` + +#### `list_table_policies_in_schema` + +List all tables in a schema with their column masks and row filters. + +```python +list_table_policies_in_schema( + catalog: str, + schema: str, +) +``` + +**Returns:** +```json +{ + "success": true, + "catalog": "my_catalog", + "schema": "my_schema", + "table_count": 3, + "tables": [ + { + "table": "customers", + "column_masks": [ + {"column_name": "ssn", "column_type": "STRING", "mask_functions": ["my_catalog.my_schema.mask_ssn"]} + ], + "row_filters": [] + }, + { + "table": "orders", + "column_masks": [], + "row_filters": [] + } + ] +} +``` + +#### `analyze_fgac_coverage` + +Analyze FGAC policy coverage for a catalog or schema. Identifies tagged columns that lack policy coverage and suggests actions. + +```python +analyze_fgac_coverage( + catalog: str, + schema: str = None, # Optional; omit to analyze entire catalog +) +``` + +**Returns:** +```json +{ + "success": true, + "scope": "SCHEMA my_catalog.my_schema", + "summary": { + "tables_scanned": 10, + "tagged_columns": 5, + "existing_policies": 2, + "available_udfs": 3, + "covered_tags": ["pii_type:ssn"], + "uncovered_tags": ["pii_type:email"] + }, + "gaps": [ + { + "tag_name": "pii_type", + "tag_value": "email", + "columns": [{"table": "my_catalog.my_schema.customers", "column": "email"}], + "suggestion": "No policy covers this tag. Consider creating a COLUMN_MASK policy." + } + ], + "existing_policies": [{"name": "mask_pii_ssn", "policy_type": "COLUMN_MASK", "...": "..."}], + "available_udfs": [{"name": "mask_ssn", "full_name": "my_catalog.my_schema.mask_ssn", "...": "..."}] +} +``` + +#### `check_policy_quota` + +Check if the policy quota allows creating a new policy on a securable. + +```python +check_policy_quota( + securable_type: str, # "CATALOG", "SCHEMA", or "TABLE" + securable_fullname: str, # e.g., "my_catalog.my_schema" +) +``` + +**Returns:** +```json +{ + "success": true, + "securable_type": "SCHEMA", + "securable_fullname": "my_catalog.my_schema", + "current": 3, + "max": 10, + "can_create": true +} +``` + +**Quotas:** CATALOG=10, SCHEMA=10, TABLE=5. + +### Preview Tool (Human-in-the-Loop Gate) + +#### `preview_policy_changes` + +Preview policy changes without executing. This is the critical human-in-the-loop gate. + +```python +preview_policy_changes( + action: str, # "CREATE", "UPDATE", or "DELETE" + policy_name: str, + securable_type: str, # "CATALOG", "SCHEMA", or "TABLE" + securable_fullname: str, + policy_type: str = None, # "COLUMN_MASK" or "ROW_FILTER" (for CREATE) + to_principals: list = None, + except_principals: list = None, + function_name: str = None, + tag_name: str = None, + tag_value: str = None, + comment: str = None, +) +``` + +**Returns:** +```json +{ + "success": true, + "action": "CREATE", + "preview": { + "policy_name": "mask_pii_ssn", + "policy_type": "COLUMN_MASK", + "securable": "SCHEMA my_catalog.my_schema", + "to_principals": ["analysts"], + "except_principals": ["gov_admin"], + "function": "my_catalog.my_schema.mask_ssn", + "tag_match": "hasTagValue('pii_type', 'ssn')", + "equivalent_sql": "CREATE OR REPLACE POLICY mask_pii_ssn\nON SCHEMA my_catalog.my_schema\n..." + }, + "warnings": [], + "requires_approval": true, + "approval_token": "a1b2c3...:eyJhY3Rpb24i...", + "message": "Review the preview above. Reply 'approve' to execute, passing the approval_token." +} +``` + +**Usage in workflow:** + +1. Call `preview_policy_changes` with proposed changes +2. Present preview to user (includes `approval_token`) +3. Wait for explicit approval +4. Pass `approval_token` to `create_fgac_policy`, `update_fgac_policy`, or `delete_fgac_policy` + +### Management Tools + +#### `create_fgac_policy` + +Create a new FGAC policy (COLUMN_MASK or ROW_FILTER). + +```python +create_fgac_policy( + policy_name: str, + policy_type: str, # "COLUMN_MASK" or "ROW_FILTER" + securable_type: str, # "CATALOG", "SCHEMA", or "TABLE" + securable_fullname: str, + function_name: str, # Fully qualified UDF name + to_principals: list, # Users/groups the policy applies to + tag_name: str, # Tag key to match + approval_token: str, # Token from preview_policy_changes() + tag_value: str = None, # Tag value (optional, uses hasTag vs hasTagValue) + except_principals: list = None, # Excluded principals (gov_admin auto-added) + comment: str = "", +) +``` + +**Returns:** +```json +{ + "success": true, + "policy_name": "mask_pii_ssn", + "action": "created", + "details": { + "policy_type": "COLUMN_MASK", + "on_securable": "SCHEMA my_catalog.my_schema", + "function": "my_catalog.my_schema.mask_ssn", + "to_principals": ["analysts"], + "except_principals": ["gov_admin"] + } +} +``` + +#### `update_fgac_policy` + +Update an existing policy's principals or comment. + +```python +update_fgac_policy( + policy_name: str, + securable_type: str, + securable_fullname: str, + approval_token: str, # Token from preview_policy_changes() + to_principals: list = None, + except_principals: list = None, + comment: str = None, +) +``` + +**Returns:** +```json +{ + "success": true, + "policy_name": "mask_pii_ssn", + "action": "updated", + "changes": { + "to_principals": ["analysts", "data_scientists", "new_team"], + "comment": "Updated: added new_team" + } +} +``` + +> **Note:** To change the UDF, tag matching, or scope, drop and recreate the policy. + +#### `delete_fgac_policy` + +Delete an FGAC policy. + +```python +delete_fgac_policy( + policy_name: str, + securable_type: str, + securable_fullname: str, + approval_token: str, # Token from preview_policy_changes() +) +``` + +**Returns:** +```json +{ + "success": true, + "policy_name": "mask_pii_ssn", + "action": "deleted" +} +``` + +--- + +## Human-in-the-Loop Workflow Example + +Complete workflow using MCP tools: + +``` +Step 1: ANALYZE +───────────────────────────────── +→ analyze_fgac_coverage(catalog="prod", schema="finance") + # Or analyze individual components: +→ list_fgac_policies(securable_type="SCHEMA", securable_fullname="prod.finance") +→ get_column_tags_api(catalog="prod", schema="finance", table="customers") +→ get_masking_functions(catalog="prod", schema="finance") + # If UDFs are in a shared governance schema: +→ get_masking_functions(catalog="prod", schema="finance", + udf_catalog="governance", udf_schema="masking_udfs") + +Step 2: RECOMMEND +───────────────────────────────── +→ Agent generates policy recommendations based on coverage gaps and available UDFs + +Step 3: PREVIEW (returns approval_token) +───────────────────────────────── +→ result = preview_policy_changes( + action="CREATE", + policy_name="mask_ssn_finance", + securable_type="SCHEMA", + securable_fullname="prod.finance", + policy_type="COLUMN_MASK", + function_name="governance.masking_udfs.mask_ssn", + to_principals=["analysts"], + tag_name="pii_type", + tag_value="ssn" + ) +→ token = result["approval_token"] + +Step 4: APPROVE +───────────────────────────────── +→ Human reviews preview and replies "approve" + +Step 5: EXECUTE (pass approval_token) +───────────────────────────────── +→ create_fgac_policy( + policy_name="mask_ssn_finance", + policy_type="COLUMN_MASK", + securable_type="SCHEMA", + securable_fullname="prod.finance", + function_name="governance.masking_udfs.mask_ssn", + to_principals=["analysts"], + tag_name="pii_type", + tag_value="ssn", + approval_token=token + ) + +Step 6: VERIFY +───────────────────────────────── +→ get_fgac_policy( + policy_name="mask_ssn_finance", + securable_type="SCHEMA", + securable_fullname="prod.finance" + ) +``` + +--- + +## Python SDK Direct Usage + +For writing custom code outside MCP tools, use the Databricks Python SDK directly. + +### Setup + +```python +from databricks.sdk import WorkspaceClient + +w = WorkspaceClient() # Auto-detects credentials +``` + +### SDK Types + +```python +from databricks.sdk.service.catalog import ( + ColumnMaskOptions, + MatchColumn, + PolicyInfo, + PolicyType, + RowFilterOptions, + SecurableType, +) +``` + +### List Policies + +```python +policies = w.policies.list_policies( + on_securable_type="CATALOG", + on_securable_fullname="my_catalog", + include_inherited=True, +) + +for p in policies: + print(f"{p.name}: {p.policy_type} on {p.on_securable_fullname}") + +# Filter by type +column_masks = [p for p in policies if p.policy_type == "COLUMN_MASK"] +row_filters = [p for p in policies if p.policy_type == "ROW_FILTER"] +``` + +### Get Policy + +```python +policy = w.policies.get_policy( + name="mask_pii_ssn", + on_securable_type="SCHEMA", + on_securable_fullname="my_catalog.my_schema", +) + +print(f"Policy: {policy.name}") +print(f"Type: {policy.policy_type}") +print(f"Principals: {policy.to_principals}") +print(f"Except: {policy.except_principals}") +``` + +### Create Column Mask Policy + +```python +policy_info = PolicyInfo( + name="mask_pii_ssn_schema", + policy_type=PolicyType.POLICY_TYPE_COLUMN_MASK, + on_securable_type=SecurableType.SCHEMA, + on_securable_fullname="my_catalog.my_schema", + for_securable_type=SecurableType.TABLE, + to_principals=["analysts", "data_scientists"], + except_principals=["gov_admin"], + comment="Mask SSN columns in schema", + column_mask=ColumnMaskOptions( + function_name="my_catalog.my_schema.mask_ssn", + on_column="masked_col", + ), + match_columns=[ + MatchColumn( + alias="masked_col", + condition="hasTagValue('pii_type', 'ssn')", + ) + ], +) +policy = w.policies.create_policy(policy_info=policy_info) +``` + +Change `on_securable_type` and `on_securable_fullname` to target catalog or table scope. + +### Create Column Mask Policy (Cross-Catalog UDF) + +The UDF can live in a separate governance catalog/schema from the policy scope: + +```python +# UDF in governance.masking_udfs, policy on prod.finance +policy_info = PolicyInfo( + name="mask_ssn_finance", + policy_type=PolicyType.POLICY_TYPE_COLUMN_MASK, + on_securable_type=SecurableType.SCHEMA, + on_securable_fullname="prod.finance", + for_securable_type=SecurableType.TABLE, + to_principals=["analysts"], + except_principals=["gov_admin"], + comment="Mask SSN columns in prod.finance using shared governance UDF", + column_mask=ColumnMaskOptions( + function_name="governance.masking_udfs.mask_ssn", + on_column="masked_col", + ), + match_columns=[ + MatchColumn( + alias="masked_col", + condition="hasTagValue('pii_type', 'ssn')", + ) + ], +) +policy = w.policies.create_policy(policy_info=policy_info) +``` + +### Create Row Filter Policy + +```python +policy_info = PolicyInfo( + name="filter_eu_data_schema", + policy_type=PolicyType.POLICY_TYPE_ROW_FILTER, + on_securable_type=SecurableType.SCHEMA, + on_securable_fullname="my_catalog.my_schema", + for_securable_type=SecurableType.TABLE, + to_principals=["us_team"], + except_principals=["gov_admin"], + comment="Filter EU rows in schema", + row_filter=RowFilterOptions( + function_name="my_catalog.my_schema.is_not_eu_region", + ), + match_columns=[ + MatchColumn( + alias="filter_col", + condition="hasTagValue('region', 'eu')", + ) + ], +) +policy = w.policies.create_policy(policy_info=policy_info) +``` + +### Update Policy + +Update principals or comment on an existing policy. + +```python +update_info = PolicyInfo( + to_principals=["analysts", "data_scientists", "new_team"], + except_principals=["gov_admin", "senior_admins"], + comment="Updated: added new_team to masked principals", + for_securable_type=SecurableType.TABLE, + policy_type=PolicyType.POLICY_TYPE_COLUMN_MASK, +) +updated = w.policies.update_policy( + name="mask_pii_ssn", + on_securable_type="SCHEMA", + on_securable_fullname="my_catalog.my_schema", + policy_info=update_info, + update_mask="to_principals,except_principals,comment", +) +``` + +> **Note:** To change the UDF, tag matching, or scope, you must drop and recreate the policy. `update_policy` only modifies principals and comment via `update_mask`. + +### Delete Policy + +```python +w.policies.delete_policy( + name="mask_pii_ssn", + on_securable_type="SCHEMA", + on_securable_fullname="my_catalog.my_schema", +) +``` + +--- + +## Error Handling + +```python +from databricks.sdk.errors import NotFound, PermissionDenied, BadRequest + +try: + policy = w.policies.get_policy( + name="nonexistent_policy", + on_securable_type="SCHEMA", + on_securable_fullname="my_catalog.my_schema", + ) +except NotFound: + print("Policy not found") +except PermissionDenied: + print("Insufficient permissions - need MANAGE on securable") +except BadRequest as e: + print(f"Invalid request: {e}") +``` + +| Error | Cause | Solution | +|-------|-------|----------| +| `POLICY_QUOTA_EXCEEDED` | Too many policies on scope | Consolidate policies or use broader scope | +| `INVALID_TAG_VALUE` | Tag value not in governed tag's allowed values | Check governed tag config in UI | +| `UDF_NOT_FOUND` | Masking UDF doesn't exist | Create UDF first, use fully qualified name | +| `POLICY_ALREADY_EXISTS` | Duplicate policy name | Use different name or delete existing first | +| `INSUFFICIENT_PERMISSIONS` | Missing `MANAGE` on securable | Grant `MANAGE` permission | +| `INVALID_SECURABLE_TYPE` | Wrong securable type string | Use `"CATALOG"`, `"SCHEMA"`, or `"TABLE"` | + +--- + +## Common Patterns + +### Policy Summary with Counts + +```python +def get_policy_summary(w, catalog: str): + """Get a summary of all FGAC policies in a catalog.""" + policies = list(w.policies.list_policies( + on_securable_type="CATALOG", + on_securable_fullname=catalog, + include_inherited=True, + )) + + column_masks = [p for p in policies if p.policy_type == "COLUMN_MASK"] + row_filters = [p for p in policies if p.policy_type == "ROW_FILTER"] + + return { + "total": len(policies), + "column_masks": len(column_masks), + "row_filters": len(row_filters), + "policies": [p.as_dict() for p in policies], + } +``` + +### Check Policy Quotas Before Creating + +```python +def check_quota(w, securable_type: str, securable_fullname: str): + """Check if policy quota allows creating a new policy.""" + quotas = {"CATALOG": 10, "SCHEMA": 10, "TABLE": 5} + max_policies = quotas.get(securable_type, 10) + + existing = list(w.policies.list_policies( + on_securable_type=securable_type, + on_securable_fullname=securable_fullname, + )) + + # Count only direct policies (not inherited) + direct = [p for p in existing + if p.on_securable_fullname == securable_fullname] + + return { + "current": len(direct), + "max": max_policies, + "can_create": len(direct) < max_policies, + } +``` + +### Async Usage (FastAPI, etc.) + +The Databricks SDK is synchronous. In async applications, wrap calls with `asyncio.to_thread()`: + +```python +import asyncio + +async def list_policies_async(w, catalog: str): + return await asyncio.to_thread( + lambda: list(w.policies.list_policies( + on_securable_type="CATALOG", + on_securable_fullname=catalog, + include_inherited=True, + )) + ) +``` diff --git a/databricks-skills/databricks-unity-catalog/SKILL.md b/databricks-skills/databricks-unity-catalog/SKILL.md index 9b77fed9..af68954f 100644 --- a/databricks-skills/databricks-unity-catalog/SKILL.md +++ b/databricks-skills/databricks-unity-catalog/SKILL.md @@ -1,16 +1,17 @@ --- name: databricks-unity-catalog -description: "Unity Catalog system tables and volumes. Use when querying system tables (audit, lineage, billing) or working with volume file operations (upload, download, list files in /Volumes/)." +description: "Unity Catalog: system tables, volumes, access controls (ACLs), and FGAC governance. Use when querying system tables (audit, lineage, billing), working with volume file operations, managing UC permissions (GRANT/REVOKE), or managing FGAC policies (column masks, row filters, governed tags, masking UDFs)." --- # Unity Catalog -Guidance for Unity Catalog system tables, volumes, and governance. +Guidance for Unity Catalog across four areas: system tables, volumes, access controls, and FGAC policy governance. ## When to Use This Skill -Use this skill when: -- Working with **volumes** (upload, download, list files in `/Volumes/`) +Use this skill when working with any of these four categories: + +### System Tables - Querying **lineage** (table dependencies, column-level lineage) - Analyzing **audit logs** (who accessed what, permission changes) - Monitoring **billing and usage** (DBU consumption, cost analysis) @@ -18,38 +19,60 @@ Use this skill when: - Reviewing **job execution** (run history, success rates, failures) - Analyzing **query performance** (slow queries, warehouse utilization) +### Volumes +- Working with **volumes** (upload, download, list files in `/Volumes/`) +- Managing volume **directories** and file operations +- Configuring volume **permissions** (READ VOLUME, WRITE VOLUME) + +### UC Access Controls (ACLs) +- **Granting or revoking** privileges on catalogs, schemas, tables, volumes, functions +- Managing **ownership** transfers +- Setting up **role-based access** patterns (data readers, engineers, admins) +- Auditing **current permissions** (SHOW GRANTS) + +### FGAC (Fine-Grained Access Control) +- Creating or managing **FGAC policies** (column masks, row filters) +- Working with **governed tags** (creating via UI, applying via SQL) +- Building **masking UDFs** for PII protection (SSN, email, credit card, etc.) +- Implementing **human-in-the-loop governance** workflows +- Managing **policy lifecycle** (create, update, delete, preview) +- Querying **tag assignments** via `information_schema` + +--- + ## Reference Files -| Topic | File | Description | -|-------|------|-------------| -| System Tables | [5-system-tables.md](5-system-tables.md) | Lineage, audit, billing, compute, jobs, query history | -| Volumes | [6-volumes.md](6-volumes.md) | Volume file operations, permissions, best practices | +### System Tables -## Quick Start +| File | Description | +|------|-------------| +| [5-system-tables.md](5-system-tables.md) | Lineage, audit, billing, compute, jobs, query history | -### Volume File Operations (MCP Tools) +### Volumes -```python -# List files in a volume -list_volume_files(volume_path="/Volumes/catalog/schema/volume/folder/") +| File | Description | +|------|-------------| +| [6-volumes.md](6-volumes.md) | Volume file operations, permissions, best practices | -# Upload file to volume -upload_to_volume( - local_path="/tmp/data.csv", - volume_path="/Volumes/catalog/schema/volume/data.csv" -) +### UC Access Controls (ACLs) -# Download file from volume -download_from_volume( - volume_path="/Volumes/catalog/schema/volume/data.csv", - local_path="/tmp/downloaded.csv" -) +| File | Description | +|------|-------------| +| [10-uc-acls.md](10-uc-acls.md) | GRANT/REVOKE, ownership, privilege reference, SDK patterns, common role patterns | -# Create directory -create_volume_directory(volume_path="/Volumes/catalog/schema/volume/new_folder") -``` +### FGAC (Fine-Grained Access Control) -### Enable System Tables Access +| File | Description | +|------|-------------| +| [7-fgac-overview.md](7-fgac-overview.md) | FGAC workflow, governed tags, masking UDFs, policy syntax, errors, best practices | +| [8-fgac-sql-generation.md](8-fgac-sql-generation.md) | SET/UNSET TAG, CREATE FUNCTION, CREATE/DROP POLICY, discovery queries | +| [9-fgac-sdk-and-tools.md](9-fgac-sdk-and-tools.md) | Python SDK patterns and 12 MCP tools for policy management | + +--- + +## Quick Start: System Tables + +### Enable Access ```sql -- Grant access to system tables @@ -81,12 +104,9 @@ WHERE usage_date >= current_date() - 30 GROUP BY workspace_id, sku_name; ``` -## MCP Tool Integration - -Use `mcp__databricks__execute_sql` for system table queries: +### MCP Tool Integration ```python -# Query lineage mcp__databricks__execute_sql( sql_query=""" SELECT source_table_full_name, target_table_full_name @@ -97,12 +117,114 @@ mcp__databricks__execute_sql( ) ``` +--- + +## Quick Start: Volumes + +```python +# List files in a volume +list_volume_files(volume_path="/Volumes/catalog/schema/volume/folder/") + +# Upload file to volume +upload_to_volume( + local_path="/tmp/data.csv", + volume_path="/Volumes/catalog/schema/volume/data.csv" +) + +# Download file from volume +download_from_volume( + volume_path="/Volumes/catalog/schema/volume/data.csv", + local_path="/tmp/downloaded.csv" +) + +# Create directory +create_volume_directory(volume_path="/Volumes/catalog/schema/volume/new_folder") +``` + +See [6-volumes.md](6-volumes.md) for full volume operations, permissions, and troubleshooting. + +--- + +## Quick Start: UC Access Controls (ACLs) + +```sql +-- Read-only access pattern +GRANT USE CATALOG ON CATALOG analytics TO `data_readers`; +GRANT USE SCHEMA ON SCHEMA analytics.gold TO `data_readers`; +GRANT SELECT ON SCHEMA analytics.gold TO `data_readers`; + +-- Data engineer access pattern +GRANT USE CATALOG ON CATALOG analytics TO `data_engineers`; +GRANT USE SCHEMA ON SCHEMA analytics.silver TO `data_engineers`; +GRANT SELECT ON SCHEMA analytics.silver TO `data_engineers`; +GRANT MODIFY ON SCHEMA analytics.silver TO `data_engineers`; +GRANT CREATE TABLE ON SCHEMA analytics.silver TO `data_engineers`; + +-- Show current grants +SHOW GRANTS ON SCHEMA analytics.gold; + +-- Transfer ownership +ALTER SCHEMA analytics.gold OWNER TO `new_owner`; +``` + +See [10-uc-acls.md](10-uc-acls.md) for full privilege reference, SDK patterns, and common role patterns. + +--- + +## Quick Start: FGAC + +```sql +-- 1. Apply governed tag to a column (tag must exist in UI first) +SET TAG ON COLUMN catalog.schema.table.ssn_column 'pii_type' = 'ssn'; + +-- 2. Create a masking UDF +CREATE OR REPLACE FUNCTION catalog.schema.mask_ssn(ssn STRING) +RETURNS STRING +DETERMINISTIC +RETURN CASE + WHEN ssn IS NULL THEN NULL + WHEN LENGTH(REGEXP_REPLACE(ssn, '[^0-9]', '')) >= 4 + THEN CONCAT('***-**-', RIGHT(REGEXP_REPLACE(ssn, '[^0-9]', ''), 4)) + ELSE '***-**-****' +END; + +-- 3. Create an FGAC column mask policy +CREATE OR REPLACE POLICY mask_pii_ssn +ON SCHEMA catalog.schema +COMMENT 'Mask SSN columns for analysts' +COLUMN MASK catalog.schema.mask_ssn +TO `analysts` +EXCEPT `gov_admin` +FOR TABLES +MATCH COLUMNS hasTagValue('pii_type', 'ssn') AS masked_col +ON COLUMN masked_col; +``` + +See [7-fgac-overview.md](7-fgac-overview.md) for the full FGAC workflow, policy syntax, and best practices. + +--- + ## Best Practices -1. **Filter by date** - System tables can be large; always use date filters -2. **Use appropriate retention** - Check your workspace's retention settings -3. **Grant minimal access** - System tables contain sensitive metadata -4. **Schedule reports** - Create scheduled queries for regular monitoring +### System Tables +1. **Filter by date** — System tables can be large; always use date filters +2. **Use appropriate retention** — Check your workspace's retention settings +3. **Schedule reports** — Create scheduled queries for regular monitoring + +### Volumes +4. **Organize by purpose** — Use directory structure within volumes +5. **Grant minimal access** — Use `READ VOLUME` vs `WRITE VOLUME` appropriately + +### UC Access Controls (ACLs) +6. **Grant to groups, not users** — Easier to manage and audit +7. **Use least privilege** — Grant only the minimum permissions needed +8. **Leverage inheritance** — Grant at schema level when all tables need the same access +9. **Audit regularly** — Query `system.access.audit` for grant/revoke events + +### FGAC +10. **Always include `EXCEPT \`gov_admin\``** in every FGAC policy +11. **Preview before executing** any FGAC policy change +12. **Use governed tags** (not ad-hoc tags) for FGAC policy matching ## Related Skills @@ -113,5 +235,15 @@ mcp__databricks__execute_sql( ## Resources +### System Tables & Volumes - [Unity Catalog System Tables](https://docs.databricks.com/administration-guide/system-tables/) - [Audit Log Reference](https://docs.databricks.com/administration-guide/account-settings/audit-logs.html) + +### UC Access Controls +- [UC Privileges](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/) + +### FGAC +- [FGAC Overview](https://docs.databricks.com/data-governance/unity-catalog/abac/) +- [FGAC Policies](https://docs.databricks.com/data-governance/unity-catalog/abac/policies) +- [FGAC Tutorial](https://docs.databricks.com/data-governance/unity-catalog/abac/tutorial) +- [Governed Tags](https://docs.databricks.com/admin/governed-tags/) diff --git a/databricks-skills/install_skills.sh b/databricks-skills/install_skills.sh index 30339ade..f299c597 100755 --- a/databricks-skills/install_skills.sh +++ b/databricks-skills/install_skills.sh @@ -66,7 +66,7 @@ get_skill_description() { "databricks-iceberg") echo "Apache Iceberg - managed tables, UniForm, IRC, Snowflake interop, migration" ;; "databricks-jobs") echo "Databricks Lakeflow Jobs - workflow orchestration" ;; "databricks-python-sdk") echo "Databricks Python SDK, Connect, and REST API" ;; - "databricks-unity-catalog") echo "System tables for lineage, audit, billing" ;; + "databricks-unity-catalog") echo "System tables, volumes, access controls, and FGAC policy governance" ;; "databricks-lakebase-autoscale") echo "Lakebase Autoscale - managed PostgreSQL with autoscaling" ;; "databricks-lakebase-provisioned") echo "Lakebase Provisioned - data connections and reverse ETL" ;; "databricks-metric-views") echo "Unity Catalog Metric Views - governed business metrics in YAML" ;; @@ -104,7 +104,7 @@ get_skill_extra_files() { "databricks-app-python") echo "dash.md streamlit.md README.md" ;; "databricks-jobs") echo "task-types.md triggers-schedules.md notifications-monitoring.md examples.md" ;; "databricks-python-sdk") echo "doc-index.md examples/1-authentication.py examples/2-clusters-and-jobs.py examples/3-sql-and-warehouses.py examples/4-unity-catalog.py examples/5-serving-and-vector-search.py" ;; - "databricks-unity-catalog") echo "5-system-tables.md" ;; + "databricks-unity-catalog") echo "5-system-tables.md 6-volumes.md 7-fgac-overview.md 8-fgac-sql-generation.md 9-fgac-sdk-and-tools.md 10-uc-acls.md" ;; "databricks-lakebase-autoscale") echo "projects.md branches.md computes.md connection-patterns.md reverse-etl.md" ;; "databricks-lakebase-provisioned") echo "connection-patterns.md reverse-etl.md" ;; "databricks-metric-views") echo "yaml-reference.md patterns.md" ;; diff --git a/databricks-tools-core/README.md b/databricks-tools-core/README.md index d7831456..e5bb9f60 100644 --- a/databricks-tools-core/README.md +++ b/databricks-tools-core/README.md @@ -12,7 +12,7 @@ The `databricks-tools-core` package provides reusable, opinionated functions for |--------|-------------| | **sql/** | SQL execution, warehouse management, and table statistics | | **jobs/** | Job management and run operations (serverless by default) | -| **unity_catalog/** | Unity Catalog operations (catalogs, schemas, tables) | +| **unity_catalog/** | Unity Catalog operations (catalogs, schemas, tables, FGAC policies) | | **compute/** | Compute and execution context operations | | **spark_declarative_pipelines/** | Spark Declarative Pipeline management | | **synthetic_data_generation/** | Test data generation utilities | diff --git a/databricks-tools-core/databricks_tools_core/unity_catalog/__init__.py b/databricks-tools-core/databricks_tools_core/unity_catalog/__init__.py index 21e37808..c2f4005e 100644 --- a/databricks-tools-core/databricks_tools_core/unity_catalog/__init__.py +++ b/databricks-tools-core/databricks_tools_core/unity_catalog/__init__.py @@ -112,6 +112,24 @@ drop_column_mask, ) +# FGAC policies +from .fgac_policies import ( + list_fgac_policies, + get_fgac_policy, + get_table_policies, + get_masking_functions, + get_column_tags_api, + get_schema_info, + get_catalog_info, + list_table_policies_in_schema, + analyze_fgac_coverage, + check_policy_quota, + preview_policy_changes, + create_fgac_policy, + update_fgac_policy, + delete_fgac_policy, +) + # Quality monitors from .monitors import ( create_monitor, @@ -226,6 +244,21 @@ "drop_row_filter", "set_column_mask", "drop_column_mask", + # FGAC policies + "list_fgac_policies", + "get_fgac_policy", + "get_table_policies", + "get_masking_functions", + "get_column_tags_api", + "get_schema_info", + "get_catalog_info", + "list_table_policies_in_schema", + "analyze_fgac_coverage", + "check_policy_quota", + "preview_policy_changes", + "create_fgac_policy", + "update_fgac_policy", + "delete_fgac_policy", # Quality monitors "create_monitor", "get_monitor", diff --git a/databricks-tools-core/databricks_tools_core/unity_catalog/fgac_policies.py b/databricks-tools-core/databricks_tools_core/unity_catalog/fgac_policies.py new file mode 100644 index 00000000..7e2f84db --- /dev/null +++ b/databricks-tools-core/databricks_tools_core/unity_catalog/fgac_policies.py @@ -0,0 +1,1174 @@ +""" +Unity Catalog - FGAC Policy Operations + +Functions for managing Fine-Grained Access Control (FGAC) policies +via the Databricks Python SDK (WorkspaceClient.policies). + +FGAC policies bind governed tags to masking UDFs or row filters, scoped to +catalogs, schemas, or tables, and targeted at specific principals. + +Human-in-the-loop design: + Mutations (create/update/delete) require an approval token from + preview_policy_changes(). The token is an HMAC-signed binding of + preview parameters to a timestamp — it ensures mutations match what + was previewed and prevents parameter tampering. + + IMPORTANT: The token does NOT guarantee a human reviewed the preview. + That responsibility falls on the MCP client (e.g., Claude Code prompts + the user for confirmation between tool calls). The token only ensures + that whatever was approved matches what gets executed. + +Policy quotas: + - Catalog: 10 policies max + - Schema: 10 policies max + - Table: 5 policies max +""" + +import base64 +import hashlib +import hmac +import json +import logging +import os +import re +import time +from typing import Any, Dict, List, Optional + +from ..auth import get_workspace_client + +logger = logging.getLogger(__name__) + +_IDENTIFIER_PATTERN = re.compile(r"^[a-zA-Z0-9_][a-zA-Z0-9_.\-]*$") + +_VALID_SECURABLE_TYPES = {"CATALOG", "SCHEMA", "TABLE"} +_VALID_POLICY_TYPES = {"COLUMN_MASK", "ROW_FILTER"} +_POLICY_QUOTAS = {"CATALOG": 10, "SCHEMA": 10, "TABLE": 5} + +_APPROVAL_SECRET = os.urandom(32).hex() +_ADMIN_GROUP = os.environ.get("FGAC_ADMIN_GROUP", "admins") +_TOKEN_TTL_SECONDS = 600 # 10 minutes + + +def _generate_approval_token(params: dict) -> str: + """Generate an HMAC-based approval token binding preview params to a timestamp.""" + clean_params = {k: v for k, v in params.items() if v is not None} + clean_params["timestamp"] = int(time.time()) + payload = json.dumps(clean_params, sort_keys=True) + signature = hmac.new(_APPROVAL_SECRET.encode(), payload.encode(), hashlib.sha256).hexdigest() + b64_payload = base64.b64encode(payload.encode()).decode() + return f"{signature}:{b64_payload}" + + +def _validate_approval_token(approval_token: str, current_params: dict) -> None: + """Validate an approval token against current parameters. + + Raises ValueError if the token is invalid, expired, or params don't match. + """ + params = dict(current_params) # work on a copy to avoid mutating caller's dict + + try: + signature, b64_payload = approval_token.split(":", 1) + except (ValueError, AttributeError): + raise ValueError("Malformed approval token: expected 'signature:payload' format") + + try: + payload = base64.b64decode(b64_payload).decode() + except Exception: + raise ValueError("Malformed approval token: payload is not valid base64") + + expected_sig = hmac.new(_APPROVAL_SECRET.encode(), payload.encode(), hashlib.sha256).hexdigest() + if not hmac.compare_digest(signature, expected_sig): + raise ValueError("Invalid approval token: signature verification failed") + + try: + token_data = json.loads(payload) + except json.JSONDecodeError: + raise ValueError("Malformed approval token: payload is not valid JSON") + + ts = token_data.pop("timestamp", 0) + if abs(time.time() - ts) > _TOKEN_TTL_SECONDS: + raise ValueError("Expired approval token: please run preview again to get a new token") + + # Map preview action to mutation action + action_map = {"CREATE": "create", "UPDATE": "update", "DELETE": "delete"} + token_action = token_data.pop("action", None) + current_action = params.pop("action", None) + if token_action and current_action: + if action_map.get(token_action) != current_action: + raise ValueError( + f"Approval token action mismatch: token is for '{token_action}'" + f" but current action is '{current_action}'" + ) + + # Compare remaining params + clean_current = {k: v for k, v in params.items() if v is not None} + if token_data != clean_current: + raise ValueError("Approval token parameter mismatch: params differ from what was previewed") + + +def _check_admin_group() -> dict: + """Verify the current user belongs to the configured admin group. + + Raises PermissionError if user is not a member. + """ + w = get_workspace_client() + me = w.current_user.me() + group_names = [g.display for g in (me.groups or []) if g.display] + if _ADMIN_GROUP not in group_names: + raise PermissionError( + f"User '{me.user_name}' is not a member of admin group '{_ADMIN_GROUP}'. " + f"FGAC mutating operations require membership in the '{_ADMIN_GROUP}' group." + ) + return {"is_admin": True, "user": me.user_name, "admin_group": _ADMIN_GROUP} + + +def _validate_identifier(name: str) -> str: + """Validate a SQL identifier to prevent injection.""" + if not _IDENTIFIER_PATTERN.match(name): + raise ValueError(f"Invalid SQL identifier: '{name}'") + return name + + +def _validate_securable_type(securable_type: str) -> str: + """Validate and normalize securable type.""" + normalized = securable_type.upper() + if normalized not in _VALID_SECURABLE_TYPES: + raise ValueError( + f"Invalid securable_type: '{securable_type}'. Must be one of: {sorted(_VALID_SECURABLE_TYPES)}" + ) + return normalized + + +def _validate_policy_type(policy_type: str) -> str: + """Validate and normalize policy type.""" + normalized = policy_type.upper().replace("POLICY_TYPE_", "") + if normalized not in _VALID_POLICY_TYPES: + raise ValueError(f"Invalid policy_type: '{policy_type}'. Must be one of: {sorted(_VALID_POLICY_TYPES)}") + return normalized + + +def _to_policy_type_enum(policy_type: str): + """Convert a policy type string to the SDK PolicyType enum.""" + from databricks.sdk.service.catalog import PolicyType + + normalized = policy_type.upper().replace("POLICY_TYPE_", "") + if normalized == "COLUMN_MASK": + return PolicyType.POLICY_TYPE_COLUMN_MASK + elif normalized == "ROW_FILTER": + return PolicyType.POLICY_TYPE_ROW_FILTER + raise ValueError(f"Invalid policy_type: '{policy_type}'") + + +def _to_securable_type_enum(securable_type: str): + """Convert a securable type string to the SDK SecurableType enum.""" + from databricks.sdk.service.catalog import SecurableType + + return SecurableType(securable_type.upper()) + + +def _policy_to_dict(policy: Any) -> Dict[str, Any]: + """Convert a policy SDK object to a serializable dict.""" + if hasattr(policy, "as_dict"): + return policy.as_dict() + return { + "name": getattr(policy, "name", None), + "policy_type": getattr(policy, "policy_type", None), + "to_principals": getattr(policy, "to_principals", []), + "except_principals": getattr(policy, "except_principals", []), + "on_securable_type": getattr(policy, "on_securable_type", None), + "on_securable_fullname": getattr(policy, "on_securable_fullname", None), + "for_securable_type": getattr(policy, "for_securable_type", None), + "column_mask": getattr(policy, "column_mask", None), + "row_filter": getattr(policy, "row_filter", None), + "match_columns": getattr(policy, "match_columns", []), + "comment": getattr(policy, "comment", None), + } + + +# --------------------------------------------------------------------------- +# Discovery +# --------------------------------------------------------------------------- + + +def list_fgac_policies( + securable_type: str, + securable_fullname: str, + include_inherited: bool = True, + policy_type: Optional[str] = None, +) -> Dict[str, Any]: + """ + List FGAC policies on a catalog, schema, or table. + + Args: + securable_type: "CATALOG", "SCHEMA", or "TABLE" + securable_fullname: Fully qualified name (e.g., "my_catalog.my_schema") + include_inherited: Include policies inherited from parent securables + policy_type: Optional filter — "COLUMN_MASK" or "ROW_FILTER" + + Returns: + Dict with policy_count and policies list + """ + stype = _validate_securable_type(securable_type) + _validate_identifier(securable_fullname) + + w = get_workspace_client() + policies = list( + w.policies.list_policies( + on_securable_type=stype, + on_securable_fullname=securable_fullname, + include_inherited=include_inherited, + ) + ) + + if policy_type: + ptype = _validate_policy_type(policy_type) + # SDK returns POLICY_TYPE_COLUMN_MASK / POLICY_TYPE_ROW_FILTER + sdk_ptype = f"POLICY_TYPE_{ptype}" + policies = [ + p + for p in policies + if str(getattr(p, "policy_type", "")) in (ptype, sdk_ptype) + or (p.as_dict() if hasattr(p, "as_dict") else {}).get("policy_type") in (ptype, sdk_ptype) + ] + + policy_dicts = [_policy_to_dict(p) for p in policies] + return { + "success": True, + "securable_type": stype, + "securable_fullname": securable_fullname, + "policy_count": len(policy_dicts), + "policies": policy_dicts, + } + + +def get_fgac_policy( + policy_name: str, + securable_type: str, + securable_fullname: str, +) -> Dict[str, Any]: + """ + Get details for a specific FGAC policy by name. + + Args: + policy_name: Policy name + securable_type: "CATALOG", "SCHEMA", or "TABLE" + securable_fullname: Fully qualified securable name + + Returns: + Dict with policy details + """ + stype = _validate_securable_type(securable_type) + _validate_identifier(securable_fullname) + + w = get_workspace_client() + policy = w.policies.get_policy( + on_securable_type=stype, + on_securable_fullname=securable_fullname, + name=policy_name, + ) + + return { + "success": True, + "policy": _policy_to_dict(policy), + } + + +def get_table_policies( + catalog: str, + schema: str, + table: str, +) -> Dict[str, Any]: + """ + Get column masks and row filters applied to a specific table. + + Uses the Unity Catalog REST API directly because the Python SDK's + TableInfo does not expose ``effective_masks`` (FGAC-derived masks). + The ``/api/2.1/unity-catalog/tables/`` endpoint returns both direct + column masks and effective masks from FGAC policies. + + Args: + catalog: Catalog name + schema: Schema name + table: Table name + + Returns: + Dict with column_masks and row_filters lists + """ + _validate_identifier(catalog) + _validate_identifier(schema) + _validate_identifier(table) + full_name = f"{catalog}.{schema}.{table}" + + w = get_workspace_client() + result = w.api_client.do("GET", f"/api/2.1/unity-catalog/tables/{full_name}") + + column_masks = [] + for col in result.get("columns", []): + masks = col.get("column_masks", {}) + effective_masks = col.get("effective_masks", []) + + if masks.get("column_masks") or effective_masks: + mask_functions = [] + for m in masks.get("column_masks", []): + mask_functions.append(m.get("function_name")) + for m in effective_masks: + fn = m.get("function_name") + if fn and fn not in mask_functions: + mask_functions.append(fn) + + column_masks.append( + { + "column_name": col.get("name"), + "column_type": col.get("type_name"), + "mask_functions": mask_functions, + } + ) + + row_filters = [] + row_filters_data = result.get("row_filters", {}) + if row_filters_data: + for rf in row_filters_data.get("row_filters", []): + row_filters.append( + { + "function_name": rf.get("function_name"), + "input_column_names": rf.get("input_column_names", []), + } + ) + + return { + "success": True, + "table": full_name, + "column_masks": column_masks, + "row_filters": row_filters, + } + + +def get_masking_functions( + catalog: str, + schema: str, +) -> Dict[str, Any]: + """ + List masking UDFs in a schema. + + Retrieves all user-defined functions in the specified schema and returns + their metadata for use in FGAC policy creation. + + Args: + catalog: Catalog name + schema: Schema name + + Returns: + Dict with list of functions and their metadata + """ + _validate_identifier(catalog) + _validate_identifier(schema) + + w = get_workspace_client() + functions = list(w.functions.list(catalog_name=catalog, schema_name=schema)) + + func_list = [] + for f in functions: + func_list.append( + { + "name": f.name, + "full_name": f.full_name, + "return_type": str(f.data_type) if f.data_type else None, + "comment": getattr(f, "comment", None), + "is_deterministic": getattr(f, "is_deterministic", None), + } + ) + + return { + "success": True, + "catalog": catalog, + "schema": schema, + "function_count": len(func_list), + "functions": func_list, + } + + +# --------------------------------------------------------------------------- +# Analysis & Discovery +# --------------------------------------------------------------------------- + + +def get_column_tags_api( + catalog: str, + schema: str, + table: str, +) -> Dict[str, Any]: + """ + Get column-level tags for a table via the Tags API. + + Queries system.information_schema.column_tags to return governed and + metadata tags applied to columns on the specified table. + + Args: + catalog: Catalog name + schema: Schema name + table: Table name + + Returns: + Dict with table name and list of column tag entries + """ + _validate_identifier(catalog) + _validate_identifier(schema) + _validate_identifier(table) + + from .tags import query_column_tags + + tags = query_column_tags(catalog_filter=catalog, table_name=table) + # Filter to the specific schema (query_column_tags filters by catalog and table but not schema) + tags = [t for t in tags if t.get("schema_name") == schema] + + return { + "success": True, + "table": f"{catalog}.{schema}.{table}", + "tags": tags, + } + + +def get_schema_info( + catalog: str, + schema: str, +) -> Dict[str, Any]: + """ + Get schema metadata via the Unity Catalog API. + + Args: + catalog: Catalog name + schema: Schema name + + Returns: + Dict with serialized schema metadata + """ + _validate_identifier(catalog) + _validate_identifier(schema) + + from .schemas import get_schema + + schema_obj = get_schema(f"{catalog}.{schema}") + return { + "success": True, + "schema": { + "name": schema_obj.name, + "full_name": schema_obj.full_name, + "catalog_name": schema_obj.catalog_name, + "owner": schema_obj.owner, + "comment": schema_obj.comment, + "created_at": schema_obj.created_at, + "updated_at": schema_obj.updated_at, + }, + } + + +def get_catalog_info( + catalog: str, +) -> Dict[str, Any]: + """ + Get catalog metadata via the Unity Catalog API. + + Args: + catalog: Catalog name + + Returns: + Dict with serialized catalog metadata + """ + _validate_identifier(catalog) + + from .catalogs import get_catalog + + catalog_obj = get_catalog(catalog) + return { + "success": True, + "catalog": { + "name": catalog_obj.name, + "owner": catalog_obj.owner, + "comment": catalog_obj.comment, + "created_at": catalog_obj.created_at, + "updated_at": catalog_obj.updated_at, + }, + } + + +def list_table_policies_in_schema( + catalog: str, + schema: str, +) -> Dict[str, Any]: + """ + List all tables in a schema with their column masks and row filters. + + Enumerates tables in the schema and calls get_table_policies() on each. + + Args: + catalog: Catalog name + schema: Schema name + + Returns: + Dict with table count and per-table policy details + """ + _validate_identifier(catalog) + _validate_identifier(schema) + + from .tables import list_tables + + tables = list_tables(catalog_name=catalog, schema_name=schema) + table_results = [] + for t in tables: + try: + policies = get_table_policies(catalog=catalog, schema=schema, table=t.name) + table_results.append( + { + "table": t.name, + "column_masks": policies.get("column_masks", []), + "row_filters": policies.get("row_filters", []), + } + ) + except Exception as e: + logger.warning(f"Failed to get policies for table {t.name}: {e}") + table_results.append( + { + "table": t.name, + "column_masks": [], + "row_filters": [], + "error": str(e), + } + ) + + return { + "success": True, + "catalog": catalog, + "schema": schema, + "table_count": len(table_results), + "tables": table_results, + } + + +def analyze_fgac_coverage( + catalog: str, + schema: Optional[str] = None, +) -> Dict[str, Any]: + """ + Analyze FGAC policy coverage for a catalog or schema. + + Examines tagged columns, existing policies, and available masking UDFs + to identify gaps where tagged columns lack policy coverage. Useful for + the "analyze this catalog/schema and suggest FGAC policies" workflow. + + Args: + catalog: Catalog name + schema: Optional schema name. If omitted, analyzes all schemas in the catalog. + + Returns: + Dict with coverage summary, gaps, existing policies, and available UDFs + """ + _validate_identifier(catalog) + if schema: + _validate_identifier(schema) + + from .schemas import list_schemas + from .tables import list_tables + from .tags import query_column_tags + + # Determine schemas to scan + if schema: + schema_names = [schema] + scope = f"SCHEMA {catalog}.{schema}" + else: + schema_objs = list_schemas(catalog) + schema_names = [s.name for s in schema_objs if s.name != "information_schema"] + scope = f"CATALOG {catalog}" + + # 1. Enumerate tables across schemas + all_tables = [] + for s in schema_names: + try: + tables = list_tables(catalog_name=catalog, schema_name=s) + all_tables.extend(tables) + except Exception as e: + logger.warning(f"Failed to list tables in {catalog}.{s}: {e}") + + # 2. Query column tags + tagged_columns = query_column_tags(catalog_filter=catalog) + if schema: + tagged_columns = [t for t in tagged_columns if t.get("schema_name") == schema] + + # 3. List existing FGAC policies + securable_type = "SCHEMA" if schema else "CATALOG" + securable_fullname = f"{catalog}.{schema}" if schema else catalog + policies_result = list_fgac_policies( + securable_type=securable_type, + securable_fullname=securable_fullname, + include_inherited=True, + ) + existing_policies = policies_result.get("policies", []) + + # 4. List masking UDFs across scanned schemas + all_udfs = [] + for s in schema_names: + try: + udfs_result = get_masking_functions(catalog=catalog, schema=s) + all_udfs.extend(udfs_result.get("functions", [])) + except Exception as e: + logger.warning(f"Failed to list UDFs in {catalog}.{s}: {e}") + + # 5. Cross-reference: determine which tag/value pairs are covered by policies + covered_tags = set() + for p in existing_policies: + for mc in p.get("match_columns") or []: + condition = mc.get("condition", "") + # Parse hasTagValue('key', 'value') or hasTag('key') + if "hasTagValue" in condition: + parts = condition.replace("hasTagValue(", "").rstrip(")").replace("'", "").split(", ") + if len(parts) == 2: + covered_tags.add(f"{parts[0]}:{parts[1]}") + elif "hasTag" in condition: + tag = condition.replace("hasTag(", "").rstrip(")").replace("'", "") + covered_tags.add(tag) + + # Build tag -> columns mapping for uncovered tags + tag_columns: Dict[str, List[Dict[str, str]]] = {} + for tc in tagged_columns: + tag_key = f"{tc.get('tag_name')}:{tc.get('tag_value')}" if tc.get("tag_value") else tc.get("tag_name", "") + if tag_key not in covered_tags: + tag_columns.setdefault(tag_key, []).append( + { + "table": f"{tc.get('catalog_name')}.{tc.get('schema_name')}.{tc.get('table_name')}", + "column": tc.get("column_name", ""), + } + ) + + # Build unique tag keys for summary + all_tag_keys = set() + for tc in tagged_columns: + tag_key = f"{tc.get('tag_name')}:{tc.get('tag_value')}" if tc.get("tag_value") else tc.get("tag_name", "") + all_tag_keys.add(tag_key) + + uncovered_tags = all_tag_keys - covered_tags + + # Build gaps + gaps = [] + for tag_key in sorted(uncovered_tags): + if ":" in tag_key: + t_name, t_value = tag_key.split(":", 1) + else: + t_name, t_value = tag_key, None + + columns = tag_columns.get(tag_key, []) + suggestion = "No policy covers this tag. Consider creating a COLUMN_MASK policy." + gaps.append( + { + "tag_name": t_name, + "tag_value": t_value, + "columns": columns, + "suggestion": suggestion, + } + ) + + return { + "success": True, + "scope": scope, + "summary": { + "tables_scanned": len(all_tables), + "tagged_columns": len(tagged_columns), + "existing_policies": len(existing_policies), + "available_udfs": len(all_udfs), + "covered_tags": sorted(covered_tags), + "uncovered_tags": sorted(uncovered_tags), + }, + "gaps": gaps, + "existing_policies": existing_policies, + "available_udfs": all_udfs, + } + + +# --------------------------------------------------------------------------- +# Quota checking +# --------------------------------------------------------------------------- + + +def check_policy_quota( + securable_type: str, + securable_fullname: str, +) -> Dict[str, Any]: + """ + Check if the policy quota allows creating a new policy. + + Policy quotas: CATALOG=10, SCHEMA=10, TABLE=5. + + Args: + securable_type: "CATALOG", "SCHEMA", or "TABLE" + securable_fullname: Fully qualified securable name + + Returns: + Dict with current count, max allowed, and whether creation is allowed + """ + stype = _validate_securable_type(securable_type) + _validate_identifier(securable_fullname) + + w = get_workspace_client() + existing = list( + w.policies.list_policies( + on_securable_type=stype, + on_securable_fullname=securable_fullname, + ) + ) + + # Count only direct policies (not inherited) + direct = [p for p in existing if getattr(p, "on_securable_fullname", None) == securable_fullname] + + max_policies = _POLICY_QUOTAS.get(stype, 10) + return { + "success": True, + "securable_type": stype, + "securable_fullname": securable_fullname, + "current": len(direct), + "max": max_policies, + "can_create": len(direct) < max_policies, + } + + +# --------------------------------------------------------------------------- +# Preview (human-in-the-loop gate) +# --------------------------------------------------------------------------- + + +def preview_policy_changes( + action: str, + policy_name: str, + securable_type: str, + securable_fullname: str, + policy_type: Optional[str] = None, + to_principals: Optional[List[str]] = None, + except_principals: Optional[List[str]] = None, + function_name: Optional[str] = None, + tag_name: Optional[str] = None, + tag_value: Optional[str] = None, + comment: Optional[str] = None, +) -> Dict[str, Any]: + """ + Preview policy changes without executing. Human-in-the-loop gate. + + Generates the equivalent SQL and returns it for review. No changes + are made until a subsequent create/update/delete call. + + Args: + action: "CREATE", "UPDATE", or "DELETE" + policy_name: Policy name + securable_type: "CATALOG", "SCHEMA", or "TABLE" + securable_fullname: Fully qualified securable name + policy_type: "COLUMN_MASK" or "ROW_FILTER" (required for CREATE) + to_principals: Principals the policy applies to + except_principals: Excluded principals + function_name: Fully qualified UDF name (required for CREATE). + Can reference any catalog/schema, not just the policy scope. + tag_name: Tag key to match (required for CREATE) + tag_value: Tag value to match (optional; omit for hasTag vs hasTagValue) + comment: Policy description + + Returns: + Dict with preview details, equivalent SQL, warnings, and approval flag + """ + action = action.upper() + if action not in ("CREATE", "UPDATE", "DELETE"): + raise ValueError(f"Invalid action: '{action}'. Must be CREATE, UPDATE, or DELETE") + + stype = _validate_securable_type(securable_type) + _validate_identifier(securable_fullname) + warnings = [] + + safe_except = list(except_principals) if except_principals else [] + + if action == "CREATE": + if not policy_type: + raise ValueError("policy_type is required for CREATE action") + ptype = _validate_policy_type(policy_type) + if not function_name: + raise ValueError("function_name is required for CREATE action") + if not tag_name: + raise ValueError("tag_name is required for CREATE action") + if not to_principals: + raise ValueError("to_principals is required for CREATE action") + + tag_match = f"hasTagValue('{tag_name}', '{tag_value}')" if tag_value else f"hasTag('{tag_name}')" + + principals_sql = ", ".join(f"`{p}`" for p in to_principals) + except_sql = ", ".join(f"`{p}`" for p in safe_except) if safe_except else "" + + if ptype == "COLUMN_MASK": + sql_lines = [ + f"CREATE OR REPLACE POLICY {policy_name}", + f"ON {stype} {securable_fullname}", + ] + if comment: + sql_lines.append(f"COMMENT '{comment}'") + sql_lines += [ + f"COLUMN MASK {function_name}", + f"TO {principals_sql}", + ] + if except_sql: + sql_lines.append(f"EXCEPT {except_sql}") + sql_lines += [ + "FOR TABLES", + f"MATCH COLUMNS {tag_match} AS masked_col", + "ON COLUMN masked_col;", + ] + else: # ROW_FILTER + sql_lines = [ + f"CREATE OR REPLACE POLICY {policy_name}", + f"ON {stype} {securable_fullname}", + ] + if comment: + sql_lines.append(f"COMMENT '{comment}'") + sql_lines += [ + f"ROW FILTER {function_name}", + f"TO {principals_sql}", + ] + if except_sql: + sql_lines.append(f"EXCEPT {except_sql}") + sql_lines += [ + "FOR TABLES", + f"MATCH COLUMNS {tag_match} AS filter_col", + "USING COLUMNS (filter_col);", + ] + + equivalent_sql = "\n".join(sql_lines) + preview = { + "policy_name": policy_name, + "policy_type": ptype, + "securable": f"{stype} {securable_fullname}", + "to_principals": to_principals, + "except_principals": safe_except, + "function": function_name, + "tag_match": tag_match, + "equivalent_sql": equivalent_sql, + } + + elif action == "UPDATE": + changes = {} + if to_principals is not None: + changes["to_principals"] = to_principals + if except_principals is not None: + changes["except_principals"] = safe_except + if comment is not None: + changes["comment"] = comment + + if not changes: + warnings.append("No changes specified for UPDATE") + + preview = { + "policy_name": policy_name, + "securable": f"{stype} {securable_fullname}", + "changes": changes, + "equivalent_sql": f"-- UPDATE via SDK: w.policies.update_policy(name='{policy_name}', ...)", + "note": "update_policy only modifies principals and comment. " + "To change UDF, tags, or scope, drop and recreate.", + } + + else: # DELETE + equivalent_sql = f"DROP POLICY {policy_name} ON {stype} {securable_fullname};" + preview = { + "policy_name": policy_name, + "securable": f"{stype} {securable_fullname}", + "equivalent_sql": equivalent_sql, + } + warnings.append("This action is irreversible. The policy will be permanently removed.") + + # Generate approval token binding these params + token_params = { + "action": action, + "policy_name": policy_name, + "securable_type": stype, + "securable_fullname": securable_fullname, + } + if policy_type: + token_params["policy_type"] = _validate_policy_type(policy_type) + if to_principals is not None: + token_params["to_principals"] = to_principals + if except_principals is not None: + token_params["except_principals"] = safe_except + if function_name is not None: + token_params["function_name"] = function_name + if tag_name is not None: + token_params["tag_name"] = tag_name + if tag_value is not None: + token_params["tag_value"] = tag_value + if comment is not None: + token_params["comment"] = comment + + approval_token = _generate_approval_token(token_params) + + return { + "success": True, + "action": action, + "preview": preview, + "warnings": warnings, + "requires_approval": True, + "approval_token": approval_token, + "message": "Review the preview above. Reply 'approve' to execute, passing the approval_token.", + } + + +# --------------------------------------------------------------------------- +# Management (mutating operations) +# --------------------------------------------------------------------------- + + +def create_fgac_policy( + policy_name: str, + policy_type: str, + securable_type: str, + securable_fullname: str, + function_name: str, + to_principals: List[str], + tag_name: str, + approval_token: str, + tag_value: Optional[str] = None, + except_principals: Optional[List[str]] = None, + comment: str = "", +) -> Dict[str, Any]: + """ + Create a new FGAC policy (COLUMN_MASK or ROW_FILTER). + + Requires a valid approval_token from preview_policy_changes() and + the caller must be a member of the configured admin group. + + Args: + policy_name: Policy name (must be unique within the securable scope) + policy_type: "COLUMN_MASK" or "ROW_FILTER" + securable_type: "CATALOG", "SCHEMA", or "TABLE" + securable_fullname: Fully qualified securable name + function_name: Fully qualified UDF name (e.g., "catalog.schema.mask_ssn"). + The UDF can reside in any catalog/schema, not just the policy scope. + For example, a policy on "prod.finance" can use "governance.masking_udfs.mask_ssn". + to_principals: Users/groups the policy applies to + tag_name: Tag key to match columns on + approval_token: Token from preview_policy_changes() + tag_value: Tag value to match (optional; omit for hasTag vs hasTagValue) + except_principals: Excluded principals + comment: Policy description + + Returns: + Dict with creation status and policy details + """ + ptype = _validate_policy_type(policy_type) + stype = _validate_securable_type(securable_type) + # Identifier validation is handled by preview_policy_changes() — the token + # binding ensures these values match what was already validated at preview time. + current_params = { + "action": "create", + "policy_name": policy_name, + "policy_type": ptype, + "securable_type": stype, + "securable_fullname": securable_fullname, + "function_name": function_name, + "to_principals": to_principals, + "tag_name": tag_name, + } + if tag_value is not None: + current_params["tag_value"] = tag_value + if except_principals is not None: + current_params["except_principals"] = list(except_principals) + if comment: + current_params["comment"] = comment + _validate_approval_token(approval_token, current_params) + _check_admin_group() + + from databricks.sdk.service.catalog import ( + ColumnMaskOptions, + MatchColumn, + PolicyInfo, + RowFilterOptions, + ) + + # Build tag match condition + tag_condition = f"hasTagValue('{tag_name}', '{tag_value}')" if tag_value else f"hasTag('{tag_name}')" + alias = "masked_col" if ptype == "COLUMN_MASK" else "filter_col" + match_columns = [MatchColumn(alias=alias, condition=tag_condition)] + + # Build PolicyInfo + policy_info = PolicyInfo( + name=policy_name, + policy_type=_to_policy_type_enum(ptype), + on_securable_type=_to_securable_type_enum(stype), + on_securable_fullname=securable_fullname, + for_securable_type=_to_securable_type_enum("TABLE"), + to_principals=to_principals, + except_principals=list(except_principals) if except_principals else None, + comment=comment, + match_columns=match_columns, + ) + + if ptype == "COLUMN_MASK": + policy_info.column_mask = ColumnMaskOptions( + function_name=function_name, + on_column=alias, + ) + else: # ROW_FILTER + policy_info.row_filter = RowFilterOptions( + function_name=function_name, + ) + + w = get_workspace_client() + policy = w.policies.create_policy(policy_info=policy_info) + + return { + "success": True, + "policy_name": policy_name, + "action": "created", + "details": { + "policy_type": ptype, + "on_securable": f"{stype} {securable_fullname}", + "function": function_name, + "to_principals": to_principals, + "except_principals": list(except_principals) if except_principals else [], + "tag_match": f"{tag_name}={tag_value}" if tag_value else tag_name, + }, + "policy": _policy_to_dict(policy), + } + + +def update_fgac_policy( + policy_name: str, + securable_type: str, + securable_fullname: str, + approval_token: str, + to_principals: Optional[List[str]] = None, + except_principals: Optional[List[str]] = None, + comment: Optional[str] = None, +) -> Dict[str, Any]: + """ + Update an existing FGAC policy's principals or comment. + + Requires a valid approval_token from preview_policy_changes() and + the caller must be a member of the configured admin group. + + Only principals and comment can be modified. To change the UDF, tag + matching, or scope, drop and recreate the policy. + + Args: + policy_name: Policy name + securable_type: "CATALOG", "SCHEMA", or "TABLE" + securable_fullname: Fully qualified securable name + approval_token: Token from preview_policy_changes() + to_principals: Updated list of principals the policy applies to + except_principals: Updated excluded principals + comment: Updated policy description + + Returns: + Dict with update status and applied changes + """ + stype = _validate_securable_type(securable_type) + current_params = { + "action": "update", + "policy_name": policy_name, + "securable_type": stype, + "securable_fullname": securable_fullname, + } + if to_principals is not None: + current_params["to_principals"] = to_principals + if except_principals is not None: + current_params["except_principals"] = list(except_principals) + if comment is not None: + current_params["comment"] = comment + _validate_approval_token(approval_token, current_params) + _check_admin_group() + + from databricks.sdk.service.catalog import PolicyInfo + + w = get_workspace_client() + + # Get existing policy to preserve required fields + existing = w.policies.get_policy( + on_securable_type=stype, + on_securable_fullname=securable_fullname, + name=policy_name, + ) + + # Build update PolicyInfo with existing required fields + policy_info = PolicyInfo( + to_principals=existing.to_principals, + for_securable_type=existing.for_securable_type, + policy_type=existing.policy_type, + ) + + changes: Dict[str, Any] = {} + update_fields = [] + + if to_principals is not None: + policy_info.to_principals = to_principals + changes["to_principals"] = to_principals + update_fields.append("to_principals") + + if except_principals is not None: + policy_info.except_principals = list(except_principals) + changes["except_principals"] = list(except_principals) + update_fields.append("except_principals") + + if comment is not None: + policy_info.comment = comment + changes["comment"] = comment + update_fields.append("comment") + + policy = w.policies.update_policy( + on_securable_type=stype, + on_securable_fullname=securable_fullname, + name=policy_name, + policy_info=policy_info, + update_mask=",".join(update_fields) if update_fields else None, + ) + + return { + "success": True, + "policy_name": policy_name, + "action": "updated", + "changes": changes, + "policy": _policy_to_dict(policy), + } + + +def delete_fgac_policy( + policy_name: str, + securable_type: str, + securable_fullname: str, + approval_token: str, +) -> Dict[str, Any]: + """ + Delete an FGAC policy. + + Requires a valid approval_token from preview_policy_changes() and + the caller must be a member of the configured admin group. + + This is irreversible. The policy will be permanently removed. + + Args: + policy_name: Policy name + securable_type: "CATALOG", "SCHEMA", or "TABLE" + securable_fullname: Fully qualified securable name + approval_token: Token from preview_policy_changes() + + Returns: + Dict with deletion status + """ + stype = _validate_securable_type(securable_type) + current_params = { + "action": "delete", + "policy_name": policy_name, + "securable_type": stype, + "securable_fullname": securable_fullname, + } + _validate_approval_token(approval_token, current_params) + _check_admin_group() + + w = get_workspace_client() + w.policies.delete_policy( + on_securable_type=stype, + on_securable_fullname=securable_fullname, + name=policy_name, + ) + + return { + "success": True, + "policy_name": policy_name, + "action": "deleted", + } diff --git a/databricks-tools-core/databricks_tools_core/unity_catalog/volume_files.py b/databricks-tools-core/databricks_tools_core/unity_catalog/volume_files.py index a8a2d208..958684d8 100644 --- a/databricks-tools-core/databricks_tools_core/unity_catalog/volume_files.py +++ b/databricks-tools-core/databricks_tools_core/unity_catalog/volume_files.py @@ -12,7 +12,6 @@ from pathlib import Path from typing import List, Optional -from databricks.sdk.service.files import DirectoryEntry from ..auth import get_workspace_client diff --git a/databricks-tools-core/tests/conftest.py b/databricks-tools-core/tests/conftest.py index ab854a2b..6a8d4fbc 100644 --- a/databricks-tools-core/tests/conftest.py +++ b/databricks-tools-core/tests/conftest.py @@ -117,7 +117,9 @@ def warehouse_id(workspace_client: WorkspaceClient) -> str: Get a running SQL warehouse for tests. Prefers shared endpoints, falls back to any running warehouse. + Starts a stopped serverless warehouse if none are running. """ + import time from databricks.sdk.service.sql import State warehouses = list(workspace_client.warehouses.list()) @@ -134,6 +136,18 @@ def warehouse_id(workspace_client: WorkspaceClient) -> str: logger.info(f"Using warehouse: {w.name} ({w.id})") return w.id + # Start a stopped serverless warehouse + for w in warehouses: + if w.state == State.STOPPED and "serverless" in (w.name or "").lower(): + logger.info(f"Starting stopped serverless warehouse: {w.name} ({w.id})") + workspace_client.warehouses.start(w.id) + for _ in range(30): + wh = workspace_client.warehouses.get(w.id) + if wh.state == State.RUNNING: + logger.info(f"Warehouse started: {w.name} ({w.id})") + return w.id + time.sleep(10) + # No running warehouse found pytest.skip("No running SQL warehouse available for tests") diff --git a/databricks-tools-core/tests/integration/unity_catalog/conftest.py b/databricks-tools-core/tests/integration/unity_catalog/conftest.py index 5f495c30..339bc616 100644 --- a/databricks-tools-core/tests/integration/unity_catalog/conftest.py +++ b/databricks-tools-core/tests/integration/unity_catalog/conftest.py @@ -230,3 +230,39 @@ def register(full_function_name: str): delete_function(fn_name, force=True) except Exception as e: logger.warning(f"Failed to cleanup function {fn_name}: {e}") + + +@pytest.fixture(scope="function") +def cleanup_policies(): + """ + Track and cleanup FGAC policies created during tests. + + Usage: + def test_create_policy(cleanup_policies): + create_fgac_policy(...) + cleanup_policies((policy_name, securable_type, securable_fullname)) + """ + from databricks_tools_core.auth import get_workspace_client + + policies_to_cleanup = [] + + def register(policy_tuple: tuple): + """Register a policy for cleanup. Tuple: (name, securable_type, securable_fullname).""" + if policy_tuple not in policies_to_cleanup: + policies_to_cleanup.append(policy_tuple) + logger.info(f"Registered policy for cleanup: {policy_tuple[0]}") + + yield register + + # Use SDK directly to bypass approval token guardrails during cleanup + w = get_workspace_client() + for name, stype, sfullname in policies_to_cleanup: + try: + logger.info(f"Cleaning up policy: {name}") + w.policies.delete_policy( + on_securable_type=stype, + on_securable_fullname=sfullname, + name=name, + ) + except Exception as e: + logger.warning(f"Failed to cleanup policy {name}: {e}") diff --git a/databricks-tools-core/tests/integration/unity_catalog/test_fgac_policies.py b/databricks-tools-core/tests/integration/unity_catalog/test_fgac_policies.py new file mode 100644 index 00000000..bc6a506e --- /dev/null +++ b/databricks-tools-core/tests/integration/unity_catalog/test_fgac_policies.py @@ -0,0 +1,1190 @@ +""" +Integration tests for Unity Catalog FGAC Policy operations. + +Tests the fgac_policies module functions: +- list_fgac_policies +- get_fgac_policy +- get_table_policies +- get_masking_functions +- check_policy_quota +- preview_policy_changes +- create_fgac_policy / update_fgac_policy / delete_fgac_policy + +Governed Tags +------------- +FGAC policies require **governed tags** (not regular metadata tags). +The CRUD tests automatically create and clean up governed tags via the +Tag Policies API (``w.tag_policies``). No manual UI setup is needed. +""" + +import logging +import time + +import pytest + +from databricks_tools_core.auth import get_workspace_client +from databricks_tools_core.sql import execute_sql +from databricks_tools_core.unity_catalog import ( + create_security_function, + set_tags, +) +from databricks_tools_core.unity_catalog.fgac_policies import ( + list_fgac_policies, + get_fgac_policy, + get_table_policies, + get_masking_functions, + get_column_tags_api, + get_schema_info, + get_catalog_info, + list_table_policies_in_schema, + analyze_fgac_coverage, + check_policy_quota, + preview_policy_changes, + create_fgac_policy, + update_fgac_policy, + delete_fgac_policy, + _check_admin_group, +) + +logger = logging.getLogger(__name__) + +UC_TEST_PREFIX = "uc_test" + +# Governed tags need time to propagate to the FGAC policy system after creation. +TAG_PROPAGATION_DELAY_SECONDS = 30 + + +def _create_governed_tag(tag_key: str, allowed_values: list[str]) -> None: + """Create a governed tag via the Tag Policies API and wait for propagation.""" + from databricks.sdk.service.tags import TagPolicy, Value + + w = get_workspace_client() + w.tag_policies.create_tag_policy( + tag_policy=TagPolicy( + tag_key=tag_key, + description=f"Integration test tag ({tag_key})", + values=[Value(name=v) for v in allowed_values], + ) + ) + logger.info(f"Created governed tag: {tag_key} (values={allowed_values})") + + logger.info(f"Waiting {TAG_PROPAGATION_DELAY_SECONDS}s for governed tag propagation...") + time.sleep(TAG_PROPAGATION_DELAY_SECONDS) + + +def _delete_governed_tag(tag_key: str) -> None: + """Delete a governed tag via the Tag Policies API.""" + try: + w = get_workspace_client() + w.tag_policies.delete_tag_policy(tag_key=tag_key) + logger.info(f"Deleted governed tag: {tag_key}") + except Exception as e: + logger.warning(f"Failed to delete governed tag {tag_key}: {e}") + + +# --------------------------------------------------------------------------- +# Discovery tests +# --------------------------------------------------------------------------- + + +@pytest.mark.integration +class TestListFgacPolicies: + """Tests for listing FGAC policies.""" + + def test_list_policies_on_catalog(self, test_catalog: str): + """Should list policies on a catalog (may be empty).""" + result = list_fgac_policies( + securable_type="CATALOG", + securable_fullname=test_catalog, + ) + + assert result["success"] is True + assert result["securable_type"] == "CATALOG" + assert result["securable_fullname"] == test_catalog + assert isinstance(result["policies"], list) + assert isinstance(result["policy_count"], int) + logger.info(f"Found {result['policy_count']} policies on catalog {test_catalog}") + + def test_list_policies_on_schema(self, test_catalog: str, uc_test_schema: str): + """Should list policies on a schema.""" + full_name = f"{test_catalog}.{uc_test_schema}" + result = list_fgac_policies( + securable_type="SCHEMA", + securable_fullname=full_name, + ) + + assert result["success"] is True + assert result["securable_type"] == "SCHEMA" + assert isinstance(result["policies"], list) + logger.info(f"Found {result['policy_count']} policies on schema {full_name}") + + def test_list_policies_with_type_filter(self, test_catalog: str): + """Should filter policies by type.""" + result = list_fgac_policies( + securable_type="CATALOG", + securable_fullname=test_catalog, + policy_type="COLUMN_MASK", + ) + + assert result["success"] is True + for p in result["policies"]: + # SDK returns POLICY_TYPE_COLUMN_MASK; accept both forms + assert p.get("policy_type") in ("COLUMN_MASK", "POLICY_TYPE_COLUMN_MASK") + logger.info(f"Found {result['policy_count']} COLUMN_MASK policies") + + def test_list_policies_without_inherited(self, test_catalog: str): + """Should list only direct policies when include_inherited=False.""" + result = list_fgac_policies( + securable_type="CATALOG", + securable_fullname=test_catalog, + include_inherited=False, + ) + + assert result["success"] is True + assert isinstance(result["policies"], list) + logger.info(f"Found {result['policy_count']} direct policies") + + +@pytest.mark.integration +class TestGetTablePolicies: + """Tests for getting column masks and row filters on a table.""" + + def test_get_table_policies(self, test_catalog: str, uc_test_schema: str, uc_test_table: str): + """Should return column masks and row filters for a table.""" + # uc_test_table is "catalog.schema.table" + parts = uc_test_table.split(".") + result = get_table_policies( + catalog=parts[0], + schema=parts[1], + table=parts[2], + ) + + assert result["success"] is True + assert result["table"] == uc_test_table + assert isinstance(result["column_masks"], list) + assert isinstance(result["row_filters"], list) + logger.info(f"Table {uc_test_table}: {len(result['column_masks'])} masks, {len(result['row_filters'])} filters") + + +@pytest.mark.integration +class TestGetMaskingFunctions: + """Tests for listing masking UDFs in a schema.""" + + def test_get_masking_functions( + self, + test_catalog: str, + uc_test_schema: str, + unique_name: str, + warehouse_id: str, + cleanup_functions, + ): + """Should list UDFs in the schema.""" + # Create a test function so there's at least one + fn_name = f"{test_catalog}.{uc_test_schema}.{UC_TEST_PREFIX}_mask_{unique_name}" + cleanup_functions(fn_name) + + create_security_function( + function_name=fn_name, + parameter_name="val", + parameter_type="STRING", + return_type="STRING", + function_body="RETURN CASE WHEN val IS NULL THEN NULL ELSE '***' END", + warehouse_id=warehouse_id, + ) + + result = get_masking_functions( + catalog=test_catalog, + schema=uc_test_schema, + ) + + assert result["success"] is True + assert result["catalog"] == test_catalog + assert result["schema"] == uc_test_schema + assert isinstance(result["functions"], list) + assert result["function_count"] > 0 + + # Verify our function appears + func_names = [f["name"] for f in result["functions"]] + expected_name = f"{UC_TEST_PREFIX}_mask_{unique_name}" + assert expected_name in func_names, f"Expected {expected_name} in {func_names}" + logger.info(f"Found {result['function_count']} functions in schema") + + +# --------------------------------------------------------------------------- +# Analysis & Discovery tests +# --------------------------------------------------------------------------- + + +@pytest.mark.integration +class TestGetColumnTagsApi: + """Tests for get_column_tags_api.""" + + def test_get_column_tags_api( + self, + test_catalog: str, + uc_test_schema: str, + uc_test_table: str, + unique_name: str, + warehouse_id: str, + ): + """Should return column tags for a table.""" + parts = uc_test_table.split(".") + tag_key = f"uc_test_tag_{unique_name}" + + # Tag a column so there's something to find + set_tags( + object_type="column", + full_name=uc_test_table, + column_name="email", + tags={tag_key: "test_val"}, + warehouse_id=warehouse_id, + ) + + result = get_column_tags_api( + catalog=parts[0], + schema=parts[1], + table=parts[2], + ) + + assert result["success"] is True + assert result["table"] == uc_test_table + assert isinstance(result["tags"], list) + logger.info(f"Found {len(result['tags'])} column tags on {uc_test_table}") + + +@pytest.mark.integration +class TestGetSchemaInfo: + """Tests for get_schema_info.""" + + def test_get_schema_info(self, test_catalog: str, uc_test_schema: str): + """Should return schema metadata.""" + result = get_schema_info( + catalog=test_catalog, + schema=uc_test_schema, + ) + + assert result["success"] is True + assert result["schema"]["name"] == uc_test_schema + assert result["schema"]["catalog_name"] == test_catalog + assert result["schema"]["full_name"] == f"{test_catalog}.{uc_test_schema}" + assert "owner" in result["schema"] + logger.info(f"Schema info: {result['schema']['full_name']} owned by {result['schema']['owner']}") + + +@pytest.mark.integration +class TestGetCatalogInfo: + """Tests for get_catalog_info.""" + + def test_get_catalog_info(self, test_catalog: str): + """Should return catalog metadata.""" + result = get_catalog_info(catalog=test_catalog) + + assert result["success"] is True + assert result["catalog"]["name"] == test_catalog + assert "owner" in result["catalog"] + logger.info(f"Catalog info: {result['catalog']['name']} owned by {result['catalog']['owner']}") + + +@pytest.mark.integration +class TestListTablePoliciesInSchema: + """Tests for list_table_policies_in_schema.""" + + def test_list_table_policies_in_schema(self, test_catalog: str, uc_test_schema: str, uc_test_table: str): + """Should list tables with their policies.""" + result = list_table_policies_in_schema( + catalog=test_catalog, + schema=uc_test_schema, + ) + + assert result["success"] is True + assert result["catalog"] == test_catalog + assert result["schema"] == uc_test_schema + assert isinstance(result["tables"], list) + assert result["table_count"] > 0 + + # Each table should have column_masks and row_filters keys + for t in result["tables"]: + assert "table" in t + assert "column_masks" in t + assert "row_filters" in t + logger.info(f"Found {result['table_count']} tables in {test_catalog}.{uc_test_schema}") + + +@pytest.mark.integration +class TestAnalyzeFgacCoverage: + """Tests for analyze_fgac_coverage.""" + + def test_analyze_coverage_schema_scope(self, test_catalog: str, uc_test_schema: str, uc_test_table: str): + """Should return coverage analysis for a schema.""" + result = analyze_fgac_coverage( + catalog=test_catalog, + schema=uc_test_schema, + ) + + assert result["success"] is True + assert result["scope"] == f"SCHEMA {test_catalog}.{uc_test_schema}" + + summary = result["summary"] + assert isinstance(summary["tables_scanned"], int) + assert isinstance(summary["tagged_columns"], int) + assert isinstance(summary["existing_policies"], int) + assert isinstance(summary["available_udfs"], int) + assert isinstance(summary["covered_tags"], list) + assert isinstance(summary["uncovered_tags"], list) + assert isinstance(result["gaps"], list) + assert isinstance(result["existing_policies"], list) + assert isinstance(result["available_udfs"], list) + logger.info( + f"Coverage analysis: {summary['tables_scanned']} tables, " + f"{summary['tagged_columns']} tagged cols, " + f"{summary['existing_policies']} policies, " + f"{len(result['gaps'])} gaps" + ) + + def test_analyze_coverage_catalog_scope(self, test_catalog: str, uc_test_schema: str, uc_test_table: str): + """Should return coverage analysis for an entire catalog.""" + result = analyze_fgac_coverage(catalog=test_catalog) + + assert result["success"] is True + assert result["scope"] == f"CATALOG {test_catalog}" + assert isinstance(result["summary"]["tables_scanned"], int) + assert isinstance(result["gaps"], list) + logger.info(f"Catalog coverage: {result['summary']['tables_scanned']} tables scanned") + + +# --------------------------------------------------------------------------- +# Quota check tests +# --------------------------------------------------------------------------- + + +@pytest.mark.integration +class TestCheckPolicyQuota: + """Tests for policy quota checking.""" + + def test_check_quota_on_catalog(self, test_catalog: str): + """Should return quota info for a catalog.""" + result = check_policy_quota( + securable_type="CATALOG", + securable_fullname=test_catalog, + ) + + assert result["success"] is True + assert result["securable_type"] == "CATALOG" + assert result["max"] == 10 + assert isinstance(result["current"], int) + assert isinstance(result["can_create"], bool) + logger.info(f"Catalog quota: {result['current']}/{result['max']}") + + def test_check_quota_on_schema(self, test_catalog: str, uc_test_schema: str): + """Should return quota info for a schema.""" + full_name = f"{test_catalog}.{uc_test_schema}" + result = check_policy_quota( + securable_type="SCHEMA", + securable_fullname=full_name, + ) + + assert result["success"] is True + assert result["max"] == 10 + logger.info(f"Schema quota: {result['current']}/{result['max']}") + + def test_check_quota_on_table(self, uc_test_table: str): + """Should return quota info for a table.""" + result = check_policy_quota( + securable_type="TABLE", + securable_fullname=uc_test_table, + ) + + assert result["success"] is True + assert result["max"] == 5 + logger.info(f"Table quota: {result['current']}/{result['max']}") + + +# --------------------------------------------------------------------------- +# Preview tests (no side effects) +# --------------------------------------------------------------------------- + + +@pytest.mark.integration +class TestPreviewPolicyChanges: + """Tests for preview_policy_changes (human-in-the-loop gate).""" + + def test_preview_create_column_mask(self): + """Should generate CREATE preview with SQL for a column mask.""" + result = preview_policy_changes( + action="CREATE", + policy_name="test_mask_ssn", + securable_type="SCHEMA", + securable_fullname="my_catalog.my_schema", + policy_type="COLUMN_MASK", + to_principals=["analysts"], + function_name="my_catalog.my_schema.mask_ssn", + tag_name="pii_type", + tag_value="ssn", + comment="Test mask SSN", + ) + + assert result["success"] is True + assert result["action"] == "CREATE" + assert result["requires_approval"] is True + + preview = result["preview"] + assert preview["policy_name"] == "test_mask_ssn" + assert preview["policy_type"] == "COLUMN_MASK" + assert "analysts" in preview["to_principals"] + assert "hasTagValue('pii_type', 'ssn')" in preview["tag_match"] + assert "COLUMN MASK" in preview["equivalent_sql"] + assert "MATCH COLUMNS" in preview["equivalent_sql"] + logger.info(f"Preview SQL:\n{preview['equivalent_sql']}") + + def test_preview_create_row_filter(self): + """Should generate CREATE preview with SQL for a row filter.""" + result = preview_policy_changes( + action="CREATE", + policy_name="test_filter_eu", + securable_type="CATALOG", + securable_fullname="my_catalog", + policy_type="ROW_FILTER", + to_principals=["us_team"], + function_name="my_catalog.my_schema.is_not_eu", + tag_name="region", + tag_value="eu", + ) + + assert result["success"] is True + preview = result["preview"] + assert preview["policy_type"] == "ROW_FILTER" + assert "ROW FILTER" in preview["equivalent_sql"] + assert "USING COLUMNS" in preview["equivalent_sql"] + logger.info(f"Preview SQL:\n{preview['equivalent_sql']}") + + def test_preview_create_with_has_tag(self): + """Should use hasTag when tag_value is omitted.""" + result = preview_policy_changes( + action="CREATE", + policy_name="test_mask_all_pii", + securable_type="SCHEMA", + securable_fullname="my_catalog.my_schema", + policy_type="COLUMN_MASK", + to_principals=["external_users"], + function_name="my_catalog.my_schema.mask_full", + tag_name="pii_type", + ) + + assert result["success"] is True + assert "hasTag('pii_type')" in result["preview"]["tag_match"] + logger.info("Preview uses hasTag (no tag_value)") + + def test_preview_delete(self): + """Should generate DELETE preview with DROP SQL.""" + result = preview_policy_changes( + action="DELETE", + policy_name="test_mask_ssn", + securable_type="SCHEMA", + securable_fullname="my_catalog.my_schema", + ) + + assert result["success"] is True + assert result["action"] == "DELETE" + assert "DROP POLICY" in result["preview"]["equivalent_sql"] + assert len(result["warnings"]) > 0 # Should warn about irreversibility + logger.info(f"Delete preview: {result['preview']['equivalent_sql']}") + + def test_preview_update(self): + """Should generate UPDATE preview.""" + result = preview_policy_changes( + action="UPDATE", + policy_name="test_mask_ssn", + securable_type="SCHEMA", + securable_fullname="my_catalog.my_schema", + to_principals=["analysts", "new_team"], + comment="Updated principals", + ) + + assert result["success"] is True + assert result["action"] == "UPDATE" + assert "to_principals" in result["preview"]["changes"] + assert "comment" in result["preview"]["changes"] + logger.info(f"Update preview changes: {result['preview']['changes']}") + + +# --------------------------------------------------------------------------- +# Validation tests (no Databricks connection needed) +# --------------------------------------------------------------------------- + + +@pytest.mark.integration +class TestFgacPolicyValidation: + """Tests for input validation in FGAC policy functions.""" + + def test_invalid_securable_type_raises(self): + """Should raise ValueError for invalid securable type.""" + with pytest.raises(ValueError) as exc_info: + list_fgac_policies( + securable_type="INVALID", + securable_fullname="test", + ) + + assert "invalid securable_type" in str(exc_info.value).lower() + + def test_invalid_policy_type_raises(self): + """Should raise ValueError for invalid policy type.""" + with pytest.raises(ValueError) as exc_info: + preview_policy_changes( + action="CREATE", + policy_name="test", + securable_type="SCHEMA", + securable_fullname="cat.sch", + policy_type="INVALID", + to_principals=["x"], + function_name="fn", + tag_name="t", + ) + + assert "invalid policy_type" in str(exc_info.value).lower() + + def test_invalid_action_raises(self): + """Should raise ValueError for invalid action.""" + with pytest.raises(ValueError) as exc_info: + preview_policy_changes( + action="INVALID", + policy_name="test", + securable_type="SCHEMA", + securable_fullname="cat.sch", + ) + + assert "invalid action" in str(exc_info.value).lower() + + def test_create_preview_missing_policy_type_raises(self): + """Should raise ValueError when policy_type missing for CREATE.""" + with pytest.raises(ValueError) as exc_info: + preview_policy_changes( + action="CREATE", + policy_name="test", + securable_type="SCHEMA", + securable_fullname="cat.sch", + to_principals=["x"], + function_name="fn", + tag_name="t", + ) + + assert "policy_type" in str(exc_info.value).lower() + + def test_create_preview_missing_function_name_raises(self): + """Should raise ValueError when function_name missing for CREATE.""" + with pytest.raises(ValueError) as exc_info: + preview_policy_changes( + action="CREATE", + policy_name="test", + securable_type="SCHEMA", + securable_fullname="cat.sch", + policy_type="COLUMN_MASK", + to_principals=["x"], + tag_name="t", + ) + + assert "function_name" in str(exc_info.value).lower() + + def test_create_preview_missing_tag_name_raises(self): + """Should raise ValueError when tag_name missing for CREATE.""" + with pytest.raises(ValueError) as exc_info: + preview_policy_changes( + action="CREATE", + policy_name="test", + securable_type="SCHEMA", + securable_fullname="cat.sch", + policy_type="COLUMN_MASK", + to_principals=["x"], + function_name="fn", + ) + + assert "tag_name" in str(exc_info.value).lower() + + def test_create_preview_missing_principals_raises(self): + """Should raise ValueError when to_principals missing for CREATE.""" + with pytest.raises(ValueError) as exc_info: + preview_policy_changes( + action="CREATE", + policy_name="test", + securable_type="SCHEMA", + securable_fullname="cat.sch", + policy_type="COLUMN_MASK", + function_name="fn", + tag_name="t", + ) + + assert "to_principals" in str(exc_info.value).lower() + + def test_invalid_identifier_raises(self): + """Should raise ValueError for SQL injection attempts.""" + with pytest.raises(ValueError) as exc_info: + list_fgac_policies( + securable_type="CATALOG", + securable_fullname="DROP TABLE; --", + ) + + assert "invalid sql identifier" in str(exc_info.value).lower() + + +# --------------------------------------------------------------------------- +# Approval token enforcement tests +# --------------------------------------------------------------------------- + + +@pytest.mark.integration +class TestApprovalTokenEnforcement: + """Tests for approval token guardrails on mutating operations.""" + + def test_create_without_token_raises(self): + """create_fgac_policy without approval_token should raise TypeError.""" + with pytest.raises(TypeError): + create_fgac_policy( + policy_name="test_no_token", + policy_type="COLUMN_MASK", + securable_type="SCHEMA", + securable_fullname="cat.sch", + function_name="cat.sch.fn", + to_principals=["analysts"], + tag_name="pii", + ) + + def test_create_with_invalid_token_raises_value_error(self): + """create_fgac_policy with an invalid token should raise ValueError before admin check.""" + with pytest.raises(ValueError, match="Malformed approval token"): + create_fgac_policy( + policy_name="test_bad_token", + policy_type="COLUMN_MASK", + securable_type="SCHEMA", + securable_fullname="cat.sch", + function_name="cat.sch.fn", + to_principals=["analysts"], + tag_name="pii", + approval_token="garbage", + ) + + def test_create_without_admin_group_raises_permission_error(self): + """create_fgac_policy should raise PermissionError if user is not in admin group.""" + import databricks_tools_core.unity_catalog.fgac_policies as fgac_mod + + # Get a valid token via preview so we pass token validation + preview = preview_policy_changes( + action="CREATE", + policy_name="test_admin_check", + securable_type="SCHEMA", + securable_fullname="cat.sch", + policy_type="COLUMN_MASK", + to_principals=["analysts"], + function_name="cat.sch.fn", + tag_name="pii", + ) + # Temporarily set admin group to a non-existent group so the check fails + original = fgac_mod._ADMIN_GROUP + try: + fgac_mod._ADMIN_GROUP = "nonexistent_group_xyz_12345" + with pytest.raises(PermissionError, match="not a member of admin group"): + create_fgac_policy( + policy_name="test_admin_check", + policy_type="COLUMN_MASK", + securable_type="SCHEMA", + securable_fullname="cat.sch", + function_name="cat.sch.fn", + to_principals=["analysts"], + tag_name="pii", + approval_token=preview["approval_token"], + ) + finally: + fgac_mod._ADMIN_GROUP = original + + def test_preview_returns_approval_token(self): + """preview_policy_changes should return an approval_token.""" + result = preview_policy_changes( + action="CREATE", + policy_name="test_token_preview", + securable_type="SCHEMA", + securable_fullname="my_catalog.my_schema", + policy_type="COLUMN_MASK", + to_principals=["analysts"], + function_name="my_catalog.my_schema.mask_ssn", + tag_name="pii_type", + tag_value="ssn", + ) + + assert result["success"] is True + assert "approval_token" in result + assert isinstance(result["approval_token"], str) + assert ":" in result["approval_token"] + logger.info("Preview returned approval token") + + def test_full_preview_then_create_workflow( + self, + test_catalog: str, + uc_test_schema: str, + uc_test_table: str, + unique_name: str, + warehouse_id: str, + cleanup_functions, + cleanup_policies, + ): + """Should preview, extract token, then create with token (happy path).""" + full_schema = f"{test_catalog}.{uc_test_schema}" + policy_name = f"{UC_TEST_PREFIX}_tok_{unique_name}" + tag_key = f"uc_test_tok_{unique_name}" + tag_value = "email" + + cleanup_policies((policy_name, "SCHEMA", full_schema)) + + _create_governed_tag(tag_key, [tag_value]) + + try: + fn_name = f"{test_catalog}.{uc_test_schema}.{UC_TEST_PREFIX}_tok_fn_{unique_name}" + cleanup_functions(fn_name) + + create_security_function( + function_name=fn_name, + parameter_name="val", + parameter_type="STRING", + return_type="STRING", + function_body="RETURN CASE WHEN val IS NULL THEN NULL ELSE '***' END", + warehouse_id=warehouse_id, + ) + + set_tags( + object_type="column", + full_name=uc_test_table, + column_name="email", + tags={tag_key: tag_value}, + warehouse_id=warehouse_id, + ) + + # Preview to get token + preview = preview_policy_changes( + action="CREATE", + policy_name=policy_name, + securable_type="SCHEMA", + securable_fullname=full_schema, + policy_type="COLUMN_MASK", + to_principals=["account users"], + function_name=fn_name, + tag_name=tag_key, + tag_value=tag_value, + comment=f"Token test {unique_name}", + ) + token = preview["approval_token"] + + # Create with token + result = create_fgac_policy( + policy_name=policy_name, + policy_type="COLUMN_MASK", + securable_type="SCHEMA", + securable_fullname=full_schema, + function_name=fn_name, + to_principals=["account users"], + tag_name=tag_key, + approval_token=token, + tag_value=tag_value, + comment=f"Token test {unique_name}", + ) + + assert result["success"] is True + assert result["action"] == "created" + logger.info("Full preview-then-create workflow passed") + + # Clean up via SDK directly (bypass guardrails) + w = get_workspace_client() + w.policies.delete_policy( + on_securable_type="SCHEMA", + on_securable_fullname=full_schema, + name=policy_name, + ) + + finally: + _delete_governed_tag(tag_key) + + def test_token_with_mismatched_params_raises(self): + """Token from preview with name A should not work for create with name B.""" + preview = preview_policy_changes( + action="CREATE", + policy_name="policy_a", + securable_type="SCHEMA", + securable_fullname="cat.sch", + policy_type="COLUMN_MASK", + to_principals=["analysts"], + function_name="cat.sch.mask", + tag_name="pii", + ) + token = preview["approval_token"] + + with pytest.raises((ValueError, PermissionError)): + create_fgac_policy( + policy_name="policy_b", # Different name! + policy_type="COLUMN_MASK", + securable_type="SCHEMA", + securable_fullname="cat.sch", + function_name="cat.sch.mask", + to_principals=["analysts"], + tag_name="pii", + approval_token=token, + ) + + def test_expired_token_raises(self): + """Token past TTL should be rejected.""" + import databricks_tools_core.unity_catalog.fgac_policies as fgac_mod + + preview = preview_policy_changes( + action="CREATE", + policy_name="test_expire", + securable_type="SCHEMA", + securable_fullname="cat.sch", + policy_type="COLUMN_MASK", + to_principals=["analysts"], + function_name="cat.sch.mask", + tag_name="pii", + ) + token = preview["approval_token"] + + # Temporarily set TTL to 0 so the token is already expired + original_ttl = fgac_mod._TOKEN_TTL_SECONDS + try: + fgac_mod._TOKEN_TTL_SECONDS = 0 + with pytest.raises(ValueError, match="Expired approval token"): + create_fgac_policy( + policy_name="test_expire", + policy_type="COLUMN_MASK", + securable_type="SCHEMA", + securable_fullname="cat.sch", + function_name="cat.sch.mask", + to_principals=["analysts"], + tag_name="pii", + approval_token=token, + ) + finally: + fgac_mod._TOKEN_TTL_SECONDS = original_ttl + + def test_cross_action_replay_raises(self): + """DELETE preview token should not work for CREATE operation.""" + delete_preview = preview_policy_changes( + action="DELETE", + policy_name="test_replay", + securable_type="SCHEMA", + securable_fullname="cat.sch", + ) + token = delete_preview["approval_token"] + + with pytest.raises(ValueError, match="action mismatch"): + create_fgac_policy( + policy_name="test_replay", + policy_type="COLUMN_MASK", + securable_type="SCHEMA", + securable_fullname="cat.sch", + function_name="cat.sch.mask", + to_principals=["analysts"], + tag_name="pii", + approval_token=token, + ) + + +# --------------------------------------------------------------------------- +# Admin group check tests +# --------------------------------------------------------------------------- + + +@pytest.mark.integration +class TestAdminGroupCheck: + """Tests for admin group membership verification.""" + + def test_admin_check_passes(self): + """Should pass for workspace admin user (test profile user).""" + result = _check_admin_group() + assert result["is_admin"] is True + assert result["user"] is not None + assert result["admin_group"] == "admins" + logger.info(f"Admin check passed for user: {result['user']}") + + def test_admin_check_custom_group_fails(self): + """Should raise PermissionError for a non-existent group.""" + import databricks_tools_core.unity_catalog.fgac_policies as fgac_mod + + original = fgac_mod._ADMIN_GROUP + try: + fgac_mod._ADMIN_GROUP = "nonexistent_group_xyz_12345" + with pytest.raises(PermissionError) as exc_info: + _check_admin_group() + assert "nonexistent_group_xyz_12345" in str(exc_info.value) + logger.info("Admin check correctly denied for non-existent group") + finally: + fgac_mod._ADMIN_GROUP = original + + +# --------------------------------------------------------------------------- +# CRUD lifecycle tests +# --------------------------------------------------------------------------- + + +@pytest.mark.integration +class TestFgacPolicyCRUD: + """Tests for create, get, update, and delete policy operations. + + Each test creates its own governed tag via the Tag Policies API, + then cleans it up afterwards. No manual UI setup is required. + """ + + def test_create_get_update_delete_column_mask_policy( + self, + test_catalog: str, + uc_test_schema: str, + uc_test_table: str, + unique_name: str, + warehouse_id: str, + cleanup_functions, + cleanup_policies, + ): + """Should create, get, update, and delete a column mask policy.""" + full_schema = f"{test_catalog}.{uc_test_schema}" + policy_name = f"{UC_TEST_PREFIX}_mask_{unique_name}" + + # Unique governed tag for this test run + tag_key = f"uc_test_pii_{unique_name}" + tag_value = "email" + + # Register for cleanup + cleanup_policies((policy_name, "SCHEMA", full_schema)) + + # --- Setup: governed tag, masking UDF, column tag --- + _create_governed_tag(tag_key, [tag_value]) + + try: + fn_name = f"{test_catalog}.{uc_test_schema}.{UC_TEST_PREFIX}_mask_fn_{unique_name}" + cleanup_functions(fn_name) + + create_security_function( + function_name=fn_name, + parameter_name="val", + parameter_type="STRING", + return_type="STRING", + function_body="RETURN CASE WHEN val IS NULL THEN NULL ELSE '***' END", + warehouse_id=warehouse_id, + ) + logger.info(f"Created masking UDF: {fn_name}") + + # Apply governed tag to column + set_tags( + object_type="column", + full_name=uc_test_table, + column_name="email", + tags={tag_key: tag_value}, + warehouse_id=warehouse_id, + ) + logger.info(f"Tagged column email with {tag_key}={tag_value}") + + # --- PREVIEW CREATE --- + logger.info(f"Previewing FGAC policy creation: {policy_name}") + create_preview = preview_policy_changes( + action="CREATE", + policy_name=policy_name, + securable_type="SCHEMA", + securable_fullname=full_schema, + policy_type="COLUMN_MASK", + to_principals=["account users"], + function_name=fn_name, + tag_name=tag_key, + tag_value=tag_value, + comment=f"Test policy {unique_name}", + ) + assert "approval_token" in create_preview + create_token = create_preview["approval_token"] + + # --- CREATE --- + logger.info(f"Creating FGAC policy: {policy_name}") + create_result = create_fgac_policy( + policy_name=policy_name, + policy_type="COLUMN_MASK", + securable_type="SCHEMA", + securable_fullname=full_schema, + function_name=fn_name, + to_principals=["account users"], + tag_name=tag_key, + approval_token=create_token, + tag_value=tag_value, + comment=f"Test policy {unique_name}", + ) + + assert create_result["success"] is True + assert create_result["policy_name"] == policy_name + assert create_result["action"] == "created" + logger.info(f"Policy created: {create_result['details']}") + + # --- GET --- + logger.info(f"Getting policy: {policy_name}") + get_result = get_fgac_policy( + policy_name=policy_name, + securable_type="SCHEMA", + securable_fullname=full_schema, + ) + + assert get_result["success"] is True + assert get_result["policy"]["name"] == policy_name + logger.info(f"Policy details: {get_result['policy']}") + + # --- PREVIEW UPDATE --- + logger.info(f"Previewing update for: {policy_name}") + update_preview = preview_policy_changes( + action="UPDATE", + policy_name=policy_name, + securable_type="SCHEMA", + securable_fullname=full_schema, + comment=f"Updated test policy {unique_name}", + ) + update_token = update_preview["approval_token"] + + # --- UPDATE --- + logger.info(f"Updating policy: {policy_name}") + update_result = update_fgac_policy( + policy_name=policy_name, + securable_type="SCHEMA", + securable_fullname=full_schema, + approval_token=update_token, + comment=f"Updated test policy {unique_name}", + ) + + assert update_result["success"] is True + assert update_result["action"] == "updated" + assert "comment" in update_result["changes"] + logger.info(f"Policy updated: {update_result['changes']}") + + # --- Verify in list --- + list_result = list_fgac_policies( + securable_type="SCHEMA", + securable_fullname=full_schema, + ) + policy_names = [p.get("name") for p in list_result["policies"]] + assert policy_name in policy_names, f"Expected {policy_name} in {policy_names}" + logger.info(f"Policy found in list ({list_result['policy_count']} total)") + + # --- PREVIEW DELETE --- + logger.info(f"Previewing delete for: {policy_name}") + delete_preview = preview_policy_changes( + action="DELETE", + policy_name=policy_name, + securable_type="SCHEMA", + securable_fullname=full_schema, + ) + delete_token = delete_preview["approval_token"] + + # --- DELETE --- + logger.info(f"Deleting policy: {policy_name}") + delete_result = delete_fgac_policy( + policy_name=policy_name, + securable_type="SCHEMA", + securable_fullname=full_schema, + approval_token=delete_token, + ) + + assert delete_result["success"] is True + assert delete_result["action"] == "deleted" + logger.info("Policy deleted") + + finally: + _delete_governed_tag(tag_key) + + def test_create_row_filter_policy( + self, + test_catalog: str, + uc_test_schema: str, + uc_test_table: str, + unique_name: str, + warehouse_id: str, + cleanup_functions, + cleanup_policies, + ): + """Should create and delete a row filter policy.""" + full_schema = f"{test_catalog}.{uc_test_schema}" + policy_name = f"{UC_TEST_PREFIX}_filter_{unique_name}" + + # Unique governed tag for this test run + tag_key = f"uc_test_dept_{unique_name}" + tag_value = "filter" + + cleanup_policies((policy_name, "SCHEMA", full_schema)) + + # --- Setup: governed tag, zero-arg UDF, column tag --- + _create_governed_tag(tag_key, [tag_value]) + + try: + fn_name = f"{test_catalog}.{uc_test_schema}.{UC_TEST_PREFIX}_rf_fn_{unique_name}" + cleanup_functions(fn_name) + + execute_sql( + sql_query=f""" + CREATE OR REPLACE FUNCTION {fn_name}() + RETURNS BOOLEAN + RETURN TRUE + """, + warehouse_id=warehouse_id, + ) + logger.info(f"Created row filter UDF (0-arg): {fn_name}") + + # Apply governed tag to column + set_tags( + object_type="column", + full_name=uc_test_table, + column_name="department", + tags={tag_key: tag_value}, + warehouse_id=warehouse_id, + ) + logger.info(f"Tagged column department with {tag_key}={tag_value}") + + # Preview create + logger.info(f"Previewing row filter policy creation: {policy_name}") + create_preview = preview_policy_changes( + action="CREATE", + policy_name=policy_name, + securable_type="SCHEMA", + securable_fullname=full_schema, + policy_type="ROW_FILTER", + to_principals=["account users"], + function_name=fn_name, + tag_name=tag_key, + tag_value=tag_value, + comment=f"Test row filter {unique_name}", + ) + create_token = create_preview["approval_token"] + + # Create row filter policy + logger.info(f"Creating row filter policy: {policy_name}") + result = create_fgac_policy( + policy_name=policy_name, + policy_type="ROW_FILTER", + securable_type="SCHEMA", + securable_fullname=full_schema, + function_name=fn_name, + to_principals=["account users"], + tag_name=tag_key, + approval_token=create_token, + tag_value=tag_value, + comment=f"Test row filter {unique_name}", + ) + + assert result["success"] is True + assert result["action"] == "created" + assert result["details"]["policy_type"] == "ROW_FILTER" + logger.info(f"Row filter policy created: {result['details']}") + + # Preview delete + delete_preview = preview_policy_changes( + action="DELETE", + policy_name=policy_name, + securable_type="SCHEMA", + securable_fullname=full_schema, + ) + delete_token = delete_preview["approval_token"] + + # Delete policy + delete_fgac_policy( + policy_name=policy_name, + securable_type="SCHEMA", + securable_fullname=full_schema, + approval_token=delete_token, + ) + logger.info("Row filter policy deleted") + + finally: + _delete_governed_tag(tag_key) diff --git a/databricks-tools-core/uv.lock b/databricks-tools-core/uv.lock index 7238e81c..f3efe4b5 100644 --- a/databricks-tools-core/uv.lock +++ b/databricks-tools-core/uv.lock @@ -754,7 +754,7 @@ wheels = [ [[package]] name = "databricks-sdk" -version = "0.76.0" +version = "0.85.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "google-auth" }, @@ -763,9 +763,9 @@ dependencies = [ { name = "requests", version = "2.32.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, { name = "requests", version = "2.32.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/70/82/5efcfdca8779c84b5c6f61cc110d0938c9818e422f55c36a68d96b98c61f/databricks_sdk-0.76.0.tar.gz", hash = "sha256:fcfce4561b090b3c8e9cac2101f549766d9fb3bece31bb5720571919fa37d210", size = 822376, upload-time = "2025-12-17T17:11:31.907Z" } +sdist = { url = "https://files.pythonhosted.org/packages/7d/40/3941b6919c3854bd107e04be1686b3e0f1ce3ca4fbeea0c7fd81909bd90c/databricks_sdk-0.85.0.tar.gz", hash = "sha256:0b5f415fba69ea0c5bfc4d0b21cb3366c6b66f678e78e4b3c94cbcf2e9e0972f", size = 846275, upload-time = "2026-02-05T08:22:40.488Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/8e/96/ee7742b94f996560c57d6fb8d2e10eab3c489e8a72187369ed0917baf8aa/databricks_sdk-0.76.0-py3-none-any.whl", hash = "sha256:6696dda22bc52c8f50a50d24e6ccd1c855f92c0f68f5afe4eb2e77d5b1b1a65f", size = 774688, upload-time = "2025-12-17T17:11:29.925Z" }, + { url = "https://files.pythonhosted.org/packages/e9/e8/1a3292820762a9b48c4774d2f9297b2e2c43319dc4b5d31a585fb76e3a05/databricks_sdk-0.85.0-py3-none-any.whl", hash = "sha256:2a2da176a55d55fb84696e0255520e99e838dd942b97b971dff724041fe00c64", size = 796888, upload-time = "2026-02-05T08:22:39.018Z" }, ] [[package]] @@ -804,7 +804,7 @@ dev = [ [package.metadata] requires-dist = [ { name = "black", marker = "extra == 'dev'", specifier = ">=23.0.0" }, - { name = "databricks-sdk", specifier = ">=0.20.0" }, + { name = "databricks-sdk", specifier = ">=0.81.0" }, { name = "litellm", specifier = ">=1.0.0" }, { name = "pydantic", specifier = ">=2.0.0" }, { name = "pymupdf", specifier = ">=1.24.0" },