diff --git a/.gitignore b/.gitignore
index 385994fa..a170605d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
 # Databricks AI Dev Kit
 .ai-dev-kit/
 .claude/
-
+.local
 
 # Python
 __pycache__/
diff --git a/.test/README.md b/.test/README.md
index d5c8fe46..d2bbb2db 100644
--- a/.test/README.md
+++ b/.test/README.md
@@ -233,3 +233,17 @@ uv pip install -e ".test/"
 uv run pytest .test/tests/
 uv run python .test/scripts/regression.py <skill-name>
 ```
+
+---
+
+## Troubleshooting
+
+### MLflow evaluation not returning results
+
+If `/skill-test <skill-name> mlflow` hangs or doesn't return results, run manually with debug logging:
+
+```bash
+MLFLOW_LOG_LEVEL=DEBUG uv run python .test/scripts/mlflow_eval.py <skill-name>
+```
+
+This will show detailed MLflow API calls and help identify connection or authentication issues.
diff --git a/.test/baselines/databricks-synthetic-data-gen/baseline.yaml b/.test/baselines/databricks-synthetic-data-gen/baseline.yaml
new file mode 100644
index 00000000..b43273c8
--- /dev/null
+++ b/.test/baselines/databricks-synthetic-data-gen/baseline.yaml
@@ -0,0 +1,21 @@
+run_id: '20260303_071721'
+created_at: '2026-03-03T07:17:21.838623'
+skill_name: databricks-synthetic-data-gen
+metrics:
+  pass_rate: 1.0
+  total_tests: 4
+  passed_tests: 4
+  failed_tests: 0
+test_results:
+- id: grp_20260302_113344
+  passed: true
+  execution_mode: local
+- id: gen_serverless_job_catalog_json_002
+  passed: true
+  execution_mode: local
+- id: grp_20260302_retail_csv_3tables_003
+  passed: true
+  execution_mode: local
+- id: grp_20260303_manufacturing_delta_streaming_004
+  passed: true
+  execution_mode: local
diff --git a/.test/scripts/mlflow_eval.py b/.test/scripts/mlflow_eval.py
index caa2e45c..93278e4d 100755
--- a/.test/scripts/mlflow_eval.py
+++ b/.test/scripts/mlflow_eval.py
@@ -2,29 +2,65 @@
 """Run MLflow evaluation for a skill.
 
 Usage:
-    python mlflow_eval.py <skill_name> [--filter-category <category>] [--run-name <name>]
+    python mlflow_eval.py <skill_name> [--filter-category <category>] [--run-name <name>] [--timeout <seconds>]
 
 Environment Variables:
     DATABRICKS_CONFIG_PROFILE - Databricks CLI profile (default: "DEFAULT")
     MLFLOW_TRACKING_URI - Set to "databricks" for Databricks MLflow
     MLFLOW_EXPERIMENT_NAME - Experiment path (e.g., "/Users/{user}/skill-test")
+    MLFLOW_LLM_JUDGE_TIMEOUT - Timeout in seconds for LLM judge evaluation (default: 120)
 """
+import os
 import sys
+import signal
 import argparse
 
+# Close stdin and disable tqdm progress bars when run non-interactively
+# This fixes hanging issues with tqdm/MLflow progress bars in background tasks
+if not sys.stdin.isatty():
+    try:
+        sys.stdin.close()
+        sys.stdin = open(os.devnull, 'r')
+    except Exception:
+        pass
+    # Disable tqdm progress bars
+    os.environ.setdefault("TQDM_DISABLE", "1")
+
 # Import common utilities
 from _common import setup_path, print_result, handle_error
 
 
+class TimeoutException(Exception):
+    pass
+
+
+def timeout_handler(signum, frame):
+    raise TimeoutException("MLflow evaluation timed out")
+
+
 def main():
     parser = argparse.ArgumentParser(description="Run MLflow evaluation for a skill")
     parser.add_argument("skill_name", help="Name of skill to evaluate")
     parser.add_argument("--filter-category", help="Filter by test category")
     parser.add_argument("--run-name", help="Custom MLflow run name")
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=120,
+        help="Timeout in seconds for evaluation (default: 120)",
+    )
     args = parser.parse_args()
 
     setup_path()
 
+    # Set up signal-based timeout (Unix only)
+    if hasattr(signal, 'SIGALRM'):
+        signal.signal(signal.SIGALRM, timeout_handler)
+        signal.alarm(args.timeout)
+    else:
+        # Windows: SIGALRM not available - no timeout enforcement
+        print("WARNING: Timeout not supported on Windows - test may run indefinitely", file=sys.stderr)
+
     try:
         from skill_test.runners import evaluate_skill
 
@@ -34,6 +70,10 @@ def main():
             run_name=args.run_name,
         )
 
+        # Cancel the alarm if we succeeded
+        if hasattr(signal, 'SIGALRM'):
+            signal.alarm(0)
+
         # Convert to standard result format
         if result.get("run_id"):
             result["success"] = True
@@ -42,7 +82,19 @@ def main():
 
         sys.exit(print_result(result))
 
+    except TimeoutException as e:
+        result = {
+            "success": False,
+            "skill_name": args.skill_name,
+            "error": f"Evaluation timed out after {args.timeout} seconds. This may indicate LLM judge endpoint issues.",
+            "error_type": "timeout",
+        }
+        sys.exit(print_result(result))
+
     except Exception as e:
+        # Cancel alarm on any exception
+        if hasattr(signal, 'SIGALRM'):
+            signal.alarm(0)
         sys.exit(handle_error(e, args.skill_name))
 
 
diff --git a/.test/skills/_routing/ground_truth.yaml b/.test/skills/_routing/ground_truth.yaml
index a428d5b7..f9948433 100644
--- a/.test/skills/_routing/ground_truth.yaml
+++ b/.test/skills/_routing/ground_truth.yaml
@@ -99,7 +99,7 @@ test_cases:
       prompt: "Generate synthetic customer data and evaluate the agent quality with MLflow scorers"
     expectations:
       expected_skills:
-        - "databricks-synthetic-data-generation"
+        - "databricks-synthetic-data-gen"
         - "databricks-mlflow-evaluation"
       is_multi_skill: true
     metadata:
diff --git a/.test/skills/databricks-synthetic-data-gen/candidates.yaml b/.test/skills/databricks-synthetic-data-gen/candidates.yaml
new file mode 100644
index 00000000..c6c2a270
--- /dev/null
+++ b/.test/skills/databricks-synthetic-data-gen/candidates.yaml
@@ -0,0 +1,7 @@
+# Candidates for databricks-synthetic-data-gen skill
+# Test cases pending review before promotion to ground_truth.yaml
+#
+# Use `/skill-test databricks-synthetic-data-gen add` to create new candidates
+# Use `/skill-test databricks-synthetic-data-gen review` to promote candidates to ground truth
+
+candidates: []
diff --git a/.test/skills/databricks-synthetic-data-gen/expected_responses/gen_serverless_job_catalog_json_002.txt b/.test/skills/databricks-synthetic-data-gen/expected_responses/gen_serverless_job_catalog_json_002.txt
new file mode 100644
index 00000000..16065794
--- /dev/null
+++ b/.test/skills/databricks-synthetic-data-gen/expected_responses/gen_serverless_job_catalog_json_002.txt
@@ -0,0 +1,331 @@
+Here's the Python script to generate product catalog data, followed by the serverless job definition:
+
+```python
+"""
+Synthetic Product Catalog Data Generation
+Serverless Databricks Job with JSON Output
+
+Generates:
+- products (3,000 rows): product_id, name, category (weighted), price (log-normal), inventory_count
+- sales (10,000 rows): sale_id, product_id (FK), quantity, sale_date, discount_pct
+
+Popular product categories have more sales (weighted sampling).
+"""
+
+import os
+from pyspark.sql import SparkSession, functions as F
+from pyspark.sql.window import Window
+from pyspark.sql.types import StringType, DoubleType, IntegerType
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+CATALOG = "my_catalog"
+SCHEMA = "devkit_gen2_test_small_json"
+VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
+
+NUM_PRODUCTS = 3_000
+NUM_SALES = 10_000
+NUM_PARTITIONS = 16
+
+END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
+START_DATE = END_DATE - timedelta(days=180)
+SEED = 42
+
+# =============================================================================
+# SPARK SESSION
+# =============================================================================
+
+def is_databricks_runtime():
+    return "DATABRICKS_RUNTIME_VERSION" in os.environ
+
+if is_databricks_runtime():
+    spark = SparkSession.builder.getOrCreate()
+    print("Running on Databricks Runtime")
+else:
+    from databricks.connect import DatabricksSession
+    spark = DatabricksSession.builder.serverless(True).getOrCreate()
+    print("Running with Databricks Connect (serverless)")
+
+print("=" * 60)
+print("PRODUCT CATALOG DATA GENERATION")
+print("=" * 60)
+print(f"Catalog: {CATALOG}")
+print(f"Schema: {SCHEMA}")
+print(f"Products: {NUM_PRODUCTS:,}")
+print(f"Sales: {NUM_SALES:,}")
+print("=" * 60)
+
+# =============================================================================
+# CREATE INFRASTRUCTURE
+# =============================================================================
+print("\nCreating infrastructure...")
+spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
+spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data")
+
+# =============================================================================
+# PANDAS UDFs
+# =============================================================================
+
+@F.pandas_udf(StringType())
+def fake_product_name(categories: pd.Series) -> pd.Series:
+    from faker import Faker
+    fake = Faker()
+
+    templates = {
+        "Electronics": ["Smart", "Wireless", "Digital", "Pro", "Ultra"],
+        "Home & Garden": ["Premium", "Deluxe", "Classic", "Modern", "Natural"],
+        "Clothing": ["Designer", "Casual", "Comfort", "Luxury", "Sport"],
+        "Sports": ["Pro", "Elite", "Performance", "Outdoor", "Active"],
+        "Books": ["Complete", "Essential", "Ultimate", "Practical", "Advanced"],
+        "Toys": ["Fun", "Creative", "Educational", "Super", "Magic"],
+        "Beauty": ["Natural", "Premium", "Radiance", "Pure", "Glow"],
+        "Automotive": ["Pro", "Heavy-Duty", "Premium", "Performance", "Ultra"],
+    }
+
+    products = {
+        "Electronics": ["Headphones", "Speaker", "Charger", "Watch", "Camera"],
+        "Home & Garden": ["Lamp", "Planter", "Organizer", "Rug", "Vase"],
+        "Clothing": ["T-Shirt", "Jacket", "Pants", "Sweater", "Dress"],
+        "Sports": ["Ball", "Racket", "Mat", "Gloves", "Bag"],
+        "Books": ["Guide", "Handbook", "Manual", "Edition", "Collection"],
+        "Toys": ["Game", "Puzzle", "Building Set", "Robot", "Craft Kit"],
+        "Beauty": ["Serum", "Cream", "Lotion", "Mask", "Oil"],
+        "Automotive": ["Tool Kit", "Cover", "Mat", "Cleaner", "Polish"],
+    }
+
+    names = []
+    for category in categories:
+        template_list = templates.get(category, ["Premium"])
+        product_list = products.get(category, ["Item"])
+        template = np.random.choice(template_list)
+        product = np.random.choice(product_list)
+        color = fake.color_name()
+        names.append(f"{template} {color} {product}")
+
+    return pd.Series(names)
+
+
+@F.pandas_udf(DoubleType())
+def generate_price(categories: pd.Series) -> pd.Series:
+    price_params = {
+        "Electronics": (4.5, 0.8),
+        "Home & Garden": (3.8, 0.7),
+        "Clothing": (3.5, 0.6),
+        "Sports": (4.0, 0.7),
+        "Books": (2.8, 0.4),
+        "Toys": (3.2, 0.6),
+        "Beauty": (3.3, 0.5),
+        "Automotive": (4.2, 0.8),
+    }
+
+    prices = []
+    for category in categories:
+        mu, sigma = price_params.get(category, (3.5, 0.6))
+        price = float(np.random.lognormal(mu, sigma))
+        price = round(price) - 0.01 if price > 1 else round(price, 2)
+        prices.append(max(0.99, price))
+
+    return pd.Series(prices)
+
+
+@F.pandas_udf(IntegerType())
+def generate_inventory(ids: pd.Series) -> pd.Series:
+    inventory = (np.random.pareto(a=2.0, size=len(ids)) + 1) * 20
+    return pd.Series(inventory.astype(int))
+
+
+# =============================================================================
+# GENERATE PRODUCTS TABLE (Master)
+# =============================================================================
+print(f"\nGenerating {NUM_PRODUCTS:,} products...")
+
+products_df = (
+    spark.range(0, NUM_PRODUCTS, numPartitions=NUM_PARTITIONS)
+    .select(
+        F.concat(F.lit("PROD-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("product_id"),
+        F.col("id").alias("_idx"),
+    )
+    .withColumn(
+        "category",
+        F.when(F.rand(SEED) < 0.25, "Electronics")
+         .when(F.rand(SEED + 1) < 0.45, "Home & Garden")
+         .when(F.rand(SEED + 2) < 0.63, "Clothing")
+         .when(F.rand(SEED + 3) < 0.75, "Sports")
+         .when(F.rand(SEED + 4) < 0.85, "Books")
+         .when(F.rand(SEED + 5) < 0.93, "Toys")
+         .when(F.rand(SEED + 6) < 0.98, "Beauty")
+         .otherwise("Automotive")
+    )
+    .withColumn("name", fake_product_name(F.col("category")))
+    .withColumn("price", generate_price(F.col("category")))
+    .withColumn("inventory_count", generate_inventory(F.col("_idx")))
+    .drop("_idx")
+)
+
+products_final = products_df.select(
+    "product_id", "name", "category", "price", "inventory_count"
+)
+
+products_json_path = f"{VOLUME_PATH}/products"
+print(f"Writing products to {products_json_path}...")
+products_final.write.mode("overwrite").json(products_json_path)
+
+products_for_fk = spark.read.json(products_json_path).select("product_id", "category")
+product_count = products_for_fk.count()
+print(f"Products written: {product_count:,}")
+
+# =============================================================================
+# GENERATE SALES TABLE (with Referential Integrity)
+# =============================================================================
+print(f"\nGenerating {NUM_SALES:,} sales with referential integrity...")
+
+product_weights = products_for_fk.select(
+    "product_id",
+    "category",
+    F.when(F.col("category") == "Electronics", 3.0)
+     .when(F.col("category") == "Home & Garden", 2.5)
+     .when(F.col("category") == "Clothing", 2.0)
+     .when(F.col("category") == "Sports", 1.5)
+     .when(F.col("category") == "Books", 1.2)
+     .when(F.col("category") == "Toys", 1.0)
+     .when(F.col("category") == "Beauty", 0.8)
+     .otherwise(0.5).alias("weight")
+)
+
+weighted_products = (
+    product_weights
+    .select(
+        F.col("product_id"),
+        F.col("category"),
+        F.explode(F.array_repeat(F.col("product_id"), F.col("weight").cast("int"))).alias("_dup")
+    )
+    .drop("_dup")
+)
+
+sampled_products = (
+    weighted_products
+    .orderBy(F.rand(SEED + 20))
+    .limit(NUM_SALES)
+    .withColumn("sale_rank", F.row_number().over(Window.orderBy(F.monotonically_increasing_id())))
+    .select("product_id", "category", "sale_rank")
+)
+
+sales_base = (
+    spark.range(0, NUM_SALES, numPartitions=NUM_PARTITIONS)
+    .withColumn("sale_rank", F.row_number().over(Window.orderBy(F.col("id"))))
+    .withColumn(
+        "sale_id",
+        F.concat(F.lit("SALE-"), F.lpad(F.col("id").cast("string"), 6, "0"))
+    )
+)
+
+sales_with_products = sales_base.join(
+    sampled_products,
+    on="sale_rank",
+    how="inner"
+)
+
+sales_df = (
+    sales_with_products
+    .withColumn(
+        "quantity",
+        F.when(F.rand(SEED + 21) < 0.60, 1)
+         .when(F.rand(SEED + 22) < 0.85, 2)
+         .when(F.rand(SEED + 23) < 0.95, 3)
+         .otherwise(F.floor(F.rand(SEED + 24) * 5 + 4).cast("int"))
+    )
+    .withColumn(
+        "sale_date",
+        F.date_add(F.lit(START_DATE.date()), (F.rand(SEED + 25) * 180).cast("int"))
+    )
+    .withColumn(
+        "discount_pct",
+        F.when(F.rand(SEED + 26) < 0.70, 0.0)
+         .when(F.rand(SEED + 27) < 0.85, 0.10)
+         .when(F.rand(SEED + 28) < 0.95, 0.20)
+         .otherwise(0.30)
+    )
+)
+
+sales_final = sales_df.select(
+    "sale_id", "product_id", "quantity", "sale_date", "discount_pct"
+)
+
+sales_json_path = f"{VOLUME_PATH}/sales"
+print(f"Writing sales to {sales_json_path}...")
+sales_final.write.mode("overwrite").json(sales_json_path)
+
+sales_count = spark.read.json(sales_json_path).count()
+print(f"Sales written: {sales_count:,}")
+
+# =============================================================================
+# VALIDATION
+# =============================================================================
+print("\n" + "=" * 60)
+print("VALIDATION")
+print("=" * 60)
+
+products_check = spark.read.json(products_json_path)
+sales_check = spark.read.json(sales_json_path)
+
+print(f"\nRow counts:")
+print(f"  Products: {products_check.count():,}")
+print(f"  Sales: {sales_check.count():,}")
+
+print(f"\nSales per category (Electronics should have most):")
+sales_with_category = sales_check.join(
+    products_check.select("product_id", "category"),
+    on="product_id"
+)
+sales_with_category.groupBy("category").agg(
+    F.count("*").alias("total_sales"),
+    F.sum("quantity").alias("total_quantity")
+).orderBy(F.desc("total_sales")).show()
+
+orphan_sales = sales_check.join(
+    products_check.select("product_id"),
+    on="product_id",
+    how="left_anti"
+).count()
+print(f"\nReferential integrity:")
+print(f"  Orphan sales (no matching product): {orphan_sales}")
+
+print("\n" + "=" * 60)
+print("GENERATION COMPLETE")
+print("=" * 60)
+print(f"\nOutput location: {VOLUME_PATH}")
+```
+
+To run this as a serverless job, create the job with this configuration:
+
+```json
+{
+  "name": "generate_product_catalog_data",
+  "tasks": [
+    {
+      "task_key": "generate_data",
+      "spark_python_task": {
+        "python_file": "/Workspace/Users/your_email/generate_product_catalog_data.py"
+      },
+      "environment_key": "default"
+    }
+  ],
+  "environments": [
+    {
+      "environment_key": "default",
+      "spec": {
+        "client": "4",
+        "dependencies": [
+          "faker",
+          "pandas",
+          "numpy"
+        ]
+      }
+    }
+  ]
+}
+```
diff --git a/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260302_113344.txt b/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260302_113344.txt
new file mode 100644
index 00000000..39997ba5
--- /dev/null
+++ b/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260302_113344.txt
@@ -0,0 +1,233 @@
+I'll generate the e-commerce data locally using Polars and Faker, then upload to Unity Catalog.
+
+## Generation Plan
+
+**Output Location:** `ai_dev_kit.devkit_gen1_test_small_parquet`
+**Volume:** `/Volumes/devkit/devkit_gen1_test_small_parquet/raw_data/`
+
+| Table | Columns | Rows |
+|-------|---------|------|
+| customers | customer_id, name, email, tier, region, created_at | 5,000 |
+| orders | order_id, customer_id (FK), amount, order_date, status | 15,000 |
+
+**Order Distribution by Tier:**
+- Enterprise (10%): ~8 orders per customer average
+- Pro (30%): ~3 orders per customer average
+- Free (60%): ~1.5 orders per customer average
+
+Here's the complete script:
+
+```python
+import polars as pl
+import numpy as np
+from faker import Faker
+from datetime import datetime, timedelta
+import random
+from databricks.sdk import WorkspaceClient
+
+# Configuration
+CATALOG = "devkit"
+SCHEMA = "devkit_gen1_test_small_parquet"
+NUM_CUSTOMERS = 5000
+NUM_ORDERS = 15000
+
+fake = Faker()
+Faker.seed(42)
+np.random.seed(42)
+random.seed(42)
+
+# Date ranges
+END_DATE = datetime.now()
+CUSTOMER_START_DATE = END_DATE - timedelta(days=730)  # 2 years
+ORDER_START_DATE = END_DATE - timedelta(days=180)  # 6 months
+
+# =============================================================================
+# Generate Customers
+# =============================================================================
+print("Generating customers...")
+
+# Tier distribution: Free 60%, Pro 30%, Enterprise 10%
+tier_weights = [0.6, 0.3, 0.1]
+tiers = random.choices(["Free", "Pro", "Enterprise"], weights=tier_weights, k=NUM_CUSTOMERS)
+
+# Region distribution
+region_weights = [0.4, 0.3, 0.2, 0.1]
+regions = random.choices(["US-East", "US-West", "EU", "APAC"], weights=region_weights, k=NUM_CUSTOMERS)
+
+customers = pl.DataFrame({
+    "customer_id": [f"CUST-{i:05d}" for i in range(NUM_CUSTOMERS)],
+    "name": [fake.name() for _ in range(NUM_CUSTOMERS)],
+    "email": [fake.email() for _ in range(NUM_CUSTOMERS)],
+    "tier": tiers,
+    "region": regions,
+    "created_at": [
+        fake.date_time_between(start_date=CUSTOMER_START_DATE, end_date=END_DATE)
+        for _ in range(NUM_CUSTOMERS)
+    ],
+})
+
+print(f"Generated {len(customers)} customers")
+print(customers.group_by("tier").len().sort("tier"))
+
+# =============================================================================
+# Generate Orders with Tier-Weighted Distribution
+# =============================================================================
+print("\nGenerating orders with tier-weighted distribution...")
+
+# Calculate orders per tier to achieve ~15,000 total with weighted distribution
+# Enterprise: ~8 orders/customer, Pro: ~3, Free: ~1.5
+tier_counts = customers.group_by("tier").len()
+enterprise_count = tier_counts.filter(pl.col("tier") == "Enterprise")["len"][0]
+pro_count = tier_counts.filter(pl.col("tier") == "Pro")["len"][0]
+free_count = tier_counts.filter(pl.col("tier") == "Free")["len"][0]
+
+# Target distribution
+enterprise_orders_per = 8
+pro_orders_per = 3
+free_orders_per = 1.5
+
+total_target = enterprise_count * enterprise_orders_per + pro_count * pro_orders_per + free_count * free_orders_per
+scale = NUM_ORDERS / total_target
+
+enterprise_orders = int(enterprise_count * enterprise_orders_per * scale)
+pro_orders = int(pro_count * pro_orders_per * scale)
+free_orders = NUM_ORDERS - enterprise_orders - pro_orders
+
+# Get customer IDs by tier
+enterprise_customers = customers.filter(pl.col("tier") == "Enterprise")["customer_id"].to_list()
+pro_customers = customers.filter(pl.col("tier") == "Pro")["customer_id"].to_list()
+free_customers = customers.filter(pl.col("tier") == "Free")["customer_id"].to_list()
+
+# Generate customer_ids for orders
+order_customer_ids = (
+    random.choices(enterprise_customers, k=enterprise_orders) +
+    random.choices(pro_customers, k=pro_orders) +
+    random.choices(free_customers, k=free_orders)
+)
+random.shuffle(order_customer_ids)
+
+# Create customer tier lookup
+customer_tier_map = dict(zip(customers["customer_id"].to_list(), customers["tier"].to_list()))
+
+# Generate amounts based on tier (log-normal distribution)
+def generate_amount(tier: str) -> float:
+    if tier == "Enterprise":
+        return round(float(np.random.lognormal(6.2, 0.8)), 2)  # mean ~$500
+    elif tier == "Pro":
+        return round(float(np.random.lognormal(5.0, 0.7)), 2)  # mean ~$150
+    else:
+        return round(float(np.random.lognormal(3.9, 0.6)), 2)  # mean ~$50
+
+amounts = [generate_amount(customer_tier_map[cid]) for cid in order_customer_ids]
+
+# Status distribution: delivered 65%, shipped 15%, processing 10%, pending 5%, cancelled 5%
+statuses = random.choices(
+    ["delivered", "shipped", "processing", "pending", "cancelled"],
+    weights=[0.65, 0.15, 0.10, 0.05, 0.05],
+    k=NUM_ORDERS
+)
+
+orders = pl.DataFrame({
+    "order_id": [f"ORD-{i:06d}" for i in range(NUM_ORDERS)],
+    "customer_id": order_customer_ids,
+    "amount": amounts,
+    "order_date": [
+        fake.date_between(start_date=ORDER_START_DATE.date(), end_date=END_DATE.date())
+        for _ in range(NUM_ORDERS)
+    ],
+    "status": statuses,
+})
+
+print(f"Generated {len(orders)} orders")
+
+# Verify tier-weighted distribution
+orders_with_tier = orders.join(
+    customers.select(["customer_id", "tier"]),
+    on="customer_id"
+)
+print("\nOrders per tier:")
+print(orders_with_tier.group_by("tier").len().sort("tier"))
+print("\nAverage orders per customer by tier:")
+print(
+    orders_with_tier.group_by("tier")
+    .agg(pl.len().alias("total_orders"))
+    .join(tier_counts, on="tier")
+    .with_columns((pl.col("total_orders") / pl.col("len")).alias("avg_orders_per_customer"))
+    .select(["tier", "avg_orders_per_customer"])
+    .sort("tier")
+)
+
+# =============================================================================
+# Save Locally as Parquet
+# =============================================================================
+print("\nSaving to local Parquet files...")
+local_path = "synthetic_data_output"
+import os
+os.makedirs(local_path, exist_ok=True)
+
+customers.write_parquet(f"{local_path}/customers.parquet")
+orders.write_parquet(f"{local_path}/orders.parquet")
+print(f"Saved to {local_path}/")
+
+# =============================================================================
+# Upload to Unity Catalog Volume
+# =============================================================================
+print("\nUploading to Unity Catalog...")
+
+w = WorkspaceClient()
+
+# Create schema and volume if needed
+from databricks.sdk.service.catalog import VolumeType
+
+try:
+    w.schemas.create(name=SCHEMA, catalog_name=CATALOG)
+    print(f"Created schema {CATALOG}.{SCHEMA}")
+except Exception as e:
+    if "SCHEMA_ALREADY_EXISTS" in str(e):
+        print(f"Schema {CATALOG}.{SCHEMA} already exists")
+    else:
+        raise
+
+try:
+    w.volumes.create(
+        catalog_name=CATALOG,
+        schema_name=SCHEMA,
+        name="raw_data",
+        volume_type=VolumeType.MANAGED
+    )
+    print(f"Created volume {CATALOG}.{SCHEMA}.raw_data")
+except Exception as e:
+    if "already exists" in str(e).lower():
+        print(f"Volume {CATALOG}.{SCHEMA}.raw_data already exists")
+    else:
+        raise
+
+# Upload files
+volume_path = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
+
+with open(f"{local_path}/customers.parquet", "rb") as f:
+    w.files.upload(f"{volume_path}/customers.parquet", f, overwrite=True)
+print(f"Uploaded customers.parquet to {volume_path}/")
+
+with open(f"{local_path}/orders.parquet", "rb") as f:
+    w.files.upload(f"{volume_path}/orders.parquet", f, overwrite=True)
+print(f"Uploaded orders.parquet to {volume_path}/")
+
+print(f"\n✅ Complete! Data available at:")
+print(f"   {volume_path}/customers.parquet")
+print(f"   {volume_path}/orders.parquet")
+```
+
+This script:
+1. Generates 5,000 customers with weighted tier distribution (Free 60%, Pro 30%, Enterprise 10%)
+2. Generates 15,000 orders with tier-weighted customer assignment (Enterprise ~8x, Pro ~3x, Free ~1.5x orders per customer)
+3. Uses log-normal distributions for realistic order amounts by tier
+4. Saves locally as Parquet files
+5. Creates the schema and volume in Unity Catalog if needed
+6. Uploads the Parquet files to the volume
+
+Run with:
+```bash
+uv pip install polars faker numpy databricks-sdk
+uv run python generate_ecommerce_data.py
+```
diff --git a/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260302_retail_csv_3tables_003.txt b/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260302_retail_csv_3tables_003.txt
new file mode 100644
index 00000000..55be3712
--- /dev/null
+++ b/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260302_retail_csv_3tables_003.txt
@@ -0,0 +1,297 @@
+Here's the complete script to generate synthetic retail order data with 3 tables and full referential integrity:
+
+```python
+"""
+Synthetic Retail Order Data Generation
+======================================
+3 related tables with full referential integrity:
+- customers (10,000 rows)
+- orders (50,000 rows)
+- line_items (150,000 rows)
+
+Output: CSV files with headers to Unity Catalog volume
+Execution: Databricks Connect with serverless compute
+"""
+
+import os
+from pyspark.sql import functions as F
+from pyspark.sql.types import StringType, DoubleType, IntegerType
+from pyspark.sql.window import Window
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+CATALOG = "ai_dev_kit"
+SCHEMA = "devkit_gen3_test_medium_csv"
+VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
+
+NUM_CUSTOMERS = 10_000
+NUM_ORDERS = 50_000
+NUM_LINE_ITEMS = 150_000
+NUM_PARTITIONS = 32
+SEED = 42
+
+END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
+START_DATE = END_DATE - timedelta(days=180)
+
+# =============================================================================
+# SPARK SESSION
+# =============================================================================
+def is_databricks_runtime():
+    return "DATABRICKS_RUNTIME_VERSION" in os.environ
+
+if is_databricks_runtime():
+    from pyspark.sql import SparkSession
+    spark = SparkSession.builder.getOrCreate()
+    print("Running on Databricks Runtime")
+else:
+    from databricks.connect import DatabricksSession, DatabricksEnv
+
+    # Install dependencies on serverless cluster
+    env = DatabricksEnv().withDependencies("faker", "pandas", "numpy")
+    spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate()
+    print("Running with Databricks Connect (serverless with managed dependencies)")
+
+# =============================================================================
+# CREATE INFRASTRUCTURE
+# =============================================================================
+spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
+spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data")
+
+# =============================================================================
+# PANDAS UDFs
+# =============================================================================
+
+@F.pandas_udf(StringType())
+def fake_name(ids: pd.Series) -> pd.Series:
+    from faker import Faker
+    fake = Faker()
+    Faker.seed(SEED)
+    return pd.Series([fake.name() for _ in range(len(ids))])
+
+@F.pandas_udf(StringType())
+def fake_email(names: pd.Series) -> pd.Series:
+    from faker import Faker
+    fake = Faker()
+    emails = []
+    for name in names:
+        parts = name.lower().split()
+        if len(parts) >= 2:
+            email = f"{parts[0]}.{parts[-1]}@{fake.free_email_domain()}"
+        else:
+            email = f"{parts[0]}{np.random.randint(100, 999)}@{fake.free_email_domain()}"
+        emails.append(email)
+    return pd.Series(emails)
+
+@F.pandas_udf(StringType())
+def fake_product_name(ids: pd.Series) -> pd.Series:
+    from faker import Faker
+    fake = Faker()
+    product_types = ["Chair", "Table", "Lamp", "Desk", "Shelf", "Cabinet", "Sofa", "Rug",
+                    "Mirror", "Clock", "Vase", "Frame", "Pillow", "Blanket", "Candle",
+                    "Mug", "Bowl", "Plate", "Glass", "Bottle", "Box", "Bag", "Hat",
+                    "Watch", "Headphones", "Speaker", "Charger", "Cable", "Case"]
+    products = []
+    for _ in range(len(ids)):
+        color = fake.color_name()
+        adj = fake.word().capitalize()
+        product = np.random.choice(product_types)
+        products.append(f"{color} {adj} {product}")
+    return pd.Series(products)
+
+@F.pandas_udf(DoubleType())
+def generate_unit_price(ids: pd.Series) -> pd.Series:
+    """Log-normal unit prices (median ~$35, range $5-$500)"""
+    prices = np.random.lognormal(mean=3.5, sigma=0.7, size=len(ids))
+    prices = np.clip(prices, 5.0, 500.0)
+    return pd.Series(np.round(prices, 2))
+
+# =============================================================================
+# GENERATE CUSTOMERS TABLE
+# =============================================================================
+customers_df = (
+    spark.range(0, NUM_CUSTOMERS, numPartitions=NUM_PARTITIONS)
+    .select(
+        F.concat(F.lit("CUST-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("customer_id"),
+        F.col("id").alias("_idx")
+    )
+    .withColumn("name", fake_name(F.col("_idx")))
+    .withColumn("email", fake_email(F.col("name")))
+    .withColumn("membership_level",
+        F.when(F.rand(SEED) < 0.50, "Bronze")
+         .when(F.rand(SEED + 1) < 0.80, "Silver")
+         .when(F.rand(SEED + 2) < 0.95, "Gold")
+         .otherwise("Platinum")
+    )
+    .withColumn("region",
+        F.when(F.rand(SEED + 3) < 0.30, "US-East")
+         .when(F.rand(SEED + 4) < 0.55, "US-West")
+         .when(F.rand(SEED + 5) < 0.80, "EU")
+         .when(F.rand(SEED + 6) < 0.95, "APAC")
+         .otherwise("Other")
+    )
+    .drop("_idx")
+)
+
+# Write to temp Delta table (no .cache() on serverless)
+customers_tmp = f"{CATALOG}.{SCHEMA}._tmp_customers"
+customers_df.write.mode("overwrite").saveAsTable(customers_tmp)
+customers_df = spark.table(customers_tmp)
+
+# =============================================================================
+# GENERATE ORDERS TABLE (weighted by membership level)
+# =============================================================================
+customer_weights = customers_df.select(
+    "customer_id",
+    "membership_level",
+    F.when(F.col("membership_level") == "Platinum", 10.0)
+     .when(F.col("membership_level") == "Gold", 7.0)
+     .when(F.col("membership_level") == "Silver", 5.0)
+     .otherwise(3.0).alias("weight")
+)
+
+weighted_customers = (
+    customer_weights
+    .withColumn("replicate_count", (F.col("weight") * 2).cast("int"))
+    .select(
+        F.col("customer_id"),
+        F.explode(F.array_repeat(F.col("customer_id"), F.col("replicate_count"))).alias("_dup")
+    )
+    .drop("_dup")
+)
+
+sampled_customers = (
+    weighted_customers
+    .orderBy(F.rand(SEED + 10))
+    .limit(NUM_ORDERS)
+    .withColumn("_row", F.row_number().over(Window.orderBy(F.monotonically_increasing_id())))
+    .select("customer_id", "_row")
+)
+
+orders_base = (
+    spark.range(0, NUM_ORDERS, numPartitions=NUM_PARTITIONS)
+    .withColumn("order_id",
+        F.concat(F.lit("ORD-"), F.lpad(F.col("id").cast("string"), 6, "0")))
+    .withColumn("_row", F.row_number().over(Window.orderBy(F.col("id"))))
+)
+
+orders_df = (
+    orders_base
+    .join(sampled_customers, on="_row", how="inner")
+    .drop("_row", "id")
+    .withColumn("order_date",
+        F.date_add(F.lit(START_DATE.date()), (F.rand(SEED + 11) * 180).cast("int")))
+    .withColumn("status",
+        F.when(F.rand(SEED + 12) < 0.60, "Delivered")
+         .when(F.rand(SEED + 13) < 0.78, "Shipped")
+         .when(F.rand(SEED + 14) < 0.90, "Processing")
+         .when(F.rand(SEED + 15) < 0.96, "Pending")
+         .otherwise("Cancelled")
+    )
+    .withColumn("total_amount", F.lit(0.0))
+)
+
+orders_tmp = f"{CATALOG}.{SCHEMA}._tmp_orders"
+orders_df.write.mode("overwrite").saveAsTable(orders_tmp)
+orders_df = spark.table(orders_tmp)
+
+# =============================================================================
+# GENERATE LINE_ITEMS TABLE
+# =============================================================================
+order_ids = orders_df.select("order_id")
+
+items_per_order = (
+    order_ids
+    .withColumn("num_items",
+        F.when(F.rand(SEED + 20) < 0.10, 1)
+         .when(F.rand(SEED + 21) < 0.25, 2)
+         .when(F.rand(SEED + 22) < 0.45, 3)
+         .when(F.rand(SEED + 23) < 0.65, 4)
+         .when(F.rand(SEED + 24) < 0.80, 5)
+         .when(F.rand(SEED + 25) < 0.90, 6)
+         .when(F.rand(SEED + 26) < 0.96, 7)
+         .otherwise(8)
+    )
+)
+
+line_items_base = (
+    items_per_order
+    .select(
+        F.col("order_id"),
+        F.explode(F.sequence(F.lit(1), F.col("num_items"))).alias("item_seq")
+    )
+    .withColumn("_idx", F.monotonically_increasing_id())
+)
+
+line_items_df = (
+    line_items_base
+    .limit(NUM_LINE_ITEMS)
+    .withColumn("line_item_id",
+        F.concat(F.lit("LI-"), F.lpad(F.col("_idx").cast("string"), 7, "0")))
+    .withColumn("product_name", fake_product_name(F.col("_idx")))
+    .withColumn("quantity",
+        F.when(F.rand(SEED + 30) < 0.50, 1)
+         .when(F.rand(SEED + 31) < 0.80, 2)
+         .when(F.rand(SEED + 32) < 0.92, 3)
+         .when(F.rand(SEED + 33) < 0.97, 4)
+         .otherwise(5)
+    )
+    .withColumn("unit_price", generate_unit_price(F.col("_idx")))
+    .drop("_idx", "item_seq")
+)
+
+line_items_tmp = f"{CATALOG}.{SCHEMA}._tmp_line_items"
+line_items_df.write.mode("overwrite").saveAsTable(line_items_tmp)
+line_items_df = spark.table(line_items_tmp)
+
+# =============================================================================
+# COMPUTE ORDER TOTALS FROM LINE ITEMS
+# =============================================================================
+order_totals = (
+    line_items_df
+    .withColumn("line_total", F.col("quantity") * F.col("unit_price"))
+    .groupBy("order_id")
+    .agg(F.round(F.sum("line_total"), 2).alias("computed_total"))
+)
+
+orders_final = (
+    spark.table(orders_tmp)
+    .drop("total_amount")
+    .join(order_totals, on="order_id", how="left")
+    .withColumn("total_amount", F.coalesce(F.col("computed_total"), F.lit(0.0)))
+    .drop("computed_total")
+)
+
+# =============================================================================
+# EXPORT TO CSV
+# =============================================================================
+customers_final = spark.table(customers_tmp).select(
+    "customer_id", "name", "email", "membership_level", "region"
+)
+
+orders_out = orders_final.select(
+    "order_id", "customer_id", "order_date", "total_amount", "status"
+)
+
+line_items_out = spark.table(line_items_tmp).select(
+    "line_item_id", "order_id", "product_name", "quantity", "unit_price"
+)
+
+customers_final.coalesce(1).write.mode("overwrite").option("header", "true").csv(f"{VOLUME_PATH}/customers")
+orders_out.coalesce(1).write.mode("overwrite").option("header", "true").csv(f"{VOLUME_PATH}/orders")
+line_items_out.coalesce(4).write.mode("overwrite").option("header", "true").csv(f"{VOLUME_PATH}/line_items")
+
+# Cleanup temp tables
+spark.sql(f"DROP TABLE IF EXISTS {customers_tmp}")
+spark.sql(f"DROP TABLE IF EXISTS {orders_tmp}")
+spark.sql(f"DROP TABLE IF EXISTS {line_items_tmp}")
+```
+
+Run with (requires Python 3.12+):
+```bash
+uv pip install "databricks-connect>=16.4,<17.4" faker numpy pandas
+uv run python generate_retail_data.py
+```
diff --git a/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260303_manufacturing_delta_streaming_004.txt b/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260303_manufacturing_delta_streaming_004.txt
new file mode 100644
index 00000000..bd6f8ef4
--- /dev/null
+++ b/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260303_manufacturing_delta_streaming_004.txt
@@ -0,0 +1,235 @@
+Here's the complete script to generate manufacturing IoT sensor data with streaming-ready batch processing:
+
+```python
+"""
+Manufacturing IoT Sensor Data Generation
+========================================
+2 related tables with referential integrity:
+- sensor_devices (500 rows): Slowly changing dimension
+- sensor_readings (10,000 rows per batch): Streaming incremental
+
+Output: Delta tables registered in Unity Catalog
+Execution: Databricks Connect with serverless compute (Python 3.12+)
+"""
+
+from databricks.connect import DatabricksSession, DatabricksEnv
+from pyspark.sql import functions as F
+from pyspark.sql.types import StringType, DoubleType, TimestampType
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+
+# Configuration
+CATALOG = "ai_dev_kit"
+SCHEMA = "devkit_gen4_test_large_delta"
+
+# Row counts
+NUM_DEVICES = 500
+NUM_READINGS_PER_BATCH = 10000
+
+# Date range for readings (last 24 hours for streaming simulation)
+END_TIME = datetime.now()
+START_TIME = END_TIME - timedelta(hours=24)
+
+# Setup with managed dependencies (databricks-connect 16.4+)
+env = DatabricksEnv().withDependencies("faker", "pandas", "numpy")
+spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate()
+
+print(f"Connected to Databricks serverless compute")
+print(f"Output location: {CATALOG}.{SCHEMA}")
+
+# Create schema if not exists
+spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
+print(f"Schema {CATALOG}.{SCHEMA} ready")
+
+
+# =============================================================================
+# Pandas UDFs for data generation
+# =============================================================================
+
+@F.pandas_udf(StringType())
+def fake_device_name(ids: pd.Series) -> pd.Series:
+    """Generate realistic device names like 'SENS-T-00001'."""
+    from faker import Faker
+    fake = Faker()
+    Faker.seed(42)
+
+    prefixes = {"temperature": "T", "pressure": "P", "vibration": "V", "humidity": "H"}
+    names = []
+    for i, _ in enumerate(ids):
+        type_key = list(prefixes.keys())[i % 4]
+        prefix = prefixes[type_key]
+        names.append(f"SENS-{prefix}-{str(i).zfill(5)}")
+    return pd.Series(names)
+
+
+@F.pandas_udf(DoubleType())
+def generate_sensor_value(device_types: pd.Series) -> pd.Series:
+    """Generate realistic sensor values based on device type."""
+    values = []
+    for dtype in device_types:
+        if dtype == "temperature":
+            values.append(float(np.random.normal(70, 15)))  # Fahrenheit
+        elif dtype == "pressure":
+            values.append(float(np.random.normal(100, 10)))  # PSI
+        elif dtype == "vibration":
+            values.append(float(np.random.lognormal(1.5, 0.8)))  # mm/s with spikes
+        elif dtype == "humidity":
+            values.append(float(np.clip(np.random.normal(45, 10), 0, 100)))  # Percentage
+        else:
+            values.append(float(np.random.normal(50, 10)))
+    return pd.Series(values)
+
+
+@F.pandas_udf(StringType())
+def generate_unit(device_types: pd.Series) -> pd.Series:
+    """Generate appropriate unit based on device type."""
+    unit_map = {
+        "temperature": "°F",
+        "pressure": "PSI",
+        "vibration": "mm/s",
+        "humidity": "%"
+    }
+    return pd.Series([unit_map.get(dt, "units") for dt in device_types])
+
+
+# =============================================================================
+# Generate sensor_devices (slowly changing dimension)
+# =============================================================================
+
+print("\nGenerating sensor_devices table (slowly changing dimension)")
+
+# Device type weights: temperature 30%, pressure 25%, vibration 25%, humidity 20%
+devices_df = (
+    spark.range(0, NUM_DEVICES, numPartitions=4)
+    .select(
+        F.concat(F.lit("DEV-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("device_id"),
+        F.col("id").alias("_id")
+    )
+    .withColumn("device_name", fake_device_name(F.col("_id")))
+    .withColumn(
+        "device_type",
+        F.when(F.rand() < 0.30, "temperature")
+         .when(F.rand() < 0.55, "pressure")
+         .when(F.rand() < 0.80, "vibration")
+         .otherwise("humidity")
+    )
+    .withColumn(
+        "location",
+        F.when(F.rand() < 0.40, "Plant_A")
+         .when(F.rand() < 0.70, "Plant_B")
+         .when(F.rand() < 0.90, "Plant_C")
+         .otherwise("Warehouse")
+    )
+    .withColumn(
+        "install_date",
+        F.date_add(F.lit("2020-01-01"), (F.rand() * 1460).cast("int"))
+    )
+    .withColumn(
+        "status",
+        F.when(F.rand() < 0.85, "active")
+         .when(F.rand() < 0.95, "maintenance")
+         .otherwise("inactive")
+    )
+    .drop("_id")
+)
+
+# Write devices table
+devices_table = f"{CATALOG}.{SCHEMA}.sensor_devices"
+devices_df.write.mode("overwrite").saveAsTable(devices_table)
+print(f"Created {devices_table}")
+
+
+# =============================================================================
+# Generate sensor_readings (streaming incremental batch)
+# =============================================================================
+
+print("\nGenerating sensor_readings table (streaming batch)")
+
+# Read devices back for FK join (no .cache() on serverless!)
+devices_for_join = spark.table(devices_table).select("device_id", "device_type")
+
+# Generate readings with valid device_id FK
+readings_df = (
+    spark.range(0, NUM_READINGS_PER_BATCH, numPartitions=16)
+    .select(
+        F.concat(
+            F.lit("RDG-"),
+            F.date_format(F.current_timestamp(), "yyyyMMddHHmmss"),
+            F.lit("-"),
+            F.lpad(F.col("id").cast("string"), 6, "0")
+        ).alias("reading_id"),
+        (F.abs(F.hash(F.col("id"))) % NUM_DEVICES).alias("device_index"),
+        F.from_unixtime(
+            F.unix_timestamp(F.lit(START_TIME)) +
+            (F.rand() * 86400).cast("long")
+        ).cast("timestamp").alias("timestamp")
+    )
+)
+
+# Get device IDs with row numbers for joining
+devices_indexed = (
+    devices_for_join
+    .withColumn("device_index", F.monotonically_increasing_id() % NUM_DEVICES)
+)
+
+# Join to get valid device_id and device_type
+readings_with_device = (
+    readings_df
+    .join(
+        devices_indexed.select("device_id", "device_type", "device_index"),
+        on="device_index",
+        how="inner"
+    )
+    .drop("device_index")
+)
+
+# Add sensor values based on device type
+readings_final = (
+    readings_with_device
+    .withColumn("value", F.round(generate_sensor_value(F.col("device_type")), 2))
+    .withColumn("unit", generate_unit(F.col("device_type")))
+    .withColumn(
+        "quality_flag",
+        F.when(F.rand() < 0.90, "good")
+         .when(F.rand() < 0.97, "warning")
+         .otherwise("error")
+    )
+    .select("reading_id", "device_id", "timestamp", "value", "unit", "quality_flag")
+)
+
+# Write readings table (append mode for streaming simulation)
+readings_table = f"{CATALOG}.{SCHEMA}.sensor_readings"
+readings_final.write.mode("overwrite").saveAsTable(readings_table)
+print(f"Created {readings_table}")
+
+
+# =============================================================================
+# Validation
+# =============================================================================
+
+# Check referential integrity
+orphan_readings = spark.sql(f"""
+    SELECT COUNT(*) as orphan_count
+    FROM {readings_table} r
+    LEFT JOIN {devices_table} d ON r.device_id = d.device_id
+    WHERE d.device_id IS NULL
+""").collect()[0]["orphan_count"]
+
+print(f"Orphan readings (should be 0): {orphan_readings}")
+print(f"\nSUMMARY")
+print(f"Catalog/Schema: {CATALOG}.{SCHEMA}")
+print(f"Tables: {devices_table}, {readings_table}")
+print(f"Referential integrity: {'PASSED' if orphan_readings == 0 else 'FAILED'}")
+```
+
+Run with (requires Python 3.12+):
+```bash
+uv pip install "databricks-connect>=16.4,<17.4" faker numpy pandas
+uv run python generate_manufacturing_data.py
+```
+
+For streaming job deployment, this can be scheduled as an incremental job where each run:
+1. Generates a new batch of 10,000 readings with unique timestamps
+2. Appends to the sensor_readings table (change mode to "append")
+3. The sensor_devices table is the slowly-changing dimension that persists
diff --git a/.test/skills/databricks-synthetic-data-gen/ground_truth.yaml b/.test/skills/databricks-synthetic-data-gen/ground_truth.yaml
new file mode 100644
index 00000000..799c0c19
--- /dev/null
+++ b/.test/skills/databricks-synthetic-data-gen/ground_truth.yaml
@@ -0,0 +1,327 @@
+test_cases:
+
+- id: grp_20260302_113344
+  inputs:
+    prompt: 'Generate synthetic e-commerce data locally then save it to Unity Catalog.
+
+      Create 2 related tables with referential integrity:
+
+      - customers (5,000 rows): customer_id, name, email, tier (Free/Pro/Enterprise
+      weighted 60/30/10), region, created_at
+
+      - orders (15,000 rows): order_id, customer_id (FK to customers), amount, order_date,
+      status
+
+
+      Save as Parquet then upload to a Unity Catalog volume. Use schema name ''devkit_gen1_test_small_parquet''.
+
+      Enterprise customers should generate more orders than Free tier.'
+  outputs:
+    expected_response_file: expected_responses/grp_20260302_113344.txt
+    execution_success: true
+  expectations:
+    expected_facts:
+      - "parquet"
+      - "customer_id"
+      - "referential integrity"
+      - "weighted"
+      - "log-normal"
+    expected_patterns:
+      - pattern: "\\.write.*parquet"
+        min_count: 1
+        description: "Parquet output format"
+      - pattern: "customer_id"
+        min_count: 3
+        description: "Foreign key reference in multiple tables"
+      - pattern: "lognormal"
+        min_count: 1
+        description: "Log-normal distribution for amounts"
+    guidelines:
+      - "Orders table customer_id must only contain IDs from customers table"
+      - "Enterprise tier customers must have higher weight for order generation"
+      - "Amount distribution must use log-normal, not uniform"
+  metadata:
+    category: happy_path
+    source: interactive
+    created_at: '2026-03-02T11:33:44.621846'
+    execution_verified:
+      mode: local
+      verified_date: '2026-03-02'
+
+- id: gen_serverless_job_catalog_json_002
+  inputs:
+    prompt: 'Generate synthetic product catalog data that will run as a serverless
+      Databricks job.
+
+      Create 2 related tables with referential integrity:
+
+      - products (3,000 rows): product_id, name, category (weighted), price (log-normal),
+      inventory_count
+
+      - sales (10,000 rows): sale_id, product_id (FK to products), quantity, sale_date,
+      discount_pct
+
+
+      Save as JSON files to a Unity Catalog volume. Use schema name ''devkit_gen2_test_small_json''.
+
+      Create a job definition with environments for dependencies (faker).
+
+      Popular product categories should have more sales (weighted sampling).
+
+      '
+  outputs:
+    expected_response_file: expected_responses/gen_serverless_job_catalog_json_002.txt
+  expectations:
+    expected_facts:
+    - serverless
+    - environments
+    - dependencies
+    - client
+    - json
+    - product_id
+    - weighted
+    - lognormal
+    - pandas_udf
+    expected_patterns:
+    - pattern: environment_key.*default
+      min_count: 1
+      description: Serverless job environment configuration
+    - pattern: client.*4
+      min_count: 1
+      description: Correct client version for serverless
+    - pattern: \.write.*json
+      min_count: 1
+      description: JSON output format
+    - pattern: product_id
+      min_count: 3
+      description: Foreign key reference in multiple places
+    - pattern: '@F\.pandas_udf|pandas_udf'
+      min_count: 1
+      description: Pandas UDF for Faker parallelism
+    - pattern: lognormal|log-normal|log_normal
+      min_count: 1
+      description: Log-normal distribution for prices
+    - pattern: CREATE SCHEMA IF NOT EXISTS|CREATE VOLUME IF NOT EXISTS
+      min_count: 1
+      description: Infrastructure creation in script
+    guidelines:
+    - Must create serverless job with environments parameter for dependencies
+    - 'Job spec must include client: 4 (not 1)'
+    - Must NOT use .cache() or .persist() (serverless incompatible)
+  metadata:
+    category: happy_path
+    difficulty: medium
+    source: interactive_execution
+    execution_date: '2026-02-26'
+    execution_verified: true
+    job_run_id: '560746964795126'
+    tags:
+    - serverless-job
+    - small
+    - json
+    - referential-integrity
+    - weighted-sampling
+    - executed
+
+- id: grp_20260302_retail_csv_3tables_003
+  inputs:
+    prompt: |
+      Generate synthetic retail order data using Databricks Connect with serverless.
+      Create 3 related tables with full referential integrity:
+      - customers (10,000 rows): customer_id, name, email, membership_level (Bronze/Silver/Gold/Platinum weighted 50/30/15/5), region
+      - orders (50,000 rows): order_id, customer_id (FK to customers), order_date, total_amount, status
+      - line_items (150,000 rows): line_item_id, order_id (FK to orders), product_name, quantity, unit_price
+
+      Save as CSV files with headers to Unity Catalog volume. Use schema name 'devkit_gen3_test_medium_csv'.
+      Create realistic product names.
+      Higher membership levels should have more orders.
+      Order total_amount should equal sum of line_items.
+  outputs:
+    expected_response_file: expected_responses/grp_20260302_retail_csv_3tables_003.txt
+    execution_success: true
+  expectations:
+    expected_facts:
+      - "DatabricksSession"
+      - "serverless"
+      - "CSV"
+      - "header"
+      - "customer_id"
+      - "order_id"
+      - "line_item"
+      - "Faker"
+      - "pandas_udf"
+      - "membership_level"
+      - "weighted"
+      - "total_amount"
+      - "lognormal"
+    expected_patterns:
+      - pattern: "DatabricksSession.*serverless.*True"
+        min_count: 1
+        description: "Databricks Connect serverless configuration"
+      - pattern: "DatabricksEnv.*withDependencies"
+        min_count: 1
+        description: "Managed dependencies for serverless"
+      - pattern: "@F\\.pandas_udf|pandas_udf"
+        min_count: 1
+        description: "Pandas UDF for Faker parallelism"
+      - pattern: "customer_id"
+        min_count: 5
+        description: "FK in customers and orders (multiple references)"
+      - pattern: "order_id"
+        min_count: 5
+        description: "FK in orders and line_items (multiple references)"
+      - pattern: "\\.option.*header.*true.*\\.csv|\\.write.*csv"
+        min_count: 1
+        description: "CSV with headers"
+      - pattern: "Bronze|Silver|Gold|Platinum"
+        min_count: 4
+        description: "All membership levels present"
+      - pattern: "lognormal"
+        min_count: 1
+        description: "Log-normal distribution for pricing"
+      - pattern: "CREATE SCHEMA IF NOT EXISTS"
+        min_count: 1
+        description: "Infrastructure creation in script"
+      - pattern: "CREATE VOLUME IF NOT EXISTS"
+        min_count: 1
+        description: "Volume creation for CSV output"
+      - pattern: "total_amount.*sum|sum.*line_total|computed_total"
+        min_count: 1
+        description: "Order total computed from line items"
+    guidelines:
+      - "Must use DatabricksSession.builder.serverless(True).getOrCreate()"
+      - "Must use Spark + Faker + Pandas UDFs approach"
+      - "line_items.order_id must reference valid orders"
+      - "Membership level must be weighted: Bronze 50%, Silver 30%, Gold 15%, Platinum 5%"
+      - "Higher membership levels must generate more orders per customer"
+  metadata:
+    category: happy_path
+    difficulty: hard
+    source: interactive_execution
+    execution_date: '2026-03-02'
+    execution_verified: true
+    verified_output:
+      customers_rows: 10000
+      orders_rows: 50000
+      line_items_rows: 150000
+      membership_distribution:
+        Bronze: 5069
+        Silver: 3957
+        Gold: 919
+        Platinum: 55
+      orders_per_tier:
+        Bronze: 18170
+        Silver: 23560
+        Gold: 7613
+        Platinum: 657
+      orphan_orders: 0
+      orphan_line_items: 0
+    tags:
+      - databricks-connect
+      - serverless
+      - medium
+      - csv
+      - 3-tables
+      - pandas-udf
+      - referential-integrity
+      - weighted-sampling
+      - computed-totals
+      - executed
+
+- id: grp_20260303_manufacturing_delta_streaming_004
+  inputs:
+    prompt: |
+      Generate manufacturing data that will run incrementally with Python 3.12 and Databricks Serverless.
+      Create 2 related tables with referential integrity.
+      Create a sensor reading table that generates 10,000 rows per batch and configure to run as a streaming job.
+      Create a lookup table for the sensor device which changes slowly.
+      Save as Delta tables registered in Unity Catalog. Use catalog 'ai_dev_kit'. Use schema name 'devkit_gen4_test_large_delta'.
+  outputs:
+    expected_response_file: expected_responses/grp_20260303_manufacturing_delta_streaming_004.txt
+    execution_success: true
+  expectations:
+    expected_facts:
+      - "DatabricksSession"
+      - "serverless"
+      - "Delta"
+      - "saveAsTable"
+      - "device_id"
+      - "sensor"
+      - "reading"
+      - "Faker"
+      - "pandas_udf"
+      - "streaming"
+      - "incremental"
+      - "batch"
+      - "slowly changing"
+      - "lognormal"
+    expected_patterns:
+      - pattern: "DatabricksSession.*serverless.*True"
+        min_count: 1
+        description: "Databricks Connect serverless configuration"
+      - pattern: "DatabricksEnv.*withDependencies"
+        min_count: 1
+        description: "Managed dependencies for serverless"
+      - pattern: "@F\\.pandas_udf|pandas_udf"
+        min_count: 1
+        description: "Pandas UDF for Faker parallelism"
+      - pattern: "device_id"
+        min_count: 3
+        description: "FK in devices and readings (multiple references)"
+      - pattern: "\\.write.*saveAsTable|saveAsTable"
+        min_count: 2
+        description: "Delta table output for both tables"
+      - pattern: "CREATE SCHEMA IF NOT EXISTS"
+        min_count: 1
+        description: "Infrastructure creation in script"
+      - pattern: "sensor_devices|sensor_readings"
+        min_count: 2
+        description: "Both sensor tables present"
+      - pattern: "temperature|pressure|vibration|humidity"
+        min_count: 4
+        description: "All device types present"
+      - pattern: "lognormal"
+        min_count: 1
+        description: "Log-normal distribution for vibration sensor values"
+      - pattern: "mode.*overwrite|mode.*append"
+        min_count: 1
+        description: "Write mode for streaming support"
+    guidelines:
+      - "Must use DatabricksSession.builder.serverless(True).getOrCreate()"
+      - "Must use Spark + Faker + Pandas UDFs approach"
+      - "Must maintain referential integrity between devices and readings"
+      - "Must use Delta tables (saveAsTable) not file formats"
+      - "sensor_readings should support incremental batch processing"
+      - "Vibration should use log-normal for occasional spikes"
+  metadata:
+    category: happy_path
+    difficulty: medium
+    source: interactive_execution
+    execution_date: '2026-03-03'
+    execution_verified: true
+    verified_output:
+      sensor_devices_rows: 500
+      sensor_readings_rows: 10013
+      device_type_distribution:
+        temperature: 147
+        pressure: 179
+        vibration: 140
+        humidity: 34
+      quality_flag_distribution:
+        good: 9008
+        warning: 979
+        error: 26
+      orphan_readings: 0
+    tags:
+      - databricks-connect
+      - serverless
+      - large
+      - delta
+      - 2-tables
+      - pandas-udf
+      - referential-integrity
+      - streaming
+      - incremental
+      - iot
+      - manufacturing
+      - executed
diff --git a/.test/skills/databricks-synthetic-data-gen/manifest.yaml b/.test/skills/databricks-synthetic-data-gen/manifest.yaml
new file mode 100644
index 00000000..330f5de7
--- /dev/null
+++ b/.test/skills/databricks-synthetic-data-gen/manifest.yaml
@@ -0,0 +1,36 @@
+skill_name: databricks-synthetic-data-gen
+description: Tests for Databricks synthetic data generation skill covering Spark + Faker + Pandas UDFs, execution methods, output formats, and referential integrity
+version: 1.0.0
+
+scorers:
+  enabled:
+    - python_syntax            # Check Python code blocks for syntax errors
+    - no_hallucinated_apis     # Detect deprecated/wrong APIs
+    - pattern_adherence        # Regex match against expected patterns
+    - expected_facts_present   # Check if required facts mentioned
+
+  llm_scorers:
+    - Safety
+    - guidelines_from_expectations
+
+  default_guidelines:
+    - "Response must generate complete, runnable Python code"
+    - "Code must use the execution method specified in the prompt"
+    - "Code must save data in the output format specified"
+
+quality_gates:
+  syntax_valid: 1.0           # 100% - all Python syntax must be valid
+  pattern_adherence: 0.9      # 90% - follow expected patterns
+  execution_success: 0.8      # 80% - code execution success rate
+  no_hallucinations: 1.0      # 100% - no deprecated/invalid APIs
+
+trace_expectations:
+  tool_limits:
+    Bash: 10
+    Read: 20
+    Write: 15
+    Edit: 15
+  token_budget:
+    max_total: 200000
+  required_tools: []
+  banned_tools: []
diff --git a/.test/src/skill_test/config.py b/.test/src/skill_test/config.py
index 275e25aa..f4a42cb8 100644
--- a/.test/src/skill_test/config.py
+++ b/.test/src/skill_test/config.py
@@ -83,6 +83,9 @@ class MLflowConfig:
 
     tracking_uri: str = field(default_factory=lambda: _get_mlflow_tracking_uri())
     experiment_name: str = field(default_factory=lambda: os.getenv("MLFLOW_EXPERIMENT_NAME", "/Shared/skill-tests"))
+    llm_judge_timeout: int = field(
+        default_factory=lambda: int(os.getenv("MLFLOW_LLM_JUDGE_TIMEOUT", "120"))
+    )  # seconds - timeout for LLM judge evaluation
 
 
 def _get_mlflow_tracking_uri() -> str:
@@ -118,7 +121,7 @@ class DatabricksExecutionSettings:
     schema: str = field(default_factory=lambda: os.getenv("SKILL_TEST_SCHEMA", "skill_test"))
 
     # Execution settings
-    timeout: int = 120  # seconds
+    timeout: int = 240  # seconds - increased from 120s to handle larger data generation tasks
     preserve_context: bool = True  # Reuse context across code blocks
 
 
diff --git a/.test/src/skill_test/dataset.py b/.test/src/skill_test/dataset.py
index 9941ef12..5c88c330 100644
--- a/.test/src/skill_test/dataset.py
+++ b/.test/src/skill_test/dataset.py
@@ -41,17 +41,34 @@ class YAMLDatasetSource:
     yaml_path: Path
 
     def load(self) -> List[EvalRecord]:
-        """Load records from YAML ground_truth.yaml file."""
+        """Load records from YAML ground_truth.yaml file.
+
+        Supports external response files via 'expected_response_file' field in outputs.
+        When present, the response is loaded from the file relative to the YAML directory.
+        """
         with open(self.yaml_path) as f:
             data = yaml.safe_load(f)
 
+        yaml_dir = self.yaml_path.parent
+
         records = []
         for case in data.get("test_cases", []):
+            outputs = case.get("outputs")
+
+            # Load response from external file if specified
+            if outputs and "expected_response_file" in outputs:
+                response_file = yaml_dir / outputs["expected_response_file"]
+                if response_file.exists():
+                    with open(response_file) as rf:
+                        outputs = dict(outputs)  # Copy to avoid modifying original
+                        outputs["response"] = rf.read()
+                        del outputs["expected_response_file"]
+
             records.append(
                 EvalRecord(
                     id=case["id"],
                     inputs=case["inputs"],
-                    outputs=case.get("outputs"),
+                    outputs=outputs,
                     expectations=case.get("expectations"),
                     metadata=case.get("metadata", {}),
                 )
diff --git a/.test/src/skill_test/grp/executor.py b/.test/src/skill_test/grp/executor.py
index 5cd393bb..6f8dedee 100644
--- a/.test/src/skill_test/grp/executor.py
+++ b/.test/src/skill_test/grp/executor.py
@@ -1,6 +1,7 @@
 """Execute code blocks from skill responses to verify they work."""
 
 import ast
+import json
 import re
 import time
 import yaml
@@ -192,6 +193,26 @@ def verify_yaml_syntax(code: str) -> ExecutionResult:
         )
 
 
+def verify_json_syntax(code: str) -> ExecutionResult:
+    """Verify JSON syntax is valid."""
+    start_time = time.time()
+    try:
+        json.loads(code)
+        return ExecutionResult(
+            success=True,
+            output="JSON syntax valid",
+            error=None,
+            execution_time_ms=(time.time() - start_time) * 1000,
+        )
+    except json.JSONDecodeError as e:
+        return ExecutionResult(
+            success=False,
+            output="",
+            error=f"JSON syntax error: {e.msg} at line {e.lineno}, column {e.colno}",
+            execution_time_ms=(time.time() - start_time) * 1000,
+        )
+
+
 def verify_bash_structure(code: str) -> ExecutionResult:
     """Verify bash code structure (basic validation for examples)."""
     # For bash examples, just check that it's not empty and looks like shell commands
@@ -220,6 +241,8 @@ def execute_code_blocks(response: str) -> Tuple[int, int, List[Dict[str, Any]]]:
             result = verify_sql_structure(block.code)
         elif block.language in ("yaml", "yml"):
             result = verify_yaml_syntax(block.code)
+        elif block.language == "json":
+            result = verify_json_syntax(block.code)
         elif block.language in ("bash", "sh", "shell"):
             result = verify_bash_structure(block.code)
         else:
@@ -528,6 +551,16 @@ def execute_code_blocks_on_databricks(
                 mcp_execute_sql,
                 mcp_get_best_warehouse,
             )
+        elif block.language == "json":
+            # JSON blocks are validated locally (e.g., job definitions)
+            json_result = verify_json_syntax(block.code)
+            result = DatabricksExecutionResult(
+                success=json_result.success,
+                output=json_result.output,
+                error=json_result.error,
+                execution_time_ms=json_result.execution_time_ms,
+                execution_mode="local",
+            )
         else:
             # Skip unknown languages
             continue
diff --git a/.test/src/skill_test/runners/evaluate.py b/.test/src/skill_test/runners/evaluate.py
index 1dff1009..212dd92a 100644
--- a/.test/src/skill_test/runners/evaluate.py
+++ b/.test/src/skill_test/runners/evaluate.py
@@ -154,6 +154,7 @@ def evaluate_skill(
     config: Optional[SkillTestConfig] = None,
     run_name: Optional[str] = None,
     filter_category: Optional[str] = None,
+    timeout: Optional[int] = None,
 ) -> Dict[str, Any]:
     """
     Evaluate a skill using pre-computed outputs (Pattern 2).
@@ -163,6 +164,7 @@ def evaluate_skill(
         config: Configuration (uses defaults if None)
         run_name: MLflow run name
         filter_category: Filter test cases by category
+        timeout: Timeout in seconds for LLM judge evaluation (overrides config)
 
     Returns:
         Evaluation results dict with metrics and run_id
@@ -170,6 +172,9 @@ def evaluate_skill(
     if config is None:
         config = SkillTestConfig()
 
+    # Use provided timeout or fall back to config
+    eval_timeout = timeout if timeout is not None else config.mlflow.llm_judge_timeout
+
     setup_mlflow(config)
 
     # Load ground truth
@@ -192,13 +197,19 @@ def evaluate_skill(
     else:
         scorers = get_default_scorers()
 
-    # Run evaluation
+    # Run evaluation with timeout
     with mlflow.start_run(run_name=run_name or f"{skill_name}_eval"):
         mlflow.set_tags(
-            {"skill_name": skill_name, "test_count": len(eval_data), "filter_category": filter_category or "all"}
+            {
+                "skill_name": skill_name,
+                "test_count": len(eval_data),
+                "filter_category": filter_category or "all",
+                "timeout_seconds": eval_timeout,
+            }
         )
 
         # No predict_fn - using pre-computed outputs
+        # Run evaluation directly - timeout is handled via signal alarm on Unix
         results = mlflow.genai.evaluate(data=eval_data, scorers=scorers)
 
         return {
diff --git a/.test/src/skill_test/scorers/routing.py b/.test/src/skill_test/scorers/routing.py
index 1a03d698..fad45033 100644
--- a/.test/src/skill_test/scorers/routing.py
+++ b/.test/src/skill_test/scorers/routing.py
@@ -52,7 +52,7 @@
         "rest api",
     ],
     "databricks-jobs": ["job", "workflow", "task", "schedule", "trigger"],
-    "databricks-synthetic-data-generation": [
+    "databricks-synthetic-data-gen": [
         "synthetic data",
         "fake data",
         "generate data",
diff --git a/.test/tests/test_scorers.py b/.test/tests/test_scorers.py
index 66a39dbf..de5b0c09 100644
--- a/.test/tests/test_scorers.py
+++ b/.test/tests/test_scorers.py
@@ -52,10 +52,10 @@ def test_detect_mlflow_evaluation(self):
         assert "databricks-mlflow-evaluation" in skills
 
     def test_detect_synthetic_data(self):
-        """Test detection of databricks-synthetic-data-generation skill."""
+        """Test detection of databricks-synthetic-data-gen skill."""
         prompt = "Generate synthetic data for testing"
         skills = detect_skills_from_prompt(prompt)
-        assert "databricks-synthetic-data-generation" in skills
+        assert "databricks-synthetic-data-gen" in skills
 
     def test_detect_agent_bricks(self):
         """Test detection of databricks-agent-bricks skill."""
@@ -175,7 +175,7 @@ def test_all_skills_have_triggers(self):
             "databricks-asset-bundles",
             "databricks-python-sdk",
             "databricks-jobs",
-            "databricks-synthetic-data-generation",
+            "databricks-synthetic-data-gen",
             "databricks-mlflow-evaluation",
             "databricks-agent-bricks",
             "databricks-lakebase-provisioned",
diff --git a/databricks-builder-app/.env.example b/databricks-builder-app/.env.example
index c95a818f..f50ed4b6 100644
--- a/databricks-builder-app/.env.example
+++ b/databricks-builder-app/.env.example
@@ -53,10 +53,10 @@ DATABRICKS_MODEL_MINI=databricks-gemini-3-flash
 # Skills Configuration
 # =============================================================================
 # Skills to include (comma-separated list of skill folder names)
-ENABLED_SKILLS=databricks-agent-bricks,databricks-python-sdk,databricks-spark-declarative-pipelines,databricks-synthetic-data-generation,databricks-unstructured-pdf-generation
+ENABLED_SKILLS=databricks-agent-bricks,databricks-python-sdk,databricks-spark-declarative-pipelines,databricks-synthetic-data-gen,databricks-unstructured-pdf-generation
 
 # Optional: Add additional skills (example with databricks- prefixed skills)
-# ENABLED_SKILLS=databricks-agent-bricks,databricks-python-sdk,databricks-spark-declarative-pipelines,databricks-synthetic-data-generation,databricks-unstructured-pdf-generation
+# ENABLED_SKILLS=databricks-agent-bricks,databricks-python-sdk,databricks-spark-declarative-pipelines,databricks-synthetic-data-gen,databricks-unstructured-pdf-generation
 
 # Test mode: only enable Skill tool (useful for debugging)
 SKILLS_ONLY_MODE=false
diff --git a/databricks-builder-app/README.md b/databricks-builder-app/README.md
index b6a43135..42031cee 100644
--- a/databricks-builder-app/README.md
+++ b/databricks-builder-app/README.md
@@ -179,7 +179,7 @@ Skills include:
 - **databricks-python-sdk**: Python SDK patterns
 - **databricks-mlflow-evaluation**: MLflow evaluation and trace analysis
 - **databricks-spark-declarative-pipelines**: Spark Declarative Pipelines (SDP) development
-- **databricks-synthetic-data-generation**: Creating test datasets
+- **databricks-synthetic-data-gen**: Creating test datasets
 
 ### 5. Project Persistence
 
@@ -329,7 +329,7 @@ Skills are loaded from `../databricks-skills/` and filtered by the `ENABLED_SKIL
 
 - `databricks-python-sdk`: Patterns for using the Databricks Python SDK
 - `databricks-spark-declarative-pipelines`: SDP/DLT pipeline development
-- `databricks-synthetic-data-generation`: Creating test datasets
+- `databricks-synthetic-data-gen`: Creating test datasets
 - `databricks-app-apx`: Full-stack apps with React (APX framework)
 - `databricks-app-python`: Python apps with Dash, Streamlit, Flask
 
diff --git a/databricks-builder-app/app.yaml.example b/databricks-builder-app/app.yaml.example
index 4f77f7a7..8a5c0207 100644
--- a/databricks-builder-app/app.yaml.example
+++ b/databricks-builder-app/app.yaml.example
@@ -30,7 +30,7 @@ env:
   # =============================================================================
   # Comma-separated list of skills to enable
   - name: ENABLED_SKILLS
-    value: "databricks-asset-bundles,databricks-agent-bricks,databricks-aibi-dashboards,databricks-app-apx,databricks-app-python,databricks-config,databricks-docs,databricks-jobs,databricks-python-sdk,databricks-unity-catalog,databricks-mlflow-evaluation,databricks-spark-declarative-pipelines,databricks-synthetic-data-generation,databricks-unstructured-pdf-generation"
+    value: "databricks-asset-bundles,databricks-agent-bricks,databricks-aibi-dashboards,databricks-app-apx,databricks-app-python,databricks-config,databricks-docs,databricks-jobs,databricks-python-sdk,databricks-unity-catalog,databricks-mlflow-evaluation,databricks-spark-declarative-pipelines,databricks-synthetic-data-gen,databricks-unstructured-pdf-generation"
   - name: SKILLS_ONLY_MODE
     value: "false"
   
diff --git a/databricks-builder-app/client/src/pages/DocPage.tsx b/databricks-builder-app/client/src/pages/DocPage.tsx
index f8b7b29c..b7ee35ec 100644
--- a/databricks-builder-app/client/src/pages/DocPage.tsx
+++ b/databricks-builder-app/client/src/pages/DocPage.tsx
@@ -92,7 +92,7 @@ function OverviewSection() {
                   Skills explain <em>how</em> to do things and reference the tools from databricks-tools-core.
                 </p>
                 <div className="flex flex-wrap gap-2">
-                  {['databricks-asset-bundles/', 'databricks-app-apx/', 'databricks-app-python/', 'databricks-python-sdk/', 'databricks-mlflow-evaluation/', 'databricks-spark-declarative-pipelines/', 'databricks-synthetic-data-generation/'].map((skill) => (
+                  {['databricks-asset-bundles/', 'databricks-app-apx/', 'databricks-app-python/', 'databricks-python-sdk/', 'databricks-mlflow-evaluation/', 'databricks-spark-declarative-pipelines/', 'databricks-synthetic-data-gen/'].map((skill) => (
                     <span key={skill} className="text-xs px-2 py-1 rounded bg-[var(--color-accent-primary)]/10 text-[var(--color-text-secondary)] font-mono">
                       {skill}
                     </span>
@@ -204,7 +204,7 @@ function OverviewSection() {
               <div>
                 <p className="font-medium text-[var(--color-text-heading)]">Read Skill</p>
                 <p className="text-sm text-[var(--color-text-muted)] mt-1">
-                  Claude reads <code className="px-1 py-0.5 rounded bg-[var(--color-background)] text-xs">databricks-synthetic-data-generation/</code> skill to learn best practices
+                  Claude reads <code className="px-1 py-0.5 rounded bg-[var(--color-background)] text-xs">databricks-synthetic-data-gen/</code> skill to learn best practices
                 </p>
                 <div className="mt-2 flex flex-wrap gap-2">
                   {['Non-linear distributions', 'Referential integrity', 'Time patterns', 'Row coherence'].map((item) => (
diff --git a/databricks-builder-app/server/services/system_prompt.py b/databricks-builder-app/server/services/system_prompt.py
index 5b7b4fef..fd18f6cf 100644
--- a/databricks-builder-app/server/services/system_prompt.py
+++ b/databricks-builder-app/server/services/system_prompt.py
@@ -5,7 +5,7 @@
 # Mapping of user request patterns to skill names for the selection guide.
 # Only entries whose skill is enabled will be included in the prompt.
 _SKILL_GUIDE_ENTRIES = [
-  ('Generate data, synthetic data, fake data, test data', 'databricks-synthetic-data-generation'),
+  ('Generate data, synthetic data, fake data, test data', 'databricks-synthetic-data-gen'),
   ('Pipeline, ETL, bronze/silver/gold, data transformation', 'databricks-spark-declarative-pipelines'),
   ('Dashboard, visualization, BI, charts', 'databricks-aibi-dashboards'),
   ('Job, workflow, schedule, automation', 'databricks-jobs'),
diff --git a/databricks-skills/README.md b/databricks-skills/README.md
index afaccd9d..29a79ae8 100644
--- a/databricks-skills/README.md
+++ b/databricks-skills/README.md
@@ -58,7 +58,7 @@ cp -r ai-dev-kit/databricks-skills/databricks-agent-bricks .claude/skills/
 - **databricks-iceberg** - Apache Iceberg tables (Managed/Foreign), UniForm, Iceberg REST Catalog, Iceberg Clients Interoperability
 - **databricks-spark-declarative-pipelines** - SDP (formerly DLT) in SQL/Python
 - **databricks-jobs** - Multi-task workflows, triggers, schedules
-- **databricks-synthetic-data-generation** - Realistic test data with Faker
+- **databricks-synthetic-data-gen** - Realistic test data with Faker
 
 ### 🚀 Development & Deployment
 - **databricks-asset-bundles** - DABs for multi-environment deployments
diff --git a/databricks-skills/databricks-agent-bricks/SKILL.md b/databricks-skills/databricks-agent-bricks/SKILL.md
index 4aff7acb..04be7dad 100644
--- a/databricks-skills/databricks-agent-bricks/SKILL.md
+++ b/databricks-skills/databricks-agent-bricks/SKILL.md
@@ -28,7 +28,7 @@ Before creating Agent Bricks, ensure you have the required data:
 ### For Genie Spaces
 - **See the `databricks-genie` skill** for comprehensive Genie Space guidance
 - Tables in Unity Catalog with the data to explore
-- Generate raw data using the `databricks-synthetic-data-generation` skill
+- Generate raw data using the `databricks-synthetic-data-gen` skill
 - Create tables using the `databricks-spark-declarative-pipelines` skill
 
 ### For Supervisor Agents
@@ -119,7 +119,7 @@ Before creating Agent Bricks, generate the required source data:
 
 **For Genie (SQL exploration)**:
 ```
-1. Use `databricks-synthetic-data-generation` skill to create raw parquet data
+1. Use `databricks-synthetic-data-gen` skill to create raw parquet data
 2. Use `databricks-spark-declarative-pipelines` skill to create bronze/silver/gold tables
 ```
 
@@ -199,7 +199,7 @@ manage_mas(
 
 - **[databricks-genie](../databricks-genie/SKILL.md)** - Comprehensive Genie Space creation, curation, and Conversation API guidance
 - **[databricks-unstructured-pdf-generation](../databricks-unstructured-pdf-generation/SKILL.md)** - Generate synthetic PDFs to feed into Knowledge Assistants
-- **[databricks-synthetic-data-generation](../databricks-synthetic-data-generation/SKILL.md)** - Create raw data for Genie Space tables
+- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - Create raw data for Genie Space tables
 - **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - Build bronze/silver/gold tables consumed by Genie Spaces
 - **[databricks-model-serving](../databricks-model-serving/SKILL.md)** - Deploy custom agent endpoints used as MAS agents
 - **[databricks-vector-search](../databricks-vector-search/SKILL.md)** - Build vector indexes for RAG applications paired with KAs
diff --git a/databricks-skills/databricks-genie/SKILL.md b/databricks-skills/databricks-genie/SKILL.md
index 576771da..e5b32b6e 100644
--- a/databricks-skills/databricks-genie/SKILL.md
+++ b/databricks-skills/databricks-genie/SKILL.md
@@ -107,7 +107,7 @@ Before creating a Genie Space:
 ### Creating Tables
 
 Use these skills in sequence:
-1. `databricks-synthetic-data-generation` - Generate raw parquet files
+1. `databricks-synthetic-data-gen` - Generate raw parquet files
 2. `databricks-spark-declarative-pipelines` - Create bronze/silver/gold tables
 
 ## Common Issues
@@ -121,6 +121,6 @@ Use these skills in sequence:
 ## Related Skills
 
 - **[databricks-agent-bricks](../databricks-agent-bricks/SKILL.md)** - Use Genie Spaces as agents inside Supervisor Agents
-- **[databricks-synthetic-data-generation](../databricks-synthetic-data-generation/SKILL.md)** - Generate raw parquet data to populate tables for Genie
+- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - Generate raw parquet data to populate tables for Genie
 - **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - Build bronze/silver/gold tables consumed by Genie Spaces
 - **[databricks-unity-catalog](../databricks-unity-catalog/SKILL.md)** - Manage the catalogs, schemas, and tables Genie queries
diff --git a/databricks-skills/databricks-genie/spaces.md b/databricks-skills/databricks-genie/spaces.md
index 8549d6bd..225efe0e 100644
--- a/databricks-skills/databricks-genie/spaces.md
+++ b/databricks-skills/databricks-genie/spaces.md
@@ -163,7 +163,7 @@ The tool finds the existing space by name and updates it.
 
 ## Example End-to-End Workflow
 
-1. **Generate synthetic data** using `databricks-synthetic-data-generation` skill:
+1. **Generate synthetic data** using `databricks-synthetic-data-gen` skill:
    - Creates parquet files in `/Volumes/catalog/schema/raw_data/`
 
 2. **Create tables** using `databricks-spark-declarative-pipelines` skill:
diff --git a/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md b/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md
index 48a698f8..60afef0b 100644
--- a/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md
+++ b/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md
@@ -573,5 +573,5 @@ For advanced configuration options (development mode, continuous pipelines, cust
 
 - **[databricks-jobs](../databricks-jobs/SKILL.md)** - for orchestrating and scheduling pipeline runs
 - **[databricks-asset-bundles](../databricks-asset-bundles/SKILL.md)** - for multi-environment deployment of pipeline projects
-- **[databricks-synthetic-data-generation](../databricks-synthetic-data-generation/SKILL.md)** - for generating test data to feed into pipelines
+- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - for generating test data to feed into pipelines
 - **[databricks-unity-catalog](../databricks-unity-catalog/SKILL.md)** - for catalog/schema/volume management and governance
diff --git a/databricks-skills/databricks-synthetic-data-gen/SKILL.md b/databricks-skills/databricks-synthetic-data-gen/SKILL.md
new file mode 100644
index 00000000..5bd95e58
--- /dev/null
+++ b/databricks-skills/databricks-synthetic-data-gen/SKILL.md
@@ -0,0 +1,258 @@
+---
+name: databricks-synthetic-data-gen
+description: "Generate realistic synthetic data using Spark + Faker (strongly recommended). Supports serverless execution, multiple output formats (Parquet/JSON/CSV/Delta), and scales from thousands to millions of rows. For small datasets (<10K rows), can optionally generate locally and upload to volumes. Use when user mentions 'synthetic data', 'test data', 'generate data', 'demo dataset', 'Faker', or 'sample data'."
+---
+
+> Catalog and schema are **always user-supplied** — never default to any value. If the user hasn't provided them, ask. For any UC write, **always create the schema if it doesn't exist** before writing data.
+
+# Databricks Synthetic Data Generation
+
+Generate realistic, story-driven synthetic data for Databricks using **Spark + Faker + Pandas UDFs** (strongly recommended).
+
+## Quick Reference
+
+| Topic | Guide | When to Use |
+|-------|-------|-------------|
+| **Setup & Execution** | [references/1-setup-and-execution.md](references/1-setup-and-execution.md) | Setting up environment, choosing compute, installing dependencies |
+| **Generation Approaches** | [references/2-generation-approaches.md](references/2-generation-approaches.md) | Choosing Spark UDFs vs Polars local, writing generation code |
+| **Data Patterns** | [references/3-data-patterns.md](references/3-data-patterns.md) | Creating realistic distributions, referential integrity, time patterns |
+| **Domain Guidance** | [references/4-domain-guidance.md](references/4-domain-guidance.md) | E-commerce, IoT, financial, support/CRM domain patterns |
+| **Output Formats** | [references/5-output-formats.md](references/5-output-formats.md) | Choosing output format, saving to volumes/tables |
+| **Troubleshooting** | [references/6-troubleshooting.md](references/6-troubleshooting.md) | Fixing errors, debugging issues |
+| **Example Script** | [scripts/generate_synthetic_data.py](scripts/generate_synthetic_data.py) | Complete Spark + Pandas UDF example |
+
+## Package Manager
+
+Prefer `uv` for all Python operations. Fall back to `pip` only if `uv` is not available.
+
+```bash
+# Preferred
+uv pip install "databricks-connect>=16.4,<17.4" faker numpy pandas holidays
+uv run python generate_data.py
+
+# Fallback if uv not available
+pip install "databricks-connect>=16.4,<17.4" faker numpy pandas holidays
+python generate_data.py
+```
+
+## Critical Rules
+
+1. **Strongly prefer to use Spark + Faker + Pandas UDFs** for data generation (scalable, parallel)
+2. **If user specifies local** then use Polars locally instead of Spark, but suggest Spark if > 30,000 rows.
+3. **Present a plan for user approval** before generating any code
+4. **Ask for catalog/schema** - do not default
+5. **Use serverless compute** unless user explicitly requests classic cluster
+6. **Generate raw data only** - no pre-aggregated fields (unless user requests)
+7. **Create master tables first** - then generate related tables with valid FKs
+8. **NEVER use `.cache()` or `.persist()` with serverless compute** - these operations are NOT supported and will fail with `AnalysisException: PERSIST TABLE is not supported on serverless compute`. Instead, write master tables to Delta first, then read them back for FK joins.
+
+## Generation Planning Workflow
+
+**Before generating any code, you MUST present a plan for user approval.**
+
+### ⚠️ MUST DO: Confirm Catalog Before Proceeding
+
+**You MUST explicitly ask the user which catalog to use.** Do not assume or proceed without confirmation.
+
+Example prompt to user:
+> "Which Unity Catalog should I use for this data?"
+
+When presenting your plan, always show the selected catalog prominently:
+```
+📍 Output Location: catalog_name.schema_name
+   Volume: /Volumes/catalog_name/schema_name/raw_data/
+```
+
+This makes it easy for the user to spot and correct if needed.
+
+### Step 1: Gather Requirements
+
+Ask the user about:
+- **Catalog/Schema** - Which catalog to use?
+- What domain/scenario? (e-commerce, support tickets, IoT sensors, etc.)
+- How many tables? What relationships between them?
+- Approximate row counts per table?
+- Output format preference? (Delta table is default)
+
+### Step 2: Present Table Specification
+
+Show a clear specification with **YOUR ASSUMPTIONS surfaced**. Always start with the output location:
+
+```
+📍 Output Location: {user_catalog}.ecommerce_demo
+   Volume: /Volumes/{user_catalog}/ecommerce_demo/raw_data/
+```
+
+| Table | Columns | Rows | Key Assumptions |
+|-------|---------|------|-----------------|
+| customers | customer_id, name, email, tier, region | 5,000 | Tier: Free 60%, Pro 30%, Enterprise 10% |
+| orders | order_id, customer_id (FK), amount, status | 15,000 | Enterprise customers generate 5x more orders |
+
+**Assumptions I'm making:**
+- Amount distribution: log-normal by tier (Enterprise ~$1800, Pro ~$245, Free ~$55)
+- Status: 65% delivered, 15% shipped, 10% processing, 5% pending, 5% cancelled
+
+**Ask user**: "Does this look correct? Any adjustments to the catalog, tables, or distributions?"
+
+### Step 3: Ask About Data Features
+
+- [x] Skew (non-uniform distributions) - **Enabled by default**
+- [x] Joins (referential integrity) - **Enabled by default**
+- [ ] Bad data injection (for data quality testing)
+- [ ] Multi-language text
+- [ ] Incremental mode (append vs overwrite)
+
+### Pre-Generation Checklist
+
+- [ ] **Catalog confirmed** - User explicitly approved which catalog to use
+- [ ] Output location shown prominently in plan (easy to spot/change)
+- [ ] Table specification shown and approved
+- [ ] Assumptions about distributions confirmed
+- [ ] User confirmed compute preference (serverless recommended)
+- [ ] Data features selected
+
+**Do NOT proceed to code generation until user approves the plan, including the catalog.**
+
+## Quick Start: Spark + Faker + Pandas UDFs
+
+```python
+from databricks.connect import DatabricksSession, DatabricksEnv
+from pyspark.sql import functions as F
+from pyspark.sql.types import StringType, DoubleType
+import pandas as pd
+import numpy as np
+
+# Setup with managed dependencies (databricks-connect 16.4+)
+env = DatabricksEnv().withDependencies("faker", "pandas", "numpy")
+spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate()
+
+# Define Pandas UDFs
+@F.pandas_udf(StringType())
+def fake_name(ids: pd.Series) -> pd.Series:
+    from faker import Faker
+    fake = Faker()
+    return pd.Series([fake.name() for _ in range(len(ids))])
+
+@F.pandas_udf(DoubleType())
+def generate_amount(tiers: pd.Series) -> pd.Series:
+    amounts = []
+    for tier in tiers:
+        if tier == "Enterprise":
+            amounts.append(float(np.random.lognormal(7.5, 0.8)))
+        elif tier == "Pro":
+            amounts.append(float(np.random.lognormal(5.5, 0.7)))
+        else:
+            amounts.append(float(np.random.lognormal(4.0, 0.6)))
+    return pd.Series(amounts)
+
+# Generate customers
+customers_df = (
+    spark.range(0, 10000, numPartitions=16)
+    .select(
+        F.concat(F.lit("CUST-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("customer_id"),
+        fake_name(F.col("id")).alias("name"),
+        F.when(F.rand() < 0.6, "Free")
+         .when(F.rand() < 0.9, "Pro")
+         .otherwise("Enterprise").alias("tier"),
+    )
+    .withColumn("arr", generate_amount(F.col("tier")))
+)
+
+# Save to Unity Catalog
+spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
+spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data")
+customers_df.write.mode("overwrite").parquet(f"/Volumes/{CATALOG}/{SCHEMA}/raw_data/customers")
+```
+
+## Common Patterns
+
+### Weighted Tier Distribution
+```python
+F.when(F.rand() < 0.6, "Free")
+ .when(F.rand() < 0.9, "Pro")
+ .otherwise("Enterprise").alias("tier")
+```
+
+### Log-Normal Amounts (Realistic Pricing)
+```python
+@F.pandas_udf(DoubleType())
+def generate_amount(tiers: pd.Series) -> pd.Series:
+    return pd.Series([
+        float(np.random.lognormal({"Enterprise": 7.5, "Pro": 5.5, "Free": 4.0}[t], 0.7))
+        for t in tiers
+    ])
+```
+
+### Date Range (Last 6 Months)
+```python
+from datetime import datetime, timedelta
+END_DATE = datetime.now()
+START_DATE = END_DATE - timedelta(days=180)
+
+F.date_add(F.lit(START_DATE.date()), (F.rand() * 180).cast("int")).alias("order_date")
+```
+
+### Infrastructure Creation
+```python
+# Always in script - assume catalog exists
+spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
+spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data")
+```
+
+## Execution Modes
+
+| Mode | Best For | Setup |
+|------|----------|-------|
+| **DB Connect 16.4+ Serverless** | Local dev, Python 3.12+ | `DatabricksEnv().withDependencies(...)` |
+| **Serverless Job** | Production, scheduled | Job with `environments` parameter |
+| **Classic Cluster** | Fallback only | Use Databricks CLI to install libraries. `databricks libraries install --json '{"cluster_id": "<cluster_id>", "libraries": [{"pypi": {"package": "faker"}}, {"pypi": {"package": "holidays"}}]}'` |
+
+See [references/1-setup-and-execution.md](references/1-setup-and-execution.md) for detailed setup instructions.
+
+## Output Formats
+
+| Format | Use Case | Code |
+|--------|----------|------|
+| **Parquet** (default) | SDP pipeline input | `df.write.parquet(path)` |
+| **JSON** | Log-style ingestion | `df.write.json(path)` |
+| **CSV** | Legacy systems | `df.write.option("header", "true").csv(path)` |
+| **Delta Table** | Direct analytics | `df.write.saveAsTable("catalog.schema.table")` |
+
+See [references/5-output-formats.md](references/5-output-formats.md) for detailed options.
+
+## Best Practices Summary
+
+### Execution
+- Use serverless (instant start, no cluster wait)
+- Ask for catalog/schema
+- Present plan before generating
+
+### Data Generation
+- **Spark + Faker + Pandas UDFs** for all cases
+- Master tables first, then related tables with valid FKs
+- Non-linear distributions (log-normal, Pareto, exponential)
+- Time patterns (weekday/weekend, holidays, seasonality)
+- Row coherence (correlated attributes)
+
+### Output
+- Create infrastructure in script (`CREATE SCHEMA/VOLUME IF NOT EXISTS`)
+- Do NOT create catalogs - assume they exist
+- Delta tables as default
+
+## Related Skills
+
+- **databricks-unity-catalog** - Managing catalogs, schemas, and volumes
+- **databricks-asset-bundles** - DABs for production deployment
+
+## Common Issues
+
+| Issue | Solution |
+|-------|----------|
+| `ModuleNotFoundError: faker` | See [references/1-setup-and-execution.md](references/1-setup-and-execution.md) |
+| Faker UDF is slow | Use `pandas_udf` for batch processing |
+| Out of memory | Increase `numPartitions` in `spark.range()` |
+| Referential integrity errors | Write master table to Delta first, read back for FK joins |
+| `PERSIST TABLE is not supported on serverless` | **NEVER use `.cache()` or `.persist()` with serverless** - write to Delta table first, then read back |
+| `F.window` vs `Window` confusion | Use `from pyspark.sql.window import Window` for `row_number()`, `rank()`, etc. `F.window` is for streaming only. |
+
+See [references/6-troubleshooting.md](references/6-troubleshooting.md) for full troubleshooting guide.
diff --git a/databricks-skills/databricks-synthetic-data-gen/references/1-setup-and-execution.md b/databricks-skills/databricks-synthetic-data-gen/references/1-setup-and-execution.md
new file mode 100644
index 00000000..3ec36fbc
--- /dev/null
+++ b/databricks-skills/databricks-synthetic-data-gen/references/1-setup-and-execution.md
@@ -0,0 +1,278 @@
+# Setup and Execution Guide
+
+This guide covers all execution modes for synthetic data generation, organized by Databricks Connect version and Python version.
+
+## Quick Decision Matrix
+
+| Your Environment | Recommended Approach |
+|------------------|---------------------|
+| Python 3.12+ with databricks-connect >= 16.4 | DatabricksEnv with withDependencies API |
+| Python 3.10/3.11 with older databricks-connect | Serverless job with environments parameter |
+| Classic compute (fallback only) | Manual cluster setup |
+
+## Option 1: Databricks Connect 16.4+ with Serverless (Recommended)
+
+**Best for:** Python 3.12+, local development with serverless compute
+
+**Install locally:**
+```bash
+# Preferred
+uv pip install "databricks-connect>=16.4,<17.4" faker numpy pandas holidays
+
+# Fallback if uv not available
+pip install "databricks-connect>=16.4,<17.4" faker numpy pandas holidays
+```
+
+**Configure ~/.databrickscfg:**
+```ini
+[DEFAULT]
+host = https://your-workspace.cloud.databricks.com/
+serverless_compute_id = auto
+auth_type = databricks-cli
+```
+
+**In your script:**
+```python
+from databricks.connect import DatabricksSession, DatabricksEnv
+
+# Pass dependencies as simple package name strings
+env = DatabricksEnv().withDependencies("faker", "pandas", "numpy", "holidays")
+
+# Create session with managed dependencies
+spark = (
+    DatabricksSession.builder
+    .withEnvironment(env)
+    .serverless(True)
+    .getOrCreate()
+)
+
+# Spark operations now execute on serverless compute with managed dependencies
+```
+
+**Version Detection (if needed):**
+```python
+import importlib.metadata
+
+def get_databricks_connect_version():
+    """Get databricks-connect version as (major, minor) tuple."""
+    try:
+        version_str = importlib.metadata.version('databricks-connect')
+        parts = version_str.split('.')
+        return (int(parts[0]), int(parts[1]))
+    except Exception:
+        return None
+
+db_version = get_databricks_connect_version()
+if db_version and db_version >= (16, 4):
+    # Use DatabricksEnv with withDependencies
+    pass
+```
+
+**Benefits:**
+- Instant start, no cluster wait
+- Local debugging and fast iteration
+- Automatic dependency management
+- Edit file, re-run immediately
+
+## Option 2: Older Databricks Connect or Python < 3.12
+
+**Best for:** Python 3.10/3.11, databricks-connect 15.1-16.3
+
+`DatabricksEnv()` and `withEnvironment()` are NOT available in older versions. Use serverless jobs with environments parameter instead.
+
+### Serverless Job Configuration Requirements
+
+**MUST use `"client": "4"` in the Environment Spec:**
+
+```json
+{
+  "environments": [{
+    "environment_key": "datagen_env",
+    "spec": {
+      "client": "4",
+      "dependencies": ["faker", "numpy", "pandas"]
+    }
+  }]
+}
+```
+
+> **Note:** Using `"client": "1"` will fail with environment configuration errors.
+
+### Script Deployment
+
+Deploy Python files (.py) to the workspace for serverless jobs:
+
+```bash
+databricks workspace import /Users/<user>@databricks.com/scripts/my_script.py \
+  --file ./my_script.py --format AUTO
+
+databricks workspace list /Users/<user>@databricks.com/scripts/
+```
+
+**Job config must reference the workspace path:**
+
+```json
+{
+  "spark_python_task": {
+    "python_file": "/Users/<user>@databricks.com/scripts/my_script.py"
+  },
+  "environment_key": "datagen_env"
+}
+```
+
+**DABs bundle configuration:**
+```yaml
+# databricks.yml
+bundle:
+  name: synthetic-data-gen
+
+resources:
+  jobs:
+    generate_data:
+      name: "Generate Synthetic Data"
+      tasks:
+        - task_key: generate
+          spark_python_task:
+            python_file: ./src/generate_data.py
+          environment_key: default
+
+environments:
+  default:
+    spec:
+      client: "4"
+      dependencies:
+        - faker
+        - numpy
+        - pandas
+        - holidays
+```
+
+## Option 3: Classic Cluster
+
+**Use when:** Serverless unavailable, or specific cluster features needed (GPUs, custom init scripts)
+
+### Step 1: Check Python Version Compatibility
+
+Pandas UDFs require matching Python minor versions between local and cluster.
+
+```bash
+# Check local Python
+uv run python --version  # or: python --version
+
+# Check cluster DBR version → Python version
+# DBR 17.x = Python 3.12
+# DBR 15.4 LTS = Python 3.11
+# DBR 14.3 LTS = Python 3.10
+databricks clusters get <cluster-id> | grep spark_version
+```
+
+### Step 2a: If Versions Match → Use Databricks Connect
+
+```bash
+# Install matching databricks-connect version (must match DBR major.minor)
+uv pip install "databricks-connect==17.3.*" faker numpy pandas holidays
+```
+
+```bash
+# Install libraries on cluster
+`databricks libraries install --json '{"cluster_id": "<cluster_id>", "libraries": [{"pypi": {"package": "faker"}}, {"pypi": {"package": "holidays"}}]}'`
+
+# Wait for INSTALLED status
+databricks libraries cluster-status <cluster-id>
+```
+
+```python
+# Run locally via Databricks Connect
+from databricks.connect import DatabricksSession
+
+spark = DatabricksSession.builder.clusterId("<cluster-id>").getOrCreate()
+# Your Spark code runs on the cluster
+```
+
+### Step 2b: If Versions Don't Match → Submit as Job
+
+**Ask user for approval before submitting.** Example prompt:
+> "Your local Python (3.11) doesn't match the cluster (3.12). Pandas UDFs require matching versions. Should I submit this as a job to run directly on the cluster instead?"
+
+```bash
+# Upload script to workspace
+databricks workspace import /Users/you@company.com/scripts/generate_data.py \
+  --file generate_data.py --format AUTO --overwrite
+
+# Submit job to run on cluster
+databricks jobs submit --json '{
+  "run_name": "Generate Data",
+  "tasks": [{
+    "task_key": "generate",
+    "existing_cluster_id": "<cluster-id>",
+    "spark_python_task": {
+      "python_file": "/Users/you@company.com/scripts/generate_data.py"
+    }
+  }]
+}'
+```
+
+### Classic Cluster Decision Flow
+
+```
+Local Python == Cluster Python?
+  ├─ YES → Install libs on cluster, run via Databricks Connect
+  └─ NO  → Ask user: "Submit as job instead?"
+           └─ Upload script + submit job
+```
+
+## Required Libraries
+
+Standard libraries for generating realistic synthetic data:
+
+| Library | Purpose | Required For |
+|---------|---------|--------------|
+| **faker** | Realistic names, addresses, emails, companies | Text data generation |
+| **numpy** | Statistical distributions | Non-linear distributions |
+| **pandas** | Data manipulation, Pandas UDFs | Spark UDF definitions |
+| **holidays** | Country-specific holiday calendars | Time-based patterns |
+
+## Environment Detection Pattern
+
+Use this pattern to auto-detect environment and choose the right session creation:
+
+```python
+import os
+import importlib.metadata
+
+def is_databricks_runtime():
+    """Check if running on Databricks Runtime vs locally."""
+    return "DATABRICKS_RUNTIME_VERSION" in os.environ
+
+def get_databricks_connect_version():
+    """Get databricks-connect version as (major, minor) tuple or None."""
+    try:
+        version_str = importlib.metadata.version('databricks-connect')
+        parts = version_str.split('.')
+        return (int(parts[0]), int(parts[1]))
+    except Exception:
+        return None
+
+on_runtime = is_databricks_runtime()
+db_version = get_databricks_connect_version()
+
+# Use DatabricksEnv if: locally + databricks-connect >= 16.4
+use_auto_dependencies = (not on_runtime) and db_version and db_version >= (16, 4)
+
+if use_auto_dependencies:
+    from databricks.connect import DatabricksSession, DatabricksEnv
+    env = DatabricksEnv().withDependencies("faker", "pandas", "numpy", "holidays")
+    spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate()
+else:
+    from databricks.connect import DatabricksSession
+    spark = DatabricksSession.builder.serverless(True).getOrCreate()
+```
+
+## Common Setup Issues
+
+| Issue | Solution |
+|-------|----------|
+| `ModuleNotFoundError: faker` | Install dependencies per execution mode above |
+| `DatabricksEnv not found` | Upgrade to databricks-connect >= 16.4 or use job with environments |
+| `serverless_compute_id` error | Add `serverless_compute_id = auto` to ~/.databrickscfg |
+| Classic cluster startup slow | Use serverless instead (instant start) |
diff --git a/databricks-skills/databricks-synthetic-data-gen/references/2-generation-approaches.md b/databricks-skills/databricks-synthetic-data-gen/references/2-generation-approaches.md
new file mode 100644
index 00000000..5d6feeca
--- /dev/null
+++ b/databricks-skills/databricks-synthetic-data-gen/references/2-generation-approaches.md
@@ -0,0 +1,205 @@
+# Data Generation Approaches
+
+Choose your approach based on scale and requirements. **Spark + Faker + Pandas UDFs is strongly preferred** for all cases.
+
+## Decision Table
+
+| Scenario | Recommended Approach |
+|----------|---------------------|
+| **Default - any data generation** | **Spark + Faker + Pandas UDFs** |
+| Large datasets (100K+ rows) | **Spark + Faker + Pandas UDFs** |
+| Medium datasets (10K-100K rows) | **Spark + Faker + Pandas UDFs** |
+| Small datasets (<10K rows) | **Spark + Faker + Pandas UDFs** (or Polars if user prefers local) |
+
+**Rule:** Always use Spark + Faker + Pandas UDFs unless user explicitly requests local generation for <10K rows.
+
+---
+
+## Approach 1: Spark + Faker + Pandas UDFs (Strongly Preferred)
+
+**Best for:** All dataset sizes, direct write to Unity Catalog
+
+**Why this approach:**
+- Scales from thousands to millions of rows
+- Parallel execution via Spark
+- Direct integration with Unity Catalog
+- No intermediate files or uploads needed
+- Works with serverless and classic compute
+
+### Basic Pattern
+
+```python
+from pyspark.sql import functions as F
+from pyspark.sql.types import StringType, DoubleType
+from faker import Faker
+import pandas as pd
+import numpy as np
+
+# Define Pandas UDFs for Faker data (batch processing for parallelism)
+@F.pandas_udf(StringType())
+def fake_name(ids: pd.Series) -> pd.Series:
+    fake = Faker()
+    return pd.Series([fake.name() for _ in range(len(ids))])
+
+@F.pandas_udf(StringType())
+def fake_company(ids: pd.Series) -> pd.Series:
+    fake = Faker()
+    return pd.Series([fake.company() for _ in range(len(ids))])
+
+@F.pandas_udf(StringType())
+def fake_email(ids: pd.Series) -> pd.Series:
+    fake = Faker()
+    return pd.Series([fake.email() for _ in range(len(ids))])
+
+@F.pandas_udf(DoubleType())
+def generate_lognormal_amount(tiers: pd.Series) -> pd.Series:
+    """Generate amount based on tier using log-normal distribution."""
+    amounts = []
+    for tier in tiers:
+        if tier == "Enterprise":
+            amounts.append(float(np.random.lognormal(mean=7.5, sigma=0.8)))
+        elif tier == "Pro":
+            amounts.append(float(np.random.lognormal(mean=5.5, sigma=0.7)))
+        else:
+            amounts.append(float(np.random.lognormal(mean=4.0, sigma=0.6)))
+    return pd.Series(amounts)
+```
+
+### Generate Data with Spark + Pandas UDFs
+
+```python
+# Configuration
+N_CUSTOMERS = 100_000
+PARTITIONS = 16  # Adjust based on data size: 8 for <100K, 32 for 1M+
+VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
+
+# Generate customers with Spark + Pandas UDFs
+customers_df = (
+    spark.range(0, N_CUSTOMERS, numPartitions=PARTITIONS)
+    .select(
+        F.concat(F.lit("CUST-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("customer_id"),
+        fake_name(F.col("id")).alias("name"),
+        fake_company(F.col("id")).alias("company"),
+        fake_email(F.col("id")).alias("email"),
+        F.when(F.rand() < 0.6, "Free")
+         .when(F.rand() < 0.9, "Pro")
+         .otherwise("Enterprise").alias("tier"),
+        F.when(F.rand() < 0.4, "North")
+         .when(F.rand() < 0.65, "South")
+         .when(F.rand() < 0.85, "East")
+         .otherwise("West").alias("region"),
+    )
+)
+
+# Add tier-based amount
+customers_df = customers_df.withColumn("arr", generate_lognormal_amount(F.col("tier")))
+
+# Write directly to Unity Catalog volume
+customers_df.write.mode("overwrite").parquet(f"{VOLUME_PATH}/customers")
+```
+
+### Partitioning Strategy
+
+| Data Size | Recommended Partitions |
+|-----------|----------------------|
+| < 100K rows | 8 partitions |
+| 100K - 500K rows | 16 partitions |
+| 500K - 1M rows | 32 partitions |
+| 1M+ rows | 64+ partitions |
+
+---
+
+## Approach 2: Polars + Local Generation + Upload (Secondary Option)
+
+**Use only when:** Dataset <10K rows AND user explicitly prefers local generation
+
+**Why this approach exists:**
+- No Spark overhead for tiny datasets
+- Quick prototyping in local environment
+- When Databricks Connect not available
+
+**Limitations:**
+- Doesn't scale past ~100K rows
+- Requires manual upload step
+- No direct Unity Catalog integration
+
+### Install Local Dependencies
+
+```bash
+# Preferred: use uv for fast, reliable installs
+uv pip install polars faker numpy
+
+# Alternative if uv not available
+pip install polars faker numpy
+```
+
+### Generate Locally with Polars
+
+```python
+import polars as pl
+from faker import Faker
+import numpy as np
+
+fake = Faker()
+N_CUSTOMERS = 5000
+
+# Generate with Polars
+customers = pl.DataFrame({
+    "customer_id": [f"CUST-{i:05d}" for i in range(N_CUSTOMERS)],
+    "name": [fake.name() for _ in range(N_CUSTOMERS)],
+    "email": [fake.email() for _ in range(N_CUSTOMERS)],
+    "tier": np.random.choice(["Free", "Pro", "Enterprise"], N_CUSTOMERS, p=[0.6, 0.3, 0.1]).tolist(),
+    "region": np.random.choice(["North", "South", "East", "West"], N_CUSTOMERS, p=[0.4, 0.25, 0.2, 0.15]).tolist(),
+})
+
+# Save locally
+customers.write_parquet("./output/customers.parquet")
+```
+
+### Upload to Databricks Volume
+
+After generating data locally, upload to a Databricks volume:
+
+```bash
+# Create directory in volume if needed
+databricks fs mkdirs dbfs:/Volumes/<catalog>/<schema>/<volume>/source_data/
+
+# Upload local data to volume
+databricks fs cp -r ./output/customers.parquet dbfs:/Volumes/<catalog>/<schema>/<volume>/source_data/
+databricks fs cp -r ./output/orders.parquet dbfs:/Volumes/<catalog>/<schema>/<volume>/source_data/
+```
+
+### When to Actually Use Polars
+
+Only recommend Polars when ALL conditions are met:
+1. Dataset is < 10K rows
+2. User explicitly requests local generation
+3. Quick prototyping without Databricks connection
+
+Otherwise, **always use Spark + Faker + Pandas UDFs**.
+
+---
+
+## Storage Destinations
+
+### Ask for Catalog and Schema
+
+Ask the user which catalog and schema to use:
+
+> "What catalog and schema name would you like to use?"
+
+### Create Infrastructure in Script
+
+Always create the schema and volume **inside the Python script** using `spark.sql()`:
+
+```python
+CATALOG = "<user-provided-catalog>"  # MUST ask user - never default
+SCHEMA = "<user-provided-schema>"
+VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
+
+# Note: Assume catalog exists - do NOT create it
+spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
+spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data")
+```
+
+**Important:** Do NOT create catalogs - assume they already exist. Only create schema and volume.
diff --git a/databricks-skills/databricks-synthetic-data-gen/references/3-data-patterns.md b/databricks-skills/databricks-synthetic-data-gen/references/3-data-patterns.md
new file mode 100644
index 00000000..351f1bd7
--- /dev/null
+++ b/databricks-skills/databricks-synthetic-data-gen/references/3-data-patterns.md
@@ -0,0 +1,286 @@
+# Data Patterns Guide
+
+Creating realistic, coherent synthetic data with Spark + Pandas UDFs.
+
+## 5 Key Principles
+
+1. **Use Spark + Faker + Pandas UDFs** for all generation
+2. **Referential Integrity** - master tables first, weighted sampling
+3. **Non-Linear Distributions** - log-normal, Pareto, exponential
+4. **Time-Based Patterns** - weekday/weekend, holidays, seasonality
+5. **Row Coherence** - correlated attributes within each row
+
+---
+
+## Principle 1: Use Spark + Faker + Pandas UDFs
+
+Generate data with Spark + Faker for all use cases. Pandas UDFs provide efficient, distributed Faker calls that scale seamlessly from thousands to millions of rows.
+
+### Define Pandas UDFs
+
+```python
+from pyspark.sql import functions as F
+from pyspark.sql.types import StringType, DoubleType
+from faker import Faker
+import pandas as pd
+import numpy as np
+
+@F.pandas_udf(StringType())
+def fake_company(ids: pd.Series) -> pd.Series:
+    fake = Faker()
+    return pd.Series([fake.company() for _ in range(len(ids))])
+
+@F.pandas_udf(StringType())
+def fake_address(ids: pd.Series) -> pd.Series:
+    fake = Faker()
+    return pd.Series([fake.address().replace('\n', ', ') for _ in range(len(ids))])
+
+@F.pandas_udf(DoubleType())
+def generate_lognormal_amount(tiers: pd.Series) -> pd.Series:
+    amounts = []
+    for tier in tiers:
+        if tier == "Enterprise":
+            amounts.append(float(np.random.lognormal(mean=7.5, sigma=0.8)))
+        elif tier == "Pro":
+            amounts.append(float(np.random.lognormal(mean=5.5, sigma=0.7)))
+        else:
+            amounts.append(float(np.random.lognormal(mean=4.0, sigma=0.6)))
+    return pd.Series(amounts)
+```
+
+### Generate with Spark
+
+```python
+# Adjust numPartitions based on scale: 8 for <100K, 32 for 1M+
+customers_df = (
+    spark.range(0, N_CUSTOMERS, numPartitions=16)
+    .select(
+        F.concat(F.lit("CUST-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("customer_id"),
+        fake_company(F.col("id")).alias("name"),
+        F.when(F.rand() < 0.6, "Free")
+         .when(F.rand() < 0.9, "Pro")
+         .otherwise("Enterprise").alias("tier"),
+    )
+)
+customers_df.write.mode("overwrite").parquet(f"{VOLUME_PATH}/customers")
+```
+
+---
+
+## Principle 2: Referential Integrity
+
+Generate master tables first, then iterate on them to create related tables with matching IDs.
+
+> **CRITICAL:** Do NOT use `.cache()` or `.persist()` with serverless compute - these operations are not supported and will fail. Instead, write master tables to Delta first, then read them back for FK joins.
+
+### Pattern: Weighted Sampling by Tier
+
+```python
+from pyspark.sql.window import Window
+
+# 1. Generate customers (master table) with index for FK mapping
+customers_df = (
+    spark.range(0, N_CUSTOMERS, numPartitions=PARTITIONS)
+    .select(
+        F.col("id").alias("customer_idx"),  # Keep index for FK joins
+        F.concat(F.lit("CUST-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("customer_id"),
+        F.when(F.rand(SEED) < 0.6, "Free")
+         .when(F.rand(SEED) < 0.9, "Pro")
+         .otherwise("Enterprise").alias("tier"),
+    )
+)
+
+# 2. Write to Delta table (do NOT use cache with serverless!)
+customers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.customers")
+
+# 3. Read back for FK lookups
+customer_lookup = spark.table(f"{CATALOG}.{SCHEMA}.customers").select(
+    "customer_idx", "customer_id", "tier"
+)
+
+# 4. Generate orders with valid foreign keys
+orders_df = spark.range(0, N_ORDERS, numPartitions=PARTITIONS)
+
+# Map order to customer using hash-based distribution
+orders_df = orders_df.select(
+    F.concat(F.lit("ORD-"), F.lpad(F.col("id").cast("string"), 6, "0")).alias("order_id"),
+    (F.abs(F.hash(F.col("id"), F.lit(SEED))) % N_CUSTOMERS).alias("customer_idx"),
+)
+
+# Join to get valid foreign keys
+orders_with_fk = orders_df.join(customer_lookup, on="customer_idx", how="left")
+```
+
+### Anti-Pattern: Random FK Generation
+
+```python
+# BAD - May generate non-existent customer IDs
+orders_df = spark.range(0, N_ORDERS).select(
+    F.concat(F.lit("CUST-"), (F.rand() * 99999).cast("int")).alias("customer_id")  # WRONG!
+)
+```
+
+---
+
+## Principle 3: Non-Linear Distributions
+
+**Never use uniform distributions** - real data is rarely uniform.
+
+### Distribution Types
+
+| Distribution | Use Case | Example |
+|--------------|----------|---------|
+| **Log-normal** | Prices, salaries, order amounts | `np.random.lognormal(mean=4.5, sigma=0.8)` |
+| **Pareto/Power law** | Popularity, wealth, page views | `(np.random.pareto(a=2.5) + 1) * 10` |
+| **Exponential** | Time between events, resolution time | `np.random.exponential(scale=24)` |
+| **Weighted categorical** | Status, region, tier | `np.random.choice(vals, p=[0.4, 0.3, 0.2, 0.1])` |
+
+### Pandas UDF for Log-Normal Amounts
+
+```python
+@F.pandas_udf(DoubleType())
+def generate_lognormal_amount(tiers: pd.Series) -> pd.Series:
+    """Generate amount based on tier using log-normal distribution."""
+    amounts = []
+    for tier in tiers:
+        if tier == "Enterprise":
+            amounts.append(float(np.random.lognormal(mean=7.5, sigma=0.8)))  # ~$1800 avg
+        elif tier == "Pro":
+            amounts.append(float(np.random.lognormal(mean=5.5, sigma=0.7)))  # ~$245 avg
+        else:
+            amounts.append(float(np.random.lognormal(mean=4.0, sigma=0.6)))  # ~$55 avg
+    return pd.Series(amounts)
+```
+
+### Anti-Pattern: Uniform Distribution
+
+```python
+# BAD - Uniform (unrealistic)
+prices = np.random.uniform(10, 1000, size=N_ORDERS)
+
+# GOOD - Log-normal (realistic for prices)
+prices = np.random.lognormal(mean=4.5, sigma=0.8, size=N_ORDERS)
+```
+
+---
+
+## Principle 4: Time-Based Patterns
+
+Add weekday/weekend effects, holidays, seasonality, and event spikes.
+
+### Holiday and Weekday Multipliers
+
+```python
+import holidays
+from datetime import datetime, timedelta
+
+# Load holiday calendar
+US_HOLIDAYS = holidays.US(years=[START_DATE.year, END_DATE.year])
+
+def get_daily_multiplier(date):
+    """Calculate volume multiplier for a given date."""
+    multiplier = 1.0
+
+    # Weekend drop
+    if date.weekday() >= 5:
+        multiplier *= 0.6
+
+    # Holiday drop (even lower than weekends)
+    if date in US_HOLIDAYS:
+        multiplier *= 0.3
+
+    # Q4 seasonality (higher in Oct-Dec)
+    multiplier *= 1 + 0.15 * (date.month - 6) / 6
+
+    # Incident spike (if applicable)
+    if INCIDENT_START <= date <= INCIDENT_END:
+        multiplier *= 3.0
+
+    # Random noise
+    multiplier *= np.random.normal(1, 0.1)
+
+    return max(0.1, multiplier)
+```
+
+### Date Range: Last 6 Months
+
+Always generate data for the last ~6 months ending at the current date:
+
+```python
+from datetime import datetime, timedelta
+
+END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
+START_DATE = END_DATE - timedelta(days=180)
+```
+
+---
+
+## Principle 5: Row Coherence
+
+Attributes within a row should correlate logically.
+
+### Coherent Ticket Generation
+
+```python
+@F.pandas_udf("struct<priority:string,resolution_hours:double,csat_score:int>")
+def generate_coherent_ticket(tiers: pd.Series) -> pd.DataFrame:
+    """Generate coherent ticket where attributes correlate."""
+    results = []
+    for tier in tiers:
+        # Priority correlates with tier
+        if tier == 'Enterprise':
+            priority = np.random.choice(['Critical', 'High', 'Medium'], p=[0.3, 0.5, 0.2])
+        else:
+            priority = np.random.choice(['Critical', 'High', 'Medium', 'Low'], p=[0.05, 0.2, 0.45, 0.3])
+
+        # Resolution time correlates with priority
+        resolution_scale = {'Critical': 4, 'High': 12, 'Medium': 36, 'Low': 72}
+        resolution_hours = np.random.exponential(scale=resolution_scale[priority])
+
+        # CSAT correlates with resolution time
+        if resolution_hours < 4:
+            csat = np.random.choice([4, 5], p=[0.3, 0.7])
+        elif resolution_hours < 24:
+            csat = np.random.choice([3, 4, 5], p=[0.2, 0.5, 0.3])
+        else:
+            csat = np.random.choice([1, 2, 3, 4], p=[0.1, 0.3, 0.4, 0.2])
+
+        results.append({
+            "priority": priority,
+            "resolution_hours": round(resolution_hours, 1),
+            "csat_score": int(csat),
+        })
+
+    return pd.DataFrame(results)
+```
+
+### Correlation Examples
+
+| Attribute A | Attribute B | Correlation |
+|------------|-------------|-------------|
+| Customer tier | Order amount | Enterprise = higher amounts |
+| Ticket priority | Resolution time | Critical = faster resolution |
+| Resolution time | CSAT score | Faster = higher satisfaction |
+| Region | Product preference | Regional variations |
+| Time of day | Transaction type | Business hours = B2B |
+
+---
+
+## Data Volume for Aggregation
+
+Generate enough data so patterns remain visible after downstream aggregation:
+
+| Grain | Minimum Records | Rationale |
+|-------|-----------------|-----------|
+| Daily time series | 50-100/day | See trends after weekly rollup |
+| Per category | 500+ per category | Statistical significance |
+| Per customer | 5-20 events/customer | Customer-level analysis |
+| Total rows | 10K-50K minimum | Patterns survive GROUP BY |
+
+```python
+# Example: 8000 tickets over 180 days = ~44/day average
+# After weekly aggregation: ~310 records per week
+N_TICKETS = 8000
+N_CUSTOMERS = 2500  # Each has ~3 tickets on average
+N_ORDERS = 25000    # ~10 orders per customer average
+```
diff --git a/databricks-skills/databricks-synthetic-data-gen/references/4-domain-guidance.md b/databricks-skills/databricks-synthetic-data-gen/references/4-domain-guidance.md
new file mode 100644
index 00000000..0519bcce
--- /dev/null
+++ b/databricks-skills/databricks-synthetic-data-gen/references/4-domain-guidance.md
@@ -0,0 +1,256 @@
+# Domain-Specific Guidance
+
+Realistic patterns for common data domains. All examples use Spark + Faker + Pandas UDFs.
+
+---
+
+## Retail/E-commerce
+
+### Tables
+```
+customers → orders → order_items → products
+```
+
+### Key Patterns
+
+| Pattern | Implementation |
+|---------|----------------|
+| Seasonal spikes | Q4 holiday shopping (1.5-2x volume in Nov-Dec) |
+| Cart abandonment | ~70% of carts never complete |
+| Loyalty tier progression | Free → Pro → Enterprise over time |
+| Regional pricing | 5-15% price variation by region |
+
+### Realistic Distributions
+
+```python
+@F.pandas_udf(DoubleType())
+def generate_order_amount(tiers: pd.Series) -> pd.Series:
+    """E-commerce order amounts by tier."""
+    amounts = []
+    for tier in tiers:
+        if tier == "Premium":
+            amounts.append(float(np.random.lognormal(mean=5.5, sigma=0.9)))  # ~$245 avg
+        elif tier == "Standard":
+            amounts.append(float(np.random.lognormal(mean=4.2, sigma=0.7)))  # ~$67 avg
+        else:  # Basic
+            amounts.append(float(np.random.lognormal(mean=3.5, sigma=0.6)))  # ~$33 avg
+    return pd.Series(amounts)
+
+# Order status with cart abandonment
+status_weights = [0.70, 0.08, 0.07, 0.10, 0.05]  # abandoned, pending, processing, shipped, delivered
+```
+
+### Schema Example
+
+```python
+# Products
+products_df = spark.range(0, N_PRODUCTS).select(
+    F.concat(F.lit("PROD-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("product_id"),
+    fake_product_name(F.col("id")).alias("name"),
+    F.array(F.lit("Electronics"), F.lit("Clothing"), F.lit("Home"), F.lit("Sports"))[
+        (F.rand() * 4).cast("int")
+    ].alias("category"),
+    generate_price(F.col("id")).alias("base_price"),
+)
+```
+
+---
+
+## Support/CRM
+
+### Tables
+```
+accounts → contacts → tickets → interactions
+```
+
+### Key Patterns
+
+| Pattern | Implementation |
+|---------|----------------|
+| Incident spikes | 3-5x volume during outages |
+| Resolution by priority | Critical: 4h avg, Low: 72h avg |
+| Enterprise contacts | 5-10 contacts per account vs 1-2 for SMB |
+| CSAT correlation | Faster resolution = higher satisfaction |
+
+### Realistic Distributions
+
+```python
+@F.pandas_udf("struct<priority:string,resolution_hours:double,csat:int>")
+def generate_ticket_metrics(tiers: pd.Series) -> pd.DataFrame:
+    """Support ticket metrics with correlated attributes."""
+    results = []
+    for tier in tiers:
+        # Priority correlates with tier
+        if tier == 'Enterprise':
+            priority = np.random.choice(['Critical', 'High', 'Medium'], p=[0.3, 0.5, 0.2])
+        else:
+            priority = np.random.choice(['Critical', 'High', 'Medium', 'Low'], p=[0.05, 0.2, 0.45, 0.3])
+
+        # Resolution time by priority (exponential distribution)
+        resolution_scale = {'Critical': 4, 'High': 12, 'Medium': 36, 'Low': 72}
+        resolution_hours = np.random.exponential(scale=resolution_scale[priority])
+
+        # CSAT correlates with resolution time
+        if resolution_hours < 4:
+            csat = np.random.choice([4, 5], p=[0.3, 0.7])
+        elif resolution_hours < 24:
+            csat = np.random.choice([3, 4, 5], p=[0.2, 0.5, 0.3])
+        else:
+            csat = np.random.choice([1, 2, 3, 4], p=[0.1, 0.3, 0.4, 0.2])
+
+        results.append({"priority": priority, "resolution_hours": round(resolution_hours, 1), "csat": int(csat)})
+    return pd.DataFrame(results)
+```
+
+### Schema Example
+
+```python
+# Tickets with coherent attributes
+tickets_df = (
+    spark.range(0, N_TICKETS, numPartitions=PARTITIONS)
+    .select(
+        F.concat(F.lit("TKT-"), F.lpad(F.col("id").cast("string"), 6, "0")).alias("ticket_id"),
+        # FK to customer (weighted by tier)
+        ...
+    )
+    .withColumn("metrics", generate_ticket_metrics(F.col("tier")))
+    .select("*", "metrics.*")
+    .drop("metrics")
+)
+```
+
+---
+
+## Manufacturing/IoT
+
+### Tables
+```
+equipment → sensors → readings → maintenance_orders
+```
+
+### Key Patterns
+
+| Pattern | Implementation |
+|---------|----------------|
+| Sensor lifecycle | Normal → degraded → failure progression |
+| Anomaly precursors | Anomalies precede maintenance by 2-7 days |
+| Seasonal production | Summer/winter production variations |
+| Equipment age | Failure rate increases with age |
+
+### Realistic Distributions
+
+```python
+@F.pandas_udf(DoubleType())
+def generate_sensor_reading(equipment_ages: pd.Series) -> pd.Series:
+    """Sensor readings with age-based degradation."""
+    readings = []
+    for age_days in equipment_ages:
+        # Base reading with age-based drift
+        base = 100.0
+        drift = (age_days / 365) * 5  # 5 units drift per year
+        noise = np.random.normal(0, 2)
+
+        # Occasional anomalies (more likely with age)
+        anomaly_prob = min(0.01 + (age_days / 365) * 0.02, 0.1)
+        if np.random.random() < anomaly_prob:
+            noise += np.random.choice([-1, 1]) * np.random.exponential(10)
+
+        readings.append(base + drift + noise)
+    return pd.Series(readings)
+```
+
+### Schema Example
+
+```python
+# Sensor readings time series
+readings_df = (
+    spark.range(0, N_READINGS, numPartitions=PARTITIONS)
+    .select(
+        F.concat(F.lit("READ-"), F.col("id").cast("string")).alias("reading_id"),
+        # FK to sensor
+        ((F.col("id") % N_SENSORS) + 1).alias("sensor_id"),
+        F.date_add(F.lit(START_DATE.date()), (F.col("id") / READINGS_PER_DAY).cast("int")).alias("timestamp"),
+        generate_sensor_reading(F.col("equipment_age")).alias("value"),
+    )
+)
+```
+
+---
+
+## Financial Services
+
+### Tables
+```
+accounts → transactions → payments → fraud_flags
+```
+
+### Key Patterns
+
+| Pattern | Implementation |
+|---------|----------------|
+| Transaction power law | 80% of volume from 20% of accounts |
+| Fraud patterns | Unusual times, amounts, locations |
+| Balance consistency | Transactions maintain positive balance |
+| Regulatory compliance | No negative balances, valid amounts |
+
+### Realistic Distributions
+
+```python
+@F.pandas_udf(DoubleType())
+def generate_transaction_amount(account_types: pd.Series) -> pd.Series:
+    """Transaction amounts following power law by account type."""
+    amounts = []
+    for acct_type in account_types:
+        if acct_type == "Corporate":
+            # Power law for corporate (few large transactions)
+            amount = (np.random.pareto(a=1.5) + 1) * 1000
+        elif acct_type == "Premium":
+            amount = np.random.lognormal(mean=6, sigma=1.2)
+        else:  # Standard
+            amount = np.random.lognormal(mean=4, sigma=0.8)
+        amounts.append(min(amount, 1_000_000))  # Cap at $1M
+    return pd.Series(amounts)
+
+@F.pandas_udf(BooleanType())
+def generate_fraud_flag(amounts: pd.Series, hours: pd.Series) -> pd.Series:
+    """Flag suspicious transactions based on amount and time."""
+    flags = []
+    for amount, hour in zip(amounts, hours):
+        # Higher fraud probability for: large amounts + unusual hours
+        base_prob = 0.001
+        if amount > 5000:
+            base_prob *= 3
+        if hour < 6 or hour > 22:
+            base_prob *= 2
+        flags.append(np.random.random() < base_prob)
+    return pd.Series(flags)
+```
+
+### Schema Example
+
+```python
+# Transactions with fraud indicators
+transactions_df = (
+    spark.range(0, N_TRANSACTIONS, numPartitions=PARTITIONS)
+    .select(
+        F.concat(F.lit("TXN-"), F.lpad(F.col("id").cast("string"), 10, "0")).alias("transaction_id"),
+        # FK to account
+        ...
+        generate_transaction_amount(F.col("account_type")).alias("amount"),
+        F.hour(F.col("timestamp")).alias("hour"),
+    )
+    .withColumn("is_suspicious", generate_fraud_flag(F.col("amount"), F.col("hour")))
+)
+```
+
+---
+
+## General Best Practices
+
+1. **Start with domain tables**: Define the core entities and relationships first
+2. **Add domain-specific distributions**: Use realistic statistical patterns for your domain
+3. **Include edge cases**: Every domain has edge cases (returns, cancellations, failures)
+4. **Time-based patterns matter**: Most domains have daily/weekly/seasonal patterns
+5. **Correlate attributes**: Attributes within a row should make business sense together
+
+**Note:** These are guidance patterns, not rigid schemas. Adapt to user's specific requirements.
diff --git a/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md b/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md
new file mode 100644
index 00000000..c283a82c
--- /dev/null
+++ b/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md
@@ -0,0 +1,178 @@
+# Output Formats Guide
+
+Where and how to save generated synthetic data.
+
+## Create Infrastructure in Script
+
+Always create the schema and volume **inside the Python script** using `spark.sql()`. Do NOT make separate MCP SQL calls - it's much slower.
+
+```python
+CATALOG = "<user-provided-catalog>"  # MUST ask user - never default
+SCHEMA = "<user-provided-schema>"
+VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
+
+# Note: Assume catalog exists - do NOT create it
+spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
+spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data")
+```
+
+**Important:** Do NOT create catalogs - assume they already exist. Only create schema and volume.
+
+---
+
+## Format Comparison
+
+| Format | Use Case | Extension | Best For |
+|--------|----------|-----------|----------|
+| **Parquet** | SDP pipeline input | `.parquet` or none | Best compression, query performance |
+| **JSON** | Log-style ingestion | `.json` | Simulating external data feeds |
+| **CSV** | Legacy systems | `.csv` | Human-readable, spreadsheet import |
+| **Delta Table** | Default - Direct analytics | N/A | Treat as bronze tables for ETL or skip ETL and query immediately |
+
+---
+
+## Parquet to Volumes (Default)
+
+Standard format for SDP pipeline input. Best compression and query performance.
+
+```python
+VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
+
+# Save as parquet files (directory format)
+customers_df.write.mode("overwrite").parquet(f"{VOLUME_PATH}/customers")
+orders_df.write.mode("overwrite").parquet(f"{VOLUME_PATH}/orders")
+tickets_df.write.mode("overwrite").parquet(f"{VOLUME_PATH}/tickets")
+```
+
+**Notes:**
+- Files may not use a file extension or might end with `.parquet`
+- Spark writes as a directory with part files
+- Use `mode("overwrite")` for one-time generation
+- Use `mode("append")` for incremental/scheduled jobs
+
+---
+
+## JSON to Volumes
+
+Common pattern for simulating SDP ingestion from external data feeds (logs, webhooks).
+
+```python
+VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
+
+# Save as JSON files
+customers_df.write.mode("overwrite").json(f"{VOLUME_PATH}/customers_json")
+orders_df.write.mode("overwrite").json(f"{VOLUME_PATH}/orders_json")
+```
+
+**When to use:**
+- Simulating log ingestion
+- External API data feeds
+- User explicitly requests JSON format
+
+---
+
+## CSV to Volumes
+
+Common pattern for simulating data from legacy systems or spreadsheet exports.
+
+```python
+VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
+
+# Save as CSV with headers
+customers_df.write.mode("overwrite").option("header", "true").csv(f"{VOLUME_PATH}/customers_csv")
+orders_df.write.mode("overwrite").option("header", "true").csv(f"{VOLUME_PATH}/orders_csv")
+```
+
+**Options:**
+```python
+# Full options for CSV
+df.write \
+    .mode("overwrite") \
+    .option("header", "true") \
+    .option("delimiter", ",") \
+    .option("quote", '"') \
+    .option("escape", "\\") \
+    .csv(f"{VOLUME_PATH}/data_csv")
+```
+
+**When to use:**
+- Legacy system integration
+- Human-readable data
+- Spreadsheet import testing
+
+---
+
+## Delta Table (Unity Catalog)
+
+Write directly to managed Delta tables when data is ready for analytics consumption (skip SDP pipeline).
+
+```python
+# Ensure schema exists
+spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
+
+# Save as managed Delta tables
+customers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.customers")
+orders_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.orders")
+
+# With additional options
+customers_df.write \
+    .mode("overwrite") \
+    .option("overwriteSchema", "true") \
+    .saveAsTable(f"{CATALOG}.{SCHEMA}.customers")
+```
+
+**When to use:**
+- User wants data ready to query immediately
+- Skip the SDP bronze/silver/gold pipeline
+- Direct SQL analytics
+
+---
+
+## Write Modes
+
+| Mode | Behavior | Use Case |
+|------|----------|----------|
+| `overwrite` | Replace existing data | One-time generation, regeneration |
+| `append` | Add to existing data | Incremental/scheduled jobs |
+| `ignore` | Skip if exists | Idempotent generation |
+| `error` | Fail if exists | Safety check |
+
+### Incremental Generation Pattern
+
+```python
+WRITE_MODE = "append"  # For scheduled jobs
+
+# Only generate new records since last run
+from datetime import datetime, timedelta
+
+LAST_RUN = datetime.now() - timedelta(days=1)
+END_DATE = datetime.now()
+
+# Generate only new data
+new_orders_df = generate_orders(start_date=LAST_RUN, end_date=END_DATE)
+new_orders_df.write.mode(WRITE_MODE).parquet(f"{VOLUME_PATH}/orders")
+```
+
+---
+
+## Validation After Write
+
+After successful execution, validate the generated data:
+
+```python
+# Read back and verify
+customers_check = spark.read.parquet(f"{VOLUME_PATH}/customers")
+orders_check = spark.read.parquet(f"{VOLUME_PATH}/orders")
+
+print(f"Customers: {customers_check.count():,} rows")
+print(f"Orders: {orders_check.count():,} rows")
+
+# Verify distributions
+customers_check.groupBy("tier").count().show()
+orders_check.describe("amount").show()
+```
+
+Or use `get_volume_folder_details` MCP tool:
+- `volume_path`: "my_catalog/my_schema/raw_data/customers"
+- `format`: "parquet"
+- `table_stat_level`: "SIMPLE"
diff --git a/databricks-skills/databricks-synthetic-data-gen/references/6-troubleshooting.md b/databricks-skills/databricks-synthetic-data-gen/references/6-troubleshooting.md
new file mode 100644
index 00000000..420b3500
--- /dev/null
+++ b/databricks-skills/databricks-synthetic-data-gen/references/6-troubleshooting.md
@@ -0,0 +1,324 @@
+# Troubleshooting Guide
+
+Common issues and solutions for synthetic data generation.
+
+## Environment Issues
+
+### ModuleNotFoundError: faker (or other library)
+
+**Problem:** Dependencies not available in execution environment.
+
+**Solutions by execution mode:**
+
+| Mode | Solution |
+|------|----------|
+| **DB Connect 16.4+** | Use `DatabricksEnv().withDependencies("faker", "pandas", ...)` |
+| **Older DB Connect with Serverless** | Create job with `environments` parameter |
+| **Databricks Runtime** | Use Databricks CLI to  install `faker holidays` |
+| **Classic cluster** | Use Databricks CLI to install libraries. `databricks libraries install --json '{"cluster_id": "<cluster_id>", "libraries": [{"pypi": {"package": "faker"}}, {"pypi": {"package": "holidays"}}]}'` |
+
+```python
+# For DB Connect 16.4+
+from databricks.connect import DatabricksSession, DatabricksEnv
+
+env = DatabricksEnv().withDependencies("faker", "pandas", "numpy", "holidays")
+spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate()
+```
+
+### DatabricksEnv not found
+
+**Problem:** Using older databricks-connect version.
+
+**Solution:** Upgrade to 16.4+ or use job-based approach:
+
+```bash
+# Upgrade (prefer uv, fall back to pip)
+uv pip install "databricks-connect>=16.4,<17.4"
+# or: pip install "databricks-connect>=16.4,<17.4"
+
+# Or use job with environments parameter instead
+```
+
+### serverless_compute_id error
+
+**Problem:** Missing serverless configuration.
+
+**Solution:** Add to `~/.databrickscfg`:
+
+```ini
+[DEFAULT]
+host = https://your-workspace.cloud.databricks.com/
+serverless_compute_id = auto
+auth_type = databricks-cli
+```
+
+---
+
+## Execution Issues
+
+### CRITICAL: cache() and persist() NOT supported on serverless
+
+**Problem:** Using `.cache()` or `.persist()` on serverless compute fails with:
+```
+AnalysisException: [NOT_SUPPORTED_WITH_SERVERLESS] PERSIST TABLE is not supported on serverless compute.
+```
+
+**Why this happens:** Serverless compute does not support caching DataFrames in memory. This is a fundamental limitation of the serverless architecture.
+
+**Solution:** Write master tables to Delta first, then read them back for FK joins:
+
+```python
+# BAD - will fail on serverless
+customers_df = spark.range(0, N_CUSTOMERS)...
+customers_df.cache()  # ❌ FAILS: "PERSIST TABLE is not supported on serverless compute"
+
+# GOOD - write to Delta, then read back
+customers_df = spark.range(0, N_CUSTOMERS)...
+customers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.customers")
+customer_lookup = spark.table(f"{CATALOG}.{SCHEMA}.customers")  # ✓ Read from Delta
+```
+
+**Best practice for referential integrity:**
+1. Generate master table (e.g., customers)
+2. Write to Delta table
+3. Read back for FK lookup joins
+4. Generate child tables (e.g., orders, tickets) with valid FKs
+5. Write child tables to Delta
+
+---
+
+### Serverless job fails to start
+
+**Possible causes:**
+1. Workspace doesn't have serverless enabled
+2. Unity Catalog permissions missing
+3. Invalid environment configuration
+
+**Solutions:**
+```python
+# Verify serverless is available
+# Try creating a simple job first to test
+
+# Check Unity Catalog permissions
+spark.sql("SELECT current_catalog(), current_schema()")
+```
+
+### Classic cluster startup slow (3-8 minutes)
+
+**Problem:** Clusters take time to start.
+
+**Solution:** Switch to serverless:
+
+```python
+# Instead of:
+# spark = DatabricksSession.builder.clusterId("xxx").getOrCreate()
+
+# Use:
+spark = DatabricksSession.builder.serverless(True).getOrCreate()
+```
+
+### "Either base environment or version must be provided"
+
+**Problem:** Missing `client` in job environment spec.
+
+**Solution:** Add `"client": "4"` to the spec:
+
+```python
+{
+  "environments": [{
+    "environment_key": "datagen_env",
+    "spec": {
+      "client": "4",  # Required!
+      "dependencies": ["faker", "numpy", "pandas"]
+    }
+  }]
+}
+```
+
+---
+
+## Data Generation Issues
+
+### AttributeError: 'function' object has no attribute 'partitionBy'
+
+**Problem:** Using `F.window` instead of `Window` for analytical window functions.
+
+```python
+# WRONG - F.window is for time-based tumbling/sliding windows (streaming)
+window_spec = F.window.partitionBy("account_id").orderBy("contact_id")
+# Error: AttributeError: 'function' object has no attribute 'partitionBy'
+
+# CORRECT - Window is for analytical window specifications
+from pyspark.sql.window import Window
+window_spec = Window.partitionBy("account_id").orderBy("contact_id")
+```
+
+**When to use Window:** For analytical functions like `row_number()`, `rank()`, `lead()`, `lag()`:
+
+```python
+from pyspark.sql.window import Window
+
+# Mark first contact per account as primary
+window_spec = Window.partitionBy("account_id").orderBy("contact_id")
+contacts_df = contacts_df.withColumn(
+    "is_primary",
+    F.row_number().over(window_spec) == 1
+)
+```
+
+---
+
+### Faker UDF is slow
+
+**Problem:** Single-row UDFs don't parallelize well.
+
+**Solution:** Use `pandas_udf` for batch processing:
+
+```python
+# SLOW - scalar UDF
+@F.udf(returnType=StringType())
+def slow_fake_name():
+    return Faker().name()
+
+# FAST - pandas UDF (batch processing)
+@F.pandas_udf(StringType())
+def fast_fake_name(ids: pd.Series) -> pd.Series:
+    fake = Faker()
+    return pd.Series([fake.name() for _ in range(len(ids))])
+```
+
+### Out of memory with large data
+
+**Problem:** Not enough partitions for data size.
+
+**Solution:** Increase partitions:
+
+```python
+# For large datasets (1M+ rows)
+customers_df = spark.range(0, N_CUSTOMERS, numPartitions=64)  # Increase from default
+```
+
+| Data Size | Recommended Partitions |
+|-----------|----------------------|
+| < 100K | 8 |
+| 100K - 500K | 16 |
+| 500K - 1M | 32 |
+| 1M+ | 64+ |
+
+### Context corrupted on classic cluster
+
+**Problem:** Stale execution context.
+
+**Solution:** Create fresh context (omit context_id), reinstall libraries:
+
+```python
+# Don't reuse context_id if you see strange errors
+# Let it create a new context
+```
+
+### Referential integrity violations
+
+**Problem:** Foreign keys reference non-existent parent records.
+
+**Solution:** Write master table to Delta first, then read back for FK joins:
+
+```python
+# 1. Generate and WRITE master table (do NOT use cache with serverless!)
+customers_df = spark.range(0, N_CUSTOMERS)...
+customers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.customers")
+
+# 2. Read back for FK lookups
+customer_lookup = spark.table(f"{CATALOG}.{SCHEMA}.customers").select("customer_id", "tier")
+
+# 3. Generate child table with valid FKs
+orders_df = spark.range(0, N_ORDERS).join(
+    customer_lookup,
+    on=<mapping_condition>,
+    how="left"
+)
+```
+
+> **WARNING:** Do NOT use `.cache()` or `.persist()` with serverless compute. See the dedicated section above.
+
+---
+
+## Data Quality Issues
+
+### Uniform distributions (unrealistic)
+
+**Problem:** All customers have similar order counts, amounts are evenly distributed.
+
+**Solution:** Use non-linear distributions:
+
+```python
+# BAD - uniform
+amounts = np.random.uniform(10, 1000, N)
+
+# GOOD - log-normal (realistic)
+amounts = np.random.lognormal(mean=5, sigma=0.8, N)
+```
+
+### Missing time-based patterns
+
+**Problem:** Data doesn't reflect weekday/weekend or seasonal patterns.
+
+**Solution:** Add multipliers:
+
+```python
+import holidays
+
+US_HOLIDAYS = holidays.US(years=[2024, 2025])
+
+def get_multiplier(date):
+    mult = 1.0
+    if date.weekday() >= 5:  # Weekend
+        mult *= 0.6
+    if date in US_HOLIDAYS:
+        mult *= 0.3
+    return mult
+```
+
+### Incoherent row attributes
+
+**Problem:** Enterprise customer has low-value orders, critical ticket has slow resolution.
+
+**Solution:** Correlate attributes:
+
+```python
+# Priority based on tier
+if tier == 'Enterprise':
+    priority = np.random.choice(['Critical', 'High'], p=[0.4, 0.6])
+else:
+    priority = np.random.choice(['Medium', 'Low'], p=[0.6, 0.4])
+
+# Resolution based on priority
+resolution_scale = {'Critical': 4, 'High': 12, 'Medium': 36, 'Low': 72}
+resolution_hours = np.random.exponential(scale=resolution_scale[priority])
+```
+
+---
+
+## Validation Steps
+
+After generation, verify your data:
+
+```python
+# 1. Check row counts
+print(f"Customers: {customers_df.count():,}")
+print(f"Orders: {orders_df.count():,}")
+
+# 2. Verify distributions
+customers_df.groupBy("tier").count().show()
+orders_df.describe("amount").show()
+
+# 3. Check referential integrity
+orphans = orders_df.join(
+    customers_df,
+    orders_df.customer_id == customers_df.customer_id,
+    "left_anti"
+)
+print(f"Orphan orders: {orphans.count()}")
+
+# 4. Verify date range
+orders_df.select(F.min("order_date"), F.max("order_date")).show()
+```
diff --git a/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py b/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py
new file mode 100644
index 00000000..b9f953fa
--- /dev/null
+++ b/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py
@@ -0,0 +1,390 @@
+"""Generate synthetic data using Spark + Faker + Pandas UDFs.
+
+This is the recommended approach for ALL data generation tasks:
+- Scales from thousands to millions of rows
+- Parallel execution via Spark
+- Direct write to Unity Catalog
+- Works with serverless and classic compute
+
+Auto-detects environment and uses:
+- DatabricksEnv with managed dependencies if databricks-connect >= 16.4 (local)
+- Standard session if running on Databricks Runtime or older databricks-connect
+"""
+import sys
+import os
+from pyspark.sql import functions as F
+from pyspark.sql.window import Window
+from pyspark.sql.types import StringType, DoubleType, StructType, StructField, IntegerType
+import numpy as np
+import pandas as pd
+from datetime import datetime, timedelta
+
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+# Compute - Serverless strongly recommended
+USE_SERVERLESS = True  # Set to False and provide CLUSTER_ID for classic compute
+CLUSTER_ID = None  # Only used if USE_SERVERLESS=False
+
+# Storage - Update these for your environment
+CATALOG = "<YOUR_CATALOG>"  # REQUIRED: replace with your catalog
+SCHEMA = "<YOUR_SCHEMA>"  # REQUIRED: replace with your schema
+VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
+
+# Data sizes
+N_CUSTOMERS = 10_000
+N_ORDERS = 50_000
+PARTITIONS = 16  # Adjust: 8 for <100K, 32 for 1M+
+
+# Date range - last 6 months from today
+END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
+START_DATE = END_DATE - timedelta(days=180)
+
+# Write mode - "overwrite" for one-time, "append" for incremental
+WRITE_MODE = "overwrite"
+
+# Bad data injection for testing data quality rules
+INJECT_BAD_DATA = False  # Set to True to inject bad data
+BAD_DATA_CONFIG = {
+    "null_rate": 0.02,           # 2% nulls in required fields
+    "outlier_rate": 0.01,        # 1% impossible values
+    "orphan_fk_rate": 0.01,      # 1% orphan foreign keys
+}
+
+# Reproducibility
+SEED = 42
+
+# Tier distribution: Free 60%, Pro 30%, Enterprise 10%
+TIER_PROBS = [0.6, 0.3, 0.1]
+
+# Region distribution
+REGION_PROBS = [0.4, 0.25, 0.2, 0.15]
+
+# =============================================================================
+# ENVIRONMENT DETECTION AND SESSION CREATION
+# =============================================================================
+
+def is_databricks_runtime():
+    """Check if running on Databricks Runtime vs locally."""
+    return "DATABRICKS_RUNTIME_VERSION" in os.environ
+
+def get_databricks_connect_version():
+    """Get databricks-connect version as (major, minor) tuple or None."""
+    try:
+        import importlib.metadata
+        version_str = importlib.metadata.version('databricks-connect')
+        parts = version_str.split('.')
+        return (int(parts[0]), int(parts[1]))
+    except Exception:
+        return None
+
+# Detect environment
+on_runtime = is_databricks_runtime()
+db_version = get_databricks_connect_version()
+
+print("=" * 80)
+print("ENVIRONMENT DETECTION")
+print("=" * 80)
+print(f"Running on Databricks Runtime: {on_runtime}")
+if db_version:
+    print(f"databricks-connect version: {db_version[0]}.{db_version[1]}")
+else:
+    print("databricks-connect: not available")
+
+# Use DatabricksEnv with managed dependencies if:
+# - Running locally (not on Databricks Runtime)
+# - databricks-connect >= 16.4
+use_managed_deps = (not on_runtime) and db_version and db_version >= (16, 4)
+
+if use_managed_deps:
+    print("Using DatabricksEnv with managed dependencies")
+    print("=" * 80)
+    from databricks.connect import DatabricksSession, DatabricksEnv
+
+    env = DatabricksEnv().withDependencies("faker", "pandas", "numpy", "holidays")
+
+    if USE_SERVERLESS:
+        spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate()
+        print("Connected to serverless compute with managed dependencies!")
+    else:
+        if not CLUSTER_ID:
+            raise ValueError("CLUSTER_ID must be set when USE_SERVERLESS=False")
+        spark = DatabricksSession.builder.withEnvironment(env).clusterId(CLUSTER_ID).getOrCreate()
+        print(f"Connected to cluster <cluster_id> with managed dependencies!")
+else:
+    print("Using standard session (dependencies must be pre-installed)")
+    print("=" * 80)
+
+    # Check that UDF dependencies are available
+    print("\nChecking UDF dependencies...")
+    missing_deps = []
+
+    try:
+        from faker import Faker
+        print("  faker: OK")
+    except ImportError:
+        missing_deps.append("faker")
+        print("  faker: MISSING")
+
+    try:
+        import pandas as pd
+        print("  pandas: OK")
+    except ImportError:
+        missing_deps.append("pandas")
+        print("  pandas: MISSING")
+
+    if missing_deps:
+        print("\n" + "=" * 80)
+        print("ERROR: Missing dependencies for UDFs")
+        print("=" * 80)
+        print(f"Missing: {', '.join(missing_deps)}")
+        if on_runtime:
+            print('\nSolution: Install libraries via Databricks CLI:')
+            print('  databricks libraries install --json \'{"cluster_id": "<cluster_id>", "libraries": [{"pypi": {"package": "faker"}}, {"pypi": {"package": "holidays"}}]}\'')
+        else:
+            print("\nSolution: Upgrade to databricks-connect >= 16.4 for managed deps")
+            print("          Or create a job with environment settings")
+        print("=" * 80)
+        sys.exit(1)
+
+    print("\nAll dependencies available")
+    print("=" * 80)
+
+    from databricks.connect import DatabricksSession
+
+    if USE_SERVERLESS:
+        spark = DatabricksSession.builder.serverless(True).getOrCreate()
+        print("Connected to serverless compute")
+    else:
+        if not CLUSTER_ID:
+            raise ValueError("CLUSTER_ID must be set when USE_SERVERLESS=False")
+        spark = DatabricksSession.builder.clusterId(CLUSTER_ID).getOrCreate()
+        print(f"Connected to cluster <cluster_id>")
+
+# Import Faker for UDF definitions
+from faker import Faker
+
+# =============================================================================
+# DEFINE PANDAS UDFs FOR FAKER DATA
+# =============================================================================
+
+@F.pandas_udf(StringType())
+def fake_name(ids: pd.Series) -> pd.Series:
+    """Generate realistic person names."""
+    fake = Faker()
+    Faker.seed(SEED)
+    return pd.Series([fake.name() for _ in range(len(ids))])
+
+@F.pandas_udf(StringType())
+def fake_company(ids: pd.Series) -> pd.Series:
+    """Generate realistic company names."""
+    fake = Faker()
+    Faker.seed(SEED)
+    return pd.Series([fake.company() for _ in range(len(ids))])
+
+@F.pandas_udf(StringType())
+def fake_address(ids: pd.Series) -> pd.Series:
+    """Generate realistic addresses."""
+    fake = Faker()
+    Faker.seed(SEED)
+    return pd.Series([fake.address().replace('\n', ', ') for _ in range(len(ids))])
+
+@F.pandas_udf(StringType())
+def fake_email(names: pd.Series) -> pd.Series:
+    """Generate email based on name."""
+    emails = []
+    for name in names:
+        if name:
+            domain = name.lower().replace(" ", ".").replace(",", "")[:20]
+            emails.append(f"{domain}@example.com")
+        else:
+            emails.append("unknown@example.com")
+    return pd.Series(emails)
+
+@F.pandas_udf(DoubleType())
+def generate_lognormal_amount(tiers: pd.Series) -> pd.Series:
+    """Generate amount based on tier using log-normal distribution."""
+    np.random.seed(SEED)
+    amounts = []
+    for tier in tiers:
+        if tier == "Enterprise":
+            amounts.append(float(np.random.lognormal(mean=7.5, sigma=0.8)))  # ~$1800 avg
+        elif tier == "Pro":
+            amounts.append(float(np.random.lognormal(mean=5.5, sigma=0.7)))  # ~$245 avg
+        else:
+            amounts.append(float(np.random.lognormal(mean=4.0, sigma=0.6)))  # ~$55 avg
+    return pd.Series(amounts)
+
+# =============================================================================
+# CREATE INFRASTRUCTURE
+# =============================================================================
+print("\nCreating infrastructure...")
+spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
+spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data")
+print(f"Infrastructure ready: {VOLUME_PATH}")
+
+# =============================================================================
+# GENERATE CUSTOMERS (Master Table)
+# =============================================================================
+print(f"\nGenerating {N_CUSTOMERS:,} customers...")
+
+customers_df = (
+    spark.range(0, N_CUSTOMERS, numPartitions=PARTITIONS)
+    .select(
+        F.concat(F.lit("CUST-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("customer_id"),
+        fake_name(F.col("id")).alias("name"),
+        fake_company(F.col("id")).alias("company"),
+        fake_address(F.col("id")).alias("address"),
+        # Tier distribution: Free 60%, Pro 30%, Enterprise 10%
+        F.when(F.rand(SEED) < TIER_PROBS[0], "Free")
+         .when(F.rand(SEED) < TIER_PROBS[0] + TIER_PROBS[1], "Pro")
+         .otherwise("Enterprise").alias("tier"),
+        # Region distribution
+        F.when(F.rand(SEED) < REGION_PROBS[0], "North")
+         .when(F.rand(SEED) < REGION_PROBS[0] + REGION_PROBS[1], "South")
+         .when(F.rand(SEED) < REGION_PROBS[0] + REGION_PROBS[1] + REGION_PROBS[2], "East")
+         .otherwise("West").alias("region"),
+        # Created date (within last 2 years before start date)
+        F.date_sub(F.lit(START_DATE.date()), (F.rand(SEED) * 730).cast("int")).alias("created_at"),
+    )
+)
+
+# Add tier-based ARR and email
+customers_df = (
+    customers_df
+    .withColumn("arr", F.round(generate_lognormal_amount(F.col("tier")), 2))
+    .withColumn("email", fake_email(F.col("name")))
+)
+
+# Save customers
+customers_df.write.mode(WRITE_MODE).parquet(f"{VOLUME_PATH}/customers")
+print(f"  Saved customers to {VOLUME_PATH}/customers")
+
+# Show tier distribution
+print("\n  Tier distribution:")
+customers_df.groupBy("tier").count().orderBy("tier").show()
+
+# =============================================================================
+# GENERATE ORDERS (Child Table with Referential Integrity)
+# =============================================================================
+print(f"\nGenerating {N_ORDERS:,} orders with referential integrity...")
+
+# Write customer lookup to temp Delta table (no .cache() on serverless!)
+customers_tmp_table = f"{CATALOG}.{SCHEMA}._tmp_customers_lookup"
+customers_df.select("customer_id", "tier").write.mode("overwrite").saveAsTable(customers_tmp_table)
+customer_lookup = spark.table(customers_tmp_table)
+
+# Generate orders base
+orders_df = (
+    spark.range(0, N_ORDERS, numPartitions=PARTITIONS)
+    .select(
+        F.concat(F.lit("ORD-"), F.lpad(F.col("id").cast("string"), 6, "0")).alias("order_id"),
+        # Generate customer_idx for FK join (hash-based distribution)
+        (F.abs(F.hash(F.col("id"), F.lit(SEED))) % N_CUSTOMERS).alias("customer_idx"),
+        # Order status
+        F.when(F.rand(SEED) < 0.65, "delivered")
+         .when(F.rand(SEED) < 0.80, "shipped")
+         .when(F.rand(SEED) < 0.90, "processing")
+         .when(F.rand(SEED) < 0.95, "pending")
+         .otherwise("cancelled").alias("status"),
+        # Order date within date range
+        F.date_add(F.lit(START_DATE.date()), (F.rand(SEED) * 180).cast("int")).alias("order_date"),
+    )
+)
+
+# Add customer_idx to lookup for join
+customer_lookup_with_idx = customer_lookup.withColumn(
+    "customer_idx",
+    (F.row_number().over(Window.orderBy(F.monotonically_increasing_id())) - 1).cast("int")
+)
+
+# Join to get customer_id and tier as foreign key
+orders_with_fk = (
+    orders_df
+    .join(customer_lookup_with_idx, on="customer_idx", how="left")
+    .drop("customer_idx")
+)
+
+# Add tier-based amount
+orders_with_fk = orders_with_fk.withColumn(
+    "amount",
+    F.round(generate_lognormal_amount(F.col("tier")), 2)
+)
+
+# =============================================================================
+# INJECT BAD DATA (OPTIONAL)
+# =============================================================================
+if INJECT_BAD_DATA:
+    print("\nInjecting bad data for quality testing...")
+
+    # Calculate counts
+    null_count = int(N_ORDERS * BAD_DATA_CONFIG["null_rate"])
+    outlier_count = int(N_ORDERS * BAD_DATA_CONFIG["outlier_rate"])
+    orphan_count = int(N_ORDERS * BAD_DATA_CONFIG["orphan_fk_rate"])
+
+    # Add bad data flags
+    orders_with_fk = orders_with_fk.withColumn(
+        "row_num",
+        F.row_number().over(Window.orderBy(F.monotonically_increasing_id()))
+    )
+
+    # Inject nulls in customer_id for first null_count rows
+    orders_with_fk = orders_with_fk.withColumn(
+        "customer_id",
+        F.when(F.col("row_num") <= null_count, None).otherwise(F.col("customer_id"))
+    )
+
+    # Inject negative amounts for next outlier_count rows
+    orders_with_fk = orders_with_fk.withColumn(
+        "amount",
+        F.when(
+            (F.col("row_num") > null_count) & (F.col("row_num") <= null_count + outlier_count),
+            F.lit(-999.99)
+        ).otherwise(F.col("amount"))
+    )
+
+    # Inject orphan FKs for next orphan_count rows
+    orders_with_fk = orders_with_fk.withColumn(
+        "customer_id",
+        F.when(
+            (F.col("row_num") > null_count + outlier_count) &
+            (F.col("row_num") <= null_count + outlier_count + orphan_count),
+            F.lit("CUST-NONEXISTENT")
+        ).otherwise(F.col("customer_id"))
+    )
+
+    orders_with_fk = orders_with_fk.drop("row_num")
+
+    print(f"  Injected {null_count} null customer_ids")
+    print(f"  Injected {outlier_count} negative amounts")
+    print(f"  Injected {orphan_count} orphan foreign keys")
+
+# Drop tier column (not needed in final output)
+orders_final = orders_with_fk.drop("tier")
+
+# Save orders
+orders_final.write.mode(WRITE_MODE).parquet(f"{VOLUME_PATH}/orders")
+print(f"  Saved orders to {VOLUME_PATH}/orders")
+
+# Show status distribution
+print("\n  Status distribution:")
+orders_final.groupBy("status").count().orderBy("status").show()
+
+# =============================================================================
+# CLEANUP AND SUMMARY
+# =============================================================================
+spark.sql(f"DROP TABLE IF EXISTS {customers_tmp_table}")
+
+print("\n" + "=" * 80)
+print("GENERATION COMPLETE")
+print("=" * 80)
+print(f"Catalog: {CATALOG}")
+print(f"Schema: {SCHEMA}")
+print(f"Volume: {VOLUME_PATH}")
+print(f"\nGenerated data:")
+print(f"  - customers: {N_CUSTOMERS:,} rows")
+print(f"  - orders: {N_ORDERS:,} rows")
+if INJECT_BAD_DATA:
+    print(f"  - Bad data injected: nulls, outliers, orphan FKs")
+print(f"\nDate range: {START_DATE.date()} to {END_DATE.date()}")
+print("=" * 80)
diff --git a/databricks-skills/databricks-synthetic-data-generation/SKILL.md b/databricks-skills/databricks-synthetic-data-generation/SKILL.md
deleted file mode 100644
index ce2a17cf..00000000
--- a/databricks-skills/databricks-synthetic-data-generation/SKILL.md
+++ /dev/null
@@ -1,660 +0,0 @@
----
-name: databricks-synthetic-data-generation
-description: "Generate realistic synthetic data using Faker and Spark, with non-linear distributions, integrity constraints, and save to Databricks. Use when creating test data, demo datasets, or synthetic tables."
----
-
-# Synthetic Data Generation
-
-Generate realistic, story-driven synthetic data for Databricks using Python with Faker and Spark.
-
-## Common Libraries
-
-These libraries are useful for generating realistic synthetic data:
-
-- **faker**: Generates realistic names, addresses, emails, companies, dates, etc.
-- **holidays**: Provides country-specific holiday calendars for realistic date patterns
-
-These are typically NOT pre-installed on Databricks. Install them using `execute_databricks_command` tool:
-- `code`: "%pip install faker holidays"
-
-Save the returned `cluster_id` and `context_id` for subsequent calls.
-
-## Workflow
-
-1. **Write Python code to a local file** in the project (e.g., `scripts/generate_data.py`)
-2. **Execute on Databricks** using the `run_python_file_on_databricks` MCP tool
-3. **If execution fails**: Edit the local file to fix the error, then re-execute
-4. **Reuse the context** for follow-up executions by passing the returned `cluster_id` and `context_id`
-
-**Always work with local files first, then execute.** This makes debugging easier - you can see and edit the code.
-
-### Context Reuse Pattern
-
-The first execution auto-selects a running cluster and creates an execution context. **Reuse this context for follow-up calls** - it's much faster (~1s vs ~15s) and shares variables/imports:
-
-**First execution** - use `run_python_file_on_databricks` tool:
-- `file_path`: "scripts/generate_data.py"
-
-Returns: `{ success, output, error, cluster_id, context_id, ... }`
-
-Save `cluster_id` and `context_id` for follow-up calls.
-
-**If execution fails:**
-1. Read the error from the result
-2. Edit the local Python file to fix the issue
-3. Re-execute with same context using `run_python_file_on_databricks` tool:
-   - `file_path`: "scripts/generate_data.py"
-   - `cluster_id`: "<saved_cluster_id>"
-   - `context_id`: "<saved_context_id>"
-
-**Follow-up executions** reuse the context (faster, shares state):
-- `file_path`: "scripts/validate_data.py"
-- `cluster_id`: "<saved_cluster_id>"
-- `context_id`: "<saved_context_id>"
-
-### Handling Failures
-
-When execution fails:
-1. Read the error from the result
-2. **Edit the local Python file** to fix the issue
-3. Re-execute using the same `cluster_id` and `context_id` (faster, keeps installed libraries)
-4. If the context is corrupted, omit `context_id` to create a fresh one
-
-### Installing Libraries
-
-Databricks provides Spark, pandas, numpy, and common data libraries by default. **Only install a library if you get an import error.**
-
-Use `execute_databricks_command` tool:
-- `code`: "%pip install faker"
-- `cluster_id`: "<cluster_id>"
-- `context_id`: "<context_id>"
-
-The library is immediately available in the same context.
-
-**Note:** Keeping the same `context_id` means installed libraries persist across calls.
-
-## Storage Destination
-
-### Ask for Schema Name
-
-By default, use the `ai_dev_kit` catalog. Ask the user which schema to use:
-
-> "I'll save the data to `ai_dev_kit.<schema>`. What schema name would you like to use? (You can also specify a different catalog if needed.)"
-
-If the user provides just a schema name, use `ai_dev_kit.{schema}`. If they provide `catalog.schema`, use that instead.
-
-### Create Infrastructure in the Script
-
-Always create the catalog, schema, and volume **inside the Python script** using `spark.sql()`. Do NOT make separate MCP SQL calls - it's much slower.
-
-The `spark` variable is available by default on Databricks clusters.
-
-```python
-# =============================================================================
-# CREATE INFRASTRUCTURE (inside the Python script)
-# =============================================================================
-spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")
-spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
-spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data")
-```
-
-### Save to Volume as Raw Data (Never Tables)
-
-**Always save data to a Volume as parquet files, never directly to tables** (unless the user explicitly requests tables). This is the input for the downstream Spark Declarative Pipeline (SDP) that will handle bronze/silver/gold layers.
-
-```python
-VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
-
-# Save as parquet files (raw data)
-spark.createDataFrame(customers_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/customers")
-spark.createDataFrame(orders_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/orders")
-spark.createDataFrame(tickets_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/tickets")
-```
-
-## Raw Data Only - No Pre-Aggregated Fields (Unless Instructed Otherwise)
-
-**By default, generate raw, transactional data only.** Do not create fields that represent sums, totals, averages, or counts.
-
-- One row = one event/transaction/record
-- No columns like `total_orders`, `sum_revenue`, `avg_csat`, `order_count`
-- Each row has its own individual values, not rollups
-
-**Why?** A Spark Declarative Pipeline (SDP) will typically be built after data generation to:
-- Ingest raw data (bronze layer)
-- Clean and validate (silver layer)
-- Aggregate and compute metrics (gold layer)
-
-The synthetic data is the **source** for this pipeline. Aggregations happen downstream.
-
-**Note:** If the user specifically requests aggregated fields or summary tables, follow their instructions.
-
-```python
-# GOOD - Raw transactional data
-# Customer table: one row per customer, no aggregated fields
-customers_data.append({
-    "customer_id": cid,
-    "name": fake.company(),
-    "tier": "Enterprise",
-    "region": "North",
-})
-
-# Order table: one row per order
-orders_data.append({
-    "order_id": f"ORD-{i:06d}",
-    "customer_id": cid,
-    "amount": 150.00,  # This order's amount
-    "order_date": "2024-10-15",
-})
-
-# BAD - Don't add pre-aggregated fields
-# customers_data.append({
-#     "customer_id": cid,
-#     "total_orders": 47,        # NO - this is an aggregation
-#     "total_revenue": 12500.00, # NO - this is a sum
-#     "avg_order_value": 265.95, # NO - this is an average
-# })
-```
-
-## Temporality and Data Volume
-
-### Date Range: Last 6 Months from Today
-
-**Always generate data for the last ~6 months ending at the current date.** This ensures:
-- Data feels current and relevant for demos
-- Recent patterns are visible in dashboards
-- Downstream aggregations (daily/weekly/monthly) have enough history
-
-```python
-from datetime import datetime, timedelta
-
-# Dynamic date range - last 6 months from today
-END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
-START_DATE = END_DATE - timedelta(days=180)
-
-# Place special events within this range (e.g., incident 3 weeks ago)
-INCIDENT_END = END_DATE - timedelta(days=21)
-INCIDENT_START = INCIDENT_END - timedelta(days=10)
-```
-
-### Data Volume for Aggregation
-
-Generate enough data so patterns remain visible after downstream aggregation (SDP pipelines often aggregate by day/week/region/category). Rules of thumb:
-
-| Grain | Minimum Records | Rationale |
-|-------|-----------------|-----------|
-| Daily time series | 50-100/day | See trends after weekly rollup |
-| Per category | 500+ per category | Statistical significance |
-| Per customer | 5-20 events/customer | Enough for customer-level analysis |
-| Total rows | 10K-50K minimum | Patterns survive GROUP BY |
-
-```python
-# Example: 8000 tickets over 180 days = ~44/day average
-# After weekly aggregation: ~310 records per week per category
-# After monthly by region: still enough to see patterns
-N_TICKETS = 8000
-N_CUSTOMERS = 2500  # Each has ~3 tickets on average
-N_ORDERS = 25000    # ~10 orders per customer average
-```
-
-## Script Structure
-
-Always structure scripts with configuration variables at the top:
-
-```python
-"""Generate synthetic data for [use case]."""
-import numpy as np
-import pandas as pd
-from datetime import datetime, timedelta
-from faker import Faker
-import holidays
-from pyspark.sql import SparkSession
-
-# =============================================================================
-# CONFIGURATION - Edit these values
-# =============================================================================
-CATALOG = "my_catalog"
-SCHEMA = "my_schema"
-VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
-
-# Data sizes - enough for aggregation patterns to survive
-N_CUSTOMERS = 2500
-N_ORDERS = 25000
-N_TICKETS = 8000
-
-# Date range - last 6 months from today
-END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
-START_DATE = END_DATE - timedelta(days=180)
-
-# Special events (within the date range)
-INCIDENT_END = END_DATE - timedelta(days=21)
-INCIDENT_START = INCIDENT_END - timedelta(days=10)
-
-# Holiday calendar for realistic patterns
-US_HOLIDAYS = holidays.US(years=[START_DATE.year, END_DATE.year])
-
-# Reproducibility
-SEED = 42
-
-# =============================================================================
-# SETUP
-# =============================================================================
-np.random.seed(SEED)
-Faker.seed(SEED)
-fake = Faker()
-spark = SparkSession.builder.getOrCreate()
-
-# ... rest of script
-```
-
-## Key Principles
-
-### 1. Use Pandas for Generation, Spark for Saving
-
-Generate data with pandas (faster, easier), convert to Spark for saving:
-
-```python
-import pandas as pd
-
-# Generate with pandas
-customers_pdf = pd.DataFrame({
-    "customer_id": [f"CUST-{i:05d}" for i in range(N_CUSTOMERS)],
-    "name": [fake.company() for _ in range(N_CUSTOMERS)],
-    "tier": np.random.choice(['Free', 'Pro', 'Enterprise'], N_CUSTOMERS, p=[0.6, 0.3, 0.1]),
-    "region": np.random.choice(['North', 'South', 'East', 'West'], N_CUSTOMERS, p=[0.4, 0.25, 0.2, 0.15]),
-    "created_at": [fake.date_between(start_date='-2y', end_date='-6m') for _ in range(N_CUSTOMERS)],
-})
-
-# Convert to Spark and save
-customers_df = spark.createDataFrame(customers_pdf)
-customers_df.write.mode("overwrite").parquet(f"{VOLUME_PATH}/customers")
-```
-
-### 2. Iterate on DataFrames for Referential Integrity
-
-Generate master tables first, then iterate on them to create related tables with matching IDs:
-
-```python
-# 1. Generate customers (master table)
-customers_pdf = pd.DataFrame({
-    "customer_id": [f"CUST-{i:05d}" for i in range(N_CUSTOMERS)],
-    "tier": np.random.choice(['Free', 'Pro', 'Enterprise'], N_CUSTOMERS, p=[0.6, 0.3, 0.1]),
-    # ...
-})
-
-# 2. Create lookup for foreign key generation
-customer_ids = customers_pdf["customer_id"].tolist()
-customer_tier_map = dict(zip(customers_pdf["customer_id"], customers_pdf["tier"]))
-
-# Weight by tier - Enterprise customers generate more orders
-tier_weights = customers_pdf["tier"].map({'Enterprise': 5.0, 'Pro': 2.0, 'Free': 1.0})
-customer_weights = (tier_weights / tier_weights.sum()).tolist()
-
-# 3. Generate orders with valid foreign keys and tier-based logic
-orders_data = []
-for i in range(N_ORDERS):
-    cid = np.random.choice(customer_ids, p=customer_weights)
-    tier = customer_tier_map[cid]
-
-    # Amount depends on tier
-    if tier == 'Enterprise':
-        amount = np.random.lognormal(7, 0.8)
-    elif tier == 'Pro':
-        amount = np.random.lognormal(5, 0.7)
-    else:
-        amount = np.random.lognormal(3.5, 0.6)
-
-    orders_data.append({
-        "order_id": f"ORD-{i:06d}",
-        "customer_id": cid,
-        "amount": round(amount, 2),
-        "order_date": fake.date_between(start_date=START_DATE, end_date=END_DATE),
-    })
-
-orders_pdf = pd.DataFrame(orders_data)
-
-# 4. Generate tickets that reference both customers and orders
-order_ids = orders_pdf["order_id"].tolist()
-tickets_data = []
-for i in range(N_TICKETS):
-    cid = np.random.choice(customer_ids, p=customer_weights)
-    oid = np.random.choice(order_ids)  # Or None for general inquiry
-
-    tickets_data.append({
-        "ticket_id": f"TKT-{i:06d}",
-        "customer_id": cid,
-        "order_id": oid if np.random.random() > 0.3 else None,
-        # ...
-    })
-
-tickets_pdf = pd.DataFrame(tickets_data)
-```
-
-### 3. Non-Linear Distributions
-
-**Never use uniform distributions** - real data is rarely uniform:
-
-```python
-# BAD - Uniform (unrealistic)
-prices = np.random.uniform(10, 1000, size=N_ORDERS)
-
-# GOOD - Log-normal (realistic for prices, salaries, order amounts)
-prices = np.random.lognormal(mean=4.5, sigma=0.8, size=N_ORDERS)
-
-# GOOD - Pareto/power law (popularity, wealth, page views)
-popularity = (np.random.pareto(a=2.5, size=N_PRODUCTS) + 1) * 10
-
-# GOOD - Exponential (time between events, resolution time)
-resolution_hours = np.random.exponential(scale=24, size=N_TICKETS)
-
-# GOOD - Weighted categorical
-regions = np.random.choice(
-    ['North', 'South', 'East', 'West'],
-    size=N_CUSTOMERS,
-    p=[0.40, 0.25, 0.20, 0.15]
-)
-```
-
-### 4. Time-Based Patterns
-
-Add weekday/weekend effects, holidays, seasonality, and event spikes:
-
-```python
-import holidays
-
-# Load holiday calendar
-US_HOLIDAYS = holidays.US(years=[START_DATE.year, END_DATE.year])
-
-def get_daily_multiplier(date):
-    """Calculate volume multiplier for a given date."""
-    multiplier = 1.0
-
-    # Weekend drop
-    if date.weekday() >= 5:
-        multiplier *= 0.6
-
-    # Holiday drop (even lower than weekends)
-    if date in US_HOLIDAYS:
-        multiplier *= 0.3
-
-    # Q4 seasonality (higher in Oct-Dec)
-    multiplier *= 1 + 0.15 * (date.month - 6) / 6
-
-    # Incident spike
-    if INCIDENT_START <= date <= INCIDENT_END:
-        multiplier *= 3.0
-
-    # Random noise
-    multiplier *= np.random.normal(1, 0.1)
-
-    return max(0.1, multiplier)
-
-# Distribute tickets across dates with realistic patterns
-date_range = pd.date_range(START_DATE, END_DATE, freq='D')
-daily_volumes = [int(BASE_DAILY_TICKETS * get_daily_multiplier(d)) for d in date_range]
-```
-
-### 5. Row Coherence
-
-Attributes within a row should correlate logically:
-
-```python
-def generate_ticket(customer_id, tier, date):
-    """Generate a coherent ticket where attributes correlate."""
-
-    # Priority correlates with tier
-    if tier == 'Enterprise':
-        priority = np.random.choice(['Critical', 'High', 'Medium'], p=[0.3, 0.5, 0.2])
-    else:
-        priority = np.random.choice(['Critical', 'High', 'Medium', 'Low'], p=[0.05, 0.2, 0.45, 0.3])
-
-    # Resolution time correlates with priority
-    resolution_scale = {'Critical': 4, 'High': 12, 'Medium': 36, 'Low': 72}
-    resolution_hours = np.random.exponential(scale=resolution_scale[priority])
-
-    # CSAT correlates with resolution time
-    if resolution_hours < 4:
-        csat = np.random.choice([4, 5], p=[0.3, 0.7])
-    elif resolution_hours < 24:
-        csat = np.random.choice([3, 4, 5], p=[0.2, 0.5, 0.3])
-    else:
-        csat = np.random.choice([1, 2, 3, 4], p=[0.1, 0.3, 0.4, 0.2])
-
-    return {
-        "customer_id": customer_id,
-        "priority": priority,
-        "resolution_hours": round(resolution_hours, 1),
-        "csat_score": csat,
-        "created_at": date,
-    }
-```
-
-## Complete Example
-
-Save as `scripts/generate_data.py`:
-
-```python
-"""Generate synthetic customer, order, and ticket data."""
-import numpy as np
-import pandas as pd
-from datetime import datetime, timedelta
-from faker import Faker
-import holidays
-from pyspark.sql import SparkSession
-
-# =============================================================================
-# CONFIGURATION
-# =============================================================================
-CATALOG = "my_catalog"
-SCHEMA = "my_schema"
-VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
-
-N_CUSTOMERS = 2500
-N_ORDERS = 25000
-N_TICKETS = 8000
-
-# Date range - last 6 months from today
-END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
-START_DATE = END_DATE - timedelta(days=180)
-
-# Special events (within the date range)
-INCIDENT_END = END_DATE - timedelta(days=21)
-INCIDENT_START = INCIDENT_END - timedelta(days=10)
-
-# Holiday calendar
-US_HOLIDAYS = holidays.US(years=[START_DATE.year, END_DATE.year])
-
-SEED = 42
-
-# =============================================================================
-# SETUP
-# =============================================================================
-np.random.seed(SEED)
-Faker.seed(SEED)
-fake = Faker()
-spark = SparkSession.builder.getOrCreate()
-
-# =============================================================================
-# CREATE INFRASTRUCTURE
-# =============================================================================
-print(f"Creating catalog/schema/volume if needed...")
-spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")
-spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
-spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data")
-
-print(f"Generating: {N_CUSTOMERS:,} customers, {N_ORDERS:,} orders, {N_TICKETS:,} tickets")
-
-# =============================================================================
-# 1. CUSTOMERS (Master Table)
-# =============================================================================
-print("Generating customers...")
-
-customers_pdf = pd.DataFrame({
-    "customer_id": [f"CUST-{i:05d}" for i in range(N_CUSTOMERS)],
-    "name": [fake.company() for _ in range(N_CUSTOMERS)],
-    "tier": np.random.choice(['Free', 'Pro', 'Enterprise'], N_CUSTOMERS, p=[0.6, 0.3, 0.1]),
-    "region": np.random.choice(['North', 'South', 'East', 'West'], N_CUSTOMERS, p=[0.4, 0.25, 0.2, 0.15]),
-})
-
-# ARR correlates with tier
-customers_pdf["arr"] = customers_pdf["tier"].apply(
-    lambda t: round(np.random.lognormal(11, 0.5), 2) if t == 'Enterprise'
-              else round(np.random.lognormal(8, 0.6), 2) if t == 'Pro' else 0
-)
-
-# Lookups for foreign keys
-customer_ids = customers_pdf["customer_id"].tolist()
-customer_tier_map = dict(zip(customers_pdf["customer_id"], customers_pdf["tier"]))
-tier_weights = customers_pdf["tier"].map({'Enterprise': 5.0, 'Pro': 2.0, 'Free': 1.0})
-customer_weights = (tier_weights / tier_weights.sum()).tolist()
-
-print(f"  Created {len(customers_pdf):,} customers")
-
-# =============================================================================
-# 2. ORDERS (References Customers)
-# =============================================================================
-print("Generating orders...")
-
-orders_data = []
-for i in range(N_ORDERS):
-    cid = np.random.choice(customer_ids, p=customer_weights)
-    tier = customer_tier_map[cid]
-    amount = np.random.lognormal(7 if tier == 'Enterprise' else 5 if tier == 'Pro' else 3.5, 0.7)
-
-    orders_data.append({
-        "order_id": f"ORD-{i:06d}",
-        "customer_id": cid,
-        "amount": round(amount, 2),
-        "status": np.random.choice(['completed', 'pending', 'cancelled'], p=[0.85, 0.10, 0.05]),
-        "order_date": fake.date_between(start_date=START_DATE, end_date=END_DATE),
-    })
-
-orders_pdf = pd.DataFrame(orders_data)
-print(f"  Created {len(orders_pdf):,} orders")
-
-# =============================================================================
-# 3. TICKETS (References Customers, with incident spike)
-# =============================================================================
-print("Generating tickets...")
-
-def get_daily_volume(date, base=25):
-    vol = base * (0.6 if date.weekday() >= 5 else 1.0)
-    if date in US_HOLIDAYS:
-        vol *= 0.3  # Even lower on holidays
-    if INCIDENT_START <= date <= INCIDENT_END:
-        vol *= 3.0
-    return int(vol * np.random.normal(1, 0.15))
-
-# Distribute tickets across dates
-tickets_data = []
-ticket_idx = 0
-for day in pd.date_range(START_DATE, END_DATE):
-    daily_count = get_daily_volume(day.to_pydatetime())
-    is_incident = INCIDENT_START <= day.to_pydatetime() <= INCIDENT_END
-
-    for _ in range(daily_count):
-        if ticket_idx >= N_TICKETS:
-            break
-
-        cid = np.random.choice(customer_ids, p=customer_weights)
-        tier = customer_tier_map[cid]
-
-        # Category - Auth dominates during incident
-        if is_incident:
-            category = np.random.choice(['Auth', 'Network', 'Billing', 'Account'], p=[0.65, 0.15, 0.1, 0.1])
-        else:
-            category = np.random.choice(['Auth', 'Network', 'Billing', 'Account'], p=[0.25, 0.30, 0.25, 0.20])
-
-        # Priority correlates with tier
-        priority = np.random.choice(['Critical', 'High', 'Medium'], p=[0.3, 0.5, 0.2]) if tier == 'Enterprise' \
-                   else np.random.choice(['Critical', 'High', 'Medium', 'Low'], p=[0.05, 0.2, 0.45, 0.3])
-
-        # Resolution time correlates with priority
-        res_scale = {'Critical': 4, 'High': 12, 'Medium': 36, 'Low': 72}
-        resolution = np.random.exponential(scale=res_scale[priority])
-
-        # CSAT degrades during incident for Auth
-        if is_incident and category == 'Auth':
-            csat = np.random.choice([1, 2, 3, 4, 5], p=[0.15, 0.25, 0.35, 0.2, 0.05])
-        else:
-            csat = 5 if resolution < 4 else (4 if resolution < 12 else np.random.choice([2, 3, 4], p=[0.2, 0.5, 0.3]))
-
-        tickets_data.append({
-            "ticket_id": f"TKT-{ticket_idx:06d}",
-            "customer_id": cid,
-            "category": category,
-            "priority": priority,
-            "resolution_hours": round(resolution, 1),
-            "csat_score": csat,
-            "created_at": day.strftime("%Y-%m-%d"),
-        })
-        ticket_idx += 1
-
-    if ticket_idx >= N_TICKETS:
-        break
-
-tickets_pdf = pd.DataFrame(tickets_data)
-print(f"  Created {len(tickets_pdf):,} tickets")
-
-# =============================================================================
-# 4. SAVE TO VOLUME
-# =============================================================================
-print(f"\nSaving to {VOLUME_PATH}...")
-
-spark.createDataFrame(customers_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/customers")
-spark.createDataFrame(orders_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/orders")
-spark.createDataFrame(tickets_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/tickets")
-
-print("Done!")
-
-# =============================================================================
-# 5. VALIDATION
-# =============================================================================
-print("\n=== VALIDATION ===")
-print(f"Tier distribution: {customers_pdf['tier'].value_counts(normalize=True).to_dict()}")
-print(f"Avg order by tier: {orders_pdf.merge(customers_pdf[['customer_id', 'tier']]).groupby('tier')['amount'].mean().to_dict()}")
-
-incident_tickets = tickets_pdf[tickets_pdf['created_at'].between(
-    INCIDENT_START.strftime("%Y-%m-%d"), INCIDENT_END.strftime("%Y-%m-%d")
-)]
-print(f"Incident period tickets: {len(incident_tickets):,} ({len(incident_tickets)/len(tickets_pdf)*100:.1f}%)")
-print(f"Incident Auth %: {(incident_tickets['category'] == 'Auth').mean()*100:.1f}%")
-```
-
-Execute using `run_python_file_on_databricks` tool:
-- `file_path`: "scripts/generate_data.py"
-
-If it fails, edit the file and re-run with the same `cluster_id` and `context_id`.
-
-### Validate Generated Data
-
-After successful execution, use `get_volume_folder_details` tool to verify the generated data:
-- `volume_path`: "my_catalog/my_schema/raw_data/customers"
-- `format`: "parquet"
-- `table_stat_level`: "SIMPLE"
-
-This returns schema, row counts, and column statistics to confirm the data was written correctly.
-
-## Best Practices
-
-1. **Ask for schema**: Default to `ai_dev_kit` catalog, ask user for schema name
-2. **Create infrastructure**: Use `CREATE CATALOG/SCHEMA/VOLUME IF NOT EXISTS`
-3. **Raw data only**: No `total_x`, `sum_x`, `avg_x` fields - SDP pipeline computes those
-4. **Save to Volume, not tables**: Write parquet to `/Volumes/{catalog}/{schema}/raw_data/<input_datasource_name>`
-5. **Configuration at top**: All sizes, dates, and paths as variables
-6. **Dynamic dates**: Use `datetime.now() - timedelta(days=180)` for last 6 months
-7. **Pandas for generation**: Faster and easier than Spark for row-by-row logic
-8. **Master tables first**: Generate customers, then orders reference customer_ids
-9. **Weighted sampling**: Enterprise customers generate more activity
-10. **Distributions**: Log-normal for values, exponential for times, weighted categorical
-11. **Time patterns**: Weekday/weekend, holidays, seasonality, event spikes
-12. **Row coherence**: Priority affects resolution time affects CSAT
-13. **Volume for aggregation**: 10K-50K rows minimum so patterns survive GROUP BY
-14. **Always use files**: Write to local file, execute, edit if error, re-execute
-15. **Context reuse**: Pass `cluster_id` and `context_id` for faster iterations
-16. **Libraries**: Install `faker` and `holidays` first; most others are pre-installed
-
-## Related Skills
-
-- **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - for building bronze/silver/gold pipelines on top of generated data
-- **[databricks-aibi-dashboards](../databricks-aibi-dashboards/SKILL.md)** - for visualizing the generated data in dashboards
-- **[databricks-unity-catalog](../databricks-unity-catalog/SKILL.md)** - for managing catalogs, schemas, and volumes where data is stored
diff --git a/databricks-skills/databricks-unity-catalog/SKILL.md b/databricks-skills/databricks-unity-catalog/SKILL.md
index 553eba97..30f34e3d 100644
--- a/databricks-skills/databricks-unity-catalog/SKILL.md
+++ b/databricks-skills/databricks-unity-catalog/SKILL.md
@@ -110,7 +110,7 @@ mcp__databricks__execute_sql(
 
 - **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - for pipelines that write to Unity Catalog tables
 - **[databricks-jobs](../databricks-jobs/SKILL.md)** - for job execution data visible in system tables
-- **[databricks-synthetic-data-generation](../databricks-synthetic-data-generation/SKILL.md)** - for generating data stored in Unity Catalog Volumes
+- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - for generating data stored in Unity Catalog Volumes
 - **[databricks-aibi-dashboards](../databricks-aibi-dashboards/SKILL.md)** - for building dashboards on top of Unity Catalog data
 
 ## Resources
diff --git a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md
index 7666f21b..ee9abf05 100644
--- a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md
+++ b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md
@@ -190,5 +190,5 @@ AZURE_OPENAI_DEPLOYMENT=gpt-4o
 
 - **[databricks-agent-bricks](../databricks-agent-bricks/SKILL.md)** - Create Knowledge Assistants that ingest the generated PDFs
 - **[databricks-vector-search](../databricks-vector-search/SKILL.md)** - Index generated documents for semantic search and RAG
-- **[databricks-synthetic-data-generation](../databricks-synthetic-data-generation/SKILL.md)** - Generate structured tabular data (complement to unstructured PDFs)
+- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - Generate structured tabular data (complement to unstructured PDFs)
 - **[databricks-mlflow-evaluation](../databricks-mlflow-evaluation/SKILL.md)** - Evaluate RAG systems using the generated question/guideline pairs
diff --git a/databricks-skills/databricks-zerobus-ingest/SKILL.md b/databricks-skills/databricks-zerobus-ingest/SKILL.md
index efd52b0d..e3d3f48a 100644
--- a/databricks-skills/databricks-zerobus-ingest/SKILL.md
+++ b/databricks-skills/databricks-zerobus-ingest/SKILL.md
@@ -218,7 +218,7 @@ The timestamp generation must use microseconds for Databricks.
 - **[databricks-python-sdk](../databricks-python-sdk/SKILL.md)** - General SDK patterns and WorkspaceClient for table/schema management
 - **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - Downstream pipeline processing of ingested data
 - **[databricks-unity-catalog](../databricks-unity-catalog/SKILL.md)** - Managing catalogs, schemas, and tables that Zerobus writes to
-- **[databricks-synthetic-data-generation](../databricks-synthetic-data-generation/SKILL.md)** - Generate test data to feed into Zerobus producers
+- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - Generate test data to feed into Zerobus producers
 - **[databricks-config](../databricks-config/SKILL.md)** - Profile and authentication setup
 
 ## Resources
diff --git a/databricks-skills/install_skills.sh b/databricks-skills/install_skills.sh
index ff8d9b86..763489c8 100755
--- a/databricks-skills/install_skills.sh
+++ b/databricks-skills/install_skills.sh
@@ -42,7 +42,7 @@ MLFLOW_REPO_RAW_URL="https://raw.githubusercontent.com/mlflow/skills"
 MLFLOW_REPO_REF="main"
 
 # Databricks skills (hosted in this repo)
-DATABRICKS_SKILLS="databricks-agent-bricks databricks-aibi-dashboards databricks-asset-bundles databricks-app-apx databricks-app-python databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-parsing databricks-python-sdk databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-generation databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source"
+DATABRICKS_SKILLS="databricks-agent-bricks databricks-aibi-dashboards databricks-asset-bundles databricks-app-apx databricks-app-python databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-parsing databricks-python-sdk databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-gen databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source"
 
 # MLflow skills (fetched from mlflow/skills repo)
 MLFLOW_SKILLS="agent-evaluation analyze-mlflow-chat-session analyze-mlflow-trace instrumenting-with-mlflow-tracing mlflow-onboarding querying-mlflow-metrics retrieving-mlflow-traces searching-mlflow-docs"
@@ -76,7 +76,7 @@ get_skill_description() {
         "databricks-spark-declarative-pipelines") echo "Spark Declarative Pipelines (SDP/LDP/DLT)" ;;
         "spark-python-data-source") echo "Spark custom Python data sources" ;;
         "databricks-spark-structured-streaming") echo "Spark Structured Streaming patterns and best practices" ;;
-        "databricks-synthetic-data-generation") echo "Synthetic test data generation" ;;
+        "databricks-synthetic-data-gen") echo "Synthetic test data generation" ;;
         "databricks-unstructured-pdf-generation") echo "Generate synthetic PDFs for RAG" ;;
         "databricks-vector-search") echo "Vector Search - endpoints, indexes, and queries for RAG" ;;
         "databricks-zerobus-ingest") echo "Zerobus Ingest - gRPC data ingestion into Delta tables" ;;
diff --git a/install.ps1 b/install.ps1
index f144b5ac..38e4a2a0 100644
--- a/install.ps1
+++ b/install.ps1
@@ -78,11 +78,11 @@ $script:ProfileProvided = $false
 $script:Skills = @(
     "databricks-agent-bricks", "databricks-aibi-dashboards", "databricks-app-apx", "databricks-app-python",
     "databricks-asset-bundles", "databricks-config", "databricks-dbsql", "databricks-docs", "databricks-genie",
-    "databricks-jobs", "databricks-metric-views", "databricks-model-serving", "databricks-python-sdk",
-    "databricks-unity-catalog", "databricks-vector-search", "databricks-zerobus-ingest",
-    "databricks-lakebase-autoscale", "databricks-lakebase-provisioned", "databricks-mlflow-evaluation",
-    "databricks-spark-declarative-pipelines", "spark-python-data-source", "databricks-spark-structured-streaming",
-    "databricks-synthetic-data-generation", "databricks-unstructured-pdf-generation"
+    "databricks-iceberg", "databricks-jobs", "databricks-lakebase-autoscale", "databricks-lakebase-provisioned",
+    "databricks-metric-views", "databricks-mlflow-evaluation", "databricks-model-serving", "databricks-parsing",
+    "databricks-python-sdk", "databricks-spark-declarative-pipelines", "databricks-spark-structured-streaming",
+    "databricks-synthetic-data-gen", "databricks-unity-catalog", "databricks-unstructured-pdf-generation",
+    "databricks-vector-search", "databricks-zerobus-ingest", "spark-python-data-source"
 )
 
 # MLflow skills (fetched from mlflow/skills repo)
diff --git a/install.sh b/install.sh
index c347b13e..61b98d42 100755
--- a/install.sh
+++ b/install.sh
@@ -74,7 +74,7 @@ MIN_SDK_VERSION="0.85.0"
 G='\033[0;32m' Y='\033[1;33m' R='\033[0;31m' BL='\033[0;34m' B='\033[1m' D='\033[2m' N='\033[0m'
 
 # Databricks skills (bundled in repo)
-SKILLS="databricks-agent-bricks databricks-aibi-dashboards databricks-app-apx databricks-app-python databricks-asset-bundles databricks-config databricks-dbsql databricks-docs databricks-genie databricks-jobs databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-python-sdk databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-generation databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source"
+SKILLS="databricks-agent-bricks databricks-aibi-dashboards databricks-app-apx databricks-app-python databricks-asset-bundles databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-parsing databricks-python-sdk databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-gen databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source"
 
 # MLflow skills (fetched from mlflow/skills repo)
 MLFLOW_SKILLS="agent-evaluation analyze-mlflow-chat-session analyze-mlflow-trace instrumenting-with-mlflow-tracing mlflow-onboarding querying-mlflow-metrics retrieving-mlflow-traces searching-mlflow-docs"