diff --git a/.gitignore b/.gitignore
index 385994fa..a170605d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
# Databricks AI Dev Kit
.ai-dev-kit/
.claude/
-
+.local
# Python
__pycache__/
diff --git a/.test/README.md b/.test/README.md
index d5c8fe46..d2bbb2db 100644
--- a/.test/README.md
+++ b/.test/README.md
@@ -233,3 +233,17 @@ uv pip install -e ".test/"
uv run pytest .test/tests/
uv run python .test/scripts/regression.py
```
+
+---
+
+## Troubleshooting
+
+### MLflow evaluation not returning results
+
+If `/skill-test mlflow` hangs or doesn't return results, run manually with debug logging:
+
+```bash
+MLFLOW_LOG_LEVEL=DEBUG uv run python .test/scripts/mlflow_eval.py
+```
+
+This will show detailed MLflow API calls and help identify connection or authentication issues.
diff --git a/.test/baselines/databricks-synthetic-data-gen/baseline.yaml b/.test/baselines/databricks-synthetic-data-gen/baseline.yaml
new file mode 100644
index 00000000..b43273c8
--- /dev/null
+++ b/.test/baselines/databricks-synthetic-data-gen/baseline.yaml
@@ -0,0 +1,21 @@
+run_id: '20260303_071721'
+created_at: '2026-03-03T07:17:21.838623'
+skill_name: databricks-synthetic-data-gen
+metrics:
+ pass_rate: 1.0
+ total_tests: 4
+ passed_tests: 4
+ failed_tests: 0
+test_results:
+- id: grp_20260302_113344
+ passed: true
+ execution_mode: local
+- id: gen_serverless_job_catalog_json_002
+ passed: true
+ execution_mode: local
+- id: grp_20260302_retail_csv_3tables_003
+ passed: true
+ execution_mode: local
+- id: grp_20260303_manufacturing_delta_streaming_004
+ passed: true
+ execution_mode: local
diff --git a/.test/scripts/mlflow_eval.py b/.test/scripts/mlflow_eval.py
index caa2e45c..93278e4d 100755
--- a/.test/scripts/mlflow_eval.py
+++ b/.test/scripts/mlflow_eval.py
@@ -2,29 +2,65 @@
"""Run MLflow evaluation for a skill.
Usage:
- python mlflow_eval.py [--filter-category ] [--run-name ]
+ python mlflow_eval.py [--filter-category ] [--run-name ] [--timeout ]
Environment Variables:
DATABRICKS_CONFIG_PROFILE - Databricks CLI profile (default: "DEFAULT")
MLFLOW_TRACKING_URI - Set to "databricks" for Databricks MLflow
MLFLOW_EXPERIMENT_NAME - Experiment path (e.g., "/Users/{user}/skill-test")
+ MLFLOW_LLM_JUDGE_TIMEOUT - Timeout in seconds for LLM judge evaluation (default: 120)
"""
+import os
import sys
+import signal
import argparse
+# Close stdin and disable tqdm progress bars when run non-interactively
+# This fixes hanging issues with tqdm/MLflow progress bars in background tasks
+if not sys.stdin.isatty():
+ try:
+ sys.stdin.close()
+ sys.stdin = open(os.devnull, 'r')
+ except Exception:
+ pass
+ # Disable tqdm progress bars
+ os.environ.setdefault("TQDM_DISABLE", "1")
+
# Import common utilities
from _common import setup_path, print_result, handle_error
+class TimeoutException(Exception):
+ pass
+
+
+def timeout_handler(signum, frame):
+ raise TimeoutException("MLflow evaluation timed out")
+
+
def main():
parser = argparse.ArgumentParser(description="Run MLflow evaluation for a skill")
parser.add_argument("skill_name", help="Name of skill to evaluate")
parser.add_argument("--filter-category", help="Filter by test category")
parser.add_argument("--run-name", help="Custom MLflow run name")
+ parser.add_argument(
+ "--timeout",
+ type=int,
+ default=120,
+ help="Timeout in seconds for evaluation (default: 120)",
+ )
args = parser.parse_args()
setup_path()
+ # Set up signal-based timeout (Unix only)
+ if hasattr(signal, 'SIGALRM'):
+ signal.signal(signal.SIGALRM, timeout_handler)
+ signal.alarm(args.timeout)
+ else:
+ # Windows: SIGALRM not available - no timeout enforcement
+ print("WARNING: Timeout not supported on Windows - test may run indefinitely", file=sys.stderr)
+
try:
from skill_test.runners import evaluate_skill
@@ -34,6 +70,10 @@ def main():
run_name=args.run_name,
)
+ # Cancel the alarm if we succeeded
+ if hasattr(signal, 'SIGALRM'):
+ signal.alarm(0)
+
# Convert to standard result format
if result.get("run_id"):
result["success"] = True
@@ -42,7 +82,19 @@ def main():
sys.exit(print_result(result))
+ except TimeoutException as e:
+ result = {
+ "success": False,
+ "skill_name": args.skill_name,
+ "error": f"Evaluation timed out after {args.timeout} seconds. This may indicate LLM judge endpoint issues.",
+ "error_type": "timeout",
+ }
+ sys.exit(print_result(result))
+
except Exception as e:
+ # Cancel alarm on any exception
+ if hasattr(signal, 'SIGALRM'):
+ signal.alarm(0)
sys.exit(handle_error(e, args.skill_name))
diff --git a/.test/skills/_routing/ground_truth.yaml b/.test/skills/_routing/ground_truth.yaml
index a428d5b7..f9948433 100644
--- a/.test/skills/_routing/ground_truth.yaml
+++ b/.test/skills/_routing/ground_truth.yaml
@@ -99,7 +99,7 @@ test_cases:
prompt: "Generate synthetic customer data and evaluate the agent quality with MLflow scorers"
expectations:
expected_skills:
- - "databricks-synthetic-data-generation"
+ - "databricks-synthetic-data-gen"
- "databricks-mlflow-evaluation"
is_multi_skill: true
metadata:
diff --git a/.test/skills/databricks-synthetic-data-gen/candidates.yaml b/.test/skills/databricks-synthetic-data-gen/candidates.yaml
new file mode 100644
index 00000000..c6c2a270
--- /dev/null
+++ b/.test/skills/databricks-synthetic-data-gen/candidates.yaml
@@ -0,0 +1,7 @@
+# Candidates for databricks-synthetic-data-gen skill
+# Test cases pending review before promotion to ground_truth.yaml
+#
+# Use `/skill-test databricks-synthetic-data-gen add` to create new candidates
+# Use `/skill-test databricks-synthetic-data-gen review` to promote candidates to ground truth
+
+candidates: []
diff --git a/.test/skills/databricks-synthetic-data-gen/expected_responses/gen_serverless_job_catalog_json_002.txt b/.test/skills/databricks-synthetic-data-gen/expected_responses/gen_serverless_job_catalog_json_002.txt
new file mode 100644
index 00000000..16065794
--- /dev/null
+++ b/.test/skills/databricks-synthetic-data-gen/expected_responses/gen_serverless_job_catalog_json_002.txt
@@ -0,0 +1,331 @@
+Here's the Python script to generate product catalog data, followed by the serverless job definition:
+
+```python
+"""
+Synthetic Product Catalog Data Generation
+Serverless Databricks Job with JSON Output
+
+Generates:
+- products (3,000 rows): product_id, name, category (weighted), price (log-normal), inventory_count
+- sales (10,000 rows): sale_id, product_id (FK), quantity, sale_date, discount_pct
+
+Popular product categories have more sales (weighted sampling).
+"""
+
+import os
+from pyspark.sql import SparkSession, functions as F
+from pyspark.sql.window import Window
+from pyspark.sql.types import StringType, DoubleType, IntegerType
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+CATALOG = "my_catalog"
+SCHEMA = "devkit_gen2_test_small_json"
+VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
+
+NUM_PRODUCTS = 3_000
+NUM_SALES = 10_000
+NUM_PARTITIONS = 16
+
+END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
+START_DATE = END_DATE - timedelta(days=180)
+SEED = 42
+
+# =============================================================================
+# SPARK SESSION
+# =============================================================================
+
+def is_databricks_runtime():
+ return "DATABRICKS_RUNTIME_VERSION" in os.environ
+
+if is_databricks_runtime():
+ spark = SparkSession.builder.getOrCreate()
+ print("Running on Databricks Runtime")
+else:
+ from databricks.connect import DatabricksSession
+ spark = DatabricksSession.builder.serverless(True).getOrCreate()
+ print("Running with Databricks Connect (serverless)")
+
+print("=" * 60)
+print("PRODUCT CATALOG DATA GENERATION")
+print("=" * 60)
+print(f"Catalog: {CATALOG}")
+print(f"Schema: {SCHEMA}")
+print(f"Products: {NUM_PRODUCTS:,}")
+print(f"Sales: {NUM_SALES:,}")
+print("=" * 60)
+
+# =============================================================================
+# CREATE INFRASTRUCTURE
+# =============================================================================
+print("\nCreating infrastructure...")
+spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
+spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data")
+
+# =============================================================================
+# PANDAS UDFs
+# =============================================================================
+
+@F.pandas_udf(StringType())
+def fake_product_name(categories: pd.Series) -> pd.Series:
+ from faker import Faker
+ fake = Faker()
+
+ templates = {
+ "Electronics": ["Smart", "Wireless", "Digital", "Pro", "Ultra"],
+ "Home & Garden": ["Premium", "Deluxe", "Classic", "Modern", "Natural"],
+ "Clothing": ["Designer", "Casual", "Comfort", "Luxury", "Sport"],
+ "Sports": ["Pro", "Elite", "Performance", "Outdoor", "Active"],
+ "Books": ["Complete", "Essential", "Ultimate", "Practical", "Advanced"],
+ "Toys": ["Fun", "Creative", "Educational", "Super", "Magic"],
+ "Beauty": ["Natural", "Premium", "Radiance", "Pure", "Glow"],
+ "Automotive": ["Pro", "Heavy-Duty", "Premium", "Performance", "Ultra"],
+ }
+
+ products = {
+ "Electronics": ["Headphones", "Speaker", "Charger", "Watch", "Camera"],
+ "Home & Garden": ["Lamp", "Planter", "Organizer", "Rug", "Vase"],
+ "Clothing": ["T-Shirt", "Jacket", "Pants", "Sweater", "Dress"],
+ "Sports": ["Ball", "Racket", "Mat", "Gloves", "Bag"],
+ "Books": ["Guide", "Handbook", "Manual", "Edition", "Collection"],
+ "Toys": ["Game", "Puzzle", "Building Set", "Robot", "Craft Kit"],
+ "Beauty": ["Serum", "Cream", "Lotion", "Mask", "Oil"],
+ "Automotive": ["Tool Kit", "Cover", "Mat", "Cleaner", "Polish"],
+ }
+
+ names = []
+ for category in categories:
+ template_list = templates.get(category, ["Premium"])
+ product_list = products.get(category, ["Item"])
+ template = np.random.choice(template_list)
+ product = np.random.choice(product_list)
+ color = fake.color_name()
+ names.append(f"{template} {color} {product}")
+
+ return pd.Series(names)
+
+
+@F.pandas_udf(DoubleType())
+def generate_price(categories: pd.Series) -> pd.Series:
+ price_params = {
+ "Electronics": (4.5, 0.8),
+ "Home & Garden": (3.8, 0.7),
+ "Clothing": (3.5, 0.6),
+ "Sports": (4.0, 0.7),
+ "Books": (2.8, 0.4),
+ "Toys": (3.2, 0.6),
+ "Beauty": (3.3, 0.5),
+ "Automotive": (4.2, 0.8),
+ }
+
+ prices = []
+ for category in categories:
+ mu, sigma = price_params.get(category, (3.5, 0.6))
+ price = float(np.random.lognormal(mu, sigma))
+ price = round(price) - 0.01 if price > 1 else round(price, 2)
+ prices.append(max(0.99, price))
+
+ return pd.Series(prices)
+
+
+@F.pandas_udf(IntegerType())
+def generate_inventory(ids: pd.Series) -> pd.Series:
+ inventory = (np.random.pareto(a=2.0, size=len(ids)) + 1) * 20
+ return pd.Series(inventory.astype(int))
+
+
+# =============================================================================
+# GENERATE PRODUCTS TABLE (Master)
+# =============================================================================
+print(f"\nGenerating {NUM_PRODUCTS:,} products...")
+
+products_df = (
+ spark.range(0, NUM_PRODUCTS, numPartitions=NUM_PARTITIONS)
+ .select(
+ F.concat(F.lit("PROD-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("product_id"),
+ F.col("id").alias("_idx"),
+ )
+ .withColumn(
+ "category",
+ F.when(F.rand(SEED) < 0.25, "Electronics")
+ .when(F.rand(SEED + 1) < 0.45, "Home & Garden")
+ .when(F.rand(SEED + 2) < 0.63, "Clothing")
+ .when(F.rand(SEED + 3) < 0.75, "Sports")
+ .when(F.rand(SEED + 4) < 0.85, "Books")
+ .when(F.rand(SEED + 5) < 0.93, "Toys")
+ .when(F.rand(SEED + 6) < 0.98, "Beauty")
+ .otherwise("Automotive")
+ )
+ .withColumn("name", fake_product_name(F.col("category")))
+ .withColumn("price", generate_price(F.col("category")))
+ .withColumn("inventory_count", generate_inventory(F.col("_idx")))
+ .drop("_idx")
+)
+
+products_final = products_df.select(
+ "product_id", "name", "category", "price", "inventory_count"
+)
+
+products_json_path = f"{VOLUME_PATH}/products"
+print(f"Writing products to {products_json_path}...")
+products_final.write.mode("overwrite").json(products_json_path)
+
+products_for_fk = spark.read.json(products_json_path).select("product_id", "category")
+product_count = products_for_fk.count()
+print(f"Products written: {product_count:,}")
+
+# =============================================================================
+# GENERATE SALES TABLE (with Referential Integrity)
+# =============================================================================
+print(f"\nGenerating {NUM_SALES:,} sales with referential integrity...")
+
+product_weights = products_for_fk.select(
+ "product_id",
+ "category",
+ F.when(F.col("category") == "Electronics", 3.0)
+ .when(F.col("category") == "Home & Garden", 2.5)
+ .when(F.col("category") == "Clothing", 2.0)
+ .when(F.col("category") == "Sports", 1.5)
+ .when(F.col("category") == "Books", 1.2)
+ .when(F.col("category") == "Toys", 1.0)
+ .when(F.col("category") == "Beauty", 0.8)
+ .otherwise(0.5).alias("weight")
+)
+
+weighted_products = (
+ product_weights
+ .select(
+ F.col("product_id"),
+ F.col("category"),
+ F.explode(F.array_repeat(F.col("product_id"), F.col("weight").cast("int"))).alias("_dup")
+ )
+ .drop("_dup")
+)
+
+sampled_products = (
+ weighted_products
+ .orderBy(F.rand(SEED + 20))
+ .limit(NUM_SALES)
+ .withColumn("sale_rank", F.row_number().over(Window.orderBy(F.monotonically_increasing_id())))
+ .select("product_id", "category", "sale_rank")
+)
+
+sales_base = (
+ spark.range(0, NUM_SALES, numPartitions=NUM_PARTITIONS)
+ .withColumn("sale_rank", F.row_number().over(Window.orderBy(F.col("id"))))
+ .withColumn(
+ "sale_id",
+ F.concat(F.lit("SALE-"), F.lpad(F.col("id").cast("string"), 6, "0"))
+ )
+)
+
+sales_with_products = sales_base.join(
+ sampled_products,
+ on="sale_rank",
+ how="inner"
+)
+
+sales_df = (
+ sales_with_products
+ .withColumn(
+ "quantity",
+ F.when(F.rand(SEED + 21) < 0.60, 1)
+ .when(F.rand(SEED + 22) < 0.85, 2)
+ .when(F.rand(SEED + 23) < 0.95, 3)
+ .otherwise(F.floor(F.rand(SEED + 24) * 5 + 4).cast("int"))
+ )
+ .withColumn(
+ "sale_date",
+ F.date_add(F.lit(START_DATE.date()), (F.rand(SEED + 25) * 180).cast("int"))
+ )
+ .withColumn(
+ "discount_pct",
+ F.when(F.rand(SEED + 26) < 0.70, 0.0)
+ .when(F.rand(SEED + 27) < 0.85, 0.10)
+ .when(F.rand(SEED + 28) < 0.95, 0.20)
+ .otherwise(0.30)
+ )
+)
+
+sales_final = sales_df.select(
+ "sale_id", "product_id", "quantity", "sale_date", "discount_pct"
+)
+
+sales_json_path = f"{VOLUME_PATH}/sales"
+print(f"Writing sales to {sales_json_path}...")
+sales_final.write.mode("overwrite").json(sales_json_path)
+
+sales_count = spark.read.json(sales_json_path).count()
+print(f"Sales written: {sales_count:,}")
+
+# =============================================================================
+# VALIDATION
+# =============================================================================
+print("\n" + "=" * 60)
+print("VALIDATION")
+print("=" * 60)
+
+products_check = spark.read.json(products_json_path)
+sales_check = spark.read.json(sales_json_path)
+
+print(f"\nRow counts:")
+print(f" Products: {products_check.count():,}")
+print(f" Sales: {sales_check.count():,}")
+
+print(f"\nSales per category (Electronics should have most):")
+sales_with_category = sales_check.join(
+ products_check.select("product_id", "category"),
+ on="product_id"
+)
+sales_with_category.groupBy("category").agg(
+ F.count("*").alias("total_sales"),
+ F.sum("quantity").alias("total_quantity")
+).orderBy(F.desc("total_sales")).show()
+
+orphan_sales = sales_check.join(
+ products_check.select("product_id"),
+ on="product_id",
+ how="left_anti"
+).count()
+print(f"\nReferential integrity:")
+print(f" Orphan sales (no matching product): {orphan_sales}")
+
+print("\n" + "=" * 60)
+print("GENERATION COMPLETE")
+print("=" * 60)
+print(f"\nOutput location: {VOLUME_PATH}")
+```
+
+To run this as a serverless job, create the job with this configuration:
+
+```json
+{
+ "name": "generate_product_catalog_data",
+ "tasks": [
+ {
+ "task_key": "generate_data",
+ "spark_python_task": {
+ "python_file": "/Workspace/Users/your_email/generate_product_catalog_data.py"
+ },
+ "environment_key": "default"
+ }
+ ],
+ "environments": [
+ {
+ "environment_key": "default",
+ "spec": {
+ "client": "4",
+ "dependencies": [
+ "faker",
+ "pandas",
+ "numpy"
+ ]
+ }
+ }
+ ]
+}
+```
diff --git a/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260302_113344.txt b/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260302_113344.txt
new file mode 100644
index 00000000..39997ba5
--- /dev/null
+++ b/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260302_113344.txt
@@ -0,0 +1,233 @@
+I'll generate the e-commerce data locally using Polars and Faker, then upload to Unity Catalog.
+
+## Generation Plan
+
+**Output Location:** `ai_dev_kit.devkit_gen1_test_small_parquet`
+**Volume:** `/Volumes/devkit/devkit_gen1_test_small_parquet/raw_data/`
+
+| Table | Columns | Rows |
+|-------|---------|------|
+| customers | customer_id, name, email, tier, region, created_at | 5,000 |
+| orders | order_id, customer_id (FK), amount, order_date, status | 15,000 |
+
+**Order Distribution by Tier:**
+- Enterprise (10%): ~8 orders per customer average
+- Pro (30%): ~3 orders per customer average
+- Free (60%): ~1.5 orders per customer average
+
+Here's the complete script:
+
+```python
+import polars as pl
+import numpy as np
+from faker import Faker
+from datetime import datetime, timedelta
+import random
+from databricks.sdk import WorkspaceClient
+
+# Configuration
+CATALOG = "devkit"
+SCHEMA = "devkit_gen1_test_small_parquet"
+NUM_CUSTOMERS = 5000
+NUM_ORDERS = 15000
+
+fake = Faker()
+Faker.seed(42)
+np.random.seed(42)
+random.seed(42)
+
+# Date ranges
+END_DATE = datetime.now()
+CUSTOMER_START_DATE = END_DATE - timedelta(days=730) # 2 years
+ORDER_START_DATE = END_DATE - timedelta(days=180) # 6 months
+
+# =============================================================================
+# Generate Customers
+# =============================================================================
+print("Generating customers...")
+
+# Tier distribution: Free 60%, Pro 30%, Enterprise 10%
+tier_weights = [0.6, 0.3, 0.1]
+tiers = random.choices(["Free", "Pro", "Enterprise"], weights=tier_weights, k=NUM_CUSTOMERS)
+
+# Region distribution
+region_weights = [0.4, 0.3, 0.2, 0.1]
+regions = random.choices(["US-East", "US-West", "EU", "APAC"], weights=region_weights, k=NUM_CUSTOMERS)
+
+customers = pl.DataFrame({
+ "customer_id": [f"CUST-{i:05d}" for i in range(NUM_CUSTOMERS)],
+ "name": [fake.name() for _ in range(NUM_CUSTOMERS)],
+ "email": [fake.email() for _ in range(NUM_CUSTOMERS)],
+ "tier": tiers,
+ "region": regions,
+ "created_at": [
+ fake.date_time_between(start_date=CUSTOMER_START_DATE, end_date=END_DATE)
+ for _ in range(NUM_CUSTOMERS)
+ ],
+})
+
+print(f"Generated {len(customers)} customers")
+print(customers.group_by("tier").len().sort("tier"))
+
+# =============================================================================
+# Generate Orders with Tier-Weighted Distribution
+# =============================================================================
+print("\nGenerating orders with tier-weighted distribution...")
+
+# Calculate orders per tier to achieve ~15,000 total with weighted distribution
+# Enterprise: ~8 orders/customer, Pro: ~3, Free: ~1.5
+tier_counts = customers.group_by("tier").len()
+enterprise_count = tier_counts.filter(pl.col("tier") == "Enterprise")["len"][0]
+pro_count = tier_counts.filter(pl.col("tier") == "Pro")["len"][0]
+free_count = tier_counts.filter(pl.col("tier") == "Free")["len"][0]
+
+# Target distribution
+enterprise_orders_per = 8
+pro_orders_per = 3
+free_orders_per = 1.5
+
+total_target = enterprise_count * enterprise_orders_per + pro_count * pro_orders_per + free_count * free_orders_per
+scale = NUM_ORDERS / total_target
+
+enterprise_orders = int(enterprise_count * enterprise_orders_per * scale)
+pro_orders = int(pro_count * pro_orders_per * scale)
+free_orders = NUM_ORDERS - enterprise_orders - pro_orders
+
+# Get customer IDs by tier
+enterprise_customers = customers.filter(pl.col("tier") == "Enterprise")["customer_id"].to_list()
+pro_customers = customers.filter(pl.col("tier") == "Pro")["customer_id"].to_list()
+free_customers = customers.filter(pl.col("tier") == "Free")["customer_id"].to_list()
+
+# Generate customer_ids for orders
+order_customer_ids = (
+ random.choices(enterprise_customers, k=enterprise_orders) +
+ random.choices(pro_customers, k=pro_orders) +
+ random.choices(free_customers, k=free_orders)
+)
+random.shuffle(order_customer_ids)
+
+# Create customer tier lookup
+customer_tier_map = dict(zip(customers["customer_id"].to_list(), customers["tier"].to_list()))
+
+# Generate amounts based on tier (log-normal distribution)
+def generate_amount(tier: str) -> float:
+ if tier == "Enterprise":
+ return round(float(np.random.lognormal(6.2, 0.8)), 2) # mean ~$500
+ elif tier == "Pro":
+ return round(float(np.random.lognormal(5.0, 0.7)), 2) # mean ~$150
+ else:
+ return round(float(np.random.lognormal(3.9, 0.6)), 2) # mean ~$50
+
+amounts = [generate_amount(customer_tier_map[cid]) for cid in order_customer_ids]
+
+# Status distribution: delivered 65%, shipped 15%, processing 10%, pending 5%, cancelled 5%
+statuses = random.choices(
+ ["delivered", "shipped", "processing", "pending", "cancelled"],
+ weights=[0.65, 0.15, 0.10, 0.05, 0.05],
+ k=NUM_ORDERS
+)
+
+orders = pl.DataFrame({
+ "order_id": [f"ORD-{i:06d}" for i in range(NUM_ORDERS)],
+ "customer_id": order_customer_ids,
+ "amount": amounts,
+ "order_date": [
+ fake.date_between(start_date=ORDER_START_DATE.date(), end_date=END_DATE.date())
+ for _ in range(NUM_ORDERS)
+ ],
+ "status": statuses,
+})
+
+print(f"Generated {len(orders)} orders")
+
+# Verify tier-weighted distribution
+orders_with_tier = orders.join(
+ customers.select(["customer_id", "tier"]),
+ on="customer_id"
+)
+print("\nOrders per tier:")
+print(orders_with_tier.group_by("tier").len().sort("tier"))
+print("\nAverage orders per customer by tier:")
+print(
+ orders_with_tier.group_by("tier")
+ .agg(pl.len().alias("total_orders"))
+ .join(tier_counts, on="tier")
+ .with_columns((pl.col("total_orders") / pl.col("len")).alias("avg_orders_per_customer"))
+ .select(["tier", "avg_orders_per_customer"])
+ .sort("tier")
+)
+
+# =============================================================================
+# Save Locally as Parquet
+# =============================================================================
+print("\nSaving to local Parquet files...")
+local_path = "synthetic_data_output"
+import os
+os.makedirs(local_path, exist_ok=True)
+
+customers.write_parquet(f"{local_path}/customers.parquet")
+orders.write_parquet(f"{local_path}/orders.parquet")
+print(f"Saved to {local_path}/")
+
+# =============================================================================
+# Upload to Unity Catalog Volume
+# =============================================================================
+print("\nUploading to Unity Catalog...")
+
+w = WorkspaceClient()
+
+# Create schema and volume if needed
+from databricks.sdk.service.catalog import VolumeType
+
+try:
+ w.schemas.create(name=SCHEMA, catalog_name=CATALOG)
+ print(f"Created schema {CATALOG}.{SCHEMA}")
+except Exception as e:
+ if "SCHEMA_ALREADY_EXISTS" in str(e):
+ print(f"Schema {CATALOG}.{SCHEMA} already exists")
+ else:
+ raise
+
+try:
+ w.volumes.create(
+ catalog_name=CATALOG,
+ schema_name=SCHEMA,
+ name="raw_data",
+ volume_type=VolumeType.MANAGED
+ )
+ print(f"Created volume {CATALOG}.{SCHEMA}.raw_data")
+except Exception as e:
+ if "already exists" in str(e).lower():
+ print(f"Volume {CATALOG}.{SCHEMA}.raw_data already exists")
+ else:
+ raise
+
+# Upload files
+volume_path = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
+
+with open(f"{local_path}/customers.parquet", "rb") as f:
+ w.files.upload(f"{volume_path}/customers.parquet", f, overwrite=True)
+print(f"Uploaded customers.parquet to {volume_path}/")
+
+with open(f"{local_path}/orders.parquet", "rb") as f:
+ w.files.upload(f"{volume_path}/orders.parquet", f, overwrite=True)
+print(f"Uploaded orders.parquet to {volume_path}/")
+
+print(f"\n✅ Complete! Data available at:")
+print(f" {volume_path}/customers.parquet")
+print(f" {volume_path}/orders.parquet")
+```
+
+This script:
+1. Generates 5,000 customers with weighted tier distribution (Free 60%, Pro 30%, Enterprise 10%)
+2. Generates 15,000 orders with tier-weighted customer assignment (Enterprise ~8x, Pro ~3x, Free ~1.5x orders per customer)
+3. Uses log-normal distributions for realistic order amounts by tier
+4. Saves locally as Parquet files
+5. Creates the schema and volume in Unity Catalog if needed
+6. Uploads the Parquet files to the volume
+
+Run with:
+```bash
+uv pip install polars faker numpy databricks-sdk
+uv run python generate_ecommerce_data.py
+```
diff --git a/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260302_retail_csv_3tables_003.txt b/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260302_retail_csv_3tables_003.txt
new file mode 100644
index 00000000..55be3712
--- /dev/null
+++ b/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260302_retail_csv_3tables_003.txt
@@ -0,0 +1,297 @@
+Here's the complete script to generate synthetic retail order data with 3 tables and full referential integrity:
+
+```python
+"""
+Synthetic Retail Order Data Generation
+======================================
+3 related tables with full referential integrity:
+- customers (10,000 rows)
+- orders (50,000 rows)
+- line_items (150,000 rows)
+
+Output: CSV files with headers to Unity Catalog volume
+Execution: Databricks Connect with serverless compute
+"""
+
+import os
+from pyspark.sql import functions as F
+from pyspark.sql.types import StringType, DoubleType, IntegerType
+from pyspark.sql.window import Window
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+CATALOG = "ai_dev_kit"
+SCHEMA = "devkit_gen3_test_medium_csv"
+VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
+
+NUM_CUSTOMERS = 10_000
+NUM_ORDERS = 50_000
+NUM_LINE_ITEMS = 150_000
+NUM_PARTITIONS = 32
+SEED = 42
+
+END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
+START_DATE = END_DATE - timedelta(days=180)
+
+# =============================================================================
+# SPARK SESSION
+# =============================================================================
+def is_databricks_runtime():
+ return "DATABRICKS_RUNTIME_VERSION" in os.environ
+
+if is_databricks_runtime():
+ from pyspark.sql import SparkSession
+ spark = SparkSession.builder.getOrCreate()
+ print("Running on Databricks Runtime")
+else:
+ from databricks.connect import DatabricksSession, DatabricksEnv
+
+ # Install dependencies on serverless cluster
+ env = DatabricksEnv().withDependencies("faker", "pandas", "numpy")
+ spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate()
+ print("Running with Databricks Connect (serverless with managed dependencies)")
+
+# =============================================================================
+# CREATE INFRASTRUCTURE
+# =============================================================================
+spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
+spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data")
+
+# =============================================================================
+# PANDAS UDFs
+# =============================================================================
+
+@F.pandas_udf(StringType())
+def fake_name(ids: pd.Series) -> pd.Series:
+ from faker import Faker
+ fake = Faker()
+ Faker.seed(SEED)
+ return pd.Series([fake.name() for _ in range(len(ids))])
+
+@F.pandas_udf(StringType())
+def fake_email(names: pd.Series) -> pd.Series:
+ from faker import Faker
+ fake = Faker()
+ emails = []
+ for name in names:
+ parts = name.lower().split()
+ if len(parts) >= 2:
+ email = f"{parts[0]}.{parts[-1]}@{fake.free_email_domain()}"
+ else:
+ email = f"{parts[0]}{np.random.randint(100, 999)}@{fake.free_email_domain()}"
+ emails.append(email)
+ return pd.Series(emails)
+
+@F.pandas_udf(StringType())
+def fake_product_name(ids: pd.Series) -> pd.Series:
+ from faker import Faker
+ fake = Faker()
+ product_types = ["Chair", "Table", "Lamp", "Desk", "Shelf", "Cabinet", "Sofa", "Rug",
+ "Mirror", "Clock", "Vase", "Frame", "Pillow", "Blanket", "Candle",
+ "Mug", "Bowl", "Plate", "Glass", "Bottle", "Box", "Bag", "Hat",
+ "Watch", "Headphones", "Speaker", "Charger", "Cable", "Case"]
+ products = []
+ for _ in range(len(ids)):
+ color = fake.color_name()
+ adj = fake.word().capitalize()
+ product = np.random.choice(product_types)
+ products.append(f"{color} {adj} {product}")
+ return pd.Series(products)
+
+@F.pandas_udf(DoubleType())
+def generate_unit_price(ids: pd.Series) -> pd.Series:
+ """Log-normal unit prices (median ~$35, range $5-$500)"""
+ prices = np.random.lognormal(mean=3.5, sigma=0.7, size=len(ids))
+ prices = np.clip(prices, 5.0, 500.0)
+ return pd.Series(np.round(prices, 2))
+
+# =============================================================================
+# GENERATE CUSTOMERS TABLE
+# =============================================================================
+customers_df = (
+ spark.range(0, NUM_CUSTOMERS, numPartitions=NUM_PARTITIONS)
+ .select(
+ F.concat(F.lit("CUST-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("customer_id"),
+ F.col("id").alias("_idx")
+ )
+ .withColumn("name", fake_name(F.col("_idx")))
+ .withColumn("email", fake_email(F.col("name")))
+ .withColumn("membership_level",
+ F.when(F.rand(SEED) < 0.50, "Bronze")
+ .when(F.rand(SEED + 1) < 0.80, "Silver")
+ .when(F.rand(SEED + 2) < 0.95, "Gold")
+ .otherwise("Platinum")
+ )
+ .withColumn("region",
+ F.when(F.rand(SEED + 3) < 0.30, "US-East")
+ .when(F.rand(SEED + 4) < 0.55, "US-West")
+ .when(F.rand(SEED + 5) < 0.80, "EU")
+ .when(F.rand(SEED + 6) < 0.95, "APAC")
+ .otherwise("Other")
+ )
+ .drop("_idx")
+)
+
+# Write to temp Delta table (no .cache() on serverless)
+customers_tmp = f"{CATALOG}.{SCHEMA}._tmp_customers"
+customers_df.write.mode("overwrite").saveAsTable(customers_tmp)
+customers_df = spark.table(customers_tmp)
+
+# =============================================================================
+# GENERATE ORDERS TABLE (weighted by membership level)
+# =============================================================================
+customer_weights = customers_df.select(
+ "customer_id",
+ "membership_level",
+ F.when(F.col("membership_level") == "Platinum", 10.0)
+ .when(F.col("membership_level") == "Gold", 7.0)
+ .when(F.col("membership_level") == "Silver", 5.0)
+ .otherwise(3.0).alias("weight")
+)
+
+weighted_customers = (
+ customer_weights
+ .withColumn("replicate_count", (F.col("weight") * 2).cast("int"))
+ .select(
+ F.col("customer_id"),
+ F.explode(F.array_repeat(F.col("customer_id"), F.col("replicate_count"))).alias("_dup")
+ )
+ .drop("_dup")
+)
+
+sampled_customers = (
+ weighted_customers
+ .orderBy(F.rand(SEED + 10))
+ .limit(NUM_ORDERS)
+ .withColumn("_row", F.row_number().over(Window.orderBy(F.monotonically_increasing_id())))
+ .select("customer_id", "_row")
+)
+
+orders_base = (
+ spark.range(0, NUM_ORDERS, numPartitions=NUM_PARTITIONS)
+ .withColumn("order_id",
+ F.concat(F.lit("ORD-"), F.lpad(F.col("id").cast("string"), 6, "0")))
+ .withColumn("_row", F.row_number().over(Window.orderBy(F.col("id"))))
+)
+
+orders_df = (
+ orders_base
+ .join(sampled_customers, on="_row", how="inner")
+ .drop("_row", "id")
+ .withColumn("order_date",
+ F.date_add(F.lit(START_DATE.date()), (F.rand(SEED + 11) * 180).cast("int")))
+ .withColumn("status",
+ F.when(F.rand(SEED + 12) < 0.60, "Delivered")
+ .when(F.rand(SEED + 13) < 0.78, "Shipped")
+ .when(F.rand(SEED + 14) < 0.90, "Processing")
+ .when(F.rand(SEED + 15) < 0.96, "Pending")
+ .otherwise("Cancelled")
+ )
+ .withColumn("total_amount", F.lit(0.0))
+)
+
+orders_tmp = f"{CATALOG}.{SCHEMA}._tmp_orders"
+orders_df.write.mode("overwrite").saveAsTable(orders_tmp)
+orders_df = spark.table(orders_tmp)
+
+# =============================================================================
+# GENERATE LINE_ITEMS TABLE
+# =============================================================================
+order_ids = orders_df.select("order_id")
+
+items_per_order = (
+ order_ids
+ .withColumn("num_items",
+ F.when(F.rand(SEED + 20) < 0.10, 1)
+ .when(F.rand(SEED + 21) < 0.25, 2)
+ .when(F.rand(SEED + 22) < 0.45, 3)
+ .when(F.rand(SEED + 23) < 0.65, 4)
+ .when(F.rand(SEED + 24) < 0.80, 5)
+ .when(F.rand(SEED + 25) < 0.90, 6)
+ .when(F.rand(SEED + 26) < 0.96, 7)
+ .otherwise(8)
+ )
+)
+
+line_items_base = (
+ items_per_order
+ .select(
+ F.col("order_id"),
+ F.explode(F.sequence(F.lit(1), F.col("num_items"))).alias("item_seq")
+ )
+ .withColumn("_idx", F.monotonically_increasing_id())
+)
+
+line_items_df = (
+ line_items_base
+ .limit(NUM_LINE_ITEMS)
+ .withColumn("line_item_id",
+ F.concat(F.lit("LI-"), F.lpad(F.col("_idx").cast("string"), 7, "0")))
+ .withColumn("product_name", fake_product_name(F.col("_idx")))
+ .withColumn("quantity",
+ F.when(F.rand(SEED + 30) < 0.50, 1)
+ .when(F.rand(SEED + 31) < 0.80, 2)
+ .when(F.rand(SEED + 32) < 0.92, 3)
+ .when(F.rand(SEED + 33) < 0.97, 4)
+ .otherwise(5)
+ )
+ .withColumn("unit_price", generate_unit_price(F.col("_idx")))
+ .drop("_idx", "item_seq")
+)
+
+line_items_tmp = f"{CATALOG}.{SCHEMA}._tmp_line_items"
+line_items_df.write.mode("overwrite").saveAsTable(line_items_tmp)
+line_items_df = spark.table(line_items_tmp)
+
+# =============================================================================
+# COMPUTE ORDER TOTALS FROM LINE ITEMS
+# =============================================================================
+order_totals = (
+ line_items_df
+ .withColumn("line_total", F.col("quantity") * F.col("unit_price"))
+ .groupBy("order_id")
+ .agg(F.round(F.sum("line_total"), 2).alias("computed_total"))
+)
+
+orders_final = (
+ spark.table(orders_tmp)
+ .drop("total_amount")
+ .join(order_totals, on="order_id", how="left")
+ .withColumn("total_amount", F.coalesce(F.col("computed_total"), F.lit(0.0)))
+ .drop("computed_total")
+)
+
+# =============================================================================
+# EXPORT TO CSV
+# =============================================================================
+customers_final = spark.table(customers_tmp).select(
+ "customer_id", "name", "email", "membership_level", "region"
+)
+
+orders_out = orders_final.select(
+ "order_id", "customer_id", "order_date", "total_amount", "status"
+)
+
+line_items_out = spark.table(line_items_tmp).select(
+ "line_item_id", "order_id", "product_name", "quantity", "unit_price"
+)
+
+customers_final.coalesce(1).write.mode("overwrite").option("header", "true").csv(f"{VOLUME_PATH}/customers")
+orders_out.coalesce(1).write.mode("overwrite").option("header", "true").csv(f"{VOLUME_PATH}/orders")
+line_items_out.coalesce(4).write.mode("overwrite").option("header", "true").csv(f"{VOLUME_PATH}/line_items")
+
+# Cleanup temp tables
+spark.sql(f"DROP TABLE IF EXISTS {customers_tmp}")
+spark.sql(f"DROP TABLE IF EXISTS {orders_tmp}")
+spark.sql(f"DROP TABLE IF EXISTS {line_items_tmp}")
+```
+
+Run with (requires Python 3.12+):
+```bash
+uv pip install "databricks-connect>=16.4,<17.4" faker numpy pandas
+uv run python generate_retail_data.py
+```
diff --git a/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260303_manufacturing_delta_streaming_004.txt b/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260303_manufacturing_delta_streaming_004.txt
new file mode 100644
index 00000000..bd6f8ef4
--- /dev/null
+++ b/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260303_manufacturing_delta_streaming_004.txt
@@ -0,0 +1,235 @@
+Here's the complete script to generate manufacturing IoT sensor data with streaming-ready batch processing:
+
+```python
+"""
+Manufacturing IoT Sensor Data Generation
+========================================
+2 related tables with referential integrity:
+- sensor_devices (500 rows): Slowly changing dimension
+- sensor_readings (10,000 rows per batch): Streaming incremental
+
+Output: Delta tables registered in Unity Catalog
+Execution: Databricks Connect with serverless compute (Python 3.12+)
+"""
+
+from databricks.connect import DatabricksSession, DatabricksEnv
+from pyspark.sql import functions as F
+from pyspark.sql.types import StringType, DoubleType, TimestampType
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+
+# Configuration
+CATALOG = "ai_dev_kit"
+SCHEMA = "devkit_gen4_test_large_delta"
+
+# Row counts
+NUM_DEVICES = 500
+NUM_READINGS_PER_BATCH = 10000
+
+# Date range for readings (last 24 hours for streaming simulation)
+END_TIME = datetime.now()
+START_TIME = END_TIME - timedelta(hours=24)
+
+# Setup with managed dependencies (databricks-connect 16.4+)
+env = DatabricksEnv().withDependencies("faker", "pandas", "numpy")
+spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate()
+
+print(f"Connected to Databricks serverless compute")
+print(f"Output location: {CATALOG}.{SCHEMA}")
+
+# Create schema if not exists
+spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
+print(f"Schema {CATALOG}.{SCHEMA} ready")
+
+
+# =============================================================================
+# Pandas UDFs for data generation
+# =============================================================================
+
+@F.pandas_udf(StringType())
+def fake_device_name(ids: pd.Series) -> pd.Series:
+ """Generate realistic device names like 'SENS-T-00001'."""
+ from faker import Faker
+ fake = Faker()
+ Faker.seed(42)
+
+ prefixes = {"temperature": "T", "pressure": "P", "vibration": "V", "humidity": "H"}
+ names = []
+ for i, _ in enumerate(ids):
+ type_key = list(prefixes.keys())[i % 4]
+ prefix = prefixes[type_key]
+ names.append(f"SENS-{prefix}-{str(i).zfill(5)}")
+ return pd.Series(names)
+
+
+@F.pandas_udf(DoubleType())
+def generate_sensor_value(device_types: pd.Series) -> pd.Series:
+ """Generate realistic sensor values based on device type."""
+ values = []
+ for dtype in device_types:
+ if dtype == "temperature":
+ values.append(float(np.random.normal(70, 15))) # Fahrenheit
+ elif dtype == "pressure":
+ values.append(float(np.random.normal(100, 10))) # PSI
+ elif dtype == "vibration":
+ values.append(float(np.random.lognormal(1.5, 0.8))) # mm/s with spikes
+ elif dtype == "humidity":
+ values.append(float(np.clip(np.random.normal(45, 10), 0, 100))) # Percentage
+ else:
+ values.append(float(np.random.normal(50, 10)))
+ return pd.Series(values)
+
+
+@F.pandas_udf(StringType())
+def generate_unit(device_types: pd.Series) -> pd.Series:
+ """Generate appropriate unit based on device type."""
+ unit_map = {
+ "temperature": "°F",
+ "pressure": "PSI",
+ "vibration": "mm/s",
+ "humidity": "%"
+ }
+ return pd.Series([unit_map.get(dt, "units") for dt in device_types])
+
+
+# =============================================================================
+# Generate sensor_devices (slowly changing dimension)
+# =============================================================================
+
+print("\nGenerating sensor_devices table (slowly changing dimension)")
+
+# Device type weights: temperature 30%, pressure 25%, vibration 25%, humidity 20%
+devices_df = (
+ spark.range(0, NUM_DEVICES, numPartitions=4)
+ .select(
+ F.concat(F.lit("DEV-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("device_id"),
+ F.col("id").alias("_id")
+ )
+ .withColumn("device_name", fake_device_name(F.col("_id")))
+ .withColumn(
+ "device_type",
+ F.when(F.rand() < 0.30, "temperature")
+ .when(F.rand() < 0.55, "pressure")
+ .when(F.rand() < 0.80, "vibration")
+ .otherwise("humidity")
+ )
+ .withColumn(
+ "location",
+ F.when(F.rand() < 0.40, "Plant_A")
+ .when(F.rand() < 0.70, "Plant_B")
+ .when(F.rand() < 0.90, "Plant_C")
+ .otherwise("Warehouse")
+ )
+ .withColumn(
+ "install_date",
+ F.date_add(F.lit("2020-01-01"), (F.rand() * 1460).cast("int"))
+ )
+ .withColumn(
+ "status",
+ F.when(F.rand() < 0.85, "active")
+ .when(F.rand() < 0.95, "maintenance")
+ .otherwise("inactive")
+ )
+ .drop("_id")
+)
+
+# Write devices table
+devices_table = f"{CATALOG}.{SCHEMA}.sensor_devices"
+devices_df.write.mode("overwrite").saveAsTable(devices_table)
+print(f"Created {devices_table}")
+
+
+# =============================================================================
+# Generate sensor_readings (streaming incremental batch)
+# =============================================================================
+
+print("\nGenerating sensor_readings table (streaming batch)")
+
+# Read devices back for FK join (no .cache() on serverless!)
+devices_for_join = spark.table(devices_table).select("device_id", "device_type")
+
+# Generate readings with valid device_id FK
+readings_df = (
+ spark.range(0, NUM_READINGS_PER_BATCH, numPartitions=16)
+ .select(
+ F.concat(
+ F.lit("RDG-"),
+ F.date_format(F.current_timestamp(), "yyyyMMddHHmmss"),
+ F.lit("-"),
+ F.lpad(F.col("id").cast("string"), 6, "0")
+ ).alias("reading_id"),
+ (F.abs(F.hash(F.col("id"))) % NUM_DEVICES).alias("device_index"),
+ F.from_unixtime(
+ F.unix_timestamp(F.lit(START_TIME)) +
+ (F.rand() * 86400).cast("long")
+ ).cast("timestamp").alias("timestamp")
+ )
+)
+
+# Get device IDs with row numbers for joining
+devices_indexed = (
+ devices_for_join
+ .withColumn("device_index", F.monotonically_increasing_id() % NUM_DEVICES)
+)
+
+# Join to get valid device_id and device_type
+readings_with_device = (
+ readings_df
+ .join(
+ devices_indexed.select("device_id", "device_type", "device_index"),
+ on="device_index",
+ how="inner"
+ )
+ .drop("device_index")
+)
+
+# Add sensor values based on device type
+readings_final = (
+ readings_with_device
+ .withColumn("value", F.round(generate_sensor_value(F.col("device_type")), 2))
+ .withColumn("unit", generate_unit(F.col("device_type")))
+ .withColumn(
+ "quality_flag",
+ F.when(F.rand() < 0.90, "good")
+ .when(F.rand() < 0.97, "warning")
+ .otherwise("error")
+ )
+ .select("reading_id", "device_id", "timestamp", "value", "unit", "quality_flag")
+)
+
+# Write readings table (append mode for streaming simulation)
+readings_table = f"{CATALOG}.{SCHEMA}.sensor_readings"
+readings_final.write.mode("overwrite").saveAsTable(readings_table)
+print(f"Created {readings_table}")
+
+
+# =============================================================================
+# Validation
+# =============================================================================
+
+# Check referential integrity
+orphan_readings = spark.sql(f"""
+ SELECT COUNT(*) as orphan_count
+ FROM {readings_table} r
+ LEFT JOIN {devices_table} d ON r.device_id = d.device_id
+ WHERE d.device_id IS NULL
+""").collect()[0]["orphan_count"]
+
+print(f"Orphan readings (should be 0): {orphan_readings}")
+print(f"\nSUMMARY")
+print(f"Catalog/Schema: {CATALOG}.{SCHEMA}")
+print(f"Tables: {devices_table}, {readings_table}")
+print(f"Referential integrity: {'PASSED' if orphan_readings == 0 else 'FAILED'}")
+```
+
+Run with (requires Python 3.12+):
+```bash
+uv pip install "databricks-connect>=16.4,<17.4" faker numpy pandas
+uv run python generate_manufacturing_data.py
+```
+
+For streaming job deployment, this can be scheduled as an incremental job where each run:
+1. Generates a new batch of 10,000 readings with unique timestamps
+2. Appends to the sensor_readings table (change mode to "append")
+3. The sensor_devices table is the slowly-changing dimension that persists
diff --git a/.test/skills/databricks-synthetic-data-gen/ground_truth.yaml b/.test/skills/databricks-synthetic-data-gen/ground_truth.yaml
new file mode 100644
index 00000000..799c0c19
--- /dev/null
+++ b/.test/skills/databricks-synthetic-data-gen/ground_truth.yaml
@@ -0,0 +1,327 @@
+test_cases:
+
+- id: grp_20260302_113344
+ inputs:
+ prompt: 'Generate synthetic e-commerce data locally then save it to Unity Catalog.
+
+ Create 2 related tables with referential integrity:
+
+ - customers (5,000 rows): customer_id, name, email, tier (Free/Pro/Enterprise
+ weighted 60/30/10), region, created_at
+
+ - orders (15,000 rows): order_id, customer_id (FK to customers), amount, order_date,
+ status
+
+
+ Save as Parquet then upload to a Unity Catalog volume. Use schema name ''devkit_gen1_test_small_parquet''.
+
+ Enterprise customers should generate more orders than Free tier.'
+ outputs:
+ expected_response_file: expected_responses/grp_20260302_113344.txt
+ execution_success: true
+ expectations:
+ expected_facts:
+ - "parquet"
+ - "customer_id"
+ - "referential integrity"
+ - "weighted"
+ - "log-normal"
+ expected_patterns:
+ - pattern: "\\.write.*parquet"
+ min_count: 1
+ description: "Parquet output format"
+ - pattern: "customer_id"
+ min_count: 3
+ description: "Foreign key reference in multiple tables"
+ - pattern: "lognormal"
+ min_count: 1
+ description: "Log-normal distribution for amounts"
+ guidelines:
+ - "Orders table customer_id must only contain IDs from customers table"
+ - "Enterprise tier customers must have higher weight for order generation"
+ - "Amount distribution must use log-normal, not uniform"
+ metadata:
+ category: happy_path
+ source: interactive
+ created_at: '2026-03-02T11:33:44.621846'
+ execution_verified:
+ mode: local
+ verified_date: '2026-03-02'
+
+- id: gen_serverless_job_catalog_json_002
+ inputs:
+ prompt: 'Generate synthetic product catalog data that will run as a serverless
+ Databricks job.
+
+ Create 2 related tables with referential integrity:
+
+ - products (3,000 rows): product_id, name, category (weighted), price (log-normal),
+ inventory_count
+
+ - sales (10,000 rows): sale_id, product_id (FK to products), quantity, sale_date,
+ discount_pct
+
+
+ Save as JSON files to a Unity Catalog volume. Use schema name ''devkit_gen2_test_small_json''.
+
+ Create a job definition with environments for dependencies (faker).
+
+ Popular product categories should have more sales (weighted sampling).
+
+ '
+ outputs:
+ expected_response_file: expected_responses/gen_serverless_job_catalog_json_002.txt
+ expectations:
+ expected_facts:
+ - serverless
+ - environments
+ - dependencies
+ - client
+ - json
+ - product_id
+ - weighted
+ - lognormal
+ - pandas_udf
+ expected_patterns:
+ - pattern: environment_key.*default
+ min_count: 1
+ description: Serverless job environment configuration
+ - pattern: client.*4
+ min_count: 1
+ description: Correct client version for serverless
+ - pattern: \.write.*json
+ min_count: 1
+ description: JSON output format
+ - pattern: product_id
+ min_count: 3
+ description: Foreign key reference in multiple places
+ - pattern: '@F\.pandas_udf|pandas_udf'
+ min_count: 1
+ description: Pandas UDF for Faker parallelism
+ - pattern: lognormal|log-normal|log_normal
+ min_count: 1
+ description: Log-normal distribution for prices
+ - pattern: CREATE SCHEMA IF NOT EXISTS|CREATE VOLUME IF NOT EXISTS
+ min_count: 1
+ description: Infrastructure creation in script
+ guidelines:
+ - Must create serverless job with environments parameter for dependencies
+ - 'Job spec must include client: 4 (not 1)'
+ - Must NOT use .cache() or .persist() (serverless incompatible)
+ metadata:
+ category: happy_path
+ difficulty: medium
+ source: interactive_execution
+ execution_date: '2026-02-26'
+ execution_verified: true
+ job_run_id: '560746964795126'
+ tags:
+ - serverless-job
+ - small
+ - json
+ - referential-integrity
+ - weighted-sampling
+ - executed
+
+- id: grp_20260302_retail_csv_3tables_003
+ inputs:
+ prompt: |
+ Generate synthetic retail order data using Databricks Connect with serverless.
+ Create 3 related tables with full referential integrity:
+ - customers (10,000 rows): customer_id, name, email, membership_level (Bronze/Silver/Gold/Platinum weighted 50/30/15/5), region
+ - orders (50,000 rows): order_id, customer_id (FK to customers), order_date, total_amount, status
+ - line_items (150,000 rows): line_item_id, order_id (FK to orders), product_name, quantity, unit_price
+
+ Save as CSV files with headers to Unity Catalog volume. Use schema name 'devkit_gen3_test_medium_csv'.
+ Create realistic product names.
+ Higher membership levels should have more orders.
+ Order total_amount should equal sum of line_items.
+ outputs:
+ expected_response_file: expected_responses/grp_20260302_retail_csv_3tables_003.txt
+ execution_success: true
+ expectations:
+ expected_facts:
+ - "DatabricksSession"
+ - "serverless"
+ - "CSV"
+ - "header"
+ - "customer_id"
+ - "order_id"
+ - "line_item"
+ - "Faker"
+ - "pandas_udf"
+ - "membership_level"
+ - "weighted"
+ - "total_amount"
+ - "lognormal"
+ expected_patterns:
+ - pattern: "DatabricksSession.*serverless.*True"
+ min_count: 1
+ description: "Databricks Connect serverless configuration"
+ - pattern: "DatabricksEnv.*withDependencies"
+ min_count: 1
+ description: "Managed dependencies for serverless"
+ - pattern: "@F\\.pandas_udf|pandas_udf"
+ min_count: 1
+ description: "Pandas UDF for Faker parallelism"
+ - pattern: "customer_id"
+ min_count: 5
+ description: "FK in customers and orders (multiple references)"
+ - pattern: "order_id"
+ min_count: 5
+ description: "FK in orders and line_items (multiple references)"
+ - pattern: "\\.option.*header.*true.*\\.csv|\\.write.*csv"
+ min_count: 1
+ description: "CSV with headers"
+ - pattern: "Bronze|Silver|Gold|Platinum"
+ min_count: 4
+ description: "All membership levels present"
+ - pattern: "lognormal"
+ min_count: 1
+ description: "Log-normal distribution for pricing"
+ - pattern: "CREATE SCHEMA IF NOT EXISTS"
+ min_count: 1
+ description: "Infrastructure creation in script"
+ - pattern: "CREATE VOLUME IF NOT EXISTS"
+ min_count: 1
+ description: "Volume creation for CSV output"
+ - pattern: "total_amount.*sum|sum.*line_total|computed_total"
+ min_count: 1
+ description: "Order total computed from line items"
+ guidelines:
+ - "Must use DatabricksSession.builder.serverless(True).getOrCreate()"
+ - "Must use Spark + Faker + Pandas UDFs approach"
+ - "line_items.order_id must reference valid orders"
+ - "Membership level must be weighted: Bronze 50%, Silver 30%, Gold 15%, Platinum 5%"
+ - "Higher membership levels must generate more orders per customer"
+ metadata:
+ category: happy_path
+ difficulty: hard
+ source: interactive_execution
+ execution_date: '2026-03-02'
+ execution_verified: true
+ verified_output:
+ customers_rows: 10000
+ orders_rows: 50000
+ line_items_rows: 150000
+ membership_distribution:
+ Bronze: 5069
+ Silver: 3957
+ Gold: 919
+ Platinum: 55
+ orders_per_tier:
+ Bronze: 18170
+ Silver: 23560
+ Gold: 7613
+ Platinum: 657
+ orphan_orders: 0
+ orphan_line_items: 0
+ tags:
+ - databricks-connect
+ - serverless
+ - medium
+ - csv
+ - 3-tables
+ - pandas-udf
+ - referential-integrity
+ - weighted-sampling
+ - computed-totals
+ - executed
+
+- id: grp_20260303_manufacturing_delta_streaming_004
+ inputs:
+ prompt: |
+ Generate manufacturing data that will run incrementally with Python 3.12 and Databricks Serverless.
+ Create 2 related tables with referential integrity.
+ Create a sensor reading table that generates 10,000 rows per batch and configure to run as a streaming job.
+ Create a lookup table for the sensor device which changes slowly.
+ Save as Delta tables registered in Unity Catalog. Use catalog 'ai_dev_kit'. Use schema name 'devkit_gen4_test_large_delta'.
+ outputs:
+ expected_response_file: expected_responses/grp_20260303_manufacturing_delta_streaming_004.txt
+ execution_success: true
+ expectations:
+ expected_facts:
+ - "DatabricksSession"
+ - "serverless"
+ - "Delta"
+ - "saveAsTable"
+ - "device_id"
+ - "sensor"
+ - "reading"
+ - "Faker"
+ - "pandas_udf"
+ - "streaming"
+ - "incremental"
+ - "batch"
+ - "slowly changing"
+ - "lognormal"
+ expected_patterns:
+ - pattern: "DatabricksSession.*serverless.*True"
+ min_count: 1
+ description: "Databricks Connect serverless configuration"
+ - pattern: "DatabricksEnv.*withDependencies"
+ min_count: 1
+ description: "Managed dependencies for serverless"
+ - pattern: "@F\\.pandas_udf|pandas_udf"
+ min_count: 1
+ description: "Pandas UDF for Faker parallelism"
+ - pattern: "device_id"
+ min_count: 3
+ description: "FK in devices and readings (multiple references)"
+ - pattern: "\\.write.*saveAsTable|saveAsTable"
+ min_count: 2
+ description: "Delta table output for both tables"
+ - pattern: "CREATE SCHEMA IF NOT EXISTS"
+ min_count: 1
+ description: "Infrastructure creation in script"
+ - pattern: "sensor_devices|sensor_readings"
+ min_count: 2
+ description: "Both sensor tables present"
+ - pattern: "temperature|pressure|vibration|humidity"
+ min_count: 4
+ description: "All device types present"
+ - pattern: "lognormal"
+ min_count: 1
+ description: "Log-normal distribution for vibration sensor values"
+ - pattern: "mode.*overwrite|mode.*append"
+ min_count: 1
+ description: "Write mode for streaming support"
+ guidelines:
+ - "Must use DatabricksSession.builder.serverless(True).getOrCreate()"
+ - "Must use Spark + Faker + Pandas UDFs approach"
+ - "Must maintain referential integrity between devices and readings"
+ - "Must use Delta tables (saveAsTable) not file formats"
+ - "sensor_readings should support incremental batch processing"
+ - "Vibration should use log-normal for occasional spikes"
+ metadata:
+ category: happy_path
+ difficulty: medium
+ source: interactive_execution
+ execution_date: '2026-03-03'
+ execution_verified: true
+ verified_output:
+ sensor_devices_rows: 500
+ sensor_readings_rows: 10013
+ device_type_distribution:
+ temperature: 147
+ pressure: 179
+ vibration: 140
+ humidity: 34
+ quality_flag_distribution:
+ good: 9008
+ warning: 979
+ error: 26
+ orphan_readings: 0
+ tags:
+ - databricks-connect
+ - serverless
+ - large
+ - delta
+ - 2-tables
+ - pandas-udf
+ - referential-integrity
+ - streaming
+ - incremental
+ - iot
+ - manufacturing
+ - executed
diff --git a/.test/skills/databricks-synthetic-data-gen/manifest.yaml b/.test/skills/databricks-synthetic-data-gen/manifest.yaml
new file mode 100644
index 00000000..330f5de7
--- /dev/null
+++ b/.test/skills/databricks-synthetic-data-gen/manifest.yaml
@@ -0,0 +1,36 @@
+skill_name: databricks-synthetic-data-gen
+description: Tests for Databricks synthetic data generation skill covering Spark + Faker + Pandas UDFs, execution methods, output formats, and referential integrity
+version: 1.0.0
+
+scorers:
+ enabled:
+ - python_syntax # Check Python code blocks for syntax errors
+ - no_hallucinated_apis # Detect deprecated/wrong APIs
+ - pattern_adherence # Regex match against expected patterns
+ - expected_facts_present # Check if required facts mentioned
+
+ llm_scorers:
+ - Safety
+ - guidelines_from_expectations
+
+ default_guidelines:
+ - "Response must generate complete, runnable Python code"
+ - "Code must use the execution method specified in the prompt"
+ - "Code must save data in the output format specified"
+
+quality_gates:
+ syntax_valid: 1.0 # 100% - all Python syntax must be valid
+ pattern_adherence: 0.9 # 90% - follow expected patterns
+ execution_success: 0.8 # 80% - code execution success rate
+ no_hallucinations: 1.0 # 100% - no deprecated/invalid APIs
+
+trace_expectations:
+ tool_limits:
+ Bash: 10
+ Read: 20
+ Write: 15
+ Edit: 15
+ token_budget:
+ max_total: 200000
+ required_tools: []
+ banned_tools: []
diff --git a/.test/src/skill_test/config.py b/.test/src/skill_test/config.py
index 275e25aa..f4a42cb8 100644
--- a/.test/src/skill_test/config.py
+++ b/.test/src/skill_test/config.py
@@ -83,6 +83,9 @@ class MLflowConfig:
tracking_uri: str = field(default_factory=lambda: _get_mlflow_tracking_uri())
experiment_name: str = field(default_factory=lambda: os.getenv("MLFLOW_EXPERIMENT_NAME", "/Shared/skill-tests"))
+ llm_judge_timeout: int = field(
+ default_factory=lambda: int(os.getenv("MLFLOW_LLM_JUDGE_TIMEOUT", "120"))
+ ) # seconds - timeout for LLM judge evaluation
def _get_mlflow_tracking_uri() -> str:
@@ -118,7 +121,7 @@ class DatabricksExecutionSettings:
schema: str = field(default_factory=lambda: os.getenv("SKILL_TEST_SCHEMA", "skill_test"))
# Execution settings
- timeout: int = 120 # seconds
+ timeout: int = 240 # seconds - increased from 120s to handle larger data generation tasks
preserve_context: bool = True # Reuse context across code blocks
diff --git a/.test/src/skill_test/dataset.py b/.test/src/skill_test/dataset.py
index 9941ef12..5c88c330 100644
--- a/.test/src/skill_test/dataset.py
+++ b/.test/src/skill_test/dataset.py
@@ -41,17 +41,34 @@ class YAMLDatasetSource:
yaml_path: Path
def load(self) -> List[EvalRecord]:
- """Load records from YAML ground_truth.yaml file."""
+ """Load records from YAML ground_truth.yaml file.
+
+ Supports external response files via 'expected_response_file' field in outputs.
+ When present, the response is loaded from the file relative to the YAML directory.
+ """
with open(self.yaml_path) as f:
data = yaml.safe_load(f)
+ yaml_dir = self.yaml_path.parent
+
records = []
for case in data.get("test_cases", []):
+ outputs = case.get("outputs")
+
+ # Load response from external file if specified
+ if outputs and "expected_response_file" in outputs:
+ response_file = yaml_dir / outputs["expected_response_file"]
+ if response_file.exists():
+ with open(response_file) as rf:
+ outputs = dict(outputs) # Copy to avoid modifying original
+ outputs["response"] = rf.read()
+ del outputs["expected_response_file"]
+
records.append(
EvalRecord(
id=case["id"],
inputs=case["inputs"],
- outputs=case.get("outputs"),
+ outputs=outputs,
expectations=case.get("expectations"),
metadata=case.get("metadata", {}),
)
diff --git a/.test/src/skill_test/grp/executor.py b/.test/src/skill_test/grp/executor.py
index 5cd393bb..6f8dedee 100644
--- a/.test/src/skill_test/grp/executor.py
+++ b/.test/src/skill_test/grp/executor.py
@@ -1,6 +1,7 @@
"""Execute code blocks from skill responses to verify they work."""
import ast
+import json
import re
import time
import yaml
@@ -192,6 +193,26 @@ def verify_yaml_syntax(code: str) -> ExecutionResult:
)
+def verify_json_syntax(code: str) -> ExecutionResult:
+ """Verify JSON syntax is valid."""
+ start_time = time.time()
+ try:
+ json.loads(code)
+ return ExecutionResult(
+ success=True,
+ output="JSON syntax valid",
+ error=None,
+ execution_time_ms=(time.time() - start_time) * 1000,
+ )
+ except json.JSONDecodeError as e:
+ return ExecutionResult(
+ success=False,
+ output="",
+ error=f"JSON syntax error: {e.msg} at line {e.lineno}, column {e.colno}",
+ execution_time_ms=(time.time() - start_time) * 1000,
+ )
+
+
def verify_bash_structure(code: str) -> ExecutionResult:
"""Verify bash code structure (basic validation for examples)."""
# For bash examples, just check that it's not empty and looks like shell commands
@@ -220,6 +241,8 @@ def execute_code_blocks(response: str) -> Tuple[int, int, List[Dict[str, Any]]]:
result = verify_sql_structure(block.code)
elif block.language in ("yaml", "yml"):
result = verify_yaml_syntax(block.code)
+ elif block.language == "json":
+ result = verify_json_syntax(block.code)
elif block.language in ("bash", "sh", "shell"):
result = verify_bash_structure(block.code)
else:
@@ -528,6 +551,16 @@ def execute_code_blocks_on_databricks(
mcp_execute_sql,
mcp_get_best_warehouse,
)
+ elif block.language == "json":
+ # JSON blocks are validated locally (e.g., job definitions)
+ json_result = verify_json_syntax(block.code)
+ result = DatabricksExecutionResult(
+ success=json_result.success,
+ output=json_result.output,
+ error=json_result.error,
+ execution_time_ms=json_result.execution_time_ms,
+ execution_mode="local",
+ )
else:
# Skip unknown languages
continue
diff --git a/.test/src/skill_test/runners/evaluate.py b/.test/src/skill_test/runners/evaluate.py
index 1dff1009..212dd92a 100644
--- a/.test/src/skill_test/runners/evaluate.py
+++ b/.test/src/skill_test/runners/evaluate.py
@@ -154,6 +154,7 @@ def evaluate_skill(
config: Optional[SkillTestConfig] = None,
run_name: Optional[str] = None,
filter_category: Optional[str] = None,
+ timeout: Optional[int] = None,
) -> Dict[str, Any]:
"""
Evaluate a skill using pre-computed outputs (Pattern 2).
@@ -163,6 +164,7 @@ def evaluate_skill(
config: Configuration (uses defaults if None)
run_name: MLflow run name
filter_category: Filter test cases by category
+ timeout: Timeout in seconds for LLM judge evaluation (overrides config)
Returns:
Evaluation results dict with metrics and run_id
@@ -170,6 +172,9 @@ def evaluate_skill(
if config is None:
config = SkillTestConfig()
+ # Use provided timeout or fall back to config
+ eval_timeout = timeout if timeout is not None else config.mlflow.llm_judge_timeout
+
setup_mlflow(config)
# Load ground truth
@@ -192,13 +197,19 @@ def evaluate_skill(
else:
scorers = get_default_scorers()
- # Run evaluation
+ # Run evaluation with timeout
with mlflow.start_run(run_name=run_name or f"{skill_name}_eval"):
mlflow.set_tags(
- {"skill_name": skill_name, "test_count": len(eval_data), "filter_category": filter_category or "all"}
+ {
+ "skill_name": skill_name,
+ "test_count": len(eval_data),
+ "filter_category": filter_category or "all",
+ "timeout_seconds": eval_timeout,
+ }
)
# No predict_fn - using pre-computed outputs
+ # Run evaluation directly - timeout is handled via signal alarm on Unix
results = mlflow.genai.evaluate(data=eval_data, scorers=scorers)
return {
diff --git a/.test/src/skill_test/scorers/routing.py b/.test/src/skill_test/scorers/routing.py
index 1a03d698..fad45033 100644
--- a/.test/src/skill_test/scorers/routing.py
+++ b/.test/src/skill_test/scorers/routing.py
@@ -52,7 +52,7 @@
"rest api",
],
"databricks-jobs": ["job", "workflow", "task", "schedule", "trigger"],
- "databricks-synthetic-data-generation": [
+ "databricks-synthetic-data-gen": [
"synthetic data",
"fake data",
"generate data",
diff --git a/.test/tests/test_scorers.py b/.test/tests/test_scorers.py
index 66a39dbf..de5b0c09 100644
--- a/.test/tests/test_scorers.py
+++ b/.test/tests/test_scorers.py
@@ -52,10 +52,10 @@ def test_detect_mlflow_evaluation(self):
assert "databricks-mlflow-evaluation" in skills
def test_detect_synthetic_data(self):
- """Test detection of databricks-synthetic-data-generation skill."""
+ """Test detection of databricks-synthetic-data-gen skill."""
prompt = "Generate synthetic data for testing"
skills = detect_skills_from_prompt(prompt)
- assert "databricks-synthetic-data-generation" in skills
+ assert "databricks-synthetic-data-gen" in skills
def test_detect_agent_bricks(self):
"""Test detection of databricks-agent-bricks skill."""
@@ -175,7 +175,7 @@ def test_all_skills_have_triggers(self):
"databricks-asset-bundles",
"databricks-python-sdk",
"databricks-jobs",
- "databricks-synthetic-data-generation",
+ "databricks-synthetic-data-gen",
"databricks-mlflow-evaluation",
"databricks-agent-bricks",
"databricks-lakebase-provisioned",
diff --git a/databricks-builder-app/.env.example b/databricks-builder-app/.env.example
index c95a818f..f50ed4b6 100644
--- a/databricks-builder-app/.env.example
+++ b/databricks-builder-app/.env.example
@@ -53,10 +53,10 @@ DATABRICKS_MODEL_MINI=databricks-gemini-3-flash
# Skills Configuration
# =============================================================================
# Skills to include (comma-separated list of skill folder names)
-ENABLED_SKILLS=databricks-agent-bricks,databricks-python-sdk,databricks-spark-declarative-pipelines,databricks-synthetic-data-generation,databricks-unstructured-pdf-generation
+ENABLED_SKILLS=databricks-agent-bricks,databricks-python-sdk,databricks-spark-declarative-pipelines,databricks-synthetic-data-gen,databricks-unstructured-pdf-generation
# Optional: Add additional skills (example with databricks- prefixed skills)
-# ENABLED_SKILLS=databricks-agent-bricks,databricks-python-sdk,databricks-spark-declarative-pipelines,databricks-synthetic-data-generation,databricks-unstructured-pdf-generation
+# ENABLED_SKILLS=databricks-agent-bricks,databricks-python-sdk,databricks-spark-declarative-pipelines,databricks-synthetic-data-gen,databricks-unstructured-pdf-generation
# Test mode: only enable Skill tool (useful for debugging)
SKILLS_ONLY_MODE=false
diff --git a/databricks-builder-app/README.md b/databricks-builder-app/README.md
index b6a43135..42031cee 100644
--- a/databricks-builder-app/README.md
+++ b/databricks-builder-app/README.md
@@ -179,7 +179,7 @@ Skills include:
- **databricks-python-sdk**: Python SDK patterns
- **databricks-mlflow-evaluation**: MLflow evaluation and trace analysis
- **databricks-spark-declarative-pipelines**: Spark Declarative Pipelines (SDP) development
-- **databricks-synthetic-data-generation**: Creating test datasets
+- **databricks-synthetic-data-gen**: Creating test datasets
### 5. Project Persistence
@@ -329,7 +329,7 @@ Skills are loaded from `../databricks-skills/` and filtered by the `ENABLED_SKIL
- `databricks-python-sdk`: Patterns for using the Databricks Python SDK
- `databricks-spark-declarative-pipelines`: SDP/DLT pipeline development
-- `databricks-synthetic-data-generation`: Creating test datasets
+- `databricks-synthetic-data-gen`: Creating test datasets
- `databricks-app-apx`: Full-stack apps with React (APX framework)
- `databricks-app-python`: Python apps with Dash, Streamlit, Flask
diff --git a/databricks-builder-app/app.yaml.example b/databricks-builder-app/app.yaml.example
index 4f77f7a7..8a5c0207 100644
--- a/databricks-builder-app/app.yaml.example
+++ b/databricks-builder-app/app.yaml.example
@@ -30,7 +30,7 @@ env:
# =============================================================================
# Comma-separated list of skills to enable
- name: ENABLED_SKILLS
- value: "databricks-asset-bundles,databricks-agent-bricks,databricks-aibi-dashboards,databricks-app-apx,databricks-app-python,databricks-config,databricks-docs,databricks-jobs,databricks-python-sdk,databricks-unity-catalog,databricks-mlflow-evaluation,databricks-spark-declarative-pipelines,databricks-synthetic-data-generation,databricks-unstructured-pdf-generation"
+ value: "databricks-asset-bundles,databricks-agent-bricks,databricks-aibi-dashboards,databricks-app-apx,databricks-app-python,databricks-config,databricks-docs,databricks-jobs,databricks-python-sdk,databricks-unity-catalog,databricks-mlflow-evaluation,databricks-spark-declarative-pipelines,databricks-synthetic-data-gen,databricks-unstructured-pdf-generation"
- name: SKILLS_ONLY_MODE
value: "false"
diff --git a/databricks-builder-app/client/src/pages/DocPage.tsx b/databricks-builder-app/client/src/pages/DocPage.tsx
index f8b7b29c..b7ee35ec 100644
--- a/databricks-builder-app/client/src/pages/DocPage.tsx
+++ b/databricks-builder-app/client/src/pages/DocPage.tsx
@@ -92,7 +92,7 @@ function OverviewSection() {
Skills explain how to do things and reference the tools from databricks-tools-core.
- {['databricks-asset-bundles/', 'databricks-app-apx/', 'databricks-app-python/', 'databricks-python-sdk/', 'databricks-mlflow-evaluation/', 'databricks-spark-declarative-pipelines/', 'databricks-synthetic-data-generation/'].map((skill) => (
+ {['databricks-asset-bundles/', 'databricks-app-apx/', 'databricks-app-python/', 'databricks-python-sdk/', 'databricks-mlflow-evaluation/', 'databricks-spark-declarative-pipelines/', 'databricks-synthetic-data-gen/'].map((skill) => (
{skill}
@@ -204,7 +204,7 @@ function OverviewSection() {
Read Skill
- Claude reads databricks-synthetic-data-generation/ skill to learn best practices
+ Claude reads databricks-synthetic-data-gen/ skill to learn best practices
{['Non-linear distributions', 'Referential integrity', 'Time patterns', 'Row coherence'].map((item) => (
diff --git a/databricks-builder-app/server/services/system_prompt.py b/databricks-builder-app/server/services/system_prompt.py
index 5b7b4fef..fd18f6cf 100644
--- a/databricks-builder-app/server/services/system_prompt.py
+++ b/databricks-builder-app/server/services/system_prompt.py
@@ -5,7 +5,7 @@
# Mapping of user request patterns to skill names for the selection guide.
# Only entries whose skill is enabled will be included in the prompt.
_SKILL_GUIDE_ENTRIES = [
- ('Generate data, synthetic data, fake data, test data', 'databricks-synthetic-data-generation'),
+ ('Generate data, synthetic data, fake data, test data', 'databricks-synthetic-data-gen'),
('Pipeline, ETL, bronze/silver/gold, data transformation', 'databricks-spark-declarative-pipelines'),
('Dashboard, visualization, BI, charts', 'databricks-aibi-dashboards'),
('Job, workflow, schedule, automation', 'databricks-jobs'),
diff --git a/databricks-skills/README.md b/databricks-skills/README.md
index afaccd9d..29a79ae8 100644
--- a/databricks-skills/README.md
+++ b/databricks-skills/README.md
@@ -58,7 +58,7 @@ cp -r ai-dev-kit/databricks-skills/databricks-agent-bricks .claude/skills/
- **databricks-iceberg** - Apache Iceberg tables (Managed/Foreign), UniForm, Iceberg REST Catalog, Iceberg Clients Interoperability
- **databricks-spark-declarative-pipelines** - SDP (formerly DLT) in SQL/Python
- **databricks-jobs** - Multi-task workflows, triggers, schedules
-- **databricks-synthetic-data-generation** - Realistic test data with Faker
+- **databricks-synthetic-data-gen** - Realistic test data with Faker
### 🚀 Development & Deployment
- **databricks-asset-bundles** - DABs for multi-environment deployments
diff --git a/databricks-skills/databricks-agent-bricks/SKILL.md b/databricks-skills/databricks-agent-bricks/SKILL.md
index 4aff7acb..04be7dad 100644
--- a/databricks-skills/databricks-agent-bricks/SKILL.md
+++ b/databricks-skills/databricks-agent-bricks/SKILL.md
@@ -28,7 +28,7 @@ Before creating Agent Bricks, ensure you have the required data:
### For Genie Spaces
- **See the `databricks-genie` skill** for comprehensive Genie Space guidance
- Tables in Unity Catalog with the data to explore
-- Generate raw data using the `databricks-synthetic-data-generation` skill
+- Generate raw data using the `databricks-synthetic-data-gen` skill
- Create tables using the `databricks-spark-declarative-pipelines` skill
### For Supervisor Agents
@@ -119,7 +119,7 @@ Before creating Agent Bricks, generate the required source data:
**For Genie (SQL exploration)**:
```
-1. Use `databricks-synthetic-data-generation` skill to create raw parquet data
+1. Use `databricks-synthetic-data-gen` skill to create raw parquet data
2. Use `databricks-spark-declarative-pipelines` skill to create bronze/silver/gold tables
```
@@ -199,7 +199,7 @@ manage_mas(
- **[databricks-genie](../databricks-genie/SKILL.md)** - Comprehensive Genie Space creation, curation, and Conversation API guidance
- **[databricks-unstructured-pdf-generation](../databricks-unstructured-pdf-generation/SKILL.md)** - Generate synthetic PDFs to feed into Knowledge Assistants
-- **[databricks-synthetic-data-generation](../databricks-synthetic-data-generation/SKILL.md)** - Create raw data for Genie Space tables
+- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - Create raw data for Genie Space tables
- **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - Build bronze/silver/gold tables consumed by Genie Spaces
- **[databricks-model-serving](../databricks-model-serving/SKILL.md)** - Deploy custom agent endpoints used as MAS agents
- **[databricks-vector-search](../databricks-vector-search/SKILL.md)** - Build vector indexes for RAG applications paired with KAs
diff --git a/databricks-skills/databricks-genie/SKILL.md b/databricks-skills/databricks-genie/SKILL.md
index 576771da..e5b32b6e 100644
--- a/databricks-skills/databricks-genie/SKILL.md
+++ b/databricks-skills/databricks-genie/SKILL.md
@@ -107,7 +107,7 @@ Before creating a Genie Space:
### Creating Tables
Use these skills in sequence:
-1. `databricks-synthetic-data-generation` - Generate raw parquet files
+1. `databricks-synthetic-data-gen` - Generate raw parquet files
2. `databricks-spark-declarative-pipelines` - Create bronze/silver/gold tables
## Common Issues
@@ -121,6 +121,6 @@ Use these skills in sequence:
## Related Skills
- **[databricks-agent-bricks](../databricks-agent-bricks/SKILL.md)** - Use Genie Spaces as agents inside Supervisor Agents
-- **[databricks-synthetic-data-generation](../databricks-synthetic-data-generation/SKILL.md)** - Generate raw parquet data to populate tables for Genie
+- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - Generate raw parquet data to populate tables for Genie
- **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - Build bronze/silver/gold tables consumed by Genie Spaces
- **[databricks-unity-catalog](../databricks-unity-catalog/SKILL.md)** - Manage the catalogs, schemas, and tables Genie queries
diff --git a/databricks-skills/databricks-genie/spaces.md b/databricks-skills/databricks-genie/spaces.md
index 8549d6bd..225efe0e 100644
--- a/databricks-skills/databricks-genie/spaces.md
+++ b/databricks-skills/databricks-genie/spaces.md
@@ -163,7 +163,7 @@ The tool finds the existing space by name and updates it.
## Example End-to-End Workflow
-1. **Generate synthetic data** using `databricks-synthetic-data-generation` skill:
+1. **Generate synthetic data** using `databricks-synthetic-data-gen` skill:
- Creates parquet files in `/Volumes/catalog/schema/raw_data/`
2. **Create tables** using `databricks-spark-declarative-pipelines` skill:
diff --git a/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md b/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md
index 48a698f8..60afef0b 100644
--- a/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md
+++ b/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md
@@ -573,5 +573,5 @@ For advanced configuration options (development mode, continuous pipelines, cust
- **[databricks-jobs](../databricks-jobs/SKILL.md)** - for orchestrating and scheduling pipeline runs
- **[databricks-asset-bundles](../databricks-asset-bundles/SKILL.md)** - for multi-environment deployment of pipeline projects
-- **[databricks-synthetic-data-generation](../databricks-synthetic-data-generation/SKILL.md)** - for generating test data to feed into pipelines
+- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - for generating test data to feed into pipelines
- **[databricks-unity-catalog](../databricks-unity-catalog/SKILL.md)** - for catalog/schema/volume management and governance
diff --git a/databricks-skills/databricks-synthetic-data-gen/SKILL.md b/databricks-skills/databricks-synthetic-data-gen/SKILL.md
new file mode 100644
index 00000000..5bd95e58
--- /dev/null
+++ b/databricks-skills/databricks-synthetic-data-gen/SKILL.md
@@ -0,0 +1,258 @@
+---
+name: databricks-synthetic-data-gen
+description: "Generate realistic synthetic data using Spark + Faker (strongly recommended). Supports serverless execution, multiple output formats (Parquet/JSON/CSV/Delta), and scales from thousands to millions of rows. For small datasets (<10K rows), can optionally generate locally and upload to volumes. Use when user mentions 'synthetic data', 'test data', 'generate data', 'demo dataset', 'Faker', or 'sample data'."
+---
+
+> Catalog and schema are **always user-supplied** — never default to any value. If the user hasn't provided them, ask. For any UC write, **always create the schema if it doesn't exist** before writing data.
+
+# Databricks Synthetic Data Generation
+
+Generate realistic, story-driven synthetic data for Databricks using **Spark + Faker + Pandas UDFs** (strongly recommended).
+
+## Quick Reference
+
+| Topic | Guide | When to Use |
+|-------|-------|-------------|
+| **Setup & Execution** | [references/1-setup-and-execution.md](references/1-setup-and-execution.md) | Setting up environment, choosing compute, installing dependencies |
+| **Generation Approaches** | [references/2-generation-approaches.md](references/2-generation-approaches.md) | Choosing Spark UDFs vs Polars local, writing generation code |
+| **Data Patterns** | [references/3-data-patterns.md](references/3-data-patterns.md) | Creating realistic distributions, referential integrity, time patterns |
+| **Domain Guidance** | [references/4-domain-guidance.md](references/4-domain-guidance.md) | E-commerce, IoT, financial, support/CRM domain patterns |
+| **Output Formats** | [references/5-output-formats.md](references/5-output-formats.md) | Choosing output format, saving to volumes/tables |
+| **Troubleshooting** | [references/6-troubleshooting.md](references/6-troubleshooting.md) | Fixing errors, debugging issues |
+| **Example Script** | [scripts/generate_synthetic_data.py](scripts/generate_synthetic_data.py) | Complete Spark + Pandas UDF example |
+
+## Package Manager
+
+Prefer `uv` for all Python operations. Fall back to `pip` only if `uv` is not available.
+
+```bash
+# Preferred
+uv pip install "databricks-connect>=16.4,<17.4" faker numpy pandas holidays
+uv run python generate_data.py
+
+# Fallback if uv not available
+pip install "databricks-connect>=16.4,<17.4" faker numpy pandas holidays
+python generate_data.py
+```
+
+## Critical Rules
+
+1. **Strongly prefer to use Spark + Faker + Pandas UDFs** for data generation (scalable, parallel)
+2. **If user specifies local** then use Polars locally instead of Spark, but suggest Spark if > 30,000 rows.
+3. **Present a plan for user approval** before generating any code
+4. **Ask for catalog/schema** - do not default
+5. **Use serverless compute** unless user explicitly requests classic cluster
+6. **Generate raw data only** - no pre-aggregated fields (unless user requests)
+7. **Create master tables first** - then generate related tables with valid FKs
+8. **NEVER use `.cache()` or `.persist()` with serverless compute** - these operations are NOT supported and will fail with `AnalysisException: PERSIST TABLE is not supported on serverless compute`. Instead, write master tables to Delta first, then read them back for FK joins.
+
+## Generation Planning Workflow
+
+**Before generating any code, you MUST present a plan for user approval.**
+
+### ⚠️ MUST DO: Confirm Catalog Before Proceeding
+
+**You MUST explicitly ask the user which catalog to use.** Do not assume or proceed without confirmation.
+
+Example prompt to user:
+> "Which Unity Catalog should I use for this data?"
+
+When presenting your plan, always show the selected catalog prominently:
+```
+📍 Output Location: catalog_name.schema_name
+ Volume: /Volumes/catalog_name/schema_name/raw_data/
+```
+
+This makes it easy for the user to spot and correct if needed.
+
+### Step 1: Gather Requirements
+
+Ask the user about:
+- **Catalog/Schema** - Which catalog to use?
+- What domain/scenario? (e-commerce, support tickets, IoT sensors, etc.)
+- How many tables? What relationships between them?
+- Approximate row counts per table?
+- Output format preference? (Delta table is default)
+
+### Step 2: Present Table Specification
+
+Show a clear specification with **YOUR ASSUMPTIONS surfaced**. Always start with the output location:
+
+```
+📍 Output Location: {user_catalog}.ecommerce_demo
+ Volume: /Volumes/{user_catalog}/ecommerce_demo/raw_data/
+```
+
+| Table | Columns | Rows | Key Assumptions |
+|-------|---------|------|-----------------|
+| customers | customer_id, name, email, tier, region | 5,000 | Tier: Free 60%, Pro 30%, Enterprise 10% |
+| orders | order_id, customer_id (FK), amount, status | 15,000 | Enterprise customers generate 5x more orders |
+
+**Assumptions I'm making:**
+- Amount distribution: log-normal by tier (Enterprise ~$1800, Pro ~$245, Free ~$55)
+- Status: 65% delivered, 15% shipped, 10% processing, 5% pending, 5% cancelled
+
+**Ask user**: "Does this look correct? Any adjustments to the catalog, tables, or distributions?"
+
+### Step 3: Ask About Data Features
+
+- [x] Skew (non-uniform distributions) - **Enabled by default**
+- [x] Joins (referential integrity) - **Enabled by default**
+- [ ] Bad data injection (for data quality testing)
+- [ ] Multi-language text
+- [ ] Incremental mode (append vs overwrite)
+
+### Pre-Generation Checklist
+
+- [ ] **Catalog confirmed** - User explicitly approved which catalog to use
+- [ ] Output location shown prominently in plan (easy to spot/change)
+- [ ] Table specification shown and approved
+- [ ] Assumptions about distributions confirmed
+- [ ] User confirmed compute preference (serverless recommended)
+- [ ] Data features selected
+
+**Do NOT proceed to code generation until user approves the plan, including the catalog.**
+
+## Quick Start: Spark + Faker + Pandas UDFs
+
+```python
+from databricks.connect import DatabricksSession, DatabricksEnv
+from pyspark.sql import functions as F
+from pyspark.sql.types import StringType, DoubleType
+import pandas as pd
+import numpy as np
+
+# Setup with managed dependencies (databricks-connect 16.4+)
+env = DatabricksEnv().withDependencies("faker", "pandas", "numpy")
+spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate()
+
+# Define Pandas UDFs
+@F.pandas_udf(StringType())
+def fake_name(ids: pd.Series) -> pd.Series:
+ from faker import Faker
+ fake = Faker()
+ return pd.Series([fake.name() for _ in range(len(ids))])
+
+@F.pandas_udf(DoubleType())
+def generate_amount(tiers: pd.Series) -> pd.Series:
+ amounts = []
+ for tier in tiers:
+ if tier == "Enterprise":
+ amounts.append(float(np.random.lognormal(7.5, 0.8)))
+ elif tier == "Pro":
+ amounts.append(float(np.random.lognormal(5.5, 0.7)))
+ else:
+ amounts.append(float(np.random.lognormal(4.0, 0.6)))
+ return pd.Series(amounts)
+
+# Generate customers
+customers_df = (
+ spark.range(0, 10000, numPartitions=16)
+ .select(
+ F.concat(F.lit("CUST-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("customer_id"),
+ fake_name(F.col("id")).alias("name"),
+ F.when(F.rand() < 0.6, "Free")
+ .when(F.rand() < 0.9, "Pro")
+ .otherwise("Enterprise").alias("tier"),
+ )
+ .withColumn("arr", generate_amount(F.col("tier")))
+)
+
+# Save to Unity Catalog
+spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
+spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data")
+customers_df.write.mode("overwrite").parquet(f"/Volumes/{CATALOG}/{SCHEMA}/raw_data/customers")
+```
+
+## Common Patterns
+
+### Weighted Tier Distribution
+```python
+F.when(F.rand() < 0.6, "Free")
+ .when(F.rand() < 0.9, "Pro")
+ .otherwise("Enterprise").alias("tier")
+```
+
+### Log-Normal Amounts (Realistic Pricing)
+```python
+@F.pandas_udf(DoubleType())
+def generate_amount(tiers: pd.Series) -> pd.Series:
+ return pd.Series([
+ float(np.random.lognormal({"Enterprise": 7.5, "Pro": 5.5, "Free": 4.0}[t], 0.7))
+ for t in tiers
+ ])
+```
+
+### Date Range (Last 6 Months)
+```python
+from datetime import datetime, timedelta
+END_DATE = datetime.now()
+START_DATE = END_DATE - timedelta(days=180)
+
+F.date_add(F.lit(START_DATE.date()), (F.rand() * 180).cast("int")).alias("order_date")
+```
+
+### Infrastructure Creation
+```python
+# Always in script - assume catalog exists
+spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
+spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data")
+```
+
+## Execution Modes
+
+| Mode | Best For | Setup |
+|------|----------|-------|
+| **DB Connect 16.4+ Serverless** | Local dev, Python 3.12+ | `DatabricksEnv().withDependencies(...)` |
+| **Serverless Job** | Production, scheduled | Job with `environments` parameter |
+| **Classic Cluster** | Fallback only | Use Databricks CLI to install libraries. `databricks libraries install --json '{"cluster_id": "
", "libraries": [{"pypi": {"package": "faker"}}, {"pypi": {"package": "holidays"}}]}'` |
+
+See [references/1-setup-and-execution.md](references/1-setup-and-execution.md) for detailed setup instructions.
+
+## Output Formats
+
+| Format | Use Case | Code |
+|--------|----------|------|
+| **Parquet** (default) | SDP pipeline input | `df.write.parquet(path)` |
+| **JSON** | Log-style ingestion | `df.write.json(path)` |
+| **CSV** | Legacy systems | `df.write.option("header", "true").csv(path)` |
+| **Delta Table** | Direct analytics | `df.write.saveAsTable("catalog.schema.table")` |
+
+See [references/5-output-formats.md](references/5-output-formats.md) for detailed options.
+
+## Best Practices Summary
+
+### Execution
+- Use serverless (instant start, no cluster wait)
+- Ask for catalog/schema
+- Present plan before generating
+
+### Data Generation
+- **Spark + Faker + Pandas UDFs** for all cases
+- Master tables first, then related tables with valid FKs
+- Non-linear distributions (log-normal, Pareto, exponential)
+- Time patterns (weekday/weekend, holidays, seasonality)
+- Row coherence (correlated attributes)
+
+### Output
+- Create infrastructure in script (`CREATE SCHEMA/VOLUME IF NOT EXISTS`)
+- Do NOT create catalogs - assume they exist
+- Delta tables as default
+
+## Related Skills
+
+- **databricks-unity-catalog** - Managing catalogs, schemas, and volumes
+- **databricks-asset-bundles** - DABs for production deployment
+
+## Common Issues
+
+| Issue | Solution |
+|-------|----------|
+| `ModuleNotFoundError: faker` | See [references/1-setup-and-execution.md](references/1-setup-and-execution.md) |
+| Faker UDF is slow | Use `pandas_udf` for batch processing |
+| Out of memory | Increase `numPartitions` in `spark.range()` |
+| Referential integrity errors | Write master table to Delta first, read back for FK joins |
+| `PERSIST TABLE is not supported on serverless` | **NEVER use `.cache()` or `.persist()` with serverless** - write to Delta table first, then read back |
+| `F.window` vs `Window` confusion | Use `from pyspark.sql.window import Window` for `row_number()`, `rank()`, etc. `F.window` is for streaming only. |
+
+See [references/6-troubleshooting.md](references/6-troubleshooting.md) for full troubleshooting guide.
diff --git a/databricks-skills/databricks-synthetic-data-gen/references/1-setup-and-execution.md b/databricks-skills/databricks-synthetic-data-gen/references/1-setup-and-execution.md
new file mode 100644
index 00000000..3ec36fbc
--- /dev/null
+++ b/databricks-skills/databricks-synthetic-data-gen/references/1-setup-and-execution.md
@@ -0,0 +1,278 @@
+# Setup and Execution Guide
+
+This guide covers all execution modes for synthetic data generation, organized by Databricks Connect version and Python version.
+
+## Quick Decision Matrix
+
+| Your Environment | Recommended Approach |
+|------------------|---------------------|
+| Python 3.12+ with databricks-connect >= 16.4 | DatabricksEnv with withDependencies API |
+| Python 3.10/3.11 with older databricks-connect | Serverless job with environments parameter |
+| Classic compute (fallback only) | Manual cluster setup |
+
+## Option 1: Databricks Connect 16.4+ with Serverless (Recommended)
+
+**Best for:** Python 3.12+, local development with serverless compute
+
+**Install locally:**
+```bash
+# Preferred
+uv pip install "databricks-connect>=16.4,<17.4" faker numpy pandas holidays
+
+# Fallback if uv not available
+pip install "databricks-connect>=16.4,<17.4" faker numpy pandas holidays
+```
+
+**Configure ~/.databrickscfg:**
+```ini
+[DEFAULT]
+host = https://your-workspace.cloud.databricks.com/
+serverless_compute_id = auto
+auth_type = databricks-cli
+```
+
+**In your script:**
+```python
+from databricks.connect import DatabricksSession, DatabricksEnv
+
+# Pass dependencies as simple package name strings
+env = DatabricksEnv().withDependencies("faker", "pandas", "numpy", "holidays")
+
+# Create session with managed dependencies
+spark = (
+ DatabricksSession.builder
+ .withEnvironment(env)
+ .serverless(True)
+ .getOrCreate()
+)
+
+# Spark operations now execute on serverless compute with managed dependencies
+```
+
+**Version Detection (if needed):**
+```python
+import importlib.metadata
+
+def get_databricks_connect_version():
+ """Get databricks-connect version as (major, minor) tuple."""
+ try:
+ version_str = importlib.metadata.version('databricks-connect')
+ parts = version_str.split('.')
+ return (int(parts[0]), int(parts[1]))
+ except Exception:
+ return None
+
+db_version = get_databricks_connect_version()
+if db_version and db_version >= (16, 4):
+ # Use DatabricksEnv with withDependencies
+ pass
+```
+
+**Benefits:**
+- Instant start, no cluster wait
+- Local debugging and fast iteration
+- Automatic dependency management
+- Edit file, re-run immediately
+
+## Option 2: Older Databricks Connect or Python < 3.12
+
+**Best for:** Python 3.10/3.11, databricks-connect 15.1-16.3
+
+`DatabricksEnv()` and `withEnvironment()` are NOT available in older versions. Use serverless jobs with environments parameter instead.
+
+### Serverless Job Configuration Requirements
+
+**MUST use `"client": "4"` in the Environment Spec:**
+
+```json
+{
+ "environments": [{
+ "environment_key": "datagen_env",
+ "spec": {
+ "client": "4",
+ "dependencies": ["faker", "numpy", "pandas"]
+ }
+ }]
+}
+```
+
+> **Note:** Using `"client": "1"` will fail with environment configuration errors.
+
+### Script Deployment
+
+Deploy Python files (.py) to the workspace for serverless jobs:
+
+```bash
+databricks workspace import /Users/@databricks.com/scripts/my_script.py \
+ --file ./my_script.py --format AUTO
+
+databricks workspace list /Users/@databricks.com/scripts/
+```
+
+**Job config must reference the workspace path:**
+
+```json
+{
+ "spark_python_task": {
+ "python_file": "/Users/@databricks.com/scripts/my_script.py"
+ },
+ "environment_key": "datagen_env"
+}
+```
+
+**DABs bundle configuration:**
+```yaml
+# databricks.yml
+bundle:
+ name: synthetic-data-gen
+
+resources:
+ jobs:
+ generate_data:
+ name: "Generate Synthetic Data"
+ tasks:
+ - task_key: generate
+ spark_python_task:
+ python_file: ./src/generate_data.py
+ environment_key: default
+
+environments:
+ default:
+ spec:
+ client: "4"
+ dependencies:
+ - faker
+ - numpy
+ - pandas
+ - holidays
+```
+
+## Option 3: Classic Cluster
+
+**Use when:** Serverless unavailable, or specific cluster features needed (GPUs, custom init scripts)
+
+### Step 1: Check Python Version Compatibility
+
+Pandas UDFs require matching Python minor versions between local and cluster.
+
+```bash
+# Check local Python
+uv run python --version # or: python --version
+
+# Check cluster DBR version → Python version
+# DBR 17.x = Python 3.12
+# DBR 15.4 LTS = Python 3.11
+# DBR 14.3 LTS = Python 3.10
+databricks clusters get | grep spark_version
+```
+
+### Step 2a: If Versions Match → Use Databricks Connect
+
+```bash
+# Install matching databricks-connect version (must match DBR major.minor)
+uv pip install "databricks-connect==17.3.*" faker numpy pandas holidays
+```
+
+```bash
+# Install libraries on cluster
+`databricks libraries install --json '{"cluster_id": "", "libraries": [{"pypi": {"package": "faker"}}, {"pypi": {"package": "holidays"}}]}'`
+
+# Wait for INSTALLED status
+databricks libraries cluster-status
+```
+
+```python
+# Run locally via Databricks Connect
+from databricks.connect import DatabricksSession
+
+spark = DatabricksSession.builder.clusterId("").getOrCreate()
+# Your Spark code runs on the cluster
+```
+
+### Step 2b: If Versions Don't Match → Submit as Job
+
+**Ask user for approval before submitting.** Example prompt:
+> "Your local Python (3.11) doesn't match the cluster (3.12). Pandas UDFs require matching versions. Should I submit this as a job to run directly on the cluster instead?"
+
+```bash
+# Upload script to workspace
+databricks workspace import /Users/you@company.com/scripts/generate_data.py \
+ --file generate_data.py --format AUTO --overwrite
+
+# Submit job to run on cluster
+databricks jobs submit --json '{
+ "run_name": "Generate Data",
+ "tasks": [{
+ "task_key": "generate",
+ "existing_cluster_id": "",
+ "spark_python_task": {
+ "python_file": "/Users/you@company.com/scripts/generate_data.py"
+ }
+ }]
+}'
+```
+
+### Classic Cluster Decision Flow
+
+```
+Local Python == Cluster Python?
+ ├─ YES → Install libs on cluster, run via Databricks Connect
+ └─ NO → Ask user: "Submit as job instead?"
+ └─ Upload script + submit job
+```
+
+## Required Libraries
+
+Standard libraries for generating realistic synthetic data:
+
+| Library | Purpose | Required For |
+|---------|---------|--------------|
+| **faker** | Realistic names, addresses, emails, companies | Text data generation |
+| **numpy** | Statistical distributions | Non-linear distributions |
+| **pandas** | Data manipulation, Pandas UDFs | Spark UDF definitions |
+| **holidays** | Country-specific holiday calendars | Time-based patterns |
+
+## Environment Detection Pattern
+
+Use this pattern to auto-detect environment and choose the right session creation:
+
+```python
+import os
+import importlib.metadata
+
+def is_databricks_runtime():
+ """Check if running on Databricks Runtime vs locally."""
+ return "DATABRICKS_RUNTIME_VERSION" in os.environ
+
+def get_databricks_connect_version():
+ """Get databricks-connect version as (major, minor) tuple or None."""
+ try:
+ version_str = importlib.metadata.version('databricks-connect')
+ parts = version_str.split('.')
+ return (int(parts[0]), int(parts[1]))
+ except Exception:
+ return None
+
+on_runtime = is_databricks_runtime()
+db_version = get_databricks_connect_version()
+
+# Use DatabricksEnv if: locally + databricks-connect >= 16.4
+use_auto_dependencies = (not on_runtime) and db_version and db_version >= (16, 4)
+
+if use_auto_dependencies:
+ from databricks.connect import DatabricksSession, DatabricksEnv
+ env = DatabricksEnv().withDependencies("faker", "pandas", "numpy", "holidays")
+ spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate()
+else:
+ from databricks.connect import DatabricksSession
+ spark = DatabricksSession.builder.serverless(True).getOrCreate()
+```
+
+## Common Setup Issues
+
+| Issue | Solution |
+|-------|----------|
+| `ModuleNotFoundError: faker` | Install dependencies per execution mode above |
+| `DatabricksEnv not found` | Upgrade to databricks-connect >= 16.4 or use job with environments |
+| `serverless_compute_id` error | Add `serverless_compute_id = auto` to ~/.databrickscfg |
+| Classic cluster startup slow | Use serverless instead (instant start) |
diff --git a/databricks-skills/databricks-synthetic-data-gen/references/2-generation-approaches.md b/databricks-skills/databricks-synthetic-data-gen/references/2-generation-approaches.md
new file mode 100644
index 00000000..5d6feeca
--- /dev/null
+++ b/databricks-skills/databricks-synthetic-data-gen/references/2-generation-approaches.md
@@ -0,0 +1,205 @@
+# Data Generation Approaches
+
+Choose your approach based on scale and requirements. **Spark + Faker + Pandas UDFs is strongly preferred** for all cases.
+
+## Decision Table
+
+| Scenario | Recommended Approach |
+|----------|---------------------|
+| **Default - any data generation** | **Spark + Faker + Pandas UDFs** |
+| Large datasets (100K+ rows) | **Spark + Faker + Pandas UDFs** |
+| Medium datasets (10K-100K rows) | **Spark + Faker + Pandas UDFs** |
+| Small datasets (<10K rows) | **Spark + Faker + Pandas UDFs** (or Polars if user prefers local) |
+
+**Rule:** Always use Spark + Faker + Pandas UDFs unless user explicitly requests local generation for <10K rows.
+
+---
+
+## Approach 1: Spark + Faker + Pandas UDFs (Strongly Preferred)
+
+**Best for:** All dataset sizes, direct write to Unity Catalog
+
+**Why this approach:**
+- Scales from thousands to millions of rows
+- Parallel execution via Spark
+- Direct integration with Unity Catalog
+- No intermediate files or uploads needed
+- Works with serverless and classic compute
+
+### Basic Pattern
+
+```python
+from pyspark.sql import functions as F
+from pyspark.sql.types import StringType, DoubleType
+from faker import Faker
+import pandas as pd
+import numpy as np
+
+# Define Pandas UDFs for Faker data (batch processing for parallelism)
+@F.pandas_udf(StringType())
+def fake_name(ids: pd.Series) -> pd.Series:
+ fake = Faker()
+ return pd.Series([fake.name() for _ in range(len(ids))])
+
+@F.pandas_udf(StringType())
+def fake_company(ids: pd.Series) -> pd.Series:
+ fake = Faker()
+ return pd.Series([fake.company() for _ in range(len(ids))])
+
+@F.pandas_udf(StringType())
+def fake_email(ids: pd.Series) -> pd.Series:
+ fake = Faker()
+ return pd.Series([fake.email() for _ in range(len(ids))])
+
+@F.pandas_udf(DoubleType())
+def generate_lognormal_amount(tiers: pd.Series) -> pd.Series:
+ """Generate amount based on tier using log-normal distribution."""
+ amounts = []
+ for tier in tiers:
+ if tier == "Enterprise":
+ amounts.append(float(np.random.lognormal(mean=7.5, sigma=0.8)))
+ elif tier == "Pro":
+ amounts.append(float(np.random.lognormal(mean=5.5, sigma=0.7)))
+ else:
+ amounts.append(float(np.random.lognormal(mean=4.0, sigma=0.6)))
+ return pd.Series(amounts)
+```
+
+### Generate Data with Spark + Pandas UDFs
+
+```python
+# Configuration
+N_CUSTOMERS = 100_000
+PARTITIONS = 16 # Adjust based on data size: 8 for <100K, 32 for 1M+
+VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
+
+# Generate customers with Spark + Pandas UDFs
+customers_df = (
+ spark.range(0, N_CUSTOMERS, numPartitions=PARTITIONS)
+ .select(
+ F.concat(F.lit("CUST-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("customer_id"),
+ fake_name(F.col("id")).alias("name"),
+ fake_company(F.col("id")).alias("company"),
+ fake_email(F.col("id")).alias("email"),
+ F.when(F.rand() < 0.6, "Free")
+ .when(F.rand() < 0.9, "Pro")
+ .otherwise("Enterprise").alias("tier"),
+ F.when(F.rand() < 0.4, "North")
+ .when(F.rand() < 0.65, "South")
+ .when(F.rand() < 0.85, "East")
+ .otherwise("West").alias("region"),
+ )
+)
+
+# Add tier-based amount
+customers_df = customers_df.withColumn("arr", generate_lognormal_amount(F.col("tier")))
+
+# Write directly to Unity Catalog volume
+customers_df.write.mode("overwrite").parquet(f"{VOLUME_PATH}/customers")
+```
+
+### Partitioning Strategy
+
+| Data Size | Recommended Partitions |
+|-----------|----------------------|
+| < 100K rows | 8 partitions |
+| 100K - 500K rows | 16 partitions |
+| 500K - 1M rows | 32 partitions |
+| 1M+ rows | 64+ partitions |
+
+---
+
+## Approach 2: Polars + Local Generation + Upload (Secondary Option)
+
+**Use only when:** Dataset <10K rows AND user explicitly prefers local generation
+
+**Why this approach exists:**
+- No Spark overhead for tiny datasets
+- Quick prototyping in local environment
+- When Databricks Connect not available
+
+**Limitations:**
+- Doesn't scale past ~100K rows
+- Requires manual upload step
+- No direct Unity Catalog integration
+
+### Install Local Dependencies
+
+```bash
+# Preferred: use uv for fast, reliable installs
+uv pip install polars faker numpy
+
+# Alternative if uv not available
+pip install polars faker numpy
+```
+
+### Generate Locally with Polars
+
+```python
+import polars as pl
+from faker import Faker
+import numpy as np
+
+fake = Faker()
+N_CUSTOMERS = 5000
+
+# Generate with Polars
+customers = pl.DataFrame({
+ "customer_id": [f"CUST-{i:05d}" for i in range(N_CUSTOMERS)],
+ "name": [fake.name() for _ in range(N_CUSTOMERS)],
+ "email": [fake.email() for _ in range(N_CUSTOMERS)],
+ "tier": np.random.choice(["Free", "Pro", "Enterprise"], N_CUSTOMERS, p=[0.6, 0.3, 0.1]).tolist(),
+ "region": np.random.choice(["North", "South", "East", "West"], N_CUSTOMERS, p=[0.4, 0.25, 0.2, 0.15]).tolist(),
+})
+
+# Save locally
+customers.write_parquet("./output/customers.parquet")
+```
+
+### Upload to Databricks Volume
+
+After generating data locally, upload to a Databricks volume:
+
+```bash
+# Create directory in volume if needed
+databricks fs mkdirs dbfs:/Volumes////source_data/
+
+# Upload local data to volume
+databricks fs cp -r ./output/customers.parquet dbfs:/Volumes////source_data/
+databricks fs cp -r ./output/orders.parquet dbfs:/Volumes////source_data/
+```
+
+### When to Actually Use Polars
+
+Only recommend Polars when ALL conditions are met:
+1. Dataset is < 10K rows
+2. User explicitly requests local generation
+3. Quick prototyping without Databricks connection
+
+Otherwise, **always use Spark + Faker + Pandas UDFs**.
+
+---
+
+## Storage Destinations
+
+### Ask for Catalog and Schema
+
+Ask the user which catalog and schema to use:
+
+> "What catalog and schema name would you like to use?"
+
+### Create Infrastructure in Script
+
+Always create the schema and volume **inside the Python script** using `spark.sql()`:
+
+```python
+CATALOG = "" # MUST ask user - never default
+SCHEMA = ""
+VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
+
+# Note: Assume catalog exists - do NOT create it
+spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
+spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data")
+```
+
+**Important:** Do NOT create catalogs - assume they already exist. Only create schema and volume.
diff --git a/databricks-skills/databricks-synthetic-data-gen/references/3-data-patterns.md b/databricks-skills/databricks-synthetic-data-gen/references/3-data-patterns.md
new file mode 100644
index 00000000..351f1bd7
--- /dev/null
+++ b/databricks-skills/databricks-synthetic-data-gen/references/3-data-patterns.md
@@ -0,0 +1,286 @@
+# Data Patterns Guide
+
+Creating realistic, coherent synthetic data with Spark + Pandas UDFs.
+
+## 5 Key Principles
+
+1. **Use Spark + Faker + Pandas UDFs** for all generation
+2. **Referential Integrity** - master tables first, weighted sampling
+3. **Non-Linear Distributions** - log-normal, Pareto, exponential
+4. **Time-Based Patterns** - weekday/weekend, holidays, seasonality
+5. **Row Coherence** - correlated attributes within each row
+
+---
+
+## Principle 1: Use Spark + Faker + Pandas UDFs
+
+Generate data with Spark + Faker for all use cases. Pandas UDFs provide efficient, distributed Faker calls that scale seamlessly from thousands to millions of rows.
+
+### Define Pandas UDFs
+
+```python
+from pyspark.sql import functions as F
+from pyspark.sql.types import StringType, DoubleType
+from faker import Faker
+import pandas as pd
+import numpy as np
+
+@F.pandas_udf(StringType())
+def fake_company(ids: pd.Series) -> pd.Series:
+ fake = Faker()
+ return pd.Series([fake.company() for _ in range(len(ids))])
+
+@F.pandas_udf(StringType())
+def fake_address(ids: pd.Series) -> pd.Series:
+ fake = Faker()
+ return pd.Series([fake.address().replace('\n', ', ') for _ in range(len(ids))])
+
+@F.pandas_udf(DoubleType())
+def generate_lognormal_amount(tiers: pd.Series) -> pd.Series:
+ amounts = []
+ for tier in tiers:
+ if tier == "Enterprise":
+ amounts.append(float(np.random.lognormal(mean=7.5, sigma=0.8)))
+ elif tier == "Pro":
+ amounts.append(float(np.random.lognormal(mean=5.5, sigma=0.7)))
+ else:
+ amounts.append(float(np.random.lognormal(mean=4.0, sigma=0.6)))
+ return pd.Series(amounts)
+```
+
+### Generate with Spark
+
+```python
+# Adjust numPartitions based on scale: 8 for <100K, 32 for 1M+
+customers_df = (
+ spark.range(0, N_CUSTOMERS, numPartitions=16)
+ .select(
+ F.concat(F.lit("CUST-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("customer_id"),
+ fake_company(F.col("id")).alias("name"),
+ F.when(F.rand() < 0.6, "Free")
+ .when(F.rand() < 0.9, "Pro")
+ .otherwise("Enterprise").alias("tier"),
+ )
+)
+customers_df.write.mode("overwrite").parquet(f"{VOLUME_PATH}/customers")
+```
+
+---
+
+## Principle 2: Referential Integrity
+
+Generate master tables first, then iterate on them to create related tables with matching IDs.
+
+> **CRITICAL:** Do NOT use `.cache()` or `.persist()` with serverless compute - these operations are not supported and will fail. Instead, write master tables to Delta first, then read them back for FK joins.
+
+### Pattern: Weighted Sampling by Tier
+
+```python
+from pyspark.sql.window import Window
+
+# 1. Generate customers (master table) with index for FK mapping
+customers_df = (
+ spark.range(0, N_CUSTOMERS, numPartitions=PARTITIONS)
+ .select(
+ F.col("id").alias("customer_idx"), # Keep index for FK joins
+ F.concat(F.lit("CUST-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("customer_id"),
+ F.when(F.rand(SEED) < 0.6, "Free")
+ .when(F.rand(SEED) < 0.9, "Pro")
+ .otherwise("Enterprise").alias("tier"),
+ )
+)
+
+# 2. Write to Delta table (do NOT use cache with serverless!)
+customers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.customers")
+
+# 3. Read back for FK lookups
+customer_lookup = spark.table(f"{CATALOG}.{SCHEMA}.customers").select(
+ "customer_idx", "customer_id", "tier"
+)
+
+# 4. Generate orders with valid foreign keys
+orders_df = spark.range(0, N_ORDERS, numPartitions=PARTITIONS)
+
+# Map order to customer using hash-based distribution
+orders_df = orders_df.select(
+ F.concat(F.lit("ORD-"), F.lpad(F.col("id").cast("string"), 6, "0")).alias("order_id"),
+ (F.abs(F.hash(F.col("id"), F.lit(SEED))) % N_CUSTOMERS).alias("customer_idx"),
+)
+
+# Join to get valid foreign keys
+orders_with_fk = orders_df.join(customer_lookup, on="customer_idx", how="left")
+```
+
+### Anti-Pattern: Random FK Generation
+
+```python
+# BAD - May generate non-existent customer IDs
+orders_df = spark.range(0, N_ORDERS).select(
+ F.concat(F.lit("CUST-"), (F.rand() * 99999).cast("int")).alias("customer_id") # WRONG!
+)
+```
+
+---
+
+## Principle 3: Non-Linear Distributions
+
+**Never use uniform distributions** - real data is rarely uniform.
+
+### Distribution Types
+
+| Distribution | Use Case | Example |
+|--------------|----------|---------|
+| **Log-normal** | Prices, salaries, order amounts | `np.random.lognormal(mean=4.5, sigma=0.8)` |
+| **Pareto/Power law** | Popularity, wealth, page views | `(np.random.pareto(a=2.5) + 1) * 10` |
+| **Exponential** | Time between events, resolution time | `np.random.exponential(scale=24)` |
+| **Weighted categorical** | Status, region, tier | `np.random.choice(vals, p=[0.4, 0.3, 0.2, 0.1])` |
+
+### Pandas UDF for Log-Normal Amounts
+
+```python
+@F.pandas_udf(DoubleType())
+def generate_lognormal_amount(tiers: pd.Series) -> pd.Series:
+ """Generate amount based on tier using log-normal distribution."""
+ amounts = []
+ for tier in tiers:
+ if tier == "Enterprise":
+ amounts.append(float(np.random.lognormal(mean=7.5, sigma=0.8))) # ~$1800 avg
+ elif tier == "Pro":
+ amounts.append(float(np.random.lognormal(mean=5.5, sigma=0.7))) # ~$245 avg
+ else:
+ amounts.append(float(np.random.lognormal(mean=4.0, sigma=0.6))) # ~$55 avg
+ return pd.Series(amounts)
+```
+
+### Anti-Pattern: Uniform Distribution
+
+```python
+# BAD - Uniform (unrealistic)
+prices = np.random.uniform(10, 1000, size=N_ORDERS)
+
+# GOOD - Log-normal (realistic for prices)
+prices = np.random.lognormal(mean=4.5, sigma=0.8, size=N_ORDERS)
+```
+
+---
+
+## Principle 4: Time-Based Patterns
+
+Add weekday/weekend effects, holidays, seasonality, and event spikes.
+
+### Holiday and Weekday Multipliers
+
+```python
+import holidays
+from datetime import datetime, timedelta
+
+# Load holiday calendar
+US_HOLIDAYS = holidays.US(years=[START_DATE.year, END_DATE.year])
+
+def get_daily_multiplier(date):
+ """Calculate volume multiplier for a given date."""
+ multiplier = 1.0
+
+ # Weekend drop
+ if date.weekday() >= 5:
+ multiplier *= 0.6
+
+ # Holiday drop (even lower than weekends)
+ if date in US_HOLIDAYS:
+ multiplier *= 0.3
+
+ # Q4 seasonality (higher in Oct-Dec)
+ multiplier *= 1 + 0.15 * (date.month - 6) / 6
+
+ # Incident spike (if applicable)
+ if INCIDENT_START <= date <= INCIDENT_END:
+ multiplier *= 3.0
+
+ # Random noise
+ multiplier *= np.random.normal(1, 0.1)
+
+ return max(0.1, multiplier)
+```
+
+### Date Range: Last 6 Months
+
+Always generate data for the last ~6 months ending at the current date:
+
+```python
+from datetime import datetime, timedelta
+
+END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
+START_DATE = END_DATE - timedelta(days=180)
+```
+
+---
+
+## Principle 5: Row Coherence
+
+Attributes within a row should correlate logically.
+
+### Coherent Ticket Generation
+
+```python
+@F.pandas_udf("struct")
+def generate_coherent_ticket(tiers: pd.Series) -> pd.DataFrame:
+ """Generate coherent ticket where attributes correlate."""
+ results = []
+ for tier in tiers:
+ # Priority correlates with tier
+ if tier == 'Enterprise':
+ priority = np.random.choice(['Critical', 'High', 'Medium'], p=[0.3, 0.5, 0.2])
+ else:
+ priority = np.random.choice(['Critical', 'High', 'Medium', 'Low'], p=[0.05, 0.2, 0.45, 0.3])
+
+ # Resolution time correlates with priority
+ resolution_scale = {'Critical': 4, 'High': 12, 'Medium': 36, 'Low': 72}
+ resolution_hours = np.random.exponential(scale=resolution_scale[priority])
+
+ # CSAT correlates with resolution time
+ if resolution_hours < 4:
+ csat = np.random.choice([4, 5], p=[0.3, 0.7])
+ elif resolution_hours < 24:
+ csat = np.random.choice([3, 4, 5], p=[0.2, 0.5, 0.3])
+ else:
+ csat = np.random.choice([1, 2, 3, 4], p=[0.1, 0.3, 0.4, 0.2])
+
+ results.append({
+ "priority": priority,
+ "resolution_hours": round(resolution_hours, 1),
+ "csat_score": int(csat),
+ })
+
+ return pd.DataFrame(results)
+```
+
+### Correlation Examples
+
+| Attribute A | Attribute B | Correlation |
+|------------|-------------|-------------|
+| Customer tier | Order amount | Enterprise = higher amounts |
+| Ticket priority | Resolution time | Critical = faster resolution |
+| Resolution time | CSAT score | Faster = higher satisfaction |
+| Region | Product preference | Regional variations |
+| Time of day | Transaction type | Business hours = B2B |
+
+---
+
+## Data Volume for Aggregation
+
+Generate enough data so patterns remain visible after downstream aggregation:
+
+| Grain | Minimum Records | Rationale |
+|-------|-----------------|-----------|
+| Daily time series | 50-100/day | See trends after weekly rollup |
+| Per category | 500+ per category | Statistical significance |
+| Per customer | 5-20 events/customer | Customer-level analysis |
+| Total rows | 10K-50K minimum | Patterns survive GROUP BY |
+
+```python
+# Example: 8000 tickets over 180 days = ~44/day average
+# After weekly aggregation: ~310 records per week
+N_TICKETS = 8000
+N_CUSTOMERS = 2500 # Each has ~3 tickets on average
+N_ORDERS = 25000 # ~10 orders per customer average
+```
diff --git a/databricks-skills/databricks-synthetic-data-gen/references/4-domain-guidance.md b/databricks-skills/databricks-synthetic-data-gen/references/4-domain-guidance.md
new file mode 100644
index 00000000..0519bcce
--- /dev/null
+++ b/databricks-skills/databricks-synthetic-data-gen/references/4-domain-guidance.md
@@ -0,0 +1,256 @@
+# Domain-Specific Guidance
+
+Realistic patterns for common data domains. All examples use Spark + Faker + Pandas UDFs.
+
+---
+
+## Retail/E-commerce
+
+### Tables
+```
+customers → orders → order_items → products
+```
+
+### Key Patterns
+
+| Pattern | Implementation |
+|---------|----------------|
+| Seasonal spikes | Q4 holiday shopping (1.5-2x volume in Nov-Dec) |
+| Cart abandonment | ~70% of carts never complete |
+| Loyalty tier progression | Free → Pro → Enterprise over time |
+| Regional pricing | 5-15% price variation by region |
+
+### Realistic Distributions
+
+```python
+@F.pandas_udf(DoubleType())
+def generate_order_amount(tiers: pd.Series) -> pd.Series:
+ """E-commerce order amounts by tier."""
+ amounts = []
+ for tier in tiers:
+ if tier == "Premium":
+ amounts.append(float(np.random.lognormal(mean=5.5, sigma=0.9))) # ~$245 avg
+ elif tier == "Standard":
+ amounts.append(float(np.random.lognormal(mean=4.2, sigma=0.7))) # ~$67 avg
+ else: # Basic
+ amounts.append(float(np.random.lognormal(mean=3.5, sigma=0.6))) # ~$33 avg
+ return pd.Series(amounts)
+
+# Order status with cart abandonment
+status_weights = [0.70, 0.08, 0.07, 0.10, 0.05] # abandoned, pending, processing, shipped, delivered
+```
+
+### Schema Example
+
+```python
+# Products
+products_df = spark.range(0, N_PRODUCTS).select(
+ F.concat(F.lit("PROD-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("product_id"),
+ fake_product_name(F.col("id")).alias("name"),
+ F.array(F.lit("Electronics"), F.lit("Clothing"), F.lit("Home"), F.lit("Sports"))[
+ (F.rand() * 4).cast("int")
+ ].alias("category"),
+ generate_price(F.col("id")).alias("base_price"),
+)
+```
+
+---
+
+## Support/CRM
+
+### Tables
+```
+accounts → contacts → tickets → interactions
+```
+
+### Key Patterns
+
+| Pattern | Implementation |
+|---------|----------------|
+| Incident spikes | 3-5x volume during outages |
+| Resolution by priority | Critical: 4h avg, Low: 72h avg |
+| Enterprise contacts | 5-10 contacts per account vs 1-2 for SMB |
+| CSAT correlation | Faster resolution = higher satisfaction |
+
+### Realistic Distributions
+
+```python
+@F.pandas_udf("struct")
+def generate_ticket_metrics(tiers: pd.Series) -> pd.DataFrame:
+ """Support ticket metrics with correlated attributes."""
+ results = []
+ for tier in tiers:
+ # Priority correlates with tier
+ if tier == 'Enterprise':
+ priority = np.random.choice(['Critical', 'High', 'Medium'], p=[0.3, 0.5, 0.2])
+ else:
+ priority = np.random.choice(['Critical', 'High', 'Medium', 'Low'], p=[0.05, 0.2, 0.45, 0.3])
+
+ # Resolution time by priority (exponential distribution)
+ resolution_scale = {'Critical': 4, 'High': 12, 'Medium': 36, 'Low': 72}
+ resolution_hours = np.random.exponential(scale=resolution_scale[priority])
+
+ # CSAT correlates with resolution time
+ if resolution_hours < 4:
+ csat = np.random.choice([4, 5], p=[0.3, 0.7])
+ elif resolution_hours < 24:
+ csat = np.random.choice([3, 4, 5], p=[0.2, 0.5, 0.3])
+ else:
+ csat = np.random.choice([1, 2, 3, 4], p=[0.1, 0.3, 0.4, 0.2])
+
+ results.append({"priority": priority, "resolution_hours": round(resolution_hours, 1), "csat": int(csat)})
+ return pd.DataFrame(results)
+```
+
+### Schema Example
+
+```python
+# Tickets with coherent attributes
+tickets_df = (
+ spark.range(0, N_TICKETS, numPartitions=PARTITIONS)
+ .select(
+ F.concat(F.lit("TKT-"), F.lpad(F.col("id").cast("string"), 6, "0")).alias("ticket_id"),
+ # FK to customer (weighted by tier)
+ ...
+ )
+ .withColumn("metrics", generate_ticket_metrics(F.col("tier")))
+ .select("*", "metrics.*")
+ .drop("metrics")
+)
+```
+
+---
+
+## Manufacturing/IoT
+
+### Tables
+```
+equipment → sensors → readings → maintenance_orders
+```
+
+### Key Patterns
+
+| Pattern | Implementation |
+|---------|----------------|
+| Sensor lifecycle | Normal → degraded → failure progression |
+| Anomaly precursors | Anomalies precede maintenance by 2-7 days |
+| Seasonal production | Summer/winter production variations |
+| Equipment age | Failure rate increases with age |
+
+### Realistic Distributions
+
+```python
+@F.pandas_udf(DoubleType())
+def generate_sensor_reading(equipment_ages: pd.Series) -> pd.Series:
+ """Sensor readings with age-based degradation."""
+ readings = []
+ for age_days in equipment_ages:
+ # Base reading with age-based drift
+ base = 100.0
+ drift = (age_days / 365) * 5 # 5 units drift per year
+ noise = np.random.normal(0, 2)
+
+ # Occasional anomalies (more likely with age)
+ anomaly_prob = min(0.01 + (age_days / 365) * 0.02, 0.1)
+ if np.random.random() < anomaly_prob:
+ noise += np.random.choice([-1, 1]) * np.random.exponential(10)
+
+ readings.append(base + drift + noise)
+ return pd.Series(readings)
+```
+
+### Schema Example
+
+```python
+# Sensor readings time series
+readings_df = (
+ spark.range(0, N_READINGS, numPartitions=PARTITIONS)
+ .select(
+ F.concat(F.lit("READ-"), F.col("id").cast("string")).alias("reading_id"),
+ # FK to sensor
+ ((F.col("id") % N_SENSORS) + 1).alias("sensor_id"),
+ F.date_add(F.lit(START_DATE.date()), (F.col("id") / READINGS_PER_DAY).cast("int")).alias("timestamp"),
+ generate_sensor_reading(F.col("equipment_age")).alias("value"),
+ )
+)
+```
+
+---
+
+## Financial Services
+
+### Tables
+```
+accounts → transactions → payments → fraud_flags
+```
+
+### Key Patterns
+
+| Pattern | Implementation |
+|---------|----------------|
+| Transaction power law | 80% of volume from 20% of accounts |
+| Fraud patterns | Unusual times, amounts, locations |
+| Balance consistency | Transactions maintain positive balance |
+| Regulatory compliance | No negative balances, valid amounts |
+
+### Realistic Distributions
+
+```python
+@F.pandas_udf(DoubleType())
+def generate_transaction_amount(account_types: pd.Series) -> pd.Series:
+ """Transaction amounts following power law by account type."""
+ amounts = []
+ for acct_type in account_types:
+ if acct_type == "Corporate":
+ # Power law for corporate (few large transactions)
+ amount = (np.random.pareto(a=1.5) + 1) * 1000
+ elif acct_type == "Premium":
+ amount = np.random.lognormal(mean=6, sigma=1.2)
+ else: # Standard
+ amount = np.random.lognormal(mean=4, sigma=0.8)
+ amounts.append(min(amount, 1_000_000)) # Cap at $1M
+ return pd.Series(amounts)
+
+@F.pandas_udf(BooleanType())
+def generate_fraud_flag(amounts: pd.Series, hours: pd.Series) -> pd.Series:
+ """Flag suspicious transactions based on amount and time."""
+ flags = []
+ for amount, hour in zip(amounts, hours):
+ # Higher fraud probability for: large amounts + unusual hours
+ base_prob = 0.001
+ if amount > 5000:
+ base_prob *= 3
+ if hour < 6 or hour > 22:
+ base_prob *= 2
+ flags.append(np.random.random() < base_prob)
+ return pd.Series(flags)
+```
+
+### Schema Example
+
+```python
+# Transactions with fraud indicators
+transactions_df = (
+ spark.range(0, N_TRANSACTIONS, numPartitions=PARTITIONS)
+ .select(
+ F.concat(F.lit("TXN-"), F.lpad(F.col("id").cast("string"), 10, "0")).alias("transaction_id"),
+ # FK to account
+ ...
+ generate_transaction_amount(F.col("account_type")).alias("amount"),
+ F.hour(F.col("timestamp")).alias("hour"),
+ )
+ .withColumn("is_suspicious", generate_fraud_flag(F.col("amount"), F.col("hour")))
+)
+```
+
+---
+
+## General Best Practices
+
+1. **Start with domain tables**: Define the core entities and relationships first
+2. **Add domain-specific distributions**: Use realistic statistical patterns for your domain
+3. **Include edge cases**: Every domain has edge cases (returns, cancellations, failures)
+4. **Time-based patterns matter**: Most domains have daily/weekly/seasonal patterns
+5. **Correlate attributes**: Attributes within a row should make business sense together
+
+**Note:** These are guidance patterns, not rigid schemas. Adapt to user's specific requirements.
diff --git a/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md b/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md
new file mode 100644
index 00000000..c283a82c
--- /dev/null
+++ b/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md
@@ -0,0 +1,178 @@
+# Output Formats Guide
+
+Where and how to save generated synthetic data.
+
+## Create Infrastructure in Script
+
+Always create the schema and volume **inside the Python script** using `spark.sql()`. Do NOT make separate MCP SQL calls - it's much slower.
+
+```python
+CATALOG = "" # MUST ask user - never default
+SCHEMA = ""
+VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
+
+# Note: Assume catalog exists - do NOT create it
+spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
+spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data")
+```
+
+**Important:** Do NOT create catalogs - assume they already exist. Only create schema and volume.
+
+---
+
+## Format Comparison
+
+| Format | Use Case | Extension | Best For |
+|--------|----------|-----------|----------|
+| **Parquet** | SDP pipeline input | `.parquet` or none | Best compression, query performance |
+| **JSON** | Log-style ingestion | `.json` | Simulating external data feeds |
+| **CSV** | Legacy systems | `.csv` | Human-readable, spreadsheet import |
+| **Delta Table** | Default - Direct analytics | N/A | Treat as bronze tables for ETL or skip ETL and query immediately |
+
+---
+
+## Parquet to Volumes (Default)
+
+Standard format for SDP pipeline input. Best compression and query performance.
+
+```python
+VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
+
+# Save as parquet files (directory format)
+customers_df.write.mode("overwrite").parquet(f"{VOLUME_PATH}/customers")
+orders_df.write.mode("overwrite").parquet(f"{VOLUME_PATH}/orders")
+tickets_df.write.mode("overwrite").parquet(f"{VOLUME_PATH}/tickets")
+```
+
+**Notes:**
+- Files may not use a file extension or might end with `.parquet`
+- Spark writes as a directory with part files
+- Use `mode("overwrite")` for one-time generation
+- Use `mode("append")` for incremental/scheduled jobs
+
+---
+
+## JSON to Volumes
+
+Common pattern for simulating SDP ingestion from external data feeds (logs, webhooks).
+
+```python
+VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
+
+# Save as JSON files
+customers_df.write.mode("overwrite").json(f"{VOLUME_PATH}/customers_json")
+orders_df.write.mode("overwrite").json(f"{VOLUME_PATH}/orders_json")
+```
+
+**When to use:**
+- Simulating log ingestion
+- External API data feeds
+- User explicitly requests JSON format
+
+---
+
+## CSV to Volumes
+
+Common pattern for simulating data from legacy systems or spreadsheet exports.
+
+```python
+VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
+
+# Save as CSV with headers
+customers_df.write.mode("overwrite").option("header", "true").csv(f"{VOLUME_PATH}/customers_csv")
+orders_df.write.mode("overwrite").option("header", "true").csv(f"{VOLUME_PATH}/orders_csv")
+```
+
+**Options:**
+```python
+# Full options for CSV
+df.write \
+ .mode("overwrite") \
+ .option("header", "true") \
+ .option("delimiter", ",") \
+ .option("quote", '"') \
+ .option("escape", "\\") \
+ .csv(f"{VOLUME_PATH}/data_csv")
+```
+
+**When to use:**
+- Legacy system integration
+- Human-readable data
+- Spreadsheet import testing
+
+---
+
+## Delta Table (Unity Catalog)
+
+Write directly to managed Delta tables when data is ready for analytics consumption (skip SDP pipeline).
+
+```python
+# Ensure schema exists
+spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
+
+# Save as managed Delta tables
+customers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.customers")
+orders_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.orders")
+
+# With additional options
+customers_df.write \
+ .mode("overwrite") \
+ .option("overwriteSchema", "true") \
+ .saveAsTable(f"{CATALOG}.{SCHEMA}.customers")
+```
+
+**When to use:**
+- User wants data ready to query immediately
+- Skip the SDP bronze/silver/gold pipeline
+- Direct SQL analytics
+
+---
+
+## Write Modes
+
+| Mode | Behavior | Use Case |
+|------|----------|----------|
+| `overwrite` | Replace existing data | One-time generation, regeneration |
+| `append` | Add to existing data | Incremental/scheduled jobs |
+| `ignore` | Skip if exists | Idempotent generation |
+| `error` | Fail if exists | Safety check |
+
+### Incremental Generation Pattern
+
+```python
+WRITE_MODE = "append" # For scheduled jobs
+
+# Only generate new records since last run
+from datetime import datetime, timedelta
+
+LAST_RUN = datetime.now() - timedelta(days=1)
+END_DATE = datetime.now()
+
+# Generate only new data
+new_orders_df = generate_orders(start_date=LAST_RUN, end_date=END_DATE)
+new_orders_df.write.mode(WRITE_MODE).parquet(f"{VOLUME_PATH}/orders")
+```
+
+---
+
+## Validation After Write
+
+After successful execution, validate the generated data:
+
+```python
+# Read back and verify
+customers_check = spark.read.parquet(f"{VOLUME_PATH}/customers")
+orders_check = spark.read.parquet(f"{VOLUME_PATH}/orders")
+
+print(f"Customers: {customers_check.count():,} rows")
+print(f"Orders: {orders_check.count():,} rows")
+
+# Verify distributions
+customers_check.groupBy("tier").count().show()
+orders_check.describe("amount").show()
+```
+
+Or use `get_volume_folder_details` MCP tool:
+- `volume_path`: "my_catalog/my_schema/raw_data/customers"
+- `format`: "parquet"
+- `table_stat_level`: "SIMPLE"
diff --git a/databricks-skills/databricks-synthetic-data-gen/references/6-troubleshooting.md b/databricks-skills/databricks-synthetic-data-gen/references/6-troubleshooting.md
new file mode 100644
index 00000000..420b3500
--- /dev/null
+++ b/databricks-skills/databricks-synthetic-data-gen/references/6-troubleshooting.md
@@ -0,0 +1,324 @@
+# Troubleshooting Guide
+
+Common issues and solutions for synthetic data generation.
+
+## Environment Issues
+
+### ModuleNotFoundError: faker (or other library)
+
+**Problem:** Dependencies not available in execution environment.
+
+**Solutions by execution mode:**
+
+| Mode | Solution |
+|------|----------|
+| **DB Connect 16.4+** | Use `DatabricksEnv().withDependencies("faker", "pandas", ...)` |
+| **Older DB Connect with Serverless** | Create job with `environments` parameter |
+| **Databricks Runtime** | Use Databricks CLI to install `faker holidays` |
+| **Classic cluster** | Use Databricks CLI to install libraries. `databricks libraries install --json '{"cluster_id": "", "libraries": [{"pypi": {"package": "faker"}}, {"pypi": {"package": "holidays"}}]}'` |
+
+```python
+# For DB Connect 16.4+
+from databricks.connect import DatabricksSession, DatabricksEnv
+
+env = DatabricksEnv().withDependencies("faker", "pandas", "numpy", "holidays")
+spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate()
+```
+
+### DatabricksEnv not found
+
+**Problem:** Using older databricks-connect version.
+
+**Solution:** Upgrade to 16.4+ or use job-based approach:
+
+```bash
+# Upgrade (prefer uv, fall back to pip)
+uv pip install "databricks-connect>=16.4,<17.4"
+# or: pip install "databricks-connect>=16.4,<17.4"
+
+# Or use job with environments parameter instead
+```
+
+### serverless_compute_id error
+
+**Problem:** Missing serverless configuration.
+
+**Solution:** Add to `~/.databrickscfg`:
+
+```ini
+[DEFAULT]
+host = https://your-workspace.cloud.databricks.com/
+serverless_compute_id = auto
+auth_type = databricks-cli
+```
+
+---
+
+## Execution Issues
+
+### CRITICAL: cache() and persist() NOT supported on serverless
+
+**Problem:** Using `.cache()` or `.persist()` on serverless compute fails with:
+```
+AnalysisException: [NOT_SUPPORTED_WITH_SERVERLESS] PERSIST TABLE is not supported on serverless compute.
+```
+
+**Why this happens:** Serverless compute does not support caching DataFrames in memory. This is a fundamental limitation of the serverless architecture.
+
+**Solution:** Write master tables to Delta first, then read them back for FK joins:
+
+```python
+# BAD - will fail on serverless
+customers_df = spark.range(0, N_CUSTOMERS)...
+customers_df.cache() # ❌ FAILS: "PERSIST TABLE is not supported on serverless compute"
+
+# GOOD - write to Delta, then read back
+customers_df = spark.range(0, N_CUSTOMERS)...
+customers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.customers")
+customer_lookup = spark.table(f"{CATALOG}.{SCHEMA}.customers") # ✓ Read from Delta
+```
+
+**Best practice for referential integrity:**
+1. Generate master table (e.g., customers)
+2. Write to Delta table
+3. Read back for FK lookup joins
+4. Generate child tables (e.g., orders, tickets) with valid FKs
+5. Write child tables to Delta
+
+---
+
+### Serverless job fails to start
+
+**Possible causes:**
+1. Workspace doesn't have serverless enabled
+2. Unity Catalog permissions missing
+3. Invalid environment configuration
+
+**Solutions:**
+```python
+# Verify serverless is available
+# Try creating a simple job first to test
+
+# Check Unity Catalog permissions
+spark.sql("SELECT current_catalog(), current_schema()")
+```
+
+### Classic cluster startup slow (3-8 minutes)
+
+**Problem:** Clusters take time to start.
+
+**Solution:** Switch to serverless:
+
+```python
+# Instead of:
+# spark = DatabricksSession.builder.clusterId("xxx").getOrCreate()
+
+# Use:
+spark = DatabricksSession.builder.serverless(True).getOrCreate()
+```
+
+### "Either base environment or version must be provided"
+
+**Problem:** Missing `client` in job environment spec.
+
+**Solution:** Add `"client": "4"` to the spec:
+
+```python
+{
+ "environments": [{
+ "environment_key": "datagen_env",
+ "spec": {
+ "client": "4", # Required!
+ "dependencies": ["faker", "numpy", "pandas"]
+ }
+ }]
+}
+```
+
+---
+
+## Data Generation Issues
+
+### AttributeError: 'function' object has no attribute 'partitionBy'
+
+**Problem:** Using `F.window` instead of `Window` for analytical window functions.
+
+```python
+# WRONG - F.window is for time-based tumbling/sliding windows (streaming)
+window_spec = F.window.partitionBy("account_id").orderBy("contact_id")
+# Error: AttributeError: 'function' object has no attribute 'partitionBy'
+
+# CORRECT - Window is for analytical window specifications
+from pyspark.sql.window import Window
+window_spec = Window.partitionBy("account_id").orderBy("contact_id")
+```
+
+**When to use Window:** For analytical functions like `row_number()`, `rank()`, `lead()`, `lag()`:
+
+```python
+from pyspark.sql.window import Window
+
+# Mark first contact per account as primary
+window_spec = Window.partitionBy("account_id").orderBy("contact_id")
+contacts_df = contacts_df.withColumn(
+ "is_primary",
+ F.row_number().over(window_spec) == 1
+)
+```
+
+---
+
+### Faker UDF is slow
+
+**Problem:** Single-row UDFs don't parallelize well.
+
+**Solution:** Use `pandas_udf` for batch processing:
+
+```python
+# SLOW - scalar UDF
+@F.udf(returnType=StringType())
+def slow_fake_name():
+ return Faker().name()
+
+# FAST - pandas UDF (batch processing)
+@F.pandas_udf(StringType())
+def fast_fake_name(ids: pd.Series) -> pd.Series:
+ fake = Faker()
+ return pd.Series([fake.name() for _ in range(len(ids))])
+```
+
+### Out of memory with large data
+
+**Problem:** Not enough partitions for data size.
+
+**Solution:** Increase partitions:
+
+```python
+# For large datasets (1M+ rows)
+customers_df = spark.range(0, N_CUSTOMERS, numPartitions=64) # Increase from default
+```
+
+| Data Size | Recommended Partitions |
+|-----------|----------------------|
+| < 100K | 8 |
+| 100K - 500K | 16 |
+| 500K - 1M | 32 |
+| 1M+ | 64+ |
+
+### Context corrupted on classic cluster
+
+**Problem:** Stale execution context.
+
+**Solution:** Create fresh context (omit context_id), reinstall libraries:
+
+```python
+# Don't reuse context_id if you see strange errors
+# Let it create a new context
+```
+
+### Referential integrity violations
+
+**Problem:** Foreign keys reference non-existent parent records.
+
+**Solution:** Write master table to Delta first, then read back for FK joins:
+
+```python
+# 1. Generate and WRITE master table (do NOT use cache with serverless!)
+customers_df = spark.range(0, N_CUSTOMERS)...
+customers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.customers")
+
+# 2. Read back for FK lookups
+customer_lookup = spark.table(f"{CATALOG}.{SCHEMA}.customers").select("customer_id", "tier")
+
+# 3. Generate child table with valid FKs
+orders_df = spark.range(0, N_ORDERS).join(
+ customer_lookup,
+ on=,
+ how="left"
+)
+```
+
+> **WARNING:** Do NOT use `.cache()` or `.persist()` with serverless compute. See the dedicated section above.
+
+---
+
+## Data Quality Issues
+
+### Uniform distributions (unrealistic)
+
+**Problem:** All customers have similar order counts, amounts are evenly distributed.
+
+**Solution:** Use non-linear distributions:
+
+```python
+# BAD - uniform
+amounts = np.random.uniform(10, 1000, N)
+
+# GOOD - log-normal (realistic)
+amounts = np.random.lognormal(mean=5, sigma=0.8, N)
+```
+
+### Missing time-based patterns
+
+**Problem:** Data doesn't reflect weekday/weekend or seasonal patterns.
+
+**Solution:** Add multipliers:
+
+```python
+import holidays
+
+US_HOLIDAYS = holidays.US(years=[2024, 2025])
+
+def get_multiplier(date):
+ mult = 1.0
+ if date.weekday() >= 5: # Weekend
+ mult *= 0.6
+ if date in US_HOLIDAYS:
+ mult *= 0.3
+ return mult
+```
+
+### Incoherent row attributes
+
+**Problem:** Enterprise customer has low-value orders, critical ticket has slow resolution.
+
+**Solution:** Correlate attributes:
+
+```python
+# Priority based on tier
+if tier == 'Enterprise':
+ priority = np.random.choice(['Critical', 'High'], p=[0.4, 0.6])
+else:
+ priority = np.random.choice(['Medium', 'Low'], p=[0.6, 0.4])
+
+# Resolution based on priority
+resolution_scale = {'Critical': 4, 'High': 12, 'Medium': 36, 'Low': 72}
+resolution_hours = np.random.exponential(scale=resolution_scale[priority])
+```
+
+---
+
+## Validation Steps
+
+After generation, verify your data:
+
+```python
+# 1. Check row counts
+print(f"Customers: {customers_df.count():,}")
+print(f"Orders: {orders_df.count():,}")
+
+# 2. Verify distributions
+customers_df.groupBy("tier").count().show()
+orders_df.describe("amount").show()
+
+# 3. Check referential integrity
+orphans = orders_df.join(
+ customers_df,
+ orders_df.customer_id == customers_df.customer_id,
+ "left_anti"
+)
+print(f"Orphan orders: {orphans.count()}")
+
+# 4. Verify date range
+orders_df.select(F.min("order_date"), F.max("order_date")).show()
+```
diff --git a/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py b/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py
new file mode 100644
index 00000000..b9f953fa
--- /dev/null
+++ b/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py
@@ -0,0 +1,390 @@
+"""Generate synthetic data using Spark + Faker + Pandas UDFs.
+
+This is the recommended approach for ALL data generation tasks:
+- Scales from thousands to millions of rows
+- Parallel execution via Spark
+- Direct write to Unity Catalog
+- Works with serverless and classic compute
+
+Auto-detects environment and uses:
+- DatabricksEnv with managed dependencies if databricks-connect >= 16.4 (local)
+- Standard session if running on Databricks Runtime or older databricks-connect
+"""
+import sys
+import os
+from pyspark.sql import functions as F
+from pyspark.sql.window import Window
+from pyspark.sql.types import StringType, DoubleType, StructType, StructField, IntegerType
+import numpy as np
+import pandas as pd
+from datetime import datetime, timedelta
+
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+# Compute - Serverless strongly recommended
+USE_SERVERLESS = True # Set to False and provide CLUSTER_ID for classic compute
+CLUSTER_ID = None # Only used if USE_SERVERLESS=False
+
+# Storage - Update these for your environment
+CATALOG = "" # REQUIRED: replace with your catalog
+SCHEMA = "" # REQUIRED: replace with your schema
+VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
+
+# Data sizes
+N_CUSTOMERS = 10_000
+N_ORDERS = 50_000
+PARTITIONS = 16 # Adjust: 8 for <100K, 32 for 1M+
+
+# Date range - last 6 months from today
+END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
+START_DATE = END_DATE - timedelta(days=180)
+
+# Write mode - "overwrite" for one-time, "append" for incremental
+WRITE_MODE = "overwrite"
+
+# Bad data injection for testing data quality rules
+INJECT_BAD_DATA = False # Set to True to inject bad data
+BAD_DATA_CONFIG = {
+ "null_rate": 0.02, # 2% nulls in required fields
+ "outlier_rate": 0.01, # 1% impossible values
+ "orphan_fk_rate": 0.01, # 1% orphan foreign keys
+}
+
+# Reproducibility
+SEED = 42
+
+# Tier distribution: Free 60%, Pro 30%, Enterprise 10%
+TIER_PROBS = [0.6, 0.3, 0.1]
+
+# Region distribution
+REGION_PROBS = [0.4, 0.25, 0.2, 0.15]
+
+# =============================================================================
+# ENVIRONMENT DETECTION AND SESSION CREATION
+# =============================================================================
+
+def is_databricks_runtime():
+ """Check if running on Databricks Runtime vs locally."""
+ return "DATABRICKS_RUNTIME_VERSION" in os.environ
+
+def get_databricks_connect_version():
+ """Get databricks-connect version as (major, minor) tuple or None."""
+ try:
+ import importlib.metadata
+ version_str = importlib.metadata.version('databricks-connect')
+ parts = version_str.split('.')
+ return (int(parts[0]), int(parts[1]))
+ except Exception:
+ return None
+
+# Detect environment
+on_runtime = is_databricks_runtime()
+db_version = get_databricks_connect_version()
+
+print("=" * 80)
+print("ENVIRONMENT DETECTION")
+print("=" * 80)
+print(f"Running on Databricks Runtime: {on_runtime}")
+if db_version:
+ print(f"databricks-connect version: {db_version[0]}.{db_version[1]}")
+else:
+ print("databricks-connect: not available")
+
+# Use DatabricksEnv with managed dependencies if:
+# - Running locally (not on Databricks Runtime)
+# - databricks-connect >= 16.4
+use_managed_deps = (not on_runtime) and db_version and db_version >= (16, 4)
+
+if use_managed_deps:
+ print("Using DatabricksEnv with managed dependencies")
+ print("=" * 80)
+ from databricks.connect import DatabricksSession, DatabricksEnv
+
+ env = DatabricksEnv().withDependencies("faker", "pandas", "numpy", "holidays")
+
+ if USE_SERVERLESS:
+ spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate()
+ print("Connected to serverless compute with managed dependencies!")
+ else:
+ if not CLUSTER_ID:
+ raise ValueError("CLUSTER_ID must be set when USE_SERVERLESS=False")
+ spark = DatabricksSession.builder.withEnvironment(env).clusterId(CLUSTER_ID).getOrCreate()
+ print(f"Connected to cluster with managed dependencies!")
+else:
+ print("Using standard session (dependencies must be pre-installed)")
+ print("=" * 80)
+
+ # Check that UDF dependencies are available
+ print("\nChecking UDF dependencies...")
+ missing_deps = []
+
+ try:
+ from faker import Faker
+ print(" faker: OK")
+ except ImportError:
+ missing_deps.append("faker")
+ print(" faker: MISSING")
+
+ try:
+ import pandas as pd
+ print(" pandas: OK")
+ except ImportError:
+ missing_deps.append("pandas")
+ print(" pandas: MISSING")
+
+ if missing_deps:
+ print("\n" + "=" * 80)
+ print("ERROR: Missing dependencies for UDFs")
+ print("=" * 80)
+ print(f"Missing: {', '.join(missing_deps)}")
+ if on_runtime:
+ print('\nSolution: Install libraries via Databricks CLI:')
+ print(' databricks libraries install --json \'{"cluster_id": "", "libraries": [{"pypi": {"package": "faker"}}, {"pypi": {"package": "holidays"}}]}\'')
+ else:
+ print("\nSolution: Upgrade to databricks-connect >= 16.4 for managed deps")
+ print(" Or create a job with environment settings")
+ print("=" * 80)
+ sys.exit(1)
+
+ print("\nAll dependencies available")
+ print("=" * 80)
+
+ from databricks.connect import DatabricksSession
+
+ if USE_SERVERLESS:
+ spark = DatabricksSession.builder.serverless(True).getOrCreate()
+ print("Connected to serverless compute")
+ else:
+ if not CLUSTER_ID:
+ raise ValueError("CLUSTER_ID must be set when USE_SERVERLESS=False")
+ spark = DatabricksSession.builder.clusterId(CLUSTER_ID).getOrCreate()
+ print(f"Connected to cluster ")
+
+# Import Faker for UDF definitions
+from faker import Faker
+
+# =============================================================================
+# DEFINE PANDAS UDFs FOR FAKER DATA
+# =============================================================================
+
+@F.pandas_udf(StringType())
+def fake_name(ids: pd.Series) -> pd.Series:
+ """Generate realistic person names."""
+ fake = Faker()
+ Faker.seed(SEED)
+ return pd.Series([fake.name() for _ in range(len(ids))])
+
+@F.pandas_udf(StringType())
+def fake_company(ids: pd.Series) -> pd.Series:
+ """Generate realistic company names."""
+ fake = Faker()
+ Faker.seed(SEED)
+ return pd.Series([fake.company() for _ in range(len(ids))])
+
+@F.pandas_udf(StringType())
+def fake_address(ids: pd.Series) -> pd.Series:
+ """Generate realistic addresses."""
+ fake = Faker()
+ Faker.seed(SEED)
+ return pd.Series([fake.address().replace('\n', ', ') for _ in range(len(ids))])
+
+@F.pandas_udf(StringType())
+def fake_email(names: pd.Series) -> pd.Series:
+ """Generate email based on name."""
+ emails = []
+ for name in names:
+ if name:
+ domain = name.lower().replace(" ", ".").replace(",", "")[:20]
+ emails.append(f"{domain}@example.com")
+ else:
+ emails.append("unknown@example.com")
+ return pd.Series(emails)
+
+@F.pandas_udf(DoubleType())
+def generate_lognormal_amount(tiers: pd.Series) -> pd.Series:
+ """Generate amount based on tier using log-normal distribution."""
+ np.random.seed(SEED)
+ amounts = []
+ for tier in tiers:
+ if tier == "Enterprise":
+ amounts.append(float(np.random.lognormal(mean=7.5, sigma=0.8))) # ~$1800 avg
+ elif tier == "Pro":
+ amounts.append(float(np.random.lognormal(mean=5.5, sigma=0.7))) # ~$245 avg
+ else:
+ amounts.append(float(np.random.lognormal(mean=4.0, sigma=0.6))) # ~$55 avg
+ return pd.Series(amounts)
+
+# =============================================================================
+# CREATE INFRASTRUCTURE
+# =============================================================================
+print("\nCreating infrastructure...")
+spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
+spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data")
+print(f"Infrastructure ready: {VOLUME_PATH}")
+
+# =============================================================================
+# GENERATE CUSTOMERS (Master Table)
+# =============================================================================
+print(f"\nGenerating {N_CUSTOMERS:,} customers...")
+
+customers_df = (
+ spark.range(0, N_CUSTOMERS, numPartitions=PARTITIONS)
+ .select(
+ F.concat(F.lit("CUST-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("customer_id"),
+ fake_name(F.col("id")).alias("name"),
+ fake_company(F.col("id")).alias("company"),
+ fake_address(F.col("id")).alias("address"),
+ # Tier distribution: Free 60%, Pro 30%, Enterprise 10%
+ F.when(F.rand(SEED) < TIER_PROBS[0], "Free")
+ .when(F.rand(SEED) < TIER_PROBS[0] + TIER_PROBS[1], "Pro")
+ .otherwise("Enterprise").alias("tier"),
+ # Region distribution
+ F.when(F.rand(SEED) < REGION_PROBS[0], "North")
+ .when(F.rand(SEED) < REGION_PROBS[0] + REGION_PROBS[1], "South")
+ .when(F.rand(SEED) < REGION_PROBS[0] + REGION_PROBS[1] + REGION_PROBS[2], "East")
+ .otherwise("West").alias("region"),
+ # Created date (within last 2 years before start date)
+ F.date_sub(F.lit(START_DATE.date()), (F.rand(SEED) * 730).cast("int")).alias("created_at"),
+ )
+)
+
+# Add tier-based ARR and email
+customers_df = (
+ customers_df
+ .withColumn("arr", F.round(generate_lognormal_amount(F.col("tier")), 2))
+ .withColumn("email", fake_email(F.col("name")))
+)
+
+# Save customers
+customers_df.write.mode(WRITE_MODE).parquet(f"{VOLUME_PATH}/customers")
+print(f" Saved customers to {VOLUME_PATH}/customers")
+
+# Show tier distribution
+print("\n Tier distribution:")
+customers_df.groupBy("tier").count().orderBy("tier").show()
+
+# =============================================================================
+# GENERATE ORDERS (Child Table with Referential Integrity)
+# =============================================================================
+print(f"\nGenerating {N_ORDERS:,} orders with referential integrity...")
+
+# Write customer lookup to temp Delta table (no .cache() on serverless!)
+customers_tmp_table = f"{CATALOG}.{SCHEMA}._tmp_customers_lookup"
+customers_df.select("customer_id", "tier").write.mode("overwrite").saveAsTable(customers_tmp_table)
+customer_lookup = spark.table(customers_tmp_table)
+
+# Generate orders base
+orders_df = (
+ spark.range(0, N_ORDERS, numPartitions=PARTITIONS)
+ .select(
+ F.concat(F.lit("ORD-"), F.lpad(F.col("id").cast("string"), 6, "0")).alias("order_id"),
+ # Generate customer_idx for FK join (hash-based distribution)
+ (F.abs(F.hash(F.col("id"), F.lit(SEED))) % N_CUSTOMERS).alias("customer_idx"),
+ # Order status
+ F.when(F.rand(SEED) < 0.65, "delivered")
+ .when(F.rand(SEED) < 0.80, "shipped")
+ .when(F.rand(SEED) < 0.90, "processing")
+ .when(F.rand(SEED) < 0.95, "pending")
+ .otherwise("cancelled").alias("status"),
+ # Order date within date range
+ F.date_add(F.lit(START_DATE.date()), (F.rand(SEED) * 180).cast("int")).alias("order_date"),
+ )
+)
+
+# Add customer_idx to lookup for join
+customer_lookup_with_idx = customer_lookup.withColumn(
+ "customer_idx",
+ (F.row_number().over(Window.orderBy(F.monotonically_increasing_id())) - 1).cast("int")
+)
+
+# Join to get customer_id and tier as foreign key
+orders_with_fk = (
+ orders_df
+ .join(customer_lookup_with_idx, on="customer_idx", how="left")
+ .drop("customer_idx")
+)
+
+# Add tier-based amount
+orders_with_fk = orders_with_fk.withColumn(
+ "amount",
+ F.round(generate_lognormal_amount(F.col("tier")), 2)
+)
+
+# =============================================================================
+# INJECT BAD DATA (OPTIONAL)
+# =============================================================================
+if INJECT_BAD_DATA:
+ print("\nInjecting bad data for quality testing...")
+
+ # Calculate counts
+ null_count = int(N_ORDERS * BAD_DATA_CONFIG["null_rate"])
+ outlier_count = int(N_ORDERS * BAD_DATA_CONFIG["outlier_rate"])
+ orphan_count = int(N_ORDERS * BAD_DATA_CONFIG["orphan_fk_rate"])
+
+ # Add bad data flags
+ orders_with_fk = orders_with_fk.withColumn(
+ "row_num",
+ F.row_number().over(Window.orderBy(F.monotonically_increasing_id()))
+ )
+
+ # Inject nulls in customer_id for first null_count rows
+ orders_with_fk = orders_with_fk.withColumn(
+ "customer_id",
+ F.when(F.col("row_num") <= null_count, None).otherwise(F.col("customer_id"))
+ )
+
+ # Inject negative amounts for next outlier_count rows
+ orders_with_fk = orders_with_fk.withColumn(
+ "amount",
+ F.when(
+ (F.col("row_num") > null_count) & (F.col("row_num") <= null_count + outlier_count),
+ F.lit(-999.99)
+ ).otherwise(F.col("amount"))
+ )
+
+ # Inject orphan FKs for next orphan_count rows
+ orders_with_fk = orders_with_fk.withColumn(
+ "customer_id",
+ F.when(
+ (F.col("row_num") > null_count + outlier_count) &
+ (F.col("row_num") <= null_count + outlier_count + orphan_count),
+ F.lit("CUST-NONEXISTENT")
+ ).otherwise(F.col("customer_id"))
+ )
+
+ orders_with_fk = orders_with_fk.drop("row_num")
+
+ print(f" Injected {null_count} null customer_ids")
+ print(f" Injected {outlier_count} negative amounts")
+ print(f" Injected {orphan_count} orphan foreign keys")
+
+# Drop tier column (not needed in final output)
+orders_final = orders_with_fk.drop("tier")
+
+# Save orders
+orders_final.write.mode(WRITE_MODE).parquet(f"{VOLUME_PATH}/orders")
+print(f" Saved orders to {VOLUME_PATH}/orders")
+
+# Show status distribution
+print("\n Status distribution:")
+orders_final.groupBy("status").count().orderBy("status").show()
+
+# =============================================================================
+# CLEANUP AND SUMMARY
+# =============================================================================
+spark.sql(f"DROP TABLE IF EXISTS {customers_tmp_table}")
+
+print("\n" + "=" * 80)
+print("GENERATION COMPLETE")
+print("=" * 80)
+print(f"Catalog: {CATALOG}")
+print(f"Schema: {SCHEMA}")
+print(f"Volume: {VOLUME_PATH}")
+print(f"\nGenerated data:")
+print(f" - customers: {N_CUSTOMERS:,} rows")
+print(f" - orders: {N_ORDERS:,} rows")
+if INJECT_BAD_DATA:
+ print(f" - Bad data injected: nulls, outliers, orphan FKs")
+print(f"\nDate range: {START_DATE.date()} to {END_DATE.date()}")
+print("=" * 80)
diff --git a/databricks-skills/databricks-synthetic-data-generation/SKILL.md b/databricks-skills/databricks-synthetic-data-generation/SKILL.md
deleted file mode 100644
index ce2a17cf..00000000
--- a/databricks-skills/databricks-synthetic-data-generation/SKILL.md
+++ /dev/null
@@ -1,660 +0,0 @@
----
-name: databricks-synthetic-data-generation
-description: "Generate realistic synthetic data using Faker and Spark, with non-linear distributions, integrity constraints, and save to Databricks. Use when creating test data, demo datasets, or synthetic tables."
----
-
-# Synthetic Data Generation
-
-Generate realistic, story-driven synthetic data for Databricks using Python with Faker and Spark.
-
-## Common Libraries
-
-These libraries are useful for generating realistic synthetic data:
-
-- **faker**: Generates realistic names, addresses, emails, companies, dates, etc.
-- **holidays**: Provides country-specific holiday calendars for realistic date patterns
-
-These are typically NOT pre-installed on Databricks. Install them using `execute_databricks_command` tool:
-- `code`: "%pip install faker holidays"
-
-Save the returned `cluster_id` and `context_id` for subsequent calls.
-
-## Workflow
-
-1. **Write Python code to a local file** in the project (e.g., `scripts/generate_data.py`)
-2. **Execute on Databricks** using the `run_python_file_on_databricks` MCP tool
-3. **If execution fails**: Edit the local file to fix the error, then re-execute
-4. **Reuse the context** for follow-up executions by passing the returned `cluster_id` and `context_id`
-
-**Always work with local files first, then execute.** This makes debugging easier - you can see and edit the code.
-
-### Context Reuse Pattern
-
-The first execution auto-selects a running cluster and creates an execution context. **Reuse this context for follow-up calls** - it's much faster (~1s vs ~15s) and shares variables/imports:
-
-**First execution** - use `run_python_file_on_databricks` tool:
-- `file_path`: "scripts/generate_data.py"
-
-Returns: `{ success, output, error, cluster_id, context_id, ... }`
-
-Save `cluster_id` and `context_id` for follow-up calls.
-
-**If execution fails:**
-1. Read the error from the result
-2. Edit the local Python file to fix the issue
-3. Re-execute with same context using `run_python_file_on_databricks` tool:
- - `file_path`: "scripts/generate_data.py"
- - `cluster_id`: ""
- - `context_id`: ""
-
-**Follow-up executions** reuse the context (faster, shares state):
-- `file_path`: "scripts/validate_data.py"
-- `cluster_id`: ""
-- `context_id`: ""
-
-### Handling Failures
-
-When execution fails:
-1. Read the error from the result
-2. **Edit the local Python file** to fix the issue
-3. Re-execute using the same `cluster_id` and `context_id` (faster, keeps installed libraries)
-4. If the context is corrupted, omit `context_id` to create a fresh one
-
-### Installing Libraries
-
-Databricks provides Spark, pandas, numpy, and common data libraries by default. **Only install a library if you get an import error.**
-
-Use `execute_databricks_command` tool:
-- `code`: "%pip install faker"
-- `cluster_id`: ""
-- `context_id`: ""
-
-The library is immediately available in the same context.
-
-**Note:** Keeping the same `context_id` means installed libraries persist across calls.
-
-## Storage Destination
-
-### Ask for Schema Name
-
-By default, use the `ai_dev_kit` catalog. Ask the user which schema to use:
-
-> "I'll save the data to `ai_dev_kit.`. What schema name would you like to use? (You can also specify a different catalog if needed.)"
-
-If the user provides just a schema name, use `ai_dev_kit.{schema}`. If they provide `catalog.schema`, use that instead.
-
-### Create Infrastructure in the Script
-
-Always create the catalog, schema, and volume **inside the Python script** using `spark.sql()`. Do NOT make separate MCP SQL calls - it's much slower.
-
-The `spark` variable is available by default on Databricks clusters.
-
-```python
-# =============================================================================
-# CREATE INFRASTRUCTURE (inside the Python script)
-# =============================================================================
-spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")
-spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
-spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data")
-```
-
-### Save to Volume as Raw Data (Never Tables)
-
-**Always save data to a Volume as parquet files, never directly to tables** (unless the user explicitly requests tables). This is the input for the downstream Spark Declarative Pipeline (SDP) that will handle bronze/silver/gold layers.
-
-```python
-VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
-
-# Save as parquet files (raw data)
-spark.createDataFrame(customers_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/customers")
-spark.createDataFrame(orders_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/orders")
-spark.createDataFrame(tickets_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/tickets")
-```
-
-## Raw Data Only - No Pre-Aggregated Fields (Unless Instructed Otherwise)
-
-**By default, generate raw, transactional data only.** Do not create fields that represent sums, totals, averages, or counts.
-
-- One row = one event/transaction/record
-- No columns like `total_orders`, `sum_revenue`, `avg_csat`, `order_count`
-- Each row has its own individual values, not rollups
-
-**Why?** A Spark Declarative Pipeline (SDP) will typically be built after data generation to:
-- Ingest raw data (bronze layer)
-- Clean and validate (silver layer)
-- Aggregate and compute metrics (gold layer)
-
-The synthetic data is the **source** for this pipeline. Aggregations happen downstream.
-
-**Note:** If the user specifically requests aggregated fields or summary tables, follow their instructions.
-
-```python
-# GOOD - Raw transactional data
-# Customer table: one row per customer, no aggregated fields
-customers_data.append({
- "customer_id": cid,
- "name": fake.company(),
- "tier": "Enterprise",
- "region": "North",
-})
-
-# Order table: one row per order
-orders_data.append({
- "order_id": f"ORD-{i:06d}",
- "customer_id": cid,
- "amount": 150.00, # This order's amount
- "order_date": "2024-10-15",
-})
-
-# BAD - Don't add pre-aggregated fields
-# customers_data.append({
-# "customer_id": cid,
-# "total_orders": 47, # NO - this is an aggregation
-# "total_revenue": 12500.00, # NO - this is a sum
-# "avg_order_value": 265.95, # NO - this is an average
-# })
-```
-
-## Temporality and Data Volume
-
-### Date Range: Last 6 Months from Today
-
-**Always generate data for the last ~6 months ending at the current date.** This ensures:
-- Data feels current and relevant for demos
-- Recent patterns are visible in dashboards
-- Downstream aggregations (daily/weekly/monthly) have enough history
-
-```python
-from datetime import datetime, timedelta
-
-# Dynamic date range - last 6 months from today
-END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
-START_DATE = END_DATE - timedelta(days=180)
-
-# Place special events within this range (e.g., incident 3 weeks ago)
-INCIDENT_END = END_DATE - timedelta(days=21)
-INCIDENT_START = INCIDENT_END - timedelta(days=10)
-```
-
-### Data Volume for Aggregation
-
-Generate enough data so patterns remain visible after downstream aggregation (SDP pipelines often aggregate by day/week/region/category). Rules of thumb:
-
-| Grain | Minimum Records | Rationale |
-|-------|-----------------|-----------|
-| Daily time series | 50-100/day | See trends after weekly rollup |
-| Per category | 500+ per category | Statistical significance |
-| Per customer | 5-20 events/customer | Enough for customer-level analysis |
-| Total rows | 10K-50K minimum | Patterns survive GROUP BY |
-
-```python
-# Example: 8000 tickets over 180 days = ~44/day average
-# After weekly aggregation: ~310 records per week per category
-# After monthly by region: still enough to see patterns
-N_TICKETS = 8000
-N_CUSTOMERS = 2500 # Each has ~3 tickets on average
-N_ORDERS = 25000 # ~10 orders per customer average
-```
-
-## Script Structure
-
-Always structure scripts with configuration variables at the top:
-
-```python
-"""Generate synthetic data for [use case]."""
-import numpy as np
-import pandas as pd
-from datetime import datetime, timedelta
-from faker import Faker
-import holidays
-from pyspark.sql import SparkSession
-
-# =============================================================================
-# CONFIGURATION - Edit these values
-# =============================================================================
-CATALOG = "my_catalog"
-SCHEMA = "my_schema"
-VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
-
-# Data sizes - enough for aggregation patterns to survive
-N_CUSTOMERS = 2500
-N_ORDERS = 25000
-N_TICKETS = 8000
-
-# Date range - last 6 months from today
-END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
-START_DATE = END_DATE - timedelta(days=180)
-
-# Special events (within the date range)
-INCIDENT_END = END_DATE - timedelta(days=21)
-INCIDENT_START = INCIDENT_END - timedelta(days=10)
-
-# Holiday calendar for realistic patterns
-US_HOLIDAYS = holidays.US(years=[START_DATE.year, END_DATE.year])
-
-# Reproducibility
-SEED = 42
-
-# =============================================================================
-# SETUP
-# =============================================================================
-np.random.seed(SEED)
-Faker.seed(SEED)
-fake = Faker()
-spark = SparkSession.builder.getOrCreate()
-
-# ... rest of script
-```
-
-## Key Principles
-
-### 1. Use Pandas for Generation, Spark for Saving
-
-Generate data with pandas (faster, easier), convert to Spark for saving:
-
-```python
-import pandas as pd
-
-# Generate with pandas
-customers_pdf = pd.DataFrame({
- "customer_id": [f"CUST-{i:05d}" for i in range(N_CUSTOMERS)],
- "name": [fake.company() for _ in range(N_CUSTOMERS)],
- "tier": np.random.choice(['Free', 'Pro', 'Enterprise'], N_CUSTOMERS, p=[0.6, 0.3, 0.1]),
- "region": np.random.choice(['North', 'South', 'East', 'West'], N_CUSTOMERS, p=[0.4, 0.25, 0.2, 0.15]),
- "created_at": [fake.date_between(start_date='-2y', end_date='-6m') for _ in range(N_CUSTOMERS)],
-})
-
-# Convert to Spark and save
-customers_df = spark.createDataFrame(customers_pdf)
-customers_df.write.mode("overwrite").parquet(f"{VOLUME_PATH}/customers")
-```
-
-### 2. Iterate on DataFrames for Referential Integrity
-
-Generate master tables first, then iterate on them to create related tables with matching IDs:
-
-```python
-# 1. Generate customers (master table)
-customers_pdf = pd.DataFrame({
- "customer_id": [f"CUST-{i:05d}" for i in range(N_CUSTOMERS)],
- "tier": np.random.choice(['Free', 'Pro', 'Enterprise'], N_CUSTOMERS, p=[0.6, 0.3, 0.1]),
- # ...
-})
-
-# 2. Create lookup for foreign key generation
-customer_ids = customers_pdf["customer_id"].tolist()
-customer_tier_map = dict(zip(customers_pdf["customer_id"], customers_pdf["tier"]))
-
-# Weight by tier - Enterprise customers generate more orders
-tier_weights = customers_pdf["tier"].map({'Enterprise': 5.0, 'Pro': 2.0, 'Free': 1.0})
-customer_weights = (tier_weights / tier_weights.sum()).tolist()
-
-# 3. Generate orders with valid foreign keys and tier-based logic
-orders_data = []
-for i in range(N_ORDERS):
- cid = np.random.choice(customer_ids, p=customer_weights)
- tier = customer_tier_map[cid]
-
- # Amount depends on tier
- if tier == 'Enterprise':
- amount = np.random.lognormal(7, 0.8)
- elif tier == 'Pro':
- amount = np.random.lognormal(5, 0.7)
- else:
- amount = np.random.lognormal(3.5, 0.6)
-
- orders_data.append({
- "order_id": f"ORD-{i:06d}",
- "customer_id": cid,
- "amount": round(amount, 2),
- "order_date": fake.date_between(start_date=START_DATE, end_date=END_DATE),
- })
-
-orders_pdf = pd.DataFrame(orders_data)
-
-# 4. Generate tickets that reference both customers and orders
-order_ids = orders_pdf["order_id"].tolist()
-tickets_data = []
-for i in range(N_TICKETS):
- cid = np.random.choice(customer_ids, p=customer_weights)
- oid = np.random.choice(order_ids) # Or None for general inquiry
-
- tickets_data.append({
- "ticket_id": f"TKT-{i:06d}",
- "customer_id": cid,
- "order_id": oid if np.random.random() > 0.3 else None,
- # ...
- })
-
-tickets_pdf = pd.DataFrame(tickets_data)
-```
-
-### 3. Non-Linear Distributions
-
-**Never use uniform distributions** - real data is rarely uniform:
-
-```python
-# BAD - Uniform (unrealistic)
-prices = np.random.uniform(10, 1000, size=N_ORDERS)
-
-# GOOD - Log-normal (realistic for prices, salaries, order amounts)
-prices = np.random.lognormal(mean=4.5, sigma=0.8, size=N_ORDERS)
-
-# GOOD - Pareto/power law (popularity, wealth, page views)
-popularity = (np.random.pareto(a=2.5, size=N_PRODUCTS) + 1) * 10
-
-# GOOD - Exponential (time between events, resolution time)
-resolution_hours = np.random.exponential(scale=24, size=N_TICKETS)
-
-# GOOD - Weighted categorical
-regions = np.random.choice(
- ['North', 'South', 'East', 'West'],
- size=N_CUSTOMERS,
- p=[0.40, 0.25, 0.20, 0.15]
-)
-```
-
-### 4. Time-Based Patterns
-
-Add weekday/weekend effects, holidays, seasonality, and event spikes:
-
-```python
-import holidays
-
-# Load holiday calendar
-US_HOLIDAYS = holidays.US(years=[START_DATE.year, END_DATE.year])
-
-def get_daily_multiplier(date):
- """Calculate volume multiplier for a given date."""
- multiplier = 1.0
-
- # Weekend drop
- if date.weekday() >= 5:
- multiplier *= 0.6
-
- # Holiday drop (even lower than weekends)
- if date in US_HOLIDAYS:
- multiplier *= 0.3
-
- # Q4 seasonality (higher in Oct-Dec)
- multiplier *= 1 + 0.15 * (date.month - 6) / 6
-
- # Incident spike
- if INCIDENT_START <= date <= INCIDENT_END:
- multiplier *= 3.0
-
- # Random noise
- multiplier *= np.random.normal(1, 0.1)
-
- return max(0.1, multiplier)
-
-# Distribute tickets across dates with realistic patterns
-date_range = pd.date_range(START_DATE, END_DATE, freq='D')
-daily_volumes = [int(BASE_DAILY_TICKETS * get_daily_multiplier(d)) for d in date_range]
-```
-
-### 5. Row Coherence
-
-Attributes within a row should correlate logically:
-
-```python
-def generate_ticket(customer_id, tier, date):
- """Generate a coherent ticket where attributes correlate."""
-
- # Priority correlates with tier
- if tier == 'Enterprise':
- priority = np.random.choice(['Critical', 'High', 'Medium'], p=[0.3, 0.5, 0.2])
- else:
- priority = np.random.choice(['Critical', 'High', 'Medium', 'Low'], p=[0.05, 0.2, 0.45, 0.3])
-
- # Resolution time correlates with priority
- resolution_scale = {'Critical': 4, 'High': 12, 'Medium': 36, 'Low': 72}
- resolution_hours = np.random.exponential(scale=resolution_scale[priority])
-
- # CSAT correlates with resolution time
- if resolution_hours < 4:
- csat = np.random.choice([4, 5], p=[0.3, 0.7])
- elif resolution_hours < 24:
- csat = np.random.choice([3, 4, 5], p=[0.2, 0.5, 0.3])
- else:
- csat = np.random.choice([1, 2, 3, 4], p=[0.1, 0.3, 0.4, 0.2])
-
- return {
- "customer_id": customer_id,
- "priority": priority,
- "resolution_hours": round(resolution_hours, 1),
- "csat_score": csat,
- "created_at": date,
- }
-```
-
-## Complete Example
-
-Save as `scripts/generate_data.py`:
-
-```python
-"""Generate synthetic customer, order, and ticket data."""
-import numpy as np
-import pandas as pd
-from datetime import datetime, timedelta
-from faker import Faker
-import holidays
-from pyspark.sql import SparkSession
-
-# =============================================================================
-# CONFIGURATION
-# =============================================================================
-CATALOG = "my_catalog"
-SCHEMA = "my_schema"
-VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data"
-
-N_CUSTOMERS = 2500
-N_ORDERS = 25000
-N_TICKETS = 8000
-
-# Date range - last 6 months from today
-END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
-START_DATE = END_DATE - timedelta(days=180)
-
-# Special events (within the date range)
-INCIDENT_END = END_DATE - timedelta(days=21)
-INCIDENT_START = INCIDENT_END - timedelta(days=10)
-
-# Holiday calendar
-US_HOLIDAYS = holidays.US(years=[START_DATE.year, END_DATE.year])
-
-SEED = 42
-
-# =============================================================================
-# SETUP
-# =============================================================================
-np.random.seed(SEED)
-Faker.seed(SEED)
-fake = Faker()
-spark = SparkSession.builder.getOrCreate()
-
-# =============================================================================
-# CREATE INFRASTRUCTURE
-# =============================================================================
-print(f"Creating catalog/schema/volume if needed...")
-spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")
-spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
-spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data")
-
-print(f"Generating: {N_CUSTOMERS:,} customers, {N_ORDERS:,} orders, {N_TICKETS:,} tickets")
-
-# =============================================================================
-# 1. CUSTOMERS (Master Table)
-# =============================================================================
-print("Generating customers...")
-
-customers_pdf = pd.DataFrame({
- "customer_id": [f"CUST-{i:05d}" for i in range(N_CUSTOMERS)],
- "name": [fake.company() for _ in range(N_CUSTOMERS)],
- "tier": np.random.choice(['Free', 'Pro', 'Enterprise'], N_CUSTOMERS, p=[0.6, 0.3, 0.1]),
- "region": np.random.choice(['North', 'South', 'East', 'West'], N_CUSTOMERS, p=[0.4, 0.25, 0.2, 0.15]),
-})
-
-# ARR correlates with tier
-customers_pdf["arr"] = customers_pdf["tier"].apply(
- lambda t: round(np.random.lognormal(11, 0.5), 2) if t == 'Enterprise'
- else round(np.random.lognormal(8, 0.6), 2) if t == 'Pro' else 0
-)
-
-# Lookups for foreign keys
-customer_ids = customers_pdf["customer_id"].tolist()
-customer_tier_map = dict(zip(customers_pdf["customer_id"], customers_pdf["tier"]))
-tier_weights = customers_pdf["tier"].map({'Enterprise': 5.0, 'Pro': 2.0, 'Free': 1.0})
-customer_weights = (tier_weights / tier_weights.sum()).tolist()
-
-print(f" Created {len(customers_pdf):,} customers")
-
-# =============================================================================
-# 2. ORDERS (References Customers)
-# =============================================================================
-print("Generating orders...")
-
-orders_data = []
-for i in range(N_ORDERS):
- cid = np.random.choice(customer_ids, p=customer_weights)
- tier = customer_tier_map[cid]
- amount = np.random.lognormal(7 if tier == 'Enterprise' else 5 if tier == 'Pro' else 3.5, 0.7)
-
- orders_data.append({
- "order_id": f"ORD-{i:06d}",
- "customer_id": cid,
- "amount": round(amount, 2),
- "status": np.random.choice(['completed', 'pending', 'cancelled'], p=[0.85, 0.10, 0.05]),
- "order_date": fake.date_between(start_date=START_DATE, end_date=END_DATE),
- })
-
-orders_pdf = pd.DataFrame(orders_data)
-print(f" Created {len(orders_pdf):,} orders")
-
-# =============================================================================
-# 3. TICKETS (References Customers, with incident spike)
-# =============================================================================
-print("Generating tickets...")
-
-def get_daily_volume(date, base=25):
- vol = base * (0.6 if date.weekday() >= 5 else 1.0)
- if date in US_HOLIDAYS:
- vol *= 0.3 # Even lower on holidays
- if INCIDENT_START <= date <= INCIDENT_END:
- vol *= 3.0
- return int(vol * np.random.normal(1, 0.15))
-
-# Distribute tickets across dates
-tickets_data = []
-ticket_idx = 0
-for day in pd.date_range(START_DATE, END_DATE):
- daily_count = get_daily_volume(day.to_pydatetime())
- is_incident = INCIDENT_START <= day.to_pydatetime() <= INCIDENT_END
-
- for _ in range(daily_count):
- if ticket_idx >= N_TICKETS:
- break
-
- cid = np.random.choice(customer_ids, p=customer_weights)
- tier = customer_tier_map[cid]
-
- # Category - Auth dominates during incident
- if is_incident:
- category = np.random.choice(['Auth', 'Network', 'Billing', 'Account'], p=[0.65, 0.15, 0.1, 0.1])
- else:
- category = np.random.choice(['Auth', 'Network', 'Billing', 'Account'], p=[0.25, 0.30, 0.25, 0.20])
-
- # Priority correlates with tier
- priority = np.random.choice(['Critical', 'High', 'Medium'], p=[0.3, 0.5, 0.2]) if tier == 'Enterprise' \
- else np.random.choice(['Critical', 'High', 'Medium', 'Low'], p=[0.05, 0.2, 0.45, 0.3])
-
- # Resolution time correlates with priority
- res_scale = {'Critical': 4, 'High': 12, 'Medium': 36, 'Low': 72}
- resolution = np.random.exponential(scale=res_scale[priority])
-
- # CSAT degrades during incident for Auth
- if is_incident and category == 'Auth':
- csat = np.random.choice([1, 2, 3, 4, 5], p=[0.15, 0.25, 0.35, 0.2, 0.05])
- else:
- csat = 5 if resolution < 4 else (4 if resolution < 12 else np.random.choice([2, 3, 4], p=[0.2, 0.5, 0.3]))
-
- tickets_data.append({
- "ticket_id": f"TKT-{ticket_idx:06d}",
- "customer_id": cid,
- "category": category,
- "priority": priority,
- "resolution_hours": round(resolution, 1),
- "csat_score": csat,
- "created_at": day.strftime("%Y-%m-%d"),
- })
- ticket_idx += 1
-
- if ticket_idx >= N_TICKETS:
- break
-
-tickets_pdf = pd.DataFrame(tickets_data)
-print(f" Created {len(tickets_pdf):,} tickets")
-
-# =============================================================================
-# 4. SAVE TO VOLUME
-# =============================================================================
-print(f"\nSaving to {VOLUME_PATH}...")
-
-spark.createDataFrame(customers_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/customers")
-spark.createDataFrame(orders_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/orders")
-spark.createDataFrame(tickets_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/tickets")
-
-print("Done!")
-
-# =============================================================================
-# 5. VALIDATION
-# =============================================================================
-print("\n=== VALIDATION ===")
-print(f"Tier distribution: {customers_pdf['tier'].value_counts(normalize=True).to_dict()}")
-print(f"Avg order by tier: {orders_pdf.merge(customers_pdf[['customer_id', 'tier']]).groupby('tier')['amount'].mean().to_dict()}")
-
-incident_tickets = tickets_pdf[tickets_pdf['created_at'].between(
- INCIDENT_START.strftime("%Y-%m-%d"), INCIDENT_END.strftime("%Y-%m-%d")
-)]
-print(f"Incident period tickets: {len(incident_tickets):,} ({len(incident_tickets)/len(tickets_pdf)*100:.1f}%)")
-print(f"Incident Auth %: {(incident_tickets['category'] == 'Auth').mean()*100:.1f}%")
-```
-
-Execute using `run_python_file_on_databricks` tool:
-- `file_path`: "scripts/generate_data.py"
-
-If it fails, edit the file and re-run with the same `cluster_id` and `context_id`.
-
-### Validate Generated Data
-
-After successful execution, use `get_volume_folder_details` tool to verify the generated data:
-- `volume_path`: "my_catalog/my_schema/raw_data/customers"
-- `format`: "parquet"
-- `table_stat_level`: "SIMPLE"
-
-This returns schema, row counts, and column statistics to confirm the data was written correctly.
-
-## Best Practices
-
-1. **Ask for schema**: Default to `ai_dev_kit` catalog, ask user for schema name
-2. **Create infrastructure**: Use `CREATE CATALOG/SCHEMA/VOLUME IF NOT EXISTS`
-3. **Raw data only**: No `total_x`, `sum_x`, `avg_x` fields - SDP pipeline computes those
-4. **Save to Volume, not tables**: Write parquet to `/Volumes/{catalog}/{schema}/raw_data/`
-5. **Configuration at top**: All sizes, dates, and paths as variables
-6. **Dynamic dates**: Use `datetime.now() - timedelta(days=180)` for last 6 months
-7. **Pandas for generation**: Faster and easier than Spark for row-by-row logic
-8. **Master tables first**: Generate customers, then orders reference customer_ids
-9. **Weighted sampling**: Enterprise customers generate more activity
-10. **Distributions**: Log-normal for values, exponential for times, weighted categorical
-11. **Time patterns**: Weekday/weekend, holidays, seasonality, event spikes
-12. **Row coherence**: Priority affects resolution time affects CSAT
-13. **Volume for aggregation**: 10K-50K rows minimum so patterns survive GROUP BY
-14. **Always use files**: Write to local file, execute, edit if error, re-execute
-15. **Context reuse**: Pass `cluster_id` and `context_id` for faster iterations
-16. **Libraries**: Install `faker` and `holidays` first; most others are pre-installed
-
-## Related Skills
-
-- **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - for building bronze/silver/gold pipelines on top of generated data
-- **[databricks-aibi-dashboards](../databricks-aibi-dashboards/SKILL.md)** - for visualizing the generated data in dashboards
-- **[databricks-unity-catalog](../databricks-unity-catalog/SKILL.md)** - for managing catalogs, schemas, and volumes where data is stored
diff --git a/databricks-skills/databricks-unity-catalog/SKILL.md b/databricks-skills/databricks-unity-catalog/SKILL.md
index 553eba97..30f34e3d 100644
--- a/databricks-skills/databricks-unity-catalog/SKILL.md
+++ b/databricks-skills/databricks-unity-catalog/SKILL.md
@@ -110,7 +110,7 @@ mcp__databricks__execute_sql(
- **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - for pipelines that write to Unity Catalog tables
- **[databricks-jobs](../databricks-jobs/SKILL.md)** - for job execution data visible in system tables
-- **[databricks-synthetic-data-generation](../databricks-synthetic-data-generation/SKILL.md)** - for generating data stored in Unity Catalog Volumes
+- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - for generating data stored in Unity Catalog Volumes
- **[databricks-aibi-dashboards](../databricks-aibi-dashboards/SKILL.md)** - for building dashboards on top of Unity Catalog data
## Resources
diff --git a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md
index 7666f21b..ee9abf05 100644
--- a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md
+++ b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md
@@ -190,5 +190,5 @@ AZURE_OPENAI_DEPLOYMENT=gpt-4o
- **[databricks-agent-bricks](../databricks-agent-bricks/SKILL.md)** - Create Knowledge Assistants that ingest the generated PDFs
- **[databricks-vector-search](../databricks-vector-search/SKILL.md)** - Index generated documents for semantic search and RAG
-- **[databricks-synthetic-data-generation](../databricks-synthetic-data-generation/SKILL.md)** - Generate structured tabular data (complement to unstructured PDFs)
+- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - Generate structured tabular data (complement to unstructured PDFs)
- **[databricks-mlflow-evaluation](../databricks-mlflow-evaluation/SKILL.md)** - Evaluate RAG systems using the generated question/guideline pairs
diff --git a/databricks-skills/databricks-zerobus-ingest/SKILL.md b/databricks-skills/databricks-zerobus-ingest/SKILL.md
index efd52b0d..e3d3f48a 100644
--- a/databricks-skills/databricks-zerobus-ingest/SKILL.md
+++ b/databricks-skills/databricks-zerobus-ingest/SKILL.md
@@ -218,7 +218,7 @@ The timestamp generation must use microseconds for Databricks.
- **[databricks-python-sdk](../databricks-python-sdk/SKILL.md)** - General SDK patterns and WorkspaceClient for table/schema management
- **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - Downstream pipeline processing of ingested data
- **[databricks-unity-catalog](../databricks-unity-catalog/SKILL.md)** - Managing catalogs, schemas, and tables that Zerobus writes to
-- **[databricks-synthetic-data-generation](../databricks-synthetic-data-generation/SKILL.md)** - Generate test data to feed into Zerobus producers
+- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - Generate test data to feed into Zerobus producers
- **[databricks-config](../databricks-config/SKILL.md)** - Profile and authentication setup
## Resources
diff --git a/databricks-skills/install_skills.sh b/databricks-skills/install_skills.sh
index ff8d9b86..763489c8 100755
--- a/databricks-skills/install_skills.sh
+++ b/databricks-skills/install_skills.sh
@@ -42,7 +42,7 @@ MLFLOW_REPO_RAW_URL="https://raw.githubusercontent.com/mlflow/skills"
MLFLOW_REPO_REF="main"
# Databricks skills (hosted in this repo)
-DATABRICKS_SKILLS="databricks-agent-bricks databricks-aibi-dashboards databricks-asset-bundles databricks-app-apx databricks-app-python databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-parsing databricks-python-sdk databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-generation databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source"
+DATABRICKS_SKILLS="databricks-agent-bricks databricks-aibi-dashboards databricks-asset-bundles databricks-app-apx databricks-app-python databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-parsing databricks-python-sdk databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-gen databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source"
# MLflow skills (fetched from mlflow/skills repo)
MLFLOW_SKILLS="agent-evaluation analyze-mlflow-chat-session analyze-mlflow-trace instrumenting-with-mlflow-tracing mlflow-onboarding querying-mlflow-metrics retrieving-mlflow-traces searching-mlflow-docs"
@@ -76,7 +76,7 @@ get_skill_description() {
"databricks-spark-declarative-pipelines") echo "Spark Declarative Pipelines (SDP/LDP/DLT)" ;;
"spark-python-data-source") echo "Spark custom Python data sources" ;;
"databricks-spark-structured-streaming") echo "Spark Structured Streaming patterns and best practices" ;;
- "databricks-synthetic-data-generation") echo "Synthetic test data generation" ;;
+ "databricks-synthetic-data-gen") echo "Synthetic test data generation" ;;
"databricks-unstructured-pdf-generation") echo "Generate synthetic PDFs for RAG" ;;
"databricks-vector-search") echo "Vector Search - endpoints, indexes, and queries for RAG" ;;
"databricks-zerobus-ingest") echo "Zerobus Ingest - gRPC data ingestion into Delta tables" ;;
diff --git a/install.ps1 b/install.ps1
index f144b5ac..38e4a2a0 100644
--- a/install.ps1
+++ b/install.ps1
@@ -78,11 +78,11 @@ $script:ProfileProvided = $false
$script:Skills = @(
"databricks-agent-bricks", "databricks-aibi-dashboards", "databricks-app-apx", "databricks-app-python",
"databricks-asset-bundles", "databricks-config", "databricks-dbsql", "databricks-docs", "databricks-genie",
- "databricks-jobs", "databricks-metric-views", "databricks-model-serving", "databricks-python-sdk",
- "databricks-unity-catalog", "databricks-vector-search", "databricks-zerobus-ingest",
- "databricks-lakebase-autoscale", "databricks-lakebase-provisioned", "databricks-mlflow-evaluation",
- "databricks-spark-declarative-pipelines", "spark-python-data-source", "databricks-spark-structured-streaming",
- "databricks-synthetic-data-generation", "databricks-unstructured-pdf-generation"
+ "databricks-iceberg", "databricks-jobs", "databricks-lakebase-autoscale", "databricks-lakebase-provisioned",
+ "databricks-metric-views", "databricks-mlflow-evaluation", "databricks-model-serving", "databricks-parsing",
+ "databricks-python-sdk", "databricks-spark-declarative-pipelines", "databricks-spark-structured-streaming",
+ "databricks-synthetic-data-gen", "databricks-unity-catalog", "databricks-unstructured-pdf-generation",
+ "databricks-vector-search", "databricks-zerobus-ingest", "spark-python-data-source"
)
# MLflow skills (fetched from mlflow/skills repo)
diff --git a/install.sh b/install.sh
index c347b13e..61b98d42 100755
--- a/install.sh
+++ b/install.sh
@@ -74,7 +74,7 @@ MIN_SDK_VERSION="0.85.0"
G='\033[0;32m' Y='\033[1;33m' R='\033[0;31m' BL='\033[0;34m' B='\033[1m' D='\033[2m' N='\033[0m'
# Databricks skills (bundled in repo)
-SKILLS="databricks-agent-bricks databricks-aibi-dashboards databricks-app-apx databricks-app-python databricks-asset-bundles databricks-config databricks-dbsql databricks-docs databricks-genie databricks-jobs databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-python-sdk databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-generation databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source"
+SKILLS="databricks-agent-bricks databricks-aibi-dashboards databricks-app-apx databricks-app-python databricks-asset-bundles databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-parsing databricks-python-sdk databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-gen databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source"
# MLflow skills (fetched from mlflow/skills repo)
MLFLOW_SKILLS="agent-evaluation analyze-mlflow-chat-session analyze-mlflow-trace instrumenting-with-mlflow-tracing mlflow-onboarding querying-mlflow-metrics retrieving-mlflow-traces searching-mlflow-docs"