diff --git a/.gitignore b/.gitignore index 385994fa..a170605d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ # Databricks AI Dev Kit .ai-dev-kit/ .claude/ - +.local # Python __pycache__/ diff --git a/.test/README.md b/.test/README.md index d5c8fe46..d2bbb2db 100644 --- a/.test/README.md +++ b/.test/README.md @@ -233,3 +233,17 @@ uv pip install -e ".test/" uv run pytest .test/tests/ uv run python .test/scripts/regression.py ``` + +--- + +## Troubleshooting + +### MLflow evaluation not returning results + +If `/skill-test mlflow` hangs or doesn't return results, run manually with debug logging: + +```bash +MLFLOW_LOG_LEVEL=DEBUG uv run python .test/scripts/mlflow_eval.py +``` + +This will show detailed MLflow API calls and help identify connection or authentication issues. diff --git a/.test/baselines/databricks-synthetic-data-gen/baseline.yaml b/.test/baselines/databricks-synthetic-data-gen/baseline.yaml new file mode 100644 index 00000000..b43273c8 --- /dev/null +++ b/.test/baselines/databricks-synthetic-data-gen/baseline.yaml @@ -0,0 +1,21 @@ +run_id: '20260303_071721' +created_at: '2026-03-03T07:17:21.838623' +skill_name: databricks-synthetic-data-gen +metrics: + pass_rate: 1.0 + total_tests: 4 + passed_tests: 4 + failed_tests: 0 +test_results: +- id: grp_20260302_113344 + passed: true + execution_mode: local +- id: gen_serverless_job_catalog_json_002 + passed: true + execution_mode: local +- id: grp_20260302_retail_csv_3tables_003 + passed: true + execution_mode: local +- id: grp_20260303_manufacturing_delta_streaming_004 + passed: true + execution_mode: local diff --git a/.test/scripts/mlflow_eval.py b/.test/scripts/mlflow_eval.py index caa2e45c..93278e4d 100755 --- a/.test/scripts/mlflow_eval.py +++ b/.test/scripts/mlflow_eval.py @@ -2,29 +2,65 @@ """Run MLflow evaluation for a skill. Usage: - python mlflow_eval.py [--filter-category ] [--run-name ] + python mlflow_eval.py [--filter-category ] [--run-name ] [--timeout ] Environment Variables: DATABRICKS_CONFIG_PROFILE - Databricks CLI profile (default: "DEFAULT") MLFLOW_TRACKING_URI - Set to "databricks" for Databricks MLflow MLFLOW_EXPERIMENT_NAME - Experiment path (e.g., "/Users/{user}/skill-test") + MLFLOW_LLM_JUDGE_TIMEOUT - Timeout in seconds for LLM judge evaluation (default: 120) """ +import os import sys +import signal import argparse +# Close stdin and disable tqdm progress bars when run non-interactively +# This fixes hanging issues with tqdm/MLflow progress bars in background tasks +if not sys.stdin.isatty(): + try: + sys.stdin.close() + sys.stdin = open(os.devnull, 'r') + except Exception: + pass + # Disable tqdm progress bars + os.environ.setdefault("TQDM_DISABLE", "1") + # Import common utilities from _common import setup_path, print_result, handle_error +class TimeoutException(Exception): + pass + + +def timeout_handler(signum, frame): + raise TimeoutException("MLflow evaluation timed out") + + def main(): parser = argparse.ArgumentParser(description="Run MLflow evaluation for a skill") parser.add_argument("skill_name", help="Name of skill to evaluate") parser.add_argument("--filter-category", help="Filter by test category") parser.add_argument("--run-name", help="Custom MLflow run name") + parser.add_argument( + "--timeout", + type=int, + default=120, + help="Timeout in seconds for evaluation (default: 120)", + ) args = parser.parse_args() setup_path() + # Set up signal-based timeout (Unix only) + if hasattr(signal, 'SIGALRM'): + signal.signal(signal.SIGALRM, timeout_handler) + signal.alarm(args.timeout) + else: + # Windows: SIGALRM not available - no timeout enforcement + print("WARNING: Timeout not supported on Windows - test may run indefinitely", file=sys.stderr) + try: from skill_test.runners import evaluate_skill @@ -34,6 +70,10 @@ def main(): run_name=args.run_name, ) + # Cancel the alarm if we succeeded + if hasattr(signal, 'SIGALRM'): + signal.alarm(0) + # Convert to standard result format if result.get("run_id"): result["success"] = True @@ -42,7 +82,19 @@ def main(): sys.exit(print_result(result)) + except TimeoutException as e: + result = { + "success": False, + "skill_name": args.skill_name, + "error": f"Evaluation timed out after {args.timeout} seconds. This may indicate LLM judge endpoint issues.", + "error_type": "timeout", + } + sys.exit(print_result(result)) + except Exception as e: + # Cancel alarm on any exception + if hasattr(signal, 'SIGALRM'): + signal.alarm(0) sys.exit(handle_error(e, args.skill_name)) diff --git a/.test/skills/_routing/ground_truth.yaml b/.test/skills/_routing/ground_truth.yaml index a428d5b7..f9948433 100644 --- a/.test/skills/_routing/ground_truth.yaml +++ b/.test/skills/_routing/ground_truth.yaml @@ -99,7 +99,7 @@ test_cases: prompt: "Generate synthetic customer data and evaluate the agent quality with MLflow scorers" expectations: expected_skills: - - "databricks-synthetic-data-generation" + - "databricks-synthetic-data-gen" - "databricks-mlflow-evaluation" is_multi_skill: true metadata: diff --git a/.test/skills/databricks-synthetic-data-gen/candidates.yaml b/.test/skills/databricks-synthetic-data-gen/candidates.yaml new file mode 100644 index 00000000..c6c2a270 --- /dev/null +++ b/.test/skills/databricks-synthetic-data-gen/candidates.yaml @@ -0,0 +1,7 @@ +# Candidates for databricks-synthetic-data-gen skill +# Test cases pending review before promotion to ground_truth.yaml +# +# Use `/skill-test databricks-synthetic-data-gen add` to create new candidates +# Use `/skill-test databricks-synthetic-data-gen review` to promote candidates to ground truth + +candidates: [] diff --git a/.test/skills/databricks-synthetic-data-gen/expected_responses/gen_serverless_job_catalog_json_002.txt b/.test/skills/databricks-synthetic-data-gen/expected_responses/gen_serverless_job_catalog_json_002.txt new file mode 100644 index 00000000..16065794 --- /dev/null +++ b/.test/skills/databricks-synthetic-data-gen/expected_responses/gen_serverless_job_catalog_json_002.txt @@ -0,0 +1,331 @@ +Here's the Python script to generate product catalog data, followed by the serverless job definition: + +```python +""" +Synthetic Product Catalog Data Generation +Serverless Databricks Job with JSON Output + +Generates: +- products (3,000 rows): product_id, name, category (weighted), price (log-normal), inventory_count +- sales (10,000 rows): sale_id, product_id (FK), quantity, sale_date, discount_pct + +Popular product categories have more sales (weighted sampling). +""" + +import os +from pyspark.sql import SparkSession, functions as F +from pyspark.sql.window import Window +from pyspark.sql.types import StringType, DoubleType, IntegerType +import pandas as pd +import numpy as np +from datetime import datetime, timedelta + +# ============================================================================= +# CONFIGURATION +# ============================================================================= +CATALOG = "my_catalog" +SCHEMA = "devkit_gen2_test_small_json" +VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data" + +NUM_PRODUCTS = 3_000 +NUM_SALES = 10_000 +NUM_PARTITIONS = 16 + +END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) +START_DATE = END_DATE - timedelta(days=180) +SEED = 42 + +# ============================================================================= +# SPARK SESSION +# ============================================================================= + +def is_databricks_runtime(): + return "DATABRICKS_RUNTIME_VERSION" in os.environ + +if is_databricks_runtime(): + spark = SparkSession.builder.getOrCreate() + print("Running on Databricks Runtime") +else: + from databricks.connect import DatabricksSession + spark = DatabricksSession.builder.serverless(True).getOrCreate() + print("Running with Databricks Connect (serverless)") + +print("=" * 60) +print("PRODUCT CATALOG DATA GENERATION") +print("=" * 60) +print(f"Catalog: {CATALOG}") +print(f"Schema: {SCHEMA}") +print(f"Products: {NUM_PRODUCTS:,}") +print(f"Sales: {NUM_SALES:,}") +print("=" * 60) + +# ============================================================================= +# CREATE INFRASTRUCTURE +# ============================================================================= +print("\nCreating infrastructure...") +spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}") +spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data") + +# ============================================================================= +# PANDAS UDFs +# ============================================================================= + +@F.pandas_udf(StringType()) +def fake_product_name(categories: pd.Series) -> pd.Series: + from faker import Faker + fake = Faker() + + templates = { + "Electronics": ["Smart", "Wireless", "Digital", "Pro", "Ultra"], + "Home & Garden": ["Premium", "Deluxe", "Classic", "Modern", "Natural"], + "Clothing": ["Designer", "Casual", "Comfort", "Luxury", "Sport"], + "Sports": ["Pro", "Elite", "Performance", "Outdoor", "Active"], + "Books": ["Complete", "Essential", "Ultimate", "Practical", "Advanced"], + "Toys": ["Fun", "Creative", "Educational", "Super", "Magic"], + "Beauty": ["Natural", "Premium", "Radiance", "Pure", "Glow"], + "Automotive": ["Pro", "Heavy-Duty", "Premium", "Performance", "Ultra"], + } + + products = { + "Electronics": ["Headphones", "Speaker", "Charger", "Watch", "Camera"], + "Home & Garden": ["Lamp", "Planter", "Organizer", "Rug", "Vase"], + "Clothing": ["T-Shirt", "Jacket", "Pants", "Sweater", "Dress"], + "Sports": ["Ball", "Racket", "Mat", "Gloves", "Bag"], + "Books": ["Guide", "Handbook", "Manual", "Edition", "Collection"], + "Toys": ["Game", "Puzzle", "Building Set", "Robot", "Craft Kit"], + "Beauty": ["Serum", "Cream", "Lotion", "Mask", "Oil"], + "Automotive": ["Tool Kit", "Cover", "Mat", "Cleaner", "Polish"], + } + + names = [] + for category in categories: + template_list = templates.get(category, ["Premium"]) + product_list = products.get(category, ["Item"]) + template = np.random.choice(template_list) + product = np.random.choice(product_list) + color = fake.color_name() + names.append(f"{template} {color} {product}") + + return pd.Series(names) + + +@F.pandas_udf(DoubleType()) +def generate_price(categories: pd.Series) -> pd.Series: + price_params = { + "Electronics": (4.5, 0.8), + "Home & Garden": (3.8, 0.7), + "Clothing": (3.5, 0.6), + "Sports": (4.0, 0.7), + "Books": (2.8, 0.4), + "Toys": (3.2, 0.6), + "Beauty": (3.3, 0.5), + "Automotive": (4.2, 0.8), + } + + prices = [] + for category in categories: + mu, sigma = price_params.get(category, (3.5, 0.6)) + price = float(np.random.lognormal(mu, sigma)) + price = round(price) - 0.01 if price > 1 else round(price, 2) + prices.append(max(0.99, price)) + + return pd.Series(prices) + + +@F.pandas_udf(IntegerType()) +def generate_inventory(ids: pd.Series) -> pd.Series: + inventory = (np.random.pareto(a=2.0, size=len(ids)) + 1) * 20 + return pd.Series(inventory.astype(int)) + + +# ============================================================================= +# GENERATE PRODUCTS TABLE (Master) +# ============================================================================= +print(f"\nGenerating {NUM_PRODUCTS:,} products...") + +products_df = ( + spark.range(0, NUM_PRODUCTS, numPartitions=NUM_PARTITIONS) + .select( + F.concat(F.lit("PROD-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("product_id"), + F.col("id").alias("_idx"), + ) + .withColumn( + "category", + F.when(F.rand(SEED) < 0.25, "Electronics") + .when(F.rand(SEED + 1) < 0.45, "Home & Garden") + .when(F.rand(SEED + 2) < 0.63, "Clothing") + .when(F.rand(SEED + 3) < 0.75, "Sports") + .when(F.rand(SEED + 4) < 0.85, "Books") + .when(F.rand(SEED + 5) < 0.93, "Toys") + .when(F.rand(SEED + 6) < 0.98, "Beauty") + .otherwise("Automotive") + ) + .withColumn("name", fake_product_name(F.col("category"))) + .withColumn("price", generate_price(F.col("category"))) + .withColumn("inventory_count", generate_inventory(F.col("_idx"))) + .drop("_idx") +) + +products_final = products_df.select( + "product_id", "name", "category", "price", "inventory_count" +) + +products_json_path = f"{VOLUME_PATH}/products" +print(f"Writing products to {products_json_path}...") +products_final.write.mode("overwrite").json(products_json_path) + +products_for_fk = spark.read.json(products_json_path).select("product_id", "category") +product_count = products_for_fk.count() +print(f"Products written: {product_count:,}") + +# ============================================================================= +# GENERATE SALES TABLE (with Referential Integrity) +# ============================================================================= +print(f"\nGenerating {NUM_SALES:,} sales with referential integrity...") + +product_weights = products_for_fk.select( + "product_id", + "category", + F.when(F.col("category") == "Electronics", 3.0) + .when(F.col("category") == "Home & Garden", 2.5) + .when(F.col("category") == "Clothing", 2.0) + .when(F.col("category") == "Sports", 1.5) + .when(F.col("category") == "Books", 1.2) + .when(F.col("category") == "Toys", 1.0) + .when(F.col("category") == "Beauty", 0.8) + .otherwise(0.5).alias("weight") +) + +weighted_products = ( + product_weights + .select( + F.col("product_id"), + F.col("category"), + F.explode(F.array_repeat(F.col("product_id"), F.col("weight").cast("int"))).alias("_dup") + ) + .drop("_dup") +) + +sampled_products = ( + weighted_products + .orderBy(F.rand(SEED + 20)) + .limit(NUM_SALES) + .withColumn("sale_rank", F.row_number().over(Window.orderBy(F.monotonically_increasing_id()))) + .select("product_id", "category", "sale_rank") +) + +sales_base = ( + spark.range(0, NUM_SALES, numPartitions=NUM_PARTITIONS) + .withColumn("sale_rank", F.row_number().over(Window.orderBy(F.col("id")))) + .withColumn( + "sale_id", + F.concat(F.lit("SALE-"), F.lpad(F.col("id").cast("string"), 6, "0")) + ) +) + +sales_with_products = sales_base.join( + sampled_products, + on="sale_rank", + how="inner" +) + +sales_df = ( + sales_with_products + .withColumn( + "quantity", + F.when(F.rand(SEED + 21) < 0.60, 1) + .when(F.rand(SEED + 22) < 0.85, 2) + .when(F.rand(SEED + 23) < 0.95, 3) + .otherwise(F.floor(F.rand(SEED + 24) * 5 + 4).cast("int")) + ) + .withColumn( + "sale_date", + F.date_add(F.lit(START_DATE.date()), (F.rand(SEED + 25) * 180).cast("int")) + ) + .withColumn( + "discount_pct", + F.when(F.rand(SEED + 26) < 0.70, 0.0) + .when(F.rand(SEED + 27) < 0.85, 0.10) + .when(F.rand(SEED + 28) < 0.95, 0.20) + .otherwise(0.30) + ) +) + +sales_final = sales_df.select( + "sale_id", "product_id", "quantity", "sale_date", "discount_pct" +) + +sales_json_path = f"{VOLUME_PATH}/sales" +print(f"Writing sales to {sales_json_path}...") +sales_final.write.mode("overwrite").json(sales_json_path) + +sales_count = spark.read.json(sales_json_path).count() +print(f"Sales written: {sales_count:,}") + +# ============================================================================= +# VALIDATION +# ============================================================================= +print("\n" + "=" * 60) +print("VALIDATION") +print("=" * 60) + +products_check = spark.read.json(products_json_path) +sales_check = spark.read.json(sales_json_path) + +print(f"\nRow counts:") +print(f" Products: {products_check.count():,}") +print(f" Sales: {sales_check.count():,}") + +print(f"\nSales per category (Electronics should have most):") +sales_with_category = sales_check.join( + products_check.select("product_id", "category"), + on="product_id" +) +sales_with_category.groupBy("category").agg( + F.count("*").alias("total_sales"), + F.sum("quantity").alias("total_quantity") +).orderBy(F.desc("total_sales")).show() + +orphan_sales = sales_check.join( + products_check.select("product_id"), + on="product_id", + how="left_anti" +).count() +print(f"\nReferential integrity:") +print(f" Orphan sales (no matching product): {orphan_sales}") + +print("\n" + "=" * 60) +print("GENERATION COMPLETE") +print("=" * 60) +print(f"\nOutput location: {VOLUME_PATH}") +``` + +To run this as a serverless job, create the job with this configuration: + +```json +{ + "name": "generate_product_catalog_data", + "tasks": [ + { + "task_key": "generate_data", + "spark_python_task": { + "python_file": "/Workspace/Users/your_email/generate_product_catalog_data.py" + }, + "environment_key": "default" + } + ], + "environments": [ + { + "environment_key": "default", + "spec": { + "client": "4", + "dependencies": [ + "faker", + "pandas", + "numpy" + ] + } + } + ] +} +``` diff --git a/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260302_113344.txt b/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260302_113344.txt new file mode 100644 index 00000000..39997ba5 --- /dev/null +++ b/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260302_113344.txt @@ -0,0 +1,233 @@ +I'll generate the e-commerce data locally using Polars and Faker, then upload to Unity Catalog. + +## Generation Plan + +**Output Location:** `ai_dev_kit.devkit_gen1_test_small_parquet` +**Volume:** `/Volumes/devkit/devkit_gen1_test_small_parquet/raw_data/` + +| Table | Columns | Rows | +|-------|---------|------| +| customers | customer_id, name, email, tier, region, created_at | 5,000 | +| orders | order_id, customer_id (FK), amount, order_date, status | 15,000 | + +**Order Distribution by Tier:** +- Enterprise (10%): ~8 orders per customer average +- Pro (30%): ~3 orders per customer average +- Free (60%): ~1.5 orders per customer average + +Here's the complete script: + +```python +import polars as pl +import numpy as np +from faker import Faker +from datetime import datetime, timedelta +import random +from databricks.sdk import WorkspaceClient + +# Configuration +CATALOG = "devkit" +SCHEMA = "devkit_gen1_test_small_parquet" +NUM_CUSTOMERS = 5000 +NUM_ORDERS = 15000 + +fake = Faker() +Faker.seed(42) +np.random.seed(42) +random.seed(42) + +# Date ranges +END_DATE = datetime.now() +CUSTOMER_START_DATE = END_DATE - timedelta(days=730) # 2 years +ORDER_START_DATE = END_DATE - timedelta(days=180) # 6 months + +# ============================================================================= +# Generate Customers +# ============================================================================= +print("Generating customers...") + +# Tier distribution: Free 60%, Pro 30%, Enterprise 10% +tier_weights = [0.6, 0.3, 0.1] +tiers = random.choices(["Free", "Pro", "Enterprise"], weights=tier_weights, k=NUM_CUSTOMERS) + +# Region distribution +region_weights = [0.4, 0.3, 0.2, 0.1] +regions = random.choices(["US-East", "US-West", "EU", "APAC"], weights=region_weights, k=NUM_CUSTOMERS) + +customers = pl.DataFrame({ + "customer_id": [f"CUST-{i:05d}" for i in range(NUM_CUSTOMERS)], + "name": [fake.name() for _ in range(NUM_CUSTOMERS)], + "email": [fake.email() for _ in range(NUM_CUSTOMERS)], + "tier": tiers, + "region": regions, + "created_at": [ + fake.date_time_between(start_date=CUSTOMER_START_DATE, end_date=END_DATE) + for _ in range(NUM_CUSTOMERS) + ], +}) + +print(f"Generated {len(customers)} customers") +print(customers.group_by("tier").len().sort("tier")) + +# ============================================================================= +# Generate Orders with Tier-Weighted Distribution +# ============================================================================= +print("\nGenerating orders with tier-weighted distribution...") + +# Calculate orders per tier to achieve ~15,000 total with weighted distribution +# Enterprise: ~8 orders/customer, Pro: ~3, Free: ~1.5 +tier_counts = customers.group_by("tier").len() +enterprise_count = tier_counts.filter(pl.col("tier") == "Enterprise")["len"][0] +pro_count = tier_counts.filter(pl.col("tier") == "Pro")["len"][0] +free_count = tier_counts.filter(pl.col("tier") == "Free")["len"][0] + +# Target distribution +enterprise_orders_per = 8 +pro_orders_per = 3 +free_orders_per = 1.5 + +total_target = enterprise_count * enterprise_orders_per + pro_count * pro_orders_per + free_count * free_orders_per +scale = NUM_ORDERS / total_target + +enterprise_orders = int(enterprise_count * enterprise_orders_per * scale) +pro_orders = int(pro_count * pro_orders_per * scale) +free_orders = NUM_ORDERS - enterprise_orders - pro_orders + +# Get customer IDs by tier +enterprise_customers = customers.filter(pl.col("tier") == "Enterprise")["customer_id"].to_list() +pro_customers = customers.filter(pl.col("tier") == "Pro")["customer_id"].to_list() +free_customers = customers.filter(pl.col("tier") == "Free")["customer_id"].to_list() + +# Generate customer_ids for orders +order_customer_ids = ( + random.choices(enterprise_customers, k=enterprise_orders) + + random.choices(pro_customers, k=pro_orders) + + random.choices(free_customers, k=free_orders) +) +random.shuffle(order_customer_ids) + +# Create customer tier lookup +customer_tier_map = dict(zip(customers["customer_id"].to_list(), customers["tier"].to_list())) + +# Generate amounts based on tier (log-normal distribution) +def generate_amount(tier: str) -> float: + if tier == "Enterprise": + return round(float(np.random.lognormal(6.2, 0.8)), 2) # mean ~$500 + elif tier == "Pro": + return round(float(np.random.lognormal(5.0, 0.7)), 2) # mean ~$150 + else: + return round(float(np.random.lognormal(3.9, 0.6)), 2) # mean ~$50 + +amounts = [generate_amount(customer_tier_map[cid]) for cid in order_customer_ids] + +# Status distribution: delivered 65%, shipped 15%, processing 10%, pending 5%, cancelled 5% +statuses = random.choices( + ["delivered", "shipped", "processing", "pending", "cancelled"], + weights=[0.65, 0.15, 0.10, 0.05, 0.05], + k=NUM_ORDERS +) + +orders = pl.DataFrame({ + "order_id": [f"ORD-{i:06d}" for i in range(NUM_ORDERS)], + "customer_id": order_customer_ids, + "amount": amounts, + "order_date": [ + fake.date_between(start_date=ORDER_START_DATE.date(), end_date=END_DATE.date()) + for _ in range(NUM_ORDERS) + ], + "status": statuses, +}) + +print(f"Generated {len(orders)} orders") + +# Verify tier-weighted distribution +orders_with_tier = orders.join( + customers.select(["customer_id", "tier"]), + on="customer_id" +) +print("\nOrders per tier:") +print(orders_with_tier.group_by("tier").len().sort("tier")) +print("\nAverage orders per customer by tier:") +print( + orders_with_tier.group_by("tier") + .agg(pl.len().alias("total_orders")) + .join(tier_counts, on="tier") + .with_columns((pl.col("total_orders") / pl.col("len")).alias("avg_orders_per_customer")) + .select(["tier", "avg_orders_per_customer"]) + .sort("tier") +) + +# ============================================================================= +# Save Locally as Parquet +# ============================================================================= +print("\nSaving to local Parquet files...") +local_path = "synthetic_data_output" +import os +os.makedirs(local_path, exist_ok=True) + +customers.write_parquet(f"{local_path}/customers.parquet") +orders.write_parquet(f"{local_path}/orders.parquet") +print(f"Saved to {local_path}/") + +# ============================================================================= +# Upload to Unity Catalog Volume +# ============================================================================= +print("\nUploading to Unity Catalog...") + +w = WorkspaceClient() + +# Create schema and volume if needed +from databricks.sdk.service.catalog import VolumeType + +try: + w.schemas.create(name=SCHEMA, catalog_name=CATALOG) + print(f"Created schema {CATALOG}.{SCHEMA}") +except Exception as e: + if "SCHEMA_ALREADY_EXISTS" in str(e): + print(f"Schema {CATALOG}.{SCHEMA} already exists") + else: + raise + +try: + w.volumes.create( + catalog_name=CATALOG, + schema_name=SCHEMA, + name="raw_data", + volume_type=VolumeType.MANAGED + ) + print(f"Created volume {CATALOG}.{SCHEMA}.raw_data") +except Exception as e: + if "already exists" in str(e).lower(): + print(f"Volume {CATALOG}.{SCHEMA}.raw_data already exists") + else: + raise + +# Upload files +volume_path = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data" + +with open(f"{local_path}/customers.parquet", "rb") as f: + w.files.upload(f"{volume_path}/customers.parquet", f, overwrite=True) +print(f"Uploaded customers.parquet to {volume_path}/") + +with open(f"{local_path}/orders.parquet", "rb") as f: + w.files.upload(f"{volume_path}/orders.parquet", f, overwrite=True) +print(f"Uploaded orders.parquet to {volume_path}/") + +print(f"\n✅ Complete! Data available at:") +print(f" {volume_path}/customers.parquet") +print(f" {volume_path}/orders.parquet") +``` + +This script: +1. Generates 5,000 customers with weighted tier distribution (Free 60%, Pro 30%, Enterprise 10%) +2. Generates 15,000 orders with tier-weighted customer assignment (Enterprise ~8x, Pro ~3x, Free ~1.5x orders per customer) +3. Uses log-normal distributions for realistic order amounts by tier +4. Saves locally as Parquet files +5. Creates the schema and volume in Unity Catalog if needed +6. Uploads the Parquet files to the volume + +Run with: +```bash +uv pip install polars faker numpy databricks-sdk +uv run python generate_ecommerce_data.py +``` diff --git a/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260302_retail_csv_3tables_003.txt b/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260302_retail_csv_3tables_003.txt new file mode 100644 index 00000000..55be3712 --- /dev/null +++ b/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260302_retail_csv_3tables_003.txt @@ -0,0 +1,297 @@ +Here's the complete script to generate synthetic retail order data with 3 tables and full referential integrity: + +```python +""" +Synthetic Retail Order Data Generation +====================================== +3 related tables with full referential integrity: +- customers (10,000 rows) +- orders (50,000 rows) +- line_items (150,000 rows) + +Output: CSV files with headers to Unity Catalog volume +Execution: Databricks Connect with serverless compute +""" + +import os +from pyspark.sql import functions as F +from pyspark.sql.types import StringType, DoubleType, IntegerType +from pyspark.sql.window import Window +import pandas as pd +import numpy as np +from datetime import datetime, timedelta + +# ============================================================================= +# CONFIGURATION +# ============================================================================= +CATALOG = "ai_dev_kit" +SCHEMA = "devkit_gen3_test_medium_csv" +VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data" + +NUM_CUSTOMERS = 10_000 +NUM_ORDERS = 50_000 +NUM_LINE_ITEMS = 150_000 +NUM_PARTITIONS = 32 +SEED = 42 + +END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) +START_DATE = END_DATE - timedelta(days=180) + +# ============================================================================= +# SPARK SESSION +# ============================================================================= +def is_databricks_runtime(): + return "DATABRICKS_RUNTIME_VERSION" in os.environ + +if is_databricks_runtime(): + from pyspark.sql import SparkSession + spark = SparkSession.builder.getOrCreate() + print("Running on Databricks Runtime") +else: + from databricks.connect import DatabricksSession, DatabricksEnv + + # Install dependencies on serverless cluster + env = DatabricksEnv().withDependencies("faker", "pandas", "numpy") + spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate() + print("Running with Databricks Connect (serverless with managed dependencies)") + +# ============================================================================= +# CREATE INFRASTRUCTURE +# ============================================================================= +spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}") +spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data") + +# ============================================================================= +# PANDAS UDFs +# ============================================================================= + +@F.pandas_udf(StringType()) +def fake_name(ids: pd.Series) -> pd.Series: + from faker import Faker + fake = Faker() + Faker.seed(SEED) + return pd.Series([fake.name() for _ in range(len(ids))]) + +@F.pandas_udf(StringType()) +def fake_email(names: pd.Series) -> pd.Series: + from faker import Faker + fake = Faker() + emails = [] + for name in names: + parts = name.lower().split() + if len(parts) >= 2: + email = f"{parts[0]}.{parts[-1]}@{fake.free_email_domain()}" + else: + email = f"{parts[0]}{np.random.randint(100, 999)}@{fake.free_email_domain()}" + emails.append(email) + return pd.Series(emails) + +@F.pandas_udf(StringType()) +def fake_product_name(ids: pd.Series) -> pd.Series: + from faker import Faker + fake = Faker() + product_types = ["Chair", "Table", "Lamp", "Desk", "Shelf", "Cabinet", "Sofa", "Rug", + "Mirror", "Clock", "Vase", "Frame", "Pillow", "Blanket", "Candle", + "Mug", "Bowl", "Plate", "Glass", "Bottle", "Box", "Bag", "Hat", + "Watch", "Headphones", "Speaker", "Charger", "Cable", "Case"] + products = [] + for _ in range(len(ids)): + color = fake.color_name() + adj = fake.word().capitalize() + product = np.random.choice(product_types) + products.append(f"{color} {adj} {product}") + return pd.Series(products) + +@F.pandas_udf(DoubleType()) +def generate_unit_price(ids: pd.Series) -> pd.Series: + """Log-normal unit prices (median ~$35, range $5-$500)""" + prices = np.random.lognormal(mean=3.5, sigma=0.7, size=len(ids)) + prices = np.clip(prices, 5.0, 500.0) + return pd.Series(np.round(prices, 2)) + +# ============================================================================= +# GENERATE CUSTOMERS TABLE +# ============================================================================= +customers_df = ( + spark.range(0, NUM_CUSTOMERS, numPartitions=NUM_PARTITIONS) + .select( + F.concat(F.lit("CUST-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("customer_id"), + F.col("id").alias("_idx") + ) + .withColumn("name", fake_name(F.col("_idx"))) + .withColumn("email", fake_email(F.col("name"))) + .withColumn("membership_level", + F.when(F.rand(SEED) < 0.50, "Bronze") + .when(F.rand(SEED + 1) < 0.80, "Silver") + .when(F.rand(SEED + 2) < 0.95, "Gold") + .otherwise("Platinum") + ) + .withColumn("region", + F.when(F.rand(SEED + 3) < 0.30, "US-East") + .when(F.rand(SEED + 4) < 0.55, "US-West") + .when(F.rand(SEED + 5) < 0.80, "EU") + .when(F.rand(SEED + 6) < 0.95, "APAC") + .otherwise("Other") + ) + .drop("_idx") +) + +# Write to temp Delta table (no .cache() on serverless) +customers_tmp = f"{CATALOG}.{SCHEMA}._tmp_customers" +customers_df.write.mode("overwrite").saveAsTable(customers_tmp) +customers_df = spark.table(customers_tmp) + +# ============================================================================= +# GENERATE ORDERS TABLE (weighted by membership level) +# ============================================================================= +customer_weights = customers_df.select( + "customer_id", + "membership_level", + F.when(F.col("membership_level") == "Platinum", 10.0) + .when(F.col("membership_level") == "Gold", 7.0) + .when(F.col("membership_level") == "Silver", 5.0) + .otherwise(3.0).alias("weight") +) + +weighted_customers = ( + customer_weights + .withColumn("replicate_count", (F.col("weight") * 2).cast("int")) + .select( + F.col("customer_id"), + F.explode(F.array_repeat(F.col("customer_id"), F.col("replicate_count"))).alias("_dup") + ) + .drop("_dup") +) + +sampled_customers = ( + weighted_customers + .orderBy(F.rand(SEED + 10)) + .limit(NUM_ORDERS) + .withColumn("_row", F.row_number().over(Window.orderBy(F.monotonically_increasing_id()))) + .select("customer_id", "_row") +) + +orders_base = ( + spark.range(0, NUM_ORDERS, numPartitions=NUM_PARTITIONS) + .withColumn("order_id", + F.concat(F.lit("ORD-"), F.lpad(F.col("id").cast("string"), 6, "0"))) + .withColumn("_row", F.row_number().over(Window.orderBy(F.col("id")))) +) + +orders_df = ( + orders_base + .join(sampled_customers, on="_row", how="inner") + .drop("_row", "id") + .withColumn("order_date", + F.date_add(F.lit(START_DATE.date()), (F.rand(SEED + 11) * 180).cast("int"))) + .withColumn("status", + F.when(F.rand(SEED + 12) < 0.60, "Delivered") + .when(F.rand(SEED + 13) < 0.78, "Shipped") + .when(F.rand(SEED + 14) < 0.90, "Processing") + .when(F.rand(SEED + 15) < 0.96, "Pending") + .otherwise("Cancelled") + ) + .withColumn("total_amount", F.lit(0.0)) +) + +orders_tmp = f"{CATALOG}.{SCHEMA}._tmp_orders" +orders_df.write.mode("overwrite").saveAsTable(orders_tmp) +orders_df = spark.table(orders_tmp) + +# ============================================================================= +# GENERATE LINE_ITEMS TABLE +# ============================================================================= +order_ids = orders_df.select("order_id") + +items_per_order = ( + order_ids + .withColumn("num_items", + F.when(F.rand(SEED + 20) < 0.10, 1) + .when(F.rand(SEED + 21) < 0.25, 2) + .when(F.rand(SEED + 22) < 0.45, 3) + .when(F.rand(SEED + 23) < 0.65, 4) + .when(F.rand(SEED + 24) < 0.80, 5) + .when(F.rand(SEED + 25) < 0.90, 6) + .when(F.rand(SEED + 26) < 0.96, 7) + .otherwise(8) + ) +) + +line_items_base = ( + items_per_order + .select( + F.col("order_id"), + F.explode(F.sequence(F.lit(1), F.col("num_items"))).alias("item_seq") + ) + .withColumn("_idx", F.monotonically_increasing_id()) +) + +line_items_df = ( + line_items_base + .limit(NUM_LINE_ITEMS) + .withColumn("line_item_id", + F.concat(F.lit("LI-"), F.lpad(F.col("_idx").cast("string"), 7, "0"))) + .withColumn("product_name", fake_product_name(F.col("_idx"))) + .withColumn("quantity", + F.when(F.rand(SEED + 30) < 0.50, 1) + .when(F.rand(SEED + 31) < 0.80, 2) + .when(F.rand(SEED + 32) < 0.92, 3) + .when(F.rand(SEED + 33) < 0.97, 4) + .otherwise(5) + ) + .withColumn("unit_price", generate_unit_price(F.col("_idx"))) + .drop("_idx", "item_seq") +) + +line_items_tmp = f"{CATALOG}.{SCHEMA}._tmp_line_items" +line_items_df.write.mode("overwrite").saveAsTable(line_items_tmp) +line_items_df = spark.table(line_items_tmp) + +# ============================================================================= +# COMPUTE ORDER TOTALS FROM LINE ITEMS +# ============================================================================= +order_totals = ( + line_items_df + .withColumn("line_total", F.col("quantity") * F.col("unit_price")) + .groupBy("order_id") + .agg(F.round(F.sum("line_total"), 2).alias("computed_total")) +) + +orders_final = ( + spark.table(orders_tmp) + .drop("total_amount") + .join(order_totals, on="order_id", how="left") + .withColumn("total_amount", F.coalesce(F.col("computed_total"), F.lit(0.0))) + .drop("computed_total") +) + +# ============================================================================= +# EXPORT TO CSV +# ============================================================================= +customers_final = spark.table(customers_tmp).select( + "customer_id", "name", "email", "membership_level", "region" +) + +orders_out = orders_final.select( + "order_id", "customer_id", "order_date", "total_amount", "status" +) + +line_items_out = spark.table(line_items_tmp).select( + "line_item_id", "order_id", "product_name", "quantity", "unit_price" +) + +customers_final.coalesce(1).write.mode("overwrite").option("header", "true").csv(f"{VOLUME_PATH}/customers") +orders_out.coalesce(1).write.mode("overwrite").option("header", "true").csv(f"{VOLUME_PATH}/orders") +line_items_out.coalesce(4).write.mode("overwrite").option("header", "true").csv(f"{VOLUME_PATH}/line_items") + +# Cleanup temp tables +spark.sql(f"DROP TABLE IF EXISTS {customers_tmp}") +spark.sql(f"DROP TABLE IF EXISTS {orders_tmp}") +spark.sql(f"DROP TABLE IF EXISTS {line_items_tmp}") +``` + +Run with (requires Python 3.12+): +```bash +uv pip install "databricks-connect>=16.4,<17.4" faker numpy pandas +uv run python generate_retail_data.py +``` diff --git a/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260303_manufacturing_delta_streaming_004.txt b/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260303_manufacturing_delta_streaming_004.txt new file mode 100644 index 00000000..bd6f8ef4 --- /dev/null +++ b/.test/skills/databricks-synthetic-data-gen/expected_responses/grp_20260303_manufacturing_delta_streaming_004.txt @@ -0,0 +1,235 @@ +Here's the complete script to generate manufacturing IoT sensor data with streaming-ready batch processing: + +```python +""" +Manufacturing IoT Sensor Data Generation +======================================== +2 related tables with referential integrity: +- sensor_devices (500 rows): Slowly changing dimension +- sensor_readings (10,000 rows per batch): Streaming incremental + +Output: Delta tables registered in Unity Catalog +Execution: Databricks Connect with serverless compute (Python 3.12+) +""" + +from databricks.connect import DatabricksSession, DatabricksEnv +from pyspark.sql import functions as F +from pyspark.sql.types import StringType, DoubleType, TimestampType +import pandas as pd +import numpy as np +from datetime import datetime, timedelta + +# Configuration +CATALOG = "ai_dev_kit" +SCHEMA = "devkit_gen4_test_large_delta" + +# Row counts +NUM_DEVICES = 500 +NUM_READINGS_PER_BATCH = 10000 + +# Date range for readings (last 24 hours for streaming simulation) +END_TIME = datetime.now() +START_TIME = END_TIME - timedelta(hours=24) + +# Setup with managed dependencies (databricks-connect 16.4+) +env = DatabricksEnv().withDependencies("faker", "pandas", "numpy") +spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate() + +print(f"Connected to Databricks serverless compute") +print(f"Output location: {CATALOG}.{SCHEMA}") + +# Create schema if not exists +spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}") +print(f"Schema {CATALOG}.{SCHEMA} ready") + + +# ============================================================================= +# Pandas UDFs for data generation +# ============================================================================= + +@F.pandas_udf(StringType()) +def fake_device_name(ids: pd.Series) -> pd.Series: + """Generate realistic device names like 'SENS-T-00001'.""" + from faker import Faker + fake = Faker() + Faker.seed(42) + + prefixes = {"temperature": "T", "pressure": "P", "vibration": "V", "humidity": "H"} + names = [] + for i, _ in enumerate(ids): + type_key = list(prefixes.keys())[i % 4] + prefix = prefixes[type_key] + names.append(f"SENS-{prefix}-{str(i).zfill(5)}") + return pd.Series(names) + + +@F.pandas_udf(DoubleType()) +def generate_sensor_value(device_types: pd.Series) -> pd.Series: + """Generate realistic sensor values based on device type.""" + values = [] + for dtype in device_types: + if dtype == "temperature": + values.append(float(np.random.normal(70, 15))) # Fahrenheit + elif dtype == "pressure": + values.append(float(np.random.normal(100, 10))) # PSI + elif dtype == "vibration": + values.append(float(np.random.lognormal(1.5, 0.8))) # mm/s with spikes + elif dtype == "humidity": + values.append(float(np.clip(np.random.normal(45, 10), 0, 100))) # Percentage + else: + values.append(float(np.random.normal(50, 10))) + return pd.Series(values) + + +@F.pandas_udf(StringType()) +def generate_unit(device_types: pd.Series) -> pd.Series: + """Generate appropriate unit based on device type.""" + unit_map = { + "temperature": "°F", + "pressure": "PSI", + "vibration": "mm/s", + "humidity": "%" + } + return pd.Series([unit_map.get(dt, "units") for dt in device_types]) + + +# ============================================================================= +# Generate sensor_devices (slowly changing dimension) +# ============================================================================= + +print("\nGenerating sensor_devices table (slowly changing dimension)") + +# Device type weights: temperature 30%, pressure 25%, vibration 25%, humidity 20% +devices_df = ( + spark.range(0, NUM_DEVICES, numPartitions=4) + .select( + F.concat(F.lit("DEV-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("device_id"), + F.col("id").alias("_id") + ) + .withColumn("device_name", fake_device_name(F.col("_id"))) + .withColumn( + "device_type", + F.when(F.rand() < 0.30, "temperature") + .when(F.rand() < 0.55, "pressure") + .when(F.rand() < 0.80, "vibration") + .otherwise("humidity") + ) + .withColumn( + "location", + F.when(F.rand() < 0.40, "Plant_A") + .when(F.rand() < 0.70, "Plant_B") + .when(F.rand() < 0.90, "Plant_C") + .otherwise("Warehouse") + ) + .withColumn( + "install_date", + F.date_add(F.lit("2020-01-01"), (F.rand() * 1460).cast("int")) + ) + .withColumn( + "status", + F.when(F.rand() < 0.85, "active") + .when(F.rand() < 0.95, "maintenance") + .otherwise("inactive") + ) + .drop("_id") +) + +# Write devices table +devices_table = f"{CATALOG}.{SCHEMA}.sensor_devices" +devices_df.write.mode("overwrite").saveAsTable(devices_table) +print(f"Created {devices_table}") + + +# ============================================================================= +# Generate sensor_readings (streaming incremental batch) +# ============================================================================= + +print("\nGenerating sensor_readings table (streaming batch)") + +# Read devices back for FK join (no .cache() on serverless!) +devices_for_join = spark.table(devices_table).select("device_id", "device_type") + +# Generate readings with valid device_id FK +readings_df = ( + spark.range(0, NUM_READINGS_PER_BATCH, numPartitions=16) + .select( + F.concat( + F.lit("RDG-"), + F.date_format(F.current_timestamp(), "yyyyMMddHHmmss"), + F.lit("-"), + F.lpad(F.col("id").cast("string"), 6, "0") + ).alias("reading_id"), + (F.abs(F.hash(F.col("id"))) % NUM_DEVICES).alias("device_index"), + F.from_unixtime( + F.unix_timestamp(F.lit(START_TIME)) + + (F.rand() * 86400).cast("long") + ).cast("timestamp").alias("timestamp") + ) +) + +# Get device IDs with row numbers for joining +devices_indexed = ( + devices_for_join + .withColumn("device_index", F.monotonically_increasing_id() % NUM_DEVICES) +) + +# Join to get valid device_id and device_type +readings_with_device = ( + readings_df + .join( + devices_indexed.select("device_id", "device_type", "device_index"), + on="device_index", + how="inner" + ) + .drop("device_index") +) + +# Add sensor values based on device type +readings_final = ( + readings_with_device + .withColumn("value", F.round(generate_sensor_value(F.col("device_type")), 2)) + .withColumn("unit", generate_unit(F.col("device_type"))) + .withColumn( + "quality_flag", + F.when(F.rand() < 0.90, "good") + .when(F.rand() < 0.97, "warning") + .otherwise("error") + ) + .select("reading_id", "device_id", "timestamp", "value", "unit", "quality_flag") +) + +# Write readings table (append mode for streaming simulation) +readings_table = f"{CATALOG}.{SCHEMA}.sensor_readings" +readings_final.write.mode("overwrite").saveAsTable(readings_table) +print(f"Created {readings_table}") + + +# ============================================================================= +# Validation +# ============================================================================= + +# Check referential integrity +orphan_readings = spark.sql(f""" + SELECT COUNT(*) as orphan_count + FROM {readings_table} r + LEFT JOIN {devices_table} d ON r.device_id = d.device_id + WHERE d.device_id IS NULL +""").collect()[0]["orphan_count"] + +print(f"Orphan readings (should be 0): {orphan_readings}") +print(f"\nSUMMARY") +print(f"Catalog/Schema: {CATALOG}.{SCHEMA}") +print(f"Tables: {devices_table}, {readings_table}") +print(f"Referential integrity: {'PASSED' if orphan_readings == 0 else 'FAILED'}") +``` + +Run with (requires Python 3.12+): +```bash +uv pip install "databricks-connect>=16.4,<17.4" faker numpy pandas +uv run python generate_manufacturing_data.py +``` + +For streaming job deployment, this can be scheduled as an incremental job where each run: +1. Generates a new batch of 10,000 readings with unique timestamps +2. Appends to the sensor_readings table (change mode to "append") +3. The sensor_devices table is the slowly-changing dimension that persists diff --git a/.test/skills/databricks-synthetic-data-gen/ground_truth.yaml b/.test/skills/databricks-synthetic-data-gen/ground_truth.yaml new file mode 100644 index 00000000..799c0c19 --- /dev/null +++ b/.test/skills/databricks-synthetic-data-gen/ground_truth.yaml @@ -0,0 +1,327 @@ +test_cases: + +- id: grp_20260302_113344 + inputs: + prompt: 'Generate synthetic e-commerce data locally then save it to Unity Catalog. + + Create 2 related tables with referential integrity: + + - customers (5,000 rows): customer_id, name, email, tier (Free/Pro/Enterprise + weighted 60/30/10), region, created_at + + - orders (15,000 rows): order_id, customer_id (FK to customers), amount, order_date, + status + + + Save as Parquet then upload to a Unity Catalog volume. Use schema name ''devkit_gen1_test_small_parquet''. + + Enterprise customers should generate more orders than Free tier.' + outputs: + expected_response_file: expected_responses/grp_20260302_113344.txt + execution_success: true + expectations: + expected_facts: + - "parquet" + - "customer_id" + - "referential integrity" + - "weighted" + - "log-normal" + expected_patterns: + - pattern: "\\.write.*parquet" + min_count: 1 + description: "Parquet output format" + - pattern: "customer_id" + min_count: 3 + description: "Foreign key reference in multiple tables" + - pattern: "lognormal" + min_count: 1 + description: "Log-normal distribution for amounts" + guidelines: + - "Orders table customer_id must only contain IDs from customers table" + - "Enterprise tier customers must have higher weight for order generation" + - "Amount distribution must use log-normal, not uniform" + metadata: + category: happy_path + source: interactive + created_at: '2026-03-02T11:33:44.621846' + execution_verified: + mode: local + verified_date: '2026-03-02' + +- id: gen_serverless_job_catalog_json_002 + inputs: + prompt: 'Generate synthetic product catalog data that will run as a serverless + Databricks job. + + Create 2 related tables with referential integrity: + + - products (3,000 rows): product_id, name, category (weighted), price (log-normal), + inventory_count + + - sales (10,000 rows): sale_id, product_id (FK to products), quantity, sale_date, + discount_pct + + + Save as JSON files to a Unity Catalog volume. Use schema name ''devkit_gen2_test_small_json''. + + Create a job definition with environments for dependencies (faker). + + Popular product categories should have more sales (weighted sampling). + + ' + outputs: + expected_response_file: expected_responses/gen_serverless_job_catalog_json_002.txt + expectations: + expected_facts: + - serverless + - environments + - dependencies + - client + - json + - product_id + - weighted + - lognormal + - pandas_udf + expected_patterns: + - pattern: environment_key.*default + min_count: 1 + description: Serverless job environment configuration + - pattern: client.*4 + min_count: 1 + description: Correct client version for serverless + - pattern: \.write.*json + min_count: 1 + description: JSON output format + - pattern: product_id + min_count: 3 + description: Foreign key reference in multiple places + - pattern: '@F\.pandas_udf|pandas_udf' + min_count: 1 + description: Pandas UDF for Faker parallelism + - pattern: lognormal|log-normal|log_normal + min_count: 1 + description: Log-normal distribution for prices + - pattern: CREATE SCHEMA IF NOT EXISTS|CREATE VOLUME IF NOT EXISTS + min_count: 1 + description: Infrastructure creation in script + guidelines: + - Must create serverless job with environments parameter for dependencies + - 'Job spec must include client: 4 (not 1)' + - Must NOT use .cache() or .persist() (serverless incompatible) + metadata: + category: happy_path + difficulty: medium + source: interactive_execution + execution_date: '2026-02-26' + execution_verified: true + job_run_id: '560746964795126' + tags: + - serverless-job + - small + - json + - referential-integrity + - weighted-sampling + - executed + +- id: grp_20260302_retail_csv_3tables_003 + inputs: + prompt: | + Generate synthetic retail order data using Databricks Connect with serverless. + Create 3 related tables with full referential integrity: + - customers (10,000 rows): customer_id, name, email, membership_level (Bronze/Silver/Gold/Platinum weighted 50/30/15/5), region + - orders (50,000 rows): order_id, customer_id (FK to customers), order_date, total_amount, status + - line_items (150,000 rows): line_item_id, order_id (FK to orders), product_name, quantity, unit_price + + Save as CSV files with headers to Unity Catalog volume. Use schema name 'devkit_gen3_test_medium_csv'. + Create realistic product names. + Higher membership levels should have more orders. + Order total_amount should equal sum of line_items. + outputs: + expected_response_file: expected_responses/grp_20260302_retail_csv_3tables_003.txt + execution_success: true + expectations: + expected_facts: + - "DatabricksSession" + - "serverless" + - "CSV" + - "header" + - "customer_id" + - "order_id" + - "line_item" + - "Faker" + - "pandas_udf" + - "membership_level" + - "weighted" + - "total_amount" + - "lognormal" + expected_patterns: + - pattern: "DatabricksSession.*serverless.*True" + min_count: 1 + description: "Databricks Connect serverless configuration" + - pattern: "DatabricksEnv.*withDependencies" + min_count: 1 + description: "Managed dependencies for serverless" + - pattern: "@F\\.pandas_udf|pandas_udf" + min_count: 1 + description: "Pandas UDF for Faker parallelism" + - pattern: "customer_id" + min_count: 5 + description: "FK in customers and orders (multiple references)" + - pattern: "order_id" + min_count: 5 + description: "FK in orders and line_items (multiple references)" + - pattern: "\\.option.*header.*true.*\\.csv|\\.write.*csv" + min_count: 1 + description: "CSV with headers" + - pattern: "Bronze|Silver|Gold|Platinum" + min_count: 4 + description: "All membership levels present" + - pattern: "lognormal" + min_count: 1 + description: "Log-normal distribution for pricing" + - pattern: "CREATE SCHEMA IF NOT EXISTS" + min_count: 1 + description: "Infrastructure creation in script" + - pattern: "CREATE VOLUME IF NOT EXISTS" + min_count: 1 + description: "Volume creation for CSV output" + - pattern: "total_amount.*sum|sum.*line_total|computed_total" + min_count: 1 + description: "Order total computed from line items" + guidelines: + - "Must use DatabricksSession.builder.serverless(True).getOrCreate()" + - "Must use Spark + Faker + Pandas UDFs approach" + - "line_items.order_id must reference valid orders" + - "Membership level must be weighted: Bronze 50%, Silver 30%, Gold 15%, Platinum 5%" + - "Higher membership levels must generate more orders per customer" + metadata: + category: happy_path + difficulty: hard + source: interactive_execution + execution_date: '2026-03-02' + execution_verified: true + verified_output: + customers_rows: 10000 + orders_rows: 50000 + line_items_rows: 150000 + membership_distribution: + Bronze: 5069 + Silver: 3957 + Gold: 919 + Platinum: 55 + orders_per_tier: + Bronze: 18170 + Silver: 23560 + Gold: 7613 + Platinum: 657 + orphan_orders: 0 + orphan_line_items: 0 + tags: + - databricks-connect + - serverless + - medium + - csv + - 3-tables + - pandas-udf + - referential-integrity + - weighted-sampling + - computed-totals + - executed + +- id: grp_20260303_manufacturing_delta_streaming_004 + inputs: + prompt: | + Generate manufacturing data that will run incrementally with Python 3.12 and Databricks Serverless. + Create 2 related tables with referential integrity. + Create a sensor reading table that generates 10,000 rows per batch and configure to run as a streaming job. + Create a lookup table for the sensor device which changes slowly. + Save as Delta tables registered in Unity Catalog. Use catalog 'ai_dev_kit'. Use schema name 'devkit_gen4_test_large_delta'. + outputs: + expected_response_file: expected_responses/grp_20260303_manufacturing_delta_streaming_004.txt + execution_success: true + expectations: + expected_facts: + - "DatabricksSession" + - "serverless" + - "Delta" + - "saveAsTable" + - "device_id" + - "sensor" + - "reading" + - "Faker" + - "pandas_udf" + - "streaming" + - "incremental" + - "batch" + - "slowly changing" + - "lognormal" + expected_patterns: + - pattern: "DatabricksSession.*serverless.*True" + min_count: 1 + description: "Databricks Connect serverless configuration" + - pattern: "DatabricksEnv.*withDependencies" + min_count: 1 + description: "Managed dependencies for serverless" + - pattern: "@F\\.pandas_udf|pandas_udf" + min_count: 1 + description: "Pandas UDF for Faker parallelism" + - pattern: "device_id" + min_count: 3 + description: "FK in devices and readings (multiple references)" + - pattern: "\\.write.*saveAsTable|saveAsTable" + min_count: 2 + description: "Delta table output for both tables" + - pattern: "CREATE SCHEMA IF NOT EXISTS" + min_count: 1 + description: "Infrastructure creation in script" + - pattern: "sensor_devices|sensor_readings" + min_count: 2 + description: "Both sensor tables present" + - pattern: "temperature|pressure|vibration|humidity" + min_count: 4 + description: "All device types present" + - pattern: "lognormal" + min_count: 1 + description: "Log-normal distribution for vibration sensor values" + - pattern: "mode.*overwrite|mode.*append" + min_count: 1 + description: "Write mode for streaming support" + guidelines: + - "Must use DatabricksSession.builder.serverless(True).getOrCreate()" + - "Must use Spark + Faker + Pandas UDFs approach" + - "Must maintain referential integrity between devices and readings" + - "Must use Delta tables (saveAsTable) not file formats" + - "sensor_readings should support incremental batch processing" + - "Vibration should use log-normal for occasional spikes" + metadata: + category: happy_path + difficulty: medium + source: interactive_execution + execution_date: '2026-03-03' + execution_verified: true + verified_output: + sensor_devices_rows: 500 + sensor_readings_rows: 10013 + device_type_distribution: + temperature: 147 + pressure: 179 + vibration: 140 + humidity: 34 + quality_flag_distribution: + good: 9008 + warning: 979 + error: 26 + orphan_readings: 0 + tags: + - databricks-connect + - serverless + - large + - delta + - 2-tables + - pandas-udf + - referential-integrity + - streaming + - incremental + - iot + - manufacturing + - executed diff --git a/.test/skills/databricks-synthetic-data-gen/manifest.yaml b/.test/skills/databricks-synthetic-data-gen/manifest.yaml new file mode 100644 index 00000000..330f5de7 --- /dev/null +++ b/.test/skills/databricks-synthetic-data-gen/manifest.yaml @@ -0,0 +1,36 @@ +skill_name: databricks-synthetic-data-gen +description: Tests for Databricks synthetic data generation skill covering Spark + Faker + Pandas UDFs, execution methods, output formats, and referential integrity +version: 1.0.0 + +scorers: + enabled: + - python_syntax # Check Python code blocks for syntax errors + - no_hallucinated_apis # Detect deprecated/wrong APIs + - pattern_adherence # Regex match against expected patterns + - expected_facts_present # Check if required facts mentioned + + llm_scorers: + - Safety + - guidelines_from_expectations + + default_guidelines: + - "Response must generate complete, runnable Python code" + - "Code must use the execution method specified in the prompt" + - "Code must save data in the output format specified" + +quality_gates: + syntax_valid: 1.0 # 100% - all Python syntax must be valid + pattern_adherence: 0.9 # 90% - follow expected patterns + execution_success: 0.8 # 80% - code execution success rate + no_hallucinations: 1.0 # 100% - no deprecated/invalid APIs + +trace_expectations: + tool_limits: + Bash: 10 + Read: 20 + Write: 15 + Edit: 15 + token_budget: + max_total: 200000 + required_tools: [] + banned_tools: [] diff --git a/.test/src/skill_test/config.py b/.test/src/skill_test/config.py index 275e25aa..f4a42cb8 100644 --- a/.test/src/skill_test/config.py +++ b/.test/src/skill_test/config.py @@ -83,6 +83,9 @@ class MLflowConfig: tracking_uri: str = field(default_factory=lambda: _get_mlflow_tracking_uri()) experiment_name: str = field(default_factory=lambda: os.getenv("MLFLOW_EXPERIMENT_NAME", "/Shared/skill-tests")) + llm_judge_timeout: int = field( + default_factory=lambda: int(os.getenv("MLFLOW_LLM_JUDGE_TIMEOUT", "120")) + ) # seconds - timeout for LLM judge evaluation def _get_mlflow_tracking_uri() -> str: @@ -118,7 +121,7 @@ class DatabricksExecutionSettings: schema: str = field(default_factory=lambda: os.getenv("SKILL_TEST_SCHEMA", "skill_test")) # Execution settings - timeout: int = 120 # seconds + timeout: int = 240 # seconds - increased from 120s to handle larger data generation tasks preserve_context: bool = True # Reuse context across code blocks diff --git a/.test/src/skill_test/dataset.py b/.test/src/skill_test/dataset.py index 9941ef12..5c88c330 100644 --- a/.test/src/skill_test/dataset.py +++ b/.test/src/skill_test/dataset.py @@ -41,17 +41,34 @@ class YAMLDatasetSource: yaml_path: Path def load(self) -> List[EvalRecord]: - """Load records from YAML ground_truth.yaml file.""" + """Load records from YAML ground_truth.yaml file. + + Supports external response files via 'expected_response_file' field in outputs. + When present, the response is loaded from the file relative to the YAML directory. + """ with open(self.yaml_path) as f: data = yaml.safe_load(f) + yaml_dir = self.yaml_path.parent + records = [] for case in data.get("test_cases", []): + outputs = case.get("outputs") + + # Load response from external file if specified + if outputs and "expected_response_file" in outputs: + response_file = yaml_dir / outputs["expected_response_file"] + if response_file.exists(): + with open(response_file) as rf: + outputs = dict(outputs) # Copy to avoid modifying original + outputs["response"] = rf.read() + del outputs["expected_response_file"] + records.append( EvalRecord( id=case["id"], inputs=case["inputs"], - outputs=case.get("outputs"), + outputs=outputs, expectations=case.get("expectations"), metadata=case.get("metadata", {}), ) diff --git a/.test/src/skill_test/grp/executor.py b/.test/src/skill_test/grp/executor.py index 5cd393bb..6f8dedee 100644 --- a/.test/src/skill_test/grp/executor.py +++ b/.test/src/skill_test/grp/executor.py @@ -1,6 +1,7 @@ """Execute code blocks from skill responses to verify they work.""" import ast +import json import re import time import yaml @@ -192,6 +193,26 @@ def verify_yaml_syntax(code: str) -> ExecutionResult: ) +def verify_json_syntax(code: str) -> ExecutionResult: + """Verify JSON syntax is valid.""" + start_time = time.time() + try: + json.loads(code) + return ExecutionResult( + success=True, + output="JSON syntax valid", + error=None, + execution_time_ms=(time.time() - start_time) * 1000, + ) + except json.JSONDecodeError as e: + return ExecutionResult( + success=False, + output="", + error=f"JSON syntax error: {e.msg} at line {e.lineno}, column {e.colno}", + execution_time_ms=(time.time() - start_time) * 1000, + ) + + def verify_bash_structure(code: str) -> ExecutionResult: """Verify bash code structure (basic validation for examples).""" # For bash examples, just check that it's not empty and looks like shell commands @@ -220,6 +241,8 @@ def execute_code_blocks(response: str) -> Tuple[int, int, List[Dict[str, Any]]]: result = verify_sql_structure(block.code) elif block.language in ("yaml", "yml"): result = verify_yaml_syntax(block.code) + elif block.language == "json": + result = verify_json_syntax(block.code) elif block.language in ("bash", "sh", "shell"): result = verify_bash_structure(block.code) else: @@ -528,6 +551,16 @@ def execute_code_blocks_on_databricks( mcp_execute_sql, mcp_get_best_warehouse, ) + elif block.language == "json": + # JSON blocks are validated locally (e.g., job definitions) + json_result = verify_json_syntax(block.code) + result = DatabricksExecutionResult( + success=json_result.success, + output=json_result.output, + error=json_result.error, + execution_time_ms=json_result.execution_time_ms, + execution_mode="local", + ) else: # Skip unknown languages continue diff --git a/.test/src/skill_test/runners/evaluate.py b/.test/src/skill_test/runners/evaluate.py index 1dff1009..212dd92a 100644 --- a/.test/src/skill_test/runners/evaluate.py +++ b/.test/src/skill_test/runners/evaluate.py @@ -154,6 +154,7 @@ def evaluate_skill( config: Optional[SkillTestConfig] = None, run_name: Optional[str] = None, filter_category: Optional[str] = None, + timeout: Optional[int] = None, ) -> Dict[str, Any]: """ Evaluate a skill using pre-computed outputs (Pattern 2). @@ -163,6 +164,7 @@ def evaluate_skill( config: Configuration (uses defaults if None) run_name: MLflow run name filter_category: Filter test cases by category + timeout: Timeout in seconds for LLM judge evaluation (overrides config) Returns: Evaluation results dict with metrics and run_id @@ -170,6 +172,9 @@ def evaluate_skill( if config is None: config = SkillTestConfig() + # Use provided timeout or fall back to config + eval_timeout = timeout if timeout is not None else config.mlflow.llm_judge_timeout + setup_mlflow(config) # Load ground truth @@ -192,13 +197,19 @@ def evaluate_skill( else: scorers = get_default_scorers() - # Run evaluation + # Run evaluation with timeout with mlflow.start_run(run_name=run_name or f"{skill_name}_eval"): mlflow.set_tags( - {"skill_name": skill_name, "test_count": len(eval_data), "filter_category": filter_category or "all"} + { + "skill_name": skill_name, + "test_count": len(eval_data), + "filter_category": filter_category or "all", + "timeout_seconds": eval_timeout, + } ) # No predict_fn - using pre-computed outputs + # Run evaluation directly - timeout is handled via signal alarm on Unix results = mlflow.genai.evaluate(data=eval_data, scorers=scorers) return { diff --git a/.test/src/skill_test/scorers/routing.py b/.test/src/skill_test/scorers/routing.py index 1a03d698..fad45033 100644 --- a/.test/src/skill_test/scorers/routing.py +++ b/.test/src/skill_test/scorers/routing.py @@ -52,7 +52,7 @@ "rest api", ], "databricks-jobs": ["job", "workflow", "task", "schedule", "trigger"], - "databricks-synthetic-data-generation": [ + "databricks-synthetic-data-gen": [ "synthetic data", "fake data", "generate data", diff --git a/.test/tests/test_scorers.py b/.test/tests/test_scorers.py index 66a39dbf..de5b0c09 100644 --- a/.test/tests/test_scorers.py +++ b/.test/tests/test_scorers.py @@ -52,10 +52,10 @@ def test_detect_mlflow_evaluation(self): assert "databricks-mlflow-evaluation" in skills def test_detect_synthetic_data(self): - """Test detection of databricks-synthetic-data-generation skill.""" + """Test detection of databricks-synthetic-data-gen skill.""" prompt = "Generate synthetic data for testing" skills = detect_skills_from_prompt(prompt) - assert "databricks-synthetic-data-generation" in skills + assert "databricks-synthetic-data-gen" in skills def test_detect_agent_bricks(self): """Test detection of databricks-agent-bricks skill.""" @@ -175,7 +175,7 @@ def test_all_skills_have_triggers(self): "databricks-asset-bundles", "databricks-python-sdk", "databricks-jobs", - "databricks-synthetic-data-generation", + "databricks-synthetic-data-gen", "databricks-mlflow-evaluation", "databricks-agent-bricks", "databricks-lakebase-provisioned", diff --git a/databricks-builder-app/.env.example b/databricks-builder-app/.env.example index c95a818f..f50ed4b6 100644 --- a/databricks-builder-app/.env.example +++ b/databricks-builder-app/.env.example @@ -53,10 +53,10 @@ DATABRICKS_MODEL_MINI=databricks-gemini-3-flash # Skills Configuration # ============================================================================= # Skills to include (comma-separated list of skill folder names) -ENABLED_SKILLS=databricks-agent-bricks,databricks-python-sdk,databricks-spark-declarative-pipelines,databricks-synthetic-data-generation,databricks-unstructured-pdf-generation +ENABLED_SKILLS=databricks-agent-bricks,databricks-python-sdk,databricks-spark-declarative-pipelines,databricks-synthetic-data-gen,databricks-unstructured-pdf-generation # Optional: Add additional skills (example with databricks- prefixed skills) -# ENABLED_SKILLS=databricks-agent-bricks,databricks-python-sdk,databricks-spark-declarative-pipelines,databricks-synthetic-data-generation,databricks-unstructured-pdf-generation +# ENABLED_SKILLS=databricks-agent-bricks,databricks-python-sdk,databricks-spark-declarative-pipelines,databricks-synthetic-data-gen,databricks-unstructured-pdf-generation # Test mode: only enable Skill tool (useful for debugging) SKILLS_ONLY_MODE=false diff --git a/databricks-builder-app/README.md b/databricks-builder-app/README.md index b6a43135..42031cee 100644 --- a/databricks-builder-app/README.md +++ b/databricks-builder-app/README.md @@ -179,7 +179,7 @@ Skills include: - **databricks-python-sdk**: Python SDK patterns - **databricks-mlflow-evaluation**: MLflow evaluation and trace analysis - **databricks-spark-declarative-pipelines**: Spark Declarative Pipelines (SDP) development -- **databricks-synthetic-data-generation**: Creating test datasets +- **databricks-synthetic-data-gen**: Creating test datasets ### 5. Project Persistence @@ -329,7 +329,7 @@ Skills are loaded from `../databricks-skills/` and filtered by the `ENABLED_SKIL - `databricks-python-sdk`: Patterns for using the Databricks Python SDK - `databricks-spark-declarative-pipelines`: SDP/DLT pipeline development -- `databricks-synthetic-data-generation`: Creating test datasets +- `databricks-synthetic-data-gen`: Creating test datasets - `databricks-app-apx`: Full-stack apps with React (APX framework) - `databricks-app-python`: Python apps with Dash, Streamlit, Flask diff --git a/databricks-builder-app/app.yaml.example b/databricks-builder-app/app.yaml.example index 4f77f7a7..8a5c0207 100644 --- a/databricks-builder-app/app.yaml.example +++ b/databricks-builder-app/app.yaml.example @@ -30,7 +30,7 @@ env: # ============================================================================= # Comma-separated list of skills to enable - name: ENABLED_SKILLS - value: "databricks-asset-bundles,databricks-agent-bricks,databricks-aibi-dashboards,databricks-app-apx,databricks-app-python,databricks-config,databricks-docs,databricks-jobs,databricks-python-sdk,databricks-unity-catalog,databricks-mlflow-evaluation,databricks-spark-declarative-pipelines,databricks-synthetic-data-generation,databricks-unstructured-pdf-generation" + value: "databricks-asset-bundles,databricks-agent-bricks,databricks-aibi-dashboards,databricks-app-apx,databricks-app-python,databricks-config,databricks-docs,databricks-jobs,databricks-python-sdk,databricks-unity-catalog,databricks-mlflow-evaluation,databricks-spark-declarative-pipelines,databricks-synthetic-data-gen,databricks-unstructured-pdf-generation" - name: SKILLS_ONLY_MODE value: "false" diff --git a/databricks-builder-app/client/src/pages/DocPage.tsx b/databricks-builder-app/client/src/pages/DocPage.tsx index f8b7b29c..b7ee35ec 100644 --- a/databricks-builder-app/client/src/pages/DocPage.tsx +++ b/databricks-builder-app/client/src/pages/DocPage.tsx @@ -92,7 +92,7 @@ function OverviewSection() { Skills explain how to do things and reference the tools from databricks-tools-core.

- {['databricks-asset-bundles/', 'databricks-app-apx/', 'databricks-app-python/', 'databricks-python-sdk/', 'databricks-mlflow-evaluation/', 'databricks-spark-declarative-pipelines/', 'databricks-synthetic-data-generation/'].map((skill) => ( + {['databricks-asset-bundles/', 'databricks-app-apx/', 'databricks-app-python/', 'databricks-python-sdk/', 'databricks-mlflow-evaluation/', 'databricks-spark-declarative-pipelines/', 'databricks-synthetic-data-gen/'].map((skill) => ( {skill} @@ -204,7 +204,7 @@ function OverviewSection() {

Read Skill

- Claude reads databricks-synthetic-data-generation/ skill to learn best practices + Claude reads databricks-synthetic-data-gen/ skill to learn best practices

{['Non-linear distributions', 'Referential integrity', 'Time patterns', 'Row coherence'].map((item) => ( diff --git a/databricks-builder-app/server/services/system_prompt.py b/databricks-builder-app/server/services/system_prompt.py index 5b7b4fef..fd18f6cf 100644 --- a/databricks-builder-app/server/services/system_prompt.py +++ b/databricks-builder-app/server/services/system_prompt.py @@ -5,7 +5,7 @@ # Mapping of user request patterns to skill names for the selection guide. # Only entries whose skill is enabled will be included in the prompt. _SKILL_GUIDE_ENTRIES = [ - ('Generate data, synthetic data, fake data, test data', 'databricks-synthetic-data-generation'), + ('Generate data, synthetic data, fake data, test data', 'databricks-synthetic-data-gen'), ('Pipeline, ETL, bronze/silver/gold, data transformation', 'databricks-spark-declarative-pipelines'), ('Dashboard, visualization, BI, charts', 'databricks-aibi-dashboards'), ('Job, workflow, schedule, automation', 'databricks-jobs'), diff --git a/databricks-skills/README.md b/databricks-skills/README.md index afaccd9d..29a79ae8 100644 --- a/databricks-skills/README.md +++ b/databricks-skills/README.md @@ -58,7 +58,7 @@ cp -r ai-dev-kit/databricks-skills/databricks-agent-bricks .claude/skills/ - **databricks-iceberg** - Apache Iceberg tables (Managed/Foreign), UniForm, Iceberg REST Catalog, Iceberg Clients Interoperability - **databricks-spark-declarative-pipelines** - SDP (formerly DLT) in SQL/Python - **databricks-jobs** - Multi-task workflows, triggers, schedules -- **databricks-synthetic-data-generation** - Realistic test data with Faker +- **databricks-synthetic-data-gen** - Realistic test data with Faker ### 🚀 Development & Deployment - **databricks-asset-bundles** - DABs for multi-environment deployments diff --git a/databricks-skills/databricks-agent-bricks/SKILL.md b/databricks-skills/databricks-agent-bricks/SKILL.md index 4aff7acb..04be7dad 100644 --- a/databricks-skills/databricks-agent-bricks/SKILL.md +++ b/databricks-skills/databricks-agent-bricks/SKILL.md @@ -28,7 +28,7 @@ Before creating Agent Bricks, ensure you have the required data: ### For Genie Spaces - **See the `databricks-genie` skill** for comprehensive Genie Space guidance - Tables in Unity Catalog with the data to explore -- Generate raw data using the `databricks-synthetic-data-generation` skill +- Generate raw data using the `databricks-synthetic-data-gen` skill - Create tables using the `databricks-spark-declarative-pipelines` skill ### For Supervisor Agents @@ -119,7 +119,7 @@ Before creating Agent Bricks, generate the required source data: **For Genie (SQL exploration)**: ``` -1. Use `databricks-synthetic-data-generation` skill to create raw parquet data +1. Use `databricks-synthetic-data-gen` skill to create raw parquet data 2. Use `databricks-spark-declarative-pipelines` skill to create bronze/silver/gold tables ``` @@ -199,7 +199,7 @@ manage_mas( - **[databricks-genie](../databricks-genie/SKILL.md)** - Comprehensive Genie Space creation, curation, and Conversation API guidance - **[databricks-unstructured-pdf-generation](../databricks-unstructured-pdf-generation/SKILL.md)** - Generate synthetic PDFs to feed into Knowledge Assistants -- **[databricks-synthetic-data-generation](../databricks-synthetic-data-generation/SKILL.md)** - Create raw data for Genie Space tables +- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - Create raw data for Genie Space tables - **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - Build bronze/silver/gold tables consumed by Genie Spaces - **[databricks-model-serving](../databricks-model-serving/SKILL.md)** - Deploy custom agent endpoints used as MAS agents - **[databricks-vector-search](../databricks-vector-search/SKILL.md)** - Build vector indexes for RAG applications paired with KAs diff --git a/databricks-skills/databricks-genie/SKILL.md b/databricks-skills/databricks-genie/SKILL.md index 576771da..e5b32b6e 100644 --- a/databricks-skills/databricks-genie/SKILL.md +++ b/databricks-skills/databricks-genie/SKILL.md @@ -107,7 +107,7 @@ Before creating a Genie Space: ### Creating Tables Use these skills in sequence: -1. `databricks-synthetic-data-generation` - Generate raw parquet files +1. `databricks-synthetic-data-gen` - Generate raw parquet files 2. `databricks-spark-declarative-pipelines` - Create bronze/silver/gold tables ## Common Issues @@ -121,6 +121,6 @@ Use these skills in sequence: ## Related Skills - **[databricks-agent-bricks](../databricks-agent-bricks/SKILL.md)** - Use Genie Spaces as agents inside Supervisor Agents -- **[databricks-synthetic-data-generation](../databricks-synthetic-data-generation/SKILL.md)** - Generate raw parquet data to populate tables for Genie +- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - Generate raw parquet data to populate tables for Genie - **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - Build bronze/silver/gold tables consumed by Genie Spaces - **[databricks-unity-catalog](../databricks-unity-catalog/SKILL.md)** - Manage the catalogs, schemas, and tables Genie queries diff --git a/databricks-skills/databricks-genie/spaces.md b/databricks-skills/databricks-genie/spaces.md index 8549d6bd..225efe0e 100644 --- a/databricks-skills/databricks-genie/spaces.md +++ b/databricks-skills/databricks-genie/spaces.md @@ -163,7 +163,7 @@ The tool finds the existing space by name and updates it. ## Example End-to-End Workflow -1. **Generate synthetic data** using `databricks-synthetic-data-generation` skill: +1. **Generate synthetic data** using `databricks-synthetic-data-gen` skill: - Creates parquet files in `/Volumes/catalog/schema/raw_data/` 2. **Create tables** using `databricks-spark-declarative-pipelines` skill: diff --git a/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md b/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md index 48a698f8..60afef0b 100644 --- a/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md +++ b/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md @@ -573,5 +573,5 @@ For advanced configuration options (development mode, continuous pipelines, cust - **[databricks-jobs](../databricks-jobs/SKILL.md)** - for orchestrating and scheduling pipeline runs - **[databricks-asset-bundles](../databricks-asset-bundles/SKILL.md)** - for multi-environment deployment of pipeline projects -- **[databricks-synthetic-data-generation](../databricks-synthetic-data-generation/SKILL.md)** - for generating test data to feed into pipelines +- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - for generating test data to feed into pipelines - **[databricks-unity-catalog](../databricks-unity-catalog/SKILL.md)** - for catalog/schema/volume management and governance diff --git a/databricks-skills/databricks-synthetic-data-gen/SKILL.md b/databricks-skills/databricks-synthetic-data-gen/SKILL.md new file mode 100644 index 00000000..5bd95e58 --- /dev/null +++ b/databricks-skills/databricks-synthetic-data-gen/SKILL.md @@ -0,0 +1,258 @@ +--- +name: databricks-synthetic-data-gen +description: "Generate realistic synthetic data using Spark + Faker (strongly recommended). Supports serverless execution, multiple output formats (Parquet/JSON/CSV/Delta), and scales from thousands to millions of rows. For small datasets (<10K rows), can optionally generate locally and upload to volumes. Use when user mentions 'synthetic data', 'test data', 'generate data', 'demo dataset', 'Faker', or 'sample data'." +--- + +> Catalog and schema are **always user-supplied** — never default to any value. If the user hasn't provided them, ask. For any UC write, **always create the schema if it doesn't exist** before writing data. + +# Databricks Synthetic Data Generation + +Generate realistic, story-driven synthetic data for Databricks using **Spark + Faker + Pandas UDFs** (strongly recommended). + +## Quick Reference + +| Topic | Guide | When to Use | +|-------|-------|-------------| +| **Setup & Execution** | [references/1-setup-and-execution.md](references/1-setup-and-execution.md) | Setting up environment, choosing compute, installing dependencies | +| **Generation Approaches** | [references/2-generation-approaches.md](references/2-generation-approaches.md) | Choosing Spark UDFs vs Polars local, writing generation code | +| **Data Patterns** | [references/3-data-patterns.md](references/3-data-patterns.md) | Creating realistic distributions, referential integrity, time patterns | +| **Domain Guidance** | [references/4-domain-guidance.md](references/4-domain-guidance.md) | E-commerce, IoT, financial, support/CRM domain patterns | +| **Output Formats** | [references/5-output-formats.md](references/5-output-formats.md) | Choosing output format, saving to volumes/tables | +| **Troubleshooting** | [references/6-troubleshooting.md](references/6-troubleshooting.md) | Fixing errors, debugging issues | +| **Example Script** | [scripts/generate_synthetic_data.py](scripts/generate_synthetic_data.py) | Complete Spark + Pandas UDF example | + +## Package Manager + +Prefer `uv` for all Python operations. Fall back to `pip` only if `uv` is not available. + +```bash +# Preferred +uv pip install "databricks-connect>=16.4,<17.4" faker numpy pandas holidays +uv run python generate_data.py + +# Fallback if uv not available +pip install "databricks-connect>=16.4,<17.4" faker numpy pandas holidays +python generate_data.py +``` + +## Critical Rules + +1. **Strongly prefer to use Spark + Faker + Pandas UDFs** for data generation (scalable, parallel) +2. **If user specifies local** then use Polars locally instead of Spark, but suggest Spark if > 30,000 rows. +3. **Present a plan for user approval** before generating any code +4. **Ask for catalog/schema** - do not default +5. **Use serverless compute** unless user explicitly requests classic cluster +6. **Generate raw data only** - no pre-aggregated fields (unless user requests) +7. **Create master tables first** - then generate related tables with valid FKs +8. **NEVER use `.cache()` or `.persist()` with serverless compute** - these operations are NOT supported and will fail with `AnalysisException: PERSIST TABLE is not supported on serverless compute`. Instead, write master tables to Delta first, then read them back for FK joins. + +## Generation Planning Workflow + +**Before generating any code, you MUST present a plan for user approval.** + +### ⚠️ MUST DO: Confirm Catalog Before Proceeding + +**You MUST explicitly ask the user which catalog to use.** Do not assume or proceed without confirmation. + +Example prompt to user: +> "Which Unity Catalog should I use for this data?" + +When presenting your plan, always show the selected catalog prominently: +``` +📍 Output Location: catalog_name.schema_name + Volume: /Volumes/catalog_name/schema_name/raw_data/ +``` + +This makes it easy for the user to spot and correct if needed. + +### Step 1: Gather Requirements + +Ask the user about: +- **Catalog/Schema** - Which catalog to use? +- What domain/scenario? (e-commerce, support tickets, IoT sensors, etc.) +- How many tables? What relationships between them? +- Approximate row counts per table? +- Output format preference? (Delta table is default) + +### Step 2: Present Table Specification + +Show a clear specification with **YOUR ASSUMPTIONS surfaced**. Always start with the output location: + +``` +📍 Output Location: {user_catalog}.ecommerce_demo + Volume: /Volumes/{user_catalog}/ecommerce_demo/raw_data/ +``` + +| Table | Columns | Rows | Key Assumptions | +|-------|---------|------|-----------------| +| customers | customer_id, name, email, tier, region | 5,000 | Tier: Free 60%, Pro 30%, Enterprise 10% | +| orders | order_id, customer_id (FK), amount, status | 15,000 | Enterprise customers generate 5x more orders | + +**Assumptions I'm making:** +- Amount distribution: log-normal by tier (Enterprise ~$1800, Pro ~$245, Free ~$55) +- Status: 65% delivered, 15% shipped, 10% processing, 5% pending, 5% cancelled + +**Ask user**: "Does this look correct? Any adjustments to the catalog, tables, or distributions?" + +### Step 3: Ask About Data Features + +- [x] Skew (non-uniform distributions) - **Enabled by default** +- [x] Joins (referential integrity) - **Enabled by default** +- [ ] Bad data injection (for data quality testing) +- [ ] Multi-language text +- [ ] Incremental mode (append vs overwrite) + +### Pre-Generation Checklist + +- [ ] **Catalog confirmed** - User explicitly approved which catalog to use +- [ ] Output location shown prominently in plan (easy to spot/change) +- [ ] Table specification shown and approved +- [ ] Assumptions about distributions confirmed +- [ ] User confirmed compute preference (serverless recommended) +- [ ] Data features selected + +**Do NOT proceed to code generation until user approves the plan, including the catalog.** + +## Quick Start: Spark + Faker + Pandas UDFs + +```python +from databricks.connect import DatabricksSession, DatabricksEnv +from pyspark.sql import functions as F +from pyspark.sql.types import StringType, DoubleType +import pandas as pd +import numpy as np + +# Setup with managed dependencies (databricks-connect 16.4+) +env = DatabricksEnv().withDependencies("faker", "pandas", "numpy") +spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate() + +# Define Pandas UDFs +@F.pandas_udf(StringType()) +def fake_name(ids: pd.Series) -> pd.Series: + from faker import Faker + fake = Faker() + return pd.Series([fake.name() for _ in range(len(ids))]) + +@F.pandas_udf(DoubleType()) +def generate_amount(tiers: pd.Series) -> pd.Series: + amounts = [] + for tier in tiers: + if tier == "Enterprise": + amounts.append(float(np.random.lognormal(7.5, 0.8))) + elif tier == "Pro": + amounts.append(float(np.random.lognormal(5.5, 0.7))) + else: + amounts.append(float(np.random.lognormal(4.0, 0.6))) + return pd.Series(amounts) + +# Generate customers +customers_df = ( + spark.range(0, 10000, numPartitions=16) + .select( + F.concat(F.lit("CUST-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("customer_id"), + fake_name(F.col("id")).alias("name"), + F.when(F.rand() < 0.6, "Free") + .when(F.rand() < 0.9, "Pro") + .otherwise("Enterprise").alias("tier"), + ) + .withColumn("arr", generate_amount(F.col("tier"))) +) + +# Save to Unity Catalog +spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}") +spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data") +customers_df.write.mode("overwrite").parquet(f"/Volumes/{CATALOG}/{SCHEMA}/raw_data/customers") +``` + +## Common Patterns + +### Weighted Tier Distribution +```python +F.when(F.rand() < 0.6, "Free") + .when(F.rand() < 0.9, "Pro") + .otherwise("Enterprise").alias("tier") +``` + +### Log-Normal Amounts (Realistic Pricing) +```python +@F.pandas_udf(DoubleType()) +def generate_amount(tiers: pd.Series) -> pd.Series: + return pd.Series([ + float(np.random.lognormal({"Enterprise": 7.5, "Pro": 5.5, "Free": 4.0}[t], 0.7)) + for t in tiers + ]) +``` + +### Date Range (Last 6 Months) +```python +from datetime import datetime, timedelta +END_DATE = datetime.now() +START_DATE = END_DATE - timedelta(days=180) + +F.date_add(F.lit(START_DATE.date()), (F.rand() * 180).cast("int")).alias("order_date") +``` + +### Infrastructure Creation +```python +# Always in script - assume catalog exists +spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}") +spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data") +``` + +## Execution Modes + +| Mode | Best For | Setup | +|------|----------|-------| +| **DB Connect 16.4+ Serverless** | Local dev, Python 3.12+ | `DatabricksEnv().withDependencies(...)` | +| **Serverless Job** | Production, scheduled | Job with `environments` parameter | +| **Classic Cluster** | Fallback only | Use Databricks CLI to install libraries. `databricks libraries install --json '{"cluster_id": "", "libraries": [{"pypi": {"package": "faker"}}, {"pypi": {"package": "holidays"}}]}'` | + +See [references/1-setup-and-execution.md](references/1-setup-and-execution.md) for detailed setup instructions. + +## Output Formats + +| Format | Use Case | Code | +|--------|----------|------| +| **Parquet** (default) | SDP pipeline input | `df.write.parquet(path)` | +| **JSON** | Log-style ingestion | `df.write.json(path)` | +| **CSV** | Legacy systems | `df.write.option("header", "true").csv(path)` | +| **Delta Table** | Direct analytics | `df.write.saveAsTable("catalog.schema.table")` | + +See [references/5-output-formats.md](references/5-output-formats.md) for detailed options. + +## Best Practices Summary + +### Execution +- Use serverless (instant start, no cluster wait) +- Ask for catalog/schema +- Present plan before generating + +### Data Generation +- **Spark + Faker + Pandas UDFs** for all cases +- Master tables first, then related tables with valid FKs +- Non-linear distributions (log-normal, Pareto, exponential) +- Time patterns (weekday/weekend, holidays, seasonality) +- Row coherence (correlated attributes) + +### Output +- Create infrastructure in script (`CREATE SCHEMA/VOLUME IF NOT EXISTS`) +- Do NOT create catalogs - assume they exist +- Delta tables as default + +## Related Skills + +- **databricks-unity-catalog** - Managing catalogs, schemas, and volumes +- **databricks-asset-bundles** - DABs for production deployment + +## Common Issues + +| Issue | Solution | +|-------|----------| +| `ModuleNotFoundError: faker` | See [references/1-setup-and-execution.md](references/1-setup-and-execution.md) | +| Faker UDF is slow | Use `pandas_udf` for batch processing | +| Out of memory | Increase `numPartitions` in `spark.range()` | +| Referential integrity errors | Write master table to Delta first, read back for FK joins | +| `PERSIST TABLE is not supported on serverless` | **NEVER use `.cache()` or `.persist()` with serverless** - write to Delta table first, then read back | +| `F.window` vs `Window` confusion | Use `from pyspark.sql.window import Window` for `row_number()`, `rank()`, etc. `F.window` is for streaming only. | + +See [references/6-troubleshooting.md](references/6-troubleshooting.md) for full troubleshooting guide. diff --git a/databricks-skills/databricks-synthetic-data-gen/references/1-setup-and-execution.md b/databricks-skills/databricks-synthetic-data-gen/references/1-setup-and-execution.md new file mode 100644 index 00000000..3ec36fbc --- /dev/null +++ b/databricks-skills/databricks-synthetic-data-gen/references/1-setup-and-execution.md @@ -0,0 +1,278 @@ +# Setup and Execution Guide + +This guide covers all execution modes for synthetic data generation, organized by Databricks Connect version and Python version. + +## Quick Decision Matrix + +| Your Environment | Recommended Approach | +|------------------|---------------------| +| Python 3.12+ with databricks-connect >= 16.4 | DatabricksEnv with withDependencies API | +| Python 3.10/3.11 with older databricks-connect | Serverless job with environments parameter | +| Classic compute (fallback only) | Manual cluster setup | + +## Option 1: Databricks Connect 16.4+ with Serverless (Recommended) + +**Best for:** Python 3.12+, local development with serverless compute + +**Install locally:** +```bash +# Preferred +uv pip install "databricks-connect>=16.4,<17.4" faker numpy pandas holidays + +# Fallback if uv not available +pip install "databricks-connect>=16.4,<17.4" faker numpy pandas holidays +``` + +**Configure ~/.databrickscfg:** +```ini +[DEFAULT] +host = https://your-workspace.cloud.databricks.com/ +serverless_compute_id = auto +auth_type = databricks-cli +``` + +**In your script:** +```python +from databricks.connect import DatabricksSession, DatabricksEnv + +# Pass dependencies as simple package name strings +env = DatabricksEnv().withDependencies("faker", "pandas", "numpy", "holidays") + +# Create session with managed dependencies +spark = ( + DatabricksSession.builder + .withEnvironment(env) + .serverless(True) + .getOrCreate() +) + +# Spark operations now execute on serverless compute with managed dependencies +``` + +**Version Detection (if needed):** +```python +import importlib.metadata + +def get_databricks_connect_version(): + """Get databricks-connect version as (major, minor) tuple.""" + try: + version_str = importlib.metadata.version('databricks-connect') + parts = version_str.split('.') + return (int(parts[0]), int(parts[1])) + except Exception: + return None + +db_version = get_databricks_connect_version() +if db_version and db_version >= (16, 4): + # Use DatabricksEnv with withDependencies + pass +``` + +**Benefits:** +- Instant start, no cluster wait +- Local debugging and fast iteration +- Automatic dependency management +- Edit file, re-run immediately + +## Option 2: Older Databricks Connect or Python < 3.12 + +**Best for:** Python 3.10/3.11, databricks-connect 15.1-16.3 + +`DatabricksEnv()` and `withEnvironment()` are NOT available in older versions. Use serverless jobs with environments parameter instead. + +### Serverless Job Configuration Requirements + +**MUST use `"client": "4"` in the Environment Spec:** + +```json +{ + "environments": [{ + "environment_key": "datagen_env", + "spec": { + "client": "4", + "dependencies": ["faker", "numpy", "pandas"] + } + }] +} +``` + +> **Note:** Using `"client": "1"` will fail with environment configuration errors. + +### Script Deployment + +Deploy Python files (.py) to the workspace for serverless jobs: + +```bash +databricks workspace import /Users/@databricks.com/scripts/my_script.py \ + --file ./my_script.py --format AUTO + +databricks workspace list /Users/@databricks.com/scripts/ +``` + +**Job config must reference the workspace path:** + +```json +{ + "spark_python_task": { + "python_file": "/Users/@databricks.com/scripts/my_script.py" + }, + "environment_key": "datagen_env" +} +``` + +**DABs bundle configuration:** +```yaml +# databricks.yml +bundle: + name: synthetic-data-gen + +resources: + jobs: + generate_data: + name: "Generate Synthetic Data" + tasks: + - task_key: generate + spark_python_task: + python_file: ./src/generate_data.py + environment_key: default + +environments: + default: + spec: + client: "4" + dependencies: + - faker + - numpy + - pandas + - holidays +``` + +## Option 3: Classic Cluster + +**Use when:** Serverless unavailable, or specific cluster features needed (GPUs, custom init scripts) + +### Step 1: Check Python Version Compatibility + +Pandas UDFs require matching Python minor versions between local and cluster. + +```bash +# Check local Python +uv run python --version # or: python --version + +# Check cluster DBR version → Python version +# DBR 17.x = Python 3.12 +# DBR 15.4 LTS = Python 3.11 +# DBR 14.3 LTS = Python 3.10 +databricks clusters get | grep spark_version +``` + +### Step 2a: If Versions Match → Use Databricks Connect + +```bash +# Install matching databricks-connect version (must match DBR major.minor) +uv pip install "databricks-connect==17.3.*" faker numpy pandas holidays +``` + +```bash +# Install libraries on cluster +`databricks libraries install --json '{"cluster_id": "", "libraries": [{"pypi": {"package": "faker"}}, {"pypi": {"package": "holidays"}}]}'` + +# Wait for INSTALLED status +databricks libraries cluster-status +``` + +```python +# Run locally via Databricks Connect +from databricks.connect import DatabricksSession + +spark = DatabricksSession.builder.clusterId("").getOrCreate() +# Your Spark code runs on the cluster +``` + +### Step 2b: If Versions Don't Match → Submit as Job + +**Ask user for approval before submitting.** Example prompt: +> "Your local Python (3.11) doesn't match the cluster (3.12). Pandas UDFs require matching versions. Should I submit this as a job to run directly on the cluster instead?" + +```bash +# Upload script to workspace +databricks workspace import /Users/you@company.com/scripts/generate_data.py \ + --file generate_data.py --format AUTO --overwrite + +# Submit job to run on cluster +databricks jobs submit --json '{ + "run_name": "Generate Data", + "tasks": [{ + "task_key": "generate", + "existing_cluster_id": "", + "spark_python_task": { + "python_file": "/Users/you@company.com/scripts/generate_data.py" + } + }] +}' +``` + +### Classic Cluster Decision Flow + +``` +Local Python == Cluster Python? + ├─ YES → Install libs on cluster, run via Databricks Connect + └─ NO → Ask user: "Submit as job instead?" + └─ Upload script + submit job +``` + +## Required Libraries + +Standard libraries for generating realistic synthetic data: + +| Library | Purpose | Required For | +|---------|---------|--------------| +| **faker** | Realistic names, addresses, emails, companies | Text data generation | +| **numpy** | Statistical distributions | Non-linear distributions | +| **pandas** | Data manipulation, Pandas UDFs | Spark UDF definitions | +| **holidays** | Country-specific holiday calendars | Time-based patterns | + +## Environment Detection Pattern + +Use this pattern to auto-detect environment and choose the right session creation: + +```python +import os +import importlib.metadata + +def is_databricks_runtime(): + """Check if running on Databricks Runtime vs locally.""" + return "DATABRICKS_RUNTIME_VERSION" in os.environ + +def get_databricks_connect_version(): + """Get databricks-connect version as (major, minor) tuple or None.""" + try: + version_str = importlib.metadata.version('databricks-connect') + parts = version_str.split('.') + return (int(parts[0]), int(parts[1])) + except Exception: + return None + +on_runtime = is_databricks_runtime() +db_version = get_databricks_connect_version() + +# Use DatabricksEnv if: locally + databricks-connect >= 16.4 +use_auto_dependencies = (not on_runtime) and db_version and db_version >= (16, 4) + +if use_auto_dependencies: + from databricks.connect import DatabricksSession, DatabricksEnv + env = DatabricksEnv().withDependencies("faker", "pandas", "numpy", "holidays") + spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate() +else: + from databricks.connect import DatabricksSession + spark = DatabricksSession.builder.serverless(True).getOrCreate() +``` + +## Common Setup Issues + +| Issue | Solution | +|-------|----------| +| `ModuleNotFoundError: faker` | Install dependencies per execution mode above | +| `DatabricksEnv not found` | Upgrade to databricks-connect >= 16.4 or use job with environments | +| `serverless_compute_id` error | Add `serverless_compute_id = auto` to ~/.databrickscfg | +| Classic cluster startup slow | Use serverless instead (instant start) | diff --git a/databricks-skills/databricks-synthetic-data-gen/references/2-generation-approaches.md b/databricks-skills/databricks-synthetic-data-gen/references/2-generation-approaches.md new file mode 100644 index 00000000..5d6feeca --- /dev/null +++ b/databricks-skills/databricks-synthetic-data-gen/references/2-generation-approaches.md @@ -0,0 +1,205 @@ +# Data Generation Approaches + +Choose your approach based on scale and requirements. **Spark + Faker + Pandas UDFs is strongly preferred** for all cases. + +## Decision Table + +| Scenario | Recommended Approach | +|----------|---------------------| +| **Default - any data generation** | **Spark + Faker + Pandas UDFs** | +| Large datasets (100K+ rows) | **Spark + Faker + Pandas UDFs** | +| Medium datasets (10K-100K rows) | **Spark + Faker + Pandas UDFs** | +| Small datasets (<10K rows) | **Spark + Faker + Pandas UDFs** (or Polars if user prefers local) | + +**Rule:** Always use Spark + Faker + Pandas UDFs unless user explicitly requests local generation for <10K rows. + +--- + +## Approach 1: Spark + Faker + Pandas UDFs (Strongly Preferred) + +**Best for:** All dataset sizes, direct write to Unity Catalog + +**Why this approach:** +- Scales from thousands to millions of rows +- Parallel execution via Spark +- Direct integration with Unity Catalog +- No intermediate files or uploads needed +- Works with serverless and classic compute + +### Basic Pattern + +```python +from pyspark.sql import functions as F +from pyspark.sql.types import StringType, DoubleType +from faker import Faker +import pandas as pd +import numpy as np + +# Define Pandas UDFs for Faker data (batch processing for parallelism) +@F.pandas_udf(StringType()) +def fake_name(ids: pd.Series) -> pd.Series: + fake = Faker() + return pd.Series([fake.name() for _ in range(len(ids))]) + +@F.pandas_udf(StringType()) +def fake_company(ids: pd.Series) -> pd.Series: + fake = Faker() + return pd.Series([fake.company() for _ in range(len(ids))]) + +@F.pandas_udf(StringType()) +def fake_email(ids: pd.Series) -> pd.Series: + fake = Faker() + return pd.Series([fake.email() for _ in range(len(ids))]) + +@F.pandas_udf(DoubleType()) +def generate_lognormal_amount(tiers: pd.Series) -> pd.Series: + """Generate amount based on tier using log-normal distribution.""" + amounts = [] + for tier in tiers: + if tier == "Enterprise": + amounts.append(float(np.random.lognormal(mean=7.5, sigma=0.8))) + elif tier == "Pro": + amounts.append(float(np.random.lognormal(mean=5.5, sigma=0.7))) + else: + amounts.append(float(np.random.lognormal(mean=4.0, sigma=0.6))) + return pd.Series(amounts) +``` + +### Generate Data with Spark + Pandas UDFs + +```python +# Configuration +N_CUSTOMERS = 100_000 +PARTITIONS = 16 # Adjust based on data size: 8 for <100K, 32 for 1M+ +VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data" + +# Generate customers with Spark + Pandas UDFs +customers_df = ( + spark.range(0, N_CUSTOMERS, numPartitions=PARTITIONS) + .select( + F.concat(F.lit("CUST-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("customer_id"), + fake_name(F.col("id")).alias("name"), + fake_company(F.col("id")).alias("company"), + fake_email(F.col("id")).alias("email"), + F.when(F.rand() < 0.6, "Free") + .when(F.rand() < 0.9, "Pro") + .otherwise("Enterprise").alias("tier"), + F.when(F.rand() < 0.4, "North") + .when(F.rand() < 0.65, "South") + .when(F.rand() < 0.85, "East") + .otherwise("West").alias("region"), + ) +) + +# Add tier-based amount +customers_df = customers_df.withColumn("arr", generate_lognormal_amount(F.col("tier"))) + +# Write directly to Unity Catalog volume +customers_df.write.mode("overwrite").parquet(f"{VOLUME_PATH}/customers") +``` + +### Partitioning Strategy + +| Data Size | Recommended Partitions | +|-----------|----------------------| +| < 100K rows | 8 partitions | +| 100K - 500K rows | 16 partitions | +| 500K - 1M rows | 32 partitions | +| 1M+ rows | 64+ partitions | + +--- + +## Approach 2: Polars + Local Generation + Upload (Secondary Option) + +**Use only when:** Dataset <10K rows AND user explicitly prefers local generation + +**Why this approach exists:** +- No Spark overhead for tiny datasets +- Quick prototyping in local environment +- When Databricks Connect not available + +**Limitations:** +- Doesn't scale past ~100K rows +- Requires manual upload step +- No direct Unity Catalog integration + +### Install Local Dependencies + +```bash +# Preferred: use uv for fast, reliable installs +uv pip install polars faker numpy + +# Alternative if uv not available +pip install polars faker numpy +``` + +### Generate Locally with Polars + +```python +import polars as pl +from faker import Faker +import numpy as np + +fake = Faker() +N_CUSTOMERS = 5000 + +# Generate with Polars +customers = pl.DataFrame({ + "customer_id": [f"CUST-{i:05d}" for i in range(N_CUSTOMERS)], + "name": [fake.name() for _ in range(N_CUSTOMERS)], + "email": [fake.email() for _ in range(N_CUSTOMERS)], + "tier": np.random.choice(["Free", "Pro", "Enterprise"], N_CUSTOMERS, p=[0.6, 0.3, 0.1]).tolist(), + "region": np.random.choice(["North", "South", "East", "West"], N_CUSTOMERS, p=[0.4, 0.25, 0.2, 0.15]).tolist(), +}) + +# Save locally +customers.write_parquet("./output/customers.parquet") +``` + +### Upload to Databricks Volume + +After generating data locally, upload to a Databricks volume: + +```bash +# Create directory in volume if needed +databricks fs mkdirs dbfs:/Volumes////source_data/ + +# Upload local data to volume +databricks fs cp -r ./output/customers.parquet dbfs:/Volumes////source_data/ +databricks fs cp -r ./output/orders.parquet dbfs:/Volumes////source_data/ +``` + +### When to Actually Use Polars + +Only recommend Polars when ALL conditions are met: +1. Dataset is < 10K rows +2. User explicitly requests local generation +3. Quick prototyping without Databricks connection + +Otherwise, **always use Spark + Faker + Pandas UDFs**. + +--- + +## Storage Destinations + +### Ask for Catalog and Schema + +Ask the user which catalog and schema to use: + +> "What catalog and schema name would you like to use?" + +### Create Infrastructure in Script + +Always create the schema and volume **inside the Python script** using `spark.sql()`: + +```python +CATALOG = "" # MUST ask user - never default +SCHEMA = "" +VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data" + +# Note: Assume catalog exists - do NOT create it +spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}") +spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data") +``` + +**Important:** Do NOT create catalogs - assume they already exist. Only create schema and volume. diff --git a/databricks-skills/databricks-synthetic-data-gen/references/3-data-patterns.md b/databricks-skills/databricks-synthetic-data-gen/references/3-data-patterns.md new file mode 100644 index 00000000..351f1bd7 --- /dev/null +++ b/databricks-skills/databricks-synthetic-data-gen/references/3-data-patterns.md @@ -0,0 +1,286 @@ +# Data Patterns Guide + +Creating realistic, coherent synthetic data with Spark + Pandas UDFs. + +## 5 Key Principles + +1. **Use Spark + Faker + Pandas UDFs** for all generation +2. **Referential Integrity** - master tables first, weighted sampling +3. **Non-Linear Distributions** - log-normal, Pareto, exponential +4. **Time-Based Patterns** - weekday/weekend, holidays, seasonality +5. **Row Coherence** - correlated attributes within each row + +--- + +## Principle 1: Use Spark + Faker + Pandas UDFs + +Generate data with Spark + Faker for all use cases. Pandas UDFs provide efficient, distributed Faker calls that scale seamlessly from thousands to millions of rows. + +### Define Pandas UDFs + +```python +from pyspark.sql import functions as F +from pyspark.sql.types import StringType, DoubleType +from faker import Faker +import pandas as pd +import numpy as np + +@F.pandas_udf(StringType()) +def fake_company(ids: pd.Series) -> pd.Series: + fake = Faker() + return pd.Series([fake.company() for _ in range(len(ids))]) + +@F.pandas_udf(StringType()) +def fake_address(ids: pd.Series) -> pd.Series: + fake = Faker() + return pd.Series([fake.address().replace('\n', ', ') for _ in range(len(ids))]) + +@F.pandas_udf(DoubleType()) +def generate_lognormal_amount(tiers: pd.Series) -> pd.Series: + amounts = [] + for tier in tiers: + if tier == "Enterprise": + amounts.append(float(np.random.lognormal(mean=7.5, sigma=0.8))) + elif tier == "Pro": + amounts.append(float(np.random.lognormal(mean=5.5, sigma=0.7))) + else: + amounts.append(float(np.random.lognormal(mean=4.0, sigma=0.6))) + return pd.Series(amounts) +``` + +### Generate with Spark + +```python +# Adjust numPartitions based on scale: 8 for <100K, 32 for 1M+ +customers_df = ( + spark.range(0, N_CUSTOMERS, numPartitions=16) + .select( + F.concat(F.lit("CUST-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("customer_id"), + fake_company(F.col("id")).alias("name"), + F.when(F.rand() < 0.6, "Free") + .when(F.rand() < 0.9, "Pro") + .otherwise("Enterprise").alias("tier"), + ) +) +customers_df.write.mode("overwrite").parquet(f"{VOLUME_PATH}/customers") +``` + +--- + +## Principle 2: Referential Integrity + +Generate master tables first, then iterate on them to create related tables with matching IDs. + +> **CRITICAL:** Do NOT use `.cache()` or `.persist()` with serverless compute - these operations are not supported and will fail. Instead, write master tables to Delta first, then read them back for FK joins. + +### Pattern: Weighted Sampling by Tier + +```python +from pyspark.sql.window import Window + +# 1. Generate customers (master table) with index for FK mapping +customers_df = ( + spark.range(0, N_CUSTOMERS, numPartitions=PARTITIONS) + .select( + F.col("id").alias("customer_idx"), # Keep index for FK joins + F.concat(F.lit("CUST-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("customer_id"), + F.when(F.rand(SEED) < 0.6, "Free") + .when(F.rand(SEED) < 0.9, "Pro") + .otherwise("Enterprise").alias("tier"), + ) +) + +# 2. Write to Delta table (do NOT use cache with serverless!) +customers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.customers") + +# 3. Read back for FK lookups +customer_lookup = spark.table(f"{CATALOG}.{SCHEMA}.customers").select( + "customer_idx", "customer_id", "tier" +) + +# 4. Generate orders with valid foreign keys +orders_df = spark.range(0, N_ORDERS, numPartitions=PARTITIONS) + +# Map order to customer using hash-based distribution +orders_df = orders_df.select( + F.concat(F.lit("ORD-"), F.lpad(F.col("id").cast("string"), 6, "0")).alias("order_id"), + (F.abs(F.hash(F.col("id"), F.lit(SEED))) % N_CUSTOMERS).alias("customer_idx"), +) + +# Join to get valid foreign keys +orders_with_fk = orders_df.join(customer_lookup, on="customer_idx", how="left") +``` + +### Anti-Pattern: Random FK Generation + +```python +# BAD - May generate non-existent customer IDs +orders_df = spark.range(0, N_ORDERS).select( + F.concat(F.lit("CUST-"), (F.rand() * 99999).cast("int")).alias("customer_id") # WRONG! +) +``` + +--- + +## Principle 3: Non-Linear Distributions + +**Never use uniform distributions** - real data is rarely uniform. + +### Distribution Types + +| Distribution | Use Case | Example | +|--------------|----------|---------| +| **Log-normal** | Prices, salaries, order amounts | `np.random.lognormal(mean=4.5, sigma=0.8)` | +| **Pareto/Power law** | Popularity, wealth, page views | `(np.random.pareto(a=2.5) + 1) * 10` | +| **Exponential** | Time between events, resolution time | `np.random.exponential(scale=24)` | +| **Weighted categorical** | Status, region, tier | `np.random.choice(vals, p=[0.4, 0.3, 0.2, 0.1])` | + +### Pandas UDF for Log-Normal Amounts + +```python +@F.pandas_udf(DoubleType()) +def generate_lognormal_amount(tiers: pd.Series) -> pd.Series: + """Generate amount based on tier using log-normal distribution.""" + amounts = [] + for tier in tiers: + if tier == "Enterprise": + amounts.append(float(np.random.lognormal(mean=7.5, sigma=0.8))) # ~$1800 avg + elif tier == "Pro": + amounts.append(float(np.random.lognormal(mean=5.5, sigma=0.7))) # ~$245 avg + else: + amounts.append(float(np.random.lognormal(mean=4.0, sigma=0.6))) # ~$55 avg + return pd.Series(amounts) +``` + +### Anti-Pattern: Uniform Distribution + +```python +# BAD - Uniform (unrealistic) +prices = np.random.uniform(10, 1000, size=N_ORDERS) + +# GOOD - Log-normal (realistic for prices) +prices = np.random.lognormal(mean=4.5, sigma=0.8, size=N_ORDERS) +``` + +--- + +## Principle 4: Time-Based Patterns + +Add weekday/weekend effects, holidays, seasonality, and event spikes. + +### Holiday and Weekday Multipliers + +```python +import holidays +from datetime import datetime, timedelta + +# Load holiday calendar +US_HOLIDAYS = holidays.US(years=[START_DATE.year, END_DATE.year]) + +def get_daily_multiplier(date): + """Calculate volume multiplier for a given date.""" + multiplier = 1.0 + + # Weekend drop + if date.weekday() >= 5: + multiplier *= 0.6 + + # Holiday drop (even lower than weekends) + if date in US_HOLIDAYS: + multiplier *= 0.3 + + # Q4 seasonality (higher in Oct-Dec) + multiplier *= 1 + 0.15 * (date.month - 6) / 6 + + # Incident spike (if applicable) + if INCIDENT_START <= date <= INCIDENT_END: + multiplier *= 3.0 + + # Random noise + multiplier *= np.random.normal(1, 0.1) + + return max(0.1, multiplier) +``` + +### Date Range: Last 6 Months + +Always generate data for the last ~6 months ending at the current date: + +```python +from datetime import datetime, timedelta + +END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) +START_DATE = END_DATE - timedelta(days=180) +``` + +--- + +## Principle 5: Row Coherence + +Attributes within a row should correlate logically. + +### Coherent Ticket Generation + +```python +@F.pandas_udf("struct") +def generate_coherent_ticket(tiers: pd.Series) -> pd.DataFrame: + """Generate coherent ticket where attributes correlate.""" + results = [] + for tier in tiers: + # Priority correlates with tier + if tier == 'Enterprise': + priority = np.random.choice(['Critical', 'High', 'Medium'], p=[0.3, 0.5, 0.2]) + else: + priority = np.random.choice(['Critical', 'High', 'Medium', 'Low'], p=[0.05, 0.2, 0.45, 0.3]) + + # Resolution time correlates with priority + resolution_scale = {'Critical': 4, 'High': 12, 'Medium': 36, 'Low': 72} + resolution_hours = np.random.exponential(scale=resolution_scale[priority]) + + # CSAT correlates with resolution time + if resolution_hours < 4: + csat = np.random.choice([4, 5], p=[0.3, 0.7]) + elif resolution_hours < 24: + csat = np.random.choice([3, 4, 5], p=[0.2, 0.5, 0.3]) + else: + csat = np.random.choice([1, 2, 3, 4], p=[0.1, 0.3, 0.4, 0.2]) + + results.append({ + "priority": priority, + "resolution_hours": round(resolution_hours, 1), + "csat_score": int(csat), + }) + + return pd.DataFrame(results) +``` + +### Correlation Examples + +| Attribute A | Attribute B | Correlation | +|------------|-------------|-------------| +| Customer tier | Order amount | Enterprise = higher amounts | +| Ticket priority | Resolution time | Critical = faster resolution | +| Resolution time | CSAT score | Faster = higher satisfaction | +| Region | Product preference | Regional variations | +| Time of day | Transaction type | Business hours = B2B | + +--- + +## Data Volume for Aggregation + +Generate enough data so patterns remain visible after downstream aggregation: + +| Grain | Minimum Records | Rationale | +|-------|-----------------|-----------| +| Daily time series | 50-100/day | See trends after weekly rollup | +| Per category | 500+ per category | Statistical significance | +| Per customer | 5-20 events/customer | Customer-level analysis | +| Total rows | 10K-50K minimum | Patterns survive GROUP BY | + +```python +# Example: 8000 tickets over 180 days = ~44/day average +# After weekly aggregation: ~310 records per week +N_TICKETS = 8000 +N_CUSTOMERS = 2500 # Each has ~3 tickets on average +N_ORDERS = 25000 # ~10 orders per customer average +``` diff --git a/databricks-skills/databricks-synthetic-data-gen/references/4-domain-guidance.md b/databricks-skills/databricks-synthetic-data-gen/references/4-domain-guidance.md new file mode 100644 index 00000000..0519bcce --- /dev/null +++ b/databricks-skills/databricks-synthetic-data-gen/references/4-domain-guidance.md @@ -0,0 +1,256 @@ +# Domain-Specific Guidance + +Realistic patterns for common data domains. All examples use Spark + Faker + Pandas UDFs. + +--- + +## Retail/E-commerce + +### Tables +``` +customers → orders → order_items → products +``` + +### Key Patterns + +| Pattern | Implementation | +|---------|----------------| +| Seasonal spikes | Q4 holiday shopping (1.5-2x volume in Nov-Dec) | +| Cart abandonment | ~70% of carts never complete | +| Loyalty tier progression | Free → Pro → Enterprise over time | +| Regional pricing | 5-15% price variation by region | + +### Realistic Distributions + +```python +@F.pandas_udf(DoubleType()) +def generate_order_amount(tiers: pd.Series) -> pd.Series: + """E-commerce order amounts by tier.""" + amounts = [] + for tier in tiers: + if tier == "Premium": + amounts.append(float(np.random.lognormal(mean=5.5, sigma=0.9))) # ~$245 avg + elif tier == "Standard": + amounts.append(float(np.random.lognormal(mean=4.2, sigma=0.7))) # ~$67 avg + else: # Basic + amounts.append(float(np.random.lognormal(mean=3.5, sigma=0.6))) # ~$33 avg + return pd.Series(amounts) + +# Order status with cart abandonment +status_weights = [0.70, 0.08, 0.07, 0.10, 0.05] # abandoned, pending, processing, shipped, delivered +``` + +### Schema Example + +```python +# Products +products_df = spark.range(0, N_PRODUCTS).select( + F.concat(F.lit("PROD-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("product_id"), + fake_product_name(F.col("id")).alias("name"), + F.array(F.lit("Electronics"), F.lit("Clothing"), F.lit("Home"), F.lit("Sports"))[ + (F.rand() * 4).cast("int") + ].alias("category"), + generate_price(F.col("id")).alias("base_price"), +) +``` + +--- + +## Support/CRM + +### Tables +``` +accounts → contacts → tickets → interactions +``` + +### Key Patterns + +| Pattern | Implementation | +|---------|----------------| +| Incident spikes | 3-5x volume during outages | +| Resolution by priority | Critical: 4h avg, Low: 72h avg | +| Enterprise contacts | 5-10 contacts per account vs 1-2 for SMB | +| CSAT correlation | Faster resolution = higher satisfaction | + +### Realistic Distributions + +```python +@F.pandas_udf("struct") +def generate_ticket_metrics(tiers: pd.Series) -> pd.DataFrame: + """Support ticket metrics with correlated attributes.""" + results = [] + for tier in tiers: + # Priority correlates with tier + if tier == 'Enterprise': + priority = np.random.choice(['Critical', 'High', 'Medium'], p=[0.3, 0.5, 0.2]) + else: + priority = np.random.choice(['Critical', 'High', 'Medium', 'Low'], p=[0.05, 0.2, 0.45, 0.3]) + + # Resolution time by priority (exponential distribution) + resolution_scale = {'Critical': 4, 'High': 12, 'Medium': 36, 'Low': 72} + resolution_hours = np.random.exponential(scale=resolution_scale[priority]) + + # CSAT correlates with resolution time + if resolution_hours < 4: + csat = np.random.choice([4, 5], p=[0.3, 0.7]) + elif resolution_hours < 24: + csat = np.random.choice([3, 4, 5], p=[0.2, 0.5, 0.3]) + else: + csat = np.random.choice([1, 2, 3, 4], p=[0.1, 0.3, 0.4, 0.2]) + + results.append({"priority": priority, "resolution_hours": round(resolution_hours, 1), "csat": int(csat)}) + return pd.DataFrame(results) +``` + +### Schema Example + +```python +# Tickets with coherent attributes +tickets_df = ( + spark.range(0, N_TICKETS, numPartitions=PARTITIONS) + .select( + F.concat(F.lit("TKT-"), F.lpad(F.col("id").cast("string"), 6, "0")).alias("ticket_id"), + # FK to customer (weighted by tier) + ... + ) + .withColumn("metrics", generate_ticket_metrics(F.col("tier"))) + .select("*", "metrics.*") + .drop("metrics") +) +``` + +--- + +## Manufacturing/IoT + +### Tables +``` +equipment → sensors → readings → maintenance_orders +``` + +### Key Patterns + +| Pattern | Implementation | +|---------|----------------| +| Sensor lifecycle | Normal → degraded → failure progression | +| Anomaly precursors | Anomalies precede maintenance by 2-7 days | +| Seasonal production | Summer/winter production variations | +| Equipment age | Failure rate increases with age | + +### Realistic Distributions + +```python +@F.pandas_udf(DoubleType()) +def generate_sensor_reading(equipment_ages: pd.Series) -> pd.Series: + """Sensor readings with age-based degradation.""" + readings = [] + for age_days in equipment_ages: + # Base reading with age-based drift + base = 100.0 + drift = (age_days / 365) * 5 # 5 units drift per year + noise = np.random.normal(0, 2) + + # Occasional anomalies (more likely with age) + anomaly_prob = min(0.01 + (age_days / 365) * 0.02, 0.1) + if np.random.random() < anomaly_prob: + noise += np.random.choice([-1, 1]) * np.random.exponential(10) + + readings.append(base + drift + noise) + return pd.Series(readings) +``` + +### Schema Example + +```python +# Sensor readings time series +readings_df = ( + spark.range(0, N_READINGS, numPartitions=PARTITIONS) + .select( + F.concat(F.lit("READ-"), F.col("id").cast("string")).alias("reading_id"), + # FK to sensor + ((F.col("id") % N_SENSORS) + 1).alias("sensor_id"), + F.date_add(F.lit(START_DATE.date()), (F.col("id") / READINGS_PER_DAY).cast("int")).alias("timestamp"), + generate_sensor_reading(F.col("equipment_age")).alias("value"), + ) +) +``` + +--- + +## Financial Services + +### Tables +``` +accounts → transactions → payments → fraud_flags +``` + +### Key Patterns + +| Pattern | Implementation | +|---------|----------------| +| Transaction power law | 80% of volume from 20% of accounts | +| Fraud patterns | Unusual times, amounts, locations | +| Balance consistency | Transactions maintain positive balance | +| Regulatory compliance | No negative balances, valid amounts | + +### Realistic Distributions + +```python +@F.pandas_udf(DoubleType()) +def generate_transaction_amount(account_types: pd.Series) -> pd.Series: + """Transaction amounts following power law by account type.""" + amounts = [] + for acct_type in account_types: + if acct_type == "Corporate": + # Power law for corporate (few large transactions) + amount = (np.random.pareto(a=1.5) + 1) * 1000 + elif acct_type == "Premium": + amount = np.random.lognormal(mean=6, sigma=1.2) + else: # Standard + amount = np.random.lognormal(mean=4, sigma=0.8) + amounts.append(min(amount, 1_000_000)) # Cap at $1M + return pd.Series(amounts) + +@F.pandas_udf(BooleanType()) +def generate_fraud_flag(amounts: pd.Series, hours: pd.Series) -> pd.Series: + """Flag suspicious transactions based on amount and time.""" + flags = [] + for amount, hour in zip(amounts, hours): + # Higher fraud probability for: large amounts + unusual hours + base_prob = 0.001 + if amount > 5000: + base_prob *= 3 + if hour < 6 or hour > 22: + base_prob *= 2 + flags.append(np.random.random() < base_prob) + return pd.Series(flags) +``` + +### Schema Example + +```python +# Transactions with fraud indicators +transactions_df = ( + spark.range(0, N_TRANSACTIONS, numPartitions=PARTITIONS) + .select( + F.concat(F.lit("TXN-"), F.lpad(F.col("id").cast("string"), 10, "0")).alias("transaction_id"), + # FK to account + ... + generate_transaction_amount(F.col("account_type")).alias("amount"), + F.hour(F.col("timestamp")).alias("hour"), + ) + .withColumn("is_suspicious", generate_fraud_flag(F.col("amount"), F.col("hour"))) +) +``` + +--- + +## General Best Practices + +1. **Start with domain tables**: Define the core entities and relationships first +2. **Add domain-specific distributions**: Use realistic statistical patterns for your domain +3. **Include edge cases**: Every domain has edge cases (returns, cancellations, failures) +4. **Time-based patterns matter**: Most domains have daily/weekly/seasonal patterns +5. **Correlate attributes**: Attributes within a row should make business sense together + +**Note:** These are guidance patterns, not rigid schemas. Adapt to user's specific requirements. diff --git a/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md b/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md new file mode 100644 index 00000000..c283a82c --- /dev/null +++ b/databricks-skills/databricks-synthetic-data-gen/references/5-output-formats.md @@ -0,0 +1,178 @@ +# Output Formats Guide + +Where and how to save generated synthetic data. + +## Create Infrastructure in Script + +Always create the schema and volume **inside the Python script** using `spark.sql()`. Do NOT make separate MCP SQL calls - it's much slower. + +```python +CATALOG = "" # MUST ask user - never default +SCHEMA = "" +VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data" + +# Note: Assume catalog exists - do NOT create it +spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}") +spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data") +``` + +**Important:** Do NOT create catalogs - assume they already exist. Only create schema and volume. + +--- + +## Format Comparison + +| Format | Use Case | Extension | Best For | +|--------|----------|-----------|----------| +| **Parquet** | SDP pipeline input | `.parquet` or none | Best compression, query performance | +| **JSON** | Log-style ingestion | `.json` | Simulating external data feeds | +| **CSV** | Legacy systems | `.csv` | Human-readable, spreadsheet import | +| **Delta Table** | Default - Direct analytics | N/A | Treat as bronze tables for ETL or skip ETL and query immediately | + +--- + +## Parquet to Volumes (Default) + +Standard format for SDP pipeline input. Best compression and query performance. + +```python +VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data" + +# Save as parquet files (directory format) +customers_df.write.mode("overwrite").parquet(f"{VOLUME_PATH}/customers") +orders_df.write.mode("overwrite").parquet(f"{VOLUME_PATH}/orders") +tickets_df.write.mode("overwrite").parquet(f"{VOLUME_PATH}/tickets") +``` + +**Notes:** +- Files may not use a file extension or might end with `.parquet` +- Spark writes as a directory with part files +- Use `mode("overwrite")` for one-time generation +- Use `mode("append")` for incremental/scheduled jobs + +--- + +## JSON to Volumes + +Common pattern for simulating SDP ingestion from external data feeds (logs, webhooks). + +```python +VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data" + +# Save as JSON files +customers_df.write.mode("overwrite").json(f"{VOLUME_PATH}/customers_json") +orders_df.write.mode("overwrite").json(f"{VOLUME_PATH}/orders_json") +``` + +**When to use:** +- Simulating log ingestion +- External API data feeds +- User explicitly requests JSON format + +--- + +## CSV to Volumes + +Common pattern for simulating data from legacy systems or spreadsheet exports. + +```python +VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data" + +# Save as CSV with headers +customers_df.write.mode("overwrite").option("header", "true").csv(f"{VOLUME_PATH}/customers_csv") +orders_df.write.mode("overwrite").option("header", "true").csv(f"{VOLUME_PATH}/orders_csv") +``` + +**Options:** +```python +# Full options for CSV +df.write \ + .mode("overwrite") \ + .option("header", "true") \ + .option("delimiter", ",") \ + .option("quote", '"') \ + .option("escape", "\\") \ + .csv(f"{VOLUME_PATH}/data_csv") +``` + +**When to use:** +- Legacy system integration +- Human-readable data +- Spreadsheet import testing + +--- + +## Delta Table (Unity Catalog) + +Write directly to managed Delta tables when data is ready for analytics consumption (skip SDP pipeline). + +```python +# Ensure schema exists +spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}") + +# Save as managed Delta tables +customers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.customers") +orders_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.orders") + +# With additional options +customers_df.write \ + .mode("overwrite") \ + .option("overwriteSchema", "true") \ + .saveAsTable(f"{CATALOG}.{SCHEMA}.customers") +``` + +**When to use:** +- User wants data ready to query immediately +- Skip the SDP bronze/silver/gold pipeline +- Direct SQL analytics + +--- + +## Write Modes + +| Mode | Behavior | Use Case | +|------|----------|----------| +| `overwrite` | Replace existing data | One-time generation, regeneration | +| `append` | Add to existing data | Incremental/scheduled jobs | +| `ignore` | Skip if exists | Idempotent generation | +| `error` | Fail if exists | Safety check | + +### Incremental Generation Pattern + +```python +WRITE_MODE = "append" # For scheduled jobs + +# Only generate new records since last run +from datetime import datetime, timedelta + +LAST_RUN = datetime.now() - timedelta(days=1) +END_DATE = datetime.now() + +# Generate only new data +new_orders_df = generate_orders(start_date=LAST_RUN, end_date=END_DATE) +new_orders_df.write.mode(WRITE_MODE).parquet(f"{VOLUME_PATH}/orders") +``` + +--- + +## Validation After Write + +After successful execution, validate the generated data: + +```python +# Read back and verify +customers_check = spark.read.parquet(f"{VOLUME_PATH}/customers") +orders_check = spark.read.parquet(f"{VOLUME_PATH}/orders") + +print(f"Customers: {customers_check.count():,} rows") +print(f"Orders: {orders_check.count():,} rows") + +# Verify distributions +customers_check.groupBy("tier").count().show() +orders_check.describe("amount").show() +``` + +Or use `get_volume_folder_details` MCP tool: +- `volume_path`: "my_catalog/my_schema/raw_data/customers" +- `format`: "parquet" +- `table_stat_level`: "SIMPLE" diff --git a/databricks-skills/databricks-synthetic-data-gen/references/6-troubleshooting.md b/databricks-skills/databricks-synthetic-data-gen/references/6-troubleshooting.md new file mode 100644 index 00000000..420b3500 --- /dev/null +++ b/databricks-skills/databricks-synthetic-data-gen/references/6-troubleshooting.md @@ -0,0 +1,324 @@ +# Troubleshooting Guide + +Common issues and solutions for synthetic data generation. + +## Environment Issues + +### ModuleNotFoundError: faker (or other library) + +**Problem:** Dependencies not available in execution environment. + +**Solutions by execution mode:** + +| Mode | Solution | +|------|----------| +| **DB Connect 16.4+** | Use `DatabricksEnv().withDependencies("faker", "pandas", ...)` | +| **Older DB Connect with Serverless** | Create job with `environments` parameter | +| **Databricks Runtime** | Use Databricks CLI to install `faker holidays` | +| **Classic cluster** | Use Databricks CLI to install libraries. `databricks libraries install --json '{"cluster_id": "", "libraries": [{"pypi": {"package": "faker"}}, {"pypi": {"package": "holidays"}}]}'` | + +```python +# For DB Connect 16.4+ +from databricks.connect import DatabricksSession, DatabricksEnv + +env = DatabricksEnv().withDependencies("faker", "pandas", "numpy", "holidays") +spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate() +``` + +### DatabricksEnv not found + +**Problem:** Using older databricks-connect version. + +**Solution:** Upgrade to 16.4+ or use job-based approach: + +```bash +# Upgrade (prefer uv, fall back to pip) +uv pip install "databricks-connect>=16.4,<17.4" +# or: pip install "databricks-connect>=16.4,<17.4" + +# Or use job with environments parameter instead +``` + +### serverless_compute_id error + +**Problem:** Missing serverless configuration. + +**Solution:** Add to `~/.databrickscfg`: + +```ini +[DEFAULT] +host = https://your-workspace.cloud.databricks.com/ +serverless_compute_id = auto +auth_type = databricks-cli +``` + +--- + +## Execution Issues + +### CRITICAL: cache() and persist() NOT supported on serverless + +**Problem:** Using `.cache()` or `.persist()` on serverless compute fails with: +``` +AnalysisException: [NOT_SUPPORTED_WITH_SERVERLESS] PERSIST TABLE is not supported on serverless compute. +``` + +**Why this happens:** Serverless compute does not support caching DataFrames in memory. This is a fundamental limitation of the serverless architecture. + +**Solution:** Write master tables to Delta first, then read them back for FK joins: + +```python +# BAD - will fail on serverless +customers_df = spark.range(0, N_CUSTOMERS)... +customers_df.cache() # ❌ FAILS: "PERSIST TABLE is not supported on serverless compute" + +# GOOD - write to Delta, then read back +customers_df = spark.range(0, N_CUSTOMERS)... +customers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.customers") +customer_lookup = spark.table(f"{CATALOG}.{SCHEMA}.customers") # ✓ Read from Delta +``` + +**Best practice for referential integrity:** +1. Generate master table (e.g., customers) +2. Write to Delta table +3. Read back for FK lookup joins +4. Generate child tables (e.g., orders, tickets) with valid FKs +5. Write child tables to Delta + +--- + +### Serverless job fails to start + +**Possible causes:** +1. Workspace doesn't have serverless enabled +2. Unity Catalog permissions missing +3. Invalid environment configuration + +**Solutions:** +```python +# Verify serverless is available +# Try creating a simple job first to test + +# Check Unity Catalog permissions +spark.sql("SELECT current_catalog(), current_schema()") +``` + +### Classic cluster startup slow (3-8 minutes) + +**Problem:** Clusters take time to start. + +**Solution:** Switch to serverless: + +```python +# Instead of: +# spark = DatabricksSession.builder.clusterId("xxx").getOrCreate() + +# Use: +spark = DatabricksSession.builder.serverless(True).getOrCreate() +``` + +### "Either base environment or version must be provided" + +**Problem:** Missing `client` in job environment spec. + +**Solution:** Add `"client": "4"` to the spec: + +```python +{ + "environments": [{ + "environment_key": "datagen_env", + "spec": { + "client": "4", # Required! + "dependencies": ["faker", "numpy", "pandas"] + } + }] +} +``` + +--- + +## Data Generation Issues + +### AttributeError: 'function' object has no attribute 'partitionBy' + +**Problem:** Using `F.window` instead of `Window` for analytical window functions. + +```python +# WRONG - F.window is for time-based tumbling/sliding windows (streaming) +window_spec = F.window.partitionBy("account_id").orderBy("contact_id") +# Error: AttributeError: 'function' object has no attribute 'partitionBy' + +# CORRECT - Window is for analytical window specifications +from pyspark.sql.window import Window +window_spec = Window.partitionBy("account_id").orderBy("contact_id") +``` + +**When to use Window:** For analytical functions like `row_number()`, `rank()`, `lead()`, `lag()`: + +```python +from pyspark.sql.window import Window + +# Mark first contact per account as primary +window_spec = Window.partitionBy("account_id").orderBy("contact_id") +contacts_df = contacts_df.withColumn( + "is_primary", + F.row_number().over(window_spec) == 1 +) +``` + +--- + +### Faker UDF is slow + +**Problem:** Single-row UDFs don't parallelize well. + +**Solution:** Use `pandas_udf` for batch processing: + +```python +# SLOW - scalar UDF +@F.udf(returnType=StringType()) +def slow_fake_name(): + return Faker().name() + +# FAST - pandas UDF (batch processing) +@F.pandas_udf(StringType()) +def fast_fake_name(ids: pd.Series) -> pd.Series: + fake = Faker() + return pd.Series([fake.name() for _ in range(len(ids))]) +``` + +### Out of memory with large data + +**Problem:** Not enough partitions for data size. + +**Solution:** Increase partitions: + +```python +# For large datasets (1M+ rows) +customers_df = spark.range(0, N_CUSTOMERS, numPartitions=64) # Increase from default +``` + +| Data Size | Recommended Partitions | +|-----------|----------------------| +| < 100K | 8 | +| 100K - 500K | 16 | +| 500K - 1M | 32 | +| 1M+ | 64+ | + +### Context corrupted on classic cluster + +**Problem:** Stale execution context. + +**Solution:** Create fresh context (omit context_id), reinstall libraries: + +```python +# Don't reuse context_id if you see strange errors +# Let it create a new context +``` + +### Referential integrity violations + +**Problem:** Foreign keys reference non-existent parent records. + +**Solution:** Write master table to Delta first, then read back for FK joins: + +```python +# 1. Generate and WRITE master table (do NOT use cache with serverless!) +customers_df = spark.range(0, N_CUSTOMERS)... +customers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.customers") + +# 2. Read back for FK lookups +customer_lookup = spark.table(f"{CATALOG}.{SCHEMA}.customers").select("customer_id", "tier") + +# 3. Generate child table with valid FKs +orders_df = spark.range(0, N_ORDERS).join( + customer_lookup, + on=, + how="left" +) +``` + +> **WARNING:** Do NOT use `.cache()` or `.persist()` with serverless compute. See the dedicated section above. + +--- + +## Data Quality Issues + +### Uniform distributions (unrealistic) + +**Problem:** All customers have similar order counts, amounts are evenly distributed. + +**Solution:** Use non-linear distributions: + +```python +# BAD - uniform +amounts = np.random.uniform(10, 1000, N) + +# GOOD - log-normal (realistic) +amounts = np.random.lognormal(mean=5, sigma=0.8, N) +``` + +### Missing time-based patterns + +**Problem:** Data doesn't reflect weekday/weekend or seasonal patterns. + +**Solution:** Add multipliers: + +```python +import holidays + +US_HOLIDAYS = holidays.US(years=[2024, 2025]) + +def get_multiplier(date): + mult = 1.0 + if date.weekday() >= 5: # Weekend + mult *= 0.6 + if date in US_HOLIDAYS: + mult *= 0.3 + return mult +``` + +### Incoherent row attributes + +**Problem:** Enterprise customer has low-value orders, critical ticket has slow resolution. + +**Solution:** Correlate attributes: + +```python +# Priority based on tier +if tier == 'Enterprise': + priority = np.random.choice(['Critical', 'High'], p=[0.4, 0.6]) +else: + priority = np.random.choice(['Medium', 'Low'], p=[0.6, 0.4]) + +# Resolution based on priority +resolution_scale = {'Critical': 4, 'High': 12, 'Medium': 36, 'Low': 72} +resolution_hours = np.random.exponential(scale=resolution_scale[priority]) +``` + +--- + +## Validation Steps + +After generation, verify your data: + +```python +# 1. Check row counts +print(f"Customers: {customers_df.count():,}") +print(f"Orders: {orders_df.count():,}") + +# 2. Verify distributions +customers_df.groupBy("tier").count().show() +orders_df.describe("amount").show() + +# 3. Check referential integrity +orphans = orders_df.join( + customers_df, + orders_df.customer_id == customers_df.customer_id, + "left_anti" +) +print(f"Orphan orders: {orphans.count()}") + +# 4. Verify date range +orders_df.select(F.min("order_date"), F.max("order_date")).show() +``` diff --git a/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py b/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py new file mode 100644 index 00000000..b9f953fa --- /dev/null +++ b/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py @@ -0,0 +1,390 @@ +"""Generate synthetic data using Spark + Faker + Pandas UDFs. + +This is the recommended approach for ALL data generation tasks: +- Scales from thousands to millions of rows +- Parallel execution via Spark +- Direct write to Unity Catalog +- Works with serverless and classic compute + +Auto-detects environment and uses: +- DatabricksEnv with managed dependencies if databricks-connect >= 16.4 (local) +- Standard session if running on Databricks Runtime or older databricks-connect +""" +import sys +import os +from pyspark.sql import functions as F +from pyspark.sql.window import Window +from pyspark.sql.types import StringType, DoubleType, StructType, StructField, IntegerType +import numpy as np +import pandas as pd +from datetime import datetime, timedelta + +# ============================================================================= +# CONFIGURATION +# ============================================================================= +# Compute - Serverless strongly recommended +USE_SERVERLESS = True # Set to False and provide CLUSTER_ID for classic compute +CLUSTER_ID = None # Only used if USE_SERVERLESS=False + +# Storage - Update these for your environment +CATALOG = "" # REQUIRED: replace with your catalog +SCHEMA = "" # REQUIRED: replace with your schema +VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data" + +# Data sizes +N_CUSTOMERS = 10_000 +N_ORDERS = 50_000 +PARTITIONS = 16 # Adjust: 8 for <100K, 32 for 1M+ + +# Date range - last 6 months from today +END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) +START_DATE = END_DATE - timedelta(days=180) + +# Write mode - "overwrite" for one-time, "append" for incremental +WRITE_MODE = "overwrite" + +# Bad data injection for testing data quality rules +INJECT_BAD_DATA = False # Set to True to inject bad data +BAD_DATA_CONFIG = { + "null_rate": 0.02, # 2% nulls in required fields + "outlier_rate": 0.01, # 1% impossible values + "orphan_fk_rate": 0.01, # 1% orphan foreign keys +} + +# Reproducibility +SEED = 42 + +# Tier distribution: Free 60%, Pro 30%, Enterprise 10% +TIER_PROBS = [0.6, 0.3, 0.1] + +# Region distribution +REGION_PROBS = [0.4, 0.25, 0.2, 0.15] + +# ============================================================================= +# ENVIRONMENT DETECTION AND SESSION CREATION +# ============================================================================= + +def is_databricks_runtime(): + """Check if running on Databricks Runtime vs locally.""" + return "DATABRICKS_RUNTIME_VERSION" in os.environ + +def get_databricks_connect_version(): + """Get databricks-connect version as (major, minor) tuple or None.""" + try: + import importlib.metadata + version_str = importlib.metadata.version('databricks-connect') + parts = version_str.split('.') + return (int(parts[0]), int(parts[1])) + except Exception: + return None + +# Detect environment +on_runtime = is_databricks_runtime() +db_version = get_databricks_connect_version() + +print("=" * 80) +print("ENVIRONMENT DETECTION") +print("=" * 80) +print(f"Running on Databricks Runtime: {on_runtime}") +if db_version: + print(f"databricks-connect version: {db_version[0]}.{db_version[1]}") +else: + print("databricks-connect: not available") + +# Use DatabricksEnv with managed dependencies if: +# - Running locally (not on Databricks Runtime) +# - databricks-connect >= 16.4 +use_managed_deps = (not on_runtime) and db_version and db_version >= (16, 4) + +if use_managed_deps: + print("Using DatabricksEnv with managed dependencies") + print("=" * 80) + from databricks.connect import DatabricksSession, DatabricksEnv + + env = DatabricksEnv().withDependencies("faker", "pandas", "numpy", "holidays") + + if USE_SERVERLESS: + spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate() + print("Connected to serverless compute with managed dependencies!") + else: + if not CLUSTER_ID: + raise ValueError("CLUSTER_ID must be set when USE_SERVERLESS=False") + spark = DatabricksSession.builder.withEnvironment(env).clusterId(CLUSTER_ID).getOrCreate() + print(f"Connected to cluster with managed dependencies!") +else: + print("Using standard session (dependencies must be pre-installed)") + print("=" * 80) + + # Check that UDF dependencies are available + print("\nChecking UDF dependencies...") + missing_deps = [] + + try: + from faker import Faker + print(" faker: OK") + except ImportError: + missing_deps.append("faker") + print(" faker: MISSING") + + try: + import pandas as pd + print(" pandas: OK") + except ImportError: + missing_deps.append("pandas") + print(" pandas: MISSING") + + if missing_deps: + print("\n" + "=" * 80) + print("ERROR: Missing dependencies for UDFs") + print("=" * 80) + print(f"Missing: {', '.join(missing_deps)}") + if on_runtime: + print('\nSolution: Install libraries via Databricks CLI:') + print(' databricks libraries install --json \'{"cluster_id": "", "libraries": [{"pypi": {"package": "faker"}}, {"pypi": {"package": "holidays"}}]}\'') + else: + print("\nSolution: Upgrade to databricks-connect >= 16.4 for managed deps") + print(" Or create a job with environment settings") + print("=" * 80) + sys.exit(1) + + print("\nAll dependencies available") + print("=" * 80) + + from databricks.connect import DatabricksSession + + if USE_SERVERLESS: + spark = DatabricksSession.builder.serverless(True).getOrCreate() + print("Connected to serverless compute") + else: + if not CLUSTER_ID: + raise ValueError("CLUSTER_ID must be set when USE_SERVERLESS=False") + spark = DatabricksSession.builder.clusterId(CLUSTER_ID).getOrCreate() + print(f"Connected to cluster ") + +# Import Faker for UDF definitions +from faker import Faker + +# ============================================================================= +# DEFINE PANDAS UDFs FOR FAKER DATA +# ============================================================================= + +@F.pandas_udf(StringType()) +def fake_name(ids: pd.Series) -> pd.Series: + """Generate realistic person names.""" + fake = Faker() + Faker.seed(SEED) + return pd.Series([fake.name() for _ in range(len(ids))]) + +@F.pandas_udf(StringType()) +def fake_company(ids: pd.Series) -> pd.Series: + """Generate realistic company names.""" + fake = Faker() + Faker.seed(SEED) + return pd.Series([fake.company() for _ in range(len(ids))]) + +@F.pandas_udf(StringType()) +def fake_address(ids: pd.Series) -> pd.Series: + """Generate realistic addresses.""" + fake = Faker() + Faker.seed(SEED) + return pd.Series([fake.address().replace('\n', ', ') for _ in range(len(ids))]) + +@F.pandas_udf(StringType()) +def fake_email(names: pd.Series) -> pd.Series: + """Generate email based on name.""" + emails = [] + for name in names: + if name: + domain = name.lower().replace(" ", ".").replace(",", "")[:20] + emails.append(f"{domain}@example.com") + else: + emails.append("unknown@example.com") + return pd.Series(emails) + +@F.pandas_udf(DoubleType()) +def generate_lognormal_amount(tiers: pd.Series) -> pd.Series: + """Generate amount based on tier using log-normal distribution.""" + np.random.seed(SEED) + amounts = [] + for tier in tiers: + if tier == "Enterprise": + amounts.append(float(np.random.lognormal(mean=7.5, sigma=0.8))) # ~$1800 avg + elif tier == "Pro": + amounts.append(float(np.random.lognormal(mean=5.5, sigma=0.7))) # ~$245 avg + else: + amounts.append(float(np.random.lognormal(mean=4.0, sigma=0.6))) # ~$55 avg + return pd.Series(amounts) + +# ============================================================================= +# CREATE INFRASTRUCTURE +# ============================================================================= +print("\nCreating infrastructure...") +spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}") +spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data") +print(f"Infrastructure ready: {VOLUME_PATH}") + +# ============================================================================= +# GENERATE CUSTOMERS (Master Table) +# ============================================================================= +print(f"\nGenerating {N_CUSTOMERS:,} customers...") + +customers_df = ( + spark.range(0, N_CUSTOMERS, numPartitions=PARTITIONS) + .select( + F.concat(F.lit("CUST-"), F.lpad(F.col("id").cast("string"), 5, "0")).alias("customer_id"), + fake_name(F.col("id")).alias("name"), + fake_company(F.col("id")).alias("company"), + fake_address(F.col("id")).alias("address"), + # Tier distribution: Free 60%, Pro 30%, Enterprise 10% + F.when(F.rand(SEED) < TIER_PROBS[0], "Free") + .when(F.rand(SEED) < TIER_PROBS[0] + TIER_PROBS[1], "Pro") + .otherwise("Enterprise").alias("tier"), + # Region distribution + F.when(F.rand(SEED) < REGION_PROBS[0], "North") + .when(F.rand(SEED) < REGION_PROBS[0] + REGION_PROBS[1], "South") + .when(F.rand(SEED) < REGION_PROBS[0] + REGION_PROBS[1] + REGION_PROBS[2], "East") + .otherwise("West").alias("region"), + # Created date (within last 2 years before start date) + F.date_sub(F.lit(START_DATE.date()), (F.rand(SEED) * 730).cast("int")).alias("created_at"), + ) +) + +# Add tier-based ARR and email +customers_df = ( + customers_df + .withColumn("arr", F.round(generate_lognormal_amount(F.col("tier")), 2)) + .withColumn("email", fake_email(F.col("name"))) +) + +# Save customers +customers_df.write.mode(WRITE_MODE).parquet(f"{VOLUME_PATH}/customers") +print(f" Saved customers to {VOLUME_PATH}/customers") + +# Show tier distribution +print("\n Tier distribution:") +customers_df.groupBy("tier").count().orderBy("tier").show() + +# ============================================================================= +# GENERATE ORDERS (Child Table with Referential Integrity) +# ============================================================================= +print(f"\nGenerating {N_ORDERS:,} orders with referential integrity...") + +# Write customer lookup to temp Delta table (no .cache() on serverless!) +customers_tmp_table = f"{CATALOG}.{SCHEMA}._tmp_customers_lookup" +customers_df.select("customer_id", "tier").write.mode("overwrite").saveAsTable(customers_tmp_table) +customer_lookup = spark.table(customers_tmp_table) + +# Generate orders base +orders_df = ( + spark.range(0, N_ORDERS, numPartitions=PARTITIONS) + .select( + F.concat(F.lit("ORD-"), F.lpad(F.col("id").cast("string"), 6, "0")).alias("order_id"), + # Generate customer_idx for FK join (hash-based distribution) + (F.abs(F.hash(F.col("id"), F.lit(SEED))) % N_CUSTOMERS).alias("customer_idx"), + # Order status + F.when(F.rand(SEED) < 0.65, "delivered") + .when(F.rand(SEED) < 0.80, "shipped") + .when(F.rand(SEED) < 0.90, "processing") + .when(F.rand(SEED) < 0.95, "pending") + .otherwise("cancelled").alias("status"), + # Order date within date range + F.date_add(F.lit(START_DATE.date()), (F.rand(SEED) * 180).cast("int")).alias("order_date"), + ) +) + +# Add customer_idx to lookup for join +customer_lookup_with_idx = customer_lookup.withColumn( + "customer_idx", + (F.row_number().over(Window.orderBy(F.monotonically_increasing_id())) - 1).cast("int") +) + +# Join to get customer_id and tier as foreign key +orders_with_fk = ( + orders_df + .join(customer_lookup_with_idx, on="customer_idx", how="left") + .drop("customer_idx") +) + +# Add tier-based amount +orders_with_fk = orders_with_fk.withColumn( + "amount", + F.round(generate_lognormal_amount(F.col("tier")), 2) +) + +# ============================================================================= +# INJECT BAD DATA (OPTIONAL) +# ============================================================================= +if INJECT_BAD_DATA: + print("\nInjecting bad data for quality testing...") + + # Calculate counts + null_count = int(N_ORDERS * BAD_DATA_CONFIG["null_rate"]) + outlier_count = int(N_ORDERS * BAD_DATA_CONFIG["outlier_rate"]) + orphan_count = int(N_ORDERS * BAD_DATA_CONFIG["orphan_fk_rate"]) + + # Add bad data flags + orders_with_fk = orders_with_fk.withColumn( + "row_num", + F.row_number().over(Window.orderBy(F.monotonically_increasing_id())) + ) + + # Inject nulls in customer_id for first null_count rows + orders_with_fk = orders_with_fk.withColumn( + "customer_id", + F.when(F.col("row_num") <= null_count, None).otherwise(F.col("customer_id")) + ) + + # Inject negative amounts for next outlier_count rows + orders_with_fk = orders_with_fk.withColumn( + "amount", + F.when( + (F.col("row_num") > null_count) & (F.col("row_num") <= null_count + outlier_count), + F.lit(-999.99) + ).otherwise(F.col("amount")) + ) + + # Inject orphan FKs for next orphan_count rows + orders_with_fk = orders_with_fk.withColumn( + "customer_id", + F.when( + (F.col("row_num") > null_count + outlier_count) & + (F.col("row_num") <= null_count + outlier_count + orphan_count), + F.lit("CUST-NONEXISTENT") + ).otherwise(F.col("customer_id")) + ) + + orders_with_fk = orders_with_fk.drop("row_num") + + print(f" Injected {null_count} null customer_ids") + print(f" Injected {outlier_count} negative amounts") + print(f" Injected {orphan_count} orphan foreign keys") + +# Drop tier column (not needed in final output) +orders_final = orders_with_fk.drop("tier") + +# Save orders +orders_final.write.mode(WRITE_MODE).parquet(f"{VOLUME_PATH}/orders") +print(f" Saved orders to {VOLUME_PATH}/orders") + +# Show status distribution +print("\n Status distribution:") +orders_final.groupBy("status").count().orderBy("status").show() + +# ============================================================================= +# CLEANUP AND SUMMARY +# ============================================================================= +spark.sql(f"DROP TABLE IF EXISTS {customers_tmp_table}") + +print("\n" + "=" * 80) +print("GENERATION COMPLETE") +print("=" * 80) +print(f"Catalog: {CATALOG}") +print(f"Schema: {SCHEMA}") +print(f"Volume: {VOLUME_PATH}") +print(f"\nGenerated data:") +print(f" - customers: {N_CUSTOMERS:,} rows") +print(f" - orders: {N_ORDERS:,} rows") +if INJECT_BAD_DATA: + print(f" - Bad data injected: nulls, outliers, orphan FKs") +print(f"\nDate range: {START_DATE.date()} to {END_DATE.date()}") +print("=" * 80) diff --git a/databricks-skills/databricks-synthetic-data-generation/SKILL.md b/databricks-skills/databricks-synthetic-data-generation/SKILL.md deleted file mode 100644 index ce2a17cf..00000000 --- a/databricks-skills/databricks-synthetic-data-generation/SKILL.md +++ /dev/null @@ -1,660 +0,0 @@ ---- -name: databricks-synthetic-data-generation -description: "Generate realistic synthetic data using Faker and Spark, with non-linear distributions, integrity constraints, and save to Databricks. Use when creating test data, demo datasets, or synthetic tables." ---- - -# Synthetic Data Generation - -Generate realistic, story-driven synthetic data for Databricks using Python with Faker and Spark. - -## Common Libraries - -These libraries are useful for generating realistic synthetic data: - -- **faker**: Generates realistic names, addresses, emails, companies, dates, etc. -- **holidays**: Provides country-specific holiday calendars for realistic date patterns - -These are typically NOT pre-installed on Databricks. Install them using `execute_databricks_command` tool: -- `code`: "%pip install faker holidays" - -Save the returned `cluster_id` and `context_id` for subsequent calls. - -## Workflow - -1. **Write Python code to a local file** in the project (e.g., `scripts/generate_data.py`) -2. **Execute on Databricks** using the `run_python_file_on_databricks` MCP tool -3. **If execution fails**: Edit the local file to fix the error, then re-execute -4. **Reuse the context** for follow-up executions by passing the returned `cluster_id` and `context_id` - -**Always work with local files first, then execute.** This makes debugging easier - you can see and edit the code. - -### Context Reuse Pattern - -The first execution auto-selects a running cluster and creates an execution context. **Reuse this context for follow-up calls** - it's much faster (~1s vs ~15s) and shares variables/imports: - -**First execution** - use `run_python_file_on_databricks` tool: -- `file_path`: "scripts/generate_data.py" - -Returns: `{ success, output, error, cluster_id, context_id, ... }` - -Save `cluster_id` and `context_id` for follow-up calls. - -**If execution fails:** -1. Read the error from the result -2. Edit the local Python file to fix the issue -3. Re-execute with same context using `run_python_file_on_databricks` tool: - - `file_path`: "scripts/generate_data.py" - - `cluster_id`: "" - - `context_id`: "" - -**Follow-up executions** reuse the context (faster, shares state): -- `file_path`: "scripts/validate_data.py" -- `cluster_id`: "" -- `context_id`: "" - -### Handling Failures - -When execution fails: -1. Read the error from the result -2. **Edit the local Python file** to fix the issue -3. Re-execute using the same `cluster_id` and `context_id` (faster, keeps installed libraries) -4. If the context is corrupted, omit `context_id` to create a fresh one - -### Installing Libraries - -Databricks provides Spark, pandas, numpy, and common data libraries by default. **Only install a library if you get an import error.** - -Use `execute_databricks_command` tool: -- `code`: "%pip install faker" -- `cluster_id`: "" -- `context_id`: "" - -The library is immediately available in the same context. - -**Note:** Keeping the same `context_id` means installed libraries persist across calls. - -## Storage Destination - -### Ask for Schema Name - -By default, use the `ai_dev_kit` catalog. Ask the user which schema to use: - -> "I'll save the data to `ai_dev_kit.`. What schema name would you like to use? (You can also specify a different catalog if needed.)" - -If the user provides just a schema name, use `ai_dev_kit.{schema}`. If they provide `catalog.schema`, use that instead. - -### Create Infrastructure in the Script - -Always create the catalog, schema, and volume **inside the Python script** using `spark.sql()`. Do NOT make separate MCP SQL calls - it's much slower. - -The `spark` variable is available by default on Databricks clusters. - -```python -# ============================================================================= -# CREATE INFRASTRUCTURE (inside the Python script) -# ============================================================================= -spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}") -spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}") -spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data") -``` - -### Save to Volume as Raw Data (Never Tables) - -**Always save data to a Volume as parquet files, never directly to tables** (unless the user explicitly requests tables). This is the input for the downstream Spark Declarative Pipeline (SDP) that will handle bronze/silver/gold layers. - -```python -VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data" - -# Save as parquet files (raw data) -spark.createDataFrame(customers_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/customers") -spark.createDataFrame(orders_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/orders") -spark.createDataFrame(tickets_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/tickets") -``` - -## Raw Data Only - No Pre-Aggregated Fields (Unless Instructed Otherwise) - -**By default, generate raw, transactional data only.** Do not create fields that represent sums, totals, averages, or counts. - -- One row = one event/transaction/record -- No columns like `total_orders`, `sum_revenue`, `avg_csat`, `order_count` -- Each row has its own individual values, not rollups - -**Why?** A Spark Declarative Pipeline (SDP) will typically be built after data generation to: -- Ingest raw data (bronze layer) -- Clean and validate (silver layer) -- Aggregate and compute metrics (gold layer) - -The synthetic data is the **source** for this pipeline. Aggregations happen downstream. - -**Note:** If the user specifically requests aggregated fields or summary tables, follow their instructions. - -```python -# GOOD - Raw transactional data -# Customer table: one row per customer, no aggregated fields -customers_data.append({ - "customer_id": cid, - "name": fake.company(), - "tier": "Enterprise", - "region": "North", -}) - -# Order table: one row per order -orders_data.append({ - "order_id": f"ORD-{i:06d}", - "customer_id": cid, - "amount": 150.00, # This order's amount - "order_date": "2024-10-15", -}) - -# BAD - Don't add pre-aggregated fields -# customers_data.append({ -# "customer_id": cid, -# "total_orders": 47, # NO - this is an aggregation -# "total_revenue": 12500.00, # NO - this is a sum -# "avg_order_value": 265.95, # NO - this is an average -# }) -``` - -## Temporality and Data Volume - -### Date Range: Last 6 Months from Today - -**Always generate data for the last ~6 months ending at the current date.** This ensures: -- Data feels current and relevant for demos -- Recent patterns are visible in dashboards -- Downstream aggregations (daily/weekly/monthly) have enough history - -```python -from datetime import datetime, timedelta - -# Dynamic date range - last 6 months from today -END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) -START_DATE = END_DATE - timedelta(days=180) - -# Place special events within this range (e.g., incident 3 weeks ago) -INCIDENT_END = END_DATE - timedelta(days=21) -INCIDENT_START = INCIDENT_END - timedelta(days=10) -``` - -### Data Volume for Aggregation - -Generate enough data so patterns remain visible after downstream aggregation (SDP pipelines often aggregate by day/week/region/category). Rules of thumb: - -| Grain | Minimum Records | Rationale | -|-------|-----------------|-----------| -| Daily time series | 50-100/day | See trends after weekly rollup | -| Per category | 500+ per category | Statistical significance | -| Per customer | 5-20 events/customer | Enough for customer-level analysis | -| Total rows | 10K-50K minimum | Patterns survive GROUP BY | - -```python -# Example: 8000 tickets over 180 days = ~44/day average -# After weekly aggregation: ~310 records per week per category -# After monthly by region: still enough to see patterns -N_TICKETS = 8000 -N_CUSTOMERS = 2500 # Each has ~3 tickets on average -N_ORDERS = 25000 # ~10 orders per customer average -``` - -## Script Structure - -Always structure scripts with configuration variables at the top: - -```python -"""Generate synthetic data for [use case].""" -import numpy as np -import pandas as pd -from datetime import datetime, timedelta -from faker import Faker -import holidays -from pyspark.sql import SparkSession - -# ============================================================================= -# CONFIGURATION - Edit these values -# ============================================================================= -CATALOG = "my_catalog" -SCHEMA = "my_schema" -VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data" - -# Data sizes - enough for aggregation patterns to survive -N_CUSTOMERS = 2500 -N_ORDERS = 25000 -N_TICKETS = 8000 - -# Date range - last 6 months from today -END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) -START_DATE = END_DATE - timedelta(days=180) - -# Special events (within the date range) -INCIDENT_END = END_DATE - timedelta(days=21) -INCIDENT_START = INCIDENT_END - timedelta(days=10) - -# Holiday calendar for realistic patterns -US_HOLIDAYS = holidays.US(years=[START_DATE.year, END_DATE.year]) - -# Reproducibility -SEED = 42 - -# ============================================================================= -# SETUP -# ============================================================================= -np.random.seed(SEED) -Faker.seed(SEED) -fake = Faker() -spark = SparkSession.builder.getOrCreate() - -# ... rest of script -``` - -## Key Principles - -### 1. Use Pandas for Generation, Spark for Saving - -Generate data with pandas (faster, easier), convert to Spark for saving: - -```python -import pandas as pd - -# Generate with pandas -customers_pdf = pd.DataFrame({ - "customer_id": [f"CUST-{i:05d}" for i in range(N_CUSTOMERS)], - "name": [fake.company() for _ in range(N_CUSTOMERS)], - "tier": np.random.choice(['Free', 'Pro', 'Enterprise'], N_CUSTOMERS, p=[0.6, 0.3, 0.1]), - "region": np.random.choice(['North', 'South', 'East', 'West'], N_CUSTOMERS, p=[0.4, 0.25, 0.2, 0.15]), - "created_at": [fake.date_between(start_date='-2y', end_date='-6m') for _ in range(N_CUSTOMERS)], -}) - -# Convert to Spark and save -customers_df = spark.createDataFrame(customers_pdf) -customers_df.write.mode("overwrite").parquet(f"{VOLUME_PATH}/customers") -``` - -### 2. Iterate on DataFrames for Referential Integrity - -Generate master tables first, then iterate on them to create related tables with matching IDs: - -```python -# 1. Generate customers (master table) -customers_pdf = pd.DataFrame({ - "customer_id": [f"CUST-{i:05d}" for i in range(N_CUSTOMERS)], - "tier": np.random.choice(['Free', 'Pro', 'Enterprise'], N_CUSTOMERS, p=[0.6, 0.3, 0.1]), - # ... -}) - -# 2. Create lookup for foreign key generation -customer_ids = customers_pdf["customer_id"].tolist() -customer_tier_map = dict(zip(customers_pdf["customer_id"], customers_pdf["tier"])) - -# Weight by tier - Enterprise customers generate more orders -tier_weights = customers_pdf["tier"].map({'Enterprise': 5.0, 'Pro': 2.0, 'Free': 1.0}) -customer_weights = (tier_weights / tier_weights.sum()).tolist() - -# 3. Generate orders with valid foreign keys and tier-based logic -orders_data = [] -for i in range(N_ORDERS): - cid = np.random.choice(customer_ids, p=customer_weights) - tier = customer_tier_map[cid] - - # Amount depends on tier - if tier == 'Enterprise': - amount = np.random.lognormal(7, 0.8) - elif tier == 'Pro': - amount = np.random.lognormal(5, 0.7) - else: - amount = np.random.lognormal(3.5, 0.6) - - orders_data.append({ - "order_id": f"ORD-{i:06d}", - "customer_id": cid, - "amount": round(amount, 2), - "order_date": fake.date_between(start_date=START_DATE, end_date=END_DATE), - }) - -orders_pdf = pd.DataFrame(orders_data) - -# 4. Generate tickets that reference both customers and orders -order_ids = orders_pdf["order_id"].tolist() -tickets_data = [] -for i in range(N_TICKETS): - cid = np.random.choice(customer_ids, p=customer_weights) - oid = np.random.choice(order_ids) # Or None for general inquiry - - tickets_data.append({ - "ticket_id": f"TKT-{i:06d}", - "customer_id": cid, - "order_id": oid if np.random.random() > 0.3 else None, - # ... - }) - -tickets_pdf = pd.DataFrame(tickets_data) -``` - -### 3. Non-Linear Distributions - -**Never use uniform distributions** - real data is rarely uniform: - -```python -# BAD - Uniform (unrealistic) -prices = np.random.uniform(10, 1000, size=N_ORDERS) - -# GOOD - Log-normal (realistic for prices, salaries, order amounts) -prices = np.random.lognormal(mean=4.5, sigma=0.8, size=N_ORDERS) - -# GOOD - Pareto/power law (popularity, wealth, page views) -popularity = (np.random.pareto(a=2.5, size=N_PRODUCTS) + 1) * 10 - -# GOOD - Exponential (time between events, resolution time) -resolution_hours = np.random.exponential(scale=24, size=N_TICKETS) - -# GOOD - Weighted categorical -regions = np.random.choice( - ['North', 'South', 'East', 'West'], - size=N_CUSTOMERS, - p=[0.40, 0.25, 0.20, 0.15] -) -``` - -### 4. Time-Based Patterns - -Add weekday/weekend effects, holidays, seasonality, and event spikes: - -```python -import holidays - -# Load holiday calendar -US_HOLIDAYS = holidays.US(years=[START_DATE.year, END_DATE.year]) - -def get_daily_multiplier(date): - """Calculate volume multiplier for a given date.""" - multiplier = 1.0 - - # Weekend drop - if date.weekday() >= 5: - multiplier *= 0.6 - - # Holiday drop (even lower than weekends) - if date in US_HOLIDAYS: - multiplier *= 0.3 - - # Q4 seasonality (higher in Oct-Dec) - multiplier *= 1 + 0.15 * (date.month - 6) / 6 - - # Incident spike - if INCIDENT_START <= date <= INCIDENT_END: - multiplier *= 3.0 - - # Random noise - multiplier *= np.random.normal(1, 0.1) - - return max(0.1, multiplier) - -# Distribute tickets across dates with realistic patterns -date_range = pd.date_range(START_DATE, END_DATE, freq='D') -daily_volumes = [int(BASE_DAILY_TICKETS * get_daily_multiplier(d)) for d in date_range] -``` - -### 5. Row Coherence - -Attributes within a row should correlate logically: - -```python -def generate_ticket(customer_id, tier, date): - """Generate a coherent ticket where attributes correlate.""" - - # Priority correlates with tier - if tier == 'Enterprise': - priority = np.random.choice(['Critical', 'High', 'Medium'], p=[0.3, 0.5, 0.2]) - else: - priority = np.random.choice(['Critical', 'High', 'Medium', 'Low'], p=[0.05, 0.2, 0.45, 0.3]) - - # Resolution time correlates with priority - resolution_scale = {'Critical': 4, 'High': 12, 'Medium': 36, 'Low': 72} - resolution_hours = np.random.exponential(scale=resolution_scale[priority]) - - # CSAT correlates with resolution time - if resolution_hours < 4: - csat = np.random.choice([4, 5], p=[0.3, 0.7]) - elif resolution_hours < 24: - csat = np.random.choice([3, 4, 5], p=[0.2, 0.5, 0.3]) - else: - csat = np.random.choice([1, 2, 3, 4], p=[0.1, 0.3, 0.4, 0.2]) - - return { - "customer_id": customer_id, - "priority": priority, - "resolution_hours": round(resolution_hours, 1), - "csat_score": csat, - "created_at": date, - } -``` - -## Complete Example - -Save as `scripts/generate_data.py`: - -```python -"""Generate synthetic customer, order, and ticket data.""" -import numpy as np -import pandas as pd -from datetime import datetime, timedelta -from faker import Faker -import holidays -from pyspark.sql import SparkSession - -# ============================================================================= -# CONFIGURATION -# ============================================================================= -CATALOG = "my_catalog" -SCHEMA = "my_schema" -VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data" - -N_CUSTOMERS = 2500 -N_ORDERS = 25000 -N_TICKETS = 8000 - -# Date range - last 6 months from today -END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) -START_DATE = END_DATE - timedelta(days=180) - -# Special events (within the date range) -INCIDENT_END = END_DATE - timedelta(days=21) -INCIDENT_START = INCIDENT_END - timedelta(days=10) - -# Holiday calendar -US_HOLIDAYS = holidays.US(years=[START_DATE.year, END_DATE.year]) - -SEED = 42 - -# ============================================================================= -# SETUP -# ============================================================================= -np.random.seed(SEED) -Faker.seed(SEED) -fake = Faker() -spark = SparkSession.builder.getOrCreate() - -# ============================================================================= -# CREATE INFRASTRUCTURE -# ============================================================================= -print(f"Creating catalog/schema/volume if needed...") -spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}") -spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}") -spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data") - -print(f"Generating: {N_CUSTOMERS:,} customers, {N_ORDERS:,} orders, {N_TICKETS:,} tickets") - -# ============================================================================= -# 1. CUSTOMERS (Master Table) -# ============================================================================= -print("Generating customers...") - -customers_pdf = pd.DataFrame({ - "customer_id": [f"CUST-{i:05d}" for i in range(N_CUSTOMERS)], - "name": [fake.company() for _ in range(N_CUSTOMERS)], - "tier": np.random.choice(['Free', 'Pro', 'Enterprise'], N_CUSTOMERS, p=[0.6, 0.3, 0.1]), - "region": np.random.choice(['North', 'South', 'East', 'West'], N_CUSTOMERS, p=[0.4, 0.25, 0.2, 0.15]), -}) - -# ARR correlates with tier -customers_pdf["arr"] = customers_pdf["tier"].apply( - lambda t: round(np.random.lognormal(11, 0.5), 2) if t == 'Enterprise' - else round(np.random.lognormal(8, 0.6), 2) if t == 'Pro' else 0 -) - -# Lookups for foreign keys -customer_ids = customers_pdf["customer_id"].tolist() -customer_tier_map = dict(zip(customers_pdf["customer_id"], customers_pdf["tier"])) -tier_weights = customers_pdf["tier"].map({'Enterprise': 5.0, 'Pro': 2.0, 'Free': 1.0}) -customer_weights = (tier_weights / tier_weights.sum()).tolist() - -print(f" Created {len(customers_pdf):,} customers") - -# ============================================================================= -# 2. ORDERS (References Customers) -# ============================================================================= -print("Generating orders...") - -orders_data = [] -for i in range(N_ORDERS): - cid = np.random.choice(customer_ids, p=customer_weights) - tier = customer_tier_map[cid] - amount = np.random.lognormal(7 if tier == 'Enterprise' else 5 if tier == 'Pro' else 3.5, 0.7) - - orders_data.append({ - "order_id": f"ORD-{i:06d}", - "customer_id": cid, - "amount": round(amount, 2), - "status": np.random.choice(['completed', 'pending', 'cancelled'], p=[0.85, 0.10, 0.05]), - "order_date": fake.date_between(start_date=START_DATE, end_date=END_DATE), - }) - -orders_pdf = pd.DataFrame(orders_data) -print(f" Created {len(orders_pdf):,} orders") - -# ============================================================================= -# 3. TICKETS (References Customers, with incident spike) -# ============================================================================= -print("Generating tickets...") - -def get_daily_volume(date, base=25): - vol = base * (0.6 if date.weekday() >= 5 else 1.0) - if date in US_HOLIDAYS: - vol *= 0.3 # Even lower on holidays - if INCIDENT_START <= date <= INCIDENT_END: - vol *= 3.0 - return int(vol * np.random.normal(1, 0.15)) - -# Distribute tickets across dates -tickets_data = [] -ticket_idx = 0 -for day in pd.date_range(START_DATE, END_DATE): - daily_count = get_daily_volume(day.to_pydatetime()) - is_incident = INCIDENT_START <= day.to_pydatetime() <= INCIDENT_END - - for _ in range(daily_count): - if ticket_idx >= N_TICKETS: - break - - cid = np.random.choice(customer_ids, p=customer_weights) - tier = customer_tier_map[cid] - - # Category - Auth dominates during incident - if is_incident: - category = np.random.choice(['Auth', 'Network', 'Billing', 'Account'], p=[0.65, 0.15, 0.1, 0.1]) - else: - category = np.random.choice(['Auth', 'Network', 'Billing', 'Account'], p=[0.25, 0.30, 0.25, 0.20]) - - # Priority correlates with tier - priority = np.random.choice(['Critical', 'High', 'Medium'], p=[0.3, 0.5, 0.2]) if tier == 'Enterprise' \ - else np.random.choice(['Critical', 'High', 'Medium', 'Low'], p=[0.05, 0.2, 0.45, 0.3]) - - # Resolution time correlates with priority - res_scale = {'Critical': 4, 'High': 12, 'Medium': 36, 'Low': 72} - resolution = np.random.exponential(scale=res_scale[priority]) - - # CSAT degrades during incident for Auth - if is_incident and category == 'Auth': - csat = np.random.choice([1, 2, 3, 4, 5], p=[0.15, 0.25, 0.35, 0.2, 0.05]) - else: - csat = 5 if resolution < 4 else (4 if resolution < 12 else np.random.choice([2, 3, 4], p=[0.2, 0.5, 0.3])) - - tickets_data.append({ - "ticket_id": f"TKT-{ticket_idx:06d}", - "customer_id": cid, - "category": category, - "priority": priority, - "resolution_hours": round(resolution, 1), - "csat_score": csat, - "created_at": day.strftime("%Y-%m-%d"), - }) - ticket_idx += 1 - - if ticket_idx >= N_TICKETS: - break - -tickets_pdf = pd.DataFrame(tickets_data) -print(f" Created {len(tickets_pdf):,} tickets") - -# ============================================================================= -# 4. SAVE TO VOLUME -# ============================================================================= -print(f"\nSaving to {VOLUME_PATH}...") - -spark.createDataFrame(customers_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/customers") -spark.createDataFrame(orders_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/orders") -spark.createDataFrame(tickets_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/tickets") - -print("Done!") - -# ============================================================================= -# 5. VALIDATION -# ============================================================================= -print("\n=== VALIDATION ===") -print(f"Tier distribution: {customers_pdf['tier'].value_counts(normalize=True).to_dict()}") -print(f"Avg order by tier: {orders_pdf.merge(customers_pdf[['customer_id', 'tier']]).groupby('tier')['amount'].mean().to_dict()}") - -incident_tickets = tickets_pdf[tickets_pdf['created_at'].between( - INCIDENT_START.strftime("%Y-%m-%d"), INCIDENT_END.strftime("%Y-%m-%d") -)] -print(f"Incident period tickets: {len(incident_tickets):,} ({len(incident_tickets)/len(tickets_pdf)*100:.1f}%)") -print(f"Incident Auth %: {(incident_tickets['category'] == 'Auth').mean()*100:.1f}%") -``` - -Execute using `run_python_file_on_databricks` tool: -- `file_path`: "scripts/generate_data.py" - -If it fails, edit the file and re-run with the same `cluster_id` and `context_id`. - -### Validate Generated Data - -After successful execution, use `get_volume_folder_details` tool to verify the generated data: -- `volume_path`: "my_catalog/my_schema/raw_data/customers" -- `format`: "parquet" -- `table_stat_level`: "SIMPLE" - -This returns schema, row counts, and column statistics to confirm the data was written correctly. - -## Best Practices - -1. **Ask for schema**: Default to `ai_dev_kit` catalog, ask user for schema name -2. **Create infrastructure**: Use `CREATE CATALOG/SCHEMA/VOLUME IF NOT EXISTS` -3. **Raw data only**: No `total_x`, `sum_x`, `avg_x` fields - SDP pipeline computes those -4. **Save to Volume, not tables**: Write parquet to `/Volumes/{catalog}/{schema}/raw_data/` -5. **Configuration at top**: All sizes, dates, and paths as variables -6. **Dynamic dates**: Use `datetime.now() - timedelta(days=180)` for last 6 months -7. **Pandas for generation**: Faster and easier than Spark for row-by-row logic -8. **Master tables first**: Generate customers, then orders reference customer_ids -9. **Weighted sampling**: Enterprise customers generate more activity -10. **Distributions**: Log-normal for values, exponential for times, weighted categorical -11. **Time patterns**: Weekday/weekend, holidays, seasonality, event spikes -12. **Row coherence**: Priority affects resolution time affects CSAT -13. **Volume for aggregation**: 10K-50K rows minimum so patterns survive GROUP BY -14. **Always use files**: Write to local file, execute, edit if error, re-execute -15. **Context reuse**: Pass `cluster_id` and `context_id` for faster iterations -16. **Libraries**: Install `faker` and `holidays` first; most others are pre-installed - -## Related Skills - -- **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - for building bronze/silver/gold pipelines on top of generated data -- **[databricks-aibi-dashboards](../databricks-aibi-dashboards/SKILL.md)** - for visualizing the generated data in dashboards -- **[databricks-unity-catalog](../databricks-unity-catalog/SKILL.md)** - for managing catalogs, schemas, and volumes where data is stored diff --git a/databricks-skills/databricks-unity-catalog/SKILL.md b/databricks-skills/databricks-unity-catalog/SKILL.md index 553eba97..30f34e3d 100644 --- a/databricks-skills/databricks-unity-catalog/SKILL.md +++ b/databricks-skills/databricks-unity-catalog/SKILL.md @@ -110,7 +110,7 @@ mcp__databricks__execute_sql( - **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - for pipelines that write to Unity Catalog tables - **[databricks-jobs](../databricks-jobs/SKILL.md)** - for job execution data visible in system tables -- **[databricks-synthetic-data-generation](../databricks-synthetic-data-generation/SKILL.md)** - for generating data stored in Unity Catalog Volumes +- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - for generating data stored in Unity Catalog Volumes - **[databricks-aibi-dashboards](../databricks-aibi-dashboards/SKILL.md)** - for building dashboards on top of Unity Catalog data ## Resources diff --git a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md index 7666f21b..ee9abf05 100644 --- a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md +++ b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md @@ -190,5 +190,5 @@ AZURE_OPENAI_DEPLOYMENT=gpt-4o - **[databricks-agent-bricks](../databricks-agent-bricks/SKILL.md)** - Create Knowledge Assistants that ingest the generated PDFs - **[databricks-vector-search](../databricks-vector-search/SKILL.md)** - Index generated documents for semantic search and RAG -- **[databricks-synthetic-data-generation](../databricks-synthetic-data-generation/SKILL.md)** - Generate structured tabular data (complement to unstructured PDFs) +- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - Generate structured tabular data (complement to unstructured PDFs) - **[databricks-mlflow-evaluation](../databricks-mlflow-evaluation/SKILL.md)** - Evaluate RAG systems using the generated question/guideline pairs diff --git a/databricks-skills/databricks-zerobus-ingest/SKILL.md b/databricks-skills/databricks-zerobus-ingest/SKILL.md index efd52b0d..e3d3f48a 100644 --- a/databricks-skills/databricks-zerobus-ingest/SKILL.md +++ b/databricks-skills/databricks-zerobus-ingest/SKILL.md @@ -218,7 +218,7 @@ The timestamp generation must use microseconds for Databricks. - **[databricks-python-sdk](../databricks-python-sdk/SKILL.md)** - General SDK patterns and WorkspaceClient for table/schema management - **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - Downstream pipeline processing of ingested data - **[databricks-unity-catalog](../databricks-unity-catalog/SKILL.md)** - Managing catalogs, schemas, and tables that Zerobus writes to -- **[databricks-synthetic-data-generation](../databricks-synthetic-data-generation/SKILL.md)** - Generate test data to feed into Zerobus producers +- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - Generate test data to feed into Zerobus producers - **[databricks-config](../databricks-config/SKILL.md)** - Profile and authentication setup ## Resources diff --git a/databricks-skills/install_skills.sh b/databricks-skills/install_skills.sh index ff8d9b86..763489c8 100755 --- a/databricks-skills/install_skills.sh +++ b/databricks-skills/install_skills.sh @@ -42,7 +42,7 @@ MLFLOW_REPO_RAW_URL="https://raw.githubusercontent.com/mlflow/skills" MLFLOW_REPO_REF="main" # Databricks skills (hosted in this repo) -DATABRICKS_SKILLS="databricks-agent-bricks databricks-aibi-dashboards databricks-asset-bundles databricks-app-apx databricks-app-python databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-parsing databricks-python-sdk databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-generation databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source" +DATABRICKS_SKILLS="databricks-agent-bricks databricks-aibi-dashboards databricks-asset-bundles databricks-app-apx databricks-app-python databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-parsing databricks-python-sdk databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-gen databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source" # MLflow skills (fetched from mlflow/skills repo) MLFLOW_SKILLS="agent-evaluation analyze-mlflow-chat-session analyze-mlflow-trace instrumenting-with-mlflow-tracing mlflow-onboarding querying-mlflow-metrics retrieving-mlflow-traces searching-mlflow-docs" @@ -76,7 +76,7 @@ get_skill_description() { "databricks-spark-declarative-pipelines") echo "Spark Declarative Pipelines (SDP/LDP/DLT)" ;; "spark-python-data-source") echo "Spark custom Python data sources" ;; "databricks-spark-structured-streaming") echo "Spark Structured Streaming patterns and best practices" ;; - "databricks-synthetic-data-generation") echo "Synthetic test data generation" ;; + "databricks-synthetic-data-gen") echo "Synthetic test data generation" ;; "databricks-unstructured-pdf-generation") echo "Generate synthetic PDFs for RAG" ;; "databricks-vector-search") echo "Vector Search - endpoints, indexes, and queries for RAG" ;; "databricks-zerobus-ingest") echo "Zerobus Ingest - gRPC data ingestion into Delta tables" ;; diff --git a/install.ps1 b/install.ps1 index f144b5ac..38e4a2a0 100644 --- a/install.ps1 +++ b/install.ps1 @@ -78,11 +78,11 @@ $script:ProfileProvided = $false $script:Skills = @( "databricks-agent-bricks", "databricks-aibi-dashboards", "databricks-app-apx", "databricks-app-python", "databricks-asset-bundles", "databricks-config", "databricks-dbsql", "databricks-docs", "databricks-genie", - "databricks-jobs", "databricks-metric-views", "databricks-model-serving", "databricks-python-sdk", - "databricks-unity-catalog", "databricks-vector-search", "databricks-zerobus-ingest", - "databricks-lakebase-autoscale", "databricks-lakebase-provisioned", "databricks-mlflow-evaluation", - "databricks-spark-declarative-pipelines", "spark-python-data-source", "databricks-spark-structured-streaming", - "databricks-synthetic-data-generation", "databricks-unstructured-pdf-generation" + "databricks-iceberg", "databricks-jobs", "databricks-lakebase-autoscale", "databricks-lakebase-provisioned", + "databricks-metric-views", "databricks-mlflow-evaluation", "databricks-model-serving", "databricks-parsing", + "databricks-python-sdk", "databricks-spark-declarative-pipelines", "databricks-spark-structured-streaming", + "databricks-synthetic-data-gen", "databricks-unity-catalog", "databricks-unstructured-pdf-generation", + "databricks-vector-search", "databricks-zerobus-ingest", "spark-python-data-source" ) # MLflow skills (fetched from mlflow/skills repo) diff --git a/install.sh b/install.sh index c347b13e..61b98d42 100755 --- a/install.sh +++ b/install.sh @@ -74,7 +74,7 @@ MIN_SDK_VERSION="0.85.0" G='\033[0;32m' Y='\033[1;33m' R='\033[0;31m' BL='\033[0;34m' B='\033[1m' D='\033[2m' N='\033[0m' # Databricks skills (bundled in repo) -SKILLS="databricks-agent-bricks databricks-aibi-dashboards databricks-app-apx databricks-app-python databricks-asset-bundles databricks-config databricks-dbsql databricks-docs databricks-genie databricks-jobs databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-python-sdk databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-generation databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source" +SKILLS="databricks-agent-bricks databricks-aibi-dashboards databricks-app-apx databricks-app-python databricks-asset-bundles databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-parsing databricks-python-sdk databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-gen databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source" # MLflow skills (fetched from mlflow/skills repo) MLFLOW_SKILLS="agent-evaluation analyze-mlflow-chat-session analyze-mlflow-trace instrumenting-with-mlflow-tracing mlflow-onboarding querying-mlflow-metrics retrieving-mlflow-traces searching-mlflow-docs"