bbglab · FerriolCalvet · Apr 10, 2026 · Apr 3, 2025 · May 8, 2025 · May 14, 2025
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,5 @@ ste_notes.txt
 assets/HDP_files*
 scratch/
 scratchhhh/
+tests/test_data/all_samples.somatic.mutations.maf
+tests/test_data/all_samples_indv.depths.tsv.gz
diff --git a/bin/annotate_omega_failing.py b/bin/annotate_omega_failing.py
@@ -271,8 +271,26 @@ def main(omegas_file: str, compiled_flagged_files: str, output: str) -> None:
         lines = [ln.strip() for ln in fh if ln.strip()]
     flagged_paths = [Path(l) for l in lines]
 
+    # Read omegas with resilience to missing header lines
+    # Some aggregation steps may drop the header; if so, re-read with explicit names
+    def _read_omegas(path: Path) -> pd.DataFrame:
+        try:
+            df = pd.read_csv(path, sep="\t", header=0, dtype=str, skip_blank_lines=True)
+        except pd.errors.EmptyDataError:
+            return pd.DataFrame(columns=["gene","sample","impact","mutations","dnds","pvalue","lower","upper"])  # empty
+        # If expected columns are missing (e.g., header was dropped), re-read with names
+        expected = {"gene","sample","impact","mutations","dnds","pvalue","lower","upper"}
+        if not expected.issubset(set(map(str, df.columns))):
+            df = pd.read_csv(path,
+                             sep="\t",
+                             header=None,
+                             names=["gene","sample","impact","mutations","dnds","pvalue","lower","upper"],
+                             dtype=str,
+                             skip_blank_lines=True)
+        return df.fillna("")
+
     # Read omegas
-    omegas = pd.read_csv(omegas_path, sep="\t", header=0, dtype=str).fillna("")
+    omegas = _read_omegas(omegas_path)
 
     syn_flagged_sample, syn_flagged_gene, npa_flagged_sample, npa_flagged_gene = load_flagged_tables(flagged_paths)
 

diff --git a/bin/create_consensus_panel.py b/bin/create_consensus_panel.py
@@ -53,6 +53,9 @@ def create_consensus_panel(compact_annot_panel_path, depths_path, version, conse
     #####
     # Filter failing columns only for rows that pass the compliance threshold
     compliance_df_passing = compliance_df.filter(passing_rows)
+
+    print(f"DEBUG: Total positions passing compliance threshold: {compliance_df_passing.height}")
+    print(f"DEBUG: Number of samples: {compliance_df_passing.width}")
 
     # Invert all boolean values (True → False, False → True)
     failing_mask = pl.DataFrame([
@@ -70,6 +73,7 @@ def create_consensus_panel(compact_annot_panel_path, depths_path, version, conse
                     "Failed": True
                 })
 
+    print(f"DEBUG: Total failing entries found: {len(failing_columns_counts)}")
 
     if failing_columns_counts:
         failing_columns_counts_df = pl.DataFrame(failing_columns_counts)
@@ -79,6 +83,7 @@ def create_consensus_panel(compact_annot_panel_path, depths_path, version, conse
             .rename({"count": "FAILING_COUNT"})
         )
         failure_counts_filtered.write_csv(f"failing_consensus.{version}.tsv", separator="\t")
+        print(f"DEBUG: Created failing_consensus.{version}.tsv with {failure_counts_filtered.height} samples")
 
 
 @click.command()

diff --git a/bin/create_panel_versions.py b/bin/create_panel_versions.py
@@ -1,14 +1,20 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
+"""
+create_panel_versions.py
 
-import click
-import pandas as pd
-import os
+Generates multiple VEP annotation panel subsets based on the 'IMPACT' column
+using the high-performance Polars library.
+
+Usage:
+    python create_panel_versions.py --compact-annot-panel-path <input_tsv> --output <output_prefix>
+"""
 
-# TODO: check pandas version 2.0.3
-# -- Auxiliary functions -- #
+import polars as pl
+import click
+import sys
 
-panel_impact_dict = {
+PANEL_IMPACT_DICT = {
 
     "protein_affecting": ["nonsense", "missense",
                             "essential_splice",
@@ -68,25 +74,33 @@
 
 }
 
-# -- Main function -- #
 
-def create_panel_versions(compact_annot_panel_path, output_path):
+def create_panel_versions(input_path: str, output_prefix: str) -> None:
+    """
+    Generates panel subsets from a VEP-annotated file using Polars.
+
+    \b
+    INPUT_PATH: Path to the annotated TSV file.
+    OUTPUT_PREFIX: Prefix for the output files (e.g., 'output/panel').
+    """
+    try:
+        df = pl.read_csv(input_path, separator="\t")
+    except Exception as e:
+        click.echo(f"Error reading input file: {e}", err=True)
+        sys.exit(1)
 
-    # Load VEP annotated panel, already compacted to have one variant per site
-    ## requires column named IMPACT with consequence type
-    compact_annot_panel_df = pd.read_csv(compact_annot_panel_path, sep = "\t")
+    if "IMPACT" not in df.columns:
+        click.echo("ERROR: 'IMPACT' column not found in input file.", err=True)
+        sys.exit(1)
 
-    # Create panel versions
-    for version in panel_impact_dict:
+    for version_name, impact_values in PANEL_IMPACT_DICT.items():
+        filtered = df.filter(pl.col("IMPACT").is_in(impact_values))
+        filtered.write_csv(f"{output_prefix}.{version_name}.tsv", separator="\t")
 
-        panel_version = compact_annot_panel_df.loc[compact_annot_panel_df["IMPACT"].isin(panel_impact_dict[version])]
-        panel_version.to_csv(f"{output_path}.{version}.tsv",
-                                sep = "\t", index = False)
+    # Write the full file as a version
+    df.write_csv(f"{output_prefix}.all.tsv", separator="\t")
 
-    # Store complete panel (better change this way of using this version in nextflow)
-    version = "all"
-    compact_annot_panel_df.to_csv(f"{output_path}.{version}.tsv",
-                                    sep = "\t", index = False)
+    click.echo("Panel versions generated successfully.")
 
 
 @click.command()