Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
035a0c7
dev: VEP chunk and VEP cache beegfs
migrau Apr 3, 2025
8ef2919
fix: use standard cache for ENSEMBLVEP_VEP
migrau May 8, 2025
40bb507
perf: improve VEP performance by converting input format
migrau May 14, 2025
bb21b25
fix: panel_postprocessing_annotation.py
migrau May 14, 2025
7c73d3b
fix: arguments safe_transform_context
migrau May 16, 2025
276152d
perf: chunking panel_custom_processing.py
migrau May 20, 2025
7bc3a16
perf: CREATECAPTUREDPANELS containers edited. create_panel_versions.p…
migrau May 22, 2025
346665d
fix: python3 container for CREATECAPTUREDPANELS
migrau Jun 4, 2025
08d8fad
fix: remove container option CREATECAPTUREDPANELS. fix conda versions…
migrau Jun 4, 2025
5c8ff55
fix: typo CREATECAPTUREDPANELS
migrau Jun 4, 2025
891ec85
fix: wave true only for CREATECAPTUREDPANELS
migrau Jun 4, 2025
e1fd6af
fix: syntax config module CREATECAPTUREDPANELS
migrau Jun 5, 2025
ca0ae01
fix: new way to specify wave for a single process
migrau Jun 5, 2025
5560c25
fix: toString added for wave
migrau Jun 5, 2025
c0c3e97
fix: wave label added
migrau Jun 5, 2025
24efcf6
fix: wave true for everything
migrau Jun 5, 2025
7734938
fix: wave false except CREATECAPTUREDPANELS
migrau Jun 5, 2025
b625332
fix: comma...
migrau Jun 5, 2025
8110a34
fix: wave removed. New container created
migrau Jun 5, 2025
e718e41
fix: Removed wave from nextflow.config
migrau Jun 6, 2025
9fd0ed7
fix: adjust memory requeriments
migrau Jun 30, 2025
abc85ed
perf: added new profile, nanoseq
migrau Jun 30, 2025
3e0b4b5
fix: naming withLabel config review
migrau Jul 1, 2025
61ec864
fix: nanoseq config resourceLimits
migrau Jul 1, 2025
0188172
fix: correct withName *
migrau Jul 1, 2025
b0e422a
fix: SITESFROMPOSITIONS memory test
migrau Jul 1, 2025
63dcea7
fix SITESFROMPOSITIONS
migrau Jul 1, 2025
7c2f56b
fix: SITESFROMPOSITIONS
migrau Jul 1, 2025
6e53f23
fix: fix profile
migrau Jul 1, 2025
e9d1b3b
fix: SITESFROMPOSITIONS config
migrau Jul 1, 2025
1dffd94
fix: POSTPROCESSVEPPANEL. Time
migrau Jul 2, 2025
24b170a
fix: RESOURCE LIMITS added
migrau Jul 3, 2025
d243ebc
fix: typo
migrau Jul 3, 2025
945c129
fix: update base.config
migrau Jul 3, 2025
198ff20
fix: adjust nanoconfig
migrau Jul 3, 2025
0cfd80f
Merge branch 'dev' into dev-chunk-optimization-POSTPROCESSVEPPANEL
migrau Nov 14, 2025
6c64f4d
fix: parallelization optional. Include sort for bedtools merge
migrau Nov 14, 2025
b2f12fd
fix: gene omega error: "No flagged entries found; skipping plots and …
migrau Nov 16, 2025
d4ed3c2
fix: Add debug logging and ensure failing_consensus file is always cr…
migrau Nov 18, 2025
4be3b45
feat: Add chunking support for SITESFROMPOSITIONS with genomic sorting
migrau Nov 19, 2025
e52cb76
feat: add parallel_processing_parameters section to schema for chunki…
migrau Nov 19, 2025
92580ce
update dnds genes list
FerriolCalvet Nov 21, 2025
485978f
Merge branch 'dev' into dev-chunk-optimization-POSTPROCESSVEPPANEL. S…
migrau Mar 18, 2026
de0463c
fix: Refactor VEP annotation processing: revert chunking from panel_…
migrau Mar 18, 2026
3179764
feat: Review Ferriol comments. custom processing of the panel fixed…
migrau Mar 18, 2026
09a6f0c
feat: Add mutation-specific QC plotting options and update tests
migrau Mar 19, 2026
e40939a
add: Mutation density test
migrau Mar 19, 2026
e111a7e
refactor: Replace pandas with polars for improved performance and rem…
migrau Mar 20, 2026
85dcfb9
Remove custom processing chunk size parameter from nextflow configura…
migrau Mar 22, 2026
73fbda3
feat: Replace nanoseq configuration with exome configuration for impr…
migrau Mar 28, 2026
d0c6176
feat: Optimize resource allocation for panel creation processes and a…
migrau Mar 30, 2026
fdbbdb5
fix: Correct typo in process name for panel comparison configuration
migrau Apr 2, 2026
c503b8e
feat: Update panel_sites_chunk_size to 1,000,000 for improved chunkin…
migrau Apr 8, 2026
3ecfb18
feat: Add Protein_position to schema overrides for VEP output file lo…
migrau Apr 8, 2026
835656f
feat: Update resource allocation for analysis processes to optimize p…
migrau Apr 9, 2026
330690e
Update Nextflow configuration and add input MAF test data
migrau Apr 10, 2026
a089e9a
Merge branch 'dev' into dev-chunk-optimization-POSTPROCESSVEPPANEL n
migrau Apr 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@ ste_notes.txt
assets/HDP_files*
scratch/
scratchhhh/
tests/test_data/all_samples.somatic.mutations.maf
tests/test_data/all_samples_indv.depths.tsv.gz
20 changes: 19 additions & 1 deletion bin/annotate_omega_failing.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,8 +271,26 @@ def main(omegas_file: str, compiled_flagged_files: str, output: str) -> None:
lines = [ln.strip() for ln in fh if ln.strip()]
flagged_paths = [Path(l) for l in lines]

# Read omegas with resilience to missing header lines
# Some aggregation steps may drop the header; if so, re-read with explicit names
def _read_omegas(path: Path) -> pd.DataFrame:
try:
df = pd.read_csv(path, sep="\t", header=0, dtype=str, skip_blank_lines=True)
except pd.errors.EmptyDataError:
return pd.DataFrame(columns=["gene","sample","impact","mutations","dnds","pvalue","lower","upper"]) # empty
# If expected columns are missing (e.g., header was dropped), re-read with names
expected = {"gene","sample","impact","mutations","dnds","pvalue","lower","upper"}
if not expected.issubset(set(map(str, df.columns))):
df = pd.read_csv(path,
sep="\t",
header=None,
names=["gene","sample","impact","mutations","dnds","pvalue","lower","upper"],
dtype=str,
skip_blank_lines=True)
return df.fillna("")

# Read omegas
omegas = pd.read_csv(omegas_path, sep="\t", header=0, dtype=str).fillna("")
omegas = _read_omegas(omegas_path)

syn_flagged_sample, syn_flagged_gene, npa_flagged_sample, npa_flagged_gene = load_flagged_tables(flagged_paths)

Expand Down
5 changes: 5 additions & 0 deletions bin/create_consensus_panel.py
Comment thread
migrau marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ def create_consensus_panel(compact_annot_panel_path, depths_path, version, conse
#####
# Filter failing columns only for rows that pass the compliance threshold
compliance_df_passing = compliance_df.filter(passing_rows)

print(f"DEBUG: Total positions passing compliance threshold: {compliance_df_passing.height}")
print(f"DEBUG: Number of samples: {compliance_df_passing.width}")

# Invert all boolean values (True → False, False → True)
failing_mask = pl.DataFrame([
Expand All @@ -70,6 +73,7 @@ def create_consensus_panel(compact_annot_panel_path, depths_path, version, conse
"Failed": True
})

print(f"DEBUG: Total failing entries found: {len(failing_columns_counts)}")

if failing_columns_counts:
failing_columns_counts_df = pl.DataFrame(failing_columns_counts)
Expand All @@ -79,6 +83,7 @@ def create_consensus_panel(compact_annot_panel_path, depths_path, version, conse
.rename({"count": "FAILING_COUNT"})
)
failure_counts_filtered.write_csv(f"failing_consensus.{version}.tsv", separator="\t")
print(f"DEBUG: Created failing_consensus.{version}.tsv with {failure_counts_filtered.height} samples")


@click.command()
Expand Down
56 changes: 35 additions & 21 deletions bin/create_panel_versions.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
#!/usr/bin/env python
#!/usr/bin/env python3

"""
create_panel_versions.py

import click
import pandas as pd
import os
Generates multiple VEP annotation panel subsets based on the 'IMPACT' column
using the high-performance Polars library.

Usage:
python create_panel_versions.py --compact-annot-panel-path <input_tsv> --output <output_prefix>
"""

# TODO: check pandas version 2.0.3
# -- Auxiliary functions -- #
import polars as pl
import click
import sys

panel_impact_dict = {
PANEL_IMPACT_DICT = {

"protein_affecting": ["nonsense", "missense",
"essential_splice",
Expand Down Expand Up @@ -68,25 +74,33 @@

}

# -- Main function -- #

def create_panel_versions(compact_annot_panel_path, output_path):
def create_panel_versions(input_path: str, output_prefix: str) -> None:
"""
Generates panel subsets from a VEP-annotated file using Polars.

\b
INPUT_PATH: Path to the annotated TSV file.
OUTPUT_PREFIX: Prefix for the output files (e.g., 'output/panel').
"""
try:
df = pl.read_csv(input_path, separator="\t")
except Exception as e:
click.echo(f"Error reading input file: {e}", err=True)
sys.exit(1)

# Load VEP annotated panel, already compacted to have one variant per site
## requires column named IMPACT with consequence type
compact_annot_panel_df = pd.read_csv(compact_annot_panel_path, sep = "\t")
if "IMPACT" not in df.columns:
click.echo("ERROR: 'IMPACT' column not found in input file.", err=True)
sys.exit(1)

# Create panel versions
for version in panel_impact_dict:
for version_name, impact_values in PANEL_IMPACT_DICT.items():
filtered = df.filter(pl.col("IMPACT").is_in(impact_values))
filtered.write_csv(f"{output_prefix}.{version_name}.tsv", separator="\t")

panel_version = compact_annot_panel_df.loc[compact_annot_panel_df["IMPACT"].isin(panel_impact_dict[version])]
panel_version.to_csv(f"{output_path}.{version}.tsv",
sep = "\t", index = False)
# Write the full file as a version
df.write_csv(f"{output_prefix}.all.tsv", separator="\t")

# Store complete panel (better change this way of using this version in nextflow)
version = "all"
compact_annot_panel_df.to_csv(f"{output_path}.{version}.tsv",
sep = "\t", index = False)
click.echo("Panel versions generated successfully.")


@click.command()
Expand Down
Loading