bbglab · FerriolCalvet · Mar 6, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026
diff --git a/README.md b/README.md
@@ -74,12 +74,14 @@ We are working to provide the biggest possible detail on the [usage](docs/usage.
 
 > **Sex and smoking bias in the selection of somatic mutations in human bladder**
 >
-> Ferriol Calvet, Raquel Blanco Martinez-Illescas, Ferran Muiños, Maria Tretiakova, Elena S. Latorre-Esteves, Jeanne Fredrickson, Maria Andrianova, Stefano Pellegrini, Axel Rosendahl Huber, Joan Enric Ramis-Zaldivar, Shuyi Charlotte An, Elana Thieme, Brendan F. Kohrn, Miguel L. Grau, Abel Gonzalez-Perez, Nuria Lopez-Bigas & Rosa Ana Risques
+> Ferriol Calvet*, Raquel Blanco Martinez-Illescas*, Ferran Muiños, Maria Tretiakova, Elena S. Latorre-Esteves, Jeanne Fredrickson, Maria Andrianova, Stefano Pellegrini, Axel Rosendahl Huber, Joan Enric Ramis-Zaldivar, Shuyi Charlotte An, Elana Thieme, Brendan F. Kohrn, Miguel L. Grau, Abel Gonzalez-Perez, Nuria Lopez-Bigas & Rosa Ana Risques
 >
 >_Nature_ (2025) doi:[10.1038/s41586-025-09521-x](https://doi.org/10.1038/s41586-025-09521-x)
+>
+> *these authors contributed equally and the order was decided randomly
 
 > **DeepClone, an end-to-end protocol to study somatic mutagenesis and selection at high resolution**
 >
-> Ferriol Calvet, Morena Pinheiro-Santin, Erika Lopez,Raquel Blanco Martinez-Illescas, Núria Samper, Miguel L. Grau, Ferran Muiños,Rocío Chamorro González, Maria Andrianova, Federica Brando,Stefano Pellegrini, Marta Huertas, Elisabet Figuerola-Bou,Coohleen Coombes,Brendan F. Kohrn, Jeanne Fredrickson, Rosa Ana Risques, Nuria Lopez-Bigas, Abel Gonzalez-Perez
+> Ferriol Calvet, Morena Pinheiro-Santin, Erika Lopez, Raquel Blanco Martinez-Illescas, Núria Samper, Miguel L. Grau, Ferran Muiños, Rocío Chamorro González, Maria Andrianova, Federica Brando, Stefano Pellegrini, Marta Huertas, Elisabet Figuerola-Bou, Coohleen Coombes, Brendan F. Kohrn, Jeanne Fredrickson, Rosa Ana Risques, Nuria Lopez-Bigas, Abel Gonzalez-Perez
 >
 > _protocols.io_ (2026) doi:[10.17504/protocols.io.dm6gp1jodgzp/v2](https://dx.doi.org/10.17504/protocols.io.dm6gp1jodgzp/v2)
diff --git a/bin/concat_sigprot_matrices.py b/bin/concat_sigprot_matrices.py
@@ -5,6 +5,14 @@
 import json
 
 
+# identify samples without mutations
+def remove_zero_mutation_samples(matrix):
+    zero_col = matrix.columns[matrix.sum(axis=0) == 0]
+    if len(zero_col) > 0:
+        print(f"Excluding samples without mutations: {list(zero_col)}")
+        matrix = matrix.drop(columns=zero_col)
+    return matrix
+
 def concat_sigprot_matrices(filename_of_matrices, samples_json_file, type_of_profile):
     """
     Concatenate signature profile matrices for samples and groups.
@@ -41,8 +49,11 @@ def concat_sigprot_matrices(filename_of_matrices, samples_json_file, type_of_pro
 
     # Save the groups matrix if it exists
     if groups_matrix is not None and groups_matrix.shape[0] > 0:
+        groups_matrix = remove_zero_mutation_samples(groups_matrix)
+
         groups_matrix.reset_index().to_csv(f"groups_matrix.{type_of_profile}.sp.tsv", sep='\t', header=True, index=False)
-        groups_matrix.round().astype(int).reset_index().to_csv(f"groups_matrix.{type_of_profile}.sp.round.tsv", sep='\t', header=True, index=False)
+        rounded_groups_matrix = remove_zero_mutation_samples(groups_matrix.round().astype(int))
+        rounded_groups_matrix.reset_index().to_csv(f"groups_matrix.{type_of_profile}.sp.round.tsv", sep='\t', header=True, index=False)
 
         # HDP
         groups_matrix = groups_matrix.transpose()
@@ -55,8 +66,11 @@ def concat_sigprot_matrices(filename_of_matrices, samples_json_file, type_of_pro
 
     # Save the samples matrix if it exists
     if samples_only_matrix is not None and samples_only_matrix.shape[0] > 0:
+        samples_only_matrix = remove_zero_mutation_samples(samples_only_matrix)
+
         samples_only_matrix.reset_index().to_csv(f"samples_matrix.{type_of_profile}.sp.tsv", sep='\t', header=True, index=False)
-        samples_only_matrix.round().astype(int).reset_index().to_csv(f"samples_matrix.{type_of_profile}.sp.round.tsv", sep='\t', header=True, index=False)
+        rounded_samples_only_matrix = remove_zero_mutation_samples(samples_only_matrix.round().astype(int))
+        rounded_samples_only_matrix.reset_index().to_csv(f"samples_matrix.{type_of_profile}.sp.round.tsv", sep='\t', header=True, index=False)
 
         # HDP
         samples_only_matrix = samples_only_matrix.transpose()

diff --git a/bin/mut_profile.py b/bin/mut_profile.py
@@ -98,6 +98,72 @@ def compute_mutation_matrix(sample_name, mutations_file, mutation_matrix, method
                                             sep = "\t")
 
 
+
+def profile_stability(counts, denominator):
+    """
+    Compute stability score of a probability profile by
+    adding +1 to each channel individually and measuring
+    L1 (Total Variation) deviation.
+
+    Parameters
+    ----------
+    counts : array-like, shape (k,)
+        Raw integer counts per channel.
+    denominator : array-like, shape (k,)
+        Denominator for each channel (e.g., trinucleotide counts).
+
+    Returns
+    -------
+    results : dict
+        {
+            'mean_deviation': float,
+            'max_deviation': float,
+            'std_deviation': float,
+            'all_deviations': np.ndarray shape (k,)
+        }
+    """
+
+    # Convert to numpy arrays
+    counts = np.asarray(counts, dtype=float)
+    denominator = np.asarray(denominator, dtype=float)
+
+    if counts.ndim != 1:
+        raise ValueError("counts must be a 1D vector")
+    if denominator.ndim != 1:
+        raise ValueError("denominator must be a 1D vector")
+    if len(counts) != len(denominator):
+        raise ValueError("counts and denominator must have same length")
+    if np.any(denominator == 0):
+        raise ValueError("denominator contains zeros")
+
+    k = len(counts)
+
+    # ---- Baseline profile ----
+    ref_profile = counts / denominator
+    ref_profile = ref_profile / ref_profile.sum()
+    p = ref_profile
+
+    deviations = np.zeros(k)
+
+    # ---- Perturb each channel ----
+    for j in range(k):
+        perturbed = counts.copy()
+        perturbed[j] += 1
+
+        p_prime = perturbed / denominator
+        p_prime = p_prime / p_prime.sum()  # IMPORTANT: renormalize
+
+        deviations[j] = 0.5 * np.sum(np.abs(p - p_prime))
+
+    return {
+        "mean_deviation": deviations.mean(),
+        "min_deviation": deviations.min(),
+        "max_deviation": deviations.max(),
+        "std_deviation": deviations.std(),
+        "all_deviations": deviations.tolist()
+    }
+
+
 def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_counts_file, plot,
                                 wgs = False, wgs_trinucleotide_counts = False, sigprofiler = False):
     """
@@ -116,8 +182,9 @@ def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_co
     total_mutations = np.sum(mutation_matrix[sample_name])
 
     # proportion of SBS mutations per trinucleotide in panel
-    mutation_matrix[sample_name] = mutation_matrix[sample_name] / total_mutations
-    mutation_matrix.to_csv(f"{sample_name}.proportion_mutations.tsv",
+    mutation_matrix_proportions = mutation_matrix.copy()
+    mutation_matrix_proportions[sample_name] = mutation_matrix_proportions[sample_name] / total_mutations
+    mutation_matrix_proportions.to_csv(f"{sample_name}.proportion_mutations.tsv",
                                 header = True,
                                 index = True,
                                 sep = "\t")
@@ -134,6 +201,13 @@ def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_co
     contextmut_depth = pd.DataFrame({sample_name : trinuc_per_contextmuts, "CONTEXT_MUT": contexts_formatted})
     contextmut_depth = contextmut_depth.set_index("CONTEXT_MUT")
 
+    # compute profile stability and store the outputs in a file
+    stability_output_dict = profile_stability(mutation_matrix[sample_name], contextmut_depth[sample_name])
+    stability_outputs = pd.DataFrame(stability_output_dict.items()).set_index(0).T
+    stability_outputs[["SAMPLE_ID", "mode"]] = sample_name.split(".")
+    stability_outputs[["SAMPLE_ID", "mode"] + list(stability_outputs.columns[:-2])].to_csv(f"{sample_name}.profile_stability.tsv", sep = "\t", index = False)
+
+
     # divide
     mut_probability = mutation_matrix.divide( contextmut_depth )
 
@@ -171,7 +245,7 @@ def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_co
                         output_f = f'{sample_name}.profile.pdf')
 
         # plot the profile as a percentage of SBS mutations seen in our sequenced panel
-        plot_profile(dict(zip(mutation_matrix.index, mutation_matrix[sample_name])),
+        plot_profile(dict(zip(mutation_matrix_proportions.index, mutation_matrix_proportions[sample_name])),
                         title=f'{sample_name} ({round(total_mutations)} muts)',
                         output_f = f'{sample_name}.profile.percentage.pdf')
 

diff --git a/bin/panel_postprocessing_annotation.py b/bin/panel_postprocessing_annotation.py
@@ -4,15 +4,13 @@
 import pandas as pd
 import numpy as np
 from itertools import product
-from bgreference import hg38, hg19, mm10, mm39
+from bgreference import hg38, mm39
 from utils_context import transform_context
 from utils_impacts import *
 from read_utils import custom_na_values
 
 assembly_name2function = {
     "hg38": hg38,
-    "hg19": hg19,
-    "mm10": mm10,
     "mm39": mm39
 }
 
@@ -199,7 +197,7 @@ def vep2summarizedannotation_panel(VEP_output_file, all_possible_sites_annotated
 
 @click.command()
 @click.option('--vep_output_file', type=click.Path(exists=True), required=True, help='Path to the VEP output file.')
-@click.option('--assembly', type=click.Choice(['hg38', 'hg19', 'mm10', 'mm39']), default='hg38', help='Genome assembly.')
+@click.option('--assembly', type=click.Choice(['hg38', 'mm39']), default='hg38', help='Genome assembly.')
 @click.option('--output_file', type=click.Path(), required=True, help='Path to the output annotated file.')
 @click.option('--only_canonical', is_flag=True, default=False, help='Use only canonical transcripts.')
 def main(vep_output_file, assembly, output_file, only_canonical):

diff --git a/bin/postprocessing_annotation.py b/bin/postprocessing_annotation.py
@@ -4,16 +4,14 @@
 import click
 import pandas as pd
 
-from bgreference import hg38, hg19, mm10, mm39
+from bgreference import hg38, mm39
 
 from utils import vartype
 from utils_context import transform_context
 from utils_impacts import *
 from read_utils import custom_na_values
 
 assembly_name2function = {"hg38": hg38,
-                            "hg19": hg19,
-                            "mm10": mm10,
                             "mm39": mm39}
 
 
@@ -245,7 +243,7 @@ def vep2summarizedannotation(VEP_output_file, all_possible_sites_annotated_file,
 @click.command()
 @click.argument('vep_output_file', type=click.Path(exists=True))
 @click.argument('all_possible_sites_annotated_file', type=click.Path())
-@click.option('--assembly-name', default='hg38', show_default=True, type=click.Choice(['hg38', 'hg19', 'mm10', 'mm39']), help='Reference genome assembly name')
+@click.option('--assembly-name', default='hg38', show_default=True, type=click.Choice(['hg38', 'mm39']), help='Reference genome assembly name')
 @click.option('--all-underscore', is_flag=True, default=False, show_default=True, help='Whether to use _ to separate all parts of MUT_ID (default: False)')
 @click.option('--hotspots-annotation-file', default=None, type=click.Path(exists=False), help='Path to hotspots annotation file')
 @click.option('--gnomad-af-threshold', default=0.001, show_default=True, type=float, help='gnomAD allele frequency threshold')

diff --git a/bin/sites_table_from_positions.py b/bin/sites_table_from_positions.py
@@ -4,11 +4,9 @@
 import click
 import pandas as pd
 
-from bgreference import hg38, hg19, mm10, mm39
+from bgreference import hg38, mm39
 
 assembly_name2function = {"hg38": hg38,
-                            "hg19": hg19,
-                            "mm10": mm10,
                             "mm39": mm39
                             }
 
@@ -49,7 +47,6 @@ def generate_all_sites_4VEP(input_positions, genome, output_file_with_sites):
 
 @click.command()
 @click.option('--input-positions', required=True, type=click.Path(exists=True), help='Input positions file (TSV)')
-#@click.option('--genome-assembly', required=True, type=click.Choice(['hg38', 'hg19', 'mm10', 'mm39']), help='Genome assembly (hg38, hg19, mm10, mm39)')
 @click.option('--genome-assembly', required=True, type=click.Choice(['hg38', 'mm39']), help='Genome assembly (hg38, mm39)')
 @click.option('--output-file-with-sites', required=True, type=click.Path(), help='Output file for sites (TSV)')
 def main(input_positions, genome_assembly, output_file_with_sites):

diff --git a/bin/utils_context.py b/bin/utils_context.py
@@ -1,4 +1,4 @@
-from bgreference import hg38, hg19, mm10, mm39
+from bgreference import hg38, mm39
 
 from itertools import product
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -112,7 +112,7 @@ process {
     }
 
 
-    withName: QUERYMUTATIONS {
+    withName: 'QUERYMUTATIONS|QUERYMUTATIONSEXONS' {
         ext.prefix    = { ".subset_mutations" }
         ext.args      = ''
         ext.args2     = '-s 1 -b 2 -e 2'
@@ -245,7 +245,7 @@ process {
             ],
             [
                 mode: params.publish_dir_mode,
-                path: { "${params.outdir}/flagged_positions" },
+                path: { "${params.outdir}/processing_files/flagged_positions" },
                 pattern: "*{bed}",
             ]
         ]
@@ -557,6 +557,14 @@ process {
                             --seed 123"
     }
 
+    withName: 'SIGPROFILERASSIGNMENT|HDPREASSIGNMENT|SIGPROMATRIXGENERATOR' {
+        ext.genome_assembly = params.vep_genome == 'GRCh38'
+            ? 'GRCh38'
+            : params.vep_genome == 'GRCm39'
+                ? 'mm39'
+                : null
+    }
+
     withName: SIGPROFILERASSIGNMENT {
         publishDir = [
             [
@@ -567,20 +575,32 @@ process {
         ]
     }
 
+    withName: HDPREASSIGNMENT {
+        publishDir = [
+            [
+                mode: params.publish_dir_mode,
+                path: { "${params.outdir}/signatures/hdp_decomposition_spa" },
+                pattern: "**{txt,pdf}",
+            ]
+        ]
+    }
+
+    withName: REFORMATSIGNATURES {
+        publishDir = [
+            [
+                mode: params.publish_dir_mode,
+                path: { "${params.outdir}/signatures/signatures_hdp/signatures_reformatted" },
+                pattern: "**{tsv}",
+            ]
+        ]
+    }
+
     withName: SIGPROMATRIXGENERATOR {
         ext.args            = "--plot \
                             --tsb_stat \
                             --seqInfo \
                             --cushion 100"
-        ext.genome_assembly = params.vep_genome == 'GRCh38'
-            ? 'GRCh38'
-            : params.vep_genome == 'GRCh37'
-                ? 'GRCh37'
-                : params.vep_genome == 'GRCm38'
-                    ? 'mm10'
-                    : params.vep_genome == 'GRCm39'
-                        ? 'mm39'
-                        : null
+
         publishDir          = [
             [
                 mode: params.publish_dir_mode,
@@ -643,14 +663,12 @@ process {
                         --missing_values mean_samples"
     }
 
-    withName: 'SITESFROMPOSITIONS|SUMANNOTATION|SIGPROFILERASSIGNMENT|ONCODRIVECLUSTL|POSTPROCESSVEPPANEL' {
+    withName: 'SITESFROMPOSITIONS|SUMANNOTATION|ONCODRIVECLUSTL|POSTPROCESSVEPPANEL' {
         ext.assembly = params.vep_genome == 'GRCh38'
             ? 'hg38'
-            : params.vep_genome == 'GRCm38'
-                ? 'mm10'
-                : params.vep_genome == 'GRCm39'
-                    ? 'mm39'
-                    : null
+            : params.vep_genome == 'GRCm39'
+                ? 'mm39'
+                : null
     }
 
     withLabel : deepcsa_core {

diff --git a/conf/tools/omega.config b/conf/tools/omega.config
@@ -168,10 +168,8 @@ process {
     withName: 'PREPROCESSING.*|ESTIMATOR.*' {
         ext.assembly = params.vep_genome == 'GRCh38'
             ? 'hg38'
-            : params.vep_genome == 'GRCm38'
-                ? 'mm10'
-                : params.vep_genome == 'GRCm39'
-                    ? 'mm39'
-                    : null
+            : params.vep_genome == 'GRCm39'
+                ? 'mm39'
+                : null
     }
 }
diff --git a/docs/usage.md b/docs/usage.md
@@ -274,7 +274,7 @@ These files identify sites overlapping common SNPs and noisy or variable genomic
 - Nanoseq SNP: Common SNP positions that should be excluded from analysis
 - Nanoseq Noise: Regions with high noise or variability
 
-Both files are available for GRCh37 and GRCh38 at the [shared folder](https://drive.google.com/drive/folders/1wqkgpRTuf4EUhqCGSLA4fIg9qEEw3ZcL) from Iñigo Martincorena's group, at the Wellcome Sanger Institute.
+Both files are available for GRCh38 at the [shared folder](https://drive.google.com/drive/folders/1wqkgpRTuf4EUhqCGSLA4fIg9qEEw3ZcL) from Iñigo Martincorena's group, at the Wellcome Sanger Institute.
 
 ## Additional customizable parameters
 

diff --git a/modules/local/compute_profile/main.nf b/modules/local/compute_profile/main.nf
@@ -16,6 +16,7 @@ process COMPUTE_PROFILE {
     tuple val(meta), path("*.proportion_mutations.WGS.tsv") , optional:true , emit: wgs_proportions
     tuple val(meta), path("*.matrix.WGS.tsv")               , optional:true , emit: wgs
     tuple val(meta), path("*.matrix.WGS.sigprofiler.tsv")   , optional:true , emit: wgs_sigprofiler
+    tuple val(meta), path("*.profile_stability.tsv")                        , emit: profile_stability
 
     tuple val(meta), path("*.pdf")                          , optional:true , emit: plots