Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,14 @@ We are working to provide the biggest possible detail on the [usage](docs/usage.

> **Sex and smoking bias in the selection of somatic mutations in human bladder**
>
> Ferriol Calvet, Raquel Blanco Martinez-Illescas, Ferran Muiños, Maria Tretiakova, Elena S. Latorre-Esteves, Jeanne Fredrickson, Maria Andrianova, Stefano Pellegrini, Axel Rosendahl Huber, Joan Enric Ramis-Zaldivar, Shuyi Charlotte An, Elana Thieme, Brendan F. Kohrn, Miguel L. Grau, Abel Gonzalez-Perez, Nuria Lopez-Bigas & Rosa Ana Risques
> Ferriol Calvet*, Raquel Blanco Martinez-Illescas*, Ferran Muiños, Maria Tretiakova, Elena S. Latorre-Esteves, Jeanne Fredrickson, Maria Andrianova, Stefano Pellegrini, Axel Rosendahl Huber, Joan Enric Ramis-Zaldivar, Shuyi Charlotte An, Elana Thieme, Brendan F. Kohrn, Miguel L. Grau, Abel Gonzalez-Perez, Nuria Lopez-Bigas & Rosa Ana Risques
>
>_Nature_ (2025) doi:[10.1038/s41586-025-09521-x](https://doi.org/10.1038/s41586-025-09521-x)
>
> *these authors contributed equally and the order was decided randomly

> **DeepClone, an end-to-end protocol to study somatic mutagenesis and selection at high resolution**
>
> Ferriol Calvet, Morena Pinheiro-Santin, Erika Lopez,Raquel Blanco Martinez-Illescas, Núria Samper, Miguel L. Grau, Ferran Muiños,Rocío Chamorro González, Maria Andrianova, Federica Brando,Stefano Pellegrini, Marta Huertas, Elisabet Figuerola-Bou,Coohleen Coombes,Brendan F. Kohrn, Jeanne Fredrickson, Rosa Ana Risques, Nuria Lopez-Bigas, Abel Gonzalez-Perez
> Ferriol Calvet, Morena Pinheiro-Santin, Erika Lopez, Raquel Blanco Martinez-Illescas, Núria Samper, Miguel L. Grau, Ferran Muiños, Rocío Chamorro González, Maria Andrianova, Federica Brando, Stefano Pellegrini, Marta Huertas, Elisabet Figuerola-Bou, Coohleen Coombes, Brendan F. Kohrn, Jeanne Fredrickson, Rosa Ana Risques, Nuria Lopez-Bigas, Abel Gonzalez-Perez
>
> _protocols.io_ (2026) doi:[10.17504/protocols.io.dm6gp1jodgzp/v2](https://dx.doi.org/10.17504/protocols.io.dm6gp1jodgzp/v2)
18 changes: 16 additions & 2 deletions bin/concat_sigprot_matrices.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@
import json


# identify samples without mutations
def remove_zero_mutation_samples(matrix):
zero_col = matrix.columns[matrix.sum(axis=0) == 0]
if len(zero_col) > 0:
print(f"Excluding samples without mutations: {list(zero_col)}")
matrix = matrix.drop(columns=zero_col)
return matrix

def concat_sigprot_matrices(filename_of_matrices, samples_json_file, type_of_profile):
"""
Concatenate signature profile matrices for samples and groups.
Expand Down Expand Up @@ -41,8 +49,11 @@ def concat_sigprot_matrices(filename_of_matrices, samples_json_file, type_of_pro

# Save the groups matrix if it exists
if groups_matrix is not None and groups_matrix.shape[0] > 0:
groups_matrix = remove_zero_mutation_samples(groups_matrix)

groups_matrix.reset_index().to_csv(f"groups_matrix.{type_of_profile}.sp.tsv", sep='\t', header=True, index=False)
Comment thread
FerriolCalvet marked this conversation as resolved.
groups_matrix.round().astype(int).reset_index().to_csv(f"groups_matrix.{type_of_profile}.sp.round.tsv", sep='\t', header=True, index=False)
rounded_groups_matrix = remove_zero_mutation_samples(groups_matrix.round().astype(int))
rounded_groups_matrix.reset_index().to_csv(f"groups_matrix.{type_of_profile}.sp.round.tsv", sep='\t', header=True, index=False)

# HDP
groups_matrix = groups_matrix.transpose()
Expand All @@ -55,8 +66,11 @@ def concat_sigprot_matrices(filename_of_matrices, samples_json_file, type_of_pro

# Save the samples matrix if it exists
if samples_only_matrix is not None and samples_only_matrix.shape[0] > 0:
samples_only_matrix = remove_zero_mutation_samples(samples_only_matrix)

samples_only_matrix.reset_index().to_csv(f"samples_matrix.{type_of_profile}.sp.tsv", sep='\t', header=True, index=False)
samples_only_matrix.round().astype(int).reset_index().to_csv(f"samples_matrix.{type_of_profile}.sp.round.tsv", sep='\t', header=True, index=False)
rounded_samples_only_matrix = remove_zero_mutation_samples(samples_only_matrix.round().astype(int))
rounded_samples_only_matrix.reset_index().to_csv(f"samples_matrix.{type_of_profile}.sp.round.tsv", sep='\t', header=True, index=False)

# HDP
samples_only_matrix = samples_only_matrix.transpose()
Expand Down
80 changes: 77 additions & 3 deletions bin/mut_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,72 @@ def compute_mutation_matrix(sample_name, mutations_file, mutation_matrix, method
sep = "\t")



def profile_stability(counts, denominator):
"""
Compute stability score of a probability profile by
adding +1 to each channel individually and measuring
L1 (Total Variation) deviation.

Parameters
----------
counts : array-like, shape (k,)
Raw integer counts per channel.
denominator : array-like, shape (k,)
Denominator for each channel (e.g., trinucleotide counts).

Returns
-------
results : dict
{
'mean_deviation': float,
'max_deviation': float,
'std_deviation': float,
'all_deviations': np.ndarray shape (k,)
}
"""

# Convert to numpy arrays
counts = np.asarray(counts, dtype=float)
denominator = np.asarray(denominator, dtype=float)

if counts.ndim != 1:
raise ValueError("counts must be a 1D vector")
if denominator.ndim != 1:
raise ValueError("denominator must be a 1D vector")
if len(counts) != len(denominator):
raise ValueError("counts and denominator must have same length")
if np.any(denominator == 0):
raise ValueError("denominator contains zeros")

k = len(counts)

# ---- Baseline profile ----
ref_profile = counts / denominator
ref_profile = ref_profile / ref_profile.sum()
p = ref_profile

deviations = np.zeros(k)

# ---- Perturb each channel ----
for j in range(k):
perturbed = counts.copy()
perturbed[j] += 1

p_prime = perturbed / denominator
p_prime = p_prime / p_prime.sum() # IMPORTANT: renormalize

deviations[j] = 0.5 * np.sum(np.abs(p - p_prime))

return {
"mean_deviation": deviations.mean(),
"min_deviation": deviations.min(),
"max_deviation": deviations.max(),
"std_deviation": deviations.std(),
"all_deviations": deviations.tolist()
}


def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_counts_file, plot,
wgs = False, wgs_trinucleotide_counts = False, sigprofiler = False):
"""
Expand All @@ -116,8 +182,9 @@ def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_co
total_mutations = np.sum(mutation_matrix[sample_name])

# proportion of SBS mutations per trinucleotide in panel
mutation_matrix[sample_name] = mutation_matrix[sample_name] / total_mutations
mutation_matrix.to_csv(f"{sample_name}.proportion_mutations.tsv",
mutation_matrix_proportions = mutation_matrix.copy()
mutation_matrix_proportions[sample_name] = mutation_matrix_proportions[sample_name] / total_mutations
mutation_matrix_proportions.to_csv(f"{sample_name}.proportion_mutations.tsv",
header = True,
index = True,
sep = "\t")
Expand All @@ -134,6 +201,13 @@ def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_co
contextmut_depth = pd.DataFrame({sample_name : trinuc_per_contextmuts, "CONTEXT_MUT": contexts_formatted})
contextmut_depth = contextmut_depth.set_index("CONTEXT_MUT")

# compute profile stability and store the outputs in a file
stability_output_dict = profile_stability(mutation_matrix[sample_name], contextmut_depth[sample_name])
stability_outputs = pd.DataFrame(stability_output_dict.items()).set_index(0).T
stability_outputs[["SAMPLE_ID", "mode"]] = sample_name.split(".")
stability_outputs[["SAMPLE_ID", "mode"] + list(stability_outputs.columns[:-2])].to_csv(f"{sample_name}.profile_stability.tsv", sep = "\t", index = False)


# divide
mut_probability = mutation_matrix.divide( contextmut_depth )

Expand Down Expand Up @@ -171,7 +245,7 @@ def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_co
output_f = f'{sample_name}.profile.pdf')

# plot the profile as a percentage of SBS mutations seen in our sequenced panel
plot_profile(dict(zip(mutation_matrix.index, mutation_matrix[sample_name])),
plot_profile(dict(zip(mutation_matrix_proportions.index, mutation_matrix_proportions[sample_name])),
title=f'{sample_name} ({round(total_mutations)} muts)',
output_f = f'{sample_name}.profile.percentage.pdf')

Expand Down
6 changes: 2 additions & 4 deletions bin/panel_postprocessing_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,13 @@
import pandas as pd
import numpy as np
from itertools import product
from bgreference import hg38, hg19, mm10, mm39
from bgreference import hg38, mm39
from utils_context import transform_context
from utils_impacts import *
from read_utils import custom_na_values

assembly_name2function = {
"hg38": hg38,
"hg19": hg19,
"mm10": mm10,
"mm39": mm39
}

Expand Down Expand Up @@ -199,7 +197,7 @@ def vep2summarizedannotation_panel(VEP_output_file, all_possible_sites_annotated

@click.command()
@click.option('--vep_output_file', type=click.Path(exists=True), required=True, help='Path to the VEP output file.')
@click.option('--assembly', type=click.Choice(['hg38', 'hg19', 'mm10', 'mm39']), default='hg38', help='Genome assembly.')
@click.option('--assembly', type=click.Choice(['hg38', 'mm39']), default='hg38', help='Genome assembly.')
@click.option('--output_file', type=click.Path(), required=True, help='Path to the output annotated file.')
@click.option('--only_canonical', is_flag=True, default=False, help='Use only canonical transcripts.')
def main(vep_output_file, assembly, output_file, only_canonical):
Expand Down
6 changes: 2 additions & 4 deletions bin/postprocessing_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,14 @@
import click
import pandas as pd

from bgreference import hg38, hg19, mm10, mm39
from bgreference import hg38, mm39

from utils import vartype
from utils_context import transform_context
from utils_impacts import *
from read_utils import custom_na_values

assembly_name2function = {"hg38": hg38,
"hg19": hg19,
"mm10": mm10,
"mm39": mm39}


Expand Down Expand Up @@ -245,7 +243,7 @@ def vep2summarizedannotation(VEP_output_file, all_possible_sites_annotated_file,
@click.command()
@click.argument('vep_output_file', type=click.Path(exists=True))
@click.argument('all_possible_sites_annotated_file', type=click.Path())
@click.option('--assembly-name', default='hg38', show_default=True, type=click.Choice(['hg38', 'hg19', 'mm10', 'mm39']), help='Reference genome assembly name')
@click.option('--assembly-name', default='hg38', show_default=True, type=click.Choice(['hg38', 'mm39']), help='Reference genome assembly name')
@click.option('--all-underscore', is_flag=True, default=False, show_default=True, help='Whether to use _ to separate all parts of MUT_ID (default: False)')
@click.option('--hotspots-annotation-file', default=None, type=click.Path(exists=False), help='Path to hotspots annotation file')
@click.option('--gnomad-af-threshold', default=0.001, show_default=True, type=float, help='gnomAD allele frequency threshold')
Expand Down
5 changes: 1 addition & 4 deletions bin/sites_table_from_positions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,9 @@
import click
import pandas as pd

from bgreference import hg38, hg19, mm10, mm39
from bgreference import hg38, mm39

assembly_name2function = {"hg38": hg38,
"hg19": hg19,
"mm10": mm10,
"mm39": mm39
}

Expand Down Expand Up @@ -49,7 +47,6 @@ def generate_all_sites_4VEP(input_positions, genome, output_file_with_sites):

@click.command()
@click.option('--input-positions', required=True, type=click.Path(exists=True), help='Input positions file (TSV)')
#@click.option('--genome-assembly', required=True, type=click.Choice(['hg38', 'hg19', 'mm10', 'mm39']), help='Genome assembly (hg38, hg19, mm10, mm39)')
@click.option('--genome-assembly', required=True, type=click.Choice(['hg38', 'mm39']), help='Genome assembly (hg38, mm39)')
@click.option('--output-file-with-sites', required=True, type=click.Path(), help='Output file for sites (TSV)')
def main(input_positions, genome_assembly, output_file_with_sites):
Expand Down
2 changes: 1 addition & 1 deletion bin/utils_context.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from bgreference import hg38, hg19, mm10, mm39
from bgreference import hg38, mm39

from itertools import product

Expand Down
52 changes: 35 additions & 17 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ process {
}


withName: QUERYMUTATIONS {
withName: 'QUERYMUTATIONS|QUERYMUTATIONSEXONS' {
ext.prefix = { ".subset_mutations" }
ext.args = ''
ext.args2 = '-s 1 -b 2 -e 2'
Expand Down Expand Up @@ -245,7 +245,7 @@ process {
],
[
mode: params.publish_dir_mode,
path: { "${params.outdir}/flagged_positions" },
path: { "${params.outdir}/processing_files/flagged_positions" },
pattern: "*{bed}",
]
]
Expand Down Expand Up @@ -557,6 +557,14 @@ process {
--seed 123"
}

withName: 'SIGPROFILERASSIGNMENT|HDPREASSIGNMENT|SIGPROMATRIXGENERATOR' {
ext.genome_assembly = params.vep_genome == 'GRCh38'
? 'GRCh38'
: params.vep_genome == 'GRCm39'
? 'mm39'
: null
}

withName: SIGPROFILERASSIGNMENT {
publishDir = [
[
Expand All @@ -567,20 +575,32 @@ process {
]
}

withName: HDPREASSIGNMENT {
publishDir = [
[
mode: params.publish_dir_mode,
path: { "${params.outdir}/signatures/hdp_decomposition_spa" },
pattern: "**{txt,pdf}",
]
]
}
Comment thread
FerriolCalvet marked this conversation as resolved.

withName: REFORMATSIGNATURES {
publishDir = [
[
mode: params.publish_dir_mode,
path: { "${params.outdir}/signatures/signatures_hdp/signatures_reformatted" },
pattern: "**{tsv}",
]
]
}

withName: SIGPROMATRIXGENERATOR {
ext.args = "--plot \
--tsb_stat \
--seqInfo \
--cushion 100"
ext.genome_assembly = params.vep_genome == 'GRCh38'
? 'GRCh38'
: params.vep_genome == 'GRCh37'
? 'GRCh37'
: params.vep_genome == 'GRCm38'
? 'mm10'
: params.vep_genome == 'GRCm39'
? 'mm39'
: null

publishDir = [
[
mode: params.publish_dir_mode,
Expand Down Expand Up @@ -643,14 +663,12 @@ process {
--missing_values mean_samples"
}

withName: 'SITESFROMPOSITIONS|SUMANNOTATION|SIGPROFILERASSIGNMENT|ONCODRIVECLUSTL|POSTPROCESSVEPPANEL' {
withName: 'SITESFROMPOSITIONS|SUMANNOTATION|ONCODRIVECLUSTL|POSTPROCESSVEPPANEL' {
ext.assembly = params.vep_genome == 'GRCh38'
? 'hg38'
: params.vep_genome == 'GRCm38'
? 'mm10'
: params.vep_genome == 'GRCm39'
? 'mm39'
: null
: params.vep_genome == 'GRCm39'
? 'mm39'
: null
}

withLabel : deepcsa_core {
Expand Down
8 changes: 3 additions & 5 deletions conf/tools/omega.config
Original file line number Diff line number Diff line change
Expand Up @@ -168,10 +168,8 @@ process {
withName: 'PREPROCESSING.*|ESTIMATOR.*' {
ext.assembly = params.vep_genome == 'GRCh38'
? 'hg38'
: params.vep_genome == 'GRCm38'
? 'mm10'
: params.vep_genome == 'GRCm39'
? 'mm39'
: null
: params.vep_genome == 'GRCm39'
? 'mm39'
: null
}
}
2 changes: 1 addition & 1 deletion docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ These files identify sites overlapping common SNPs and noisy or variable genomic
- Nanoseq SNP: Common SNP positions that should be excluded from analysis
- Nanoseq Noise: Regions with high noise or variability

Both files are available for GRCh37 and GRCh38 at the [shared folder](https://drive.google.com/drive/folders/1wqkgpRTuf4EUhqCGSLA4fIg9qEEw3ZcL) from Iñigo Martincorena's group, at the Wellcome Sanger Institute.
Both files are available for GRCh38 at the [shared folder](https://drive.google.com/drive/folders/1wqkgpRTuf4EUhqCGSLA4fIg9qEEw3ZcL) from Iñigo Martincorena's group, at the Wellcome Sanger Institute.

## Additional customizable parameters

Expand Down
1 change: 1 addition & 0 deletions modules/local/compute_profile/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ process COMPUTE_PROFILE {
tuple val(meta), path("*.proportion_mutations.WGS.tsv") , optional:true , emit: wgs_proportions
tuple val(meta), path("*.matrix.WGS.tsv") , optional:true , emit: wgs
tuple val(meta), path("*.matrix.WGS.sigprofiler.tsv") , optional:true , emit: wgs_sigprofiler
tuple val(meta), path("*.profile_stability.tsv") , emit: profile_stability

tuple val(meta), path("*.pdf") , optional:true , emit: plots

Expand Down
Loading