diff --git a/Dockerfile b/Dockerfile index 6c2556e..fbc2b93 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM condaforge/miniforge3:latest LABEL io.github.snakemake.containerized="true" -LABEL io.github.snakemake.conda_env_hash="ed26b44bdff1bc9f7a99d3830ae812bf262f78427b95a45fa1ac1ce255c5f054" +LABEL io.github.snakemake.conda_env_hash="fb2a04b98efa75d8b830a16722a8717eb91a3c66659e3c78b90b0cee3a50db69" # Step 2: Retrieve conda environments @@ -117,12 +117,12 @@ COPY workflow/envs/quarto_render.yaml /conda-envs/96f3c1cec4b3ce5d72f708992272e9 # Conda environment: # source: workflow/envs/renv.yaml -# prefix: /conda-envs/8ad6cdcf265d30289788da99d5bf9fff +# prefix: /conda-envs/fe892aca096e6b2883923c8755f9ac77 # channels: # - conda-forge # - bioconda # dependencies: -# - r-base=4.3.2 +# - r-base=4.3.3 # - r-tidyverse==2.0.0 # - r-ggrepel==0.9.3 # - r-ggpubr==0.6.0 @@ -133,10 +133,9 @@ COPY workflow/envs/quarto_render.yaml /conda-envs/96f3c1cec4b3ce5d72f708992272e9 # - r-data.table==1.14.8 # - r-future.apply==1.11.0 # - r-scales==1.3.0 -# - r-showtext==0.9_6 # - r-logger==0.2.2 -RUN mkdir -p /conda-envs/8ad6cdcf265d30289788da99d5bf9fff -COPY workflow/envs/renv.yaml /conda-envs/8ad6cdcf265d30289788da99d5bf9fff/environment.yaml +RUN mkdir -p /conda-envs/fe892aca096e6b2883923c8755f9ac77 +COPY workflow/envs/renv.yaml /conda-envs/fe892aca096e6b2883923c8755f9ac77/environment.yaml # Conda environment: # source: workflow/envs/snpeff.yaml @@ -149,6 +148,18 @@ COPY workflow/envs/renv.yaml /conda-envs/8ad6cdcf265d30289788da99d5bf9fff/enviro RUN mkdir -p /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95 COPY workflow/envs/snpeff.yaml /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95/environment.yaml +# Conda environment: +# source: workflow/envs/tools.yaml +# prefix: /conda-envs/1f283441022c3c9d97669994a3c5e8bb +# channels: +# - conda-forge +# - bioconda +# dependencies: +# - bedtools==2.31.1 +# - bcftools==1.23 +RUN mkdir -p /conda-envs/1f283441022c3c9d97669994a3c5e8bb +COPY workflow/envs/tools.yaml /conda-envs/1f283441022c3c9d97669994a3c5e8bb/environment.yaml + # Conda environment: # source: workflow/envs/var_calling.yaml # prefix: /conda-envs/81e46c677a6cc0618c93963d57d17d3f @@ -173,8 +184,9 @@ RUN conda env create --prefix /conda-envs/9c24a867826615972cc288081976e7fc --fil conda env create --prefix /conda-envs/04a3347f94ddf7e21c34bc49e5246076 --file /conda-envs/04a3347f94ddf7e21c34bc49e5246076/environment.yaml && \ conda env create --prefix /conda-envs/fb978640cd765c8a63bbcdc01f3a206b --file /conda-envs/fb978640cd765c8a63bbcdc01f3a206b/environment.yaml && \ conda env create --prefix /conda-envs/96f3c1cec4b3ce5d72f708992272e9c1 --file /conda-envs/96f3c1cec4b3ce5d72f708992272e9c1/environment.yaml && \ - conda env create --prefix /conda-envs/8ad6cdcf265d30289788da99d5bf9fff --file /conda-envs/8ad6cdcf265d30289788da99d5bf9fff/environment.yaml && \ + conda env create --prefix /conda-envs/fe892aca096e6b2883923c8755f9ac77 --file /conda-envs/fe892aca096e6b2883923c8755f9ac77/environment.yaml && \ conda env create --prefix /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95 --file /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95/environment.yaml && \ + conda env create --prefix /conda-envs/1f283441022c3c9d97669994a3c5e8bb --file /conda-envs/1f283441022c3c9d97669994a3c5e8bb/environment.yaml && \ conda env create --prefix /conda-envs/81e46c677a6cc0618c93963d57d17d3f --file /conda-envs/81e46c677a6cc0618c93963d57d17d3f/environment.yaml && \ conda clean --all -y diff --git a/config/design_plots.R b/config/design_plots.R index 74e87d8..8952379 100644 --- a/config/design_plots.R +++ b/config/design_plots.R @@ -1,26 +1,14 @@ -# Jordi Sevilla - library(ggplot2) -library(showtext) - -# Ajustes #### -showtext_auto(enable = FALSE) -showtext_opts(dpi = 200) - -# Tema -font_add_google("Noto Sans", "Noto Sans") -showtext_auto() +# Theme theme_set(theme_minimal()) - theme_update( - text = element_text(size = 16, family = "Noto Sans"), - axis.title = element_text(size = 16), + text = element_text(size = 10, family = "Noto Sans"), + axis.title = element_text(size = 12), axis.line = element_line( linewidth = 0.5, colour = "grey40", - linetype = 1, - arrow = arrow(length = unit(3, "mm")) + linetype = 1 ), panel.grid = element_line(linewidth = 0.17, color = "lightgray") ) diff --git a/workflow/envs/bedtools.yaml b/workflow/envs/bedtools.yaml new file mode 100644 index 0000000..a76cc14 --- /dev/null +++ b/workflow/envs/bedtools.yaml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bedtools==2.31.1 diff --git a/workflow/envs/renv.yaml b/workflow/envs/renv.yaml index ceb6a99..9256d78 100644 --- a/workflow/envs/renv.yaml +++ b/workflow/envs/renv.yaml @@ -2,7 +2,7 @@ channels: - conda-forge - bioconda dependencies: - - r-base=4.3.2 + - r-base=4.3.3 - r-tidyverse==2.0.0 - r-ggrepel==0.9.3 - r-ggpubr==0.6.0 @@ -13,5 +13,4 @@ dependencies: - r-data.table==1.14.8 - r-future.apply==1.11.0 - r-scales==1.3.0 - - r-showtext==0.9_6 - r-logger==0.2.2 diff --git a/workflow/envs/tools.yaml b/workflow/envs/tools.yaml new file mode 100644 index 0000000..cc5ada1 --- /dev/null +++ b/workflow/envs/tools.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bedtools==2.31.1 + - bcftools==1.23 diff --git a/workflow/rules/evolution.smk b/workflow/rules/evolution.smk index 0380940..ef1895c 100644 --- a/workflow/rules/evolution.smk +++ b/workflow/rules/evolution.smk @@ -21,6 +21,7 @@ rule n_s_sites: gb_qualifier_display = "gene", input: fasta = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta", + masked = OUTDIR / "sites_masked.bed", gb = OUTDIR/"reference.cds.gb", genetic_code = Path(config["GENETIC_CODE_JSON"]).resolve(), output: @@ -35,6 +36,7 @@ rule calculate_dnds: conda: "../envs/renv.yaml" input: n_s_sites = OUTDIR/f"{OUTPUT_NAME}.ancestor.n_s.sites.csv", + masked = OUTDIR / "sites_masked.bed", variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", metadata = config["METADATA"] output: diff --git a/workflow/rules/sites.smk b/workflow/rules/sites.smk index 9ff7739..9bbbd41 100644 --- a/workflow/rules/sites.smk +++ b/workflow/rules/sites.smk @@ -1,3 +1,21 @@ +rule problematic_vcf_to_bed: + conda: + "../envs/tools.yaml" + params: + filters = ["mask"], + input: + vcf = lambda wildcards: select_problematic_vcf(), + output: + vcf = temp(OUTDIR / "sites_masked.vcf"), + bed = temp(OUTDIR / "sites_masked.bed"), + log: + LOGDIR / "problematic_vcf_to_bed" / "log.txt", + shell: + "FILTER_STR=$(echo \"{params.filters}\" | tr ' ' ',') && " + "bcftools view -f \"$FILTER_STR\" {input.vcf} >{output.vcf} 2>{log} && " + "bedtools merge -i {output.vcf} >{output.bed} 2>{log}" + + rule bcftools_mpileup_all_sites: threads: 1 conda: "../envs/var_calling.yaml" diff --git a/workflow/scripts/calculate_dnds.R b/workflow/scripts/calculate_dnds.R index 4c34688..d600936 100644 --- a/workflow/scripts/calculate_dnds.R +++ b/workflow/scripts/calculate_dnds.R @@ -8,6 +8,7 @@ sink(log, type = "output") library(dplyr) library(readr) library(tidyr) +library(purrr) library(logger) log_threshold(INFO) @@ -37,6 +38,21 @@ metadata <- read_delim(snakemake@input[["metadata"]]) %>% select(ID, interval) %>% rename(SAMPLE = ID) +log_info("Reading masked sites BED") +masked <- read_tsv( + snakemake@input$masked, + col_names = c("chrom", "start", "end"), + col_types = "cii", + comment = "#" +) %>% + mutate(POS = map2(start + 1, end, seq.int)) %>% + unnest(POS) %>% + pull(POS) + +log_info("Filtering variants on masked sites") +variants <- variants %>% + filter(!POS %in% masked, !is.na(SYNONYMOUS)) + log_debug("Adding metadata to variants table") variants <- left_join(variants, metadata) diff --git a/workflow/scripts/n_s_sites_from_fasta.py b/workflow/scripts/n_s_sites_from_fasta.py index cb7fa6f..62cf6ed 100644 --- a/workflow/scripts/n_s_sites_from_fasta.py +++ b/workflow/scripts/n_s_sites_from_fasta.py @@ -8,18 +8,50 @@ import pandas as pd from Bio import SeqIO from Bio.SeqRecord import SeqRecord +from Bio.SeqFeature import SeqFeature from Bio.Seq import Seq NTS = ("A", "C", "G", "T") +MASK_CHR = "N" def read_monofasta(path: str) -> SeqRecord: return SeqIO.read(path, format="fasta") +def read_masked_positions(bed_path: str) -> set: + masked = set() + with open(bed_path) as fh: + for line in fh: + if line.startswith("#") or not line.strip(): + continue + parts = line.strip().split("\t") + start, end = int(parts[1]), int(parts[2]) + masked.update(range(start, end)) + return masked + + +def mask_feature(feature: SeqFeature, record: SeqRecord, masked_positions: set) -> SeqRecord: + extracted = feature.extract(record) + seq_chars = list(str(extracted.seq).upper()) + # Handle reverse strand and compound locations + genomic_positions = [] + for part in feature.location.parts: + positions = list(range(int(part.start), int(part.end))) + if part.strand == -1: + positions.reverse() + genomic_positions.extend(positions) + # Mask sites on sequence + for i, pos in enumerate(genomic_positions): + if seq_chars[i] not in NTS or pos in masked_positions: + seq_chars[i] = MASK_CHR + extracted.seq = Seq("".join(seq_chars)) + return extracted + + def split_into_codons(seq: Seq) -> list: - """Split the complete CDS feature in to a list of codons""" + """Split the complete CDS feature in to a list of codons (if ACGT only)""" return [ seq[i:i + 3] for i in range(0, len(seq)-2, 3) if all(char in NTS for char in seq[i:i + 3]) ] @@ -79,6 +111,9 @@ def main(): with open(snakemake.input.genetic_code) as f: genetic_code = json.load(f) + logging.info("Reading masked sites BED") + masked = read_masked_positions(snakemake.input.masked) + logging.info("Reading GenBank file") gb = SeqIO.read(snakemake.input.gb, format="gb") @@ -95,7 +130,7 @@ def main(): logging.warning(f"Identifier '{identifier}' is already among coding records and will not be replaced by feature at {feature.location}") else: logging.debug(f"Adding feature") - coding_records[identifier] = feature.extract(record) + coding_records[identifier] = mask_feature(feature, record, masked) logging.info(f"Splitting {len(coding_records)} records into codons") codons = get_feature_codons(coding_records)