Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FROM condaforge/miniforge3:latest
LABEL io.github.snakemake.containerized="true"
LABEL io.github.snakemake.conda_env_hash="ed26b44bdff1bc9f7a99d3830ae812bf262f78427b95a45fa1ac1ce255c5f054"
LABEL io.github.snakemake.conda_env_hash="dac62e7daa260f58e83880b4ad266bb819917087f9c7baee7cbb1337b1603244"

# Step 2: Retrieve conda environments

Expand All @@ -15,6 +15,17 @@ LABEL io.github.snakemake.conda_env_hash="ed26b44bdff1bc9f7a99d3830ae812bf262f78
RUN mkdir -p /conda-envs/9c24a867826615972cc288081976e7fc
COPY workflow/envs/afwdist.yaml /conda-envs/9c24a867826615972cc288081976e7fc/environment.yaml

# Conda environment:
# source: workflow/envs/bedtools.yaml
# prefix: /conda-envs/cef13cdc1c388b3087761b0222ec0613
# channels:
# - conda-forge
# - bioconda
# dependencies:
# - bedtools==2.31.1
RUN mkdir -p /conda-envs/cef13cdc1c388b3087761b0222ec0613
COPY workflow/envs/bedtools.yaml /conda-envs/cef13cdc1c388b3087761b0222ec0613/environment.yaml

# Conda environment:
# source: workflow/envs/biopython.yaml
# prefix: /conda-envs/162796cecea22d99c8702138f0c48e2f
Expand Down Expand Up @@ -165,6 +176,7 @@ COPY workflow/envs/var_calling.yaml /conda-envs/81e46c677a6cc0618c93963d57d17d3f
# Step 3: Generate conda environments

RUN conda env create --prefix /conda-envs/9c24a867826615972cc288081976e7fc --file /conda-envs/9c24a867826615972cc288081976e7fc/environment.yaml && \
conda env create --prefix /conda-envs/cef13cdc1c388b3087761b0222ec0613 --file /conda-envs/cef13cdc1c388b3087761b0222ec0613/environment.yaml && \
conda env create --prefix /conda-envs/162796cecea22d99c8702138f0c48e2f --file /conda-envs/162796cecea22d99c8702138f0c48e2f/environment.yaml && \
conda env create --prefix /conda-envs/9439457f932a4fbca3665c9ea1ac2f0a --file /conda-envs/9439457f932a4fbca3665c9ea1ac2f0a/environment.yaml && \
conda env create --prefix /conda-envs/bb4c5f3a509433cc08861582fab4a705 --file /conda-envs/bb4c5f3a509433cc08861582fab4a705/environment.yaml && \
Expand Down
5 changes: 5 additions & 0 deletions workflow/envs/bedtools.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
channels:
- conda-forge
- bioconda
dependencies:
- bedtools==2.31.1
2 changes: 2 additions & 0 deletions workflow/rules/evolution.smk
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ rule n_s_sites:
gb_qualifier_display = "gene",
input:
fasta = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta",
masked = OUTDIR / "sites_masked.bed",
gb = OUTDIR/"reference.cds.gb",
genetic_code = Path(config["GENETIC_CODE_JSON"]).resolve(),
output:
Expand All @@ -35,6 +36,7 @@ rule calculate_dnds:
conda: "../envs/renv.yaml"
input:
n_s_sites = OUTDIR/f"{OUTPUT_NAME}.ancestor.n_s.sites.csv",
masked = OUTDIR / "sites_masked.bed",
variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
metadata = config["METADATA"]
output:
Expand Down
13 changes: 13 additions & 0 deletions workflow/rules/sites.smk
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
rule problematic_vcf_to_bed:
conda:
"../envs/bedtools.yaml"
input:
vcf = lambda wildcards: select_problematic_vcf(),
output:
bed = temp(OUTDIR / "sites_masked.bed"),
log:
LOGDIR / "build_problematic_bed" / "log.txt",
shell:
"bedtools merge -i {input.vcf} >{output.bed} 2>{log}"


rule bcftools_mpileup_all_sites:
threads: 1
conda: "../envs/var_calling.yaml"
Expand Down
16 changes: 16 additions & 0 deletions workflow/scripts/calculate_dnds.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ sink(log, type = "output")
library(dplyr)
library(readr)
library(tidyr)
library(purrr)
library(logger)

log_threshold(INFO)
Expand Down Expand Up @@ -37,6 +38,21 @@ metadata <- read_delim(snakemake@input[["metadata"]]) %>%
select(ID, interval) %>%
rename(SAMPLE = ID)

log_info("Reading masked sites BED")
masked <- read_tsv(
snakemake@input$masked,
col_names = c("chrom", "start", "end"),
col_types = "cii",
comment = "#"
) %>%
mutate(POS = map2(start + 1, end, seq.int)) %>%
unnest(POS) %>%
pull(POS)

log_info("Filtering variants on masked sites")
variants <- variants %>%
filter(!POS %in% masked, !is.na(SYNONYMOUS))

log_debug("Adding metadata to variants table")
variants <- left_join(variants, metadata)

Expand Down
39 changes: 37 additions & 2 deletions workflow/scripts/n_s_sites_from_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,50 @@
import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature
from Bio.Seq import Seq


NTS = ("A", "C", "G", "T")
MASK_CHR = "N"


def read_monofasta(path: str) -> SeqRecord:
return SeqIO.read(path, format="fasta")


def read_masked_positions(bed_path: str) -> set:
masked = set()
with open(bed_path) as fh:
for line in fh:
if line.startswith("#") or not line.strip():
continue
parts = line.strip().split("\t")
start, end = int(parts[1]), int(parts[2])
masked.update(range(start, end))
return masked


def mask_feature(feature: SeqFeature, record: SeqRecord, masked_positions: set) -> SeqRecord:
extracted = feature.extract(record)
seq_chars = list(str(extracted.seq).upper())
# Handle reverse strand and compound locations
genomic_positions = []
for part in feature.location.parts:
positions = list(range(int(part.start), int(part.end)))
if part.strand == -1:
positions.reverse()
genomic_positions.extend(positions)
# Mask sites on sequence
for i, pos in enumerate(genomic_positions):
if seq_chars[i] not in NTS or pos in masked_positions:
seq_chars[i] = MASK_CHR
extracted.seq = Seq("".join(seq_chars))
return extracted


def split_into_codons(seq: Seq) -> list:
"""Split the complete CDS feature in to a list of codons"""
"""Split the complete CDS feature in to a list of codons (if ACGT only)"""
return [
seq[i:i + 3] for i in range(0, len(seq)-2, 3) if all(char in NTS for char in seq[i:i + 3])
]
Expand Down Expand Up @@ -79,6 +111,9 @@ def main():
with open(snakemake.input.genetic_code) as f:
genetic_code = json.load(f)

logging.info("Reading masked sites BED")
masked = read_masked_positions(snakemake.input.masked)

logging.info("Reading GenBank file")
gb = SeqIO.read(snakemake.input.gb, format="gb")

Expand All @@ -95,7 +130,7 @@ def main():
logging.warning(f"Identifier '{identifier}' is already among coding records and will not be replaced by feature at {feature.location}")
else:
logging.debug(f"Adding feature")
coding_records[identifier] = feature.extract(record)
coding_records[identifier] = mask_feature(feature, record, masked)

logging.info(f"Splitting {len(coding_records)} records into codons")
codons = get_feature_codons(coding_records)
Expand Down
Loading