nf-core · LiaOb21 · Mar 20, 2026 · Mar 23, 2026 · Mar 24, 2026 · Mar 24, 2026
diff --git a/modules/nf-core/masurca/environment.yml b/modules/nf-core/masurca/environment.yml
@@ -0,0 +1,7 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - "bioconda::masurca=4.1.4"
diff --git a/modules/nf-core/masurca/main.nf b/modules/nf-core/masurca/main.nf
@@ -0,0 +1,141 @@
+process MASURCA {
+    tag "$meta.id"
+    label 'process_high'
+
+    conda "${moduleDir}/environment.yml"
+    container "ecoflowucl/masurca:v4.1.4"
+
+    input:
+    tuple val(meta), path(illumina), path(jump), path(pacbio), path(nanopore)
+    val fragment_mean
+    val fragment_stdev
+    val jump_mean
+    val jump_stdev
+    val extend_jump_reads
+    val graph_kmer_size
+    val use_linking_mates
+    val lhe_coverage
+    val mega_reads_one_pass
+    val limit_jump_coverage
+    val ca_parameters
+    val close_gaps
+    val jf_size
+
+
+    output:
+    tuple val(meta), path("assemble.sh")                               , emit: script
+    tuple val(meta), path("*scaffolds.fa.gz")                          , emit: scaffolds
+    tuple val(meta), path("*_masurca_config.txt")                      , emit: config
+    tuple val(meta), path("*-masurca.log")                             , emit: log
+    tuple val("${task.process}"), val('masurca'), eval("masurca --version | sed 's/version //g'"), topic: versions, emit: versions_masurca
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+
+    //get input reads with absolute paths - illumina are mandatory, jump/pacbio/nanopore are optional
+    def illumina_reads = [illumina].flatten().collect { it.toRealPath() }.join(' ')
+    def jump_reads = jump ? [jump].flatten().collect { it.toRealPath() }.join(' ') : ""
+    def pacbio_file = pacbio ? pacbio.toRealPath() : ""
+    def nanopore_file = nanopore ? nanopore.toRealPath() : ""
+
+    // Configuration parameters with defaults from task.ext
+    // def extend_jump_reads = task.ext.extend_jump_reads != null ? task.ext.extend_jump_reads : 0
+    // def graph_kmer_size = task.ext.graph_kmer_size ?: 'auto'
+    // def use_linking_mates = task.ext.use_linking_mates != null ? task.ext.use_linking_mates : 0
+    // def lhe_coverage = task.ext.lhe_coverage ?: 25
+    // def mega_reads_one_pass = task.ext.mega_reads_one_pass != null ? task.ext.mega_reads_one_pass : 0
+    // def limit_jump_coverage = task.ext.limit_jump_coverage ?: 300
+    // def ca_parameters = task.ext.ca_parameters ?: 'cgwErrorRate=0.15'
+    // def close_gaps = task.ext.close_gaps != null ? task.ext.close_gaps : 0
+    // def jf_size = task.ext.jf_size ?: 200000000
+    """
+    echo "DATA" > ${prefix}_masurca_config.txt
+    echo "#Illumina paired end reads supplied as <two-character prefix> <fragment mean> <fragment stdev> <forward_reads> <reverse_reads>" >> ${prefix}_masurca_config.txt
+    echo "#if single-end, do not specify <reverse_reads>" >> ${prefix}_masurca_config.txt
+    echo "#MUST HAVE Illumina paired end reads to use MaSuRCA" >> ${prefix}_masurca_config.txt
+    echo "PE= pe ${fragment_mean} ${fragment_stdev} ${illumina_reads}" >> ${prefix}_masurca_config.txt
+
+    # Jump/mate pair reads (optional)
+    if [ -n "${jump_reads}" ]; then
+        echo "#Illumina mate pair reads supplied as <two-character prefix> <fragment mean> <fragment stdev> <forward_reads> <reverse_reads>" >> ${prefix}_masurca_config.txt
+        echo "JUMP= sh ${jump_mean} ${jump_stdev} ${jump_reads}" >> ${prefix}_masurca_config.txt
+    fi
+
+    # PacBio and Nanopore reads handling
+    # If both exist, concatenate them and supply as NANOPORE (per MaSuRCA docs)
+    if [ -n "${pacbio_file}" ] && [ -n "${nanopore_file}" ]; then
+        echo "#if you have both PacBio and Nanopore, supply both as NANOPORE type" >> ${prefix}_masurca_config.txt
+        cat ${pacbio_file} ${nanopore_file} > ${prefix}_long_reads.fastq.gz
+        echo "NANOPORE= ${prefix}_long_reads.fastq.gz" >> ${prefix}_masurca_config.txt
+    elif [ -n "${pacbio_file}" ]; then
+        echo "#PacBio/CCS reads must be in a single fasta or fastq file with absolute path" >> ${prefix}_masurca_config.txt
+        echo "PACBIO=${pacbio_file}" >> ${prefix}_masurca_config.txt
+    elif [ -n "${nanopore_file}" ]; then
+        echo "#Nanopore reads must be in a single fasta or fastq file with absolute path" >> ${prefix}_masurca_config.txt
+        echo "NANOPORE=${nanopore_file}" >> ${prefix}_masurca_config.txt
+    fi
+
+    echo "END" >> ${prefix}_masurca_config.txt
+
+
+    echo "" >> ${prefix}_masurca_config.txt
+    echo "PARAMETERS" >> ${prefix}_masurca_config.txt
+    echo "#set this to 1 if your Illumina jumping library reads are shorter than 100bp" >> ${prefix}_masurca_config.txt
+    echo "EXTEND_JUMP_READS=${extend_jump_reads}" >> ${prefix}_masurca_config.txt
+    echo "#this is k-mer size for deBruijn graph values between 25 and 127 are supported, auto will compute the optimal size based on the read data and GC content" >> ${prefix}_masurca_config.txt
+    echo "GRAPH_KMER_SIZE = ${graph_kmer_size}" >> ${prefix}_masurca_config.txt
+    echo "#set this to 1 for all Illumina-only assemblies" >> ${prefix}_masurca_config.txt
+    echo "#set this to 0 if you have more than 15x coverage by long reads (Pacbio or Nanopore) or any other long reads/mate pairs (Illumina MP, Sanger, 454, etc)" >> ${prefix}_masurca_config.txt
+    echo "USE_LINKING_MATES = ${use_linking_mates}" >> ${prefix}_masurca_config.txt
+    echo "#use at most this much coverage by the longest Pacbio or Nanopore reads, discard the rest of the reads" >> ${prefix}_masurca_config.txt
+    echo "#can increase this to 30 or 35 if your reads are short (N50<7000bp)" >> ${prefix}_masurca_config.txt
+    echo "LHE_COVERAGE=${lhe_coverage}" >> ${prefix}_masurca_config.txt
+    echo "#set to 0 (default) to do two passes of mega-reads for slower, but higher quality assembly, otherwise set to 1" >> ${prefix}_masurca_config.txt
+    echo "MEGA_READS_ONE_PASS=${mega_reads_one_pass}" >> ${prefix}_masurca_config.txt
+    echo "#this parameter is useful if you have too many Illumina jumping library mates. Typically set it to 60 for bacteria and 300 for the other organisms" >> ${prefix}_masurca_config.txt
+    echo "LIMIT_JUMP_COVERAGE = ${limit_jump_coverage}" >> ${prefix}_masurca_config.txt
+    echo "#these are the additional parameters to Celera Assembler.  do not worry about performance, number or processors or batch sizes -- these are computed automatically." >> ${prefix}_masurca_config.txt
+    echo "#CABOG ASSEMBLY ONLY: set cgwErrorRate=0.25 for bacteria and 0.1<=cgwErrorRate<=0.15 for other organisms." >> ${prefix}_masurca_config.txt
+    echo "CA_PARAMETERS = ${ca_parameters}" >> ${prefix}_masurca_config.txt
+    echo "#CABOG ASSEMBLY ONLY: whether to attempt to close gaps in scaffolds with Illumina  or long read data" >> ${prefix}_masurca_config.txt
+    echo "CLOSE_GAPS=${close_gaps}" >> ${prefix}_masurca_config.txt
+    echo "#number of cpus to use, set this to the number of CPUs/threads per node you will be using" >> ${prefix}_masurca_config.txt
+    echo "NUM_THREADS = ${task.cpus}" >> ${prefix}_masurca_config.txt
+    echo "#this is mandatory jellyfish hash size -- a safe value is estimated_genome_size*20" >> ${prefix}_masurca_config.txt
+    echo "JF_SIZE = ${jf_size}" >> ${prefix}_masurca_config.txt
+    echo "#ILLUMINA ONLY. Set this to 1 to use SOAPdenovo contigging/scaffolding module." >> ${prefix}_masurca_config.txt
+    echo "#Assembly will be worse but will run faster. Useful for very large (>=8Gbp) genomes from Illumina-only data" >> ${prefix}_masurca_config.txt
+    echo "SOAP_ASSEMBLY=0" >> ${prefix}_masurca_config.txt
+    echo "#If you are doing Hybrid Illumina paired end + Nanopore/PacBio assembly ONLY (no Illumina mate pairs or OTHER frg files)." >> ${prefix}_masurca_config.txt
+    echo "#Set this to 1 to use Flye assembler for final assembly of corrected mega-reads." >> ${prefix}_masurca_config.txt
+    echo "#A lot faster than CABOG, AND QUALITY IS THE SAME OR BETTER." >> ${prefix}_masurca_config.txt
+    echo "#Works well even when MEGA_READS_ONE_PASS is set to 1." >> ${prefix}_masurca_config.txt
+    echo "#DO NOT use if you have less than 15x coverage by long reads." >> ${prefix}_masurca_config.txt
+    echo "FLYE_ASSEMBLY=0" >> ${prefix}_masurca_config.txt
+    echo "END" >> ${prefix}_masurca_config.txt
+
+    # Generate assembly script
+    masurca ${prefix}_masurca_config.txt
+
+    ./assemble.sh > ${prefix}-masurca.log 2>&1
+
+    if [ -f CA*/primary.genome.scf.fasta ]; then
+        gzip -cn CA*/primary.genome.scf.fasta > ${prefix}.scaffolds.fa.gz
+    fi
+    """
+
+    stub:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    mkdir -p CA
+    touch assemble.sh
+    touch ${prefix}_masurca_config.txt
+    echo "" | gzip > ${prefix}.scaffolds.fa.gz
+    touch ${prefix}-masurca.log
+    """
+}
diff --git a/modules/nf-core/masurca/meta.yml b/modules/nf-core/masurca/meta.yml
@@ -0,0 +1,167 @@
+name: "masurca"
+description: The MaSuRCA (Maryland Super Read Cabog Assembler) genome assembly
+  and analysis toolkit
+keywords:
+  - denovo
+  - assembly
+  - debruijn
+  - genomics
+tools:
+  - "masurca":
+      description: "MaSuRCA (Maryland Super-Read Celera Assembler) genome assembly software."
+      homepage: "https://github.com/alekseyzimin/masurca/blob/v4.1.4/README.md"
+      documentation: "https://github.com/alekseyzimin/masurca/blob/v4.1.4/README.md"
+      tool_dev_url: "https://github.com/alekseyzimin/masurca"
+      doi: "10.1101/gr.213405.116"
+      licence:
+        - "GPL v3"
+      identifier: biotools:masurca
+input:
+  - - meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1' ]`
+    - illumina:
+        type: file
+        description: |
+          Input paired-end FastQ files (R1 and R2).
+        pattern: "*.{fastq.gz,fastq,fq.gz,fq}"
+        ontologies:
+          - edam: http://edamontology.org/format_1930
+    - jump:
+        type: file
+        description: |
+          Jump/mate-pair FastQ files.
+        pattern: "*.{fastq.gz,fastq,fq.gz,fq}"
+        ontologies:
+          - edam: http://edamontology.org/format_1930
+    - pacbio:
+        type: file
+        description: |
+          PacBio FastQ files.
+        pattern: "*.{fastq.gz,fastq,fq.gz,fq}"
+        ontologies:
+          - edam: http://edamontology.org/format_1930
+    - nanopore:
+        type: file
+        description: |
+          Nanopore FastQ files.
+        pattern: "*.{fastq.gz,fastq,fq.gz,fq}"
+        ontologies:
+          - edam: http://edamontology.org/format_1930
+  - fragment_mean:
+      type: integer
+      description: Mean fragment size for Illumina paired-end reads
+  - fragment_stdev:
+      type: integer
+      description: Standard deviation of fragment size for Illumina paired-end
+        reads
+  - jump_mean:
+      type: integer
+      description: Mean fragment size for jump/mate-pair reads
+  - jump_stdev:
+      type: integer
+      description: Standard deviation of fragment size for jump/mate-pair reads
+  - extend_jump_reads:
+      type: boolean
+      description: "Whether to extend jump reads (default: 0 - false; use 1 for true)"
+  - graph_kmer_size:
+      type: string
+      description: "K-mer size for the de Bruijn graph (default: 'auto', it can be an
+        integer or 'auto')"
+  - use_linking_mates:
+      type: boolean
+      description: "Whether to use linking mates (default: 0 - false; use 1 for true)"
+  - lhe_coverage:
+      type: integer
+      description: "LHE coverage (default: 25)"
+  - mega_reads_one_pass:
+      type: boolean
+      description: "Whether to perform one pass of mega-reads (default: 0 - false; use
+        1 for true)"
+  - limit_jump_coverage:
+      type: integer
+      description: "Limit for jump read coverage (default: 300)"
+  - ca_parameters:
+      type: string
+      description: "Parameters for the Celera Assembler (default: 'cgwErrorRate=0.15')"
+  - close_gaps:
+      type: boolean
+      description: "Whether to close gaps (default: 0 - false; use 1 for true)"
+  - jf_size:
+      type: integer
+      description: "Jellyfish hash size (default: 200000000)"
+output:
+  script:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. `[ id:'sample1' ]`
+      - "assemble.sh":
+          type: file
+          description: MaSuRCA assembly script
+          pattern: "assemble.sh"
+          ontologies: []
+  scaffolds:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. `[ id:'sample1' ]`
+      - "*scaffolds.fa.gz":
+          type: file
+          description: Assembled scaffolds in FASTA format
+          pattern: "*-scaffolds.fa.gz"
+          ontologies:
+            - edam: http://edamontology.org/data_0925
+            - edam: http://edamontology.org/format_1929
+            - edam: http://edamontology.org/format_3989
+  config:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. `[ id:'sample1' ]`
+      - "*_masurca_config.txt":
+          type: file
+          description: MaSuRCA configuration file
+          pattern: "*_masurca_config.txt"
+          ontologies: []
+  log:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. `[ id:'sample1' ]`
+      - "*-masurca.log":
+          type: file
+          description: MaSuRCA assembly log file
+          pattern: "*-masurca.log"
+          ontologies: []
+  versions_masurca:
+    - - ${task.process}:
+          type: string
+          description: The name of the process
+      - masurca:
+          type: string
+          description: The name of the tool
+      - masurca --version | sed 's/version //g':
+          type: eval
+          description: The expression to obtain the version of the tool
+topics:
+  versions:
+    - - ${task.process}:
+          type: string
+          description: The name of the process
+      - masurca:
+          type: string
+          description: The name of the tool
+      - masurca --version | sed 's/version //g':
+          type: eval
+          description: The expression to obtain the version of the tool
+authors:
+  - "@LiaOb21"
+maintainers:
+  - "@LiaOb21"