diff --git a/modules/nf-core/masurca/environment.yml b/modules/nf-core/masurca/environment.yml new file mode 100644 index 00000000000..e29097684bf --- /dev/null +++ b/modules/nf-core/masurca/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::masurca=4.1.4" diff --git a/modules/nf-core/masurca/main.nf b/modules/nf-core/masurca/main.nf new file mode 100644 index 00000000000..d4890e07fde --- /dev/null +++ b/modules/nf-core/masurca/main.nf @@ -0,0 +1,141 @@ +process MASURCA { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "ecoflowucl/masurca:v4.1.4" + + input: + tuple val(meta), path(illumina), path(jump), path(pacbio), path(nanopore) + val fragment_mean + val fragment_stdev + val jump_mean + val jump_stdev + val extend_jump_reads + val graph_kmer_size + val use_linking_mates + val lhe_coverage + val mega_reads_one_pass + val limit_jump_coverage + val ca_parameters + val close_gaps + val jf_size + + + output: + tuple val(meta), path("assemble.sh") , emit: script + tuple val(meta), path("*scaffolds.fa.gz") , emit: scaffolds + tuple val(meta), path("*_masurca_config.txt") , emit: config + tuple val(meta), path("*-masurca.log") , emit: log + tuple val("${task.process}"), val('masurca'), eval("masurca --version | sed 's/version //g'"), topic: versions, emit: versions_masurca + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + //get input reads with absolute paths - illumina are mandatory, jump/pacbio/nanopore are optional + def illumina_reads = [illumina].flatten().collect { it.toRealPath() }.join(' ') + def jump_reads = jump ? [jump].flatten().collect { it.toRealPath() }.join(' ') : "" + def pacbio_file = pacbio ? pacbio.toRealPath() : "" + def nanopore_file = nanopore ? nanopore.toRealPath() : "" + + // Configuration parameters with defaults from task.ext + // def extend_jump_reads = task.ext.extend_jump_reads != null ? task.ext.extend_jump_reads : 0 + // def graph_kmer_size = task.ext.graph_kmer_size ?: 'auto' + // def use_linking_mates = task.ext.use_linking_mates != null ? task.ext.use_linking_mates : 0 + // def lhe_coverage = task.ext.lhe_coverage ?: 25 + // def mega_reads_one_pass = task.ext.mega_reads_one_pass != null ? task.ext.mega_reads_one_pass : 0 + // def limit_jump_coverage = task.ext.limit_jump_coverage ?: 300 + // def ca_parameters = task.ext.ca_parameters ?: 'cgwErrorRate=0.15' + // def close_gaps = task.ext.close_gaps != null ? task.ext.close_gaps : 0 + // def jf_size = task.ext.jf_size ?: 200000000 + """ + echo "DATA" > ${prefix}_masurca_config.txt + echo "#Illumina paired end reads supplied as " >> ${prefix}_masurca_config.txt + echo "#if single-end, do not specify " >> ${prefix}_masurca_config.txt + echo "#MUST HAVE Illumina paired end reads to use MaSuRCA" >> ${prefix}_masurca_config.txt + echo "PE= pe ${fragment_mean} ${fragment_stdev} ${illumina_reads}" >> ${prefix}_masurca_config.txt + + # Jump/mate pair reads (optional) + if [ -n "${jump_reads}" ]; then + echo "#Illumina mate pair reads supplied as " >> ${prefix}_masurca_config.txt + echo "JUMP= sh ${jump_mean} ${jump_stdev} ${jump_reads}" >> ${prefix}_masurca_config.txt + fi + + # PacBio and Nanopore reads handling + # If both exist, concatenate them and supply as NANOPORE (per MaSuRCA docs) + if [ -n "${pacbio_file}" ] && [ -n "${nanopore_file}" ]; then + echo "#if you have both PacBio and Nanopore, supply both as NANOPORE type" >> ${prefix}_masurca_config.txt + cat ${pacbio_file} ${nanopore_file} > ${prefix}_long_reads.fastq.gz + echo "NANOPORE= ${prefix}_long_reads.fastq.gz" >> ${prefix}_masurca_config.txt + elif [ -n "${pacbio_file}" ]; then + echo "#PacBio/CCS reads must be in a single fasta or fastq file with absolute path" >> ${prefix}_masurca_config.txt + echo "PACBIO=${pacbio_file}" >> ${prefix}_masurca_config.txt + elif [ -n "${nanopore_file}" ]; then + echo "#Nanopore reads must be in a single fasta or fastq file with absolute path" >> ${prefix}_masurca_config.txt + echo "NANOPORE=${nanopore_file}" >> ${prefix}_masurca_config.txt + fi + + echo "END" >> ${prefix}_masurca_config.txt + + + echo "" >> ${prefix}_masurca_config.txt + echo "PARAMETERS" >> ${prefix}_masurca_config.txt + echo "#set this to 1 if your Illumina jumping library reads are shorter than 100bp" >> ${prefix}_masurca_config.txt + echo "EXTEND_JUMP_READS=${extend_jump_reads}" >> ${prefix}_masurca_config.txt + echo "#this is k-mer size for deBruijn graph values between 25 and 127 are supported, auto will compute the optimal size based on the read data and GC content" >> ${prefix}_masurca_config.txt + echo "GRAPH_KMER_SIZE = ${graph_kmer_size}" >> ${prefix}_masurca_config.txt + echo "#set this to 1 for all Illumina-only assemblies" >> ${prefix}_masurca_config.txt + echo "#set this to 0 if you have more than 15x coverage by long reads (Pacbio or Nanopore) or any other long reads/mate pairs (Illumina MP, Sanger, 454, etc)" >> ${prefix}_masurca_config.txt + echo "USE_LINKING_MATES = ${use_linking_mates}" >> ${prefix}_masurca_config.txt + echo "#use at most this much coverage by the longest Pacbio or Nanopore reads, discard the rest of the reads" >> ${prefix}_masurca_config.txt + echo "#can increase this to 30 or 35 if your reads are short (N50<7000bp)" >> ${prefix}_masurca_config.txt + echo "LHE_COVERAGE=${lhe_coverage}" >> ${prefix}_masurca_config.txt + echo "#set to 0 (default) to do two passes of mega-reads for slower, but higher quality assembly, otherwise set to 1" >> ${prefix}_masurca_config.txt + echo "MEGA_READS_ONE_PASS=${mega_reads_one_pass}" >> ${prefix}_masurca_config.txt + echo "#this parameter is useful if you have too many Illumina jumping library mates. Typically set it to 60 for bacteria and 300 for the other organisms" >> ${prefix}_masurca_config.txt + echo "LIMIT_JUMP_COVERAGE = ${limit_jump_coverage}" >> ${prefix}_masurca_config.txt + echo "#these are the additional parameters to Celera Assembler. do not worry about performance, number or processors or batch sizes -- these are computed automatically." >> ${prefix}_masurca_config.txt + echo "#CABOG ASSEMBLY ONLY: set cgwErrorRate=0.25 for bacteria and 0.1<=cgwErrorRate<=0.15 for other organisms." >> ${prefix}_masurca_config.txt + echo "CA_PARAMETERS = ${ca_parameters}" >> ${prefix}_masurca_config.txt + echo "#CABOG ASSEMBLY ONLY: whether to attempt to close gaps in scaffolds with Illumina or long read data" >> ${prefix}_masurca_config.txt + echo "CLOSE_GAPS=${close_gaps}" >> ${prefix}_masurca_config.txt + echo "#number of cpus to use, set this to the number of CPUs/threads per node you will be using" >> ${prefix}_masurca_config.txt + echo "NUM_THREADS = ${task.cpus}" >> ${prefix}_masurca_config.txt + echo "#this is mandatory jellyfish hash size -- a safe value is estimated_genome_size*20" >> ${prefix}_masurca_config.txt + echo "JF_SIZE = ${jf_size}" >> ${prefix}_masurca_config.txt + echo "#ILLUMINA ONLY. Set this to 1 to use SOAPdenovo contigging/scaffolding module." >> ${prefix}_masurca_config.txt + echo "#Assembly will be worse but will run faster. Useful for very large (>=8Gbp) genomes from Illumina-only data" >> ${prefix}_masurca_config.txt + echo "SOAP_ASSEMBLY=0" >> ${prefix}_masurca_config.txt + echo "#If you are doing Hybrid Illumina paired end + Nanopore/PacBio assembly ONLY (no Illumina mate pairs or OTHER frg files)." >> ${prefix}_masurca_config.txt + echo "#Set this to 1 to use Flye assembler for final assembly of corrected mega-reads." >> ${prefix}_masurca_config.txt + echo "#A lot faster than CABOG, AND QUALITY IS THE SAME OR BETTER." >> ${prefix}_masurca_config.txt + echo "#Works well even when MEGA_READS_ONE_PASS is set to 1." >> ${prefix}_masurca_config.txt + echo "#DO NOT use if you have less than 15x coverage by long reads." >> ${prefix}_masurca_config.txt + echo "FLYE_ASSEMBLY=0" >> ${prefix}_masurca_config.txt + echo "END" >> ${prefix}_masurca_config.txt + + # Generate assembly script + masurca ${prefix}_masurca_config.txt + + ./assemble.sh > ${prefix}-masurca.log 2>&1 + + if [ -f CA*/primary.genome.scf.fasta ]; then + gzip -cn CA*/primary.genome.scf.fasta > ${prefix}.scaffolds.fa.gz + fi + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir -p CA + touch assemble.sh + touch ${prefix}_masurca_config.txt + echo "" | gzip > ${prefix}.scaffolds.fa.gz + touch ${prefix}-masurca.log + """ +} diff --git a/modules/nf-core/masurca/meta.yml b/modules/nf-core/masurca/meta.yml new file mode 100644 index 00000000000..f96165adb2f --- /dev/null +++ b/modules/nf-core/masurca/meta.yml @@ -0,0 +1,167 @@ +name: "masurca" +description: The MaSuRCA (Maryland Super Read Cabog Assembler) genome assembly + and analysis toolkit +keywords: + - denovo + - assembly + - debruijn + - genomics +tools: + - "masurca": + description: "MaSuRCA (Maryland Super-Read Celera Assembler) genome assembly software." + homepage: "https://github.com/alekseyzimin/masurca/blob/v4.1.4/README.md" + documentation: "https://github.com/alekseyzimin/masurca/blob/v4.1.4/README.md" + tool_dev_url: "https://github.com/alekseyzimin/masurca" + doi: "10.1101/gr.213405.116" + licence: + - "GPL v3" + identifier: biotools:masurca +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - illumina: + type: file + description: | + Input paired-end FastQ files (R1 and R2). + pattern: "*.{fastq.gz,fastq,fq.gz,fq}" + ontologies: + - edam: http://edamontology.org/format_1930 + - jump: + type: file + description: | + Jump/mate-pair FastQ files. + pattern: "*.{fastq.gz,fastq,fq.gz,fq}" + ontologies: + - edam: http://edamontology.org/format_1930 + - pacbio: + type: file + description: | + PacBio FastQ files. + pattern: "*.{fastq.gz,fastq,fq.gz,fq}" + ontologies: + - edam: http://edamontology.org/format_1930 + - nanopore: + type: file + description: | + Nanopore FastQ files. + pattern: "*.{fastq.gz,fastq,fq.gz,fq}" + ontologies: + - edam: http://edamontology.org/format_1930 + - fragment_mean: + type: integer + description: Mean fragment size for Illumina paired-end reads + - fragment_stdev: + type: integer + description: Standard deviation of fragment size for Illumina paired-end + reads + - jump_mean: + type: integer + description: Mean fragment size for jump/mate-pair reads + - jump_stdev: + type: integer + description: Standard deviation of fragment size for jump/mate-pair reads + - extend_jump_reads: + type: boolean + description: "Whether to extend jump reads (default: 0 - false; use 1 for true)" + - graph_kmer_size: + type: string + description: "K-mer size for the de Bruijn graph (default: 'auto', it can be an + integer or 'auto')" + - use_linking_mates: + type: boolean + description: "Whether to use linking mates (default: 0 - false; use 1 for true)" + - lhe_coverage: + type: integer + description: "LHE coverage (default: 25)" + - mega_reads_one_pass: + type: boolean + description: "Whether to perform one pass of mega-reads (default: 0 - false; use + 1 for true)" + - limit_jump_coverage: + type: integer + description: "Limit for jump read coverage (default: 300)" + - ca_parameters: + type: string + description: "Parameters for the Celera Assembler (default: 'cgwErrorRate=0.15')" + - close_gaps: + type: boolean + description: "Whether to close gaps (default: 0 - false; use 1 for true)" + - jf_size: + type: integer + description: "Jellyfish hash size (default: 200000000)" +output: + script: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "assemble.sh": + type: file + description: MaSuRCA assembly script + pattern: "assemble.sh" + ontologies: [] + scaffolds: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*scaffolds.fa.gz": + type: file + description: Assembled scaffolds in FASTA format + pattern: "*-scaffolds.fa.gz" + ontologies: + - edam: http://edamontology.org/data_0925 + - edam: http://edamontology.org/format_1929 + - edam: http://edamontology.org/format_3989 + config: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*_masurca_config.txt": + type: file + description: MaSuRCA configuration file + pattern: "*_masurca_config.txt" + ontologies: [] + log: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*-masurca.log": + type: file + description: MaSuRCA assembly log file + pattern: "*-masurca.log" + ontologies: [] + versions_masurca: + - - ${task.process}: + type: string + description: The name of the process + - masurca: + type: string + description: The name of the tool + - masurca --version | sed 's/version //g': + type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process + - masurca: + type: string + description: The name of the tool + - masurca --version | sed 's/version //g': + type: eval + description: The expression to obtain the version of the tool +authors: + - "@LiaOb21" +maintainers: + - "@LiaOb21" diff --git a/modules/nf-core/masurca/tests/main.nf.test b/modules/nf-core/masurca/tests/main.nf.test new file mode 100644 index 00000000000..dffba62d12a --- /dev/null +++ b/modules/nf-core/masurca/tests/main.nf.test @@ -0,0 +1,384 @@ +nextflow_process { + + name "Test Process MASURCA" + script "../main.nf" + process "MASURCA" + + tag "modules" + tag "modules_nfcore" + tag "masurca" + + test("homo_sapiens - illumina - single_end") { + + when { + process { + """ + input[0] = [ + [ id:'test'], + [file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/fastq/test_1.fastq.gz", checkIfExists: true)], + [], // no jump reads + [], // no pacbio + [] // no nanopore + ] + input[1] = 500 // fragment_mean + input[2] = 50 // fragment_stdev + input[3] = 0 // jump_mean (not used) + input[4] = 0 // jump_stdev (not used) + input[5] = 0 // extend_jump_reads + input[6] = 'auto' // graph_kmer_size + input[7] = 0 // use_linking_mates + input[8] = 25 // lhe_coverage + input[9] = 0 // mega_reads_one_pass + input[10] = 300 // limit_jump_coverage + input[11] = 'cgwErrorRate=0.15' // ca_parameters + input[12] = 0 // close_gaps + input[13] = 200000000 // jf_size + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('AAA') } }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('CCC') } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("# assemble.sh generated by masurca") } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("head -n 1 ESTIMATED_GENOME_SIZE.txt") } }, + { assert path(process.out.config[0][1]).readLines().last().contains("END") }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("All done") } }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("primary.genome.scf.fasta") } }, + { assert snapshot( + file(process.out.scaffolds[0][1]).name, + file(process.out.script[0][1]).name, + file(process.out.config[0][1]).name, + file(process.out.log[0][1]).name, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() } + ) + } + + } + + test("homo_sapiens - illumina - paired_end") { + + when { + process { + """ + input[0] = [ + [ id:'test'], + [ + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/fastq/test_1.fastq.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/fastq/test_2.fastq.gz", checkIfExists: true) + ], + [], // no jump reads + [], // no pacbio + [] // no nanopore + ] + input[1] = 500 // fragment_mean + input[2] = 50 // fragment_stdev + input[3] = 0 // jump_mean (not used) + input[4] = 0 // jump_stdev (not used) + input[5] = 0 // extend_jump_reads + input[6] = 'auto' // graph_kmer_size + input[7] = 0 // use_linking_mates + input[8] = 25 // lhe_coverage + input[9] = 0 // mega_reads_one_pass + input[10] = 300 // limit_jump_coverage + input[11] = 'cgwErrorRate=0.15' // ca_parameters + input[12] = 0 // close_gaps + input[13] = 200000000 // jf_size + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('AAA') } }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('CCC') } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("# assemble.sh generated by masurca") } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("head -n 1 ESTIMATED_GENOME_SIZE.txt") } }, + { assert path(process.out.config[0][1]).readLines().last().contains("END") }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("All done") } }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("primary.genome.scf.fasta") } }, + { assert snapshot( + file(process.out.scaffolds[0][1]).name, + file(process.out.script[0][1]).name, + file(process.out.config[0][1]).name, + file(process.out.log[0][1]).name, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() } + ) + } + + } + + test("homo_sapiens - illumina - paired_end - with_jump") { + + when { + process { + """ + input[0] = [ + [ id:'test'], + [ + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/fastq/test_1.fastq.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/fastq/test_2.fastq.gz", checkIfExists: true) + ], + [ + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/fastq/test2_germline_1.fq.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/fastq/test2_germline_2.fq.gz", checkIfExists: true) + ], + [], // no pacbio + [] // no nanopore + ] + input[1] = 500 // fragment_mean + input[2] = 50 // fragment_stdev + input[3] = 3600 // jump_mean + input[4] = 200 // jump_stdev + input[5] = 0 // extend_jump_reads + input[6] = 'auto' // graph_kmer_size + input[7] = 0 // use_linking_mates + input[8] = 25 // lhe_coverage + input[9] = 0 // mega_reads_one_pass + input[10] = 300 // limit_jump_coverage + input[11] = 'cgwErrorRate=0.15' // ca_parameters + input[12] = 0 // close_gaps + input[13] = 200000000 // jf_size + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('AAA') } }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('CCC') } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("# assemble.sh generated by masurca") } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("head -n 1 ESTIMATED_GENOME_SIZE.txt") } }, + { assert path(process.out.config[0][1]).readLines().last().contains("END") }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("All done") } }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("primary.genome.scf.fasta") } }, + { assert snapshot( + file(process.out.scaffolds[0][1]).name, + file(process.out.script[0][1]).name, + file(process.out.config[0][1]).name, + file(process.out.log[0][1]).name, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() } + ) + } + + } + + test("genomeassembler - hybrid - illumina_pacbio") { + + when { + process { + """ + input[0] = [ + [ id:'test'], + [ + file("https://raw.githubusercontent.com/nf-core/test-datasets/genomeassembler/A_thaliana_Col-0_2mb/SR_Col-0_test_data_1.fastq.gz", checkIfExists: true), + file("https://raw.githubusercontent.com/nf-core/test-datasets/genomeassembler/A_thaliana_Col-0_2mb/SR_Col-0_test_data_2.fastq.gz", checkIfExists: true) + ], + [], // no jump reads + [file("https://raw.githubusercontent.com/nf-core/test-datasets/genomeassembler/A_thaliana_Col-0_2mb/HiFi-Col-0_test_data.fastq.gz", checkIfExists: true)], + [] // no nanopore + ] + input[1] = 500 // fragment_mean + input[2] = 50 // fragment_stdev + input[3] = 0 // jump_mean (not used) + input[4] = 0 // jump_stdev (not used) + input[5] = 0 // extend_jump_reads + input[6] = 'auto' // graph_kmer_size + input[7] = 0 // use_linking_mates + input[8] = 25 // lhe_coverage + input[9] = 0 // mega_reads_one_pass + input[10] = 300 // limit_jump_coverage + input[11] = 'cgwErrorRate=0.15' // ca_parameters + input[12] = 0 // close_gaps + input[13] = 200000000 // jf_size + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('AAA') } }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('CCC') } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("# assemble.sh generated by masurca") } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("head -n 1 ESTIMATED_GENOME_SIZE.txt") } }, + { assert path(process.out.config[0][1]).readLines().last().contains("END") }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("All done") } }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("primary.genome.scf.fasta") } }, + { assert snapshot( + file(process.out.scaffolds[0][1]).name, + file(process.out.script[0][1]).name, + file(process.out.config[0][1]).name, + file(process.out.log[0][1]).name, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() } + ) + } + + } + + test("genomeassembler - hybrid - illumina_nanopore") { + + when { + process { + """ + input[0] = [ + [ id:'test'], + [ + file("https://raw.githubusercontent.com/nf-core/test-datasets/genomeassembler/A_thaliana_Col-0_2mb/SR_Col-0_test_data_1.fastq.gz", checkIfExists: true), + file("https://raw.githubusercontent.com/nf-core/test-datasets/genomeassembler/A_thaliana_Col-0_2mb/SR_Col-0_test_data_2.fastq.gz", checkIfExists: true) + ], + [], // no jump reads + [], // no pacbio + [file("https://raw.githubusercontent.com/nf-core/test-datasets/genomeassembler/A_thaliana_Col-0_2mb/ONT-Col-0_test_data.fastq.gz", checkIfExists: true)] + ] + input[1] = 500 // fragment_mean + input[2] = 50 // fragment_stdev + input[3] = 0 // jump_mean (not used) + input[4] = 0 // jump_stdev (not used) + input[5] = 0 // extend_jump_reads + input[6] = 'auto' // graph_kmer_size + input[7] = 0 // use_linking_mates + input[8] = 25 // lhe_coverage + input[9] = 0 // mega_reads_one_pass + input[10] = 300 // limit_jump_coverage + input[11] = 'cgwErrorRate=0.15' // ca_parameters + input[12] = 0 // close_gaps + input[13] = 200000000 // jf_size + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('AAA') } }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('CCC') } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("# assemble.sh generated by masurca") } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("head -n 1 ESTIMATED_GENOME_SIZE.txt") } }, + { assert path(process.out.config[0][1]).readLines().last().contains("END") }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("All done") } }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("primary.genome.scf.fasta") } }, + { assert snapshot( + file(process.out.scaffolds[0][1]).name, + file(process.out.script[0][1]).name, + file(process.out.config[0][1]).name, + file(process.out.log[0][1]).name, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() } + ) + } + + } + + test("genomeassembler - hybrid - illumina_pacbio_nanopore") { + + when { + process { + """ + input[0] = [ + [ id:'test'], + [ + file("https://raw.githubusercontent.com/nf-core/test-datasets/genomeassembler/A_thaliana_Col-0_2mb/SR_Col-0_test_data_1.fastq.gz", checkIfExists: true), + file("https://raw.githubusercontent.com/nf-core/test-datasets/genomeassembler/A_thaliana_Col-0_2mb/SR_Col-0_test_data_2.fastq.gz", checkIfExists: true) + ], + [], // no jump reads + [file("https://raw.githubusercontent.com/nf-core/test-datasets/genomeassembler/A_thaliana_Col-0_2mb/HiFi-Col-0_test_data.fastq.gz", checkIfExists: true)], + [file("https://raw.githubusercontent.com/nf-core/test-datasets/genomeassembler/A_thaliana_Col-0_2mb/ONT-Col-0_test_data.fastq.gz", checkIfExists: true)] + ] + input[1] = 500 // fragment_mean + input[2] = 50 // fragment_stdev + input[3] = 0 // jump_mean (not used) + input[4] = 0 // jump_stdev (not used) + input[5] = 0 // extend_jump_reads + input[6] = 'auto' // graph_kmer_size + input[7] = 0 // use_linking_mates + input[8] = 25 // lhe_coverage + input[9] = 0 // mega_reads_one_pass + input[10] = 300 // limit_jump_coverage + input[11] = 'cgwErrorRate=0.15' // ca_parameters + input[12] = 0 // close_gaps + input[13] = 200000000 // jf_size + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('AAA') } }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('CCC') } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("# assemble.sh generated by masurca") } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("head -n 1 ESTIMATED_GENOME_SIZE.txt") } }, + { assert path(process.out.config[0][1]).readLines().last().contains("END") }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("All done") } }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("primary.genome.scf.fasta") } }, + { assert snapshot( + file(process.out.scaffolds[0][1]).name, + file(process.out.script[0][1]).name, + file(process.out.config[0][1]).name, + file(process.out.log[0][1]).name, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() } + ) + } + + } + + test("sarscov2 - illumina - paired_end - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test'], + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ], + [], // no jump reads + [], // no pacbio + [] // no nanopore + ] + input[1] = 500 // fragment_mean + input[2] = 50 // fragment_stdev + input[3] = 0 // jump_mean (not used) + input[4] = 0 // jump_stdev (not used) + input[5] = 0 // extend_jump_reads + input[6] = 'auto' // graph_kmer_size + input[7] = 0 // use_linking_mates + input[8] = 25 // lhe_coverage + input[9] = 0 // mega_reads_one_pass + input[10] = 300 // limit_jump_coverage + input[11] = 'cgwErrorRate=0.15' // ca_parameters + input[12] = 0 // close_gaps + input[13] = 200000000 // jf_size + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.scaffolds[0][1]).name, + file(process.out.script[0][1]).name, + file(process.out.config[0][1]).name, + file(process.out.log[0][1]).name, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/masurca/tests/main.nf.test.snap b/modules/nf-core/masurca/tests/main.nf.test.snap new file mode 100644 index 00000000000..be7eb9380be --- /dev/null +++ b/modules/nf-core/masurca/tests/main.nf.test.snap @@ -0,0 +1,156 @@ +{ + "homo_sapiens - illumina - single_end": { + "content": [ + "test.scaffolds.fa.gz", + "assemble.sh", + "test_masurca_config.txt", + "test-masurca.log", + { + "versions_masurca": [ + [ + "MASURCA", + "masurca", + "4.1.4" + ] + ] + } + ], + "timestamp": "2026-04-02T17:02:36.723413139", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "genomeassembler - hybrid - illumina_pacbio_nanopore": { + "content": [ + "test.scaffolds.fa.gz", + "assemble.sh", + "test_masurca_config.txt", + "test-masurca.log", + { + "versions_masurca": [ + [ + "MASURCA", + "masurca", + "4.1.4" + ] + ] + } + ], + "timestamp": "2026-04-02T17:12:19.186769917", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "sarscov2 - illumina - paired_end - stub": { + "content": [ + "test.scaffolds.fa.gz", + "assemble.sh", + "test_masurca_config.txt", + "test-masurca.log", + { + "versions_masurca": [ + [ + "MASURCA", + "masurca", + "4.1.4" + ] + ] + } + ], + "timestamp": "2026-04-02T20:15:28.52290314", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "genomeassembler - hybrid - illumina_pacbio": { + "content": [ + "test.scaffolds.fa.gz", + "assemble.sh", + "test_masurca_config.txt", + "test-masurca.log", + { + "versions_masurca": [ + [ + "MASURCA", + "masurca", + "4.1.4" + ] + ] + } + ], + "timestamp": "2026-04-02T17:06:10.293289505", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "homo_sapiens - illumina - paired_end": { + "content": [ + "test.scaffolds.fa.gz", + "assemble.sh", + "test_masurca_config.txt", + "test-masurca.log", + { + "versions_masurca": [ + [ + "MASURCA", + "masurca", + "4.1.4" + ] + ] + } + ], + "timestamp": "2026-04-02T17:23:14.003329552", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "homo_sapiens - illumina - paired_end - with_jump": { + "content": [ + "test.scaffolds.fa.gz", + "assemble.sh", + "test_masurca_config.txt", + "test-masurca.log", + { + "versions_masurca": [ + [ + "MASURCA", + "masurca", + "4.1.4" + ] + ] + } + ], + "timestamp": "2026-04-02T17:33:32.958497714", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "genomeassembler - hybrid - illumina_nanopore": { + "content": [ + "test.scaffolds.fa.gz", + "assemble.sh", + "test_masurca_config.txt", + "test-masurca.log", + { + "versions_masurca": [ + [ + "MASURCA", + "masurca", + "4.1.4" + ] + ] + } + ], + "timestamp": "2026-04-02T17:08:18.247500418", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file