DOCKET/test_enrich_resample.nf at docket-active-dev · PriceLab/DOCKET · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/bin/env nextflow

scripts = "$baseDir/scripts"

/* Parameter defaults */
params.infile1 = 'test/dataset1.csv'
params.infile2 = 'test/dataset2.txt'
params.config = 'test/file_import.config'
params.docket = 'test/merge_and_enrichment.docket'
params.pca_n = 50
params.fill_na = true

/* local variable names */
infile1 = file(params.infile1)
infile2 = file(params.infile2)
config = file(params.config)
docket = file(params.docket)
pca_n = params.pca_n
fill_na = params.fill_na

process preprocess_input {
    /* Pre-process input files to get numeric data on which to cluster and attribute data for enrichment analysis */
    publishDir "$docket", mode: 'copy'

    output:
    file 'rows_numeric_data.txt.gz' into rows_numdata
    file 'cols_numeric_data.txt.gz' into cols_numdata
    file 'cols_attribute_data.json.gz' into cols_attrdata
    file 'cols_attribute_counts.json.gz' into cols_attrcounts

	"""
	${scripts}/preprocess_input.py \
	  --file1 $infile1 \
	  --file2 $infile2 \
	  --config_file $config
	"""
}

process compute_numeric_pca {
    /* Compute PCA on row-wise data */
    publishDir "$docket", mode: 'copy'

    input:
    /* Use fingerprint results, if available; Otherwise, use original data */
    file numdata from rows_numdata

    output:
    file 'rows_numeric_pca.pca.gz' into rowspca

    """
    ${scripts}/compute_pca.py \
      --source $numdata \
      --out rows_numeric_pca.pca.gz \
      --n_comp $pca_n
    """
}

process cluster_numeric_hier {
    /* Compute row-wise clustering */
    publishDir "$docket", mode: 'copy'

    input:
    file rpca from rowspca

    output:
    file 'cluster_labels.txt.gz' into rows_hier_clust
    file 'cluster_members.json.gz' into row_clust_members

    """
    ${scripts}/cluster_hier.py \
      --source $rpca \
      --cl_labels_out cluster_labels.txt.gz \
      --cl_members_out cluster_members.json.gz
    """
}

process compute_enrich {
    /* Compute enrichment */
    publishDir "$docket", mode: 'copy'

    input:
    file cattrdata from cols_attrdata
    file cattrcnts from cols_attrcounts
    file rhc from rows_hier_clust

    output:
    file 'enrichment_results.txt'

    """
    ${scripts}/compute_enrich.py \
      --attr_data $cattrdata \
      --attr_counts $cattrcnts \
      --cluster_data $rhc \
      --out enrichment_results.txt
    """
}

process copy_notebooks {
    /* Copy Jupyter notebook for visualizing results */
    publishDir "$docket", mode: 'copy'

    output:
    file 'results.py'
    file 'visualize-enrichment-results.ipynb'

    """
    cp '$baseDir/common/results.py' .
    cp '$baseDir/notebooks/visualize-enrichment-results.ipynb' .
    """
}