-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSnakefile
More file actions
113 lines (91 loc) · 3.58 KB
/
Snakefile
File metadata and controls
113 lines (91 loc) · 3.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#### PROJECT SNAKEFILE #########################################################
## This file defines a workflow for a QuantGov project as executed by the
## SnakeMake utility. The default project uses a single QuantGov Estimator to
## predict a value for the documents in a single QuantGov Corpus.
#### PYTHON IMPORTS ###########################################################
## This section imports libraries and defines functionality to be used
## throughout the workflow
import os.path
import datetime
import zipfile
from pathlib import Path
def outpath(path):
"""Ensure Cross-Platform functionality for files in subdirectories"""
return os.path.sep.join(os.path.split(str(path)))
def get_component_name(path):
"""Guess a component's name from the folder containing it"""
path = Path(path)
if path.stem.startswith('corpus-') or path.stem.startswith('estimator-'):
return path.stem.split('-', 1)[-1]
else:
return path.stem
#### SNAKEFILE CONFIGURATION ##################################################
## This section defines snakfile configuration and variables used throughout
## the workflow
configfile: 'config.yaml'
subworkflow estimator:
workdir: config['estimator']
configfile: 'config.yaml'
subworkflow target_corpus:
workdir: config['target_corpus']
configfile: 'config.yaml'
target_name = get_component_name(config['target_corpus'])
estimator_name = get_component_name(config['estimator'])
dataset_dir = Path('data').joinpath(config['name'])
timestamp_format = '%Y%m%d-%H%M' # See Python datetime documentation
#### ESTIMATION ###############################################################
## This section defines the rule which performs the estimation on the target
## Corpus
rule estimate:
input:
estimator('data/vectorizer.pickle'),
estimator('data/model.pickle'),
target_corpus('timestamp')
params:
'--probability' if config['probability'] else ''
output:
outpath('data/results.csv')
shell:
'quantgov estimator estimate {input} --outfile {output} {params}'
#### DATASET CREATION #########################################################
## This section packages files together in a dataset folder and archive. This
## is where to define any "finishing" tasks, such as renaming files or
## variables
metadata = dataset_dir.joinpath('{}_metadata.csv'.format(target_name))
results = dataset_dir.joinpath('{}_{}_{}.csv'.format(
target_name, estimator_name, config['prediction_type']))
rule create_dataset:
input:
target_corpus('data/metadata.csv'),
rules.estimate.output
output:
outpath(metadata),
outpath(results)
run:
import shutil
try:
shutil.rmtree(str(dataset_dir))
except FileNotFoundError:
pass
dataset_dir.mkdir(parents=True)
for infile, outfile in zip(input, output):
print('Copying {} to {}'.format(infile, outfile))
shutil.copy(infile, outfile)
try: # Define dataset timestamp for archive
dataset_time = (
datetime.datetime
.fromtimestamp(dataset_dir.stat().st_mtime)
.strftime(timestamp_format)
)
except FileNotFoundError:
dataset_time = datetime.datetime.now().strftime(timestamp_format)
rule create_archive:
input:
rules.create_dataset.output
output:
outpath('data/{}_{}.zip'.format(config['name'], dataset_time))
run:
import zipfile
with zipfile.ZipFile(output[0], 'w') as zf:
for i in input:
zf.write(i, str(Path(i).relative_to(dataset_dir)))