-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdocket_study
executable file
·468 lines (443 loc) · 13.6 KB
/
docket_study
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
#!/bin/env nextflow
dbin = "$baseDir/scripts"
lphbin = "$baseDir/data-fingerprints"
/* parameter defaults */
params.infile = "$baseDir/test/test_data.txt"
params.format = 'table'
params.docket = "$baseDir/test/test_data.docket"
params.L = 500
params.histL = 50
params.minTriples = 1
params.skipDuplicates = 1
params.rowclusters = 3
params.colclusters = 3
params.comment = '#'
/* interpret parameters */
infile = file(params.infile)
format = params.format
docket = file(params.docket)
L = params.L
histL = params.histL
/* SECTION: preparation */
process ingest_file {
/* read the file, store contents as row-wise and column-wise json */
/* temporary: keeping only the first instance (row) for each id */
publishDir "$docket/data", mode: 'copyNoFollow'
input: file infile
output:
file 'original_data.gz' into originaldata
/* file 'original_file' */
file 'cols_data.json.gz' into colsdata
file 'rows_data.json.gz' into rowsdata
file 'cleaned_data.txt.gz' into cleandata
"""
ln -s $params.infile original_file
${dbin}/ingest.pl $infile $params.format $params.skipDuplicates
"""
}
/* END SECTION: preparation */
/* SECTION: data set overview */
process plot_dataset_overviews {
/* graphics depicting entire data set */
publishDir "$docket/visualizations", mode: 'copy'
input: file data from cleandata
output:
file 'data_overview.png'
file 'data_overview_nullsorted.png'
"""
$dbin/plotMatrixOverview.pl $docket data_overview
$dbin/plotMatrixOverview.pl $docket data_overview_nullsorted nulls nulls
"""
}
/* END SECTION: data set overview */
/* SECTION: column histogram pipeline */
process column_histograms {
/* column-wise histograms of observed values */
publishDir "$docket/data", mode: 'copy'
input: file cd from colsdata
output:
file 'cols_hist.json.gz' into colshist
file 'cols_types.json.gz' into colstypes
"""
$dbin/compute_hist.pl $cd cols_hist.json cols_types.json
gzip cols_hist.json cols_types.json
"""
}
process compute_col_hist_fingerprints {
/* compute and serialize fingerprints of column-wise histograms of observed values */
publishDir "$docket/fingerprints", mode: 'copy'
input: file ch from colshist
output:
file 'cols_hist_fp.trim.gz' into colshistfp
file 'cols_hist_fp.fp' into colshistfpser
file 'cols_hist_fp.id' into colshistfpserids
"""
$dbin/compute_fp.pl $ch $histL | gzip -c > cols_hist_fp.raw.gz
$dbin/filterTable.pl cols_hist_fp.raw.gz 1 $params.minTriples | gzip -c > cols_hist_fp.trim.gz
$lphbin/serializeLPH.pl cols_hist_fp $histL 1 1 cols_hist_fp.trim.gz
"""
}
process PCA_col_hist_fingerprints {
/* compute PCA on fingerprints of column histograms */
publishDir "$docket/analyses", mode: 'copy'
input: file chfp from colshistfp
output: file 'cols_hist_fp.pca.gz' into colshistfppca
"""
$dbin/pca.py $chfp --L $histL | gzip -c > cols_hist_fp.pca.gz
"""
}
process plot_PCA_col_hist_fingerprints {
/* plot PC1-PC2 on fingerprints of column histograms */
publishDir "$docket/visualizations", mode: 'copy'
input: file chfp from colshistfppca
output:
file 'cols_hist_fp.pc1_pc2.pdf'
file 'cols_hist_fp.pc1_pc2.png'
"""
$dbin/plotpca.py $chfp cols_hist_fp.pc1_pc2
"""
}
process compare_col_hist_fingerprints {
/* compare fingerprints of column-wise histograms of observed values */
publishDir "$docket/comparisons", mode: 'copy'
input: file chf from colshistfpser
output:
file 'cols_hist_fp.aaa.gz' into chfaaa
file 'cols_hist_fp.aaa.hist' into chfaaahist
"""
$lphbin/searchLPHs.pl $chf 0 1000000 cols_hist_fp.aaa.hist | gzip -c > cols_hist_fp.aaa.gz
"""
}
process index_col_hist_fingerprints {
/* index fingerprints of column-wise histograms, using annoy */
errorStrategy 'ignore'
publishDir "$docket/fingerprints", mode: 'copy'
input: file chf from colshistfp
output:
file 'cols_hist_fp.tree' into colshistfpindex
file 'cols_hist_fp.names' into colshistfpnames
"""
$dbin/annoyIndexGz.py --file $chf --L $histL --norm 1 --out cols_hist_fp
"""
}
process KNN_col_hist_fingerprints {
/* find k nearest neighbors of column-wise histograms, using annoy */
errorStrategy 'ignore'
publishDir "$docket/comparisons", mode: 'copy'
input:
file chfi from colshistfpindex
file chfn from colshistfpnames
output: file 'cols_hist_fp.knn.gz'
"""
$dbin/annoyQueryAll.py --index $chfi --names $chfn --L $histL --k 100 | gzip -c > cols_hist_fp.knn.gz
"""
}
/* END SECTION: column histogram pipeline */
/* SECTION: row histogram pipeline */
process row_histograms {
/* row-wise histograms of observed values */
publishDir "$docket/data", mode: 'copy'
input: file rd from rowsdata
output:
file 'rows_hist.json.gz' into rowshist
file 'rows_types.json.gz' into rowstypes
"""
$dbin/compute_hist.pl $rd rows_hist.json rows_types.json
gzip rows_hist.json rows_types.json
"""
}
/* END SECTION: row histogram pipeline */
/* SECTION: row analysis pipeline */
process compute_row_fingerprints {
/* row-wise fingerprints */
publishDir "$docket/fingerprints", mode: 'copy'
input: file rd from rowsdata
output: file 'rows_allfp.raw.gz' into rowsallfp
"""
$dbin/compute_fp.pl $rd $L | gzip -c > rows_allfp.raw.gz
"""
}
process trim_row_fingerprints {
/* trim row-wise fingerprints by triples */
publishDir "$docket/fingerprints", mode: 'copy'
input: file rfp from rowsallfp
output: file 'rows_fp.raw.gz' into rowstrimfp
"""
$dbin/filterTable.pl $rfp 1 $params.minTriples | gzip -c > rows_fp.raw.gz
"""
}
process PCA_row_fingerprints {
/* compute PCA on fingerprints of rows */
publishDir "$docket/analyses", mode: 'copy'
input: file rfp from rowstrimfp
output: file 'rows_fp.pca.gz' into rowsfppca
"""
$dbin/pca.py $rfp --L $L | gzip -c > rows_fp.pca.gz
"""
}
process center_row_fingerprints {
/* center row-wise fingerprints */
publishDir "$docket/fingerprints", mode: 'copy'
input: file rfp from rowstrimfp
output: file 'rows_fp.cent.gz' into rowscentfp
"""
gunzip -c $rfp | $dbin/center_fp.pl | gzip -c > rows_fp.cent.gz
"""
}
process serialize_cent_row_fingerprints {
/* serialize centered row-wise fingerprints */
publishDir "$docket/fingerprints", mode: 'copy'
input: file rfp from rowscentfp
output:
file 'rows_fp.fp' into rowsfp
file 'rows_fp.id' into rowsfpids
"""
$lphbin/serializeLPH.pl rows_fp $L 1 0 $rfp
"""
}
process compare_row_fingerprints {
/* compare fingerprints of rows */
publishDir "$docket/comparisons", mode: 'copy'
input: file rfp from rowsfp
output:
file 'rows_fp.aaa.gz' into rowsaaa
file 'rows_fp.aaa.hist' into rowsaaahist
"""
$lphbin/searchLPHs.pl $rfp 0 1000000 rows_fp.aaa.hist | gzip -c > rows_fp.aaa.gz
"""
}
process index_row_fingerprints {
/* index fingerprints of rows, using annoy */
errorStrategy 'ignore'
publishDir "$docket/fingerprints", mode: 'copy'
input: file rfp from rowstrimfp
output:
file 'rows_fp.tree' into rowsfpindex
file 'rows_fp.names' into rowsfpnames
"""
$dbin/annoyIndexGz.py --file $rfp --L $L --norm 1 --out rows_fp
"""
}
process KNN_row_fingerprints {
/* find k nearest neighbors of rows, using annoy */
errorStrategy 'ignore'
publishDir "$docket/comparisons", mode: 'copy'
input:
file rfpi from rowsfpindex
file rfpn from rowsfpnames
output: file 'rows_fp.knn.gz'
"""
$dbin/annoyQueryAll.py --index $rfpi --names $rfpn --L $L --k 100 | gzip -c > rows_fp.knn.gz
"""
}
process row_cluster_hier {
/* Compute row-wise clustering */
publishDir "$docket/analyses", mode: 'copy'
input: file rpca from rowsfppca
output:
file 'rows_hier_linkage.txt.gz' into rows_hier_linkage
file 'rows_hier_clusters.txt.gz' into rows_hier_clust
"""
$dbin/hierarchical_clustering.py --source $rpca --linkage_out rows_hier_linkage.txt.gz | gzip -c > rows_hier_clusters.txt.gz
"""
}
process plot_PCA_row_fingerprints {
/* plot PC1-PC2 on fingerprints of rows */
publishDir "$docket/visualizations", mode: 'copy'
input:
file pca from rowsfppca
file rhc from rows_hier_clust
output:
file 'rows_fp.pc1_pc2.pdf'
file 'rows_fp.pc1_pc2.png'
"""
$dbin/plotpca.py $pca rows_fp.pc1_pc2 --clustering $rhc --clusters $params.rowclusters
"""
}
/* END SECTION: row analysis pipeline */
/* SECTION: column analysis pipeline */
process compute_col_fingerprints {
/* col-wise fingerprints */
publishDir "$docket/fingerprints", mode: 'copy'
input: file cd from colsdata
output: file 'cols_allfp.raw.gz' into colsallfp
"""
$dbin/compute_fp.pl $cd $L | gzip -c > cols_allfp.raw.gz
"""
}
process trim_col_fingerprints {
/* trim col-wise fingerprints by triples */
publishDir "$docket/fingerprints", mode: 'copy'
input: file cfp from colsallfp
output: file 'cols_fp.raw.gz' into colstrimfp
"""
$dbin/filterTable.pl $cfp 1 $params.minTriples 0 id | gzip -c > cols_fp.raw.gz
"""
}
process PCA_col_fingerprints {
/* compute PCA on fingerprints of columns */
publishDir "$docket/analyses", mode: 'copy'
input: file cfp from colstrimfp
output: file 'cols_fp.pca.gz' into colsfppca
"""
$dbin/pca.py $cfp --L $L | gzip -c > cols_fp.pca.gz
"""
}
process center_col_fingerprints {
/* center col-wise fingerprints */
publishDir "$docket/fingerprints", mode: 'copy'
input: file cfp from colstrimfp
output: file 'cols_fp.cent.gz' into colscentfp
"""
gunzip -c $cfp | $dbin/center_fp.pl | gzip -c > cols_fp.cent.gz
"""
}
process serialize_cent_col_fingerprints {
/* serialize centered col-wise fingerprints */
publishDir "$docket/fingerprints", mode: 'copy'
input: file cfp from colscentfp
output:
file 'cols_fp.fp' into colsfp
file 'cols_fp.id' into colsfpids
"""
$lphbin/serializeLPH.pl cols_fp $L 1 1 $cfp
"""
}
process compare_col_fingerprints {
/* compare fingerprints of columns */
publishDir "$docket/comparisons", mode: 'copy'
input: file cfp from colsfp
output:
file 'cols_fp.aaa.gz' into colsaaa
file 'cols_fp.aaa.hist' into colsaaahist
"""
$lphbin/searchLPHs.pl $cfp 0 1000000 cols_fp.aaa.hist | gzip -c > cols_fp.aaa.gz
"""
}
process index_col_fingerprints {
/* index fingerprints of columns, using annoy */
errorStrategy 'ignore'
publishDir "$docket/fingerprints", mode: 'copy'
input: file cfp from colstrimfp
output:
file 'cols_fp.tree' into colsfpindex
file 'cols_fp.names' into colsfpnames
"""
$dbin/annoyIndexGz.py --file $cfp --L $L --norm 1 --out cols_fp
"""
}
process KNN_col_fingerprints {
/* find k nearest neighbors of columns, using annoy */
errorStrategy 'ignore'
publishDir "$docket/comparisons", mode: 'copy'
input:
file cfpi from colsfpindex
file cfpn from colsfpnames
output: file 'cols_fp.knn.gz'
"""
$dbin/annoyQueryAll.py --index $cfpi --names $cfpn --L $L --k 100 | gzip -c > cols_fp.knn.gz
"""
}
process col_cluster_hier {
/* Compute col-wise clustering */
publishDir "$docket/analyses", mode: 'copy'
input: file cpca from colsfppca
output:
file 'cols_hier_linkage.txt.gz' into cols_hier_linkage
file 'cols_hier_clusters.txt.gz' into cols_hier_clust
"""
$dbin/hierarchical_clustering.py --source $cpca --linkage_out cols_hier_linkage.txt.gz | gzip -c > cols_hier_clusters.txt.gz
"""
}
process plot_PCA_col_fingerprints {
/* plot PC1-PC2 on fingerprints of columns */
publishDir "$docket/visualizations", mode: 'copy'
input:
file pca from colsfppca
file chc from cols_hier_clust
output:
file 'cols_fp.pc1_pc2.pdf'
file 'cols_fp.pc1_pc2.png'
"""
$dbin/plotpca.py $pca cols_fp.pc1_pc2 --clustering $chc --clusters $params.colclusters
"""
}
process compute_numeric_associations {
/* compute pairwise Spearman correlations between numerical columns */
publishDir "$docket/knowledgegraphs", pattern: "*.json.gz", mode: 'copy'
publishDir "$docket/comparisons", pattern: "*assoc.gz", mode: 'copy'
input:
file data from cleandata
file ctypes from colstypes
output:
file 'num_assoc.json.gz'
file 'num_assoc.gz'
"""
$dbin/numerical_associations.py --infile $data --types_file $ctypes | gzip -c > num_assoc.gz
"""
}
process compute_categorical_associations {
/* compute pairwise Cramer V and Theil's U between categorical columns */
publishDir "$docket/comparisons", mode: 'copy'
input:
file data from cleandata
file ctypes from colstypes
output: file 'cat_assoc.gz'
"""
$dbin/categorical_associations.py --infile $data --types_file $ctypes | gzip -c > cat_assoc.gz
"""
}
/* END SECTION: column analysis pipeline */
/* ENRICHMENT ANALYSIS */
process row_generate_cluster_membership {
/* Compute row-wise clustering */
publishDir "$docket/analyses", mode: 'copy'
input:
file rpca from rowsfppca
file rlink from rows_hier_linkage
output: file 'row_cluster_members.json.gz' into rows_hclust_members
"""
$dbin/generate_cluster_membership.py \
--source $rpca \
--link_table_in $rlink \
--cl_members_out row_cluster_members.json.gz
"""
}
process pairwise_occurrence_counts {
/* Get occurrence counts for all attributes and parent/children trios for branch points in cluster hierarchy */
publishDir "$docket/analyses", mode: 'copy'
input:
file cdata from colsdata
file rhc_members from rows_hclust_members
output: file 'pairwise_occurrence_counts.json.gz' into occur_counts
"""
$dbin/pairwise_occurrence_counts.py \
--source $cdata \
--cluster_members $rhc_members \
--counts_out pairwise_occurrence_counts.json.gz
"""
}
process generate_enrichment_results {
/* Calculate and write to output enrichment results */
publishDir "$docket/analyses/enrichment", mode: 'copy'
input:
file cl_members from rows_hclust_members
file counts from occur_counts
output: file 'enrich_results_*.txt.gz' into enrichment_results
"""
$dbin/compute_chi_squared.py \
--counts_file $counts \
--cluster_members $cl_members
"""
}
process copy_notebooks {
/* Copy Jupyter notebook for visualizing results */
publishDir "$docket/visualizations", mode: 'copy'
output:
file 'results.py'
file 'review-docket-study-results.ipynb'
"""
cp '$baseDir/common/results.py' .
cp '$baseDir/notebooks/review-docket-study-results.ipynb' .
"""
}