-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinspect.py
executable file
·131 lines (106 loc) · 3.53 KB
/
inspect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/python
import zipfile
from os.path import basename, splitext
import csv
import argparse
from itertools import izip_longest
from pprint import pprint
def inspect(path):
print basename(path)
with open_indexfile(path) as f:
reader = csv.reader(f, delimiter='\t')
print '-', '\n- '.join(next(reader))
print 'Record count:', sum(1 for _ in reader)
def compare(paths):
readers = [
csv.DictReader(open_indexfile(path), delimiter='\t') for path in paths
]
first = readers[0]
rest = readers[1:]
fieldnames = first.fieldnames
for reader in rest:
if reader.fieldnames != fieldnames:
print "Fieldnames don't match: %s != %s" % (
reader.fieldnames, fieldnames
)
print "Common fields are", ", ".join(
set(reader.fieldnames) & set(fieldnames)
)
return
for i, rows in enumerate(izip_longest(*readers)):
first_row = rows[0]
for row in rows[1:]:
for fieldname in fieldnames:
if not row:
continue
if first_row.get(fieldname) != row.get(fieldname):
print "Difference detected in field %s for record %d: %s != %s" % (
fieldname, i, first_row.get(fieldname), row.get(fieldname)
)
def diff(paths, id_field="filename"):
readers = [
csv.DictReader(open_indexfile(path), delimiter='\t') for path in paths
]
records = list({} for _ in paths)
for reader, path, record in zip(readers, paths, records):
for row in reader:
record[row[id_field]] = row
ids = [
set(r.keys())
for r in records
]
all_ids = set()
for idset in ids:
all_ids |= idset
print all_ids
for idset, path in zip(ids, paths):
missing = all_ids - idset
if missing:
print path, "is missing", ', '.join(missing)
else:
print path, "has no missing records"
def print_indexfiles(paths):
readers = [
csv.DictReader(open_indexfile(path), delimiter='\t') for path in paths
]
for path, reader in zip(paths, readers):
print path
for row in reader:
pprint(row)
print
def open_indexfile(path):
name, ext = splitext(basename(path))
if ext == '.zip':
zp = zipfile.ZipFile(path)
return zp.open(zp.namelist()[0])
else:
return open(path)
if __name__ == '__main__':
parser = argparse.ArgumentParser('Inspect index files.')
parser.add_argument(
'-i', '--inspect', action='store_const', const='inspect', dest='mode',
default='inspect'
)
parser.add_argument(
'-c', '--compare', action='store_const', const='compare', dest='mode'
)
parser.add_argument(
'-d', '--diff', action='store_const', const='diff', dest='mode'
)
parser.add_argument(
'-p', '--print', action='store_const', const='print', dest='mode'
)
parser.add_argument('indexfiles', nargs='+')
args = parser.parse_args()
if args.mode == 'inspect':
print 'Inspecting', ', '.join(args.indexfiles)
for path in args.indexfiles:
inspect(path)
elif args.mode == 'compare':
print 'Comparing', ', '.join(args.indexfiles)
compare(args.indexfiles)
elif args.mode == 'diff':
print 'Diffing', ', '.join(args.indexfiles)
diff(args.indexfiles)
elif args.mode == 'print':
print_indexfiles(args.indexfiles)