-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathfix_authors_in_json.py
executable file
·135 lines (122 loc) · 5.02 KB
/
fix_authors_in_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python
# fix_authors_in_json.py was a one-time script to fix problems in json_data for manuscripts.
__author__ = 'daisieh'
import re
import os
import sys
import json
import urllib
def rows_from_query(sql):
# Now execute it
cmd = "psql -A -U dryad_app dryad_repo -c \"%s\"" % sql
output = [line.strip().split('|') for line in os.popen(cmd).readlines()]
if len(output) <= 2: # the output should have at least 3 lines: header, body rows, number of rows
return None
else:
return output[1:-1]
def update_json_row(json_data, manuscript_id):
sql = "update manuscript set json_data = $$%s$$ where manuscript_id = %s" % (json_data, manuscript_id)
cmd = "psql -A -U dryad_app dryad_repo -c \'%s\'" % sql
# print manuscript_id + ": trying to execute command:"
# print cmd
return os.popen(cmd)
def update_json(manuscript_id, json_data):
# test = "manuscript %s has data %s" % (manuscript_id, json_data)
# print test
# print "replaced " + str(x) + " occurrences"
manuscript = json.loads(json_data)
authors = manuscript["authors"]
for author in authors[authors.keys()[0]]:
if "fullName" not in author:
familyname = author["unicodeFamilyName"]
givennames = author["unicodeGivenNames"]
fullname = familyname + ", " + givennames
if "unicodeFullName" in author:
del author["unicodeFullName"]
if "unicodeFamilyName" in author:
del author["unicodeFamilyName"]
if "unicodeGivenNames" in author:
del author["unicodeGivenNames"]
if "htmlfullName" in author:
del author["htmlfullName"]
if "htmlfamilyName" in author:
del author["htmlfamilyName"]
if "htmlgivenNames" in author:
del author["htmlgivenNames"]
if "normalizedFullName" in author:
del author["normalizedFullName"]
if "normalizedFamilyName" in author:
del author["normalizedFamilyName"]
if "normalizedGivenNames" in author:
del author["normalizedGivenNames"]
author["fullName"] = fullname
author["familyName"] = familyname
author["givenNames"] = givennames
elif "familyName" not in author:
pat = re.compile("(.*?), (.*)")
match = pat.match(author["fullName"])
if match:
author["familyName"] = match.group(1)
author["givenNames"] = match.group(2)
else:
print "looks okay"
authors = manuscript["correspondingAuthor"]
author = authors["author"]
author["familyName"] = ""
author["givenNames"] = ""
if "fullName" not in author:
familyname = author["unicodeFamilyName"]
givennames = author["unicodeGivenNames"]
fullname = familyname + ", " + givennames
if "unicodeFullName" in author:
del author["unicodeFullName"]
if "unicodeFamilyName" in author:
del author["unicodeFamilyName"]
if "unicodeGivenNames" in author:
del author["unicodeGivenNames"]
if "htmlfullName" in author:
del author["htmlfullName"]
if "htmlfamilyName" in author:
del author["htmlfamilyName"]
if "htmlgivenNames" in author:
del author["htmlgivenNames"]
if "normalizedFullName" in author:
del author["normalizedFullName"]
if "normalizedFamilyName" in author:
del author["normalizedFamilyName"]
if "normalizedGivenNames" in author:
del author["normalizedGivenNames"]
author["fullName"] = fullname
author["familyName"] = familyname
author["givenNames"] = givennames
elif "familyName" not in author:
pat = re.compile("(.*?), (.*)")
match = pat.match(author["fullName"])
if match:
author["familyName"] = match.group(1)
author["givenNames"] = match.group(2)
else:
print "looks okay"
abstract = manuscript["abstract"]
if abstract is not None:
abstract = abstract.encode('ascii', 'xmlcharrefreplace')
pat = re.compile("([\'\"])")
new_abstract = urllib.quote(abstract)
new_abstract = re.sub('%20', ' ', new_abstract)
new_abstract = re.sub('\'', '\'\'', new_abstract)
manuscript["abstract"] = new_abstract
# return re.sub("\'", "%27", json.dumps(manuscript))
return json.dumps(manuscript)
def main():
bad_json_rows = rows_from_query("select manuscript_id, json_data, msid from manuscript where json_data NOT LIKE '%familyName%'")
print len(bad_json_rows)
for entry in bad_json_rows:
print "processing " + entry[2]
good_json = update_json(entry[0], entry[1])
print good_json
output = update_json_row(good_json, entry[0]).close()
if output is not None:
print "reprocess " + entry[2]
print good_json
if __name__ == '__main__':
main()