-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathcommons_migrate_ids_to_wikidata.py
173 lines (154 loc) · 6.39 KB
/
commons_migrate_ids_to_wikidata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Migrate data from Commons to Wikidata
# Started 11 May 2018 by Mike Peel
from __future__ import unicode_literals
import pywikibot
import numpy as np
import time
import string
from pywikibot import pagegenerators
import urllib
from pibot_functions import *
maxnum = 10000
nummodified = 0
commons = pywikibot.Site('commons', 'commons')
repo = commons.data_repository() # this is a DataSite object
debug = True
manual = False
# category = 'Category:World Heritage Sites by name'
# templates = ['World Heritage Site']
# properties = ['P757']
# category = 'Category:HPIP with known IDs'
# templates = ['HPIP']
# properties = ['P5094']
#category = 'Category:Protected areas with known WDPA-ID'
templates = ['WDPA']
properties = ['P809']
# category = 'Category:Listed buildings in England with known IDs'
# templates = ['Listed building England', 'listed building England']
# properties = ['P1216', 'P1216']
# category = 'Category:Buildings of Madrid with COAM Register number'
# templates = ['COAM']
# properties = ['P2917']
templates = ['South African Heritage Site']
properties = u'P3759'
toremove = ['/', '|']
others = ['mainw','Mainw', 'Interwiki from Wikidata', 'interwiki from Wikidata', 'label', 'Label', 'object location|wikidata=', 'object location|Wikidata=', 'Object location|Wikidata=', 'Object location|wikidata=', "Interwiki from Wikidata", "interwiki from Wikidata", "Interwiki from wikidata", "interwiki from wikidata", "PeopleByName", "peopleByName", "Authority control", "authority control", "On Wikidata", "on Wikidata", "In Wikidata", "in Wikidata", "Wikidata", "wikidata", "en"]
enwp = ['mainw', 'Mainw', 'on Wikipedia|en=', 'On Wikipedia|en=']
savemessage="Trim information provided through the Wikidata Infobox"
wikidatainfobox = ["Wikidata Infobox", "Wikidata infobox", "wikidata infobox", "wikidata Infobox", "Infobox Wikidata", "infobox Wikidata", "infobox wikidata", "Infobox wikidata"]
def migratecat(targetcat):
print targetcat
target_text = targetcat.get()
print target_text
# Check that we have a Wikidata infobox here
if not any(option in target_text for option in wikidatainfobox):
print 'No infobox'
return 0
# Fetch the info from Wikidata
try:
wd_item = pywikibot.ItemPage.fromPage(targetcat)
item_dict = wd_item.get()
print wd_item.title()
except:
print 'No Wikidata ID'
return 0
# Or in the main topic
try:
p301 = target_dict['claims']['P301']
for clm2 in p301:
wd_item = clm2.getTarget()
item_dict = wd_item.get()
except:
null = 1
# Look for replacements
count = 0
for i in range(0,len(properties)):
try:
ID = item_dict['claims'][properties[i]]
for clm in ID:
if count == 0:
target_text = target_text.replace("{{" + templates[i] + "|" + clm.getTarget() + "}}", "")
count += 1
except:
continue
for i in range(0,len(properties)):
try:
ID = item_dict['claims'][properties[i]]
enwp_link = get_sitelink_title(item_dict['sitelinks']['enwiki'])
enwp_link2 = enwp_link[0].lower() + enwp_link[1:]
for clm in ID:
if count == 0:
target_text = target_text.replace("{{" + templates[i] + "|" + clm.getTarget() + "|"+enwp_link
+"}}", "")
target_text = target_text.replace("{{" + templates[i] + "|" + clm.getTarget() + "|"+enwp_link2+"}}", "")
count += 1
except:
continue
# Remove other templates
for i in range(0,len(others)):
target_text = target_text.replace("{{"+others[i]+"}}", "")
target_text = target_text.replace("{{" + others[i] + "|" + wd_item.title() + "}}", "")
target_text = target_text.replace("{{" + others[i] + wd_item.title() + "}}", "")
target_text = target_text.replace("{{" + others[i] + "| }}", "")
target_text = target_text.replace("{{" + others[i] + " | }}", "")
try:
enwp_link = get_sitelink_title(item_dict['sitelinks']['enwiki'])
enwp_link2 = enwp_link[0].lower() + enwp_link[1:]
for i in range(0,len(enwp)):
target_text = target_text.replace("{{"+enwp[i]+"|"+enwp_link+"}}", "")
target_text = target_text.replace("{{"+enwp[i]+"|"+enwp_link2+"}}", "")
except:
null = 1
# We should now not be able to find the original template here - but if we can, don't edit it.
if any(option in target_text for option in templates):
return 0
# Only remove whitespace if we're making another change
if (target_text != targetcat.get()):
for i in range(0,len(wikidatainfobox)):
target_text = target_text.replace("{{"+wikidatainfobox[i],'{{Wikidata Infobox')
target_text = target_text.replace('\n\n\n','\n')
target_text = target_text.replace('\n\n\n','\n')
target_text = target_text.replace('\n\n{{Wikidata Infobox','\n{{Wikidata Infobox')
# target_text = target_text.replace('\n\n','\n')
# Time to save it
if (target_text != targetcat.get()):
targetcat.text = target_text.strip()
print targetcat.text
if manual:
text = raw_input("Save on Commons? ")
if text == 'y':
try:
targetcat.save(savemessage)
return 1
except:
print "That didn't work!"
return 0
else:
return 0
else:
try:
targetcat.save(savemessage)
return 1
except:
print "That didn't work!"
return 0
else:
return 0
template = pywikibot.Page(commons, 'Template:'+templates[0])
targetcats = template.embeddedin(namespaces='14')
# Start the category walker
# cat = pywikibot.Category(commons,category)
# nummodified += migratecat(cat)
# targetcats = pagegenerators.SubCategoriesPageGenerator(cat, recurse=False);
for targetcat in targetcats:
print targetcat
print "\n" + targetcat.title()
# print target.text
nummodified += migratecat(targetcat)
if nummodified >= maxnum:
print 'Reached the maximum of ' + str(maxnum) + ' entries modified, quitting!'
exit()
print 'Done! Edited ' + str(nummodified) + ' entries'
# EOF