-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathenwp_archaeologists.py
207 lines (190 loc) · 5.53 KB
/
enwp_archaeologists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Remove locally defined commons category links when bad or pointing to a redirect
# Mike Peel 22-May-2020 v1 - start
import pywikibot
import numpy as np
import time
import string
from pywikibot import pagegenerators
import urllib
from pibot_functions import *
nummodified = 0
wikidata_site = pywikibot.Site("wikidata", "wikidata")
repo = wikidata_site.data_repository() # this is a DataSite object
enwp = pywikibot.Site('en', 'wikipedia')
debug = 1
trip = 1
maxnum = 10000
usecats = False
targetitem = 'Q3621491'
otheritems = ['Q56842676','Q15983985','Q25915497','Q1350189','Q52231239','Q26261971','Q26424344']
banner = ['WikiProject Archaeology','WP Archaeology','WP Archeology']
tags = ['women=yes','women=y','women=Yes']
cats = ['Category:Women archaeologists','Category:British women archaeologists','Category:Indian women archaeologists']
if usecats == True:
targetcats = cats
else:
targetcats = [targetitem] + otheritems
for targetcat in targetcats:
print(targetcat)
if usecats == True:
cat = pywikibot.Category(enwp, targetcat)
pages = pagegenerators.CategorizedPageGenerator(cat, recurse=False);
else:
query = 'SELECT ?item WHERE {'\
'?item wdt:P106 wd:'+targetcat+' .'\
'?item wdt:P21 wd:Q6581072 .'\
'FILTER EXISTS {'\
' ?wen schema:about ?item .'\
' ?wen schema:isPartOf <https://en.wikipedia.org/> .'\
'}'\
'}'
print(query)
pages = pagegenerators.WikidataSPARQLPageGenerator(query, site=repo)
for page in pages:
# Optional skip-ahead to resume broken runs
if trip == 0:
if "Bonneau" in page.title():
trip = 1
else:
print(page.title())
continue
# Cut-off at a maximum number of edits
print("")
print(nummodified)
if nummodified >= maxnum:
print('Reached the maximum of ' + str(maxnum) + ' entries modified, quitting!')
exit()
if usecats == True:
# Get the Wikidata item
try:
wd_item = pywikibot.ItemPage.fromPage(page)
item_dict = wd_item.get()
qid = wd_item.title()
print("http://www.wikidata.org/wiki/"+qid)
except:
# If that didn't work, go no further
print(page.title() + ' - no page found')
wd_item = 0
item_dict = 0
qid = 0
sitelink_check = 0
# continue
else:
wd_item = page
item_dict = wd_item.get()
qid = wd_item.title()
page = pywikibot.Page(enwp, get_sitelink_title(item_dict['sitelinks']['enwiki']))
print("\nhttp://en.wikipedia.org/wiki/" + page.title().replace(' ','_'))
check_human = False
try:
p31 = item_dict['claims']['P31']
for clm in p31:
if 'Q5' in clm.getTarget().title():
check_human = True
except:
input('No P31, check?')
if check_human == False:
input('Not human, check?')
continue
check_woman = False
try:
p21 = item_dict['claims']['P21']
for clm in p21:
if 'Q6581072' in clm.getTarget().title():
check_woman = True
except:
input('No P21, check?')
if check_woman == False:
input('Not female, check?')
continue
check_occupation = False
try:
p106 = item_dict['claims']['P106']
for clm in p106:
if targetitem in clm.getTarget().title():
check_occupation = True
else:
for alt in otheritems:
if alt in clm.getTarget().title():
check_occupation = True
except:
null = 1
if check_occupation == False:
add_occupation = input('Not archaeologist, add it?')
if add_occupation == 'y':
newclaim = pywikibot.Claim(repo, 'P106')
newclaim.setTarget(pywikibot.ItemPage(repo, targetitem))
print(newclaim)
wd_item.addClaim(newclaim, summary=u'Setting P106 value')
else:
continue
talk = pywikibot.Page(enwp, 'Talk:'+page.title())
hastalk = False
hasbanner = False
hastag = False
try:
talk_text = talk.get()
hastalk = True
except:
maketalk = input('No talk page, start it?')
if maketalk == 'y':
talk.text = "{{"+banner[0]+"|"+tags[0]+"}}"
talk.save("Starting talk page, adding " + banner[0])
talk_text = talk.get()
if hastalk:
for b in banner:
talk_text = talk_text.replace(b,banner[0])
test = talk_text.split(banner[0])
if len(test) > 1:
hasbanner = True
test2 = test[1].split('}}')
for tag in tags:
if tag in test2[0]:
hastag = True
if not hastag:
print('http://en.wikipedia.org/wiki/Talk:'+page.title().replace(' ','_'))
if hasbanner:
talk_text = talk_text.replace(banner[0],banner[0]+'|'+tags[0])
if talk_text != talk.get():
print(talk_text)
savemessage = 'Add '+tags[0]+' parameter to '+banner[0]
print(savemessage)
savetalk = input('Save talk page?')
if savetalk == 'y':
talk.text = talk_text
talk.save(savemessage,minor=False)
else:
input('Not able to add tag, check?')
else:
input('No banner found, add it manually?')
# Get the page
haspage = False
try:
target_text = page.get()
haspage = True
except:
input('No page, how did that happen?')
hascat = False
if haspage:
print(cats)
for testcat in cats:
if testcat in target_text:
hascat = True
if not hascat:
pos = target_text.rfind(']')
target_text = target_text[:pos+1] + '\n[['+cats[0]+']]'+target_text[pos+1:]
if target_text != page.get():
print(target_text)
print("\nhttp://en.wikipedia.org/wiki/" + page.title().replace(' ','_'))
savemessage = 'Add [['+cats[0]+']]'
print(savemessage)
savepage = input('Save article?')
if savepage == 'y':
page.text = target_text
page.save(savemessage,minor=False)
else:
input('Not able to add category, check?')
print('Done! Edited ' + str(nummodified) + ' entries')
# EOF