-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathcochrane_fr.py
143 lines (130 loc) · 6.33 KB
/
cochrane_fr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/python
# -*- coding: utf-8 -*-
# License: MIT
import pywikibot
import re
import requests
import datetime
import locale
from pywikibot import pagegenerators
debug = False
maxnum = 1000
reportpage = u'Projet:Médecine/Cochrane/Bot'
locale.setlocale(locale.LC_ALL, 'fr_FR.UTF-8')
def update_report(page, old_pmid, new_pmid, previousreports):
report = pywikibot.Page(site, reportpage)
report_text = report.get()
rep = u'Article [[%s]] ([{{fullurl:%s|action=edit}} edit]) ancienne critique [https://www.ncbi.nlm.nih.gov/pubmed/%s PMID:%s] nouvelle critique [https://www.ncbi.nlm.nih.gov/pubmed/%s PMID:%s]' % (page.title(), page.title(),old_pmid, old_pmid, new_pmid, new_pmid)
if rep in report_text or rep in previousreports:
return
report.text = report_text + "\n*" + rep + u' - ~~~~~'
report.save(u'Rapport de mise à jour à inclure ' + page.title())
checkedpages = {}
site = pywikibot.Site('fr', 'wikipedia')
# First clean up the report page
report = pywikibot.Page(site, reportpage)
report_text = report.get()
report_text = report_text.splitlines()
archive = pywikibot.Page(site, reportpage+"/Archiver")
archive_text = archive.get()
report_text_new = ''
# print(report_text)
for line in report_text:
print(line)
# exit()
if "{{Fait" in line:
archive_text = archive_text + "\n" + line
elif "{{fait" in line:
archive_text = archive_text + "\n" + line
else:
report_text_new = report_text_new + "\n" + line
print(report_text_new)
print(archive_text)
if debug == False:
archive.text = archive_text.strip()
archive.save("Archivage d'anciens rapports")
report.text = report_text_new.strip()
report.save("Archivage d'anciens rapports")
previousreports = archive_text + "\n" + report_text_new
regexes = ["insource:/\| périodique =.+Cochrane/", "insource:/\| périodique=.+Cochrane/", "insource:/\|périodique =.+Cochrane/", "insource:/\|périodique=.+Cochrane/","insource:/\| titre =.+Cochrane/", "titre:/\| title=.+Cochrane/", "insource:/\|titre =.+Cochrane/", "insource:/\|titre=.+Cochrane/"]
i = 0
nummodified = 0
todaysdate = datetime.datetime.now()
todaysdate.strftime("%B")
datestr = "|date = " + (todaysdate.strftime("%B %Y")).lower()
print(datestr)
for regex in regexes:
generator = pagegenerators.SearchPageGenerator(regex, site=site, namespaces=[0])
gen = pagegenerators.PreloadingGenerator(generator)
for page in gen:
# print(checkedpages)
# print(page)
# page = pywikibot.Page(site, "Alzheimer's disease")
i += 1
try:
text = page.get()
except:
continue
pmids = re.findall(r'\|\s*?pmid\s*?\=\s*?(\d+?)\s*?\|', text)
print(len(pmids))
for pmid in pmids:
if str(pmid) not in checkedpages:
print('https://pubmed.ncbi.nlm.nih.gov/%s' % pmid)
try:
r = requests.get('https://pubmed.ncbi.nlm.nih.gov/%s' % pmid, timeout=10.0)
res = r.text
except:
continue
# if 'WITHDRAWN' in res and re.search(r'<h3>Update in</h3><ul><li class="comments"><a href="/pubmed/\d+?"', res):
rawtext = re.sub(r'\s+', '', res)
if re.search(r'data-ga-category="comment_correction"data-ga-action="(\d+?)"data-ga-label="linked-update">', rawtext):
pm = re.findall(r'data-ga-category="comment_correction"data-ga-action="(\d+?)"data-ga-label="linked-update">', rawtext)[0]
print(pm)
checkedpages[str(pmid)] = pm
# Check to make sure that the new paper doesn't also have an updated version...
try:
r2 = requests.get('https://pubmed.ncbi.nlm.nih.gov/%s' % pm, timeout=10.0)
res2 = r2.text
except:
continue
if '<title>WITHDRAWN' in res2:
# The new one's been withdrawn: we don't want to report this as an update.
checkedpages[str(pmid)] = 0
rawtext2 = re.sub(r'\s+', '', res2)
if 'WITHDRAWN' in res2 and re.search(r'data-ga-category="comment_correction"data-ga-action="(\d+?)"data-ga-label="linked-update">', rawtext2):
pm2 = re.findall(r'data-ga-category="comment_correction"data-ga-action="(\d+?)"data-ga-label="linked-update">', rawtext2)[0]
try:
r3 = requests.get('https://pubmed.ncbi.nlm.nih.gov/%s' % pm2, timeout=10.0)
res3 = r3.text
if '<title>WITHDRAWN' in res3:
# This new one has also been withdrawn, giving up.
checkedpages[str(pmid)] = 0
else:
checkedpages[str(pmid)] = pm2
except:
continue
else:
checkedpages[str(pmid)] = 0
else:
print('using cache for ' + str(pmid))
print(checkedpages[str(pmid)])
if checkedpages[str(pmid)] != 0:
if u'<!-- Aucune mise à jour nécessaire: ' + str(pmid) + ' -->' not in text:
up = u'<!-- Nouvelle revue https://pubmed.ncbi.nlm.nih.gov/' + checkedpages[str(pmid)]+u" -->{{Passage à actualiser"
if not up in text:
# text = re.sub(ur'(\|\s*?pmid\s*?\=\s*?%s\s*?(?:\||\}\}).*?\}\})' % pmid,ur'\1%s}}' % (up+str(datestr)), text, re.DOTALL)
# print(text)
if debug == False:
update_report(page, pmid, checkedpages[str(pmid)], previousreports)
nummodified += 1
if nummodified > maxnum - 1:
print('Reached the maximum of ' + str(maxnum) + ' pages modified, quitting!')
exit()
print(str(i) + " pages checked, " + str(nummodified) + " tagged!")
# if text != page.text and debug == False:
# page.text = text
# page.save(u'ajout d\'un modèle de passage à actualiser pour la référence Cochrane')
# nummodified += 1
# if nummodified > maxnum - 1:
# print('Reached the maximum of ' + str(maxnum) + ' pages modified, quitting!')
# exit()