forked from WikiEducationFoundation/WikiEduDashboard
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrevision_importer.rb
137 lines (114 loc) · 4.42 KB
/
revision_importer.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# frozen_string_literal: true
require "#{Rails.root}/lib/replica"
require "#{Rails.root}/lib/duplicate_article_deleter"
require "#{Rails.root}/lib/importers/article_importer"
#= Imports and updates revisions from Wikipedia into the dashboard database
class RevisionImporter
def initialize(wiki, course)
@wiki = wiki
@course = course
end
def import_new_revisions_for_course
import_revisions(new_revisions_for_course)
end
###########
# Helpers #
###########
private
# Given a Course, get new revisions for the users in that course.
def new_revisions_for_course
results = []
# Users with no revisions are considered "new". For them, we search for
# revisions starting from the beginning of the course, in case they were
# just added to the course.
@new_users = users_with_no_revisions
results += revisions_from_new_users unless @new_users.empty?
# For users who already have revisions during the course, we assume that
# previous updates imported their revisions prior to the latest revisions.
# We only need to import revisions
@old_users = @course.students - @new_users
results += revisions_from_old_users unless @old_users.empty?
results
end
def revisions_from_new_users
get_revisions(@new_users, course_start_date, end_of_update_period)
end
def revisions_from_old_users
first_rev = latest_revision_of_course
start = first_rev.blank? ? course_start_date : first_rev.date.strftime('%Y%m%d')
get_revisions(@old_users, start, end_of_update_period)
end
def import_revisions(data)
# Use revision data fetched from Replica to add new Revisions as well as
# new Articles where appropriate.
# Limit it to 8000 per slice to avoid running out of memory.
data.each_slice(8000) do |sub_data|
import_revisions_slice(sub_data)
end
end
# Get revisions made by a set of users between two dates.
def get_revisions(users, start, end_date)
Utils.chunk_requests(users, 40) do |block|
Replica.new(@wiki).get_revisions block, start, end_date
end
end
def course_start_date
@course.start.strftime('%Y%m%d')
end
DAYS_TO_IMPORT_AFTER_COURSE_END = 30
def end_of_update_period
# Add one day so that the query does not end at the beginning of the last day.
(@course.end + 1.day + DAYS_TO_IMPORT_AFTER_COURSE_END.days).strftime('%Y%m%d')
end
def users_with_no_revisions
@course.users.role('student')
.joins(:courses_users)
.where(courses_users: { revision_count: 0 })
end
def latest_revision_of_course
@course.revisions.where(wiki_id: @wiki.id).order('date DESC').first
end
def import_revisions_slice(sub_data)
@articles, @revisions = [], []
sub_data.each do |_a_id, article_data|
process_article_and_revisions(article_data)
end
DuplicateArticleDeleter.new(@wiki).resolve_duplicates(@articles)
Revision.import @revisions
end
def process_article_and_revisions(article_data)
article = article_updated_from_data(article_data)
@articles.push article
article_data['revisions'].each do |rev_data|
push_revision_record(rev_data, article)
end
end
def article_updated_from_data(article_data)
article = Article.find_by(mw_page_id: article_data['article']['mw_page_id'], wiki_id: @wiki.id)
article ||= Article.new(mw_page_id: article_data['article']['mw_page_id'], wiki_id: @wiki.id)
article.update!(title: article_data['article']['title'],
namespace: article_data['article']['namespace'])
article
end
def push_revision_record(rev_data, article)
existing_revision = Revision.find_by(mw_rev_id: rev_data['mw_rev_id'], wiki_id: @wiki.id)
return unless existing_revision.nil?
revision = revision_from_data(rev_data, article)
@revisions.push revision
end
def revision_from_data(rev_data, article)
Revision.new(mw_rev_id: rev_data['mw_rev_id'],
date: rev_data['date'],
characters: rev_data['characters'],
article_id: article.id,
mw_page_id: rev_data['mw_page_id'],
user_id: User.find_by(username: rev_data['username'])&.id,
new_article: string_to_boolean(rev_data['new_article']),
system: string_to_boolean(rev_data['system']),
wiki_id: rev_data['wiki_id'])
end
def string_to_boolean(string)
return false if string == 'false'
return true if string == 'true'
end
end