-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1948 from datadryad/3695-quarterly-clean-unmatche…
…d-institutions Create script to regularly match and update institutions
- Loading branch information
Showing
10 changed files
with
346 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
REPORTS_DIR = 'reports'.freeze |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# frozen_string_literal: true | ||
|
||
require 'csv' | ||
|
||
module Stash | ||
module Organization | ||
class AffiliationRorMatcher < BaseRorMatcher | ||
|
||
private | ||
|
||
def connect_to_ror(item, ror) | ||
# puts '------------- connect_to_ror -------------' | ||
ror_id = ror[:id] | ||
|
||
rep = StashDatacite::Affiliation.find_by(ror_id: ror_id) | ||
rep ||= StashDatacite::Affiliation.from_ror_id(ror_id: ror_id) | ||
to_fix = StashDatacite::Affiliation.where(ror_id: nil, long_name: item.long_name) | ||
|
||
update_affiliation_name(rep, ror) | ||
|
||
message = 'Replacing affiliations with' | ||
puts " - #{message} name \"#{item.long_name}\" (ids: #{to_fix.ids}) with \"#{ror[:name]}\" (id: #{rep.id || 'new'})" | ||
@csv_rows << [item.id, item.long_name, item.authors.count, message, ror[:name], ror[:id], rep.id] | ||
@updates_count += to_fix.count | ||
return unless perform_updates | ||
|
||
to_fix.each do |aff| | ||
# updating authors affiliation with new affiliation | ||
aff.authors.each do |author| | ||
author.affiliation = rep | ||
end | ||
aff.destroy | ||
end | ||
end | ||
|
||
def update_affiliation_name(rep, ror) | ||
return if ror[:name] == rep.long_name | ||
|
||
rep.update(long_name: ror[:name]) if perform_updates | ||
message = 'Updating existing affiliation name' | ||
puts " - #{message} \"#{rep.long_name}\" (id: #{rep.id}) with \"#{ror[:name]}\"" | ||
@csv_rows << [rep.id, rep.long_name, rep.authors.count, message, ror[:name]] | ||
end | ||
|
||
def record_name(item) | ||
item.long_name | ||
end | ||
|
||
def record_ror_id(item) | ||
item.ror_id | ||
end | ||
|
||
def base_items_query | ||
StashDatacite::Affiliation.where(ror_id: nil) | ||
end | ||
|
||
def report_file_name(filters_text) | ||
File.join(REPORTS_DIR, "affiliation_ror_matcher_report_#{Time.now.strftime('%Y%m%d_%H%M%S')}_#{filters_text}.csv") | ||
end | ||
|
||
def report_headers | ||
['Affiliation ID', 'Long Name', 'Authors Count', 'Message', 'ROR Name', 'ROR ID', 'New Affiliation ID'] | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# frozen_string_literal: true | ||
|
||
module Stash | ||
module Organization | ||
class AffiliationUpdater | ||
|
||
def self.perform | ||
puts '' | ||
puts "Starting affiliation update: #{Time.now}" | ||
|
||
index = 0 | ||
StashDatacite::Affiliation.joins(:ror_org).where('long_name != name').find_each do |record| | ||
puts "Updating affiliation with id: #{record.id} from \"#{record.long_name}\" to \"#{record.ror_org.name}\"" | ||
record.update(long_name: record.ror_org.name) | ||
|
||
index += 1 | ||
sleep 3 if index % 1000 == 0 | ||
end | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
# frozen_string_literal: true | ||
|
||
require 'csv' | ||
|
||
module Stash | ||
module Organization | ||
class BaseRorMatcher | ||
|
||
attr_reader :perform_updates | ||
|
||
def initialize(perform_updates: true, start_id: nil, end_id: nil, start_created_at: nil, end_created_at: nil) | ||
@perform_updates = perform_updates | ||
@end_id = end_id | ||
@start_id = start_id | ||
@start_created_at = start_created_at&.to_date | ||
@end_created_at = end_created_at&.to_date | ||
|
||
@updates_count = 0 | ||
@multiple_ror_found_count = 0 | ||
@no_ror_found_count = 0 | ||
@csv_rows = [] | ||
end | ||
|
||
def perform | ||
items_to_be_mapped = filter_items | ||
|
||
start_report(items_to_be_mapped.count) | ||
map_items(items_to_be_mapped) | ||
end_report | ||
end | ||
|
||
private | ||
|
||
def filter_items | ||
items = base_items_query | ||
items = items.where('id >= ?', @start_id) if @start_id | ||
items = items.where('id <= ?', @end_id) if @end_id | ||
items = items.where('created_at >= ?', @start_created_at) if @start_created_at | ||
items = items.where('created_at <= ?', @end_created_at) if @end_created_at | ||
items | ||
end | ||
|
||
def start_report(items_count) | ||
puts '' | ||
puts '========================================================================================' | ||
@text = "Processing #{items_count} records" | ||
@text += " starting with id: #{@start_id}" if @start_id | ||
@text += " ending with id: #{@end_id}" if @end_id | ||
@text += " starting from: #{@start_created_at}" if @start_created_at | ||
@text += " ending on: #{@end_created_at}" if @end_created_at | ||
@text += ':' | ||
puts @text | ||
initialize_csv_report | ||
end | ||
|
||
def end_report | ||
messages = [ | ||
[], | ||
[@text.gsub('Processing', 'From')], | ||
[" - Updated: #{@updates_count} records."], | ||
[" - Multiple RORs found: #{@multiple_ror_found_count} records."], | ||
[" - No ROR found: #{@no_ror_found_count} records."] | ||
] | ||
update_csv_report(messages) | ||
|
||
puts '' | ||
messages.each do |message| | ||
puts message.first | ||
end | ||
puts "Report file: #{@report_name}" | ||
end | ||
|
||
def initialize_csv_report | ||
filters_text = @text.downcase.gsub(':', '').gsub(' ', '_') | ||
@report_name = report_file_name(filters_text) | ||
|
||
@csv = CSV.open(@report_name, 'wb') do |csv| | ||
csv << report_headers | ||
end | ||
end | ||
|
||
def update_csv_report(csv_rows) | ||
@csv = CSV.open(@report_name, 'a+') do |csv| | ||
csv_rows.each do |row| | ||
csv << row | ||
end | ||
end | ||
end | ||
|
||
def map_items(items_to_be_mapped) | ||
index = 0 | ||
|
||
items_to_be_mapped.find_each do |item| | ||
index += 1 | ||
if index % 100 == 0 | ||
sleep 2 | ||
update_csv_report(@csv_rows) | ||
@csv_rows = [] | ||
end | ||
|
||
handle_item(item, record_name(item), index) | ||
end | ||
update_csv_report(@csv_rows) | ||
end | ||
|
||
def handle_item(item, item_name, index) | ||
puts '' | ||
puts "#{index}. Processing record \"#{item_name}\" (id: #{item.id}, created_at: #{item.created_at})" | ||
|
||
if record_ror_id(item).present? | ||
puts ' - ROR already updated' | ||
return | ||
end | ||
|
||
rors = StashEngine::RorOrg.find_by_name_for_auto_matching(item_name) | ||
case rors.count | ||
when 0 | ||
@no_ror_found_count += 1 | ||
# Do not add to CSV report, nor log file, as it will increase the file size too much | ||
# message = 'Could not find ROR' | ||
# @csv_rows << [item.id, item_name, message] | ||
# puts " - #{message} for \"#{item_name}\"" | ||
when 1 | ||
connect_to_ror(item, rors.first) | ||
else | ||
@multiple_ror_found_count += 1 | ||
message = 'Found multiple RORs' | ||
@csv_rows << [item.id, item_name, message, rors.map { |ror| ror[:name] }.join("\n"), rors.map { |ror| ror[:id] }.join("\n")] | ||
puts " - #{message} for \"#{item_name}\"" | ||
end | ||
end | ||
|
||
def base_items_query | ||
raise NotImplementedError, 'Subclasses must implement base_items_query' | ||
end | ||
|
||
def record_name(item) | ||
raise NotImplementedError, 'Subclasses must implement record_name' | ||
end | ||
|
||
def record_ror_id(item) | ||
raise NotImplementedError, 'Subclasses must implement record_ror_id' | ||
end | ||
|
||
def connect_to_ror(affiliation, ror) | ||
raise NotImplementedError, 'Subclasses must implement connect_to_ror' | ||
end | ||
|
||
def report_file_name(filters_text) | ||
raise NotImplementedError, 'Subclasses must implement report_file_name' | ||
end | ||
|
||
def report_headers | ||
raise NotImplementedError, 'Subclasses must implement report_headers' | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
# frozen_string_literal: true | ||
|
||
require 'csv' | ||
|
||
module Stash | ||
module Organization | ||
class ContributorRorMatcher < BaseRorMatcher | ||
|
||
private | ||
|
||
def connect_to_ror(item, ror) | ||
# puts '------------- connect_to_ror -------------' | ||
ror_id = ror[:id] | ||
message = 'Updating contributor with' | ||
puts " - #{message} name \"#{item.contributor_name}\" (ids: #{item.id}) with \"#{ror[:name]}\" (ror_id: #{ror_id})" | ||
@csv_rows << [item.id, item.contributor_name, message, ror[:name], ror[:id]] | ||
@updates_count += 1 | ||
|
||
return unless perform_updates | ||
|
||
item.update(contributor_name: ror[:name], identifier_type: 'ror', name_identifier_id: ror_id) | ||
end | ||
|
||
def record_name(item) | ||
item.contributor_name | ||
end | ||
|
||
def record_ror_id(item) | ||
item.name_identifier_id | ||
end | ||
|
||
def base_items_query | ||
StashDatacite::Contributor.where(name_identifier_id: [nil, '']) | ||
end | ||
|
||
def report_file_name(filters_text) | ||
File.join(REPORTS_DIR, "contributor_ror_matcher_report_#{Time.now.strftime('%Y%m%d_%H%M%S')}_#{filters_text}.csv") | ||
end | ||
|
||
def report_headers | ||
['Contributor ID', 'Contributor Name', 'Message', 'ROR Name', 'ROR ID'] | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# :nocov: | ||
namespace :cleanup do | ||
|
||
# example usage: RAILS_ENV=development bundle exec rake cleanup:affiliations_wo_ror | ||
# https://github.com/datadryad/dryad-app/blob/main/documentation/technical_notes/affiliations.md#cleaning-affiliation-names | ||
desc 'Match Affiliations with ROR organizations' | ||
task affiliations_wo_ror: :environment do | ||
Stash::Organization::AffiliationRorMatcher.new.perform | ||
end | ||
|
||
# example usage: RAILS_ENV=development bundle exec rake cleanup:contributors_wo_ror | ||
# https://github.com/datadryad/dryad-app/blob/main/documentation/technical_notes/contributors.md#cleaning-contributor-names | ||
desc 'Match Contributors with ROR organizations' | ||
task contributors_wo_ror: :environment do | ||
Stash::Organization::ContributorRorMatcher.new.perform | ||
end | ||
end | ||
# :nocov: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters