Skip to content

Commit

Permalink
Merge pull request #1948 from datadryad/3695-quarterly-clean-unmatche…
Browse files Browse the repository at this point in the history
…d-institutions

Create script to regularly match and update institutions
  • Loading branch information
ryscher authored Dec 19, 2024
2 parents dfbf905 + 294842a commit 5b03f83
Show file tree
Hide file tree
Showing 10 changed files with 346 additions and 3 deletions.
25 changes: 25 additions & 0 deletions app/models/stash_engine/ror_org.rb
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,31 @@ def self.find_by_ror_name(query)
results.flatten.uniq
end

# Search the RorOrgs for the given string. This will search name, acronyms, aliases, etc.
# @return an Array of Hashes { id: 'https://ror.org/12345', name: 'Sample University' }
# This method is used for auto-matching scripts, where no human has to confirm the match.
def self.find_by_name_for_auto_matching(query)
max_results = 10
return [] unless query.present?

query = query.downcase
# First, find matches at the beginning of the name string, and exact matches in the acronyms/aliases
resp = where("LOWER(name) LIKE ? OR JSON_SEARCH(LOWER(acronyms), 'all', ?) or JSON_SEARCH(LOWER(aliases), 'all', ?)",
"#{query}%", query.to_s, query.to_s).limit(max_results)
results = resp.map do |r|
{ id: r.ror_id, name: r.name, country: r.country, acronyms: r.acronyms, aliases: r.aliases }
end

return results if results.any?

# If we don't have enough results, find matches at the beginning of the acronyms/aliases
resp = where("JSON_SEARCH(LOWER(acronyms), 'all', ?) or JSON_SEARCH(LOWER(aliases), 'all', ?)",
"#{query}%", "#{query}%").limit(max_results)
resp.map do |r|
{ id: r.ror_id, name: r.name, country: r.country, acronyms: r.acronyms, aliases: r.aliases }
end
end

# Return the first match for the given name
# @return a StashEngine::RorOrg or nil
def self.find_first_by_ror_name(ror_name)
Expand Down
1 change: 1 addition & 0 deletions config/initializers/constants.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
REPORTS_DIR = 'reports'.freeze
7 changes: 6 additions & 1 deletion cron/monthly.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,9 @@ bundle exec rails link_out:seed_genbank_ids >> /home/ec2-user/deploy/shared/log/
bundle exec rails link_out:publish >> /home/ec2-user/deploy/shared/log/link_out_publish.log 2>&1

# Update ROR organizations
bundle exec rails affiliation_import:update_ror_orgs >>/home/ec2-user/deploy/shared/log/ror_update.log 2>&1
bundle exec rails affiliation_import:update_ror_orgs >> /home/ec2-user/deploy/shared/log/ror_update.log 2>&1
bundle exec rails affiliation_import:update_affiliations_names >> /home/ec2-user/deploy/shared/log/affiliations_name_updates.log 2>&1

# Cleanup affiliation/contributor records
bundle exec rails cleanup:affiliations_wo_ror >> /home/ec2-user/deploy/shared/log/affiliations_wo_ror_cleanup.log 2>&1
bundle exec rails cleanup:contributors_wo_ror >> /home/ec2-user/deploy/shared/log/contributors_wo_ror_cleanup.log 2>&1
66 changes: 66 additions & 0 deletions lib/stash/organization/affiliation_ror_matcher.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# frozen_string_literal: true

require 'csv'

module Stash
module Organization
class AffiliationRorMatcher < BaseRorMatcher

private

def connect_to_ror(item, ror)
# puts '------------- connect_to_ror -------------'
ror_id = ror[:id]

rep = StashDatacite::Affiliation.find_by(ror_id: ror_id)
rep ||= StashDatacite::Affiliation.from_ror_id(ror_id: ror_id)
to_fix = StashDatacite::Affiliation.where(ror_id: nil, long_name: item.long_name)

update_affiliation_name(rep, ror)

message = 'Replacing affiliations with'
puts " - #{message} name \"#{item.long_name}\" (ids: #{to_fix.ids}) with \"#{ror[:name]}\" (id: #{rep.id || 'new'})"
@csv_rows << [item.id, item.long_name, item.authors.count, message, ror[:name], ror[:id], rep.id]
@updates_count += to_fix.count
return unless perform_updates

to_fix.each do |aff|
# updating authors affiliation with new affiliation
aff.authors.each do |author|
author.affiliation = rep
end
aff.destroy
end
end

def update_affiliation_name(rep, ror)
return if ror[:name] == rep.long_name

rep.update(long_name: ror[:name]) if perform_updates
message = 'Updating existing affiliation name'
puts " - #{message} \"#{rep.long_name}\" (id: #{rep.id}) with \"#{ror[:name]}\""
@csv_rows << [rep.id, rep.long_name, rep.authors.count, message, ror[:name]]
end

def record_name(item)
item.long_name
end

def record_ror_id(item)
item.ror_id
end

def base_items_query
StashDatacite::Affiliation.where(ror_id: nil)
end

def report_file_name(filters_text)
File.join(REPORTS_DIR, "affiliation_ror_matcher_report_#{Time.now.strftime('%Y%m%d_%H%M%S')}_#{filters_text}.csv")
end

def report_headers
['Affiliation ID', 'Long Name', 'Authors Count', 'Message', 'ROR Name', 'ROR ID', 'New Affiliation ID']
end
end
end
end
22 changes: 22 additions & 0 deletions lib/stash/organization/affiliation_updater.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# frozen_string_literal: true

module Stash
module Organization
class AffiliationUpdater

def self.perform
puts ''
puts "Starting affiliation update: #{Time.now}"

index = 0
StashDatacite::Affiliation.joins(:ror_org).where('long_name != name').find_each do |record|
puts "Updating affiliation with id: #{record.id} from \"#{record.long_name}\" to \"#{record.ror_org.name}\""
record.update(long_name: record.ror_org.name)

index += 1
sleep 3 if index % 1000 == 0
end
end
end
end
end
158 changes: 158 additions & 0 deletions lib/stash/organization/base_ror_matcher.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# frozen_string_literal: true

require 'csv'

module Stash
module Organization
class BaseRorMatcher

attr_reader :perform_updates

def initialize(perform_updates: true, start_id: nil, end_id: nil, start_created_at: nil, end_created_at: nil)
@perform_updates = perform_updates
@end_id = end_id
@start_id = start_id
@start_created_at = start_created_at&.to_date
@end_created_at = end_created_at&.to_date

@updates_count = 0
@multiple_ror_found_count = 0
@no_ror_found_count = 0
@csv_rows = []
end

def perform
items_to_be_mapped = filter_items

start_report(items_to_be_mapped.count)
map_items(items_to_be_mapped)
end_report
end

private

def filter_items
items = base_items_query
items = items.where('id >= ?', @start_id) if @start_id
items = items.where('id <= ?', @end_id) if @end_id
items = items.where('created_at >= ?', @start_created_at) if @start_created_at
items = items.where('created_at <= ?', @end_created_at) if @end_created_at
items
end

def start_report(items_count)
puts ''
puts '========================================================================================'
@text = "Processing #{items_count} records"
@text += " starting with id: #{@start_id}" if @start_id
@text += " ending with id: #{@end_id}" if @end_id
@text += " starting from: #{@start_created_at}" if @start_created_at
@text += " ending on: #{@end_created_at}" if @end_created_at
@text += ':'
puts @text
initialize_csv_report
end

def end_report
messages = [
[],
[@text.gsub('Processing', 'From')],
[" - Updated: #{@updates_count} records."],
[" - Multiple RORs found: #{@multiple_ror_found_count} records."],
[" - No ROR found: #{@no_ror_found_count} records."]
]
update_csv_report(messages)

puts ''
messages.each do |message|
puts message.first
end
puts "Report file: #{@report_name}"
end

def initialize_csv_report
filters_text = @text.downcase.gsub(':', '').gsub(' ', '_')
@report_name = report_file_name(filters_text)

@csv = CSV.open(@report_name, 'wb') do |csv|
csv << report_headers
end
end

def update_csv_report(csv_rows)
@csv = CSV.open(@report_name, 'a+') do |csv|
csv_rows.each do |row|
csv << row
end
end
end

def map_items(items_to_be_mapped)
index = 0

items_to_be_mapped.find_each do |item|
index += 1
if index % 100 == 0
sleep 2
update_csv_report(@csv_rows)
@csv_rows = []
end

handle_item(item, record_name(item), index)
end
update_csv_report(@csv_rows)
end

def handle_item(item, item_name, index)
puts ''
puts "#{index}. Processing record \"#{item_name}\" (id: #{item.id}, created_at: #{item.created_at})"

if record_ror_id(item).present?
puts ' - ROR already updated'
return
end

rors = StashEngine::RorOrg.find_by_name_for_auto_matching(item_name)
case rors.count
when 0
@no_ror_found_count += 1
# Do not add to CSV report, nor log file, as it will increase the file size too much
# message = 'Could not find ROR'
# @csv_rows << [item.id, item_name, message]
# puts " - #{message} for \"#{item_name}\""
when 1
connect_to_ror(item, rors.first)
else
@multiple_ror_found_count += 1
message = 'Found multiple RORs'
@csv_rows << [item.id, item_name, message, rors.map { |ror| ror[:name] }.join("\n"), rors.map { |ror| ror[:id] }.join("\n")]
puts " - #{message} for \"#{item_name}\""
end
end

def base_items_query
raise NotImplementedError, 'Subclasses must implement base_items_query'
end

def record_name(item)
raise NotImplementedError, 'Subclasses must implement record_name'
end

def record_ror_id(item)
raise NotImplementedError, 'Subclasses must implement record_ror_id'
end

def connect_to_ror(affiliation, ror)
raise NotImplementedError, 'Subclasses must implement connect_to_ror'
end

def report_file_name(filters_text)
raise NotImplementedError, 'Subclasses must implement report_file_name'
end

def report_headers
raise NotImplementedError, 'Subclasses must implement report_headers'
end
end
end
end
45 changes: 45 additions & 0 deletions lib/stash/organization/contributor_ror_matcher.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# frozen_string_literal: true

require 'csv'

module Stash
module Organization
class ContributorRorMatcher < BaseRorMatcher

private

def connect_to_ror(item, ror)
# puts '------------- connect_to_ror -------------'
ror_id = ror[:id]
message = 'Updating contributor with'
puts " - #{message} name \"#{item.contributor_name}\" (ids: #{item.id}) with \"#{ror[:name]}\" (ror_id: #{ror_id})"
@csv_rows << [item.id, item.contributor_name, message, ror[:name], ror[:id]]
@updates_count += 1

return unless perform_updates

item.update(contributor_name: ror[:name], identifier_type: 'ror', name_identifier_id: ror_id)
end

def record_name(item)
item.contributor_name
end

def record_ror_id(item)
item.name_identifier_id
end

def base_items_query
StashDatacite::Contributor.where(name_identifier_id: [nil, ''])
end

def report_file_name(filters_text)
File.join(REPORTS_DIR, "contributor_ror_matcher_report_#{Time.now.strftime('%Y%m%d_%H%M%S')}_#{filters_text}.csv")
end

def report_headers
['Contributor ID', 'Contributor Name', 'Message', 'ROR Name', 'ROR ID']
end
end
end
end
5 changes: 5 additions & 0 deletions lib/tasks/affiliation_import.rake
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ namespace :affiliation_import do
Stash::Organization::RorUpdater.perform
end

desc 'Sync Affiliation name with ROR organizations name'
task update_affiliations_names: :environment do
Stash::Organization::AffiliationUpdater.perform
end

# example: RAILS_ENV=development bundle exec rake affiliation_import:process_ror_csv -- --affiliation_mode true
desc 'Process all of the CSV files'
task process_ror_csv: :environment do
Expand Down
18 changes: 18 additions & 0 deletions lib/tasks/cleanup.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# :nocov:
namespace :cleanup do

# example usage: RAILS_ENV=development bundle exec rake cleanup:affiliations_wo_ror
# https://github.com/datadryad/dryad-app/blob/main/documentation/technical_notes/affiliations.md#cleaning-affiliation-names
desc 'Match Affiliations with ROR organizations'
task affiliations_wo_ror: :environment do
Stash::Organization::AffiliationRorMatcher.new.perform
end

# example usage: RAILS_ENV=development bundle exec rake cleanup:contributors_wo_ror
# https://github.com/datadryad/dryad-app/blob/main/documentation/technical_notes/contributors.md#cleaning-contributor-names
desc 'Match Contributors with ROR organizations'
task contributors_wo_ror: :environment do
Stash::Organization::ContributorRorMatcher.new.perform
end
end
# :nocov:
2 changes: 0 additions & 2 deletions lib/tasks/stash_engine_tasks.rake
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@ require 'stash/google/journal_g_mail'
require_relative 'identifier_rake_functions'
require_relative '../stash/action_required_reminder'

REPORTS_DIR = 'reports'.freeze

# rubocop:disable Metrics/BlockLength
namespace :identifiers do
desc 'Give resources missing a stash_engine_identifier one (run from main app, not engine)'
Expand Down

0 comments on commit 5b03f83

Please sign in to comment.