Merge pull request #1948 from datadryad/3695-quarterly-clean-unmatche…

…d-institutions Create script to regularly match and update institutions
datadryad · Dec 19, 2024 · 5b03f83 · 5b03f83
2 parents dfbf905 + 294842a
commit 5b03f83
Show file tree

Hide file tree

Showing 10 changed files with 346 additions and 3 deletions.
diff --git a/app/models/stash_engine/ror_org.rb b/app/models/stash_engine/ror_org.rb
@@ -53,6 +53,31 @@ def self.find_by_ror_name(query)
       results.flatten.uniq
     end
 
+    # Search the RorOrgs for the given string. This will search name, acronyms, aliases, etc.
+    # @return an Array of Hashes { id: 'https://ror.org/12345', name: 'Sample University' }
+    # This method is used for auto-matching scripts, where no human has to confirm the match.
+    def self.find_by_name_for_auto_matching(query)
+      max_results = 10
+      return [] unless query.present?
+
+      query = query.downcase
+      # First, find matches at the beginning of the name string, and exact matches in the acronyms/aliases
+      resp = where("LOWER(name) LIKE ? OR JSON_SEARCH(LOWER(acronyms), 'all', ?) or JSON_SEARCH(LOWER(aliases), 'all', ?)",
+                   "#{query}%", query.to_s, query.to_s).limit(max_results)
+      results = resp.map do |r|
+        { id: r.ror_id, name: r.name, country: r.country, acronyms: r.acronyms, aliases: r.aliases }
+      end
+
+      return results if results.any?
+
+      # If we don't have enough results, find matches at the beginning of the acronyms/aliases
+      resp = where("JSON_SEARCH(LOWER(acronyms), 'all', ?) or JSON_SEARCH(LOWER(aliases), 'all', ?)",
+                   "#{query}%", "#{query}%").limit(max_results)
+      resp.map do |r|
+        { id: r.ror_id, name: r.name, country: r.country, acronyms: r.acronyms, aliases: r.aliases }
+      end
+    end
+
     # Return the first match for the given name
     # @return a StashEngine::RorOrg or nil
     def self.find_first_by_ror_name(ror_name)

diff --git a/config/initializers/constants.rb b/config/initializers/constants.rb
@@ -0,0 +1 @@
+REPORTS_DIR = 'reports'.freeze
diff --git a/cron/monthly.sh b/cron/monthly.sh
@@ -23,4 +23,9 @@ bundle exec rails link_out:seed_genbank_ids >> /home/ec2-user/deploy/shared/log/
 bundle exec rails link_out:publish >> /home/ec2-user/deploy/shared/log/link_out_publish.log 2>&1
 
 # Update ROR organizations
-bundle exec rails affiliation_import:update_ror_orgs >>/home/ec2-user/deploy/shared/log/ror_update.log 2>&1
+bundle exec rails affiliation_import:update_ror_orgs >> /home/ec2-user/deploy/shared/log/ror_update.log 2>&1
+bundle exec rails affiliation_import:update_affiliations_names >> /home/ec2-user/deploy/shared/log/affiliations_name_updates.log 2>&1
+
+# Cleanup affiliation/contributor records
+bundle exec rails cleanup:affiliations_wo_ror >> /home/ec2-user/deploy/shared/log/affiliations_wo_ror_cleanup.log 2>&1
+bundle exec rails cleanup:contributors_wo_ror >> /home/ec2-user/deploy/shared/log/contributors_wo_ror_cleanup.log 2>&1
diff --git a/lib/stash/organization/affiliation_ror_matcher.rb b/lib/stash/organization/affiliation_ror_matcher.rb
@@ -0,0 +1,66 @@
+# frozen_string_literal: true
+
+require 'csv'
+
+module Stash
+  module Organization
+    class AffiliationRorMatcher < BaseRorMatcher
+
+      private
+
+      def connect_to_ror(item, ror)
+        # puts '------------- connect_to_ror -------------'
+        ror_id = ror[:id]
+
+        rep = StashDatacite::Affiliation.find_by(ror_id: ror_id)
+        rep ||= StashDatacite::Affiliation.from_ror_id(ror_id: ror_id)
+        to_fix = StashDatacite::Affiliation.where(ror_id: nil, long_name: item.long_name)
+
+        update_affiliation_name(rep, ror)
+
+        message = 'Replacing affiliations with'
+        puts " - #{message} name \"#{item.long_name}\" (ids: #{to_fix.ids}) with \"#{ror[:name]}\" (id: #{rep.id || 'new'})"
+        @csv_rows << [item.id, item.long_name, item.authors.count, message, ror[:name], ror[:id], rep.id]
+        @updates_count += to_fix.count
+        return unless perform_updates
+
+        to_fix.each do |aff|
+          # updating authors affiliation with new affiliation
+          aff.authors.each do |author|
+            author.affiliation = rep
+          end
+          aff.destroy
+        end
+      end
+
+      def update_affiliation_name(rep, ror)
+        return if ror[:name] == rep.long_name
+
+        rep.update(long_name: ror[:name]) if perform_updates
+        message = 'Updating existing affiliation name'
+        puts " - #{message} \"#{rep.long_name}\" (id: #{rep.id}) with \"#{ror[:name]}\""
+        @csv_rows << [rep.id, rep.long_name, rep.authors.count, message, ror[:name]]
+      end
+
+      def record_name(item)
+        item.long_name
+      end
+
+      def record_ror_id(item)
+        item.ror_id
+      end
+
+      def base_items_query
+        StashDatacite::Affiliation.where(ror_id: nil)
+      end
+
+      def report_file_name(filters_text)
+        File.join(REPORTS_DIR, "affiliation_ror_matcher_report_#{Time.now.strftime('%Y%m%d_%H%M%S')}_#{filters_text}.csv")
+      end
+
+      def report_headers
+        ['Affiliation ID', 'Long Name', 'Authors Count', 'Message', 'ROR Name', 'ROR ID', 'New Affiliation ID']
+      end
+    end
+  end
+end
diff --git a/lib/stash/organization/affiliation_updater.rb b/lib/stash/organization/affiliation_updater.rb
@@ -0,0 +1,22 @@
+# frozen_string_literal: true
+
+module Stash
+  module Organization
+    class AffiliationUpdater
+
+      def self.perform
+        puts ''
+        puts "Starting affiliation update: #{Time.now}"
+
+        index = 0
+        StashDatacite::Affiliation.joins(:ror_org).where('long_name != name').find_each do |record|
+          puts "Updating affiliation with id: #{record.id} from \"#{record.long_name}\" to \"#{record.ror_org.name}\""
+          record.update(long_name: record.ror_org.name)
+
+          index += 1
+          sleep 3 if index % 1000 == 0
+        end
+      end
+    end
+  end
+end
diff --git a/lib/stash/organization/base_ror_matcher.rb b/lib/stash/organization/base_ror_matcher.rb
@@ -0,0 +1,158 @@
+# frozen_string_literal: true
+
+require 'csv'
+
+module Stash
+  module Organization
+    class BaseRorMatcher
+
+      attr_reader :perform_updates
+
+      def initialize(perform_updates: true, start_id: nil, end_id: nil, start_created_at: nil, end_created_at: nil)
+        @perform_updates = perform_updates
+        @end_id = end_id
+        @start_id = start_id
+        @start_created_at = start_created_at&.to_date
+        @end_created_at = end_created_at&.to_date
+
+        @updates_count = 0
+        @multiple_ror_found_count = 0
+        @no_ror_found_count = 0
+        @csv_rows = []
+      end
+
+      def perform
+        items_to_be_mapped = filter_items
+
+        start_report(items_to_be_mapped.count)
+        map_items(items_to_be_mapped)
+        end_report
+      end
+
+      private
+
+      def filter_items
+        items = base_items_query
+        items = items.where('id >= ?', @start_id) if @start_id
+        items = items.where('id <= ?', @end_id) if @end_id
+        items = items.where('created_at >= ?', @start_created_at) if @start_created_at
+        items = items.where('created_at <= ?', @end_created_at) if @end_created_at
+        items
+      end
+
+      def start_report(items_count)
+        puts ''
+        puts '========================================================================================'
+        @text = "Processing #{items_count} records"
+        @text += " starting with id: #{@start_id}" if @start_id
+        @text += " ending with id: #{@end_id}" if @end_id
+        @text += " starting from: #{@start_created_at}" if @start_created_at
+        @text += " ending on: #{@end_created_at}" if @end_created_at
+        @text += ':'
+        puts @text
+        initialize_csv_report
+      end
+
+      def end_report
+        messages = [
+          [],
+          [@text.gsub('Processing', 'From')],
+          [" - Updated: #{@updates_count} records."],
+          [" - Multiple RORs found: #{@multiple_ror_found_count} records."],
+          [" - No ROR found: #{@no_ror_found_count} records."]
+        ]
+        update_csv_report(messages)
+
+        puts ''
+        messages.each do |message|
+          puts message.first
+        end
+        puts "Report file: #{@report_name}"
+      end
+
+      def initialize_csv_report
+        filters_text = @text.downcase.gsub(':', '').gsub(' ', '_')
+        @report_name = report_file_name(filters_text)
+
+        @csv = CSV.open(@report_name, 'wb') do |csv|
+          csv << report_headers
+        end
+      end
+
+      def update_csv_report(csv_rows)
+        @csv = CSV.open(@report_name, 'a+') do |csv|
+          csv_rows.each do |row|
+            csv << row
+          end
+        end
+      end
+
+      def map_items(items_to_be_mapped)
+        index = 0
+
+        items_to_be_mapped.find_each do |item|
+          index += 1
+          if index % 100 == 0
+            sleep 2
+            update_csv_report(@csv_rows)
+            @csv_rows = []
+          end
+
+          handle_item(item, record_name(item), index)
+        end
+        update_csv_report(@csv_rows)
+      end
+
+      def handle_item(item, item_name, index)
+        puts ''
+        puts "#{index}. Processing record \"#{item_name}\" (id: #{item.id}, created_at: #{item.created_at})"
+
+        if record_ror_id(item).present?
+          puts ' - ROR already updated'
+          return
+        end
+
+        rors = StashEngine::RorOrg.find_by_name_for_auto_matching(item_name)
+        case rors.count
+        when 0
+          @no_ror_found_count += 1
+          # Do not add to CSV report, nor log file, as it will increase the file size too much
+          # message = 'Could not find ROR'
+          # @csv_rows << [item.id, item_name, message]
+          # puts " - #{message} for \"#{item_name}\""
+        when 1
+          connect_to_ror(item, rors.first)
+        else
+          @multiple_ror_found_count += 1
+          message = 'Found multiple RORs'
+          @csv_rows << [item.id, item_name, message, rors.map { |ror| ror[:name] }.join("\n"), rors.map { |ror| ror[:id] }.join("\n")]
+          puts " - #{message} for \"#{item_name}\""
+        end
+      end
+
+      def base_items_query
+        raise NotImplementedError, 'Subclasses must implement base_items_query'
+      end
+
+      def record_name(item)
+        raise NotImplementedError, 'Subclasses must implement record_name'
+      end
+
+      def record_ror_id(item)
+        raise NotImplementedError, 'Subclasses must implement record_ror_id'
+      end
+
+      def connect_to_ror(affiliation, ror)
+        raise NotImplementedError, 'Subclasses must implement connect_to_ror'
+      end
+
+      def report_file_name(filters_text)
+        raise NotImplementedError, 'Subclasses must implement report_file_name'
+      end
+
+      def report_headers
+        raise NotImplementedError, 'Subclasses must implement report_headers'
+      end
+    end
+  end
+end
diff --git a/lib/stash/organization/contributor_ror_matcher.rb b/lib/stash/organization/contributor_ror_matcher.rb
@@ -0,0 +1,45 @@
+# frozen_string_literal: true
+
+require 'csv'
+
+module Stash
+  module Organization
+    class ContributorRorMatcher < BaseRorMatcher
+
+      private
+
+      def connect_to_ror(item, ror)
+        # puts '------------- connect_to_ror -------------'
+        ror_id = ror[:id]
+        message = 'Updating contributor with'
+        puts " - #{message} name \"#{item.contributor_name}\" (ids: #{item.id}) with \"#{ror[:name]}\" (ror_id: #{ror_id})"
+        @csv_rows << [item.id, item.contributor_name, message, ror[:name], ror[:id]]
+        @updates_count += 1
+
+        return unless perform_updates
+
+        item.update(contributor_name: ror[:name], identifier_type: 'ror', name_identifier_id: ror_id)
+      end
+
+      def record_name(item)
+        item.contributor_name
+      end
+
+      def record_ror_id(item)
+        item.name_identifier_id
+      end
+
+      def base_items_query
+        StashDatacite::Contributor.where(name_identifier_id: [nil, ''])
+      end
+
+      def report_file_name(filters_text)
+        File.join(REPORTS_DIR, "contributor_ror_matcher_report_#{Time.now.strftime('%Y%m%d_%H%M%S')}_#{filters_text}.csv")
+      end
+
+      def report_headers
+        ['Contributor ID', 'Contributor Name', 'Message', 'ROR Name', 'ROR ID']
+      end
+    end
+  end
+end
diff --git a/lib/tasks/affiliation_import.rake b/lib/tasks/affiliation_import.rake
@@ -22,6 +22,11 @@ namespace :affiliation_import do
     Stash::Organization::RorUpdater.perform
   end
 
+  desc 'Sync Affiliation name with ROR organizations name'
+  task update_affiliations_names: :environment do
+    Stash::Organization::AffiliationUpdater.perform
+  end
+
   # example: RAILS_ENV=development bundle exec rake affiliation_import:process_ror_csv -- --affiliation_mode true
   desc 'Process all of the CSV files'
   task process_ror_csv: :environment do

diff --git a/lib/tasks/cleanup.rake b/lib/tasks/cleanup.rake
@@ -0,0 +1,18 @@
+# :nocov:
+namespace :cleanup do
+
+  # example usage: RAILS_ENV=development bundle exec rake cleanup:affiliations_wo_ror
+  # https://github.com/datadryad/dryad-app/blob/main/documentation/technical_notes/affiliations.md#cleaning-affiliation-names
+  desc 'Match Affiliations with ROR organizations'
+  task affiliations_wo_ror: :environment do
+    Stash::Organization::AffiliationRorMatcher.new.perform
+  end
+
+  # example usage: RAILS_ENV=development bundle exec rake cleanup:contributors_wo_ror
+  # https://github.com/datadryad/dryad-app/blob/main/documentation/technical_notes/contributors.md#cleaning-contributor-names
+  desc 'Match Contributors with ROR organizations'
+  task contributors_wo_ror: :environment do
+    Stash::Organization::ContributorRorMatcher.new.perform
+  end
+end
+# :nocov:
diff --git a/lib/tasks/stash_engine_tasks.rake b/lib/tasks/stash_engine_tasks.rake
@@ -6,8 +6,6 @@ require 'stash/google/journal_g_mail'
 require_relative 'identifier_rake_functions'
 require_relative '../stash/action_required_reminder'
 
-REPORTS_DIR = 'reports'.freeze
-
 # rubocop:disable Metrics/BlockLength
 namespace :identifiers do
   desc 'Give resources missing a stash_engine_identifier one (run from main app, not engine)'