Merge pull request #3322 from cisagov/za/3299-script-to-update-suborg…

…-values #3299: Script to update suborg values - [ZA]
cisagov · Jan 16, 2025 · 0d2c81c · 0d2c81c
2 parents 2ff8564 + fcac8a8
commit 0d2c81c
Show file tree

Hide file tree

Showing 7 changed files with 662 additions and 23 deletions.
diff --git a/docs/operations/data_migration.md b/docs/operations/data_migration.md
@@ -918,3 +918,38 @@ Example (only requests): `./manage.py create_federal_portfolio --branch "executi
 - Parameters #1-#2: Either `--agency_name` or `--branch` must be specified. Not both.
 - Parameters #2-#3, you cannot use `--both` while using these. You must specify either `--parse_requests` or `--parse_domains` seperately. While all of these parameters are optional in that you do not need to specify all of them,
 you must specify at least one to run this script.
+
+
+## Patch suborganizations
+This script deletes some duplicate suborganization data that exists in our database (one-time use).
+It works in two ways:
+1. If the only name difference between two suborg records is extra spaces or a capitalization difference,
+then we delete all duplicate records of this type.
+2. If the suborg name is one we manually specify to delete via the script.
+
+Before it deletes records, it goes through each DomainInformation and DomainRequest object and updates the reference to "sub_organization" to match the non-duplicative record.
+
+### Running on sandboxes
+
+#### Step 1: Login to CloudFoundry
+```cf login -a api.fr.cloud.gov --sso```
+
+#### Step 2: SSH into your environment
+```cf ssh getgov-{space}```
+
+Example: `cf ssh getgov-za`
+
+#### Step 3: Create a shell instance
+```/tmp/lifecycle/shell```
+
+#### Step 4: Upload your csv to the desired sandbox
+[Follow these steps](#use-scp-to-transfer-data-to-sandboxes) to upload the federal_cio csv to a sandbox of your choice.
+
+#### Step 5: Running the script
+To create a specific portfolio: 
+```./manage.py patch_suborganizations```
+
+### Running locally
+
+#### Step 1: Running the script
+```docker-compose exec app ./manage.py patch_suborganizations```
diff --git a/src/registrar/management/commands/create_federal_portfolio.py b/src/registrar/management/commands/create_federal_portfolio.py
@@ -6,6 +6,7 @@
 from registrar.management.commands.utility.terminal_helper import TerminalColors, TerminalHelper
 from registrar.models import DomainInformation, DomainRequest, FederalAgency, Suborganization, Portfolio, User
 from registrar.models.utility.generic_helper import normalize_string
+from django.db.models import F, Q
 
 
 logger = logging.getLogger(__name__)
@@ -104,12 +105,17 @@ def handle(self, **options):
                 message = f"Failed to create portfolio '{federal_agency.agency}'"
                 TerminalHelper.colorful_logger(logger.info, TerminalColors.FAIL, message)
 
+        # POST PROCESS STEP: Add additional suborg info where applicable.
+        updated_suborg_count = self.post_process_all_suborganization_fields(agencies)
+        message = f"Added city and state_territory information to {updated_suborg_count} suborgs."
+        TerminalHelper.colorful_logger(logger.info, TerminalColors.MAGENTA, message)
+
         TerminalHelper.log_script_run_summary(
             self.updated_portfolios,
             self.failed_portfolios,
             self.skipped_portfolios,
             debug=False,
-            skipped_header="----- SOME PORTFOLIOS WERE SKIPPED -----",
+            skipped_header="----- SOME PORTFOLIOS WERENT CREATED -----",
             display_as_str=True,
         )
 
@@ -169,14 +175,11 @@ def post_process_started_domain_requests(self, agencies, portfolios):
 
     def handle_populate_portfolio(self, federal_agency, parse_domains, parse_requests, both):
         """Attempts to create a portfolio. If successful, this function will
-        also create new suborganizations.
-        Returns the portfolio for the given federal_agency.
-        """
-        portfolio, created = self.create_portfolio(federal_agency)
-        if created:
-            self.create_suborganizations(portfolio, federal_agency)
-            if parse_domains or both:
-                self.handle_portfolio_domains(portfolio, federal_agency)
+        also create new suborganizations"""
+        portfolio, _ = self.create_portfolio(federal_agency)
+        self.create_suborganizations(portfolio, federal_agency)
+        if parse_domains or both:
+            self.handle_portfolio_domains(portfolio, federal_agency)
 
         if parse_requests or both:
             self.handle_portfolio_requests(portfolio, federal_agency)
@@ -233,7 +236,6 @@ def create_suborganizations(self, portfolio: Portfolio, federal_agency: FederalA
             federal_agency=federal_agency, organization_name__isnull=False
         )
         org_names = set(valid_agencies.values_list("organization_name", flat=True))
-
         if not org_names:
             message = (
                 "Could not add any suborganizations."
@@ -352,3 +354,141 @@ def handle_portfolio_domains(self, portfolio: Portfolio, federal_agency: Federal
         DomainInformation.objects.bulk_update(domain_infos, ["portfolio", "sub_organization"])
         message = f"Added portfolio '{portfolio}' to {len(domain_infos)} domains."
         TerminalHelper.colorful_logger(logger.info, TerminalColors.OKGREEN, message)
+
+    def post_process_all_suborganization_fields(self, agencies):
+        """Batch updates suborganization locations from domain and request data.
+
+        Args:
+            agencies: List of FederalAgency objects to process
+
+        Returns:
+            int: Number of suborganizations updated
+
+        Priority for location data:
+        1. Domain information
+        2. Domain request suborganization fields
+        3. Domain request standard fields
+        """
+        # Common filter between domaininformation / domain request.
+        # Filter by only the agencies we've updated thus far.
+        # Then, only process records without null portfolio, org name, or suborg name.
+        base_filter = Q(
+            federal_agency__in=agencies,
+            portfolio__isnull=False,
+            organization_name__isnull=False,
+            sub_organization__isnull=False,
+        ) & ~Q(organization_name__iexact=F("portfolio__organization_name"))
+
+        # First: Remove null city / state_territory values on domain info / domain requests.
+        # We want to add city data if there is data to add to begin with!
+        domains = DomainInformation.objects.filter(
+            base_filter,
+            Q(city__isnull=False, state_territory__isnull=False),
+        )
+        requests = DomainRequest.objects.filter(
+            base_filter,
+            (
+                Q(city__isnull=False, state_territory__isnull=False)
+                | Q(suborganization_city__isnull=False, suborganization_state_territory__isnull=False)
+            ),
+        )
+
+        # Second: Group domains and requests by normalized organization name.
+        # This means that later down the line we have to account for "duplicate" org names.
+        domains_dict = {}
+        requests_dict = {}
+        for domain in domains:
+            normalized_name = normalize_string(domain.organization_name)
+            domains_dict.setdefault(normalized_name, []).append(domain)
+
+        for request in requests:
+            normalized_name = normalize_string(request.organization_name)
+            requests_dict.setdefault(normalized_name, []).append(request)
+
+        # Third: Get suborganizations to update
+        suborgs_to_edit = Suborganization.objects.filter(
+            Q(id__in=domains.values_list("sub_organization", flat=True))
+            | Q(id__in=requests.values_list("sub_organization", flat=True))
+        )
+
+        # Fourth: Process each suborg to add city / state territory info
+        for suborg in suborgs_to_edit:
+            self.post_process_suborganization_fields(suborg, domains_dict, requests_dict)
+
+        # Fifth: Perform a bulk update
+        return Suborganization.objects.bulk_update(suborgs_to_edit, ["city", "state_territory"])
+
+    def post_process_suborganization_fields(self, suborg, domains_dict, requests_dict):
+        """Updates a single suborganization's location data if valid.
+
+        Args:
+            suborg: Suborganization to update
+            domains_dict: Dict of domain info records grouped by org name
+            requests_dict: Dict of domain requests grouped by org name
+
+        Priority matches parent method. Updates are skipped if location data conflicts
+        between multiple records of the same type.
+        """
+        normalized_suborg_name = normalize_string(suborg.name)
+        domains = domains_dict.get(normalized_suborg_name, [])
+        requests = requests_dict.get(normalized_suborg_name, [])
+
+        # Try to get matching domain info
+        domain = None
+        if domains:
+            reference = domains[0]
+            use_location_for_domain = all(
+                d.city == reference.city and d.state_territory == reference.state_territory for d in domains
+            )
+            if use_location_for_domain:
+                domain = reference
+
+        # Try to get matching request info
+        # Uses consensus: if all city / state_territory info matches, then we can assume the data is "good".
+        # If not, take the safe route and just skip updating this particular record.
+        request = None
+        use_suborg_location_for_request = True
+        use_location_for_request = True
+        if requests:
+            reference = requests[0]
+            use_suborg_location_for_request = all(
+                r.suborganization_city
+                and r.suborganization_state_territory
+                and r.suborganization_city == reference.suborganization_city
+                and r.suborganization_state_territory == reference.suborganization_state_territory
+                for r in requests
+            )
+            use_location_for_request = all(
+                r.city
+                and r.state_territory
+                and r.city == reference.city
+                and r.state_territory == reference.state_territory
+                for r in requests
+            )
+            if use_suborg_location_for_request or use_location_for_request:
+                request = reference
+
+        if not domain and not request:
+            message = f"Skipping adding city / state_territory information to suborg: {suborg}. Bad data."
+            TerminalHelper.colorful_logger(logger.warning, TerminalColors.YELLOW, message)
+            return
+
+        # PRIORITY:
+        # 1. Domain info
+        # 2. Domain request requested suborg fields
+        # 3. Domain request normal fields
+        if domain:
+            suborg.city = normalize_string(domain.city, lowercase=False)
+            suborg.state_territory = domain.state_territory
+        elif request and use_suborg_location_for_request:
+            suborg.city = normalize_string(request.suborganization_city, lowercase=False)
+            suborg.state_territory = request.suborganization_state_territory
+        elif request and use_location_for_request:
+            suborg.city = normalize_string(request.city, lowercase=False)
+            suborg.state_territory = request.state_territory
+
+        message = (
+            f"Added city/state_territory to suborg: {suborg}. "
+            f"city - {suborg.city}, state - {suborg.state_territory}"
+        )
+        TerminalHelper.colorful_logger(logger.info, TerminalColors.MAGENTA, message)
diff --git a/src/registrar/management/commands/patch_suborganizations.py b/src/registrar/management/commands/patch_suborganizations.py
@@ -0,0 +1,133 @@
+import logging
+from django.core.management import BaseCommand
+from registrar.models import Suborganization, DomainRequest, DomainInformation
+from registrar.management.commands.utility.terminal_helper import TerminalColors, TerminalHelper
+from registrar.models.utility.generic_helper import count_capitals, normalize_string
+
+
+logger = logging.getLogger(__name__)
+
+
+class Command(BaseCommand):
+    help = "Clean up duplicate suborganizations that differ only by spaces and capitalization"
+
+    def handle(self, **kwargs):
+        """Process manual deletions and find/remove duplicates. Shows preview
+        and updates DomainInformation / DomainRequest sub_organization references before deletion."""
+
+        # First: get a preset list of records we want to delete.
+        # For extra_records_to_prune: the key gets deleted, the value gets kept.
+        extra_records_to_prune = {
+            normalize_string("Assistant Secretary for Preparedness and Response Office of the Secretary"): {
+                "replace_with": "Assistant Secretary for Preparedness and Response, Office of the Secretary"
+            },
+            normalize_string("US Geological Survey"): {"replace_with": "U.S. Geological Survey"},
+            normalize_string("USDA/OC"): {"replace_with": "USDA, Office of Communications"},
+            normalize_string("GSA, IC, OGP WebPortfolio"): {"replace_with": "GSA, IC, OGP Web Portfolio"},
+            normalize_string("USDA/ARS/NAL"): {"replace_with": "USDA, ARS, NAL"},
+        }
+
+        # Second: loop through every Suborganization and return a dict of what to keep, and what to delete
+        # for each duplicate or "incorrect" record. We do this by pruning records with extra spaces or bad caps
+        # Note that "extra_records_to_prune" is just a manual mapping.
+        records_to_prune = self.get_records_to_prune(extra_records_to_prune)
+        if len(records_to_prune) == 0:
+            TerminalHelper.colorful_logger(logger.error, TerminalColors.FAIL, "No suborganizations to delete.")
+            return
+
+        # Third: Build a preview of the changes
+        total_records_to_remove = 0
+        preview_lines = ["The following records will be removed:"]
+        for data in records_to_prune.values():
+            keep = data.get("keep")
+            delete = data.get("delete")
+            if keep:
+                preview_lines.append(f"Keeping: '{keep.name}' (id: {keep.id})")
+
+            for duplicate in delete:
+                preview_lines.append(f"Removing: '{duplicate.name}' (id: {duplicate.id})")
+                total_records_to_remove += 1
+            preview_lines.append("")
+        preview = "\n".join(preview_lines)
+
+        # Fourth: Get user confirmation and delete
+        if TerminalHelper.prompt_for_execution(
+            system_exit_on_terminate=True,
+            prompt_message=preview,
+            prompt_title=f"Remove {total_records_to_remove} suborganizations?",
+            verify_message="*** WARNING: This will replace the record on DomainInformation and DomainRequest! ***",
+        ):
+            try:
+                # Update all references to point to the right suborg before deletion
+                all_suborgs_to_remove = set()
+                for record in records_to_prune.values():
+                    best_record = record["keep"]
+                    suborgs_to_remove = {dupe.id for dupe in record["delete"]}
+                    DomainRequest.objects.filter(sub_organization_id__in=suborgs_to_remove).update(
+                        sub_organization=best_record
+                    )
+                    DomainInformation.objects.filter(sub_organization_id__in=suborgs_to_remove).update(
+                        sub_organization=best_record
+                    )
+                    all_suborgs_to_remove.update(suborgs_to_remove)
+                # Delete the suborgs
+                delete_count, _ = Suborganization.objects.filter(id__in=all_suborgs_to_remove).delete()
+                TerminalHelper.colorful_logger(
+                    logger.info, TerminalColors.MAGENTA, f"Successfully deleted {delete_count} suborganizations."
+                )
+            except Exception as e:
+                TerminalHelper.colorful_logger(
+                    logger.error, TerminalColors.FAIL, f"Failed to delete suborganizations: {str(e)}"
+                )
+
+    def get_records_to_prune(self, extra_records_to_prune):
+        """Maps all suborgs into a dictionary with a record to keep, and an array of records to delete."""
+        # First: Group all suborganization names by their "normalized" names (finding duplicates).
+        # Returns a dict that looks like this:
+        # {
+        #   "amtrak": [<Suborganization: AMTRAK>, <Suborganization: aMtRaK>, <Suborganization: AMTRAK  >],
+        #   "usda/oc": [<Suborganization: USDA/OC>],
+        #   ...etc
+        # }
+        #
+        name_groups = {}
+        for suborg in Suborganization.objects.all():
+            normalized_name = normalize_string(suborg.name)
+            name_groups.setdefault(normalized_name, []).append(suborg)
+
+        # Second: find the record we should keep, and the records we should delete
+        # Returns a dict that looks like this:
+        # {
+        #  "amtrak": {
+        #       "keep": <Suborganization: AMTRAK>
+        #       "delete": [<Suborganization: aMtRaK>, <Suborganization: AMTRAK  >]
+        #   },
+        #   "usda/oc": {
+        #       "keep": <Suborganization: USDA, Office of Communications>,
+        #       "delete": [<Suborganization: USDA/OC>]
+        #   },
+        #   ...etc
+        # }
+        records_to_prune = {}
+        for normalized_name, duplicate_suborgs in name_groups.items():
+            # Delete data from our preset list
+            if normalized_name in extra_records_to_prune:
+                # The 'keep' field expects a Suborganization but we just pass in a string, so this is just a workaround.
+                # This assumes that there is only one item in the name_group array (see usda/oc example).
+                # But this should be fine, given our data.
+                hardcoded_record_name = extra_records_to_prune[normalized_name]["replace_with"]
+                name_group = name_groups.get(normalize_string(hardcoded_record_name))
+                keep = name_group[0] if name_group else None
+                records_to_prune[normalized_name] = {"keep": keep, "delete": duplicate_suborgs}
+            # Delete duplicates (extra spaces or casing differences)
+            elif len(duplicate_suborgs) > 1:
+                # Pick the best record (fewest spaces, most leading capitals)
+                best_record = max(
+                    duplicate_suborgs,
+                    key=lambda suborg: (-suborg.name.count(" "), count_capitals(suborg.name, leading_only=True)),
+                )
+                records_to_prune[normalized_name] = {
+                    "keep": best_record,
+                    "delete": [s for s in duplicate_suborgs if s != best_record],
+                }
+        return records_to_prune