Skip to content

Commit

Permalink
Merge pull request #3322 from cisagov/za/3299-script-to-update-suborg…
Browse files Browse the repository at this point in the history
…-values

#3299: Script to update suborg values - [ZA]
  • Loading branch information
zandercymatics authored Jan 16, 2025
2 parents 2ff8564 + fcac8a8 commit 0d2c81c
Show file tree
Hide file tree
Showing 7 changed files with 662 additions and 23 deletions.
35 changes: 35 additions & 0 deletions docs/operations/data_migration.md
Original file line number Diff line number Diff line change
Expand Up @@ -918,3 +918,38 @@ Example (only requests): `./manage.py create_federal_portfolio --branch "executi
- Parameters #1-#2: Either `--agency_name` or `--branch` must be specified. Not both.
- Parameters #2-#3, you cannot use `--both` while using these. You must specify either `--parse_requests` or `--parse_domains` seperately. While all of these parameters are optional in that you do not need to specify all of them,
you must specify at least one to run this script.


## Patch suborganizations
This script deletes some duplicate suborganization data that exists in our database (one-time use).
It works in two ways:
1. If the only name difference between two suborg records is extra spaces or a capitalization difference,
then we delete all duplicate records of this type.
2. If the suborg name is one we manually specify to delete via the script.

Before it deletes records, it goes through each DomainInformation and DomainRequest object and updates the reference to "sub_organization" to match the non-duplicative record.

### Running on sandboxes

#### Step 1: Login to CloudFoundry
```cf login -a api.fr.cloud.gov --sso```

#### Step 2: SSH into your environment
```cf ssh getgov-{space}```

Example: `cf ssh getgov-za`

#### Step 3: Create a shell instance
```/tmp/lifecycle/shell```

#### Step 4: Upload your csv to the desired sandbox
[Follow these steps](#use-scp-to-transfer-data-to-sandboxes) to upload the federal_cio csv to a sandbox of your choice.

#### Step 5: Running the script
To create a specific portfolio:
```./manage.py patch_suborganizations```

### Running locally

#### Step 1: Running the script
```docker-compose exec app ./manage.py patch_suborganizations```
160 changes: 150 additions & 10 deletions src/registrar/management/commands/create_federal_portfolio.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from registrar.management.commands.utility.terminal_helper import TerminalColors, TerminalHelper
from registrar.models import DomainInformation, DomainRequest, FederalAgency, Suborganization, Portfolio, User
from registrar.models.utility.generic_helper import normalize_string
from django.db.models import F, Q


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -104,12 +105,17 @@ def handle(self, **options):
message = f"Failed to create portfolio '{federal_agency.agency}'"
TerminalHelper.colorful_logger(logger.info, TerminalColors.FAIL, message)

# POST PROCESS STEP: Add additional suborg info where applicable.
updated_suborg_count = self.post_process_all_suborganization_fields(agencies)
message = f"Added city and state_territory information to {updated_suborg_count} suborgs."
TerminalHelper.colorful_logger(logger.info, TerminalColors.MAGENTA, message)

TerminalHelper.log_script_run_summary(
self.updated_portfolios,
self.failed_portfolios,
self.skipped_portfolios,
debug=False,
skipped_header="----- SOME PORTFOLIOS WERE SKIPPED -----",
skipped_header="----- SOME PORTFOLIOS WERENT CREATED -----",
display_as_str=True,
)

Expand Down Expand Up @@ -169,14 +175,11 @@ def post_process_started_domain_requests(self, agencies, portfolios):

def handle_populate_portfolio(self, federal_agency, parse_domains, parse_requests, both):
"""Attempts to create a portfolio. If successful, this function will
also create new suborganizations.
Returns the portfolio for the given federal_agency.
"""
portfolio, created = self.create_portfolio(federal_agency)
if created:
self.create_suborganizations(portfolio, federal_agency)
if parse_domains or both:
self.handle_portfolio_domains(portfolio, federal_agency)
also create new suborganizations"""
portfolio, _ = self.create_portfolio(federal_agency)
self.create_suborganizations(portfolio, federal_agency)
if parse_domains or both:
self.handle_portfolio_domains(portfolio, federal_agency)

if parse_requests or both:
self.handle_portfolio_requests(portfolio, federal_agency)
Expand Down Expand Up @@ -233,7 +236,6 @@ def create_suborganizations(self, portfolio: Portfolio, federal_agency: FederalA
federal_agency=federal_agency, organization_name__isnull=False
)
org_names = set(valid_agencies.values_list("organization_name", flat=True))

if not org_names:
message = (
"Could not add any suborganizations."
Expand Down Expand Up @@ -352,3 +354,141 @@ def handle_portfolio_domains(self, portfolio: Portfolio, federal_agency: Federal
DomainInformation.objects.bulk_update(domain_infos, ["portfolio", "sub_organization"])
message = f"Added portfolio '{portfolio}' to {len(domain_infos)} domains."
TerminalHelper.colorful_logger(logger.info, TerminalColors.OKGREEN, message)

def post_process_all_suborganization_fields(self, agencies):
"""Batch updates suborganization locations from domain and request data.
Args:
agencies: List of FederalAgency objects to process
Returns:
int: Number of suborganizations updated
Priority for location data:
1. Domain information
2. Domain request suborganization fields
3. Domain request standard fields
"""
# Common filter between domaininformation / domain request.
# Filter by only the agencies we've updated thus far.
# Then, only process records without null portfolio, org name, or suborg name.
base_filter = Q(
federal_agency__in=agencies,
portfolio__isnull=False,
organization_name__isnull=False,
sub_organization__isnull=False,
) & ~Q(organization_name__iexact=F("portfolio__organization_name"))

# First: Remove null city / state_territory values on domain info / domain requests.
# We want to add city data if there is data to add to begin with!
domains = DomainInformation.objects.filter(
base_filter,
Q(city__isnull=False, state_territory__isnull=False),
)
requests = DomainRequest.objects.filter(
base_filter,
(
Q(city__isnull=False, state_territory__isnull=False)
| Q(suborganization_city__isnull=False, suborganization_state_territory__isnull=False)
),
)

# Second: Group domains and requests by normalized organization name.
# This means that later down the line we have to account for "duplicate" org names.
domains_dict = {}
requests_dict = {}
for domain in domains:
normalized_name = normalize_string(domain.organization_name)
domains_dict.setdefault(normalized_name, []).append(domain)

for request in requests:
normalized_name = normalize_string(request.organization_name)
requests_dict.setdefault(normalized_name, []).append(request)

# Third: Get suborganizations to update
suborgs_to_edit = Suborganization.objects.filter(
Q(id__in=domains.values_list("sub_organization", flat=True))
| Q(id__in=requests.values_list("sub_organization", flat=True))
)

# Fourth: Process each suborg to add city / state territory info
for suborg in suborgs_to_edit:
self.post_process_suborganization_fields(suborg, domains_dict, requests_dict)

# Fifth: Perform a bulk update
return Suborganization.objects.bulk_update(suborgs_to_edit, ["city", "state_territory"])

def post_process_suborganization_fields(self, suborg, domains_dict, requests_dict):
"""Updates a single suborganization's location data if valid.
Args:
suborg: Suborganization to update
domains_dict: Dict of domain info records grouped by org name
requests_dict: Dict of domain requests grouped by org name
Priority matches parent method. Updates are skipped if location data conflicts
between multiple records of the same type.
"""
normalized_suborg_name = normalize_string(suborg.name)
domains = domains_dict.get(normalized_suborg_name, [])
requests = requests_dict.get(normalized_suborg_name, [])

# Try to get matching domain info
domain = None
if domains:
reference = domains[0]
use_location_for_domain = all(
d.city == reference.city and d.state_territory == reference.state_territory for d in domains
)
if use_location_for_domain:
domain = reference

# Try to get matching request info
# Uses consensus: if all city / state_territory info matches, then we can assume the data is "good".
# If not, take the safe route and just skip updating this particular record.
request = None
use_suborg_location_for_request = True
use_location_for_request = True
if requests:
reference = requests[0]
use_suborg_location_for_request = all(
r.suborganization_city
and r.suborganization_state_territory
and r.suborganization_city == reference.suborganization_city
and r.suborganization_state_territory == reference.suborganization_state_territory
for r in requests
)
use_location_for_request = all(
r.city
and r.state_territory
and r.city == reference.city
and r.state_territory == reference.state_territory
for r in requests
)
if use_suborg_location_for_request or use_location_for_request:
request = reference

if not domain and not request:
message = f"Skipping adding city / state_territory information to suborg: {suborg}. Bad data."
TerminalHelper.colorful_logger(logger.warning, TerminalColors.YELLOW, message)
return

# PRIORITY:
# 1. Domain info
# 2. Domain request requested suborg fields
# 3. Domain request normal fields
if domain:
suborg.city = normalize_string(domain.city, lowercase=False)
suborg.state_territory = domain.state_territory
elif request and use_suborg_location_for_request:
suborg.city = normalize_string(request.suborganization_city, lowercase=False)
suborg.state_territory = request.suborganization_state_territory
elif request and use_location_for_request:
suborg.city = normalize_string(request.city, lowercase=False)
suborg.state_territory = request.state_territory

message = (
f"Added city/state_territory to suborg: {suborg}. "
f"city - {suborg.city}, state - {suborg.state_territory}"
)
TerminalHelper.colorful_logger(logger.info, TerminalColors.MAGENTA, message)
133 changes: 133 additions & 0 deletions src/registrar/management/commands/patch_suborganizations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import logging
from django.core.management import BaseCommand
from registrar.models import Suborganization, DomainRequest, DomainInformation
from registrar.management.commands.utility.terminal_helper import TerminalColors, TerminalHelper
from registrar.models.utility.generic_helper import count_capitals, normalize_string


logger = logging.getLogger(__name__)


class Command(BaseCommand):
help = "Clean up duplicate suborganizations that differ only by spaces and capitalization"

def handle(self, **kwargs):
"""Process manual deletions and find/remove duplicates. Shows preview
and updates DomainInformation / DomainRequest sub_organization references before deletion."""

# First: get a preset list of records we want to delete.
# For extra_records_to_prune: the key gets deleted, the value gets kept.
extra_records_to_prune = {
normalize_string("Assistant Secretary for Preparedness and Response Office of the Secretary"): {
"replace_with": "Assistant Secretary for Preparedness and Response, Office of the Secretary"
},
normalize_string("US Geological Survey"): {"replace_with": "U.S. Geological Survey"},
normalize_string("USDA/OC"): {"replace_with": "USDA, Office of Communications"},
normalize_string("GSA, IC, OGP WebPortfolio"): {"replace_with": "GSA, IC, OGP Web Portfolio"},
normalize_string("USDA/ARS/NAL"): {"replace_with": "USDA, ARS, NAL"},
}

# Second: loop through every Suborganization and return a dict of what to keep, and what to delete
# for each duplicate or "incorrect" record. We do this by pruning records with extra spaces or bad caps
# Note that "extra_records_to_prune" is just a manual mapping.
records_to_prune = self.get_records_to_prune(extra_records_to_prune)
if len(records_to_prune) == 0:
TerminalHelper.colorful_logger(logger.error, TerminalColors.FAIL, "No suborganizations to delete.")
return

# Third: Build a preview of the changes
total_records_to_remove = 0
preview_lines = ["The following records will be removed:"]
for data in records_to_prune.values():
keep = data.get("keep")
delete = data.get("delete")
if keep:
preview_lines.append(f"Keeping: '{keep.name}' (id: {keep.id})")

for duplicate in delete:
preview_lines.append(f"Removing: '{duplicate.name}' (id: {duplicate.id})")
total_records_to_remove += 1
preview_lines.append("")
preview = "\n".join(preview_lines)

# Fourth: Get user confirmation and delete
if TerminalHelper.prompt_for_execution(
system_exit_on_terminate=True,
prompt_message=preview,
prompt_title=f"Remove {total_records_to_remove} suborganizations?",
verify_message="*** WARNING: This will replace the record on DomainInformation and DomainRequest! ***",
):
try:
# Update all references to point to the right suborg before deletion
all_suborgs_to_remove = set()
for record in records_to_prune.values():
best_record = record["keep"]
suborgs_to_remove = {dupe.id for dupe in record["delete"]}
DomainRequest.objects.filter(sub_organization_id__in=suborgs_to_remove).update(
sub_organization=best_record
)
DomainInformation.objects.filter(sub_organization_id__in=suborgs_to_remove).update(
sub_organization=best_record
)
all_suborgs_to_remove.update(suborgs_to_remove)
# Delete the suborgs
delete_count, _ = Suborganization.objects.filter(id__in=all_suborgs_to_remove).delete()
TerminalHelper.colorful_logger(
logger.info, TerminalColors.MAGENTA, f"Successfully deleted {delete_count} suborganizations."
)
except Exception as e:
TerminalHelper.colorful_logger(
logger.error, TerminalColors.FAIL, f"Failed to delete suborganizations: {str(e)}"
)

def get_records_to_prune(self, extra_records_to_prune):
"""Maps all suborgs into a dictionary with a record to keep, and an array of records to delete."""
# First: Group all suborganization names by their "normalized" names (finding duplicates).
# Returns a dict that looks like this:
# {
# "amtrak": [<Suborganization: AMTRAK>, <Suborganization: aMtRaK>, <Suborganization: AMTRAK >],
# "usda/oc": [<Suborganization: USDA/OC>],
# ...etc
# }
#
name_groups = {}
for suborg in Suborganization.objects.all():
normalized_name = normalize_string(suborg.name)
name_groups.setdefault(normalized_name, []).append(suborg)

# Second: find the record we should keep, and the records we should delete
# Returns a dict that looks like this:
# {
# "amtrak": {
# "keep": <Suborganization: AMTRAK>
# "delete": [<Suborganization: aMtRaK>, <Suborganization: AMTRAK >]
# },
# "usda/oc": {
# "keep": <Suborganization: USDA, Office of Communications>,
# "delete": [<Suborganization: USDA/OC>]
# },
# ...etc
# }
records_to_prune = {}
for normalized_name, duplicate_suborgs in name_groups.items():
# Delete data from our preset list
if normalized_name in extra_records_to_prune:
# The 'keep' field expects a Suborganization but we just pass in a string, so this is just a workaround.
# This assumes that there is only one item in the name_group array (see usda/oc example).
# But this should be fine, given our data.
hardcoded_record_name = extra_records_to_prune[normalized_name]["replace_with"]
name_group = name_groups.get(normalize_string(hardcoded_record_name))
keep = name_group[0] if name_group else None
records_to_prune[normalized_name] = {"keep": keep, "delete": duplicate_suborgs}
# Delete duplicates (extra spaces or casing differences)
elif len(duplicate_suborgs) > 1:
# Pick the best record (fewest spaces, most leading capitals)
best_record = max(
duplicate_suborgs,
key=lambda suborg: (-suborg.name.count(" "), count_capitals(suborg.name, leading_only=True)),
)
records_to_prune[normalized_name] = {
"keep": best_record,
"delete": [s for s in duplicate_suborgs if s != best_record],
}
return records_to_prune
Loading

0 comments on commit 0d2c81c

Please sign in to comment.