From 56eeda771780c230cd048e6b298d100c3755f263 Mon Sep 17 00:00:00 2001 From: fredkingham Date: Wed, 16 Nov 2022 09:01:29 +0000 Subject: [PATCH] Adds unit tests and fixes for the find duplicates check --- .../data_quality/checks/find_duplicates.py | 18 +- .../tests/test_find_duplicates.py | 177 ++++++++++++++++++ 2 files changed, 188 insertions(+), 7 deletions(-) create mode 100644 plugins/data_quality/tests/test_find_duplicates.py diff --git a/plugins/data_quality/checks/find_duplicates.py b/plugins/data_quality/checks/find_duplicates.py index ad82130b6..1f2fa0b6a 100644 --- a/plugins/data_quality/checks/find_duplicates.py +++ b/plugins/data_quality/checks/find_duplicates.py @@ -43,8 +43,7 @@ def find_exact_duplicates(): # something unexpected has happened. raise ValueError(f'We have found {len(patients)} for hn {duplicate_hn}') dup_patients.append(patients) - cleanable_duplicates, uncleanable_duplicates = calculate_deletable_undeletable() - cleanable_duplicates, uncleanable_duplicates = calculate_deletable_undeletable() + cleanable_duplicates, uncleanable_duplicates = calculate_deletable_undeletable(dup_patients) email_title = f'{len(cleanable_duplicates) + len (uncleanable_duplicates)} Exact hospital number duplicates' email_context = { "title": email_title, @@ -75,15 +74,18 @@ def find_zero_leading_duplicates(): without_zero_patients = list(Patient.objects.filter( demographics__hospital_number=without_zero )) - if len(with_zero_patients) > 1 or len(without_zero_patient) > 1: + if len(with_zero_patients) > 1 or len(without_zero_patients) > 1: # we expect these patients to be covered by the find_exact_duplicates # check continue - if without_zero_patients == 0: + if len(without_zero_patients) == 0: # we have no patients with the zero stripped so lets continue continue - dup_patients.append(with_zero[0], without_zero_patients[0]) - cleanable_duplicates, uncleanable_duplicates = calculate_deletable_undeletable() + dup_patients.append((with_zero_patients[0], without_zero_patients[0],)) + if len(dup_patients) == 0: + # if there are no duplicate patients, return + return + cleanable_duplicates, uncleanable_duplicates = calculate_deletable_undeletable(dup_patients) email_title = f'{len(cleanable_duplicates) + len (uncleanable_duplicates)} Exact hospital number duplicates' email_context = { "title": email_title, @@ -141,10 +143,12 @@ def calculate_deletable_undeletable(patient_tuples): for patients in patient_tuples: patient_1 = patients[0] patient_1_native_records = has_records(patient_1) + patient_1_hn = patient_1.demographics_set.get().hospital_number patient_2 = patients[1] patient_2_native_records = has_records(patient_2) + patient_2_hn = patient_2.demographics_set.get().hospital_number if len(patient_1_native_records) and len(patient_2_native_records): - logger.info(f'====== Unable to merge hn {duplicate_hn} as they both have duplicate records') + logger.info(f'====== Unable to merge hn {patient_1_hn}/{patient_2_hn} as they both have duplicate records') logger.info(f'=== Patient 1 ({patient_1.id}) has:') for record in patient_1_native_records: logger.info(f'{record.__class__.__name__} {record.id}') diff --git a/plugins/data_quality/tests/test_find_duplicates.py b/plugins/data_quality/tests/test_find_duplicates.py new file mode 100644 index 000000000..6adffa9d2 --- /dev/null +++ b/plugins/data_quality/tests/test_find_duplicates.py @@ -0,0 +1,177 @@ +from unittest import mock +from opal.core.test import OpalTestCase +from plugins.data_quality.checks import find_duplicates + + +@mock.patch('plugins.data_quality.utils.send_email') +class FindExactDuplicatesTestCase(OpalTestCase): + def test_finds_exact_cleanable_duplicates(self, send_email): + """ + Cleanable duplicates are when we can delete one without issue. + + In this case only patient 1 has a subrecord so + patient 2 can be deleted. + """ + patient_1, episode = self.new_patient_and_episode_please() + patient_1.demographics_set.update( + hospital_number='111111', + ) + diagnosis = episode.diagnosis_set.create() + diagnosis.condition = "Cough" + diagnosis.save() + patient_2, _ = self.new_patient_and_episode_please() + patient_2.demographics_set.update( + hospital_number='111111', + ) + find_duplicates.find_exact_duplicates() + call_args = send_email.call_args[0] + self.assertEqual( + call_args[0], '1 Exact hospital number duplicates' + ) + self.assertEqual( + call_args[2]["title"], '1 Exact hospital number duplicates' + ) + self.assertEqual( + call_args[2]["uncleanable_duplicates"], [] + ) + self.assertEqual( + call_args[2]["cleanable_duplicates"], [(patient_2, patient_1,)] + ) + self.assertEqual( + call_args[2]["uncleanable_duplicates"], [] + ) + + def test_finds_exact_uncleanable_duplicates(self, send_email): + """ + Cleanable duplicates are when we can delete one without issue. + + In this case only patient 1 has a subrecord so + patient 2 can be deleted. + """ + patient_1, episode_1 = self.new_patient_and_episode_please() + patient_1.demographics_set.update( + hospital_number='111111', + ) + diagnosis = episode_1.diagnosis_set.create() + diagnosis.condition = "Cough" + diagnosis.save() + patient_2, episode_2 = self.new_patient_and_episode_please() + patient_2.demographics_set.update( + hospital_number='111111', + ) + diagnosis = episode_2.diagnosis_set.create() + diagnosis.condition = "Fever" + diagnosis.save() + find_duplicates.find_exact_duplicates() + call_args = send_email.call_args[0] + self.assertEqual( + call_args[0], '1 Exact hospital number duplicates' + ) + self.assertEqual( + call_args[2]["title"], '1 Exact hospital number duplicates' + ) + self.assertEqual( + call_args[2]["cleanable_duplicates"], [] + ) + self.assertEqual( + call_args[2]["uncleanable_duplicates"], [(patient_1, patient_2,)] + ) + + def test_does_not_find_exact_duplicates(self, send_email): + patient_1, _ = self.new_patient_and_episode_please() + patient_1.demographics_set.update( + hospital_number='111111', + ) + patient_2, _ = self.new_patient_and_episode_please() + patient_2.demographics_set.update( + hospital_number='22222222', + ) + find_duplicates.find_exact_duplicates() + self.assertFalse(send_email.called) + + +@mock.patch('plugins.data_quality.utils.send_email') +class FindLeadingZeroDuplicatesTestCase(OpalTestCase): + def test_finds_leading_zero_cleanable_duplicates(self, send_email): + """ + Cleanable duplicates are when we can delete one without issue. + + In this case only patient 1 has a subrecord so + patient 2 can be deleted. + """ + patient_1, episode = self.new_patient_and_episode_please() + patient_1.demographics_set.update( + hospital_number='111111', + ) + diagnosis = episode.diagnosis_set.create() + diagnosis.condition = "Cough" + diagnosis.save() + patient_2, _ = self.new_patient_and_episode_please() + patient_2.demographics_set.update( + hospital_number='0111111', + ) + find_duplicates.find_zero_leading_duplicates() + call_args = send_email.call_args[0] + self.assertEqual( + call_args[0], '1 Exact hospital number duplicates' + ) + self.assertEqual( + call_args[2]["title"], '1 Exact hospital number duplicates' + ) + self.assertEqual( + call_args[2]["uncleanable_duplicates"], [] + ) + self.assertEqual( + call_args[2]["cleanable_duplicates"], [(patient_2, patient_1,)] + ) + self.assertEqual( + call_args[2]["uncleanable_duplicates"], [] + ) + + def test_finds_leading_zero_uncleanable_duplicates(self, send_email): + """ + Cleanable duplicates are when we can delete one without issue. + + In this case only patient 1 has a subrecord so + patient 2 can be deleted. + """ + patient_1, episode_1 = self.new_patient_and_episode_please() + patient_1.demographics_set.update( + hospital_number='111111', + ) + diagnosis = episode_1.diagnosis_set.create() + diagnosis.condition = "Cough" + diagnosis.save() + patient_2, episode_2 = self.new_patient_and_episode_please() + patient_2.demographics_set.update( + hospital_number='0111111', + ) + diagnosis = episode_2.diagnosis_set.create() + diagnosis.condition = "Fever" + diagnosis.save() + find_duplicates.find_zero_leading_duplicates() + call_args = send_email.call_args[0] + self.assertEqual( + call_args[0], '1 Exact hospital number duplicates' + ) + self.assertEqual( + call_args[2]["title"], '1 Exact hospital number duplicates' + ) + self.assertEqual( + call_args[2]["cleanable_duplicates"], [] + ) + self.assertEqual( + call_args[2]["uncleanable_duplicates"], [(patient_2,patient_1,)] + ) + + def test_does_not_find_leading_zero_duplicates(self, send_email): + patient_1, _ = self.new_patient_and_episode_please() + patient_1.demographics_set.update( + hospital_number='111111', + ) + patient_2, _ = self.new_patient_and_episode_please() + patient_2.demographics_set.update( + hospital_number='022222222', + ) + find_duplicates.find_zero_leading_duplicates() + self.assertFalse(send_email.called)