Referential integrity testing identified 10 tables with invalid foreign key references, totaling 273,091 orphaned records. This excludes concept mappings and external system table references.
These referential integrity violations would have a low-medium impact on analyses that rely on Appointment and Location data. Counts for other failures are low and will have a limited impact.
import snowflake.snowpark as snowpark
def main(session: snowpark.Session):
db = "Data_Store_OLIDS_UAT"
schema = "OLIDS_MASKED"
session.sql(f'USE DATABASE "{db}"').collect()
session.sql(f'USE SCHEMA "{schema}"').collect()
# Get all available columns to validate relationships
columns_query = f"""
SELECT table_name, column_name
FROM "{db}".INFORMATION_SCHEMA.COLUMNS
WHERE table_schema = '{schema}'
"""
available_columns = session.sql(columns_query).collect()
column_lookup = {(row['TABLE_NAME'], row['COLUMN_NAME']) for row in available_columns}
# All table-to-table relationships (excluding concept mappings and external system references)
relationships = [
('ALLERGY_INTOLERANCE', 'patient_id', 'PATIENT', 'id'),
('ALLERGY_INTOLERANCE', 'person_id', 'PERSON', 'id'),
('ALLERGY_INTOLERANCE', 'encounter_id', 'ENCOUNTER', 'id'),
('ALLERGY_INTOLERANCE', 'practitioner_id', 'PRACTITIONER', 'id'),
('APPOINTMENT', 'patient_id', 'PATIENT', 'id'),
('APPOINTMENT', 'person_id', 'PERSON', 'id'),
('APPOINTMENT', 'organisation_id', 'ORGANISATION', 'id'),
('APPOINTMENT', 'practitioner_in_role_id', 'PRACTITIONER_IN_ROLE', 'id'),
('APPOINTMENT', 'schedule_id', 'SCHEDULE', 'id'),
('APPOINTMENT_PRACTITIONER', 'appointment_id', 'APPOINTMENT', 'id'),
('APPOINTMENT_PRACTITIONER', 'practitioner_id', 'PRACTITIONER', 'id'),
('DIAGNOSTIC_ORDER', 'patient_id', 'PATIENT', 'id'),
('DIAGNOSTIC_ORDER', 'person_id', 'PERSON', 'id'),
('DIAGNOSTIC_ORDER', 'encounter_id', 'ENCOUNTER', 'id'),
('DIAGNOSTIC_ORDER', 'practitioner_id', 'PRACTITIONER', 'id'),
('DIAGNOSTIC_ORDER', 'parent_observation_id', 'OBSERVATION', 'id'),
('ENCOUNTER', 'patient_id', 'PATIENT', 'id'),
('ENCOUNTER', 'person_id', 'PERSON', 'id'),
('ENCOUNTER', 'appointment_id', 'APPOINTMENT', 'id'),
('ENCOUNTER', 'episode_of_care_id', 'EPISODE_OF_CARE', 'id'),
('ENCOUNTER', 'practitioner_id', 'PRACTITIONER', 'id'),
('ENCOUNTER', 'service_provider_organisation_id', 'ORGANISATION', 'id'),
('EPISODE_OF_CARE', 'patient_id', 'PATIENT', 'id'),
('EPISODE_OF_CARE', 'person_id', 'PERSON', 'id'),
('EPISODE_OF_CARE', 'organisation_id', 'ORGANISATION', 'id'),
('EPISODE_OF_CARE', 'care_manager_practitioner_id', 'PRACTITIONER', 'id'),
('FLAG', 'patient_id', 'PATIENT', 'id'),
('FLAG', 'person_id', 'PERSON', 'id'),
('LOCATION', 'managing_organisation_id', 'ORGANISATION', 'id'),
('LOCATION_CONTACT', 'location_id', 'LOCATION', 'id'),
('MEDICATION_ORDER', 'patient_id', 'PATIENT', 'id'),
('MEDICATION_ORDER', 'person_id', 'PERSON', 'id'),
('MEDICATION_ORDER', 'encounter_id', 'ENCOUNTER', 'id'),
('MEDICATION_ORDER', 'practitioner_id', 'PRACTITIONER', 'id'),
('MEDICATION_ORDER', 'organisation_id', 'ORGANISATION', 'id'),
('MEDICATION_ORDER', 'allergy_intolerance_id', 'ALLERGY_INTOLERANCE', 'id'),
('MEDICATION_ORDER', 'diagnostic_order_id', 'DIAGNOSTIC_ORDER', 'id'),
('MEDICATION_ORDER', 'medication_statement_id', 'MEDICATION_STATEMENT', 'id'),
('MEDICATION_ORDER', 'observation_id', 'OBSERVATION', 'id'),
('MEDICATION_ORDER', 'referral_request_id', 'REFERRAL_REQUEST', 'id'),
('MEDICATION_STATEMENT', 'patient_id', 'PATIENT', 'id'),
('MEDICATION_STATEMENT', 'person_id', 'PERSON', 'id'),
('MEDICATION_STATEMENT', 'encounter_id', 'ENCOUNTER', 'id'),
('MEDICATION_STATEMENT', 'practitioner_id', 'PRACTITIONER', 'id'),
('MEDICATION_STATEMENT', 'organisation_id', 'ORGANISATION', 'id'),
('MEDICATION_STATEMENT', 'allergy_intolerance_id', 'ALLERGY_INTOLERANCE', 'id'),
('MEDICATION_STATEMENT', 'diagnostic_order_id', 'DIAGNOSTIC_ORDER', 'id'),
('MEDICATION_STATEMENT', 'observation_id', 'OBSERVATION', 'id'),
('MEDICATION_STATEMENT', 'referral_request_id', 'REFERRAL_REQUEST', 'id'),
('OBSERVATION', 'patient_id', 'PATIENT', 'id'),
('OBSERVATION', 'person_id', 'PERSON', 'id'),
('OBSERVATION', 'encounter_id', 'ENCOUNTER', 'id'),
('OBSERVATION', 'parent_obervation_id', 'OBSERVATION', 'id'),
('OBSERVATION', 'practioner_id', 'PRACTITIONER', 'id'),
('ORGANISATION', 'parent_organisation_id', 'ORGANISATION', 'id'),
('PATIENT', 'registered_practice_id', 'ORGANISATION', 'id'),
('PATIENT_ADDRESS', 'patient_id', 'PATIENT', 'id'),
('PATIENT_ADDRESS', 'person_id', 'PERSON', 'id'),
('PATIENT_CONTACT', 'patient_id', 'PATIENT', 'id'),
('PATIENT_CONTACT', 'person_id', 'PERSON', 'id'),
('PATIENT_PERSON', 'patient_id', 'PATIENT', 'id'),
('PATIENT_PERSON', 'person_id', 'PERSON', 'id'),
('PATIENT_REGISTERED_PRACTITIONER_IN_ROLE', 'patient_id', 'PATIENT', 'id'),
('PATIENT_REGISTERED_PRACTITIONER_IN_ROLE', 'person_id', 'PERSON', 'id'),
('PATIENT_REGISTERED_PRACTITIONER_IN_ROLE', 'practitioner_id', 'PRACTITIONER', 'id'),
('PATIENT_REGISTERED_PRACTITIONER_IN_ROLE', 'organisation_id', 'ORGANISATION', 'id'),
('PATIENT_REGISTERED_PRACTITIONER_IN_ROLE', 'episode_of_care_id', 'EPISODE_OF_CARE', 'id'),
('PERSON', 'requesting_patient_id', 'PATIENT', 'id'),
('PRACTITIONER_IN_ROLE', 'practitioner_id', 'PRACTITIONER', 'id'),
('PRACTITIONER_IN_ROLE', 'organisation_id', 'ORGANISATION', 'id'),
('PROCEDURE_REQUEST', 'patient_id', 'PATIENT', 'id'),
('PROCEDURE_REQUEST', 'person_id', 'PERSON', 'id'),
('PROCEDURE_REQUEST', 'encounter_id', 'ENCOUNTER', 'id'),
('PROCEDURE_REQUEST', 'practitioner_id', 'PRACTITIONER', 'id'),
('REFERRAL_REQUEST', 'patient_id', 'PATIENT', 'id'),
('REFERRAL_REQUEST', 'person_id', 'PERSON', 'id'),
('REFERRAL_REQUEST', 'encounter_id', 'ENCOUNTER', 'id'),
('REFERRAL_REQUEST', 'practitioner_id', 'PRACTITIONER', 'id'),
('REFERRAL_REQUEST', 'organisation_id', 'ORGANISATION', 'id'),
('REFERRAL_REQUEST', 'recipient_organisation_id', 'ORGANISATION', 'id'),
('REFERRAL_REQUEST', 'requester_organisation_id', 'ORGANISATION', 'id'),
('SCHEDULE', 'location_id', 'LOCATION', 'id'),
('SCHEDULE', 'practitioner_id', 'PRACTITIONER', 'id'),
('SCHEDULE_PRACTITIONER', 'practitioner_id', 'PRACTITIONER', 'id'),
('SCHEDULE_PRACTITIONER', 'schedule_id', 'SCHEDULE', 'id'),
]
results = []
total_invalid = 0
skipped_count = 0
for source_table, fk_column, ref_table, ref_column in relationships:
# Check if both columns exist before testing the relationship
source_exists = (source_table, fk_column) in column_lookup
ref_exists = (ref_table, ref_column) in column_lookup
if not source_exists or not ref_exists:
skipped_count += 1
missing = []
if not source_exists:
missing.append(f"{source_table}.{fk_column}")
if not ref_exists:
missing.append(f"{ref_table}.{ref_column}")
results.append({
'SOURCE_TABLE': source_table,
'FOREIGN_KEY': fk_column,
'REFERENCED_TABLE': f"{ref_table}.{ref_column}",
'INVALID_REFERENCES': f"SKIPPED: Missing {', '.join(missing)}"
})
continue
try:
# Find records with foreign keys that don't exist in the referenced table
query = f"""
SELECT COUNT(*) as invalid_count
FROM "{db}"."{schema}"."{source_table}" src
LEFT JOIN "{db}"."{schema}"."{ref_table}" ref
ON src."{fk_column}" = ref."{ref_column}"
WHERE src."{fk_column}" IS NOT NULL
AND ref."{ref_column}" IS NULL
"""
result = session.sql(query).collect()[0]
invalid_count = result['INVALID_COUNT']
if invalid_count > 0:
total_invalid += invalid_count
results.append({
'SOURCE_TABLE': source_table,
'FOREIGN_KEY': fk_column,
'REFERENCED_TABLE': f"{ref_table}.{ref_column}",
'INVALID_REFERENCES': invalid_count
})
except Exception as e:
results.append({
'SOURCE_TABLE': source_table,
'FOREIGN_KEY': fk_column,
'REFERENCED_TABLE': f"{ref_table}.{ref_column}",
'INVALID_REFERENCES': f"ERROR: {str(e)[:50]}"
})
# Add summary
tested_count = len(relationships) - skipped_count
if not any(isinstance(r.get("INVALID_REFERENCES"), int) and r["INVALID_REFERENCES"] > 0 for r in results):
results.insert(0, {
'SOURCE_TABLE': 'SUMMARY',
'FOREIGN_KEY': f'Tested {tested_count} relationships, skipped {skipped_count}',
'REFERENCED_TABLE': 'All referential integrity checks passed',
'INVALID_REFERENCES': 0
})
else:
tables_with_issues = len([r for r in results if isinstance(r.get("INVALID_REFERENCES"), int) and r["INVALID_REFERENCES"] > 0])
results.insert(0, {
'SOURCE_TABLE': 'SUMMARY',
'FOREIGN_KEY': f'Tested {tested_count} relationships, skipped {skipped_count}',
'REFERENCED_TABLE': f'Found {tables_with_issues} tables with invalid references',
'INVALID_REFERENCES': total_invalid
})
return session.create_dataframe(results)
Problem Summary
Referential integrity testing identified 10 tables with invalid foreign key references, totaling 273,091 orphaned records. This excludes concept mappings and external system table references.
Findings
Impact
These referential integrity violations would have a low-medium impact on analyses that rely on Appointment and Location data. Counts for other failures are low and will have a limited impact.
The person_id failure relates to known issue #17
Test Evidence
Snowpark Python Script: