# validators.py
def run_tests(conn):
errors = []
with conn.cursor() as cur:
# Schema-Level & Key Constraints
# Check for duplicate IDs in Spatial
cur.execute("""
SELECT ID, COUNT(*)
FROM Spatial
GROUP BY ID
HAVING COUNT(*) > 1
""")
for row in cur.fetchall():
errors.append(f"[Spatial] Duplicate ID found: {row[0]}")
# Check for duplicate IDs in Linguistic
cur.execute("""
SELECT ID, COUNT(*)
FROM Linguistic
GROUP BY ID
HAVING COUNT(*) > 1
""")
for row in cur.fetchall():
errors.append(f"[Linguistic] Duplicate ID found: {row[0]}")
# Check for duplicate IDs in Temporal
cur.execute("""
SELECT ID, COUNT(*)
FROM Temporal
GROUP BY ID
HAVING COUNT(*) > 1
""")
for row in cur.fetchall():
errors.append(f"[Temporal] Duplicate ID found: {row[0]}")
# Check for duplicate IDs in Sources
cur.execute("""
SELECT ID, COUNT(*)
FROM Sources
GROUP BY ID
HAVING COUNT(*) > 1
""")
for row in cur.fetchall():
errors.append(f"[Sources] Duplicate ID found: {row[0]}")
# Check foreign key: Linguistic.SPATID must exist in Spatial
cur.execute("""
SELECT L.ID, L.SPATID
FROM Linguistic L
LEFT JOIN Spatial S ON L.SPATID = S.ID
WHERE L.SPATID IS NOT NULL AND S.ID IS NULL
""")
for row in cur.fetchall():
errors.append(f"[Linguistic] SPATID={row[1]} not found in Spatial (Linguistic.ID={row[0]})")
# Check foreign key: Temporal.LINGID must exist in Linguistic
cur.execute("""
SELECT T.ID, T.LINGID
FROM Temporal T
LEFT JOIN Linguistic L ON T.LINGID = L.ID
WHERE T.LINGID IS NOT NULL AND L.ID IS NULL
""")
for row in cur.fetchall():
errors.append(f"[Temporal] LINGID={row[1]} not found in Linguistic (Temporal.ID={row[0]})")
# (Composite key check) For each row in Temporal,
# LINGNAME should match the TOPONYM in the row of Linguistic where ID = LINGID.
cur.execute("""
SELECT T.ID, T.LINGID, T.LINGNAME, L.TOPONYM
FROM Temporal T
LEFT JOIN Linguistic L ON T.LINGID = L.ID
WHERE L.ID IS NOT NULL AND T.LINGNAME != L.TOPONYM
""")
for row in cur.fetchall():
errors.append(f"[Temporal] LINGNAME='{row[2]}' does not match TOPONYM='{row[3]}' for LINGID={row[1]} (Temporal.ID={row[0]})")
# Logic-Based Validations
# Example: In Temporal, if EVENT='ACTIVE' => END should be NULL (or at least not set)
cur.execute("""
SELECT ID, ENDYEAR, EVENT
FROM Temporal
WHERE EVENT='ACTIVE' AND ENDYEAR IS NOT NULL
""")
for row in cur.fetchall():
errors.append(f"[Temporal] EVENT=ACTIVE but END is not null for row ID={row[0]}")
# Example: If EVENT='CEASE' => OBJID and OBJNAME must be empty
cur.execute("""
SELECT ID, OBJID, OBJNAME
FROM Temporal
WHERE EVENT='CEASE' AND (OBJID IS NOT NULL OR OBJNAME IS NOT NULL AND OBJNAME != '')
""")
for row in cur.fetchall():
errors.append(f"[Temporal] EVENT=CEASE but OBJID/OBJNAME present for ID={row[0]}")
# Example: If EVENT in ('RENAME','MERGEIN') => OBJID/OBJNAME must not be empty
cur.execute("""
SELECT ID, EVENT, OBJID, OBJNAME
FROM Temporal
WHERE EVENT IN ('RENAME','MERGEIN')
AND (OBJID IS NULL OR OBJNAME IS NULL OR OBJNAME = '')
""")
for row in cur.fetchall():
errors.append(f"[Temporal] EVENT={row[1]} but OBJID/OBJNAME is missing (Temporal.ID={row[0]})")
# Data Quality Checks
# Note: Coordinate range checks are now in schema constraints
# EVENT values are checked via schema constraint
# LANG codes are checked via schema constraint
return errors