Chuvash Toponymy Database v.0.1

validators.py at trunk
Login

validators.py at trunk

File tests/validators.py artifact 3b7b929f49 on branch trunk


# validators.py
def run_tests(conn):
    errors = []
    with conn.cursor() as cur:
        # Schema-Level & Key Constraints
        # Check for duplicate IDs in Spatial
        cur.execute("""
            SELECT ID, COUNT(*) 
            FROM Spatial
            GROUP BY ID
            HAVING COUNT(*) > 1
        """)
        for row in cur.fetchall():
            errors.append(f"[Spatial] Duplicate ID found: {row[0]}")

        # Check for duplicate IDs in Linguistic
        cur.execute("""
            SELECT ID, COUNT(*) 
            FROM Linguistic
            GROUP BY ID
            HAVING COUNT(*) > 1
        """)
        for row in cur.fetchall():
            errors.append(f"[Linguistic] Duplicate ID found: {row[0]}")

        # Check for duplicate IDs in Temporal
        cur.execute("""
            SELECT ID, COUNT(*) 
            FROM Temporal
            GROUP BY ID
            HAVING COUNT(*) > 1
        """)
        for row in cur.fetchall():
            errors.append(f"[Temporal] Duplicate ID found: {row[0]}")

        # Check for duplicate IDs in Sources
        cur.execute("""
            SELECT ID, COUNT(*) 
            FROM Sources
            GROUP BY ID
            HAVING COUNT(*) > 1
        """)
        for row in cur.fetchall():
            errors.append(f"[Sources] Duplicate ID found: {row[0]}")

        # Check foreign key: Linguistic.SPATID must exist in Spatial
        cur.execute("""
            SELECT L.ID, L.SPATID
            FROM Linguistic L
            LEFT JOIN Spatial S ON L.SPATID = S.ID
            WHERE L.SPATID IS NOT NULL AND S.ID IS NULL
        """)
        for row in cur.fetchall():
            errors.append(f"[Linguistic] SPATID={row[1]} not found in Spatial (Linguistic.ID={row[0]})")

        # Check foreign key: Temporal.LINGID must exist in Linguistic
        cur.execute("""
            SELECT T.ID, T.LINGID
            FROM Temporal T
            LEFT JOIN Linguistic L ON T.LINGID = L.ID
            WHERE T.LINGID IS NOT NULL AND L.ID IS NULL
        """)
        for row in cur.fetchall():
            errors.append(f"[Temporal] LINGID={row[1]} not found in Linguistic (Temporal.ID={row[0]})")

        # (Composite key check) For each row in Temporal,
        #   LINGNAME should match the TOPONYM in the row of Linguistic where ID = LINGID.
        cur.execute("""
            SELECT T.ID, T.LINGID, T.LINGNAME, L.TOPONYM
            FROM Temporal T
            LEFT JOIN Linguistic L ON T.LINGID = L.ID
            WHERE L.ID IS NOT NULL AND T.LINGNAME != L.TOPONYM
        """)
        for row in cur.fetchall():
            errors.append(f"[Temporal] LINGNAME='{row[2]}' does not match TOPONYM='{row[3]}' for LINGID={row[1]} (Temporal.ID={row[0]})")

        # Logic-Based Validations
        # Example: In Temporal, if EVENT='ACTIVE' => END should be NULL (or at least not set)
        cur.execute("""
            SELECT ID, ENDYEAR, EVENT
            FROM Temporal
            WHERE EVENT='ACTIVE' AND ENDYEAR IS NOT NULL
        """)
        for row in cur.fetchall():
            errors.append(f"[Temporal] EVENT=ACTIVE but END is not null for row ID={row[0]}")

        # Example: If EVENT='CEASE' => OBJID and OBJNAME must be empty
        cur.execute("""
            SELECT ID, OBJID, OBJNAME
            FROM Temporal
            WHERE EVENT='CEASE' AND (OBJID IS NOT NULL OR OBJNAME IS NOT NULL AND OBJNAME != '')
        """)
        for row in cur.fetchall():
            errors.append(f"[Temporal] EVENT=CEASE but OBJID/OBJNAME present for ID={row[0]}")

        # Example: If EVENT in ('RENAME','MERGEIN') => OBJID/OBJNAME must not be empty
        cur.execute("""
            SELECT ID, EVENT, OBJID, OBJNAME
            FROM Temporal
            WHERE EVENT IN ('RENAME','MERGEIN')
              AND (OBJID IS NULL OR OBJNAME IS NULL OR OBJNAME = '')
        """)
        for row in cur.fetchall():
            errors.append(f"[Temporal] EVENT={row[1]} but OBJID/OBJNAME is missing (Temporal.ID={row[0]})")

        # Data Quality Checks
        # Note: Coordinate range checks are now in schema constraints
        # EVENT values are checked via schema constraint
        # LANG codes are checked via schema constraint

    return errors