Chuvash Toponymy Database v.0.1: Artifact [4c8662844d]

Artifact 4c8662844db1e4c544d051eb5233806aed68886e8fb210b707b65c141590916f:

File data/archive/db_experiment/v0.8/divide_database_v0.8.py — part of check-in [0eb324393e] at 2025-04-24 17:10:24 on branch trunk — archive 'db_experiment' directory and english language database documentation (user: sisrahtak, size: 14319) [annotate] [blame] [check-ins using]
File data/csv/db_experiment/v0.8/divide_database_v0.8.py — part of check-in [b4e4016dda] at 2025-02-19 16:10:09 on branch trunk — 2025 Feb 19 add data transformation script v0.8 and two data tables (spatial and linguistic) (user: sisrahtak, size: 14319) [annotate] [blame] [check-ins using]
import csv

def transform_data(source_file, spatial_file, linguistic_file):
    # Delimiter used in the source CSV
    delimiter = ';'
    
    # Data structure to hold all rows grouped by either REFID or ID
    groups = {}
    
    # Read the source CSV
    with open(source_file, 'r', encoding='utf-8-sig') as f:
        reader = csv.DictReader(f, delimiter=delimiter)
        rows = list(reader)  # load all rows into memory
        
    rows_for_grouping = {}
        
    # Group the rows
    for row in rows:
        #print(row)
        id_id = int(row['ID'].strip())
        refid_str = row['REFID'].strip()
        #print(refid_str)
        if refid_str:  # if REFID is not empty
            refid_id = int(refid_str)
            rows_for_grouping[id_id] = [refid_id, row] # isinstance(obj, int)
        else:
            rows_for_grouping[id_id] = [row]
        
    for k, v in rows_for_grouping.items():
        if isinstance(v[0], int):  # if REFID is not empty
            if v[0] in groups:
                continue
            # print(v[1], "\n", rows_for_grouping[v[0]])
            groups[k] = [v[1], rows_for_grouping[v[0]][1]]
        else:
            groups[k] = v
    
    # Prepare the output files
    with open(spatial_file, 'w', newline='', encoding='utf-8-sig') as fspatial, \
         open(linguistic_file, 'w', newline='', encoding='utf-8-sig') as fling:
        
        spatial_writer = csv.writer(fspatial, delimiter=';')
        linguistic_writer = csv.writer(fling, delimiter=';')
        
        # Write headers for spatial.csv
        spatial_header = [
            'ID',         # PK
            'LAT',        # float
            'LON',        # float
            'OFFNAME',    # string
            'LANG',       # string
            'CLASS',      # string
            'TYPE',       # string
            'DISTRICT',   # string (nullable)
            'DOUBT',      # int (nullable) - empty
            'LANDMARK',   # from RELREF
            'COMMENTS',   # from GEOCOMMENT
            'OTHER'       # empty for now
        ]
        spatial_writer.writerow(spatial_header)
        
        # Write headers for linguistic.csv
        linguistic_header = [
            'ID',         # PK
            'SPATID',     # FK -> Spatial.ID
            'DOUSPAT',    # int (nullable) - empty
            'MAINID',     # FK -> Linguistic.ID
            'TOPONYM',    # name
            'TOPFORMS',   # text (nullable) - empty
            'DOUTOPO',    # int (nullable) - empty
            'LANG',       # RUS or CHU
            'DOULANG',    # int (nullable) - empty
            'PRONUNC',    # text (nullable) - empty
            'DOUPRON',    # int (nullable) - empty
            'ETYM',       # ETYMSAV + ETYM
            'ORIGIN',     # ORIG field
            'COMMENTS',   # CORRECTION + HISTORIC + ...
            'OTHER'       # empty for now
        ]
        linguistic_writer.writerow(linguistic_header)
        
        # We'll keep an auto-increment for the linguistic ID
        linguistic_id_counter = 1
        
        # Process each group, find the main row, write to spatial + linguistic
        for group_key, group_rows in groups.items():
            # Find the main row: the one that has LAT and LON (non-empty)
            main_row = None
            for r in group_rows:
                if r['LAT'].strip() and r['LON'].strip():
                    main_row = r
                    break
            
            # If we can't find a main row by LAT and LON pair the next pretender is a row with RUS name and only after that -- the row with CHUV name
            if not main_row:
                # print(f"No main row for ID: {group_key}")
                for r in group_rows:
                    if r['RUS'].strip():
                        main_row = r
                    break
            
            if not main_row:
                # print(f"No main row for ID: {group_key}")
                for r in group_rows:
                    if r['CHUV'].strip():
                        main_row = r
                    break
            
            if not main_row:
                print(f"\n\nNo main row for ID: {group_key}\n\n")
            
            # Prepare fields for SPATIAL table
            spat_id = int(main_row['ID'])
            
            lat_str = main_row['LAT'].replace(',', '.').strip()
            lon_str = main_row['LON'].replace(',', '.').strip()
            
            # figure out OFFNAME + LANG
            # "only if RUS name is absent in the record itself and absent in the record connected
            #  through REFID field, only in that case CHUV name is used to fill the OFFNAME"
            # But the main rule for the main row:
            # If REFID is empty and RUS is empty => use CHUV
            # else use RUS. If RUS is empty for a group with REFID, let's do a fallback, because it's an error scenario.
            # We'll keep it simple here:
            r_main = main_row['RUS'].strip()
            c_main = main_row['CHUV'].strip()
            refid_main = main_row['REFID'].strip()
            
            if r_main:
                offname = r_main
                offlang = 'RUS'
            else:
                if (not refid_main) and c_main:
                    # If REFID is empty and there's no RUS, we use CHUV
                    offname = c_main
                    offlang = 'CHU'
                else:
                    # fallback if there's no RUS but we do have a REFID or no CHUV
                    # We'll just say offname is empty or "MISSING_RUS"
                    offname = 'MISSING_RUS'
                    offlang = 'RUS'
            
            objclass = main_row['OBJCLASS'].strip()
            objtype  = main_row['OBJTYPE'].strip()
            district = main_row['ABSREF'].strip()
            landmark = main_row['RELREF'].strip()
            geocomm  = main_row['GEOCOMMENT'].strip()
            
            # Write spatial row
            spatial_writer.writerow([
                spat_id,
                lat_str,
                lon_str,
                offname,
                offlang,
                objclass,
                objtype,
                district,
                '',            # DOUBT is empty
                landmark,
                geocomm,
                ''             # OTHER is empty for now
            ])
            
            # Next, build a list of all possible linguistic names from the group
            # We also need to figure out the MAINID. That is the ID of the main record's RUS name.
            
            # We'll parse all group rows first to find "the main row's RUS name linguistic ID" if it exists.
            # Then we assign MAINID to that ID for all subsequent rows.
            
            # We will store each "to be inserted" linguistic entry in a list of dicts:
            # Each dict will contain row info, language, name, etc.
            linguistic_entries = []
            
            # We'll do two passes: first pass gather, second pass assign MAINID
            # Because the main row's RUS name is the reference for MAINID if it exists.
            
            # Pass 1: gather all CHUV / RUS from each row into memory
            for r in group_rows:
                row_id = int(r['ID'])
                
                # Each row could produce up to 2 linguistic names (CHUV, RUS)
                possible_names = [
                    ('CHU', r['CHUV'].strip()),
                    ('RUS', r['RUS'].strip())
                ]
                
                for (lang_code, name_val) in possible_names:
                    if not name_val:
                        continue  # skip empty
                        
                    # Construct the "COMMENTS" field from the specified fields
                    correction = r['CORRECTION'].strip()
                    historic   = r['HISTORIC'].strip()
                    histcomm   = r['HISTCOMM'].strip()
                    dubanov    = r['DUBANOV'].strip()
                    page       = r['PAGE'].strip()
                    commidr    = r['COMMIDR'].strip()
                    commsav    = r['COMMSAV'].strip()
                    
                    comments_list = [
                        f"CORRECTION: {correction}",
                        f"HISTORIC: {historic}",
                        f"HISTCOMM: {histcomm}",
                        f"DUBANOV: {dubanov}",
                        f"PAGE: {page}",
                        f"COMMIDR: {commidr}",
                        f"COMMSAV: {commsav}"
                    ]
                    comments_str = " | ".join(comments_list)
                    
                    # Etym = ETYMSAV + ETYM
                    etymsav = r['ETYMSAV'].strip()
                    etym    = r['ETYM'].strip()
                    combined_etym = etymsav
                    if etym:
                        if combined_etym:
                            combined_etym += " " + etym
                        else:
                            combined_etym = etym
                    
                    origin_str = r['ORIG'].strip()
                    
                    # Build an entry
                    entry = {
                        'row_id': row_id,         # original row ID
                        'lang': lang_code,
                        'toponym': name_val,
                        'etym': combined_etym,
                        'origin': origin_str,
                        'comments': comments_str
                    }
                    
                    linguistic_entries.append(entry)
            
            # Pass 2: insert into the linguistic.csv. We identify the "main RUS" from the main row
            # We'll define main_rus_toponym if the main row's RUS is not empty (and not "MISSING_RUS")
            main_rus = main_row['RUS'].strip()
            
            # We'll do a quick function to see if a dictionary entry belongs to "main row's RUS"
            def is_main_rus(entry):
                if entry['row_id'] == int(main_row['ID']):
                    if entry['lang'] == 'RUS' and entry['toponym'] == main_rus and main_rus != '' and main_rus != 'MISSING_RUS':
                        return True
                return False
            
            # find if there's a main row's RUS entry
            main_rus_entries = [e for e in linguistic_entries if is_main_rus(e)]
            
            if main_rus_entries:
                # we take the first one as the "real main RUS" entry
                main_rus_entry = main_rus_entries[0]
            else:
                main_rus_entry = None
            
            # We will do two passes: first insert the main RUS to get its ID and store it in `the_mainid`,
            # then insert the rest referencing `the_mainid`.
            
            the_mainid = None
            
            # function to write row to linguistic.csv
            def write_linguistic_row(entry, spatid, mainid_placeholder):
                nonlocal linguistic_id_counter, linguistic_writer
                # Each row gets a new linguistic_id
                current_ling_id = linguistic_id_counter
                linguistic_id_counter += 1
                
                # If this row is itself the "main" row's RUS name, then MAINID = current_ling_id
                # Otherwise, MAINID = the_mainid from the main row
                if mainid_placeholder is None:
                    # This indicates we are assigning the main RUS's own ID as MAINID
                    # so for the main RUS row: ID == MAINID
                    real_mainid = current_ling_id
                else:
                    real_mainid = mainid_placeholder
                
                # Write the row
                # ID, SPATID, DOUSPAT, MAINID, TOPONYM, TOPFORMS, DOUTOPO, LANG, DOULANG, PRONUNC, DOUPRON, ETYM, ORIGIN, COMMENTS, OTHER
                linguistic_writer.writerow([
                    current_ling_id,             # ID
                    spatid,                      # SPATID
                    '',                          # DOUSPAT empty
                    real_mainid,                 # MAINID
                    entry['toponym'],            # TOPONYM
                    '',                          # TOPFORMS
                    '',                          # DOUTOPO
                    entry['lang'],               # LANG
                    '',                          # DOULANG
                    '',                          # PRONUNC
                    '',                          # DOUPRON
                    entry['etym'],               # ETYM
                    entry['origin'],             # ORIGIN
                    entry['comments'],           # COMMENTS
                    ''                           # OTHER
                ])
                
                return current_ling_id
            
            # 2a. If we have a main_rus_entry, insert it first
            if main_rus_entry:
                main_rus_ling_id = write_linguistic_row(main_rus_entry, spat_id, None)
                the_mainid = main_rus_ling_id
                # remove it from the list so we don’t insert it twice
                linguistic_entries.remove(main_rus_entry)
            
            # 2b. Insert all other entries
            for entry in linguistic_entries:
                if the_mainid:
                    # we already have a main RUS ID
                    write_linguistic_row(entry, spat_id, the_mainid)
                else:
                    print(f"Check the ID: {entry['row_id']}-{entry['toponym']}")
                    # no main RUS found (error scenario?), let's let the first inserted name become main
                    new_id = write_linguistic_row(entry, spat_id, None)
                    if the_mainid is None:
                        the_mainid = new_id
                    # subsequent ones in the same loop will reference the_mainid
                        
# Example usage:
# transform_data('source.csv', 'spatial.csv', 'linguistic.csv')
#
# If you need to run it as a standalone script, uncomment the following lines
# and adjust the filenames as needed:

if __name__ == "__main__":
    transform_data('source.csv', 'spatial.csv', 'linguistic.csv')