ADDED data/csv/db_experiment/v0.3/2025_Feb_12_chatGPT_chat.txt Index: data/csv/db_experiment/v0.3/2025_Feb_12_chatGPT_chat.txt ================================================================== --- /dev/null +++ data/csv/db_experiment/v0.3/2025_Feb_12_chatGPT_chat.txt @@ -0,0 +1,99 @@ +Hello, ChatGPT! + +I need your help with table data transformation. + +This is the structure of the source csv file + +headers look like that: +ID;LAT;LON;coordcommenttemp;CHUV;RUS;GEOCOMMENT;CORRECTION;HISTORIC;OBJTYPE;OBJCLASS;ABSREF;OLD2025;HISTCOMM;RELREF;ETYMSAV;ORIG;DUBANOV;REFID;PAGE;COMMIDR;COMMSAV;ETYM + +data rows could be several types +First one is the most frequent -- CHUV and RUS columns are not empty and REFID column is empty +ID;LAT;LON;coordcommenttemp;CHUV;RUS;GEOCOMMENT;CORRECTION;HISTORIC;OBJTYPE;OBJCLASS;ABSREF;OLD2025;HISTCOMM;RELREF;ETYMSAV;ORIG;DUBANOV;REFID;PAGE;COMMIDR;COMMSAV;ETYM +23;55,983707;46,827595;YS;Атапай;Адабай;;;;деревня;ойконим;Моргаушский р-н;;;;чув. яз. имя (СЧЯ);CHV / TAT;От имени чуваша-язычника Атапай: ата 'отец', пай 'богатый'.;;12;;; + +The second type is quite frequent too -- CHUV column is empty, RUS is not empty, REFID is empty +ID;LAT;LON;coordcommenttemp;CHUV;RUS;GEOCOMMENT;CORRECTION;HISTORIC;OBJTYPE;OBJCLASS;ABSREF;OLD2025;HISTCOMM;RELREF;ETYMSAV;ORIG;DUBANOV;REFID;PAGE;COMMIDR;COMMSAV;ETYM +69;56,015002;46,346179;YS;;Алексеевка;;;;деревня;ойконим;Ядринский р-н;;;;;;;;24;Алешкина - название деревни Алексеевка до 1924 года;; + +Third type is the rarest -- CHUV is not empty, RUS and REFID are empty +ID;LAT;LON;coordcommenttemp;CHUV;RUS;GEOCOMMENT;CORRECTION;HISTORIC;OBJTYPE;OBJCLASS;ABSREF;OLD2025;HISTCOMM;RELREF;ETYMSAV;ORIG;DUBANOV;REFID;PAGE;COMMIDR;COMMSAV;ETYM +244;54,942062;47,721401;MS;Патирек;;;;;деревня;ойконим;Шемуршинский р-н;;;;;;;;52;;; + +Fourth type is not frequent -- CHUV, RUS and REFID columns are not empty +ID;LAT;LON;coordcommenttemp;CHUV;RUS;GEOCOMMENT;CORRECTION;HISTORIC;OBJTYPE;OBJCLASS;ABSREF;OLD2025;HISTCOMM;RELREF;ETYMSAV;ORIG;DUBANOV;REFID;PAGE;COMMIDR;COMMSAV;ETYM +445;55,075314;47,695053;MS;Пулак;Булаково;;;;деревня;ойконим;Батыревский р-н;;;;;;Название на 1923 год - Плаккасси.;446;93;;;От р. Була или личного имени Булак. + +Fifth type -- CHUV column is empty, RUS is not empty, REFID is not empty +ID;LAT;LON;coordcommenttemp;CHUV;RUS;GEOCOMMENT;CORRECTION;HISTORIC;OBJTYPE;OBJCLASS;ABSREF;OLD2025;HISTCOMM;RELREF;ETYMSAV;ORIG;DUBANOV;REFID;PAGE;COMMIDR;COMMSAV;ETYM +523;55,75605;46,358486;MS;;Верхнее Аккозино;;;;деревня;ойконим;Красночетайский р-н;;;;;;Рус. название от имени Акаҫ.;713;105;;; + +Sixth type -- CHUV column is not empty, RUS is empty, REFID is not empty +ID;LAT;LON;coordcommenttemp;CHUV;RUS;GEOCOMMENT;CORRECTION;HISTORIC;OBJTYPE;OBJCLASS;ABSREF;OLD2025;HISTCOMM;RELREF;ETYMSAV;ORIG;DUBANOV;REFID;PAGE;COMMIDR;COMMSAV;ETYM +594;;;;Малти Чиперуй;;;;;деревня;ойконим;Мариинско-Посадский р-н;;;;;;;399;85;;;От чув. чипер 'красивый' и уй 'поле'. + +About REFID column -- if the value is present in this field it means that two records (basically two strings in sorce.csv file) are connected, because it is the one spatial object (an entity having LAT and LON attributes) with several names (CHUV and RUS field in several records from source file). + +I want you to write python script transforming the source file into 2 csv files: spatial.csv and linguistic.csv + +Here is the specification of spatial table +"## 1. Spatial Table + +### Purpose: +The spatial table contains unique geographic entities, their locations, and relevant metadata. + +### Columns: +- ID (int, PK): Unique identifier for each geographic object, here goes the ID from source file +- LAT, LON (float): Estimated geographic coordinates, LAT and LON field from source file +- OFFNAME (string): Usually the RUS field goes here (It is the RUS name for the location) -- only if the RUS name is absent in the record itself and absent in the record connected through REFID field, only in that case CHUV name is used to fill the OFFNAME field. +- LANG (string): The language of the name, using codes such as RUS for RUS name and CHU for CHUV name. +- CLASS (string): The OBJCLASS field from source file. +- TYPE (string): The OBJTYPE field from source file. +- DISTRICT (string, nullable): The ABSREF field. +- DOUBT (int, nullable): This field is empty for now. +- LANDMARK (string, nullable): The RELREF field. +- COMMENTS (text, nullable): The GEOCOMMENT field. +- OTHER (text, nullable): This field is empty for now." + +Here is the specification of linguistic table +"## 2. Linguistic Table + +### Purpose: +The linguistic table stores different names (and their variants) of the same geographic object across different languages and sources, etymological information and other linguistic metadata. Every CHUV name and RUS name (toponym) must have its own record in this table. + +### Columns: +- ID (int, PK): Unique identifier for each toponym entry, independent from the spatial table. +- SPATID (int, FK → Spatial.ID): Links the toponym to its corresponding spatial object. +- DOUSPAT (int, nullable): Empty for now. +- MAINID (int, FK → Linguistic.ID): See below, the logic of this field is connected with REFID column from source file +- TOPONYM (string): The attested name as recorded from sources, here goes CHUV field or RUS field +- TOPFORMS (text, nullable): Empty for now. +- DOUTOPO (int, nullable): Empty for now. +- LANG (string): RUS for RUS name or CHU for CHUV name. +- DOULANG (int, nullable): Empty for now. +- PRONUNC (text, nullable): Empty for now. +- DOUPRON (int, nullable): Empty for now. +- ETYM (text, nullable): The concatenation of ETYMSAV and ETYM fields. +- ORIGIN (string, nullable): The ORIG field. +- COMMENTS (text, nullable): The concatenation of CORRECTION, HISTORIC, HISTCOMM, DUBANOV, PAGE, COMMIDR, COMMSAV in the format: "CORRECTION: | HISTORIC: | and so on". +- OTHER (text, nullable): Empty for now." + +coordcommenttemp field is not needed + +The logic behind REFID column from the source table goes like that: +- if REFID column is empty, then the RUS name (if not empty) from the record goes to OFFNAME column of spatial table and ID goes to spatial table and linked records from linguistic table +- if it is not empty, it means there are several connected records in the source file with structure like that: +ID_1, LAT, LON, ... REFID_1 +ID_2, empty, empty, ... REFID_2 +ID_3, empty, empty, ... REFID_3 +etc +REFID_1 equals REFID_2 or REFID_3, REFID_2 equals REFID_1 and REFID_3 equals REFID_1 too etc +The source record with the LAT and LON filled is the main one (sometimes it is not the first one, maybe the second or third) others are secondary records: +- its' ID goes to the spatial table's ID field +- its' RUS name (if it is not present it means the error in source data) goes to OFFNAME column of spatial table +- filling out of the linguistic table for main and secondary source records requires special care, bear with me -- +the RUS name of the main record needs to be processed first, its' linguistic.ID (unique, remember) goes to the MAINID field of its own record (so linguistic.ID equals linguistic.MAINID for the RUS name of the main record) and into all linguistic.MAINID fields of connected names, namely, into inguistic.MAINID field for CHUV name (if present) of the main record and into inguistic.MAINID field for CHUV name and RUS name for all secondary source records + +Remember, official name, the spatial.OFFNAME, is usually in RUS column, only if RUS column is empty **and** the REFID column is empty, only in that case we should use CHUV name as OFFNAME. + +Now please write a python script for the described data transformation. If there's anything unclear just ask, I'll add the necessary details. ADDED data/csv/db_experiment/v0.3/divide_database_v0.31.py Index: data/csv/db_experiment/v0.3/divide_database_v0.31.py ================================================================== --- /dev/null +++ data/csv/db_experiment/v0.3/divide_database_v0.31.py @@ -0,0 +1,315 @@ +# TODO: work the case when no row in REFID nest has LAT-LON pair +import csv + +def transform_data(source_file, spatial_file, linguistic_file): + # Delimiter used in the source CSV + delimiter = ';' + + # Data structure to hold all rows grouped by either REFID or ID + groups = {} + + # Read the source CSV + with open(source_file, 'r', encoding='utf-8-sig') as f: + reader = csv.DictReader(f, delimiter=delimiter) + rows = list(reader) # load all rows into memory + + rows_for_grouping = {} + + # Group the rows + for row in rows: + #print(row) + id_id = int(row['ID'].strip()) + refid_str = row['REFID'].strip() + #print(refid_str) + if refid_str: # if REFID is not empty + refid_id = int(refid_str) + rows_for_grouping[id_id] = [refid_id, row] # isinstance(obj, int) + else: + rows_for_grouping[id_id] = [row] + + for k, v in rows_for_grouping.items(): + if isinstance(v[0], int): # if REFID is not empty + if v[0] in groups: + continue + print(v[1], "\n", rows_for_grouping[v[0]]) + groups[k] = [v[1], rows_for_grouping[v[0]][1]] + else: + groups[k] = v + + # Prepare the output files + with open(spatial_file, 'w', newline='', encoding='utf-8') as fspatial, \ + open(linguistic_file, 'w', newline='', encoding='utf-8') as fling: + + spatial_writer = csv.writer(fspatial, delimiter=';') + linguistic_writer = csv.writer(fling, delimiter=';') + + # Write headers for spatial.csv + spatial_header = [ + 'ID', # PK + 'LAT', # float + 'LON', # float + 'OFFNAME', # string + 'LANG', # string + 'CLASS', # string + 'TYPE', # string + 'DISTRICT', # string (nullable) + 'DOUBT', # int (nullable) - empty + 'LANDMARK', # from RELREF + 'COMMENTS', # from GEOCOMMENT + 'OTHER' # empty for now + ] + spatial_writer.writerow(spatial_header) + + # Write headers for linguistic.csv + linguistic_header = [ + 'ID', # PK + 'SPATID', # FK -> Spatial.ID + 'DOUSPAT', # int (nullable) - empty + 'MAINID', # FK -> Linguistic.ID + 'TOPONYM', # name + 'TOPFORMS', # text (nullable) - empty + 'DOUTOPO', # int (nullable) - empty + 'LANG', # RUS or CHU + 'DOULANG', # int (nullable) - empty + 'PRONUNC', # text (nullable) - empty + 'DOUPRON', # int (nullable) - empty + 'ETYM', # ETYMSAV + ETYM + 'ORIGIN', # ORIG field + 'COMMENTS', # CORRECTION + HISTORIC + ... + 'OTHER' # empty for now + ] + linguistic_writer.writerow(linguistic_header) + + # We'll keep an auto-increment for the linguistic ID + linguistic_id_counter = 1 + + # Process each group, find the main row, write to spatial + linguistic + for group_key, group_rows in groups.items(): + # Find the main row: the one that has LAT and LON (non-empty) + main_row = None + for r in group_rows: + if r['LAT'].strip() and r['LON'].strip(): + main_row = r + break + + # If we can't find a main row, we skip this group (or handle error). + # But let's proceed carefully. If there's no main row, it might mean data is incomplete. + if not main_row: + print(f"No main row for ID: {group_key}") + continue + + # Prepare fields for SPATIAL table + spat_id = int(main_row['ID']) + + lat_str = main_row['LAT'].replace(',', '.').strip() + lon_str = main_row['LON'].replace(',', '.').strip() + + # figure out OFFNAME + LANG + # "only if RUS name is absent in the record itself and absent in the record connected + # through REFID field, only in that case CHUV name is used to fill the OFFNAME" + # But the main rule for the main row: + # If REFID is empty and RUS is empty => use CHUV + # else use RUS. If RUS is empty for a group with REFID, let's do a fallback, because it's an error scenario. + # We'll keep it simple here: + r_main = main_row['RUS'].strip() + c_main = main_row['CHUV'].strip() + refid_main = main_row['REFID'].strip() + + if r_main: + offname = r_main + offlang = 'RUS' + else: + if (not refid_main) and c_main: + # If REFID is empty and there's no RUS, we use CHUV + offname = c_main + offlang = 'CHU' + else: + # fallback if there's no RUS but we do have a REFID or no CHUV + # We'll just say offname is empty or "MISSING_RUS" + offname = 'MISSING_RUS' + offlang = 'RUS' + + objclass = main_row['OBJCLASS'].strip() + objtype = main_row['OBJTYPE'].strip() + district = main_row['ABSREF'].strip() + landmark = main_row['RELREF'].strip() + geocomm = main_row['GEOCOMMENT'].strip() + + # Write spatial row + spatial_writer.writerow([ + spat_id, + lat_str, + lon_str, + offname, + offlang, + objclass, + objtype, + district, + '', # DOUBT is empty + landmark, + geocomm, + '' # OTHER is empty for now + ]) + + # Next, build a list of all possible linguistic names from the group + # We also need to figure out the MAINID. That is the ID of the main record's RUS name. + + # We'll parse all group rows first to find "the main row's RUS name linguistic ID" if it exists. + # Then we assign MAINID to that ID for all subsequent rows. + + # We will store each "to be inserted" linguistic entry in a list of dicts: + # Each dict will contain row info, language, name, etc. + linguistic_entries = [] + + # We'll do two passes: first pass gather, second pass assign MAINID + # Because the main row's RUS name is the reference for MAINID if it exists. + + # Pass 1: gather all CHUV / RUS from each row into memory + for r in group_rows: + row_id = int(r['ID']) + + # Each row could produce up to 2 linguistic names (CHUV, RUS) + possible_names = [ + ('CHU', r['CHUV'].strip()), + ('RUS', r['RUS'].strip()) + ] + + for (lang_code, name_val) in possible_names: + if not name_val: + continue # skip empty + + # Construct the "COMMENTS" field from the specified fields + correction = r['CORRECTION'].strip() + historic = r['HISTORIC'].strip() + histcomm = r['HISTCOMM'].strip() + dubanov = r['DUBANOV'].strip() + page = r['PAGE'].strip() + commidr = r['COMMIDR'].strip() + commsav = r['COMMSAV'].strip() + + comments_list = [ + f"CORRECTION: {correction}", + f"HISTORIC: {historic}", + f"HISTCOMM: {histcomm}", + f"DUBANOV: {dubanov}", + f"PAGE: {page}", + f"COMMIDR: {commidr}", + f"COMMSAV: {commsav}" + ] + comments_str = " | ".join(comments_list) + + # Etym = ETYMSAV + ETYM + etymsav = r['ETYMSAV'].strip() + etym = r['ETYM'].strip() + combined_etym = etymsav + if etym: + if combined_etym: + combined_etym += " " + etym + else: + combined_etym = etym + + origin_str = r['ORIG'].strip() + + # Build an entry + entry = { + 'row_id': row_id, # original row ID + 'lang': lang_code, + 'toponym': name_val, + 'etym': combined_etym, + 'origin': origin_str, + 'comments': comments_str + } + + linguistic_entries.append(entry) + + # Pass 2: insert into the linguistic.csv. We identify the "main RUS" from the main row + # We'll define main_rus_toponym if the main row's RUS is not empty (and not "MISSING_RUS") + main_rus = main_row['RUS'].strip() + + # We'll do a quick function to see if a dictionary entry belongs to "main row's RUS" + def is_main_rus(entry): + if entry['row_id'] == int(main_row['ID']): + if entry['lang'] == 'RUS' and entry['toponym'] == main_rus and main_rus != '' and main_rus != 'MISSING_RUS': + return True + return False + + # find if there's a main row's RUS entry + main_rus_entries = [e for e in linguistic_entries if is_main_rus(e)] + + if main_rus_entries: + # we take the first one as the "real main RUS" entry + main_rus_entry = main_rus_entries[0] + else: + main_rus_entry = None + + # We will do two passes: first insert the main RUS to get its ID and store it in `the_mainid`, + # then insert the rest referencing `the_mainid`. + + the_mainid = None + + # function to write row to linguistic.csv + def write_linguistic_row(entry, spatid, mainid_placeholder): + nonlocal linguistic_id_counter, linguistic_writer + # Each row gets a new linguistic_id + current_ling_id = linguistic_id_counter + linguistic_id_counter += 1 + + # If this row is itself the "main" row's RUS name, then MAINID = current_ling_id + # Otherwise, MAINID = the_mainid from the main row + if mainid_placeholder is None: + # This indicates we are assigning the main RUS's own ID as MAINID + # so for the main RUS row: ID == MAINID + real_mainid = current_ling_id + else: + real_mainid = mainid_placeholder + + # Write the row + # ID, SPATID, DOUSPAT, MAINID, TOPONYM, TOPFORMS, DOUTOPO, LANG, DOULANG, PRONUNC, DOUPRON, ETYM, ORIGIN, COMMENTS, OTHER + linguistic_writer.writerow([ + current_ling_id, # ID + spatid, # SPATID + '', # DOUSPAT empty + real_mainid, # MAINID + entry['toponym'], # TOPONYM + '', # TOPFORMS + '', # DOUTOPO + entry['lang'], # LANG + '', # DOULANG + '', # PRONUNC + '', # DOUPRON + entry['etym'], # ETYM + entry['origin'], # ORIGIN + entry['comments'], # COMMENTS + '' # OTHER + ]) + + return current_ling_id + + # 2a. If we have a main_rus_entry, insert it first + if main_rus_entry: + main_rus_ling_id = write_linguistic_row(main_rus_entry, spat_id, None) + the_mainid = main_rus_ling_id + # remove it from the list so we don’t insert it twice + linguistic_entries.remove(main_rus_entry) + + # 2b. Insert all other entries + for entry in linguistic_entries: + if the_mainid: + # we already have a main RUS ID + write_linguistic_row(entry, spat_id, the_mainid) + else: + print(f"Check the ID: {entry['row_id']}-{entry['TOPONYM']}") + # no main RUS found (error scenario?), let's let the first inserted name become main + new_id = write_linguistic_row(entry, spat_id, None) + if the_mainid is None: + the_mainid = new_id + # subsequent ones in the same loop will reference the_mainid + +# Example usage: +# transform_data('source.csv', 'spatial.csv', 'linguistic.csv') +# +# If you need to run it as a standalone script, uncomment the following lines +# and adjust the filenames as needed: + +if __name__ == "__main__": + transform_data('source.csv', 'spatial.csv', 'linguistic.csv')