import csv
def transform_data(source_file, spatial_file, linguistic_file):
# Delimiter used in the source CSV
delimiter = ';'
# Data structure to hold all rows grouped by either REFID or ID
groups = {}
# Read the source CSV
with open(source_file, 'r', encoding='utf-8-sig') as f:
reader = csv.DictReader(f, delimiter=delimiter)
rows = list(reader) # load all rows into memory
rows_for_grouping = {}
# Group the rows
for row in rows:
#print(row)
id_id = int(row['ID'].strip())
refid_str = row['REFID'].strip()
#print(refid_str)
if refid_str: # if REFID is not empty
refid_id = int(refid_str)
rows_for_grouping[id_id] = [refid_id, row] # isinstance(obj, int)
else:
rows_for_grouping[id_id] = [row]
for k, v in rows_for_grouping.items():
if isinstance(v[0], int): # if REFID is not empty
if v[0] in groups:
continue
# print(v[1], "\n", rows_for_grouping[v[0]])
groups[k] = [v[1], rows_for_grouping[v[0]][1]]
else:
groups[k] = v
# Prepare the output files
with open(spatial_file, 'w', newline='', encoding='utf-8-sig') as fspatial, \
open(linguistic_file, 'w', newline='', encoding='utf-8-sig') as fling:
spatial_writer = csv.writer(fspatial, delimiter=';')
linguistic_writer = csv.writer(fling, delimiter=';')
# Write headers for spatial.csv
spatial_header = [
'ID', # PK
'LAT', # float
'LON', # float
'OFFNAME', # string
'LANG', # string
'CLASS', # string
'TYPE', # string
'DISTRICT', # string (nullable)
'DOUBT', # int (nullable) - empty
'LANDMARK', # from RELREF
'COMMENTS', # from GEOCOMMENT
'OTHER' # empty for now
]
spatial_writer.writerow(spatial_header)
# Write headers for linguistic.csv
linguistic_header = [
'ID', # PK
'SPATID', # FK -> Spatial.ID
'DOUSPAT', # int (nullable) - empty
'MAINID', # FK -> Linguistic.ID
'TOPONYM', # name
'TOPFORMS', # text (nullable) - empty
'DOUTOPO', # int (nullable) - empty
'LANG', # RUS or CHU
'DOULANG', # int (nullable) - empty
'PRONUNC', # text (nullable) - empty
'DOUPRON', # int (nullable) - empty
'ETYM', # ETYMSAV + ETYM
'ORIGIN', # ORIG field
'COMMENTS', # CORRECTION + HISTORIC + ...
'OTHER' # empty for now
]
linguistic_writer.writerow(linguistic_header)
# We'll keep an auto-increment for the linguistic ID
linguistic_id_counter = 1
# Process each group, find the main row, write to spatial + linguistic
for group_key, group_rows in groups.items():
# Find the main row: the one that has LAT and LON (non-empty)
main_row = None
for r in group_rows:
if r['LAT'].strip() and r['LON'].strip():
main_row = r
break
# If we can't find a main row by LAT and LON pair the next pretender is a row with RUS name and only after that -- the row with CHUV name
if not main_row:
# print(f"No main row for ID: {group_key}")
for r in group_rows:
if r['RUS'].strip():
main_row = r
break
if not main_row:
# print(f"No main row for ID: {group_key}")
for r in group_rows:
if r['CHUV'].strip():
main_row = r
break
if not main_row:
print(f"\n\nNo main row for ID: {group_key}\n\n")
# Prepare fields for SPATIAL table
spat_id = int(main_row['ID'])
lat_str = main_row['LAT'].replace(',', '.').strip()
lon_str = main_row['LON'].replace(',', '.').strip()
# figure out OFFNAME + LANG
# "only if RUS name is absent in the record itself and absent in the record connected
# through REFID field, only in that case CHUV name is used to fill the OFFNAME"
# But the main rule for the main row:
# If REFID is empty and RUS is empty => use CHUV
# else use RUS. If RUS is empty for a group with REFID, let's do a fallback, because it's an error scenario.
# We'll keep it simple here:
r_main = main_row['RUS'].strip()
c_main = main_row['CHUV'].strip()
refid_main = main_row['REFID'].strip()
if r_main:
offname = r_main
offlang = 'RUS'
else:
if (not refid_main) and c_main:
# If REFID is empty and there's no RUS, we use CHUV
offname = c_main
offlang = 'CHU'
else:
# fallback if there's no RUS but we do have a REFID or no CHUV
# We'll just say offname is empty or "MISSING_RUS"
offname = 'MISSING_RUS'
offlang = 'RUS'
objclass = main_row['OBJCLASS'].strip()
objtype = main_row['OBJTYPE'].strip()
district = main_row['ABSREF'].strip()
landmark = main_row['RELREF'].strip()
geocomm = main_row['GEOCOMMENT'].strip()
# Write spatial row
spatial_writer.writerow([
spat_id,
lat_str,
lon_str,
offname,
offlang,
objclass,
objtype,
district,
'', # DOUBT is empty
landmark,
geocomm,
'' # OTHER is empty for now
])
# Next, build a list of all possible linguistic names from the group
# We also need to figure out the MAINID. That is the ID of the main record's RUS name.
# We'll parse all group rows first to find "the main row's RUS name linguistic ID" if it exists.
# Then we assign MAINID to that ID for all subsequent rows.
# We will store each "to be inserted" linguistic entry in a list of dicts:
# Each dict will contain row info, language, name, etc.
linguistic_entries = []
# We'll do two passes: first pass gather, second pass assign MAINID
# Because the main row's RUS name is the reference for MAINID if it exists.
# Pass 1: gather all CHUV / RUS from each row into memory
for r in group_rows:
row_id = int(r['ID'])
# Each row could produce up to 2 linguistic names (CHUV, RUS)
possible_names = [
('CHU', r['CHUV'].strip()),
('RUS', r['RUS'].strip())
]
for (lang_code, name_val) in possible_names:
if not name_val:
continue # skip empty
# Construct the "COMMENTS" field from the specified fields
correction = r['CORRECTION'].strip()
historic = r['HISTORIC'].strip()
histcomm = r['HISTCOMM'].strip()
dubanov = r['DUBANOV'].strip()
page = r['PAGE'].strip()
commidr = r['COMMIDR'].strip()
commsav = r['COMMSAV'].strip()
comments_list = [
f"CORRECTION: {correction}",
f"HISTORIC: {historic}",
f"HISTCOMM: {histcomm}",
f"DUBANOV: {dubanov}",
f"PAGE: {page}",
f"COMMIDR: {commidr}",
f"COMMSAV: {commsav}"
]
comments_str = " | ".join(comments_list)
# Etym = ETYMSAV + ETYM
etymsav = r['ETYMSAV'].strip()
etym = r['ETYM'].strip()
combined_etym = etymsav
if etym:
if combined_etym:
combined_etym += " " + etym
else:
combined_etym = etym
origin_str = r['ORIG'].strip()
# Build an entry
entry = {
'row_id': row_id, # original row ID
'lang': lang_code,
'toponym': name_val,
'etym': combined_etym,
'origin': origin_str,
'comments': comments_str
}
linguistic_entries.append(entry)
# Pass 2: insert into the linguistic.csv. We identify the "main RUS" from the main row
# We'll define main_rus_toponym if the main row's RUS is not empty (and not "MISSING_RUS")
main_rus = main_row['RUS'].strip()
# We'll do a quick function to see if a dictionary entry belongs to "main row's RUS"
def is_main_rus(entry):
if entry['row_id'] == int(main_row['ID']):
if entry['lang'] == 'RUS' and entry['toponym'] == main_rus and main_rus != '' and main_rus != 'MISSING_RUS':
return True
return False
# find if there's a main row's RUS entry
main_rus_entries = [e for e in linguistic_entries if is_main_rus(e)]
if main_rus_entries:
# we take the first one as the "real main RUS" entry
main_rus_entry = main_rus_entries[0]
else:
main_rus_entry = None
# We will do two passes: first insert the main RUS to get its ID and store it in `the_mainid`,
# then insert the rest referencing `the_mainid`.
the_mainid = None
# function to write row to linguistic.csv
def write_linguistic_row(entry, spatid, mainid_placeholder):
nonlocal linguistic_id_counter, linguistic_writer
# Each row gets a new linguistic_id
current_ling_id = linguistic_id_counter
linguistic_id_counter += 1
# If this row is itself the "main" row's RUS name, then MAINID = current_ling_id
# Otherwise, MAINID = the_mainid from the main row
if mainid_placeholder is None:
# This indicates we are assigning the main RUS's own ID as MAINID
# so for the main RUS row: ID == MAINID
real_mainid = current_ling_id
else:
real_mainid = mainid_placeholder
# Write the row
# ID, SPATID, DOUSPAT, MAINID, TOPONYM, TOPFORMS, DOUTOPO, LANG, DOULANG, PRONUNC, DOUPRON, ETYM, ORIGIN, COMMENTS, OTHER
linguistic_writer.writerow([
current_ling_id, # ID
spatid, # SPATID
'', # DOUSPAT empty
real_mainid, # MAINID
entry['toponym'], # TOPONYM
'', # TOPFORMS
'', # DOUTOPO
entry['lang'], # LANG
'', # DOULANG
'', # PRONUNC
'', # DOUPRON
entry['etym'], # ETYM
entry['origin'], # ORIGIN
entry['comments'], # COMMENTS
'' # OTHER
])
return current_ling_id
# 2a. If we have a main_rus_entry, insert it first
if main_rus_entry:
main_rus_ling_id = write_linguistic_row(main_rus_entry, spat_id, None)
the_mainid = main_rus_ling_id
# remove it from the list so we don’t insert it twice
linguistic_entries.remove(main_rus_entry)
# 2b. Insert all other entries
for entry in linguistic_entries:
if the_mainid:
# we already have a main RUS ID
write_linguistic_row(entry, spat_id, the_mainid)
else:
print(f"Check the ID: {entry['row_id']}-{entry['toponym']}")
# no main RUS found (error scenario?), let's let the first inserted name become main
new_id = write_linguistic_row(entry, spat_id, None)
if the_mainid is None:
the_mainid = new_id
# subsequent ones in the same loop will reference the_mainid
# Example usage:
# transform_data('source.csv', 'spatial.csv', 'linguistic.csv')
#
# If you need to run it as a standalone script, uncomment the following lines
# and adjust the filenames as needed:
if __name__ == "__main__":
transform_data('source.csv', 'spatial.csv', 'linguistic.csv')