Source code for twol.table2words

"""Reformats a paradigm table into a one word form per row csv file

© Kimmo Koskenniemi, 2017-2019

 This is free software under GPL 3 license.
"""

[docs]def step1(): import csv, re, sys import twol.cfg as cfg version = cfg.timestamp(__file__) import argparse argparser = argparse.ArgumentParser( "python3 paratab2segcsv.py", description="Converts a tabular csv paradigm into"\ " one example per row CSV file. Version {} ".format(version)) argparser.add_argument( "input", default="ksk-paradigms.csv", help="Paradigm table as a CSV file") argparser.add_argument( "output", default="ksk-seg-examp.csv", help="One example per row paradigm as a CSV file") argparser.add_argument( "-s", "--morph-separator", default=".", help="Boundary between the morphs in a table cell") argparser.add_argument( "-d", "--csv-delimiter", default=",", help="CSV delimiter between the two fields, default is ','") argparser.add_argument( "-n", "--name-separator", default=".", help="Separator between morpheme names"\ " in the morpheme list, default is '.'") argparser.add_argument( "-z", "--zero-symbol", default="Ø", help="Symbol to be inserted in word forms in order to"\ " align them, default is Ø. You are discouraged to change it.") args = argparser.parse_args() out_file = open(args.output, "w") writer = csv.DictWriter(out_file, ["MORPHEMES","MORPHS"], delimiter=args.csv_delimiter) writer.writeheader() d = {} morph_set = {} seg_ex_list = [] with open(args.input, "r") as csvfile: reader = csv.DictReader(csvfile, delimiter=args.csv_delimiter, skipinitialspace=True) for row in reader: if row["ID"].startswith("?"): continue # process each cell of the row for column_label, words in row.items(): if (not words) or (column_label in {"ID", "KSK"}) \ or ("STM" not in column_label): continue morpheme_list = column_label.split(args.name_separator) if morpheme_list[0] == 'STM': morpheme_list[0] = row['ID'] words_clean = re.sub(r'[][()]', '', words) word_list = re.split(r"\s+", words_clean) for morphs in word_list: if not morphs or morphs.find('*') >= 0: continue d["MORPHEMES"] = args.name_separator.join(morpheme_list).strip() d["MORPHS"] = morphs writer.writerow(d) out_file.close() return
if __name__ == "__main__": step1()