"""A script for converting zero-filled examples into examples with raw morphophonemes
Raw morphophonemes are named according to a set of principal forms
(principal parts) which are assumed to reflect all morphophonemic
alternations in a lexeme. The script assumes that the examples within
a lexeme occur all in the same order as for their form.
The code still relies on the assumption that the stem comes as the
first morph (and thus some minor modifications would be required in
order to handle prefixing languages).
© Kimmo Koskenniemi, 2018. This is free software under GPL 3 license.
"""
[docs]def main():
import twol.cfg as cfg
version = cfg.timestamp(__file__)
import argparse
argparser = argparse.ArgumentParser(
"python3 zerofilled2raw.py",
description="Forms raw morphophonemes out of zero-filled"\
" morphs and produces a space-separated pair string"\
" representation for the word suitable for"\
" twol-comp or twol-discov. Version {}".format(version))
argparser.add_argument(
"input",
help="zero-filled example words as a CSV file")
argparser.add_argument(
"output",
help="The output file in CSV format with a new column"\
" where the words are represented with raw"\
" morhpophonemes from zero-filling.")
argparser.add_argument(
"affix_info",
help="Principal forms and morphophonemic affixes as a CSV file")
argparser.add_argument(
"-d", "--csv-delimiter",
default=",",
help="Delimiter between the fields, default=','")
argparser.add_argument(
"-s", "--morph-separator",
default=".",
help="Separator between morphs in the word form, default='.'")
argparser.add_argument(
"-n", "--name-separator",
default=".",
help="Separator between morpheme names in the morpheme list")
argparser.add_argument(
"-z", "--zero-symbol",
default="Ø",
help="Symbol inserted in word forms to align them")
argparser.add_argument(
"-v", "--verbosity",
default=0,
type=int,
help="level of diagnostic and debugging output")
args = argparser.parse_args()
import re
import csv
import collections
principal_lst = []
""""List of principal forms or principal parts, i.e. the forms which
uniquely determine the morphophonemic variations that may occur
within the stem.
"""
feat2mphons = {}
# Read in the feature combinations of principal forms and
# the morphophonemic representations of affix features
with open(args.affix_info, "r") as afffil:
affrdr = csv.reader(afffil,
delimiter=args.csv_delimiter,
skipinitialspace=True)
for row in affrdr:
if row[1] == '+':
feat = row[0]
if feat not in principal_lst:
principal_lst.append(feat)
else:
feat2mphons[row[0]] = row[1]
if args.verbosity >= 10:
print("principal_lst =", principal_lst)####
print("feat2mphons =", feat2mphons)####
# Read in the morpheme names and the zero-filled morphs
stem_morpheme_data = collections.OrderedDict()
"""Indexed by stem morpheme name, value is a list of the original data
for that stem morpheme. Each value consists of a tuple of fields
(MORPHEMES, MORPHS, ALIGNED) in the original data.
"""
with open(args.input, "r") as infil:
rdr = csv.DictReader(infil,
delimiter=args.csv_delimiter,
skipinitialspace=True)
for row in rdr:
names = row["MORPHEMES"].strip()
orig_morphs = row["MORPHS"].strip()
zerof_morphs = row["ZEROFILLED"].strip()
if (not names) or (not zerof_morphs):
continue
name_lst = names.split(args.name_separator, maxsplit=1)
stem_name = name_lst[0]
form_name = ".".join(name_lst[1:]) if len(name_lst) > 1 else ""
zerof_morph_lst = zerof_morphs.split(args.morph_separator,
maxsplit=1)
if stem_name not in stem_morpheme_data:
stem_morpheme_data[stem_name] = []
stem_morpheme_data[stem_name].append((form_name,
orig_morphs,
zerof_morph_lst))
ofil = open(args.output, "w")
writer = csv.DictWriter(ofil, fieldnames=["MORPHEMES", "MORPHS",
"ZEROFILLED", "RAW"])
writer.writeheader()
for stem_morpheme, data_lst in stem_morpheme_data.items():
princ_zstem_lst =[]
if args.verbosity >= 10:
print("*** stem_morpheme, data_lst:", stem_morpheme, data_lst)
# select the principal forms of this stem morpheme
for data in data_lst:
form_name, orig_morphs, zerof_morph_lst = data
if form_name in principal_lst:
princ_zstem_lst.append(zerof_morph_lst[0])
# form the raw morphophonemes by combining corresponding
# symbols
if args.verbosity >= 10:
print("*** princ_zstem_lst:", princ_zstem_lst) ###
lgth = len(princ_zstem_lst[0])
zstem_rawsym_lst = []
for i in range(lgth):
lst = []
for princ_zstem in princ_zstem_lst:
lst.append(princ_zstem[i])
# print(stem_morpheme, i, lst)###
raw_seq = "".join(lst)
if re.match(r"^(.)(\1)*$", raw_seq):
raw_sym = raw_seq[0] # abbreviate if all identical
else:
raw_sym = "{" + raw_seq + "}"
zstem_rawsym_lst.append(raw_sym)
zstem_pairsym_str = " ".join(zstem_rawsym_lst)
# Output the data augmented with the representation with raw
# morphophonemes
for data in data_lst:
form_name, orig_morphs, zerof_morph_lst = data
form_part = args.name_separator + form_name if form_name else ""
row["MORPHEMES"] = (stem_morpheme + form_part).strip()
row["MORPHS"] = orig_morphs
orig_zerof_morphs = args.morph_separator.join(zerof_morph_lst)
row["ZEROFILLED"] = orig_zerof_morphs
raw_lst = [zstem_pairsym_str]
feat_lst = form_name.split(args.name_separator)
for feat in feat_lst:
raw_lst.append(feat2mphons[feat])
row["RAW"] = " ".join(raw_lst)
writer.writerow(row)
return
if __name__ == "__main__":
main()