Source code for twol.words2zerofilled

"""Aligns morphs that occur in the example words by inserting zero symbols

© Kimmo Koskenniemi, 2017-2018. This is free software under the GPL 3 license.
"""

[docs]def main():

    import twol.cfg as cfg
    version = cfg.timestamp(__file__)
    
    import argparse
    argparser = argparse.ArgumentParser(
        "python3 words2zerofilled.py",
        description="Aligns a set of word forms with morph boundaries"\
        " Version {} ".format(version))
    argparser.add_argument(
        "input",
        default="ksk-seg-examp.csv",
        help="moprheme names and segmented example words as a CSV file")
    argparser.add_argument(
        "output",
        default="ksk-alig-examp.csv",
        help="example words plus zero-filled aligned forms as a CSV file")
    argparser.add_argument(
        "alphabet",
        default="alphabet-test.text",
        help="An alphabet definition which determines"\
        " the weights for morphophonemes")
    argparser.add_argument(
        "-s", "--morph-separator",
        default=".",
        help="Separator between morphs in the word form, default is '.'")
    argparser.add_argument(
        "-d", "--csv-delimiter",
        default=",",
        help="Delimiter between the fields")
    argparser.add_argument(
        "-n", "--name-separator",
        default=".",
        help="separator between morpheme names"\
        " in the morpheme list,, default is '.'")
    argparser.add_argument(
        "-z", "--zero-symbol",
        default="Ø",
        help="symbol to be inserted in word forms to align them")
    argparser.add_argument(
        "-x", "--extra-zeros", default=0, type=int,
        help="number of extra zeros to be tried in alighnment")
    argparser.add_argument(
        "-v", "--verbosity", default=0, type=int,
        help="level of diagnostic and debugging output")
    args = argparser.parse_args()

    import re
    import csv
    import collections
    import grapheme

    cfg.verbosity = args.verbosity
    
    # STEP 1:
    # Read in the segmented words and collect the allomorphs of each morpheme

    morphs_of_morpheme = {} 
    """A dict to which allomorphs of each morpheme are collected:
    morphs_of_morpheme[morpheme_name] == ordered list its unique allomorphs.
    """
    seg_example_list = []
    """A list to which of all example words are collected. 
    Each word is represented as a list of (morpheme,morph) pairs.
    """
    stem_name_set = set()
    """Set of stem morphemes i.e. names of stem morphemes.
    """
    csvfile = open(args.input)

    reader = csv.DictReader(csvfile,
                            delimiter=args.csv_delimiter,
                            skipinitialspace=True)
    i = 0
    morphs_of_morpheme = {}
    for row in reader:
        morpheme_list = row["MORPHEMES"].strip().split(args.name_separator)
        morph_list = row["MORPHS"].strip().split(args.morph_separator)
        if args.verbosity >= 25:
            print(row["MORPHEMES"])
            print(morpheme_list)
            print(row["MORPHS"])
            print(morph_list)
        i = i + 1
        if len(morpheme_list) != len(morph_list):
            print("** line", i, ":", row["MORPHEMES"],
                    "is incompatible with", row["MORPHS"])
            continue
        if not morpheme_list:
            continue
        stem_name_set.add(morpheme_list[0])
        name_morph_pair_lst = list(zip(morpheme_list, morph_list))
        if args.verbosity >= 10:
            print("name_morph_pair_lst", name_morph_pair_lst)
        seg_example_list.append(name_morph_pair_lst)
        for morpheme, morph in name_morph_pair_lst:
            if args.verbosity >= 10:
                print("morpheme, morph:", morpheme, morph)
            morph = morph.strip()
            if morpheme not in morphs_of_morpheme:
                morphs_of_morpheme[morpheme] = [morph]
            else:
                if morph not in morphs_of_morpheme[morpheme]:
                    morphs = morphs_of_morpheme[morpheme]
                    morphs.append(morph)
                    morphs_of_morpheme[morpheme] = morphs
    if args.verbosity >= 5:
        print("morphs_of_morpheme", morphs_of_morpheme)

    csvfile.close()

    print("-- STEP 1 COMPLETED (seg_example_list, stem_name_set,"
          " morphs_of_morpheme done)--")

    # STEP 2:
    # align the allomorphs of each morpheme

    import twol.cfg as cfg
    #cfg.all_zero_weight = 1.0

    import twol.multialign as multialign
    
    multialign.init(args.alphabet, all_zero_weight=1)

    alignments = {}
    """All aligned morphs. index: morpheme name, value: sequence of
    aligned symbols.  Each aligned symbol has as many characters as
    there are items in the sequence.
    """

    for morpheme in sorted(morphs_of_morpheme.keys()):
        morphs = morphs_of_morpheme[morpheme]
        if len(morphs) == 1 and len(morphs[0]) == 0:
            aligned_morphs_lst = []
        else:
            if args.verbosity >= 5:
                print("morphs:", morphs)
            aligned_results_lst = \
                multialign.multialign(morphs,
                                      max_zeros=args.extra_zeros,
                                      best_count=1)
            if aligned_results_lst:
                weight, aligned_morphs_lst = aligned_results_lst[0]
            else:
                aligned_morphs_lst = []
        if args.verbosity >= 5:
            print("aligned_results_lst:", aligned_results_lst)
        alignments[morpheme] = aligned_morphs_lst

    print("-- STEP 2 COMPLETED (alignments done) --")

    # STEP 3:
    # Compute the zero filled morphs out of the sequences of aligned symbols

    aligned_morphs = {}
    """index: (morpheme, morph), value: zero-filled morph
    """

    for morpheme, aligned_morphs_lst in alignments.items():
        # e.g. "KOTA", ['kota', 'koda', 'kotØ', 'kodØ']
        if args.verbosity >= 5:
            print("aligned_morphs_lst:", aligned_morphs_lst)
        if morpheme not in aligned_morphs:
            aligned_morphs[morpheme] = collections.OrderedDict()
        if aligned_morphs_lst:
            original_morphs = [x.replace("Ø", "") for x in aligned_morphs_lst]
            for origm, zerofm in zip(original_morphs, aligned_morphs_lst):
                #if origm:
                #    aligned_morphs[morpheme][origm] = zerofm
                aligned_morphs[morpheme][origm] = zerofm
        else:
            aligned_morphs[morpheme] = {"": ""}
    if args.verbosity >= 5:
        print("aligned_morphs", aligned_morphs)

    print("-- STEP 3 COMPLETED (aligned_morphs done) --")

    # STEP 4:
    # Write the example word forms plus their a zero filled morphs

    out_file = open(args.output, "w", newline="")
    writer = csv.DictWriter(out_file,
                            ["MORPHEMES","MORPHS","ZEROFILLED"],
                            delimiter=args.csv_delimiter)
    forms_of_morphs = {}

    writer.writeheader()
    d = {}
    for seg_example in seg_example_list:
        if args.verbosity >= 20:
            print("seg_example:", seg_example)
        morpheme_lst = [morpheme for morpheme, morph in seg_example]
        morph_lst = [morph for morpheme, morph in seg_example]
        zero_filled_morph_lst = \
            [aligned_morphs[morpheme].get(morph.replace("Ø", ""), "")
             for (morpheme, morph) in seg_example]
        if args.verbosity >= 20:
            print("zero_filled_morph_lst:", zero_filled_morph_lst)
        d["MORPHEMES"] = args.name_separator.join(morpheme_lst)
        d["MORPHS"] = args.morph_separator.join(morph_lst)
        d["ZEROFILLED"] = args.morph_separator.join(zero_filled_morph_lst)
        writer.writerow(d)
        if morph_lst[0] not in forms_of_morphs:
            forms_of_morphs[morph_lst[0]] = set()
        forms_of_morphs[morph_lst[0]].add(" ".join(x for x in morpheme_lst[1:]))

    print("-- STEP 4 COMPLETED (zero-filled morphs and the CSV file done) --")
    return

if __name__ == "__main__":
    main()
Source code for twol.words2zerofilled

twol

Navigation

Related Topics