Source code for twol.metric
"""metric.py
Produces a weighted finite-state transducer (WFST) out of the alphabet
definition as interpreted by alphabet.py. The WFST is the used by the
aligner.py program.
Copyright 2020, Kimmo Koskenniemi
This is free software according to GNU GPL 3 license.
"""
import sys
import re
import twol.alphabet as alphabet
import twol.cfg as cfg
import hfst_dev as hfst
within_set_lst = []
forall_lst = []
pair_weight_dict = {}
[docs]def alignment_fst():
pair_weight_lst = []
for insym in alphabet.consonant_set:
mphon = "Ø" + insym
if alphabet.mphon_is_valid(mphon):
xyw = "{}:{}::{}".format(insym, "Ø", alphabet.mphon_weight(mphon))
pair_weight_lst.append(xyw)
mphon = insym + "Ø"
if alphabet.mphon_is_valid(mphon):
xyw = "{}:{}::{}".format("Ø", insym, alphabet.mphon_weight(mphon))
pair_weight_lst.append(xyw)
for outsym in alphabet.consonant_set:
mphon = insym + outsym
if alphabet.mphon_is_valid(mphon):
xyw = "{}:{}::{}".format(insym, outsym,
alphabet.mphon_weight(mphon))
pair_weight_lst.append(xyw)
for insym in alphabet.vowel_set:
mphon = "Ø" + insym
if alphabet.mphon_is_valid(mphon):
xyw = "{}:{}::{}".format(insym, "Ø", alphabet.mphon_weight(mphon))
pair_weight_lst.append(xyw)
mphon = insym + "Ø"
if alphabet.mphon_is_valid(mphon):
xyw = "{}:{}::{}".format("Ø", insym, alphabet.mphon_weight(mphon))
pair_weight_lst.append(xyw)
for outsym in alphabet.vowel_set:
mphon = insym + outsym
if alphabet.mphon_is_valid(mphon):
xyw = "{}:{}::{}".format(insym, outsym,
alphabet.mphon_weight(mphon))
pair_weight_lst.append(xyw)
pair_weight_str = "|".join(pair_weight_lst)
if cfg.verbosity >= 20:
print("\npair_weight_str:", pair_weight_str)
loop_sets = {}
loop_sets["Consonants"] = alphabet.consonant_set - {"Ø"}
loop_sets["Vowels"] = alphabet.vowel_set- {"Ø"}
if cfg.verbosity >= 20:
print("\nloop_sets:", loop_sets)
for_all_lst = []
for expr, var, loop_set in alphabet.for_definitions_lst:
if cfg.verbosity >= 25:
print(expr, var, loop_set)
for x in loop_sets[loop_set]:
if cfg.verbosity >= 25:
print(x, var, loop_set)
item_expr = re.sub(var, x, expr)
if cfg.verbosity >= 25:
print("item_expr:", item_expr)
for_all_lst.append(item_expr)
for_all_str = "|".join(for_all_lst)
if cfg.verbosity >= 20:
print("\nfor_all_str:", for_all_str)
exceptions_str = " | ".join(alphabet.exception_lst)
if cfg.verbosity >= 20:
print("\nexceptions_str:", exceptions_str)
fst = hfst.regex(pair_weight_str + "|" + for_all_str + "|" + exceptions_str)
fst.repeat_star()
fst.minimize()
return fst
[docs]def main():
#last_modified_date = datetime.fromtimestamp(mtime)
version = cfg.timestamp(__file__)
import argparse
arpar = argparse.ArgumentParser(
"twol-metric",
description="""Builds a distance metric FST out of an alphabet
description. See
https://pytwolc.readthedocs.io/en/latest/alignment.html
for detailed instructions. Version {}""".format(version)
)
arpar.add_argument(
"alphabet",
help="An alphabet definition with features and similarity sets")
arpar.add_argument(
"metrics",
help="FST which contains weights for preferring alternative alignments")
arpar.add_argument(
"-v", "--verbosity",
help="Level of diagnostic output printed, default=0",
type=int, default=0)
args = arpar.parse_args()
cfg.verbosity = args.verbosity
cfg.all_zero_weight = 1000
alphabet.read_alphabet(args.alphabet)
fst = alignment_fst()
fstfile = hfst.HfstOutputStream(filename=args.metrics)
fstfile.write(fst)
fstfile.flush()
fstfile.close()
return
if __name__ == "__main__":
main()