"""A module for reading two-level examples
The examples are assumed to be as space-separated one-level
representation and they are compiled into a single automaton.
At the same time, the alphabet used in the examples is
collected in several forms.
cfg.examples_fst -- the transducer which accepts exactly the examples
cfg.symbol_pair_set -- a tuple of string pairs suitable for e.g.
hfst.rules.restriction
"""
import re
import hfst_dev as hfst
import twol.cfg as cfg
import twol.twbt as twbt
[docs]def pairs_to_fst(pair_set):
"""Converts a seq of symbol pairs into a fst that accepts any of them
"""
pairs_bfst = hfst.HfstIterableTransducer()
for pair in pair_set:
pairs_bfst.disjunct((pair,), 0) # arg in tokenized format
fst = hfst.HfstTransducer(pairs_bfst)
fst.remove_epsilons()
fst.minimize()
return fst
[docs]def read_fst(filename="examples.fst"):
"""Reads in a previously stored example FST file
"""
import os
if not os.path.isfile(filename):
exit("EXAMPLE FST FILE {} DOES NOT EXIST",format(filename))
exfile = hfst.HfstInputStream(filename)
cfg.examples_fst = exfile.read()
pair_symbols = cfg.examples_fst.get_property("x-pair_symbols")
# print("pair_symbols", pair_symbols) ##
pair_symbol_lst = re.split(r" +", pair_symbols)
for pair in pair_symbol_lst:
cfg.pair_symbol_set.add(pair)
(insym, outsym) = cfg.pairsym2sympair(pair)
cfg.symbol_pair_set.add((insym, outsym))
cfg.input_symbol_set.add(insym)
cfg.output_symbol_set.add(outsym)
cfg.all_pairs_fst = pairs_to_fst(cfg.symbol_pair_set)
if cfg.verbosity >= 30:
twbt.ppfst(cfg.all_pairs_fst, title="cfg.all_pairs_fst")
return
[docs]def read_examples(filename_lst=["test.pstr"], build_fsts=True):
"""Reads the examples from files whose names are 'filename_lst'.
The file must contain one example per line and each line consists of
a space separated sequence of pair-symbols.
The examples are processed to a FST which is a union of all examples.
"""
import os
import fileinput
for f in filename_lst:
if not os.path.isfile(f):
exit("EXAMPLE FILE {} DOES NOT EXIST".format(f))
if build_fsts:
examples_bfst = hfst.HfstIterableTransducer()
for line_nl in fileinput.input(filename_lst):
line = line_nl.strip()
if not line or line.startswith("!"):
continue
lst = line.split("!", maxsplit=1)
line = lst[0].strip()
pairsym_lst = re.split("\s+", line)
symbol_pair_lst = [cfg.pairsym2sympair(pairsym)
for pairsym in pairsym_lst]
if not all([insym and outsym for insym, outsym in symbol_pair_lst]):
print("*** example contains an invalid pair symbol")
print(line)
continue
if cfg.verbosity >= 30:
print("symbol_pair_lst:", symbol_pair_lst)
pair_symbol_str = " ".join([cfg.sympair2pairsym(insym, outsym)
for insym,outsym
in symbol_pair_lst])
if cfg.verbosity >= 30:
print("pair_symbol_str:", pair_symbol_str)
cfg.example_lst.append(pair_symbol_str)
cfg.example_set.add(pair_symbol_str) # spaces normalized
#LINE_FST = hfst.tokenized_fst(symbol_pair_lst)
# twbt.printfst(LINE_FST, True) ##
if build_fsts:
examples_bfst.disjunct(symbol_pair_lst, 0)
for insym, outsym in symbol_pair_lst:
cfg.symbol_pair_set.add((insym, outsym))
if cfg.verbosity >= 30:
print("List of examples:", cfg.example_lst)
print("List of alphabet symbol pairs:", sorted(cfg.symbol_pair_set))
if build_fsts:
cfg.all_pairs_fst = pairs_to_fst(cfg.symbol_pair_set)
cfg.examples_fst = hfst.HfstTransducer(examples_bfst)
cfg.examples_fst.set_name(filename_lst[-1])
cfg.examples_fst.minimize()
if cfg.verbosity >= 30:
twbt.ppfst(cfg.examples_fst, False, title="Example file as FST")
for insym, outsym in cfg.symbol_pair_set:
cfg.input_symbol_set.add(insym)
cfg.output_symbol_set.add(outsym)
for insym, outsym in cfg.symbol_pair_set:
pair_symbol = cfg.sympair2pairsym(insym, outsym)
cfg.pair_symbol_set.add(pair_symbol)
if build_fsts:
pair_symbol_lst = [insym+':'+outsym for insym, outsym
in cfg.symbol_pair_set]
pair_symbol_str = " ".join(sorted(pair_symbol_lst))
# print("symbol pairs:", pair_symbol_str) ##
cfg.examples_fst.set_property("x-pair_symbols", pair_symbol_str)
return
[docs]def main():
"""The ``twexamp.py`` module can also be used as a standalone script
or command in order to convert examples in *pair string* format
into a :term:`finite-state transducer` (FST). Examples in pair
string format are plain human readable text files, one example per
line, where each example is give as a space-separated sequence of
pair symbols, e.g.::
k a u p {pØ}:Ø {ao}:a s s {aä}:a
The invocation of the program could be e.g.::
$ twol-examp examples.pstr examples.fst
"""
import argparse
arpar = argparse.ArgumentParser("python3 twexamp.py")
arpar.add_argument(
"-i", "--input",
action='store', nargs='+',
help="list of example pair strings files",
default=["examples.pstr"])
arpar.add_argument(
"-o", "--output",
help="file to which write the example FST",
default="")
arpar.add_argument(
"-v", "--verbosity",
help="level of diagnostic output",
type=int, default=0)
args = arpar.parse_args()
cfg.verbosity = args.verbosity
read_examples(args.input, build_fsts=True)
if args.output:
exfile = hfst.HfstOutputStream(filename=args.output)
exfile.write(cfg.examples_fst)
exfile.flush()
exfile.close()
print("--- example fst written to ", args.output ," ---")
return
if __name__ == "__main__":
main()