Source code for twol.twolcomp

# twol.py
# =======
# A compiler and tester for simplified two-level rules.
# Copyright (c) Kimmo Koskenniemi, 2019
# This orogram is free software according to GPL 3 license
#
import sys

import re

import fileinput

import hfst as hfst

import twol.cfg as cfg

import twol.twbt as twbt

import twol.twexamp as twexamp

import twol.twrule as twrule

from twol.twparser import init as twparser_init

from twol.twparser import parse_rule




[docs]def main(): version = cfg.timestamp(__file__) import argparse arpar = argparse.ArgumentParser( description="A compiler and tester for two-level rules."\ " Version {}."\ " See https://pytwolc.readthedocs.io/en/latest/index.html"\ " or https://github.com/koskenni/twol"\ " for more information.".format(version)) arpar.add_argument( "-e", "--examples", action='store', nargs='+', help="""Either one name of a FST file that contains the examples or a list of names of files which contain the PSTR form examples used for compiling the rules.""", default=[None]) arpar.add_argument( "-r", "--rules", action='store', nargs='+', help="""One or more files which contain the rules, either just one rule file or a file of defines as the first one and a part of the whole rule set as the second""", default=[None]) arpar.add_argument( "-o", "--output", help="File to which write the compiled rules if a name is given", default="") arpar.add_argument( "-l", "--lost", help="File to which write the examples"\ " that were not accepted by all rules"\ " -- it is written as a FST", default="") arpar.add_argument( "-w", "--wrong", help="file to which write the wrong strings"\ " that are accepted by all rules -- it is written as a FST", default="") arpar.add_argument( "-t", "--thorough", help="test each rule separately: 0 if no testing is desired,"\ " 1 if against positive examples," " 2 against both positive and negative examples."\ " Default is 2.", type=int, choices=[0, 1, 2], default=2) arpar.add_argument( "--recursion", help="set the limit for recursion depth", type=int) arpar.add_argument( "-v", "--verbosity", help="level of diagnostic output", type=int, default=0) args = arpar.parse_args() cfg.verbosity = args.verbosity if args.recursion: sys.setrecursionlimit(args.recursion) if len(args.examples) == 1 and args.examples[0].endswith(".fst"): twexamp.read_fst(args.examples[0]) else: twexamp.read_examples(args.examples) if cfg.verbosity >= 30: twbt.ppfst(cfg.examples_fst, title="examples_fst") parser = twparser_init() examples_fsa = hfst.fst_to_fsa(cfg.examples_fst, separator="^") examples_up_fsa = cfg.examples_fst.copy() examples_up_fsa.input_project() if cfg.verbosity >= 30: twbt.ppfst(examples_up_fsa, title="examples_up_fsa") twrule.init() i = 0 skip = False all_rules_fst_lst = [] line_lst = [] for line_nl in fileinput.input(args.rules): i += 1 if not line_lst: line_nl_lst = [] line_nl_lst.append(line_nl) line = line_nl.split('!', maxsplit=1)[0].strip() if line == "START": skip = False continue elif line == "STOP": skip = True if skip or (not line) or line.startswith("!"): continue line_lst.append(line) if not line.endswith(";"): continue else: rule_str = " ".join(line_lst) line_lst = [] op, left, right = parse_rule(parser, rule_str, i, line_nl_lst) if op == "?" or not (left and right): continue if (args.thorough > 0 and op != "=") or cfg.verbosity > 0: print("\n") print(rule_str) if op == "=": # if cfg.verbosity > 0: # print(line) if cfg.verbosity >= 10: print(left, op) twbt.ppfst(right) continue elif op == "=>": R, selector_fst, MIXe = twrule.rightarrow(line, left, *right) elif op == "<=": R, selector_fst, MIXe = twrule.output_coercion(line, left, *right) elif op == "<--": R, selector_fst, MIXe = twrule.input_coercion(line, left, *right) elif op == "<=>": R, selector_fst, MIXe = twrule.doublearrow(line, left, *right) elif op == "/<=": R, selector_fst, MIXe = twrule.center_exclusion(line, left, *right) else: print("Error: not a valid type of a rule", op) continue R.set_name(rule_str[:30]) if cfg.verbosity >= 10: twbt.ppfst(R) if args.lost or args.wrong or args.output: all_rules_fst_lst.append(R) if args.thorough > 0: selector_fst.intersect(cfg.examples_fst) # selector_fst.n_best(5) selector_fst.minimize() if cfg.verbosity >= 20: paths = selector_fst.extract_paths(output='raw') print_raw_paths(paths[0:20]) passed_pos_examples_fst = selector_fst.copy() passed_pos_examples_fst.intersect(R) if args.thorough > 0: if passed_pos_examples_fst.compare(selector_fst): print("All positive examples accepted") else: lost_examples_fst = selector_fst.copy() lost_examples_fst.minus(passed_pos_examples_fst) lost_examples_fst.minimize() print("** Some positive examples were rejected:") lost_paths = lost_examples_fst.extract_paths(output='raw') print_raw_paths(lost_paths[0:20]) if args.thorough > 1 and op in {"=>", "<=", "<=>", "<--"}: neg_examples_fsa = examples_fsa.copy() neg_examples_fsa.compose(MIXe) neg_examples_fsa.output_project() neg_examples_fst = hfst.fsa_to_fst(neg_examples_fsa, separator="^") neg_examples_fst.minus(cfg.examples_fst) NG = examples_up_fsa.copy() NG.compose(neg_examples_fst) npaths = NG.extract_paths(output='raw') #print_raw_paths(npaths) passed_neg_examples_fst = NG.copy() passed_neg_examples_fst.intersect(R) if passed_neg_examples_fst.compare(hfst.empty_fst()): print("All negative examples rejected") else: print("** Some negative examples accepted:") npaths = passed_neg_examples_fst.extract_paths(output='raw') print_raw_paths(npaths[0:20]) if args.lost or args.wrong: RESU = examples_up_fsa.copy() print(RESU.number_of_arcs(), "arcs in RESU") RESU.compose_intersect(tuple(all_rules_fst_lst)) RESU.minimize() if args.lost: lost_positive_examples_fst = cfg.examples_fst.copy() lost_positive_examples_fst.minus(RESU) lost_positive_examples_fst.minimize() lost_stream = hfst.HfstOutputStream(filename=args.lost) lost_stream.write(lost_positive_examples_fst) lost_stream.flush() lost_stream.close() print("wrote lost examples to", args.lost) if args.wrong: WRONG = RESU.copy() WRONG.subtract(cfg.examples_fst) WRONG.minimize() wrong_stream = hfst.HfstOutputStream(filename=args.wrong) wrong_stream.write(WRONG) wrong_stream.flush() wrong_stream.close() print("wrote wrongly accepted examples to", args.wrong) if args.output: outstream = hfst.HfstOutputStream(filename=args.output) for fst in all_rules_fst_lst: outstream.write(fst) outstream.flush() outstream.close() print("wrote {} rule transducers to {}".format(len(all_rules_fst_lst), args.output)) return
if __name__ == "__main__": main()