Source code for twol.twparser

# -*- coding: utf-8 -*-

from __future__ import absolute_import, division, print_function, unicode_literals

import sys, re, json

from codecs import open

from pprint import pprint

import tatsu
from tatsu import compile
from tatsu.ast import AST
#from tatsu import ast
#from ast import AST
from tatsu.walkers import NodeWalker
#from tatsu import walkers
#from walkers import NodeWalker
from tatsu.exceptions import ParseException, FailedParse, ParseError, FailedSemantics

import hfst_dev as hfst

import twol.cfg as cfg

import twol.twexamp as twexamp

[docs]class DiscovDefSemantics(object):
[docs] def define(self, ast): cfg.definitions[ast.left] = ast.right return ("=", ast.left, ast.right)
[docs] def identifier(self, ast): string = ast.token.strip() return string
[docs] def union(self, ast): return ast.left | ast.right
[docs] def intersection(self, ast): return ast.left & ast.right
[docs] def difference(self, ast): return ast.left - ast.right
[docs] def Morphophonemic(self, ast): """Surface completion Returns a set which contains valid pair symbols x:y such that for x there is some pair x:z in the original set. For a single symbol pair k:g.m it is equivalent to k: """ pairsym_set = ast.expr.copy() insym_set = set() for sympair in pairsym_set: insym, outsym = cfg.pairsym2sympair(pairsym) insym_set.add(outsym) for insymbol, outsymbol in cfg.symbol_pair_set: if insymbol in insym_set: result_set.add(cfg.sympaif2pairsym(insymbol, outsymbol)) return result_set
[docs] def Surface(self, ast): """Morphophonemic completion Returns a set which contais valid pair symbols whose ouput side cepted by the output side of the argument. For a single pair symbol k:g.s it is equivalent to :g """ pairsym_set = ast.expr.copy() result_set = set() outsym_set = set() for pairsym in pairsym_set: insym, outsym = cfg.pairsym2sympair(pairsym) outsym_set.add(outsym) for insymbol, outsymbol in cfg.symbol_pair_set: if outsymbol in outsym_set: result_set.add(cfg.sympaif2pairsym(insymbol, outsymbol)) return result_set
[docs] def pair(self, ast): # print(f"in pair: {ast = }") #### up, lo = ast up_quoted = re.sub(r"([{}])", r"%\1", up) lo_quoted = lo ### ???? lo = re.sub(r"%(.)", r"\1", lo_quoted) # print(f"in pair: {up= }, {lo = }") #### failmsg = [] if up and (up not in cfg.input_symbol_set): failmsg.append(f"input symbol '{up}'") if lo and (lo not in cfg.output_symbol_set): failmsg.append(f"outputput symbol '{lo}'") if (up and lo and (up, lo) not in cfg.symbol_pair_set): failmsg.append(f"symbol pair '{up}:{lo}'") if failmsg: cfg.error_message = " and ".join(failmsg) + " not in alphabet" raise FailedSemantics(cfg.error_message) if up and lo: # it is e.g. "{aØ}:a" result_set = set([f"{up}:{lo}"]) return result_set elif up and (not lo): # it is e.g. "{aØ}:" result_set = set() for insym, outsym in cfg.symbol_pair_set: if insym == up: result_set.add(cfg.sympair2pairsym(insym, outsym)) return result_set elif (not up) and lo: # it is e.g. ":i" result_set = set() for insym, outsym in cfg.symbol_pair_set: if outsym == lo: result_set.add(cfg.sympair2pairsym(insym, outsym)) return result_set else: # it is ":" result_set = cfg.pair_symbol_set.copy() return result_set
[docs] def defined(self, ast): # print(f"in defined: {ast = }") #### string = ast if string in cfg.definitions: # print(f"in defined: {string} is a defined symbol") #### result_set = cfg.definitions[string].copy() return result_set else: cfg.error_message = f"in defined: {string} is not defined" raise FailedSemantics(cfg.error_message)
[docs] def outsym(self, ast): # print(f"in outsym: {ast = }") #### string = ast lo_quoted = string ### ???? lo = re.sub(r"%(.)", r"\1", lo_quoted) # print(f"in outsym: {lo = }, {lo_quoted = }") #### if (lo in cfg.output_symbol_set and (lo,lo) in cfg.symbol_pair_set): # print(f"symbol_or_pair: {string} is a surface symbol") #### result_set = set(cfg.sympair2pairsym(lo, lo)) return result_set else: cfg.error_message = f"in outsym: {string} is not in alphabet" raise FailedSemantics(cfg.error_message)
[docs]class TwolFstSemantics(object):
[docs] def define(self, ast): expr_fst = ast.right.copy() def_name = ast.left # print(f"define: {def_name = }") #### cfg.definitions[def_name] = expr_fst return ("=", ast.left, ast.right)
[docs] def identifier(self, ast): # print(f"in identifier: {ast = }") #### string = ast.strip() return string
[docs] def right_arrow_rule(self, ast): result = ("=>", ast.left, ast.right) return result
[docs] def output_coercion_rule(self, ast): result = ("<=", ast.left, ast.right) return result
[docs] def input_coercion_rule(self, ast): result = ("<--", ast.left, ast.right) return result
[docs] def double_arrow_rule(self, ast): result = ("<=>", ast.left, ast.right) return result
[docs] def exclusion_rule(self, ast): result = ("/<=", ast.left, ast.right) return result
[docs] def contexts(self, ast): result = ast.lst.copy() return result
[docs] def context_lst(self, ast): left_lst = ast.left.copy() right_lst = ast.right.copy() result = left_lst.copy() result.extend(right_lst) return result
[docs] def context(self, ast): lc = ast.left.copy() if ast.left else hfst.epsilon_fst() rc = ast.right.copy() if ast.right else hfst.epsilon_fst() lc.substitute("END", "BEGIN") #print(lc)### #print(rc)### result = [(lc, rc)] return result
[docs] def union(self, ast): # print(f"in union: {ast = }") #### name = "f[{ast.left.get_name()} | {ast.right.get_name()}]" result_fst = ast.left.copy() result_fst.disjunct(ast.right) result_fst.minimize() result_fst.set_name(name) return result_fst
[docs] def intersection(self, ast): name = f"[{ast.left.get_name()} & {ast.right.get_name()}]" result_fst = ast.left.copy() result_fst.conjunct(ast.right) result_fst.minimize() result_fst.set_name(name) return result_fst
[docs] def difference(self, ast): name = f"[{ast.left.get_name()} - {ast.right.get_name()}]" result_fst = ast.left.copy() result_fst.minus(ast.right) result_fst.minimize() result_fst.set_name(name) return result_fst
[docs] def concatenation(self, ast): name = f"[{ast.left.get_name()} {ast.right.get_name()}]" result_fst = ast.left.copy() result_fst.concatenate(ast.right) result_fst.minimize() result_fst.set_name(name) return result_fst
[docs] def Kleene_star(self, ast): name = f"[{ast.expr.get_name()}]*" result_fst = ast.expr.copy() result_fst.repeat_star() result_fst.minimize() result_fst.set_name(name) return result_fst
[docs] def Kleene_plus(self, ast): name = f"[{ast.expr.get_name()}]+" result_fst = ast.expr.copy() result_fst.repeat_plus() result_fst.minimize() result_fst.set_name(name) return result_fst
[docs] def Morphophonemic(self, ast): """Surface completion Returns a FST which accepts sequences of valid pairs whose input side is accepted by the input side of the argument. For a single symbol pair k:g it is equivalent to k: """ name = f"[{ast.expr.get_name()}].m" result_fst = ast.expr.copy() result_fst.input_project() all_pairs_fst = cfg.all_pairs_fst.copy() result_fst.compose(all_pairs_fst) result_fst.minimize() result_fst.set_name(name) return result_fst
[docs] def Surface(self, ast): """Morphophonemic completion Returns a FST which accepts sequences of valid pairs whose ouput side is accepted by the output side of the argument. For a single symbol pair k:g it is equivalent to :g """ name = f"[{ast.expr.get_name()}].s" temp_fst = ast.expr.copy() temp_fst.output_project() result_fst = cfg.all_pairs_fst.copy() result_fst.compose(temp_fst) result_fst.minimize() result_fst.set_name(name) return result_fst
[docs] def One_but_not(self, ast): name = r"\[{}]".format(ast.expr.get_name()) result_fst = cfg.all_pairs_fst.copy() result_fst.minus(ast.expr) result_fst.minimize() result_fst.set_name(name) return result_fst
[docs] def optexpression(self, ast): result_fst = ast.expr.copy() name = result_fst.get_name() result_fst.optionalize() result_fst.minimize() result_fst.set_name("({})".format(name)) return result_fst
[docs] def subexpression(self, ast): name = "[{}]".format(ast.expr.get_name()) result_fst = ast.expr.copy() result_fst.set_name(name) return result_fst
[docs] def pair(self, ast): # print(f"in pair: {ast = }") #### up, lo = ast up_quoted = re.sub(r"([{}])", r"%\1", up) lo_quoted = lo ### ???? lo = re.sub(r"%(.)", r"\1", lo_quoted) # print(f"in pair: {up= }, {lo = }") #### failmsg = [] if up and (up not in cfg.input_symbol_set): failmsg.append(f"input symbol '{up}'") if lo and (lo not in cfg.output_symbol_set): failmsg.append(f"outputput symbol '{lo}'") if (up and lo and (up, lo) not in cfg.symbol_pair_set): failmsg.append(f"symbol pair '{up}:{lo}'") if failmsg: cfg.error_message = " and ".join(failmsg) + " not in alphabet" raise FailedSemantics(cfg.error_message) if up and lo: # it is e.g. "{aØ}:a" result_fst = hfst.regex(up_quoted + ':' + lo_quoted) result_fst.set_name(f"{up}:{lo}") return result_fst elif up and (not lo): # it is e.g. "{aØ}:" result_fst = hfst.regex(up_quoted) result_fst.compose(cfg.all_pairs_fst) result_fst.set_name(f"{up}:") return result_fst elif (not up) and lo: # it is e.g. ":i" result_fst = cfg.all_pairs_fst.copy() lo_fst = hfst.regex(lo_quoted) result_fst.compose(lo_fst) result_fst.set_name(f":{lo}") return result_fst else: # it is ":" result_fst = cfg.all_pairs_fst.copy() result_fst.set_name("PI") return result_fst
[docs] def defined(self, ast): # print(f"in defined: {ast = }") #### string = ast if string in cfg.definitions: # print(f"in defined: {string} is a defined symbol") #### result_fst = cfg.definitions[string].copy() result_fst.set_name(string) return result_fst else: cfg.error_message = f"in defined: {string} is not defined" raise FailedSemantics(cfg.error_message)
[docs] def outsym(self, ast): # print(f"in outsym: {ast = }") #### string = ast lo_quoted = string ### ???? lo = re.sub(r"%(.)", r"\1", lo_quoted) # print(f"in outsym: {lo = }, {lo_quoted = }") #### if (lo in cfg.output_symbol_set and (lo,lo) in cfg.symbol_pair_set): # print(f"symbol_or_pair: {string} is a surface symbol") #### result_fst = hfst.regex(string) result_fst.set_name(string) return result_fst else: cfg.error_message = f"in outsym: {string} is not in alphabet" raise FailedSemantics(cfg.error_message)
[docs] def boundary(self, ast): result_fst = hfst.regex("END") # print(result_fst)#### result_fst.set_name(".#.") return result_fst
[docs]def init(): """Initializes the module and compiles and returns a tatsu parser grammar_file -- the name of the file containing the EBNF grammar for rules """ import os dir = os.path.dirname(os.path.abspath(__file__)) grammar_file = dir + "/twolcsyntax.ebnf" grammar = open(grammar_file).read() parser = compile(grammar) return parser
[docs]def parse_rule(parser, line_nl, line_no, line_lst, start="expr_start"): """Parse one rule or definiton or any constituent given as start parser -- a tatsu parser which parses the EBNF grammar for two-level rules line_nl -- the string that contains the rule or definition to be parsed keyword arguments: start -- the element in the EBNF grammar where to start the parsing """ line = line_nl.strip() # print(f"{line = }") #### # print(f"in parse_rule: {cfg.definitions.keys() = }") #### if (not line) or line[0] == '!': return "!", None, None # it was a comment or an empty line rulepat = r"^.* +(=|<=|=>|<=>|/<=|<--) +.*$" try: m = re.match(rulepat, line) if m: # print("groups:", m.groups()) #### if m.group(1) == '=': op, name, expr_fst = parser.parse(line, start='def_start', semantics=TwolFstSemantics()) return op, name, expr_fst elif m.group(1) in {'=>', '<=', '<=>', '/<=', '<--'}: op, x_fst, contexts = parser.parse(line, start='rul_start', semantics=TwolFstSemantics()) return op, x_fst, contexts else: return "?", None, None except ParseException as e: print("\n" + 40 * "*") print("ERROR WAS IN INPUT LINES:", line_no, "-", line_no + len(line_lst) - 1) print("".join(line_lst)) print("THE ERROR IS PROBABLY ABOVE THE '^' OR BEFORE IT") msg = str(e) lst = msg.split("\n") if len(lst) >= 3: print(lst[1]) print(lst[2], "<---", e.__class__.__name__, "HERE") if cfg.error_message: print("EXPLANATION:") print(" ", cfg.error_message) cfg.error_message = "" else: print(e) ### print(str(e)) print(40 * "*" + "\n") return "?", None, None
[docs]def main(): #import hfst import argparse import twol.twbt as twbt #import twol.cfg as cfg #import twol.twexamp as twexamp arpar = argparse.ArgumentParser( description="A compiler and tester for two-level rules") arpar.add_argument("start", help="start parsing from", default="expr_start") args = arpar.parse_args() twexamp.read_fst(filename="nounex.fst") parser = init() for line_nl in sys.stdin: line = line_nl.strip() #print(line) result = parser.parse(line, start=args.start, semantics=TwolFstSemantics()) if args.start == "def_start": op, left, right, source = result print(left, "=") twbt.ppfst(right) elif args.start == "rul_start": op, left, right, source = result twbt.ppfst(left) print(op) for lc, rc in right: twbt.ppfst(lc, title="left context") twbt.ppfst(rc, title="right context") elif args.start == "expr_start": fst = result #print(fst) twbt.ppfst(fst, True) elif op == "?": print("Incorrect: " + line) return
if __name__ == '__main__': main()