"""Module for compiling two-level rules out of FSTs for the components.
"""
__author__ = "© Kimmo Koskenniemi, 2018 - 2020"
__version__ = "2020-02-01"
import re
import hfst_dev as hfst
import twol.cfg as cfg
import twol.fs as fs
import twol.twbt as twbt
import twol.twexamp as twexamp
[docs]def init():
"""Initializes the module by computing several common FSTs
Assumes that twexamp.read_fst() has read in cfg.examples_fst and
initialized sone symbol sets.
"""
global pistar_fst, pistar_fsa, diamond_sym, diamond_fst
global trim_pre_fst, trim_post_fst
assert cfg.examples_fst, "cfg.examples_fst not loaded (by twexamp module)"
cfg.definitions["PAIRS"] = cfg.all_pairs_fst.copy()
cfg.definitions["PI"] = cfg.all_pairs_fst.copy()
diamond_sym = 'DIAMOND'
diamond_fst = hfst.regex(diamond_sym)
pi_fst = cfg.all_pairs_fst.copy()
pistar_fst = cfg.all_pairs_fst.copy()
pistar_fst.repeat_star()
pistar_fst.remove_epsilons()
pistar_fst.minimize()
pistar_fsa = hfst.fst_to_fsa(pistar_fst, separator='^')
pi_in_fst = pi_fst.copy()
pi_in_fst.input_project()
pi_out_fst = pi_fst.copy()
pi_out_fst.output_project()
pi_in_star_fst = pistar_fst.copy()
pi_in_star_fst.input_project()
pi_out_star_fst = pistar_fst.copy()
pi_out_star_fst.output_project()
if cfg.verbosity >= 20:
twbt.ppfst(pistar_fst, title="pistar_fst")
fst1 = fs.star(fs.crossprod(fs.expr("ZERO"),
pi_in_fst))
fst2 = fs.star(fs.concat(fst1,
fs.expr("ZERO:BEGIN")))
fst3 = fs.concat(fst2, pi_in_star_fst)
fst4 = fs.star(fs.concat(fs.expr("ZERO:END"),
fs.star(fs.crossprod(fs.expr("ZERO"),
pi_in_fst))))
trim_pre_fst = fs.concat(fst3, fst4)
trim_pre_fst.set_name("trim_pre_fst")
#trim_pre_fst = XRC.compile(
# "[[ZERO .x. [PI].u]* ZERO:BEGIN]* " \
# "[[PI].u]* " \
# "[ZERO:END [ZERO .x. [PI].u]*]*"
#)
fst1 = fs.star(fs.crossprod(pi_out_fst, fs.expr("ZERO")))
fst2 = fs.star(fs.concat(fst1, fs.expr("BEGIN:ZERO")))
fst3 = fs.concat(fst2, pi_out_star_fst)
fst4 = fs.star(fs.concat(fs.expr("END:ZERO"),
fs.star(fs.crossprod(pi_out_fst,
fs.expr("ZERO")))))
trim_post_fst = fs.concat(fst3, fst4)
trim_post_fst.set_name("trim_post_fst")
#trim_post_fst = XRC.compile(
# "[[[PI].l .x. ZERO]* BEGIN:ZERO]* " \
# "[[PI].l]* " \
# "[END:ZERO [[PI].l .x. ZERO]*]*"
#)
if cfg.verbosity >= 20:
twbt.ppfst(trim_pre_fst)
twbt.ppfst(trim_post_fst)
return
[docs]def quote(str):
"""Protect '{' and '}' with a % in xerox regular expressions.
>>> quote("a {ij}:j [a:c | b]")
"a %{ij%}:j [a:c | b]"
"""
return(re.sub(r"([{'}])", r"%\1", str))
[docs]def e(str):
"""Convert a two-level component expression into a FST.
str -- a string containing a (two-level) regular expression
Returns an FST which performs the mapping represented by str
corresponding to the expression.
"""
global XRC
# print("Regex string:", str) ##
if str == "":
return(XRC.compile("[]"))
F = XRC.compile(str)
F.minimize()
F.set_name(str)
if cfg.verbosity >= 5:
twbt.ppfst(F) ##
return(F)
[docs]def generalized_restriction(precondition_fst,
postcondition_fst):
"""Combines the precondition FST and the postcondition FST into a rule FST.
Conditions are formed out of the center/contexts of a rule.
Each condition contains exactly two diamond symbols.
Returns the generalized restriction as an FST.
"""
global pistar_fst, diamond_sym
temp_fst = precondition_fst.copy()
temp_fst.subtract(postcondition_fst)
temp_fst.minimize()
temp_fst.set_name("PRECOND-POSTCOND")
# twbt.ppfst(temp_fst, True) ##
temp_fst.substitute(diamond_sym, hfst.EPSILON)
# print(temp_fst.get_properties().items()) ##
temp_fst.minimize()
temp_fst.set_name("Diamonds removed")
# twbt.ppfst(temp_fst, True) ##
result_fst = pistar_fst.copy()
result_fst.minus(temp_fst)
result_fst.minimize()
# twbt.ppfst(result_fst, True) ##
result_fst.set_name("Doubly negated")
# twbt.ppfst(result_fst, True) ##
return(result_fst)
[docs]def x_to_condition(x_fst):
"""Computes and returns a condition FST out of the center FST of a rule.
x_fst -- the center or X-part of a rule (located in front of the
operator)
returns [PI* ¤ X ¤ PI*] where ¤ is the diamond (not in PI) as an FST
"""
global pistar_fst, diamond_fst
result_fst = pistar_fst.copy()
result_fst.concatenate(diamond_fst)
result_fst.concatenate(x_fst)
result_fst.concatenate(diamond_fst)
result_fst.concatenate(pistar_fst)
result_fst.minimize()
result_fst.set_name("PISTAR ¤ X ¤ PISTAR")
# twbt.ppfst(result_fst, True) ##
return(result_fst)
[docs]def begin_end(expr_fst):
"""Removes everything before BEGIN and after END from expr_fst
expr_fst -- an FST representing a set of pair strings str
Returns an FST which represents pair strings which are maximal
substrings of the above which do not contain any BEGIN or END
symbols.
"""
global trim_pre_fst, trim_post_fst
result_fst = trim_pre_fst.copy()
result_fst.compose(expr_fst)
# twbt.ppfst(result_fst, True) ##
result_fst.compose(trim_post_fst)
# twbt.ppfst(result_fst, True) ##
result_fst.substitute("ZERO", hfst.EPSILON)
result_fst.minimize()
# twbt.ppfst(result_fst, True) ##
return(result_fst)
[docs]def context_to_condition(left_context_fst,
right_context_fst):
"""Convert one context into a condition (for the generalized restriction)
left_context_fst -- the left context as an FST
right_context_fst -- the right context as an FST
Returns [PI* LC ¤ PI* ¤ RC PI*] as an FST
"""
global pistar_fst, diamond_fst
leftc_fst = pistar_fst.copy()
leftc_fst.concatenate(left_context_fst)
leftc_fst.minimize()
result_fst = begin_end(leftc_fst)
result_fst.concatenate(diamond_fst)
result_fst.concatenate(pistar_fst)
result_fst.concatenate(diamond_fst)
rightc_fst = right_context_fst.copy()
rightc_fst.concatenate(pistar_fst)
rightc_fst.minimize()
rightc_fst = begin_end(rightc_fst)
result_fst.concatenate(rightc_fst)
result_fst.minimize()
return(result_fst)
[docs]def contexts_to_condition(*contexts):
"""A list of contexsts is converted into a condition.
Each context in the list is converted separately and
the result is the union of these and is returned as an FST.
"""
global pistar_fst
result_fst = hfst.HfstTransducer()
for leftc, rightc in contexts:
#twbt.ppfst(leftc, title="leftc") ##
#twbt.ppfst(rightc, title="rightc") ##
context_fst = context_to_condition(leftc, rightc)
result_fst.disjunct(context_fst)
result_fst.minimize()
leftc_name = leftc.get_name()
rightc_name = rightc.get_name()
result_fst.set_name(leftc_name + "_" + rightc_name)
#twbt.ppfst(result_fst, title="result") ##
return(result_fst)
[docs]def mix_output(x_fst):
"""Computes an FSA that is used when creating negative examples
First, it computes an expression Y which represent all possible
(correct and incorrect) realizations of the input side of X. Then,
Y is transformed into an encoded FSA which can be a component of the
transformation of correct examples into incorrect ones.
x_fst -- the center FST (X part) of a rule Returns [X.u .o. PI*]
encoded as an FSA (i.e. maps pairs to themselves)
"""
global pistar_fst
result_fst = x_fst.copy()
result_fst.input_project()
result_fst.compose(pistar_fst)
result_fst.minimize()
result_encod_fsa = hfst.fst_to_fsa(result_fst, separator="^")
# twbt.ppfst(result_fsa, True) ##
return result_encod_fsa
[docs]def selector_from_x(x_fst):
"""Compute and return [PI* X PI*]"""
selector_fst = pistar_fst.copy() # starting to build it
selector_fst.concatenate(x_fst)
selector_fst.concatenate(pistar_fst) # now complete
selector_fst.set_name("Selector " + x_fst.get_name())
return selector_fst
[docs]def correct_to_incorrect(x_fst, side):
"""used for creating negative examples for <= rules
In order to make negative examples for <= rules we need to transform
the examples so that some correct input:output pairs are
changed so that the output part becomes different. The computed
encoded FST maps correct inputs to any possible outputs (correct or
incorrect).
x_fst -- the FST for the X part of the rule
side -- either "input" or "output"
returns: an fst (encoded as a fsa) which maps correct examples into
incorrect exs
"""
global pistar_fst, pistar_fsa
if side == "input":
mixed_fsa = mix_input(x_fst)
else:
mixed_fsa = mix_output(x_fst)
temp_encod_fsa = hfst.fst_to_fsa(x_fst, separator="^")
temp_encod_fsa.cross_product(mixed_fsa) # now maps corr X to all variations
# twbt.ppfst(temp_encod_fsa, True) ##
corr_to_incorr_encod_fst = pistar_fsa.copy()
corr_to_incorr_encod_fst.concatenate(temp_encod_fsa)
corr_to_incorr_encod_fst.concatenate(pistar_fsa)
corr_to_incorr_encod_fst.minimize() # now complete
corr_to_incorr_encod_fst.set_name("Correct to incorrect")
return corr_to_incorr_encod_fst
[docs]def incorrect_to_correct(x_fst):
"""Compute a transformation for right-arrow (=>) rules
In order to make negative examples for the => rules we need to
modify the examples so that some correct occurrences of X are
modified so that the output part of X becomes something else,
i.e. incorrect because it is in an unexpected context.
x_fst -- FST for the center part (X) of a rule
Returns: scrambler_fst -- an encoded FST which maps encoded
instances of X into all possible correct and incorrect pairs (where
the input symbol is the same but the output symbol perhaps
different).
"""
global pistar_fst, pistar_fsa
x_encod_fsa = hfst.fst_to_fsa(x_fst, separator="^")
mix_fst = mix_output(x_fst) # still an encoded fsa
mix_fst.cross_product(x_encod_fsa) # now fst
scrambler_fst = pistar_fsa.copy()
scrambler_fst.concatenate(mix_fst)
scrambler_fst.concatenate(pistar_fsa)
scrambler_fst.minimize() # now complete
scrambler_fst.set_name("Scrambler " + x_fst.get_name())
return scrambler_fst
[docs]def rightarrow(name, x_fst, *contexts):
"""Compiles rules like X => [LC1,RC1),...(LCk,RCk)]
name -- name to be given to the rule FST
x_fst -- the center (X) of the rule
\*contexts -- list of contexts, i.e. pairs of left and right context
Returns a triple:
rule_fst -- the compiled rule
selector_fst -- FST which selects examples which are relevant for
this rule
scrambler_fst -- an encoded FST which produces negative examples
"""
precondition_fst = x_to_condition(x_fst)
postcondition_fst = contexts_to_condition(*contexts)
rule_fst = generalized_restriction(precondition_fst, postcondition_fst)
rule_fst.set_name(name)
# twbt.ppfst(rule_fst, True) ##
selector_fst = selector_from_x(x_fst)
scrambler_fst = incorrect_to_correct(x_fst)
# twbt.ppfst(scrambler_fst, True) ##
return rule_fst, selector_fst, scrambler_fst
[docs]def output_coercion(name, x_fst, *contexts):
"""Compiles rules like X <= LC1 _ RC1, ..., LCk _ RCk
name -- name to be given to the rule FST
x_fst -- the center (X) of the rule
\*contexts -- list of contexts, i.e. pairs of left and right context
Returns a triple:
rule_fst -- the compiled rule
selector_fst -- FST which selects examples which are relevant for
this rule
scrambler_fst -- an encoded FST which produces negative examples
"""
global pistar_fst
postcondition_fst = x_to_condition(x_fst)
x_all_fst = x_fst.copy()
x_all_fst.input_project()
x_all_fst.compose(pistar_fst)
precondition_fst = x_to_condition(x_all_fst)
context_condition_fst = contexts_to_condition(*contexts)
precondition_fst.intersect(context_condition_fst)
rule_fst = generalized_restriction(precondition_fst, postcondition_fst)
rule_fst.set_name(name)
if cfg.verbosity >= 20:
twbt.ppfst(rule_fst, True)
x_any_fst = x_fst.copy()
x_any_fst.input_project()
x_any_fst.compose(pistar_fst)
###x_any_fst.minus(x_fst)
selector_fst = selector_from_x(x_any_fst)
scrambler_fst = correct_to_incorrect(x_fst, "output")
return rule_fst, selector_fst, scrambler_fst
[docs]def doublearrow(name, x_fst, *contexts):
"""Compiles rules like X <=> [LC1,RC1),...(LCk,RCk)]
name -- name to be given to the rule FST
x_fst -- the center (X) of the rule
\*contexts -- list of contexts, i.e. pairs of left and right
context
Returns a triple:
rule_fst -- the compiled rule
selector_fst -- FST which selects examples which are relevant for
this rule
scrambler_fst -- an encoded FST which produces negative examples
"""
rule_fst, selector_fst, scrambler_fst = rightarrow(name, x_fst, *contexts)
rule2_fst, selector2_fst, scrambler2_fst = output_coercion(name, x_fst, *contexts)
rule_fst.intersect(rule2_fst)
rule_fst.minimize()
rule_fst.set_name(name)
scrambler_fst.disjunct(scrambler2_fst)
scrambler_fst.minimize()
selector_fst.disjunct(selector2_fst)
selector_fst.minimize()
# twbt.ppfst(rule_fst, True) ##
return rule_fst, selector_fst, scrambler_fst
[docs]def center_exclusion(name, x_fst, *contexts):
"""Compiles rules like X /<= [LC1,RC1),...(LCk,RCk)]
name -- name to be given to the rule FST
x_fst -- the center (X) of the rule
\*contents -- list of contexts, i.e. pairs of left and right context
Returns a triple:
rule_fst -- the compiled rule
selector_fst -- FST which selects examples which are relevant for this rule
scrambler_fst -- empty_fst (negative examples not relevant for these rules)
"""
context_condition_fst = contexts_to_condition(*contexts)
x_condition_fst = x_to_condition(x_fst)
context_condition_fst.intersect(x_condition_fst)
null_fst = hfst.empty_fst()
rule_fst = generalized_restriction(context_condition_fst, null_fst)
rule_fst.set_name(name)
# twbt.ppfst(rule_fst, True) ##
selector_fst = selector_from_x(x_fst)
scrambler_fst = hfst.empty_fst()
return rule_fst, selector_fst, scrambler_fst
if __name__ == "__main__":
"""This module forms the rule FSTs out of the collection of component
FSTs which have been compiled out of the center parts, left and right
contexts of the rules. The formulas used in the module are those from
[ylijyrä2006]_. """
twex.read_examples()
init(1)
#define("V", "PI .o.[a|e|i|o|ä|ö]")
#define("C", "[PI .o. [h|l|n|s|t|v]] | %{ij%}:j")
R1 = doublearrow("{ao}:o <=> _ {ij}:",
e("%{ao%}:o"),
(e("[]"), e("[%{ij%} .o. PI]")))
twbt.ppfst(R1, True)
rule2_fst = doublearrow("{ij}:j <=> V :Ø* _ :Ø* V",
"%{ij%}:j",
("V [PI .o. Ø]*", "[PI .o. Ø]* V"))
twbt.ppfst(rule2_fst, True)
R3 = doublearrow("{tl}:l <=> _ CLOSED",
"%{tl%}:l",
("[]", "V %{ij%}:i* C [C | [PI .o. Ø]* END]"))
twbt.ppfst(R3, True)