Source code for twol.alphabet

"""alphabet.py

Processes an alphabet definition file so that it can be used both by
aligner.py and by multialign.py.  In particular, it computes weights
for phoneme pairs or morphophonemes for the purposes of weighted
alignment.

Copyright 2017-2020, Kimmo Koskenniemi

This is free software according to GNU GPL 3 license.
"""

import sys, re
import twol.cfg as cfg

cost_of_zero_c = 25
"""Additional weight for a subset of consonants if Ø belongs to it."""

cost_of_zero_v = 10
"""Additional weight for a subset of vowels if Ø belongs to it."""

feature_lst_lst = [["Zero"], ["Zero"], ["Zero"], ["Zero"], ["Zero"], ["Zero"]]
                             # six lists, one for each group of features

feature_bitpos = {}
"""The bit position of the feature within the 16 bit field of the group."""

mphon_to_binint_cache = {}
"""A 6*16 bit vector which represents the feature sets of this phoneme."""

binint_to_mphon_set = {}
"""For a binint, it gives the set of phoneme set previously stored for
a 96 bit integer."""

vowel_set = set()
"""The set of vowels including semivowels."""

consonant_set = set()
"""The set of consonants including semivowels."""

mphon_weight_cache = {}
"""A cache for morphophoneme weights."""

phoneme_set_weight_cache = {}
"""Given weights for phoneme sets represented by "".join(sorted(set(MPHON)))"""

for_definitions_lst = []
"""A list of ... FOR X IN ... definitions to be used in metric.py"""

exception_lst = []
"""A list of weighting exceptions to be used in metric.py"""

[docs]def spaced_bin_int(intg):
    """binint to human readable string conversion."""
    bs = "{:096b}".format(intg)
    spaced_str = (bs[0:16] + " " + bs[16:32] + " " + bs[32:48] +
                  " " + bs[48:64] + " " + bs[64:80] + " " + bs[80:96])
    return spaced_str

[docs]def mphon_to_binint(mphon):
    """Converts a morphophoneme into a binary integer that represents it.

    :param mphon: A string of phonemes and possibly Øs
    :type mphon: str
    :return: A binary integer which represents the morphophoneme
    :rtype: int
    """
    if cfg.verbosity >= 25:
        print("mphon_to_binint({})".format(mphon))
    if not mphon:
        return 0
    if mphon in mphon_to_binint_cache:
        return mphon_to_binint_cache[mphon]
    if len(mphon) == 1:
        msg = "** '{}' NOT IN ALPHABET\n\n".format(mphon)
        exit(msg)
    old = mphon[:-1]
    new = mphon[-1:]
    if new not in mphon_to_binint_cache:
        msg = "** '{}' IN '{}' NOT IN ALPHABET\n\n".format(new, mphon)
        exit(msg)
    if old in mphon_to_binint_cache:
        binint =  mphon_to_binint_cache[old] | mphon_to_binint_cache[new]
        mphon_to_binint_cache[mphon] = binint
        return binint
    else:
        return mphon_to_binint(old) | mphon_to_binint_cache[new]
    
[docs]def mphon_is_valid(mphon):
    """Tests whether a set of phonemes is possible.

    :param mphon: A sequence of phonemes a.k.a. morphophoneme, e.g. 'ij'.  
    :type mphon: str
    
    :return: True if the set consists either of vowels (and semivowels and Øs) \
    or of consonants (and semivowels and Øs), otherwise False.

    :rtype: boolean
    """

    binint = mphon_to_binint(mphon)
    if cfg.verbosity >= 30:
        print("mphon_is_valid({}) == {}".format(mphon, spaced_bin_int(binint)) )
    if not(~ binint & 0xffffffffffffffffffffffff): # (U,U,U,U,U,U)
        return False
    else:
        return True
    
[docs]def mphon_weight(mphon):
    """Returns the weight of a morphophoneme

    :param mphon: A sequence of phonemes a.k.a. morphophoneme, e.g. 'ij'
    :type mphon: str
    
    :return: The weight of mphon based on the phonological features of its members 
    :rtype: float
    """
    global weight_c1, weight_c2, weight_c3, weight_v1, weight_v2, weight_v3
    if re.fullmatch(r"[Ø]+", mphon):
        return cfg.all_zero_weight
    phon_set_str = "".join(sorted(set(mphon)))
    if phon_set_str in phoneme_set_weight_cache:
        return phoneme_set_weight_cache[phon_set_str]
    if mphon in mphon_weight_cache:
        return mphon_weight_cache[mphon]
    mphon_int = mphon_to_binint_cache[mphon]
    if cfg.verbosity >= 25:
        print("\nmphon_to_binint_cache[{}] = {}".
              format(mphon, spaced_bin_int(mphon_int)))
    w_cons = 999999
    w_vow = 999999
    high = mphon_int >> 48               # extract the 48 cons feature bits
    low = mphon_int & 0xffffffffffff     # extract the 48 voc feature bits
    if cfg.verbosity >= 25:
        print("{:048b}".format(high), "high")
        print("{:048b}".format(low), "low")
    if high != 0xffffffffffff:
        c1 = high >> 32                  # place of articulation set
        c2 = (high >> 16) & 0xffff       # voicing set
        c3 = high & 0xffff               # manner of articulation set
        if cfg.verbosity >= 25:
            print("{:012b}, {:012b}, {:012b}".format(c1, c2, c3))
        w_cons = weight_c1[c1] + weight_c2[c2] + weight_c3[c3]
        if cfg.verbosity >= 25:
            print("\nmphon_weight info of a cons set:", hex(c1), weight_c1[c1],
                  hex(c2), weight_c2[c2], hex(c3), weight_c3[c3])
    if low != 0xffffffffffff:
        v1 = low >> 32                   # tongue height
        v2 = (low >> 16) & 0xffff        # backness
        v3 = low & 0xffff                # rounding
        if cfg.verbosity >= 25:
            print("{:012b}, {:012b}, {:012b}".format(v1, v2, v3))
        w_vow = weight_v1[v1] + weight_v2[v2] + weight_v3[v3]
        if cfg.verbosity >= 25:
            print("\nmphon_weight info of a vowel set:", hex(v1), weight_v1[v1],
                  hex(v2), weight_v2[v2], hex(v3), weight_v3[v3])
    w = min(w_cons, w_vow)
    if cfg.verbosity >= 25:
        print("\nmphon_int[{}]  = {}".format(mphon, spaced_bin_int(mphon_int)))
        print("mphon_weight[{}] = {}".format(mphon, w))
    mphon_weight_cache[mphon] = w
    phoneme_set_weight_cache[phon_set_str] = w
    return w

[docs]def read_alphabet(file_name):
    """Reads phoneme features, feature subsets with weights, and other definitions
    
    :param file_name: Name of the file that contains the alphabet definition
    :type file_name: str

    Stores the computed sets and weights for feature subsets and other
    info in module variables for the use of other modules.

    """
    global weight_c1, weight_c2, weight_c3, weight_v1, weight_v2, weight_v3

    subset_lst = []              # list of pairs (set, weight)
    feature_group = {}           # the group to which this feature belongs
    features_of_phoneme = {}     # tuples of six features for a phoneme
    with open(file_name, "r") as f:
        features_of_phoneme["Ø"] = ("Zero","Zero","Zero","Zero","Zero","Zero")
        i = 0
        for line_nl in f:
            i += 1
            line = line_nl.split("#")[0].strip()
            if not line:
                continue
            mat_phon_feat = re.fullmatch(
                r":?(?P<symbol>(\w|')):? *= *(?P<feats>\w*( *, *\w*)+)",
                line)
            if mat_phon_feat:
                # it defines features of a phoneme
                r_lst = [feat.strip()
                         for feat in mat_phon_feat.group("feats").split(",")]
                if len(r_lst) != 6:
                    msg = "** WRONG NUMBER OF FEATURES ON LINE {}:\n{}"
                    sys.exit(msg.format(i, line))
                if mat_phon_feat.group("symbol") in features_of_phoneme:
                    msg = "** {} ALREADY DEFINED. LINE {}:\n{}"
                    sys.exit(msg.format(mat_phon_feat.group("symbol"),
                                        i, line_nl))
                features_of_phoneme[mat_phon_feat.group("symbol")] = tuple(r_lst)
                for ls, feat in zip(feature_lst_lst, r_lst):
                    if not feat in ls and feat:
                        ls.append(feat)
                continue
            mat_feat_set = re.fullmatch(
                r"(?P<elements>\w\w\w+( +\w\w\w+)+) *= *(?P<weight>[0-9]+)",
                line)
            if mat_feat_set:
                # it defines a subset of features and its weight
                l_lst = mat_feat_set.group("elements").split()
                subset_lst.append((set(l_lst),
                                   int(mat_feat_set.group("weight"))))
                continue
            mat_phon_set = re.fullmatch(
                r"(?P<weight>[0-9]+) *= *(?P<elements>(\w|')+( +(\w|')+)+)",
                line)
            if mat_phon_set:
                # it defines a subset of features and its weight
                phon_set_str_lst = mat_phon_set.group("elements").split()
                weight = int(mat_phon_set.group("weight"))
                for phon_str in phon_set_str_lst:
                    phon_set_str = "".join(sorted(set(phon_str)))
                    phoneme_set_weight_cache[phon_set_str] = weight
                continue
            mat_zero = re.fullmatch(
                r"Zero *[+]= *(?P<consw>[0-9]+) +(?P<voww>[0-9]+)",
                line)
            if mat_zero:
                # it defines the cost of including a Zero in feature sets
                cost_of_zero_c = int(mat_zero.group("consw"))
                cost_of_zero_v = int(mat_zero.group("voww"))
                continue
            mat_for_in = re.fullmatch(
                r"(?P<expr>(\w:\w +)*\w:\w::[0-9]+) +FOR +(?P<var>\w+) +IN +(?P<set>\w+)",
                line)
            if mat_for_in:
                # it defines a FOR IN definition
                for_definitions_lst.append((mat_for_in.group("expr"),
                                            mat_for_in.group("var"),
                                            mat_for_in.group("set")))
                continue
            mat_exception = re.fullmatch("(?P<expr>(\w:\w +)*\w:\w::[0-9]+)", line)
            if mat_exception:
                # it defines an exception list
                exception_lst.append(mat_exception.group("expr"))
                continue
            msg = "** INCORRECT ALPHABET DEFINITON LINE {}:\n {}"
            sys.exit(msg.format(i, line_nl))

    feature_set_lst = [set(lst) for lst in feature_lst_lst]
    #
    # now the alphabet data has been read in and extracted
    #
    if cfg.verbosity >= 20:
        print("\ncost_of_zero_c:", cost_of_zero_c)
        print("\ncost_of_zero_v:", cost_of_zero_v)
        print("\nfeature_set_lst:", feature_set_lst)
        print("\nfeature_lst_lst:", feature_lst_lst)
        print("\nsubset_lst:", subset_lst)
        print("\nfeatures_of_phoneme:", features_of_phoneme)
        print("\nfor_definitions_lst:", for_definitions_lst)
        print("\nexception_lst:", exception_lst)
    #
    # find the groups and bit positions of individual features
    #
    i = 0
    for feature_lst in feature_lst_lst:
        j = 0
        for feature in feature_lst:
            feature_group[feature] = i
            feature_bitpos[feature] = j
            j +=1
        i += 1
    del feature_group["Zero"]   # feature Zero belongs all groups and
                                # needs special care
    if cfg.verbosity >= 20:
        print("\nfeature_group:", feature_group)
        print("\nfeature_bitpos:", feature_bitpos)
    #
    # An integer for each phoneme.  The integer represents six sets
    # (with one element in each set).  The sets are 16 bit long fields
    # of the binary representation of the integer, total of 96 bits.
    # Each set has a bit position reserved for each feature in the
    # respective group.  These integers or bit vectors can be combined
    # with each other using bit operations.
    #
    for phoneme, features in features_of_phoneme.items():
        intset = 0
        for feature in features:
            if feature:
                bit_pos = feature_bitpos[feature]
                bin_set = 1 << bit_pos
                intset = (intset << 16) | bin_set
            else:
                intset = (intset << 16) | 0xffff
        mphon_to_binint_cache[phoneme] = intset
        if intset not in binint_to_mphon_set:
            binint_to_mphon_set[intset] = set()
        binint_to_mphon_set[intset].add(phoneme)
    if cfg.verbosity >= 20:
        lst = [fon + "=" + spaced_bin_int(intg) for fon, intg in
               sorted(mphon_to_binint_cache.items())]
        s = "\n".join(lst)
        print("\nmphon_to_binint_cache")
        print(s)
        t = "\n".join([spaced_bin_int(intg) + " = " + str(fon_set)
                       for intg, fon_set in sorted(binint_to_mphon_set.items())])
        print("\nbinint_to_mphon_set")
        print(t)
    #
    # sets of vowels and consonants
    #
    for phoneme, features in features_of_phoneme.items():
        if features[0] and features[1] and features[2]:
            consonant_set.add(phoneme)
        elif features[3] and features[4] and features[5]:
            vowel_set.add(phoneme)    
    #
    # convert the subsets into integers which represent bit vectors of the sets
    #
    subset_bin_lst = []
    for subset, weight in subset_lst:
        group_lst = list(set([feature_group[feature] for feature in subset]))
        if len(group_lst) == 1:
            group = group_lst[0]
        else:
            sys.exit("** FEATURES FROM SEVERAL GROUPS: {} = {}".format(subset, group_lst))
        #print("\ngroup:", subset, weight, group) ###
        bin_set = 0
        for feat in subset:
            bin_set = bin_set | (1 << feature_bitpos[feat])
        if cfg.verbosity > 25:
            print("\nsubset, bin_set, weight, group:", subset, bin(bin_set), weight, group)
        subset_bin_lst.append((bin_set, weight, group, bin(bin_set)))
    if cfg.verbosity > 20:
        print("\nsubset_bin_lst:", subset_bin_lst)
    #
    # compute weights for all possible feature sets in each of the six groups
    #
    i = 0
    weight_dict_lst = []
    for feature_lst in feature_lst_lst:
        weight_dict = {}
        weight_dict[0] =  -1 # for the empty set
        weight_dict[1] = -1   # {"Zero"}
        weight_dict[0xffff] = 999999 # for the universal set U
        l = len(feature_lst)
        for j in range(2, 1 << l, 2):
            w = 100
            for subset_bin, weight, group, bin_str in subset_bin_lst:
                if cfg.verbosity >= 25:
                    print("\nsubset_bin, weight, group, bin_str, i:",
                          bin(j), weight, group, bin_str, i)
                test = ~(subset_bin | ~j)
                if cfg.verbosity >= 25:
                    print(">>> test:", bin(test)) ###
                if group == i and not test and weight < w:
                    w = weight
            weight_dict[j] = w
            weight_dict[j+1] = w + (cost_of_zero_c if i < 3 else cost_of_zero_v)
        for j in range(1, l):
            weight_dict[1 << j] = 0
            weight_dict[(1 << j) +1] = (cost_of_zero_c if i < 3 else cost_of_zero_v)
        if cfg.verbosity > 20:
            print("\nweight_dict[{}]:".format(i), weight_dict)
            #for set_int, weight in weight_dict.items():
            #    for phoneme, feature_tuple in features_of_phoneme.items():
            #        feature = feature_tuple[i]
            #        ***
                
        weight_dict_lst.append(weight_dict)
        i += 1
    (weight_c1, weight_c2, weight_c3, weight_v1, weight_v2, weight_v3) = weight_dict_lst
    return

if __name__ == "__main__":
    cfg.verbosity = 1
    read_alphabet("alphabet-test.text")
    mphon_is_valid("ei")
    print(mphon_weight("ei"))
Source code for twol.alphabet

twol

Navigation

Related Topics