"""alphabet.py
Processes an alphabet definition file so that it can be used both by
aligner.py and by multialign.py. In particular, it computes weights
for phoneme pairs or morphophonemes for the purposes of weighted
alignment.
Copyright 2017-2020, Kimmo Koskenniemi
This is free software according to GNU GPL 3 license.
"""
import sys, re
import twol.cfg as cfg
cost_of_zero_c = 25
"""Additional weight for a subset of consonants if Ø belongs to it."""
cost_of_zero_v = 10
"""Additional weight for a subset of vowels if Ø belongs to it."""
feature_lst_lst = [["Zero"], ["Zero"], ["Zero"], ["Zero"], ["Zero"], ["Zero"]]
# six lists, one for each group of features
feature_bitpos = {}
"""The bit position of the feature within the 16 bit field of the group."""
mphon_to_binint_cache = {}
"""A 6*16 bit vector which represents the feature sets of this phoneme."""
binint_to_mphon_set = {}
"""For a binint, it gives the set of phoneme set previously stored for
a 96 bit integer."""
vowel_set = set()
"""The set of vowels including semivowels."""
consonant_set = set()
"""The set of consonants including semivowels."""
mphon_weight_cache = {}
"""A cache for morphophoneme weights."""
phoneme_set_weight_cache = {}
"""Given weights for phoneme sets represented by "".join(sorted(set(MPHON)))"""
for_definitions_lst = []
"""A list of ... FOR X IN ... definitions to be used in metric.py"""
exception_lst = []
"""A list of weighting exceptions to be used in metric.py"""
[docs]def spaced_bin_int(intg):
"""binint to human readable string conversion."""
bs = "{:096b}".format(intg)
spaced_str = (bs[0:16] + " " + bs[16:32] + " " + bs[32:48] +
" " + bs[48:64] + " " + bs[64:80] + " " + bs[80:96])
return spaced_str
[docs]def mphon_to_binint(mphon):
"""Converts a morphophoneme into a binary integer that represents it.
:param mphon: A string of phonemes and possibly Øs
:type mphon: str
:return: A binary integer which represents the morphophoneme
:rtype: int
"""
if cfg.verbosity >= 25:
print("mphon_to_binint({})".format(mphon))
if not mphon:
return 0
if mphon in mphon_to_binint_cache:
return mphon_to_binint_cache[mphon]
if len(mphon) == 1:
msg = "** '{}' NOT IN ALPHABET\n\n".format(mphon)
exit(msg)
old = mphon[:-1]
new = mphon[-1:]
if new not in mphon_to_binint_cache:
msg = "** '{}' IN '{}' NOT IN ALPHABET\n\n".format(new, mphon)
exit(msg)
if old in mphon_to_binint_cache:
binint = mphon_to_binint_cache[old] | mphon_to_binint_cache[new]
mphon_to_binint_cache[mphon] = binint
return binint
else:
return mphon_to_binint(old) | mphon_to_binint_cache[new]
[docs]def mphon_is_valid(mphon):
"""Tests whether a set of phonemes is possible.
:param mphon: A sequence of phonemes a.k.a. morphophoneme, e.g. 'ij'.
:type mphon: str
:return: True if the set consists either of vowels (and semivowels and Øs) \
or of consonants (and semivowels and Øs), otherwise False.
:rtype: boolean
"""
binint = mphon_to_binint(mphon)
if cfg.verbosity >= 30:
print("mphon_is_valid({}) == {}".format(mphon, spaced_bin_int(binint)) )
if not(~ binint & 0xffffffffffffffffffffffff): # (U,U,U,U,U,U)
return False
else:
return True
[docs]def mphon_weight(mphon):
"""Returns the weight of a morphophoneme
:param mphon: A sequence of phonemes a.k.a. morphophoneme, e.g. 'ij'
:type mphon: str
:return: The weight of mphon based on the phonological features of its members
:rtype: float
"""
global weight_c1, weight_c2, weight_c3, weight_v1, weight_v2, weight_v3
if re.fullmatch(r"[Ø]+", mphon):
return cfg.all_zero_weight
phon_set_str = "".join(sorted(set(mphon)))
if phon_set_str in phoneme_set_weight_cache:
return phoneme_set_weight_cache[phon_set_str]
if mphon in mphon_weight_cache:
return mphon_weight_cache[mphon]
mphon_int = mphon_to_binint_cache[mphon]
if cfg.verbosity >= 25:
print("\nmphon_to_binint_cache[{}] = {}".
format(mphon, spaced_bin_int(mphon_int)))
w_cons = 999999
w_vow = 999999
high = mphon_int >> 48 # extract the 48 cons feature bits
low = mphon_int & 0xffffffffffff # extract the 48 voc feature bits
if cfg.verbosity >= 25:
print("{:048b}".format(high), "high")
print("{:048b}".format(low), "low")
if high != 0xffffffffffff:
c1 = high >> 32 # place of articulation set
c2 = (high >> 16) & 0xffff # voicing set
c3 = high & 0xffff # manner of articulation set
if cfg.verbosity >= 25:
print("{:012b}, {:012b}, {:012b}".format(c1, c2, c3))
w_cons = weight_c1[c1] + weight_c2[c2] + weight_c3[c3]
if cfg.verbosity >= 25:
print("\nmphon_weight info of a cons set:", hex(c1), weight_c1[c1],
hex(c2), weight_c2[c2], hex(c3), weight_c3[c3])
if low != 0xffffffffffff:
v1 = low >> 32 # tongue height
v2 = (low >> 16) & 0xffff # backness
v3 = low & 0xffff # rounding
if cfg.verbosity >= 25:
print("{:012b}, {:012b}, {:012b}".format(v1, v2, v3))
w_vow = weight_v1[v1] + weight_v2[v2] + weight_v3[v3]
if cfg.verbosity >= 25:
print("\nmphon_weight info of a vowel set:", hex(v1), weight_v1[v1],
hex(v2), weight_v2[v2], hex(v3), weight_v3[v3])
w = min(w_cons, w_vow)
if cfg.verbosity >= 25:
print("\nmphon_int[{}] = {}".format(mphon, spaced_bin_int(mphon_int)))
print("mphon_weight[{}] = {}".format(mphon, w))
mphon_weight_cache[mphon] = w
phoneme_set_weight_cache[phon_set_str] = w
return w
[docs]def read_alphabet(file_name):
"""Reads phoneme features, feature subsets with weights, and other definitions
:param file_name: Name of the file that contains the alphabet definition
:type file_name: str
Stores the computed sets and weights for feature subsets and other
info in module variables for the use of other modules.
"""
global weight_c1, weight_c2, weight_c3, weight_v1, weight_v2, weight_v3
subset_lst = [] # list of pairs (set, weight)
feature_group = {} # the group to which this feature belongs
features_of_phoneme = {} # tuples of six features for a phoneme
with open(file_name, "r") as f:
features_of_phoneme["Ø"] = ("Zero","Zero","Zero","Zero","Zero","Zero")
i = 0
for line_nl in f:
i += 1
line = line_nl.split("#")[0].strip()
if not line:
continue
mat_phon_feat = re.fullmatch(
r":?(?P<symbol>(\w|')):? *= *(?P<feats>\w*( *, *\w*)+)",
line)
if mat_phon_feat:
# it defines features of a phoneme
r_lst = [feat.strip()
for feat in mat_phon_feat.group("feats").split(",")]
if len(r_lst) != 6:
msg = "** WRONG NUMBER OF FEATURES ON LINE {}:\n{}"
sys.exit(msg.format(i, line))
if mat_phon_feat.group("symbol") in features_of_phoneme:
msg = "** {} ALREADY DEFINED. LINE {}:\n{}"
sys.exit(msg.format(mat_phon_feat.group("symbol"),
i, line_nl))
features_of_phoneme[mat_phon_feat.group("symbol")] = tuple(r_lst)
for ls, feat in zip(feature_lst_lst, r_lst):
if not feat in ls and feat:
ls.append(feat)
continue
mat_feat_set = re.fullmatch(
r"(?P<elements>\w\w\w+( +\w\w\w+)+) *= *(?P<weight>[0-9]+)",
line)
if mat_feat_set:
# it defines a subset of features and its weight
l_lst = mat_feat_set.group("elements").split()
subset_lst.append((set(l_lst),
int(mat_feat_set.group("weight"))))
continue
mat_phon_set = re.fullmatch(
r"(?P<weight>[0-9]+) *= *(?P<elements>(\w|')+( +(\w|')+)+)",
line)
if mat_phon_set:
# it defines a subset of features and its weight
phon_set_str_lst = mat_phon_set.group("elements").split()
weight = int(mat_phon_set.group("weight"))
for phon_str in phon_set_str_lst:
phon_set_str = "".join(sorted(set(phon_str)))
phoneme_set_weight_cache[phon_set_str] = weight
continue
mat_zero = re.fullmatch(
r"Zero *[+]= *(?P<consw>[0-9]+) +(?P<voww>[0-9]+)",
line)
if mat_zero:
# it defines the cost of including a Zero in feature sets
cost_of_zero_c = int(mat_zero.group("consw"))
cost_of_zero_v = int(mat_zero.group("voww"))
continue
mat_for_in = re.fullmatch(
r"(?P<expr>(\w:\w +)*\w:\w::[0-9]+) +FOR +(?P<var>\w+) +IN +(?P<set>\w+)",
line)
if mat_for_in:
# it defines a FOR IN definition
for_definitions_lst.append((mat_for_in.group("expr"),
mat_for_in.group("var"),
mat_for_in.group("set")))
continue
mat_exception = re.fullmatch("(?P<expr>(\w:\w +)*\w:\w::[0-9]+)", line)
if mat_exception:
# it defines an exception list
exception_lst.append(mat_exception.group("expr"))
continue
msg = "** INCORRECT ALPHABET DEFINITON LINE {}:\n {}"
sys.exit(msg.format(i, line_nl))
feature_set_lst = [set(lst) for lst in feature_lst_lst]
#
# now the alphabet data has been read in and extracted
#
if cfg.verbosity >= 20:
print("\ncost_of_zero_c:", cost_of_zero_c)
print("\ncost_of_zero_v:", cost_of_zero_v)
print("\nfeature_set_lst:", feature_set_lst)
print("\nfeature_lst_lst:", feature_lst_lst)
print("\nsubset_lst:", subset_lst)
print("\nfeatures_of_phoneme:", features_of_phoneme)
print("\nfor_definitions_lst:", for_definitions_lst)
print("\nexception_lst:", exception_lst)
#
# find the groups and bit positions of individual features
#
i = 0
for feature_lst in feature_lst_lst:
j = 0
for feature in feature_lst:
feature_group[feature] = i
feature_bitpos[feature] = j
j +=1
i += 1
del feature_group["Zero"] # feature Zero belongs all groups and
# needs special care
if cfg.verbosity >= 20:
print("\nfeature_group:", feature_group)
print("\nfeature_bitpos:", feature_bitpos)
#
# An integer for each phoneme. The integer represents six sets
# (with one element in each set). The sets are 16 bit long fields
# of the binary representation of the integer, total of 96 bits.
# Each set has a bit position reserved for each feature in the
# respective group. These integers or bit vectors can be combined
# with each other using bit operations.
#
for phoneme, features in features_of_phoneme.items():
intset = 0
for feature in features:
if feature:
bit_pos = feature_bitpos[feature]
bin_set = 1 << bit_pos
intset = (intset << 16) | bin_set
else:
intset = (intset << 16) | 0xffff
mphon_to_binint_cache[phoneme] = intset
if intset not in binint_to_mphon_set:
binint_to_mphon_set[intset] = set()
binint_to_mphon_set[intset].add(phoneme)
if cfg.verbosity >= 20:
lst = [fon + "=" + spaced_bin_int(intg) for fon, intg in
sorted(mphon_to_binint_cache.items())]
s = "\n".join(lst)
print("\nmphon_to_binint_cache")
print(s)
t = "\n".join([spaced_bin_int(intg) + " = " + str(fon_set)
for intg, fon_set in sorted(binint_to_mphon_set.items())])
print("\nbinint_to_mphon_set")
print(t)
#
# sets of vowels and consonants
#
for phoneme, features in features_of_phoneme.items():
if features[0] and features[1] and features[2]:
consonant_set.add(phoneme)
elif features[3] and features[4] and features[5]:
vowel_set.add(phoneme)
#
# convert the subsets into integers which represent bit vectors of the sets
#
subset_bin_lst = []
for subset, weight in subset_lst:
group_lst = list(set([feature_group[feature] for feature in subset]))
if len(group_lst) == 1:
group = group_lst[0]
else:
sys.exit("** FEATURES FROM SEVERAL GROUPS: {} = {}".format(subset, group_lst))
#print("\ngroup:", subset, weight, group) ###
bin_set = 0
for feat in subset:
bin_set = bin_set | (1 << feature_bitpos[feat])
if cfg.verbosity > 25:
print("\nsubset, bin_set, weight, group:", subset, bin(bin_set), weight, group)
subset_bin_lst.append((bin_set, weight, group, bin(bin_set)))
if cfg.verbosity > 20:
print("\nsubset_bin_lst:", subset_bin_lst)
#
# compute weights for all possible feature sets in each of the six groups
#
i = 0
weight_dict_lst = []
for feature_lst in feature_lst_lst:
weight_dict = {}
weight_dict[0] = -1 # for the empty set
weight_dict[1] = -1 # {"Zero"}
weight_dict[0xffff] = 999999 # for the universal set U
l = len(feature_lst)
for j in range(2, 1 << l, 2):
w = 100
for subset_bin, weight, group, bin_str in subset_bin_lst:
if cfg.verbosity >= 25:
print("\nsubset_bin, weight, group, bin_str, i:",
bin(j), weight, group, bin_str, i)
test = ~(subset_bin | ~j)
if cfg.verbosity >= 25:
print(">>> test:", bin(test)) ###
if group == i and not test and weight < w:
w = weight
weight_dict[j] = w
weight_dict[j+1] = w + (cost_of_zero_c if i < 3 else cost_of_zero_v)
for j in range(1, l):
weight_dict[1 << j] = 0
weight_dict[(1 << j) +1] = (cost_of_zero_c if i < 3 else cost_of_zero_v)
if cfg.verbosity > 20:
print("\nweight_dict[{}]:".format(i), weight_dict)
#for set_int, weight in weight_dict.items():
# for phoneme, feature_tuple in features_of_phoneme.items():
# feature = feature_tuple[i]
# ***
weight_dict_lst.append(weight_dict)
i += 1
(weight_c1, weight_c2, weight_c3, weight_v1, weight_v2, weight_v3) = weight_dict_lst
return
if __name__ == "__main__":
cfg.verbosity = 1
read_alphabet("alphabet-test.text")
mphon_is_valid("ei")
print(mphon_weight("ei"))