Skip to content

Commit

Permalink
Re-implement python client (google#65)
Browse files Browse the repository at this point in the history
* Reimplement ZawgyiDetector without numpy
* Require Python 3.8
  • Loading branch information
blackblitz committed Dec 29, 2020
1 parent b6a4762 commit 92a7052
Show file tree
Hide file tree
Showing 9 changed files with 195 additions and 182 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,7 @@ build/
# checking in composer.lock causes failures due to incompatible PHP versions
composer.lock
composer.phar
# python
__pycache__/
venv/
*.egg-info/
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ matrix:

# Python Client
- language: python
python: 3.7
python: 3.8
before_script:
- cd clients/python
- python setup.py install
Expand Down
18 changes: 8 additions & 10 deletions clients/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,13 @@
author='William (Wai Yan) Zhu',
author_email='williamzhu345@gmail.com',
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Developers',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: Apache Software License',
'Operating System :: OS Independent',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Topic :: Text Processing'
'Development Status :: 4 - Beta',
'Intended Audience :: Developers',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: Apache Software License',
'Operating System :: OS Independent',
'Programming Language :: Python :: 3.8',
'Topic :: Text Processing'
],
license='Apache License, Version 2.0',
description='Tools for processing font encodings used in Myanmar',
Expand All @@ -29,6 +28,5 @@
package_dir={'': 'src'},
include_package_data=True,
package_data={'myanmartools': ['resources/*']},
install_requires=['numpy>=1.18'],
python_requires='>=3.7'
python_requires='>=3.8'
)
30 changes: 3 additions & 27 deletions clients/python/src/myanmartools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,5 @@
'''
Myanmar Tools
=============
"""Tools for processing font encodings used in Myanmar."""

Myanmar Tools implements tools for processing font encodings used in Myanmar.
It currently supports Zawgyi detection.
To detect Zawgyi, create an instance of ZawgyiDetector, and call
``get_zawgyi_probability`` with a string::
from myanmartools import ZawgyiDetector
detector = ZawgyiDetector()
score = detector.get_zawgyi_probability('မ္း')
# score is now 0.999772 (very likely Zawgyi)
For Zawgyi-to-Unicode conversion, you can use the ICU library. Install it
using ``pip install PyICU``.
To convert Zawgyi to Unicode, create an instance of ICU Transliterator with
the transform ID "Zawgyi-my", and call :code:`transiliterate` with a string::
from icu import Transliterator
converter = Transliterator.createInstance('Zawgyi-my')
output = converter.transliterate('မ္း')
# output is now 'မ်း'
'''
from .zawgyi_detector import ZawgyiDetector

__all__ = ['ZawgyiDetector']
169 changes: 91 additions & 78 deletions clients/python/src/myanmartools/_params.py
Original file line number Diff line number Diff line change
@@ -1,80 +1,93 @@
from importlib.resources import open_binary
import numpy as np
"""Helper functions for reading parameters of the model file."""

from array import array
from itertools import chain, repeat
import struct
from typing import BinaryIO, cast, Final, Iterator, Tuple

# Myanmar Unicode characters before digits
STD: Final = range(0x1000, 0x103F + 1)
# Myanmar Unicode characters after digits
AFT: Final = range(0x104A, 0x109F + 1)
# Myanmar Extended-A Unicode characters
EXA: Final = range(0xAA60, 0xAA7F + 1)
# Myanmar Extended-B Unicode characters
EXB: Final = range(0xA9E0, 0xA9FF + 1)
# Unicode space characters
SPC: Final = range(0x2000, 0x200B + 1)


def check_signature(stream: BinaryIO) -> str:
"""
Check signature of the model file and return characters used by the model.
The characters returned are sorted in lexicographical order.
"""
uzmodel_tag = stream.read(8)
if uzmodel_tag != b'UZMODEL ':
raise IOError('invalid uzmodel_tag')
uzmodel_version = read_int(stream)

if uzmodel_version == 1:
ssv = 0
elif uzmodel_version == 2:
ssv = read_int(stream)
else:
raise IOError('invalid uzmodel_version')

if ssv == 0:
chars = ''.join(map(chr, chain(STD, AFT, EXA, EXB, SPC)))
elif ssv == 1:
chars = ''.join(map(chr, chain(STD, AFT, EXA, EXB)))
else:
raise ValueError('invalid ssv')

bmarkov_tag = stream.read(8)
if bmarkov_tag != b'BMARKOV ':
raise IOError('invalid bmarkov_tag')
bmarkov_version = read_int(stream)
if bmarkov_version != 0:
raise IOError('invalid bmarkov_version')

return chars


def read_params(stream: BinaryIO) -> 'array[float]':
"""Read parameters in the model file."""
size = read_short(stream)
params = array('f', repeat(0, size * size))
for i in range(size):
count = read_short(stream)
if count != 0:
offset = i * size
# set default value
value = read_float(stream)
for index in range(size):
params[offset + index] = value
# set special values
for index, value in read_pairs(stream, count):
params[offset + index] = value
return params


def read_short(stream: BinaryIO) -> int:
"""Read a short integer value in big-endian order."""
return cast(int, struct.unpack('>h', stream.read(2))[0])


def read_int(stream: BinaryIO) -> int:
"""Read an integer value in big-endian order."""
return cast(int, struct.unpack('>i', stream.read(4))[0])


def read_float(stream: BinaryIO) -> float:
"""Read a float value in big-endian order."""
return cast(float, struct.unpack('>f', stream.read(4))[0])


def get_mapping():
'''
Generates a mapping of Myanmar Unicode characters to their corresponding
indices in the parameter array.
Returns
-------
dict
A mapping from Myanmar Unicode characters to indices.
'''
def get_chars(start, end):
return [chr(char) for char in range(ord(start), ord(end) + 1)]

chars = (get_chars('\u1000', '\u103F')
+ get_chars('\u104A', '\u109F')
+ get_chars('\uAA60', '\uAA7F')
+ get_chars('\uA9E0', '\uA9FF')
+ get_chars('\u2000', '\u200B'))

return {char: i + 1 for i, char in enumerate(chars)}

def load_params():
'''
Loads parameters as a 2d array, which are log likelihood ratios of
Unicode to Zawgyi.
Returns
-------
numpy.ndarray
Parameters as a 2d array.
'''
def read_char_array(f, size):
return struct.unpack(f'{size}s', f.read(size))[0].decode('utf-8')

def read_float(f):
return struct.unpack('>f', f.read(4))[0]

def read_int(f):
return struct.unpack('>i', f.read(4))[0]

def read_short(f):
return struct.unpack('>h', f.read(2))[0]

with open_binary('myanmartools.resources', 'zawgyiUnicodeModel.dat') as f:
# check signature
uzmodel_tag = read_char_array(f, 8)
if uzmodel_tag != 'UZMODEL ':
raise IOError('incorrect uzmodel_tag')
uzmodel_version = read_int(f)
if uzmodel_version == 1:
ssv = 0
elif uzmodel_version == 2:
ssv = read_int(f)
else:
raise IOError('incorrect uzmodel_version')
bmarkov_tag = read_char_array(f, 8)
if bmarkov_tag != 'BMARKOV ':
raise IOError('incorrect bmarkov_tag')
bmarkov_version = read_int(f)
if bmarkov_version != 0:
raise IOError('incorrect bmarkov_version')

# read params
size = read_short(f)
params = np.empty((size, size))
for row in range(size):
count = read_short(f)
if count != 0:
params[row] = read_float(f)
for i in range(count):
col = read_short(f)
params[row, col] = read_float(f)
else:
params[row] = 0

return params
def read_pairs(stream: BinaryIO, n: int) -> Iterator[Tuple[int, float]]:
"""Read n int-float value pairs in big-endian order."""
return cast(
Iterator[Tuple[int, float]],
struct.iter_unpack('>hf', stream.read(6 * n))
)
9 changes: 9 additions & 0 deletions clients/python/src/myanmartools/resources/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""
Resources for myanmartools.
This peckage contains:
- `zawgyiUnicodeModel.dat` - parameters of Zawgyi detector
- `compatability.tsv` - Zawgyi probabilities and input strings
resulting from Java implementation of Zawgyi detector
"""
115 changes: 67 additions & 48 deletions clients/python/src/myanmartools/zawgyi_detector.py
Original file line number Diff line number Diff line change
@@ -1,50 +1,69 @@
import numpy as np
from ._params import get_mapping, load_params
"""Zawgyi detector module."""

from bisect import bisect_left
from importlib.resources import open_binary
from itertools import chain, filterfalse
from math import exp, inf, isnan, nan
from typing import Iterator, Optional

from ._params import check_signature, read_params


class ZawgyiDetector:
'''
An estimator that predicts Zawgyi using two Markov chains, one
for Unicode text and the other for Zawgyi text.
Attributes
----------
mapping : dict
A mapping of Myanmar Unicode characters to the corresponding indices
in the parameter array.
params : numpy.ndarray
A parameter array containing log likelihood ratios of
Unicode to Zawgyi.
'''
def __init__(self):
self.mapping = get_mapping()
self.params = load_params()

def get_zawgyi_probability(self, string):
'''
Computes Zawgyi probability.
Parameters
----------
string : str
String to predict Zawgyi on.
Returns
-------
float
Zawgyi probability between 0 and 1, or negative infinity
if there is no Myanmar Unicode character.
'''
indices = [self.mapping.get(char, 0) for char in string]
# include starting and ending state probabilities
previous = np.array([0] + indices)
current = np.array(indices + [0])
# ignore 0-to-0 transitions
mask = np.logical_or(previous != 0, current != 0)
# return negative inifinity if there are only 0-to-0 transitions,
# which happens when there is no Myanmar Unicode character
if not mask.any():
return -np.inf
# Pz/(Pu+Pz) = exp(logPz)/(exp(logPu)+exp(logPz))
# = 1/(1+exp(logPu-logPz))
return 1.0 / (1.0 +
np.exp(self.params[previous[mask], current[mask]].sum()))
"""A detector of Myanmar Zawgyi encoding."""

__slots__ = ['_chars', '_params']

def __init__(self) -> None:
"""Intialize the detector."""
with open_binary(
'myanmartools.resources',
'zawgyiUnicodeModel.dat'
) as stream:
self._chars = check_signature(stream)
self._params = read_params(stream)
# the 0 node is for foreign characters so mark as nan
self._params[0] = nan

def _state(self, char: Optional[str]) -> int:
"""
Return the state of a character.
Return 0 for foreign characters.
"""
if char is None:
return 0
i = bisect_left(self._chars, char)
if i < len(self._chars) and self._chars[i] == char:
return i + 1
return 0

def _llrs(self, string: str) -> Iterator[float]:
"""
Return the log-likelihood ratios of consecutive character pairs.
The first and last characters are paired with None on the left
and right respectively.
"""
size = len(self._chars) + 1
return map(
lambda i, j: self._params[self._state(i) * size + self._state(j)],
chain((None,), string),
chain(string, (None,))
)

def get_zawgyi_probability(self, string: str) -> float:
"""
Return the Zawgyi probability of a string.
Return negative infinity if there are only foreign characters.
"""
if all(map(isnan, self._llrs(string))):
return -inf
total = sum(filterfalse(isnan, self._llrs(string)))
# Pz/(Pu+Pz) = exp(lnPz)/(exp(lnPu)+exp(lnPz)) = 1/(1+exp(lnPu-lnPz))
# prevent overflow when positive
if total >= 0:
z = exp(-total)
return z / (z + 1)
return 1 / (1 + exp(total))
1 change: 1 addition & 0 deletions clients/python/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Tests for myanmartools."""
Loading

0 comments on commit 92a7052

Please sign in to comment.