You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

130 lines
4.2 KiB

import sqlite3
import sys
import os
import re
import copy
from collections import OrderedDict
from json import JSONEncoder
"""
Library for PinYin-izing a either a file or a block of text. Sadly, python > 2.7 is required...
"""
NOT = 0
PRIMARY = 1
FALLBACK = 2
PINYIN = 0
PINYIN_TRANSL = 1
PINYIN_HANZI = 2
PINYIN_TRANSL_HANZI = 3
TRANS = 4
TRANSL_HANZI = 5
def pinyinize(hanzitf, sqlitef, simplified=PRIMARY, traditional=FALLBACK, original = True, json = False):
"""
Will pinyin-ize either a string or a file (open(foo,"X")) given as "hanzitf".
The pinyinization can be controlled via the simplified, traditional
and original paramters. simplified and traditional can be set on
PRIMARY or FALLBACK. If no PRIMARY match is found (e.g. you
pinyinize a traditional text) then the FALLBACK is used. If no
FALLBACK is found then the original input is copied to the output
IFF original==True. In a example of a mixed traditional- and
simplified text this would lead to a best effort to pinyinize
everything as simplified. If theres no data how to pinyinize
something as simplified, then traditional is attempt. If original
is set to True instead of False, then the original input hanzi
is copied to output.
The json parameter makes the function return its data "json" formatted for use in web systems
The return value is a OrderedDict, the recognized input sequences
are the keys, the values are lists of lists. The Structure is:
{'input_str[n:m]': [
[ 'pinyin', 'translation' ],
[ 'pinyin', 'translation' ],
...
[ ' ', '' ] # space at the end so you can nicely #print these
],
...
"""
if type(hanzitf)==file:
hanzitf.seek(0)
hanzi = hanzitf.read()
elif type(hanzitf)==str:
hanzi = hanzitf
else:
#print "hanzitf was neither text nor file, was: %s"%type(hanzitf)
return
hanzi.strip().lower()
if not os.path.exists(sqlitef):
#print "sqlite database file not found"
return None
slconn = sqlite3.connect(sqlitef)
hzwork = list(hanzi.decode("UTF-8"))
finres = OrderedDict()
while hzwork:
#print "\nhzwork still %s"%hzwork
tmpstr = ""
lastres = ""
for c in copy.deepcopy(hzwork):
tmpstr += hzwork[0]
#print "checking for %s (%s)"%(tmpstr,tmpstr.__repr__())
sqlstr_s = 'SELECT pinyin, dict FROM entries WHERE simplified="%s";'%tmpstr
sqlstr_t = 'SELECT pinyin, dict FROM entries WHERE traditional="%s";'%tmpstr
res_s = None
res_t = None
res = None
if simplified != NOT:
res_s = slconn.execute(sqlstr_s).fetchall()
if traditional != NOT:
res_t = slconn.execute(sqlstr_t).fetchall()
if simplified == PRIMARY:
res = res_s
if not res_s and traditional == FALLBACK:
res = res_t
elif traditional == PRIMARY:
res = res_t
if not res_t and simplified == FALLBACK:
res = res_s
#print "res are: %s"%res
# now for the result work...
if res: # sequence of chars not found, using last result
#print res[0]
lastres = [ list(res[0]) ]
hzwork.pop(0)
#print "found and consumed %s"%res[0][0]
elif not res and not lastres: # first char not found, using fallback barf
if original:
lastres = [['%s', 'no translation / string found, using input']]%tmpstr
hzwork.pop(0)
#print "%s not found, adding raw and breaking"%tmpstr
break
else:
#print "got empty result, breaking"
lastres.append( [" ",""] )
break
finres[tmpstr] = lastres
#print "appending and deleting lastres %s, finres now: %s"%(lastres,finres)
if not json:
return finres
else:
enc = JSONEncoder()
return enc.encode( finres )