|
|
|
|
@ -3,8 +3,8 @@ import sys
|
|
|
|
|
import os
|
|
|
|
|
import re
|
|
|
|
|
import copy
|
|
|
|
|
import pprint
|
|
|
|
|
from collections import OrderedDict
|
|
|
|
|
from json import JSONEncoder
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
Library for PinYin-izing a either a file or a block of text. Sadly, python > 2.7 is required...
|
|
|
|
|
@ -22,14 +22,9 @@ TRANS = 4
|
|
|
|
|
TRANSL_HANZI = 5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def pinyinize(hanzitf, sqlitef, mode = PINYIN, simplified=PRIMARY, traditional=FALLBACK, original = True):
|
|
|
|
|
def pinyinize(hanzitf, sqlitef, simplified=PRIMARY, traditional=FALLBACK, original = True, json = False):
|
|
|
|
|
"""
|
|
|
|
|
Will pinyin-ize either a string or a file (open(foo,"X")) given as "hanzitf".
|
|
|
|
|
Pinyinization can be controlled by the mode parameter. The following
|
|
|
|
|
are possible:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
With the defined return-value semantics.
|
|
|
|
|
|
|
|
|
|
The pinyinization can be controlled via the simplified, traditional
|
|
|
|
|
and original paramters. simplified and traditional can be set on
|
|
|
|
|
@ -42,6 +37,18 @@ def pinyinize(hanzitf, sqlitef, mode = PINYIN, simplified=PRIMARY, traditional=F
|
|
|
|
|
something as simplified, then traditional is attempt. If original
|
|
|
|
|
is set to True instead of False, then the original input hanzi
|
|
|
|
|
is copied to output.
|
|
|
|
|
|
|
|
|
|
The json parameter makes the function return its data "json" formatted for use in web systems
|
|
|
|
|
|
|
|
|
|
The return value is a OrderedDict, the recognized input sequences
|
|
|
|
|
are the keys, the values are lists of lists. The Structure is:
|
|
|
|
|
{'input_str[n:m]': [
|
|
|
|
|
[ 'pinyin', 'translation' ],
|
|
|
|
|
[ 'pinyin', 'translation' ],
|
|
|
|
|
...
|
|
|
|
|
[ ' ', '' ] # space at the end so you can nicely #print these
|
|
|
|
|
],
|
|
|
|
|
...
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
if type(hanzitf)==file:
|
|
|
|
|
@ -50,13 +57,13 @@ def pinyinize(hanzitf, sqlitef, mode = PINYIN, simplified=PRIMARY, traditional=F
|
|
|
|
|
elif type(hanzitf)==str:
|
|
|
|
|
hanzi = hanzitf
|
|
|
|
|
else:
|
|
|
|
|
print "hanzitf was neither text nor file, was: %s"%type(hanzitf)
|
|
|
|
|
#print "hanzitf was neither text nor file, was: %s"%type(hanzitf)
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
hanzi.strip().lower()
|
|
|
|
|
|
|
|
|
|
if not os.path.exists(sqlitef):
|
|
|
|
|
print "sqlite database file not found"
|
|
|
|
|
#print "sqlite database file not found"
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
slconn = sqlite3.connect(sqlitef)
|
|
|
|
|
@ -65,14 +72,14 @@ def pinyinize(hanzitf, sqlitef, mode = PINYIN, simplified=PRIMARY, traditional=F
|
|
|
|
|
finres = OrderedDict()
|
|
|
|
|
|
|
|
|
|
while hzwork:
|
|
|
|
|
print "\nhzwork still %s"%hzwork
|
|
|
|
|
#print "\nhzwork still %s"%hzwork
|
|
|
|
|
tmpstr = ""
|
|
|
|
|
lastres = ""
|
|
|
|
|
|
|
|
|
|
for c in copy.deepcopy(hzwork):
|
|
|
|
|
tmpstr += hzwork[0]
|
|
|
|
|
|
|
|
|
|
print "checking for %s (%s)"%(tmpstr,tmpstr.__repr__())
|
|
|
|
|
#print "checking for %s (%s)"%(tmpstr,tmpstr.__repr__())
|
|
|
|
|
sqlstr_s = 'SELECT pinyin, dict FROM entries WHERE simplified="%s";'%tmpstr
|
|
|
|
|
sqlstr_t = 'SELECT pinyin, dict FROM entries WHERE traditional="%s";'%tmpstr
|
|
|
|
|
res_s = None
|
|
|
|
|
@ -93,35 +100,30 @@ def pinyinize(hanzitf, sqlitef, mode = PINYIN, simplified=PRIMARY, traditional=F
|
|
|
|
|
if not res_t and simplified == FALLBACK:
|
|
|
|
|
res = res_s
|
|
|
|
|
|
|
|
|
|
print "res are: %s"%res
|
|
|
|
|
#print "res are: %s"%res
|
|
|
|
|
|
|
|
|
|
# now for the result work...
|
|
|
|
|
if res: # sequence of chars not found, using last result
|
|
|
|
|
print res[0]
|
|
|
|
|
lastres = [res[0]]
|
|
|
|
|
#print res[0]
|
|
|
|
|
lastres = [ list(res[0]) ]
|
|
|
|
|
hzwork.pop(0)
|
|
|
|
|
print "found and consumed %s"%res[0][0]
|
|
|
|
|
#print "found and consumed %s"%res[0][0]
|
|
|
|
|
elif not res and not lastres: # first char not found, using fallback barf
|
|
|
|
|
if original:
|
|
|
|
|
lastres = [('%s', 'no translation / string found, using input')]%tmpstr
|
|
|
|
|
lastres = [['%s', 'no translation / string found, using input']]%tmpstr
|
|
|
|
|
hzwork.pop(0)
|
|
|
|
|
print "%s not found, adding raw and breaking"%tmpstr
|
|
|
|
|
#print "%s not found, adding raw and breaking"%tmpstr
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
print "got empty result, breaking"
|
|
|
|
|
lastres.append( (" ","") )
|
|
|
|
|
#print "got empty result, breaking"
|
|
|
|
|
lastres.append( [" ",""] )
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
finres[tmpstr] = lastres
|
|
|
|
|
print "appending and deleting lastres %s, finres now: %s"%(lastres,finres)
|
|
|
|
|
|
|
|
|
|
pprint.pprint(finres)
|
|
|
|
|
print "\n\n\n"
|
|
|
|
|
for k,v in finres.iteritems():
|
|
|
|
|
print k
|
|
|
|
|
for i in v:
|
|
|
|
|
print v
|
|
|
|
|
print v[0]
|
|
|
|
|
print v[1]
|
|
|
|
|
print ""
|
|
|
|
|
#print "appending and deleting lastres %s, finres now: %s"%(lastres,finres)
|
|
|
|
|
|
|
|
|
|
if not json:
|
|
|
|
|
return finres
|
|
|
|
|
else:
|
|
|
|
|
enc = JSONEncoder()
|
|
|
|
|
return enc.encode( finres )
|
|
|
|
|
|