made it work ... ~ish?

master
Dario Ernst 14 years ago
parent 2dc4fe66c2
commit f48fe5bfc1

@ -3,8 +3,8 @@ import sys
import os
import re
import copy
import pprint
from collections import OrderedDict
from json import JSONEncoder
"""
Library for PinYin-izing a either a file or a block of text. Sadly, python > 2.7 is required...
@ -22,14 +22,9 @@ TRANS = 4
TRANSL_HANZI = 5
def pinyinize(hanzitf, sqlitef, mode = PINYIN, simplified=PRIMARY, traditional=FALLBACK, original = True):
def pinyinize(hanzitf, sqlitef, simplified=PRIMARY, traditional=FALLBACK, original = True, json = False):
"""
Will pinyin-ize either a string or a file (open(foo,"X")) given as "hanzitf".
Pinyinization can be controlled by the mode parameter. The following
are possible:
With the defined return-value semantics.
The pinyinization can be controlled via the simplified, traditional
and original paramters. simplified and traditional can be set on
@ -42,6 +37,18 @@ def pinyinize(hanzitf, sqlitef, mode = PINYIN, simplified=PRIMARY, traditional=F
something as simplified, then traditional is attempt. If original
is set to True instead of False, then the original input hanzi
is copied to output.
The json parameter makes the function return its data "json" formatted for use in web systems
The return value is a OrderedDict, the recognized input sequences
are the keys, the values are lists of lists. The Structure is:
{'input_str[n:m]': [
[ 'pinyin', 'translation' ],
[ 'pinyin', 'translation' ],
...
[ ' ', '' ] # space at the end so you can nicely #print these
],
...
"""
if type(hanzitf)==file:
@ -50,13 +57,13 @@ def pinyinize(hanzitf, sqlitef, mode = PINYIN, simplified=PRIMARY, traditional=F
elif type(hanzitf)==str:
hanzi = hanzitf
else:
print "hanzitf was neither text nor file, was: %s"%type(hanzitf)
#print "hanzitf was neither text nor file, was: %s"%type(hanzitf)
return
hanzi.strip().lower()
if not os.path.exists(sqlitef):
print "sqlite database file not found"
#print "sqlite database file not found"
return None
slconn = sqlite3.connect(sqlitef)
@ -65,14 +72,14 @@ def pinyinize(hanzitf, sqlitef, mode = PINYIN, simplified=PRIMARY, traditional=F
finres = OrderedDict()
while hzwork:
print "\nhzwork still %s"%hzwork
#print "\nhzwork still %s"%hzwork
tmpstr = ""
lastres = ""
for c in copy.deepcopy(hzwork):
tmpstr += hzwork[0]
print "checking for %s (%s)"%(tmpstr,tmpstr.__repr__())
#print "checking for %s (%s)"%(tmpstr,tmpstr.__repr__())
sqlstr_s = 'SELECT pinyin, dict FROM entries WHERE simplified="%s";'%tmpstr
sqlstr_t = 'SELECT pinyin, dict FROM entries WHERE traditional="%s";'%tmpstr
res_s = None
@ -93,35 +100,30 @@ def pinyinize(hanzitf, sqlitef, mode = PINYIN, simplified=PRIMARY, traditional=F
if not res_t and simplified == FALLBACK:
res = res_s
print "res are: %s"%res
#print "res are: %s"%res
# now for the result work...
if res: # sequence of chars not found, using last result
print res[0]
lastres = [res[0]]
#print res[0]
lastres = [ list(res[0]) ]
hzwork.pop(0)
print "found and consumed %s"%res[0][0]
#print "found and consumed %s"%res[0][0]
elif not res and not lastres: # first char not found, using fallback barf
if original:
lastres = [('%s', 'no translation / string found, using input')]%tmpstr
lastres = [['%s', 'no translation / string found, using input']]%tmpstr
hzwork.pop(0)
print "%s not found, adding raw and breaking"%tmpstr
#print "%s not found, adding raw and breaking"%tmpstr
break
else:
print "got empty result, breaking"
lastres.append( (" ","") )
#print "got empty result, breaking"
lastres.append( [" ",""] )
break
finres[tmpstr] = lastres
print "appending and deleting lastres %s, finres now: %s"%(lastres,finres)
pprint.pprint(finres)
print "\n\n\n"
for k,v in finres.iteritems():
print k
for i in v:
print v
print v[0]
print v[1]
print ""
return finres
#print "appending and deleting lastres %s, finres now: %s"%(lastres,finres)
if not json:
return finres
else:
enc = JSONEncoder()
return enc.encode( finres )

Loading…
Cancel
Save