working commit

master
Dario Ernst 14 years ago
parent b36d6bd288
commit 2dc4fe66c2

@ -3,9 +3,11 @@ import sys
import os import os
import re import re
import copy import copy
import pprint
from collections import OrderedDict
""" """
Library for PinYin-izing a either a file or a block of text Library for PinYin-izing a either a file or a block of text. Sadly, python > 2.7 is required...
""" """
NOT = 0 NOT = 0
@ -25,12 +27,8 @@ def pinyinize(hanzitf, sqlitef, mode = PINYIN, simplified=PRIMARY, traditional=F
Will pinyin-ize either a string or a file (open(foo,"X")) given as "hanzitf". Will pinyin-ize either a string or a file (open(foo,"X")) given as "hanzitf".
Pinyinization can be controlled by the mode parameter. The following Pinyinization can be controlled by the mode parameter. The following
are possible: are possible:
PINYIN:
PINYIN_TRANSL
PINYIN_HANZI
PINYIN_TRANSL_HANZI
TRANSL
TRANSL_HANZI
With the defined return-value semantics. With the defined return-value semantics.
The pinyinization can be controlled via the simplified, traditional The pinyinization can be controlled via the simplified, traditional
@ -46,13 +44,13 @@ def pinyinize(hanzitf, sqlitef, mode = PINYIN, simplified=PRIMARY, traditional=F
is copied to output. is copied to output.
""" """
if(type(hanzitf)=="file"): if type(hanzitf)==file:
hanzitf.seek(0) hanzitf.seek(0)
hanzi = hanzitf.read() hanzi = hanzitf.read()
else if(type(hanzitf)=="str"): elif type(hanzitf)==str:
hanzi = hanzitf hanzi = hanzitf
else: else:
print "hanzitf was neither text nor file" print "hanzitf was neither text nor file, was: %s"%type(hanzitf)
return return
hanzi.strip().lower() hanzi.strip().lower()
@ -64,7 +62,8 @@ def pinyinize(hanzitf, sqlitef, mode = PINYIN, simplified=PRIMARY, traditional=F
slconn = sqlite3.connect(sqlitef) slconn = sqlite3.connect(sqlitef)
hzwork = list(hanzi.decode("UTF-8")) hzwork = list(hanzi.decode("UTF-8"))
finres = "" finres = OrderedDict()
while hzwork: while hzwork:
print "\nhzwork still %s"%hzwork print "\nhzwork still %s"%hzwork
tmpstr = "" tmpstr = ""
@ -74,46 +73,55 @@ def pinyinize(hanzitf, sqlitef, mode = PINYIN, simplified=PRIMARY, traditional=F
tmpstr += hzwork[0] tmpstr += hzwork[0]
print "checking for %s (%s)"%(tmpstr,tmpstr.__repr__()) print "checking for %s (%s)"%(tmpstr,tmpstr.__repr__())
sqlstr_s = 'SELECT pinyin FROM entries WHERE simplified="%s";'%(tmpstr,tmpstr) sqlstr_s = 'SELECT pinyin, dict FROM entries WHERE simplified="%s";'%tmpstr
sqlstr_t = 'SELECT pinyin FROM entries WHERE traditional="%s";'%(tmpstr,tmpstr) sqlstr_t = 'SELECT pinyin, dict FROM entries WHERE traditional="%s";'%tmpstr
res_s = None res_s = None
res_t = None res_t = None
res = None res = None
if simplified =! NOT: if simplified != NOT:
res_s = slconn.execute(sqlstr_s).fetchall() res_s = slconn.execute(sqlstr_s).fetchall()
if traditional =! NOT: if traditional != NOT:
res_t = slconn.execute(sqlstr_t).fetchall() res_t = slconn.execute(sqlstr_t).fetchall()
if simplified == PRIMARY: if simplified == PRIMARY:
res = res_s res = res_s
if not res_s && traditional == FALLBACK: if not res_s and traditional == FALLBACK:
res = res_t res = res_t
else if traditional == PRIMARY: elif traditional == PRIMARY:
res = res_t res = res_t
if not res_t && simplified == FALLBACK: if not res_t and simplified == FALLBACK:
res = res_s res = res_s
print "res are: %s"%res print "res are: %s"%res
# now for the result work... # now for the result work...
if res: # sequence of chars not found, using last result if res: # sequence of chars not found, using last result
lastres = res[0][0] print res[0]
lastres = [res[0]]
hzwork.pop(0) hzwork.pop(0)
print "found and consumed %s"%res[0][0] print "found and consumed %s"%res[0][0]
elif not res and not lastres: # first char not found, using fallback barf elif not res and not lastres: # first char not found, using fallback barf
if original: if original:
lastres = '%s'%tmpstr lastres = [('%s', 'no translation / string found, using input')]%tmpstr
hzwork.pop(0) hzwork.pop(0)
print "%s not found, adding raw and breaking"%tmpstr print "%s not found, adding raw and breaking"%tmpstr
break break
else: else:
print "got empty result, breaking" print "got empty result, breaking"
lastres += " " lastres.append( (" ","") )
break break
finres += lastres finres[tmpstr] = lastres
print "appending and deleting lastres %s, finres now: %s"%(lastres,finres) print "appending and deleting lastres %s, finres now: %s"%(lastres,finres)
print '\nfinished with:\n """\n%s\n"""\n'%finres pprint.pprint(finres)
print "\n\n\n"
for k,v in finres.iteritems():
print k
for i in v:
print v
print v[0]
print v[1]
print ""
return finres return finres

Loading…
Cancel
Save