started libbing and enhancing pynize

master
Dario Ernst 14 years ago
parent 142aa65c21
commit b36d6bd288

@ -1,66 +0,0 @@
import sqlite3
import sys
import os
import re
import copy
if len(sys.argv) != 3:
print "Usage: cedict2sqlite [hanzi-text or file] [sqlite-file.sqlite]"
exit
hanzitf=sys.argv[1]
sqlitef=sys.argv[2]
if os.path.exists(hanzitf):
hanzi = open(hanzitf,"r").read()
else:
hanzi = hanzitf
hanzi.strip().lower()
if not os.path.exists(sqlitef):
print "sqlite database file not found"
sys.exit()
slconn = sqlite3.connect(sqlitef)
hzwork = list(hanzi.decode("UTF-8"))
finres = ""
while hzwork:
print "\nhzwork still %s"%hzwork
tmpstr = ""
lastres = ""
for c in copy.deepcopy(hzwork):
tmpstr += hzwork[0]
print "checking for %s (%s)"%(tmpstr,tmpstr.__repr__())
res = slconn.execute('SELECT pinyin FROM entries WHERE simplified="%s" OR traditional="%s";'%(tmpstr,tmpstr)).fetchall()
print "res are: %s"%res
# now for the result work...
if res: # sequence of chars not found, using last result
lastres = res[0][0]
hzwork.pop(0)
print "found and consumed %s"%res[0][0]
elif not res and not lastres: # first char not found, using fallback barf
lastres = '%s'%tmpstr
hzwork.pop(0)
print "%s not found, adding raw and breaking"%tmpstr
break
else:
print "got empty result, breaking"
lastres += " "
break
finres += lastres
print "appending and deleting lastres %s, finres now: %s"%(lastres,finres)
print '\nfinished with:\n """\n%s\n"""\n'%finres

@ -0,0 +1,119 @@
import sqlite3
import sys
import os
import re
import copy
"""
Library for PinYin-izing a either a file or a block of text
"""
NOT = 0
PRIMARY = 1
FALLBACK = 2
PINYIN = 0
PINYIN_TRANSL = 1
PINYIN_HANZI = 2
PINYIN_TRANSL_HANZI = 3
TRANS = 4
TRANSL_HANZI = 5
def pinyinize(hanzitf, sqlitef, mode = PINYIN, simplified=PRIMARY, traditional=FALLBACK, original = True):
"""
Will pinyin-ize either a string or a file (open(foo,"X")) given as "hanzitf".
Pinyinization can be controlled by the mode parameter. The following
are possible:
PINYIN:
PINYIN_TRANSL
PINYIN_HANZI
PINYIN_TRANSL_HANZI
TRANSL
TRANSL_HANZI
With the defined return-value semantics.
The pinyinization can be controlled via the simplified, traditional
and original paramters. simplified and traditional can be set on
PRIMARY or FALLBACK. If no PRIMARY match is found (e.g. you
pinyinize a traditional text) then the FALLBACK is used. If no
FALLBACK is found then the original input is copied to the output
IFF original==True. In a example of a mixed traditional- and
simplified text this would lead to a best effort to pinyinize
everything as simplified. If theres no data how to pinyinize
something as simplified, then traditional is attempt. If original
is set to True instead of False, then the original input hanzi
is copied to output.
"""
if(type(hanzitf)=="file"):
hanzitf.seek(0)
hanzi = hanzitf.read()
else if(type(hanzitf)=="str"):
hanzi = hanzitf
else:
print "hanzitf was neither text nor file"
return
hanzi.strip().lower()
if not os.path.exists(sqlitef):
print "sqlite database file not found"
return None
slconn = sqlite3.connect(sqlitef)
hzwork = list(hanzi.decode("UTF-8"))
finres = ""
while hzwork:
print "\nhzwork still %s"%hzwork
tmpstr = ""
lastres = ""
for c in copy.deepcopy(hzwork):
tmpstr += hzwork[0]
print "checking for %s (%s)"%(tmpstr,tmpstr.__repr__())
sqlstr_s = 'SELECT pinyin FROM entries WHERE simplified="%s";'%(tmpstr,tmpstr)
sqlstr_t = 'SELECT pinyin FROM entries WHERE traditional="%s";'%(tmpstr,tmpstr)
res_s = None
res_t = None
res = None
if simplified =! NOT:
res_s = slconn.execute(sqlstr_s).fetchall()
if traditional =! NOT:
res_t = slconn.execute(sqlstr_t).fetchall()
if simplified == PRIMARY:
res = res_s
if not res_s && traditional == FALLBACK:
res = res_t
else if traditional == PRIMARY:
res = res_t
if not res_t && simplified == FALLBACK:
res = res_s
print "res are: %s"%res
# now for the result work...
if res: # sequence of chars not found, using last result
lastres = res[0][0]
hzwork.pop(0)
print "found and consumed %s"%res[0][0]
elif not res and not lastres: # first char not found, using fallback barf
if original:
lastres = '%s'%tmpstr
hzwork.pop(0)
print "%s not found, adding raw and breaking"%tmpstr
break
else:
print "got empty result, breaking"
lastres += " "
break
finres += lastres
print "appending and deleting lastres %s, finres now: %s"%(lastres,finres)
print '\nfinished with:\n """\n%s\n"""\n'%finres
return finres
Loading…
Cancel
Save