parent
142aa65c21
commit
b36d6bd288
@ -1,66 +0,0 @@
|
||||
import sqlite3
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import copy
|
||||
|
||||
if len(sys.argv) != 3:
|
||||
print "Usage: cedict2sqlite [hanzi-text or file] [sqlite-file.sqlite]"
|
||||
exit
|
||||
|
||||
hanzitf=sys.argv[1]
|
||||
sqlitef=sys.argv[2]
|
||||
|
||||
|
||||
if os.path.exists(hanzitf):
|
||||
hanzi = open(hanzitf,"r").read()
|
||||
else:
|
||||
hanzi = hanzitf
|
||||
|
||||
hanzi.strip().lower()
|
||||
|
||||
if not os.path.exists(sqlitef):
|
||||
print "sqlite database file not found"
|
||||
sys.exit()
|
||||
|
||||
slconn = sqlite3.connect(sqlitef)
|
||||
|
||||
hzwork = list(hanzi.decode("UTF-8"))
|
||||
finres = ""
|
||||
while hzwork:
|
||||
print "\nhzwork still %s"%hzwork
|
||||
tmpstr = ""
|
||||
lastres = ""
|
||||
|
||||
for c in copy.deepcopy(hzwork):
|
||||
tmpstr += hzwork[0]
|
||||
|
||||
print "checking for %s (%s)"%(tmpstr,tmpstr.__repr__())
|
||||
res = slconn.execute('SELECT pinyin FROM entries WHERE simplified="%s" OR traditional="%s";'%(tmpstr,tmpstr)).fetchall()
|
||||
print "res are: %s"%res
|
||||
|
||||
# now for the result work...
|
||||
if res: # sequence of chars not found, using last result
|
||||
lastres = res[0][0]
|
||||
hzwork.pop(0)
|
||||
print "found and consumed %s"%res[0][0]
|
||||
elif not res and not lastres: # first char not found, using fallback barf
|
||||
lastres = '%s'%tmpstr
|
||||
hzwork.pop(0)
|
||||
print "%s not found, adding raw and breaking"%tmpstr
|
||||
break
|
||||
else:
|
||||
print "got empty result, breaking"
|
||||
lastres += " "
|
||||
break
|
||||
|
||||
finres += lastres
|
||||
print "appending and deleting lastres %s, finres now: %s"%(lastres,finres)
|
||||
|
||||
print '\nfinished with:\n """\n%s\n"""\n'%finres
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -0,0 +1,119 @@
|
||||
import sqlite3
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import copy
|
||||
|
||||
"""
|
||||
Library for PinYin-izing a either a file or a block of text
|
||||
"""
|
||||
|
||||
NOT = 0
|
||||
PRIMARY = 1
|
||||
FALLBACK = 2
|
||||
|
||||
PINYIN = 0
|
||||
PINYIN_TRANSL = 1
|
||||
PINYIN_HANZI = 2
|
||||
PINYIN_TRANSL_HANZI = 3
|
||||
TRANS = 4
|
||||
TRANSL_HANZI = 5
|
||||
|
||||
|
||||
def pinyinize(hanzitf, sqlitef, mode = PINYIN, simplified=PRIMARY, traditional=FALLBACK, original = True):
|
||||
"""
|
||||
Will pinyin-ize either a string or a file (open(foo,"X")) given as "hanzitf".
|
||||
Pinyinization can be controlled by the mode parameter. The following
|
||||
are possible:
|
||||
PINYIN:
|
||||
PINYIN_TRANSL
|
||||
PINYIN_HANZI
|
||||
PINYIN_TRANSL_HANZI
|
||||
TRANSL
|
||||
TRANSL_HANZI
|
||||
With the defined return-value semantics.
|
||||
|
||||
The pinyinization can be controlled via the simplified, traditional
|
||||
and original paramters. simplified and traditional can be set on
|
||||
PRIMARY or FALLBACK. If no PRIMARY match is found (e.g. you
|
||||
pinyinize a traditional text) then the FALLBACK is used. If no
|
||||
FALLBACK is found then the original input is copied to the output
|
||||
IFF original==True. In a example of a mixed traditional- and
|
||||
simplified text this would lead to a best effort to pinyinize
|
||||
everything as simplified. If theres no data how to pinyinize
|
||||
something as simplified, then traditional is attempt. If original
|
||||
is set to True instead of False, then the original input hanzi
|
||||
is copied to output.
|
||||
"""
|
||||
|
||||
if(type(hanzitf)=="file"):
|
||||
hanzitf.seek(0)
|
||||
hanzi = hanzitf.read()
|
||||
else if(type(hanzitf)=="str"):
|
||||
hanzi = hanzitf
|
||||
else:
|
||||
print "hanzitf was neither text nor file"
|
||||
return
|
||||
|
||||
hanzi.strip().lower()
|
||||
|
||||
if not os.path.exists(sqlitef):
|
||||
print "sqlite database file not found"
|
||||
return None
|
||||
|
||||
slconn = sqlite3.connect(sqlitef)
|
||||
|
||||
hzwork = list(hanzi.decode("UTF-8"))
|
||||
finres = ""
|
||||
while hzwork:
|
||||
print "\nhzwork still %s"%hzwork
|
||||
tmpstr = ""
|
||||
lastres = ""
|
||||
|
||||
for c in copy.deepcopy(hzwork):
|
||||
tmpstr += hzwork[0]
|
||||
|
||||
print "checking for %s (%s)"%(tmpstr,tmpstr.__repr__())
|
||||
sqlstr_s = 'SELECT pinyin FROM entries WHERE simplified="%s";'%(tmpstr,tmpstr)
|
||||
sqlstr_t = 'SELECT pinyin FROM entries WHERE traditional="%s";'%(tmpstr,tmpstr)
|
||||
res_s = None
|
||||
res_t = None
|
||||
res = None
|
||||
|
||||
if simplified =! NOT:
|
||||
res_s = slconn.execute(sqlstr_s).fetchall()
|
||||
if traditional =! NOT:
|
||||
res_t = slconn.execute(sqlstr_t).fetchall()
|
||||
|
||||
if simplified == PRIMARY:
|
||||
res = res_s
|
||||
if not res_s && traditional == FALLBACK:
|
||||
res = res_t
|
||||
else if traditional == PRIMARY:
|
||||
res = res_t
|
||||
if not res_t && simplified == FALLBACK:
|
||||
res = res_s
|
||||
|
||||
print "res are: %s"%res
|
||||
|
||||
# now for the result work...
|
||||
if res: # sequence of chars not found, using last result
|
||||
lastres = res[0][0]
|
||||
hzwork.pop(0)
|
||||
print "found and consumed %s"%res[0][0]
|
||||
elif not res and not lastres: # first char not found, using fallback barf
|
||||
if original:
|
||||
lastres = '%s'%tmpstr
|
||||
hzwork.pop(0)
|
||||
print "%s not found, adding raw and breaking"%tmpstr
|
||||
break
|
||||
else:
|
||||
print "got empty result, breaking"
|
||||
lastres += " "
|
||||
break
|
||||
|
||||
finres += lastres
|
||||
print "appending and deleting lastres %s, finres now: %s"%(lastres,finres)
|
||||
|
||||
print '\nfinished with:\n """\n%s\n"""\n'%finres
|
||||
return finres
|
||||
Loading…
Reference in new issue