You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

128 lines
3.9 KiB

import sqlite3
import sys
import os
import re
import copy
14 years ago
import pprint
from collections import OrderedDict
"""
14 years ago
Library for PinYin-izing a either a file or a block of text. Sadly, python > 2.7 is required...
"""
NOT = 0
PRIMARY = 1
FALLBACK = 2
PINYIN = 0
PINYIN_TRANSL = 1
PINYIN_HANZI = 2
PINYIN_TRANSL_HANZI = 3
TRANS = 4
TRANSL_HANZI = 5
def pinyinize(hanzitf, sqlitef, mode = PINYIN, simplified=PRIMARY, traditional=FALLBACK, original = True):
"""
Will pinyin-ize either a string or a file (open(foo,"X")) given as "hanzitf".
Pinyinization can be controlled by the mode parameter. The following
are possible:
14 years ago
With the defined return-value semantics.
The pinyinization can be controlled via the simplified, traditional
and original paramters. simplified and traditional can be set on
PRIMARY or FALLBACK. If no PRIMARY match is found (e.g. you
pinyinize a traditional text) then the FALLBACK is used. If no
FALLBACK is found then the original input is copied to the output
IFF original==True. In a example of a mixed traditional- and
simplified text this would lead to a best effort to pinyinize
everything as simplified. If theres no data how to pinyinize
something as simplified, then traditional is attempt. If original
is set to True instead of False, then the original input hanzi
is copied to output.
"""
14 years ago
if type(hanzitf)==file:
hanzitf.seek(0)
hanzi = hanzitf.read()
14 years ago
elif type(hanzitf)==str:
hanzi = hanzitf
else:
14 years ago
print "hanzitf was neither text nor file, was: %s"%type(hanzitf)
return
hanzi.strip().lower()
if not os.path.exists(sqlitef):
print "sqlite database file not found"
return None
slconn = sqlite3.connect(sqlitef)
hzwork = list(hanzi.decode("UTF-8"))
14 years ago
finres = OrderedDict()
while hzwork:
print "\nhzwork still %s"%hzwork
tmpstr = ""
lastres = ""
for c in copy.deepcopy(hzwork):
tmpstr += hzwork[0]
print "checking for %s (%s)"%(tmpstr,tmpstr.__repr__())
14 years ago
sqlstr_s = 'SELECT pinyin, dict FROM entries WHERE simplified="%s";'%tmpstr
sqlstr_t = 'SELECT pinyin, dict FROM entries WHERE traditional="%s";'%tmpstr
res_s = None
res_t = None
res = None
14 years ago
if simplified != NOT:
res_s = slconn.execute(sqlstr_s).fetchall()
14 years ago
if traditional != NOT:
res_t = slconn.execute(sqlstr_t).fetchall()
if simplified == PRIMARY:
res = res_s
14 years ago
if not res_s and traditional == FALLBACK:
res = res_t
14 years ago
elif traditional == PRIMARY:
res = res_t
14 years ago
if not res_t and simplified == FALLBACK:
res = res_s
print "res are: %s"%res
# now for the result work...
if res: # sequence of chars not found, using last result
14 years ago
print res[0]
lastres = [res[0]]
hzwork.pop(0)
print "found and consumed %s"%res[0][0]
elif not res and not lastres: # first char not found, using fallback barf
if original:
14 years ago
lastres = [('%s', 'no translation / string found, using input')]%tmpstr
hzwork.pop(0)
print "%s not found, adding raw and breaking"%tmpstr
break
else:
print "got empty result, breaking"
14 years ago
lastres.append( (" ","") )
break
14 years ago
finres[tmpstr] = lastres
print "appending and deleting lastres %s, finres now: %s"%(lastres,finres)
14 years ago
pprint.pprint(finres)
print "\n\n\n"
for k,v in finres.iteritems():
print k
for i in v:
print v
print v[0]
print v[1]
print ""
return finres