import sqlite3 import sys import os import re import copy """ Library for PinYin-izing a either a file or a block of text """ NOT = 0 PRIMARY = 1 FALLBACK = 2 PINYIN = 0 PINYIN_TRANSL = 1 PINYIN_HANZI = 2 PINYIN_TRANSL_HANZI = 3 TRANS = 4 TRANSL_HANZI = 5 def pinyinize(hanzitf, sqlitef, mode = PINYIN, simplified=PRIMARY, traditional=FALLBACK, original = True): """ Will pinyin-ize either a string or a file (open(foo,"X")) given as "hanzitf". Pinyinization can be controlled by the mode parameter. The following are possible: PINYIN: PINYIN_TRANSL PINYIN_HANZI PINYIN_TRANSL_HANZI TRANSL TRANSL_HANZI With the defined return-value semantics. The pinyinization can be controlled via the simplified, traditional and original paramters. simplified and traditional can be set on PRIMARY or FALLBACK. If no PRIMARY match is found (e.g. you pinyinize a traditional text) then the FALLBACK is used. If no FALLBACK is found then the original input is copied to the output IFF original==True. In a example of a mixed traditional- and simplified text this would lead to a best effort to pinyinize everything as simplified. If theres no data how to pinyinize something as simplified, then traditional is attempt. If original is set to True instead of False, then the original input hanzi is copied to output. """ if(type(hanzitf)=="file"): hanzitf.seek(0) hanzi = hanzitf.read() else if(type(hanzitf)=="str"): hanzi = hanzitf else: print "hanzitf was neither text nor file" return hanzi.strip().lower() if not os.path.exists(sqlitef): print "sqlite database file not found" return None slconn = sqlite3.connect(sqlitef) hzwork = list(hanzi.decode("UTF-8")) finres = "" while hzwork: print "\nhzwork still %s"%hzwork tmpstr = "" lastres = "" for c in copy.deepcopy(hzwork): tmpstr += hzwork[0] print "checking for %s (%s)"%(tmpstr,tmpstr.__repr__()) sqlstr_s = 'SELECT pinyin FROM entries WHERE simplified="%s";'%(tmpstr,tmpstr) sqlstr_t = 'SELECT pinyin FROM entries WHERE traditional="%s";'%(tmpstr,tmpstr) res_s = None res_t = None res = None if simplified =! NOT: res_s = slconn.execute(sqlstr_s).fetchall() if traditional =! NOT: res_t = slconn.execute(sqlstr_t).fetchall() if simplified == PRIMARY: res = res_s if not res_s && traditional == FALLBACK: res = res_t else if traditional == PRIMARY: res = res_t if not res_t && simplified == FALLBACK: res = res_s print "res are: %s"%res # now for the result work... if res: # sequence of chars not found, using last result lastres = res[0][0] hzwork.pop(0) print "found and consumed %s"%res[0][0] elif not res and not lastres: # first char not found, using fallback barf if original: lastres = '%s'%tmpstr hzwork.pop(0) print "%s not found, adding raw and breaking"%tmpstr break else: print "got empty result, breaking" lastres += " " break finres += lastres print "appending and deleting lastres %s, finres now: %s"%(lastres,finres) print '\nfinished with:\n """\n%s\n"""\n'%finres return finres