diff --git a/pinyinize.py b/pinyinize.py deleted file mode 100644 index 8901d72..0000000 --- a/pinyinize.py +++ /dev/null @@ -1,66 +0,0 @@ -import sqlite3 -import sys -import os -import re -import copy - -if len(sys.argv) != 3: - print "Usage: cedict2sqlite [hanzi-text or file] [sqlite-file.sqlite]" - exit - -hanzitf=sys.argv[1] -sqlitef=sys.argv[2] - - -if os.path.exists(hanzitf): - hanzi = open(hanzitf,"r").read() -else: - hanzi = hanzitf - -hanzi.strip().lower() - -if not os.path.exists(sqlitef): - print "sqlite database file not found" - sys.exit() - -slconn = sqlite3.connect(sqlitef) - -hzwork = list(hanzi.decode("UTF-8")) -finres = "" -while hzwork: - print "\nhzwork still %s"%hzwork - tmpstr = "" - lastres = "" - - for c in copy.deepcopy(hzwork): - tmpstr += hzwork[0] - - print "checking for %s (%s)"%(tmpstr,tmpstr.__repr__()) - res = slconn.execute('SELECT pinyin FROM entries WHERE simplified="%s" OR traditional="%s";'%(tmpstr,tmpstr)).fetchall() - print "res are: %s"%res - - # now for the result work... - if res: # sequence of chars not found, using last result - lastres = res[0][0] - hzwork.pop(0) - print "found and consumed %s"%res[0][0] - elif not res and not lastres: # first char not found, using fallback barf - lastres = '%s'%tmpstr - hzwork.pop(0) - print "%s not found, adding raw and breaking"%tmpstr - break - else: - print "got empty result, breaking" - lastres += " " - break - - finres += lastres - print "appending and deleting lastres %s, finres now: %s"%(lastres,finres) - -print '\nfinished with:\n """\n%s\n"""\n'%finres - - - - - - diff --git a/pynizelib.py b/pynizelib.py new file mode 100644 index 0000000..47fee38 --- /dev/null +++ b/pynizelib.py @@ -0,0 +1,119 @@ +import sqlite3 +import sys +import os +import re +import copy + +""" +Library for PinYin-izing a either a file or a block of text +""" + +NOT = 0 +PRIMARY = 1 +FALLBACK = 2 + +PINYIN = 0 +PINYIN_TRANSL = 1 +PINYIN_HANZI = 2 +PINYIN_TRANSL_HANZI = 3 +TRANS = 4 +TRANSL_HANZI = 5 + + +def pinyinize(hanzitf, sqlitef, mode = PINYIN, simplified=PRIMARY, traditional=FALLBACK, original = True): + """ + Will pinyin-ize either a string or a file (open(foo,"X")) given as "hanzitf". + Pinyinization can be controlled by the mode parameter. The following + are possible: + PINYIN: + PINYIN_TRANSL + PINYIN_HANZI + PINYIN_TRANSL_HANZI + TRANSL + TRANSL_HANZI + With the defined return-value semantics. + + The pinyinization can be controlled via the simplified, traditional + and original paramters. simplified and traditional can be set on + PRIMARY or FALLBACK. If no PRIMARY match is found (e.g. you + pinyinize a traditional text) then the FALLBACK is used. If no + FALLBACK is found then the original input is copied to the output + IFF original==True. In a example of a mixed traditional- and + simplified text this would lead to a best effort to pinyinize + everything as simplified. If theres no data how to pinyinize + something as simplified, then traditional is attempt. If original + is set to True instead of False, then the original input hanzi + is copied to output. + """ + + if(type(hanzitf)=="file"): + hanzitf.seek(0) + hanzi = hanzitf.read() + else if(type(hanzitf)=="str"): + hanzi = hanzitf + else: + print "hanzitf was neither text nor file" + return + + hanzi.strip().lower() + + if not os.path.exists(sqlitef): + print "sqlite database file not found" + return None + + slconn = sqlite3.connect(sqlitef) + + hzwork = list(hanzi.decode("UTF-8")) + finres = "" + while hzwork: + print "\nhzwork still %s"%hzwork + tmpstr = "" + lastres = "" + + for c in copy.deepcopy(hzwork): + tmpstr += hzwork[0] + + print "checking for %s (%s)"%(tmpstr,tmpstr.__repr__()) + sqlstr_s = 'SELECT pinyin FROM entries WHERE simplified="%s";'%(tmpstr,tmpstr) + sqlstr_t = 'SELECT pinyin FROM entries WHERE traditional="%s";'%(tmpstr,tmpstr) + res_s = None + res_t = None + res = None + + if simplified =! NOT: + res_s = slconn.execute(sqlstr_s).fetchall() + if traditional =! NOT: + res_t = slconn.execute(sqlstr_t).fetchall() + + if simplified == PRIMARY: + res = res_s + if not res_s && traditional == FALLBACK: + res = res_t + else if traditional == PRIMARY: + res = res_t + if not res_t && simplified == FALLBACK: + res = res_s + + print "res are: %s"%res + + # now for the result work... + if res: # sequence of chars not found, using last result + lastres = res[0][0] + hzwork.pop(0) + print "found and consumed %s"%res[0][0] + elif not res and not lastres: # first char not found, using fallback barf + if original: + lastres = '%s'%tmpstr + hzwork.pop(0) + print "%s not found, adding raw and breaking"%tmpstr + break + else: + print "got empty result, breaking" + lastres += " " + break + + finres += lastres + print "appending and deleting lastres %s, finres now: %s"%(lastres,finres) + + print '\nfinished with:\n """\n%s\n"""\n'%finres + return finres