import sqlite3 import sys import os import re import copy from collections import OrderedDict from json import JSONEncoder """ Library for PinYin-izing a either a file or a block of text. Sadly, python > 2.7 is required... """ NOT = 0 PRIMARY = 1 FALLBACK = 2 PINYIN = 0 PINYIN_TRANSL = 1 PINYIN_HANZI = 2 PINYIN_TRANSL_HANZI = 3 TRANS = 4 TRANSL_HANZI = 5 def pinyinize(hanzitf, sqlitef, simplified=PRIMARY, traditional=FALLBACK, original = True, json = False): """ Will pinyin-ize either a string or a file (open(foo,"X")) given as "hanzitf". The pinyinization can be controlled via the simplified, traditional and original paramters. simplified and traditional can be set on PRIMARY or FALLBACK. If no PRIMARY match is found (e.g. you pinyinize a traditional text) then the FALLBACK is used. If no FALLBACK is found then the original input is copied to the output IFF original==True. In a example of a mixed traditional- and simplified text this would lead to a best effort to pinyinize everything as simplified. If theres no data how to pinyinize something as simplified, then traditional is attempt. If original is set to True instead of False, then the original input hanzi is copied to output. The json parameter makes the function return its data "json" formatted for use in web systems The return value is a OrderedDict, the recognized input sequences are the keys, the values are lists of lists. The Structure is: {'input_str[n:m]': [ [ 'pinyin', 'translation' ], [ 'pinyin', 'translation' ], ... [ ' ', '' ] # space at the end so you can nicely #print these ], ... """ if type(hanzitf)==file: hanzitf.seek(0) hanzi = hanzitf.read() elif type(hanzitf)==str: hanzi = hanzitf else: #print "hanzitf was neither text nor file, was: %s"%type(hanzitf) return hanzi.strip().lower() if not os.path.exists(sqlitef): #print "sqlite database file not found" return None slconn = sqlite3.connect(sqlitef) hzwork = list(hanzi.decode("UTF-8")) finres = OrderedDict() while hzwork: #print "\nhzwork still %s"%hzwork tmpstr = "" lastres = "" for c in copy.deepcopy(hzwork): tmpstr += hzwork[0] #print "checking for %s (%s)"%(tmpstr,tmpstr.__repr__()) sqlstr_s = 'SELECT pinyin, dict FROM entries WHERE simplified="%s";'%tmpstr sqlstr_t = 'SELECT pinyin, dict FROM entries WHERE traditional="%s";'%tmpstr res_s = None res_t = None res = None if simplified != NOT: res_s = slconn.execute(sqlstr_s).fetchall() if traditional != NOT: res_t = slconn.execute(sqlstr_t).fetchall() if simplified == PRIMARY: res = res_s if not res_s and traditional == FALLBACK: res = res_t elif traditional == PRIMARY: res = res_t if not res_t and simplified == FALLBACK: res = res_s #print "res are: %s"%res # now for the result work... if res: # sequence of chars not found, using last result #print res[0] lastres = [ list(res[0]) ] hzwork.pop(0) #print "found and consumed %s"%res[0][0] elif not res and not lastres: # first char not found, using fallback barf if original: lastres = [['%s', 'no translation / string found, using input']]%tmpstr hzwork.pop(0) #print "%s not found, adding raw and breaking"%tmpstr break else: #print "got empty result, breaking" lastres.append( [" ",""] ) break finres[tmpstr] = lastres #print "appending and deleting lastres %s, finres now: %s"%(lastres,finres) if not json: return finres else: enc = JSONEncoder() return enc.encode( finres )