#!/usr/bin/python # Last updated: 13-09-2006: # added the unicode-bengali alphabet list # added --ancient option # fixed the problem of unit specification # This code uses Xavier Defrang's Single-Pass Multiple Replace, # Py Cookbook 81330 (=3.14 hardcopy) in an essential fashion. from __future__ import nested_scopes import string, re, sys, getopt, timing timing.start() version = "1.1" usage = ''' This is mkbangtex Version %s. It creates a BangTeX file from a Bengali document written phonetically in Roman script. It works in two modes, which are automatically detected: bengali: converts anything but single-word latex commands not enclosing within a pair of '@fn's foreign: converts only the portions enclosed within a pair of '@bn's CAUTION: Only ONE mode can be used in a file A piece of text enclosed within a pair of '@b2e's is converted from Bengali to acctented Roman script following the IAST (1912) rules. Works in both modes. [See accompanying documentation for further details] --- Usage: mkbangtex [-h, --alphabet --transparent, --ancient, -o ] -h, --help : print this page --alphabet : transliteration table --transparent : transparent symbols and default positioning --ancient : ancient Bengali --- without antastha ya -o : specify output file (default: .tex) ''' bengali_alphabet = u''' BENGALI ALPHABET a( \u0985 ) aa( \u0986 ) i( \u0987 ) ii( \u0988 ) u( \u0989 ) uu( \u098A ) Ri( \u098B ) \u098C (not used) e( \u098F ) oi( \u0990 ) o( \u0993 ) ou( \u0994 ) ------------------------------------------------ ka( \u0995 ) kha( \u0996 ) ga( \u0997 ) gha( \u0998 ) NG( \u0999 ) ca/cha( \u099A ) chha( \u099B ) ja( \u099C ) jha( \u099D ) NJ( \u099E ) Ta( \u099F ) Tha( \u09A0 ) Da( \u09A1 ) Dha( \u09A2 ) Na( \u09A3 ) ta( \u09A4 ) tha( \u09A5 ) da( \u09A6 ) dha( \u09A7 ) na( \u09A8 ) pa( \u09AA ) pha( \u09AB ) ba( \u09AC ) bha( \u09AD ) ma( \u09AE ) Ja( \u09AF ) ra( \u09B0 ) la( \u09B2 ) wa( \u09AC ) sha( \u09B6 ) Sha/shha( \u09B7 ) sa( \u09B8 ) ha/Ha( \u09B9 ) rh( \u09DC ) rhh( \u09DD ) _m( \u0982 ) ~h( \u0983 ) ~n( \u0981 ) ------------------------------------------------ khaNDa ta: _t( \u09CE ); hasanta: _h( \u09CD ); k+shh: x antastha y: y( \u09DF ) ''' try: opts, args = getopt.getopt(sys.argv[1:], 'ho:', ["help","outfile","transparent","alphabet","ancient"]) except getopt.GetoptError: # print help information and exit: print usage sys.exit(2) # Help menu for o , a in opts: if o in ("-h", "--help"): print usage % version sys.exit() output = None if o in ("--alphabet"): print bengali_alphabet.encode('utf-8') sys.exit() output = None # define input file # Check if the file exists try: fi = open(args[0], 'r') input_filename = args[0] print ''' mkbangtex %s Got %s. Processing ...''' % (version, input_filename) except: input_filename = "file" print ''' Can\'t open %s for reading. For help menu use: mkbangtex --help ''' % input_filename sys.exit(0) out_file_name = args[0]+'.tex' # initialize the ouput file cosmetix = "True" # initialize cosmetics to TRUE Ancient_bng = "False" # intialize Ancient_bng to FALSE # Set up the options for o , a in opts: if o == "-o": out_file_name = a if o == "--transparent": cosmetix = "False" if o == "--ancient": Ancient_bng = "True" # define and open output file o_file = open(out_file_name,'w') # setting up delimiters for both modes fnmod_delim = re.compile(r'@bengali|@bnstart|@bnend|@bn') bnmod_delim = re.compile(r"@fn") bndelim = re.compile(r'@@@') # change these to @@@ sndelim = re.compile(r'@b2e') # for transliteration fndelim = re.compile(r"(\$+|@@@)", re.X) # dollar and @@@ treated together dollar = re.compile(r"(\$)", re.X) # dollar and @@@ treated together # define non-latex words nonltx_word = re.compile(r"\s-?(?![\d+])\w*-?\(?\~?[\`]*\w+\S*|{o}\S+|{i}\S+", re.X) # compile patterns to be substituted ns_k = re.compile('n(?=k|g|x)') # nasals before k-barga ns_c = re.compile('n(?=c|j)') # nasals before c-barga tal_s = re.compile(r'sh(?=[bcdfghjklmnpqrstwxyz])') # sh followed by coonsonants, (for b2e) xX = re.compile(r"x", re.I) unga = re.compile(r"NG") ch = re.compile(r"ch") jh = re.compile(r"jh") Dh = re.compile(r"Dh") sh = re.compile(r"sh") ha = re.compile(r"~ha|~h") Ri = re.compile(r"Ri") jn = re.compile(r"jn") onuswar = re.compile(r"_m") rikar = re.compile(r"_r") hasanta = re.compile(r"_h") rh = re.compile("rh") Rh = re.compile("Rh") R = re.compile("R") J = re.compile("J") oi = re.compile("oi") ou = re.compile("ou") aii = re.compile(r"aii") ai = re.compile(r"ai") au = re.compile(r"au") aa = re.compile(r"aa") ao = re.compile(r"ao") a = re.compile(r"a") ao = re.compile(r"ao") ae = re.compile(r"ae") ei = re.compile(r"ei") eo = re.compile(r"eo") eu = re.compile(r"eu") io = re.compile(r"io") iu = re.compile(r"iu") ui = re.compile(r"ui") ii = re.compile(r"ii") oioi = re.compile(r"oioi") III = re.compile(r"III") XzX = re.compile(r"XzX") VxV = re.compile(r"VxV") chandra = re.compile(r"~n") ng_prob = re.compile(r"NG(?=M|r)") ngm = re.compile(r"NG/M") Ha = re.compile(r'~Ho|~Ha') H = re.compile("H") uu = re.compile("uu") inya = re.compile("NJ") T = re.compile('T') D = re.compile('D') N = re.compile('N') Th = re.compile('Th') shh = re.compile('shh') vw_b = re.compile(r'\b[aeiou]+') # vowels at word-beginning p_i = re.compile('([\^\~]|[b-dB-Df-hF-Hj-nJ-Np-tP-Tv-zV-Z])+(?=i)') p_e = re.compile('([\^\~]|[b-dB-Df-hF-Hj-nJ-Np-tP-Tv-zV-Z])+(?=e)') p_YA = re.compile('([\^\~]|[b-dB-Df-hF-Hj-nJ-Np-tP-Tv-zV-Z])+(?=(YA))') # for YA-kar's bangtex symbol p_o = re.compile('([\^\~]|[b-dB-Df-hF-Hj-nJ-Np-tP-Tv-zV-Z])+(?=o)') p_ou = re.compile('([\^\~]|[b-df-hj-np-tv-z])+(?=ou)', re.I) p_oi = re.compile('([\^\~]|[b-df-hj-np-tv-z])+(?=oi)', re.I) p_begin_Ha = re.compile(r'\bh', re.X) # Ha at word-beginning yukta_bnj = re.compile(r'[kxgcjTDNtdmnpblhHSsXZ](?=[kxgcCjTDNtTdnpbHSsXZ])') p_tkp = re.compile('(\_*)t(?=k|x|p|f|s|X)') # t before ka,kha,pa,pha,s,X is khanda t # consonants under which u, uu and Ri-kar need be shifted anomaly_u = re.compile(r'(k|kR|NG|c|C|T|D|Z|t|ph|f|phR|bh|X)(?=u|{rR})|(T(?=R))') u_kern = re.compile(r'em}u(?!u)') uu_kern = re.compile(r'em}uu') # DEFINING VARIOUS FUNCTIONS # # Functions from Defrang's code def multiple_replace(dict, text): """ Replace in 'text' all occurences of any key in the given dictionary by its corresponding value. Returns the new tring.""" # Create a regular expression from the dictionary keys regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys()))) # For each match, look-up corresponding value in dictionary return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text) from UserDict import UserDict class Xlator(UserDict): """ An all-in-one multiple string substitution class """ def _make_regex(self): """ Build a regular expression object based on the keys of the current dictionary """ return re.compile("(%s)" % "|".join(map(re.escape, self.keys()))) def __call__(self, mo): """ This handler will be invoked for each regex match """ # Count substitutions self.count += 1 # Look-up string return self[mo.string[mo.start():mo.end()]] def xlat(self, txt): """ Translate text, returns the modified text. """ # Reset substitution counter self.count = 0 # Process text return self._make_regex().sub(self, txt) # BENGALI Functions for -fala's and -kar's # Various -kar's def eikar(self): anomi = self.group() return '\*'+anomi+'*' def okar(self): anome = self.group() return '\*'+anome+'*ea' def oukar(self): anome = self.group() return '\*'+anome+'*'+'e' def oikar(self): anome = self.group() return '\*'+anome+'*'+'oi' def fala(self, c_list): # Various -fala's return dict([(x+self, x+self.upper()) for x in c_list]) def nasa(self,replmnt): # Nasals return dict([('n'+x, replmnt+'/'+x) for x in self]) def caps(self): capup = self.group() return '{'+capup.upper()+'}' def yukta(self): ykt = self.group() return ykt + '/' def ukar(self): # for shifting u/uu under anomu = self.group() # some consonants return anomu+'{\\kern-.23em}' def YAkar(self): # for YA-kar's bangtex symbol anomi = self.group() return '\*'+anomi+'*' def y_blank_ancient(self): y_anc = self.group() y_blnk = y_anc.replace("y","{}") return y_blnk # LISTS and DICTIONARIES: # # Define groups of letters from the alphabet: barga_k = list('xGX') barga_c = list('qC') # Ha after volwels dict_ha = fala('h', list('aeiouAEIOU')) # Ref dict_rf = dict([('r'+x, 'r'+'/'+x) for x in '[kxgGcqjCTFDZNtQdznpfbvmJlHPSs]']) # Nasals dict_k = nasa(barga_k,'VNV') dict_c = nasa(barga_c, 'NJ') dict_nasa = dict() dict_nasa.update(dict_k) dict_nasa.update(dict_c) ## Dictionary constructing the conjunctions dict_conj = { "AA" : "Aa" , "VVV":"g/Y", "_t" : "t//", "nh" : "n/H", "Nh" : "N/H", "mh" : "HM", "\"V" : "NN", "VV\"" : "NNG", "VNV" : "NG", "th/th" : "{\char211}", "rh/g" : "rhg", ":/:" : ":/", r"*YA" : "*{aa}", # for YA-kar's bangtex symbol r"YA" : "{AA}", # for YA-kar's bangtex symbol } # FUNCTIONS for mkbangtex def b2e(self): # b2e : for transliteration self = ns_k.sub('NG',self) self = ns_c.sub('NJ',self) self = xX.sub("kshh", self) self = ch.sub("c", self) self = H.sub("h", self) self = aa.sub('{\=a}',self) self = ii.sub('{\=\i}',self) self = uu.sub('{\=u}', self) self = Ri.sub('\d{ri}', self) self = unga.sub('{\.n}', self) self = inya.sub(r'\~{n}', self) self = Th.sub('\d{th}', self) self = T.sub('\d{t}', self) self = Dh.sub('\d{dh}', self) self = D.sub('\d{d}', self) self = N.sub('\d n', self) self = shh.sub('\d{s}', self) self = sh.sub("{\\'s}", self) self = chandra.sub("\u{n}", self) self = onuswar.sub("\d{m}", self) self = ha.sub("\d{h}", self) self = hasanta.sub("", self) self = self.replace('_t',"t") self = rikar.sub("\d{ri}", self) self = rh.sub(r'R', self) #self = Rh.sub(r'\d{Rh}', self) self = R.sub(r"\d{R}", self) self = J.sub("y", self) self = oi.sub("ai", self) self = ou.sub("au", self) return self def bn(self): # bn : for fnmode = FALSE self = xX.sub(r"X", self) self = unga.sub("VNV", self) self = ch.sub("c", self) self = jh.sub("C", self) self = Dh.sub("Z", self) self = sh.sub("S", self) self = ha.sub("~ha", self) self = Ri.sub("{RR}", self) self = jn.sub("VVV", self) self = vw_b.sub(caps, self) self = p_tkp.sub(r't//', self) # khanDa t self = onuswar.sub('{VV"}', self) # onu:sar self = rikar.sub('{rR}', self) # ri-fala self = hasanta.sub(':/:', self) # hasanta self = p_begin_Ha.sub('H', self) # Ha in middle must be before -kars xlat = Xlator(dict_ha) self = xlat.xlat(self) self = ao.sub(r"aO", self) self = ei.sub(r"eI", self) self = eo.sub(r"eO", self) self = eu.sub(r"eU", self) self = io.sub(r"iO", self) self = iu.sub(r"iU", self) self = ui.sub(r"uI", self) self = p_e.sub(eikar, self) self = ii.sub('III', self) self = p_i.sub(eikar, self) # these have to appear before self = p_ou.sub(oukar, self) self = p_oi.sub(oikar, self) self = p_o.sub(okar, self) # changing a to o self = oioi.sub(r'{oi}', self) self = III.sub('ii', self) self = aii.sub('aII', self) self = ai.sub('aI',self) self = ae.sub('aE',self) self = au.sub('aU',self) self = aa.sub('VxV',self) self = ao.sub('XzX',self) # ko --> ekao ---> eka self = a.sub('o',self) self = XzX.sub('a',self) self = VxV.sub('a',self) # ORDERS IN FOLLOWING IMPORTANT self = chandra.sub('"V', self) xlat = Xlator(dict_nasa) self = xlat.xlat(self) self = yukta_bnj.sub(yukta,self) self = xlat.xlat(self) xlat = Xlator(dict_rf) self = xlat.xlat(self) xlat = Xlator(fala('r',list('bcdfghjklmpqstvwxzBCDFGHJLMPQRSTWXYZ\/'))) self = xlat.xlat(self) xlat = Xlator(fala('y',list('bcdfghjklmnpqrstvwxyzBCDFGHJLMNPQRSTVWXYZ\/'))) # putting in r,h in this list self = p_YA.sub(YAkar, self) # for YA-kar's bangtex symbol self = xlat.xlat(self) xlat = Xlator(dict_conj) self = xlat.xlat(self) for f in list('[mlw]'): xlat = Xlator(fala(f,list('bcdfghjklmnpqstvwxyzBCDFGHJLMNPQRSTVWXYZ'))) self = xlat.xlat(self) self = Ha.sub('h', self) ng_prob = re.compile(r'NG(?=M|r)') self = ng_prob.sub('NG/',self) self = ngm.sub('NG/m', self) # only for the exceptional baNGmay return self def bn_cosmetic(self): self = bn(self) # cosmetic changes # changing position of u and uu under anomalous consonants self = self.replace(r'Yuu', 'uu{\kern-.06em}Y') self = self.replace(r'Yu', 'u{\kern-.06em}Y') self = anomaly_u.sub(ukar, self) self = u_kern.sub('em}u{\\kern.23em}', self) self = uu_kern.sub('em}uu{\\kern.23em}', self) self = self.replace(r'{\kern-.23em}{rR}','{\\kern-.23em}{rR}{\\kern.23em}') self = self.replace(r'XW', 'X\\kern-.25em\lower.2em\hbox{W}\\kern.25em') # for xwa need to lower ba-fala self = self.replace(r'TW', 'T\\kern-.1em\lower.3em\hbox{W}\\kern.1em') # for Twa need to position ba-fala self = self.replace(r'T{\kern-.23em}Ru','''T{\kern-.2em}\lower.1em\hbox{R}\lower.15em \hbox{u}\kern.2em''') # for Tru you need to lower R & u # changing shhT and shhTha from default to more orthodox forms self = self.replace(r'Sh/Th', '{\char178\kern-2pt\char70}') self = self.replace(r'Sh/T', '{\char178\kern0pt\char84}') return self def bnfn(self): # for using in fnmode as a function return bn(self.group()) # in a re.sub statement def bnfn_cosmetic(self): # for using in fnmode as a function return bn_cosmetic(self.group()) # in a re.sub statement # SUBSTITUTIONS : if __name__ == "__main__": text = fi.read() # Read text --- treating the whole # content as a SINGLE string fn_test = re.search(fnmod_delim, text) bn_test = re.search(bnmod_delim, text) fnmode = "" if fn_test: fnmode = "True" print "fnmode selected." elif bn_test: fnmode = "False" print "bnmode selected." # Now translate the text # The delimiters are handled differently in BNMODE and FNMODE. # Hence, the delimiter substitution and transliteration # are to be performed inside the loops. if fnmode == "True": # case: FOREIGN MODE: START # Manipulating delimiters text = fnmod_delim.sub("@@@", text) # first do the roman transliteration, if any sn_list = sndelim.split(text) for i in range(len(sn_list)): if i%2 == 0: pass else: text = sn_list[i] text = b2e(text).strip() sn_list[i] = text text = "".join(sn_list) big_list = bndelim.split(text) for i in range(len(big_list)): if i%2 == 0 : pass else: text = big_list[i] smal_list = dollar.split(text) for j in range(len(smal_list)): if j%4 == 0 : # Recall that for dollars text = smal_list[j] # we have kept the delimiters if Ancient_bng == "True": # for --ancient option text = text.replace("y","{}") else: pass if cosmetix == "False": # for --transparent option text = bn(text) else: text = bn_cosmetic(text) smal_list[j] = text text = "".join(smal_list) big_list[i] = text.strip() text = "".join(big_list) # FOREIGN MODE: END else: # case: BNMODE: START # Manipulating delimiters text = sndelim.sub("@b2e @@@ ", text) text = bnmod_delim.sub("@@@", text) # first do the roman transliteration, if any sn_list = sndelim.split(text) for i in range(len(sn_list)): if i%2 == 0: pass else: text = sn_list[i] text = b2e(text).strip() sn_list[i] = text text = "".join(sn_list) big_list = fndelim.split(text) for i in range(len(big_list)): if i%4 == 0 : text = big_list[i] if Ancient_bng == "True": # for --ancient option text = nonltx_word.sub(y_blank_ancient, text) else: pass if cosmetix == "False": # for --transparent option text = nonltx_word.sub(bnfn, text) else: text = nonltx_word.sub(bnfn_cosmetic, text) big_list[i] = text.strip("\n") text = "".join(big_list) # now split and join again to get rid of the delimiter @@@ text = "".join(text.split("@@@")) # BNMODE: END # write in the output file o_file.write(text) timing.finish() # on standard output time_taken = float(timing.micro()) / 1000000 print "Processed %s in %ss. Output written in %s." % (input_filename, time_taken, out_file_name )