#!/usr/bin/python
 
# Last updated: 13-09-2006:       
#			  added the unicode-bengali alphabet list
#                         added --ancient option
#                         fixed the problem of unit specification 
#  This code uses Xavier Defrang's Single-Pass Multiple Replace, 
#  Py Cookbook 81330 (=3.14 hardcopy) in an essential fashion.

from __future__ import nested_scopes

import string, re, sys, getopt, timing
timing.start()

version = "1.1"

usage = '''     
This is mkbangtex Version %s.
It creates a BangTeX file from a Bengali document written
phonetically in Roman script. It works in two modes, which 
are automatically detected:
bengali: converts anything but single-word latex  
         commands not enclosing within a pair of '@fn's
foreign: converts only the portions enclosed within a pair of '@bn's
CAUTION: Only ONE mode can be used in a  file 

A piece of text enclosed within a pair of '@b2e's is converted  
from Bengali to acctented Roman script following the IAST (1912) 
rules. Works in both modes. 
[See accompanying documentation for further details]
---

Usage: mkbangtex [-h, --alphabet --transparent, --ancient, -o <out_file>]  <in_file> 
        -h, --help    : print this page
        --alphabet    : transliteration table
        --transparent : transparent symbols and default positioning
        --ancient     : ancient Bengali --- without antastha ya 
        -o <out_file> : specify output file (default: <in_file>.tex)
        '''

bengali_alphabet = u'''
    BENGALI   ALPHABET 

a( \u0985 ) aa( \u0986 ) i( \u0987 ) ii( \u0988 ) 
u( \u0989 ) uu( \u098A ) Ri( \u098B )  \u098C (not used)
e( \u098F ) oi( \u0990 ) o( \u0993 ) ou( \u0994 )
------------------------------------------------
ka( \u0995 ) kha( \u0996 ) ga( \u0997 ) gha( \u0998 ) NG( \u0999 ) 
ca/cha( \u099A ) chha( \u099B ) ja( \u099C ) jha( \u099D ) NJ( \u099E )
Ta( \u099F ) Tha( \u09A0 ) Da( \u09A1 ) Dha( \u09A2 ) Na( \u09A3 )
ta( \u09A4 ) tha( \u09A5 ) da( \u09A6 ) dha( \u09A7 ) na( \u09A8 )
pa( \u09AA ) pha( \u09AB ) ba( \u09AC ) bha( \u09AD ) ma( \u09AE )
Ja( \u09AF ) ra( \u09B0 ) la( \u09B2 ) wa( \u09AC )
sha( \u09B6 ) Sha/shha( \u09B7 ) sa( \u09B8 ) ha/Ha( \u09B9 )
rh( \u09DC ) rhh( \u09DD ) _m( \u0982 ) ~h( \u0983 ) ~n( \u0981 )
------------------------------------------------
khaNDa ta: _t( \u09CE ); hasanta: _h( \u09CD ); k+shh: x antastha y: y( \u09DF )
'''


try: 
 opts, args = getopt.getopt(sys.argv[1:], 'ho:',
["help","outfile","transparent","alphabet","ancient"])
except getopt.GetoptError:
# print help information and exit:
        print usage
        sys.exit(2)

# Help menu
for o , a in opts:
    if o in ("-h", "--help"):
       print usage %  version
       sys.exit()
       output = None
    if o in ("--alphabet"):
       print bengali_alphabet.encode('utf-8')
       sys.exit()
       output = None


# define input file

# Check if the file exists
try:
        fi = open(args[0], 'r')
        input_filename = args[0]
        print '''
mkbangtex %s 
Got %s. Processing ...''' % (version, input_filename) 
except: 
        input_filename = "file"
        print '''
Can\'t open %s for reading.
For help menu use: mkbangtex --help
''' % input_filename
        sys.exit(0)
        

out_file_name = args[0]+'.tex'                 # initialize the ouput file
cosmetix = "True"                              # initialize cosmetics to TRUE
Ancient_bng = "False"                          # intialize Ancient_bng to FALSE

# Set up the options
for o , a in opts:
    if o == "-o":
       out_file_name = a
    if o ==  "--transparent":
       cosmetix = "False"
    if o ==  "--ancient":
       Ancient_bng = "True"


# define and open output file
o_file = open(out_file_name,'w')

# setting up delimiters for both modes
fnmod_delim = re.compile(r'@bengali|@bnstart|@bnend|@bn')   
bnmod_delim = re.compile(r"@fn")
bndelim = re.compile(r'@@@')                  # change these to @@@
sndelim = re.compile(r'@b2e')                 # for transliteration
fndelim = re.compile(r"(\$+|@@@)", re.X)      # dollar and @@@ treated together
dollar = re.compile(r"(\$)", re.X)            # dollar and @@@ treated together


# define non-latex words
nonltx_word = re.compile(r"\s-?(?![\d+])\w*-?\(?\~?[\`]*\w+\S*|{o}\S+|{i}\S+", re.X)

# compile patterns to be substituted
ns_k = re.compile('n(?=k|g|x)')               # nasals before k-barga
ns_c = re.compile('n(?=c|j)')                 # nasals before c-barga
tal_s = re.compile(r'sh(?=[bcdfghjklmnpqrstwxyz])') 
                                              # sh followed by coonsonants, (for b2e)
xX = re.compile(r"x", re.I)
unga = re.compile(r"NG")
ch = re.compile(r"ch")
jh = re.compile(r"jh")
Dh = re.compile(r"Dh")
sh = re.compile(r"sh")
ha = re.compile(r"~ha|~h")
Ri = re.compile(r"Ri")
jn = re.compile(r"jn")
onuswar = re.compile(r"_m")
rikar = re.compile(r"_r")
hasanta = re.compile(r"_h")
rh = re.compile("rh")
Rh = re.compile("Rh")
R = re.compile("R")
J = re.compile("J")
oi = re.compile("oi")
ou = re.compile("ou")
aii = re.compile(r"aii")
ai = re.compile(r"ai")
au = re.compile(r"au")
aa = re.compile(r"aa")
ao = re.compile(r"ao")
a = re.compile(r"a")
ao = re.compile(r"ao")
ae = re.compile(r"ae")
ei = re.compile(r"ei")
eo = re.compile(r"eo")
eu = re.compile(r"eu")
io = re.compile(r"io")
iu = re.compile(r"iu")
ui = re.compile(r"ui")
ii = re.compile(r"ii")
oioi = re.compile(r"oioi")
III = re.compile(r"III")
XzX = re.compile(r"XzX")
VxV = re.compile(r"VxV")
chandra = re.compile(r"~n")
ng_prob = re.compile(r"NG(?=M|r)")
ngm = re.compile(r"NG/M")
Ha = re.compile(r'~Ho|~Ha')
H = re.compile("H")
uu = re.compile("uu")
inya = re.compile("NJ")
T = re.compile('T')
D = re.compile('D')
N = re.compile('N')
Th = re.compile('Th')
shh = re.compile('shh')
vw_b = re.compile(r'\b[aeiou]+')              # vowels at word-beginning
p_i = re.compile('([\^\~]|[b-dB-Df-hF-Hj-nJ-Np-tP-Tv-zV-Z])+(?=i)')
p_e = re.compile('([\^\~]|[b-dB-Df-hF-Hj-nJ-Np-tP-Tv-zV-Z])+(?=e)')
p_YA = re.compile('([\^\~]|[b-dB-Df-hF-Hj-nJ-Np-tP-Tv-zV-Z])+(?=(YA))')
                                              # for YA-kar's bangtex symbol
p_o = re.compile('([\^\~]|[b-dB-Df-hF-Hj-nJ-Np-tP-Tv-zV-Z])+(?=o)')
p_ou = re.compile('([\^\~]|[b-df-hj-np-tv-z])+(?=ou)', re.I)
p_oi = re.compile('([\^\~]|[b-df-hj-np-tv-z])+(?=oi)', re.I)
p_begin_Ha = re.compile(r'\bh', re.X)         # Ha  at word-beginning
yukta_bnj = re.compile(r'[kxgcjTDNtdmnpblhHSsXZ](?=[kxgcCjTDNtTdnpbHSsXZ])')
p_tkp = re.compile('(\_*)t(?=k|x|p|f|s|X)')   # t before ka,kha,pa,pha,s,X is khanda t
# consonants under which u, uu and Ri-kar need be shifted
anomaly_u = re.compile(r'(k|kR|NG|c|C|T|D|Z|t|ph|f|phR|bh|X)(?=u|{rR})|(T(?=R))')
u_kern = re.compile(r'em}u(?!u)')
uu_kern = re.compile(r'em}uu')


# DEFINING VARIOUS FUNCTIONS 
# 
# Functions from Defrang's code

def multiple_replace(dict, text):           

  """ Replace in 'text' all occurences of any key in the given
  dictionary by its corresponding value.  Returns the new tring.""" 

  # Create a regular expression  from the dictionary keys
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

  # For each match, look-up corresponding value in dictionary
  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text) 


from UserDict import UserDict 
class Xlator(UserDict):

  """ An all-in-one multiple string substitution class """ 

  def _make_regex(self): 

    """ Build a regular expression object based on the keys of
    the current dictionary """

    return re.compile("(%s)" % "|".join(map(re.escape, self.keys()))) 

  def __call__(self, mo): 
    
    """ This handler will be invoked for each regex match """

    # Count substitutions
    self.count += 1 # Look-up string

    return self[mo.string[mo.start():mo.end()]]

  def xlat(self, txt): 

    """ Translate text, returns the modified text. """ 

    # Reset substitution counter
    self.count = 0 

    # Process text
    return self._make_regex().sub(self, txt)


# BENGALI Functions for -fala's and -kar's

# Various -kar's 

def eikar(self):
  anomi = self.group()
  return '\*'+anomi+'*'

def okar(self):
  anome = self.group()
  return '\*'+anome+'*ea'

def oukar(self):
  anome = self.group()
  return '\*'+anome+'*'+'e'

def oikar(self):
  anome = self.group()
  return '\*'+anome+'*'+'oi'


def fala(self, c_list):                       # Various -fala's
  return dict([(x+self, x+self.upper()) for x in c_list])


def nasa(self,replmnt):                       # Nasals
  return dict([('n'+x, replmnt+'/'+x) for x in self])

def caps(self):
  capup = self.group()
  return '{'+capup.upper()+'}'

def yukta(self):
  ykt = self.group()
  return ykt + '/'

def ukar(self):                               # for shifting u/uu under
  anomu = self.group()                        # some consonants
  return anomu+'{\\kern-.23em}'

def YAkar(self):                              # for YA-kar's bangtex symbol
  anomi = self.group()
  return '\*'+anomi+'*'


def y_blank_ancient(self):
  y_anc = self.group()
  y_blnk = y_anc.replace("y","{}")
  return y_blnk
# LISTS and  DICTIONARIES:
#
# Define groups of letters from the alphabet:
barga_k = list('xGX')
barga_c = list('qC')
# Ha after volwels
dict_ha = fala('h', list('aeiouAEIOU'))
# Ref
dict_rf = dict([('r'+x, 'r'+'/'+x) for x in '[kxgGcqjCTFDZNtQdznpfbvmJlHPSs]'])
# Nasals
dict_k =  nasa(barga_k,'VNV')
dict_c =  nasa(barga_c, 'NJ') 
dict_nasa = dict()
dict_nasa.update(dict_k)
dict_nasa.update(dict_c)
## Dictionary constructing the conjunctions
dict_conj = {
  "AA" : "Aa" ,
  "VVV":"g/Y",
  "_t" : "t//",
  "nh" : "n/H",
  "Nh" : "N/H",
  "mh" : "HM",
  "\"V" : "NN",
  "VV\"" : "NNG",
  "VNV" : "NG",
  "th/th" : "{\char211}",
  "rh/g" : "rhg",
  ":/:" : ":/",
  r"*YA" : "*{aa}",                           # for YA-kar's bangtex symbol
  r"YA" : "{AA}",                             # for YA-kar's bangtex symbol
  } 

# FUNCTIONS for  mkbangtex

def b2e(self):                                # b2e : for transliteration
 self = ns_k.sub('NG',self)
 self = ns_c.sub('NJ',self)
 self = xX.sub("kshh", self)
 self = ch.sub("c", self)
 self = H.sub("h", self)
 self = aa.sub('{\=a}',self)
 self = ii.sub('{\=\i}',self)
 self = uu.sub('{\=u}', self)
 self = Ri.sub('\d{ri}', self)
 self = unga.sub('{\.n}', self)
 self = inya.sub(r'\~{n}', self)
 self = Th.sub('\d{th}', self)
 self = T.sub('\d{t}', self)
 self = Dh.sub('\d{dh}', self)
 self = D.sub('\d{d}', self)
 self = N.sub('\d n', self)
 self = shh.sub('\d{s}', self)
 self = sh.sub("{\\'s}", self)
 self = chandra.sub("\u{n}", self)
 self = onuswar.sub("\d{m}", self)
 self = ha.sub("\d{h}", self)
 self = hasanta.sub("", self)
 self = self.replace('_t',"t")
 self = rikar.sub("\d{ri}", self)
 self = rh.sub(r'R', self)
 #self = Rh.sub(r'\d{Rh}', self)
 self = R.sub(r"\d{R}", self)
 self = J.sub("y", self)
 self = oi.sub("ai", self)
 self = ou.sub("au", self)
 return self


def bn(self):                                 # bn : for fnmode = FALSE 
 self = xX.sub(r"X", self)
 self = unga.sub("VNV", self)
 self = ch.sub("c", self)
 self = jh.sub("C", self)
 self = Dh.sub("Z", self)
 self = sh.sub("S", self)
 self = ha.sub("~ha", self)
 self = Ri.sub("{RR}", self)
 self = jn.sub("VVV", self)
 self = vw_b.sub(caps, self)
 self = p_tkp.sub(r't//', self)               # khanDa t
 self = onuswar.sub('{VV"}', self)            # onu:sar 
 self = rikar.sub('{rR}', self)               # ri-fala
 self = hasanta.sub(':/:', self)              # hasanta
 self = p_begin_Ha.sub('H', self)             # Ha in middle must be before -kars
 xlat = Xlator(dict_ha)
 self = xlat.xlat(self)
 self = ao.sub(r"aO", self)
 self = ei.sub(r"eI", self)
 self = eo.sub(r"eO", self)
 self = eu.sub(r"eU", self)
 self = io.sub(r"iO", self)
 self = iu.sub(r"iU", self)
 self = ui.sub(r"uI", self)
 self = p_e.sub(eikar, self)
 self = ii.sub('III', self)
 self = p_i.sub(eikar, self)                  # these have to appear before 
 self = p_ou.sub(oukar, self)          
 self = p_oi.sub(oikar, self)   
 self = p_o.sub(okar, self)                   # changing a to o
 self = oioi.sub(r'{oi}', self)
 self = III.sub('ii', self)
 self = aii.sub('aII', self)
 self = ai.sub('aI',self)
 self = ae.sub('aE',self)
 self = au.sub('aU',self)
 self = aa.sub('VxV',self)
 self = ao.sub('XzX',self)                    # ko --> ekao ---> eka
 self = a.sub('o',self)
 self = XzX.sub('a',self)
 self = VxV.sub('a',self)
                                              # ORDERS IN FOLLOWING IMPORTANT
 self = chandra.sub('"V', self)
 xlat = Xlator(dict_nasa)
 self = xlat.xlat(self)
 self = yukta_bnj.sub(yukta,self)
 self = xlat.xlat(self)
 xlat = Xlator(dict_rf)
 self = xlat.xlat(self)
 xlat = Xlator(fala('r',list('bcdfghjklmpqstvwxzBCDFGHJLMPQRSTWXYZ\/')))
 self = xlat.xlat(self)
 xlat = Xlator(fala('y',list('bcdfghjklmnpqrstvwxyzBCDFGHJLMNPQRSTVWXYZ\/')))
                                              # putting in r,h in this list
 self = p_YA.sub(YAkar, self)                 # for YA-kar's bangtex symbol
 self = xlat.xlat(self)
 xlat = Xlator(dict_conj)
 self = xlat.xlat(self)
 for f in list('[mlw]'):
   xlat = Xlator(fala(f,list('bcdfghjklmnpqstvwxyzBCDFGHJLMNPQRSTVWXYZ')))
   self = xlat.xlat(self)
 self = Ha.sub('h', self)
 ng_prob = re.compile(r'NG(?=M|r)')
 self = ng_prob.sub('NG/',self)
 self = ngm.sub('NG/m', self)                 # only for the exceptional baNGmay
 return self


def bn_cosmetic(self):
 self = bn(self)
 # cosmetic changes
 # changing position of u and uu under anomalous consonants
 self = self.replace(r'Yuu', 'uu{\kern-.06em}Y')
 self = self.replace(r'Yu', 'u{\kern-.06em}Y')
 self = anomaly_u.sub(ukar, self)
 self = u_kern.sub('em}u{\\kern.23em}', self)
 self = uu_kern.sub('em}uu{\\kern.23em}', self)
 self = self.replace(r'{\kern-.23em}{rR}','{\\kern-.23em}{rR}{\\kern.23em}')
 self = self.replace(r'XW', 'X\\kern-.25em\lower.2em\hbox{W}\\kern.25em')
                                              # for xwa need to lower ba-fala
 self = self.replace(r'TW', 'T\\kern-.1em\lower.3em\hbox{W}\\kern.1em')
                                              # for Twa need to position ba-fala
 self = self.replace(r'T{\kern-.23em}Ru','''T{\kern-.2em}\lower.1em\hbox{R}\lower.15em
                                            \hbox{u}\kern.2em''')
                                              # for Tru you need to lower R & u
 # changing shhT and shhTha from default to more orthodox forms
 self = self.replace(r'Sh/Th', '{\char178\kern-2pt\char70}')
 self = self.replace(r'Sh/T', '{\char178\kern0pt\char84}')
 return self
 

def bnfn(self):                               # for using in fnmode as a function
 return bn(self.group())                      #   in a re.sub statement

def bnfn_cosmetic(self):                               # for using in fnmode as a function
 return bn_cosmetic(self.group())                      #   in a re.sub statement


# SUBSTITUTIONS :

if __name__ == "__main__": 

  text = fi.read()                             # Read text --- treating the whole 
                                               #          content as a SINGLE string
  fn_test = re.search(fnmod_delim, text)
  bn_test = re.search(bnmod_delim, text)
  fnmode = ""
  if fn_test:
    fnmode = "True"
    print "fnmode selected."
  elif bn_test:
    fnmode = "False"
    print "bnmode selected."

# Now translate the text 
# The delimiters are handled differently in BNMODE and FNMODE.
# Hence, the delimiter substitution and transliteration 
#        are to be performed inside the loops.

  if fnmode == "True":                        # case: FOREIGN MODE: START                 
   # Manipulating delimiters
   text = fnmod_delim.sub("@@@", text)
   # first do the roman transliteration, if any
   sn_list = sndelim.split(text)
   for i in range(len(sn_list)):
    if i%2 == 0:
     pass
    else:
     text = sn_list[i]
     text = b2e(text).strip()
     sn_list[i] = text
   text = "".join(sn_list) 
   big_list = bndelim.split(text)
   for i in range(len(big_list)):
    if i%2 == 0 :
       pass
    else:
       text = big_list[i]
       smal_list = dollar.split(text)
       for j in range(len(smal_list)):
        if j%4 == 0 :                         # Recall that for dollars
          text = smal_list[j]                 # we have kept the delimiters
          if Ancient_bng == "True":           # for --ancient option
            text = text.replace("y","{}")
          else:
            pass
          if cosmetix == "False":             # for --transparent option
           text = bn(text)
          else:
           text = bn_cosmetic(text)
          smal_list[j] = text
          text = "".join(smal_list)
       big_list[i] = text.strip()
   text = "".join(big_list)                   # FOREIGN MODE: END
  else:                                       # case: BNMODE: START 
   # Manipulating delimiters
   text = sndelim.sub("@b2e @@@ ", text)
   text = bnmod_delim.sub("@@@", text)
   # first do the roman transliteration, if any
   sn_list = sndelim.split(text)
   for i in range(len(sn_list)):
    if i%2 == 0:
      pass
    else:
      text = sn_list[i]
      text = b2e(text).strip()
      sn_list[i] = text
   text = "".join(sn_list) 
   big_list = fndelim.split(text)
   for i in range(len(big_list)):
    if i%4 == 0 :
          text = big_list[i]
          if Ancient_bng == "True":           # for --ancient option
            text = nonltx_word.sub(y_blank_ancient, text)
          else:
            pass
          if cosmetix == "False":             # for --transparent option
           text = nonltx_word.sub(bnfn, text)
          else:
           text = nonltx_word.sub(bnfn_cosmetic, text)
          big_list[i] = text.strip("\n") 
   text = "".join(big_list)                     
            # now split and join again to get rid of the delimiter @@@
  text = "".join(text.split("@@@"))           # BNMODE: END
# write in the output file
  o_file.write(text)

  timing.finish()

# on standard output
  time_taken = float(timing.micro()) / 1000000 
  print "Processed %s in %ss. Output written in %s." % (input_filename, time_taken, out_file_name )