#! /usr/bin/env python
# proc_pubs.py

# Read ~/tx/resume/input/pubs.tex and generate html files

# Note: First part is almost identical to ~/tx/res/input/rev_pubs.py

import sys
import time
import re
# For regular expression handling, esp. re.sub

#===============================================
# Main routine
#===============================================

def main():

  global if_aip,fp,fl

  # Announce operation
  print 'Reading ~/tx/resume/input/pubs.tex...'

  f=open('/home/dhv/tx/resume/input/pubs-tagged.tex','r')
  infile=f.read()
  f.close()

  # Remove spaces in otherwise blank lines
  infile=re.sub('\n +\n','\n\n',infile)
  # Two or more blank lines become just two
  infile=re.sub('\n\n+','\n\n',infile)

  # Convert string-file into list of pubs
  publist=infile.split('\n\n')
  # Strip final \n from last entry for consistency
  if publist[-1][-1]=='\n':
    publist[-1]=publist[-1][0:-1]
  num=len(publist)

  # Note that entry zero is really just the %%% comment block
  print 'Entries found = ',num-1

  # Announce operation
  print 'Making ~/pu/pubs/pub_list.html and pub_list_local.html...'

  # Reverse the list, make index list, and add \n's
  ind=[]
  list=[]
  for j in reversed(range(1,num)):
    list+=[publist[j]+'\n']
    ind+=[j]
  n=len(list)

  # Open two files to write
  fp=open('pub_list.html','w')
  fl=open('pub_list_local.html','w')

  # Print headers
  header=get_text('header')
  fp.write(header)
  fl.write(header)

  no_tag_list=[]

  # Now step through the entries and parse each

  for j in range(n):
    # write header for new entry
    nnn='%d' % ind[j]
    nnt='   '+nnn
    str='<a name="'+nnn+'">\n<p><li value="'+nnt[-3:]+'">\n'
    fp.write(str)
    fl.write(str)

    entry=list[j]
    mtext=''
    if_aip=False
    lines=entry.split('\n')

    # first line should be id line
    if lines[0].startswith('%id '):
      parts=lines[0].split(' ')
      lenp=len(parts)
      if lenp < 2:
        print '*** id line has no number'
        print lines[0]
        print '*** HALTING EXECUTION ***'
        sys.exit(1)
      if int(parts[1]) != ind[j]:
        print '*** id entry number does not match for entry'
        print lines[0]
        print '*** HALTING EXECUTION ***'
        sys.exit(1)
      if lenp == 3:
        paper_id=parts[2]
      else:
        paper_id=''
        trailer=''
        if ind[j]>150: no_tag_list+=[ind[j]]
      lines=lines[1:]
    else:
      print '*** Error: entry does not start with id tag ***'
      print '\n'.join(lines)
      print '*** HALTING EXECUTION ***'
      sys.exit(1)
    for line in lines:
      line=line+'\n'
      if line.startswith('%html '):
        line=line[6:]
      if line.startswith('%break'):
        mtext=mtext+'<br>\n'
        parse_latex(mtext)
        mtext=''
      elif line.startswith('%link '):
        parse_latex(mtext)
        mtext=''
        parse_links(line[6:])
      elif len(line)>1 and not line.startswith('%'):
        mtext=mtext+line
    mtext=mtext+'\n'
    parse_latex(mtext)
    if paper_id!='':
      fl.write('&nbsp&nbsp<em><font color="green">'+paper_id+'</font></em>\n')

  # Print footers
  footer=get_text('footer')
  fp.write(footer)
  fl.write(footer)

  print 'Missing tags for entries above 150:'
  tt=''
  nnn=len(no_tag_list)
  for j in range(nnn):
    tt+='%4d' % no_tag_list[nnn-j-1]
    if j%15 == 14: tt+='\n'
  print tt

  # Announce operation
  print 'Done.'

  return

#===============================================
# Parse latex and print
#===============================================
def parse_latex(text):
  global if_aip,fp,fl
  if len(text)==0:
    return

  # accents and diacriticals
  text=text.replace('\\\'e','&eacute;')
  text=text.replace('\\\'a','&aacute;')
  text=text.replace('\\\'I','&Iacute;')
  text=text.replace('\\\'E','&Eacute;')
  text=text.replace('\\~n','&ntilde;')
  text=text.replace('\\\'{\\i}','&iacute;')
  text=text.replace('\\"o','&ouml;')

  # latex constructs that get replaced by a simple space
  text=text.replace('\\-','')
  text=text.replace('\\ ',' ')
  text=text.replace('\\break',' ')
  text=text.replace('\\thinspace',' ')
  text=text.replace('.~','. ')

  # special math constructs in their entirety
  text=text.replace('$^\\circ$',' degree')
  text=text.replace("$'$",'&prime;')
  text=text.replace("$''$",'&Prime;')    # suspicious ...
  text=text.replace('$\\beta$','&beta;')
  text=text.replace('$^{\\rm th}$','th')
  text=text.replace('$\\bar1$','[1-bar]')

  # now do replacements that require regular expression handling
  # hints:
  #   r'string' is a raw string (not parsed by Python before passing)
  #   ? makes .* the search 'not greedy' (as short as possible)

  text=re.sub(r'{\\bf (.*?)}',r'\1',text)
  text=re.sub(r'\$_([0-9,u-z])\$',r'<font size="-2">\1</font>',text)
  text=re.sub(r'\$_{(.*?)}\$',r'<font size="-2">\1</font>',text)
  text=re.sub(r'\$([A-Z]*?)\$',r'\1',text)

  text=re.sub(r'{\\sl (.*?)}',r'<em>\1</em>',text,re.DOTALL)
  text=re.sub(r'{\\it (.*?)}',r'<em>\1</em>',text,re.DOTALL)
  # DOTALL means that . includes the possibility of a newline
  # Official Python 2.7 syntax: "...,flags=re.DOTALL)"
  # But that doesn't seem to work with Python 2.6

  # clean up remaining math constructs
  text=text.replace('$Z_2$','Z<font size="-2">2</font>')
  # warn if any are left
  if '$' in text:
    print '*** Note $ in:  '+text

  # finally, do special processing on author names

  skip_authors=False      # Set True for debug and to compare old style
  if skip_authors:
    text=text.replace('``','"')
    text=text.replace("''",'"')
  else:

    [head,x,tail]=text.partition('``')
    #   If above fails, it leaves '' in 2nd and 3rd entries of tuple

    # call routine for parsing author names
    if x!='': text=parse_auth(head)+'"'+tail.replace("''",'"')

    text=text.replace('``','"')  # in case there was another open quote

  fp.write(text)
  fl.write(text)
  return

#===============================================
# Parse author names and abbreviate
#===============================================
def parse_auth(text):
  if text.startswith('See'): return(text)
  text=text.replace('\n',' ')   # Need to format with linefeeds later!
  text=re.sub(' +',' ',text)    # Multiple spaces -> single
  text=text.strip(', ')
  names=re.split(', and | and |,',text)   # Split on any of three
  text=''
  line=''
  middle_abbrev=['Seok','Jie','Wook','Nyung','Luigi','Nihat']
  middle_save=['De','Di','Banaszak','Cargill']
  na=len(names)   # number of names in the author list
  for ia in range(na):
    name=names[ia].strip(' ')
    parts=name.split(' ')
    nn=len(parts)   # number of parts to one author name
    # print(' %d ' % num +' '+name)   # debug
    if nn <= 1:
      print '*** Author name too short ***'
      print name, parts
      sys.exit(1)
    elif nn > 4:
      print '*** Author name too long ***'
      print name, parts
      sys.exit(1)
    aname=''
    for j in range(nn):
      part=parts[j]
      # Note: re.search looks anywhere in string
      #       re.match looks only at beginning of string
      if re.match(r'[A-Z].*\.$',part) != None:
        aname+=part
      elif j==0:
        aname+=part[0]+'.'
      elif j==nn-1:
        aname+=' '+part
      elif part in middle_save:
        aname+=' '+part
      elif part in middle_abbrev:
        aname+=part[0]+'.'
      else:
        # issue warning if this middle name not recognized
        print '*** unrecognized middle name:  '+part
        aname+=part[0]+'.'
    # aname is now abbreviated author name (no commas etc.)
    if na > 1 and ia==na-1:
      aname='and '+aname
    if len(line)+len(aname) >= 72:
      text+=line[:-1]+'\n'
      line=''
    if na==2 and ia==0:
      line+=aname+' '
    else:
      line+=aname+', '
  if line != '':
    text+=line[:-1]+'\n'
  return(text)

#===============================================
# Parse links and print
#===============================================
def parse_links(in_text):
  global if_aip,fp,fl
  stuff=in_text.strip(' ()\n')
  # fb.write(stuff+'\n')
  items=stuff.split(',')
  if len(items)==0:
    return
  text_p=''
  text_l=''
  for item in items:
    item=item.strip()
    text_p+=parse_link(True,item)
    text_l+=parse_link(False,item)
  text_p=text_p.strip(', \n')
  text_l=text_l.strip(', \n ')
  if len(text_p)>0:
    fp.write('<em>('+text_p+')</em>\n')
  if len(text_l)>0:
    fl.write('<em>('+text_l+')</em>\n')
  return

#===============================================
# Parse one link
#===============================================
def parse_link(if_public,text):
  global if_aip,fp,fl
  # if_public=True :  For pub_list.html
  # if_public=False:  For pub_list_local.html

  show_untagged=False
  taglist = ['j','c','x','p','r','z','v']

  type=''
  for tag in taglist:
    if text.startswith(tag+'s-') and text.endswith('-'+tag+'e'):
      type=tag
      text=text[3:-3]
  aa=''
  zz=''
  if type == 'j':
    if text.startswith('AP') or text.startswith('JA'):
      if_aip=True
    if if_public:
      aa='<a href=\"http://'
      zz='">journal link</a>'
      text=re.sub('^PR','publish.aps.org/abstract/PR',text)
      text=re.sub('^RM','publish.aps.org/abstract/RM',text)
      text=re.sub('^AP','link.aip.org/link/?AP',text)
      text=re.sub('^JA','link.aip.org/link/?JA',text)
      text=re.sub('^DOI','dx.doi.org/',text)
    else:
      text='journal link'
  elif type == 'c':
    aa='<a href=\"local_copy/'
    if if_public and if_aip:
      zz='.html">local copy</a>'
    else:
      zz='.pdf">local copy</a>'
  elif type == 'x':
    if if_public:
      aa='<a href="http://arxiv.org/abs/'
      zz='/index.html">cond-mat archive</a>'
    else:
      aa='<a href="archive/'
      zz='/pap.pdf">cond-mat archive</a>'
  elif type == 'p':
    aa='<a href="local_preprint/'
    zz='.html">local preprint</a>'
  elif type == 'r':
    if if_public:
      text='<a href="tools/request.html">request article</a>'
    else:
      aa='<a href="archive/'
      zz='/pap.pdf">archive copy</a>'
  elif type == 'z':
    aa='<a href="supp/'
    zz='">local copy of supplement</a>'
  elif type == 'v':
    if if_public:
      text=''
    else:
      aa='<a href="local_copy/'
      zz='.pdf">private local copy</a>'
  elif show_untagged:
    if if_public: print 'Untagged:  '+text   # Not necessarily bad
  if text != '': text=aa+text+zz+',\n'
  return(text)

#===============================================
# Provide header and footer text
#===============================================
def get_text(tag):

  if tag == 'header':

    text='<html>\n<body background="gifs/GrayDotted.gif">\n'
    text+='<TITLE>Vanderbilt Publication List</TITLE>\n'
    text+='<H3>Publication List for David Vanderbilt</H3>\n\n<p>\n'
    text+='A link to the electronic <em>journal copy</em> is given if possible.  Where\n'
    text+='<a href="tools/copyright.html">copyright policy</a> allows, a link to a\n'
    text+='<em>local copy</em> of the electronic journal article may also appear.  If\n'
    text+='these are unavailable, a link to a\n'
    text+='<em><a href="http://arxiv.org/archive/cond-mat">cond-mat archive</a>\n'
    text+='preprint</em> or to a\n'
    text+='<em>local preprint</em>\n'
    text+='may be given.\n\n<p>\n'
    text+='Updated '+time.asctime(time.localtime(time.time()))+'\n\n<ol>\n'

  elif tag == 'footer':

    text='<a name="end">\n</ol>\n\n'
    text+='<p>My PhD Thesis, "A Theoretical Study of Defects in'
    text+=' Amorphous Semiconductors"\n(MIT, 1981) is available as a\n'
    text+='<a href="local_preprint/dv_thesis.pdf">Scanned PDF (7MB)</a>.\n'
    text+='</p>\n\n<p>\n\n'
    text+='In case you cannot download an article, try sending me\n'
    text+='<A HREF = "mailto:dhv@physics.rutgers.edu">email</a>; I might be\n'
    text+='able to send it to you.\n<p>\n'
    text+='Also please notify me of broken links above.\n\n'
    text+='</body>\n</html>\n'

  return(text)

#===============================================
main()
#===============================================
