#! /usr/bin/env python # proc_pubs.py # Read ~/tx/resume/input/pubs.tex and generate html files # Note: First part is almost identical to ~/tx/res/input/rev_pubs.py import sys import time import re # For regular expression handling, esp. re.sub #=============================================== # Main routine #=============================================== def main(): global if_aip,fp,fl # Announce operation print 'Reading ~/tx/resume/input/pubs.tex...' f=open('/home/dhv/tx/resume/input/pubs-tagged.tex','r') infile=f.read() f.close() # Remove spaces in otherwise blank lines infile=re.sub('\n +\n','\n\n',infile) # Two or more blank lines become just two infile=re.sub('\n\n+','\n\n',infile) # Convert string-file into list of pubs publist=infile.split('\n\n') # Strip final \n from last entry for consistency if publist[-1][-1]=='\n': publist[-1]=publist[-1][0:-1] num=len(publist) # Note that entry zero is really just the %%% comment block print 'Entries found = ',num-1 # Announce operation print 'Making ~/pu/pubs/pub_list.html and pub_list_local.html...' # Reverse the list, make index list, and add \n's ind=[] list=[] for j in reversed(range(1,num)): list+=[publist[j]+'\n'] ind+=[j] n=len(list) # Open two files to write fp=open('pub_list.html','w') fl=open('pub_list_local.html','w') # Print headers header=get_text('header') fp.write(header) fl.write(header) no_tag_list=[] # Now step through the entries and parse each for j in range(n): # write header for new entry nnn='%d' % ind[j] nnt=' '+nnn str='\n

\n' fp.write(str) fl.write(str) entry=list[j] mtext='' if_aip=False lines=entry.split('\n') # first line should be id line if lines[0].startswith('%id '): parts=lines[0].split(' ') lenp=len(parts) if lenp < 2: print '*** id line has no number' print lines[0] print '*** HALTING EXECUTION ***' sys.exit(1) if int(parts[1]) != ind[j]: print '*** id entry number does not match for entry' print lines[0] print '*** HALTING EXECUTION ***' sys.exit(1) if lenp == 3: paper_id=parts[2] else: paper_id='' trailer='' if ind[j]>150: no_tag_list+=[ind[j]] lines=lines[1:] else: print '*** Error: entry does not start with id tag ***' print '\n'.join(lines) print '*** HALTING EXECUTION ***' sys.exit(1) for line in lines: line=line+'\n' if line.startswith('%html '): line=line[6:] if line.startswith('%break'): mtext=mtext+'
\n' parse_latex(mtext) mtext='' elif line.startswith('%link '): parse_latex(mtext) mtext='' parse_links(line[6:]) elif len(line)>1 and not line.startswith('%'): mtext=mtext+line mtext=mtext+'\n' parse_latex(mtext) if paper_id!='': fl.write(' '+paper_id+'\n') # Print footers footer=get_text('footer') fp.write(footer) fl.write(footer) print 'Missing tags for entries above 150:' tt='' nnn=len(no_tag_list) for j in range(nnn): tt+='%4d' % no_tag_list[nnn-j-1] if j%15 == 14: tt+='\n' print tt # Announce operation print 'Done.' return #=============================================== # Parse latex and print #=============================================== def parse_latex(text): global if_aip,fp,fl if len(text)==0: return # accents and diacriticals text=text.replace('\\\'e','é') text=text.replace('\\\'a','á') text=text.replace('\\\'I','Í') text=text.replace('\\\'E','É') text=text.replace('\\~n','ñ') text=text.replace('\\\'{\\i}','í') text=text.replace('\\"o','ö') # latex constructs that get replaced by a simple space text=text.replace('\\-','') text=text.replace('\\ ',' ') text=text.replace('\\break',' ') text=text.replace('\\thinspace',' ') text=text.replace('.~','. ') # special math constructs in their entirety text=text.replace('$^\\circ$',' degree') text=text.replace("$'$",'′') text=text.replace("$''$",'″') # suspicious ... text=text.replace('$\\beta$','β') text=text.replace('$^{\\rm th}$','th') text=text.replace('$\\bar1$','[1-bar]') # now do replacements that require regular expression handling # hints: # r'string' is a raw string (not parsed by Python before passing) # ? makes .* the search 'not greedy' (as short as possible) text=re.sub(r'{\\bf (.*?)}',r'\1',text) text=re.sub(r'\$_([0-9,u-z])\$',r'\1',text) text=re.sub(r'\$_{(.*?)}\$',r'\1',text) text=re.sub(r'\$([A-Z]*?)\$',r'\1',text) text=re.sub(r'{\\sl (.*?)}',r'\1',text,re.DOTALL) text=re.sub(r'{\\it (.*?)}',r'\1',text,re.DOTALL) # DOTALL means that . includes the possibility of a newline # Official Python 2.7 syntax: "...,flags=re.DOTALL)" # But that doesn't seem to work with Python 2.6 # clean up remaining math constructs text=text.replace('$Z_2$','Z2') # warn if any are left if '$' in text: print '*** Note $ in: '+text # finally, do special processing on author names skip_authors=False # Set True for debug and to compare old style if skip_authors: text=text.replace('``','"') text=text.replace("''",'"') else: [head,x,tail]=text.partition('``') # If above fails, it leaves '' in 2nd and 3rd entries of tuple # call routine for parsing author names if x!='': text=parse_auth(head)+'"'+tail.replace("''",'"') text=text.replace('``','"') # in case there was another open quote fp.write(text) fl.write(text) return #=============================================== # Parse author names and abbreviate #=============================================== def parse_auth(text): if text.startswith('See'): return(text) text=text.replace('\n',' ') # Need to format with linefeeds later! text=re.sub(' +',' ',text) # Multiple spaces -> single text=text.strip(', ') names=re.split(', and | and |,',text) # Split on any of three text='' line='' middle_abbrev=['Seok','Jie','Wook','Nyung','Luigi','Nihat'] middle_save=['De','Di','Banaszak','Cargill'] na=len(names) # number of names in the author list for ia in range(na): name=names[ia].strip(' ') parts=name.split(' ') nn=len(parts) # number of parts to one author name # print(' %d ' % num +' '+name) # debug if nn <= 1: print '*** Author name too short ***' print name, parts sys.exit(1) elif nn > 4: print '*** Author name too long ***' print name, parts sys.exit(1) aname='' for j in range(nn): part=parts[j] # Note: re.search looks anywhere in string # re.match looks only at beginning of string if re.match(r'[A-Z].*\.$',part) != None: aname+=part elif j==0: aname+=part[0]+'.' elif j==nn-1: aname+=' '+part elif part in middle_save: aname+=' '+part elif part in middle_abbrev: aname+=part[0]+'.' else: # issue warning if this middle name not recognized print '*** unrecognized middle name: '+part aname+=part[0]+'.' # aname is now abbreviated author name (no commas etc.) if na > 1 and ia==na-1: aname='and '+aname if len(line)+len(aname) >= 72: text+=line[:-1]+'\n' line='' if na==2 and ia==0: line+=aname+' ' else: line+=aname+', ' if line != '': text+=line[:-1]+'\n' return(text) #=============================================== # Parse links and print #=============================================== def parse_links(in_text): global if_aip,fp,fl stuff=in_text.strip(' ()\n') # fb.write(stuff+'\n') items=stuff.split(',') if len(items)==0: return text_p='' text_l='' for item in items: item=item.strip() text_p+=parse_link(True,item) text_l+=parse_link(False,item) text_p=text_p.strip(', \n') text_l=text_l.strip(', \n ') if len(text_p)>0: fp.write('('+text_p+')\n') if len(text_l)>0: fl.write('('+text_l+')\n') return #=============================================== # Parse one link #=============================================== def parse_link(if_public,text): global if_aip,fp,fl # if_public=True : For pub_list.html # if_public=False: For pub_list_local.html show_untagged=False taglist = ['j','c','x','p','r','z','v'] type='' for tag in taglist: if text.startswith(tag+'s-') and text.endswith('-'+tag+'e'): type=tag text=text[3:-3] aa='' zz='' if type == 'j': if text.startswith('AP') or text.startswith('JA'): if_aip=True if if_public: aa='\n' elif tag == 'footer': text='\n\n\n' text+='

My PhD Thesis, "A Theoretical Study of Defects in' text+=' Amorphous Semiconductors"\n(MIT, 1981) is available as a\n' text+='Scanned PDF (7MB).\n' text+='

\n\n

\n\n' text+='In case you cannot download an article, try sending me\n' text+='email; I might be\n' text+='able to send it to you.\n

\n' text+='Also please notify me of broken links above.\n\n' text+='\n\n' return(text) #=============================================== main() #===============================================