#! /usr/bin/env python3 # make_pubs_3.ppy from __future__ import print_function # python3 style print # Read ~/tx/resume/input/pubs.tex and generate html files # Note: First part is almost identical to ~/tx/res/input/rev_pubs.py import sys import time import re # For regular expression handling, esp. re.sub #=============================================== # Main routine #=============================================== def main(): global if_aip,fp,fl,paper_id # Announce operation f=open('/home/dhv/tx/resume/input/pubs.tex','r') infile=f.read() f.close() # Remove spaces in otherwise blank lines infile=re.sub('\n +\n','\n\n',infile) # Two or more blank lines become just two infile=re.sub('\n\n+','\n\n',infile) # Convert string-file into list of pubs publist=infile.split('\n\n') # Strip final \n from last entry for consistency if publist[-1][-1]=='\n': publist[-1]=publist[-1][0:-1] num=len(publist) # Note that entry zero is really just the %%% comment block print(num-1,'entries found in ~/tx/resume/input/pubs.tex...') # Announce operation # print 'Making ~/pu/pubs/pub_list.html and pub_list_local.html...' # Reverse the list, make index list, and add \n's ind=[] list=[] for j in reversed(range(1,num)): list+=[publist[j]+'\n'] ind+=[j] n=len(list) # Open two files to write fp=open('pub_list.html','w') fl=open('pub_list_local.html','w') # Print headers header=get_text('header') fp.write(header) fl.write(header) no_tag_list=[] # Now step through the entries and parse each for j in range(n): # write header for new entry nnn='%d' % ind[j] nnt=' '+nnn str='\n

\n' fp.write(str) fl.write(str) entry=list[j] mtext='' if_aip=False lines=entry.split('\n') # first line should be id line if lines[0].startswith('%id '): line=re.sub(' +',' ',lines[0]) # Remove dup spaces parts=line.split(' ') lenp=len(parts) if lenp < 2: print('*** id line has no number') print(lines[0]) print('*** HALTING EXECUTION ***') sys.exit(1) if int(parts[1]) != ind[j]: print('*** id entry number does not match for entry') print(lines[0]) print('*** HALTING EXECUTION ***') sys.exit(1) if lenp == 2: paper_id='' trailer='' if ind[j]>120: no_tag_list+=[ind[j]] if lenp >= 3: paper_id=parts[2] if lenp >= 4: bib_tag=parts[3] lines=lines[1:] else: print('*** Error: entry does not start with id tag ***') print('\n'.join(lines)) print('*** HALTING EXECUTION ***') sys.exit(1) for line in lines: line=line+'\n' if line.startswith('%html '): line=line[6:] if line.startswith('%priv '): line='LOC:'+line[6:-1]+':LOC\n' if line.startswith('%break'): mtext=mtext+'
\n' parse_latex(mtext) mtext='' elif line.startswith('%link '): parse_latex(mtext) mtext='' parse_links(line[6:]) else: comment=line.startswith('%') or line.startswith('\\null') if len(line)>1 and not comment: mtext=mtext+line mtext=mtext+'\n' parse_latex(mtext) if paper_id!='': fl.write(' '+paper_id+'\n') if lenp >= 4: fl.write(' '+bib_tag+'\n') # Print footers footer=get_text('footer') fp.write(footer) fl.write(footer) # print 'Missing tags for entries above 120:' nnn=len(no_tag_list) if nnn>0: tt='Later missing tags: ' # above 120 for j in range(nnn): tt+='%4d' % no_tag_list[nnn-j-1] if j%15 == 14: tt+='\n' print(tt) # Announce operation print('Done.') return #=============================================== # Parse latex and print #=============================================== def parse_latex(text): global if_aip,fp,fl,paper_id if len(text)==0: return # accents and diacriticals text=text.replace('\\\'e','é') text=text.replace('\\`e','è') text=text.replace('\\\'a','á') text=text.replace('\\\'I','Í') text=text.replace('\\\'E','É') text=text.replace('\\\'c','ć') text=text.replace('\\~n','ñ') text=text.replace('\\\'{\\i}','í') text=text.replace('\\"o','ö') text=text.replace('\\"u','ü') text=text.replace('\\"a','ä') # latex constructs that get replaced by a simple space text=text.replace('\\-','') text=text.replace('\\ ',' ') text=text.replace('\\break',' ') text=text.replace('\\thinspace',' ') text=text.replace('.~','. ') # special math constructs in their entirety text=text.replace('$^\\circ$',' degree') text=text.replace("$'$",'′') text=text.replace("$''$",'″') # suspicious ... text=text.replace('$\\beta$','β') text=text.replace('$^{\\rm th}$','th') text=text.replace('$\\bar1$','[1-bar]') # now do replacements that require regular expression handling # hints: # r'string' is a raw string (not parsed by Python before passing) # ? makes .* the search 'not greedy' (as short as possible) text=re.sub(r'{\\bf (.*?)}',r'\1',text) text=re.sub(r'\$_([0-9,d,u-z])\$',r'\1',text) text=re.sub(r'\$_{(.*?)}\$',r'\1',text) text=re.sub(r'\$([A-Z]*?)\$',r'\1',text) text=re.sub(r'{\\sl (.*?)}',r'\1',text,re.DOTALL) text=re.sub(r'{\\it (.*?)}',r'\1',text,re.DOTALL) # DOTALL means that . includes the possibility of a newline # Official Python 2.7 syntax: "...,flags=re.DOTALL)" # But that doesn't seem to work with Python 2.6 # clean up remaining math constructs text=text.replace('$Z_2$','Z2') text=text.replace('$g$','g') # warn if any are left if '$' in text: print('*** Note $ in: '+text) # finally, do special processing on author names skip_authors=False # Set True for debug and to compare old style if skip_authors: text=text.replace('``','"') text=text.replace("''",'"') else: [head,x,tail]=text.partition('``') # If above fails, it leaves '' in 2nd and 3rd entries of tuple # call routine for parsing author names if x!='': text=parse_auth(head)+'"'+tail.replace("''",'"') text=text.replace('``','"') # in case there was another open quote # fl.write(text) # fp.write(text) # Now see if there is '%priv' (local) text for special treatment fp.write(re.sub(r'LOC:(.*?):LOC\n','',text)) fl.write(re.sub(r'LOC:(.*?):LOC\n','\\1\n',text)) return #=============================================== # Parse author names and abbreviate #=============================================== def parse_auth(text): if text.startswith('See'): return(text) text=text.replace('\n',' ') # Need to format with linefeeds later! text=re.sub(' +',' ',text) # Multiple spaces -> single text=text.strip(', ') names=re.split(', and | and |,',text) # Split on any of three text='' line='' middle_abbrev=['Seok','Jie','Wook','Nyung','Luigi','Nihat', 'Young','Andrei','Lawrence','Joon','Anil','Hight','Gilad','Roy'] middle_save=['De','de','Van','Di','Banaszak','Cargill'] na=len(names) # number of names in the author list for ia in range(na): name=names[ia].strip(' ') name=name.replace("Éamonn","Eamonn") #rare special case parts=name.split(' ') nn=len(parts) # number of parts to one author name # print(' %d ' % num +' '+name) # debug if nn <= 1: print('*** Author name too short ***') print(name, parts) sys.exit(1) elif nn > 4: print('*** Author name too long ***') print(name, parts) sys.exit(1) aname='' for j in range(nn): part=parts[j] # Note: re.search looks anywhere in string # re.match looks only at beginning of string if re.match(r'[A-Z].*\.$',part) != None: aname+=part elif j==0: # First name # Modified here so Sang-Wook becomes S.W. etc. for fnpart in part.split('-'): aname+=fnpart[0]+'.' elif j==nn-1: # Last name aname+=' '+part elif part in middle_save: # Middle name(s) aname+=' '+part elif part in middle_abbrev: aname+=part[0]+'.' else: # issue warning if this middle name not recognized print('*** unrecognized middle name: '+part) aname+=part[0]+'.' # aname is now abbreviated author name (no commas etc.) if na > 1 and ia==na-1: aname='and '+aname if len(line)+len(aname) >= 72: text+=line[:-1]+'\n' line='' if na==2 and ia==0: line+=aname+' ' else: line+=aname+', ' if line != '': text+=line[:-1]+'\n' return(text) #=============================================== # Parse links and print #=============================================== def parse_links(in_text): global if_aip,fp,fl,paper_id stuff=in_text.strip(' ()\n') # fb.write(stuff+'\n') items=stuff.split(',') if len(items)==0: return text_p='' text_l='' for item in items: item=item.strip() text_p+=parse_link(True,item) text_l+=parse_link(False,item) text_p=text_p.strip(', \n') text_l=text_l.strip(', \n ') if len(text_p)>0: fp.write('('+text_p+')\n') if len(text_l)>0: fl.write('('+text_l+')\n') return #=============================================== # Parse one link #=============================================== def parse_link(if_public,text): global if_aip,fp,fl,paper_id # if_public=True : For pub_list.html # if_public=False: For pub_list_local.html show_untagged=True taglist = ['c','j','k','p','r','v','w','x','y','z'] type=None for tag in taglist: tys=tag+'s-' # eg, cs- tye='-'+tag+'e' # eg, -ce if text == tys+tye[1:]: # eg, cs-ce text=tys+paper_id+tye # eg, cs-paper_id-ce if text.startswith(tys) and text.endswith(tye): type=tag text=text[3:-3] break # terminate for loop if type == None: # Tag not found if if_public: print('Untagged: '+text) # Not necessarily bad return(text) # Now start tag processing aa='' zz='' if type == 'j' or type == 'k': if text.startswith('AP') or text.startswith('JA') or text.startswith('scitat'): if_aip=True if type == 'j': aa='\n' elif tag == 'footer': text='\n\n\n' text+='

My PhD Thesis, "A Theoretical Study of Defects in' text+=' Amorphous Semiconductors"\n(MIT, 1981) is available as a\n' text+='Scanned PDF (7MB).\n' text+='

\n\n

\n\n' text+='In case you cannot download an article, try sending me\n' text+='email; I might be\n' text+='able to send it to you.\n

\n' text+='Also please notify me of broken links above.\n\n' text+='\n\n' return(text) #=============================================== main() #===============================================