#!/usr/bin/python # This is a second-cut code to html converter. # # Unlike any of the other modules I've looked at, it makes a rudimentary attempt # to cross-reference variables (eventually, it will need to be two-pass.) # This replaces the overburdened pile of regexps with the native tokenizer lib. # Future directions include more literate-programming oriented features, like # taking block comments [like this] and formatting them more smartly, or perhaps # even extracting the __doc__ value. Perhaps even having a sidebar with notes # that are somehow tagged as less-inline than normal comments. # # Copyright 2003 Mark W. Eichin <eichin@thok.org> The Herd Of Kittens # import sys import tokenize import re from htmlentitydefs import entitydefs invdefs = {} for i in range(0,256): c = "%c" % i invdefs[c] = c for k,v in entitydefs.items(): if len(v) == 1: invdefs[v] = "&%s;" % k def make_one_url(s): if s.startswith("http://"): return '<a href="%s">%s</a>' % (s, s) return s def urlify(s): parts = re.split("(http://[^\s&]*)", s) return "".join([make_one_url(p) for p in parts]) def httpquote(s): return "".join([invdefs[v] for v in s]) def span_token(tag, str): return """<span class="python-%s">%s</span>""" % (tag, str) def link_target_wrap(s, name): return '<a name="%s">%s</a>' % (name, s) def link_ref_wrap(s, name): return '<a href="#%s">%s</a>' % (name, s) # should these be in formatter? probably known_names = {} known_imports = {} known_aliases = {} class formatter: def __init__(self, outstream): # we may not need this anymore self.st = {"string":("outside",None)} self.outstream = outstream self.next_name = None self.lastcol = 0 self.lastrow = 1 self.indent_list = [""] self.please_indent = 1 def emit(self, txt): self.outstream.write(txt) def COMMENT(self, tstring): self.emit(span_token("comment", urlify(httpquote(tstring)))) def NL(self, tstring): self.emit(tstring) self.please_indent = 1 self.next_name = None def NEWLINE(self, tstring): self.emit(tstring) self.please_indent = 1 self.next_name = None def passthrough(self, tstring): self.emit(tstring) self.next_name = None OP = passthrough NUMBER = passthrough ENDMARKER = passthrough def do_indent(self): # self.emit("".join(self.indent_list)) self.emit(self.indent_list[-1]) # use the head self.please_indent = None def INDENT(self, tstring): self.indent_list.append(tstring) # push self.next_name = None self.do_indent() def DEDENT(self, tstring): self.indent_list = self.indent_list[:-1] # pop self.next_name = None def STRING(self, tstring): self.emit(span_token("quote", httpquote(tstring))) self.next_name = None def NAME(self, tstring): if self.next_name: return self.next_name(tstring) try: fn = getattr(self, "NAME_%s" % tstring) except: # try other context stuff? self.emit(tstring) return fn(tstring) def NAME_import(self, tstring): self.next_name = self.import_pkg_name self.emit(span_token("verb", tstring)) def NAME_from(self, tstring): self.next_name = self.import_pkg_name_from self.emit(span_token("verb", tstring)) def import_pkg_name(self, tstring): # not the right test if tstring.isalnum(): known_imports[tstring] = 1 self.emit(span_token("import", link_target_wrap(tstring,tstring))) else: self.emit(tstring) self.next_name = None def import_pkg_name_from(self, tstring): # not the right test if tstring.isalnum(): known_imports[tstring] = 1 self.emit(span_token("import", link_target_wrap(tstring,tstring))) else: self.emit(tstring) self.next_name = None self.next_name = None # need a way to insert missing whitespace # possibly less kludgily than counting offsets. # maybe we just need to use the offsets to mark things up? def process_token(self, tk): ttype, tstring, spos, epos, line = tk # print tokenize.tok_name[ttype] # print tokenize.tok_name[ttype], tk # dispatch on token type, emit appropriately, have a fallback # add a summary of non-specially-handled ones # handle the space... thisrow, thiscol = spos # print spos, epos if thisrow > self.lastrow: self.outstream.write("\n" * (thisrow - self.lastrow - 1)) self.lastcol = 0 if thiscol > self.lastcol: # print >>self.outstream, [thisrow, thiscol, self.lastrow, self.lastcol], self.outstream.write(" " * (thiscol - self.lastcol)) self.please_indent = None self.lastrow, self.lastcol = epos try: fn = getattr(self, tokenize.tok_name[ttype]) except AttributeError: print >>sys.stderr, "No match!", tokenize.tok_name[ttype], tstring return if ttype != tokenize.DEDENT and ttype != tokenize.INDENT and self.please_indent: self.do_indent() fn(tstring) def format_stream(instream, outstream): print >>outstream, """<pre class="python-src">""" fmt = formatter(outstream) for tk in tokenize.generate_tokens(instream.readline): fmt.process_token(tk) print >>outstream, """</pre>""" CSS = """<style type="text/css" media=screen> <!-- span.python-verb { color: cyan; } span.python-comment { color: red; } span.python-quote { color: orange; } span.python-token { color: blue; } span.python-name { color: green; } span.python-import { color: green; } span.python-alias { color: green; } span.python-noun { color: brown; } pre.python-src { color: grey; background: black; } h1 { color: yellow; background: grey; } body { color: yellow; background: grey; } --> </style> """ if __name__ == "__main__": progname, input = sys.argv f = open(input) print """<html><head><title>%s</title></head>""" % input print CSS print """<body>%s<h1>%s</h1>""" % ( '<a href="index.html">[back]</a>', '<a href="%s">%s</a>' % (input,input)) format_stream(f, sys.stdout) print """</body></html>"""