tokenparse.py

#!/usr/bin/python

# This is a second-cut code to html converter.
#
# Unlike any of the other modules I've looked at, it makes a rudimentary attempt
# to cross-reference variables (eventually, it will need to be two-pass.)
# This replaces the overburdened pile of regexps with the native tokenizer lib.

# Future directions include more literate-programming oriented features, like
# taking block comments [like this] and formatting them more smartly, or perhaps
# even extracting the __doc__ value.  Perhaps even having a sidebar with notes
# that are somehow tagged as less-inline than normal comments.
#
# Copyright 2003 Mark W. Eichin <eichin@thok.org> The Herd Of Kittens
#

import sys
import tokenize
import re

from htmlentitydefs import entitydefs

invdefs = {}
for i in range(0,256):
    c = "%c" % i
    invdefs[c] = c
for k,v in entitydefs.items():
    if len(v) == 1:
        invdefs[v] = "&%s;" % k

def make_one_url(s):
    if s.startswith("http://"):
        return '<a href="%s">%s</a>' % (s, s)
    return s

def urlify(s):
    parts = re.split("(http://[^\s&]*)", s)
    return "".join([make_one_url(p) for p in parts])

def httpquote(s):
    return "".join([invdefs[v] for v in s])

def span_token(tag, str):
    return """<span class="python-%s">%s</span>""" % (tag, str)

def link_target_wrap(s, name):
    return '<a name="%s">%s</a>' % (name, s)
def link_ref_wrap(s, name):
    return '<a href="#%s">%s</a>' % (name, s)

# should these be in formatter? probably
known_names = {}
known_imports = {}
known_aliases = {}

class formatter:
    def __init__(self, outstream):
        # we may not need this anymore
        self.st = {"string":("outside",None)}
        self.outstream = outstream
        self.next_name = None
        self.lastcol = 0
        self.lastrow = 1
        self.indent_list = [""]
        self.please_indent = 1

    def emit(self, txt):
        self.outstream.write(txt)

    def COMMENT(self, tstring):
        self.emit(span_token("comment", urlify(httpquote(tstring))))

    def NL(self, tstring):
        self.emit(tstring)
        self.please_indent = 1
        self.next_name = None

    def NEWLINE(self, tstring):
        self.emit(tstring)
        self.please_indent = 1
        self.next_name = None

    def passthrough(self, tstring):
        self.emit(tstring)
        self.next_name = None

    OP = passthrough
    NUMBER = passthrough
    ENDMARKER = passthrough

    def do_indent(self):
        # self.emit("".join(self.indent_list))
        self.emit(self.indent_list[-1]) # use the head
        self.please_indent = None

    def INDENT(self, tstring):
        self.indent_list.append(tstring) # push
        self.next_name = None
        self.do_indent()

    def DEDENT(self, tstring):
        self.indent_list = self.indent_list[:-1] # pop
        self.next_name = None


    def STRING(self, tstring):
        self.emit(span_token("quote", httpquote(tstring)))
        self.next_name = None

    def NAME(self, tstring):
        if self.next_name:
            return self.next_name(tstring)
        try:
            fn = getattr(self, "NAME_%s" % tstring)
        except:
            # try other context stuff?
            self.emit(tstring)
            return
        fn(tstring)

    def NAME_import(self, tstring):
        self.next_name = self.import_pkg_name
        self.emit(span_token("verb", tstring))

    def NAME_from(self, tstring):
        self.next_name = self.import_pkg_name_from
        self.emit(span_token("verb", tstring))

    def import_pkg_name(self, tstring):
        # not the right test
        if tstring.isalnum():
            known_imports[tstring] = 1
            self.emit(span_token("import", link_target_wrap(tstring,tstring)))
        else:
            self.emit(tstring)
            self.next_name = None

    def import_pkg_name_from(self, tstring):
        # not the right test
        if tstring.isalnum():
            known_imports[tstring] = 1
            self.emit(span_token("import", link_target_wrap(tstring,tstring)))
        else:
            self.emit(tstring)
            self.next_name = None
        self.next_name = None

    # need a way to insert missing whitespace
    # possibly less kludgily than counting offsets.
    # maybe we just need to use the offsets to mark things up?
    def process_token(self, tk):
        ttype, tstring, spos, epos, line = tk
        # print tokenize.tok_name[ttype]
        # print tokenize.tok_name[ttype], tk
        # dispatch on token type, emit appropriately, have a fallback
        # add a summary of non-specially-handled ones
        # handle the space...
        thisrow, thiscol = spos
        # print spos, epos
        if thisrow > self.lastrow:
            self.outstream.write("\n" * (thisrow - self.lastrow - 1))
            self.lastcol = 0
        if thiscol > self.lastcol:
            # print >>self.outstream, [thisrow, thiscol, self.lastrow, self.lastcol],
            self.outstream.write(" " * (thiscol - self.lastcol))
            self.please_indent = None
        self.lastrow, self.lastcol = epos
        try:
            fn = getattr(self, tokenize.tok_name[ttype])
        except AttributeError:
            print >>sys.stderr, "No match!", tokenize.tok_name[ttype], tstring
            return

        if ttype != tokenize.DEDENT and ttype != tokenize.INDENT and self.please_indent:
            self.do_indent()
        fn(tstring)

def format_stream(instream, outstream):
    print >>outstream, """<pre class="python-src">"""
    fmt = formatter(outstream)
    for tk in tokenize.generate_tokens(instream.readline):
        fmt.process_token(tk)
    print >>outstream, """</pre>"""

CSS = """<style type="text/css" media=screen>
  <!-- 
    span.python-verb { color: cyan; }
    span.python-comment { color: red; }
    span.python-quote { color: orange; }
    span.python-token { color: blue; }
    span.python-name { color: green; }
    span.python-import { color: green; }
    span.python-alias { color: green; }
    span.python-noun { color: brown; }
    pre.python-src { color: grey; background: black; }
    h1 { color: yellow; background: grey; }
    body { color: yellow; background: grey; }
  -->
</style>
"""

if __name__ == "__main__":
    progname, input = sys.argv
    f = open(input)
    print """<html><head><title>%s</title></head>""" % input
    print CSS
    print """<body>%s<h1>%s</h1>""" % (
        '<a href="index.html">[back]</a>', 
        '<a href="%s">%s</a>' % (input,input))
    format_stream(f, sys.stdout)
    print """</body></html>"""