fakeparse.py

#!/usr/bin/python

# This is a basic first-cut code to html converter.
# Unlike any of the other modules I've looked at, it makes a rudimentary attempt
# to cross-reference variables (eventually, it will need to be two-pass.)
# Note that it doesn't use any python-internal tools to do the parsing, just
# code and regexps; thus there are some holes.  However, this gets me off the
# ground far enough to start putting blobs of python code up on the web site.
#
# Future directions include more literate-programming oriented features, like
# taking block comments [like this] and formatting them more smartly, or perhaps
# even extracting the __doc__ value.  Perhaps even having a sidebar with notes
# that are somehow tagged as less-inline than normal comments.
#
# For now, though, this is enough to put in Makefiles and check in for the web site.
#
# Copyright 2003 Mark W. Eichin <eichin@thok.org> The Herd Of Kittens
#
import re
import sys


verbs = ["assert", "break", "class", "continue", "def", "del", "elif",
         "else", "except", "exec", "finally", "for", "from", "global", "if", "import",
         "pass", "print", "raise", "return", "try", "while", "yield",
         ]

# -- single token, outdent --
# break
# continue
# pass
# return 

# -- single token with colon, indent --
# else:
# except:
# finally:
# try:

# -- token EXPR --
# assert EXPR
# del EXPR
# exec EXPR

# -- token EXPR, outdent --
# raise EXPR
# return EXPR
# yield EXPR

# -- token EXPR colon, indent --
# if EXPR:
# elif EXPR:
# while EXPR:
# except TYPE:
# class NAME:

# -- something complex, outdent --
# class NAME(BASECLASS)
# def NAME(ARGS):
# except TYPE, VAR:
# for NAME in EXPR:

# -- something else --
# from NAME import COMMANAMES
# global COMMANAMES
# import COMMANAMES
# print >> FD, EXPRS
# print EXPRS


def link_target_wrap(s, name):
    return '<a name="%s">%s</a>' % (name, s)
def link_ref_wrap(s, name):
    return '<a href="#%s">%s</a>' % (name, s)

known_names = {}

def handle_def(stream, line, state):
    m = re.match("(?P<def>def)(?P<ws>\s+)(?P<name>\S*)(?P<args>\([^)]*\):)", line)
    if not m:
        return emit_rest(stream, line, state)
    g = m.groupdict()
    stream.write(span_token("verb", g["def"]))
    stream.write(g["ws"])
    known_names[g["name"]] = 1
    stream.write(span_token("name", link_target_wrap(g["name"], g["name"])))
    stream.write(g["args"])

def handle_class(stream, line, state):
    return emit_rest(stream, line, state)

known_imports = {}
known_aliases = {}

def handle_import(stream, line, state):
    m = re.match("(?P<imp>import)(?P<ws>\s+)(?P<names>.*)", line)
    if not m:
        m = re.match("(?P<imp>from)(?P<ws>\s+)(?P<name>\S*)(?P<rest>\s*import.*)", line)
        if not m:
            return emit_rest(stream, line, state)
    g = m.groupdict()
    if g["imp"] == "from":
        stream.write(span_token("verb", g["imp"]))
        stream.write(g["ws"])
        known_imports[g["name"]] = 1
        stream.write(span_token("import", link_target_wrap(g["name"], g["name"])))
        mm = re.match("(?P<ws1>\s+)(?P<imp>import)(?P<ws2>\s+)(?P<rest>.*)", g["rest"])
        gg = mm.groupdict()
        stream.write(gg["ws1"])
        stream.write(span_token("verb", gg["imp"]))
        stream.write(gg["ws2"])
        for s, xt in re.findall("([^, ]+)(, ?)?", gg["rest"]):
            known_aliases[s] = 1
            stream.write(span_token("alias", link_target_wrap(s,s)))
            stream.write(xt)
    elif g["imp"] == "import":
        stream.write(span_token("verb", g["imp"]))
        stream.write(g["ws"])
        for s, xt in re.findall("([^, ]+)(, ?)?", g["names"]):
            known_imports[s] = 1
            stream.write(span_token("import", link_target_wrap(s,s)))
            stream.write(xt)
        
known_variables = {}

# there needs to be a way to reset scoping, probably with levels set in state
# or maybe stuffing the whole thing in state
def handle_assignment(stream, line, state):
    m = re.match("(?P<lhs>\S+)(?P<eq>\s*=\s*)(?P<rest>.*)", line)
    if not m:
        return emit_rest(stream, line, state)
    g = m.groupdict()
    # needs to handle multiple LHS variables...
    mm = re.match("(?P<v>\w*)(?P<rest>.*)", g["lhs"])
    gg = mm.groupdict()
    vname = gg["v"]
    if vname in known_variables.keys():
        stream.write(span_token("noun", link_ref_wrap(vname, vname)))
    else:
        known_variables[vname] = 1
        stream.write(span_token("noun", link_target_wrap(vname, vname)))
    stream.write(gg["rest"])
    stream.write(g["eq"])
    emit_rest(stream, g["rest"], state)
    
special_verbs = { "def": handle_def,
                  "class": handle_class,
                  "import": handle_import,
                  "from": handle_import,
                  }

exprs = ["and", "in", "is", "lambda", "not", "or", 
         # and some that should be handled better
         "for", "return", "len"]

from htmlentitydefs import entitydefs

invdefs = {}
for i in range(0,256):
    c = "%c" % i
    invdefs[c] = c
for k,v in entitydefs.items():
    if len(v) == 1:
        invdefs[v] = "&%s;" % k

def make_one_url(s):
    if s.startswith("http://"):
        return '<a href="%s">%s</a>' % (s, s)
    return s

def urlify(s):
    parts = re.split("(http://[^\s&]*)", s)
    return "".join([make_one_url(p) for p in parts])

def httpquote(s):
    return "".join([invdefs[v] for v in s])

# break into tokens: quote, triplequote, tick, other
# backslash-foo yields other.
munge_other = "munge_other"
munge_quote = "munge_quote"
munge_3quote = "munge_3quote"
munge_tick = "munge_tick"

mquotes = [ munge_quote, munge_3quote, munge_tick ]

def check_token(s, mungetag, match):
    if s.startswith(match):
        return (mungetag, s[:len(match)], s[len(match):])
    return None

def string_munge_one_token(s):
    # strip the quoted char too (deal with newline case later)
    if s.startswith("\\"):
        return (munge_other, s[:2], s[2:])
    r = check_token(s, munge_tick, "'")
    if r: return r
    r = check_token(s, munge_3quote, '"""')
    if r: return r
    r = check_token(s, munge_quote, '"')
    if r: return r
    r = check_token(s, munge_3quote, httpquote('"""'))
    if r: return r
    r = check_token(s, munge_quote, httpquote('"'))
    if r: return r
    if s.startswith("&"):
        m = re.match("(\&[^\\'\"\&]*)", s)
        if m:
            return (munge_other, s[:len(m.group(1))], s[len(m.group(1)):])
    m = re.match("([^\\'\"\&]*)", s)
    if m:
        return (munge_other, s[:len(m.group(1))], s[len(m.group(1)):])

def string_munge(s):
    reslist = []
    while s:
        tag, piece, s = string_munge_one_token(s)
        reslist.append([tag,piece])
    return reslist

def span_mark(t):
    return """<span class="python-quote">%s""" % t
def span_end(t):
    return """%s</span>""" % t

def span_token(tag, str):
    return """<span class="python-%s">%s</span>""" % (tag, str)
def txt_process(s):
    # gets chunks of text that are "other"
    s = re.sub("\\bNone\\b", span_token("token", "None"), s)
    for i in exprs:
        s = re.sub("\\b%s\\b" % i, span_token("token", i), s)
    for i in known_names:
        s = re.sub("\\b%s\\b" % i, span_token("name", link_ref_wrap(i,i)), s)
    for i in known_imports:
        s = re.sub("\\b%s\\b" % i, span_token("import", link_ref_wrap(i,i)), s)
    for i in known_aliases:
        s = re.sub("\\b%s\\b" % i, span_token("alias", link_ref_wrap(i,i)), s)
    for i in known_variables:
        s = re.sub("\\b%s\\b" % i, span_token("noun", link_ref_wrap(i,i)), s)
    return s

def string_span_munge(s, st):
    state, ticktype = st
    res = []
    for t, p in string_munge(s):
        if state == "outside":
            if t == "munge_other":
                res.append(txt_process(p))
            else:
                state = "inside"
                ticktype = t
                res.append(span_mark(p))
        elif state == "inside":
            if t == ticktype:
                state = "outside"
                res.append(span_end(p))
            else:
                res.append(p)
        else:
            raise Exception("bugcheck %s" % state)
    return (state, ticktype, "".join(res))

def emit_rest(stream, s, state):
    st, ty, res = string_span_munge(httpquote(s), state["string"])
    state["string"] = (st, ty)
    stream.write(urlify(res))

def format_line(s, stream, state):
    m = re.match("^(\s*)(.*)$", s)
    indent, line = m.groups()
    stream.write("""<span class="python-indent">%s</span>""" % indent)
    if line.startswith("#"):
        stream.write(span_token("comment", urlify(httpquote(line))))
        stream.write("\n")
        return
    try:
        first_word, rest = line.split(" ",1)
        if first_word in special_verbs.keys():
            special_verbs[first_word](stream, line, state)
        elif first_word in verbs:
            stream.write(span_token("verb", first_word))
            stream.write(" ")
            emit_rest(stream, rest, state)
        elif line.find("=") > -1:
            handle_assignment(stream, line, state)
        else:
            emit_rest(stream, line, state)
    except ValueError:
        emit_rest(stream, line, state)
    stream.write("\n")

def format_stream(instream, outstream):
    print >>outstream, """<pre class="python-src">"""
    st = {"string":("outside",None)}
    while 1:
        line = instream.readline()
        if not line:
            break
        format_line(line, outstream, st)
    print >>outstream, """</pre>"""

#H1, H2, H3  { text-align: center; }
#.gutter td { font-family: sans-serif; text-align: right; }
#td.gutter { background: #c0d8c0;  }
#BODY { background: #ffffd8; }
#P, TD { color: black; }

CSS = """<style type="text/css" media=screen>
  <!-- 
    span.python-verb { color: cyan; }
    span.python-comment { color: red; }
    span.python-quote { color: orange; }
    span.python-token { color: blue; }
    span.python-name { color: green; }
    span.python-import { color: green; }
    span.python-alias { color: green; }
    span.python-noun { color: brown; }
    pre.python-src { color: grey; background: black; }
    h1 { color: yellow; background: grey; }
    body { color: yellow; background: grey; }
  -->
</style>
"""

if __name__ == "__main__":
    progname, input = sys.argv
    f = open(input)
    print """<html><head><title>%s</title></head>""" % input
    print CSS
    print """<body>%s<h1>%s</h1>""" % (
        '<a href="index.html">[back]</a>', 
        '<a href="%s">%s</a>' % (input,input))
    format_stream(f, sys.stdout)
    print """</body></html>"""