#!/usr/bin/python import BaseHTTPServer import SocketServer import base64 import os import time class PageServerRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler): #def __init__(self, request, client_address, server): # BaseHTTPServer.BaseHTTPRequestHandler.__init__(request, client_address, server) def log_event(self, kind, url): print >> open(self.server.logfile,"a"), kind, url def do_GET(self): self.send_response(200, "OK") self.end_headers() if self.path == "/tags": print >> self.wfile, "\n".join(self.server.clipstash.tags()) elif self.path == "/previous_urls": print >> self.wfile, "\n".join(previous_urls()) def do_POST(self): # print "PATH:", self.path if self.path == "/url_seen": return self.do_POST_url_seen() if self.path == "/url_save_all": return self.do_POST_url_save_all() if self.path == "/clip_this": return self.do_POST_clip_this() self.send_response(404, "Not Found") self.end_headers() self.wfile.write("%s is bogus" % self.path) def do_POST_clip_this(self): info = {} for k in ['ClipURL', 'ClipCategory', 'ClipSelection', 'ClipTitle', 'ClipReferrer']: info[k] = self.headers.getheader(k) length = int(self.headers.getheader("Content-Length")) info["ClipHTML"] = base64.decodestring(self.rfile.read(length)) self.send_response(200, "OK") self.end_headers() self.wfile.write("") print "GOT STUFF:" for k in ['ClipURL', 'ClipCategory', 'ClipSelection', 'ClipTitle', 'ClipReferrer']: print k, info[k] ### now just have to stash it somewhere... self.server.clipstash.record(info) ### could use pyosd.display() but it can't do "middle" or "center" os.system("echo '%s' | osd_cat -p middle -f '-*-new century schoolbook-bold-i-*-*-24-*-*-*-*-*-*-*' -A center -c orange -l 2 -s 1 &" % "Recorded") def do_POST_url_seen(self): # print "PATH:", self.path url = self.headers.getheader("SaveURL") # print "URL:", url event = self.headers.getheader("EventType") # print "EventType:", event self.send_response(200, "OK") self.end_headers() self.wfile.write("") self.log_event(event, url) def do_POST_url_save_all(self): # test with: # echo http://www.thok.org/ | curl -d@- http://localhost:3382/url_save_all length = int(self.headers.getheader("Content-Length")) urls = self.rfile.read(length) print >> open(urllog_file, "a"), "---", time.ctime() print >> open(urllog_file, "a"), urls self.send_response(200, "OK") self.end_headers() self.wfile.write("") def log_request(self, code=None, size=None): # this is just to make the default per-requst logs shut up pass urllog_file = os.path.expanduser("~/.urllog") def previous_urls(): # cheap forward scan urls = [] asof = "" for line in file(urllog_file): if not line.strip(): continue if line.startswith("undefined"): line = line.replace("undefined", "", 1) if line.startswith("--- "): asof = line.strip().replace("--- ","", 1) urls = [] continue urls.append(line.strip()) print len(urls), "urls retrieved as of", asof return urls class ForkingHTTPServer(SocketServer.ForkingMixIn, BaseHTTPServer.HTTPServer): pass import os def deepmkdir(d): if not os.path.isdir(d): deepmkdir(os.path.dirname(d)) os.mkdir(d) import time import datetime import urlparse def crunch_url(url): scheme, netloc, path, params, query, fragment = urlparse.urlparse(url.lower()) return (netloc.replace("www.","").replace(".com","") + "_" + path.split("/")[-1].replace(".html","").replace(".htm","")) import string import urllib squash_whitespace = string.maketrans(string.whitespace, " " * len(string.whitespace)) def line_safe_unquote(s): # given encodeURIComponent, make something more # greppable but still rfc822-safe if not s: return "" return urllib.unquote(s).translate(squash_whitespace) import sets badpunct = sets.Set() dotpunct = sets.Set() badpunct.update(string.punctuation) dotpunct.update("+-=@_%") squash_chars = "".join(badpunct - dotpunct) + string.whitespace squash_punctuation = string.maketrans(squash_chars, "." * len(squash_chars)) def crunch_words(txt): txt = txt.replace("%20","_") txt = txt.replace("%0A","_") txt = urllib.unquote(txt) txt = txt.translate(squash_punctuation) txt = txt.replace("..",".") # don't actually try too hard txt = txt.replace("..",".") txt = txt.replace("__","_") # don't actually try too hard txt = txt.replace("__","_") txt = txt.replace("_._",".") # consider a split_words -> SplitWords transform here return txt[:45] class Stash: def __init__(self, basedir): self.base = basedir def tags(self): return [t.rstrip() for t in open(os.path.join(self.base,"tags")).readlines()] def record(self, info): instant = time.time() when = datetime.date.fromtimestamp(instant) thisdir = os.path.join(self.base, when.strftime("%Y-%m"), when.strftime("%Y-%m-%d")) deepmkdir(thisdir) this_item_parts = [time.strftime("%Y%m%d-%H%M%S", time.localtime(instant))] if "ClipCategory" in info: this_item_parts.append(info["ClipCategory"]) if "ClipURL" in info: this_item_parts.append(crunch_url(info["ClipURL"])) if "ClipSelection" in info: this_item_parts.append(crunch_words(info["ClipSelection"])) elif "ClipTitle" in info: this_item_parts.append(crunch_words(info["ClipTitle"])) this_item_parts.append("clip") this_item = ".".join(this_item_parts) this_item_path = os.path.join(thisdir, this_item) # marshal info into it... item = open(this_item_path, "w") # rfc822-ish... for k,v in info.items(): if k == "ClipHTML": continue if k == "ClipSelection": print >>item, "%s: %s" % (k,line_safe_unquote(v)) else: print >>item, "%s: %s" % (k,v) print >>item, "" print >>item, info.get("ClipHTML", "") item.close() import sys,os if __name__ == "__main__": server = ForkingHTTPServer(("localhost", 3382), PageServerRequestHandler) server.logfile = os.path.join(os.path.dirname(sys.argv[0]), "pagelog") server.clipstash = Stash(os.path.expanduser("~/stufflog")) server.serve_forever()