#!/usr/bin/python # supposedly this is enough for it to work in 2.1, but I don't test it there. from __future__ import nested_scopes import urllib import re import shelve import sys import os import traceback import time import rfc822 urllib.URLopener.version = "thok.org-comick.py-low-bandwidth-change-monitor/0.9" def get_content(url): u = urllib.urlopen(url) s = u.read() u.close() return s def page_feeder_notify(link): try: uo = urllib.URLopener() uo.addheader('url', link) u = uo.open("http://localhost:3383/push_url", data="") u.read() u.close() except Exception, e: print >> sys.stderr, "feeder whine:", e, "on", link # # Inspired by Jarno Virtanen's article on Python Owns Us, at # http://www.hole.fi/jajvirta/weblog/20030928T2101.html # This is structured a little differently, using urllib instead of urllib2; # we need FancyURLopener to get redirects, but we don't want # the http_error_default change because it loses the status, so # we just roll that back... # class MyFancyURLopener(urllib.FancyURLopener): http_error_default = urllib.URLopener.http_error_default def get_changed_content(url, etag=None, lastmod=None): uo = MyFancyURLopener() if etag: uo.addheader("If-None-Match", etag) if lastmod: uo.addheader("If-Modified-Since", lastmod) try: u = uo.open(url) except IOError, e: if e[0] == "http error" and e[1] == 304: return None raise if u.headers.has_key("ETag"): etag = u.headers["ETag"] if u.headers.has_key("Last-Modified"): lastmod = u.headers["Last-Modified"] s = u.read() u.close() return (s, etag, lastmod) img_pat = re.compile("""<img[^>]*src=['"]([^"'>]*)['"][^>]*>""", re.IGNORECASE) # " def get_img(s): return re.findall(img_pat, s) def check_img_start(s, prefix): targ = filter(lambda s: s.startswith(prefix), get_img(s)) if not targ: return None # a later image is likely a second comic return targ[-1] def check_start(s, u, arg): return check_img_start(s, arg) # def check_somethingpositive(s, u): # return check_img_start(s, "arch/") # def check_sinfest(s, u): # return check_img_start(s, "/comics/") def urlbase(u): ht, path = urllib.splittype(u) host, path = urllib.splithost(path) slash = path.rfind("/") if slash > -1: path = path[0:slash] return "%s://%s%s" % (ht, host, path) def urlroot(u): ht, path = urllib.splittype(u) host, path = urllib.splithost(path) return "%s://%s" % (ht, host) def check_maybe(s, u, arg): if "keenspace.com" in u: r = old_check_maybe(s, u.replace("keenspace", "comicgenesis"), arg) if r: return r return old_check_maybe(s, u, arg) def old_check_maybe(s, u, arg): r = check_img_start(s, "/comics/") if r: return r r = check_img_start(s, "./comics/") if r: return r r = check_img_start(s, "../comics/") # questionablecontent.net if r: return r r = check_img_start(s, "comics/") if r: return r r = check_img_start(s, "Comics/") # missmab if r: return r r = check_img_start(s, "/comix/") if r: return r r = check_img_start(s, "strips/") if r: return r r = check_img_start(s, "/strips/") if r: return r r = check_img_start(s, "/hstrips/") # new dieselsweeties if r: return r r = check_img_start(s, "arch/") if r: return r r = check_img_start(s, "/archive/") if r: return r r = check_img_start(s, "/arch/") # new somethingpositive if r: return r r = check_img_start(s, "archive/") if r: return r r = check_img_start(s, "%s/comics/" % urlbase(u)) if r: return r r = check_img_start(s, "%s/arch/" % urlbase(u)) if r: return r r = check_img_start(s, "%s/strips/" % urlbase(u)) if r: return r r = check_img_start(s, "active/") if r: return r r = check_img_start(s, "/active/") # furrymilitia if r: return r r = check_img_start(s, "%s/active/" % urlbase(u)) # badlydrawnkitties if r: return r r = check_img_start(s, "%s/comics/" % u) # new badlydrawnkitties if r: return r r = check_img_start(s, "%s/images/comics/" % urlbase(u)) # new sluggy if r: return r r = check_img_start(s, "archives/strips/") # for putf if r: return r r = check_img_start(s, "/Cartoons/") # for daybyday if r: return r r = check_img_start(s, "/images/comics/") # hello-cthulhu if r: return r r = check_img_start(s, "images/comics/") # hello-cthulhu revised if r: return r r = check_img_start(s, "%s/images/strips/" % urlbase(u)) # realmofatland if r: return r r = check_img_start(s, "images/strips/") # newer realmofatland if r: return r r = check_img_start(s, "imgs/comics/") # jinwicked if r: return r r = check_img_start(s, "/Assets/Finished") # radioactivepanda: /Assets/Finished%20Comics/Strip-0083.jpg if r: return r r = check_img_start(s, "%s/storage" % urlroot(u)) # partiallyclips: http://www.partiallyclips.com/storage/20050714_Researchers_lg.png if r: return r r = check_img_start(s, "/guest/comics/") # crfh during guest weekends if r: return r r = check_img_start(s, "manga/") # Miracle of Science if r: return r print "nothing worked, evaluate", get_img(s) return r def check_userfriendly(s, u, arg): # http://www.userfriendly.org/cartoons/archives/03sep/xuf005935.gif return check_img_start(s, "%s/cartoons/archives/" % urlroot(u)) # def check_drfun_week(s, u): # return check_img_start(s, "Dr-Fun/inline/thumbs/") # def check_faans(s, u): # # http://faans.com/images/2003/995goodbye.jpg # return check_img_start(s, "%s/images/2" % urlbase(u)) def check_start_url(s, u, arg): return check_img_start(s, arg % urlbase(u)) # def check_pennyarcade(s, u): # # images/2003/20030915l.gif # return check_img_start(s, "images/2") bruno_pad = re.compile("\\d{4}(sketch)?pics/") def check_bruno(s, u, arg): targ = filter(lambda s: re.match(bruno_pad, s), get_img(s)) if not targ: return None # a later image is likely a second comic return targ[-1] alt_pat = re.compile('<img[^>]*src="([^">]*)"[^>]*alt="([^">]*)"[^>]*>', re.IGNORECASE + re.MULTILINE) def check_img_alt(s, a): targ = [src for src, alt in re.findall(alt_pat, s) if alt.startswith(a)] if not targ: return None # a later image is likely a second comic return targ[-1] def check_alt(s, u, arg): return check_img_alt(s, arg) # ozyandmillie was just a check_alt again, but now there are spurious newlines # in long elements... as in, alt=\n"Today..." and the browser loses them too. # Since it's stuck for a week, we kludge it... def check_ozyandmillie(s, u, arg): return check_alt(s.replace("\n",""), u, arg) # def check_krakow(s, u, arg): # return check_img_start(s, "%s/comicpro/strips/" % urlbase(u)) # def check_dieselsweeties(s, u): # return check_img_start(s, "http://images.clango.org/strips/") def check_re(pattern, s): m = re.search(pattern, s) if not m: return None return m.group(1) def check_regexp(s, u, arg): return check_re(arg, s) # <B>LATEST COMIC</B>: <A HREF="/202.html">#202 - Too Much of a Good Thing IV</A> # def check_sexylosers(s, u): # return check_re('<B>LATEST COMIC</B>: <A HREF="([^"]*)">', s) def check_sexylosers_fan(s,u): return check_re('<B>LATEST FAN ART</B>: <A HREF = "([^"]*)">', s) def check_sexylosers_guest(s,u): return check_re('<B>LATEST GUEST COMIC</B>: <A HREF="([^"]*)">', s) title_pat = re.compile("<title>([^<]*)</title>", re.IGNORECASE + re.MULTILINE) # def check_title(s, u): # return check_re(title_pat, s) # def check_redmeat(s, u): # return check_re('<a href="([^"]*)">previous</a>', s) def check_helen(s, u, arg): nextpage = check_re('<frame name="main" src="([^"]*)" ', s) if not nextpage: return None framepage = get_content("%s%s" % (urlroot(u), nextpage)) return check_img_start(framepage, "http://www.tmsfeatures.com/") # def check_marilith(s, u): # return check_re("<img src='([^']*)'", s) # # def check_minimumsecurity(s, u): # return check_re('<a href="([^"]*)">', s) # def check_ponju(s, u): # # 'piggyhunter030809.jpg' # return check_img_start(s, "piggyhunter") # def check_zark(s, u): # return check_img_start(s, "../pages") # def check_sluggy(s,u): # # "maybe" *used* to work, but that was possibly while pics was down # return check_img_start(s,"http://pics") # def check_gadgeteer(s,u): # return check_re(re.compile('past reviews.*?<a href="([^"]*)"', re.DOTALL), s) angryflower_pat = re.compile('<img src="newest[^/]*href="([^"]*)"', re.MULTILINE) # new form: [url, fn, arg] where arg is usually an re sites = [ ["http://www.sinfest.net/", check_maybe, None], ["http://somethingpositive.net/index.html", check_maybe, None], ["http://www.somethingpositive.net/newgolddream/", check_maybe, None], ["http://www.sluggy.com/", check_maybe, None], # ["http://ram.purrsia.com/fwf/", check_maybe, None], ["http://www.jadephoenix.org/fwf/", check_maybe, None], # ["http://www.furwillfly.com/", check_maybe, None], ["http://freefall.purrsia.com/default.htm", check_start, "/ff"], # ["http://loserzcomic.keenspace.com/", check_maybe, None], # moved to: ["http://loserz.scribblekid.org/", check_maybe, None], # "all gone" as of [eichin:20041212T0340-05] # ["http://www.radcomics.com/", check_maybe, None], ["http://www.queenofwands.net/", check_maybe, None], ["http://commanderkitty.com/", check_maybe, None], ["http://www.userfriendly.org/static/", check_userfriendly, None], ["http://www.goats.com/", check_maybe, None], ["http://www.megagamerz.com/", check_maybe, None], ["http://www.brunostrip.com/bruno.html", check_bruno, None], ["http://www.ibiblio.org/Dave/this-week.html", check_start, "Dr-Fun/inline/thumbs/"], ["http://www.ozyandmillie.org/", check_ozyandmillie, "Today's cartoon"], ["http://www.clanofthecats.com/", check_maybe, None], ["http://www.wanderingones.com/", check_maybe, None], ["http://lcd.keenspace.com/", check_maybe, None], # someday figure out how to handle frames at the outer level... # ["http://www.comicspage.com/helen/index.html", check_helen, None], ["http://www.tmsfeatures.com/tmsfeatures/subcategory.jsp?custid=67&catid=1242", check_start, "http://www.tmsfeatures.com/"], ["http://www.soaprope.com/", check_maybe, None], ["http://www.ubersoft.net/", check_maybe, None], ["http://www.gpf-comics.com/", check_maybe, None], ["http://www.errantstory.com/", check_maybe, None], ["http://www.wigu.com/", check_maybe, None], # <div class="DateHeader">Friday, November 19, 2004</div> # so a there's an xml-ish way to say that, hmmm ["http://www.wigu.com/overcompensating/", check_regexp, 'date-header">([^<]*)</div'], ["http://strangedaze.keenspace.com/", check_maybe, None], ["http://www.nukees.com/", check_maybe, None], ["http://jackiesfridge.keenspace.com/", check_maybe, None], ["http://www.schlockmercenary.com/", check_maybe, None], ["http://nsitmc.keenspace.com/latest.html", check_maybe, None], # faans is commercial now # ["http://faans.com/", check_start_url, "%s/images/2"], ["http://www.sheldoncomics.com/comics/sheldon/index.html", check_maybe, None], ["http://flem.keenspace.com/", check_maybe, None], # http://sexylosers.com/egg-redirect.html is supposed to work, try it? ["http://sexylosers.com/", check_regexp, '<B>LATEST COMIC</B>: <A HREF="([^"]*)">'], ["http://www.ghastlycomic.com/", check_maybe, None], # ["http://www.washingtonpost.com/wp-dyn/style/columns/missmanners/", check_maybe, None], #["http://www.dieselsweeties.com/", check_start, "http://images.clango.org/strips/"], ["http://www.dieselsweeties.com/", check_maybe, None], ["http://www.redmeat.com/redmeat/current/index.html", check_regexp, '<a href="([^"]*)">previous</a>'], ["http://www.joeaverage.org/", check_maybe, None], ["http://tonjasteele.keenspace.com/", check_maybe, None], ["http://www.choppingblock.org/", check_maybe, None], ["http://www.roadwaffles.com/", check_regexp, '<img src=(comics/rw.*\.gif)>'], # ["http://www.eightland.com/", check_maybe, None], ["http://www.minimumsecurity.net/toons/index.htm", check_regexp, '<a href="([^"]*)">'], # ["http://oddjobs.keenspace.com/", check_maybe, None], # ["http://www.krakowstudios.com/", check_start_url, "%s/comicpro/strips/"], ["http://www.krakowstudios.com/", check_maybe, None], # ["http://marilith.com/", check_regexp, "<img src='([^']*)'"], ["http://marilith.com/", check_maybe, None], ["http://www.ponju.com/PiggyHunter/comic.php", check_start, "piggyhunter"], # dragon-tails ended 2004-11-04. # ["http://www.dragon-tails.com/", check_maybe, None], ["http://umlauthouse.keenspace.com/", check_maybe, None], ["http://umlauthouse.comicgenesis.com/", check_maybe, None], ["http://www.nuklearpower.com/latest.php", check_maybe, None], # ["http://www.polymercitychronicles.com/", check_maybe, None], ["http://www.polymercitychronicles.com/", check_alt, "[current strip]"], ["http://www.scarygoround.com/", check_maybe, None], # ["http://www.writheandshine.com/index2.html", check_maybe, None], ["http://pillarsoffaith.keenspace.com/", check_regexp, title_pat], ["http://webmarines.keenspace.com/", check_maybe, None], ["http://www.mrbang.net/", check_maybe, None], # ["http://www.avalonhigh.com/", check_maybe, None], # ended ["http://www.zark.com/front/azpages.html", check_start, "../pages"], ["http://www.catandgirl.com/", check_maybe, None], # http://www.penny-arcade.com/images/2005/20051207h.jpg # ["http://www.penny-arcade.com/comic", check_start, "/images"], # move to check_maybe? ["http://www.penny-arcade.com/comic", check_regexp, title_pat], ["http://www.the-gadgeteer.com/", check_regexp, re.compile('past reviews.*?<a href="([^"]*)"', re.DOTALL)], ["http://wapsisquare.com/", check_maybe, None], # ["http://jack.keenspace.com/", check_maybe, None], # jack moved to pholph ["http://www.pholph.com/",check_maybe, None], ["http://www.pvponline.com/", check_maybe, None], # ["http://www.fusiond.com/", check_maybe, None], # points to a generic page? # ["http://fusiond.digitalcrap.net/", check_maybe, None], # also dead ["http://www.antiheroforhire.com/", check_maybe, None], # movie-comics ended, should drop from here ## ["http://www.movie-comics.com/comic.php", check_maybe, None], # ["http://www.gushi.org/~whitestorm/rdt/index.html", check_maybe, None], #["http://www.dangerousthings.net", check_regexp, re.compile('<a title="Current comic" href="([^"]*)">', re.DOTALL)], # ended, and the characters got distributed out to other comics :-) #["http://www.dangerousthings.net", check_regexp, "(Current\s*Comic\s*:\s*[^<]*)<"], ["http://www.partiallyclips.com/pages/current.php", check_maybe, None], ["http://www.missmab.com/", check_maybe, None], ["http://bhag.sackofjustice.com/", check_maybe, None], # ["http://conscrew.keenspace.com/", check_maybe, None], ["http://www.conscrew.com/index.php", check_maybe, None], ["http://wicket.conscrew.com/index.php", check_maybe, None], ["http://www.dominic-deegan.com/", check_maybe, None], ["http://www.coffeebrain.com/comic/", check_start, "images/pages"], ["http://cdc.keenspace.com/", check_maybe, None], # the graphicsmash version died but keenspace came back, thanks to something-positive for noticing # ["http://www.graphicsmash.com/series.php?name=lifeonforbez&view=current", check_regexp, "<img src=http://www.webcomicsnation.com/~graphicsmashers/ccuasay/([^ >]*)[^>]*>"], # ["http://www.furrymilitia.net/comicdefault.aspx", check_maybe, None], # ["http://www.furrymilitia.net/betterdays/", check_regexp, '<img src=([^>]*)>'], # bad html on page ["http://www.jaynaylor.com/betterdays/", check_regexp, '<img src=([^>]*)>'], # bad html on page ["http://www.badlydrawnkitties.com/", check_maybe, None], ["http://www.ok-cancel.com/", check_maybe, None], # ["http://twolumps.keenspace.com/", check_maybe, None], ["http://www.twolumps.net/", check_maybe, None], ["http://crfh.net/", check_maybe, None], ["http://www.itswalky.com/", check_maybe, None], # sadly, gone, some search-squatter has it # ["http://www.w00t-comic.net/", check_regexp, "<big>(.*)</big>"], # oh, still dead but archived: ["http://usrbinw00t.keenspace.com/", check_maybe, None], ["http://www.sorethumbsonline.com/", check_maybe, None], ["http://www.nasa.gov/multimedia/imagegallery/index.html", check_regexp, '<a href="/multimedia/imagegallery/image_feature_(.*).html">'], ["http://www.sdss.org/iotw/iotw.html", check_regexp, "<center>(.*)</center>"], ["http://ares.nrl.navy.mil/sungrazer/recent.html", check_regexp, "<b>([^<]+)</b></font></td>"], # ["http://antwrp.gsfc.nasa.gov/apod/", check_regexp, " Explanation: ([\0-\377]*) Tomorrow"], # or put an re.DOTALL in check_regexp # doesn't work, because we're not quoting the match... # ["http://antwrp.gsfc.nasa.gov/apod/", check_regexp, '<IMG SRC="([^"]*)"'], ["http://antwrp.gsfc.nasa.gov/apod/", check_start, "image/"], ["http://lorebrandcomics.com/", check_alt, "[Lore:"], # putf is now over, and hitting it gives a randomly chosen comic # ["http://www.accendi.net/putf/", check_maybe, None], # ["http://ohmygods.timerift.net/", check_maybe, None], ["http://ohmygods.timerift.net/", check_regexp, '#BeginEditable "day" -->([^<]*)<'], ["http://www.daybydaycartoon.com/Default.aspx", check_maybe, None], ["http://www.vanvonhunter.com/index.html", check_maybe, None], ["http://www.drunkduck.com/Elijah_and_Azuu/", check_regexp, title_pat], ["http://www.drunkduck.com/The_Whovian_Observer/", check_regexp, title_pat], ["http://www.asofterworld.com/", check_start, ""], ["http://underpower.non-essential.com/", check_maybe, None], ["http://www.littledee.net/", check_bruno, None], ["http://www.vigilanteho.com/", check_maybe, None], # ["http://www.pvcomics.com/atland/", check_start, "http://www.pvcomics.com/comics/atland/"], ["http://www.realmofatland.com/", check_maybe, None], ["http://www.questionablecontent.net/", check_maybe, None], ["http://www.hello-cthulhu.com/", check_maybe, None], # there's other stuff there, but Metroid is gone # ["http://www.bobandgeorge.com/Fancomics/Metroid/Metroid.html", check_start, "Metroid"], ["http://www.petprofessional.net/", check_maybe, None], ["http://www.library-of-rain.com/botmaker/index.php", check_alt, "Strip"], ["http://www.radioactivepanda.com/", check_maybe, None], ["http://crap.jinwicked.com/", check_maybe, None], ["http://www.unshelved.com/", check_maybe, None], ["http://devilspanties.keenspot.com/", check_maybe, None], # mostly for the boston jokes ["http://www.evil-comic.com/index.html", check_maybe, None], # via schlockmercenary ["http://angryflower.com/", check_regexp, angryflower_pat], ["http://www.bugbash.net/", check_maybe, None], ["http://crossroads.keenspace.com/", check_maybe, None], ["http://www.galactanet.com/comic/index.htm", check_regexp, '<img\s*src\s*=\s*"\s*(Strip.*\....)">'], ["http://www.project-apollo.net/mos/index.html", check_maybe, None], ["http://www.candicomics.com/", check_maybe, None], ["http://www.elisalandis.com/", check_maybe, None], ["http://www.ctrlaltdel-online.com/comic.php", check_maybe, None], ["http://www.starslipcrisis.com/", check_maybe, None], ] rss_header = """<?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="feed.xsl"?> <rss version="0.91"> <channel> <title>Comick Update</title> <lastBuildDate>%s</lastBuildDate> <description>webcomics that have changed</description> <link>http://www.thok.org/intranet/python/comics/comick.py.html</link> """ rss_header_old = rss_header.replace('<?xml-stylesheet type="text/xsl" href="feed.xsl"?>\n',"") rss_item = """<item> <title>%s</title> <link>%s</link> <description>%s</description> <guid isPermaLink="false">%s</guid> </item> """ rss_footer = """</channel> </rss> """ time_day = 24*60*60 rest_time = time_day / 4 check_rest_time = 1*60*60 import md5 def make_guid(s): return md5.new(s).hexdigest() class rssfile: def __init__(self, name): self.olditems = "" self.ref = {} self.dsc = {} self.order = [] if os.path.isfile(name): f = open(name, "r") s = f.read() f.close() # handle one-rev-back rss header, for add-stylesheet case if re.match(rss_header_old % "[^<]*?", s): s = re.sub(rss_header_old % "[^<]*?", "", s, 1) else: s = re.sub(rss_header % "[^<]*?", "", s, 1) s = s.replace(rss_footer, "", 1) self.itemsplit(s) self.tmpname = "%s~" % name self.realname = name self.rssfile = open(self.tmpname, "w") self.rssfile.write(rss_header % rfc822.formatdate()) def write_item_raw(self, title, link, desc): self.rssfile.write(rss_item % (entity_quote(title), entity_quote(link), entity_quote(desc), make_guid(desc))) self.rssfile.flush() def write_item(self, title, link, desc): self.write_item_raw(title, link, desc) if link in self.ref.keys(): del self.ref[link] page_feeder_notify(link) def close(self): self.oldwrite() self.rssfile.write(rss_footer) os.rename(self.tmpname, self.realname) itemre = re.compile("\n".join(["<item>", "<title>(?P<title>[^<]*)</title>", "<link>(?P<link>[^<]*)</link>", "<description>(?P<description>[^<]*)</description>", '<guid( isPermaLink="false")?>(?P<guid>[^<]*)</guid>', "</item>"]), re.MULTILINE|re.I) def itemsplit(self,s): for title, link, desc, dummy, guid in re.findall(self.itemre, s): if link not in self.order: self.ref[link] = entity_unquote(title) self.dsc[link] = entity_unquote(desc) self.order.append(entity_unquote(link)) def oldwrite(self): for link in self.order: if link in self.ref.keys(): self.write_item_raw(self.ref[link], link, self.dsc[link]) verbose = None def entity_quote(s): return s.replace("&", "&") # add lt/gt later def entity_unquote(s): return s.replace("&", "&") # add lt/gt later def fmt_time(t): # one good use for APL reduce-floor... sign = "" if t < 0: sign = "-" t = -t s = t % 60 t -= s m = (t / 60) % 60 t -= m * 60 h = (t / 60 / 60) % 24 t -= h * 60 * 60 d = (t / 60 / 60 / 24) if d: return "%s%dd+%02dh%02dm%02ds" % (sign,d,h,m,s) return "%s%02dh%02dm%02ds" % (sign,h,m,s) def process_db(dbname): sh = shelve.open(dbname) rf = rssfile("%s.rss" % dbname) total_comics = 0 failed_comics = 0 changed_comics = 0 nextcheck = time.time() + rest_time + check_rest_time for u, checkproc, arg in sites: total_comics += 1 lastgot = 0 lastcheck = 0 lastetag = None lastmodtime = None arghash={} if sh.has_key(u): arghash = sh[u] lastgot = arghash["last-changed"] tag = arghash["tag"] lastcheck = arghash["last-queried"] if arghash.has_key("etag"): lastetag = arghash["etag"] if arghash.has_key("last-modtime"): lastmodtime = arghash["last-modtime"] now = time.time() if lastgot + rest_time > now: if verbose: print u, "not stale yet" nextcheck = min(lastgot + rest_time, nextcheck) continue if lastcheck + check_rest_time > now: if verbose: print u, "poked recently" nextcheck = min(lastcheck + check_rest_time, nextcheck) continue # assume any not-poked will trigger sooner than any poked # not true in several cases, though a second run will be right try: sch = get_changed_content(u, lastetag, lastmodtime) except IOError, e: print "fetch", u, "failed:", e.args failed_comics += 1 continue except KeyboardInterrupt: print "Processing", u, "interrupted, saving current values" break except: print "fetch", u, "failed!", repr(traceback.format_tb(sys.exc_info()[2])) continue if not sch: if verbose: print u, "unfetched: etag or last-mod still current" arghash["last-queried"] = now nextcheck = min(now + check_rest_time, nextcheck) sh[u] = arghash continue s, lastetag, lastmodtime = sch newtag = checkproc(s, u, arg) if verbose: print u if not newtag: print u, "not handled" failed_comics += 1 continue arghash["tag"] = newtag arghash["last-queried"] = now arghash["etag"] = lastetag arghash["last-modtime"] = lastmodtime if not sh.has_key(u): arghash["last-changed"] = now nextcheck = min(now + rest_time, nextcheck) rf.write_item("first time: %s" % newtag, u, "%s: first time %s @ %s" % (u, entity_quote(newtag), now)) changed_comics += 1 elif tag != newtag: print "tag for", u, "changed from", tag, "to", newtag arghash["last-changed"] = now nextcheck = min(now + rest_time, nextcheck) rf.write_item(newtag, u, "%s: %s changed to %s @ %s" % (u, entity_quote(tag), entity_quote(newtag), now)) changed_comics += 1 # otherwise, last-changed stays sh[u] = arghash sh.close() rf.close() checkwait = nextcheck - time.time() print "%d changed, %d failed (out of %d total) [wait %s until %s]" % (changed_comics, failed_comics, total_comics, fmt_time(checkwait), time.ctime(nextcheck)) def scan_db(dbname): sh = shelve.open(dbname) for u in sh.keys(): print u, sh[u] sh.close() def summary_db(dbname): sitekeys = [i[0] for i in sites] sh = shelve.open(dbname) etagscount = 0 lastmodcount = 0 keycount = 0 for u in sh.keys(): args = sh[u] if args.has_key("etag") and args["etag"]: etagscount += 1 if args.has_key("last-modtime") and args["last-modtime"]: lastmodcount += 1 if u in sitekeys: del sitekeys[sitekeys.index(u)] else: print u, "not in sites" keycount += 1 sh.close() for u in sitekeys: print u, "in sites, not in db" print "etags found:", etagscount print "times found:", lastmodcount print "total found:", keycount def fix_db1(dbname): sh = shelve.open(dbname) now = time.time() for u in sh.keys(): when, tag = sh[u] sh[u] = (when, tag, now) sh.close() def fix_db2(dbname): sh = shelve.open(dbname) now = time.time() for u in sh.keys(): when, tag, now = sh[u] sh[u] = {"when": when, "tag": tag, "now": now} sh.close() def fix_db3(dbname): sh = shelve.open(dbname) now = time.time() for u in sh.keys(): args = sh[u] sh[u] = {"last-changed":args["when"], "tag": args["tag"], "last-queried":args["now"]} sh.close() import pprint def diag_db(dbname): sh = shelve.open(dbname) print "Url fragment:" ufrag = sys.stdin.readline() for u, checkproc, arg in sites: if u.lower().find(ufrag.rstrip()) > -1: print "Checking:", u, checkproc, arg if sh.has_key(u): arghash = sh[u] print "Old args:", pprint.pformat(arghash) print "age:", (time.time()-arghash["last-changed"])/(24*60*60),"days" sch = get_changed_content(u, None, None) s, lastetag, lastmodtime = sch newtag = checkproc(s, u, arg) print "New tag:", newtag sh.close() def show_cruft_db(dbname): sh = shelve.open(dbname) for u, checkproc, arg in sites: if sh.has_key(u): arghash = sh[u] age = time.time()-arghash["last-changed"] if age/(24*60*60) > 30: print fmt_time(age), "\t", u sh.close() import socket if __name__ == "__main__": socket.setdefaulttimeout(15) try: progname, dbname, verb = sys.argv except ValueError: sys.exit(sys.argv[0] + " dbname {update|scan|fix|summary} - you probably want comdb update") {"update":process_db, "scan":scan_db, "fix":fix_db3, "summary":summary_db, "diag":diag_db, "cruft":show_cruft_db, }[verb.lower()](dbname)