import sys, os, re import urllib import StringIO from lxml import etree from dateutil import parser as dateparser from baseHandler import * baseuri = "http://web.archive.org/web/*sa_/" parser = etree.HTMLParser() class IArchiveHandler(BaseProxyHandler): def fetch_changes(self, req, requri, dt=None): # implement the changes list for this particular proxy uri = baseuri + requri try: fh = urllib.urlopen(uri) except: return self.error("Couldn't retrieve data from %s" % uri, req, status=404) data = fh.read() fh.close() try: dom = etree.parse(StringIO.StringIO(data), parser) except: return self.error("Couldn't parse data from %s" % uri, req, status=500) if not dom: return xps = dom.xpath('//tr[@bgcolor="#EBEBEB"]') if len(xps) != 1: return self.error("URL (%s) doesn't exist in IA" % uri, req, status=404) tr = xps[0] alist = tr.xpath('.//a') changes = [] for a in alist: changed = a.tail if changed: dtobj = dateparser.parse(a.text) loc = a.attrib['href'] changes.append((dtobj, loc)) return changes def handler(req): hdlr = IArchiveHandler('ia') return basehandler(req, hdlr)