# Version 3: Rewrite to use Wikimedia API, rather than HTML scraping import sys, os, re import time import urllib2 import StringIO from lxml import etree from dateutil import parser as dateparser import pprint from baseHandler import BaseProxyHandler, basehandler class WikiHandler(BaseProxyHandler): def __init__(self, d): BaseProxyHandler.__init__(self, d) self.hdrs = {'Host' : 'en.wikipedia.org', 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language' : 'en-us,en;q=0.5', 'Proxy-Connection' : 'keep-alive', 'Pragma' : 'no-cache', 'Cache-Control' : 'no-cache', 'User-Agent' : 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2'} def fetch_dom(self, wikiuri, req): try: ureq = urllib2.Request(wikiuri, headers=self.hdrs) hdlr = urllib2.HTTPRedirectHandler() opener = urllib2.build_opener(hdlr) fh = opener.open(ureq) except Exception, e: return self.error("Couldn't retrieve Wikipedia data from %s" % wikiuri, req, status=404) data = fh.read() fh.close() try: dom = etree.parse(StringIO.StringIO(data)) except: return self.error("Response from Wikipedia (%s) not parsable" % wikiuri, req, status=500) dom = dom.getroot() return dom def fetch_changes(self, req, requri, dt=None): title = requri[29:] url = "http://en.wikipedia.org/w/api.php?format=xml&action=query&prop=revisions&meta=siteinfo&rvprop=timestamp|ids&rvlimit=500&redirects=1&titles=" changes = [] base = "http://en.wikipedia.org/w/index.php?oldid=" dom = self.fetch_dom(url + title, req) while dom is not None: revs = dom.xpath('//rev') for r in revs: changes.append((dateparser.parse(r.attrib['timestamp']), base + r.attrib['revid'])) cont = dom.xpath('/api/query-continue/revisions/@rvstartid') if cont: dom = self.fetch_dom(url + title + "&rvstartid=" + cont[0], req) else: dom = None changes.sort() return changes def handler(req): hdlr = WikiHandler('wiki') return basehandler(req, hdlr)