try: import cgitb from mod_python import apache from mod_python.util import FieldStorage except: # called outside of apache for debugging pass import os from dateutil import parser as dateparser from foresite import * from foresite import conneg from foresite.utils import now, namespaces from rdflib import Namespace try: import json except: import simplejson as json srlzHash = {'rdf/' : RdfLibSerializer('pretty-xml'), 'rdfxml/' : RdfLibSerializer('xml'), 'nt/' : RdfLibSerializer('nt'), 'turtle/' : RdfLibSerializer('turtle'), 'n3/' : RdfLibSerializer('n3'), 'atom/' : AtomSerializer(), 'html/' : RdfLibSerializer('rdfa'), 'json/' : RdfLibSerializer('pretty-json'), 'rdfjson/' : RdfLibSerializer('json') } srlzHash['json/'].mimeType = 'application/json'; srlzHash['rdfjson/'].mimeType = 'application/rdf+json'; mimeHash = {} for (k,v) in srlzHash.items(): mimeHash[v.mimeType] = k mimeStr = ', '.join(mimeHash.keys()) mimeList = conneg.parse(mimeStr) namespaces['mem'] = Namespace('http://mementoweb.org/terms/tb/') class BaseProxyHandler: def __init__(self, d): if d[0] != '/': d = '/' + d self.urldir = d # Change this to your own hostname self.host = "http://mementoproxy.lanl.gov" def send(self, data, req, status=302, ct="text/plain"): req.status = status req.content_type = ct req.content_length = len(data) req.send_http_header() if type(data) == unicode: req.write(data.encode('utf-8')) else: req.write(data) def error(self, data, req, status=400, ct="text/plain"): self.send(data, req, status, ct) return None def fetch_changes(self, req, requri, dt=None): # This is what to implement per proxy # It should return a list of 2-tuples, the first element of each being a datetime, the second a URI raise NotImplementedError() def handle_event(self, req): # This generates Simile Timeline event streams in either XML or JSON urlOffset = len(self.urldir) + len('/event/') requri = req.uri[urlOffset:] if requri.startswith('xml/'): xml = 1; requri = requri[4:] elif requri.startswith('json/'): xml = 0; requri = requri[5:] else: xml = 1 if not requri.startswith('http://'): requri = "http://" + requri changes = self.fetch_changes(req, requri) if xml: xml = [''] else: jsonl = {'dateTimeFormat' : 'iso8601', 'events' : []} for c in changes: created = c[0].isoformat() loc = c[1] if xml: loc = loc.replace('&', '&') xml.append('<a href="%s">%s</a>' % (created, loc, loc)) else: jsonl['events'].append({'start' : created, 'link' : loc, 'description' : '<a href="%s">%s</a>' % (loc ,loc)}) if xml: xml.append("") self.send('\n'.join(xml), req, status=200, ct='text/xml') else: data = json.dumps(jsonl) self.send(data, req, status=200, ct='application/json') def handle_aggr(self, req): # This handles the aggregation/timebundle redirect to the appropriate resourcemap/timemap urlOffset = len(self.urldir) + len('/timebundle/') requri = req.uri[urlOffset:] if not requri.startswith('http://'): requri = "http://" + requri try: wanted = req.headers_in['Accept'] except KeyError: wanted = 'application/rdf+xml' mts = conneg.parse(wanted) mt = conneg.best(mts, mimeList) if not mt: which = 'rdf/' else: which = mimeHash[str(mt)] location = self.urldir + '/timemap/%s%s' % (which, requri) req.err_headers_out['Location'] = location req.err_headers_out['Vary'] = "Accept" return self.send('', req, status=303) def handle_rem(self, req): # This generates the TimeMap urlOffset = len(self.urldir) + len('/timemap/') requri = req.uri[urlOffset:] srlz = None for (k,v) in srlzHash.items(): if requri.startswith(k): srlz = v requri = requri[len(k):] break if not srlz: # unknown rem srlz, raise return self.error('404: Unknown time map serialization', req, status=404) if not requri.startswith('http://'): requri = "http://" + requri changes = self.fetch_changes(req, requri) if not changes: return self.error('404: No history for resource', req, status=404) aggr = Aggregation(self.host + self.urldir + '/timebundle/' + requri) rem = aggr.register_serialization(srlz, self.host + req.uri) rem._rdf.type = namespaces['mem']['TimeMap'] aggr._dc.title = 'Memento Time Bundle for ' + requri aggr._rdf.type = namespaces['mem']['TimeBundle'] # add base resource ar = AggregatedResource(requri) ar._rdf.type = namespaces['mem']['OriginalResource'] aggr.add_resource(ar) ar = AggregatedResource(self.host + self.urldir + '/timegate/' + requri) ar._rdf.type = namespaces['mem']['TimeGate'] aggr.add_resource(ar) for c in changes: dtobj = c[0] loc = c[1] ar = AggregatedResource(loc) ar._dcterms.created = dtobj.isoformat() ar._rdf.type = namespaces['mem']['Memento'] aggr.add_resource(ar) rd = rem.get_serialization() return self.send(rd.data, req, status=200, ct=srlz.mimeType) def handle_dt(self, req): nowd = now() current = dateparser.parse(nowd) try: reqdate = req.headers_in['x-accept-datetime'] except KeyError: reqdate = nowd # strip off silly {} characters if reqdate[0] == "{": reqdate = reqdate[1:] if reqdate[-1] == "}": reqdate = reqdate[:-1] # /xxx/timegate/(URL) -- use unparsed_uri to include ?bla urlOffset = len(self.urldir) + len('/timegate/') requri = req.unparsed_uri[urlOffset:] if not requri.startswith('http://'): requri = "http://" + requri req.err_headers_out['Vary'] = 'negotiate,X-Accept-Datetime' # now check info try: wanted = dateparser.parse(reqdate) if (wanted.tzinfo == None or wanted.tzinfo.utcoffset(wanted) == None): # Naive date. Reparse with Timezone reqdate += " GMT" wanted = dateparser.parse(wanted) except: return self.error("400: Couldn't parse %s" % reqdate, req, status=400) if wanted > current: return self.error("406: Requested date in future", req, status=406) # now look for all collections in which URL might appear changes = self.fetch_changes(req, requri, dt=wanted) if changes == None: return None elif not changes: return self.error('406: Could not find any Mementos for resource: %s' % requri, req, status=406) # setup headers ore = self.host + self.urldir + '/timebundle/' + requri req.err_headers_out['Link'] = '<%s>;rel="%s"' % (ore, 'aggregation') fmtstr = "{%a, %d %b %Y %H:%M:%S GMT}" dt1 = changes[0][0].strftime(fmtstr) dtn = current.strftime(fmtstr) req.err_headers_out['X-Archive-Interval'] = "%s - %s" % (dt1, dtn) req.err_headers_out['TCN'] = 'choice' if wanted == current: # return last req.err_headers_out['Location'] = changes[-1][1] return self.send('', req, status=302) # now find closest to requested for c in range(len(changes)): this = changes[c] if wanted < this[0] or c == len(changes)-1: if c: prev = changes[c-1] tdelta1 = prev[0] - wanted tdelta2 = this[0] - wanted if abs(tdelta1) < abs(tdelta2): loc = prev[1] try: alts = ['{"%s" 0.8 {dt "%s"}}' % (changes[c-2][1], changes[c-2][0].strftime(fmtstr))] except: alts = [] alts.append('{"%s" 0.8 {dt "%s"}}' % (this[1], this[0].strftime(fmtstr))) else: loc = this[1] try: alts = ['{"%s" 0.8 {dt "%s"}}' % (prev[1], prev[0].strftime(fmtstr))] except: alts = [] try: alts.append('{"%s" 0.8 {datetime "%s"}}' % (changes[c+1][1], changes[c+1][0].strftime(fmtstr))) except: pass req.err_headers_out['Alternates'] = ','.join(alts) req.err_headers_out['Location'] = loc return self.send('', req, status=302) else: return self.error('406: Requested date before first archived copy', req, status=406) # after last archived copy, and not wanting most recent return self.error('406: Requested date after last archived copy: %s vs %r' % (wanted, changes), req, status=406) def handle(self, req): urlOffset = len(self.urldir) if req.uri[urlOffset:urlOffset+len('/timebundle/')] == '/timebundle/': return self.handle_aggr(req) elif req.uri[urlOffset:urlOffset+len('/timemap/')] == '/timemap/': return self.handle_rem(req) elif req.uri[urlOffset:urlOffset+len('/event/')] == '/event/': return self.handle_event(req) elif req.uri[urlOffset:urlOffset+len('/timegate/')] == '/timegate/': return self.handle_dt(req) else: return self.error("404: Unknown proxy command", req, status=404) def basehandler(req, hdlr): os.chdir('/home/web/mementoproxy') try: hdlr.handle(req) except: req.content_type = "text/html" cgitb.Hook(file=req).handle() return apache.OK