summaryrefslogtreecommitdiffstats
path: root/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_http.py
diff options
context:
space:
mode:
Diffstat (limited to 'WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_http.py')
-rw-r--r--WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_http.py758
1 files changed, 0 insertions, 758 deletions
diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_http.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_http.py
deleted file mode 100644
index 1b80e2b..0000000
--- a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_http.py
+++ /dev/null
@@ -1,758 +0,0 @@
-"""HTTP related handlers.
-
-Note that some other HTTP handlers live in more specific modules: _auth.py,
-_gzip.py, etc.
-
-
-Copyright 2002-2006 John J Lee <jjl@pobox.com>
-
-This code is free software; you can redistribute it and/or modify it
-under the terms of the BSD or ZPL 2.1 licenses (see the file
-COPYING.txt included with the distribution).
-
-"""
-
-import time, htmlentitydefs, logging, socket, \
- urllib2, urllib, httplib, sgmllib
-from urllib2 import URLError, HTTPError, BaseHandler
-from cStringIO import StringIO
-
-from _clientcookie import CookieJar
-from _headersutil import is_html
-from _html import unescape, unescape_charref
-from _request import Request
-from _response import closeable_response, response_seek_wrapper
-import _rfc3986
-import _sockettimeout
-
-debug = logging.getLogger("mechanize").debug
-debug_robots = logging.getLogger("mechanize.robots").debug
-
-# monkeypatch urllib2.HTTPError to show URL
-## def urllib2_str(self):
-## return 'HTTP Error %s: %s (%s)' % (
-## self.code, self.msg, self.geturl())
-## urllib2.HTTPError.__str__ = urllib2_str
-
-
-CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes
-DEFAULT_ENCODING = 'latin-1'
-
-
-try:
- socket._fileobject("fake socket", close=True)
-except TypeError:
- # python <= 2.4
- create_readline_wrapper = socket._fileobject
-else:
- def create_readline_wrapper(fh):
- return socket._fileobject(fh, close=True)
-
-
-# This adds "refresh" to the list of redirectables and provides a redirection
-# algorithm that doesn't go into a loop in the presence of cookies
-# (Python 2.4 has this new algorithm, 2.3 doesn't).
-class HTTPRedirectHandler(BaseHandler):
- # maximum number of redirections to any single URL
- # this is needed because of the state that cookies introduce
- max_repeats = 4
- # maximum total number of redirections (regardless of URL) before
- # assuming we're in a loop
- max_redirections = 10
-
- # Implementation notes:
-
- # To avoid the server sending us into an infinite loop, the request
- # object needs to track what URLs we have already seen. Do this by
- # adding a handler-specific attribute to the Request object. The value
- # of the dict is used to count the number of times the same URL has
- # been visited. This is needed because visiting the same URL twice
- # does not necessarily imply a loop, thanks to state introduced by
- # cookies.
-
- # Always unhandled redirection codes:
- # 300 Multiple Choices: should not handle this here.
- # 304 Not Modified: no need to handle here: only of interest to caches
- # that do conditional GETs
- # 305 Use Proxy: probably not worth dealing with here
- # 306 Unused: what was this for in the previous versions of protocol??
-
- def redirect_request(self, newurl, req, fp, code, msg, headers):
- """Return a Request or None in response to a redirect.
-
- This is called by the http_error_30x methods when a redirection
- response is received. If a redirection should take place, return a
- new Request to allow http_error_30x to perform the redirect;
- otherwise, return None to indicate that an HTTPError should be
- raised.
-
- """
- if code in (301, 302, 303, "refresh") or \
- (code == 307 and not req.has_data()):
- # Strictly (according to RFC 2616), 301 or 302 in response to
- # a POST MUST NOT cause a redirection without confirmation
- # from the user (of urllib2, in this case). In practice,
- # essentially all clients do redirect in this case, so we do
- # the same.
- # XXX really refresh redirections should be visiting; tricky to
- # fix, so this will wait until post-stable release
- new = Request(newurl,
- headers=req.headers,
- origin_req_host=req.get_origin_req_host(),
- unverifiable=True,
- visit=False,
- )
- new._origin_req = getattr(req, "_origin_req", req)
- return new
- else:
- raise HTTPError(req.get_full_url(), code, msg, headers, fp)
-
- def http_error_302(self, req, fp, code, msg, headers):
- # Some servers (incorrectly) return multiple Location headers
- # (so probably same goes for URI). Use first header.
- if headers.has_key('location'):
- newurl = headers.getheaders('location')[0]
- elif headers.has_key('uri'):
- newurl = headers.getheaders('uri')[0]
- else:
- return
- newurl = _rfc3986.clean_url(newurl, "latin-1")
- newurl = _rfc3986.urljoin(req.get_full_url(), newurl)
-
- # XXX Probably want to forget about the state of the current
- # request, although that might interact poorly with other
- # handlers that also use handler-specific request attributes
- new = self.redirect_request(newurl, req, fp, code, msg, headers)
- if new is None:
- return
-
- # loop detection
- # .redirect_dict has a key url if url was previously visited.
- if hasattr(req, 'redirect_dict'):
- visited = new.redirect_dict = req.redirect_dict
- if (visited.get(newurl, 0) >= self.max_repeats or
- len(visited) >= self.max_redirections):
- raise HTTPError(req.get_full_url(), code,
- self.inf_msg + msg, headers, fp)
- else:
- visited = new.redirect_dict = req.redirect_dict = {}
- visited[newurl] = visited.get(newurl, 0) + 1
-
- # Don't close the fp until we are sure that we won't use it
- # with HTTPError.
- fp.read()
- fp.close()
-
- return self.parent.open(new)
-
- http_error_301 = http_error_303 = http_error_307 = http_error_302
- http_error_refresh = http_error_302
-
- inf_msg = "The HTTP server returned a redirect error that would " \
- "lead to an infinite loop.\n" \
- "The last 30x error message was:\n"
-
-
-# XXX would self.reset() work, instead of raising this exception?
-class EndOfHeadError(Exception): pass
-class AbstractHeadParser:
- # only these elements are allowed in or before HEAD of document
- head_elems = ("html", "head",
- "title", "base",
- "script", "style", "meta", "link", "object")
- _entitydefs = htmlentitydefs.name2codepoint
- _encoding = DEFAULT_ENCODING
-
- def __init__(self):
- self.http_equiv = []
-
- def start_meta(self, attrs):
- http_equiv = content = None
- for key, value in attrs:
- if key == "http-equiv":
- http_equiv = self.unescape_attr_if_required(value)
- elif key == "content":
- content = self.unescape_attr_if_required(value)
- if http_equiv is not None and content is not None:
- self.http_equiv.append((http_equiv, content))
-
- def end_head(self):
- raise EndOfHeadError()
-
- def handle_entityref(self, name):
- #debug("%s", name)
- self.handle_data(unescape(
- '&%s;' % name, self._entitydefs, self._encoding))
-
- def handle_charref(self, name):
- #debug("%s", name)
- self.handle_data(unescape_charref(name, self._encoding))
-
- def unescape_attr(self, name):
- #debug("%s", name)
- return unescape(name, self._entitydefs, self._encoding)
-
- def unescape_attrs(self, attrs):
- #debug("%s", attrs)
- escaped_attrs = {}
- for key, val in attrs.items():
- escaped_attrs[key] = self.unescape_attr(val)
- return escaped_attrs
-
- def unknown_entityref(self, ref):
- self.handle_data("&%s;" % ref)
-
- def unknown_charref(self, ref):
- self.handle_data("&#%s;" % ref)
-
-
-try:
- import HTMLParser
-except ImportError:
- pass
-else:
- class XHTMLCompatibleHeadParser(AbstractHeadParser,
- HTMLParser.HTMLParser):
- def __init__(self):
- HTMLParser.HTMLParser.__init__(self)
- AbstractHeadParser.__init__(self)
-
- def handle_starttag(self, tag, attrs):
- if tag not in self.head_elems:
- raise EndOfHeadError()
- try:
- method = getattr(self, 'start_' + tag)
- except AttributeError:
- try:
- method = getattr(self, 'do_' + tag)
- except AttributeError:
- pass # unknown tag
- else:
- method(attrs)
- else:
- method(attrs)
-
- def handle_endtag(self, tag):
- if tag not in self.head_elems:
- raise EndOfHeadError()
- try:
- method = getattr(self, 'end_' + tag)
- except AttributeError:
- pass # unknown tag
- else:
- method()
-
- def unescape(self, name):
- # Use the entitydefs passed into constructor, not
- # HTMLParser.HTMLParser's entitydefs.
- return self.unescape_attr(name)
-
- def unescape_attr_if_required(self, name):
- return name # HTMLParser.HTMLParser already did it
-
-class HeadParser(AbstractHeadParser, sgmllib.SGMLParser):
-
- def _not_called(self):
- assert False
-
- def __init__(self):
- sgmllib.SGMLParser.__init__(self)
- AbstractHeadParser.__init__(self)
-
- def handle_starttag(self, tag, method, attrs):
- if tag not in self.head_elems:
- raise EndOfHeadError()
- if tag == "meta":
- method(attrs)
-
- def unknown_starttag(self, tag, attrs):
- self.handle_starttag(tag, self._not_called, attrs)
-
- def handle_endtag(self, tag, method):
- if tag in self.head_elems:
- method()
- else:
- raise EndOfHeadError()
-
- def unescape_attr_if_required(self, name):
- return self.unescape_attr(name)
-
-def parse_head(fileobj, parser):
- """Return a list of key, value pairs."""
- while 1:
- data = fileobj.read(CHUNK)
- try:
- parser.feed(data)
- except EndOfHeadError:
- break
- if len(data) != CHUNK:
- # this should only happen if there is no HTML body, or if
- # CHUNK is big
- break
- return parser.http_equiv
-
-class HTTPEquivProcessor(BaseHandler):
- """Append META HTTP-EQUIV headers to regular HTTP headers."""
-
- handler_order = 300 # before handlers that look at HTTP headers
-
- def __init__(self, head_parser_class=HeadParser,
- i_want_broken_xhtml_support=False,
- ):
- self.head_parser_class = head_parser_class
- self._allow_xhtml = i_want_broken_xhtml_support
-
- def http_response(self, request, response):
- if not hasattr(response, "seek"):
- response = response_seek_wrapper(response)
- http_message = response.info()
- url = response.geturl()
- ct_hdrs = http_message.getheaders("content-type")
- if is_html(ct_hdrs, url, self._allow_xhtml):
- try:
- try:
- html_headers = parse_head(response,
- self.head_parser_class())
- finally:
- response.seek(0)
- except (HTMLParser.HTMLParseError,
- sgmllib.SGMLParseError):
- pass
- else:
- for hdr, val in html_headers:
- # add a header
- http_message.dict[hdr.lower()] = val
- text = hdr + ": " + val
- for line in text.split("\n"):
- http_message.headers.append(line + "\n")
- return response
-
- https_response = http_response
-
-class HTTPCookieProcessor(BaseHandler):
- """Handle HTTP cookies.
-
- Public attributes:
-
- cookiejar: CookieJar instance
-
- """
- def __init__(self, cookiejar=None):
- if cookiejar is None:
- cookiejar = CookieJar()
- self.cookiejar = cookiejar
-
- def http_request(self, request):
- self.cookiejar.add_cookie_header(request)
- return request
-
- def http_response(self, request, response):
- self.cookiejar.extract_cookies(response, request)
- return response
-
- https_request = http_request
- https_response = http_response
-
-try:
- import robotparser
-except ImportError:
- pass
-else:
- class MechanizeRobotFileParser(robotparser.RobotFileParser):
-
- def __init__(self, url='', opener=None):
- robotparser.RobotFileParser.__init__(self, url)
- self._opener = opener
- self._timeout = _sockettimeout._GLOBAL_DEFAULT_TIMEOUT
-
- def set_opener(self, opener=None):
- import _opener
- if opener is None:
- opener = _opener.OpenerDirector()
- self._opener = opener
-
- def set_timeout(self, timeout):
- self._timeout = timeout
-
- def read(self):
- """Reads the robots.txt URL and feeds it to the parser."""
- if self._opener is None:
- self.set_opener()
- req = Request(self.url, unverifiable=True, visit=False,
- timeout=self._timeout)
- try:
- f = self._opener.open(req)
- except HTTPError, f:
- pass
- except (IOError, socket.error, OSError), exc:
- debug_robots("ignoring error opening %r: %s" %
- (self.url, exc))
- return
- lines = []
- line = f.readline()
- while line:
- lines.append(line.strip())
- line = f.readline()
- status = f.code
- if status == 401 or status == 403:
- self.disallow_all = True
- debug_robots("disallow all")
- elif status >= 400:
- self.allow_all = True
- debug_robots("allow all")
- elif status == 200 and lines:
- debug_robots("parse lines")
- self.parse(lines)
-
- class RobotExclusionError(urllib2.HTTPError):
- def __init__(self, request, *args):
- apply(urllib2.HTTPError.__init__, (self,)+args)
- self.request = request
-
- class HTTPRobotRulesProcessor(BaseHandler):
- # before redirections, after everything else
- handler_order = 800
-
- try:
- from httplib import HTTPMessage
- except:
- from mimetools import Message
- http_response_class = Message
- else:
- http_response_class = HTTPMessage
-
- def __init__(self, rfp_class=MechanizeRobotFileParser):
- self.rfp_class = rfp_class
- self.rfp = None
- self._host = None
-
- def http_request(self, request):
- scheme = request.get_type()
- if scheme not in ["http", "https"]:
- # robots exclusion only applies to HTTP
- return request
-
- if request.get_selector() == "/robots.txt":
- # /robots.txt is always OK to fetch
- return request
-
- host = request.get_host()
-
- # robots.txt requests don't need to be allowed by robots.txt :-)
- origin_req = getattr(request, "_origin_req", None)
- if (origin_req is not None and
- origin_req.get_selector() == "/robots.txt" and
- origin_req.get_host() == host
- ):
- return request
-
- if host != self._host:
- self.rfp = self.rfp_class()
- try:
- self.rfp.set_opener(self.parent)
- except AttributeError:
- debug("%r instance does not support set_opener" %
- self.rfp.__class__)
- self.rfp.set_url(scheme+"://"+host+"/robots.txt")
- self.rfp.set_timeout(request.timeout)
- self.rfp.read()
- self._host = host
-
- ua = request.get_header("User-agent", "")
- if self.rfp.can_fetch(ua, request.get_full_url()):
- return request
- else:
- # XXX This should really have raised URLError. Too late now...
- msg = "request disallowed by robots.txt"
- raise RobotExclusionError(
- request,
- request.get_full_url(),
- 403, msg,
- self.http_response_class(StringIO()), StringIO(msg))
-
- https_request = http_request
-
-class HTTPRefererProcessor(BaseHandler):
- """Add Referer header to requests.
-
- This only makes sense if you use each RefererProcessor for a single
- chain of requests only (so, for example, if you use a single
- HTTPRefererProcessor to fetch a series of URLs extracted from a single
- page, this will break).
-
- There's a proper implementation of this in mechanize.Browser.
-
- """
- def __init__(self):
- self.referer = None
-
- def http_request(self, request):
- if ((self.referer is not None) and
- not request.has_header("Referer")):
- request.add_unredirected_header("Referer", self.referer)
- return request
-
- def http_response(self, request, response):
- self.referer = response.geturl()
- return response
-
- https_request = http_request
- https_response = http_response
-
-
-def clean_refresh_url(url):
- # e.g. Firefox 1.5 does (something like) this
- if ((url.startswith('"') and url.endswith('"')) or
- (url.startswith("'") and url.endswith("'"))):
- url = url[1:-1]
- return _rfc3986.clean_url(url, "latin-1") # XXX encoding
-
-def parse_refresh_header(refresh):
- """
- >>> parse_refresh_header("1; url=http://example.com/")
- (1.0, 'http://example.com/')
- >>> parse_refresh_header("1; url='http://example.com/'")
- (1.0, 'http://example.com/')
- >>> parse_refresh_header("1")
- (1.0, None)
- >>> parse_refresh_header("blah")
- Traceback (most recent call last):
- ValueError: invalid literal for float(): blah
-
- """
-
- ii = refresh.find(";")
- if ii != -1:
- pause, newurl_spec = float(refresh[:ii]), refresh[ii+1:]
- jj = newurl_spec.find("=")
- key = None
- if jj != -1:
- key, newurl = newurl_spec[:jj], newurl_spec[jj+1:]
- newurl = clean_refresh_url(newurl)
- if key is None or key.strip().lower() != "url":
- raise ValueError()
- else:
- pause, newurl = float(refresh), None
- return pause, newurl
-
-class HTTPRefreshProcessor(BaseHandler):
- """Perform HTTP Refresh redirections.
-
- Note that if a non-200 HTTP code has occurred (for example, a 30x
- redirect), this processor will do nothing.
-
- By default, only zero-time Refresh headers are redirected. Use the
- max_time attribute / constructor argument to allow Refresh with longer
- pauses. Use the honor_time attribute / constructor argument to control
- whether the requested pause is honoured (with a time.sleep()) or
- skipped in favour of immediate redirection.
-
- Public attributes:
-
- max_time: see above
- honor_time: see above
-
- """
- handler_order = 1000
-
- def __init__(self, max_time=0, honor_time=True):
- self.max_time = max_time
- self.honor_time = honor_time
- self._sleep = time.sleep
-
- def http_response(self, request, response):
- code, msg, hdrs = response.code, response.msg, response.info()
-
- if code == 200 and hdrs.has_key("refresh"):
- refresh = hdrs.getheaders("refresh")[0]
- try:
- pause, newurl = parse_refresh_header(refresh)
- except ValueError:
- debug("bad Refresh header: %r" % refresh)
- return response
-
- if newurl is None:
- newurl = response.geturl()
- if (self.max_time is None) or (pause <= self.max_time):
- if pause > 1E-3 and self.honor_time:
- self._sleep(pause)
- hdrs["location"] = newurl
- # hardcoded http is NOT a bug
- response = self.parent.error(
- "http", request, response,
- "refresh", msg, hdrs)
- else:
- debug("Refresh header ignored: %r" % refresh)
-
- return response
-
- https_response = http_response
-
-class HTTPErrorProcessor(BaseHandler):
- """Process HTTP error responses.
-
- The purpose of this handler is to to allow other response processors a
- look-in by removing the call to parent.error() from
- AbstractHTTPHandler.
-
- For non-200 error codes, this just passes the job on to the
- Handler.<proto>_error_<code> methods, via the OpenerDirector.error
- method. Eventually, urllib2.HTTPDefaultErrorHandler will raise an
- HTTPError if no other handler handles the error.
-
- """
- handler_order = 1000 # after all other processors
-
- def http_response(self, request, response):
- code, msg, hdrs = response.code, response.msg, response.info()
-
- if code != 200:
- # hardcoded http is NOT a bug
- response = self.parent.error(
- "http", request, response, code, msg, hdrs)
-
- return response
-
- https_response = http_response
-
-
-class HTTPDefaultErrorHandler(BaseHandler):
- def http_error_default(self, req, fp, code, msg, hdrs):
- # why these error methods took the code, msg, headers args in the first
- # place rather than a response object, I don't know, but to avoid
- # multiple wrapping, we're discarding them
-
- if isinstance(fp, urllib2.HTTPError):
- response = fp
- else:
- response = urllib2.HTTPError(
- req.get_full_url(), code, msg, hdrs, fp)
- assert code == response.code
- assert msg == response.msg
- assert hdrs == response.hdrs
- raise response
-
-
-class AbstractHTTPHandler(BaseHandler):
-
- def __init__(self, debuglevel=0):
- self._debuglevel = debuglevel
-
- def set_http_debuglevel(self, level):
- self._debuglevel = level
-
- def do_request_(self, request):
- host = request.get_host()
- if not host:
- raise URLError('no host given')
-
- if request.has_data(): # POST
- data = request.get_data()
- if not request.has_header('Content-type'):
- request.add_unredirected_header(
- 'Content-type',
- 'application/x-www-form-urlencoded')
- if not request.has_header('Content-length'):
- request.add_unredirected_header(
- 'Content-length', '%d' % len(data))
-
- scheme, sel = urllib.splittype(request.get_selector())
- sel_host, sel_path = urllib.splithost(sel)
- if not request.has_header('Host'):
- request.add_unredirected_header('Host', sel_host or host)
- for name, value in self.parent.addheaders:
- name = name.capitalize()
- if not request.has_header(name):
- request.add_unredirected_header(name, value)
-
- return request
-
- def do_open(self, http_class, req):
- """Return an addinfourl object for the request, using http_class.
-
- http_class must implement the HTTPConnection API from httplib.
- The addinfourl return value is a file-like object. It also
- has methods and attributes including:
- - info(): return a mimetools.Message object for the headers
- - geturl(): return the original request URL
- - code: HTTP status code
- """
- host_port = req.get_host()
- if not host_port:
- raise URLError('no host given')
-
- try:
- h = http_class(host_port, timeout=req.timeout)
- except TypeError:
- # Python < 2.6, no per-connection timeout support
- h = http_class(host_port)
- h.set_debuglevel(self._debuglevel)
-
- headers = dict(req.headers)
- headers.update(req.unredirected_hdrs)
- # We want to make an HTTP/1.1 request, but the addinfourl
- # class isn't prepared to deal with a persistent connection.
- # It will try to read all remaining data from the socket,
- # which will block while the server waits for the next request.
- # So make sure the connection gets closed after the (only)
- # request.
- headers["Connection"] = "close"
- headers = dict(
- [(name.title(), val) for name, val in headers.items()])
- try:
- h.request(req.get_method(), req.get_selector(), req.data, headers)
- r = h.getresponse()
- except socket.error, err: # XXX what error?
- raise URLError(err)
-
- # Pick apart the HTTPResponse object to get the addinfourl
- # object initialized properly.
-
- # Wrap the HTTPResponse object in socket's file object adapter
- # for Windows. That adapter calls recv(), so delegate recv()
- # to read(). This weird wrapping allows the returned object to
- # have readline() and readlines() methods.
-
- # XXX It might be better to extract the read buffering code
- # out of socket._fileobject() and into a base class.
-
- r.recv = r.read
- fp = create_readline_wrapper(r)
-
- resp = closeable_response(fp, r.msg, req.get_full_url(),
- r.status, r.reason)
- return resp
-
-
-class HTTPHandler(AbstractHTTPHandler):
- def http_open(self, req):
- return self.do_open(httplib.HTTPConnection, req)
-
- http_request = AbstractHTTPHandler.do_request_
-
-if hasattr(httplib, 'HTTPS'):
-
- class HTTPSConnectionFactory:
- def __init__(self, key_file, cert_file):
- self._key_file = key_file
- self._cert_file = cert_file
- def __call__(self, hostport):
- return httplib.HTTPSConnection(
- hostport,
- key_file=self._key_file, cert_file=self._cert_file)
-
- class HTTPSHandler(AbstractHTTPHandler):
- def __init__(self, client_cert_manager=None):
- AbstractHTTPHandler.__init__(self)
- self.client_cert_manager = client_cert_manager
-
- def https_open(self, req):
- if self.client_cert_manager is not None:
- key_file, cert_file = self.client_cert_manager.find_key_cert(
- req.get_full_url())
- conn_factory = HTTPSConnectionFactory(key_file, cert_file)
- else:
- conn_factory = httplib.HTTPSConnection
- return self.do_open(conn_factory, req)
-
- https_request = AbstractHTTPHandler.do_request_