diff options
author | Ben Murdoch <benm@google.com> | 2010-05-11 18:35:50 +0100 |
---|---|---|
committer | Ben Murdoch <benm@google.com> | 2010-05-14 10:23:05 +0100 |
commit | 21939df44de1705786c545cd1bf519d47250322d (patch) | |
tree | ef56c310f5c0cdc379c2abb2e212308a3281ce20 /WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled | |
parent | 4ff1d8891d520763f17675827154340c7c740f90 (diff) | |
download | external_webkit-21939df44de1705786c545cd1bf519d47250322d.zip external_webkit-21939df44de1705786c545cd1bf519d47250322d.tar.gz external_webkit-21939df44de1705786c545cd1bf519d47250322d.tar.bz2 |
Merge Webkit at r58956: Initial merge by Git.
Change-Id: I1d9fb60ea2c3f2ddc04c17a871acdb39353be228
Diffstat (limited to 'WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled')
43 files changed, 17801 insertions, 0 deletions
diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/.mechanize.url b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/.mechanize.url new file mode 100644 index 0000000..4186aee --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/.mechanize.url @@ -0,0 +1 @@ +http://pypi.python.org/packages/source/m/mechanize/mechanize-0.1.11.zip
\ No newline at end of file diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/.pep8.py.url b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/.pep8.py.url new file mode 100644 index 0000000..0fb1ef6 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/.pep8.py.url @@ -0,0 +1 @@ +http://pypi.python.org/packages/source/p/pep8/pep8-0.5.0.tar.gz#md5=512a818af9979290cd619cce8e9c2e2b
\ No newline at end of file diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/README b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/README new file mode 100644 index 0000000..1d68cf3 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/README @@ -0,0 +1,2 @@ +This directory is auto-generated by WebKit and is safe to delete. +It contains needed third-party Python packages automatically downloaded from the web.
\ No newline at end of file diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/__init__.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/__init__.py new file mode 100644 index 0000000..c1e4c6d --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/__init__.py @@ -0,0 +1 @@ +# This file is required for Python to search this directory for modules. diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/clientform/.ClientForm.py.url b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/clientform/.ClientForm.py.url new file mode 100644 index 0000000..c723abf --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/clientform/.ClientForm.py.url @@ -0,0 +1 @@ +http://pypi.python.org/packages/source/C/ClientForm/ClientForm-0.2.10.zip
\ No newline at end of file diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/clientform/ClientForm.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/clientform/ClientForm.py new file mode 100644 index 0000000..a622de7 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/clientform/ClientForm.py @@ -0,0 +1,3401 @@ +"""HTML form handling for web clients. + +ClientForm is a Python module for handling HTML forms on the client +side, useful for parsing HTML forms, filling them in and returning the +completed forms to the server. It has developed from a port of Gisle +Aas' Perl module HTML::Form, from the libwww-perl library, but the +interface is not the same. + +The most useful docstring is the one for HTMLForm. + +RFC 1866: HTML 2.0 +RFC 1867: Form-based File Upload in HTML +RFC 2388: Returning Values from Forms: multipart/form-data +HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX) +HTML 4.01 Specification, W3C Recommendation 24 December 1999 + + +Copyright 2002-2007 John J. Lee <jjl@pobox.com> +Copyright 2005 Gary Poster +Copyright 2005 Zope Corporation +Copyright 1998-2000 Gisle Aas. + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +# XXX +# Remove parser testing hack +# safeUrl()-ize action +# Switch to unicode throughout (would be 0.3.x) +# See Wichert Akkerman's 2004-01-22 message to c.l.py. +# Add charset parameter to Content-type headers? How to find value?? +# Add some more functional tests +# Especially single and multiple file upload on the internet. +# Does file upload work when name is missing? Sourceforge tracker form +# doesn't like it. Check standards, and test with Apache. Test +# binary upload with Apache. +# mailto submission & enctype text/plain +# I'm not going to fix this unless somebody tells me what real servers +# that want this encoding actually expect: If enctype is +# application/x-www-form-urlencoded and there's a FILE control present. +# Strictly, it should be 'name=data' (see HTML 4.01 spec., section +# 17.13.2), but I send "name=" ATM. What about multiple file upload?? + +# Would be nice, but I'm not going to do it myself: +# ------------------------------------------------- +# Maybe a 0.4.x? +# Replace by_label etc. with moniker / selector concept. Allows, eg., +# a choice between selection by value / id / label / element +# contents. Or choice between matching labels exactly or by +# substring. Etc. +# Remove deprecated methods. +# ...what else? +# Work on DOMForm. +# XForms? Don't know if there's a need here. + +__all__ = ['AmbiguityError', 'CheckboxControl', 'Control', + 'ControlNotFoundError', 'FileControl', 'FormParser', 'HTMLForm', + 'HiddenControl', 'IgnoreControl', 'ImageControl', 'IsindexControl', + 'Item', 'ItemCountError', 'ItemNotFoundError', 'Label', + 'ListControl', 'LocateError', 'Missing', 'ParseError', 'ParseFile', + 'ParseFileEx', 'ParseResponse', 'ParseResponseEx','PasswordControl', + 'RadioControl', 'ScalarControl', 'SelectControl', + 'SubmitButtonControl', 'SubmitControl', 'TextControl', + 'TextareaControl', 'XHTMLCompatibleFormParser'] + +try: True +except NameError: + True = 1 + False = 0 + +try: bool +except NameError: + def bool(expr): + if expr: return True + else: return False + +try: + import logging + import inspect +except ImportError: + def debug(msg, *args, **kwds): + pass +else: + _logger = logging.getLogger("ClientForm") + OPTIMIZATION_HACK = True + + def debug(msg, *args, **kwds): + if OPTIMIZATION_HACK: + return + + caller_name = inspect.stack()[1][3] + extended_msg = '%%s %s' % msg + extended_args = (caller_name,)+args + debug = _logger.debug(extended_msg, *extended_args, **kwds) + + def _show_debug_messages(): + global OPTIMIZATION_HACK + OPTIMIZATION_HACK = False + _logger.setLevel(logging.DEBUG) + handler = logging.StreamHandler(sys.stdout) + handler.setLevel(logging.DEBUG) + _logger.addHandler(handler) + +import sys, urllib, urllib2, types, mimetools, copy, urlparse, \ + htmlentitydefs, re, random +from cStringIO import StringIO + +import sgmllib +# monkeypatch to fix http://www.python.org/sf/803422 :-( +sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]") + +# HTMLParser.HTMLParser is recent, so live without it if it's not available +# (also, sgmllib.SGMLParser is much more tolerant of bad HTML) +try: + import HTMLParser +except ImportError: + HAVE_MODULE_HTMLPARSER = False +else: + HAVE_MODULE_HTMLPARSER = True + +try: + import warnings +except ImportError: + def deprecation(message, stack_offset=0): + pass +else: + def deprecation(message, stack_offset=0): + warnings.warn(message, DeprecationWarning, stacklevel=3+stack_offset) + +VERSION = "0.2.10" + +CHUNK = 1024 # size of chunks fed to parser, in bytes + +DEFAULT_ENCODING = "latin-1" + +class Missing: pass + +_compress_re = re.compile(r"\s+") +def compress_text(text): return _compress_re.sub(" ", text.strip()) + +def normalize_line_endings(text): + return re.sub(r"(?:(?<!\r)\n)|(?:\r(?!\n))", "\r\n", text) + + +# This version of urlencode is from my Python 1.5.2 back-port of the +# Python 2.1 CVS maintenance branch of urllib. It will accept a sequence +# of pairs instead of a mapping -- the 2.0 version only accepts a mapping. +def urlencode(query,doseq=False,): + """Encode a sequence of two-element tuples or dictionary into a URL query \ +string. + + If any values in the query arg are sequences and doseq is true, each + sequence element is converted to a separate parameter. + + If the query arg is a sequence of two-element tuples, the order of the + parameters in the output will match the order of parameters in the + input. + """ + + if hasattr(query,"items"): + # mapping objects + query = query.items() + else: + # it's a bother at times that strings and string-like objects are + # sequences... + try: + # non-sequence items should not work with len() + x = len(query) + # non-empty strings will fail this + if len(query) and type(query[0]) != types.TupleType: + raise TypeError() + # zero-length sequences of all types will get here and succeed, + # but that's a minor nit - since the original implementation + # allowed empty dicts that type of behavior probably should be + # preserved for consistency + except TypeError: + ty,va,tb = sys.exc_info() + raise TypeError("not a valid non-string sequence or mapping " + "object", tb) + + l = [] + if not doseq: + # preserve old behavior + for k, v in query: + k = urllib.quote_plus(str(k)) + v = urllib.quote_plus(str(v)) + l.append(k + '=' + v) + else: + for k, v in query: + k = urllib.quote_plus(str(k)) + if type(v) == types.StringType: + v = urllib.quote_plus(v) + l.append(k + '=' + v) + elif type(v) == types.UnicodeType: + # is there a reasonable way to convert to ASCII? + # encode generates a string, but "replace" or "ignore" + # lose information and "strict" can raise UnicodeError + v = urllib.quote_plus(v.encode("ASCII","replace")) + l.append(k + '=' + v) + else: + try: + # is this a sufficient test for sequence-ness? + x = len(v) + except TypeError: + # not a sequence + v = urllib.quote_plus(str(v)) + l.append(k + '=' + v) + else: + # loop over the sequence + for elt in v: + l.append(k + '=' + urllib.quote_plus(str(elt))) + return '&'.join(l) + +def unescape(data, entities, encoding=DEFAULT_ENCODING): + if data is None or "&" not in data: + return data + + def replace_entities(match, entities=entities, encoding=encoding): + ent = match.group() + if ent[1] == "#": + return unescape_charref(ent[2:-1], encoding) + + repl = entities.get(ent) + if repl is not None: + if type(repl) != type(""): + try: + repl = repl.encode(encoding) + except UnicodeError: + repl = ent + else: + repl = ent + + return repl + + return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data) + +def unescape_charref(data, encoding): + name, base = data, 10 + if name.startswith("x"): + name, base= name[1:], 16 + uc = unichr(int(name, base)) + if encoding is None: + return uc + else: + try: + repl = uc.encode(encoding) + except UnicodeError: + repl = "&#%s;" % data + return repl + +def get_entitydefs(): + import htmlentitydefs + from codecs import latin_1_decode + entitydefs = {} + try: + htmlentitydefs.name2codepoint + except AttributeError: + entitydefs = {} + for name, char in htmlentitydefs.entitydefs.items(): + uc = latin_1_decode(char)[0] + if uc.startswith("&#") and uc.endswith(";"): + uc = unescape_charref(uc[2:-1], None) + entitydefs["&%s;" % name] = uc + else: + for name, codepoint in htmlentitydefs.name2codepoint.items(): + entitydefs["&%s;" % name] = unichr(codepoint) + return entitydefs + + +def issequence(x): + try: + x[0] + except (TypeError, KeyError): + return False + except IndexError: + pass + return True + +def isstringlike(x): + try: x+"" + except: return False + else: return True + + +def choose_boundary(): + """Return a string usable as a multipart boundary.""" + # follow IE and firefox + nonce = "".join([str(random.randint(0, sys.maxint-1)) for i in 0,1,2]) + return "-"*27 + nonce + +# This cut-n-pasted MimeWriter from standard library is here so can add +# to HTTP headers rather than message body when appropriate. It also uses +# \r\n in place of \n. This is a bit nasty. +class MimeWriter: + + """Generic MIME writer. + + Methods: + + __init__() + addheader() + flushheaders() + startbody() + startmultipartbody() + nextpart() + lastpart() + + A MIME writer is much more primitive than a MIME parser. It + doesn't seek around on the output file, and it doesn't use large + amounts of buffer space, so you have to write the parts in the + order they should occur on the output file. It does buffer the + headers you add, allowing you to rearrange their order. + + General usage is: + + f = <open the output file> + w = MimeWriter(f) + ...call w.addheader(key, value) 0 or more times... + + followed by either: + + f = w.startbody(content_type) + ...call f.write(data) for body data... + + or: + + w.startmultipartbody(subtype) + for each part: + subwriter = w.nextpart() + ...use the subwriter's methods to create the subpart... + w.lastpart() + + The subwriter is another MimeWriter instance, and should be + treated in the same way as the toplevel MimeWriter. This way, + writing recursive body parts is easy. + + Warning: don't forget to call lastpart()! + + XXX There should be more state so calls made in the wrong order + are detected. + + Some special cases: + + - startbody() just returns the file passed to the constructor; + but don't use this knowledge, as it may be changed. + + - startmultipartbody() actually returns a file as well; + this can be used to write the initial 'if you can read this your + mailer is not MIME-aware' message. + + - If you call flushheaders(), the headers accumulated so far are + written out (and forgotten); this is useful if you don't need a + body part at all, e.g. for a subpart of type message/rfc822 + that's (mis)used to store some header-like information. + + - Passing a keyword argument 'prefix=<flag>' to addheader(), + start*body() affects where the header is inserted; 0 means + append at the end, 1 means insert at the start; default is + append for addheader(), but insert for start*body(), which use + it to determine where the Content-type header goes. + + """ + + def __init__(self, fp, http_hdrs=None): + self._http_hdrs = http_hdrs + self._fp = fp + self._headers = [] + self._boundary = [] + self._first_part = True + + def addheader(self, key, value, prefix=0, + add_to_http_hdrs=0): + """ + prefix is ignored if add_to_http_hdrs is true. + """ + lines = value.split("\r\n") + while lines and not lines[-1]: del lines[-1] + while lines and not lines[0]: del lines[0] + if add_to_http_hdrs: + value = "".join(lines) + # 2.2 urllib2 doesn't normalize header case + self._http_hdrs.append((key.capitalize(), value)) + else: + for i in range(1, len(lines)): + lines[i] = " " + lines[i].strip() + value = "\r\n".join(lines) + "\r\n" + line = key.title() + ": " + value + if prefix: + self._headers.insert(0, line) + else: + self._headers.append(line) + + def flushheaders(self): + self._fp.writelines(self._headers) + self._headers = [] + + def startbody(self, ctype=None, plist=[], prefix=1, + add_to_http_hdrs=0, content_type=1): + """ + prefix is ignored if add_to_http_hdrs is true. + """ + if content_type and ctype: + for name, value in plist: + ctype = ctype + ';\r\n %s=%s' % (name, value) + self.addheader("Content-Type", ctype, prefix=prefix, + add_to_http_hdrs=add_to_http_hdrs) + self.flushheaders() + if not add_to_http_hdrs: self._fp.write("\r\n") + self._first_part = True + return self._fp + + def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1, + add_to_http_hdrs=0, content_type=1): + boundary = boundary or choose_boundary() + self._boundary.append(boundary) + return self.startbody("multipart/" + subtype, + [("boundary", boundary)] + plist, + prefix=prefix, + add_to_http_hdrs=add_to_http_hdrs, + content_type=content_type) + + def nextpart(self): + boundary = self._boundary[-1] + if self._first_part: + self._first_part = False + else: + self._fp.write("\r\n") + self._fp.write("--" + boundary + "\r\n") + return self.__class__(self._fp) + + def lastpart(self): + if self._first_part: + self.nextpart() + boundary = self._boundary.pop() + self._fp.write("\r\n--" + boundary + "--\r\n") + + +class LocateError(ValueError): pass +class AmbiguityError(LocateError): pass +class ControlNotFoundError(LocateError): pass +class ItemNotFoundError(LocateError): pass + +class ItemCountError(ValueError): pass + +# for backwards compatibility, ParseError derives from exceptions that were +# raised by versions of ClientForm <= 0.2.5 +if HAVE_MODULE_HTMLPARSER: + SGMLLIB_PARSEERROR = sgmllib.SGMLParseError + class ParseError(sgmllib.SGMLParseError, + HTMLParser.HTMLParseError, + ): + pass +else: + if hasattr(sgmllib, "SGMLParseError"): + SGMLLIB_PARSEERROR = sgmllib.SGMLParseError + class ParseError(sgmllib.SGMLParseError): + pass + else: + SGMLLIB_PARSEERROR = RuntimeError + class ParseError(RuntimeError): + pass + + +class _AbstractFormParser: + """forms attribute contains HTMLForm instances on completion.""" + # thanks to Moshe Zadka for an example of sgmllib/htmllib usage + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + if entitydefs is None: + entitydefs = get_entitydefs() + self._entitydefs = entitydefs + self._encoding = encoding + + self.base = None + self.forms = [] + self.labels = [] + self._current_label = None + self._current_form = None + self._select = None + self._optgroup = None + self._option = None + self._textarea = None + + # forms[0] will contain all controls that are outside of any form + # self._global_form is an alias for self.forms[0] + self._global_form = None + self.start_form([]) + self.end_form() + self._current_form = self._global_form = self.forms[0] + + def do_base(self, attrs): + debug("%s", attrs) + for key, value in attrs: + if key == "href": + self.base = self.unescape_attr_if_required(value) + + def end_body(self): + debug("") + if self._current_label is not None: + self.end_label() + if self._current_form is not self._global_form: + self.end_form() + + def start_form(self, attrs): + debug("%s", attrs) + if self._current_form is not self._global_form: + raise ParseError("nested FORMs") + name = None + action = None + enctype = "application/x-www-form-urlencoded" + method = "GET" + d = {} + for key, value in attrs: + if key == "name": + name = self.unescape_attr_if_required(value) + elif key == "action": + action = self.unescape_attr_if_required(value) + elif key == "method": + method = self.unescape_attr_if_required(value.upper()) + elif key == "enctype": + enctype = self.unescape_attr_if_required(value.lower()) + d[key] = self.unescape_attr_if_required(value) + controls = [] + self._current_form = (name, action, method, enctype), d, controls + + def end_form(self): + debug("") + if self._current_label is not None: + self.end_label() + if self._current_form is self._global_form: + raise ParseError("end of FORM before start") + self.forms.append(self._current_form) + self._current_form = self._global_form + + def start_select(self, attrs): + debug("%s", attrs) + if self._select is not None: + raise ParseError("nested SELECTs") + if self._textarea is not None: + raise ParseError("SELECT inside TEXTAREA") + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + + self._select = d + self._add_label(d) + + self._append_select_control({"__select": d}) + + def end_select(self): + debug("") + if self._select is None: + raise ParseError("end of SELECT before start") + + if self._option is not None: + self._end_option() + + self._select = None + + def start_optgroup(self, attrs): + debug("%s", attrs) + if self._select is None: + raise ParseError("OPTGROUP outside of SELECT") + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + + self._optgroup = d + + def end_optgroup(self): + debug("") + if self._optgroup is None: + raise ParseError("end of OPTGROUP before start") + self._optgroup = None + + def _start_option(self, attrs): + debug("%s", attrs) + if self._select is None: + raise ParseError("OPTION outside of SELECT") + if self._option is not None: + self._end_option() + + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + + self._option = {} + self._option.update(d) + if (self._optgroup and self._optgroup.has_key("disabled") and + not self._option.has_key("disabled")): + self._option["disabled"] = None + + def _end_option(self): + debug("") + if self._option is None: + raise ParseError("end of OPTION before start") + + contents = self._option.get("contents", "").strip() + self._option["contents"] = contents + if not self._option.has_key("value"): + self._option["value"] = contents + if not self._option.has_key("label"): + self._option["label"] = contents + # stuff dict of SELECT HTML attrs into a special private key + # (gets deleted again later) + self._option["__select"] = self._select + self._append_select_control(self._option) + self._option = None + + def _append_select_control(self, attrs): + debug("%s", attrs) + controls = self._current_form[2] + name = self._select.get("name") + controls.append(("select", name, attrs)) + + def start_textarea(self, attrs): + debug("%s", attrs) + if self._textarea is not None: + raise ParseError("nested TEXTAREAs") + if self._select is not None: + raise ParseError("TEXTAREA inside SELECT") + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + self._add_label(d) + + self._textarea = d + + def end_textarea(self): + debug("") + if self._textarea is None: + raise ParseError("end of TEXTAREA before start") + controls = self._current_form[2] + name = self._textarea.get("name") + controls.append(("textarea", name, self._textarea)) + self._textarea = None + + def start_label(self, attrs): + debug("%s", attrs) + if self._current_label: + self.end_label() + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + taken = bool(d.get("for")) # empty id is invalid + d["__text"] = "" + d["__taken"] = taken + if taken: + self.labels.append(d) + self._current_label = d + + def end_label(self): + debug("") + label = self._current_label + if label is None: + # something is ugly in the HTML, but we're ignoring it + return + self._current_label = None + # if it is staying around, it is True in all cases + del label["__taken"] + + def _add_label(self, d): + #debug("%s", d) + if self._current_label is not None: + if not self._current_label["__taken"]: + self._current_label["__taken"] = True + d["__label"] = self._current_label + + def handle_data(self, data): + debug("%s", data) + + if self._option is not None: + # self._option is a dictionary of the OPTION element's HTML + # attributes, but it has two special keys, one of which is the + # special "contents" key contains text between OPTION tags (the + # other is the "__select" key: see the end_option method) + map = self._option + key = "contents" + elif self._textarea is not None: + map = self._textarea + key = "value" + data = normalize_line_endings(data) + # not if within option or textarea + elif self._current_label is not None: + map = self._current_label + key = "__text" + else: + return + + if data and not map.has_key(key): + # according to + # http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.1 line break + # immediately after start tags or immediately before end tags must + # be ignored, but real browsers only ignore a line break after a + # start tag, so we'll do that. + if data[0:2] == "\r\n": + data = data[2:] + elif data[0:1] in ["\n", "\r"]: + data = data[1:] + map[key] = data + else: + map[key] = map[key] + data + + def do_button(self, attrs): + debug("%s", attrs) + d = {} + d["type"] = "submit" # default + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + controls = self._current_form[2] + + type = d["type"] + name = d.get("name") + # we don't want to lose information, so use a type string that + # doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON} + # e.g. type for BUTTON/RESET is "resetbutton" + # (type for INPUT/RESET is "reset") + type = type+"button" + self._add_label(d) + controls.append((type, name, d)) + + def do_input(self, attrs): + debug("%s", attrs) + d = {} + d["type"] = "text" # default + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + controls = self._current_form[2] + + type = d["type"] + name = d.get("name") + self._add_label(d) + controls.append((type, name, d)) + + def do_isindex(self, attrs): + debug("%s", attrs) + d = {} + for key, val in attrs: + d[key] = self.unescape_attr_if_required(val) + controls = self._current_form[2] + + self._add_label(d) + # isindex doesn't have type or name HTML attributes + controls.append(("isindex", None, d)) + + def handle_entityref(self, name): + #debug("%s", name) + self.handle_data(unescape( + '&%s;' % name, self._entitydefs, self._encoding)) + + def handle_charref(self, name): + #debug("%s", name) + self.handle_data(unescape_charref(name, self._encoding)) + + def unescape_attr(self, name): + #debug("%s", name) + return unescape(name, self._entitydefs, self._encoding) + + def unescape_attrs(self, attrs): + #debug("%s", attrs) + escaped_attrs = {} + for key, val in attrs.items(): + try: + val.items + except AttributeError: + escaped_attrs[key] = self.unescape_attr(val) + else: + # e.g. "__select" -- yuck! + escaped_attrs[key] = self.unescape_attrs(val) + return escaped_attrs + + def unknown_entityref(self, ref): self.handle_data("&%s;" % ref) + def unknown_charref(self, ref): self.handle_data("&#%s;" % ref) + + +if not HAVE_MODULE_HTMLPARSER: + class XHTMLCompatibleFormParser: + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + raise ValueError("HTMLParser could not be imported") +else: + class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser): + """Good for XHTML, bad for tolerance of incorrect HTML.""" + # thanks to Michael Howitz for this! + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + HTMLParser.HTMLParser.__init__(self) + _AbstractFormParser.__init__(self, entitydefs, encoding) + + def feed(self, data): + try: + HTMLParser.HTMLParser.feed(self, data) + except HTMLParser.HTMLParseError, exc: + raise ParseError(exc) + + def start_option(self, attrs): + _AbstractFormParser._start_option(self, attrs) + + def end_option(self): + _AbstractFormParser._end_option(self) + + def handle_starttag(self, tag, attrs): + try: + method = getattr(self, "start_" + tag) + except AttributeError: + try: + method = getattr(self, "do_" + tag) + except AttributeError: + pass # unknown tag + else: + method(attrs) + else: + method(attrs) + + def handle_endtag(self, tag): + try: + method = getattr(self, "end_" + tag) + except AttributeError: + pass # unknown tag + else: + method() + + def unescape(self, name): + # Use the entitydefs passed into constructor, not + # HTMLParser.HTMLParser's entitydefs. + return self.unescape_attr(name) + + def unescape_attr_if_required(self, name): + return name # HTMLParser.HTMLParser already did it + def unescape_attrs_if_required(self, attrs): + return attrs # ditto + + def close(self): + HTMLParser.HTMLParser.close(self) + self.end_body() + + +class _AbstractSgmllibParser(_AbstractFormParser): + + def do_option(self, attrs): + _AbstractFormParser._start_option(self, attrs) + + if sys.version_info[:2] >= (2,5): + # we override this attr to decode hex charrefs + entity_or_charref = re.compile( + '&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(x?[0-9a-fA-F]+))(;?)') + def convert_entityref(self, name): + return unescape("&%s;" % name, self._entitydefs, self._encoding) + def convert_charref(self, name): + return unescape_charref("%s" % name, self._encoding) + def unescape_attr_if_required(self, name): + return name # sgmllib already did it + def unescape_attrs_if_required(self, attrs): + return attrs # ditto + else: + def unescape_attr_if_required(self, name): + return self.unescape_attr(name) + def unescape_attrs_if_required(self, attrs): + return self.unescape_attrs(attrs) + + +class FormParser(_AbstractSgmllibParser, sgmllib.SGMLParser): + """Good for tolerance of incorrect HTML, bad for XHTML.""" + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + sgmllib.SGMLParser.__init__(self) + _AbstractFormParser.__init__(self, entitydefs, encoding) + + def feed(self, data): + try: + sgmllib.SGMLParser.feed(self, data) + except SGMLLIB_PARSEERROR, exc: + raise ParseError(exc) + + def close(self): + sgmllib.SGMLParser.close(self) + self.end_body() + + +# sigh, must support mechanize by allowing dynamic creation of classes based on +# its bundled copy of BeautifulSoup (which was necessary because of dependency +# problems) + +def _create_bs_classes(bs, + icbinbs, + ): + class _AbstractBSFormParser(_AbstractSgmllibParser): + bs_base_class = None + def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): + _AbstractFormParser.__init__(self, entitydefs, encoding) + self.bs_base_class.__init__(self) + def handle_data(self, data): + _AbstractFormParser.handle_data(self, data) + self.bs_base_class.handle_data(self, data) + def feed(self, data): + try: + self.bs_base_class.feed(self, data) + except SGMLLIB_PARSEERROR, exc: + raise ParseError(exc) + def close(self): + self.bs_base_class.close(self) + self.end_body() + + class RobustFormParser(_AbstractBSFormParser, bs): + """Tries to be highly tolerant of incorrect HTML.""" + pass + RobustFormParser.bs_base_class = bs + class NestingRobustFormParser(_AbstractBSFormParser, icbinbs): + """Tries to be highly tolerant of incorrect HTML. + + Different from RobustFormParser in that it more often guesses nesting + above missing end tags (see BeautifulSoup docs). + + """ + pass + NestingRobustFormParser.bs_base_class = icbinbs + + return RobustFormParser, NestingRobustFormParser + +try: + if sys.version_info[:2] < (2, 2): + raise ImportError # BeautifulSoup uses generators + import BeautifulSoup +except ImportError: + pass +else: + RobustFormParser, NestingRobustFormParser = _create_bs_classes( + BeautifulSoup.BeautifulSoup, BeautifulSoup.ICantBelieveItsBeautifulSoup + ) + __all__ += ['RobustFormParser', 'NestingRobustFormParser'] + + +#FormParser = XHTMLCompatibleFormParser # testing hack +#FormParser = RobustFormParser # testing hack + + +def ParseResponseEx(response, + select_default=False, + form_parser_class=FormParser, + request_class=urllib2.Request, + entitydefs=None, + encoding=DEFAULT_ENCODING, + + # private + _urljoin=urlparse.urljoin, + _urlparse=urlparse.urlparse, + _urlunparse=urlparse.urlunparse, + ): + """Identical to ParseResponse, except that: + + 1. The returned list contains an extra item. The first form in the list + contains all controls not contained in any FORM element. + + 2. The arguments ignore_errors and backwards_compat have been removed. + + 3. Backwards-compatibility mode (backwards_compat=True) is not available. + """ + return _ParseFileEx(response, response.geturl(), + select_default, + False, + form_parser_class, + request_class, + entitydefs, + False, + encoding, + _urljoin=_urljoin, + _urlparse=_urlparse, + _urlunparse=_urlunparse, + ) + +def ParseFileEx(file, base_uri, + select_default=False, + form_parser_class=FormParser, + request_class=urllib2.Request, + entitydefs=None, + encoding=DEFAULT_ENCODING, + + # private + _urljoin=urlparse.urljoin, + _urlparse=urlparse.urlparse, + _urlunparse=urlparse.urlunparse, + ): + """Identical to ParseFile, except that: + + 1. The returned list contains an extra item. The first form in the list + contains all controls not contained in any FORM element. + + 2. The arguments ignore_errors and backwards_compat have been removed. + + 3. Backwards-compatibility mode (backwards_compat=True) is not available. + """ + return _ParseFileEx(file, base_uri, + select_default, + False, + form_parser_class, + request_class, + entitydefs, + False, + encoding, + _urljoin=_urljoin, + _urlparse=_urlparse, + _urlunparse=_urlunparse, + ) + +def ParseResponse(response, *args, **kwds): + """Parse HTTP response and return a list of HTMLForm instances. + + The return value of urllib2.urlopen can be conveniently passed to this + function as the response parameter. + + ClientForm.ParseError is raised on parse errors. + + response: file-like object (supporting read() method) with a method + geturl(), returning the URI of the HTTP response + select_default: for multiple-selection SELECT controls and RADIO controls, + pick the first item as the default if none are selected in the HTML + form_parser_class: class to instantiate and use to pass + request_class: class to return from .click() method (default is + urllib2.Request) + entitydefs: mapping like {"&": "&", ...} containing HTML entity + definitions (a sensible default is used) + encoding: character encoding used for encoding numeric character references + when matching link text. ClientForm does not attempt to find the encoding + in a META HTTP-EQUIV attribute in the document itself (mechanize, for + example, does do that and will pass the correct value to ClientForm using + this parameter). + + backwards_compat: boolean that determines whether the returned HTMLForm + objects are backwards-compatible with old code. If backwards_compat is + true: + + - ClientForm 0.1 code will continue to work as before. + + - Label searches that do not specify a nr (number or count) will always + get the first match, even if other controls match. If + backwards_compat is False, label searches that have ambiguous results + will raise an AmbiguityError. + + - Item label matching is done by strict string comparison rather than + substring matching. + + - De-selecting individual list items is allowed even if the Item is + disabled. + + The backwards_compat argument will be deprecated in a future release. + + Pass a true value for select_default if you want the behaviour specified by + RFC 1866 (the HTML 2.0 standard), which is to select the first item in a + RADIO or multiple-selection SELECT control if none were selected in the + HTML. Most browsers (including Microsoft Internet Explorer (IE) and + Netscape Navigator) instead leave all items unselected in these cases. The + W3C HTML 4.0 standard leaves this behaviour undefined in the case of + multiple-selection SELECT controls, but insists that at least one RADIO + button should be checked at all times, in contradiction to browser + behaviour. + + There is a choice of parsers. ClientForm.XHTMLCompatibleFormParser (uses + HTMLParser.HTMLParser) works best for XHTML, ClientForm.FormParser (uses + sgmllib.SGMLParser) (the default) works better for ordinary grubby HTML. + Note that HTMLParser is only available in Python 2.2 and later. You can + pass your own class in here as a hack to work around bad HTML, but at your + own risk: there is no well-defined interface. + + """ + return _ParseFileEx(response, response.geturl(), *args, **kwds)[1:] + +def ParseFile(file, base_uri, *args, **kwds): + """Parse HTML and return a list of HTMLForm instances. + + ClientForm.ParseError is raised on parse errors. + + file: file-like object (supporting read() method) containing HTML with zero + or more forms to be parsed + base_uri: the URI of the document (note that the base URI used to submit + the form will be that given in the BASE element if present, not that of + the document) + + For the other arguments and further details, see ParseResponse.__doc__. + + """ + return _ParseFileEx(file, base_uri, *args, **kwds)[1:] + +def _ParseFileEx(file, base_uri, + select_default=False, + ignore_errors=False, + form_parser_class=FormParser, + request_class=urllib2.Request, + entitydefs=None, + backwards_compat=True, + encoding=DEFAULT_ENCODING, + _urljoin=urlparse.urljoin, + _urlparse=urlparse.urlparse, + _urlunparse=urlparse.urlunparse, + ): + if backwards_compat: + deprecation("operating in backwards-compatibility mode", 1) + fp = form_parser_class(entitydefs, encoding) + while 1: + data = file.read(CHUNK) + try: + fp.feed(data) + except ParseError, e: + e.base_uri = base_uri + raise + if len(data) != CHUNK: break + fp.close() + if fp.base is not None: + # HTML BASE element takes precedence over document URI + base_uri = fp.base + labels = [] # Label(label) for label in fp.labels] + id_to_labels = {} + for l in fp.labels: + label = Label(l) + labels.append(label) + for_id = l["for"] + coll = id_to_labels.get(for_id) + if coll is None: + id_to_labels[for_id] = [label] + else: + coll.append(label) + forms = [] + for (name, action, method, enctype), attrs, controls in fp.forms: + if action is None: + action = base_uri + else: + action = _urljoin(base_uri, action) + # would be nice to make HTMLForm class (form builder) pluggable + form = HTMLForm( + action, method, enctype, name, attrs, request_class, + forms, labels, id_to_labels, backwards_compat) + form._urlparse = _urlparse + form._urlunparse = _urlunparse + for ii in range(len(controls)): + type, name, attrs = controls[ii] + # index=ii*10 allows ImageControl to return multiple ordered pairs + form.new_control( + type, name, attrs, select_default=select_default, index=ii*10) + forms.append(form) + for form in forms: + form.fixup() + return forms + + +class Label: + def __init__(self, attrs): + self.id = attrs.get("for") + self._text = attrs.get("__text").strip() + self._ctext = compress_text(self._text) + self.attrs = attrs + self._backwards_compat = False # maintained by HTMLForm + + def __getattr__(self, name): + if name == "text": + if self._backwards_compat: + return self._text + else: + return self._ctext + return getattr(Label, name) + + def __setattr__(self, name, value): + if name == "text": + # don't see any need for this, so make it read-only + raise AttributeError("text attribute is read-only") + self.__dict__[name] = value + + def __str__(self): + return "<Label(id=%r, text=%r)>" % (self.id, self.text) + + +def _get_label(attrs): + text = attrs.get("__label") + if text is not None: + return Label(text) + else: + return None + +class Control: + """An HTML form control. + + An HTMLForm contains a sequence of Controls. The Controls in an HTMLForm + are accessed using the HTMLForm.find_control method or the + HTMLForm.controls attribute. + + Control instances are usually constructed using the ParseFile / + ParseResponse functions. If you use those functions, you can ignore the + rest of this paragraph. A Control is only properly initialised after the + fixup method has been called. In fact, this is only strictly necessary for + ListControl instances. This is necessary because ListControls are built up + from ListControls each containing only a single item, and their initial + value(s) can only be known after the sequence is complete. + + The types and values that are acceptable for assignment to the value + attribute are defined by subclasses. + + If the disabled attribute is true, this represents the state typically + represented by browsers by 'greying out' a control. If the disabled + attribute is true, the Control will raise AttributeError if an attempt is + made to change its value. In addition, the control will not be considered + 'successful' as defined by the W3C HTML 4 standard -- ie. it will + contribute no data to the return value of the HTMLForm.click* methods. To + enable a control, set the disabled attribute to a false value. + + If the readonly attribute is true, the Control will raise AttributeError if + an attempt is made to change its value. To make a control writable, set + the readonly attribute to a false value. + + All controls have the disabled and readonly attributes, not only those that + may have the HTML attributes of the same names. + + On assignment to the value attribute, the following exceptions are raised: + TypeError, AttributeError (if the value attribute should not be assigned + to, because the control is disabled, for example) and ValueError. + + If the name or value attributes are None, or the value is an empty list, or + if the control is disabled, the control is not successful. + + Public attributes: + + type: string describing type of control (see the keys of the + HTMLForm.type2class dictionary for the allowable values) (readonly) + name: name of control (readonly) + value: current value of control (subclasses may allow a single value, a + sequence of values, or either) + disabled: disabled state + readonly: readonly state + id: value of id HTML attribute + + """ + def __init__(self, type, name, attrs, index=None): + """ + type: string describing type of control (see the keys of the + HTMLForm.type2class dictionary for the allowable values) + name: control name + attrs: HTML attributes of control's HTML element + + """ + raise NotImplementedError() + + def add_to_form(self, form): + self._form = form + form.controls.append(self) + + def fixup(self): + pass + + def is_of_kind(self, kind): + raise NotImplementedError() + + def clear(self): + raise NotImplementedError() + + def __getattr__(self, name): raise NotImplementedError() + def __setattr__(self, name, value): raise NotImplementedError() + + def pairs(self): + """Return list of (key, value) pairs suitable for passing to urlencode. + """ + return [(k, v) for (i, k, v) in self._totally_ordered_pairs()] + + def _totally_ordered_pairs(self): + """Return list of (key, value, index) tuples. + + Like pairs, but allows preserving correct ordering even where several + controls are involved. + + """ + raise NotImplementedError() + + def _write_mime_data(self, mw, name, value): + """Write data for a subitem of this control to a MimeWriter.""" + # called by HTMLForm + mw2 = mw.nextpart() + mw2.addheader("Content-Disposition", + 'form-data; name="%s"' % name, 1) + f = mw2.startbody(prefix=0) + f.write(value) + + def __str__(self): + raise NotImplementedError() + + def get_labels(self): + """Return all labels (Label instances) for this control. + + If the control was surrounded by a <label> tag, that will be the first + label; all other labels, connected by 'for' and 'id', are in the order + that appear in the HTML. + + """ + res = [] + if self._label: + res.append(self._label) + if self.id: + res.extend(self._form._id_to_labels.get(self.id, ())) + return res + + +#--------------------------------------------------- +class ScalarControl(Control): + """Control whose value is not restricted to one of a prescribed set. + + Some ScalarControls don't accept any value attribute. Otherwise, takes a + single value, which must be string-like. + + Additional read-only public attribute: + + attrs: dictionary mapping the names of original HTML attributes of the + control to their values + + """ + def __init__(self, type, name, attrs, index=None): + self._index = index + self._label = _get_label(attrs) + self.__dict__["type"] = type.lower() + self.__dict__["name"] = name + self._value = attrs.get("value") + self.disabled = attrs.has_key("disabled") + self.readonly = attrs.has_key("readonly") + self.id = attrs.get("id") + + self.attrs = attrs.copy() + + self._clicked = False + + self._urlparse = urlparse.urlparse + self._urlunparse = urlparse.urlunparse + + def __getattr__(self, name): + if name == "value": + return self.__dict__["_value"] + else: + raise AttributeError("%s instance has no attribute '%s'" % + (self.__class__.__name__, name)) + + def __setattr__(self, name, value): + if name == "value": + if not isstringlike(value): + raise TypeError("must assign a string") + elif self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + elif self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + self.__dict__["_value"] = value + elif name in ("name", "type"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + def _totally_ordered_pairs(self): + name = self.name + value = self.value + if name is None or value is None or self.disabled: + return [] + return [(self._index, name, value)] + + def clear(self): + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + self.__dict__["_value"] = None + + def __str__(self): + name = self.name + value = self.value + if name is None: name = "<None>" + if value is None: value = "<None>" + + infos = [] + if self.disabled: infos.append("disabled") + if self.readonly: infos.append("readonly") + info = ", ".join(infos) + if info: info = " (%s)" % info + + return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info) + + +#--------------------------------------------------- +class TextControl(ScalarControl): + """Textual input control. + + Covers: + + INPUT/TEXT + INPUT/PASSWORD + INPUT/HIDDEN + TEXTAREA + + """ + def __init__(self, type, name, attrs, index=None): + ScalarControl.__init__(self, type, name, attrs, index) + if self.type == "hidden": self.readonly = True + if self._value is None: + self._value = "" + + def is_of_kind(self, kind): return kind == "text" + +#--------------------------------------------------- +class FileControl(ScalarControl): + """File upload with INPUT TYPE=FILE. + + The value attribute of a FileControl is always None. Use add_file instead. + + Additional public method: add_file + + """ + + def __init__(self, type, name, attrs, index=None): + ScalarControl.__init__(self, type, name, attrs, index) + self._value = None + self._upload_data = [] + + def is_of_kind(self, kind): return kind == "file" + + def clear(self): + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + self._upload_data = [] + + def __setattr__(self, name, value): + if name in ("value", "name", "type"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + def add_file(self, file_object, content_type=None, filename=None): + if not hasattr(file_object, "read"): + raise TypeError("file-like object must have read method") + if content_type is not None and not isstringlike(content_type): + raise TypeError("content type must be None or string-like") + if filename is not None and not isstringlike(filename): + raise TypeError("filename must be None or string-like") + if content_type is None: + content_type = "application/octet-stream" + self._upload_data.append((file_object, content_type, filename)) + + def _totally_ordered_pairs(self): + # XXX should it be successful even if unnamed? + if self.name is None or self.disabled: + return [] + return [(self._index, self.name, "")] + + def _write_mime_data(self, mw, _name, _value): + # called by HTMLForm + # assert _name == self.name and _value == '' + if len(self._upload_data) < 2: + if len(self._upload_data) == 0: + file_object = StringIO() + content_type = "application/octet-stream" + filename = "" + else: + file_object, content_type, filename = self._upload_data[0] + if filename is None: + filename = "" + mw2 = mw.nextpart() + fn_part = '; filename="%s"' % filename + disp = 'form-data; name="%s"%s' % (self.name, fn_part) + mw2.addheader("Content-Disposition", disp, prefix=1) + fh = mw2.startbody(content_type, prefix=0) + fh.write(file_object.read()) + else: + # multiple files + mw2 = mw.nextpart() + disp = 'form-data; name="%s"' % self.name + mw2.addheader("Content-Disposition", disp, prefix=1) + fh = mw2.startmultipartbody("mixed", prefix=0) + for file_object, content_type, filename in self._upload_data: + mw3 = mw2.nextpart() + if filename is None: + filename = "" + fn_part = '; filename="%s"' % filename + disp = "file%s" % fn_part + mw3.addheader("Content-Disposition", disp, prefix=1) + fh2 = mw3.startbody(content_type, prefix=0) + fh2.write(file_object.read()) + mw2.lastpart() + + def __str__(self): + name = self.name + if name is None: name = "<None>" + + if not self._upload_data: + value = "<No files added>" + else: + value = [] + for file, ctype, filename in self._upload_data: + if filename is None: + value.append("<Unnamed file>") + else: + value.append(filename) + value = ", ".join(value) + + info = [] + if self.disabled: info.append("disabled") + if self.readonly: info.append("readonly") + info = ", ".join(info) + if info: info = " (%s)" % info + + return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info) + + +#--------------------------------------------------- +class IsindexControl(ScalarControl): + """ISINDEX control. + + ISINDEX is the odd-one-out of HTML form controls. In fact, it isn't really + part of regular HTML forms at all, and predates it. You're only allowed + one ISINDEX per HTML document. ISINDEX and regular form submission are + mutually exclusive -- either submit a form, or the ISINDEX. + + Having said this, since ISINDEX controls may appear in forms (which is + probably bad HTML), ParseFile / ParseResponse will include them in the + HTMLForm instances it returns. You can set the ISINDEX's value, as with + any other control (but note that ISINDEX controls have no name, so you'll + need to use the type argument of set_value!). When you submit the form, + the ISINDEX will not be successful (ie., no data will get returned to the + server as a result of its presence), unless you click on the ISINDEX + control, in which case the ISINDEX gets submitted instead of the form: + + form.set_value("my isindex value", type="isindex") + urllib2.urlopen(form.click(type="isindex")) + + ISINDEX elements outside of FORMs are ignored. If you want to submit one + by hand, do it like so: + + url = urlparse.urljoin(page_uri, "?"+urllib.quote_plus("my isindex value")) + result = urllib2.urlopen(url) + + """ + def __init__(self, type, name, attrs, index=None): + ScalarControl.__init__(self, type, name, attrs, index) + if self._value is None: + self._value = "" + + def is_of_kind(self, kind): return kind in ["text", "clickable"] + + def _totally_ordered_pairs(self): + return [] + + def _click(self, form, coord, return_type, request_class=urllib2.Request): + # Relative URL for ISINDEX submission: instead of "foo=bar+baz", + # want "bar+baz". + # This doesn't seem to be specified in HTML 4.01 spec. (ISINDEX is + # deprecated in 4.01, but it should still say how to submit it). + # Submission of ISINDEX is explained in the HTML 3.2 spec, though. + parts = self._urlparse(form.action) + rest, (query, frag) = parts[:-2], parts[-2:] + parts = rest + (urllib.quote_plus(self.value), None) + url = self._urlunparse(parts) + req_data = url, None, [] + + if return_type == "pairs": + return [] + elif return_type == "request_data": + return req_data + else: + return request_class(url) + + def __str__(self): + value = self.value + if value is None: value = "<None>" + + infos = [] + if self.disabled: infos.append("disabled") + if self.readonly: infos.append("readonly") + info = ", ".join(infos) + if info: info = " (%s)" % info + + return "<%s(%s)%s>" % (self.__class__.__name__, value, info) + + +#--------------------------------------------------- +class IgnoreControl(ScalarControl): + """Control that we're not interested in. + + Covers: + + INPUT/RESET + BUTTON/RESET + INPUT/BUTTON + BUTTON/BUTTON + + These controls are always unsuccessful, in the terminology of HTML 4 (ie. + they never require any information to be returned to the server). + + BUTTON/BUTTON is used to generate events for script embedded in HTML. + + The value attribute of IgnoreControl is always None. + + """ + def __init__(self, type, name, attrs, index=None): + ScalarControl.__init__(self, type, name, attrs, index) + self._value = None + + def is_of_kind(self, kind): return False + + def __setattr__(self, name, value): + if name == "value": + raise AttributeError( + "control '%s' is ignored, hence read-only" % self.name) + elif name in ("name", "type"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + +#--------------------------------------------------- +# ListControls + +# helpers and subsidiary classes + +class Item: + def __init__(self, control, attrs, index=None): + label = _get_label(attrs) + self.__dict__.update({ + "name": attrs["value"], + "_labels": label and [label] or [], + "attrs": attrs, + "_control": control, + "disabled": attrs.has_key("disabled"), + "_selected": False, + "id": attrs.get("id"), + "_index": index, + }) + control.items.append(self) + + def get_labels(self): + """Return all labels (Label instances) for this item. + + For items that represent radio buttons or checkboxes, if the item was + surrounded by a <label> tag, that will be the first label; all other + labels, connected by 'for' and 'id', are in the order that appear in + the HTML. + + For items that represent select options, if the option had a label + attribute, that will be the first label. If the option has contents + (text within the option tags) and it is not the same as the label + attribute (if any), that will be a label. There is nothing in the + spec to my knowledge that makes an option with an id unable to be the + target of a label's for attribute, so those are included, if any, for + the sake of consistency and completeness. + + """ + res = [] + res.extend(self._labels) + if self.id: + res.extend(self._control._form._id_to_labels.get(self.id, ())) + return res + + def __getattr__(self, name): + if name=="selected": + return self._selected + raise AttributeError(name) + + def __setattr__(self, name, value): + if name == "selected": + self._control._set_selected_state(self, value) + elif name == "disabled": + self.__dict__["disabled"] = bool(value) + else: + raise AttributeError(name) + + def __str__(self): + res = self.name + if self.selected: + res = "*" + res + if self.disabled: + res = "(%s)" % res + return res + + def __repr__(self): + # XXX appending the attrs without distinguishing them from name and id + # is silly + attrs = [("name", self.name), ("id", self.id)]+self.attrs.items() + return "<%s %s>" % ( + self.__class__.__name__, + " ".join(["%s=%r" % (k, v) for k, v in attrs]) + ) + +def disambiguate(items, nr, **kwds): + msgs = [] + for key, value in kwds.items(): + msgs.append("%s=%r" % (key, value)) + msg = " ".join(msgs) + if not items: + raise ItemNotFoundError(msg) + if nr is None: + if len(items) > 1: + raise AmbiguityError(msg) + nr = 0 + if len(items) <= nr: + raise ItemNotFoundError(msg) + return items[nr] + +class ListControl(Control): + """Control representing a sequence of items. + + The value attribute of a ListControl represents the successful list items + in the control. The successful list items are those that are selected and + not disabled. + + ListControl implements both list controls that take a length-1 value + (single-selection) and those that take length >1 values + (multiple-selection). + + ListControls accept sequence values only. Some controls only accept + sequences of length 0 or 1 (RADIO, and single-selection SELECT). + In those cases, ItemCountError is raised if len(sequence) > 1. CHECKBOXes + and multiple-selection SELECTs (those having the "multiple" HTML attribute) + accept sequences of any length. + + Note the following mistake: + + control.value = some_value + assert control.value == some_value # not necessarily true + + The reason for this is that the value attribute always gives the list items + in the order they were listed in the HTML. + + ListControl items can also be referred to by their labels instead of names. + Use the label argument to .get(), and the .set_value_by_label(), + .get_value_by_label() methods. + + Note that, rather confusingly, though SELECT controls are represented in + HTML by SELECT elements (which contain OPTION elements, representing + individual list items), CHECKBOXes and RADIOs are not represented by *any* + element. Instead, those controls are represented by a collection of INPUT + elements. For example, this is a SELECT control, named "control1": + + <select name="control1"> + <option>foo</option> + <option value="1">bar</option> + </select> + + and this is a CHECKBOX control, named "control2": + + <input type="checkbox" name="control2" value="foo" id="cbe1"> + <input type="checkbox" name="control2" value="bar" id="cbe2"> + + The id attribute of a CHECKBOX or RADIO ListControl is always that of its + first element (for example, "cbe1" above). + + + Additional read-only public attribute: multiple. + + """ + + # ListControls are built up by the parser from their component items by + # creating one ListControl per item, consolidating them into a single + # master ListControl held by the HTMLForm: + + # -User calls form.new_control(...) + # -Form creates Control, and calls control.add_to_form(self). + # -Control looks for a Control with the same name and type in the form, + # and if it finds one, merges itself with that control by calling + # control.merge_control(self). The first Control added to the form, of + # a particular name and type, is the only one that survives in the + # form. + # -Form calls control.fixup for all its controls. ListControls in the + # form know they can now safely pick their default values. + + # To create a ListControl without an HTMLForm, use: + + # control.merge_control(new_control) + + # (actually, it's much easier just to use ParseFile) + + _label = None + + def __init__(self, type, name, attrs={}, select_default=False, + called_as_base_class=False, index=None): + """ + select_default: for RADIO and multiple-selection SELECT controls, pick + the first item as the default if no 'selected' HTML attribute is + present + + """ + if not called_as_base_class: + raise NotImplementedError() + + self.__dict__["type"] = type.lower() + self.__dict__["name"] = name + self._value = attrs.get("value") + self.disabled = False + self.readonly = False + self.id = attrs.get("id") + self._closed = False + + # As Controls are merged in with .merge_control(), self.attrs will + # refer to each Control in turn -- always the most recently merged + # control. Each merged-in Control instance corresponds to a single + # list item: see ListControl.__doc__. + self.items = [] + self._form = None + + self._select_default = select_default + self._clicked = False + + def clear(self): + self.value = [] + + def is_of_kind(self, kind): + if kind == "list": + return True + elif kind == "multilist": + return bool(self.multiple) + elif kind == "singlelist": + return not self.multiple + else: + return False + + def get_items(self, name=None, label=None, id=None, + exclude_disabled=False): + """Return matching items by name or label. + + For argument docs, see the docstring for .get() + + """ + if name is not None and not isstringlike(name): + raise TypeError("item name must be string-like") + if label is not None and not isstringlike(label): + raise TypeError("item label must be string-like") + if id is not None and not isstringlike(id): + raise TypeError("item id must be string-like") + items = [] # order is important + compat = self._form.backwards_compat + for o in self.items: + if exclude_disabled and o.disabled: + continue + if name is not None and o.name != name: + continue + if label is not None: + for l in o.get_labels(): + if ((compat and l.text == label) or + (not compat and l.text.find(label) > -1)): + break + else: + continue + if id is not None and o.id != id: + continue + items.append(o) + return items + + def get(self, name=None, label=None, id=None, nr=None, + exclude_disabled=False): + """Return item by name or label, disambiguating if necessary with nr. + + All arguments must be passed by name, with the exception of 'name', + which may be used as a positional argument. + + If name is specified, then the item must have the indicated name. + + If label is specified, then the item must have a label whose + whitespace-compressed, stripped, text substring-matches the indicated + label string (eg. label="please choose" will match + " Do please choose an item "). + + If id is specified, then the item must have the indicated id. + + nr is an optional 0-based index of the items matching the query. + + If nr is the default None value and more than item is found, raises + AmbiguityError (unless the HTMLForm instance's backwards_compat + attribute is true). + + If no item is found, or if items are found but nr is specified and not + found, raises ItemNotFoundError. + + Optionally excludes disabled items. + + """ + if nr is None and self._form.backwards_compat: + nr = 0 # :-/ + items = self.get_items(name, label, id, exclude_disabled) + return disambiguate(items, nr, name=name, label=label, id=id) + + def _get(self, name, by_label=False, nr=None, exclude_disabled=False): + # strictly for use by deprecated methods + if by_label: + name, label = None, name + else: + name, label = name, None + return self.get(name, label, nr, exclude_disabled) + + def toggle(self, name, by_label=False, nr=None): + """Deprecated: given a name or label and optional disambiguating index + nr, toggle the matching item's selection. + + Selecting items follows the behavior described in the docstring of the + 'get' method. + + if the item is disabled, or this control is disabled or readonly, + raise AttributeError. + + """ + deprecation( + "item = control.get(...); item.selected = not item.selected") + o = self._get(name, by_label, nr) + self._set_selected_state(o, not o.selected) + + def set(self, selected, name, by_label=False, nr=None): + """Deprecated: given a name or label and optional disambiguating index + nr, set the matching item's selection to the bool value of selected. + + Selecting items follows the behavior described in the docstring of the + 'get' method. + + if the item is disabled, or this control is disabled or readonly, + raise AttributeError. + + """ + deprecation( + "control.get(...).selected = <boolean>") + self._set_selected_state(self._get(name, by_label, nr), selected) + + def _set_selected_state(self, item, action): + # action: + # bool False: off + # bool True: on + if self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + action == bool(action) + compat = self._form.backwards_compat + if not compat and item.disabled: + raise AttributeError("item is disabled") + else: + if compat and item.disabled and action: + raise AttributeError("item is disabled") + if self.multiple: + item.__dict__["_selected"] = action + else: + if not action: + item.__dict__["_selected"] = False + else: + for o in self.items: + o.__dict__["_selected"] = False + item.__dict__["_selected"] = True + + def toggle_single(self, by_label=None): + """Deprecated: toggle the selection of the single item in this control. + + Raises ItemCountError if the control does not contain only one item. + + by_label argument is ignored, and included only for backwards + compatibility. + + """ + deprecation( + "control.items[0].selected = not control.items[0].selected") + if len(self.items) != 1: + raise ItemCountError( + "'%s' is not a single-item control" % self.name) + item = self.items[0] + self._set_selected_state(item, not item.selected) + + def set_single(self, selected, by_label=None): + """Deprecated: set the selection of the single item in this control. + + Raises ItemCountError if the control does not contain only one item. + + by_label argument is ignored, and included only for backwards + compatibility. + + """ + deprecation( + "control.items[0].selected = <boolean>") + if len(self.items) != 1: + raise ItemCountError( + "'%s' is not a single-item control" % self.name) + self._set_selected_state(self.items[0], selected) + + def get_item_disabled(self, name, by_label=False, nr=None): + """Get disabled state of named list item in a ListControl.""" + deprecation( + "control.get(...).disabled") + return self._get(name, by_label, nr).disabled + + def set_item_disabled(self, disabled, name, by_label=False, nr=None): + """Set disabled state of named list item in a ListControl. + + disabled: boolean disabled state + + """ + deprecation( + "control.get(...).disabled = <boolean>") + self._get(name, by_label, nr).disabled = disabled + + def set_all_items_disabled(self, disabled): + """Set disabled state of all list items in a ListControl. + + disabled: boolean disabled state + + """ + for o in self.items: + o.disabled = disabled + + def get_item_attrs(self, name, by_label=False, nr=None): + """Return dictionary of HTML attributes for a single ListControl item. + + The HTML element types that describe list items are: OPTION for SELECT + controls, INPUT for the rest. These elements have HTML attributes that + you may occasionally want to know about -- for example, the "alt" HTML + attribute gives a text string describing the item (graphical browsers + usually display this as a tooltip). + + The returned dictionary maps HTML attribute names to values. The names + and values are taken from the original HTML. + + """ + deprecation( + "control.get(...).attrs") + return self._get(name, by_label, nr).attrs + + def close_control(self): + self._closed = True + + def add_to_form(self, form): + assert self._form is None or form == self._form, ( + "can't add control to more than one form") + self._form = form + if self.name is None: + # always count nameless elements as separate controls + Control.add_to_form(self, form) + else: + for ii in range(len(form.controls)-1, -1, -1): + control = form.controls[ii] + if control.name == self.name and control.type == self.type: + if control._closed: + Control.add_to_form(self, form) + else: + control.merge_control(self) + break + else: + Control.add_to_form(self, form) + + def merge_control(self, control): + assert bool(control.multiple) == bool(self.multiple) + # usually, isinstance(control, self.__class__) + self.items.extend(control.items) + + def fixup(self): + """ + ListControls are built up from component list items (which are also + ListControls) during parsing. This method should be called after all + items have been added. See ListControl.__doc__ for the reason this is + required. + + """ + # Need to set default selection where no item was indicated as being + # selected by the HTML: + + # CHECKBOX: + # Nothing should be selected. + # SELECT/single, SELECT/multiple and RADIO: + # RFC 1866 (HTML 2.0): says first item should be selected. + # W3C HTML 4.01 Specification: says that client behaviour is + # undefined in this case. For RADIO, exactly one must be selected, + # though which one is undefined. + # Both Netscape and Microsoft Internet Explorer (IE) choose first + # item for SELECT/single. However, both IE5 and Mozilla (both 1.0 + # and Firebird 0.6) leave all items unselected for RADIO and + # SELECT/multiple. + + # Since both Netscape and IE all choose the first item for + # SELECT/single, we do the same. OTOH, both Netscape and IE + # leave SELECT/multiple with nothing selected, in violation of RFC 1866 + # (but not in violation of the W3C HTML 4 standard); the same is true + # of RADIO (which *is* in violation of the HTML 4 standard). We follow + # RFC 1866 if the _select_default attribute is set, and Netscape and IE + # otherwise. RFC 1866 and HTML 4 are always violated insofar as you + # can deselect all items in a RadioControl. + + for o in self.items: + # set items' controls to self, now that we've merged + o.__dict__["_control"] = self + + def __getattr__(self, name): + if name == "value": + compat = self._form.backwards_compat + if self.name is None: + return [] + return [o.name for o in self.items if o.selected and + (not o.disabled or compat)] + else: + raise AttributeError("%s instance has no attribute '%s'" % + (self.__class__.__name__, name)) + + def __setattr__(self, name, value): + if name == "value": + if self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + self._set_value(value) + elif name in ("name", "type", "multiple"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + def _set_value(self, value): + if value is None or isstringlike(value): + raise TypeError("ListControl, must set a sequence") + if not value: + compat = self._form.backwards_compat + for o in self.items: + if not o.disabled or compat: + o.selected = False + elif self.multiple: + self._multiple_set_value(value) + elif len(value) > 1: + raise ItemCountError( + "single selection list, must set sequence of " + "length 0 or 1") + else: + self._single_set_value(value) + + def _get_items(self, name, target=1): + all_items = self.get_items(name) + items = [o for o in all_items if not o.disabled] + if len(items) < target: + if len(all_items) < target: + raise ItemNotFoundError( + "insufficient items with name %r" % name) + else: + raise AttributeError( + "insufficient non-disabled items with name %s" % name) + on = [] + off = [] + for o in items: + if o.selected: + on.append(o) + else: + off.append(o) + return on, off + + def _single_set_value(self, value): + assert len(value) == 1 + on, off = self._get_items(value[0]) + assert len(on) <= 1 + if not on: + off[0].selected = True + + def _multiple_set_value(self, value): + compat = self._form.backwards_compat + turn_on = [] # transactional-ish + turn_off = [item for item in self.items if + item.selected and (not item.disabled or compat)] + names = {} + for nn in value: + if nn in names.keys(): + names[nn] += 1 + else: + names[nn] = 1 + for name, count in names.items(): + on, off = self._get_items(name, count) + for i in range(count): + if on: + item = on[0] + del on[0] + del turn_off[turn_off.index(item)] + else: + item = off[0] + del off[0] + turn_on.append(item) + for item in turn_off: + item.selected = False + for item in turn_on: + item.selected = True + + def set_value_by_label(self, value): + """Set the value of control by item labels. + + value is expected to be an iterable of strings that are substrings of + the item labels that should be selected. Before substring matching is + performed, the original label text is whitespace-compressed + (consecutive whitespace characters are converted to a single space + character) and leading and trailing whitespace is stripped. Ambiguous + labels are accepted without complaint if the form's backwards_compat is + True; otherwise, it will not complain as long as all ambiguous labels + share the same item name (e.g. OPTION value). + + """ + if isstringlike(value): + raise TypeError(value) + if not self.multiple and len(value) > 1: + raise ItemCountError( + "single selection list, must set sequence of " + "length 0 or 1") + items = [] + for nn in value: + found = self.get_items(label=nn) + if len(found) > 1: + if not self._form.backwards_compat: + # ambiguous labels are fine as long as item names (e.g. + # OPTION values) are same + opt_name = found[0].name + if [o for o in found[1:] if o.name != opt_name]: + raise AmbiguityError(nn) + else: + # OK, we'll guess :-( Assume first available item. + found = found[:1] + for o in found: + # For the multiple-item case, we could try to be smarter, + # saving them up and trying to resolve, but that's too much. + if self._form.backwards_compat or o not in items: + items.append(o) + break + else: # all of them are used + raise ItemNotFoundError(nn) + # now we have all the items that should be on + # let's just turn everything off and then back on. + self.value = [] + for o in items: + o.selected = True + + def get_value_by_label(self): + """Return the value of the control as given by normalized labels.""" + res = [] + compat = self._form.backwards_compat + for o in self.items: + if (not o.disabled or compat) and o.selected: + for l in o.get_labels(): + if l.text: + res.append(l.text) + break + else: + res.append(None) + return res + + def possible_items(self, by_label=False): + """Deprecated: return the names or labels of all possible items. + + Includes disabled items, which may be misleading for some use cases. + + """ + deprecation( + "[item.name for item in self.items]") + if by_label: + res = [] + for o in self.items: + for l in o.get_labels(): + if l.text: + res.append(l.text) + break + else: + res.append(None) + return res + return [o.name for o in self.items] + + def _totally_ordered_pairs(self): + if self.disabled or self.name is None: + return [] + else: + return [(o._index, self.name, o.name) for o in self.items + if o.selected and not o.disabled] + + def __str__(self): + name = self.name + if name is None: name = "<None>" + + display = [str(o) for o in self.items] + + infos = [] + if self.disabled: infos.append("disabled") + if self.readonly: infos.append("readonly") + info = ", ".join(infos) + if info: info = " (%s)" % info + + return "<%s(%s=[%s])%s>" % (self.__class__.__name__, + name, ", ".join(display), info) + + +class RadioControl(ListControl): + """ + Covers: + + INPUT/RADIO + + """ + def __init__(self, type, name, attrs, select_default=False, index=None): + attrs.setdefault("value", "on") + ListControl.__init__(self, type, name, attrs, select_default, + called_as_base_class=True, index=index) + self.__dict__["multiple"] = False + o = Item(self, attrs, index) + o.__dict__["_selected"] = attrs.has_key("checked") + + def fixup(self): + ListControl.fixup(self) + found = [o for o in self.items if o.selected and not o.disabled] + if not found: + if self._select_default: + for o in self.items: + if not o.disabled: + o.selected = True + break + else: + # Ensure only one item selected. Choose the last one, + # following IE and Firefox. + for o in found[:-1]: + o.selected = False + + def get_labels(self): + return [] + +class CheckboxControl(ListControl): + """ + Covers: + + INPUT/CHECKBOX + + """ + def __init__(self, type, name, attrs, select_default=False, index=None): + attrs.setdefault("value", "on") + ListControl.__init__(self, type, name, attrs, select_default, + called_as_base_class=True, index=index) + self.__dict__["multiple"] = True + o = Item(self, attrs, index) + o.__dict__["_selected"] = attrs.has_key("checked") + + def get_labels(self): + return [] + + +class SelectControl(ListControl): + """ + Covers: + + SELECT (and OPTION) + + + OPTION 'values', in HTML parlance, are Item 'names' in ClientForm parlance. + + SELECT control values and labels are subject to some messy defaulting + rules. For example, if the HTML representation of the control is: + + <SELECT name=year> + <OPTION value=0 label="2002">current year</OPTION> + <OPTION value=1>2001</OPTION> + <OPTION>2000</OPTION> + </SELECT> + + The items, in order, have labels "2002", "2001" and "2000", whereas their + names (the OPTION values) are "0", "1" and "2000" respectively. Note that + the value of the last OPTION in this example defaults to its contents, as + specified by RFC 1866, as do the labels of the second and third OPTIONs. + + The OPTION labels are sometimes more meaningful than the OPTION values, + which can make for more maintainable code. + + Additional read-only public attribute: attrs + + The attrs attribute is a dictionary of the original HTML attributes of the + SELECT element. Other ListControls do not have this attribute, because in + other cases the control as a whole does not correspond to any single HTML + element. control.get(...).attrs may be used as usual to get at the HTML + attributes of the HTML elements corresponding to individual list items (for + SELECT controls, these are OPTION elements). + + Another special case is that the Item.attrs dictionaries have a special key + "contents" which does not correspond to any real HTML attribute, but rather + contains the contents of the OPTION element: + + <OPTION>this bit</OPTION> + + """ + # HTML attributes here are treated slightly differently from other list + # controls: + # -The SELECT HTML attributes dictionary is stuffed into the OPTION + # HTML attributes dictionary under the "__select" key. + # -The content of each OPTION element is stored under the special + # "contents" key of the dictionary. + # After all this, the dictionary is passed to the SelectControl constructor + # as the attrs argument, as usual. However: + # -The first SelectControl constructed when building up a SELECT control + # has a constructor attrs argument containing only the __select key -- so + # this SelectControl represents an empty SELECT control. + # -Subsequent SelectControls have both OPTION HTML-attribute in attrs and + # the __select dictionary containing the SELECT HTML-attributes. + + def __init__(self, type, name, attrs, select_default=False, index=None): + # fish out the SELECT HTML attributes from the OPTION HTML attributes + # dictionary + self.attrs = attrs["__select"].copy() + self.__dict__["_label"] = _get_label(self.attrs) + self.__dict__["id"] = self.attrs.get("id") + self.__dict__["multiple"] = self.attrs.has_key("multiple") + # the majority of the contents, label, and value dance already happened + contents = attrs.get("contents") + attrs = attrs.copy() + del attrs["__select"] + + ListControl.__init__(self, type, name, self.attrs, select_default, + called_as_base_class=True, index=index) + self.disabled = self.attrs.has_key("disabled") + self.readonly = self.attrs.has_key("readonly") + if attrs.has_key("value"): + # otherwise it is a marker 'select started' token + o = Item(self, attrs, index) + o.__dict__["_selected"] = attrs.has_key("selected") + # add 'label' label and contents label, if different. If both are + # provided, the 'label' label is used for display in HTML + # 4.0-compliant browsers (and any lower spec? not sure) while the + # contents are used for display in older or less-compliant + # browsers. We make label objects for both, if the values are + # different. + label = attrs.get("label") + if label: + o._labels.append(Label({"__text": label})) + if contents and contents != label: + o._labels.append(Label({"__text": contents})) + elif contents: + o._labels.append(Label({"__text": contents})) + + def fixup(self): + ListControl.fixup(self) + # Firefox doesn't exclude disabled items from those considered here + # (i.e. from 'found', for both branches of the if below). Note that + # IE6 doesn't support the disabled attribute on OPTIONs at all. + found = [o for o in self.items if o.selected] + if not found: + if not self.multiple or self._select_default: + for o in self.items: + if not o.disabled: + was_disabled = self.disabled + self.disabled = False + try: + o.selected = True + finally: + o.disabled = was_disabled + break + elif not self.multiple: + # Ensure only one item selected. Choose the last one, + # following IE and Firefox. + for o in found[:-1]: + o.selected = False + + +#--------------------------------------------------- +class SubmitControl(ScalarControl): + """ + Covers: + + INPUT/SUBMIT + BUTTON/SUBMIT + + """ + def __init__(self, type, name, attrs, index=None): + ScalarControl.__init__(self, type, name, attrs, index) + # IE5 defaults SUBMIT value to "Submit Query"; Firebird 0.6 leaves it + # blank, Konqueror 3.1 defaults to "Submit". HTML spec. doesn't seem + # to define this. + if self.value is None: self.value = "" + self.readonly = True + + def get_labels(self): + res = [] + if self.value: + res.append(Label({"__text": self.value})) + res.extend(ScalarControl.get_labels(self)) + return res + + def is_of_kind(self, kind): return kind == "clickable" + + def _click(self, form, coord, return_type, request_class=urllib2.Request): + self._clicked = coord + r = form._switch_click(return_type, request_class) + self._clicked = False + return r + + def _totally_ordered_pairs(self): + if not self._clicked: + return [] + return ScalarControl._totally_ordered_pairs(self) + + +#--------------------------------------------------- +class ImageControl(SubmitControl): + """ + Covers: + + INPUT/IMAGE + + Coordinates are specified using one of the HTMLForm.click* methods. + + """ + def __init__(self, type, name, attrs, index=None): + SubmitControl.__init__(self, type, name, attrs, index) + self.readonly = False + + def _totally_ordered_pairs(self): + clicked = self._clicked + if self.disabled or not clicked: + return [] + name = self.name + if name is None: return [] + pairs = [ + (self._index, "%s.x" % name, str(clicked[0])), + (self._index+1, "%s.y" % name, str(clicked[1])), + ] + value = self._value + if value: + pairs.append((self._index+2, name, value)) + return pairs + + get_labels = ScalarControl.get_labels + +# aliases, just to make str(control) and str(form) clearer +class PasswordControl(TextControl): pass +class HiddenControl(TextControl): pass +class TextareaControl(TextControl): pass +class SubmitButtonControl(SubmitControl): pass + + +def is_listcontrol(control): return control.is_of_kind("list") + + +class HTMLForm: + """Represents a single HTML <form> ... </form> element. + + A form consists of a sequence of controls that usually have names, and + which can take on various values. The values of the various types of + controls represent variously: text, zero-or-one-of-many or many-of-many + choices, and files to be uploaded. Some controls can be clicked on to + submit the form, and clickable controls' values sometimes include the + coordinates of the click. + + Forms can be filled in with data to be returned to the server, and then + submitted, using the click method to generate a request object suitable for + passing to urllib2.urlopen (or the click_request_data or click_pairs + methods if you're not using urllib2). + + import ClientForm + forms = ClientForm.ParseFile(html, base_uri) + form = forms[0] + + form["query"] = "Python" + form.find_control("nr_results").get("lots").selected = True + + response = urllib2.urlopen(form.click()) + + Usually, HTMLForm instances are not created directly. Instead, the + ParseFile or ParseResponse factory functions are used. If you do construct + HTMLForm objects yourself, however, note that an HTMLForm instance is only + properly initialised after the fixup method has been called (ParseFile and + ParseResponse do this for you). See ListControl.__doc__ for the reason + this is required. + + Indexing a form (form["control_name"]) returns the named Control's value + attribute. Assignment to a form index (form["control_name"] = something) + is equivalent to assignment to the named Control's value attribute. If you + need to be more specific than just supplying the control's name, use the + set_value and get_value methods. + + ListControl values are lists of item names (specifically, the names of the + items that are selected and not disabled, and hence are "successful" -- ie. + cause data to be returned to the server). The list item's name is the + value of the corresponding HTML element's"value" attribute. + + Example: + + <INPUT type="CHECKBOX" name="cheeses" value="leicester"></INPUT> + <INPUT type="CHECKBOX" name="cheeses" value="cheddar"></INPUT> + + defines a CHECKBOX control with name "cheeses" which has two items, named + "leicester" and "cheddar". + + Another example: + + <SELECT name="more_cheeses"> + <OPTION>1</OPTION> + <OPTION value="2" label="CHEDDAR">cheddar</OPTION> + </SELECT> + + defines a SELECT control with name "more_cheeses" which has two items, + named "1" and "2" (because the OPTION element's value HTML attribute + defaults to the element contents -- see SelectControl.__doc__ for more on + these defaulting rules). + + To select, deselect or otherwise manipulate individual list items, use the + HTMLForm.find_control() and ListControl.get() methods. To set the whole + value, do as for any other control: use indexing or the set_/get_value + methods. + + Example: + + # select *only* the item named "cheddar" + form["cheeses"] = ["cheddar"] + # select "cheddar", leave other items unaffected + form.find_control("cheeses").get("cheddar").selected = True + + Some controls (RADIO and SELECT without the multiple attribute) can only + have zero or one items selected at a time. Some controls (CHECKBOX and + SELECT with the multiple attribute) can have multiple items selected at a + time. To set the whole value of a ListControl, assign a sequence to a form + index: + + form["cheeses"] = ["cheddar", "leicester"] + + If the ListControl is not multiple-selection, the assigned list must be of + length one. + + To check if a control has an item, if an item is selected, or if an item is + successful (selected and not disabled), respectively: + + "cheddar" in [item.name for item in form.find_control("cheeses").items] + "cheddar" in [item.name for item in form.find_control("cheeses").items and + item.selected] + "cheddar" in form["cheeses"] # (or "cheddar" in form.get_value("cheeses")) + + Note that some list items may be disabled (see below). + + Note the following mistake: + + form[control_name] = control_value + assert form[control_name] == control_value # not necessarily true + + The reason for this is that form[control_name] always gives the list items + in the order they were listed in the HTML. + + List items (hence list values, too) can be referred to in terms of list + item labels rather than list item names using the appropriate label + arguments. Note that each item may have several labels. + + The question of default values of OPTION contents, labels and values is + somewhat complicated: see SelectControl.__doc__ and + ListControl.get_item_attrs.__doc__ if you think you need to know. + + Controls can be disabled or readonly. In either case, the control's value + cannot be changed until you clear those flags (see example below). + Disabled is the state typically represented by browsers by 'greying out' a + control. Disabled controls are not 'successful' -- they don't cause data + to get returned to the server. Readonly controls usually appear in + browsers as read-only text boxes. Readonly controls are successful. List + items can also be disabled. Attempts to select or deselect disabled items + fail with AttributeError. + + If a lot of controls are readonly, it can be useful to do this: + + form.set_all_readonly(False) + + To clear a control's value attribute, so that it is not successful (until a + value is subsequently set): + + form.clear("cheeses") + + More examples: + + control = form.find_control("cheeses") + control.disabled = False + control.readonly = False + control.get("gruyere").disabled = True + control.items[0].selected = True + + See the various Control classes for further documentation. Many methods + take name, type, kind, id, label and nr arguments to specify the control to + be operated on: see HTMLForm.find_control.__doc__. + + ControlNotFoundError (subclass of ValueError) is raised if the specified + control can't be found. This includes occasions where a non-ListControl + is found, but the method (set, for example) requires a ListControl. + ItemNotFoundError (subclass of ValueError) is raised if a list item can't + be found. ItemCountError (subclass of ValueError) is raised if an attempt + is made to select more than one item and the control doesn't allow that, or + set/get_single are called and the control contains more than one item. + AttributeError is raised if a control or item is readonly or disabled and + an attempt is made to alter its value. + + Security note: Remember that any passwords you store in HTMLForm instances + will be saved to disk in the clear if you pickle them (directly or + indirectly). The simplest solution to this is to avoid pickling HTMLForm + objects. You could also pickle before filling in any password, or just set + the password to "" before pickling. + + + Public attributes: + + action: full (absolute URI) form action + method: "GET" or "POST" + enctype: form transfer encoding MIME type + name: name of form (None if no name was specified) + attrs: dictionary mapping original HTML form attributes to their values + + controls: list of Control instances; do not alter this list + (instead, call form.new_control to make a Control and add it to the + form, or control.add_to_form if you already have a Control instance) + + + + Methods for form filling: + ------------------------- + + Most of the these methods have very similar arguments. See + HTMLForm.find_control.__doc__ for details of the name, type, kind, label + and nr arguments. + + def find_control(self, + name=None, type=None, kind=None, id=None, predicate=None, + nr=None, label=None) + + get_value(name=None, type=None, kind=None, id=None, nr=None, + by_label=False, # by_label is deprecated + label=None) + set_value(value, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False, # by_label is deprecated + label=None) + + clear_all() + clear(name=None, type=None, kind=None, id=None, nr=None, label=None) + + set_all_readonly(readonly) + + + Method applying only to FileControls: + + add_file(file_object, + content_type="application/octet-stream", filename=None, + name=None, id=None, nr=None, label=None) + + + Methods applying only to clickable controls: + + click(name=None, type=None, id=None, nr=0, coord=(1,1), label=None) + click_request_data(name=None, type=None, id=None, nr=0, coord=(1,1), + label=None) + click_pairs(name=None, type=None, id=None, nr=0, coord=(1,1), label=None) + + """ + + type2class = { + "text": TextControl, + "password": PasswordControl, + "hidden": HiddenControl, + "textarea": TextareaControl, + + "isindex": IsindexControl, + + "file": FileControl, + + "button": IgnoreControl, + "buttonbutton": IgnoreControl, + "reset": IgnoreControl, + "resetbutton": IgnoreControl, + + "submit": SubmitControl, + "submitbutton": SubmitButtonControl, + "image": ImageControl, + + "radio": RadioControl, + "checkbox": CheckboxControl, + "select": SelectControl, + } + +#--------------------------------------------------- +# Initialisation. Use ParseResponse / ParseFile instead. + + def __init__(self, action, method="GET", + enctype="application/x-www-form-urlencoded", + name=None, attrs=None, + request_class=urllib2.Request, + forms=None, labels=None, id_to_labels=None, + backwards_compat=True): + """ + In the usual case, use ParseResponse (or ParseFile) to create new + HTMLForm objects. + + action: full (absolute URI) form action + method: "GET" or "POST" + enctype: form transfer encoding MIME type + name: name of form + attrs: dictionary mapping original HTML form attributes to their values + + """ + self.action = action + self.method = method + self.enctype = enctype + self.name = name + if attrs is not None: + self.attrs = attrs.copy() + else: + self.attrs = {} + self.controls = [] + self._request_class = request_class + + # these attributes are used by zope.testbrowser + self._forms = forms # this is a semi-public API! + self._labels = labels # this is a semi-public API! + self._id_to_labels = id_to_labels # this is a semi-public API! + + self.backwards_compat = backwards_compat # note __setattr__ + + self._urlunparse = urlparse.urlunparse + self._urlparse = urlparse.urlparse + + def __getattr__(self, name): + if name == "backwards_compat": + return self._backwards_compat + return getattr(HTMLForm, name) + + def __setattr__(self, name, value): + # yuck + if name == "backwards_compat": + name = "_backwards_compat" + value = bool(value) + for cc in self.controls: + try: + items = cc.items + except AttributeError: + continue + else: + for ii in items: + for ll in ii.get_labels(): + ll._backwards_compat = value + self.__dict__[name] = value + + def new_control(self, type, name, attrs, + ignore_unknown=False, select_default=False, index=None): + """Adds a new control to the form. + + This is usually called by ParseFile and ParseResponse. Don't call it + youself unless you're building your own Control instances. + + Note that controls representing lists of items are built up from + controls holding only a single list item. See ListControl.__doc__ for + further information. + + type: type of control (see Control.__doc__ for a list) + attrs: HTML attributes of control + ignore_unknown: if true, use a dummy Control instance for controls of + unknown type; otherwise, use a TextControl + select_default: for RADIO and multiple-selection SELECT controls, pick + the first item as the default if no 'selected' HTML attribute is + present (this defaulting happens when the HTMLForm.fixup method is + called) + index: index of corresponding element in HTML (see + MoreFormTests.test_interspersed_controls for motivation) + + """ + type = type.lower() + klass = self.type2class.get(type) + if klass is None: + if ignore_unknown: + klass = IgnoreControl + else: + klass = TextControl + + a = attrs.copy() + if issubclass(klass, ListControl): + control = klass(type, name, a, select_default, index) + else: + control = klass(type, name, a, index) + + if type == "select" and len(attrs) == 1: + for ii in range(len(self.controls)-1, -1, -1): + ctl = self.controls[ii] + if ctl.type == "select": + ctl.close_control() + break + + control.add_to_form(self) + control._urlparse = self._urlparse + control._urlunparse = self._urlunparse + + def fixup(self): + """Normalise form after all controls have been added. + + This is usually called by ParseFile and ParseResponse. Don't call it + youself unless you're building your own Control instances. + + This method should only be called once, after all controls have been + added to the form. + + """ + for control in self.controls: + control.fixup() + self.backwards_compat = self._backwards_compat + +#--------------------------------------------------- + def __str__(self): + header = "%s%s %s %s" % ( + (self.name and self.name+" " or ""), + self.method, self.action, self.enctype) + rep = [header] + for control in self.controls: + rep.append(" %s" % str(control)) + return "<%s>" % "\n".join(rep) + +#--------------------------------------------------- +# Form-filling methods. + + def __getitem__(self, name): + return self.find_control(name).value + def __contains__(self, name): + return bool(self.find_control(name)) + def __setitem__(self, name, value): + control = self.find_control(name) + try: + control.value = value + except AttributeError, e: + raise ValueError(str(e)) + + def get_value(self, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False, # by_label is deprecated + label=None): + """Return value of control. + + If only name and value arguments are supplied, equivalent to + + form[name] + + """ + if by_label: + deprecation("form.get_value_by_label(...)") + c = self.find_control(name, type, kind, id, label=label, nr=nr) + if by_label: + try: + meth = c.get_value_by_label + except AttributeError: + raise NotImplementedError( + "control '%s' does not yet support by_label" % c.name) + else: + return meth() + else: + return c.value + def set_value(self, value, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False, # by_label is deprecated + label=None): + """Set value of control. + + If only name and value arguments are supplied, equivalent to + + form[name] = value + + """ + if by_label: + deprecation("form.get_value_by_label(...)") + c = self.find_control(name, type, kind, id, label=label, nr=nr) + if by_label: + try: + meth = c.set_value_by_label + except AttributeError: + raise NotImplementedError( + "control '%s' does not yet support by_label" % c.name) + else: + meth(value) + else: + c.value = value + def get_value_by_label( + self, name=None, type=None, kind=None, id=None, label=None, nr=None): + """ + + All arguments should be passed by name. + + """ + c = self.find_control(name, type, kind, id, label=label, nr=nr) + return c.get_value_by_label() + + def set_value_by_label( + self, value, + name=None, type=None, kind=None, id=None, label=None, nr=None): + """ + + All arguments should be passed by name. + + """ + c = self.find_control(name, type, kind, id, label=label, nr=nr) + c.set_value_by_label(value) + + def set_all_readonly(self, readonly): + for control in self.controls: + control.readonly = bool(readonly) + + def clear_all(self): + """Clear the value attributes of all controls in the form. + + See HTMLForm.clear.__doc__. + + """ + for control in self.controls: + control.clear() + + def clear(self, + name=None, type=None, kind=None, id=None, nr=None, label=None): + """Clear the value attribute of a control. + + As a result, the affected control will not be successful until a value + is subsequently set. AttributeError is raised on readonly controls. + + """ + c = self.find_control(name, type, kind, id, label=label, nr=nr) + c.clear() + + +#--------------------------------------------------- +# Form-filling methods applying only to ListControls. + + def possible_items(self, # deprecated + name=None, type=None, kind=None, id=None, + nr=None, by_label=False, label=None): + """Return a list of all values that the specified control can take.""" + c = self._find_list_control(name, type, kind, id, label, nr) + return c.possible_items(by_label) + + def set(self, selected, item_name, # deprecated + name=None, type=None, kind=None, id=None, nr=None, + by_label=False, label=None): + """Select / deselect named list item. + + selected: boolean selected state + + """ + self._find_list_control(name, type, kind, id, label, nr).set( + selected, item_name, by_label) + def toggle(self, item_name, # deprecated + name=None, type=None, kind=None, id=None, nr=None, + by_label=False, label=None): + """Toggle selected state of named list item.""" + self._find_list_control(name, type, kind, id, label, nr).toggle( + item_name, by_label) + + def set_single(self, selected, # deprecated + name=None, type=None, kind=None, id=None, + nr=None, by_label=None, label=None): + """Select / deselect list item in a control having only one item. + + If the control has multiple list items, ItemCountError is raised. + + This is just a convenience method, so you don't need to know the item's + name -- the item name in these single-item controls is usually + something meaningless like "1" or "on". + + For example, if a checkbox has a single item named "on", the following + two calls are equivalent: + + control.toggle("on") + control.toggle_single() + + """ # by_label ignored and deprecated + self._find_list_control( + name, type, kind, id, label, nr).set_single(selected) + def toggle_single(self, name=None, type=None, kind=None, id=None, + nr=None, by_label=None, label=None): # deprecated + """Toggle selected state of list item in control having only one item. + + The rest is as for HTMLForm.set_single.__doc__. + + """ # by_label ignored and deprecated + self._find_list_control(name, type, kind, id, label, nr).toggle_single() + +#--------------------------------------------------- +# Form-filling method applying only to FileControls. + + def add_file(self, file_object, content_type=None, filename=None, + name=None, id=None, nr=None, label=None): + """Add a file to be uploaded. + + file_object: file-like object (with read method) from which to read + data to upload + content_type: MIME content type of data to upload + filename: filename to pass to server + + If filename is None, no filename is sent to the server. + + If content_type is None, the content type is guessed based on the + filename and the data from read from the file object. + + XXX + At the moment, guessed content type is always application/octet-stream. + Use sndhdr, imghdr modules. Should also try to guess HTML, XML, and + plain text. + + Note the following useful HTML attributes of file upload controls (see + HTML 4.01 spec, section 17): + + accept: comma-separated list of content types that the server will + handle correctly; you can use this to filter out non-conforming files + size: XXX IIRC, this is indicative of whether form wants multiple or + single files + maxlength: XXX hint of max content length in bytes? + + """ + self.find_control(name, "file", id=id, label=label, nr=nr).add_file( + file_object, content_type, filename) + +#--------------------------------------------------- +# Form submission methods, applying only to clickable controls. + + def click(self, name=None, type=None, id=None, nr=0, coord=(1,1), + request_class=urllib2.Request, + label=None): + """Return request that would result from clicking on a control. + + The request object is a urllib2.Request instance, which you can pass to + urllib2.urlopen (or ClientCookie.urlopen). + + Only some control types (INPUT/SUBMIT & BUTTON/SUBMIT buttons and + IMAGEs) can be clicked. + + Will click on the first clickable control, subject to the name, type + and nr arguments (as for find_control). If no name, type, id or number + is specified and there are no clickable controls, a request will be + returned for the form in its current, un-clicked, state. + + IndexError is raised if any of name, type, id or nr is specified but no + matching control is found. ValueError is raised if the HTMLForm has an + enctype attribute that is not recognised. + + You can optionally specify a coordinate to click at, which only makes a + difference if you clicked on an image. + + """ + return self._click(name, type, id, label, nr, coord, "request", + self._request_class) + + def click_request_data(self, + name=None, type=None, id=None, + nr=0, coord=(1,1), + request_class=urllib2.Request, + label=None): + """As for click method, but return a tuple (url, data, headers). + + You can use this data to send a request to the server. This is useful + if you're using httplib or urllib rather than urllib2. Otherwise, use + the click method. + + # Untested. Have to subclass to add headers, I think -- so use urllib2 + # instead! + import urllib + url, data, hdrs = form.click_request_data() + r = urllib.urlopen(url, data) + + # Untested. I don't know of any reason to use httplib -- you can get + # just as much control with urllib2. + import httplib, urlparse + url, data, hdrs = form.click_request_data() + tup = urlparse(url) + host, path = tup[1], urlparse.urlunparse((None, None)+tup[2:]) + conn = httplib.HTTPConnection(host) + if data: + httplib.request("POST", path, data, hdrs) + else: + httplib.request("GET", path, headers=hdrs) + r = conn.getresponse() + + """ + return self._click(name, type, id, label, nr, coord, "request_data", + self._request_class) + + def click_pairs(self, name=None, type=None, id=None, + nr=0, coord=(1,1), + label=None): + """As for click_request_data, but returns a list of (key, value) pairs. + + You can use this list as an argument to ClientForm.urlencode. This is + usually only useful if you're using httplib or urllib rather than + urllib2 or ClientCookie. It may also be useful if you want to manually + tweak the keys and/or values, but this should not be necessary. + Otherwise, use the click method. + + Note that this method is only useful for forms of MIME type + x-www-form-urlencoded. In particular, it does not return the + information required for file upload. If you need file upload and are + not using urllib2, use click_request_data. + + Also note that Python 2.0's urllib.urlencode is slightly broken: it + only accepts a mapping, not a sequence of pairs, as an argument. This + messes up any ordering in the argument. Use ClientForm.urlencode + instead. + + """ + return self._click(name, type, id, label, nr, coord, "pairs", + self._request_class) + +#--------------------------------------------------- + + def find_control(self, + name=None, type=None, kind=None, id=None, + predicate=None, nr=None, + label=None): + """Locate and return some specific control within the form. + + At least one of the name, type, kind, predicate and nr arguments must + be supplied. If no matching control is found, ControlNotFoundError is + raised. + + If name is specified, then the control must have the indicated name. + + If type is specified then the control must have the specified type (in + addition to the types possible for <input> HTML tags: "text", + "password", "hidden", "submit", "image", "button", "radio", "checkbox", + "file" we also have "reset", "buttonbutton", "submitbutton", + "resetbutton", "textarea", "select" and "isindex"). + + If kind is specified, then the control must fall into the specified + group, each of which satisfies a particular interface. The types are + "text", "list", "multilist", "singlelist", "clickable" and "file". + + If id is specified, then the control must have the indicated id. + + If predicate is specified, then the control must match that function. + The predicate function is passed the control as its single argument, + and should return a boolean value indicating whether the control + matched. + + nr, if supplied, is the sequence number of the control (where 0 is the + first). Note that control 0 is the first control matching all the + other arguments (if supplied); it is not necessarily the first control + in the form. If no nr is supplied, AmbiguityError is raised if + multiple controls match the other arguments (unless the + .backwards-compat attribute is true). + + If label is specified, then the control must have this label. Note + that radio controls and checkboxes never have labels: their items do. + + """ + if ((name is None) and (type is None) and (kind is None) and + (id is None) and (label is None) and (predicate is None) and + (nr is None)): + raise ValueError( + "at least one argument must be supplied to specify control") + return self._find_control(name, type, kind, id, label, predicate, nr) + +#--------------------------------------------------- +# Private methods. + + def _find_list_control(self, + name=None, type=None, kind=None, id=None, + label=None, nr=None): + if ((name is None) and (type is None) and (kind is None) and + (id is None) and (label is None) and (nr is None)): + raise ValueError( + "at least one argument must be supplied to specify control") + + return self._find_control(name, type, kind, id, label, + is_listcontrol, nr) + + def _find_control(self, name, type, kind, id, label, predicate, nr): + if ((name is not None) and (name is not Missing) and + not isstringlike(name)): + raise TypeError("control name must be string-like") + if (type is not None) and not isstringlike(type): + raise TypeError("control type must be string-like") + if (kind is not None) and not isstringlike(kind): + raise TypeError("control kind must be string-like") + if (id is not None) and not isstringlike(id): + raise TypeError("control id must be string-like") + if (label is not None) and not isstringlike(label): + raise TypeError("control label must be string-like") + if (predicate is not None) and not callable(predicate): + raise TypeError("control predicate must be callable") + if (nr is not None) and nr < 0: + raise ValueError("control number must be a positive integer") + + orig_nr = nr + found = None + ambiguous = False + if nr is None and self.backwards_compat: + nr = 0 + + for control in self.controls: + if ((name is not None and name != control.name) and + (name is not Missing or control.name is not None)): + continue + if type is not None and type != control.type: + continue + if kind is not None and not control.is_of_kind(kind): + continue + if id is not None and id != control.id: + continue + if predicate and not predicate(control): + continue + if label: + for l in control.get_labels(): + if l.text.find(label) > -1: + break + else: + continue + if nr is not None: + if nr == 0: + return control # early exit: unambiguous due to nr + nr -= 1 + continue + if found: + ambiguous = True + break + found = control + + if found and not ambiguous: + return found + + description = [] + if name is not None: description.append("name %s" % repr(name)) + if type is not None: description.append("type '%s'" % type) + if kind is not None: description.append("kind '%s'" % kind) + if id is not None: description.append("id '%s'" % id) + if label is not None: description.append("label '%s'" % label) + if predicate is not None: + description.append("predicate %s" % predicate) + if orig_nr: description.append("nr %d" % orig_nr) + description = ", ".join(description) + + if ambiguous: + raise AmbiguityError("more than one control matching "+description) + elif not found: + raise ControlNotFoundError("no control matching "+description) + assert False + + def _click(self, name, type, id, label, nr, coord, return_type, + request_class=urllib2.Request): + try: + control = self._find_control( + name, type, "clickable", id, label, None, nr) + except ControlNotFoundError: + if ((name is not None) or (type is not None) or (id is not None) or + (nr != 0)): + raise + # no clickable controls, but no control was explicitly requested, + # so return state without clicking any control + return self._switch_click(return_type, request_class) + else: + return control._click(self, coord, return_type, request_class) + + def _pairs(self): + """Return sequence of (key, value) pairs suitable for urlencoding.""" + return [(k, v) for (i, k, v, c_i) in self._pairs_and_controls()] + + + def _pairs_and_controls(self): + """Return sequence of (index, key, value, control_index) + of totally ordered pairs suitable for urlencoding. + + control_index is the index of the control in self.controls + """ + pairs = [] + for control_index in range(len(self.controls)): + control = self.controls[control_index] + for ii, key, val in control._totally_ordered_pairs(): + pairs.append((ii, key, val, control_index)) + + # stable sort by ONLY first item in tuple + pairs.sort() + + return pairs + + def _request_data(self): + """Return a tuple (url, data, headers).""" + method = self.method.upper() + #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(self.action) + parts = self._urlparse(self.action) + rest, (query, frag) = parts[:-2], parts[-2:] + + if method == "GET": + if self.enctype != "application/x-www-form-urlencoded": + raise ValueError( + "unknown GET form encoding type '%s'" % self.enctype) + parts = rest + (urlencode(self._pairs()), None) + uri = self._urlunparse(parts) + return uri, None, [] + elif method == "POST": + parts = rest + (query, None) + uri = self._urlunparse(parts) + if self.enctype == "application/x-www-form-urlencoded": + return (uri, urlencode(self._pairs()), + [("Content-Type", self.enctype)]) + elif self.enctype == "multipart/form-data": + data = StringIO() + http_hdrs = [] + mw = MimeWriter(data, http_hdrs) + f = mw.startmultipartbody("form-data", add_to_http_hdrs=True, + prefix=0) + for ii, k, v, control_index in self._pairs_and_controls(): + self.controls[control_index]._write_mime_data(mw, k, v) + mw.lastpart() + return uri, data.getvalue(), http_hdrs + else: + raise ValueError( + "unknown POST form encoding type '%s'" % self.enctype) + else: + raise ValueError("Unknown method '%s'" % method) + + def _switch_click(self, return_type, request_class=urllib2.Request): + # This is called by HTMLForm and clickable Controls to hide switching + # on return_type. + if return_type == "pairs": + return self._pairs() + elif return_type == "request_data": + return self._request_data() + else: + req_data = self._request_data() + req = request_class(req_data[0], req_data[1]) + for key, val in req_data[2]: + add_hdr = req.add_header + if key.lower() == "content-type": + try: + add_hdr = req.add_unredirected_header + except AttributeError: + # pre-2.4 and not using ClientCookie + pass + add_hdr(key, val) + return req diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/clientform/__init__.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/clientform/__init__.py new file mode 100644 index 0000000..c1e4c6d --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/clientform/__init__.py @@ -0,0 +1 @@ +# This file is required for Python to search this directory for modules. diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/irc/.ircbot.py.url b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/irc/.ircbot.py.url new file mode 100644 index 0000000..f34e243 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/irc/.ircbot.py.url @@ -0,0 +1 @@ +http://iweb.dl.sourceforge.net/project/python-irclib/python-irclib/0.4.8/python-irclib-0.4.8.zip
\ No newline at end of file diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/irc/.irclib.py.url b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/irc/.irclib.py.url new file mode 100644 index 0000000..f34e243 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/irc/.irclib.py.url @@ -0,0 +1 @@ +http://iweb.dl.sourceforge.net/project/python-irclib/python-irclib/0.4.8/python-irclib-0.4.8.zip
\ No newline at end of file diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/irc/__init__.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/irc/__init__.py new file mode 100644 index 0000000..c1e4c6d --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/irc/__init__.py @@ -0,0 +1 @@ +# This file is required for Python to search this directory for modules. diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/irc/ircbot.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/irc/ircbot.py new file mode 100644 index 0000000..6f29a65 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/irc/ircbot.py @@ -0,0 +1,438 @@ +# Copyright (C) 1999--2002 Joel Rosdahl +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Joel Rosdahl <joel@rosdahl.net> +# +# $Id: ircbot.py,v 1.23 2008/09/11 07:38:30 keltus Exp $ + +"""ircbot -- Simple IRC bot library. + +This module contains a single-server IRC bot class that can be used to +write simpler bots. +""" + +import sys +from UserDict import UserDict + +from irclib import SimpleIRCClient +from irclib import nm_to_n, irc_lower, all_events +from irclib import parse_channel_modes, is_channel +from irclib import ServerConnectionError + +class SingleServerIRCBot(SimpleIRCClient): + """A single-server IRC bot class. + + The bot tries to reconnect if it is disconnected. + + The bot keeps track of the channels it has joined, the other + clients that are present in the channels and which of those that + have operator or voice modes. The "database" is kept in the + self.channels attribute, which is an IRCDict of Channels. + """ + def __init__(self, server_list, nickname, realname, reconnection_interval=60): + """Constructor for SingleServerIRCBot objects. + + Arguments: + + server_list -- A list of tuples (server, port) that + defines which servers the bot should try to + connect to. + + nickname -- The bot's nickname. + + realname -- The bot's realname. + + reconnection_interval -- How long the bot should wait + before trying to reconnect. + + dcc_connections -- A list of initiated/accepted DCC + connections. + """ + + SimpleIRCClient.__init__(self) + self.channels = IRCDict() + self.server_list = server_list + if not reconnection_interval or reconnection_interval < 0: + reconnection_interval = 2**31 + self.reconnection_interval = reconnection_interval + + self._nickname = nickname + self._realname = realname + for i in ["disconnect", "join", "kick", "mode", + "namreply", "nick", "part", "quit"]: + self.connection.add_global_handler(i, + getattr(self, "_on_" + i), + -10) + def _connected_checker(self): + """[Internal]""" + if not self.connection.is_connected(): + self.connection.execute_delayed(self.reconnection_interval, + self._connected_checker) + self.jump_server() + + def _connect(self): + """[Internal]""" + password = None + if len(self.server_list[0]) > 2: + password = self.server_list[0][2] + try: + self.connect(self.server_list[0][0], + self.server_list[0][1], + self._nickname, + password, + ircname=self._realname) + except ServerConnectionError: + pass + + def _on_disconnect(self, c, e): + """[Internal]""" + self.channels = IRCDict() + self.connection.execute_delayed(self.reconnection_interval, + self._connected_checker) + + def _on_join(self, c, e): + """[Internal]""" + ch = e.target() + nick = nm_to_n(e.source()) + if nick == c.get_nickname(): + self.channels[ch] = Channel() + self.channels[ch].add_user(nick) + + def _on_kick(self, c, e): + """[Internal]""" + nick = e.arguments()[0] + channel = e.target() + + if nick == c.get_nickname(): + del self.channels[channel] + else: + self.channels[channel].remove_user(nick) + + def _on_mode(self, c, e): + """[Internal]""" + modes = parse_channel_modes(" ".join(e.arguments())) + t = e.target() + if is_channel(t): + ch = self.channels[t] + for mode in modes: + if mode[0] == "+": + f = ch.set_mode + else: + f = ch.clear_mode + f(mode[1], mode[2]) + else: + # Mode on self... XXX + pass + + def _on_namreply(self, c, e): + """[Internal]""" + + # e.arguments()[0] == "@" for secret channels, + # "*" for private channels, + # "=" for others (public channels) + # e.arguments()[1] == channel + # e.arguments()[2] == nick list + + ch = e.arguments()[1] + for nick in e.arguments()[2].split(): + if nick[0] == "@": + nick = nick[1:] + self.channels[ch].set_mode("o", nick) + elif nick[0] == "+": + nick = nick[1:] + self.channels[ch].set_mode("v", nick) + self.channels[ch].add_user(nick) + + def _on_nick(self, c, e): + """[Internal]""" + before = nm_to_n(e.source()) + after = e.target() + for ch in self.channels.values(): + if ch.has_user(before): + ch.change_nick(before, after) + + def _on_part(self, c, e): + """[Internal]""" + nick = nm_to_n(e.source()) + channel = e.target() + + if nick == c.get_nickname(): + del self.channels[channel] + else: + self.channels[channel].remove_user(nick) + + def _on_quit(self, c, e): + """[Internal]""" + nick = nm_to_n(e.source()) + for ch in self.channels.values(): + if ch.has_user(nick): + ch.remove_user(nick) + + def die(self, msg="Bye, cruel world!"): + """Let the bot die. + + Arguments: + + msg -- Quit message. + """ + + self.connection.disconnect(msg) + sys.exit(0) + + def disconnect(self, msg="I'll be back!"): + """Disconnect the bot. + + The bot will try to reconnect after a while. + + Arguments: + + msg -- Quit message. + """ + self.connection.disconnect(msg) + + def get_version(self): + """Returns the bot version. + + Used when answering a CTCP VERSION request. + """ + return "ircbot.py by Joel Rosdahl <joel@rosdahl.net>" + + def jump_server(self, msg="Changing servers"): + """Connect to a new server, possibly disconnecting from the current. + + The bot will skip to next server in the server_list each time + jump_server is called. + """ + if self.connection.is_connected(): + self.connection.disconnect(msg) + + self.server_list.append(self.server_list.pop(0)) + self._connect() + + def on_ctcp(self, c, e): + """Default handler for ctcp events. + + Replies to VERSION and PING requests and relays DCC requests + to the on_dccchat method. + """ + if e.arguments()[0] == "VERSION": + c.ctcp_reply(nm_to_n(e.source()), + "VERSION " + self.get_version()) + elif e.arguments()[0] == "PING": + if len(e.arguments()) > 1: + c.ctcp_reply(nm_to_n(e.source()), + "PING " + e.arguments()[1]) + elif e.arguments()[0] == "DCC" and e.arguments()[1].split(" ", 1)[0] == "CHAT": + self.on_dccchat(c, e) + + def on_dccchat(self, c, e): + pass + + def start(self): + """Start the bot.""" + self._connect() + SimpleIRCClient.start(self) + + +class IRCDict: + """A dictionary suitable for storing IRC-related things. + + Dictionary keys a and b are considered equal if and only if + irc_lower(a) == irc_lower(b) + + Otherwise, it should behave exactly as a normal dictionary. + """ + + def __init__(self, dict=None): + self.data = {} + self.canon_keys = {} # Canonical keys + if dict is not None: + self.update(dict) + def __repr__(self): + return repr(self.data) + def __cmp__(self, dict): + if isinstance(dict, IRCDict): + return cmp(self.data, dict.data) + else: + return cmp(self.data, dict) + def __len__(self): + return len(self.data) + def __getitem__(self, key): + return self.data[self.canon_keys[irc_lower(key)]] + def __setitem__(self, key, item): + if key in self: + del self[key] + self.data[key] = item + self.canon_keys[irc_lower(key)] = key + def __delitem__(self, key): + ck = irc_lower(key) + del self.data[self.canon_keys[ck]] + del self.canon_keys[ck] + def __iter__(self): + return iter(self.data) + def __contains__(self, key): + return self.has_key(key) + def clear(self): + self.data.clear() + self.canon_keys.clear() + def copy(self): + if self.__class__ is UserDict: + return UserDict(self.data) + import copy + return copy.copy(self) + def keys(self): + return self.data.keys() + def items(self): + return self.data.items() + def values(self): + return self.data.values() + def has_key(self, key): + return irc_lower(key) in self.canon_keys + def update(self, dict): + for k, v in dict.items(): + self.data[k] = v + def get(self, key, failobj=None): + return self.data.get(key, failobj) + + +class Channel: + """A class for keeping information about an IRC channel. + + This class can be improved a lot. + """ + + def __init__(self): + self.userdict = IRCDict() + self.operdict = IRCDict() + self.voiceddict = IRCDict() + self.modes = {} + + def users(self): + """Returns an unsorted list of the channel's users.""" + return self.userdict.keys() + + def opers(self): + """Returns an unsorted list of the channel's operators.""" + return self.operdict.keys() + + def voiced(self): + """Returns an unsorted list of the persons that have voice + mode set in the channel.""" + return self.voiceddict.keys() + + def has_user(self, nick): + """Check whether the channel has a user.""" + return nick in self.userdict + + def is_oper(self, nick): + """Check whether a user has operator status in the channel.""" + return nick in self.operdict + + def is_voiced(self, nick): + """Check whether a user has voice mode set in the channel.""" + return nick in self.voiceddict + + def add_user(self, nick): + self.userdict[nick] = 1 + + def remove_user(self, nick): + for d in self.userdict, self.operdict, self.voiceddict: + if nick in d: + del d[nick] + + def change_nick(self, before, after): + self.userdict[after] = 1 + del self.userdict[before] + if before in self.operdict: + self.operdict[after] = 1 + del self.operdict[before] + if before in self.voiceddict: + self.voiceddict[after] = 1 + del self.voiceddict[before] + + def set_mode(self, mode, value=None): + """Set mode on the channel. + + Arguments: + + mode -- The mode (a single-character string). + + value -- Value + """ + if mode == "o": + self.operdict[value] = 1 + elif mode == "v": + self.voiceddict[value] = 1 + else: + self.modes[mode] = value + + def clear_mode(self, mode, value=None): + """Clear mode on the channel. + + Arguments: + + mode -- The mode (a single-character string). + + value -- Value + """ + try: + if mode == "o": + del self.operdict[value] + elif mode == "v": + del self.voiceddict[value] + else: + del self.modes[mode] + except KeyError: + pass + + def has_mode(self, mode): + return mode in self.modes + + def is_moderated(self): + return self.has_mode("m") + + def is_secret(self): + return self.has_mode("s") + + def is_protected(self): + return self.has_mode("p") + + def has_topic_lock(self): + return self.has_mode("t") + + def is_invite_only(self): + return self.has_mode("i") + + def has_allow_external_messages(self): + return self.has_mode("n") + + def has_limit(self): + return self.has_mode("l") + + def limit(self): + if self.has_limit(): + return self.modes[l] + else: + return None + + def has_key(self): + return self.has_mode("k") + + def key(self): + if self.has_key(): + return self.modes["k"] + else: + return None diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/irc/irclib.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/irc/irclib.py new file mode 100644 index 0000000..5f7141c --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/irc/irclib.py @@ -0,0 +1,1560 @@ +# Copyright (C) 1999--2002 Joel Rosdahl +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# keltus <keltus@users.sourceforge.net> +# +# $Id: irclib.py,v 1.47 2008/09/25 22:00:59 keltus Exp $ + +"""irclib -- Internet Relay Chat (IRC) protocol client library. + +This library is intended to encapsulate the IRC protocol at a quite +low level. It provides an event-driven IRC client framework. It has +a fairly thorough support for the basic IRC protocol, CTCP, DCC chat, +but DCC file transfers is not yet supported. + +In order to understand how to make an IRC client, I'm afraid you more +or less must understand the IRC specifications. They are available +here: [IRC specifications]. + +The main features of the IRC client framework are: + + * Abstraction of the IRC protocol. + * Handles multiple simultaneous IRC server connections. + * Handles server PONGing transparently. + * Messages to the IRC server are done by calling methods on an IRC + connection object. + * Messages from an IRC server triggers events, which can be caught + by event handlers. + * Reading from and writing to IRC server sockets are normally done + by an internal select() loop, but the select()ing may be done by + an external main loop. + * Functions can be registered to execute at specified times by the + event-loop. + * Decodes CTCP tagging correctly (hopefully); I haven't seen any + other IRC client implementation that handles the CTCP + specification subtilties. + * A kind of simple, single-server, object-oriented IRC client class + that dispatches events to instance methods is included. + +Current limitations: + + * The IRC protocol shines through the abstraction a bit too much. + * Data is not written asynchronously to the server, i.e. the write() + may block if the TCP buffers are stuffed. + * There are no support for DCC file transfers. + * The author haven't even read RFC 2810, 2811, 2812 and 2813. + * Like most projects, documentation is lacking... + +.. [IRC specifications] http://www.irchelp.org/irchelp/rfc/ +""" + +import bisect +import re +import select +import socket +import string +import sys +import time +import types + +VERSION = 0, 4, 8 +DEBUG = 0 + +# TODO +# ---- +# (maybe) thread safety +# (maybe) color parser convenience functions +# documentation (including all event types) +# (maybe) add awareness of different types of ircds +# send data asynchronously to the server (and DCC connections) +# (maybe) automatically close unused, passive DCC connections after a while + +# NOTES +# ----- +# connection.quit() only sends QUIT to the server. +# ERROR from the server triggers the error event and the disconnect event. +# dropping of the connection triggers the disconnect event. + +class IRCError(Exception): + """Represents an IRC exception.""" + pass + + +class IRC: + """Class that handles one or several IRC server connections. + + When an IRC object has been instantiated, it can be used to create + Connection objects that represent the IRC connections. The + responsibility of the IRC object is to provide an event-driven + framework for the connections and to keep the connections alive. + It runs a select loop to poll each connection's TCP socket and + hands over the sockets with incoming data for processing by the + corresponding connection. + + The methods of most interest for an IRC client writer are server, + add_global_handler, remove_global_handler, execute_at, + execute_delayed, process_once and process_forever. + + Here is an example: + + irc = irclib.IRC() + server = irc.server() + server.connect(\"irc.some.where\", 6667, \"my_nickname\") + server.privmsg(\"a_nickname\", \"Hi there!\") + irc.process_forever() + + This will connect to the IRC server irc.some.where on port 6667 + using the nickname my_nickname and send the message \"Hi there!\" + to the nickname a_nickname. + """ + + def __init__(self, fn_to_add_socket=None, + fn_to_remove_socket=None, + fn_to_add_timeout=None): + """Constructor for IRC objects. + + Optional arguments are fn_to_add_socket, fn_to_remove_socket + and fn_to_add_timeout. The first two specify functions that + will be called with a socket object as argument when the IRC + object wants to be notified (or stop being notified) of data + coming on a new socket. When new data arrives, the method + process_data should be called. Similarly, fn_to_add_timeout + is called with a number of seconds (a floating point number) + as first argument when the IRC object wants to receive a + notification (by calling the process_timeout method). So, if + e.g. the argument is 42.17, the object wants the + process_timeout method to be called after 42 seconds and 170 + milliseconds. + + The three arguments mainly exist to be able to use an external + main loop (for example Tkinter's or PyGTK's main app loop) + instead of calling the process_forever method. + + An alternative is to just call ServerConnection.process_once() + once in a while. + """ + + if fn_to_add_socket and fn_to_remove_socket: + self.fn_to_add_socket = fn_to_add_socket + self.fn_to_remove_socket = fn_to_remove_socket + else: + self.fn_to_add_socket = None + self.fn_to_remove_socket = None + + self.fn_to_add_timeout = fn_to_add_timeout + self.connections = [] + self.handlers = {} + self.delayed_commands = [] # list of tuples in the format (time, function, arguments) + + self.add_global_handler("ping", _ping_ponger, -42) + + def server(self): + """Creates and returns a ServerConnection object.""" + + c = ServerConnection(self) + self.connections.append(c) + return c + + def process_data(self, sockets): + """Called when there is more data to read on connection sockets. + + Arguments: + + sockets -- A list of socket objects. + + See documentation for IRC.__init__. + """ + for s in sockets: + for c in self.connections: + if s == c._get_socket(): + c.process_data() + + def process_timeout(self): + """Called when a timeout notification is due. + + See documentation for IRC.__init__. + """ + t = time.time() + while self.delayed_commands: + if t >= self.delayed_commands[0][0]: + self.delayed_commands[0][1](*self.delayed_commands[0][2]) + del self.delayed_commands[0] + else: + break + + def process_once(self, timeout=0): + """Process data from connections once. + + Arguments: + + timeout -- How long the select() call should wait if no + data is available. + + This method should be called periodically to check and process + incoming data, if there are any. If that seems boring, look + at the process_forever method. + """ + sockets = map(lambda x: x._get_socket(), self.connections) + sockets = filter(lambda x: x != None, sockets) + if sockets: + (i, o, e) = select.select(sockets, [], [], timeout) + self.process_data(i) + else: + time.sleep(timeout) + self.process_timeout() + + def process_forever(self, timeout=0.2): + """Run an infinite loop, processing data from connections. + + This method repeatedly calls process_once. + + Arguments: + + timeout -- Parameter to pass to process_once. + """ + while 1: + self.process_once(timeout) + + def disconnect_all(self, message=""): + """Disconnects all connections.""" + for c in self.connections: + c.disconnect(message) + + def add_global_handler(self, event, handler, priority=0): + """Adds a global handler function for a specific event type. + + Arguments: + + event -- Event type (a string). Check the values of the + numeric_events dictionary in irclib.py for possible event + types. + + handler -- Callback function. + + priority -- A number (the lower number, the higher priority). + + The handler function is called whenever the specified event is + triggered in any of the connections. See documentation for + the Event class. + + The handler functions are called in priority order (lowest + number is highest priority). If a handler function returns + \"NO MORE\", no more handlers will be called. + """ + if not event in self.handlers: + self.handlers[event] = [] + bisect.insort(self.handlers[event], ((priority, handler))) + + def remove_global_handler(self, event, handler): + """Removes a global handler function. + + Arguments: + + event -- Event type (a string). + + handler -- Callback function. + + Returns 1 on success, otherwise 0. + """ + if not event in self.handlers: + return 0 + for h in self.handlers[event]: + if handler == h[1]: + self.handlers[event].remove(h) + return 1 + + def execute_at(self, at, function, arguments=()): + """Execute a function at a specified time. + + Arguments: + + at -- Execute at this time (standard \"time_t\" time). + + function -- Function to call. + + arguments -- Arguments to give the function. + """ + self.execute_delayed(at-time.time(), function, arguments) + + def execute_delayed(self, delay, function, arguments=()): + """Execute a function after a specified time. + + Arguments: + + delay -- How many seconds to wait. + + function -- Function to call. + + arguments -- Arguments to give the function. + """ + bisect.insort(self.delayed_commands, (delay+time.time(), function, arguments)) + if self.fn_to_add_timeout: + self.fn_to_add_timeout(delay) + + def dcc(self, dcctype="chat"): + """Creates and returns a DCCConnection object. + + Arguments: + + dcctype -- "chat" for DCC CHAT connections or "raw" for + DCC SEND (or other DCC types). If "chat", + incoming data will be split in newline-separated + chunks. If "raw", incoming data is not touched. + """ + c = DCCConnection(self, dcctype) + self.connections.append(c) + return c + + def _handle_event(self, connection, event): + """[Internal]""" + h = self.handlers + for handler in h.get("all_events", []) + h.get(event.eventtype(), []): + if handler[1](connection, event) == "NO MORE": + return + + def _remove_connection(self, connection): + """[Internal]""" + self.connections.remove(connection) + if self.fn_to_remove_socket: + self.fn_to_remove_socket(connection._get_socket()) + +_rfc_1459_command_regexp = re.compile("^(:(?P<prefix>[^ ]+) +)?(?P<command>[^ ]+)( *(?P<argument> .+))?") + +class Connection: + """Base class for IRC connections. + + Must be overridden. + """ + def __init__(self, irclibobj): + self.irclibobj = irclibobj + + def _get_socket(): + raise IRCError, "Not overridden" + + ############################## + ### Convenience wrappers. + + def execute_at(self, at, function, arguments=()): + self.irclibobj.execute_at(at, function, arguments) + + def execute_delayed(self, delay, function, arguments=()): + self.irclibobj.execute_delayed(delay, function, arguments) + + +class ServerConnectionError(IRCError): + pass + +class ServerNotConnectedError(ServerConnectionError): + pass + + +# Huh!? Crrrrazy EFNet doesn't follow the RFC: their ircd seems to +# use \n as message separator! :P +_linesep_regexp = re.compile("\r?\n") + +class ServerConnection(Connection): + """This class represents an IRC server connection. + + ServerConnection objects are instantiated by calling the server + method on an IRC object. + """ + + def __init__(self, irclibobj): + Connection.__init__(self, irclibobj) + self.connected = 0 # Not connected yet. + self.socket = None + self.ssl = None + + def connect(self, server, port, nickname, password=None, username=None, + ircname=None, localaddress="", localport=0, ssl=False, ipv6=False): + """Connect/reconnect to a server. + + Arguments: + + server -- Server name. + + port -- Port number. + + nickname -- The nickname. + + password -- Password (if any). + + username -- The username. + + ircname -- The IRC name ("realname"). + + localaddress -- Bind the connection to a specific local IP address. + + localport -- Bind the connection to a specific local port. + + ssl -- Enable support for ssl. + + ipv6 -- Enable support for ipv6. + + This function can be called to reconnect a closed connection. + + Returns the ServerConnection object. + """ + if self.connected: + self.disconnect("Changing servers") + + self.previous_buffer = "" + self.handlers = {} + self.real_server_name = "" + self.real_nickname = nickname + self.server = server + self.port = port + self.nickname = nickname + self.username = username or nickname + self.ircname = ircname or nickname + self.password = password + self.localaddress = localaddress + self.localport = localport + self.localhost = socket.gethostname() + if ipv6: + self.socket = socket.socket(socket.AF_INET6, socket.SOCK_STREAM) + else: + self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + self.socket.bind((self.localaddress, self.localport)) + self.socket.connect((self.server, self.port)) + if ssl: + self.ssl = socket.ssl(self.socket) + except socket.error, x: + self.socket.close() + self.socket = None + raise ServerConnectionError, "Couldn't connect to socket: %s" % x + self.connected = 1 + if self.irclibobj.fn_to_add_socket: + self.irclibobj.fn_to_add_socket(self.socket) + + # Log on... + if self.password: + self.pass_(self.password) + self.nick(self.nickname) + self.user(self.username, self.ircname) + return self + + def close(self): + """Close the connection. + + This method closes the connection permanently; after it has + been called, the object is unusable. + """ + + self.disconnect("Closing object") + self.irclibobj._remove_connection(self) + + def _get_socket(self): + """[Internal]""" + return self.socket + + def get_server_name(self): + """Get the (real) server name. + + This method returns the (real) server name, or, more + specifically, what the server calls itself. + """ + + if self.real_server_name: + return self.real_server_name + else: + return "" + + def get_nickname(self): + """Get the (real) nick name. + + This method returns the (real) nickname. The library keeps + track of nick changes, so it might not be the nick name that + was passed to the connect() method. """ + + return self.real_nickname + + def process_data(self): + """[Internal]""" + + try: + if self.ssl: + new_data = self.ssl.read(2**14) + else: + new_data = self.socket.recv(2**14) + except socket.error, x: + # The server hung up. + self.disconnect("Connection reset by peer") + return + if not new_data: + # Read nothing: connection must be down. + self.disconnect("Connection reset by peer") + return + + lines = _linesep_regexp.split(self.previous_buffer + new_data) + + # Save the last, unfinished line. + self.previous_buffer = lines.pop() + + for line in lines: + if DEBUG: + print "FROM SERVER:", line + + if not line: + continue + + prefix = None + command = None + arguments = None + self._handle_event(Event("all_raw_messages", + self.get_server_name(), + None, + [line])) + + m = _rfc_1459_command_regexp.match(line) + if m.group("prefix"): + prefix = m.group("prefix") + if not self.real_server_name: + self.real_server_name = prefix + + if m.group("command"): + command = m.group("command").lower() + + if m.group("argument"): + a = m.group("argument").split(" :", 1) + arguments = a[0].split() + if len(a) == 2: + arguments.append(a[1]) + + # Translate numerics into more readable strings. + if command in numeric_events: + command = numeric_events[command] + + if command == "nick": + if nm_to_n(prefix) == self.real_nickname: + self.real_nickname = arguments[0] + elif command == "welcome": + # Record the nickname in case the client changed nick + # in a nicknameinuse callback. + self.real_nickname = arguments[0] + + if command in ["privmsg", "notice"]: + target, message = arguments[0], arguments[1] + messages = _ctcp_dequote(message) + + if command == "privmsg": + if is_channel(target): + command = "pubmsg" + else: + if is_channel(target): + command = "pubnotice" + else: + command = "privnotice" + + for m in messages: + if type(m) is types.TupleType: + if command in ["privmsg", "pubmsg"]: + command = "ctcp" + else: + command = "ctcpreply" + + m = list(m) + if DEBUG: + print "command: %s, source: %s, target: %s, arguments: %s" % ( + command, prefix, target, m) + self._handle_event(Event(command, prefix, target, m)) + if command == "ctcp" and m[0] == "ACTION": + self._handle_event(Event("action", prefix, target, m[1:])) + else: + if DEBUG: + print "command: %s, source: %s, target: %s, arguments: %s" % ( + command, prefix, target, [m]) + self._handle_event(Event(command, prefix, target, [m])) + else: + target = None + + if command == "quit": + arguments = [arguments[0]] + elif command == "ping": + target = arguments[0] + else: + target = arguments[0] + arguments = arguments[1:] + + if command == "mode": + if not is_channel(target): + command = "umode" + + if DEBUG: + print "command: %s, source: %s, target: %s, arguments: %s" % ( + command, prefix, target, arguments) + self._handle_event(Event(command, prefix, target, arguments)) + + def _handle_event(self, event): + """[Internal]""" + self.irclibobj._handle_event(self, event) + if event.eventtype() in self.handlers: + for fn in self.handlers[event.eventtype()]: + fn(self, event) + + def is_connected(self): + """Return connection status. + + Returns true if connected, otherwise false. + """ + return self.connected + + def add_global_handler(self, *args): + """Add global handler. + + See documentation for IRC.add_global_handler. + """ + self.irclibobj.add_global_handler(*args) + + def remove_global_handler(self, *args): + """Remove global handler. + + See documentation for IRC.remove_global_handler. + """ + self.irclibobj.remove_global_handler(*args) + + def action(self, target, action): + """Send a CTCP ACTION command.""" + self.ctcp("ACTION", target, action) + + def admin(self, server=""): + """Send an ADMIN command.""" + self.send_raw(" ".join(["ADMIN", server]).strip()) + + def ctcp(self, ctcptype, target, parameter=""): + """Send a CTCP command.""" + ctcptype = ctcptype.upper() + self.privmsg(target, "\001%s%s\001" % (ctcptype, parameter and (" " + parameter) or "")) + + def ctcp_reply(self, target, parameter): + """Send a CTCP REPLY command.""" + self.notice(target, "\001%s\001" % parameter) + + def disconnect(self, message=""): + """Hang up the connection. + + Arguments: + + message -- Quit message. + """ + if not self.connected: + return + + self.connected = 0 + + self.quit(message) + + try: + self.socket.close() + except socket.error, x: + pass + self.socket = None + self._handle_event(Event("disconnect", self.server, "", [message])) + + def globops(self, text): + """Send a GLOBOPS command.""" + self.send_raw("GLOBOPS :" + text) + + def info(self, server=""): + """Send an INFO command.""" + self.send_raw(" ".join(["INFO", server]).strip()) + + def invite(self, nick, channel): + """Send an INVITE command.""" + self.send_raw(" ".join(["INVITE", nick, channel]).strip()) + + def ison(self, nicks): + """Send an ISON command. + + Arguments: + + nicks -- List of nicks. + """ + self.send_raw("ISON " + " ".join(nicks)) + + def join(self, channel, key=""): + """Send a JOIN command.""" + self.send_raw("JOIN %s%s" % (channel, (key and (" " + key)))) + + def kick(self, channel, nick, comment=""): + """Send a KICK command.""" + self.send_raw("KICK %s %s%s" % (channel, nick, (comment and (" :" + comment)))) + + def links(self, remote_server="", server_mask=""): + """Send a LINKS command.""" + command = "LINKS" + if remote_server: + command = command + " " + remote_server + if server_mask: + command = command + " " + server_mask + self.send_raw(command) + + def list(self, channels=None, server=""): + """Send a LIST command.""" + command = "LIST" + if channels: + command = command + " " + ",".join(channels) + if server: + command = command + " " + server + self.send_raw(command) + + def lusers(self, server=""): + """Send a LUSERS command.""" + self.send_raw("LUSERS" + (server and (" " + server))) + + def mode(self, target, command): + """Send a MODE command.""" + self.send_raw("MODE %s %s" % (target, command)) + + def motd(self, server=""): + """Send an MOTD command.""" + self.send_raw("MOTD" + (server and (" " + server))) + + def names(self, channels=None): + """Send a NAMES command.""" + self.send_raw("NAMES" + (channels and (" " + ",".join(channels)) or "")) + + def nick(self, newnick): + """Send a NICK command.""" + self.send_raw("NICK " + newnick) + + def notice(self, target, text): + """Send a NOTICE command.""" + # Should limit len(text) here! + self.send_raw("NOTICE %s :%s" % (target, text)) + + def oper(self, nick, password): + """Send an OPER command.""" + self.send_raw("OPER %s %s" % (nick, password)) + + def part(self, channels, message=""): + """Send a PART command.""" + if type(channels) == types.StringType: + self.send_raw("PART " + channels + (message and (" " + message))) + else: + self.send_raw("PART " + ",".join(channels) + (message and (" " + message))) + + def pass_(self, password): + """Send a PASS command.""" + self.send_raw("PASS " + password) + + def ping(self, target, target2=""): + """Send a PING command.""" + self.send_raw("PING %s%s" % (target, target2 and (" " + target2))) + + def pong(self, target, target2=""): + """Send a PONG command.""" + self.send_raw("PONG %s%s" % (target, target2 and (" " + target2))) + + def privmsg(self, target, text): + """Send a PRIVMSG command.""" + # Should limit len(text) here! + self.send_raw("PRIVMSG %s :%s" % (target, text)) + + def privmsg_many(self, targets, text): + """Send a PRIVMSG command to multiple targets.""" + # Should limit len(text) here! + self.send_raw("PRIVMSG %s :%s" % (",".join(targets), text)) + + def quit(self, message=""): + """Send a QUIT command.""" + # Note that many IRC servers don't use your QUIT message + # unless you've been connected for at least 5 minutes! + self.send_raw("QUIT" + (message and (" :" + message))) + + def send_raw(self, string): + """Send raw string to the server. + + The string will be padded with appropriate CR LF. + """ + if self.socket is None: + raise ServerNotConnectedError, "Not connected." + try: + if self.ssl: + self.ssl.write(string + "\r\n") + else: + self.socket.send(string + "\r\n") + if DEBUG: + print "TO SERVER:", string + except socket.error, x: + # Ouch! + self.disconnect("Connection reset by peer.") + + def squit(self, server, comment=""): + """Send an SQUIT command.""" + self.send_raw("SQUIT %s%s" % (server, comment and (" :" + comment))) + + def stats(self, statstype, server=""): + """Send a STATS command.""" + self.send_raw("STATS %s%s" % (statstype, server and (" " + server))) + + def time(self, server=""): + """Send a TIME command.""" + self.send_raw("TIME" + (server and (" " + server))) + + def topic(self, channel, new_topic=None): + """Send a TOPIC command.""" + if new_topic is None: + self.send_raw("TOPIC " + channel) + else: + self.send_raw("TOPIC %s :%s" % (channel, new_topic)) + + def trace(self, target=""): + """Send a TRACE command.""" + self.send_raw("TRACE" + (target and (" " + target))) + + def user(self, username, realname): + """Send a USER command.""" + self.send_raw("USER %s 0 * :%s" % (username, realname)) + + def userhost(self, nicks): + """Send a USERHOST command.""" + self.send_raw("USERHOST " + ",".join(nicks)) + + def users(self, server=""): + """Send a USERS command.""" + self.send_raw("USERS" + (server and (" " + server))) + + def version(self, server=""): + """Send a VERSION command.""" + self.send_raw("VERSION" + (server and (" " + server))) + + def wallops(self, text): + """Send a WALLOPS command.""" + self.send_raw("WALLOPS :" + text) + + def who(self, target="", op=""): + """Send a WHO command.""" + self.send_raw("WHO%s%s" % (target and (" " + target), op and (" o"))) + + def whois(self, targets): + """Send a WHOIS command.""" + self.send_raw("WHOIS " + ",".join(targets)) + + def whowas(self, nick, max="", server=""): + """Send a WHOWAS command.""" + self.send_raw("WHOWAS %s%s%s" % (nick, + max and (" " + max), + server and (" " + server))) + +class DCCConnectionError(IRCError): + pass + + +class DCCConnection(Connection): + """This class represents a DCC connection. + + DCCConnection objects are instantiated by calling the dcc + method on an IRC object. + """ + def __init__(self, irclibobj, dcctype): + Connection.__init__(self, irclibobj) + self.connected = 0 + self.passive = 0 + self.dcctype = dcctype + self.peeraddress = None + self.peerport = None + + def connect(self, address, port): + """Connect/reconnect to a DCC peer. + + Arguments: + address -- Host/IP address of the peer. + + port -- The port number to connect to. + + Returns the DCCConnection object. + """ + self.peeraddress = socket.gethostbyname(address) + self.peerport = port + self.socket = None + self.previous_buffer = "" + self.handlers = {} + self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + self.passive = 0 + try: + self.socket.connect((self.peeraddress, self.peerport)) + except socket.error, x: + raise DCCConnectionError, "Couldn't connect to socket: %s" % x + self.connected = 1 + if self.irclibobj.fn_to_add_socket: + self.irclibobj.fn_to_add_socket(self.socket) + return self + + def listen(self): + """Wait for a connection/reconnection from a DCC peer. + + Returns the DCCConnection object. + + The local IP address and port are available as + self.localaddress and self.localport. After connection from a + peer, the peer address and port are available as + self.peeraddress and self.peerport. + """ + self.previous_buffer = "" + self.handlers = {} + self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + self.passive = 1 + try: + self.socket.bind((socket.gethostbyname(socket.gethostname()), 0)) + self.localaddress, self.localport = self.socket.getsockname() + self.socket.listen(10) + except socket.error, x: + raise DCCConnectionError, "Couldn't bind socket: %s" % x + return self + + def disconnect(self, message=""): + """Hang up the connection and close the object. + + Arguments: + + message -- Quit message. + """ + if not self.connected: + return + + self.connected = 0 + try: + self.socket.close() + except socket.error, x: + pass + self.socket = None + self.irclibobj._handle_event( + self, + Event("dcc_disconnect", self.peeraddress, "", [message])) + self.irclibobj._remove_connection(self) + + def process_data(self): + """[Internal]""" + + if self.passive and not self.connected: + conn, (self.peeraddress, self.peerport) = self.socket.accept() + self.socket.close() + self.socket = conn + self.connected = 1 + if DEBUG: + print "DCC connection from %s:%d" % ( + self.peeraddress, self.peerport) + self.irclibobj._handle_event( + self, + Event("dcc_connect", self.peeraddress, None, None)) + return + + try: + new_data = self.socket.recv(2**14) + except socket.error, x: + # The server hung up. + self.disconnect("Connection reset by peer") + return + if not new_data: + # Read nothing: connection must be down. + self.disconnect("Connection reset by peer") + return + + if self.dcctype == "chat": + # The specification says lines are terminated with LF, but + # it seems safer to handle CR LF terminations too. + chunks = _linesep_regexp.split(self.previous_buffer + new_data) + + # Save the last, unfinished line. + self.previous_buffer = chunks[-1] + if len(self.previous_buffer) > 2**14: + # Bad peer! Naughty peer! + self.disconnect() + return + chunks = chunks[:-1] + else: + chunks = [new_data] + + command = "dccmsg" + prefix = self.peeraddress + target = None + for chunk in chunks: + if DEBUG: + print "FROM PEER:", chunk + arguments = [chunk] + if DEBUG: + print "command: %s, source: %s, target: %s, arguments: %s" % ( + command, prefix, target, arguments) + self.irclibobj._handle_event( + self, + Event(command, prefix, target, arguments)) + + def _get_socket(self): + """[Internal]""" + return self.socket + + def privmsg(self, string): + """Send data to DCC peer. + + The string will be padded with appropriate LF if it's a DCC + CHAT session. + """ + try: + self.socket.send(string) + if self.dcctype == "chat": + self.socket.send("\n") + if DEBUG: + print "TO PEER: %s\n" % string + except socket.error, x: + # Ouch! + self.disconnect("Connection reset by peer.") + +class SimpleIRCClient: + """A simple single-server IRC client class. + + This is an example of an object-oriented wrapper of the IRC + framework. A real IRC client can be made by subclassing this + class and adding appropriate methods. + + The method on_join will be called when a "join" event is created + (which is done when the server sends a JOIN messsage/command), + on_privmsg will be called for "privmsg" events, and so on. The + handler methods get two arguments: the connection object (same as + self.connection) and the event object. + + Instance attributes that can be used by sub classes: + + ircobj -- The IRC instance. + + connection -- The ServerConnection instance. + + dcc_connections -- A list of DCCConnection instances. + """ + def __init__(self): + self.ircobj = IRC() + self.connection = self.ircobj.server() + self.dcc_connections = [] + self.ircobj.add_global_handler("all_events", self._dispatcher, -10) + self.ircobj.add_global_handler("dcc_disconnect", self._dcc_disconnect, -10) + + def _dispatcher(self, c, e): + """[Internal]""" + m = "on_" + e.eventtype() + if hasattr(self, m): + getattr(self, m)(c, e) + + def _dcc_disconnect(self, c, e): + self.dcc_connections.remove(c) + + def connect(self, server, port, nickname, password=None, username=None, + ircname=None, localaddress="", localport=0, ssl=False, ipv6=False): + """Connect/reconnect to a server. + + Arguments: + + server -- Server name. + + port -- Port number. + + nickname -- The nickname. + + password -- Password (if any). + + username -- The username. + + ircname -- The IRC name. + + localaddress -- Bind the connection to a specific local IP address. + + localport -- Bind the connection to a specific local port. + + ssl -- Enable support for ssl. + + ipv6 -- Enable support for ipv6. + + This function can be called to reconnect a closed connection. + """ + self.connection.connect(server, port, nickname, + password, username, ircname, + localaddress, localport, ssl, ipv6) + + def dcc_connect(self, address, port, dcctype="chat"): + """Connect to a DCC peer. + + Arguments: + + address -- IP address of the peer. + + port -- Port to connect to. + + Returns a DCCConnection instance. + """ + dcc = self.ircobj.dcc(dcctype) + self.dcc_connections.append(dcc) + dcc.connect(address, port) + return dcc + + def dcc_listen(self, dcctype="chat"): + """Listen for connections from a DCC peer. + + Returns a DCCConnection instance. + """ + dcc = self.ircobj.dcc(dcctype) + self.dcc_connections.append(dcc) + dcc.listen() + return dcc + + def start(self): + """Start the IRC client.""" + self.ircobj.process_forever() + + +class Event: + """Class representing an IRC event.""" + def __init__(self, eventtype, source, target, arguments=None): + """Constructor of Event objects. + + Arguments: + + eventtype -- A string describing the event. + + source -- The originator of the event (a nick mask or a server). + + target -- The target of the event (a nick or a channel). + + arguments -- Any event specific arguments. + """ + self._eventtype = eventtype + self._source = source + self._target = target + if arguments: + self._arguments = arguments + else: + self._arguments = [] + + def eventtype(self): + """Get the event type.""" + return self._eventtype + + def source(self): + """Get the event source.""" + return self._source + + def target(self): + """Get the event target.""" + return self._target + + def arguments(self): + """Get the event arguments.""" + return self._arguments + +_LOW_LEVEL_QUOTE = "\020" +_CTCP_LEVEL_QUOTE = "\134" +_CTCP_DELIMITER = "\001" + +_low_level_mapping = { + "0": "\000", + "n": "\n", + "r": "\r", + _LOW_LEVEL_QUOTE: _LOW_LEVEL_QUOTE +} + +_low_level_regexp = re.compile(_LOW_LEVEL_QUOTE + "(.)") + +def mask_matches(nick, mask): + """Check if a nick matches a mask. + + Returns true if the nick matches, otherwise false. + """ + nick = irc_lower(nick) + mask = irc_lower(mask) + mask = mask.replace("\\", "\\\\") + for ch in ".$|[](){}+": + mask = mask.replace(ch, "\\" + ch) + mask = mask.replace("?", ".") + mask = mask.replace("*", ".*") + r = re.compile(mask, re.IGNORECASE) + return r.match(nick) + +_special = "-[]\\`^{}" +nick_characters = string.ascii_letters + string.digits + _special +_ircstring_translation = string.maketrans(string.ascii_uppercase + "[]\\^", + string.ascii_lowercase + "{}|~") + +def irc_lower(s): + """Returns a lowercased string. + + The definition of lowercased comes from the IRC specification (RFC + 1459). + """ + return s.translate(_ircstring_translation) + +def _ctcp_dequote(message): + """[Internal] Dequote a message according to CTCP specifications. + + The function returns a list where each element can be either a + string (normal message) or a tuple of one or two strings (tagged + messages). If a tuple has only one element (ie is a singleton), + that element is the tag; otherwise the tuple has two elements: the + tag and the data. + + Arguments: + + message -- The message to be decoded. + """ + + def _low_level_replace(match_obj): + ch = match_obj.group(1) + + # If low_level_mapping doesn't have the character as key, we + # should just return the character. + return _low_level_mapping.get(ch, ch) + + if _LOW_LEVEL_QUOTE in message: + # Yup, there was a quote. Release the dequoter, man! + message = _low_level_regexp.sub(_low_level_replace, message) + + if _CTCP_DELIMITER not in message: + return [message] + else: + # Split it into parts. (Does any IRC client actually *use* + # CTCP stacking like this?) + chunks = message.split(_CTCP_DELIMITER) + + messages = [] + i = 0 + while i < len(chunks)-1: + # Add message if it's non-empty. + if len(chunks[i]) > 0: + messages.append(chunks[i]) + + if i < len(chunks)-2: + # Aye! CTCP tagged data ahead! + messages.append(tuple(chunks[i+1].split(" ", 1))) + + i = i + 2 + + if len(chunks) % 2 == 0: + # Hey, a lonely _CTCP_DELIMITER at the end! This means + # that the last chunk, including the delimiter, is a + # normal message! (This is according to the CTCP + # specification.) + messages.append(_CTCP_DELIMITER + chunks[-1]) + + return messages + +def is_channel(string): + """Check if a string is a channel name. + + Returns true if the argument is a channel name, otherwise false. + """ + return string and string[0] in "#&+!" + +def ip_numstr_to_quad(num): + """Convert an IP number as an integer given in ASCII + representation (e.g. '3232235521') to an IP address string + (e.g. '192.168.0.1').""" + n = long(num) + p = map(str, map(int, [n >> 24 & 0xFF, n >> 16 & 0xFF, + n >> 8 & 0xFF, n & 0xFF])) + return ".".join(p) + +def ip_quad_to_numstr(quad): + """Convert an IP address string (e.g. '192.168.0.1') to an IP + number as an integer given in ASCII representation + (e.g. '3232235521').""" + p = map(long, quad.split(".")) + s = str((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]) + if s[-1] == "L": + s = s[:-1] + return s + +def nm_to_n(s): + """Get the nick part of a nickmask. + + (The source of an Event is a nickmask.) + """ + return s.split("!")[0] + +def nm_to_uh(s): + """Get the userhost part of a nickmask. + + (The source of an Event is a nickmask.) + """ + return s.split("!")[1] + +def nm_to_h(s): + """Get the host part of a nickmask. + + (The source of an Event is a nickmask.) + """ + return s.split("@")[1] + +def nm_to_u(s): + """Get the user part of a nickmask. + + (The source of an Event is a nickmask.) + """ + s = s.split("!")[1] + return s.split("@")[0] + +def parse_nick_modes(mode_string): + """Parse a nick mode string. + + The function returns a list of lists with three members: sign, + mode and argument. The sign is \"+\" or \"-\". The argument is + always None. + + Example: + + >>> irclib.parse_nick_modes(\"+ab-c\") + [['+', 'a', None], ['+', 'b', None], ['-', 'c', None]] + """ + + return _parse_modes(mode_string, "") + +def parse_channel_modes(mode_string): + """Parse a channel mode string. + + The function returns a list of lists with three members: sign, + mode and argument. The sign is \"+\" or \"-\". The argument is + None if mode isn't one of \"b\", \"k\", \"l\", \"v\" or \"o\". + + Example: + + >>> irclib.parse_channel_modes(\"+ab-c foo\") + [['+', 'a', None], ['+', 'b', 'foo'], ['-', 'c', None]] + """ + + return _parse_modes(mode_string, "bklvo") + +def _parse_modes(mode_string, unary_modes=""): + """[Internal]""" + modes = [] + arg_count = 0 + + # State variable. + sign = "" + + a = mode_string.split() + if len(a) == 0: + return [] + else: + mode_part, args = a[0], a[1:] + + if mode_part[0] not in "+-": + return [] + for ch in mode_part: + if ch in "+-": + sign = ch + elif ch == " ": + collecting_arguments = 1 + elif ch in unary_modes: + if len(args) >= arg_count + 1: + modes.append([sign, ch, args[arg_count]]) + arg_count = arg_count + 1 + else: + modes.append([sign, ch, None]) + else: + modes.append([sign, ch, None]) + return modes + +def _ping_ponger(connection, event): + """[Internal]""" + connection.pong(event.target()) + +# Numeric table mostly stolen from the Perl IRC module (Net::IRC). +numeric_events = { + "001": "welcome", + "002": "yourhost", + "003": "created", + "004": "myinfo", + "005": "featurelist", # XXX + "200": "tracelink", + "201": "traceconnecting", + "202": "tracehandshake", + "203": "traceunknown", + "204": "traceoperator", + "205": "traceuser", + "206": "traceserver", + "207": "traceservice", + "208": "tracenewtype", + "209": "traceclass", + "210": "tracereconnect", + "211": "statslinkinfo", + "212": "statscommands", + "213": "statscline", + "214": "statsnline", + "215": "statsiline", + "216": "statskline", + "217": "statsqline", + "218": "statsyline", + "219": "endofstats", + "221": "umodeis", + "231": "serviceinfo", + "232": "endofservices", + "233": "service", + "234": "servlist", + "235": "servlistend", + "241": "statslline", + "242": "statsuptime", + "243": "statsoline", + "244": "statshline", + "250": "luserconns", + "251": "luserclient", + "252": "luserop", + "253": "luserunknown", + "254": "luserchannels", + "255": "luserme", + "256": "adminme", + "257": "adminloc1", + "258": "adminloc2", + "259": "adminemail", + "261": "tracelog", + "262": "endoftrace", + "263": "tryagain", + "265": "n_local", + "266": "n_global", + "300": "none", + "301": "away", + "302": "userhost", + "303": "ison", + "305": "unaway", + "306": "nowaway", + "311": "whoisuser", + "312": "whoisserver", + "313": "whoisoperator", + "314": "whowasuser", + "315": "endofwho", + "316": "whoischanop", + "317": "whoisidle", + "318": "endofwhois", + "319": "whoischannels", + "321": "liststart", + "322": "list", + "323": "listend", + "324": "channelmodeis", + "329": "channelcreate", + "331": "notopic", + "332": "currenttopic", + "333": "topicinfo", + "341": "inviting", + "342": "summoning", + "346": "invitelist", + "347": "endofinvitelist", + "348": "exceptlist", + "349": "endofexceptlist", + "351": "version", + "352": "whoreply", + "353": "namreply", + "361": "killdone", + "362": "closing", + "363": "closeend", + "364": "links", + "365": "endoflinks", + "366": "endofnames", + "367": "banlist", + "368": "endofbanlist", + "369": "endofwhowas", + "371": "info", + "372": "motd", + "373": "infostart", + "374": "endofinfo", + "375": "motdstart", + "376": "endofmotd", + "377": "motd2", # 1997-10-16 -- tkil + "381": "youreoper", + "382": "rehashing", + "384": "myportis", + "391": "time", + "392": "usersstart", + "393": "users", + "394": "endofusers", + "395": "nousers", + "401": "nosuchnick", + "402": "nosuchserver", + "403": "nosuchchannel", + "404": "cannotsendtochan", + "405": "toomanychannels", + "406": "wasnosuchnick", + "407": "toomanytargets", + "409": "noorigin", + "411": "norecipient", + "412": "notexttosend", + "413": "notoplevel", + "414": "wildtoplevel", + "421": "unknowncommand", + "422": "nomotd", + "423": "noadmininfo", + "424": "fileerror", + "431": "nonicknamegiven", + "432": "erroneusnickname", # Thiss iz how its speld in thee RFC. + "433": "nicknameinuse", + "436": "nickcollision", + "437": "unavailresource", # "Nick temporally unavailable" + "441": "usernotinchannel", + "442": "notonchannel", + "443": "useronchannel", + "444": "nologin", + "445": "summondisabled", + "446": "usersdisabled", + "451": "notregistered", + "461": "needmoreparams", + "462": "alreadyregistered", + "463": "nopermforhost", + "464": "passwdmismatch", + "465": "yourebannedcreep", # I love this one... + "466": "youwillbebanned", + "467": "keyset", + "471": "channelisfull", + "472": "unknownmode", + "473": "inviteonlychan", + "474": "bannedfromchan", + "475": "badchannelkey", + "476": "badchanmask", + "477": "nochanmodes", # "Channel doesn't support modes" + "478": "banlistfull", + "481": "noprivileges", + "482": "chanoprivsneeded", + "483": "cantkillserver", + "484": "restricted", # Connection is restricted + "485": "uniqopprivsneeded", + "491": "nooperhost", + "492": "noservicehost", + "501": "umodeunknownflag", + "502": "usersdontmatch", +} + +generated_events = [ + # Generated events + "dcc_connect", + "dcc_disconnect", + "dccmsg", + "disconnect", + "ctcp", + "ctcpreply", +] + +protocol_events = [ + # IRC protocol events + "error", + "join", + "kick", + "mode", + "part", + "ping", + "privmsg", + "privnotice", + "pubmsg", + "pubnotice", + "quit", + "invite", + "pong", +] + +all_events = generated_events + protocol_events + numeric_events.values() diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/__init__.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/__init__.py new file mode 100644 index 0000000..4bb20aa --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/__init__.py @@ -0,0 +1,140 @@ +__all__ = [ + 'AbstractBasicAuthHandler', + 'AbstractDigestAuthHandler', + 'BaseHandler', + 'Browser', + 'BrowserStateError', + 'CacheFTPHandler', + 'ContentTooShortError', + 'Cookie', + 'CookieJar', + 'CookiePolicy', + 'DefaultCookiePolicy', + 'DefaultFactory', + 'FTPHandler', + 'Factory', + 'FileCookieJar', + 'FileHandler', + 'FormNotFoundError', + 'FormsFactory', + 'HTTPBasicAuthHandler', + 'HTTPCookieProcessor', + 'HTTPDefaultErrorHandler', + 'HTTPDigestAuthHandler', + 'HTTPEquivProcessor', + 'HTTPError', + 'HTTPErrorProcessor', + 'HTTPHandler', + 'HTTPPasswordMgr', + 'HTTPPasswordMgrWithDefaultRealm', + 'HTTPProxyPasswordMgr', + 'HTTPRedirectDebugProcessor', + 'HTTPRedirectHandler', + 'HTTPRefererProcessor', + 'HTTPRefreshProcessor', + 'HTTPRequestUpgradeProcessor', + 'HTTPResponseDebugProcessor', + 'HTTPRobotRulesProcessor', + 'HTTPSClientCertMgr', + 'HTTPSHandler', + 'HeadParser', + 'History', + 'LWPCookieJar', + 'Link', + 'LinkNotFoundError', + 'LinksFactory', + 'LoadError', + 'MSIECookieJar', + 'MozillaCookieJar', + 'OpenerDirector', + 'OpenerFactory', + 'ParseError', + 'ProxyBasicAuthHandler', + 'ProxyDigestAuthHandler', + 'ProxyHandler', + 'Request', + 'ResponseUpgradeProcessor', + 'RobotExclusionError', + 'RobustFactory', + 'RobustFormsFactory', + 'RobustLinksFactory', + 'RobustTitleFactory', + 'SeekableProcessor', + 'SeekableResponseOpener', + 'TitleFactory', + 'URLError', + 'USE_BARE_EXCEPT', + 'UnknownHandler', + 'UserAgent', + 'UserAgentBase', + 'XHTMLCompatibleHeadParser', + '__version__', + 'build_opener', + 'install_opener', + 'lwp_cookie_str', + 'make_response', + 'request_host', + 'response_seek_wrapper', # XXX deprecate in public interface? + 'seek_wrapped_response' # XXX should probably use this internally in place of response_seek_wrapper() + 'str2time', + 'urlopen', + 'urlretrieve'] + +import logging +import sys + +from _mechanize import __version__ + +# high-level stateful browser-style interface +from _mechanize import \ + Browser, History, \ + BrowserStateError, LinkNotFoundError, FormNotFoundError + +# configurable URL-opener interface +from _useragent import UserAgentBase, UserAgent +from _html import \ + ParseError, \ + Link, \ + Factory, DefaultFactory, RobustFactory, \ + FormsFactory, LinksFactory, TitleFactory, \ + RobustFormsFactory, RobustLinksFactory, RobustTitleFactory + +# urllib2 work-alike interface (part from mechanize, part from urllib2) +# This is a superset of the urllib2 interface. +from _urllib2 import * + +# misc +from _opener import ContentTooShortError, OpenerFactory, urlretrieve +from _util import http2time as str2time +from _response import \ + response_seek_wrapper, seek_wrapped_response, make_response +from _http import HeadParser +try: + from _http import XHTMLCompatibleHeadParser +except ImportError: + pass + +# cookies +from _clientcookie import Cookie, CookiePolicy, DefaultCookiePolicy, \ + CookieJar, FileCookieJar, LoadError, request_host_lc as request_host, \ + effective_request_host +from _lwpcookiejar import LWPCookieJar, lwp_cookie_str +# 2.4 raises SyntaxError due to generator / try/finally use +if sys.version_info[:2] > (2,4): + try: + import sqlite3 + except ImportError: + pass + else: + from _firefox3cookiejar import Firefox3CookieJar +from _mozillacookiejar import MozillaCookieJar +from _msiecookiejar import MSIECookieJar + +# If you hate the idea of turning bugs into warnings, do: +# import mechanize; mechanize.USE_BARE_EXCEPT = False +USE_BARE_EXCEPT = True + +logger = logging.getLogger("mechanize") +if logger.level is logging.NOTSET: + logger.setLevel(logging.CRITICAL) +del logger diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_auth.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_auth.py new file mode 100644 index 0000000..232f7d8 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_auth.py @@ -0,0 +1,522 @@ +"""HTTP Authentication and Proxy support. + +All but HTTPProxyPasswordMgr come from Python 2.5. + + +Copyright 2006 John J. Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +import base64 +import copy +import os +import posixpath +import random +import re +import time +import urlparse + +try: + import hashlib +except ImportError: + import md5 + import sha + def sha1_digest(bytes): + return sha.new(bytes).hexdigest() + def md5_digest(bytes): + return md5.new(bytes).hexdigest() +else: + def sha1_digest(bytes): + return hashlib.sha1(bytes).hexdigest() + def md5_digest(bytes): + return hashlib.md5(bytes).hexdigest() + +from urllib2 import BaseHandler, HTTPError, parse_keqv_list, parse_http_list +from urllib import getproxies, unquote, splittype, splituser, splitpasswd, \ + splitport + + +def _parse_proxy(proxy): + """Return (scheme, user, password, host/port) given a URL or an authority. + + If a URL is supplied, it must have an authority (host:port) component. + According to RFC 3986, having an authority component means the URL must + have two slashes after the scheme: + + >>> _parse_proxy('file:/ftp.example.com/') + Traceback (most recent call last): + ValueError: proxy URL with no authority: 'file:/ftp.example.com/' + + The first three items of the returned tuple may be None. + + Examples of authority parsing: + + >>> _parse_proxy('proxy.example.com') + (None, None, None, 'proxy.example.com') + >>> _parse_proxy('proxy.example.com:3128') + (None, None, None, 'proxy.example.com:3128') + + The authority component may optionally include userinfo (assumed to be + username:password): + + >>> _parse_proxy('joe:password@proxy.example.com') + (None, 'joe', 'password', 'proxy.example.com') + >>> _parse_proxy('joe:password@proxy.example.com:3128') + (None, 'joe', 'password', 'proxy.example.com:3128') + + Same examples, but with URLs instead: + + >>> _parse_proxy('http://proxy.example.com/') + ('http', None, None, 'proxy.example.com') + >>> _parse_proxy('http://proxy.example.com:3128/') + ('http', None, None, 'proxy.example.com:3128') + >>> _parse_proxy('http://joe:password@proxy.example.com/') + ('http', 'joe', 'password', 'proxy.example.com') + >>> _parse_proxy('http://joe:password@proxy.example.com:3128') + ('http', 'joe', 'password', 'proxy.example.com:3128') + + Everything after the authority is ignored: + + >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128') + ('ftp', 'joe', 'password', 'proxy.example.com') + + Test for no trailing '/' case: + + >>> _parse_proxy('http://joe:password@proxy.example.com') + ('http', 'joe', 'password', 'proxy.example.com') + + """ + scheme, r_scheme = splittype(proxy) + if not r_scheme.startswith("/"): + # authority + scheme = None + authority = proxy + else: + # URL + if not r_scheme.startswith("//"): + raise ValueError("proxy URL with no authority: %r" % proxy) + # We have an authority, so for RFC 3986-compliant URLs (by ss 3. + # and 3.3.), path is empty or starts with '/' + end = r_scheme.find("/", 2) + if end == -1: + end = None + authority = r_scheme[2:end] + userinfo, hostport = splituser(authority) + if userinfo is not None: + user, password = splitpasswd(userinfo) + else: + user = password = None + return scheme, user, password, hostport + +class ProxyHandler(BaseHandler): + # Proxies must be in front + handler_order = 100 + + def __init__(self, proxies=None): + if proxies is None: + proxies = getproxies() + assert hasattr(proxies, 'has_key'), "proxies must be a mapping" + self.proxies = proxies + for type, url in proxies.items(): + setattr(self, '%s_open' % type, + lambda r, proxy=url, type=type, meth=self.proxy_open: \ + meth(r, proxy, type)) + + def proxy_open(self, req, proxy, type): + orig_type = req.get_type() + proxy_type, user, password, hostport = _parse_proxy(proxy) + if proxy_type is None: + proxy_type = orig_type + if user and password: + user_pass = '%s:%s' % (unquote(user), unquote(password)) + creds = base64.encodestring(user_pass).strip() + req.add_header('Proxy-authorization', 'Basic ' + creds) + hostport = unquote(hostport) + req.set_proxy(hostport, proxy_type) + if orig_type == proxy_type: + # let other handlers take care of it + return None + else: + # need to start over, because the other handlers don't + # grok the proxy's URL type + # e.g. if we have a constructor arg proxies like so: + # {'http': 'ftp://proxy.example.com'}, we may end up turning + # a request for http://acme.example.com/a into one for + # ftp://proxy.example.com/a + return self.parent.open(req) + +class HTTPPasswordMgr: + + def __init__(self): + self.passwd = {} + + def add_password(self, realm, uri, user, passwd): + # uri could be a single URI or a sequence + if isinstance(uri, basestring): + uri = [uri] + if not realm in self.passwd: + self.passwd[realm] = {} + for default_port in True, False: + reduced_uri = tuple( + [self.reduce_uri(u, default_port) for u in uri]) + self.passwd[realm][reduced_uri] = (user, passwd) + + def find_user_password(self, realm, authuri): + domains = self.passwd.get(realm, {}) + for default_port in True, False: + reduced_authuri = self.reduce_uri(authuri, default_port) + for uris, authinfo in domains.iteritems(): + for uri in uris: + if self.is_suburi(uri, reduced_authuri): + return authinfo + return None, None + + def reduce_uri(self, uri, default_port=True): + """Accept authority or URI and extract only the authority and path.""" + # note HTTP URLs do not have a userinfo component + parts = urlparse.urlsplit(uri) + if parts[1]: + # URI + scheme = parts[0] + authority = parts[1] + path = parts[2] or '/' + else: + # host or host:port + scheme = None + authority = uri + path = '/' + host, port = splitport(authority) + if default_port and port is None and scheme is not None: + dport = {"http": 80, + "https": 443, + }.get(scheme) + if dport is not None: + authority = "%s:%d" % (host, dport) + return authority, path + + def is_suburi(self, base, test): + """Check if test is below base in a URI tree + + Both args must be URIs in reduced form. + """ + if base == test: + return True + if base[0] != test[0]: + return False + common = posixpath.commonprefix((base[1], test[1])) + if len(common) == len(base[1]): + return True + return False + + +class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): + + def find_user_password(self, realm, authuri): + user, password = HTTPPasswordMgr.find_user_password(self, realm, + authuri) + if user is not None: + return user, password + return HTTPPasswordMgr.find_user_password(self, None, authuri) + + +class AbstractBasicAuthHandler: + + rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I) + + # XXX there can actually be multiple auth-schemes in a + # www-authenticate header. should probably be a lot more careful + # in parsing them to extract multiple alternatives + + def __init__(self, password_mgr=None): + if password_mgr is None: + password_mgr = HTTPPasswordMgr() + self.passwd = password_mgr + self.add_password = self.passwd.add_password + + def http_error_auth_reqed(self, authreq, host, req, headers): + # host may be an authority (without userinfo) or a URL with an + # authority + # XXX could be multiple headers + authreq = headers.get(authreq, None) + if authreq: + mo = AbstractBasicAuthHandler.rx.search(authreq) + if mo: + scheme, realm = mo.groups() + if scheme.lower() == 'basic': + return self.retry_http_basic_auth(host, req, realm) + + def retry_http_basic_auth(self, host, req, realm): + user, pw = self.passwd.find_user_password(realm, host) + if pw is not None: + raw = "%s:%s" % (user, pw) + auth = 'Basic %s' % base64.encodestring(raw).strip() + if req.headers.get(self.auth_header, None) == auth: + return None + newreq = copy.copy(req) + newreq.add_header(self.auth_header, auth) + newreq.visit = False + return self.parent.open(newreq) + else: + return None + + +class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): + + auth_header = 'Authorization' + + def http_error_401(self, req, fp, code, msg, headers): + url = req.get_full_url() + return self.http_error_auth_reqed('www-authenticate', + url, req, headers) + + +class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): + + auth_header = 'Proxy-authorization' + + def http_error_407(self, req, fp, code, msg, headers): + # http_error_auth_reqed requires that there is no userinfo component in + # authority. Assume there isn't one, since urllib2 does not (and + # should not, RFC 3986 s. 3.2.1) support requests for URLs containing + # userinfo. + authority = req.get_host() + return self.http_error_auth_reqed('proxy-authenticate', + authority, req, headers) + + +def randombytes(n): + """Return n random bytes.""" + # Use /dev/urandom if it is available. Fall back to random module + # if not. It might be worthwhile to extend this function to use + # other platform-specific mechanisms for getting random bytes. + if os.path.exists("/dev/urandom"): + f = open("/dev/urandom") + s = f.read(n) + f.close() + return s + else: + L = [chr(random.randrange(0, 256)) for i in range(n)] + return "".join(L) + +class AbstractDigestAuthHandler: + # Digest authentication is specified in RFC 2617. + + # XXX The client does not inspect the Authentication-Info header + # in a successful response. + + # XXX It should be possible to test this implementation against + # a mock server that just generates a static set of challenges. + + # XXX qop="auth-int" supports is shaky + + def __init__(self, passwd=None): + if passwd is None: + passwd = HTTPPasswordMgr() + self.passwd = passwd + self.add_password = self.passwd.add_password + self.retried = 0 + self.nonce_count = 0 + + def reset_retry_count(self): + self.retried = 0 + + def http_error_auth_reqed(self, auth_header, host, req, headers): + authreq = headers.get(auth_header, None) + if self.retried > 5: + # Don't fail endlessly - if we failed once, we'll probably + # fail a second time. Hm. Unless the Password Manager is + # prompting for the information. Crap. This isn't great + # but it's better than the current 'repeat until recursion + # depth exceeded' approach <wink> + raise HTTPError(req.get_full_url(), 401, "digest auth failed", + headers, None) + else: + self.retried += 1 + if authreq: + scheme = authreq.split()[0] + if scheme.lower() == 'digest': + return self.retry_http_digest_auth(req, authreq) + + def retry_http_digest_auth(self, req, auth): + token, challenge = auth.split(' ', 1) + chal = parse_keqv_list(parse_http_list(challenge)) + auth = self.get_authorization(req, chal) + if auth: + auth_val = 'Digest %s' % auth + if req.headers.get(self.auth_header, None) == auth_val: + return None + newreq = copy.copy(req) + newreq.add_unredirected_header(self.auth_header, auth_val) + newreq.visit = False + return self.parent.open(newreq) + + def get_cnonce(self, nonce): + # The cnonce-value is an opaque + # quoted string value provided by the client and used by both client + # and server to avoid chosen plaintext attacks, to provide mutual + # authentication, and to provide some message integrity protection. + # This isn't a fabulous effort, but it's probably Good Enough. + dig = sha1_digest("%s:%s:%s:%s" % (self.nonce_count, nonce, + time.ctime(), randombytes(8))) + return dig[:16] + + def get_authorization(self, req, chal): + try: + realm = chal['realm'] + nonce = chal['nonce'] + qop = chal.get('qop') + algorithm = chal.get('algorithm', 'MD5') + # mod_digest doesn't send an opaque, even though it isn't + # supposed to be optional + opaque = chal.get('opaque', None) + except KeyError: + return None + + H, KD = self.get_algorithm_impls(algorithm) + if H is None: + return None + + user, pw = self.passwd.find_user_password(realm, req.get_full_url()) + if user is None: + return None + + # XXX not implemented yet + if req.has_data(): + entdig = self.get_entity_digest(req.get_data(), chal) + else: + entdig = None + + A1 = "%s:%s:%s" % (user, realm, pw) + A2 = "%s:%s" % (req.get_method(), + # XXX selector: what about proxies and full urls + req.get_selector()) + if qop == 'auth': + self.nonce_count += 1 + ncvalue = '%08x' % self.nonce_count + cnonce = self.get_cnonce(nonce) + noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2)) + respdig = KD(H(A1), noncebit) + elif qop is None: + respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) + else: + # XXX handle auth-int. + pass + + # XXX should the partial digests be encoded too? + + base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ + 'response="%s"' % (user, realm, nonce, req.get_selector(), + respdig) + if opaque: + base += ', opaque="%s"' % opaque + if entdig: + base += ', digest="%s"' % entdig + base += ', algorithm="%s"' % algorithm + if qop: + base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) + return base + + def get_algorithm_impls(self, algorithm): + # lambdas assume digest modules are imported at the top level + if algorithm == 'MD5': + H = md5_digest + elif algorithm == 'SHA': + H = sha1_digest + # XXX MD5-sess + KD = lambda s, d: H("%s:%s" % (s, d)) + return H, KD + + def get_entity_digest(self, data, chal): + # XXX not implemented yet + return None + + +class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): + """An authentication protocol defined by RFC 2069 + + Digest authentication improves on basic authentication because it + does not transmit passwords in the clear. + """ + + auth_header = 'Authorization' + handler_order = 490 + + def http_error_401(self, req, fp, code, msg, headers): + host = urlparse.urlparse(req.get_full_url())[1] + retry = self.http_error_auth_reqed('www-authenticate', + host, req, headers) + self.reset_retry_count() + return retry + + +class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): + + auth_header = 'Proxy-Authorization' + handler_order = 490 + + def http_error_407(self, req, fp, code, msg, headers): + host = req.get_host() + retry = self.http_error_auth_reqed('proxy-authenticate', + host, req, headers) + self.reset_retry_count() + return retry + + +# XXX ugly implementation, should probably not bother deriving +class HTTPProxyPasswordMgr(HTTPPasswordMgr): + # has default realm and host/port + def add_password(self, realm, uri, user, passwd): + # uri could be a single URI or a sequence + if uri is None or isinstance(uri, basestring): + uris = [uri] + else: + uris = uri + passwd_by_domain = self.passwd.setdefault(realm, {}) + for uri in uris: + for default_port in True, False: + reduced_uri = self.reduce_uri(uri, default_port) + passwd_by_domain[reduced_uri] = (user, passwd) + + def find_user_password(self, realm, authuri): + attempts = [(realm, authuri), (None, authuri)] + # bleh, want default realm to take precedence over default + # URI/authority, hence this outer loop + for default_uri in False, True: + for realm, authuri in attempts: + authinfo_by_domain = self.passwd.get(realm, {}) + for default_port in True, False: + reduced_authuri = self.reduce_uri(authuri, default_port) + for uri, authinfo in authinfo_by_domain.iteritems(): + if uri is None and not default_uri: + continue + if self.is_suburi(uri, reduced_authuri): + return authinfo + user, password = None, None + + if user is not None: + break + return user, password + + def reduce_uri(self, uri, default_port=True): + if uri is None: + return None + return HTTPPasswordMgr.reduce_uri(self, uri, default_port) + + def is_suburi(self, base, test): + if base is None: + # default to the proxy's host/port + hostport, path = test + base = (hostport, "/") + return HTTPPasswordMgr.is_suburi(self, base, test) + + +class HTTPSClientCertMgr(HTTPPasswordMgr): + # implementation inheritance: this is not a proper subclass + def add_key_cert(self, uri, key_file, cert_file): + self.add_password(None, uri, key_file, cert_file) + def find_key_cert(self, authuri): + return HTTPPasswordMgr.find_user_password(self, None, authuri) diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_beautifulsoup.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_beautifulsoup.py new file mode 100644 index 0000000..268b305 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_beautifulsoup.py @@ -0,0 +1,1080 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" +v2.1.1 +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup parses arbitrarily invalid XML- or HTML-like substance +into a tree representation. It provides methods and Pythonic idioms +that make it easy to search and modify the tree. + +A well-formed XML/HTML document will yield a well-formed data +structure. An ill-formed XML/HTML document will yield a +correspondingly ill-formed data structure. If your document is only +locally well-formed, you can use this library to find and process the +well-formed part of it. The BeautifulSoup class has heuristics for +obtaining a sensible parse tree in the face of common HTML errors. + +Beautiful Soup has no external dependencies. It works with Python 2.2 +and up. + +Beautiful Soup defines classes for four different parsing strategies: + + * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific + language that kind of looks like XML. + + * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid + or invalid. + + * ICantBelieveItsBeautifulSoup, for parsing valid but bizarre HTML + that trips up BeautifulSoup. + + * BeautifulSOAP, for making it easier to parse XML documents that use + lots of subelements containing a single string, where you'd prefer + they put that string into an attribute (such as SOAP messages). + +You can subclass BeautifulStoneSoup or BeautifulSoup to create a +parsing strategy specific to an XML schema or a particular bizarre +HTML document. Typically your subclass would just override +SELF_CLOSING_TAGS and/or NESTABLE_TAGS. +""" #" +from __future__ import generators + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "2.1.1" +__date__ = "$Date: 2004/10/18 00:14:20 $" +__copyright__ = "Copyright (c) 2004-2005 Leonard Richardson" +__license__ = "PSF" + +from sgmllib import SGMLParser, SGMLParseError +import types +import re +import sgmllib + +#This code makes Beautiful Soup able to parse XML with namespaces +sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') + +class NullType(object): + + """Similar to NoneType with a corresponding singleton instance + 'Null' that, unlike None, accepts any message and returns itself. + + Examples: + >>> Null("send", "a", "message")("and one more", + ... "and what you get still") is Null + True + """ + + def __new__(cls): return Null + def __call__(self, *args, **kwargs): return Null +## def __getstate__(self, *args): return Null + def __getattr__(self, attr): return Null + def __getitem__(self, item): return Null + def __setattr__(self, attr, value): pass + def __setitem__(self, item, value): pass + def __len__(self): return 0 + # FIXME: is this a python bug? otherwise ``for x in Null: pass`` + # never terminates... + def __iter__(self): return iter([]) + def __contains__(self, item): return False + def __repr__(self): return "Null" +Null = object.__new__(NullType) + +class PageElement: + """Contains the navigational information for some part of the page + (either a tag or a piece of text)""" + + def setup(self, parent=Null, previous=Null): + """Sets up the initial relations between this element and + other elements.""" + self.parent = parent + self.previous = previous + self.next = Null + self.previousSibling = Null + self.nextSibling = Null + if self.parent and self.parent.contents: + self.previousSibling = self.parent.contents[-1] + self.previousSibling.nextSibling = self + + def findNext(self, name=None, attrs={}, text=None): + """Returns the first item that matches the given criteria and + appears after this Tag in the document.""" + return self._first(self.fetchNext, name, attrs, text) + firstNext = findNext + + def fetchNext(self, name=None, attrs={}, text=None, limit=None): + """Returns all items that match the given criteria and appear + before after Tag in the document.""" + return self._fetch(name, attrs, text, limit, self.nextGenerator) + + def findNextSibling(self, name=None, attrs={}, text=None): + """Returns the closest sibling to this Tag that matches the + given criteria and appears after this Tag in the document.""" + return self._first(self.fetchNextSiblings, name, attrs, text) + firstNextSibling = findNextSibling + + def fetchNextSiblings(self, name=None, attrs={}, text=None, limit=None): + """Returns the siblings of this Tag that match the given + criteria and appear after this Tag in the document.""" + return self._fetch(name, attrs, text, limit, self.nextSiblingGenerator) + + def findPrevious(self, name=None, attrs={}, text=None): + """Returns the first item that matches the given criteria and + appears before this Tag in the document.""" + return self._first(self.fetchPrevious, name, attrs, text) + + def fetchPrevious(self, name=None, attrs={}, text=None, limit=None): + """Returns all items that match the given criteria and appear + before this Tag in the document.""" + return self._fetch(name, attrs, text, limit, self.previousGenerator) + firstPrevious = findPrevious + + def findPreviousSibling(self, name=None, attrs={}, text=None): + """Returns the closest sibling to this Tag that matches the + given criteria and appears before this Tag in the document.""" + return self._first(self.fetchPreviousSiblings, name, attrs, text) + firstPreviousSibling = findPreviousSibling + + def fetchPreviousSiblings(self, name=None, attrs={}, text=None, + limit=None): + """Returns the siblings of this Tag that match the given + criteria and appear before this Tag in the document.""" + return self._fetch(name, attrs, text, limit, + self.previousSiblingGenerator) + + def findParent(self, name=None, attrs={}): + """Returns the closest parent of this Tag that matches the given + criteria.""" + r = Null + l = self.fetchParents(name, attrs, 1) + if l: + r = l[0] + return r + firstParent = findParent + + def fetchParents(self, name=None, attrs={}, limit=None): + """Returns the parents of this Tag that match the given + criteria.""" + return self._fetch(name, attrs, None, limit, self.parentGenerator) + + #These methods do the real heavy lifting. + + def _first(self, method, name, attrs, text): + r = Null + l = method(name, attrs, text, 1) + if l: + r = l[0] + return r + + def _fetch(self, name, attrs, text, limit, generator): + "Iterates over a generator looking for things that match." + if not hasattr(attrs, 'items'): + attrs = {'class' : attrs} + + results = [] + g = generator() + while True: + try: + i = g.next() + except StopIteration: + break + found = None + if isinstance(i, Tag): + if not text: + if not name or self._matches(i, name): + match = True + for attr, matchAgainst in attrs.items(): + check = i.get(attr) + if not self._matches(check, matchAgainst): + match = False + break + if match: + found = i + elif text: + if self._matches(i, text): + found = i + if found: + results.append(found) + if limit and len(results) >= limit: + break + return results + + #Generators that can be used to navigate starting from both + #NavigableTexts and Tags. + def nextGenerator(self): + i = self + while i: + i = i.next + yield i + + def nextSiblingGenerator(self): + i = self + while i: + i = i.nextSibling + yield i + + def previousGenerator(self): + i = self + while i: + i = i.previous + yield i + + def previousSiblingGenerator(self): + i = self + while i: + i = i.previousSibling + yield i + + def parentGenerator(self): + i = self + while i: + i = i.parent + yield i + + def _matches(self, chunk, howToMatch): + #print 'looking for %s in %s' % (howToMatch, chunk) + # + # If given a list of items, return true if the list contains a + # text element that matches. + if isList(chunk) and not isinstance(chunk, Tag): + for tag in chunk: + if isinstance(tag, NavigableText) and self._matches(tag, howToMatch): + return True + return False + if callable(howToMatch): + return howToMatch(chunk) + if isinstance(chunk, Tag): + #Custom match methods take the tag as an argument, but all other + #ways of matching match the tag name as a string + chunk = chunk.name + #Now we know that chunk is a string + if not isinstance(chunk, basestring): + chunk = str(chunk) + if hasattr(howToMatch, 'match'): + # It's a regexp object. + return howToMatch.search(chunk) + if isList(howToMatch): + return chunk in howToMatch + if hasattr(howToMatch, 'items'): + return howToMatch.has_key(chunk) + #It's just a string + return str(howToMatch) == chunk + +class NavigableText(PageElement): + + def __getattr__(self, attr): + "For backwards compatibility, text.string gives you text" + if attr == 'string': + return self + else: + raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) + +class NavigableString(str, NavigableText): + pass + +class NavigableUnicodeString(unicode, NavigableText): + pass + +class Tag(PageElement): + + """Represents a found HTML tag with its attributes and contents.""" + + def __init__(self, name, attrs=None, parent=Null, previous=Null): + "Basic constructor." + self.name = name + if attrs == None: + attrs = [] + self.attrs = attrs + self.contents = [] + self.setup(parent, previous) + self.hidden = False + + def get(self, key, default=None): + """Returns the value of the 'key' attribute for the tag, or + the value given for 'default' if it doesn't have that + attribute.""" + return self._getAttrMap().get(key, default) + + def __getitem__(self, key): + """tag[key] returns the value of the 'key' attribute for the tag, + and throws an exception if it's not there.""" + return self._getAttrMap()[key] + + def __iter__(self): + "Iterating over a tag iterates over its contents." + return iter(self.contents) + + def __len__(self): + "The length of a tag is the length of its list of contents." + return len(self.contents) + + def __contains__(self, x): + return x in self.contents + + def __nonzero__(self): + "A tag is non-None even if it has no contents." + return True + + def __setitem__(self, key, value): + """Setting tag[key] sets the value of the 'key' attribute for the + tag.""" + self._getAttrMap() + self.attrMap[key] = value + found = False + for i in range(0, len(self.attrs)): + if self.attrs[i][0] == key: + self.attrs[i] = (key, value) + found = True + if not found: + self.attrs.append((key, value)) + self._getAttrMap()[key] = value + + def __delitem__(self, key): + "Deleting tag[key] deletes all 'key' attributes for the tag." + for item in self.attrs: + if item[0] == key: + self.attrs.remove(item) + #We don't break because bad HTML can define the same + #attribute multiple times. + self._getAttrMap() + if self.attrMap.has_key(key): + del self.attrMap[key] + + def __call__(self, *args, **kwargs): + """Calling a tag like a function is the same as calling its + fetch() method. Eg. tag('a') returns a list of all the A tags + found within this tag.""" + return apply(self.fetch, args, kwargs) + + def __getattr__(self, tag): + if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: + return self.first(tag[:-3]) + elif tag.find('__') != 0: + return self.first(tag) + + def __eq__(self, other): + """Returns true iff this tag has the same name, the same attributes, + and the same contents (recursively) as the given tag. + + NOTE: right now this will return false if two tags have the + same attributes in a different order. Should this be fixed?""" + if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): + return False + for i in range(0, len(self.contents)): + if self.contents[i] != other.contents[i]: + return False + return True + + def __ne__(self, other): + """Returns true iff this tag is not identical to the other tag, + as defined in __eq__.""" + return not self == other + + def __repr__(self): + """Renders this tag as a string.""" + return str(self) + + def __unicode__(self): + return self.__str__(1) + + def __str__(self, needUnicode=None, showStructureIndent=None): + """Returns a string or Unicode representation of this tag and + its contents. + + NOTE: since Python's HTML parser consumes whitespace, this + method is not certain to reproduce the whitespace present in + the original string.""" + + attrs = [] + if self.attrs: + for key, val in self.attrs: + attrs.append('%s="%s"' % (key, val)) + close = '' + closeTag = '' + if self.isSelfClosing(): + close = ' /' + else: + closeTag = '</%s>' % self.name + indentIncrement = None + if showStructureIndent != None: + indentIncrement = showStructureIndent + if not self.hidden: + indentIncrement += 1 + contents = self.renderContents(indentIncrement, needUnicode=needUnicode) + if showStructureIndent: + space = '\n%s' % (' ' * showStructureIndent) + if self.hidden: + s = contents + else: + s = [] + attributeString = '' + if attrs: + attributeString = ' ' + ' '.join(attrs) + if showStructureIndent: + s.append(space) + s.append('<%s%s%s>' % (self.name, attributeString, close)) + s.append(contents) + if closeTag and showStructureIndent != None: + s.append(space) + s.append(closeTag) + s = ''.join(s) + isUnicode = type(s) == types.UnicodeType + if needUnicode and not isUnicode: + s = unicode(s) + elif isUnicode and needUnicode==False: + s = str(s) + return s + + def prettify(self, needUnicode=None): + return self.__str__(needUnicode, showStructureIndent=True) + + def renderContents(self, showStructureIndent=None, needUnicode=None): + """Renders the contents of this tag as a (possibly Unicode) + string.""" + s=[] + for c in self: + text = None + if isinstance(c, NavigableUnicodeString) or type(c) == types.UnicodeType: + text = unicode(c) + elif isinstance(c, Tag): + s.append(c.__str__(needUnicode, showStructureIndent)) + elif needUnicode: + text = unicode(c) + else: + text = str(c) + if text: + if showStructureIndent != None: + if text[-1] == '\n': + text = text[:-1] + s.append(text) + return ''.join(s) + + #Soup methods + + def firstText(self, text, recursive=True): + """Convenience method to retrieve the first piece of text matching the + given criteria. 'text' can be a string, a regular expression object, + a callable that takes a string and returns whether or not the + string 'matches', etc.""" + return self.first(recursive=recursive, text=text) + + def fetchText(self, text, recursive=True, limit=None): + """Convenience method to retrieve all pieces of text matching the + given criteria. 'text' can be a string, a regular expression object, + a callable that takes a string and returns whether or not the + string 'matches', etc.""" + return self.fetch(recursive=recursive, text=text, limit=limit) + + def first(self, name=None, attrs={}, recursive=True, text=None): + """Return only the first child of this + Tag matching the given criteria.""" + r = Null + l = self.fetch(name, attrs, recursive, text, 1) + if l: + r = l[0] + return r + findChild = first + + def fetch(self, name=None, attrs={}, recursive=True, text=None, + limit=None): + """Extracts a list of Tag objects that match the given + criteria. You can specify the name of the Tag and any + attributes you want the Tag to have. + + The value of a key-value pair in the 'attrs' map can be a + string, a list of strings, a regular expression object, or a + callable that takes a string and returns whether or not the + string matches for some custom definition of 'matches'. The + same is true of the tag name.""" + generator = self.recursiveChildGenerator + if not recursive: + generator = self.childGenerator + return self._fetch(name, attrs, text, limit, generator) + fetchChildren = fetch + + #Utility methods + + def isSelfClosing(self): + """Returns true iff this is a self-closing tag as defined in the HTML + standard. + + TODO: This is specific to BeautifulSoup and its subclasses, but it's + used by __str__""" + return self.name in BeautifulSoup.SELF_CLOSING_TAGS + + def append(self, tag): + """Appends the given tag to the contents of this tag.""" + self.contents.append(tag) + + #Private methods + + def _getAttrMap(self): + """Initializes a map representation of this tag's attributes, + if not already initialized.""" + if not getattr(self, 'attrMap'): + self.attrMap = {} + for (key, value) in self.attrs: + self.attrMap[key] = value + return self.attrMap + + #Generator methods + def childGenerator(self): + for i in range(0, len(self.contents)): + yield self.contents[i] + raise StopIteration + + def recursiveChildGenerator(self): + stack = [(self, 0)] + while stack: + tag, start = stack.pop() + if isinstance(tag, Tag): + for i in range(start, len(tag.contents)): + a = tag.contents[i] + yield a + if isinstance(a, Tag) and tag.contents: + if i < len(tag.contents) - 1: + stack.append((tag, i+1)) + stack.append((a, 0)) + break + raise StopIteration + + +def isList(l): + """Convenience method that works with all 2.x versions of Python + to determine whether or not something is listlike.""" + return hasattr(l, '__iter__') \ + or (type(l) in (types.ListType, types.TupleType)) + +def buildTagMap(default, *args): + """Turns a list of maps, lists, or scalars into a single map. + Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out + of lists and partial maps.""" + built = {} + for portion in args: + if hasattr(portion, 'items'): + #It's a map. Merge it. + for k,v in portion.items(): + built[k] = v + elif isList(portion): + #It's a list. Map each item to the default. + for k in portion: + built[k] = default + else: + #It's a scalar. Map it to the default. + built[portion] = default + return built + +class BeautifulStoneSoup(Tag, SGMLParser): + + """This class contains the basic parser and fetch code. It defines + a parser that knows nothing about tag behavior except for the + following: + + You can't close a tag without closing all the tags it encloses. + That is, "<foo><bar></foo>" actually means + "<foo><bar></bar></foo>". + + [Another possible explanation is "<foo><bar /></foo>", but since + this class defines no SELF_CLOSING_TAGS, it will never use that + explanation.] + + This class is useful for parsing XML or made-up markup languages, + or when BeautifulSoup makes an assumption counter to what you were + expecting.""" + + SELF_CLOSING_TAGS = {} + NESTABLE_TAGS = {} + RESET_NESTING_TAGS = {} + QUOTE_TAGS = {} + + #As a public service we will by default silently replace MS smart quotes + #and similar characters with their HTML or ASCII equivalents. + MS_CHARS = { '\x80' : '€', + '\x81' : ' ', + '\x82' : '‚', + '\x83' : 'ƒ', + '\x84' : '„', + '\x85' : '…', + '\x86' : '†', + '\x87' : '‡', + '\x88' : '⁁', + '\x89' : '%', + '\x8A' : 'Š', + '\x8B' : '<', + '\x8C' : 'Œ', + '\x8D' : '?', + '\x8E' : 'Z', + '\x8F' : '?', + '\x90' : '?', + '\x91' : '‘', + '\x92' : '’', + '\x93' : '“', + '\x94' : '”', + '\x95' : '•', + '\x96' : '–', + '\x97' : '—', + '\x98' : '˜', + '\x99' : '™', + '\x9a' : 'š', + '\x9b' : '>', + '\x9c' : 'œ', + '\x9d' : '?', + '\x9e' : 'z', + '\x9f' : 'Ÿ',} + + PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'), + lambda(x):x.group(1) + ' />'), + (re.compile('<!\s+([^<>]*)>'), + lambda(x):'<!' + x.group(1) + '>'), + (re.compile("([\x80-\x9f])"), + lambda(x): BeautifulStoneSoup.MS_CHARS.get(x.group(1))) + ] + + ROOT_TAG_NAME = '[document]' + + def __init__(self, text=None, avoidParserProblems=True, + initialTextIsEverything=True): + """Initialize this as the 'root tag' and feed in any text to + the parser. + + NOTE about avoidParserProblems: sgmllib will process most bad + HTML, and BeautifulSoup has tricks for dealing with some HTML + that kills sgmllib, but Beautiful Soup can nonetheless choke + or lose data if your data uses self-closing tags or + declarations incorrectly. By default, Beautiful Soup sanitizes + its input to avoid the vast majority of these problems. The + problems are relatively rare, even in bad HTML, so feel free + to pass in False to avoidParserProblems if they don't apply to + you, and you'll get better performance. The only reason I have + this turned on by default is so I don't get so many tech + support questions. + + The two most common instances of invalid HTML that will choke + sgmllib are fixed by the default parser massage techniques: + + <br/> (No space between name of closing tag and tag close) + <! --Comment--> (Extraneous whitespace in declaration) + + You can pass in a custom list of (RE object, replace method) + tuples to get Beautiful Soup to scrub your input the way you + want.""" + Tag.__init__(self, self.ROOT_TAG_NAME) + if avoidParserProblems \ + and not isList(avoidParserProblems): + avoidParserProblems = self.PARSER_MASSAGE + self.avoidParserProblems = avoidParserProblems + SGMLParser.__init__(self) + self.quoteStack = [] + self.hidden = 1 + self.reset() + if hasattr(text, 'read'): + #It's a file-type object. + text = text.read() + if text: + self.feed(text) + if initialTextIsEverything: + self.done() + + def __getattr__(self, methodName): + """This method routes method call requests to either the SGMLParser + superclass or the Tag superclass, depending on the method name.""" + if methodName.find('start_') == 0 or methodName.find('end_') == 0 \ + or methodName.find('do_') == 0: + return SGMLParser.__getattr__(self, methodName) + elif methodName.find('__') != 0: + return Tag.__getattr__(self, methodName) + else: + raise AttributeError + + def feed(self, text): + if self.avoidParserProblems: + for fix, m in self.avoidParserProblems: + text = fix.sub(m, text) + SGMLParser.feed(self, text) + + def done(self): + """Called when you're done parsing, so that the unclosed tags can be + correctly processed.""" + self.endData() #NEW + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def reset(self): + SGMLParser.reset(self) + self.currentData = [] + self.currentTag = None + self.tagStack = [] + self.pushTag(self) + + def popTag(self): + tag = self.tagStack.pop() + # Tags with just one string-owning child get the child as a + # 'string' property, so that soup.tag.string is shorthand for + # soup.tag.contents[0] + if len(self.currentTag.contents) == 1 and \ + isinstance(self.currentTag.contents[0], NavigableText): + self.currentTag.string = self.currentTag.contents[0] + + #print "Pop", tag.name + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag: + self.currentTag.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + + def endData(self): + currentData = ''.join(self.currentData) + if currentData: + if not currentData.strip(): + if '\n' in currentData: + currentData = '\n' + else: + currentData = ' ' + c = NavigableString + if type(currentData) == types.UnicodeType: + c = NavigableUnicodeString + o = c(currentData) + o.setup(self.currentTag, self.previous) + if self.previous: + self.previous.next = o + self.previous = o + self.currentTag.contents.append(o) + self.currentData = [] + + def _popToTag(self, name, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. If inclusivePop is false, pops the tag + stack up to but *not* including the most recent instqance of + the given tag.""" + if name == self.ROOT_TAG_NAME: + return + + numPops = 0 + mostRecentTag = None + for i in range(len(self.tagStack)-1, 0, -1): + if name == self.tagStack[i].name: + numPops = len(self.tagStack)-i + break + if not inclusivePop: + numPops = numPops - 1 + + for i in range(0, numPops): + mostRecentTag = self.popTag() + return mostRecentTag + + def _smartPop(self, name): + + """We need to pop up to the previous tag of this type, unless + one of this tag's nesting reset triggers comes between this + tag and the previous tag of this type, OR unless this tag is a + generic nesting trigger and another generic nesting trigger + comes between this tag and the previous tag of this type. + + Examples: + <p>Foo<b>Bar<p> should pop to 'p', not 'b'. + <p>Foo<table>Bar<p> should pop to 'table', not 'p'. + <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'. + <p>Foo<b>Bar<p> should pop to 'p', not 'b'. + + <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. + <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' + <td><tr><td> *<td>* should pop to 'tr', not the first 'td' + """ + + nestingResetTriggers = self.NESTABLE_TAGS.get(name) + isNestable = nestingResetTriggers != None + isResetNesting = self.RESET_NESTING_TAGS.has_key(name) + popTo = None + inclusive = True + for i in range(len(self.tagStack)-1, 0, -1): + p = self.tagStack[i] + if (not p or p.name == name) and not isNestable: + #Non-nestable tags get popped to the top or to their + #last occurance. + popTo = name + break + if (nestingResetTriggers != None + and p.name in nestingResetTriggers) \ + or (nestingResetTriggers == None and isResetNesting + and self.RESET_NESTING_TAGS.has_key(p.name)): + + #If we encounter one of the nesting reset triggers + #peculiar to this tag, or we encounter another tag + #that causes nesting to reset, pop up to but not + #including that tag. + + popTo = p.name + inclusive = False + break + p = p.parent + if popTo: + self._popToTag(popTo, inclusive) + + def unknown_starttag(self, name, attrs, selfClosing=0): + #print "Start tag %s" % name + if self.quoteStack: + #This is not a real tag. + #print "<%s> is not real!" % name + attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) + self.handle_data('<%s%s>' % (name, attrs)) + return + self.endData() + if not name in self.SELF_CLOSING_TAGS and not selfClosing: + self._smartPop(name) + tag = Tag(name, attrs, self.currentTag, self.previous) + if self.previous: + self.previous.next = tag + self.previous = tag + self.pushTag(tag) + if selfClosing or name in self.SELF_CLOSING_TAGS: + self.popTag() + if name in self.QUOTE_TAGS: + #print "Beginning quote (%s)" % name + self.quoteStack.append(name) + self.literal = 1 + + def unknown_endtag(self, name): + if self.quoteStack and self.quoteStack[-1] != name: + #This is not a real end tag. + #print "</%s> is not real!" % name + self.handle_data('</%s>' % name) + return + self.endData() + self._popToTag(name) + if self.quoteStack and self.quoteStack[-1] == name: + self.quoteStack.pop() + self.literal = (len(self.quoteStack) > 0) + + def handle_data(self, data): + self.currentData.append(data) + + def handle_pi(self, text): + "Propagate processing instructions right through." + self.handle_data("<?%s>" % text) + + def handle_comment(self, text): + "Propagate comments right through." + self.handle_data("<!--%s-->" % text) + + def handle_charref(self, ref): + "Propagate char refs right through." + self.handle_data('&#%s;' % ref) + + def handle_entityref(self, ref): + "Propagate entity refs right through." + self.handle_data('&%s;' % ref) + + def handle_decl(self, data): + "Propagate DOCTYPEs and the like right through." + self.handle_data('<!%s>' % data) + + def parse_declaration(self, i): + """Treat a bogus SGML declaration as raw data. Treat a CDATA + declaration as regular data.""" + j = None + if self.rawdata[i:i+9] == '<![CDATA[': + k = self.rawdata.find(']]>', i) + if k == -1: + k = len(self.rawdata) + self.handle_data(self.rawdata[i+9:k]) + j = k+3 + else: + try: + j = SGMLParser.parse_declaration(self, i) + except SGMLParseError: + toHandle = self.rawdata[i:] + self.handle_data(toHandle) + j = i + len(toHandle) + return j + +class BeautifulSoup(BeautifulStoneSoup): + + """This parser knows the following facts about HTML: + + * Some tags have no closing tag and should be interpreted as being + closed as soon as they are encountered. + + * The text inside some tags (ie. 'script') may contain tags which + are not really part of the document and which should be parsed + as text, not tags. If you want to parse the text as tags, you can + always fetch it and parse it explicitly. + + * Tag nesting rules: + + Most tags can't be nested at all. For instance, the occurance of + a <p> tag should implicitly close the previous <p> tag. + + <p>Para1<p>Para2 + should be transformed into: + <p>Para1</p><p>Para2 + + Some tags can be nested arbitrarily. For instance, the occurance + of a <blockquote> tag should _not_ implicitly close the previous + <blockquote> tag. + + Alice said: <blockquote>Bob said: <blockquote>Blah + should NOT be transformed into: + Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah + + Some tags can be nested, but the nesting is reset by the + interposition of other tags. For instance, a <tr> tag should + implicitly close the previous <tr> tag within the same <table>, + but not close a <tr> tag in another table. + + <table><tr>Blah<tr>Blah + should be transformed into: + <table><tr>Blah</tr><tr>Blah + but, + <tr>Blah<table><tr>Blah + should NOT be transformed into + <tr>Blah<table></tr><tr>Blah + + Differing assumptions about tag nesting rules are a major source + of problems with the BeautifulSoup class. If BeautifulSoup is not + treating as nestable a tag your page author treats as nestable, + try ICantBelieveItsBeautifulSoup before writing your own + subclass.""" + + SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) + + QUOTE_TAGS = {'script': None} + + #According to the HTML standard, each of these inline tags can + #contain another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', + 'center'] + + #According to the HTML standard, these block tags can contain + #another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] + + #Lists can contain other lists, but there are restrictions. + NESTABLE_LIST_TAGS = { 'ol' : [], + 'ul' : [], + 'li' : ['ul', 'ol'], + 'dl' : [], + 'dd' : ['dl'], + 'dt' : ['dl'] } + + #Tables can contain other tables, but there are restrictions. + NESTABLE_TABLE_TAGS = {'table' : [], + 'tr' : ['table', 'tbody', 'tfoot', 'thead'], + 'td' : ['tr'], + 'th' : ['tr'], + } + + NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre'] + + #If one of these tags is encountered, all tags up to the next tag of + #this type are popped. + RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', + NON_NESTABLE_BLOCK_TAGS, + NESTABLE_LIST_TAGS, + NESTABLE_TABLE_TAGS) + + NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, + NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) + +class ICantBelieveItsBeautifulSoup(BeautifulSoup): + + """The BeautifulSoup class is oriented towards skipping over + common HTML errors like unclosed tags. However, sometimes it makes + errors of its own. For instance, consider this fragment: + + <b>Foo<b>Bar</b></b> + + This is perfectly valid (if bizarre) HTML. However, the + BeautifulSoup class will implicitly close the first b tag when it + encounters the second 'b'. It will think the author wrote + "<b>Foo<b>Bar", and didn't close the first 'b' tag, because + there's no real-world reason to bold something that's already + bold. When it encounters '</b></b>' it will close two more 'b' + tags, for a grand total of three tags closed instead of two. This + can throw off the rest of your document structure. The same is + true of a number of other tags, listed below. + + It's much more common for someone to forget to close (eg.) a 'b' + tag than to actually use nested 'b' tags, and the BeautifulSoup + class handles the common case. This class handles the + not-co-common case: where you can't believe someone wrote what + they did, but it's valid HTML and BeautifulSoup screwed up by + assuming it wouldn't be. + + If this doesn't do what you need, try subclassing this class or + BeautifulSoup, and providing your own list of NESTABLE_TAGS.""" + + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ + ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', + 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', + 'big'] + + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] + + NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) + +class BeautifulSOAP(BeautifulStoneSoup): + """This class will push a tag with only a single string child into + the tag's parent as an attribute. The attribute's name is the tag + name, and the value is the string child. An example should give + the flavor of the change: + + <foo><bar>baz</bar></foo> + => + <foo bar="baz"><bar>baz</bar></foo> + + You can then access fooTag['bar'] instead of fooTag.barTag.string. + + This is, of course, useful for scraping structures that tend to + use subelements instead of attributes, such as SOAP messages. Note + that it modifies its input, so don't print the modified version + out. + + I'm not sure how many people really want to use this class; let me + know if you do. Mainly I like the name.""" + + def popTag(self): + if len(self.tagStack) > 1: + tag = self.tagStack[-1] + parent = self.tagStack[-2] + parent._getAttrMap() + if (isinstance(tag, Tag) and len(tag.contents) == 1 and + isinstance(tag.contents[0], NavigableText) and + not parent.attrMap.has_key(tag.name)): + parent[tag.name] = tag.contents[0] + BeautifulStoneSoup.popTag(self) + +#Enterprise class names! It has come to our attention that some people +#think the names of the Beautiful Soup parser classes are too silly +#and "unprofessional" for use in enterprise screen-scraping. We feel +#your pain! For such-minded folk, the Beautiful Soup Consortium And +#All-Night Kosher Bakery recommends renaming this file to +#"RobustParser.py" (or, in cases of extreme enterprisitude, +#"RobustParserBeanInterface.class") and using the following +#enterprise-friendly class aliases: +class RobustXMLParser(BeautifulStoneSoup): + pass +class RobustHTMLParser(BeautifulSoup): + pass +class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): + pass +class SimplifyingSOAPParser(BeautifulSOAP): + pass + +### + + +#By default, act as an HTML pretty-printer. +if __name__ == '__main__': + import sys + soup = BeautifulStoneSoup(sys.stdin.read()) + print soup.prettify() diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_clientcookie.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_clientcookie.py new file mode 100644 index 0000000..caeb82b --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_clientcookie.py @@ -0,0 +1,1707 @@ +"""HTTP cookie handling for web clients. + +This module originally developed from my port of Gisle Aas' Perl module +HTTP::Cookies, from the libwww-perl library. + +Docstrings, comments and debug strings in this code refer to the +attributes of the HTTP cookie system as cookie-attributes, to distinguish +them clearly from Python attributes. + + CookieJar____ + / \ \ + FileCookieJar \ \ + / | \ \ \ + MozillaCookieJar | LWPCookieJar \ \ + | | \ + | ---MSIEBase | \ + | / | | \ + | / MSIEDBCookieJar BSDDBCookieJar + |/ + MSIECookieJar + +Comments to John J Lee <jjl@pobox.com>. + + +Copyright 2002-2006 John J Lee <jjl@pobox.com> +Copyright 1997-1999 Gisle Aas (original libwww-perl code) +Copyright 2002-2003 Johnny Lee (original MSIE Perl code) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import sys, re, copy, time, urllib, types, logging +try: + import threading + _threading = threading; del threading +except ImportError: + import dummy_threading + _threading = dummy_threading; del dummy_threading + +MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar " + "instance initialised with one)") +DEFAULT_HTTP_PORT = "80" + +from _headersutil import split_header_words, parse_ns_headers +from _util import isstringlike +import _rfc3986 + +debug = logging.getLogger("mechanize.cookies").debug + + +def reraise_unmasked_exceptions(unmasked=()): + # There are a few catch-all except: statements in this module, for + # catching input that's bad in unexpected ways. + # This function re-raises some exceptions we don't want to trap. + import mechanize, warnings + if not mechanize.USE_BARE_EXCEPT: + raise + unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError) + etype = sys.exc_info()[0] + if issubclass(etype, unmasked): + raise + # swallowed an exception + import traceback, StringIO + f = StringIO.StringIO() + traceback.print_exc(None, f) + msg = f.getvalue() + warnings.warn("mechanize bug!\n%s" % msg, stacklevel=2) + + +IPV4_RE = re.compile(r"\.\d+$") +def is_HDN(text): + """Return True if text is a host domain name.""" + # XXX + # This may well be wrong. Which RFC is HDN defined in, if any (for + # the purposes of RFC 2965)? + # For the current implementation, what about IPv6? Remember to look + # at other uses of IPV4_RE also, if change this. + return not (IPV4_RE.search(text) or + text == "" or + text[0] == "." or text[-1] == ".") + +def domain_match(A, B): + """Return True if domain A domain-matches domain B, according to RFC 2965. + + A and B may be host domain names or IP addresses. + + RFC 2965, section 1: + + Host names can be specified either as an IP address or a HDN string. + Sometimes we compare one host name with another. (Such comparisons SHALL + be case-insensitive.) Host A's name domain-matches host B's if + + * their host name strings string-compare equal; or + + * A is a HDN string and has the form NB, where N is a non-empty + name string, B has the form .B', and B' is a HDN string. (So, + x.y.com domain-matches .Y.com but not Y.com.) + + Note that domain-match is not a commutative operation: a.b.c.com + domain-matches .c.com, but not the reverse. + + """ + # Note that, if A or B are IP addresses, the only relevant part of the + # definition of the domain-match algorithm is the direct string-compare. + A = A.lower() + B = B.lower() + if A == B: + return True + if not is_HDN(A): + return False + i = A.rfind(B) + has_form_nb = not (i == -1 or i == 0) + return ( + has_form_nb and + B.startswith(".") and + is_HDN(B[1:]) + ) + +def liberal_is_HDN(text): + """Return True if text is a sort-of-like a host domain name. + + For accepting/blocking domains. + + """ + return not IPV4_RE.search(text) + +def user_domain_match(A, B): + """For blocking/accepting domains. + + A and B may be host domain names or IP addresses. + + """ + A = A.lower() + B = B.lower() + if not (liberal_is_HDN(A) and liberal_is_HDN(B)): + if A == B: + # equal IP addresses + return True + return False + initial_dot = B.startswith(".") + if initial_dot and A.endswith(B): + return True + if not initial_dot and A == B: + return True + return False + +cut_port_re = re.compile(r":\d+$") +def request_host(request): + """Return request-host, as defined by RFC 2965. + + Variation from RFC: returned value is lowercased, for convenient + comparison. + + """ + url = request.get_full_url() + host = _rfc3986.urlsplit(url)[1] + if host is None: + host = request.get_header("Host", "") + # remove port, if present + return cut_port_re.sub("", host, 1) + +def request_host_lc(request): + return request_host(request).lower() + +def eff_request_host(request): + """Return a tuple (request-host, effective request-host name).""" + erhn = req_host = request_host(request) + if req_host.find(".") == -1 and not IPV4_RE.search(req_host): + erhn = req_host + ".local" + return req_host, erhn + +def eff_request_host_lc(request): + req_host, erhn = eff_request_host(request) + return req_host.lower(), erhn.lower() + +def effective_request_host(request): + """Return the effective request-host, as defined by RFC 2965.""" + return eff_request_host(request)[1] + +def request_path(request): + """request-URI, as defined by RFC 2965.""" + url = request.get_full_url() + path, query, frag = _rfc3986.urlsplit(url)[2:] + path = escape_path(path) + req_path = _rfc3986.urlunsplit((None, None, path, query, frag)) + if not req_path.startswith("/"): + req_path = "/"+req_path + return req_path + +def request_port(request): + host = request.get_host() + i = host.find(':') + if i >= 0: + port = host[i+1:] + try: + int(port) + except ValueError: + debug("nonnumeric port: '%s'", port) + return None + else: + port = DEFAULT_HTTP_PORT + return port + +def request_is_unverifiable(request): + try: + return request.is_unverifiable() + except AttributeError: + if hasattr(request, "unverifiable"): + return request.unverifiable + else: + raise + +# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't +# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738). +HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()" +ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])") +def uppercase_escaped_char(match): + return "%%%s" % match.group(1).upper() +def escape_path(path): + """Escape any invalid characters in HTTP URL, and uppercase all escapes.""" + # There's no knowing what character encoding was used to create URLs + # containing %-escapes, but since we have to pick one to escape invalid + # path characters, we pick UTF-8, as recommended in the HTML 4.0 + # specification: + # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1 + # And here, kind of: draft-fielding-uri-rfc2396bis-03 + # (And in draft IRI specification: draft-duerst-iri-05) + # (And here, for new URI schemes: RFC 2718) + if isinstance(path, types.UnicodeType): + path = path.encode("utf-8") + path = urllib.quote(path, HTTP_PATH_SAFE) + path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) + return path + +def reach(h): + """Return reach of host h, as defined by RFC 2965, section 1. + + The reach R of a host name H is defined as follows: + + * If + + - H is the host domain name of a host; and, + + - H has the form A.B; and + + - A has no embedded (that is, interior) dots; and + + - B has at least one embedded dot, or B is the string "local". + then the reach of H is .B. + + * Otherwise, the reach of H is H. + + >>> reach("www.acme.com") + '.acme.com' + >>> reach("acme.com") + 'acme.com' + >>> reach("acme.local") + '.local' + + """ + i = h.find(".") + if i >= 0: + #a = h[:i] # this line is only here to show what a is + b = h[i+1:] + i = b.find(".") + if is_HDN(h) and (i >= 0 or b == "local"): + return "."+b + return h + +def is_third_party(request): + """ + + RFC 2965, section 3.3.6: + + An unverifiable transaction is to a third-party host if its request- + host U does not domain-match the reach R of the request-host O in the + origin transaction. + + """ + req_host = request_host_lc(request) + # the origin request's request-host was stuffed into request by + # _urllib2_support.AbstractHTTPHandler + return not domain_match(req_host, reach(request.origin_req_host)) + + +class Cookie: + """HTTP Cookie. + + This class represents both Netscape and RFC 2965 cookies. + + This is deliberately a very simple class. It just holds attributes. It's + possible to construct Cookie instances that don't comply with the cookie + standards. CookieJar.make_cookies is the factory function for Cookie + objects -- it deals with cookie parsing, supplying defaults, and + normalising to the representation used in this class. CookiePolicy is + responsible for checking them to see whether they should be accepted from + and returned to the server. + + version: integer; + name: string; + value: string (may be None); + port: string; None indicates no attribute was supplied (eg. "Port", rather + than eg. "Port=80"); otherwise, a port string (eg. "80") or a port list + string (eg. "80,8080") + port_specified: boolean; true if a value was supplied with the Port + cookie-attribute + domain: string; + domain_specified: boolean; true if Domain was explicitly set + domain_initial_dot: boolean; true if Domain as set in HTTP header by server + started with a dot (yes, this really is necessary!) + path: string; + path_specified: boolean; true if Path was explicitly set + secure: boolean; true if should only be returned over secure connection + expires: integer; seconds since epoch (RFC 2965 cookies should calculate + this value from the Max-Age attribute) + discard: boolean, true if this is a session cookie; (if no expires value, + this should be true) + comment: string; + comment_url: string; + rfc2109: boolean; true if cookie arrived in a Set-Cookie: (not + Set-Cookie2:) header, but had a version cookie-attribute of 1 + rest: mapping of other cookie-attributes + + Note that the port may be present in the headers, but unspecified ("Port" + rather than"Port=80", for example); if this is the case, port is None. + + """ + + def __init__(self, version, name, value, + port, port_specified, + domain, domain_specified, domain_initial_dot, + path, path_specified, + secure, + expires, + discard, + comment, + comment_url, + rest, + rfc2109=False, + ): + + if version is not None: version = int(version) + if expires is not None: expires = int(expires) + if port is None and port_specified is True: + raise ValueError("if port is None, port_specified must be false") + + self.version = version + self.name = name + self.value = value + self.port = port + self.port_specified = port_specified + # normalise case, as per RFC 2965 section 3.3.3 + self.domain = domain.lower() + self.domain_specified = domain_specified + # Sigh. We need to know whether the domain given in the + # cookie-attribute had an initial dot, in order to follow RFC 2965 + # (as clarified in draft errata). Needed for the returned $Domain + # value. + self.domain_initial_dot = domain_initial_dot + self.path = path + self.path_specified = path_specified + self.secure = secure + self.expires = expires + self.discard = discard + self.comment = comment + self.comment_url = comment_url + self.rfc2109 = rfc2109 + + self._rest = copy.copy(rest) + + def has_nonstandard_attr(self, name): + return self._rest.has_key(name) + def get_nonstandard_attr(self, name, default=None): + return self._rest.get(name, default) + def set_nonstandard_attr(self, name, value): + self._rest[name] = value + def nonstandard_attr_keys(self): + return self._rest.keys() + + def is_expired(self, now=None): + if now is None: now = time.time() + return (self.expires is not None) and (self.expires <= now) + + def __str__(self): + if self.port is None: p = "" + else: p = ":"+self.port + limit = self.domain + p + self.path + if self.value is not None: + namevalue = "%s=%s" % (self.name, self.value) + else: + namevalue = self.name + return "<Cookie %s for %s>" % (namevalue, limit) + + def __repr__(self): + args = [] + for name in ["version", "name", "value", + "port", "port_specified", + "domain", "domain_specified", "domain_initial_dot", + "path", "path_specified", + "secure", "expires", "discard", "comment", "comment_url", + ]: + attr = getattr(self, name) + args.append("%s=%s" % (name, repr(attr))) + args.append("rest=%s" % repr(self._rest)) + args.append("rfc2109=%s" % repr(self.rfc2109)) + return "Cookie(%s)" % ", ".join(args) + + +class CookiePolicy: + """Defines which cookies get accepted from and returned to server. + + May also modify cookies. + + The subclass DefaultCookiePolicy defines the standard rules for Netscape + and RFC 2965 cookies -- override that if you want a customised policy. + + As well as implementing set_ok and return_ok, implementations of this + interface must also supply the following attributes, indicating which + protocols should be used, and how. These can be read and set at any time, + though whether that makes complete sense from the protocol point of view is + doubtful. + + Public attributes: + + netscape: implement netscape protocol + rfc2965: implement RFC 2965 protocol + rfc2109_as_netscape: + WARNING: This argument will change or go away if is not accepted into + the Python standard library in this form! + If true, treat RFC 2109 cookies as though they were Netscape cookies. The + default is for this attribute to be None, which means treat 2109 cookies + as RFC 2965 cookies unless RFC 2965 handling is switched off (which it is, + by default), and as Netscape cookies otherwise. + hide_cookie2: don't add Cookie2 header to requests (the presence of + this header indicates to the server that we understand RFC 2965 + cookies) + + """ + def set_ok(self, cookie, request): + """Return true if (and only if) cookie should be accepted from server. + + Currently, pre-expired cookies never get this far -- the CookieJar + class deletes such cookies itself. + + cookie: mechanize.Cookie object + request: object implementing the interface defined by + CookieJar.extract_cookies.__doc__ + + """ + raise NotImplementedError() + + def return_ok(self, cookie, request): + """Return true if (and only if) cookie should be returned to server. + + cookie: mechanize.Cookie object + request: object implementing the interface defined by + CookieJar.add_cookie_header.__doc__ + + """ + raise NotImplementedError() + + def domain_return_ok(self, domain, request): + """Return false if cookies should not be returned, given cookie domain. + + This is here as an optimization, to remove the need for checking every + cookie with a particular domain (which may involve reading many files). + The default implementations of domain_return_ok and path_return_ok + (return True) leave all the work to return_ok. + + If domain_return_ok returns true for the cookie domain, path_return_ok + is called for the cookie path. Otherwise, path_return_ok and return_ok + are never called for that cookie domain. If path_return_ok returns + true, return_ok is called with the Cookie object itself for a full + check. Otherwise, return_ok is never called for that cookie path. + + Note that domain_return_ok is called for every *cookie* domain, not + just for the *request* domain. For example, the function might be + called with both ".acme.com" and "www.acme.com" if the request domain + is "www.acme.com". The same goes for path_return_ok. + + For argument documentation, see the docstring for return_ok. + + """ + return True + + def path_return_ok(self, path, request): + """Return false if cookies should not be returned, given cookie path. + + See the docstring for domain_return_ok. + + """ + return True + + +class DefaultCookiePolicy(CookiePolicy): + """Implements the standard rules for accepting and returning cookies. + + Both RFC 2965 and Netscape cookies are covered. RFC 2965 handling is + switched off by default. + + The easiest way to provide your own policy is to override this class and + call its methods in your overriden implementations before adding your own + additional checks. + + import mechanize + class MyCookiePolicy(mechanize.DefaultCookiePolicy): + def set_ok(self, cookie, request): + if not mechanize.DefaultCookiePolicy.set_ok( + self, cookie, request): + return False + if i_dont_want_to_store_this_cookie(): + return False + return True + + In addition to the features required to implement the CookiePolicy + interface, this class allows you to block and allow domains from setting + and receiving cookies. There are also some strictness switches that allow + you to tighten up the rather loose Netscape protocol rules a little bit (at + the cost of blocking some benign cookies). + + A domain blacklist and whitelist is provided (both off by default). Only + domains not in the blacklist and present in the whitelist (if the whitelist + is active) participate in cookie setting and returning. Use the + blocked_domains constructor argument, and blocked_domains and + set_blocked_domains methods (and the corresponding argument and methods for + allowed_domains). If you set a whitelist, you can turn it off again by + setting it to None. + + Domains in block or allow lists that do not start with a dot must + string-compare equal. For example, "acme.com" matches a blacklist entry of + "acme.com", but "www.acme.com" does not. Domains that do start with a dot + are matched by more specific domains too. For example, both "www.acme.com" + and "www.munitions.acme.com" match ".acme.com" (but "acme.com" itself does + not). IP addresses are an exception, and must match exactly. For example, + if blocked_domains contains "192.168.1.2" and ".168.1.2" 192.168.1.2 is + blocked, but 193.168.1.2 is not. + + Additional Public Attributes: + + General strictness switches + + strict_domain: don't allow sites to set two-component domains with + country-code top-level domains like .co.uk, .gov.uk, .co.nz. etc. + This is far from perfect and isn't guaranteed to work! + + RFC 2965 protocol strictness switches + + strict_rfc2965_unverifiable: follow RFC 2965 rules on unverifiable + transactions (usually, an unverifiable transaction is one resulting from + a redirect or an image hosted on another site); if this is false, cookies + are NEVER blocked on the basis of verifiability + + Netscape protocol strictness switches + + strict_ns_unverifiable: apply RFC 2965 rules on unverifiable transactions + even to Netscape cookies + strict_ns_domain: flags indicating how strict to be with domain-matching + rules for Netscape cookies: + DomainStrictNoDots: when setting cookies, host prefix must not contain a + dot (eg. www.foo.bar.com can't set a cookie for .bar.com, because + www.foo contains a dot) + DomainStrictNonDomain: cookies that did not explicitly specify a Domain + cookie-attribute can only be returned to a domain that string-compares + equal to the domain that set the cookie (eg. rockets.acme.com won't + be returned cookies from acme.com that had no Domain cookie-attribute) + DomainRFC2965Match: when setting cookies, require a full RFC 2965 + domain-match + DomainLiberal and DomainStrict are the most useful combinations of the + above flags, for convenience + strict_ns_set_initial_dollar: ignore cookies in Set-Cookie: headers that + have names starting with '$' + strict_ns_set_path: don't allow setting cookies whose path doesn't + path-match request URI + + """ + + DomainStrictNoDots = 1 + DomainStrictNonDomain = 2 + DomainRFC2965Match = 4 + + DomainLiberal = 0 + DomainStrict = DomainStrictNoDots|DomainStrictNonDomain + + def __init__(self, + blocked_domains=None, allowed_domains=None, + netscape=True, rfc2965=False, + # WARNING: this argument will change or go away if is not + # accepted into the Python standard library in this form! + # default, ie. treat 2109 as netscape iff not rfc2965 + rfc2109_as_netscape=None, + hide_cookie2=False, + strict_domain=False, + strict_rfc2965_unverifiable=True, + strict_ns_unverifiable=False, + strict_ns_domain=DomainLiberal, + strict_ns_set_initial_dollar=False, + strict_ns_set_path=False, + ): + """ + Constructor arguments should be used as keyword arguments only. + + blocked_domains: sequence of domain names that we never accept cookies + from, nor return cookies to + allowed_domains: if not None, this is a sequence of the only domains + for which we accept and return cookies + + For other arguments, see CookiePolicy.__doc__ and + DefaultCookiePolicy.__doc__.. + + """ + self.netscape = netscape + self.rfc2965 = rfc2965 + self.rfc2109_as_netscape = rfc2109_as_netscape + self.hide_cookie2 = hide_cookie2 + self.strict_domain = strict_domain + self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable + self.strict_ns_unverifiable = strict_ns_unverifiable + self.strict_ns_domain = strict_ns_domain + self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar + self.strict_ns_set_path = strict_ns_set_path + + if blocked_domains is not None: + self._blocked_domains = tuple(blocked_domains) + else: + self._blocked_domains = () + + if allowed_domains is not None: + allowed_domains = tuple(allowed_domains) + self._allowed_domains = allowed_domains + + def blocked_domains(self): + """Return the sequence of blocked domains (as a tuple).""" + return self._blocked_domains + def set_blocked_domains(self, blocked_domains): + """Set the sequence of blocked domains.""" + self._blocked_domains = tuple(blocked_domains) + + def is_blocked(self, domain): + for blocked_domain in self._blocked_domains: + if user_domain_match(domain, blocked_domain): + return True + return False + + def allowed_domains(self): + """Return None, or the sequence of allowed domains (as a tuple).""" + return self._allowed_domains + def set_allowed_domains(self, allowed_domains): + """Set the sequence of allowed domains, or None.""" + if allowed_domains is not None: + allowed_domains = tuple(allowed_domains) + self._allowed_domains = allowed_domains + + def is_not_allowed(self, domain): + if self._allowed_domains is None: + return False + for allowed_domain in self._allowed_domains: + if user_domain_match(domain, allowed_domain): + return False + return True + + def set_ok(self, cookie, request): + """ + If you override set_ok, be sure to call this method. If it returns + false, so should your subclass (assuming your subclass wants to be more + strict about which cookies to accept). + + """ + debug(" - checking cookie %s", cookie) + + assert cookie.name is not None + + for n in "version", "verifiability", "name", "path", "domain", "port": + fn_name = "set_ok_"+n + fn = getattr(self, fn_name) + if not fn(cookie, request): + return False + + return True + + def set_ok_version(self, cookie, request): + if cookie.version is None: + # Version is always set to 0 by parse_ns_headers if it's a Netscape + # cookie, so this must be an invalid RFC 2965 cookie. + debug(" Set-Cookie2 without version attribute (%s)", cookie) + return False + if cookie.version > 0 and not self.rfc2965: + debug(" RFC 2965 cookies are switched off") + return False + elif cookie.version == 0 and not self.netscape: + debug(" Netscape cookies are switched off") + return False + return True + + def set_ok_verifiability(self, cookie, request): + if request_is_unverifiable(request) and is_third_party(request): + if cookie.version > 0 and self.strict_rfc2965_unverifiable: + debug(" third-party RFC 2965 cookie during " + "unverifiable transaction") + return False + elif cookie.version == 0 and self.strict_ns_unverifiable: + debug(" third-party Netscape cookie during " + "unverifiable transaction") + return False + return True + + def set_ok_name(self, cookie, request): + # Try and stop servers setting V0 cookies designed to hack other + # servers that know both V0 and V1 protocols. + if (cookie.version == 0 and self.strict_ns_set_initial_dollar and + cookie.name.startswith("$")): + debug(" illegal name (starts with '$'): '%s'", cookie.name) + return False + return True + + def set_ok_path(self, cookie, request): + if cookie.path_specified: + req_path = request_path(request) + if ((cookie.version > 0 or + (cookie.version == 0 and self.strict_ns_set_path)) and + not req_path.startswith(cookie.path)): + debug(" path attribute %s is not a prefix of request " + "path %s", cookie.path, req_path) + return False + return True + + def set_ok_countrycode_domain(self, cookie, request): + """Return False if explicit cookie domain is not acceptable. + + Called by set_ok_domain, for convenience of overriding by + subclasses. + + """ + if cookie.domain_specified and self.strict_domain: + domain = cookie.domain + # since domain was specified, we know that: + assert domain.startswith(".") + if domain.count(".") == 2: + # domain like .foo.bar + i = domain.rfind(".") + tld = domain[i+1:] + sld = domain[1:i] + if (sld.lower() in [ + "co", "ac", + "com", "edu", "org", "net", "gov", "mil", "int", + "aero", "biz", "cat", "coop", "info", "jobs", "mobi", + "museum", "name", "pro", "travel", + ] and + len(tld) == 2): + # domain like .co.uk + return False + return True + + def set_ok_domain(self, cookie, request): + if self.is_blocked(cookie.domain): + debug(" domain %s is in user block-list", cookie.domain) + return False + if self.is_not_allowed(cookie.domain): + debug(" domain %s is not in user allow-list", cookie.domain) + return False + if not self.set_ok_countrycode_domain(cookie, request): + debug(" country-code second level domain %s", cookie.domain) + return False + if cookie.domain_specified: + req_host, erhn = eff_request_host_lc(request) + domain = cookie.domain + if domain.startswith("."): + undotted_domain = domain[1:] + else: + undotted_domain = domain + embedded_dots = (undotted_domain.find(".") >= 0) + if not embedded_dots and domain != ".local": + debug(" non-local domain %s contains no embedded dot", + domain) + return False + if cookie.version == 0: + if (not erhn.endswith(domain) and + (not erhn.startswith(".") and + not ("."+erhn).endswith(domain))): + debug(" effective request-host %s (even with added " + "initial dot) does not end end with %s", + erhn, domain) + return False + if (cookie.version > 0 or + (self.strict_ns_domain & self.DomainRFC2965Match)): + if not domain_match(erhn, domain): + debug(" effective request-host %s does not domain-match " + "%s", erhn, domain) + return False + if (cookie.version > 0 or + (self.strict_ns_domain & self.DomainStrictNoDots)): + host_prefix = req_host[:-len(domain)] + if (host_prefix.find(".") >= 0 and + not IPV4_RE.search(req_host)): + debug(" host prefix %s for domain %s contains a dot", + host_prefix, domain) + return False + return True + + def set_ok_port(self, cookie, request): + if cookie.port_specified: + req_port = request_port(request) + if req_port is None: + req_port = "80" + else: + req_port = str(req_port) + for p in cookie.port.split(","): + try: + int(p) + except ValueError: + debug(" bad port %s (not numeric)", p) + return False + if p == req_port: + break + else: + debug(" request port (%s) not found in %s", + req_port, cookie.port) + return False + return True + + def return_ok(self, cookie, request): + """ + If you override return_ok, be sure to call this method. If it returns + false, so should your subclass (assuming your subclass wants to be more + strict about which cookies to return). + + """ + # Path has already been checked by path_return_ok, and domain blocking + # done by domain_return_ok. + debug(" - checking cookie %s", cookie) + + for n in ("version", "verifiability", "secure", "expires", "port", + "domain"): + fn_name = "return_ok_"+n + fn = getattr(self, fn_name) + if not fn(cookie, request): + return False + return True + + def return_ok_version(self, cookie, request): + if cookie.version > 0 and not self.rfc2965: + debug(" RFC 2965 cookies are switched off") + return False + elif cookie.version == 0 and not self.netscape: + debug(" Netscape cookies are switched off") + return False + return True + + def return_ok_verifiability(self, cookie, request): + if request_is_unverifiable(request) and is_third_party(request): + if cookie.version > 0 and self.strict_rfc2965_unverifiable: + debug(" third-party RFC 2965 cookie during unverifiable " + "transaction") + return False + elif cookie.version == 0 and self.strict_ns_unverifiable: + debug(" third-party Netscape cookie during unverifiable " + "transaction") + return False + return True + + def return_ok_secure(self, cookie, request): + if cookie.secure and request.get_type() != "https": + debug(" secure cookie with non-secure request") + return False + return True + + def return_ok_expires(self, cookie, request): + if cookie.is_expired(self._now): + debug(" cookie expired") + return False + return True + + def return_ok_port(self, cookie, request): + if cookie.port: + req_port = request_port(request) + if req_port is None: + req_port = "80" + for p in cookie.port.split(","): + if p == req_port: + break + else: + debug(" request port %s does not match cookie port %s", + req_port, cookie.port) + return False + return True + + def return_ok_domain(self, cookie, request): + req_host, erhn = eff_request_host_lc(request) + domain = cookie.domain + + # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't + if (cookie.version == 0 and + (self.strict_ns_domain & self.DomainStrictNonDomain) and + not cookie.domain_specified and domain != erhn): + debug(" cookie with unspecified domain does not string-compare " + "equal to request domain") + return False + + if cookie.version > 0 and not domain_match(erhn, domain): + debug(" effective request-host name %s does not domain-match " + "RFC 2965 cookie domain %s", erhn, domain) + return False + if cookie.version == 0 and not ("."+erhn).endswith(domain): + debug(" request-host %s does not match Netscape cookie domain " + "%s", req_host, domain) + return False + return True + + def domain_return_ok(self, domain, request): + # Liberal check of domain. This is here as an optimization to avoid + # having to load lots of MSIE cookie files unless necessary. + + # Munge req_host and erhn to always start with a dot, so as to err on + # the side of letting cookies through. + dotted_req_host, dotted_erhn = eff_request_host_lc(request) + if not dotted_req_host.startswith("."): + dotted_req_host = "."+dotted_req_host + if not dotted_erhn.startswith("."): + dotted_erhn = "."+dotted_erhn + if not (dotted_req_host.endswith(domain) or + dotted_erhn.endswith(domain)): + #debug(" request domain %s does not match cookie domain %s", + # req_host, domain) + return False + + if self.is_blocked(domain): + debug(" domain %s is in user block-list", domain) + return False + if self.is_not_allowed(domain): + debug(" domain %s is not in user allow-list", domain) + return False + + return True + + def path_return_ok(self, path, request): + debug("- checking cookie path=%s", path) + req_path = request_path(request) + if not req_path.startswith(path): + debug(" %s does not path-match %s", req_path, path) + return False + return True + + +def vals_sorted_by_key(adict): + keys = adict.keys() + keys.sort() + return map(adict.get, keys) + +class MappingIterator: + """Iterates over nested mapping, depth-first, in sorted order by key.""" + def __init__(self, mapping): + self._s = [(vals_sorted_by_key(mapping), 0, None)] # LIFO stack + + def __iter__(self): return self + + def next(self): + # this is hairy because of lack of generators + while 1: + try: + vals, i, prev_item = self._s.pop() + except IndexError: + raise StopIteration() + if i < len(vals): + item = vals[i] + i = i + 1 + self._s.append((vals, i, prev_item)) + try: + item.items + except AttributeError: + # non-mapping + break + else: + # mapping + self._s.append((vals_sorted_by_key(item), 0, item)) + continue + return item + + +# Used as second parameter to dict.get method, to distinguish absent +# dict key from one with a None value. +class Absent: pass + +class CookieJar: + """Collection of HTTP cookies. + + You may not need to know about this class: try mechanize.urlopen(). + + The major methods are extract_cookies and add_cookie_header; these are all + you are likely to need. + + CookieJar supports the iterator protocol: + + for cookie in cookiejar: + # do something with cookie + + Methods: + + add_cookie_header(request) + extract_cookies(response, request) + get_policy() + set_policy(policy) + cookies_for_request(request) + make_cookies(response, request) + set_cookie_if_ok(cookie, request) + set_cookie(cookie) + clear_session_cookies() + clear_expired_cookies() + clear(domain=None, path=None, name=None) + + Public attributes + + policy: CookiePolicy object + + """ + + non_word_re = re.compile(r"\W") + quote_re = re.compile(r"([\"\\])") + strict_domain_re = re.compile(r"\.?[^.]*") + domain_re = re.compile(r"[^.]*") + dots_re = re.compile(r"^\.+") + + def __init__(self, policy=None): + """ + See CookieJar.__doc__ for argument documentation. + + """ + if policy is None: + policy = DefaultCookiePolicy() + self._policy = policy + + self._cookies = {} + + # for __getitem__ iteration in pre-2.2 Pythons + self._prev_getitem_index = 0 + + def get_policy(self): + return self._policy + + def set_policy(self, policy): + self._policy = policy + + def _cookies_for_domain(self, domain, request): + cookies = [] + if not self._policy.domain_return_ok(domain, request): + return [] + debug("Checking %s for cookies to return", domain) + cookies_by_path = self._cookies[domain] + for path in cookies_by_path.keys(): + if not self._policy.path_return_ok(path, request): + continue + cookies_by_name = cookies_by_path[path] + for cookie in cookies_by_name.values(): + if not self._policy.return_ok(cookie, request): + debug(" not returning cookie") + continue + debug(" it's a match") + cookies.append(cookie) + return cookies + + def cookies_for_request(self, request): + """Return a list of cookies to be returned to server. + + The returned list of cookie instances is sorted in the order they + should appear in the Cookie: header for return to the server. + + See add_cookie_header.__doc__ for the interface required of the + request argument. + + New in version 0.1.10 + + """ + self._policy._now = self._now = int(time.time()) + cookies = self._cookies_for_request(request) + # add cookies in order of most specific (i.e. longest) path first + def decreasing_size(a, b): return cmp(len(b.path), len(a.path)) + cookies.sort(decreasing_size) + return cookies + + def _cookies_for_request(self, request): + """Return a list of cookies to be returned to server.""" + # this method still exists (alongside cookies_for_request) because it + # is part of an implied protected interface for subclasses of cookiejar + # XXX document that implied interface, or provide another way of + # implementing cookiejars than subclassing + cookies = [] + for domain in self._cookies.keys(): + cookies.extend(self._cookies_for_domain(domain, request)) + return cookies + + def _cookie_attrs(self, cookies): + """Return a list of cookie-attributes to be returned to server. + + The $Version attribute is also added when appropriate (currently only + once per request). + + >>> jar = CookieJar() + >>> ns_cookie = Cookie(0, "foo", '"bar"', None, False, + ... "example.com", False, False, + ... "/", False, False, None, True, + ... None, None, {}) + >>> jar._cookie_attrs([ns_cookie]) + ['foo="bar"'] + >>> rfc2965_cookie = Cookie(1, "foo", "bar", None, False, + ... ".example.com", True, False, + ... "/", False, False, None, True, + ... None, None, {}) + >>> jar._cookie_attrs([rfc2965_cookie]) + ['$Version=1', 'foo=bar', '$Domain="example.com"'] + + """ + version_set = False + + attrs = [] + for cookie in cookies: + # set version of Cookie header + # XXX + # What should it be if multiple matching Set-Cookie headers have + # different versions themselves? + # Answer: there is no answer; was supposed to be settled by + # RFC 2965 errata, but that may never appear... + version = cookie.version + if not version_set: + version_set = True + if version > 0: + attrs.append("$Version=%s" % version) + + # quote cookie value if necessary + # (not for Netscape protocol, which already has any quotes + # intact, due to the poorly-specified Netscape Cookie: syntax) + if ((cookie.value is not None) and + self.non_word_re.search(cookie.value) and version > 0): + value = self.quote_re.sub(r"\\\1", cookie.value) + else: + value = cookie.value + + # add cookie-attributes to be returned in Cookie header + if cookie.value is None: + attrs.append(cookie.name) + else: + attrs.append("%s=%s" % (cookie.name, value)) + if version > 0: + if cookie.path_specified: + attrs.append('$Path="%s"' % cookie.path) + if cookie.domain.startswith("."): + domain = cookie.domain + if (not cookie.domain_initial_dot and + domain.startswith(".")): + domain = domain[1:] + attrs.append('$Domain="%s"' % domain) + if cookie.port is not None: + p = "$Port" + if cookie.port_specified: + p = p + ('="%s"' % cookie.port) + attrs.append(p) + + return attrs + + def add_cookie_header(self, request): + """Add correct Cookie: header to request (urllib2.Request object). + + The Cookie2 header is also added unless policy.hide_cookie2 is true. + + The request object (usually a urllib2.Request instance) must support + the methods get_full_url, get_host, is_unverifiable, get_type, + has_header, get_header, header_items and add_unredirected_header, as + documented by urllib2, and the port attribute (the port number). + Actually, RequestUpgradeProcessor will automatically upgrade your + Request object to one with has_header, get_header, header_items and + add_unredirected_header, if it lacks those methods, for compatibility + with pre-2.4 versions of urllib2. + + """ + debug("add_cookie_header") + cookies = self.cookies_for_request(request) + + attrs = self._cookie_attrs(cookies) + if attrs: + if not request.has_header("Cookie"): + request.add_unredirected_header("Cookie", "; ".join(attrs)) + + # if necessary, advertise that we know RFC 2965 + if self._policy.rfc2965 and not self._policy.hide_cookie2: + for cookie in cookies: + if cookie.version != 1 and not request.has_header("Cookie2"): + request.add_unredirected_header("Cookie2", '$Version="1"') + break + + self.clear_expired_cookies() + + def _normalized_cookie_tuples(self, attrs_set): + """Return list of tuples containing normalised cookie information. + + attrs_set is the list of lists of key,value pairs extracted from + the Set-Cookie or Set-Cookie2 headers. + + Tuples are name, value, standard, rest, where name and value are the + cookie name and value, standard is a dictionary containing the standard + cookie-attributes (discard, secure, version, expires or max-age, + domain, path and port) and rest is a dictionary containing the rest of + the cookie-attributes. + + """ + cookie_tuples = [] + + boolean_attrs = "discard", "secure" + value_attrs = ("version", + "expires", "max-age", + "domain", "path", "port", + "comment", "commenturl") + + for cookie_attrs in attrs_set: + name, value = cookie_attrs[0] + + # Build dictionary of standard cookie-attributes (standard) and + # dictionary of other cookie-attributes (rest). + + # Note: expiry time is normalised to seconds since epoch. V0 + # cookies should have the Expires cookie-attribute, and V1 cookies + # should have Max-Age, but since V1 includes RFC 2109 cookies (and + # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we + # accept either (but prefer Max-Age). + max_age_set = False + + bad_cookie = False + + standard = {} + rest = {} + for k, v in cookie_attrs[1:]: + lc = k.lower() + # don't lose case distinction for unknown fields + if lc in value_attrs or lc in boolean_attrs: + k = lc + if k in boolean_attrs and v is None: + # boolean cookie-attribute is present, but has no value + # (like "discard", rather than "port=80") + v = True + if standard.has_key(k): + # only first value is significant + continue + if k == "domain": + if v is None: + debug(" missing value for domain attribute") + bad_cookie = True + break + # RFC 2965 section 3.3.3 + v = v.lower() + if k == "expires": + if max_age_set: + # Prefer max-age to expires (like Mozilla) + continue + if v is None: + debug(" missing or invalid value for expires " + "attribute: treating as session cookie") + continue + if k == "max-age": + max_age_set = True + if v is None: + debug(" missing value for max-age attribute") + bad_cookie = True + break + try: + v = int(v) + except ValueError: + debug(" missing or invalid (non-numeric) value for " + "max-age attribute") + bad_cookie = True + break + # convert RFC 2965 Max-Age to seconds since epoch + # XXX Strictly you're supposed to follow RFC 2616 + # age-calculation rules. Remember that zero Max-Age is a + # is a request to discard (old and new) cookie, though. + k = "expires" + v = self._now + v + if (k in value_attrs) or (k in boolean_attrs): + if (v is None and + k not in ["port", "comment", "commenturl"]): + debug(" missing value for %s attribute" % k) + bad_cookie = True + break + standard[k] = v + else: + rest[k] = v + + if bad_cookie: + continue + + cookie_tuples.append((name, value, standard, rest)) + + return cookie_tuples + + def _cookie_from_cookie_tuple(self, tup, request): + # standard is dict of standard cookie-attributes, rest is dict of the + # rest of them + name, value, standard, rest = tup + + domain = standard.get("domain", Absent) + path = standard.get("path", Absent) + port = standard.get("port", Absent) + expires = standard.get("expires", Absent) + + # set the easy defaults + version = standard.get("version", None) + if version is not None: + try: + version = int(version) + except ValueError: + return None # invalid version, ignore cookie + secure = standard.get("secure", False) + # (discard is also set if expires is Absent) + discard = standard.get("discard", False) + comment = standard.get("comment", None) + comment_url = standard.get("commenturl", None) + + # set default path + if path is not Absent and path != "": + path_specified = True + path = escape_path(path) + else: + path_specified = False + path = request_path(request) + i = path.rfind("/") + if i != -1: + if version == 0: + # Netscape spec parts company from reality here + path = path[:i] + else: + path = path[:i+1] + if len(path) == 0: path = "/" + + # set default domain + domain_specified = domain is not Absent + # but first we have to remember whether it starts with a dot + domain_initial_dot = False + if domain_specified: + domain_initial_dot = bool(domain.startswith(".")) + if domain is Absent: + req_host, erhn = eff_request_host_lc(request) + domain = erhn + elif not domain.startswith("."): + domain = "."+domain + + # set default port + port_specified = False + if port is not Absent: + if port is None: + # Port attr present, but has no value: default to request port. + # Cookie should then only be sent back on that port. + port = request_port(request) + else: + port_specified = True + port = re.sub(r"\s+", "", port) + else: + # No port attr present. Cookie can be sent back on any port. + port = None + + # set default expires and discard + if expires is Absent: + expires = None + discard = True + + return Cookie(version, + name, value, + port, port_specified, + domain, domain_specified, domain_initial_dot, + path, path_specified, + secure, + expires, + discard, + comment, + comment_url, + rest) + + def _cookies_from_attrs_set(self, attrs_set, request): + cookie_tuples = self._normalized_cookie_tuples(attrs_set) + + cookies = [] + for tup in cookie_tuples: + cookie = self._cookie_from_cookie_tuple(tup, request) + if cookie: cookies.append(cookie) + return cookies + + def _process_rfc2109_cookies(self, cookies): + if self._policy.rfc2109_as_netscape is None: + rfc2109_as_netscape = not self._policy.rfc2965 + else: + rfc2109_as_netscape = self._policy.rfc2109_as_netscape + for cookie in cookies: + if cookie.version == 1: + cookie.rfc2109 = True + if rfc2109_as_netscape: + # treat 2109 cookies as Netscape cookies rather than + # as RFC2965 cookies + cookie.version = 0 + + def _make_cookies(self, response, request): + # get cookie-attributes for RFC 2965 and Netscape protocols + headers = response.info() + rfc2965_hdrs = headers.getheaders("Set-Cookie2") + ns_hdrs = headers.getheaders("Set-Cookie") + + rfc2965 = self._policy.rfc2965 + netscape = self._policy.netscape + + if ((not rfc2965_hdrs and not ns_hdrs) or + (not ns_hdrs and not rfc2965) or + (not rfc2965_hdrs and not netscape) or + (not netscape and not rfc2965)): + return [] # no relevant cookie headers: quick exit + + try: + cookies = self._cookies_from_attrs_set( + split_header_words(rfc2965_hdrs), request) + except: + reraise_unmasked_exceptions() + cookies = [] + + if ns_hdrs and netscape: + try: + # RFC 2109 and Netscape cookies + ns_cookies = self._cookies_from_attrs_set( + parse_ns_headers(ns_hdrs), request) + except: + reraise_unmasked_exceptions() + ns_cookies = [] + self._process_rfc2109_cookies(ns_cookies) + + # Look for Netscape cookies (from Set-Cookie headers) that match + # corresponding RFC 2965 cookies (from Set-Cookie2 headers). + # For each match, keep the RFC 2965 cookie and ignore the Netscape + # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are + # bundled in with the Netscape cookies for this purpose, which is + # reasonable behaviour. + if rfc2965: + lookup = {} + for cookie in cookies: + lookup[(cookie.domain, cookie.path, cookie.name)] = None + + def no_matching_rfc2965(ns_cookie, lookup=lookup): + key = ns_cookie.domain, ns_cookie.path, ns_cookie.name + return not lookup.has_key(key) + ns_cookies = filter(no_matching_rfc2965, ns_cookies) + + if ns_cookies: + cookies.extend(ns_cookies) + + return cookies + + def make_cookies(self, response, request): + """Return sequence of Cookie objects extracted from response object. + + See extract_cookies.__doc__ for the interface required of the + response and request arguments. + + """ + self._policy._now = self._now = int(time.time()) + return [cookie for cookie in self._make_cookies(response, request) + if cookie.expires is None or not cookie.expires <= self._now] + + def set_cookie_if_ok(self, cookie, request): + """Set a cookie if policy says it's OK to do so. + + cookie: mechanize.Cookie instance + request: see extract_cookies.__doc__ for the required interface + + """ + self._policy._now = self._now = int(time.time()) + + if self._policy.set_ok(cookie, request): + self.set_cookie(cookie) + + def set_cookie(self, cookie): + """Set a cookie, without checking whether or not it should be set. + + cookie: mechanize.Cookie instance + """ + c = self._cookies + if not c.has_key(cookie.domain): c[cookie.domain] = {} + c2 = c[cookie.domain] + if not c2.has_key(cookie.path): c2[cookie.path] = {} + c3 = c2[cookie.path] + c3[cookie.name] = cookie + + def extract_cookies(self, response, request): + """Extract cookies from response, where allowable given the request. + + Look for allowable Set-Cookie: and Set-Cookie2: headers in the response + object passed as argument. Any of these headers that are found are + used to update the state of the object (subject to the policy.set_ok + method's approval). + + The response object (usually be the result of a call to + mechanize.urlopen, or similar) should support an info method, which + returns a mimetools.Message object (in fact, the 'mimetools.Message + object' may be any object that provides a getheaders method). + + The request object (usually a urllib2.Request instance) must support + the methods get_full_url, get_type, get_host, and is_unverifiable, as + documented by urllib2, and the port attribute (the port number). The + request is used to set default values for cookie-attributes as well as + for checking that the cookie is OK to be set. + + """ + debug("extract_cookies: %s", response.info()) + self._policy._now = self._now = int(time.time()) + + for cookie in self._make_cookies(response, request): + if cookie.expires is not None and cookie.expires <= self._now: + # Expiry date in past is request to delete cookie. This can't be + # in DefaultCookiePolicy, because can't delete cookies there. + try: + self.clear(cookie.domain, cookie.path, cookie.name) + except KeyError: + pass + debug("Expiring cookie, domain='%s', path='%s', name='%s'", + cookie.domain, cookie.path, cookie.name) + elif self._policy.set_ok(cookie, request): + debug(" setting cookie: %s", cookie) + self.set_cookie(cookie) + + def clear(self, domain=None, path=None, name=None): + """Clear some cookies. + + Invoking this method without arguments will clear all cookies. If + given a single argument, only cookies belonging to that domain will be + removed. If given two arguments, cookies belonging to the specified + path within that domain are removed. If given three arguments, then + the cookie with the specified name, path and domain is removed. + + Raises KeyError if no matching cookie exists. + + """ + if name is not None: + if (domain is None) or (path is None): + raise ValueError( + "domain and path must be given to remove a cookie by name") + del self._cookies[domain][path][name] + elif path is not None: + if domain is None: + raise ValueError( + "domain must be given to remove cookies by path") + del self._cookies[domain][path] + elif domain is not None: + del self._cookies[domain] + else: + self._cookies = {} + + def clear_session_cookies(self): + """Discard all session cookies. + + Discards all cookies held by object which had either no Max-Age or + Expires cookie-attribute or an explicit Discard cookie-attribute, or + which otherwise have ended up with a true discard attribute. For + interactive browsers, the end of a session usually corresponds to + closing the browser window. + + Note that the save method won't save session cookies anyway, unless you + ask otherwise by passing a true ignore_discard argument. + + """ + for cookie in self: + if cookie.discard: + self.clear(cookie.domain, cookie.path, cookie.name) + + def clear_expired_cookies(self): + """Discard all expired cookies. + + You probably don't need to call this method: expired cookies are never + sent back to the server (provided you're using DefaultCookiePolicy), + this method is called by CookieJar itself every so often, and the save + method won't save expired cookies anyway (unless you ask otherwise by + passing a true ignore_expires argument). + + """ + now = time.time() + for cookie in self: + if cookie.is_expired(now): + self.clear(cookie.domain, cookie.path, cookie.name) + + def __getitem__(self, i): + if i == 0: + self._getitem_iterator = self.__iter__() + elif self._prev_getitem_index != i-1: raise IndexError( + "CookieJar.__getitem__ only supports sequential iteration") + self._prev_getitem_index = i + try: + return self._getitem_iterator.next() + except StopIteration: + raise IndexError() + + def __iter__(self): + return MappingIterator(self._cookies) + + def __len__(self): + """Return number of contained cookies.""" + i = 0 + for cookie in self: i = i + 1 + return i + + def __repr__(self): + r = [] + for cookie in self: r.append(repr(cookie)) + return "<%s[%s]>" % (self.__class__, ", ".join(r)) + + def __str__(self): + r = [] + for cookie in self: r.append(str(cookie)) + return "<%s[%s]>" % (self.__class__, ", ".join(r)) + + +class LoadError(Exception): pass + +class FileCookieJar(CookieJar): + """CookieJar that can be loaded from and saved to a file. + + Additional methods + + save(filename=None, ignore_discard=False, ignore_expires=False) + load(filename=None, ignore_discard=False, ignore_expires=False) + revert(filename=None, ignore_discard=False, ignore_expires=False) + + Additional public attributes + + filename: filename for loading and saving cookies + + Additional public readable attributes + + delayload: request that cookies are lazily loaded from disk; this is only + a hint since this only affects performance, not behaviour (unless the + cookies on disk are changing); a CookieJar object may ignore it (in fact, + only MSIECookieJar lazily loads cookies at the moment) + + """ + + def __init__(self, filename=None, delayload=False, policy=None): + """ + See FileCookieJar.__doc__ for argument documentation. + + Cookies are NOT loaded from the named file until either the load or + revert method is called. + + """ + CookieJar.__init__(self, policy) + if filename is not None and not isstringlike(filename): + raise ValueError("filename must be string-like") + self.filename = filename + self.delayload = bool(delayload) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + """Save cookies to a file. + + filename: name of file in which to save cookies + ignore_discard: save even cookies set to be discarded + ignore_expires: save even cookies that have expired + + The file is overwritten if it already exists, thus wiping all its + cookies. Saved cookies can be restored later using the load or revert + methods. If filename is not specified, self.filename is used; if + self.filename is None, ValueError is raised. + + """ + raise NotImplementedError() + + def load(self, filename=None, ignore_discard=False, ignore_expires=False): + """Load cookies from a file. + + Old cookies are kept unless overwritten by newly loaded ones. + + Arguments are as for .save(). + + If filename is not specified, self.filename is used; if self.filename + is None, ValueError is raised. The named file must be in the format + understood by the class, or LoadError will be raised. This format will + be identical to that written by the save method, unless the load format + is not sufficiently well understood (as is the case for MSIECookieJar). + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + f = open(filename) + try: + self._really_load(f, filename, ignore_discard, ignore_expires) + finally: + f.close() + + def revert(self, filename=None, + ignore_discard=False, ignore_expires=False): + """Clear all cookies and reload cookies from a saved file. + + Raises LoadError (or IOError) if reversion is not successful; the + object's state will not be altered if this happens. + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + old_state = copy.deepcopy(self._cookies) + self._cookies = {} + try: + self.load(filename, ignore_discard, ignore_expires) + except (LoadError, IOError): + self._cookies = old_state + raise diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_debug.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_debug.py new file mode 100644 index 0000000..596b114 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_debug.py @@ -0,0 +1,28 @@ +import logging + +from urllib2 import BaseHandler +from _response import response_seek_wrapper + + +class HTTPResponseDebugProcessor(BaseHandler): + handler_order = 900 # before redirections, after everything else + + def http_response(self, request, response): + if not hasattr(response, "seek"): + response = response_seek_wrapper(response) + info = logging.getLogger("mechanize.http_responses").info + try: + info(response.read()) + finally: + response.seek(0) + info("*****************************************************") + return response + + https_response = http_response + +class HTTPRedirectDebugProcessor(BaseHandler): + def http_request(self, request): + if hasattr(request, "redirect_dict"): + info = logging.getLogger("mechanize.http_redirects").info + info("redirecting to %s", request.get_full_url()) + return request diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_file.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_file.py new file mode 100644 index 0000000..db662a8 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_file.py @@ -0,0 +1,60 @@ +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO +import mimetools +import os +import socket +import urllib +from urllib2 import BaseHandler, URLError + + +class FileHandler(BaseHandler): + # Use local file or FTP depending on form of URL + def file_open(self, req): + url = req.get_selector() + if url[:2] == '//' and url[2:3] != '/': + req.type = 'ftp' + return self.parent.open(req) + else: + return self.open_local_file(req) + + # names for the localhost + names = None + def get_names(self): + if FileHandler.names is None: + try: + FileHandler.names = (socket.gethostbyname('localhost'), + socket.gethostbyname(socket.gethostname())) + except socket.gaierror: + FileHandler.names = (socket.gethostbyname('localhost'),) + return FileHandler.names + + # not entirely sure what the rules are here + def open_local_file(self, req): + try: + import email.utils as emailutils + except ImportError: + import email.Utils as emailutils + import mimetypes + host = req.get_host() + file = req.get_selector() + localfile = urllib.url2pathname(file) + try: + stats = os.stat(localfile) + size = stats.st_size + modified = emailutils.formatdate(stats.st_mtime, usegmt=True) + mtype = mimetypes.guess_type(file)[0] + headers = mimetools.Message(StringIO( + 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % + (mtype or 'text/plain', size, modified))) + if host: + host, port = urllib.splitport(host) + if not host or \ + (not port and socket.gethostbyname(host) in self.get_names()): + return urllib.addinfourl(open(localfile, 'rb'), + headers, 'file:'+file) + except OSError, msg: + # urllib2 users shouldn't expect OSErrors coming from urlopen() + raise URLError(msg) + raise URLError('file not on local host') diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_firefox3cookiejar.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_firefox3cookiejar.py new file mode 100644 index 0000000..34fe979 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_firefox3cookiejar.py @@ -0,0 +1,249 @@ +"""Firefox 3 "cookies.sqlite" cookie persistence. + +Copyright 2008 John J Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import logging +import time +import sqlite3 + +from _clientcookie import CookieJar, Cookie, MappingIterator +from _util import isstringlike, experimental +debug = logging.getLogger("mechanize.cookies").debug + + +class Firefox3CookieJar(CookieJar): + + """Firefox 3 cookie jar. + + The cookies are stored in Firefox 3's "cookies.sqlite" format. + + Constructor arguments: + + filename: filename of cookies.sqlite (typically found at the top level + of a firefox profile directory) + autoconnect: as a convenience, connect to the SQLite cookies database at + Firefox3CookieJar construction time (default True) + policy: an object satisfying the mechanize.CookiePolicy interface + + Note that this is NOT a FileCookieJar, and there are no .load(), + .save() or .restore() methods. The database is in sync with the + cookiejar object's state after each public method call. + + Following Firefox's own behaviour, session cookies are never saved to + the database. + + The file is created, and an sqlite database written to it, if it does + not already exist. The moz_cookies database table is created if it does + not already exist. + """ + + # XXX + # handle DatabaseError exceptions + # add a FileCookieJar (explicit .save() / .revert() / .load() methods) + + def __init__(self, filename, autoconnect=True, policy=None): + experimental("Firefox3CookieJar is experimental code") + CookieJar.__init__(self, policy) + if filename is not None and not isstringlike(filename): + raise ValueError("filename must be string-like") + self.filename = filename + self._conn = None + if autoconnect: + self.connect() + + def connect(self): + self._conn = sqlite3.connect(self.filename) + self._conn.isolation_level = "DEFERRED" + self._create_table_if_necessary() + + def close(self): + self._conn.close() + + def _transaction(self, func): + try: + cur = self._conn.cursor() + try: + result = func(cur) + finally: + cur.close() + except: + self._conn.rollback() + raise + else: + self._conn.commit() + return result + + def _execute(self, query, params=()): + return self._transaction(lambda cur: cur.execute(query, params)) + + def _query(self, query, params=()): + # XXX should we bother with a transaction? + cur = self._conn.cursor() + try: + cur.execute(query, params) + for row in cur.fetchall(): + yield row + finally: + cur.close() + + def _create_table_if_necessary(self): + self._execute("""\ +CREATE TABLE IF NOT EXISTS moz_cookies (id INTEGER PRIMARY KEY, name TEXT, + value TEXT, host TEXT, path TEXT,expiry INTEGER, + lastAccessed INTEGER, isSecure INTEGER, isHttpOnly INTEGER)""") + + def _cookie_from_row(self, row): + (pk, name, value, domain, path, expires, + last_accessed, secure, http_only) = row + + version = 0 + domain = domain.encode("ascii", "ignore") + path = path.encode("ascii", "ignore") + name = name.encode("ascii", "ignore") + value = value.encode("ascii", "ignore") + secure = bool(secure) + + # last_accessed isn't a cookie attribute, so isn't added to rest + rest = {} + if http_only: + rest["HttpOnly"] = None + + if name == "": + name = value + value = None + + initial_dot = domain.startswith(".") + domain_specified = initial_dot + + discard = False + if expires == "": + expires = None + discard = True + + return Cookie(version, name, value, + None, False, + domain, domain_specified, initial_dot, + path, False, + secure, + expires, + discard, + None, + None, + rest) + + def clear(self, domain=None, path=None, name=None): + CookieJar.clear(self, domain, path, name) + where_parts = [] + sql_params = [] + if domain is not None: + where_parts.append("host = ?") + sql_params.append(domain) + if path is not None: + where_parts.append("path = ?") + sql_params.append(path) + if name is not None: + where_parts.append("name = ?") + sql_params.append(name) + where = " AND ".join(where_parts) + if where: + where = " WHERE " + where + def clear(cur): + cur.execute("DELETE FROM moz_cookies%s" % where, + tuple(sql_params)) + self._transaction(clear) + + def _row_from_cookie(self, cookie, cur): + expires = cookie.expires + if cookie.discard: + expires = "" + + domain = unicode(cookie.domain) + path = unicode(cookie.path) + name = unicode(cookie.name) + value = unicode(cookie.value) + secure = bool(int(cookie.secure)) + + if value is None: + value = name + name = "" + + last_accessed = int(time.time()) + http_only = cookie.has_nonstandard_attr("HttpOnly") + + query = cur.execute("""SELECT MAX(id) + 1 from moz_cookies""") + pk = query.fetchone()[0] + if pk is None: + pk = 1 + + return (pk, name, value, domain, path, expires, + last_accessed, secure, http_only) + + def set_cookie(self, cookie): + if cookie.discard: + CookieJar.set_cookie(self, cookie) + return + + def set_cookie(cur): + # XXX + # is this RFC 2965-correct? + # could this do an UPDATE instead? + row = self._row_from_cookie(cookie, cur) + name, unused, domain, path = row[1:5] + cur.execute("""\ +DELETE FROM moz_cookies WHERE host = ? AND path = ? AND name = ?""", + (domain, path, name)) + cur.execute("""\ +INSERT INTO moz_cookies VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) +""", row) + self._transaction(set_cookie) + + def __iter__(self): + # session (non-persistent) cookies + for cookie in MappingIterator(self._cookies): + yield cookie + # persistent cookies + for row in self._query("""\ +SELECT * FROM moz_cookies ORDER BY name, path, host"""): + yield self._cookie_from_row(row) + + def _cookies_for_request(self, request): + session_cookies = CookieJar._cookies_for_request(self, request) + def get_cookies(cur): + query = cur.execute("SELECT host from moz_cookies") + domains = [row[0] for row in query.fetchmany()] + cookies = [] + for domain in domains: + cookies += self._persistent_cookies_for_domain(domain, + request, cur) + return cookies + persistent_coookies = self._transaction(get_cookies) + return session_cookies + persistent_coookies + + def _persistent_cookies_for_domain(self, domain, request, cur): + cookies = [] + if not self._policy.domain_return_ok(domain, request): + return [] + debug("Checking %s for cookies to return", domain) + query = cur.execute("""\ +SELECT * from moz_cookies WHERE host = ? ORDER BY path""", + (domain,)) + cookies = [self._cookie_from_row(row) for row in query.fetchmany()] + last_path = None + r = [] + for cookie in cookies: + if (cookie.path != last_path and + not self._policy.path_return_ok(cookie.path, request)): + last_path = cookie.path + continue + if not self._policy.return_ok(cookie, request): + debug(" not returning cookie") + continue + debug(" it's a match") + r.append(cookie) + return r diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_gzip.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_gzip.py new file mode 100644 index 0000000..26c2743 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_gzip.py @@ -0,0 +1,103 @@ +import urllib2 +from cStringIO import StringIO +import _response + +# GzipConsumer was taken from Fredrik Lundh's effbot.org-0.1-20041009 library +class GzipConsumer: + + def __init__(self, consumer): + self.__consumer = consumer + self.__decoder = None + self.__data = "" + + def __getattr__(self, key): + return getattr(self.__consumer, key) + + def feed(self, data): + if self.__decoder is None: + # check if we have a full gzip header + data = self.__data + data + try: + i = 10 + flag = ord(data[3]) + if flag & 4: # extra + x = ord(data[i]) + 256*ord(data[i+1]) + i = i + 2 + x + if flag & 8: # filename + while ord(data[i]): + i = i + 1 + i = i + 1 + if flag & 16: # comment + while ord(data[i]): + i = i + 1 + i = i + 1 + if flag & 2: # crc + i = i + 2 + if len(data) < i: + raise IndexError("not enough data") + if data[:3] != "\x1f\x8b\x08": + raise IOError("invalid gzip data") + data = data[i:] + except IndexError: + self.__data = data + return # need more data + import zlib + self.__data = "" + self.__decoder = zlib.decompressobj(-zlib.MAX_WBITS) + data = self.__decoder.decompress(data) + if data: + self.__consumer.feed(data) + + def close(self): + if self.__decoder: + data = self.__decoder.flush() + if data: + self.__consumer.feed(data) + self.__consumer.close() + + +# -------------------------------------------------------------------- + +# the rest of this module is John Lee's stupid code, not +# Fredrik's nice code :-) + +class stupid_gzip_consumer: + def __init__(self): self.data = [] + def feed(self, data): self.data.append(data) + +class stupid_gzip_wrapper(_response.closeable_response): + def __init__(self, response): + self._response = response + + c = stupid_gzip_consumer() + gzc = GzipConsumer(c) + gzc.feed(response.read()) + self.__data = StringIO("".join(c.data)) + + def read(self, size=-1): + return self.__data.read(size) + def readline(self, size=-1): + return self.__data.readline(size) + def readlines(self, sizehint=-1): + return self.__data.readlines(sizehint) + + def __getattr__(self, name): + # delegate unknown methods/attributes + return getattr(self._response, name) + +class HTTPGzipProcessor(urllib2.BaseHandler): + handler_order = 200 # response processing before HTTPEquivProcessor + + def http_request(self, request): + request.add_header("Accept-Encoding", "gzip") + return request + + def http_response(self, request, response): + # post-process response + enc_hdrs = response.info().getheaders("Content-encoding") + for enc_hdr in enc_hdrs: + if ("gzip" in enc_hdr) or ("compress" in enc_hdr): + return stupid_gzip_wrapper(response) + return response + + https_response = http_response diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_headersutil.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_headersutil.py new file mode 100644 index 0000000..49ba5de --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_headersutil.py @@ -0,0 +1,232 @@ +"""Utility functions for HTTP header value parsing and construction. + +Copyright 1997-1998, Gisle Aas +Copyright 2002-2006, John J. Lee + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import os, re +from types import StringType +from types import UnicodeType +STRING_TYPES = StringType, UnicodeType + +from _util import http2time +import _rfc3986 + +def is_html(ct_headers, url, allow_xhtml=False): + """ + ct_headers: Sequence of Content-Type headers + url: Response URL + + """ + if not ct_headers: + # guess + ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1] + html_exts = [".htm", ".html"] + if allow_xhtml: + html_exts += [".xhtml"] + return ext in html_exts + # use first header + ct = split_header_words(ct_headers)[0][0][0] + html_types = ["text/html"] + if allow_xhtml: + html_types += [ + "text/xhtml", "text/xml", + "application/xml", "application/xhtml+xml", + ] + return ct in html_types + +def unmatched(match): + """Return unmatched part of re.Match object.""" + start, end = match.span(0) + return match.string[:start]+match.string[end:] + +token_re = re.compile(r"^\s*([^=\s;,]+)") +quoted_value_re = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"") +value_re = re.compile(r"^\s*=\s*([^\s;,]*)") +escape_re = re.compile(r"\\(.)") +def split_header_words(header_values): + r"""Parse header values into a list of lists containing key,value pairs. + + The function knows how to deal with ",", ";" and "=" as well as quoted + values after "=". A list of space separated tokens are parsed as if they + were separated by ";". + + If the header_values passed as argument contains multiple values, then they + are treated as if they were a single value separated by comma ",". + + This means that this function is useful for parsing header fields that + follow this syntax (BNF as from the HTTP/1.1 specification, but we relax + the requirement for tokens). + + headers = #header + header = (token | parameter) *( [";"] (token | parameter)) + + token = 1*<any CHAR except CTLs or separators> + separators = "(" | ")" | "<" | ">" | "@" + | "," | ";" | ":" | "\" | <"> + | "/" | "[" | "]" | "?" | "=" + | "{" | "}" | SP | HT + + quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) + qdtext = <any TEXT except <">> + quoted-pair = "\" CHAR + + parameter = attribute "=" value + attribute = token + value = token | quoted-string + + Each header is represented by a list of key/value pairs. The value for a + simple token (not part of a parameter) is None. Syntactically incorrect + headers will not necessarily be parsed as you would want. + + This is easier to describe with some examples: + + >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz']) + [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]] + >>> split_header_words(['text/html; charset="iso-8859-1"']) + [[('text/html', None), ('charset', 'iso-8859-1')]] + >>> split_header_words([r'Basic realm="\"foo\bar\""']) + [[('Basic', None), ('realm', '"foobar"')]] + + """ + assert type(header_values) not in STRING_TYPES + result = [] + for text in header_values: + orig_text = text + pairs = [] + while text: + m = token_re.search(text) + if m: + text = unmatched(m) + name = m.group(1) + m = quoted_value_re.search(text) + if m: # quoted value + text = unmatched(m) + value = m.group(1) + value = escape_re.sub(r"\1", value) + else: + m = value_re.search(text) + if m: # unquoted value + text = unmatched(m) + value = m.group(1) + value = value.rstrip() + else: + # no value, a lone token + value = None + pairs.append((name, value)) + elif text.lstrip().startswith(","): + # concatenated headers, as per RFC 2616 section 4.2 + text = text.lstrip()[1:] + if pairs: result.append(pairs) + pairs = [] + else: + # skip junk + non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text) + assert nr_junk_chars > 0, ( + "split_header_words bug: '%s', '%s', %s" % + (orig_text, text, pairs)) + text = non_junk + if pairs: result.append(pairs) + return result + +join_escape_re = re.compile(r"([\"\\])") +def join_header_words(lists): + """Do the inverse of the conversion done by split_header_words. + + Takes a list of lists of (key, value) pairs and produces a single header + value. Attribute values are quoted if needed. + + >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]]) + 'text/plain; charset="iso-8859/1"' + >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]]) + 'text/plain, charset="iso-8859/1"' + + """ + headers = [] + for pairs in lists: + attr = [] + for k, v in pairs: + if v is not None: + if not re.search(r"^\w+$", v): + v = join_escape_re.sub(r"\\\1", v) # escape " and \ + v = '"%s"' % v + if k is None: # Netscape cookies may have no name + k = v + else: + k = "%s=%s" % (k, v) + attr.append(k) + if attr: headers.append("; ".join(attr)) + return ", ".join(headers) + +def strip_quotes(text): + if text.startswith('"'): + text = text[1:] + if text.endswith('"'): + text = text[:-1] + return text + +def parse_ns_headers(ns_headers): + """Ad-hoc parser for Netscape protocol cookie-attributes. + + The old Netscape cookie format for Set-Cookie can for instance contain + an unquoted "," in the expires field, so we have to use this ad-hoc + parser instead of split_header_words. + + XXX This may not make the best possible effort to parse all the crap + that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient + parser is probably better, so could do worse than following that if + this ever gives any trouble. + + Currently, this is also used for parsing RFC 2109 cookies. + + """ + known_attrs = ("expires", "domain", "path", "secure", + # RFC 2109 attrs (may turn up in Netscape cookies, too) + "version", "port", "max-age") + + result = [] + for ns_header in ns_headers: + pairs = [] + version_set = False + params = re.split(r";\s*", ns_header) + for ii in range(len(params)): + param = params[ii] + param = param.rstrip() + if param == "": continue + if "=" not in param: + k, v = param, None + else: + k, v = re.split(r"\s*=\s*", param, 1) + k = k.lstrip() + if ii != 0: + lc = k.lower() + if lc in known_attrs: + k = lc + if k == "version": + # This is an RFC 2109 cookie. + v = strip_quotes(v) + version_set = True + if k == "expires": + # convert expires date to seconds since epoch + v = http2time(strip_quotes(v)) # None if invalid + pairs.append((k, v)) + + if pairs: + if not version_set: + pairs.append(("version", "0")) + result.append(pairs) + + return result + + +def _test(): + import doctest, _headersutil + return doctest.testmod(_headersutil) + +if __name__ == "__main__": + _test() diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_html.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_html.py new file mode 100644 index 0000000..5da0815 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_html.py @@ -0,0 +1,631 @@ +"""HTML handling. + +Copyright 2003-2006 John J. Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +import re, copy, htmlentitydefs +import sgmllib, ClientForm + +import _request +from _headersutil import split_header_words, is_html as _is_html +import _rfc3986 + +DEFAULT_ENCODING = "latin-1" + +COMPRESS_RE = re.compile(r"\s+") + + +# the base classe is purely for backwards compatibility +class ParseError(ClientForm.ParseError): pass + + +class CachingGeneratorFunction(object): + """Caching wrapper around a no-arguments iterable.""" + + def __init__(self, iterable): + self._cache = [] + # wrap iterable to make it non-restartable (otherwise, repeated + # __call__ would give incorrect results) + self._iterator = iter(iterable) + + def __call__(self): + cache = self._cache + for item in cache: + yield item + for item in self._iterator: + cache.append(item) + yield item + + +class EncodingFinder: + def __init__(self, default_encoding): + self._default_encoding = default_encoding + def encoding(self, response): + # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV + # headers may be in the response. HTTP-EQUIV headers come last, + # so try in order from first to last. + for ct in response.info().getheaders("content-type"): + for k, v in split_header_words([ct])[0]: + if k == "charset": + return v + return self._default_encoding + +class ResponseTypeFinder: + def __init__(self, allow_xhtml): + self._allow_xhtml = allow_xhtml + def is_html(self, response, encoding): + ct_hdrs = response.info().getheaders("content-type") + url = response.geturl() + # XXX encoding + return _is_html(ct_hdrs, url, self._allow_xhtml) + + +# idea for this argument-processing trick is from Peter Otten +class Args: + def __init__(self, args_map): + self.dictionary = dict(args_map) + def __getattr__(self, key): + try: + return self.dictionary[key] + except KeyError: + return getattr(self.__class__, key) + +def form_parser_args( + select_default=False, + form_parser_class=None, + request_class=None, + backwards_compat=False, + ): + return Args(locals()) + + +class Link: + def __init__(self, base_url, url, text, tag, attrs): + assert None not in [url, tag, attrs] + self.base_url = base_url + self.absolute_url = _rfc3986.urljoin(base_url, url) + self.url, self.text, self.tag, self.attrs = url, text, tag, attrs + def __cmp__(self, other): + try: + for name in "url", "text", "tag", "attrs": + if getattr(self, name) != getattr(other, name): + return -1 + except AttributeError: + return -1 + return 0 + def __repr__(self): + return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % ( + self.base_url, self.url, self.text, self.tag, self.attrs) + + +class LinksFactory: + + def __init__(self, + link_parser_class=None, + link_class=Link, + urltags=None, + ): + import _pullparser + if link_parser_class is None: + link_parser_class = _pullparser.TolerantPullParser + self.link_parser_class = link_parser_class + self.link_class = link_class + if urltags is None: + urltags = { + "a": "href", + "area": "href", + "frame": "src", + "iframe": "src", + } + self.urltags = urltags + self._response = None + self._encoding = None + + def set_response(self, response, base_url, encoding): + self._response = response + self._encoding = encoding + self._base_url = base_url + + def links(self): + """Return an iterator that provides links of the document.""" + response = self._response + encoding = self._encoding + base_url = self._base_url + p = self.link_parser_class(response, encoding=encoding) + + try: + for token in p.tags(*(self.urltags.keys()+["base"])): + if token.type == "endtag": + continue + if token.data == "base": + base_href = dict(token.attrs).get("href") + if base_href is not None: + base_url = base_href + continue + attrs = dict(token.attrs) + tag = token.data + name = attrs.get("name") + text = None + # XXX use attr_encoding for ref'd doc if that doc does not + # provide one by other means + #attr_encoding = attrs.get("charset") + url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL? + if not url: + # Probably an <A NAME="blah"> link or <AREA NOHREF...>. + # For our purposes a link is something with a URL, so + # ignore this. + continue + + url = _rfc3986.clean_url(url, encoding) + if tag == "a": + if token.type != "startendtag": + # hmm, this'd break if end tag is missing + text = p.get_compressed_text(("endtag", tag)) + # but this doesn't work for eg. + # <a href="blah"><b>Andy</b></a> + #text = p.get_compressed_text() + + yield Link(base_url, url, text, tag, token.attrs) + except sgmllib.SGMLParseError, exc: + raise ParseError(exc) + +class FormsFactory: + + """Makes a sequence of objects satisfying ClientForm.HTMLForm interface. + + After calling .forms(), the .global_form attribute is a form object + containing all controls not a descendant of any FORM element. + + For constructor argument docs, see ClientForm.ParseResponse + argument docs. + + """ + + def __init__(self, + select_default=False, + form_parser_class=None, + request_class=None, + backwards_compat=False, + ): + import ClientForm + self.select_default = select_default + if form_parser_class is None: + form_parser_class = ClientForm.FormParser + self.form_parser_class = form_parser_class + if request_class is None: + request_class = _request.Request + self.request_class = request_class + self.backwards_compat = backwards_compat + self._response = None + self.encoding = None + self.global_form = None + + def set_response(self, response, encoding): + self._response = response + self.encoding = encoding + self.global_form = None + + def forms(self): + import ClientForm + encoding = self.encoding + try: + forms = ClientForm.ParseResponseEx( + self._response, + select_default=self.select_default, + form_parser_class=self.form_parser_class, + request_class=self.request_class, + encoding=encoding, + _urljoin=_rfc3986.urljoin, + _urlparse=_rfc3986.urlsplit, + _urlunparse=_rfc3986.urlunsplit, + ) + except ClientForm.ParseError, exc: + raise ParseError(exc) + self.global_form = forms[0] + return forms[1:] + +class TitleFactory: + def __init__(self): + self._response = self._encoding = None + + def set_response(self, response, encoding): + self._response = response + self._encoding = encoding + + def _get_title_text(self, parser): + import _pullparser + text = [] + tok = None + while 1: + try: + tok = parser.get_token() + except _pullparser.NoMoreTokensError: + break + if tok.type == "data": + text.append(str(tok)) + elif tok.type == "entityref": + t = unescape("&%s;" % tok.data, + parser._entitydefs, parser.encoding) + text.append(t) + elif tok.type == "charref": + t = unescape_charref(tok.data, parser.encoding) + text.append(t) + elif tok.type in ["starttag", "endtag", "startendtag"]: + tag_name = tok.data + if tok.type == "endtag" and tag_name == "title": + break + text.append(str(tok)) + return COMPRESS_RE.sub(" ", "".join(text).strip()) + + def title(self): + import _pullparser + p = _pullparser.TolerantPullParser( + self._response, encoding=self._encoding) + try: + try: + p.get_tag("title") + except _pullparser.NoMoreTokensError: + return None + else: + return self._get_title_text(p) + except sgmllib.SGMLParseError, exc: + raise ParseError(exc) + + +def unescape(data, entities, encoding): + if data is None or "&" not in data: + return data + + def replace_entities(match): + ent = match.group() + if ent[1] == "#": + return unescape_charref(ent[2:-1], encoding) + + repl = entities.get(ent[1:-1]) + if repl is not None: + repl = unichr(repl) + if type(repl) != type(""): + try: + repl = repl.encode(encoding) + except UnicodeError: + repl = ent + else: + repl = ent + return repl + + return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data) + +def unescape_charref(data, encoding): + name, base = data, 10 + if name.startswith("x"): + name, base= name[1:], 16 + uc = unichr(int(name, base)) + if encoding is None: + return uc + else: + try: + repl = uc.encode(encoding) + except UnicodeError: + repl = "&#%s;" % data + return repl + + +# bizarre import gymnastics for bundled BeautifulSoup +import _beautifulsoup +import ClientForm +RobustFormParser, NestingRobustFormParser = ClientForm._create_bs_classes( + _beautifulsoup.BeautifulSoup, _beautifulsoup.ICantBelieveItsBeautifulSoup + ) +# monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-( +sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]") + +class MechanizeBs(_beautifulsoup.BeautifulSoup): + _entitydefs = htmlentitydefs.name2codepoint + # don't want the magic Microsoft-char workaround + PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'), + lambda(x):x.group(1) + ' />'), + (re.compile('<!\s+([^<>]*)>'), + lambda(x):'<!' + x.group(1) + '>') + ] + + def __init__(self, encoding, text=None, avoidParserProblems=True, + initialTextIsEverything=True): + self._encoding = encoding + _beautifulsoup.BeautifulSoup.__init__( + self, text, avoidParserProblems, initialTextIsEverything) + + def handle_charref(self, ref): + t = unescape("&#%s;"%ref, self._entitydefs, self._encoding) + self.handle_data(t) + def handle_entityref(self, ref): + t = unescape("&%s;"%ref, self._entitydefs, self._encoding) + self.handle_data(t) + def unescape_attrs(self, attrs): + escaped_attrs = [] + for key, val in attrs: + val = unescape(val, self._entitydefs, self._encoding) + escaped_attrs.append((key, val)) + return escaped_attrs + +class RobustLinksFactory: + + compress_re = COMPRESS_RE + + def __init__(self, + link_parser_class=None, + link_class=Link, + urltags=None, + ): + if link_parser_class is None: + link_parser_class = MechanizeBs + self.link_parser_class = link_parser_class + self.link_class = link_class + if urltags is None: + urltags = { + "a": "href", + "area": "href", + "frame": "src", + "iframe": "src", + } + self.urltags = urltags + self._bs = None + self._encoding = None + self._base_url = None + + def set_soup(self, soup, base_url, encoding): + self._bs = soup + self._base_url = base_url + self._encoding = encoding + + def links(self): + import _beautifulsoup + bs = self._bs + base_url = self._base_url + encoding = self._encoding + gen = bs.recursiveChildGenerator() + for ch in bs.recursiveChildGenerator(): + if (isinstance(ch, _beautifulsoup.Tag) and + ch.name in self.urltags.keys()+["base"]): + link = ch + attrs = bs.unescape_attrs(link.attrs) + attrs_dict = dict(attrs) + if link.name == "base": + base_href = attrs_dict.get("href") + if base_href is not None: + base_url = base_href + continue + url_attr = self.urltags[link.name] + url = attrs_dict.get(url_attr) + if not url: + continue + url = _rfc3986.clean_url(url, encoding) + text = link.fetchText(lambda t: True) + if not text: + # follow _pullparser's weird behaviour rigidly + if link.name == "a": + text = "" + else: + text = None + else: + text = self.compress_re.sub(" ", " ".join(text).strip()) + yield Link(base_url, url, text, link.name, attrs) + + +class RobustFormsFactory(FormsFactory): + def __init__(self, *args, **kwds): + args = form_parser_args(*args, **kwds) + if args.form_parser_class is None: + args.form_parser_class = RobustFormParser + FormsFactory.__init__(self, **args.dictionary) + + def set_response(self, response, encoding): + self._response = response + self.encoding = encoding + + +class RobustTitleFactory: + def __init__(self): + self._bs = self._encoding = None + + def set_soup(self, soup, encoding): + self._bs = soup + self._encoding = encoding + + def title(self): + import _beautifulsoup + title = self._bs.first("title") + if title == _beautifulsoup.Null: + return None + else: + inner_html = "".join([str(node) for node in title.contents]) + return COMPRESS_RE.sub(" ", inner_html.strip()) + + +class Factory: + """Factory for forms, links, etc. + + This interface may expand in future. + + Public methods: + + set_request_class(request_class) + set_response(response) + forms() + links() + + Public attributes: + + Note that accessing these attributes may raise ParseError. + + encoding: string specifying the encoding of response if it contains a text + document (this value is left unspecified for documents that do not have + an encoding, e.g. an image file) + is_html: true if response contains an HTML document (XHTML may be + regarded as HTML too) + title: page title, or None if no title or not HTML + global_form: form object containing all controls that are not descendants + of any FORM element, or None if the forms_factory does not support + supplying a global form + + """ + + LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"] + + def __init__(self, forms_factory, links_factory, title_factory, + encoding_finder=EncodingFinder(DEFAULT_ENCODING), + response_type_finder=ResponseTypeFinder(allow_xhtml=False), + ): + """ + + Pass keyword arguments only. + + default_encoding: character encoding to use if encoding cannot be + determined (or guessed) from the response. You should turn on + HTTP-EQUIV handling if you want the best chance of getting this right + without resorting to this default. The default value of this + parameter (currently latin-1) may change in future. + + """ + self._forms_factory = forms_factory + self._links_factory = links_factory + self._title_factory = title_factory + self._encoding_finder = encoding_finder + self._response_type_finder = response_type_finder + + self.set_response(None) + + def set_request_class(self, request_class): + """Set urllib2.Request class. + + ClientForm.HTMLForm instances returned by .forms() will return + instances of this class when .click()ed. + + """ + self._forms_factory.request_class = request_class + + def set_response(self, response): + """Set response. + + The response must either be None or implement the same interface as + objects returned by urllib2.urlopen(). + + """ + self._response = response + self._forms_genf = self._links_genf = None + self._get_title = None + for name in self.LAZY_ATTRS: + try: + delattr(self, name) + except AttributeError: + pass + + def __getattr__(self, name): + if name not in self.LAZY_ATTRS: + return getattr(self.__class__, name) + + if name == "encoding": + self.encoding = self._encoding_finder.encoding( + copy.copy(self._response)) + return self.encoding + elif name == "is_html": + self.is_html = self._response_type_finder.is_html( + copy.copy(self._response), self.encoding) + return self.is_html + elif name == "title": + if self.is_html: + self.title = self._title_factory.title() + else: + self.title = None + return self.title + elif name == "global_form": + self.forms() + return self.global_form + + def forms(self): + """Return iterable over ClientForm.HTMLForm-like objects. + + Raises mechanize.ParseError on failure. + """ + # this implementation sets .global_form as a side-effect, for benefit + # of __getattr__ impl + if self._forms_genf is None: + try: + self._forms_genf = CachingGeneratorFunction( + self._forms_factory.forms()) + except: # XXXX define exception! + self.set_response(self._response) + raise + self.global_form = getattr( + self._forms_factory, "global_form", None) + return self._forms_genf() + + def links(self): + """Return iterable over mechanize.Link-like objects. + + Raises mechanize.ParseError on failure. + """ + if self._links_genf is None: + try: + self._links_genf = CachingGeneratorFunction( + self._links_factory.links()) + except: # XXXX define exception! + self.set_response(self._response) + raise + return self._links_genf() + +class DefaultFactory(Factory): + """Based on sgmllib.""" + def __init__(self, i_want_broken_xhtml_support=False): + Factory.__init__( + self, + forms_factory=FormsFactory(), + links_factory=LinksFactory(), + title_factory=TitleFactory(), + response_type_finder=ResponseTypeFinder( + allow_xhtml=i_want_broken_xhtml_support), + ) + + def set_response(self, response): + Factory.set_response(self, response) + if response is not None: + self._forms_factory.set_response( + copy.copy(response), self.encoding) + self._links_factory.set_response( + copy.copy(response), response.geturl(), self.encoding) + self._title_factory.set_response( + copy.copy(response), self.encoding) + +class RobustFactory(Factory): + """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is + DefaultFactory. + + """ + def __init__(self, i_want_broken_xhtml_support=False, + soup_class=None): + Factory.__init__( + self, + forms_factory=RobustFormsFactory(), + links_factory=RobustLinksFactory(), + title_factory=RobustTitleFactory(), + response_type_finder=ResponseTypeFinder( + allow_xhtml=i_want_broken_xhtml_support), + ) + if soup_class is None: + soup_class = MechanizeBs + self._soup_class = soup_class + + def set_response(self, response): + Factory.set_response(self, response) + if response is not None: + data = response.read() + soup = self._soup_class(self.encoding, data) + self._forms_factory.set_response( + copy.copy(response), self.encoding) + self._links_factory.set_soup( + soup, response.geturl(), self.encoding) + self._title_factory.set_soup(soup, self.encoding) diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_http.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_http.py new file mode 100644 index 0000000..1b80e2b --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_http.py @@ -0,0 +1,758 @@ +"""HTTP related handlers. + +Note that some other HTTP handlers live in more specific modules: _auth.py, +_gzip.py, etc. + + +Copyright 2002-2006 John J Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import time, htmlentitydefs, logging, socket, \ + urllib2, urllib, httplib, sgmllib +from urllib2 import URLError, HTTPError, BaseHandler +from cStringIO import StringIO + +from _clientcookie import CookieJar +from _headersutil import is_html +from _html import unescape, unescape_charref +from _request import Request +from _response import closeable_response, response_seek_wrapper +import _rfc3986 +import _sockettimeout + +debug = logging.getLogger("mechanize").debug +debug_robots = logging.getLogger("mechanize.robots").debug + +# monkeypatch urllib2.HTTPError to show URL +## def urllib2_str(self): +## return 'HTTP Error %s: %s (%s)' % ( +## self.code, self.msg, self.geturl()) +## urllib2.HTTPError.__str__ = urllib2_str + + +CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes +DEFAULT_ENCODING = 'latin-1' + + +try: + socket._fileobject("fake socket", close=True) +except TypeError: + # python <= 2.4 + create_readline_wrapper = socket._fileobject +else: + def create_readline_wrapper(fh): + return socket._fileobject(fh, close=True) + + +# This adds "refresh" to the list of redirectables and provides a redirection +# algorithm that doesn't go into a loop in the presence of cookies +# (Python 2.4 has this new algorithm, 2.3 doesn't). +class HTTPRedirectHandler(BaseHandler): + # maximum number of redirections to any single URL + # this is needed because of the state that cookies introduce + max_repeats = 4 + # maximum total number of redirections (regardless of URL) before + # assuming we're in a loop + max_redirections = 10 + + # Implementation notes: + + # To avoid the server sending us into an infinite loop, the request + # object needs to track what URLs we have already seen. Do this by + # adding a handler-specific attribute to the Request object. The value + # of the dict is used to count the number of times the same URL has + # been visited. This is needed because visiting the same URL twice + # does not necessarily imply a loop, thanks to state introduced by + # cookies. + + # Always unhandled redirection codes: + # 300 Multiple Choices: should not handle this here. + # 304 Not Modified: no need to handle here: only of interest to caches + # that do conditional GETs + # 305 Use Proxy: probably not worth dealing with here + # 306 Unused: what was this for in the previous versions of protocol?? + + def redirect_request(self, newurl, req, fp, code, msg, headers): + """Return a Request or None in response to a redirect. + + This is called by the http_error_30x methods when a redirection + response is received. If a redirection should take place, return a + new Request to allow http_error_30x to perform the redirect; + otherwise, return None to indicate that an HTTPError should be + raised. + + """ + if code in (301, 302, 303, "refresh") or \ + (code == 307 and not req.has_data()): + # Strictly (according to RFC 2616), 301 or 302 in response to + # a POST MUST NOT cause a redirection without confirmation + # from the user (of urllib2, in this case). In practice, + # essentially all clients do redirect in this case, so we do + # the same. + # XXX really refresh redirections should be visiting; tricky to + # fix, so this will wait until post-stable release + new = Request(newurl, + headers=req.headers, + origin_req_host=req.get_origin_req_host(), + unverifiable=True, + visit=False, + ) + new._origin_req = getattr(req, "_origin_req", req) + return new + else: + raise HTTPError(req.get_full_url(), code, msg, headers, fp) + + def http_error_302(self, req, fp, code, msg, headers): + # Some servers (incorrectly) return multiple Location headers + # (so probably same goes for URI). Use first header. + if headers.has_key('location'): + newurl = headers.getheaders('location')[0] + elif headers.has_key('uri'): + newurl = headers.getheaders('uri')[0] + else: + return + newurl = _rfc3986.clean_url(newurl, "latin-1") + newurl = _rfc3986.urljoin(req.get_full_url(), newurl) + + # XXX Probably want to forget about the state of the current + # request, although that might interact poorly with other + # handlers that also use handler-specific request attributes + new = self.redirect_request(newurl, req, fp, code, msg, headers) + if new is None: + return + + # loop detection + # .redirect_dict has a key url if url was previously visited. + if hasattr(req, 'redirect_dict'): + visited = new.redirect_dict = req.redirect_dict + if (visited.get(newurl, 0) >= self.max_repeats or + len(visited) >= self.max_redirections): + raise HTTPError(req.get_full_url(), code, + self.inf_msg + msg, headers, fp) + else: + visited = new.redirect_dict = req.redirect_dict = {} + visited[newurl] = visited.get(newurl, 0) + 1 + + # Don't close the fp until we are sure that we won't use it + # with HTTPError. + fp.read() + fp.close() + + return self.parent.open(new) + + http_error_301 = http_error_303 = http_error_307 = http_error_302 + http_error_refresh = http_error_302 + + inf_msg = "The HTTP server returned a redirect error that would " \ + "lead to an infinite loop.\n" \ + "The last 30x error message was:\n" + + +# XXX would self.reset() work, instead of raising this exception? +class EndOfHeadError(Exception): pass +class AbstractHeadParser: + # only these elements are allowed in or before HEAD of document + head_elems = ("html", "head", + "title", "base", + "script", "style", "meta", "link", "object") + _entitydefs = htmlentitydefs.name2codepoint + _encoding = DEFAULT_ENCODING + + def __init__(self): + self.http_equiv = [] + + def start_meta(self, attrs): + http_equiv = content = None + for key, value in attrs: + if key == "http-equiv": + http_equiv = self.unescape_attr_if_required(value) + elif key == "content": + content = self.unescape_attr_if_required(value) + if http_equiv is not None and content is not None: + self.http_equiv.append((http_equiv, content)) + + def end_head(self): + raise EndOfHeadError() + + def handle_entityref(self, name): + #debug("%s", name) + self.handle_data(unescape( + '&%s;' % name, self._entitydefs, self._encoding)) + + def handle_charref(self, name): + #debug("%s", name) + self.handle_data(unescape_charref(name, self._encoding)) + + def unescape_attr(self, name): + #debug("%s", name) + return unescape(name, self._entitydefs, self._encoding) + + def unescape_attrs(self, attrs): + #debug("%s", attrs) + escaped_attrs = {} + for key, val in attrs.items(): + escaped_attrs[key] = self.unescape_attr(val) + return escaped_attrs + + def unknown_entityref(self, ref): + self.handle_data("&%s;" % ref) + + def unknown_charref(self, ref): + self.handle_data("&#%s;" % ref) + + +try: + import HTMLParser +except ImportError: + pass +else: + class XHTMLCompatibleHeadParser(AbstractHeadParser, + HTMLParser.HTMLParser): + def __init__(self): + HTMLParser.HTMLParser.__init__(self) + AbstractHeadParser.__init__(self) + + def handle_starttag(self, tag, attrs): + if tag not in self.head_elems: + raise EndOfHeadError() + try: + method = getattr(self, 'start_' + tag) + except AttributeError: + try: + method = getattr(self, 'do_' + tag) + except AttributeError: + pass # unknown tag + else: + method(attrs) + else: + method(attrs) + + def handle_endtag(self, tag): + if tag not in self.head_elems: + raise EndOfHeadError() + try: + method = getattr(self, 'end_' + tag) + except AttributeError: + pass # unknown tag + else: + method() + + def unescape(self, name): + # Use the entitydefs passed into constructor, not + # HTMLParser.HTMLParser's entitydefs. + return self.unescape_attr(name) + + def unescape_attr_if_required(self, name): + return name # HTMLParser.HTMLParser already did it + +class HeadParser(AbstractHeadParser, sgmllib.SGMLParser): + + def _not_called(self): + assert False + + def __init__(self): + sgmllib.SGMLParser.__init__(self) + AbstractHeadParser.__init__(self) + + def handle_starttag(self, tag, method, attrs): + if tag not in self.head_elems: + raise EndOfHeadError() + if tag == "meta": + method(attrs) + + def unknown_starttag(self, tag, attrs): + self.handle_starttag(tag, self._not_called, attrs) + + def handle_endtag(self, tag, method): + if tag in self.head_elems: + method() + else: + raise EndOfHeadError() + + def unescape_attr_if_required(self, name): + return self.unescape_attr(name) + +def parse_head(fileobj, parser): + """Return a list of key, value pairs.""" + while 1: + data = fileobj.read(CHUNK) + try: + parser.feed(data) + except EndOfHeadError: + break + if len(data) != CHUNK: + # this should only happen if there is no HTML body, or if + # CHUNK is big + break + return parser.http_equiv + +class HTTPEquivProcessor(BaseHandler): + """Append META HTTP-EQUIV headers to regular HTTP headers.""" + + handler_order = 300 # before handlers that look at HTTP headers + + def __init__(self, head_parser_class=HeadParser, + i_want_broken_xhtml_support=False, + ): + self.head_parser_class = head_parser_class + self._allow_xhtml = i_want_broken_xhtml_support + + def http_response(self, request, response): + if not hasattr(response, "seek"): + response = response_seek_wrapper(response) + http_message = response.info() + url = response.geturl() + ct_hdrs = http_message.getheaders("content-type") + if is_html(ct_hdrs, url, self._allow_xhtml): + try: + try: + html_headers = parse_head(response, + self.head_parser_class()) + finally: + response.seek(0) + except (HTMLParser.HTMLParseError, + sgmllib.SGMLParseError): + pass + else: + for hdr, val in html_headers: + # add a header + http_message.dict[hdr.lower()] = val + text = hdr + ": " + val + for line in text.split("\n"): + http_message.headers.append(line + "\n") + return response + + https_response = http_response + +class HTTPCookieProcessor(BaseHandler): + """Handle HTTP cookies. + + Public attributes: + + cookiejar: CookieJar instance + + """ + def __init__(self, cookiejar=None): + if cookiejar is None: + cookiejar = CookieJar() + self.cookiejar = cookiejar + + def http_request(self, request): + self.cookiejar.add_cookie_header(request) + return request + + def http_response(self, request, response): + self.cookiejar.extract_cookies(response, request) + return response + + https_request = http_request + https_response = http_response + +try: + import robotparser +except ImportError: + pass +else: + class MechanizeRobotFileParser(robotparser.RobotFileParser): + + def __init__(self, url='', opener=None): + robotparser.RobotFileParser.__init__(self, url) + self._opener = opener + self._timeout = _sockettimeout._GLOBAL_DEFAULT_TIMEOUT + + def set_opener(self, opener=None): + import _opener + if opener is None: + opener = _opener.OpenerDirector() + self._opener = opener + + def set_timeout(self, timeout): + self._timeout = timeout + + def read(self): + """Reads the robots.txt URL and feeds it to the parser.""" + if self._opener is None: + self.set_opener() + req = Request(self.url, unverifiable=True, visit=False, + timeout=self._timeout) + try: + f = self._opener.open(req) + except HTTPError, f: + pass + except (IOError, socket.error, OSError), exc: + debug_robots("ignoring error opening %r: %s" % + (self.url, exc)) + return + lines = [] + line = f.readline() + while line: + lines.append(line.strip()) + line = f.readline() + status = f.code + if status == 401 or status == 403: + self.disallow_all = True + debug_robots("disallow all") + elif status >= 400: + self.allow_all = True + debug_robots("allow all") + elif status == 200 and lines: + debug_robots("parse lines") + self.parse(lines) + + class RobotExclusionError(urllib2.HTTPError): + def __init__(self, request, *args): + apply(urllib2.HTTPError.__init__, (self,)+args) + self.request = request + + class HTTPRobotRulesProcessor(BaseHandler): + # before redirections, after everything else + handler_order = 800 + + try: + from httplib import HTTPMessage + except: + from mimetools import Message + http_response_class = Message + else: + http_response_class = HTTPMessage + + def __init__(self, rfp_class=MechanizeRobotFileParser): + self.rfp_class = rfp_class + self.rfp = None + self._host = None + + def http_request(self, request): + scheme = request.get_type() + if scheme not in ["http", "https"]: + # robots exclusion only applies to HTTP + return request + + if request.get_selector() == "/robots.txt": + # /robots.txt is always OK to fetch + return request + + host = request.get_host() + + # robots.txt requests don't need to be allowed by robots.txt :-) + origin_req = getattr(request, "_origin_req", None) + if (origin_req is not None and + origin_req.get_selector() == "/robots.txt" and + origin_req.get_host() == host + ): + return request + + if host != self._host: + self.rfp = self.rfp_class() + try: + self.rfp.set_opener(self.parent) + except AttributeError: + debug("%r instance does not support set_opener" % + self.rfp.__class__) + self.rfp.set_url(scheme+"://"+host+"/robots.txt") + self.rfp.set_timeout(request.timeout) + self.rfp.read() + self._host = host + + ua = request.get_header("User-agent", "") + if self.rfp.can_fetch(ua, request.get_full_url()): + return request + else: + # XXX This should really have raised URLError. Too late now... + msg = "request disallowed by robots.txt" + raise RobotExclusionError( + request, + request.get_full_url(), + 403, msg, + self.http_response_class(StringIO()), StringIO(msg)) + + https_request = http_request + +class HTTPRefererProcessor(BaseHandler): + """Add Referer header to requests. + + This only makes sense if you use each RefererProcessor for a single + chain of requests only (so, for example, if you use a single + HTTPRefererProcessor to fetch a series of URLs extracted from a single + page, this will break). + + There's a proper implementation of this in mechanize.Browser. + + """ + def __init__(self): + self.referer = None + + def http_request(self, request): + if ((self.referer is not None) and + not request.has_header("Referer")): + request.add_unredirected_header("Referer", self.referer) + return request + + def http_response(self, request, response): + self.referer = response.geturl() + return response + + https_request = http_request + https_response = http_response + + +def clean_refresh_url(url): + # e.g. Firefox 1.5 does (something like) this + if ((url.startswith('"') and url.endswith('"')) or + (url.startswith("'") and url.endswith("'"))): + url = url[1:-1] + return _rfc3986.clean_url(url, "latin-1") # XXX encoding + +def parse_refresh_header(refresh): + """ + >>> parse_refresh_header("1; url=http://example.com/") + (1.0, 'http://example.com/') + >>> parse_refresh_header("1; url='http://example.com/'") + (1.0, 'http://example.com/') + >>> parse_refresh_header("1") + (1.0, None) + >>> parse_refresh_header("blah") + Traceback (most recent call last): + ValueError: invalid literal for float(): blah + + """ + + ii = refresh.find(";") + if ii != -1: + pause, newurl_spec = float(refresh[:ii]), refresh[ii+1:] + jj = newurl_spec.find("=") + key = None + if jj != -1: + key, newurl = newurl_spec[:jj], newurl_spec[jj+1:] + newurl = clean_refresh_url(newurl) + if key is None or key.strip().lower() != "url": + raise ValueError() + else: + pause, newurl = float(refresh), None + return pause, newurl + +class HTTPRefreshProcessor(BaseHandler): + """Perform HTTP Refresh redirections. + + Note that if a non-200 HTTP code has occurred (for example, a 30x + redirect), this processor will do nothing. + + By default, only zero-time Refresh headers are redirected. Use the + max_time attribute / constructor argument to allow Refresh with longer + pauses. Use the honor_time attribute / constructor argument to control + whether the requested pause is honoured (with a time.sleep()) or + skipped in favour of immediate redirection. + + Public attributes: + + max_time: see above + honor_time: see above + + """ + handler_order = 1000 + + def __init__(self, max_time=0, honor_time=True): + self.max_time = max_time + self.honor_time = honor_time + self._sleep = time.sleep + + def http_response(self, request, response): + code, msg, hdrs = response.code, response.msg, response.info() + + if code == 200 and hdrs.has_key("refresh"): + refresh = hdrs.getheaders("refresh")[0] + try: + pause, newurl = parse_refresh_header(refresh) + except ValueError: + debug("bad Refresh header: %r" % refresh) + return response + + if newurl is None: + newurl = response.geturl() + if (self.max_time is None) or (pause <= self.max_time): + if pause > 1E-3 and self.honor_time: + self._sleep(pause) + hdrs["location"] = newurl + # hardcoded http is NOT a bug + response = self.parent.error( + "http", request, response, + "refresh", msg, hdrs) + else: + debug("Refresh header ignored: %r" % refresh) + + return response + + https_response = http_response + +class HTTPErrorProcessor(BaseHandler): + """Process HTTP error responses. + + The purpose of this handler is to to allow other response processors a + look-in by removing the call to parent.error() from + AbstractHTTPHandler. + + For non-200 error codes, this just passes the job on to the + Handler.<proto>_error_<code> methods, via the OpenerDirector.error + method. Eventually, urllib2.HTTPDefaultErrorHandler will raise an + HTTPError if no other handler handles the error. + + """ + handler_order = 1000 # after all other processors + + def http_response(self, request, response): + code, msg, hdrs = response.code, response.msg, response.info() + + if code != 200: + # hardcoded http is NOT a bug + response = self.parent.error( + "http", request, response, code, msg, hdrs) + + return response + + https_response = http_response + + +class HTTPDefaultErrorHandler(BaseHandler): + def http_error_default(self, req, fp, code, msg, hdrs): + # why these error methods took the code, msg, headers args in the first + # place rather than a response object, I don't know, but to avoid + # multiple wrapping, we're discarding them + + if isinstance(fp, urllib2.HTTPError): + response = fp + else: + response = urllib2.HTTPError( + req.get_full_url(), code, msg, hdrs, fp) + assert code == response.code + assert msg == response.msg + assert hdrs == response.hdrs + raise response + + +class AbstractHTTPHandler(BaseHandler): + + def __init__(self, debuglevel=0): + self._debuglevel = debuglevel + + def set_http_debuglevel(self, level): + self._debuglevel = level + + def do_request_(self, request): + host = request.get_host() + if not host: + raise URLError('no host given') + + if request.has_data(): # POST + data = request.get_data() + if not request.has_header('Content-type'): + request.add_unredirected_header( + 'Content-type', + 'application/x-www-form-urlencoded') + if not request.has_header('Content-length'): + request.add_unredirected_header( + 'Content-length', '%d' % len(data)) + + scheme, sel = urllib.splittype(request.get_selector()) + sel_host, sel_path = urllib.splithost(sel) + if not request.has_header('Host'): + request.add_unredirected_header('Host', sel_host or host) + for name, value in self.parent.addheaders: + name = name.capitalize() + if not request.has_header(name): + request.add_unredirected_header(name, value) + + return request + + def do_open(self, http_class, req): + """Return an addinfourl object for the request, using http_class. + + http_class must implement the HTTPConnection API from httplib. + The addinfourl return value is a file-like object. It also + has methods and attributes including: + - info(): return a mimetools.Message object for the headers + - geturl(): return the original request URL + - code: HTTP status code + """ + host_port = req.get_host() + if not host_port: + raise URLError('no host given') + + try: + h = http_class(host_port, timeout=req.timeout) + except TypeError: + # Python < 2.6, no per-connection timeout support + h = http_class(host_port) + h.set_debuglevel(self._debuglevel) + + headers = dict(req.headers) + headers.update(req.unredirected_hdrs) + # We want to make an HTTP/1.1 request, but the addinfourl + # class isn't prepared to deal with a persistent connection. + # It will try to read all remaining data from the socket, + # which will block while the server waits for the next request. + # So make sure the connection gets closed after the (only) + # request. + headers["Connection"] = "close" + headers = dict( + [(name.title(), val) for name, val in headers.items()]) + try: + h.request(req.get_method(), req.get_selector(), req.data, headers) + r = h.getresponse() + except socket.error, err: # XXX what error? + raise URLError(err) + + # Pick apart the HTTPResponse object to get the addinfourl + # object initialized properly. + + # Wrap the HTTPResponse object in socket's file object adapter + # for Windows. That adapter calls recv(), so delegate recv() + # to read(). This weird wrapping allows the returned object to + # have readline() and readlines() methods. + + # XXX It might be better to extract the read buffering code + # out of socket._fileobject() and into a base class. + + r.recv = r.read + fp = create_readline_wrapper(r) + + resp = closeable_response(fp, r.msg, req.get_full_url(), + r.status, r.reason) + return resp + + +class HTTPHandler(AbstractHTTPHandler): + def http_open(self, req): + return self.do_open(httplib.HTTPConnection, req) + + http_request = AbstractHTTPHandler.do_request_ + +if hasattr(httplib, 'HTTPS'): + + class HTTPSConnectionFactory: + def __init__(self, key_file, cert_file): + self._key_file = key_file + self._cert_file = cert_file + def __call__(self, hostport): + return httplib.HTTPSConnection( + hostport, + key_file=self._key_file, cert_file=self._cert_file) + + class HTTPSHandler(AbstractHTTPHandler): + def __init__(self, client_cert_manager=None): + AbstractHTTPHandler.__init__(self) + self.client_cert_manager = client_cert_manager + + def https_open(self, req): + if self.client_cert_manager is not None: + key_file, cert_file = self.client_cert_manager.find_key_cert( + req.get_full_url()) + conn_factory = HTTPSConnectionFactory(key_file, cert_file) + else: + conn_factory = httplib.HTTPSConnection + return self.do_open(conn_factory, req) + + https_request = AbstractHTTPHandler.do_request_ diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_lwpcookiejar.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_lwpcookiejar.py new file mode 100644 index 0000000..f8d49cf --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_lwpcookiejar.py @@ -0,0 +1,185 @@ +"""Load / save to libwww-perl (LWP) format files. + +Actually, the format is slightly extended from that used by LWP's +(libwww-perl's) HTTP::Cookies, to avoid losing some RFC 2965 information +not recorded by LWP. + +It uses the version string "2.0", though really there isn't an LWP Cookies +2.0 format. This indicates that there is extra information in here +(domain_dot and port_spec) while still being compatible with libwww-perl, +I hope. + +Copyright 2002-2006 John J Lee <jjl@pobox.com> +Copyright 1997-1999 Gisle Aas (original libwww-perl code) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import time, re, logging + +from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \ + MISSING_FILENAME_TEXT, LoadError +from _headersutil import join_header_words, split_header_words +from _util import iso2time, time2isoz + +debug = logging.getLogger("mechanize").debug + + +def lwp_cookie_str(cookie): + """Return string representation of Cookie in an the LWP cookie file format. + + Actually, the format is extended a bit -- see module docstring. + + """ + h = [(cookie.name, cookie.value), + ("path", cookie.path), + ("domain", cookie.domain)] + if cookie.port is not None: h.append(("port", cookie.port)) + if cookie.path_specified: h.append(("path_spec", None)) + if cookie.port_specified: h.append(("port_spec", None)) + if cookie.domain_initial_dot: h.append(("domain_dot", None)) + if cookie.secure: h.append(("secure", None)) + if cookie.expires: h.append(("expires", + time2isoz(float(cookie.expires)))) + if cookie.discard: h.append(("discard", None)) + if cookie.comment: h.append(("comment", cookie.comment)) + if cookie.comment_url: h.append(("commenturl", cookie.comment_url)) + if cookie.rfc2109: h.append(("rfc2109", None)) + + keys = cookie.nonstandard_attr_keys() + keys.sort() + for k in keys: + h.append((k, str(cookie.get_nonstandard_attr(k)))) + + h.append(("version", str(cookie.version))) + + return join_header_words([h]) + +class LWPCookieJar(FileCookieJar): + """ + The LWPCookieJar saves a sequence of"Set-Cookie3" lines. + "Set-Cookie3" is the format used by the libwww-perl libary, not known + to be compatible with any browser, but which is easy to read and + doesn't lose information about RFC 2965 cookies. + + Additional methods + + as_lwp_str(ignore_discard=True, ignore_expired=True) + + """ + + magic_re = r"^\#LWP-Cookies-(\d+\.\d+)" + + def as_lwp_str(self, ignore_discard=True, ignore_expires=True): + """Return cookies as a string of "\n"-separated "Set-Cookie3" headers. + + ignore_discard and ignore_expires: see docstring for FileCookieJar.save + + """ + now = time.time() + r = [] + for cookie in self: + if not ignore_discard and cookie.discard: + debug(" Not saving %s: marked for discard", cookie.name) + continue + if not ignore_expires and cookie.is_expired(now): + debug(" Not saving %s: expired", cookie.name) + continue + r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie)) + return "\n".join(r+[""]) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + f = open(filename, "w") + try: + debug("Saving LWP cookies file") + # There really isn't an LWP Cookies 2.0 format, but this indicates + # that there is extra information in here (domain_dot and + # port_spec) while still being compatible with libwww-perl, I hope. + f.write("#LWP-Cookies-2.0\n") + f.write(self.as_lwp_str(ignore_discard, ignore_expires)) + finally: + f.close() + + def _really_load(self, f, filename, ignore_discard, ignore_expires): + magic = f.readline() + if not re.search(self.magic_re, magic): + msg = "%s does not seem to contain cookies" % filename + raise LoadError(msg) + + now = time.time() + + header = "Set-Cookie3:" + boolean_attrs = ("port_spec", "path_spec", "domain_dot", + "secure", "discard", "rfc2109") + value_attrs = ("version", + "port", "path", "domain", + "expires", + "comment", "commenturl") + + try: + while 1: + line = f.readline() + if line == "": break + if not line.startswith(header): + continue + line = line[len(header):].strip() + + for data in split_header_words([line]): + name, value = data[0] + standard = {} + rest = {} + for k in boolean_attrs: + standard[k] = False + for k, v in data[1:]: + if k is not None: + lc = k.lower() + else: + lc = None + # don't lose case distinction for unknown fields + if (lc in value_attrs) or (lc in boolean_attrs): + k = lc + if k in boolean_attrs: + if v is None: v = True + standard[k] = v + elif k in value_attrs: + standard[k] = v + else: + rest[k] = v + + h = standard.get + expires = h("expires") + discard = h("discard") + if expires is not None: + expires = iso2time(expires) + if expires is None: + discard = True + domain = h("domain") + domain_specified = domain.startswith(".") + c = Cookie(h("version"), name, value, + h("port"), h("port_spec"), + domain, domain_specified, h("domain_dot"), + h("path"), h("path_spec"), + h("secure"), + expires, + discard, + h("comment"), + h("commenturl"), + rest, + h("rfc2109"), + ) + if not ignore_discard and c.discard: + continue + if not ignore_expires and c.is_expired(now): + continue + self.set_cookie(c) + except: + reraise_unmasked_exceptions((IOError,)) + raise LoadError("invalid Set-Cookie3 format file %s" % filename) + diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_mechanize.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_mechanize.py new file mode 100644 index 0000000..ad729c9 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_mechanize.py @@ -0,0 +1,676 @@ +"""Stateful programmatic WWW navigation, after Perl's WWW::Mechanize. + +Copyright 2003-2006 John J. Lee <jjl@pobox.com> +Copyright 2003 Andy Lester (original Perl code) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +import urllib2, copy, re, os, urllib + + +from _html import DefaultFactory +import _response +import _request +import _rfc3986 +import _sockettimeout +from _useragent import UserAgentBase + +__version__ = (0, 1, 11, None, None) # 0.1.11 + +class BrowserStateError(Exception): pass +class LinkNotFoundError(Exception): pass +class FormNotFoundError(Exception): pass + + +def sanepathname2url(path): + urlpath = urllib.pathname2url(path) + if os.name == "nt" and urlpath.startswith("///"): + urlpath = urlpath[2:] + # XXX don't ask me about the mac... + return urlpath + + +class History: + """ + + Though this will become public, the implied interface is not yet stable. + + """ + def __init__(self): + self._history = [] # LIFO + def add(self, request, response): + self._history.append((request, response)) + def back(self, n, _response): + response = _response # XXX move Browser._response into this class? + while n > 0 or response is None: + try: + request, response = self._history.pop() + except IndexError: + raise BrowserStateError("already at start of history") + n -= 1 + return request, response + def clear(self): + del self._history[:] + def close(self): + for request, response in self._history: + if response is not None: + response.close() + del self._history[:] + + +class HTTPRefererProcessor(urllib2.BaseHandler): + def http_request(self, request): + # See RFC 2616 14.36. The only times we know the source of the + # request URI has a URI associated with it are redirect, and + # Browser.click() / Browser.submit() / Browser.follow_link(). + # Otherwise, it's the user's job to add any Referer header before + # .open()ing. + if hasattr(request, "redirect_dict"): + request = self.parent._add_referer_header( + request, origin_request=False) + return request + + https_request = http_request + + +class Browser(UserAgentBase): + """Browser-like class with support for history, forms and links. + + BrowserStateError is raised whenever the browser is in the wrong state to + complete the requested operation - eg., when .back() is called when the + browser history is empty, or when .follow_link() is called when the current + response does not contain HTML data. + + Public attributes: + + request: current request (mechanize.Request or urllib2.Request) + form: currently selected form (see .select_form()) + + """ + + handler_classes = copy.copy(UserAgentBase.handler_classes) + handler_classes["_referer"] = HTTPRefererProcessor + default_features = copy.copy(UserAgentBase.default_features) + default_features.append("_referer") + + def __init__(self, + factory=None, + history=None, + request_class=None, + ): + """ + + Only named arguments should be passed to this constructor. + + factory: object implementing the mechanize.Factory interface. + history: object implementing the mechanize.History interface. Note + this interface is still experimental and may change in future. + request_class: Request class to use. Defaults to mechanize.Request + by default for Pythons older than 2.4, urllib2.Request otherwise. + + The Factory and History objects passed in are 'owned' by the Browser, + so they should not be shared across Browsers. In particular, + factory.set_response() should not be called except by the owning + Browser itself. + + Note that the supplied factory's request_class is overridden by this + constructor, to ensure only one Request class is used. + + """ + self._handle_referer = True + + if history is None: + history = History() + self._history = history + + if request_class is None: + if not hasattr(urllib2.Request, "add_unredirected_header"): + request_class = _request.Request + else: + request_class = urllib2.Request # Python >= 2.4 + + if factory is None: + factory = DefaultFactory() + factory.set_request_class(request_class) + self._factory = factory + self.request_class = request_class + + self.request = None + self._set_response(None, False) + + # do this last to avoid __getattr__ problems + UserAgentBase.__init__(self) + + def close(self): + UserAgentBase.close(self) + if self._response is not None: + self._response.close() + if self._history is not None: + self._history.close() + self._history = None + + # make use after .close easy to spot + self.form = None + self.request = self._response = None + self.request = self.response = self.set_response = None + self.geturl = self.reload = self.back = None + self.clear_history = self.set_cookie = self.links = self.forms = None + self.viewing_html = self.encoding = self.title = None + self.select_form = self.click = self.submit = self.click_link = None + self.follow_link = self.find_link = None + + def set_handle_referer(self, handle): + """Set whether to add Referer header to each request.""" + self._set_handler("_referer", handle) + self._handle_referer = bool(handle) + + def _add_referer_header(self, request, origin_request=True): + if self.request is None: + return request + scheme = request.get_type() + original_scheme = self.request.get_type() + if scheme not in ["http", "https"]: + return request + if not origin_request and not self.request.has_header("Referer"): + return request + + if (self._handle_referer and + original_scheme in ["http", "https"] and + not (original_scheme == "https" and scheme != "https")): + # strip URL fragment (RFC 2616 14.36) + parts = _rfc3986.urlsplit(self.request.get_full_url()) + parts = parts[:-1]+(None,) + referer = _rfc3986.urlunsplit(parts) + request.add_unredirected_header("Referer", referer) + return request + + def open_novisit(self, url, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + """Open a URL without visiting it. + + Browser state (including request, response, history, forms and links) + is left unchanged by calling this function. + + The interface is the same as for .open(). + + This is useful for things like fetching images. + + See also .retrieve(). + + """ + return self._mech_open(url, data, visit=False, timeout=timeout) + + def open(self, url, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + return self._mech_open(url, data, timeout=timeout) + + def _mech_open(self, url, data=None, update_history=True, visit=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + try: + url.get_full_url + except AttributeError: + # string URL -- convert to absolute URL if required + scheme, authority = _rfc3986.urlsplit(url)[:2] + if scheme is None: + # relative URL + if self._response is None: + raise BrowserStateError( + "can't fetch relative reference: " + "not viewing any document") + url = _rfc3986.urljoin(self._response.geturl(), url) + + request = self._request(url, data, visit, timeout) + visit = request.visit + if visit is None: + visit = True + + if visit: + self._visit_request(request, update_history) + + success = True + try: + response = UserAgentBase.open(self, request, data) + except urllib2.HTTPError, error: + success = False + if error.fp is None: # not a response + raise + response = error +## except (IOError, socket.error, OSError), error: +## # Yes, urllib2 really does raise all these :-(( +## # See test_urllib2.py for examples of socket.gaierror and OSError, +## # plus note that FTPHandler raises IOError. +## # XXX I don't seem to have an example of exactly socket.error being +## # raised, only socket.gaierror... +## # I don't want to start fixing these here, though, since this is a +## # subclass of OpenerDirector, and it would break old code. Even in +## # Python core, a fix would need some backwards-compat. hack to be +## # acceptable. +## raise + + if visit: + self._set_response(response, False) + response = copy.copy(self._response) + elif response is not None: + response = _response.upgrade_response(response) + + if not success: + raise response + return response + + def __str__(self): + text = [] + text.append("<%s " % self.__class__.__name__) + if self._response: + text.append("visiting %s" % self._response.geturl()) + else: + text.append("(not visiting a URL)") + if self.form: + text.append("\n selected form:\n %s\n" % str(self.form)) + text.append(">") + return "".join(text) + + def response(self): + """Return a copy of the current response. + + The returned object has the same interface as the object returned by + .open() (or urllib2.urlopen()). + + """ + return copy.copy(self._response) + + def open_local_file(self, filename): + path = sanepathname2url(os.path.abspath(filename)) + url = 'file://'+path + return self.open(url) + + def set_response(self, response): + """Replace current response with (a copy of) response. + + response may be None. + + This is intended mostly for HTML-preprocessing. + """ + self._set_response(response, True) + + def _set_response(self, response, close_current): + # sanity check, necessary but far from sufficient + if not (response is None or + (hasattr(response, "info") and hasattr(response, "geturl") and + hasattr(response, "read") + ) + ): + raise ValueError("not a response object") + + self.form = None + if response is not None: + response = _response.upgrade_response(response) + if close_current and self._response is not None: + self._response.close() + self._response = response + self._factory.set_response(response) + + def visit_response(self, response, request=None): + """Visit the response, as if it had been .open()ed. + + Unlike .set_response(), this updates history rather than replacing the + current response. + """ + if request is None: + request = _request.Request(response.geturl()) + self._visit_request(request, True) + self._set_response(response, False) + + def _visit_request(self, request, update_history): + if self._response is not None: + self._response.close() + if self.request is not None and update_history: + self._history.add(self.request, self._response) + self._response = None + # we want self.request to be assigned even if UserAgentBase.open + # fails + self.request = request + + def geturl(self): + """Get URL of current document.""" + if self._response is None: + raise BrowserStateError("not viewing any document") + return self._response.geturl() + + def reload(self): + """Reload current document, and return response object.""" + if self.request is None: + raise BrowserStateError("no URL has yet been .open()ed") + if self._response is not None: + self._response.close() + return self._mech_open(self.request, update_history=False) + + def back(self, n=1): + """Go back n steps in history, and return response object. + + n: go back this number of steps (default 1 step) + + """ + if self._response is not None: + self._response.close() + self.request, response = self._history.back(n, self._response) + self.set_response(response) + if not response.read_complete: + return self.reload() + return copy.copy(response) + + def clear_history(self): + self._history.clear() + + def set_cookie(self, cookie_string): + """Request to set a cookie. + + Note that it is NOT necessary to call this method under ordinary + circumstances: cookie handling is normally entirely automatic. The + intended use case is rather to simulate the setting of a cookie by + client script in a web page (e.g. JavaScript). In that case, use of + this method is necessary because mechanize currently does not support + JavaScript, VBScript, etc. + + The cookie is added in the same way as if it had arrived with the + current response, as a result of the current request. This means that, + for example, if it is not appropriate to set the cookie based on the + current request, no cookie will be set. + + The cookie will be returned automatically with subsequent responses + made by the Browser instance whenever that's appropriate. + + cookie_string should be a valid value of the Set-Cookie header. + + For example: + + browser.set_cookie( + "sid=abcdef; expires=Wednesday, 09-Nov-06 23:12:40 GMT") + + Currently, this method does not allow for adding RFC 2986 cookies. + This limitation will be lifted if anybody requests it. + + """ + if self._response is None: + raise BrowserStateError("not viewing any document") + if self.request.get_type() not in ["http", "https"]: + raise BrowserStateError("can't set cookie for non-HTTP/HTTPS " + "transactions") + cookiejar = self._ua_handlers["_cookies"].cookiejar + response = self.response() # copy + headers = response.info() + headers["Set-cookie"] = cookie_string + cookiejar.extract_cookies(response, self.request) + + def links(self, **kwds): + """Return iterable over links (mechanize.Link objects).""" + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + links = self._factory.links() + if kwds: + return self._filter_links(links, **kwds) + else: + return links + + def forms(self): + """Return iterable over forms. + + The returned form objects implement the ClientForm.HTMLForm interface. + + """ + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + return self._factory.forms() + + def global_form(self): + """Return the global form object, or None if the factory implementation + did not supply one. + + The "global" form object contains all controls that are not descendants + of any FORM element. + + The returned form object implements the ClientForm.HTMLForm interface. + + This is a separate method since the global form is not regarded as part + of the sequence of forms in the document -- mostly for + backwards-compatibility. + + """ + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + return self._factory.global_form + + def viewing_html(self): + """Return whether the current response contains HTML data.""" + if self._response is None: + raise BrowserStateError("not viewing any document") + return self._factory.is_html + + def encoding(self): + if self._response is None: + raise BrowserStateError("not viewing any document") + return self._factory.encoding + + def title(self): + r"""Return title, or None if there is no title element in the document. + + Treatment of any tag children of attempts to follow Firefox and IE + (currently, tags are preserved). + + """ + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + return self._factory.title + + def select_form(self, name=None, predicate=None, nr=None): + """Select an HTML form for input. + + This is a bit like giving a form the "input focus" in a browser. + + If a form is selected, the Browser object supports the HTMLForm + interface, so you can call methods like .set_value(), .set(), and + .click(). + + Another way to select a form is to assign to the .form attribute. The + form assigned should be one of the objects returned by the .forms() + method. + + At least one of the name, predicate and nr arguments must be supplied. + If no matching form is found, mechanize.FormNotFoundError is raised. + + If name is specified, then the form must have the indicated name. + + If predicate is specified, then the form must match that function. The + predicate function is passed the HTMLForm as its single argument, and + should return a boolean value indicating whether the form matched. + + nr, if supplied, is the sequence number of the form (where 0 is the + first). Note that control 0 is the first form matching all the other + arguments (if supplied); it is not necessarily the first control in the + form. The "global form" (consisting of all form controls not contained + in any FORM element) is considered not to be part of this sequence and + to have no name, so will not be matched unless both name and nr are + None. + + """ + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + if (name is None) and (predicate is None) and (nr is None): + raise ValueError( + "at least one argument must be supplied to specify form") + + global_form = self._factory.global_form + if nr is None and name is None and \ + predicate is not None and predicate(global_form): + self.form = global_form + return + + orig_nr = nr + for form in self.forms(): + if name is not None and name != form.name: + continue + if predicate is not None and not predicate(form): + continue + if nr: + nr -= 1 + continue + self.form = form + break # success + else: + # failure + description = [] + if name is not None: description.append("name '%s'" % name) + if predicate is not None: + description.append("predicate %s" % predicate) + if orig_nr is not None: description.append("nr %d" % orig_nr) + description = ", ".join(description) + raise FormNotFoundError("no form matching "+description) + + def click(self, *args, **kwds): + """See ClientForm.HTMLForm.click for documentation.""" + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + request = self.form.click(*args, **kwds) + return self._add_referer_header(request) + + def submit(self, *args, **kwds): + """Submit current form. + + Arguments are as for ClientForm.HTMLForm.click(). + + Return value is same as for Browser.open(). + + """ + return self.open(self.click(*args, **kwds)) + + def click_link(self, link=None, **kwds): + """Find a link and return a Request object for it. + + Arguments are as for .find_link(), except that a link may be supplied + as the first argument. + + """ + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + if not link: + link = self.find_link(**kwds) + else: + if kwds: + raise ValueError( + "either pass a Link, or keyword arguments, not both") + request = self.request_class(link.absolute_url) + return self._add_referer_header(request) + + def follow_link(self, link=None, **kwds): + """Find a link and .open() it. + + Arguments are as for .click_link(). + + Return value is same as for Browser.open(). + + """ + return self.open(self.click_link(link, **kwds)) + + def find_link(self, **kwds): + """Find a link in current page. + + Links are returned as mechanize.Link objects. + + # Return third link that .search()-matches the regexp "python" + # (by ".search()-matches", I mean that the regular expression method + # .search() is used, rather than .match()). + find_link(text_regex=re.compile("python"), nr=2) + + # Return first http link in the current page that points to somewhere + # on python.org whose link text (after tags have been removed) is + # exactly "monty python". + find_link(text="monty python", + url_regex=re.compile("http.*python.org")) + + # Return first link with exactly three HTML attributes. + find_link(predicate=lambda link: len(link.attrs) == 3) + + Links include anchors (<a>), image maps (<area>), and frames (<frame>, + <iframe>). + + All arguments must be passed by keyword, not position. Zero or more + arguments may be supplied. In order to find a link, all arguments + supplied must match. + + If a matching link is not found, mechanize.LinkNotFoundError is raised. + + text: link text between link tags: eg. <a href="blah">this bit</a> (as + returned by pullparser.get_compressed_text(), ie. without tags but + with opening tags "textified" as per the pullparser docs) must compare + equal to this argument, if supplied + text_regex: link text between tag (as defined above) must match the + regular expression object or regular expression string passed as this + argument, if supplied + name, name_regex: as for text and text_regex, but matched against the + name HTML attribute of the link tag + url, url_regex: as for text and text_regex, but matched against the + URL of the link tag (note this matches against Link.url, which is a + relative or absolute URL according to how it was written in the HTML) + tag: element name of opening tag, eg. "a" + predicate: a function taking a Link object as its single argument, + returning a boolean result, indicating whether the links + nr: matches the nth link that matches all other criteria (default 0) + + """ + try: + return self._filter_links(self._factory.links(), **kwds).next() + except StopIteration: + raise LinkNotFoundError() + + def __getattr__(self, name): + # pass through ClientForm / DOMForm methods and attributes + form = self.__dict__.get("form") + if form is None: + raise AttributeError( + "%s instance has no attribute %s (perhaps you forgot to " + ".select_form()?)" % (self.__class__, name)) + return getattr(form, name) + + def _filter_links(self, links, + text=None, text_regex=None, + name=None, name_regex=None, + url=None, url_regex=None, + tag=None, + predicate=None, + nr=0 + ): + if not self.viewing_html(): + raise BrowserStateError("not viewing HTML") + + found_links = [] + orig_nr = nr + + for link in links: + if url is not None and url != link.url: + continue + if url_regex is not None and not re.search(url_regex, link.url): + continue + if (text is not None and + (link.text is None or text != link.text)): + continue + if (text_regex is not None and + (link.text is None or not re.search(text_regex, link.text))): + continue + if name is not None and name != dict(link.attrs).get("name"): + continue + if name_regex is not None: + link_name = dict(link.attrs).get("name") + if link_name is None or not re.search(name_regex, link_name): + continue + if tag is not None and tag != link.tag: + continue + if predicate is not None and not predicate(link): + continue + if nr: + nr -= 1 + continue + yield link + nr = orig_nr diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_mozillacookiejar.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_mozillacookiejar.py new file mode 100644 index 0000000..51e81bb --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_mozillacookiejar.py @@ -0,0 +1,161 @@ +"""Mozilla / Netscape cookie loading / saving. + +Copyright 2002-2006 John J Lee <jjl@pobox.com> +Copyright 1997-1999 Gisle Aas (original libwww-perl code) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import re, time, logging + +from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \ + MISSING_FILENAME_TEXT, LoadError +debug = logging.getLogger("ClientCookie").debug + + +class MozillaCookieJar(FileCookieJar): + """ + + WARNING: you may want to backup your browser's cookies file if you use + this class to save cookies. I *think* it works, but there have been + bugs in the past! + + This class differs from CookieJar only in the format it uses to save and + load cookies to and from a file. This class uses the Mozilla/Netscape + `cookies.txt' format. lynx uses this file format, too. + + Don't expect cookies saved while the browser is running to be noticed by + the browser (in fact, Mozilla on unix will overwrite your saved cookies if + you change them on disk while it's running; on Windows, you probably can't + save at all while the browser is running). + + Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to + Netscape cookies on saving. + + In particular, the cookie version and port number information is lost, + together with information about whether or not Path, Port and Discard were + specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the + domain as set in the HTTP header started with a dot (yes, I'm aware some + domains in Netscape files start with a dot and some don't -- trust me, you + really don't want to know any more about this). + + Note that though Mozilla and Netscape use the same format, they use + slightly different headers. The class saves cookies using the Netscape + header by default (Mozilla can cope with that). + + """ + magic_re = "#( Netscape)? HTTP Cookie File" + header = """\ + # Netscape HTTP Cookie File + # http://www.netscape.com/newsref/std/cookie_spec.html + # This is a generated file! Do not edit. + +""" + + def _really_load(self, f, filename, ignore_discard, ignore_expires): + now = time.time() + + magic = f.readline() + if not re.search(self.magic_re, magic): + f.close() + raise LoadError( + "%s does not look like a Netscape format cookies file" % + filename) + + try: + while 1: + line = f.readline() + if line == "": break + + # last field may be absent, so keep any trailing tab + if line.endswith("\n"): line = line[:-1] + + # skip comments and blank lines XXX what is $ for? + if (line.strip().startswith("#") or + line.strip().startswith("$") or + line.strip() == ""): + continue + + domain, domain_specified, path, secure, expires, name, value = \ + line.split("\t", 6) + secure = (secure == "TRUE") + domain_specified = (domain_specified == "TRUE") + if name == "": + name = value + value = None + + initial_dot = domain.startswith(".") + if domain_specified != initial_dot: + raise LoadError("domain and domain specified flag don't " + "match in %s: %s" % (filename, line)) + + discard = False + if expires == "": + expires = None + discard = True + + # assume path_specified is false + c = Cookie(0, name, value, + None, False, + domain, domain_specified, initial_dot, + path, False, + secure, + expires, + discard, + None, + None, + {}) + if not ignore_discard and c.discard: + continue + if not ignore_expires and c.is_expired(now): + continue + self.set_cookie(c) + + except: + reraise_unmasked_exceptions((IOError, LoadError)) + raise LoadError("invalid Netscape format file %s: %s" % + (filename, line)) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + f = open(filename, "w") + try: + debug("Saving Netscape cookies.txt file") + f.write(self.header) + now = time.time() + for cookie in self: + if not ignore_discard and cookie.discard: + debug(" Not saving %s: marked for discard", cookie.name) + continue + if not ignore_expires and cookie.is_expired(now): + debug(" Not saving %s: expired", cookie.name) + continue + if cookie.secure: secure = "TRUE" + else: secure = "FALSE" + if cookie.domain.startswith("."): initial_dot = "TRUE" + else: initial_dot = "FALSE" + if cookie.expires is not None: + expires = str(cookie.expires) + else: + expires = "" + if cookie.value is None: + # cookies.txt regards 'Set-Cookie: foo' as a cookie + # with no name, whereas cookielib regards it as a + # cookie with no value. + name = "" + value = cookie.name + else: + name = cookie.name + value = cookie.value + f.write( + "\t".join([cookie.domain, initial_dot, cookie.path, + secure, expires, name, value])+ + "\n") + finally: + f.close() diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_msiecookiejar.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_msiecookiejar.py new file mode 100644 index 0000000..1057811 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_msiecookiejar.py @@ -0,0 +1,388 @@ +"""Microsoft Internet Explorer cookie loading on Windows. + +Copyright 2002-2003 Johnny Lee <typo_pl@hotmail.com> (MSIE Perl code) +Copyright 2002-2006 John J Lee <jjl@pobox.com> (The Python port) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +# XXX names and comments are not great here + +import os, re, time, struct, logging +if os.name == "nt": + import _winreg + +from _clientcookie import FileCookieJar, CookieJar, Cookie, \ + MISSING_FILENAME_TEXT, LoadError + +debug = logging.getLogger("mechanize").debug + + +def regload(path, leaf): + key = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, path, 0, + _winreg.KEY_ALL_ACCESS) + try: + value = _winreg.QueryValueEx(key, leaf)[0] + except WindowsError: + value = None + return value + +WIN32_EPOCH = 0x019db1ded53e8000L # 1970 Jan 01 00:00:00 in Win32 FILETIME + +def epoch_time_offset_from_win32_filetime(filetime): + """Convert from win32 filetime to seconds-since-epoch value. + + MSIE stores create and expire times as Win32 FILETIME, which is 64 + bits of 100 nanosecond intervals since Jan 01 1601. + + mechanize expects time in 32-bit value expressed in seconds since the + epoch (Jan 01 1970). + + """ + if filetime < WIN32_EPOCH: + raise ValueError("filetime (%d) is before epoch (%d)" % + (filetime, WIN32_EPOCH)) + + return divmod((filetime - WIN32_EPOCH), 10000000L)[0] + +def binary_to_char(c): return "%02X" % ord(c) +def binary_to_str(d): return "".join(map(binary_to_char, list(d))) + +class MSIEBase: + magic_re = re.compile(r"Client UrlCache MMF Ver \d\.\d.*") + padding = "\x0d\xf0\xad\x0b" + + msie_domain_re = re.compile(r"^([^/]+)(/.*)$") + cookie_re = re.compile("Cookie\:.+\@([\x21-\xFF]+).*?" + "(.+\@[\x21-\xFF]+\.txt)") + + # path under HKEY_CURRENT_USER from which to get location of index.dat + reg_path = r"software\microsoft\windows" \ + r"\currentversion\explorer\shell folders" + reg_key = "Cookies" + + def __init__(self): + self._delayload_domains = {} + + def _delayload_domain(self, domain): + # if necessary, lazily load cookies for this domain + delayload_info = self._delayload_domains.get(domain) + if delayload_info is not None: + cookie_file, ignore_discard, ignore_expires = delayload_info + try: + self.load_cookie_data(cookie_file, + ignore_discard, ignore_expires) + except (LoadError, IOError): + debug("error reading cookie file, skipping: %s", cookie_file) + else: + del self._delayload_domains[domain] + + def _load_cookies_from_file(self, filename): + debug("Loading MSIE cookies file: %s", filename) + cookies = [] + + cookies_fh = open(filename) + + try: + while 1: + key = cookies_fh.readline() + if key == "": break + + rl = cookies_fh.readline + def getlong(rl=rl): return long(rl().rstrip()) + def getstr(rl=rl): return rl().rstrip() + + key = key.rstrip() + value = getstr() + domain_path = getstr() + flags = getlong() # 0x2000 bit is for secure I think + lo_expire = getlong() + hi_expire = getlong() + lo_create = getlong() + hi_create = getlong() + sep = getstr() + + if "" in (key, value, domain_path, flags, hi_expire, lo_expire, + hi_create, lo_create, sep) or (sep != "*"): + break + + m = self.msie_domain_re.search(domain_path) + if m: + domain = m.group(1) + path = m.group(2) + + cookies.append({"KEY": key, "VALUE": value, + "DOMAIN": domain, "PATH": path, + "FLAGS": flags, "HIXP": hi_expire, + "LOXP": lo_expire, "HICREATE": hi_create, + "LOCREATE": lo_create}) + finally: + cookies_fh.close() + + return cookies + + def load_cookie_data(self, filename, + ignore_discard=False, ignore_expires=False): + """Load cookies from file containing actual cookie data. + + Old cookies are kept unless overwritten by newly loaded ones. + + You should not call this method if the delayload attribute is set. + + I think each of these files contain all cookies for one user, domain, + and path. + + filename: file containing cookies -- usually found in a file like + C:\WINNT\Profiles\joe\Cookies\joe@blah[1].txt + + """ + now = int(time.time()) + + cookie_data = self._load_cookies_from_file(filename) + + for cookie in cookie_data: + flags = cookie["FLAGS"] + secure = ((flags & 0x2000) != 0) + filetime = (cookie["HIXP"] << 32) + cookie["LOXP"] + expires = epoch_time_offset_from_win32_filetime(filetime) + if expires < now: + discard = True + else: + discard = False + domain = cookie["DOMAIN"] + initial_dot = domain.startswith(".") + if initial_dot: + domain_specified = True + else: + # MSIE 5 does not record whether the domain cookie-attribute + # was specified. + # Assuming it wasn't is conservative, because with strict + # domain matching this will match less frequently; with regular + # Netscape tail-matching, this will match at exactly the same + # times that domain_specified = True would. It also means we + # don't have to prepend a dot to achieve consistency with our + # own & Mozilla's domain-munging scheme. + domain_specified = False + + # assume path_specified is false + # XXX is there other stuff in here? -- eg. comment, commentURL? + c = Cookie(0, + cookie["KEY"], cookie["VALUE"], + None, False, + domain, domain_specified, initial_dot, + cookie["PATH"], False, + secure, + expires, + discard, + None, + None, + {"flags": flags}) + if not ignore_discard and c.discard: + continue + if not ignore_expires and c.is_expired(now): + continue + CookieJar.set_cookie(self, c) + + def load_from_registry(self, ignore_discard=False, ignore_expires=False, + username=None): + """ + username: only required on win9x + + """ + cookies_dir = regload(self.reg_path, self.reg_key) + filename = os.path.normpath(os.path.join(cookies_dir, "INDEX.DAT")) + self.load(filename, ignore_discard, ignore_expires, username) + + def _really_load(self, index, filename, ignore_discard, ignore_expires, + username): + now = int(time.time()) + + if username is None: + username = os.environ['USERNAME'].lower() + + cookie_dir = os.path.dirname(filename) + + data = index.read(256) + if len(data) != 256: + raise LoadError("%s file is too short" % filename) + + # Cookies' index.dat file starts with 32 bytes of signature + # followed by an offset to the first record, stored as a little- + # endian DWORD. + sig, size, data = data[:32], data[32:36], data[36:] + size = struct.unpack("<L", size)[0] + + # check that sig is valid + if not self.magic_re.match(sig) or size != 0x4000: + raise LoadError("%s ['%s' %s] does not seem to contain cookies" % + (str(filename), sig, size)) + + # skip to start of first record + index.seek(size, 0) + + sector = 128 # size of sector in bytes + + while 1: + data = "" + + # Cookies are usually in two contiguous sectors, so read in two + # sectors and adjust if not a Cookie. + to_read = 2 * sector + d = index.read(to_read) + if len(d) != to_read: + break + data = data + d + + # Each record starts with a 4-byte signature and a count + # (little-endian DWORD) of sectors for the record. + sig, size, data = data[:4], data[4:8], data[8:] + size = struct.unpack("<L", size)[0] + + to_read = (size - 2) * sector + +## from urllib import quote +## print "data", quote(data) +## print "sig", quote(sig) +## print "size in sectors", size +## print "size in bytes", size*sector +## print "size in units of 16 bytes", (size*sector) / 16 +## print "size to read in bytes", to_read +## print + + if sig != "URL ": + assert sig in ("HASH", "LEAK", \ + self.padding, "\x00\x00\x00\x00"), \ + "unrecognized MSIE index.dat record: %s" % \ + binary_to_str(sig) + if sig == "\x00\x00\x00\x00": + # assume we've got all the cookies, and stop + break + if sig == self.padding: + continue + # skip the rest of this record + assert to_read >= 0 + if size != 2: + assert to_read != 0 + index.seek(to_read, 1) + continue + + # read in rest of record if necessary + if size > 2: + more_data = index.read(to_read) + if len(more_data) != to_read: break + data = data + more_data + + cookie_re = ("Cookie\:%s\@([\x21-\xFF]+).*?" % username + + "(%s\@[\x21-\xFF]+\.txt)" % username) + m = re.search(cookie_re, data, re.I) + if m: + cookie_file = os.path.join(cookie_dir, m.group(2)) + if not self.delayload: + try: + self.load_cookie_data(cookie_file, + ignore_discard, ignore_expires) + except (LoadError, IOError): + debug("error reading cookie file, skipping: %s", + cookie_file) + else: + domain = m.group(1) + i = domain.find("/") + if i != -1: + domain = domain[:i] + + self._delayload_domains[domain] = ( + cookie_file, ignore_discard, ignore_expires) + + +class MSIECookieJar(MSIEBase, FileCookieJar): + """FileCookieJar that reads from the Windows MSIE cookies database. + + MSIECookieJar can read the cookie files of Microsoft Internet Explorer + (MSIE) for Windows version 5 on Windows NT and version 6 on Windows XP and + Windows 98. Other configurations may also work, but are untested. Saving + cookies in MSIE format is NOT supported. If you save cookies, they'll be + in the usual Set-Cookie3 format, which you can read back in using an + instance of the plain old CookieJar class. Don't save using the same + filename that you loaded cookies from, because you may succeed in + clobbering your MSIE cookies index file! + + You should be able to have LWP share Internet Explorer's cookies like + this (note you need to supply a username to load_from_registry if you're on + Windows 9x or Windows ME): + + cj = MSIECookieJar(delayload=1) + # find cookies index file in registry and load cookies from it + cj.load_from_registry() + opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) + response = opener.open("http://example.com/") + + Iterating over a delayloaded MSIECookieJar instance will not cause any + cookies to be read from disk. To force reading of all cookies from disk, + call read_all_cookies. Note that the following methods iterate over self: + clear_temporary_cookies, clear_expired_cookies, __len__, __repr__, __str__ + and as_string. + + Additional methods: + + load_from_registry(ignore_discard=False, ignore_expires=False, + username=None) + load_cookie_data(filename, ignore_discard=False, ignore_expires=False) + read_all_cookies() + + """ + def __init__(self, filename=None, delayload=False, policy=None): + MSIEBase.__init__(self) + FileCookieJar.__init__(self, filename, delayload, policy) + + def set_cookie(self, cookie): + if self.delayload: + self._delayload_domain(cookie.domain) + CookieJar.set_cookie(self, cookie) + + def _cookies_for_request(self, request): + """Return a list of cookies to be returned to server.""" + domains = self._cookies.copy() + domains.update(self._delayload_domains) + domains = domains.keys() + + cookies = [] + for domain in domains: + cookies.extend(self._cookies_for_domain(domain, request)) + return cookies + + def _cookies_for_domain(self, domain, request): + if not self._policy.domain_return_ok(domain, request): + return [] + debug("Checking %s for cookies to return", domain) + if self.delayload: + self._delayload_domain(domain) + return CookieJar._cookies_for_domain(self, domain, request) + + def read_all_cookies(self): + """Eagerly read in all cookies.""" + if self.delayload: + for domain in self._delayload_domains.keys(): + self._delayload_domain(domain) + + def load(self, filename, ignore_discard=False, ignore_expires=False, + username=None): + """Load cookies from an MSIE 'index.dat' cookies index file. + + filename: full path to cookie index file + username: only required on win9x + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + index = open(filename, "rb") + + try: + self._really_load(index, filename, ignore_discard, ignore_expires, + username) + finally: + index.close() diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_opener.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_opener.py new file mode 100644 index 0000000..d94eacf --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_opener.py @@ -0,0 +1,436 @@ +"""Integration with Python standard library module urllib2: OpenerDirector +class. + +Copyright 2004-2006 John J Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import os, urllib2, bisect, httplib, types, tempfile +try: + import threading as _threading +except ImportError: + import dummy_threading as _threading +try: + set +except NameError: + import sets + set = sets.Set + +import _file +import _http +from _request import Request +import _response +import _rfc3986 +import _sockettimeout +import _upgrade +from _util import isstringlike + + +class ContentTooShortError(urllib2.URLError): + def __init__(self, reason, result): + urllib2.URLError.__init__(self, reason) + self.result = result + + +def set_request_attr(req, name, value, default): + try: + getattr(req, name) + except AttributeError: + setattr(req, name, default) + if value is not default: + setattr(req, name, value) + + +class OpenerDirector(urllib2.OpenerDirector): + def __init__(self): + urllib2.OpenerDirector.__init__(self) + # really none of these are (sanely) public -- the lack of initial + # underscore on some is just due to following urllib2 + self.process_response = {} + self.process_request = {} + self._any_request = {} + self._any_response = {} + self._handler_index_valid = True + self._tempfiles = [] + + def add_handler(self, handler): + if handler in self.handlers: + return + # XXX why does self.handlers need to be sorted? + bisect.insort(self.handlers, handler) + handler.add_parent(self) + self._handler_index_valid = False + + def _maybe_reindex_handlers(self): + if self._handler_index_valid: + return + + handle_error = {} + handle_open = {} + process_request = {} + process_response = {} + any_request = set() + any_response = set() + unwanted = [] + + for handler in self.handlers: + added = False + for meth in dir(handler): + if meth in ["redirect_request", "do_open", "proxy_open"]: + # oops, coincidental match + continue + + if meth == "any_request": + any_request.add(handler) + added = True + continue + elif meth == "any_response": + any_response.add(handler) + added = True + continue + + ii = meth.find("_") + scheme = meth[:ii] + condition = meth[ii+1:] + + if condition.startswith("error"): + jj = meth[ii+1:].find("_") + ii + 1 + kind = meth[jj+1:] + try: + kind = int(kind) + except ValueError: + pass + lookup = handle_error.setdefault(scheme, {}) + elif condition == "open": + kind = scheme + lookup = handle_open + elif condition == "request": + kind = scheme + lookup = process_request + elif condition == "response": + kind = scheme + lookup = process_response + else: + continue + + lookup.setdefault(kind, set()).add(handler) + added = True + + if not added: + unwanted.append(handler) + + for handler in unwanted: + self.handlers.remove(handler) + + # sort indexed methods + # XXX could be cleaned up + for lookup in [process_request, process_response]: + for scheme, handlers in lookup.iteritems(): + lookup[scheme] = handlers + for scheme, lookup in handle_error.iteritems(): + for code, handlers in lookup.iteritems(): + handlers = list(handlers) + handlers.sort() + lookup[code] = handlers + for scheme, handlers in handle_open.iteritems(): + handlers = list(handlers) + handlers.sort() + handle_open[scheme] = handlers + + # cache the indexes + self.handle_error = handle_error + self.handle_open = handle_open + self.process_request = process_request + self.process_response = process_response + self._any_request = any_request + self._any_response = any_response + + def _request(self, url_or_req, data, visit, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + if isstringlike(url_or_req): + req = Request(url_or_req, data, visit=visit, timeout=timeout) + else: + # already a urllib2.Request or mechanize.Request instance + req = url_or_req + if data is not None: + req.add_data(data) + # XXX yuck + set_request_attr(req, "visit", visit, None) + set_request_attr(req, "timeout", timeout, + _sockettimeout._GLOBAL_DEFAULT_TIMEOUT) + return req + + def open(self, fullurl, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + req = self._request(fullurl, data, None, timeout) + req_scheme = req.get_type() + + self._maybe_reindex_handlers() + + # pre-process request + # XXX should we allow a Processor to change the URL scheme + # of the request? + request_processors = set(self.process_request.get(req_scheme, [])) + request_processors.update(self._any_request) + request_processors = list(request_processors) + request_processors.sort() + for processor in request_processors: + for meth_name in ["any_request", req_scheme+"_request"]: + meth = getattr(processor, meth_name, None) + if meth: + req = meth(req) + + # In Python >= 2.4, .open() supports processors already, so we must + # call ._open() instead. + urlopen = getattr(urllib2.OpenerDirector, "_open", + urllib2.OpenerDirector.open) + response = urlopen(self, req, data) + + # post-process response + response_processors = set(self.process_response.get(req_scheme, [])) + response_processors.update(self._any_response) + response_processors = list(response_processors) + response_processors.sort() + for processor in response_processors: + for meth_name in ["any_response", req_scheme+"_response"]: + meth = getattr(processor, meth_name, None) + if meth: + response = meth(req, response) + + return response + + def error(self, proto, *args): + if proto in ['http', 'https']: + # XXX http[s] protocols are special-cased + dict = self.handle_error['http'] # https is not different than http + proto = args[2] # YUCK! + meth_name = 'http_error_%s' % proto + http_err = 1 + orig_args = args + else: + dict = self.handle_error + meth_name = proto + '_error' + http_err = 0 + args = (dict, proto, meth_name) + args + result = apply(self._call_chain, args) + if result: + return result + + if http_err: + args = (dict, 'default', 'http_error_default') + orig_args + return apply(self._call_chain, args) + + BLOCK_SIZE = 1024*8 + def retrieve(self, fullurl, filename=None, reporthook=None, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + """Returns (filename, headers). + + For remote objects, the default filename will refer to a temporary + file. Temporary files are removed when the OpenerDirector.close() + method is called. + + For file: URLs, at present the returned filename is None. This may + change in future. + + If the actual number of bytes read is less than indicated by the + Content-Length header, raises ContentTooShortError (a URLError + subclass). The exception's .result attribute contains the (filename, + headers) that would have been returned. + + """ + req = self._request(fullurl, data, False, timeout) + scheme = req.get_type() + fp = self.open(req) + headers = fp.info() + if filename is None and scheme == 'file': + # XXX req.get_selector() seems broken here, return None, + # pending sanity :-/ + return None, headers + #return urllib.url2pathname(req.get_selector()), headers + if filename: + tfp = open(filename, 'wb') + else: + path = _rfc3986.urlsplit(req.get_full_url())[2] + suffix = os.path.splitext(path)[1] + fd, filename = tempfile.mkstemp(suffix) + self._tempfiles.append(filename) + tfp = os.fdopen(fd, 'wb') + + result = filename, headers + bs = self.BLOCK_SIZE + size = -1 + read = 0 + blocknum = 0 + if reporthook: + if "content-length" in headers: + size = int(headers["Content-Length"]) + reporthook(blocknum, bs, size) + while 1: + block = fp.read(bs) + if block == "": + break + read += len(block) + tfp.write(block) + blocknum += 1 + if reporthook: + reporthook(blocknum, bs, size) + fp.close() + tfp.close() + del fp + del tfp + + # raise exception if actual size does not match content-length header + if size >= 0 and read < size: + raise ContentTooShortError( + "retrieval incomplete: " + "got only %i out of %i bytes" % (read, size), + result + ) + + return result + + def close(self): + urllib2.OpenerDirector.close(self) + + # make it very obvious this object is no longer supposed to be used + self.open = self.error = self.retrieve = self.add_handler = None + + if self._tempfiles: + for filename in self._tempfiles: + try: + os.unlink(filename) + except OSError: + pass + del self._tempfiles[:] + + +def wrapped_open(urlopen, process_response_object, fullurl, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + success = True + try: + response = urlopen(fullurl, data, timeout) + except urllib2.HTTPError, error: + success = False + if error.fp is None: # not a response + raise + response = error + + if response is not None: + response = process_response_object(response) + + if not success: + raise response + return response + +class ResponseProcessingOpener(OpenerDirector): + + def open(self, fullurl, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + def bound_open(fullurl, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + return OpenerDirector.open(self, fullurl, data, timeout) + return wrapped_open( + bound_open, self.process_response_object, fullurl, data, timeout) + + def process_response_object(self, response): + return response + + +class SeekableResponseOpener(ResponseProcessingOpener): + def process_response_object(self, response): + return _response.seek_wrapped_response(response) + + +class OpenerFactory: + """This class's interface is quite likely to change.""" + + default_classes = [ + # handlers + urllib2.ProxyHandler, + urllib2.UnknownHandler, + _http.HTTPHandler, # derived from new AbstractHTTPHandler + _http.HTTPDefaultErrorHandler, + _http.HTTPRedirectHandler, # bugfixed + urllib2.FTPHandler, + _file.FileHandler, + # processors + _upgrade.HTTPRequestUpgradeProcessor, + _http.HTTPCookieProcessor, + _http.HTTPErrorProcessor, + ] + if hasattr(httplib, 'HTTPS'): + default_classes.append(_http.HTTPSHandler) + handlers = [] + replacement_handlers = [] + + def __init__(self, klass=OpenerDirector): + self.klass = klass + + def build_opener(self, *handlers): + """Create an opener object from a list of handlers and processors. + + The opener will use several default handlers and processors, including + support for HTTP and FTP. + + If any of the handlers passed as arguments are subclasses of the + default handlers, the default handlers will not be used. + + """ + opener = self.klass() + default_classes = list(self.default_classes) + skip = [] + for klass in default_classes: + for check in handlers: + if type(check) == types.ClassType: + if issubclass(check, klass): + skip.append(klass) + elif type(check) == types.InstanceType: + if isinstance(check, klass): + skip.append(klass) + for klass in skip: + default_classes.remove(klass) + + for klass in default_classes: + opener.add_handler(klass()) + for h in handlers: + if type(h) == types.ClassType: + h = h() + opener.add_handler(h) + + return opener + + +build_opener = OpenerFactory().build_opener + +_opener = None +urlopen_lock = _threading.Lock() +def urlopen(url, data=None, timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + global _opener + if _opener is None: + urlopen_lock.acquire() + try: + if _opener is None: + _opener = build_opener() + finally: + urlopen_lock.release() + return _opener.open(url, data, timeout) + +def urlretrieve(url, filename=None, reporthook=None, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + global _opener + if _opener is None: + urlopen_lock.acquire() + try: + if _opener is None: + _opener = build_opener() + finally: + urlopen_lock.release() + return _opener.retrieve(url, filename, reporthook, data, timeout) + +def install_opener(opener): + global _opener + _opener = opener diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_pullparser.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_pullparser.py new file mode 100644 index 0000000..4d8d9d3 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_pullparser.py @@ -0,0 +1,390 @@ +"""A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser. + +Examples + +This program extracts all links from a document. It will print one +line for each link, containing the URL and the textual description +between the <A>...</A> tags: + +import pullparser, sys +f = file(sys.argv[1]) +p = pullparser.PullParser(f) +for token in p.tags("a"): + if token.type == "endtag": continue + url = dict(token.attrs).get("href", "-") + text = p.get_compressed_text(endat=("endtag", "a")) + print "%s\t%s" % (url, text) + +This program extracts the <TITLE> from the document: + +import pullparser, sys +f = file(sys.argv[1]) +p = pullparser.PullParser(f) +if p.get_tag("title"): + title = p.get_compressed_text() + print "Title: %s" % title + + +Copyright 2003-2006 John J. Lee <jjl@pobox.com> +Copyright 1998-2001 Gisle Aas (original libwww-perl code) + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses. + +""" + +import re, htmlentitydefs +import sgmllib, HTMLParser +from xml.sax import saxutils + +from _html import unescape, unescape_charref + + +class NoMoreTokensError(Exception): pass + +class Token: + """Represents an HTML tag, declaration, processing instruction etc. + + Behaves as both a tuple-like object (ie. iterable) and has attributes + .type, .data and .attrs. + + >>> t = Token("starttag", "a", [("href", "http://www.python.org/")]) + >>> t == ("starttag", "a", [("href", "http://www.python.org/")]) + True + >>> (t.type, t.data) == ("starttag", "a") + True + >>> t.attrs == [("href", "http://www.python.org/")] + True + + Public attributes + + type: one of "starttag", "endtag", "startendtag", "charref", "entityref", + "data", "comment", "decl", "pi", after the corresponding methods of + HTMLParser.HTMLParser + data: For a tag, the tag name; otherwise, the relevant data carried by the + tag, as a string + attrs: list of (name, value) pairs representing HTML attributes + (or None if token does not represent an opening tag) + + """ + def __init__(self, type, data, attrs=None): + self.type = type + self.data = data + self.attrs = attrs + def __iter__(self): + return iter((self.type, self.data, self.attrs)) + def __eq__(self, other): + type, data, attrs = other + if (self.type == type and + self.data == data and + self.attrs == attrs): + return True + else: + return False + def __ne__(self, other): return not self.__eq__(other) + def __repr__(self): + args = ", ".join(map(repr, [self.type, self.data, self.attrs])) + return self.__class__.__name__+"(%s)" % args + + def __str__(self): + """ + >>> print Token("starttag", "br") + <br> + >>> print Token("starttag", "a", + ... [("href", "http://www.python.org/"), ("alt", '"foo"')]) + <a href="http://www.python.org/" alt='"foo"'> + >>> print Token("startendtag", "br") + <br /> + >>> print Token("startendtag", "br", [("spam", "eggs")]) + <br spam="eggs" /> + >>> print Token("endtag", "p") + </p> + >>> print Token("charref", "38") + & + >>> print Token("entityref", "amp") + & + >>> print Token("data", "foo\\nbar") + foo + bar + >>> print Token("comment", "Life is a bowl\\nof cherries.") + <!--Life is a bowl + of cherries.--> + >>> print Token("decl", "decl") + <!decl> + >>> print Token("pi", "pi") + <?pi> + """ + if self.attrs is not None: + attrs = "".join([" %s=%s" % (k, saxutils.quoteattr(v)) for + k, v in self.attrs]) + else: + attrs = "" + if self.type == "starttag": + return "<%s%s>" % (self.data, attrs) + elif self.type == "startendtag": + return "<%s%s />" % (self.data, attrs) + elif self.type == "endtag": + return "</%s>" % self.data + elif self.type == "charref": + return "&#%s;" % self.data + elif self.type == "entityref": + return "&%s;" % self.data + elif self.type == "data": + return self.data + elif self.type == "comment": + return "<!--%s-->" % self.data + elif self.type == "decl": + return "<!%s>" % self.data + elif self.type == "pi": + return "<?%s>" % self.data + assert False + + +def iter_until_exception(fn, exception, *args, **kwds): + while 1: + try: + yield fn(*args, **kwds) + except exception: + raise StopIteration + + +class _AbstractParser: + chunk = 1024 + compress_re = re.compile(r"\s+") + def __init__(self, fh, textify={"img": "alt", "applet": "alt"}, + encoding="ascii", entitydefs=None): + """ + fh: file-like object (only a .read() method is required) from which to + read HTML to be parsed + textify: mapping used by .get_text() and .get_compressed_text() methods + to represent opening tags as text + encoding: encoding used to encode numeric character references by + .get_text() and .get_compressed_text() ("ascii" by default) + + entitydefs: mapping like {"amp": "&", ...} containing HTML entity + definitions (a sensible default is used). This is used to unescape + entities in .get_text() (and .get_compressed_text()) and attribute + values. If the encoding can not represent the character, the entity + reference is left unescaped. Note that entity references (both + numeric - e.g. { or ઼ - and non-numeric - e.g. &) are + unescaped in attribute values and the return value of .get_text(), but + not in data outside of tags. Instead, entity references outside of + tags are represented as tokens. This is a bit odd, it's true :-/ + + If the element name of an opening tag matches a key in the textify + mapping then that tag is converted to text. The corresponding value is + used to specify which tag attribute to obtain the text from. textify + maps from element names to either: + + - an HTML attribute name, in which case the HTML attribute value is + used as its text value along with the element name in square + brackets (eg."alt text goes here[IMG]", or, if the alt attribute + were missing, just "[IMG]") + - a callable object (eg. a function) which takes a Token and returns + the string to be used as its text value + + If textify has no key for an element name, nothing is substituted for + the opening tag. + + Public attributes: + + encoding and textify: see above + + """ + self._fh = fh + self._tokenstack = [] # FIFO + self.textify = textify + self.encoding = encoding + if entitydefs is None: + entitydefs = htmlentitydefs.name2codepoint + self._entitydefs = entitydefs + + def __iter__(self): return self + + def tags(self, *names): + return iter_until_exception(self.get_tag, NoMoreTokensError, *names) + + def tokens(self, *tokentypes): + return iter_until_exception(self.get_token, NoMoreTokensError, + *tokentypes) + + def next(self): + try: + return self.get_token() + except NoMoreTokensError: + raise StopIteration() + + def get_token(self, *tokentypes): + """Pop the next Token object from the stack of parsed tokens. + + If arguments are given, they are taken to be token types in which the + caller is interested: tokens representing other elements will be + skipped. Element names must be given in lower case. + + Raises NoMoreTokensError. + + """ + while 1: + while self._tokenstack: + token = self._tokenstack.pop(0) + if tokentypes: + if token.type in tokentypes: + return token + else: + return token + data = self._fh.read(self.chunk) + if not data: + raise NoMoreTokensError() + self.feed(data) + + def unget_token(self, token): + """Push a Token back onto the stack.""" + self._tokenstack.insert(0, token) + + def get_tag(self, *names): + """Return the next Token that represents an opening or closing tag. + + If arguments are given, they are taken to be element names in which the + caller is interested: tags representing other elements will be skipped. + Element names must be given in lower case. + + Raises NoMoreTokensError. + + """ + while 1: + tok = self.get_token() + if tok.type not in ["starttag", "endtag", "startendtag"]: + continue + if names: + if tok.data in names: + return tok + else: + return tok + + def get_text(self, endat=None): + """Get some text. + + endat: stop reading text at this tag (the tag is included in the + returned text); endtag is a tuple (type, name) where type is + "starttag", "endtag" or "startendtag", and name is the element name of + the tag (element names must be given in lower case) + + If endat is not given, .get_text() will stop at the next opening or + closing tag, or when there are no more tokens (no exception is raised). + Note that .get_text() includes the text representation (if any) of the + opening tag, but pushes the opening tag back onto the stack. As a + result, if you want to call .get_text() again, you need to call + .get_tag() first (unless you want an empty string returned when you + next call .get_text()). + + Entity references are translated using the value of the entitydefs + constructor argument (a mapping from names to characters like that + provided by the standard module htmlentitydefs). Named entity + references that are not in this mapping are left unchanged. + + The textify attribute is used to translate opening tags into text: see + the class docstring. + + """ + text = [] + tok = None + while 1: + try: + tok = self.get_token() + except NoMoreTokensError: + # unget last token (not the one we just failed to get) + if tok: self.unget_token(tok) + break + if tok.type == "data": + text.append(tok.data) + elif tok.type == "entityref": + t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding) + text.append(t) + elif tok.type == "charref": + t = unescape_charref(tok.data, self.encoding) + text.append(t) + elif tok.type in ["starttag", "endtag", "startendtag"]: + tag_name = tok.data + if tok.type in ["starttag", "startendtag"]: + alt = self.textify.get(tag_name) + if alt is not None: + if callable(alt): + text.append(alt(tok)) + elif tok.attrs is not None: + for k, v in tok.attrs: + if k == alt: + text.append(v) + text.append("[%s]" % tag_name.upper()) + if endat is None or endat == (tok.type, tag_name): + self.unget_token(tok) + break + return "".join(text) + + def get_compressed_text(self, *args, **kwds): + """ + As .get_text(), but collapses each group of contiguous whitespace to a + single space character, and removes all initial and trailing + whitespace. + + """ + text = self.get_text(*args, **kwds) + text = text.strip() + return self.compress_re.sub(" ", text) + + def handle_startendtag(self, tag, attrs): + self._tokenstack.append(Token("startendtag", tag, attrs)) + def handle_starttag(self, tag, attrs): + self._tokenstack.append(Token("starttag", tag, attrs)) + def handle_endtag(self, tag): + self._tokenstack.append(Token("endtag", tag)) + def handle_charref(self, name): + self._tokenstack.append(Token("charref", name)) + def handle_entityref(self, name): + self._tokenstack.append(Token("entityref", name)) + def handle_data(self, data): + self._tokenstack.append(Token("data", data)) + def handle_comment(self, data): + self._tokenstack.append(Token("comment", data)) + def handle_decl(self, decl): + self._tokenstack.append(Token("decl", decl)) + def unknown_decl(self, data): + # XXX should this call self.error instead? + #self.error("unknown declaration: " + `data`) + self._tokenstack.append(Token("decl", data)) + def handle_pi(self, data): + self._tokenstack.append(Token("pi", data)) + + def unescape_attr(self, name): + return unescape(name, self._entitydefs, self.encoding) + def unescape_attrs(self, attrs): + escaped_attrs = [] + for key, val in attrs: + escaped_attrs.append((key, self.unescape_attr(val))) + return escaped_attrs + +class PullParser(_AbstractParser, HTMLParser.HTMLParser): + def __init__(self, *args, **kwds): + HTMLParser.HTMLParser.__init__(self) + _AbstractParser.__init__(self, *args, **kwds) + def unescape(self, name): + # Use the entitydefs passed into constructor, not + # HTMLParser.HTMLParser's entitydefs. + return self.unescape_attr(name) + +class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser): + def __init__(self, *args, **kwds): + sgmllib.SGMLParser.__init__(self) + _AbstractParser.__init__(self, *args, **kwds) + def unknown_starttag(self, tag, attrs): + attrs = self.unescape_attrs(attrs) + self._tokenstack.append(Token("starttag", tag, attrs)) + def unknown_endtag(self, tag): + self._tokenstack.append(Token("endtag", tag)) + + +def _test(): + import doctest, _pullparser + return doctest.testmod(_pullparser) + +if __name__ == "__main__": + _test() diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_request.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_request.py new file mode 100644 index 0000000..7824441 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_request.py @@ -0,0 +1,87 @@ +"""Integration with Python standard library module urllib2: Request class. + +Copyright 2004-2006 John J Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import urllib2, urllib, logging + +from _clientcookie import request_host_lc +import _rfc3986 +import _sockettimeout + +warn = logging.getLogger("mechanize").warning + + +class Request(urllib2.Request): + def __init__(self, url, data=None, headers={}, + origin_req_host=None, unverifiable=False, visit=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + # In mechanize 0.2, the interpretation of a unicode url argument will + # change: A unicode url argument will be interpreted as an IRI, and a + # bytestring as a URI. For now, we accept unicode or bytestring. We + # don't insist that the value is always a URI (specifically, must only + # contain characters which are legal), because that might break working + # code (who knows what bytes some servers want to see, especially with + # browser plugins for internationalised URIs). + if not _rfc3986.is_clean_uri(url): + warn("url argument is not a URI " + "(contains illegal characters) %r" % url) + urllib2.Request.__init__(self, url, data, headers) + self.selector = None + self.unredirected_hdrs = {} + self.visit = visit + self.timeout = timeout + + # All the terminology below comes from RFC 2965. + self.unverifiable = unverifiable + # Set request-host of origin transaction. + # The origin request-host is needed in order to decide whether + # unverifiable sub-requests (automatic redirects, images embedded + # in HTML, etc.) are to third-party hosts. If they are, the + # resulting transactions might need to be conducted with cookies + # turned off. + if origin_req_host is None: + origin_req_host = request_host_lc(self) + self.origin_req_host = origin_req_host + + def get_selector(self): + return urllib.splittag(self.__r_host)[0] + + def get_origin_req_host(self): + return self.origin_req_host + + def is_unverifiable(self): + return self.unverifiable + + def add_unredirected_header(self, key, val): + """Add a header that will not be added to a redirected request.""" + self.unredirected_hdrs[key.capitalize()] = val + + def has_header(self, header_name): + """True iff request has named header (regular or unredirected).""" + return (header_name in self.headers or + header_name in self.unredirected_hdrs) + + def get_header(self, header_name, default=None): + return self.headers.get( + header_name, + self.unredirected_hdrs.get(header_name, default)) + + def header_items(self): + hdrs = self.unredirected_hdrs.copy() + hdrs.update(self.headers) + return hdrs.items() + + def __str__(self): + return "<Request for %s>" % self.get_full_url() + + def get_method(self): + if self.has_data(): + return "POST" + else: + return "GET" diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_response.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_response.py new file mode 100644 index 0000000..fad9b57 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_response.py @@ -0,0 +1,527 @@ +"""Response classes. + +The seek_wrapper code is not used if you're using UserAgent with +.set_seekable_responses(False), or if you're using the urllib2-level interface +without SeekableProcessor or HTTPEquivProcessor. Class closeable_response is +instantiated by some handlers (AbstractHTTPHandler), but the closeable_response +interface is only depended upon by Browser-level code. Function +upgrade_response is only used if you're using Browser or +ResponseUpgradeProcessor. + + +Copyright 2006 John J. Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +import copy, mimetools +from cStringIO import StringIO +import urllib2 + + +def len_of_seekable(file_): + # this function exists because evaluation of len(file_.getvalue()) on every + # .read() from seek_wrapper would be O(N**2) in number of .read()s + pos = file_.tell() + file_.seek(0, 2) # to end + try: + return file_.tell() + finally: + file_.seek(pos) + + +# XXX Andrew Dalke kindly sent me a similar class in response to my request on +# comp.lang.python, which I then proceeded to lose. I wrote this class +# instead, but I think he's released his code publicly since, could pinch the +# tests from it, at least... + +# For testing seek_wrapper invariant (note that +# test_urllib2.HandlerTest.test_seekable is expected to fail when this +# invariant checking is turned on). The invariant checking is done by module +# ipdc, which is available here: +# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/436834 +## from ipdbc import ContractBase +## class seek_wrapper(ContractBase): +class seek_wrapper: + """Adds a seek method to a file object. + + This is only designed for seeking on readonly file-like objects. + + Wrapped file-like object must have a read method. The readline method is + only supported if that method is present on the wrapped object. The + readlines method is always supported. xreadlines and iteration are + supported only for Python 2.2 and above. + + Public attributes: + + wrapped: the wrapped file object + is_closed: true iff .close() has been called + + WARNING: All other attributes of the wrapped object (ie. those that are not + one of wrapped, read, readline, readlines, xreadlines, __iter__ and next) + are passed through unaltered, which may or may not make sense for your + particular file object. + + """ + # General strategy is to check that cache is full enough, then delegate to + # the cache (self.__cache, which is a cStringIO.StringIO instance). A seek + # position (self.__pos) is maintained independently of the cache, in order + # that a single cache may be shared between multiple seek_wrapper objects. + # Copying using module copy shares the cache in this way. + + def __init__(self, wrapped): + self.wrapped = wrapped + self.__read_complete_state = [False] + self.__is_closed_state = [False] + self.__have_readline = hasattr(self.wrapped, "readline") + self.__cache = StringIO() + self.__pos = 0 # seek position + + def invariant(self): + # The end of the cache is always at the same place as the end of the + # wrapped file (though the .tell() method is not required to be present + # on wrapped file). + return self.wrapped.tell() == len(self.__cache.getvalue()) + + def close(self): + self.wrapped.close() + self.is_closed = True + + def __getattr__(self, name): + if name == "is_closed": + return self.__is_closed_state[0] + elif name == "read_complete": + return self.__read_complete_state[0] + + wrapped = self.__dict__.get("wrapped") + if wrapped: + return getattr(wrapped, name) + + return getattr(self.__class__, name) + + def __setattr__(self, name, value): + if name == "is_closed": + self.__is_closed_state[0] = bool(value) + elif name == "read_complete": + if not self.is_closed: + self.__read_complete_state[0] = bool(value) + else: + self.__dict__[name] = value + + def seek(self, offset, whence=0): + assert whence in [0,1,2] + + # how much data, if any, do we need to read? + if whence == 2: # 2: relative to end of *wrapped* file + if offset < 0: raise ValueError("negative seek offset") + # since we don't know yet where the end of that file is, we must + # read everything + to_read = None + else: + if whence == 0: # 0: absolute + if offset < 0: raise ValueError("negative seek offset") + dest = offset + else: # 1: relative to current position + pos = self.__pos + if pos < offset: + raise ValueError("seek to before start of file") + dest = pos + offset + end = len_of_seekable(self.__cache) + to_read = dest - end + if to_read < 0: + to_read = 0 + + if to_read != 0: + self.__cache.seek(0, 2) + if to_read is None: + assert whence == 2 + self.__cache.write(self.wrapped.read()) + self.read_complete = True + self.__pos = self.__cache.tell() - offset + else: + data = self.wrapped.read(to_read) + if not data: + self.read_complete = True + else: + self.__cache.write(data) + # Don't raise an exception even if we've seek()ed past the end + # of .wrapped, since fseek() doesn't complain in that case. + # Also like fseek(), pretend we have seek()ed past the end, + # i.e. not: + #self.__pos = self.__cache.tell() + # but rather: + self.__pos = dest + else: + self.__pos = dest + + def tell(self): + return self.__pos + + def __copy__(self): + cpy = self.__class__(self.wrapped) + cpy.__cache = self.__cache + cpy.__read_complete_state = self.__read_complete_state + cpy.__is_closed_state = self.__is_closed_state + return cpy + + def get_data(self): + pos = self.__pos + try: + self.seek(0) + return self.read(-1) + finally: + self.__pos = pos + + def read(self, size=-1): + pos = self.__pos + end = len_of_seekable(self.__cache) + available = end - pos + + # enough data already cached? + if size <= available and size != -1: + self.__cache.seek(pos) + self.__pos = pos+size + return self.__cache.read(size) + + # no, so read sufficient data from wrapped file and cache it + self.__cache.seek(0, 2) + if size == -1: + self.__cache.write(self.wrapped.read()) + self.read_complete = True + else: + to_read = size - available + assert to_read > 0 + data = self.wrapped.read(to_read) + if not data: + self.read_complete = True + else: + self.__cache.write(data) + self.__cache.seek(pos) + + data = self.__cache.read(size) + self.__pos = self.__cache.tell() + assert self.__pos == pos + len(data) + return data + + def readline(self, size=-1): + if not self.__have_readline: + raise NotImplementedError("no readline method on wrapped object") + + # line we're about to read might not be complete in the cache, so + # read another line first + pos = self.__pos + self.__cache.seek(0, 2) + data = self.wrapped.readline() + if not data: + self.read_complete = True + else: + self.__cache.write(data) + self.__cache.seek(pos) + + data = self.__cache.readline() + if size != -1: + r = data[:size] + self.__pos = pos+size + else: + r = data + self.__pos = pos+len(data) + return r + + def readlines(self, sizehint=-1): + pos = self.__pos + self.__cache.seek(0, 2) + self.__cache.write(self.wrapped.read()) + self.read_complete = True + self.__cache.seek(pos) + data = self.__cache.readlines(sizehint) + self.__pos = self.__cache.tell() + return data + + def __iter__(self): return self + def next(self): + line = self.readline() + if line == "": raise StopIteration + return line + + xreadlines = __iter__ + + def __repr__(self): + return ("<%s at %s whose wrapped object = %r>" % + (self.__class__.__name__, hex(abs(id(self))), self.wrapped)) + + +class response_seek_wrapper(seek_wrapper): + + """ + Supports copying response objects and setting response body data. + + """ + + def __init__(self, wrapped): + seek_wrapper.__init__(self, wrapped) + self._headers = self.wrapped.info() + + def __copy__(self): + cpy = seek_wrapper.__copy__(self) + # copy headers from delegate + cpy._headers = copy.copy(self.info()) + return cpy + + # Note that .info() and .geturl() (the only two urllib2 response methods + # that are not implemented by seek_wrapper) must be here explicitly rather + # than by seek_wrapper's __getattr__ delegation) so that the nasty + # dynamically-created HTTPError classes in get_seek_wrapper_class() get the + # wrapped object's implementation, and not HTTPError's. + + def info(self): + return self._headers + + def geturl(self): + return self.wrapped.geturl() + + def set_data(self, data): + self.seek(0) + self.read() + self.close() + cache = self._seek_wrapper__cache = StringIO() + cache.write(data) + self.seek(0) + + +class eoffile: + # file-like object that always claims to be at end-of-file... + def read(self, size=-1): return "" + def readline(self, size=-1): return "" + def __iter__(self): return self + def next(self): return "" + def close(self): pass + +class eofresponse(eoffile): + def __init__(self, url, headers, code, msg): + self._url = url + self._headers = headers + self.code = code + self.msg = msg + def geturl(self): return self._url + def info(self): return self._headers + + +class closeable_response: + """Avoids unnecessarily clobbering urllib.addinfourl methods on .close(). + + Only supports responses returned by mechanize.HTTPHandler. + + After .close(), the following methods are supported: + + .read() + .readline() + .info() + .geturl() + .__iter__() + .next() + .close() + + and the following attributes are supported: + + .code + .msg + + Also supports pickling (but the stdlib currently does something to prevent + it: http://python.org/sf/1144636). + + """ + # presence of this attr indicates is useable after .close() + closeable_response = None + + def __init__(self, fp, headers, url, code, msg): + self._set_fp(fp) + self._headers = headers + self._url = url + self.code = code + self.msg = msg + + def _set_fp(self, fp): + self.fp = fp + self.read = self.fp.read + self.readline = self.fp.readline + if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines + if hasattr(self.fp, "fileno"): + self.fileno = self.fp.fileno + else: + self.fileno = lambda: None + self.__iter__ = self.fp.__iter__ + self.next = self.fp.next + + def __repr__(self): + return '<%s at %s whose fp = %r>' % ( + self.__class__.__name__, hex(abs(id(self))), self.fp) + + def info(self): + return self._headers + + def geturl(self): + return self._url + + def close(self): + wrapped = self.fp + wrapped.close() + new_wrapped = eofresponse( + self._url, self._headers, self.code, self.msg) + self._set_fp(new_wrapped) + + def __getstate__(self): + # There are three obvious options here: + # 1. truncate + # 2. read to end + # 3. close socket, pickle state including read position, then open + # again on unpickle and use Range header + # XXXX um, 4. refuse to pickle unless .close()d. This is better, + # actually ("errors should never pass silently"). Pickling doesn't + # work anyway ATM, because of http://python.org/sf/1144636 so fix + # this later + + # 2 breaks pickle protocol, because one expects the original object + # to be left unscathed by pickling. 3 is too complicated and + # surprising (and too much work ;-) to happen in a sane __getstate__. + # So we do 1. + + state = self.__dict__.copy() + new_wrapped = eofresponse( + self._url, self._headers, self.code, self.msg) + state["wrapped"] = new_wrapped + return state + +def test_response(data='test data', headers=[], + url="http://example.com/", code=200, msg="OK"): + return make_response(data, headers, url, code, msg) + +def test_html_response(data='test data', headers=[], + url="http://example.com/", code=200, msg="OK"): + headers += [("Content-type", "text/html")] + return make_response(data, headers, url, code, msg) + +def make_response(data, headers, url, code, msg): + """Convenient factory for objects implementing response interface. + + data: string containing response body data + headers: sequence of (name, value) pairs + url: URL of response + code: integer response code (e.g. 200) + msg: string response code message (e.g. "OK") + + """ + mime_headers = make_headers(headers) + r = closeable_response(StringIO(data), mime_headers, url, code, msg) + return response_seek_wrapper(r) + + +def make_headers(headers): + """ + headers: sequence of (name, value) pairs + """ + hdr_text = [] + for name_value in headers: + hdr_text.append("%s: %s" % name_value) + return mimetools.Message(StringIO("\n".join(hdr_text))) + + +# Rest of this module is especially horrible, but needed, at least until fork +# urllib2. Even then, may want to preseve urllib2 compatibility. + +def get_seek_wrapper_class(response): + # in order to wrap response objects that are also exceptions, we must + # dynamically subclass the exception :-((( + if (isinstance(response, urllib2.HTTPError) and + not hasattr(response, "seek")): + if response.__class__.__module__ == "__builtin__": + exc_class_name = response.__class__.__name__ + else: + exc_class_name = "%s.%s" % ( + response.__class__.__module__, response.__class__.__name__) + + class httperror_seek_wrapper(response_seek_wrapper, response.__class__): + # this only derives from HTTPError in order to be a subclass -- + # the HTTPError behaviour comes from delegation + + _exc_class_name = exc_class_name + + def __init__(self, wrapped): + response_seek_wrapper.__init__(self, wrapped) + # be compatible with undocumented HTTPError attributes :-( + self.hdrs = wrapped.info() + self.filename = wrapped.geturl() + + def __repr__(self): + return ( + "<%s (%s instance) at %s " + "whose wrapped object = %r>" % ( + self.__class__.__name__, self._exc_class_name, + hex(abs(id(self))), self.wrapped) + ) + wrapper_class = httperror_seek_wrapper + else: + wrapper_class = response_seek_wrapper + return wrapper_class + +def seek_wrapped_response(response): + """Return a copy of response that supports seekable response interface. + + Accepts responses from both mechanize and urllib2 handlers. + + Copes with both oridinary response instances and HTTPError instances (which + can't be simply wrapped due to the requirement of preserving the exception + base class). + """ + if not hasattr(response, "seek"): + wrapper_class = get_seek_wrapper_class(response) + response = wrapper_class(response) + assert hasattr(response, "get_data") + return response + +def upgrade_response(response): + """Return a copy of response that supports Browser response interface. + + Browser response interface is that of "seekable responses" + (response_seek_wrapper), plus the requirement that responses must be + useable after .close() (closeable_response). + + Accepts responses from both mechanize and urllib2 handlers. + + Copes with both ordinary response instances and HTTPError instances (which + can't be simply wrapped due to the requirement of preserving the exception + base class). + """ + wrapper_class = get_seek_wrapper_class(response) + if hasattr(response, "closeable_response"): + if not hasattr(response, "seek"): + response = wrapper_class(response) + assert hasattr(response, "get_data") + return copy.copy(response) + + # a urllib2 handler constructed the response, i.e. the response is an + # urllib.addinfourl or a urllib2.HTTPError, instead of a + # _Util.closeable_response as returned by e.g. mechanize.HTTPHandler + try: + code = response.code + except AttributeError: + code = None + try: + msg = response.msg + except AttributeError: + msg = None + + # may have already-.read() data from .seek() cache + data = None + get_data = getattr(response, "get_data", None) + if get_data: + data = get_data() + + response = closeable_response( + response.fp, response.info(), response.geturl(), code, msg) + response = wrapper_class(response) + if data: + response.set_data(data) + return response diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_rfc3986.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_rfc3986.py new file mode 100644 index 0000000..1bb5021 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_rfc3986.py @@ -0,0 +1,241 @@ +"""RFC 3986 URI parsing and relative reference resolution / absolutization. + +(aka splitting and joining) + +Copyright 2006 John J. Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +# XXX Wow, this is ugly. Overly-direct translation of the RFC ATM. + +import re, urllib + +## def chr_range(a, b): +## return "".join(map(chr, range(ord(a), ord(b)+1))) + +## UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ" +## "abcdefghijklmnopqrstuvwxyz" +## "0123456789" +## "-_.~") +## RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]" +## URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%' +# this re matches any character that's not in URI_CHARS +BAD_URI_CHARS_RE = re.compile("[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]") + + +def clean_url(url, encoding): + # percent-encode illegal URI characters + # Trying to come up with test cases for this gave me a headache, revisit + # when do switch to unicode. + # Somebody else's comments (lost the attribution): +## - IE will return you the url in the encoding you send it +## - Mozilla/Firefox will send you latin-1 if there's no non latin-1 +## characters in your link. It will send you utf-8 however if there are... + if type(url) == type(""): + url = url.decode(encoding, "replace") + url = url.strip() + # for second param to urllib.quote(), we want URI_CHARS, minus the + # 'always_safe' characters that urllib.quote() never percent-encodes + return urllib.quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~") + +def is_clean_uri(uri): + """ + >>> is_clean_uri("ABC!") + True + >>> is_clean_uri(u"ABC!") + True + >>> is_clean_uri("ABC|") + False + >>> is_clean_uri(u"ABC|") + False + >>> is_clean_uri("http://example.com/0") + True + >>> is_clean_uri(u"http://example.com/0") + True + """ + # note module re treats bytestrings as through they were decoded as latin-1 + # so this function accepts both unicode and bytestrings + return not bool(BAD_URI_CHARS_RE.search(uri)) + + +SPLIT_MATCH = re.compile( + r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match +def urlsplit(absolute_uri): + """Return scheme, authority, path, query, fragment.""" + match = SPLIT_MATCH(absolute_uri) + if match: + g = match.groups() + return g[1], g[3], g[4], g[6], g[8] + +def urlunsplit(parts): + scheme, authority, path, query, fragment = parts + r = [] + append = r.append + if scheme is not None: + append(scheme) + append(":") + if authority is not None: + append("//") + append(authority) + append(path) + if query is not None: + append("?") + append(query) + if fragment is not None: + append("#") + append(fragment) + return "".join(r) + +def urljoin(base_uri, uri_reference): + return urlunsplit(urljoin_parts(urlsplit(base_uri), + urlsplit(uri_reference))) + +# oops, this doesn't do the same thing as the literal translation +# from the RFC below +## import posixpath +## def urljoin_parts(base_parts, reference_parts): +## scheme, authority, path, query, fragment = base_parts +## rscheme, rauthority, rpath, rquery, rfragment = reference_parts + +## # compute target URI path +## if rpath == "": +## tpath = path +## else: +## tpath = rpath +## if not tpath.startswith("/"): +## tpath = merge(authority, path, tpath) +## tpath = posixpath.normpath(tpath) + +## if rscheme is not None: +## return (rscheme, rauthority, tpath, rquery, rfragment) +## elif rauthority is not None: +## return (scheme, rauthority, tpath, rquery, rfragment) +## elif rpath == "": +## if rquery is not None: +## tquery = rquery +## else: +## tquery = query +## return (scheme, authority, tpath, tquery, rfragment) +## else: +## return (scheme, authority, tpath, rquery, rfragment) + +def urljoin_parts(base_parts, reference_parts): + scheme, authority, path, query, fragment = base_parts + rscheme, rauthority, rpath, rquery, rfragment = reference_parts + + if rscheme == scheme: + rscheme = None + + if rscheme is not None: + tscheme, tauthority, tpath, tquery = ( + rscheme, rauthority, remove_dot_segments(rpath), rquery) + else: + if rauthority is not None: + tauthority, tpath, tquery = ( + rauthority, remove_dot_segments(rpath), rquery) + else: + if rpath == "": + tpath = path + if rquery is not None: + tquery = rquery + else: + tquery = query + else: + if rpath.startswith("/"): + tpath = remove_dot_segments(rpath) + else: + tpath = merge(authority, path, rpath) + tpath = remove_dot_segments(tpath) + tquery = rquery + tauthority = authority + tscheme = scheme + tfragment = rfragment + return (tscheme, tauthority, tpath, tquery, tfragment) + +# um, something *vaguely* like this is what I want, but I have to generate +# lots of test cases first, if only to understand what it is that +# remove_dot_segments really does... +## def remove_dot_segments(path): +## if path == '': +## return '' +## comps = path.split('/') +## new_comps = [] +## for comp in comps: +## if comp in ['.', '']: +## if not new_comps or new_comps[-1]: +## new_comps.append('') +## continue +## if comp != '..': +## new_comps.append(comp) +## elif new_comps: +## new_comps.pop() +## return '/'.join(new_comps) + + +def remove_dot_segments(path): + r = [] + while path: + # A + if path.startswith("../"): + path = path[3:] + continue + if path.startswith("./"): + path = path[2:] + continue + # B + if path.startswith("/./"): + path = path[2:] + continue + if path == "/.": + path = "/" + continue + # C + if path.startswith("/../"): + path = path[3:] + if r: + r.pop() + continue + if path == "/..": + path = "/" + if r: + r.pop() + continue + # D + if path == ".": + path = path[1:] + continue + if path == "..": + path = path[2:] + continue + # E + start = 0 + if path.startswith("/"): + start = 1 + ii = path.find("/", start) + if ii < 0: + ii = None + r.append(path[:ii]) + if ii is None: + break + path = path[ii:] + return "".join(r) + +def merge(base_authority, base_path, ref_path): + # XXXX Oddly, the sample Perl implementation of this by Roy Fielding + # doesn't even take base_authority as a parameter, despite the wording in + # the RFC suggesting otherwise. Perhaps I'm missing some obvious identity. + #if base_authority is not None and base_path == "": + if base_path == "": + return "/" + ref_path + ii = base_path.rfind("/") + if ii >= 0: + return base_path[:ii+1] + ref_path + return ref_path + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_seek.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_seek.py new file mode 100644 index 0000000..4086d52 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_seek.py @@ -0,0 +1,16 @@ +from urllib2 import BaseHandler +from _util import deprecation +from _response import response_seek_wrapper + + +class SeekableProcessor(BaseHandler): + """Deprecated: Make responses seekable.""" + + def __init__(self): + deprecation( + "See http://wwwsearch.sourceforge.net/mechanize/doc.html#seekable") + + def any_response(self, request, response): + if not hasattr(response, "seek"): + return response_seek_wrapper(response) + return response diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_sockettimeout.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_sockettimeout.py new file mode 100644 index 0000000..c22b734 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_sockettimeout.py @@ -0,0 +1,6 @@ +import socket + +try: + _GLOBAL_DEFAULT_TIMEOUT = socket._GLOBAL_DEFAULT_TIMEOUT +except AttributeError: + _GLOBAL_DEFAULT_TIMEOUT = object() diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_testcase.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_testcase.py new file mode 100644 index 0000000..a13cca3 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_testcase.py @@ -0,0 +1,73 @@ +import shutil +import tempfile +import unittest + + +class SetupStack(object): + + def __init__(self): + self._on_teardown = [] + + def add_teardown(self, teardown): + self._on_teardown.append(teardown) + + def tear_down(self): + for func in reversed(self._on_teardown): + func() + + +class TearDownConvenience(object): + + def __init__(self, setup_stack=None): + self._own_setup_stack = setup_stack is None + if setup_stack is None: + setup_stack = SetupStack() + self._setup_stack = setup_stack + + # only call this convenience method if no setup_stack was supplied to c'tor + def tear_down(self): + assert self._own_setup_stack + self._setup_stack.tear_down() + + +class TempDirMaker(TearDownConvenience): + + def make_temp_dir(self): + temp_dir = tempfile.mkdtemp(prefix="tmp-%s-" % self.__class__.__name__) + def tear_down(): + shutil.rmtree(temp_dir) + self._setup_stack.add_teardown(tear_down) + return temp_dir + + +class MonkeyPatcher(TearDownConvenience): + + def monkey_patch(self, obj, name, value): + orig_value = getattr(obj, name) + setattr(obj, name, value) + def reverse_patch(): + setattr(obj, name, orig_value) + self._setup_stack.add_teardown(reverse_patch) + + +class TestCase(unittest.TestCase): + + def setUp(self): + self._setup_stack = SetupStack() + + def tearDown(self): + self._setup_stack.tear_down() + + def make_temp_dir(self, *args, **kwds): + return TempDirMaker(self._setup_stack).make_temp_dir(*args, **kwds) + + def monkey_patch(self, *args, **kwds): + return MonkeyPatcher(self._setup_stack).monkey_patch(*args, **kwds) + + def assert_contains(self, container, containee): + self.assertTrue(containee in container, "%r not in %r" % + (containee, container)) + + def assert_less_than(self, got, expected): + self.assertTrue(got < expected, "%r >= %r" % + (got, expected)) diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_upgrade.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_upgrade.py new file mode 100644 index 0000000..df59c01 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_upgrade.py @@ -0,0 +1,40 @@ +from urllib2 import BaseHandler + +from _request import Request +from _response import upgrade_response +from _util import deprecation + + +class HTTPRequestUpgradeProcessor(BaseHandler): + # upgrade urllib2.Request to this module's Request + # yuck! + handler_order = 0 # before anything else + + def http_request(self, request): + if not hasattr(request, "add_unredirected_header"): + newrequest = Request(request.get_full_url(), request.data, + request.headers) + try: newrequest.origin_req_host = request.origin_req_host + except AttributeError: pass + try: newrequest.unverifiable = request.unverifiable + except AttributeError: pass + try: newrequest.visit = request.visit + except AttributeError: pass + request = newrequest + return request + + https_request = http_request + + +class ResponseUpgradeProcessor(BaseHandler): + # upgrade responses to be .close()able without becoming unusable + handler_order = 0 # before anything else + + def __init__(self): + deprecation( + "See http://wwwsearch.sourceforge.net/mechanize/doc.html#seekable") + + def any_response(self, request, response): + if not hasattr(response, 'closeable_response'): + response = upgrade_response(response) + return response diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_urllib2.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_urllib2.py new file mode 100644 index 0000000..cbb761b --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_urllib2.py @@ -0,0 +1,55 @@ +# urllib2 work-alike interface +# ...from urllib2... +from urllib2 import \ + URLError, \ + HTTPError, \ + BaseHandler, \ + UnknownHandler, \ + FTPHandler, \ + CacheFTPHandler +# ...and from mechanize +from _auth import \ + HTTPPasswordMgr, \ + HTTPPasswordMgrWithDefaultRealm, \ + AbstractBasicAuthHandler, \ + AbstractDigestAuthHandler, \ + HTTPProxyPasswordMgr, \ + ProxyHandler, \ + ProxyBasicAuthHandler, \ + ProxyDigestAuthHandler, \ + HTTPBasicAuthHandler, \ + HTTPDigestAuthHandler, \ + HTTPSClientCertMgr +from _debug import \ + HTTPResponseDebugProcessor, \ + HTTPRedirectDebugProcessor +from _file import \ + FileHandler +# crap ATM +## from _gzip import \ +## HTTPGzipProcessor +from _http import \ + HTTPHandler, \ + HTTPDefaultErrorHandler, \ + HTTPRedirectHandler, \ + HTTPEquivProcessor, \ + HTTPCookieProcessor, \ + HTTPRefererProcessor, \ + HTTPRefreshProcessor, \ + HTTPErrorProcessor, \ + HTTPRobotRulesProcessor, \ + RobotExclusionError +import httplib +if hasattr(httplib, 'HTTPS'): + from _http import HTTPSHandler +del httplib +from _opener import OpenerDirector, \ + SeekableResponseOpener, \ + build_opener, install_opener, urlopen +from _request import \ + Request +from _seek import \ + SeekableProcessor +from _upgrade import \ + HTTPRequestUpgradeProcessor, \ + ResponseUpgradeProcessor diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_useragent.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_useragent.py new file mode 100644 index 0000000..723f87c --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_useragent.py @@ -0,0 +1,352 @@ +"""Convenient HTTP UserAgent class. + +This is a subclass of urllib2.OpenerDirector. + + +Copyright 2003-2006 John J. Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt +included with the distribution). + +""" + +import warnings + +import _auth +import _gzip +import _opener +import _response +import _sockettimeout +import _urllib2 + + +class UserAgentBase(_opener.OpenerDirector): + """Convenient user-agent class. + + Do not use .add_handler() to add a handler for something already dealt with + by this code. + + The only reason at present for the distinction between UserAgent and + UserAgentBase is so that classes that depend on .seek()able responses + (e.g. mechanize.Browser) can inherit from UserAgentBase. The subclass + UserAgent exposes a .set_seekable_responses() method that allows switching + off the adding of a .seek() method to responses. + + Public attributes: + + addheaders: list of (name, value) pairs specifying headers to send with + every request, unless they are overridden in the Request instance. + + >>> ua = UserAgentBase() + >>> ua.addheaders = [ + ... ("User-agent", "Mozilla/5.0 (compatible)"), + ... ("From", "responsible.person@example.com")] + + """ + + handler_classes = { + # scheme handlers + "http": _urllib2.HTTPHandler, + # CacheFTPHandler is buggy, at least in 2.3, so we don't use it + "ftp": _urllib2.FTPHandler, + "file": _urllib2.FileHandler, + + # other handlers + "_unknown": _urllib2.UnknownHandler, + # HTTP{S,}Handler depend on HTTPErrorProcessor too + "_http_error": _urllib2.HTTPErrorProcessor, + "_http_request_upgrade": _urllib2.HTTPRequestUpgradeProcessor, + "_http_default_error": _urllib2.HTTPDefaultErrorHandler, + + # feature handlers + "_basicauth": _urllib2.HTTPBasicAuthHandler, + "_digestauth": _urllib2.HTTPDigestAuthHandler, + "_redirect": _urllib2.HTTPRedirectHandler, + "_cookies": _urllib2.HTTPCookieProcessor, + "_refresh": _urllib2.HTTPRefreshProcessor, + "_equiv": _urllib2.HTTPEquivProcessor, + "_proxy": _urllib2.ProxyHandler, + "_proxy_basicauth": _urllib2.ProxyBasicAuthHandler, + "_proxy_digestauth": _urllib2.ProxyDigestAuthHandler, + "_robots": _urllib2.HTTPRobotRulesProcessor, + "_gzip": _gzip.HTTPGzipProcessor, # experimental! + + # debug handlers + "_debug_redirect": _urllib2.HTTPRedirectDebugProcessor, + "_debug_response_body": _urllib2.HTTPResponseDebugProcessor, + } + + default_schemes = ["http", "ftp", "file"] + default_others = ["_unknown", "_http_error", "_http_request_upgrade", + "_http_default_error", + ] + default_features = ["_redirect", "_cookies", + "_refresh", "_equiv", + "_basicauth", "_digestauth", + "_proxy", "_proxy_basicauth", "_proxy_digestauth", + "_robots", + ] + if hasattr(_urllib2, 'HTTPSHandler'): + handler_classes["https"] = _urllib2.HTTPSHandler + default_schemes.append("https") + + def __init__(self): + _opener.OpenerDirector.__init__(self) + + ua_handlers = self._ua_handlers = {} + for scheme in (self.default_schemes+ + self.default_others+ + self.default_features): + klass = self.handler_classes[scheme] + ua_handlers[scheme] = klass() + for handler in ua_handlers.itervalues(): + self.add_handler(handler) + + # Yuck. + # Ensure correct default constructor args were passed to + # HTTPRefreshProcessor and HTTPEquivProcessor. + if "_refresh" in ua_handlers: + self.set_handle_refresh(True) + if "_equiv" in ua_handlers: + self.set_handle_equiv(True) + # Ensure default password managers are installed. + pm = ppm = None + if "_basicauth" in ua_handlers or "_digestauth" in ua_handlers: + pm = _urllib2.HTTPPasswordMgrWithDefaultRealm() + if ("_proxy_basicauth" in ua_handlers or + "_proxy_digestauth" in ua_handlers): + ppm = _auth.HTTPProxyPasswordMgr() + self.set_password_manager(pm) + self.set_proxy_password_manager(ppm) + # set default certificate manager + if "https" in ua_handlers: + cm = _urllib2.HTTPSClientCertMgr() + self.set_client_cert_manager(cm) + + def close(self): + _opener.OpenerDirector.close(self) + self._ua_handlers = None + + # XXX +## def set_timeout(self, timeout): +## self._timeout = timeout +## def set_http_connection_cache(self, conn_cache): +## self._http_conn_cache = conn_cache +## def set_ftp_connection_cache(self, conn_cache): +## # XXX ATM, FTP has cache as part of handler; should it be separate? +## self._ftp_conn_cache = conn_cache + + def set_handled_schemes(self, schemes): + """Set sequence of URL scheme (protocol) strings. + + For example: ua.set_handled_schemes(["http", "ftp"]) + + If this fails (with ValueError) because you've passed an unknown + scheme, the set of handled schemes will not be changed. + + """ + want = {} + for scheme in schemes: + if scheme.startswith("_"): + raise ValueError("not a scheme '%s'" % scheme) + if scheme not in self.handler_classes: + raise ValueError("unknown scheme '%s'") + want[scheme] = None + + # get rid of scheme handlers we don't want + for scheme, oldhandler in self._ua_handlers.items(): + if scheme.startswith("_"): continue # not a scheme handler + if scheme not in want: + self._replace_handler(scheme, None) + else: + del want[scheme] # already got it + # add the scheme handlers that are missing + for scheme in want.keys(): + self._set_handler(scheme, True) + + def set_cookiejar(self, cookiejar): + """Set a mechanize.CookieJar, or None.""" + self._set_handler("_cookies", obj=cookiejar) + + # XXX could use Greg Stein's httpx for some of this instead? + # or httplib2?? + def set_proxies(self, proxies): + """Set a dictionary mapping URL scheme to proxy specification, or None. + + e.g. {"http": "joe:password@myproxy.example.com:3128", + "ftp": "proxy.example.com"} + + """ + self._set_handler("_proxy", obj=proxies) + + def add_password(self, url, user, password, realm=None): + self._password_manager.add_password(realm, url, user, password) + def add_proxy_password(self, user, password, hostport=None, realm=None): + self._proxy_password_manager.add_password( + realm, hostport, user, password) + + def add_client_certificate(self, url, key_file, cert_file): + """Add an SSL client certificate, for HTTPS client auth. + + key_file and cert_file must be filenames of the key and certificate + files, in PEM format. You can use e.g. OpenSSL to convert a p12 (PKCS + 12) file to PEM format: + + openssl pkcs12 -clcerts -nokeys -in cert.p12 -out cert.pem + openssl pkcs12 -nocerts -in cert.p12 -out key.pem + + + Note that client certificate password input is very inflexible ATM. At + the moment this seems to be console only, which is presumably the + default behaviour of libopenssl. In future mechanize may support + third-party libraries that (I assume) allow more options here. + + """ + self._client_cert_manager.add_key_cert(url, key_file, cert_file) + + # the following are rarely useful -- use add_password / add_proxy_password + # instead + def set_password_manager(self, password_manager): + """Set a mechanize.HTTPPasswordMgrWithDefaultRealm, or None.""" + self._password_manager = password_manager + self._set_handler("_basicauth", obj=password_manager) + self._set_handler("_digestauth", obj=password_manager) + def set_proxy_password_manager(self, password_manager): + """Set a mechanize.HTTPProxyPasswordMgr, or None.""" + self._proxy_password_manager = password_manager + self._set_handler("_proxy_basicauth", obj=password_manager) + self._set_handler("_proxy_digestauth", obj=password_manager) + def set_client_cert_manager(self, cert_manager): + """Set a mechanize.HTTPClientCertMgr, or None.""" + self._client_cert_manager = cert_manager + handler = self._ua_handlers["https"] + handler.client_cert_manager = cert_manager + + # these methods all take a boolean parameter + def set_handle_robots(self, handle): + """Set whether to observe rules from robots.txt.""" + self._set_handler("_robots", handle) + def set_handle_redirect(self, handle): + """Set whether to handle HTTP 30x redirections.""" + self._set_handler("_redirect", handle) + def set_handle_refresh(self, handle, max_time=None, honor_time=True): + """Set whether to handle HTTP Refresh headers.""" + self._set_handler("_refresh", handle, constructor_kwds= + {"max_time": max_time, "honor_time": honor_time}) + def set_handle_equiv(self, handle, head_parser_class=None): + """Set whether to treat HTML http-equiv headers like HTTP headers. + + Response objects may be .seek()able if this is set (currently returned + responses are, raised HTTPError exception responses are not). + + """ + if head_parser_class is not None: + constructor_kwds = {"head_parser_class": head_parser_class} + else: + constructor_kwds={} + self._set_handler("_equiv", handle, constructor_kwds=constructor_kwds) + def set_handle_gzip(self, handle): + """Handle gzip transfer encoding. + + """ + if handle: + warnings.warn( + "gzip transfer encoding is experimental!", stacklevel=2) + self._set_handler("_gzip", handle) + def set_debug_redirects(self, handle): + """Log information about HTTP redirects (including refreshes). + + Logging is performed using module logging. The logger name is + "mechanize.http_redirects". To actually print some debug output, + eg: + + import sys, logging + logger = logging.getLogger("mechanize.http_redirects") + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.setLevel(logging.INFO) + + Other logger names relevant to this module: + + "mechanize.http_responses" + "mechanize.cookies" (or "cookielib" if running Python 2.4) + + To turn on everything: + + import sys, logging + logger = logging.getLogger("mechanize") + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.setLevel(logging.INFO) + + """ + self._set_handler("_debug_redirect", handle) + def set_debug_responses(self, handle): + """Log HTTP response bodies. + + See docstring for .set_debug_redirects() for details of logging. + + Response objects may be .seek()able if this is set (currently returned + responses are, raised HTTPError exception responses are not). + + """ + self._set_handler("_debug_response_body", handle) + def set_debug_http(self, handle): + """Print HTTP headers to sys.stdout.""" + level = int(bool(handle)) + for scheme in "http", "https": + h = self._ua_handlers.get(scheme) + if h is not None: + h.set_http_debuglevel(level) + + def _set_handler(self, name, handle=None, obj=None, + constructor_args=(), constructor_kwds={}): + if handle is None: + handle = obj is not None + if handle: + handler_class = self.handler_classes[name] + if obj is not None: + newhandler = handler_class(obj) + else: + newhandler = handler_class( + *constructor_args, **constructor_kwds) + else: + newhandler = None + self._replace_handler(name, newhandler) + + def _replace_handler(self, name, newhandler=None): + # first, if handler was previously added, remove it + if name is not None: + handler = self._ua_handlers.get(name) + if handler: + try: + self.handlers.remove(handler) + except ValueError: + pass + # then add the replacement, if any + if newhandler is not None: + self.add_handler(newhandler) + self._ua_handlers[name] = newhandler + + +class UserAgent(UserAgentBase): + + def __init__(self): + UserAgentBase.__init__(self) + self._seekable = False + + def set_seekable_responses(self, handle): + """Make response objects .seek()able.""" + self._seekable = bool(handle) + + def open(self, fullurl, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + if self._seekable: + def bound_open(fullurl, data=None, + timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): + return UserAgentBase.open(self, fullurl, data, timeout) + response = _opener.wrapped_open( + bound_open, _response.seek_wrapped_response, fullurl, data, + timeout) + else: + response = UserAgentBase.open(self, fullurl, data) + return response diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_util.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_util.py new file mode 100644 index 0000000..dcdefa9 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/mechanize/_util.py @@ -0,0 +1,291 @@ +"""Utility functions and date/time routines. + + Copyright 2002-2006 John J Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD or ZPL 2.1 licenses (see the file +COPYING.txt included with the distribution). + +""" + +import re, time, warnings + + +class ExperimentalWarning(UserWarning): + pass + +def experimental(message): + warnings.warn(message, ExperimentalWarning, stacklevel=3) +def hide_experimental_warnings(): + warnings.filterwarnings("ignore", category=ExperimentalWarning) +def reset_experimental_warnings(): + warnings.filterwarnings("default", category=ExperimentalWarning) + +def deprecation(message): + warnings.warn(message, DeprecationWarning, stacklevel=3) +def hide_deprecations(): + warnings.filterwarnings("ignore", category=DeprecationWarning) +def reset_deprecations(): + warnings.filterwarnings("default", category=DeprecationWarning) + + +def isstringlike(x): + try: x+"" + except: return False + else: return True + +## def caller(): +## try: +## raise SyntaxError +## except: +## import sys +## return sys.exc_traceback.tb_frame.f_back.f_back.f_code.co_name + + +from calendar import timegm + +# Date/time conversion routines for formats used by the HTTP protocol. + +EPOCH = 1970 +def my_timegm(tt): + year, month, mday, hour, min, sec = tt[:6] + if ((year >= EPOCH) and (1 <= month <= 12) and (1 <= mday <= 31) and + (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)): + return timegm(tt) + else: + return None + +days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] +months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] +months_lower = [] +for month in months: months_lower.append(month.lower()) + + +def time2isoz(t=None): + """Return a string representing time in seconds since epoch, t. + + If the function is called without an argument, it will use the current + time. + + The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ", + representing Universal Time (UTC, aka GMT). An example of this format is: + + 1994-11-24 08:49:37Z + + """ + if t is None: t = time.time() + year, mon, mday, hour, min, sec = time.gmtime(t)[:6] + return "%04d-%02d-%02d %02d:%02d:%02dZ" % ( + year, mon, mday, hour, min, sec) + +def time2netscape(t=None): + """Return a string representing time in seconds since epoch, t. + + If the function is called without an argument, it will use the current + time. + + The format of the returned string is like this: + + Wed, DD-Mon-YYYY HH:MM:SS GMT + + """ + if t is None: t = time.time() + year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7] + return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % ( + days[wday], mday, months[mon-1], year, hour, min, sec) + + +UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None} + +timezone_re = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$") +def offset_from_tz_string(tz): + offset = None + if UTC_ZONES.has_key(tz): + offset = 0 + else: + m = timezone_re.search(tz) + if m: + offset = 3600 * int(m.group(2)) + if m.group(3): + offset = offset + 60 * int(m.group(3)) + if m.group(1) == '-': + offset = -offset + return offset + +def _str2time(day, mon, yr, hr, min, sec, tz): + # translate month name to number + # month numbers start with 1 (January) + try: + mon = months_lower.index(mon.lower())+1 + except ValueError: + # maybe it's already a number + try: + imon = int(mon) + except ValueError: + return None + if 1 <= imon <= 12: + mon = imon + else: + return None + + # make sure clock elements are defined + if hr is None: hr = 0 + if min is None: min = 0 + if sec is None: sec = 0 + + yr = int(yr) + day = int(day) + hr = int(hr) + min = int(min) + sec = int(sec) + + if yr < 1000: + # find "obvious" year + cur_yr = time.localtime(time.time())[0] + m = cur_yr % 100 + tmp = yr + yr = yr + cur_yr - m + m = m - tmp + if abs(m) > 50: + if m > 0: yr = yr + 100 + else: yr = yr - 100 + + # convert UTC time tuple to seconds since epoch (not timezone-adjusted) + t = my_timegm((yr, mon, day, hr, min, sec, tz)) + + if t is not None: + # adjust time using timezone string, to get absolute time since epoch + if tz is None: + tz = "UTC" + tz = tz.upper() + offset = offset_from_tz_string(tz) + if offset is None: + return None + t = t - offset + + return t + + +strict_re = re.compile(r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) " + r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$") +wkday_re = re.compile( + r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I) +loose_http_re = re.compile( + r"""^ + (\d\d?) # day + (?:\s+|[-\/]) + (\w+) # month + (?:\s+|[-\/]) + (\d+) # year + (?: + (?:\s+|:) # separator before clock + (\d\d?):(\d\d) # hour:min + (?::(\d\d))? # optional seconds + )? # optional clock + \s* + ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone + \s* + (?:\(\w+\))? # ASCII representation of timezone in parens. + \s*$""", re.X) +def http2time(text): + """Returns time in seconds since epoch of time represented by a string. + + Return value is an integer. + + None is returned if the format of str is unrecognized, the time is outside + the representable range, or the timezone string is not recognized. If the + string contains no timezone, UTC is assumed. + + The timezone in the string may be numerical (like "-0800" or "+0100") or a + string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the + timezone strings equivalent to UTC (zero offset) are known to the function. + + The function loosely parses the following formats: + + Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format + Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format + Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format + 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday) + 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday) + 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday) + + The parser ignores leading and trailing whitespace. The time may be + absent. + + If the year is given with only 2 digits, the function will select the + century that makes the year closest to the current date. + + """ + # fast exit for strictly conforming string + m = strict_re.search(text) + if m: + g = m.groups() + mon = months_lower.index(g[1].lower()) + 1 + tt = (int(g[2]), mon, int(g[0]), + int(g[3]), int(g[4]), float(g[5])) + return my_timegm(tt) + + # No, we need some messy parsing... + + # clean up + text = text.lstrip() + text = wkday_re.sub("", text, 1) # Useless weekday + + # tz is time zone specifier string + day, mon, yr, hr, min, sec, tz = [None]*7 + + # loose regexp parse + m = loose_http_re.search(text) + if m is not None: + day, mon, yr, hr, min, sec, tz = m.groups() + else: + return None # bad format + + return _str2time(day, mon, yr, hr, min, sec, tz) + + +iso_re = re.compile( + """^ + (\d{4}) # year + [-\/]? + (\d\d?) # numerical month + [-\/]? + (\d\d?) # day + (?: + (?:\s+|[-:Tt]) # separator before clock + (\d\d?):?(\d\d) # hour:min + (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional) + )? # optional clock + \s* + ([-+]?\d\d?:?(:?\d\d)? + |Z|z)? # timezone (Z is "zero meridian", i.e. GMT) + \s*$""", re.X) +def iso2time(text): + """ + As for http2time, but parses the ISO 8601 formats: + + 1994-02-03 14:15:29 -0100 -- ISO 8601 format + 1994-02-03 14:15:29 -- zone is optional + 1994-02-03 -- only date + 1994-02-03T14:15:29 -- Use T as separator + 19940203T141529Z -- ISO 8601 compact format + 19940203 -- only date + + """ + # clean up + text = text.lstrip() + + # tz is time zone specifier string + day, mon, yr, hr, min, sec, tz = [None]*7 + + # loose regexp parse + m = iso_re.search(text) + if m is not None: + # XXX there's an extra bit of the timezone I'm ignoring here: is + # this the right thing to do? + yr, mon, day, hr, min, sec, tz, _ = m.groups() + else: + return None # bad format + + return _str2time(day, mon, yr, hr, min, sec, tz) diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/pep8.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/pep8.py new file mode 100755 index 0000000..c319370 --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/pep8.py @@ -0,0 +1,1254 @@ +#!/usr/bin/python +# pep8.py - Check Python source code formatting, according to PEP 8 +# Copyright (C) 2006 Johann C. Rocholl <johann@rocholl.net> +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation files +# (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, +# publish, distribute, sublicense, and/or sell copies of the Software, +# and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +""" +Check Python source code formatting, according to PEP 8: +http://www.python.org/dev/peps/pep-0008/ + +For usage and a list of options, try this: +$ python pep8.py -h + +This program and its regression test suite live here: +http://github.com/jcrocholl/pep8 + +Groups of errors and warnings: +E errors +W warnings +100 indentation +200 whitespace +300 blank lines +400 imports +500 line length +600 deprecation +700 statements + +You can add checks to this program by writing plugins. Each plugin is +a simple function that is called for each line of source code, either +physical or logical. + +Physical line: +- Raw line of text from the input file. + +Logical line: +- Multi-line statements converted to a single line. +- Stripped left and right. +- Contents of strings replaced with 'xxx' of same length. +- Comments removed. + +The check function requests physical or logical lines by the name of +the first argument: + +def maximum_line_length(physical_line) +def extraneous_whitespace(logical_line) +def blank_lines(logical_line, blank_lines, indent_level, line_number) + +The last example above demonstrates how check plugins can request +additional information with extra arguments. All attributes of the +Checker object are available. Some examples: + +lines: a list of the raw lines from the input file +tokens: the tokens that contribute to this logical line +line_number: line number in the input file +blank_lines: blank lines before this one +indent_char: first indentation character in this file (' ' or '\t') +indent_level: indentation (with tabs expanded to multiples of 8) +previous_indent_level: indentation on previous line +previous_logical: previous logical line + +The docstring of each check function shall be the relevant part of +text from PEP 8. It is printed if the user enables --show-pep8. +Several docstrings contain examples directly from the PEP 8 document. + +Okay: spam(ham[1], {eggs: 2}) +E201: spam( ham[1], {eggs: 2}) + +These examples are verified automatically when pep8.py is run with the +--doctest option. You can add examples for your own check functions. +The format is simple: "Okay" or error/warning code followed by colon +and space, the rest of the line is example source code. If you put 'r' +before the docstring, you can use \n for newline, \t for tab and \s +for space. + +""" + +__version__ = '0.5.0' + +import os +import sys +import re +import time +import inspect +import tokenize +from optparse import OptionParser +from keyword import iskeyword +from fnmatch import fnmatch + +DEFAULT_EXCLUDE = '.svn,CVS,.bzr,.hg,.git' +DEFAULT_IGNORE = ['E24'] + +INDENT_REGEX = re.compile(r'([ \t]*)') +RAISE_COMMA_REGEX = re.compile(r'raise\s+\w+\s*(,)') +SELFTEST_REGEX = re.compile(r'(Okay|[EW]\d{3}):\s(.*)') +ERRORCODE_REGEX = re.compile(r'[EW]\d{3}') +E301NOT_REGEX = re.compile(r'class |def |u?r?["\']') + +WHITESPACE = ' \t' + +BINARY_OPERATORS = ['**=', '*=', '+=', '-=', '!=', '<>', + '%=', '^=', '&=', '|=', '==', '/=', '//=', '>=', '<=', '>>=', '<<=', + '%', '^', '&', '|', '=', '/', '//', '>', '<', '>>', '<<'] +UNARY_OPERATORS = ['**', '*', '+', '-'] +OPERATORS = BINARY_OPERATORS + UNARY_OPERATORS + +options = None +args = None + + +############################################################################## +# Plugins (check functions) for physical lines +############################################################################## + + +def tabs_or_spaces(physical_line, indent_char): + r""" + Never mix tabs and spaces. + + The most popular way of indenting Python is with spaces only. The + second-most popular way is with tabs only. Code indented with a mixture + of tabs and spaces should be converted to using spaces exclusively. When + invoking the Python command line interpreter with the -t option, it issues + warnings about code that illegally mixes tabs and spaces. When using -tt + these warnings become errors. These options are highly recommended! + + Okay: if a == 0:\n a = 1\n b = 1 + E101: if a == 0:\n a = 1\n\tb = 1 + """ + indent = INDENT_REGEX.match(physical_line).group(1) + for offset, char in enumerate(indent): + if char != indent_char: + return offset, "E101 indentation contains mixed spaces and tabs" + + +def tabs_obsolete(physical_line): + r""" + For new projects, spaces-only are strongly recommended over tabs. Most + editors have features that make this easy to do. + + Okay: if True:\n return + W191: if True:\n\treturn + """ + indent = INDENT_REGEX.match(physical_line).group(1) + if indent.count('\t'): + return indent.index('\t'), "W191 indentation contains tabs" + + +def trailing_whitespace(physical_line): + """ + JCR: Trailing whitespace is superfluous. + + Okay: spam(1) + W291: spam(1)\s + """ + physical_line = physical_line.rstrip('\n') # chr(10), newline + physical_line = physical_line.rstrip('\r') # chr(13), carriage return + physical_line = physical_line.rstrip('\x0c') # chr(12), form feed, ^L + stripped = physical_line.rstrip() + if physical_line != stripped: + return len(stripped), "W291 trailing whitespace" + + +def trailing_blank_lines(physical_line, lines, line_number): + r""" + JCR: Trailing blank lines are superfluous. + + Okay: spam(1) + W391: spam(1)\n + """ + if physical_line.strip() == '' and line_number == len(lines): + return 0, "W391 blank line at end of file" + + +def missing_newline(physical_line): + """ + JCR: The last line should have a newline. + """ + if physical_line.rstrip() == physical_line: + return len(physical_line), "W292 no newline at end of file" + + +def maximum_line_length(physical_line): + """ + Limit all lines to a maximum of 79 characters. + + There are still many devices around that are limited to 80 character + lines; plus, limiting windows to 80 characters makes it possible to have + several windows side-by-side. The default wrapping on such devices looks + ugly. Therefore, please limit all lines to a maximum of 79 characters. + For flowing long blocks of text (docstrings or comments), limiting the + length to 72 characters is recommended. + """ + length = len(physical_line.rstrip()) + if length > 79: + return 79, "E501 line too long (%d characters)" % length + + +############################################################################## +# Plugins (check functions) for logical lines +############################################################################## + + +def blank_lines(logical_line, blank_lines, indent_level, line_number, + previous_logical, blank_lines_before_comment): + r""" + Separate top-level function and class definitions with two blank lines. + + Method definitions inside a class are separated by a single blank line. + + Extra blank lines may be used (sparingly) to separate groups of related + functions. Blank lines may be omitted between a bunch of related + one-liners (e.g. a set of dummy implementations). + + Use blank lines in functions, sparingly, to indicate logical sections. + + Okay: def a():\n pass\n\n\ndef b():\n pass + Okay: def a():\n pass\n\n\n# Foo\n# Bar\n\ndef b():\n pass + + E301: class Foo:\n b = 0\n def bar():\n pass + E302: def a():\n pass\n\ndef b(n):\n pass + E303: def a():\n pass\n\n\n\ndef b(n):\n pass + E303: def a():\n\n\n\n pass + E304: @decorator\n\ndef a():\n pass + """ + if line_number == 1: + return # Don't expect blank lines before the first line + max_blank_lines = max(blank_lines, blank_lines_before_comment) + if previous_logical.startswith('@'): + if max_blank_lines: + return 0, "E304 blank lines found after function decorator" + elif max_blank_lines > 2 or (indent_level and max_blank_lines == 2): + return 0, "E303 too many blank lines (%d)" % max_blank_lines + elif (logical_line.startswith('def ') or + logical_line.startswith('class ') or + logical_line.startswith('@')): + if indent_level: + if not (max_blank_lines or E301NOT_REGEX.match(previous_logical)): + return 0, "E301 expected 1 blank line, found 0" + elif max_blank_lines != 2: + return 0, "E302 expected 2 blank lines, found %d" % max_blank_lines + + +def extraneous_whitespace(logical_line): + """ + Avoid extraneous whitespace in the following situations: + + - Immediately inside parentheses, brackets or braces. + + - Immediately before a comma, semicolon, or colon. + + Okay: spam(ham[1], {eggs: 2}) + E201: spam( ham[1], {eggs: 2}) + E201: spam(ham[ 1], {eggs: 2}) + E201: spam(ham[1], { eggs: 2}) + E202: spam(ham[1], {eggs: 2} ) + E202: spam(ham[1 ], {eggs: 2}) + E202: spam(ham[1], {eggs: 2 }) + + E203: if x == 4: print x, y; x, y = y , x + E203: if x == 4: print x, y ; x, y = y, x + E203: if x == 4 : print x, y; x, y = y, x + """ + line = logical_line + for char in '([{': + found = line.find(char + ' ') + if found > -1: + return found + 1, "E201 whitespace after '%s'" % char + for char in '}])': + found = line.find(' ' + char) + if found > -1 and line[found - 1] != ',': + return found, "E202 whitespace before '%s'" % char + for char in ',;:': + found = line.find(' ' + char) + if found > -1: + return found, "E203 whitespace before '%s'" % char + + +def missing_whitespace(logical_line): + """ + JCR: Each comma, semicolon or colon should be followed by whitespace. + + Okay: [a, b] + Okay: (3,) + Okay: a[1:4] + Okay: a[:4] + Okay: a[1:] + Okay: a[1:4:2] + E231: ['a','b'] + E231: foo(bar,baz) + """ + line = logical_line + for index in range(len(line) - 1): + char = line[index] + if char in ',;:' and line[index + 1] not in WHITESPACE: + before = line[:index] + if char == ':' and before.count('[') > before.count(']'): + continue # Slice syntax, no space required + if char == ',' and line[index + 1] == ')': + continue # Allow tuple with only one element: (3,) + return index, "E231 missing whitespace after '%s'" % char + + +def indentation(logical_line, previous_logical, indent_char, + indent_level, previous_indent_level): + r""" + Use 4 spaces per indentation level. + + For really old code that you don't want to mess up, you can continue to + use 8-space tabs. + + Okay: a = 1 + Okay: if a == 0:\n a = 1 + E111: a = 1 + + Okay: for item in items:\n pass + E112: for item in items:\npass + + Okay: a = 1\nb = 2 + E113: a = 1\n b = 2 + """ + if indent_char == ' ' and indent_level % 4: + return 0, "E111 indentation is not a multiple of four" + indent_expect = previous_logical.endswith(':') + if indent_expect and indent_level <= previous_indent_level: + return 0, "E112 expected an indented block" + if indent_level > previous_indent_level and not indent_expect: + return 0, "E113 unexpected indentation" + + +def whitespace_before_parameters(logical_line, tokens): + """ + Avoid extraneous whitespace in the following situations: + + - Immediately before the open parenthesis that starts the argument + list of a function call. + + - Immediately before the open parenthesis that starts an indexing or + slicing. + + Okay: spam(1) + E211: spam (1) + + Okay: dict['key'] = list[index] + E211: dict ['key'] = list[index] + E211: dict['key'] = list [index] + """ + prev_type = tokens[0][0] + prev_text = tokens[0][1] + prev_end = tokens[0][3] + for index in range(1, len(tokens)): + token_type, text, start, end, line = tokens[index] + if (token_type == tokenize.OP and + text in '([' and + start != prev_end and + prev_type == tokenize.NAME and + (index < 2 or tokens[index - 2][1] != 'class') and + (not iskeyword(prev_text))): + return prev_end, "E211 whitespace before '%s'" % text + prev_type = token_type + prev_text = text + prev_end = end + + +def whitespace_around_operator(logical_line): + """ + Avoid extraneous whitespace in the following situations: + + - More than one space around an assignment (or other) operator to + align it with another. + + Okay: a = 12 + 3 + E221: a = 4 + 5 + E222: a = 4 + 5 + E223: a = 4\t+ 5 + E224: a = 4 +\t5 + """ + line = logical_line + for operator in OPERATORS: + found = line.find(' ' + operator) + if found > -1: + return found, "E221 multiple spaces before operator" + found = line.find(operator + ' ') + if found > -1: + return found, "E222 multiple spaces after operator" + found = line.find('\t' + operator) + if found > -1: + return found, "E223 tab before operator" + found = line.find(operator + '\t') + if found > -1: + return found, "E224 tab after operator" + + +def missing_whitespace_around_operator(logical_line, tokens): + r""" + - Always surround these binary operators with a single space on + either side: assignment (=), augmented assignment (+=, -= etc.), + comparisons (==, <, >, !=, <>, <=, >=, in, not in, is, is not), + Booleans (and, or, not). + + - Use spaces around arithmetic operators. + + Okay: i = i + 1 + Okay: submitted += 1 + Okay: x = x * 2 - 1 + Okay: hypot2 = x * x + y * y + Okay: c = (a + b) * (a - b) + Okay: foo(bar, key='word', *args, **kwargs) + Okay: baz(**kwargs) + Okay: negative = -1 + Okay: spam(-1) + Okay: alpha[:-i] + Okay: if not -5 < x < +5:\n pass + Okay: lambda *args, **kw: (args, kw) + + E225: i=i+1 + E225: submitted +=1 + E225: x = x*2 - 1 + E225: hypot2 = x*x + y*y + E225: c = (a+b) * (a-b) + E225: c = alpha -4 + E225: z = x **y + """ + parens = 0 + need_space = False + prev_type = tokenize.OP + prev_text = prev_end = None + for token_type, text, start, end, line in tokens: + if token_type in (tokenize.NL, tokenize.NEWLINE, tokenize.ERRORTOKEN): + # ERRORTOKEN is triggered by backticks in Python 3000 + continue + if text in ('(', 'lambda'): + parens += 1 + elif text == ')': + parens -= 1 + if need_space: + if start == prev_end: + return prev_end, "E225 missing whitespace around operator" + need_space = False + elif token_type == tokenize.OP: + if text == '=' and parens: + # Allow keyword args or defaults: foo(bar=None). + pass + elif text in BINARY_OPERATORS: + need_space = True + elif text in UNARY_OPERATORS: + if ((prev_type != tokenize.OP or prev_text in '}])') and not + (prev_type == tokenize.NAME and iskeyword(prev_text))): + # Allow unary operators: -123, -x, +1. + # Allow argument unpacking: foo(*args, **kwargs). + need_space = True + if need_space and start == prev_end: + return prev_end, "E225 missing whitespace around operator" + prev_type = token_type + prev_text = text + prev_end = end + + +def whitespace_around_comma(logical_line): + """ + Avoid extraneous whitespace in the following situations: + + - More than one space around an assignment (or other) operator to + align it with another. + + JCR: This should also be applied around comma etc. + Note: these checks are disabled by default + + Okay: a = (1, 2) + E241: a = (1, 2) + E242: a = (1,\t2) + """ + line = logical_line + for separator in ',;:': + found = line.find(separator + ' ') + if found > -1: + return found + 1, "E241 multiple spaces after '%s'" % separator + found = line.find(separator + '\t') + if found > -1: + return found + 1, "E242 tab after '%s'" % separator + + +def whitespace_around_named_parameter_equals(logical_line): + """ + Don't use spaces around the '=' sign when used to indicate a + keyword argument or a default parameter value. + + Okay: def complex(real, imag=0.0): + Okay: return magic(r=real, i=imag) + Okay: boolean(a == b) + Okay: boolean(a != b) + Okay: boolean(a <= b) + Okay: boolean(a >= b) + + E251: def complex(real, imag = 0.0): + E251: return magic(r = real, i = imag) + """ + parens = 0 + window = ' ' + equal_ok = ['==', '!=', '<=', '>='] + + for pos, c in enumerate(logical_line): + window = window[1:] + c + if parens: + if window[0] in WHITESPACE and window[1] == '=': + if window[1:] not in equal_ok: + issue = "E251 no spaces around keyword / parameter equals" + return pos, issue + if window[2] in WHITESPACE and window[1] == '=': + if window[:2] not in equal_ok: + issue = "E251 no spaces around keyword / parameter equals" + return pos, issue + if c == '(': + parens += 1 + elif c == ')': + parens -= 1 + + +def whitespace_before_inline_comment(logical_line, tokens): + """ + Separate inline comments by at least two spaces. + + An inline comment is a comment on the same line as a statement. Inline + comments should be separated by at least two spaces from the statement. + They should start with a # and a single space. + + Okay: x = x + 1 # Increment x + Okay: x = x + 1 # Increment x + E261: x = x + 1 # Increment x + E262: x = x + 1 #Increment x + E262: x = x + 1 # Increment x + """ + prev_end = (0, 0) + for token_type, text, start, end, line in tokens: + if token_type == tokenize.NL: + continue + if token_type == tokenize.COMMENT: + if not line[:start[1]].strip(): + continue + if prev_end[0] == start[0] and start[1] < prev_end[1] + 2: + return (prev_end, + "E261 at least two spaces before inline comment") + if (len(text) > 1 and text.startswith('# ') + or not text.startswith('# ')): + return start, "E262 inline comment should start with '# '" + else: + prev_end = end + + +def imports_on_separate_lines(logical_line): + r""" + Imports should usually be on separate lines. + + Okay: import os\nimport sys + E401: import sys, os + + Okay: from subprocess import Popen, PIPE + Okay: from myclas import MyClass + Okay: from foo.bar.yourclass import YourClass + Okay: import myclass + Okay: import foo.bar.yourclass + """ + line = logical_line + if line.startswith('import '): + found = line.find(',') + if found > -1: + return found, "E401 multiple imports on one line" + + +def compound_statements(logical_line): + r""" + Compound statements (multiple statements on the same line) are + generally discouraged. + + While sometimes it's okay to put an if/for/while with a small body + on the same line, never do this for multi-clause statements. Also + avoid folding such long lines! + + Okay: if foo == 'blah':\n do_blah_thing() + Okay: do_one() + Okay: do_two() + Okay: do_three() + + E701: if foo == 'blah': do_blah_thing() + E701: for x in lst: total += x + E701: while t < 10: t = delay() + E701: if foo == 'blah': do_blah_thing() + E701: else: do_non_blah_thing() + E701: try: something() + E701: finally: cleanup() + E701: if foo == 'blah': one(); two(); three() + + E702: do_one(); do_two(); do_three() + """ + line = logical_line + found = line.find(':') + if -1 < found < len(line) - 1: + before = line[:found] + if (before.count('{') <= before.count('}') and # {'a': 1} (dict) + before.count('[') <= before.count(']') and # [1:2] (slice) + not re.search(r'\blambda\b', before)): # lambda x: x + return found, "E701 multiple statements on one line (colon)" + found = line.find(';') + if -1 < found: + return found, "E702 multiple statements on one line (semicolon)" + + +def python_3000_has_key(logical_line): + """ + The {}.has_key() method will be removed in the future version of + Python. Use the 'in' operation instead, like: + d = {"a": 1, "b": 2} + if "b" in d: + print d["b"] + """ + pos = logical_line.find('.has_key(') + if pos > -1: + return pos, "W601 .has_key() is deprecated, use 'in'" + + +def python_3000_raise_comma(logical_line): + """ + When raising an exception, use "raise ValueError('message')" + instead of the older form "raise ValueError, 'message'". + + The paren-using form is preferred because when the exception arguments + are long or include string formatting, you don't need to use line + continuation characters thanks to the containing parentheses. The older + form will be removed in Python 3000. + """ + match = RAISE_COMMA_REGEX.match(logical_line) + if match: + return match.start(1), "W602 deprecated form of raising exception" + + +def python_3000_not_equal(logical_line): + """ + != can also be written <>, but this is an obsolete usage kept for + backwards compatibility only. New code should always use !=. + The older syntax is removed in Python 3000. + """ + pos = logical_line.find('<>') + if pos > -1: + return pos, "W603 '<>' is deprecated, use '!='" + + +def python_3000_backticks(logical_line): + """ + Backticks are removed in Python 3000. + Use repr() instead. + """ + pos = logical_line.find('`') + if pos > -1: + return pos, "W604 backticks are deprecated, use 'repr()'" + + +############################################################################## +# Helper functions +############################################################################## + + +def expand_indent(line): + """ + Return the amount of indentation. + Tabs are expanded to the next multiple of 8. + + >>> expand_indent(' ') + 4 + >>> expand_indent('\\t') + 8 + >>> expand_indent(' \\t') + 8 + >>> expand_indent(' \\t') + 8 + >>> expand_indent(' \\t') + 16 + """ + result = 0 + for char in line: + if char == '\t': + result = result // 8 * 8 + 8 + elif char == ' ': + result += 1 + else: + break + return result + + +def mute_string(text): + """ + Replace contents with 'xxx' to prevent syntax matching. + + >>> mute_string('"abc"') + '"xxx"' + >>> mute_string("'''abc'''") + "'''xxx'''" + >>> mute_string("r'abc'") + "r'xxx'" + """ + start = 1 + end = len(text) - 1 + # String modifiers (e.g. u or r) + if text.endswith('"'): + start += text.index('"') + elif text.endswith("'"): + start += text.index("'") + # Triple quotes + if text.endswith('"""') or text.endswith("'''"): + start += 2 + end -= 2 + return text[:start] + 'x' * (end - start) + text[end:] + + +def message(text): + """Print a message.""" + # print >> sys.stderr, options.prog + ': ' + text + # print >> sys.stderr, text + print(text) + + +############################################################################## +# Framework to run all checks +############################################################################## + + +def find_checks(argument_name): + """ + Find all globally visible functions where the first argument name + starts with argument_name. + """ + checks = [] + for name, function in globals().items(): + if not inspect.isfunction(function): + continue + args = inspect.getargspec(function)[0] + if args and args[0].startswith(argument_name): + codes = ERRORCODE_REGEX.findall(inspect.getdoc(function) or '') + for code in codes or ['']: + if not code or not ignore_code(code): + checks.append((name, function, args)) + break + checks.sort() + return checks + + +class Checker(object): + """ + Load a Python source file, tokenize it, check coding style. + """ + + def __init__(self, filename): + if filename: + self.filename = filename + try: + self.lines = open(filename).readlines() + except UnicodeDecodeError: + # Errors may occur with non-UTF8 files in Python 3000 + self.lines = open(filename, errors='replace').readlines() + else: + self.filename = 'stdin' + self.lines = [] + options.counters['physical lines'] = \ + options.counters.get('physical lines', 0) + len(self.lines) + + def readline(self): + """ + Get the next line from the input buffer. + """ + self.line_number += 1 + if self.line_number > len(self.lines): + return '' + return self.lines[self.line_number - 1] + + def readline_check_physical(self): + """ + Check and return the next physical line. This method can be + used to feed tokenize.generate_tokens. + """ + line = self.readline() + if line: + self.check_physical(line) + return line + + def run_check(self, check, argument_names): + """ + Run a check plugin. + """ + arguments = [] + for name in argument_names: + arguments.append(getattr(self, name)) + return check(*arguments) + + def check_physical(self, line): + """ + Run all physical checks on a raw input line. + """ + self.physical_line = line + if self.indent_char is None and len(line) and line[0] in ' \t': + self.indent_char = line[0] + for name, check, argument_names in options.physical_checks: + result = self.run_check(check, argument_names) + if result is not None: + offset, text = result + self.report_error(self.line_number, offset, text, check) + + def build_tokens_line(self): + """ + Build a logical line from tokens. + """ + self.mapping = [] + logical = [] + length = 0 + previous = None + for token in self.tokens: + token_type, text = token[0:2] + if token_type in (tokenize.COMMENT, tokenize.NL, + tokenize.INDENT, tokenize.DEDENT, + tokenize.NEWLINE): + continue + if token_type == tokenize.STRING: + text = mute_string(text) + if previous: + end_line, end = previous[3] + start_line, start = token[2] + if end_line != start_line: # different row + if self.lines[end_line - 1][end - 1] not in '{[(': + logical.append(' ') + length += 1 + elif end != start: # different column + fill = self.lines[end_line - 1][end:start] + logical.append(fill) + length += len(fill) + self.mapping.append((length, token)) + logical.append(text) + length += len(text) + previous = token + self.logical_line = ''.join(logical) + assert self.logical_line.lstrip() == self.logical_line + assert self.logical_line.rstrip() == self.logical_line + + def check_logical(self): + """ + Build a line from tokens and run all logical checks on it. + """ + options.counters['logical lines'] = \ + options.counters.get('logical lines', 0) + 1 + self.build_tokens_line() + first_line = self.lines[self.mapping[0][1][2][0] - 1] + indent = first_line[:self.mapping[0][1][2][1]] + self.previous_indent_level = self.indent_level + self.indent_level = expand_indent(indent) + if options.verbose >= 2: + print(self.logical_line[:80].rstrip()) + for name, check, argument_names in options.logical_checks: + if options.verbose >= 3: + print(' ', name) + result = self.run_check(check, argument_names) + if result is not None: + offset, text = result + if isinstance(offset, tuple): + original_number, original_offset = offset + else: + for token_offset, token in self.mapping: + if offset >= token_offset: + original_number = token[2][0] + original_offset = (token[2][1] + + offset - token_offset) + self.report_error(original_number, original_offset, + text, check) + self.previous_logical = self.logical_line + + def check_all(self): + """ + Run all checks on the input file. + """ + self.file_errors = 0 + self.line_number = 0 + self.indent_char = None + self.indent_level = 0 + self.previous_logical = '' + self.blank_lines = 0 + self.blank_lines_before_comment = 0 + self.tokens = [] + parens = 0 + for token in tokenize.generate_tokens(self.readline_check_physical): + # print(tokenize.tok_name[token[0]], repr(token)) + self.tokens.append(token) + token_type, text = token[0:2] + if token_type == tokenize.OP and text in '([{': + parens += 1 + if token_type == tokenize.OP and text in '}])': + parens -= 1 + if token_type == tokenize.NEWLINE and not parens: + self.check_logical() + self.blank_lines = 0 + self.blank_lines_before_comment = 0 + self.tokens = [] + if token_type == tokenize.NL and not parens: + if len(self.tokens) <= 1: + # The physical line contains only this token. + self.blank_lines += 1 + self.tokens = [] + if token_type == tokenize.COMMENT: + source_line = token[4] + token_start = token[2][1] + if source_line[:token_start].strip() == '': + self.blank_lines_before_comment = max(self.blank_lines, + self.blank_lines_before_comment) + self.blank_lines = 0 + if text.endswith('\n') and not parens: + # The comment also ends a physical line. This works around + # Python < 2.6 behaviour, which does not generate NL after + # a comment which is on a line by itself. + self.tokens = [] + return self.file_errors + + def report_error(self, line_number, offset, text, check): + """ + Report an error, according to options. + """ + if options.quiet == 1 and not self.file_errors: + message(self.filename) + self.file_errors += 1 + code = text[:4] + options.counters[code] = options.counters.get(code, 0) + 1 + options.messages[code] = text[5:] + if options.quiet: + return + if options.testsuite: + basename = os.path.basename(self.filename) + if basename[:4] != code: + return # Don't care about other errors or warnings + if 'not' not in basename: + return # Don't print the expected error message + if ignore_code(code): + return + if options.counters[code] == 1 or options.repeat: + message("%s:%s:%d: %s" % + (self.filename, line_number, offset + 1, text)) + if options.show_source: + line = self.lines[line_number - 1] + message(line.rstrip()) + message(' ' * offset + '^') + if options.show_pep8: + message(check.__doc__.lstrip('\n').rstrip()) + + +def input_file(filename): + """ + Run all checks on a Python source file. + """ + if excluded(filename): + return {} + if options.verbose: + message('checking ' + filename) + files_counter_before = options.counters.get('files', 0) + if options.testsuite: # Keep showing errors for multiple tests + options.counters = {} + options.counters['files'] = files_counter_before + 1 + errors = Checker(filename).check_all() + if options.testsuite: # Check if the expected error was found + basename = os.path.basename(filename) + code = basename[:4] + count = options.counters.get(code, 0) + if count == 0 and 'not' not in basename: + message("%s: error %s not found" % (filename, code)) + + +def input_dir(dirname): + """ + Check all Python source files in this directory and all subdirectories. + """ + dirname = dirname.rstrip('/') + if excluded(dirname): + return + for root, dirs, files in os.walk(dirname): + if options.verbose: + message('directory ' + root) + options.counters['directories'] = \ + options.counters.get('directories', 0) + 1 + dirs.sort() + for subdir in dirs: + if excluded(subdir): + dirs.remove(subdir) + files.sort() + for filename in files: + if filename_match(filename): + input_file(os.path.join(root, filename)) + + +def excluded(filename): + """ + Check if options.exclude contains a pattern that matches filename. + """ + basename = os.path.basename(filename) + for pattern in options.exclude: + if fnmatch(basename, pattern): + # print basename, 'excluded because it matches', pattern + return True + + +def filename_match(filename): + """ + Check if options.filename contains a pattern that matches filename. + If options.filename is unspecified, this always returns True. + """ + if not options.filename: + return True + for pattern in options.filename: + if fnmatch(filename, pattern): + return True + + +def ignore_code(code): + """ + Check if options.ignore contains a prefix of the error code. + If options.select contains a prefix of the error code, do not ignore it. + """ + for select in options.select: + if code.startswith(select): + return False + for ignore in options.ignore: + if code.startswith(ignore): + return True + + +def get_error_statistics(): + """Get error statistics.""" + return get_statistics("E") + + +def get_warning_statistics(): + """Get warning statistics.""" + return get_statistics("W") + + +def get_statistics(prefix=''): + """ + Get statistics for message codes that start with the prefix. + + prefix='' matches all errors and warnings + prefix='E' matches all errors + prefix='W' matches all warnings + prefix='E4' matches all errors that have to do with imports + """ + stats = [] + keys = list(options.messages.keys()) + keys.sort() + for key in keys: + if key.startswith(prefix): + stats.append('%-7s %s %s' % + (options.counters[key], key, options.messages[key])) + return stats + + +def get_count(prefix=''): + """Return the total count of errors and warnings.""" + keys = list(options.messages.keys()) + count = 0 + for key in keys: + if key.startswith(prefix): + count += options.counters[key] + return count + + +def print_statistics(prefix=''): + """Print overall statistics (number of errors and warnings).""" + for line in get_statistics(prefix): + print(line) + + +def print_benchmark(elapsed): + """ + Print benchmark numbers. + """ + print('%-7.2f %s' % (elapsed, 'seconds elapsed')) + keys = ['directories', 'files', + 'logical lines', 'physical lines'] + for key in keys: + if key in options.counters: + print('%-7d %s per second (%d total)' % ( + options.counters[key] / elapsed, key, + options.counters[key])) + + +def selftest(): + """ + Test all check functions with test cases in docstrings. + """ + count_passed = 0 + count_failed = 0 + checks = options.physical_checks + options.logical_checks + for name, check, argument_names in checks: + for line in check.__doc__.splitlines(): + line = line.lstrip() + match = SELFTEST_REGEX.match(line) + if match is None: + continue + code, source = match.groups() + checker = Checker(None) + for part in source.split(r'\n'): + part = part.replace(r'\t', '\t') + part = part.replace(r'\s', ' ') + checker.lines.append(part + '\n') + options.quiet = 2 + options.counters = {} + checker.check_all() + error = None + if code == 'Okay': + if len(options.counters) > 1: + codes = [key for key in options.counters.keys() + if key != 'logical lines'] + error = "incorrectly found %s" % ', '.join(codes) + elif options.counters.get(code, 0) == 0: + error = "failed to find %s" % code + if not error: + count_passed += 1 + else: + count_failed += 1 + if len(checker.lines) == 1: + print("pep8.py: %s: %s" % + (error, checker.lines[0].rstrip())) + else: + print("pep8.py: %s:" % error) + for line in checker.lines: + print(line.rstrip()) + if options.verbose: + print("%d passed and %d failed." % (count_passed, count_failed)) + if count_failed: + print("Test failed.") + else: + print("Test passed.") + + +def process_options(arglist=None): + """ + Process options passed either via arglist or via command line args. + """ + global options, args + parser = OptionParser(version=__version__, + usage="%prog [options] input ...") + parser.add_option('-v', '--verbose', default=0, action='count', + help="print status messages, or debug with -vv") + parser.add_option('-q', '--quiet', default=0, action='count', + help="report only file names, or nothing with -qq") + parser.add_option('-r', '--repeat', action='store_true', + help="show all occurrences of the same error") + parser.add_option('--exclude', metavar='patterns', default=DEFAULT_EXCLUDE, + help="exclude files or directories which match these " + "comma separated patterns (default: %s)" % + DEFAULT_EXCLUDE) + parser.add_option('--filename', metavar='patterns', default='*.py', + help="when parsing directories, only check filenames " + "matching these comma separated patterns (default: " + "*.py)") + parser.add_option('--select', metavar='errors', default='', + help="select errors and warnings (e.g. E,W6)") + parser.add_option('--ignore', metavar='errors', default='', + help="skip errors and warnings (e.g. E4,W)") + parser.add_option('--show-source', action='store_true', + help="show source code for each error") + parser.add_option('--show-pep8', action='store_true', + help="show text of PEP 8 for each error") + parser.add_option('--statistics', action='store_true', + help="count errors and warnings") + parser.add_option('--count', action='store_true', + help="print total number of errors and warnings " + "to standard error and set exit code to 1 if " + "total is not null") + parser.add_option('--benchmark', action='store_true', + help="measure processing speed") + parser.add_option('--testsuite', metavar='dir', + help="run regression tests from dir") + parser.add_option('--doctest', action='store_true', + help="run doctest on myself") + options, args = parser.parse_args(arglist) + if options.testsuite: + args.append(options.testsuite) + if len(args) == 0 and not options.doctest: + parser.error('input not specified') + options.prog = os.path.basename(sys.argv[0]) + options.exclude = options.exclude.split(',') + for index in range(len(options.exclude)): + options.exclude[index] = options.exclude[index].rstrip('/') + if options.filename: + options.filename = options.filename.split(',') + if options.select: + options.select = options.select.split(',') + else: + options.select = [] + if options.ignore: + options.ignore = options.ignore.split(',') + elif options.select: + # Ignore all checks which are not explicitly selected + options.ignore = [''] + elif options.testsuite or options.doctest: + # For doctest and testsuite, all checks are required + options.ignore = [] + else: + # The default choice: ignore controversial checks + options.ignore = DEFAULT_IGNORE + options.physical_checks = find_checks('physical_line') + options.logical_checks = find_checks('logical_line') + options.counters = {} + options.messages = {} + return options, args + + +def _main(): + """ + Parse options and run checks on Python source. + """ + options, args = process_options() + if options.doctest: + import doctest + doctest.testmod(verbose=options.verbose) + selftest() + start_time = time.time() + for path in args: + if os.path.isdir(path): + input_dir(path) + else: + input_file(path) + elapsed = time.time() - start_time + if options.statistics: + print_statistics() + if options.benchmark: + print_benchmark(elapsed) + if options.count: + count = get_count() + if count: + sys.stderr.write(str(count) + '\n') + sys.exit(1) + + +if __name__ == '__main__': + _main() diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/rietveld/.upload.py.url b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/rietveld/.upload.py.url new file mode 100644 index 0000000..8098dbc --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/rietveld/.upload.py.url @@ -0,0 +1 @@ +http://webkit-rietveld.googlecode.com/svn/trunk/static/upload.py
\ No newline at end of file diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/rietveld/__init__.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/rietveld/__init__.py new file mode 100644 index 0000000..c1e4c6d --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/rietveld/__init__.py @@ -0,0 +1 @@ +# This file is required for Python to search this directory for modules. diff --git a/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/rietveld/upload.py b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/rietveld/upload.py new file mode 100755 index 0000000..e91060f --- /dev/null +++ b/WebKitTools/Scripts/webkitpy/thirdparty/autoinstalled/rietveld/upload.py @@ -0,0 +1,1702 @@ +#!/usr/bin/env python +# +# Copyright 2007 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tool for uploading diffs from a version control system to the codereview app. + +Usage summary: upload.py [options] [-- diff_options] + +Diff options are passed to the diff command of the underlying system. + +Supported version control systems: + Git + Mercurial + Subversion + +It is important for Git/Mercurial users to specify a tree/node/branch to diff +against by using the '--rev' option. +""" +# This code is derived from appcfg.py in the App Engine SDK (open source), +# and from ASPN recipe #146306. + +import ConfigParser +import cookielib +import fnmatch +import getpass +import logging +import mimetypes +import optparse +import os +import re +import socket +import subprocess +import sys +import urllib +import urllib2 +import urlparse + +# The md5 module was deprecated in Python 2.5. +try: + from hashlib import md5 +except ImportError: + from md5 import md5 + +try: + import readline +except ImportError: + pass + +# The logging verbosity: +# 0: Errors only. +# 1: Status messages. +# 2: Info logs. +# 3: Debug logs. +verbosity = 1 + +# Max size of patch or base file. +MAX_UPLOAD_SIZE = 900 * 1024 + +# Constants for version control names. Used by GuessVCSName. +VCS_GIT = "Git" +VCS_MERCURIAL = "Mercurial" +VCS_SUBVERSION = "Subversion" +VCS_UNKNOWN = "Unknown" + +# whitelist for non-binary filetypes which do not start with "text/" +# .mm (Objective-C) shows up as application/x-freemind on my Linux box. +TEXT_MIMETYPES = ['application/javascript', 'application/x-javascript', + 'application/xml', 'application/x-freemind'] + +VCS_ABBREVIATIONS = { + VCS_MERCURIAL.lower(): VCS_MERCURIAL, + "hg": VCS_MERCURIAL, + VCS_SUBVERSION.lower(): VCS_SUBVERSION, + "svn": VCS_SUBVERSION, + VCS_GIT.lower(): VCS_GIT, +} + +# The result of parsing Subversion's [auto-props] setting. +svn_auto_props_map = None + +def GetEmail(prompt): + """Prompts the user for their email address and returns it. + + The last used email address is saved to a file and offered up as a suggestion + to the user. If the user presses enter without typing in anything the last + used email address is used. If the user enters a new address, it is saved + for next time we prompt. + + """ + last_email_file_name = os.path.expanduser("~/.last_codereview_email_address") + last_email = "" + if os.path.exists(last_email_file_name): + try: + last_email_file = open(last_email_file_name, "r") + last_email = last_email_file.readline().strip("\n") + last_email_file.close() + prompt += " [%s]" % last_email + except IOError, e: + pass + email = raw_input(prompt + ": ").strip() + if email: + try: + last_email_file = open(last_email_file_name, "w") + last_email_file.write(email) + last_email_file.close() + except IOError, e: + pass + else: + email = last_email + return email + + +def StatusUpdate(msg): + """Print a status message to stdout. + + If 'verbosity' is greater than 0, print the message. + + Args: + msg: The string to print. + """ + if verbosity > 0: + print msg + + +def ErrorExit(msg): + """Print an error message to stderr and exit.""" + print >>sys.stderr, msg + sys.exit(1) + + +class ClientLoginError(urllib2.HTTPError): + """Raised to indicate there was an error authenticating with ClientLogin.""" + + def __init__(self, url, code, msg, headers, args): + urllib2.HTTPError.__init__(self, url, code, msg, headers, None) + self.args = args + self.reason = args["Error"] + + +class AbstractRpcServer(object): + """Provides a common interface for a simple RPC server.""" + + def __init__(self, host, auth_function, host_override=None, extra_headers={}, + save_cookies=False): + """Creates a new HttpRpcServer. + + Args: + host: The host to send requests to. + auth_function: A function that takes no arguments and returns an + (email, password) tuple when called. Will be called if authentication + is required. + host_override: The host header to send to the server (defaults to host). + extra_headers: A dict of extra headers to append to every request. + save_cookies: If True, save the authentication cookies to local disk. + If False, use an in-memory cookiejar instead. Subclasses must + implement this functionality. Defaults to False. + """ + self.host = host + self.host_override = host_override + self.auth_function = auth_function + self.authenticated = False + self.extra_headers = extra_headers + self.save_cookies = save_cookies + self.opener = self._GetOpener() + if self.host_override: + logging.info("Server: %s; Host: %s", self.host, self.host_override) + else: + logging.info("Server: %s", self.host) + + def _GetOpener(self): + """Returns an OpenerDirector for making HTTP requests. + + Returns: + A urllib2.OpenerDirector object. + """ + raise NotImplementedError() + + def _CreateRequest(self, url, data=None): + """Creates a new urllib request.""" + logging.debug("Creating request for: '%s' with payload:\n%s", url, data) + req = urllib2.Request(url, data=data) + if self.host_override: + req.add_header("Host", self.host_override) + for key, value in self.extra_headers.iteritems(): + req.add_header(key, value) + return req + + def _GetAuthToken(self, email, password): + """Uses ClientLogin to authenticate the user, returning an auth token. + + Args: + email: The user's email address + password: The user's password + + Raises: + ClientLoginError: If there was an error authenticating with ClientLogin. + HTTPError: If there was some other form of HTTP error. + + Returns: + The authentication token returned by ClientLogin. + """ + account_type = "GOOGLE" + if self.host.endswith(".google.com"): + # Needed for use inside Google. + account_type = "HOSTED" + req = self._CreateRequest( + url="https://www.google.com/accounts/ClientLogin", + data=urllib.urlencode({ + "Email": email, + "Passwd": password, + "service": "ah", + "source": "rietveld-codereview-upload", + "accountType": account_type, + }), + ) + try: + response = self.opener.open(req) + response_body = response.read() + response_dict = dict(x.split("=") + for x in response_body.split("\n") if x) + return response_dict["Auth"] + except urllib2.HTTPError, e: + if e.code == 403: + body = e.read() + response_dict = dict(x.split("=", 1) for x in body.split("\n") if x) + raise ClientLoginError(req.get_full_url(), e.code, e.msg, + e.headers, response_dict) + else: + raise + + def _GetAuthCookie(self, auth_token): + """Fetches authentication cookies for an authentication token. + + Args: + auth_token: The authentication token returned by ClientLogin. + + Raises: + HTTPError: If there was an error fetching the authentication cookies. + """ + # This is a dummy value to allow us to identify when we're successful. + continue_location = "http://localhost/" + args = {"continue": continue_location, "auth": auth_token} + req = self._CreateRequest("http://%s/_ah/login?%s" % + (self.host, urllib.urlencode(args))) + try: + response = self.opener.open(req) + except urllib2.HTTPError, e: + response = e + if (response.code != 302 or + response.info()["location"] != continue_location): + raise urllib2.HTTPError(req.get_full_url(), response.code, response.msg, + response.headers, response.fp) + self.authenticated = True + + def _Authenticate(self): + """Authenticates the user. + + The authentication process works as follows: + 1) We get a username and password from the user + 2) We use ClientLogin to obtain an AUTH token for the user + (see http://code.google.com/apis/accounts/AuthForInstalledApps.html). + 3) We pass the auth token to /_ah/login on the server to obtain an + authentication cookie. If login was successful, it tries to redirect + us to the URL we provided. + + If we attempt to access the upload API without first obtaining an + authentication cookie, it returns a 401 response (or a 302) and + directs us to authenticate ourselves with ClientLogin. + """ + for i in range(3): + credentials = self.auth_function() + try: + auth_token = self._GetAuthToken(credentials[0], credentials[1]) + except ClientLoginError, e: + if e.reason == "BadAuthentication": + print >>sys.stderr, "Invalid username or password." + continue + if e.reason == "CaptchaRequired": + print >>sys.stderr, ( + "Please go to\n" + "https://www.google.com/accounts/DisplayUnlockCaptcha\n" + "and verify you are a human. Then try again.") + break + if e.reason == "NotVerified": + print >>sys.stderr, "Account not verified." + break + if e.reason == "TermsNotAgreed": + print >>sys.stderr, "User has not agreed to TOS." + break + if e.reason == "AccountDeleted": + print >>sys.stderr, "The user account has been deleted." + break + if e.reason == "AccountDisabled": + print >>sys.stderr, "The user account has been disabled." + break + if e.reason == "ServiceDisabled": + print >>sys.stderr, ("The user's access to the service has been " + "disabled.") + break + if e.reason == "ServiceUnavailable": + print >>sys.stderr, "The service is not available; try again later." + break + raise + self._GetAuthCookie(auth_token) + return + + def Send(self, request_path, payload=None, + content_type="application/octet-stream", + timeout=None, + **kwargs): + """Sends an RPC and returns the response. + + Args: + request_path: The path to send the request to, eg /api/appversion/create. + payload: The body of the request, or None to send an empty request. + content_type: The Content-Type header to use. + timeout: timeout in seconds; default None i.e. no timeout. + (Note: for large requests on OS X, the timeout doesn't work right.) + kwargs: Any keyword arguments are converted into query string parameters. + + Returns: + The response body, as a string. + """ + # TODO: Don't require authentication. Let the server say + # whether it is necessary. + if not self.authenticated: + self._Authenticate() + + old_timeout = socket.getdefaulttimeout() + socket.setdefaulttimeout(timeout) + try: + tries = 0 + while True: + tries += 1 + args = dict(kwargs) + url = "http://%s%s" % (self.host, request_path) + if args: + url += "?" + urllib.urlencode(args) + req = self._CreateRequest(url=url, data=payload) + req.add_header("Content-Type", content_type) + try: + f = self.opener.open(req) + response = f.read() + f.close() + return response + except urllib2.HTTPError, e: + if tries > 3: + raise + elif e.code == 401 or e.code == 302: + self._Authenticate() +## elif e.code >= 500 and e.code < 600: +## # Server Error - try again. +## continue + else: + raise + finally: + socket.setdefaulttimeout(old_timeout) + + +class HttpRpcServer(AbstractRpcServer): + """Provides a simplified RPC-style interface for HTTP requests.""" + + def _Authenticate(self): + """Save the cookie jar after authentication.""" + super(HttpRpcServer, self)._Authenticate() + if self.save_cookies: + StatusUpdate("Saving authentication cookies to %s" % self.cookie_file) + self.cookie_jar.save() + + def _GetOpener(self): + """Returns an OpenerDirector that supports cookies and ignores redirects. + + Returns: + A urllib2.OpenerDirector object. + """ + opener = urllib2.OpenerDirector() + opener.add_handler(urllib2.ProxyHandler()) + opener.add_handler(urllib2.UnknownHandler()) + opener.add_handler(urllib2.HTTPHandler()) + opener.add_handler(urllib2.HTTPDefaultErrorHandler()) + opener.add_handler(urllib2.HTTPSHandler()) + opener.add_handler(urllib2.HTTPErrorProcessor()) + if self.save_cookies: + self.cookie_file = os.path.expanduser("~/.codereview_upload_cookies") + self.cookie_jar = cookielib.MozillaCookieJar(self.cookie_file) + if os.path.exists(self.cookie_file): + try: + self.cookie_jar.load() + self.authenticated = True + StatusUpdate("Loaded authentication cookies from %s" % + self.cookie_file) + except (cookielib.LoadError, IOError): + # Failed to load cookies - just ignore them. + pass + else: + # Create an empty cookie file with mode 600 + fd = os.open(self.cookie_file, os.O_CREAT, 0600) + os.close(fd) + # Always chmod the cookie file + os.chmod(self.cookie_file, 0600) + else: + # Don't save cookies across runs of update.py. + self.cookie_jar = cookielib.CookieJar() + opener.add_handler(urllib2.HTTPCookieProcessor(self.cookie_jar)) + return opener + + +parser = optparse.OptionParser(usage="%prog [options] [-- diff_options]") +parser.add_option("-y", "--assume_yes", action="store_true", + dest="assume_yes", default=False, + help="Assume that the answer to yes/no questions is 'yes'.") +# Logging +group = parser.add_option_group("Logging options") +group.add_option("-q", "--quiet", action="store_const", const=0, + dest="verbose", help="Print errors only.") +group.add_option("-v", "--verbose", action="store_const", const=2, + dest="verbose", default=1, + help="Print info level logs (default).") +group.add_option("--noisy", action="store_const", const=3, + dest="verbose", help="Print all logs.") +# Review server +group = parser.add_option_group("Review server options") +group.add_option("-s", "--server", action="store", dest="server", + default="codereview.appspot.com", + metavar="SERVER", + help=("The server to upload to. The format is host[:port]. " + "Defaults to '%default'.")) +group.add_option("-e", "--email", action="store", dest="email", + metavar="EMAIL", default=None, + help="The username to use. Will prompt if omitted.") +group.add_option("-H", "--host", action="store", dest="host", + metavar="HOST", default=None, + help="Overrides the Host header sent with all RPCs.") +group.add_option("--no_cookies", action="store_false", + dest="save_cookies", default=True, + help="Do not save authentication cookies to local disk.") +# Issue +group = parser.add_option_group("Issue options") +group.add_option("-d", "--description", action="store", dest="description", + metavar="DESCRIPTION", default=None, + help="Optional description when creating an issue.") +group.add_option("-f", "--description_file", action="store", + dest="description_file", metavar="DESCRIPTION_FILE", + default=None, + help="Optional path of a file that contains " + "the description when creating an issue.") +group.add_option("-r", "--reviewers", action="store", dest="reviewers", + metavar="REVIEWERS", default=None, + help="Add reviewers (comma separated email addresses).") +group.add_option("--cc", action="store", dest="cc", + metavar="CC", default=None, + help="Add CC (comma separated email addresses).") +group.add_option("--private", action="store_true", dest="private", + default=False, + help="Make the issue restricted to reviewers and those CCed") +# Upload options +group = parser.add_option_group("Patch options") +group.add_option("-m", "--message", action="store", dest="message", + metavar="MESSAGE", default=None, + help="A message to identify the patch. " + "Will prompt if omitted.") +group.add_option("-i", "--issue", type="int", action="store", + metavar="ISSUE", default=None, + help="Issue number to which to add. Defaults to new issue.") +group.add_option("--base_url", action="store", dest="base_url", default=None, + help="Base repository URL (listed as \"Base URL\" when " + "viewing issue). If omitted, will be guessed automatically " + "for SVN repos and left blank for others.") +group.add_option("--download_base", action="store_true", + dest="download_base", default=False, + help="Base files will be downloaded by the server " + "(side-by-side diffs may not work on files with CRs).") +group.add_option("--rev", action="store", dest="revision", + metavar="REV", default=None, + help="Base revision/branch/tree to diff against. Use " + "rev1:rev2 range to review already committed changeset.") +group.add_option("--send_mail", action="store_true", + dest="send_mail", default=False, + help="Send notification email to reviewers.") +group.add_option("--vcs", action="store", dest="vcs", + metavar="VCS", default=None, + help=("Version control system (optional, usually upload.py " + "already guesses the right VCS).")) +group.add_option("--emulate_svn_auto_props", action="store_true", + dest="emulate_svn_auto_props", default=False, + help=("Emulate Subversion's auto properties feature.")) + + +def GetRpcServer(server, email=None, host_override=None, save_cookies=True): + """Returns an instance of an AbstractRpcServer. + + Args: + server: String containing the review server URL. + email: String containing user's email address. + host_override: If not None, string containing an alternate hostname to use + in the host header. + save_cookies: Whether authentication cookies should be saved to disk. + + Returns: + A new AbstractRpcServer, on which RPC calls can be made. + """ + + rpc_server_class = HttpRpcServer + + def GetUserCredentials(): + """Prompts the user for a username and password.""" + if email is None: + email = GetEmail("Email (login for uploading to %s)" % server) + password = getpass.getpass("Password for %s: " % email) + return (email, password) + + # If this is the dev_appserver, use fake authentication. + host = (host_override or server).lower() + if host == "localhost" or host.startswith("localhost:"): + if email is None: + email = "test@example.com" + logging.info("Using debug user %s. Override with --email" % email) + server = rpc_server_class( + server, + lambda: (email, "password"), + host_override=host_override, + extra_headers={"Cookie": + 'dev_appserver_login="%s:False"' % email}, + save_cookies=save_cookies) + # Don't try to talk to ClientLogin. + server.authenticated = True + return server + + return rpc_server_class(server, + GetUserCredentials, + host_override=host_override, + save_cookies=save_cookies) + + +def EncodeMultipartFormData(fields, files): + """Encode form fields for multipart/form-data. + + Args: + fields: A sequence of (name, value) elements for regular form fields. + files: A sequence of (name, filename, value) elements for data to be + uploaded as files. + Returns: + (content_type, body) ready for httplib.HTTP instance. + + Source: + http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/146306 + """ + BOUNDARY = '-M-A-G-I-C---B-O-U-N-D-A-R-Y-' + CRLF = '\r\n' + lines = [] + for (key, value) in fields: + lines.append('--' + BOUNDARY) + lines.append('Content-Disposition: form-data; name="%s"' % key) + lines.append('') + lines.append(value) + for (key, filename, value) in files: + lines.append('--' + BOUNDARY) + lines.append('Content-Disposition: form-data; name="%s"; filename="%s"' % + (key, filename)) + lines.append('Content-Type: %s' % GetContentType(filename)) + lines.append('') + lines.append(value) + lines.append('--' + BOUNDARY + '--') + lines.append('') + body = CRLF.join(lines) + content_type = 'multipart/form-data; boundary=%s' % BOUNDARY + return content_type, body + + +def GetContentType(filename): + """Helper to guess the content-type from the filename.""" + return mimetypes.guess_type(filename)[0] or 'application/octet-stream' + + +# Use a shell for subcommands on Windows to get a PATH search. +use_shell = sys.platform.startswith("win") + +def RunShellWithReturnCode(command, print_output=False, + universal_newlines=True, + env=os.environ): + """Executes a command and returns the output from stdout and the return code. + + Args: + command: Command to execute. + print_output: If True, the output is printed to stdout. + If False, both stdout and stderr are ignored. + universal_newlines: Use universal_newlines flag (default: True). + + Returns: + Tuple (output, return code) + """ + logging.info("Running %s", command) + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + shell=use_shell, universal_newlines=universal_newlines, + env=env) + if print_output: + output_array = [] + while True: + line = p.stdout.readline() + if not line: + break + print line.strip("\n") + output_array.append(line) + output = "".join(output_array) + else: + output = p.stdout.read() + p.wait() + errout = p.stderr.read() + if print_output and errout: + print >>sys.stderr, errout + p.stdout.close() + p.stderr.close() + return output, p.returncode + + +def RunShell(command, silent_ok=False, universal_newlines=True, + print_output=False, env=os.environ): + data, retcode = RunShellWithReturnCode(command, print_output, + universal_newlines, env) + if retcode: + ErrorExit("Got error status from %s:\n%s" % (command, data)) + if not silent_ok and not data: + ErrorExit("No output from %s" % command) + return data + + +class VersionControlSystem(object): + """Abstract base class providing an interface to the VCS.""" + + def __init__(self, options): + """Constructor. + + Args: + options: Command line options. + """ + self.options = options + + def GenerateDiff(self, args): + """Return the current diff as a string. + + Args: + args: Extra arguments to pass to the diff command. + """ + raise NotImplementedError( + "abstract method -- subclass %s must override" % self.__class__) + + def GetUnknownFiles(self): + """Return a list of files unknown to the VCS.""" + raise NotImplementedError( + "abstract method -- subclass %s must override" % self.__class__) + + def CheckForUnknownFiles(self): + """Show an "are you sure?" prompt if there are unknown files.""" + unknown_files = self.GetUnknownFiles() + if unknown_files: + print "The following files are not added to version control:" + for line in unknown_files: + print line + prompt = "Are you sure to continue?(y/N) " + answer = raw_input(prompt).strip() + if answer != "y": + ErrorExit("User aborted") + + def GetBaseFile(self, filename): + """Get the content of the upstream version of a file. + + Returns: + A tuple (base_content, new_content, is_binary, status) + base_content: The contents of the base file. + new_content: For text files, this is empty. For binary files, this is + the contents of the new file, since the diff output won't contain + information to reconstruct the current file. + is_binary: True iff the file is binary. + status: The status of the file. + """ + + raise NotImplementedError( + "abstract method -- subclass %s must override" % self.__class__) + + + def GetBaseFiles(self, diff): + """Helper that calls GetBase file for each file in the patch. + + Returns: + A dictionary that maps from filename to GetBaseFile's tuple. Filenames + are retrieved based on lines that start with "Index:" or + "Property changes on:". + """ + files = {} + for line in diff.splitlines(True): + if line.startswith('Index:') or line.startswith('Property changes on:'): + unused, filename = line.split(':', 1) + # On Windows if a file has property changes its filename uses '\' + # instead of '/'. + filename = filename.strip().replace('\\', '/') + files[filename] = self.GetBaseFile(filename) + return files + + + def UploadBaseFiles(self, issue, rpc_server, patch_list, patchset, options, + files): + """Uploads the base files (and if necessary, the current ones as well).""" + + def UploadFile(filename, file_id, content, is_binary, status, is_base): + """Uploads a file to the server.""" + file_too_large = False + if is_base: + type = "base" + else: + type = "current" + if len(content) > MAX_UPLOAD_SIZE: + print ("Not uploading the %s file for %s because it's too large." % + (type, filename)) + file_too_large = True + content = "" + checksum = md5(content).hexdigest() + if options.verbose > 0 and not file_too_large: + print "Uploading %s file for %s" % (type, filename) + url = "/%d/upload_content/%d/%d" % (int(issue), int(patchset), file_id) + form_fields = [("filename", filename), + ("status", status), + ("checksum", checksum), + ("is_binary", str(is_binary)), + ("is_current", str(not is_base)), + ] + if file_too_large: + form_fields.append(("file_too_large", "1")) + if options.email: + form_fields.append(("user", options.email)) + ctype, body = EncodeMultipartFormData(form_fields, + [("data", filename, content)]) + response_body = rpc_server.Send(url, body, + content_type=ctype) + if not response_body.startswith("OK"): + StatusUpdate(" --> %s" % response_body) + sys.exit(1) + + patches = dict() + [patches.setdefault(v, k) for k, v in patch_list] + for filename in patches.keys(): + base_content, new_content, is_binary, status = files[filename] + file_id_str = patches.get(filename) + if file_id_str.find("nobase") != -1: + base_content = None + file_id_str = file_id_str[file_id_str.rfind("_") + 1:] + file_id = int(file_id_str) + if base_content != None: + UploadFile(filename, file_id, base_content, is_binary, status, True) + if new_content != None: + UploadFile(filename, file_id, new_content, is_binary, status, False) + + def IsImage(self, filename): + """Returns true if the filename has an image extension.""" + mimetype = mimetypes.guess_type(filename)[0] + if not mimetype: + return False + return mimetype.startswith("image/") + + def IsBinary(self, filename): + """Returns true if the guessed mimetyped isnt't in text group.""" + mimetype = mimetypes.guess_type(filename)[0] + if not mimetype: + return False # e.g. README, "real" binaries usually have an extension + # special case for text files which don't start with text/ + if mimetype in TEXT_MIMETYPES: + return False + return not mimetype.startswith("text/") + + +class SubversionVCS(VersionControlSystem): + """Implementation of the VersionControlSystem interface for Subversion.""" + + def __init__(self, options): + super(SubversionVCS, self).__init__(options) + if self.options.revision: + match = re.match(r"(\d+)(:(\d+))?", self.options.revision) + if not match: + ErrorExit("Invalid Subversion revision %s." % self.options.revision) + self.rev_start = match.group(1) + self.rev_end = match.group(3) + else: + self.rev_start = self.rev_end = None + # Cache output from "svn list -r REVNO dirname". + # Keys: dirname, Values: 2-tuple (ouput for start rev and end rev). + self.svnls_cache = {} + # Base URL is required to fetch files deleted in an older revision. + # Result is cached to not guess it over and over again in GetBaseFile(). + required = self.options.download_base or self.options.revision is not None + self.svn_base = self._GuessBase(required) + + def GuessBase(self, required): + """Wrapper for _GuessBase.""" + return self.svn_base + + def _GuessBase(self, required): + """Returns the SVN base URL. + + Args: + required: If true, exits if the url can't be guessed, otherwise None is + returned. + """ + info = RunShell(["svn", "info"]) + for line in info.splitlines(): + words = line.split() + if len(words) == 2 and words[0] == "URL:": + url = words[1] + scheme, netloc, path, params, query, fragment = urlparse.urlparse(url) + username, netloc = urllib.splituser(netloc) + if username: + logging.info("Removed username from base URL") + if netloc.endswith("svn.python.org"): + if netloc == "svn.python.org": + if path.startswith("/projects/"): + path = path[9:] + elif netloc != "pythondev@svn.python.org": + ErrorExit("Unrecognized Python URL: %s" % url) + base = "http://svn.python.org/view/*checkout*%s/" % path + logging.info("Guessed Python base = %s", base) + elif netloc.endswith("svn.collab.net"): + if path.startswith("/repos/"): + path = path[6:] + base = "http://svn.collab.net/viewvc/*checkout*%s/" % path + logging.info("Guessed CollabNet base = %s", base) + elif netloc.endswith(".googlecode.com"): + path = path + "/" + base = urlparse.urlunparse(("http", netloc, path, params, + query, fragment)) + logging.info("Guessed Google Code base = %s", base) + else: + path = path + "/" + base = urlparse.urlunparse((scheme, netloc, path, params, + query, fragment)) + logging.info("Guessed base = %s", base) + return base + if required: + ErrorExit("Can't find URL in output from svn info") + return None + + def GenerateDiff(self, args): + cmd = ["svn", "diff"] + if self.options.revision: + cmd += ["-r", self.options.revision] + cmd.extend(args) + data = RunShell(cmd) + count = 0 + for line in data.splitlines(): + if line.startswith("Index:") or line.startswith("Property changes on:"): + count += 1 + logging.info(line) + if not count: + ErrorExit("No valid patches found in output from svn diff") + return data + + def _CollapseKeywords(self, content, keyword_str): + """Collapses SVN keywords.""" + # svn cat translates keywords but svn diff doesn't. As a result of this + # behavior patching.PatchChunks() fails with a chunk mismatch error. + # This part was originally written by the Review Board development team + # who had the same problem (http://reviews.review-board.org/r/276/). + # Mapping of keywords to known aliases + svn_keywords = { + # Standard keywords + 'Date': ['Date', 'LastChangedDate'], + 'Revision': ['Revision', 'LastChangedRevision', 'Rev'], + 'Author': ['Author', 'LastChangedBy'], + 'HeadURL': ['HeadURL', 'URL'], + 'Id': ['Id'], + + # Aliases + 'LastChangedDate': ['LastChangedDate', 'Date'], + 'LastChangedRevision': ['LastChangedRevision', 'Rev', 'Revision'], + 'LastChangedBy': ['LastChangedBy', 'Author'], + 'URL': ['URL', 'HeadURL'], + } + + def repl(m): + if m.group(2): + return "$%s::%s$" % (m.group(1), " " * len(m.group(3))) + return "$%s$" % m.group(1) + keywords = [keyword + for name in keyword_str.split(" ") + for keyword in svn_keywords.get(name, [])] + return re.sub(r"\$(%s):(:?)([^\$]+)\$" % '|'.join(keywords), repl, content) + + def GetUnknownFiles(self): + status = RunShell(["svn", "status", "--ignore-externals"], silent_ok=True) + unknown_files = [] + for line in status.split("\n"): + if line and line[0] == "?": + unknown_files.append(line) + return unknown_files + + def ReadFile(self, filename): + """Returns the contents of a file.""" + file = open(filename, 'rb') + result = "" + try: + result = file.read() + finally: + file.close() + return result + + def GetStatus(self, filename): + """Returns the status of a file.""" + if not self.options.revision: + status = RunShell(["svn", "status", "--ignore-externals", filename]) + if not status: + ErrorExit("svn status returned no output for %s" % filename) + status_lines = status.splitlines() + # If file is in a cl, the output will begin with + # "\n--- Changelist 'cl_name':\n". See + # http://svn.collab.net/repos/svn/trunk/notes/changelist-design.txt + if (len(status_lines) == 3 and + not status_lines[0] and + status_lines[1].startswith("--- Changelist")): + status = status_lines[2] + else: + status = status_lines[0] + # If we have a revision to diff against we need to run "svn list" + # for the old and the new revision and compare the results to get + # the correct status for a file. + else: + dirname, relfilename = os.path.split(filename) + if dirname not in self.svnls_cache: + cmd = ["svn", "list", "-r", self.rev_start, dirname or "."] + out, returncode = RunShellWithReturnCode(cmd) + if returncode: + ErrorExit("Failed to get status for %s." % filename) + old_files = out.splitlines() + args = ["svn", "list"] + if self.rev_end: + args += ["-r", self.rev_end] + cmd = args + [dirname or "."] + out, returncode = RunShellWithReturnCode(cmd) + if returncode: + ErrorExit("Failed to run command %s" % cmd) + self.svnls_cache[dirname] = (old_files, out.splitlines()) + old_files, new_files = self.svnls_cache[dirname] + if relfilename in old_files and relfilename not in new_files: + status = "D " + elif relfilename in old_files and relfilename in new_files: + status = "M " + else: + status = "A " + return status + + def GetBaseFile(self, filename): + status = self.GetStatus(filename) + base_content = None + new_content = None + + # If a file is copied its status will be "A +", which signifies + # "addition-with-history". See "svn st" for more information. We need to + # upload the original file or else diff parsing will fail if the file was + # edited. + if status[0] == "A" and status[3] != "+": + # We'll need to upload the new content if we're adding a binary file + # since diff's output won't contain it. + mimetype = RunShell(["svn", "propget", "svn:mime-type", filename], + silent_ok=True) + base_content = "" + is_binary = bool(mimetype) and not mimetype.startswith("text/") + if is_binary and self.IsImage(filename): + new_content = self.ReadFile(filename) + elif (status[0] in ("M", "D", "R") or + (status[0] == "A" and status[3] == "+") or # Copied file. + (status[0] == " " and status[1] == "M")): # Property change. + args = [] + if self.options.revision: + url = "%s/%s@%s" % (self.svn_base, filename, self.rev_start) + else: + # Don't change filename, it's needed later. + url = filename + args += ["-r", "BASE"] + cmd = ["svn"] + args + ["propget", "svn:mime-type", url] + mimetype, returncode = RunShellWithReturnCode(cmd) + if returncode: + # File does not exist in the requested revision. + # Reset mimetype, it contains an error message. + mimetype = "" + get_base = False + is_binary = bool(mimetype) and not mimetype.startswith("text/") + if status[0] == " ": + # Empty base content just to force an upload. + base_content = "" + elif is_binary: + if self.IsImage(filename): + get_base = True + if status[0] == "M": + if not self.rev_end: + new_content = self.ReadFile(filename) + else: + url = "%s/%s@%s" % (self.svn_base, filename, self.rev_end) + new_content = RunShell(["svn", "cat", url], + universal_newlines=True, silent_ok=True) + else: + base_content = "" + else: + get_base = True + + if get_base: + if is_binary: + universal_newlines = False + else: + universal_newlines = True + if self.rev_start: + # "svn cat -r REV delete_file.txt" doesn't work. cat requires + # the full URL with "@REV" appended instead of using "-r" option. + url = "%s/%s@%s" % (self.svn_base, filename, self.rev_start) + base_content = RunShell(["svn", "cat", url], + universal_newlines=universal_newlines, + silent_ok=True) + else: + base_content = RunShell(["svn", "cat", filename], + universal_newlines=universal_newlines, + silent_ok=True) + if not is_binary: + args = [] + if self.rev_start: + url = "%s/%s@%s" % (self.svn_base, filename, self.rev_start) + else: + url = filename + args += ["-r", "BASE"] + cmd = ["svn"] + args + ["propget", "svn:keywords", url] + keywords, returncode = RunShellWithReturnCode(cmd) + if keywords and not returncode: + base_content = self._CollapseKeywords(base_content, keywords) + else: + StatusUpdate("svn status returned unexpected output: %s" % status) + sys.exit(1) + return base_content, new_content, is_binary, status[0:5] + + +class GitVCS(VersionControlSystem): + """Implementation of the VersionControlSystem interface for Git.""" + + def __init__(self, options): + super(GitVCS, self).__init__(options) + # Map of filename -> (hash before, hash after) of base file. + # Hashes for "no such file" are represented as None. + self.hashes = {} + # Map of new filename -> old filename for renames. + self.renames = {} + + def GenerateDiff(self, extra_args): + # This is more complicated than svn's GenerateDiff because we must convert + # the diff output to include an svn-style "Index:" line as well as record + # the hashes of the files, so we can upload them along with our diff. + + # Special used by git to indicate "no such content". + NULL_HASH = "0"*40 + + extra_args = extra_args[:] + if self.options.revision: + extra_args = [self.options.revision] + extra_args + + # --no-ext-diff is broken in some versions of Git, so try to work around + # this by overriding the environment (but there is still a problem if the + # git config key "diff.external" is used). + env = os.environ.copy() + if 'GIT_EXTERNAL_DIFF' in env: del env['GIT_EXTERNAL_DIFF'] + gitdiff = RunShell(["git", "diff", "--no-ext-diff", "--full-index", "-M"] + + extra_args, env=env) + + def IsFileNew(filename): + return filename in self.hashes and self.hashes[filename][0] is None + + def AddSubversionPropertyChange(filename): + """Add svn's property change information into the patch if given file is + new file. + + We use Subversion's auto-props setting to retrieve its property. + See http://svnbook.red-bean.com/en/1.1/ch07.html#svn-ch-7-sect-1.3.2 for + Subversion's [auto-props] setting. + """ + if self.options.emulate_svn_auto_props and IsFileNew(filename): + svnprops = GetSubversionPropertyChanges(filename) + if svnprops: + svndiff.append("\n" + svnprops + "\n") + + svndiff = [] + filecount = 0 + filename = None + for line in gitdiff.splitlines(): + match = re.match(r"diff --git a/(.*) b/(.*)$", line) + if match: + # Add auto property here for previously seen file. + if filename is not None: + AddSubversionPropertyChange(filename) + filecount += 1 + # Intentionally use the "after" filename so we can show renames. + filename = match.group(2) + svndiff.append("Index: %s\n" % filename) + if match.group(1) != match.group(2): + self.renames[match.group(2)] = match.group(1) + else: + # The "index" line in a git diff looks like this (long hashes elided): + # index 82c0d44..b2cee3f 100755 + # We want to save the left hash, as that identifies the base file. + match = re.match(r"index (\w+)\.\.(\w+)", line) + if match: + before, after = (match.group(1), match.group(2)) + if before == NULL_HASH: + before = None + if after == NULL_HASH: + after = None + self.hashes[filename] = (before, after) + svndiff.append(line + "\n") + if not filecount: + ErrorExit("No valid patches found in output from git diff") + # Add auto property for the last seen file. + assert filename is not None + AddSubversionPropertyChange(filename) + return "".join(svndiff) + + def GetUnknownFiles(self): + status = RunShell(["git", "ls-files", "--exclude-standard", "--others"], + silent_ok=True) + return status.splitlines() + + def GetFileContent(self, file_hash, is_binary): + """Returns the content of a file identified by its git hash.""" + data, retcode = RunShellWithReturnCode(["git", "show", file_hash], + universal_newlines=not is_binary) + if retcode: + ErrorExit("Got error status from 'git show %s'" % file_hash) + return data + + def GetBaseFile(self, filename): + hash_before, hash_after = self.hashes.get(filename, (None,None)) + base_content = None + new_content = None + is_binary = self.IsBinary(filename) + status = None + + if filename in self.renames: + status = "A +" # Match svn attribute name for renames. + if filename not in self.hashes: + # If a rename doesn't change the content, we never get a hash. + base_content = RunShell(["git", "show", "HEAD:" + filename]) + elif not hash_before: + status = "A" + base_content = "" + elif not hash_after: + status = "D" + else: + status = "M" + + is_image = self.IsImage(filename) + + # Grab the before/after content if we need it. + # We should include file contents if it's text or it's an image. + if not is_binary or is_image: + # Grab the base content if we don't have it already. + if base_content is None and hash_before: + base_content = self.GetFileContent(hash_before, is_binary) + # Only include the "after" file if it's an image; otherwise it + # it is reconstructed from the diff. + if is_image and hash_after: + new_content = self.GetFileContent(hash_after, is_binary) + + return (base_content, new_content, is_binary, status) + + +class MercurialVCS(VersionControlSystem): + """Implementation of the VersionControlSystem interface for Mercurial.""" + + def __init__(self, options, repo_dir): + super(MercurialVCS, self).__init__(options) + # Absolute path to repository (we can be in a subdir) + self.repo_dir = os.path.normpath(repo_dir) + # Compute the subdir + cwd = os.path.normpath(os.getcwd()) + assert cwd.startswith(self.repo_dir) + self.subdir = cwd[len(self.repo_dir):].lstrip(r"\/") + if self.options.revision: + self.base_rev = self.options.revision + else: + self.base_rev = RunShell(["hg", "parent", "-q"]).split(':')[1].strip() + + def _GetRelPath(self, filename): + """Get relative path of a file according to the current directory, + given its logical path in the repo.""" + assert filename.startswith(self.subdir), (filename, self.subdir) + return filename[len(self.subdir):].lstrip(r"\/") + + def GenerateDiff(self, extra_args): + # If no file specified, restrict to the current subdir + extra_args = extra_args or ["."] + cmd = ["hg", "diff", "--git", "-r", self.base_rev] + extra_args + data = RunShell(cmd, silent_ok=True) + svndiff = [] + filecount = 0 + for line in data.splitlines(): + m = re.match("diff --git a/(\S+) b/(\S+)", line) + if m: + # Modify line to make it look like as it comes from svn diff. + # With this modification no changes on the server side are required + # to make upload.py work with Mercurial repos. + # NOTE: for proper handling of moved/copied files, we have to use + # the second filename. + filename = m.group(2) + svndiff.append("Index: %s" % filename) + svndiff.append("=" * 67) + filecount += 1 + logging.info(line) + else: + svndiff.append(line) + if not filecount: + ErrorExit("No valid patches found in output from hg diff") + return "\n".join(svndiff) + "\n" + + def GetUnknownFiles(self): + """Return a list of files unknown to the VCS.""" + args = [] + status = RunShell(["hg", "status", "--rev", self.base_rev, "-u", "."], + silent_ok=True) + unknown_files = [] + for line in status.splitlines(): + st, fn = line.split(" ", 1) + if st == "?": + unknown_files.append(fn) + return unknown_files + + def GetBaseFile(self, filename): + # "hg status" and "hg cat" both take a path relative to the current subdir + # rather than to the repo root, but "hg diff" has given us the full path + # to the repo root. + base_content = "" + new_content = None + is_binary = False + oldrelpath = relpath = self._GetRelPath(filename) + # "hg status -C" returns two lines for moved/copied files, one otherwise + out = RunShell(["hg", "status", "-C", "--rev", self.base_rev, relpath]) + out = out.splitlines() + # HACK: strip error message about missing file/directory if it isn't in + # the working copy + if out[0].startswith('%s: ' % relpath): + out = out[1:] + if len(out) > 1: + # Moved/copied => considered as modified, use old filename to + # retrieve base contents + oldrelpath = out[1].strip() + status = "M" + else: + status, _ = out[0].split(' ', 1) + if ":" in self.base_rev: + base_rev = self.base_rev.split(":", 1)[0] + else: + base_rev = self.base_rev + if status != "A": + base_content = RunShell(["hg", "cat", "-r", base_rev, oldrelpath], + silent_ok=True) + is_binary = "\0" in base_content # Mercurial's heuristic + if status != "R": + new_content = open(relpath, "rb").read() + is_binary = is_binary or "\0" in new_content + if is_binary and base_content: + # Fetch again without converting newlines + base_content = RunShell(["hg", "cat", "-r", base_rev, oldrelpath], + silent_ok=True, universal_newlines=False) + if not is_binary or not self.IsImage(relpath): + new_content = None + return base_content, new_content, is_binary, status + + +# NOTE: The SplitPatch function is duplicated in engine.py, keep them in sync. +def SplitPatch(data): + """Splits a patch into separate pieces for each file. + + Args: + data: A string containing the output of svn diff. + + Returns: + A list of 2-tuple (filename, text) where text is the svn diff output + pertaining to filename. + """ + patches = [] + filename = None + diff = [] + for line in data.splitlines(True): + new_filename = None + if line.startswith('Index:'): + unused, new_filename = line.split(':', 1) + new_filename = new_filename.strip() + elif line.startswith('Property changes on:'): + unused, temp_filename = line.split(':', 1) + # When a file is modified, paths use '/' between directories, however + # when a property is modified '\' is used on Windows. Make them the same + # otherwise the file shows up twice. + temp_filename = temp_filename.strip().replace('\\', '/') + if temp_filename != filename: + # File has property changes but no modifications, create a new diff. + new_filename = temp_filename + if new_filename: + if filename and diff: + patches.append((filename, ''.join(diff))) + filename = new_filename + diff = [line] + continue + if diff is not None: + diff.append(line) + if filename and diff: + patches.append((filename, ''.join(diff))) + return patches + + +def UploadSeparatePatches(issue, rpc_server, patchset, data, options): + """Uploads a separate patch for each file in the diff output. + + Returns a list of [patch_key, filename] for each file. + """ + patches = SplitPatch(data) + rv = [] + for patch in patches: + if len(patch[1]) > MAX_UPLOAD_SIZE: + print ("Not uploading the patch for " + patch[0] + + " because the file is too large.") + continue + form_fields = [("filename", patch[0])] + if not options.download_base: + form_fields.append(("content_upload", "1")) + files = [("data", "data.diff", patch[1])] + ctype, body = EncodeMultipartFormData(form_fields, files) + url = "/%d/upload_patch/%d" % (int(issue), int(patchset)) + print "Uploading patch for " + patch[0] + response_body = rpc_server.Send(url, body, content_type=ctype) + lines = response_body.splitlines() + if not lines or lines[0] != "OK": + StatusUpdate(" --> %s" % response_body) + sys.exit(1) + rv.append([lines[1], patch[0]]) + return rv + + +def GuessVCSName(): + """Helper to guess the version control system. + + This examines the current directory, guesses which VersionControlSystem + we're using, and returns an string indicating which VCS is detected. + + Returns: + A pair (vcs, output). vcs is a string indicating which VCS was detected + and is one of VCS_GIT, VCS_MERCURIAL, VCS_SUBVERSION, or VCS_UNKNOWN. + output is a string containing any interesting output from the vcs + detection routine, or None if there is nothing interesting. + """ + # Mercurial has a command to get the base directory of a repository + # Try running it, but don't die if we don't have hg installed. + # NOTE: we try Mercurial first as it can sit on top of an SVN working copy. + try: + out, returncode = RunShellWithReturnCode(["hg", "root"]) + if returncode == 0: + return (VCS_MERCURIAL, out.strip()) + except OSError, (errno, message): + if errno != 2: # ENOENT -- they don't have hg installed. + raise + + # Subversion has a .svn in all working directories. + if os.path.isdir('.svn'): + logging.info("Guessed VCS = Subversion") + return (VCS_SUBVERSION, None) + + # Git has a command to test if you're in a git tree. + # Try running it, but don't die if we don't have git installed. + try: + out, returncode = RunShellWithReturnCode(["git", "rev-parse", + "--is-inside-work-tree"]) + if returncode == 0: + return (VCS_GIT, None) + except OSError, (errno, message): + if errno != 2: # ENOENT -- they don't have git installed. + raise + + return (VCS_UNKNOWN, None) + + +def GuessVCS(options): + """Helper to guess the version control system. + + This verifies any user-specified VersionControlSystem (by command line + or environment variable). If the user didn't specify one, this examines + the current directory, guesses which VersionControlSystem we're using, + and returns an instance of the appropriate class. Exit with an error + if we can't figure it out. + + Returns: + A VersionControlSystem instance. Exits if the VCS can't be guessed. + """ + vcs = options.vcs + if not vcs: + vcs = os.environ.get("CODEREVIEW_VCS") + if vcs: + v = VCS_ABBREVIATIONS.get(vcs.lower()) + if v is None: + ErrorExit("Unknown version control system %r specified." % vcs) + (vcs, extra_output) = (v, None) + else: + (vcs, extra_output) = GuessVCSName() + + if vcs == VCS_MERCURIAL: + if extra_output is None: + extra_output = RunShell(["hg", "root"]).strip() + return MercurialVCS(options, extra_output) + elif vcs == VCS_SUBVERSION: + return SubversionVCS(options) + elif vcs == VCS_GIT: + return GitVCS(options) + + ErrorExit(("Could not guess version control system. " + "Are you in a working copy directory?")) + + +def CheckReviewer(reviewer): + """Validate a reviewer -- either a nickname or an email addres. + + Args: + reviewer: A nickname or an email address. + + Calls ErrorExit() if it is an invalid email address. + """ + if "@" not in reviewer: + return # Assume nickname + parts = reviewer.split("@") + if len(parts) > 2: + ErrorExit("Invalid email address: %r" % reviewer) + assert len(parts) == 2 + if "." not in parts[1]: + ErrorExit("Invalid email address: %r" % reviewer) + + +def LoadSubversionAutoProperties(): + """Returns the content of [auto-props] section of Subversion's config file as + a dictionary. + + Returns: + A dictionary whose key-value pair corresponds the [auto-props] section's + key-value pair. + In following cases, returns empty dictionary: + - config file doesn't exist, or + - 'enable-auto-props' is not set to 'true-like-value' in [miscellany]. + """ + # Todo(hayato): Windows users might use different path for configuration file. + subversion_config = os.path.expanduser("~/.subversion/config") + if not os.path.exists(subversion_config): + return {} + config = ConfigParser.ConfigParser() + config.read(subversion_config) + if (config.has_section("miscellany") and + config.has_option("miscellany", "enable-auto-props") and + config.getboolean("miscellany", "enable-auto-props") and + config.has_section("auto-props")): + props = {} + for file_pattern in config.options("auto-props"): + props[file_pattern] = ParseSubversionPropertyValues( + config.get("auto-props", file_pattern)) + return props + else: + return {} + +def ParseSubversionPropertyValues(props): + """Parse the given property value which comes from [auto-props] section and + returns a list whose element is a (svn_prop_key, svn_prop_value) pair. + + See the following doctest for example. + + >>> ParseSubversionPropertyValues('svn:eol-style=LF') + [('svn:eol-style', 'LF')] + >>> ParseSubversionPropertyValues('svn:mime-type=image/jpeg') + [('svn:mime-type', 'image/jpeg')] + >>> ParseSubversionPropertyValues('svn:eol-style=LF;svn:executable') + [('svn:eol-style', 'LF'), ('svn:executable', '*')] + """ + key_value_pairs = [] + for prop in props.split(";"): + key_value = prop.split("=") + assert len(key_value) <= 2 + if len(key_value) == 1: + # If value is not given, use '*' as a Subversion's convention. + key_value_pairs.append((key_value[0], "*")) + else: + key_value_pairs.append((key_value[0], key_value[1])) + return key_value_pairs + + +def GetSubversionPropertyChanges(filename): + """Return a Subversion's 'Property changes on ...' string, which is used in + the patch file. + + Args: + filename: filename whose property might be set by [auto-props] config. + + Returns: + A string like 'Property changes on |filename| ...' if given |filename| + matches any entries in [auto-props] section. None, otherwise. + """ + global svn_auto_props_map + if svn_auto_props_map is None: + svn_auto_props_map = LoadSubversionAutoProperties() + + all_props = [] + for file_pattern, props in svn_auto_props_map.items(): + if fnmatch.fnmatch(filename, file_pattern): + all_props.extend(props) + if all_props: + return FormatSubversionPropertyChanges(filename, all_props) + return None + + +def FormatSubversionPropertyChanges(filename, props): + """Returns Subversion's 'Property changes on ...' strings using given filename + and properties. + + Args: + filename: filename + props: A list whose element is a (svn_prop_key, svn_prop_value) pair. + + Returns: + A string which can be used in the patch file for Subversion. + + See the following doctest for example. + + >>> print FormatSubversionPropertyChanges('foo.cc', [('svn:eol-style', 'LF')]) + Property changes on: foo.cc + ___________________________________________________________________ + Added: svn:eol-style + + LF + <BLANKLINE> + """ + prop_changes_lines = [ + "Property changes on: %s" % filename, + "___________________________________________________________________"] + for key, value in props: + prop_changes_lines.append("Added: " + key) + prop_changes_lines.append(" + " + value) + return "\n".join(prop_changes_lines) + "\n" + + +def RealMain(argv, data=None): + """The real main function. + + Args: + argv: Command line arguments. + data: Diff contents. If None (default) the diff is generated by + the VersionControlSystem implementation returned by GuessVCS(). + + Returns: + A 2-tuple (issue id, patchset id). + The patchset id is None if the base files are not uploaded by this + script (applies only to SVN checkouts). + """ + logging.basicConfig(format=("%(asctime).19s %(levelname)s %(filename)s:" + "%(lineno)s %(message)s ")) + os.environ['LC_ALL'] = 'C' + options, args = parser.parse_args(argv[1:]) + global verbosity + verbosity = options.verbose + if verbosity >= 3: + logging.getLogger().setLevel(logging.DEBUG) + elif verbosity >= 2: + logging.getLogger().setLevel(logging.INFO) + + vcs = GuessVCS(options) + + base = options.base_url + if isinstance(vcs, SubversionVCS): + # Guessing the base field is only supported for Subversion. + # Note: Fetching base files may become deprecated in future releases. + guessed_base = vcs.GuessBase(options.download_base) + if base: + if guessed_base and base != guessed_base: + print "Using base URL \"%s\" from --base_url instead of \"%s\"" % \ + (base, guessed_base) + else: + base = guessed_base + + if not base and options.download_base: + options.download_base = True + logging.info("Enabled upload of base file") + if not options.assume_yes: + vcs.CheckForUnknownFiles() + if data is None: + data = vcs.GenerateDiff(args) + files = vcs.GetBaseFiles(data) + if verbosity >= 1: + print "Upload server:", options.server, "(change with -s/--server)" + if options.issue: + prompt = "Message describing this patch set: " + else: + prompt = "New issue subject: " + message = options.message or raw_input(prompt).strip() + if not message: + ErrorExit("A non-empty message is required") + rpc_server = GetRpcServer(options.server, + options.email, + options.host, + options.save_cookies) + form_fields = [("subject", message)] + if base: + form_fields.append(("base", base)) + if options.issue: + form_fields.append(("issue", str(options.issue))) + if options.email: + form_fields.append(("user", options.email)) + if options.reviewers: + for reviewer in options.reviewers.split(','): + CheckReviewer(reviewer) + form_fields.append(("reviewers", options.reviewers)) + if options.cc: + for cc in options.cc.split(','): + CheckReviewer(cc) + form_fields.append(("cc", options.cc)) + description = options.description + if options.description_file: + if options.description: + ErrorExit("Can't specify description and description_file") + file = open(options.description_file, 'r') + description = file.read() + file.close() + if description: + form_fields.append(("description", description)) + # Send a hash of all the base file so the server can determine if a copy + # already exists in an earlier patchset. + base_hashes = "" + for file, info in files.iteritems(): + if not info[0] is None: + checksum = md5(info[0]).hexdigest() + if base_hashes: + base_hashes += "|" + base_hashes += checksum + ":" + file + form_fields.append(("base_hashes", base_hashes)) + if options.private: + if options.issue: + print "Warning: Private flag ignored when updating an existing issue." + else: + form_fields.append(("private", "1")) + # If we're uploading base files, don't send the email before the uploads, so + # that it contains the file status. + if options.send_mail and options.download_base: + form_fields.append(("send_mail", "1")) + if not options.download_base: + form_fields.append(("content_upload", "1")) + if len(data) > MAX_UPLOAD_SIZE: + print "Patch is large, so uploading file patches separately." + uploaded_diff_file = [] + form_fields.append(("separate_patches", "1")) + else: + uploaded_diff_file = [("data", "data.diff", data)] + ctype, body = EncodeMultipartFormData(form_fields, uploaded_diff_file) + response_body = rpc_server.Send("/upload", body, content_type=ctype) + patchset = None + if not options.download_base or not uploaded_diff_file: + lines = response_body.splitlines() + if len(lines) >= 2: + msg = lines[0] + patchset = lines[1].strip() + patches = [x.split(" ", 1) for x in lines[2:]] + else: + msg = response_body + else: + msg = response_body + StatusUpdate(msg) + if not response_body.startswith("Issue created.") and \ + not response_body.startswith("Issue updated."): + sys.exit(0) + issue = msg[msg.rfind("/")+1:] + + if not uploaded_diff_file: + result = UploadSeparatePatches(issue, rpc_server, patchset, data, options) + if not options.download_base: + patches = result + + if not options.download_base: + vcs.UploadBaseFiles(issue, rpc_server, patches, patchset, options, files) + if options.send_mail: + rpc_server.Send("/" + issue + "/mail", payload="") + return issue, patchset + + +def main(): + try: + RealMain(sys.argv) + except KeyboardInterrupt: + print + StatusUpdate("Interrupted.") + sys.exit(1) + + +if __name__ == "__main__": + main() |