diff options
Diffstat (limited to 'src/org/apache/http/message/BasicTokenIterator.java')
-rw-r--r-- | src/org/apache/http/message/BasicTokenIterator.java | 429 |
1 files changed, 429 insertions, 0 deletions
diff --git a/src/org/apache/http/message/BasicTokenIterator.java b/src/org/apache/http/message/BasicTokenIterator.java new file mode 100644 index 0000000..5fbf5ba --- /dev/null +++ b/src/org/apache/http/message/BasicTokenIterator.java @@ -0,0 +1,429 @@ +/* + * $HeadURL: http://svn.apache.org/repos/asf/httpcomponents/httpcore/trunk/module-main/src/main/java/org/apache/http/message/BasicTokenIterator.java $ + * $Revision: 602520 $ + * $Date: 2007-12-08 09:42:26 -0800 (Sat, 08 Dec 2007) $ + * + * ==================================================================== + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * <http://www.apache.org/>. + * + */ + +package org.apache.http.message; + +import java.util.NoSuchElementException; + +import org.apache.http.HeaderIterator; +import org.apache.http.ParseException; +import org.apache.http.TokenIterator; + +/** + * Basic implementation of a {@link TokenIterator}. + * This implementation parses <tt>#token<tt> sequences as + * defined by RFC 2616, section 2. + * It extends that definition somewhat beyond US-ASCII. + * + * @version $Revision: 602520 $ + */ +public class BasicTokenIterator implements TokenIterator { + + /** The HTTP separator characters. Defined in RFC 2616, section 2.2. */ + // the order of the characters here is adjusted to put the + // most likely candidates at the beginning of the collection + public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t"; + + + /** The iterator from which to obtain the next header. */ + protected final HeaderIterator headerIt; + + /** + * The value of the current header. + * This is the header value that includes {@link #currentToken}. + * Undefined if the iteration is over. + */ + protected String currentHeader; + + /** + * The token to be returned by the next call to {@link #currentToken}. + * <code>null</code> if the iteration is over. + */ + protected String currentToken; + + /** + * The position after {@link #currentToken} in {@link #currentHeader}. + * Undefined if the iteration is over. + */ + protected int searchPos; + + + /** + * Creates a new instance of {@link BasicTokenIterator}. + * + * @param headerIterator the iterator for the headers to tokenize + */ + public BasicTokenIterator(final HeaderIterator headerIterator) { + if (headerIterator == null) { + throw new IllegalArgumentException + ("Header iterator must not be null."); + } + + this.headerIt = headerIterator; + this.searchPos = findNext(-1); + } + + + // non-javadoc, see interface TokenIterator + public boolean hasNext() { + return (this.currentToken != null); + } + + + /** + * Obtains the next token from this iteration. + * + * @return the next token in this iteration + * + * @throws NoSuchElementException if the iteration is already over + * @throws ParseException if an invalid header value is encountered + */ + public String nextToken() + throws NoSuchElementException, ParseException { + + if (this.currentToken == null) { + throw new NoSuchElementException("Iteration already finished."); + } + + final String result = this.currentToken; + // updates currentToken, may trigger ParseException: + this.searchPos = findNext(this.searchPos); + + return result; + } + + + /** + * Returns the next token. + * Same as {@link #nextToken}, but with generic return type. + * + * @return the next token in this iteration + * + * @throws NoSuchElementException if there are no more tokens + * @throws ParseException if an invalid header value is encountered + */ + public final Object next() + throws NoSuchElementException, ParseException { + return nextToken(); + } + + + /** + * Removing tokens is not supported. + * + * @throws UnsupportedOperationException always + */ + public final void remove() + throws UnsupportedOperationException { + + throw new UnsupportedOperationException + ("Removing tokens is not supported."); + } + + + /** + * Determines the next token. + * If found, the token is stored in {@link #currentToken}. + * The return value indicates the position after the token + * in {@link #currentHeader}. If necessary, the next header + * will be obtained from {@link #headerIt}. + * If not found, {@link #currentToken} is set to <code>null</code>. + * + * @param from the position in the current header at which to + * start the search, -1 to search in the first header + * + * @return the position after the found token in the current header, or + * negative if there was no next token + * + * @throws ParseException if an invalid header value is encountered + */ + protected int findNext(int from) + throws ParseException { + + if (from < 0) { + // called from the constructor, initialize the first header + if (!this.headerIt.hasNext()) { + return -1; + } + this.currentHeader = this.headerIt.nextHeader().getValue(); + from = 0; + } else { + // called after a token, make sure there is a separator + from = findTokenSeparator(from); + } + + int start = findTokenStart(from); + if (start < 0) { + this.currentToken = null; + return -1; // nothing found + } + + int end = findTokenEnd(start); + this.currentToken = createToken(this.currentHeader, start, end); + return end; + } + + + /** + * Creates a new token to be returned. + * Called from {@link #findNext findNext} after the token is identified. + * The default implementation simply calls + * {@link java.lang.String#substring String.substring}. + * <br/> + * If header values are significantly longer than tokens, and some + * tokens are permanently referenced by the application, there can + * be problems with garbage collection. A substring will hold a + * reference to the full characters of the original string and + * therefore occupies more memory than might be expected. + * To avoid this, override this method and create a new string + * instead of a substring. + * + * @param value the full header value from which to create a token + * @param start the index of the first token character + * @param end the index after the last token character + * + * @return a string representing the token identified by the arguments + */ + protected String createToken(String value, int start, int end) { + return value.substring(start, end); + } + + + /** + * Determines the starting position of the next token. + * This method will iterate over headers if necessary. + * + * @param from the position in the current header at which to + * start the search + * + * @return the position of the token start in the current header, + * negative if no token start could be found + */ + protected int findTokenStart(int from) { + if (from < 0) { + throw new IllegalArgumentException + ("Search position must not be negative: " + from); + } + + boolean found = false; + while (!found && (this.currentHeader != null)) { + + final int to = this.currentHeader.length(); + while (!found && (from < to)) { + + final char ch = this.currentHeader.charAt(from); + if (isTokenSeparator(ch) || isWhitespace(ch)) { + // whitspace and token separators are skipped + from++; + } else if (isTokenChar(this.currentHeader.charAt(from))) { + // found the start of a token + found = true; + } else { + throw new ParseException + ("Invalid character before token (pos " + from + + "): " + this.currentHeader); + } + } + if (!found) { + if (this.headerIt.hasNext()) { + this.currentHeader = this.headerIt.nextHeader().getValue(); + from = 0; + } else { + this.currentHeader = null; + } + } + } // while headers + + return found ? from : -1; + } + + + /** + * Determines the position of the next token separator. + * Because of multi-header joining rules, the end of a + * header value is a token separator. This method does + * therefore not need to iterate over headers. + * + * @param from the position in the current header at which to + * start the search + * + * @return the position of a token separator in the current header, + * or at the end + * + * @throws ParseException + * if a new token is found before a token separator. + * RFC 2616, section 2.1 explicitly requires a comma between + * tokens for <tt>#</tt>. + */ + protected int findTokenSeparator(int from) { + if (from < 0) { + throw new IllegalArgumentException + ("Search position must not be negative: " + from); + } + + boolean found = false; + final int to = this.currentHeader.length(); + while (!found && (from < to)) { + final char ch = this.currentHeader.charAt(from); + if (isTokenSeparator(ch)) { + found = true; + } else if (isWhitespace(ch)) { + from++; + } else if (isTokenChar(ch)) { + throw new ParseException + ("Tokens without separator (pos " + from + + "): " + this.currentHeader); + } else { + throw new ParseException + ("Invalid character after token (pos " + from + + "): " + this.currentHeader); + } + } + + return from; + } + + + /** + * Determines the ending position of the current token. + * This method will not leave the current header value, + * since the end of the header value is a token boundary. + * + * @param from the position of the first character of the token + * + * @return the position after the last character of the token. + * The behavior is undefined if <code>from</code> does not + * point to a token character in the current header value. + */ + protected int findTokenEnd(int from) { + if (from < 0) { + throw new IllegalArgumentException + ("Token start position must not be negative: " + from); + } + + final int to = this.currentHeader.length(); + int end = from+1; + while ((end < to) && isTokenChar(this.currentHeader.charAt(end))) { + end++; + } + + return end; + } + + + /** + * Checks whether a character is a token separator. + * RFC 2616, section 2.1 defines comma as the separator for + * <tt>#token</tt> sequences. The end of a header value will + * also separate tokens, but that is not a character check. + * + * @param ch the character to check + * + * @return <code>true</code> if the character is a token separator, + * <code>false</code> otherwise + */ + protected boolean isTokenSeparator(char ch) { + return (ch == ','); + } + + + /** + * Checks whether a character is a whitespace character. + * RFC 2616, section 2.2 defines space and horizontal tab as whitespace. + * The optional preceeding line break is irrelevant, since header + * continuation is handled transparently when parsing messages. + * + * @param ch the character to check + * + * @return <code>true</code> if the character is whitespace, + * <code>false</code> otherwise + */ + protected boolean isWhitespace(char ch) { + + // we do not use Character.isWhitspace(ch) here, since that allows + // many control characters which are not whitespace as per RFC 2616 + return ((ch == '\t') || Character.isSpaceChar(ch)); + } + + + /** + * Checks whether a character is a valid token character. + * Whitespace, control characters, and HTTP separators are not + * valid token characters. The HTTP specification (RFC 2616, section 2.2) + * defines tokens only for the US-ASCII character set, this + * method extends the definition to other character sets. + * + * @param ch the character to check + * + * @return <code>true</code> if the character is a valid token start, + * <code>false</code> otherwise + */ + protected boolean isTokenChar(char ch) { + + // common sense extension of ALPHA + DIGIT + if (Character.isLetterOrDigit(ch)) + return true; + + // common sense extension of CTL + if (Character.isISOControl(ch)) + return false; + + // no common sense extension for this + if (isHttpSeparator(ch)) + return false; + + // RFC 2616, section 2.2 defines a token character as + // "any CHAR except CTLs or separators". The controls + // and separators are included in the checks above. + // This will yield unexpected results for Unicode format characters. + // If that is a problem, overwrite isHttpSeparator(char) to filter + // out the false positives. + return true; + } + + + /** + * Checks whether a character is an HTTP separator. + * The implementation in this class checks only for the HTTP separators + * defined in RFC 2616, section 2.2. If you need to detect other + * separators beyond the US-ASCII character set, override this method. + * + * @param ch the character to check + * + * @return <code>true</code> if the character is an HTTP separator + */ + protected boolean isHttpSeparator(char ch) { + return (HTTP_SEPARATORS.indexOf(ch) >= 0); + } + + +} // class BasicTokenIterator + |