summaryrefslogtreecommitdiffstats
path: root/src/org/apache/http/message/BasicTokenIterator.java
blob: 5fbf5ba77a5eae70fe0800396598261f6c468f04 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
/*
 * $HeadURL: http://svn.apache.org/repos/asf/httpcomponents/httpcore/trunk/module-main/src/main/java/org/apache/http/message/BasicTokenIterator.java $
 * $Revision: 602520 $
 * $Date: 2007-12-08 09:42:26 -0800 (Sat, 08 Dec 2007) $
 *
 * ====================================================================
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 *
 */

package org.apache.http.message;

import java.util.NoSuchElementException;

import org.apache.http.HeaderIterator;
import org.apache.http.ParseException;
import org.apache.http.TokenIterator;

/**
 * Basic implementation of a {@link TokenIterator}.
 * This implementation parses <tt>#token<tt> sequences as
 * defined by RFC 2616, section 2.
 * It extends that definition somewhat beyond US-ASCII.
 * 
 * @version $Revision: 602520 $
 */
public class BasicTokenIterator implements TokenIterator {

    /** The HTTP separator characters. Defined in RFC 2616, section 2.2. */
    // the order of the characters here is adjusted to put the
    // most likely candidates at the beginning of the collection
    public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t";


    /** The iterator from which to obtain the next header. */
    protected final HeaderIterator headerIt;

    /**
     * The value of the current header.
     * This is the header value that includes {@link #currentToken}.
     * Undefined if the iteration is over.
     */
    protected String currentHeader;

    /**
     * The token to be returned by the next call to {@link #currentToken}.
     * <code>null</code> if the iteration is over.
     */
    protected String currentToken;

    /**
     * The position after {@link #currentToken} in {@link #currentHeader}.
     * Undefined if the iteration is over.
     */
    protected int searchPos;


    /**
     * Creates a new instance of {@link BasicTokenIterator}.
     *
     * @param headerIterator    the iterator for the headers to tokenize
     */
    public BasicTokenIterator(final HeaderIterator headerIterator) {
        if (headerIterator == null) {
            throw new IllegalArgumentException
                ("Header iterator must not be null.");
        }

        this.headerIt = headerIterator;
        this.searchPos = findNext(-1);
    }


    // non-javadoc, see interface TokenIterator
    public boolean hasNext() {
        return (this.currentToken != null);
    }


    /**
     * Obtains the next token from this iteration.
     *
     * @return  the next token in this iteration
     *
     * @throws NoSuchElementException   if the iteration is already over
     * @throws ParseException   if an invalid header value is encountered
     */
    public String nextToken()
        throws NoSuchElementException, ParseException {

        if (this.currentToken == null) {
            throw new NoSuchElementException("Iteration already finished.");
        }

        final String result = this.currentToken;
        // updates currentToken, may trigger ParseException:
        this.searchPos = findNext(this.searchPos);

        return result;
    }


    /**
     * Returns the next token.
     * Same as {@link #nextToken}, but with generic return type.
     *
     * @return  the next token in this iteration
     *
     * @throws NoSuchElementException   if there are no more tokens
     * @throws ParseException   if an invalid header value is encountered
     */
    public final Object next()
        throws NoSuchElementException, ParseException {
        return nextToken();
    }


    /**
     * Removing tokens is not supported.
     *
     * @throws UnsupportedOperationException    always
     */
    public final void remove()
        throws UnsupportedOperationException {

        throw new UnsupportedOperationException
            ("Removing tokens is not supported.");
    }


    /**
     * Determines the next token.
     * If found, the token is stored in {@link #currentToken}.
     * The return value indicates the position after the token
     * in {@link #currentHeader}. If necessary, the next header
     * will be obtained from {@link #headerIt}.
     * If not found, {@link #currentToken} is set to <code>null</code>.
     *
     * @param from      the position in the current header at which to
     *                  start the search, -1 to search in the first header
     *
     * @return  the position after the found token in the current header, or
     *          negative if there was no next token
     *
     * @throws ParseException   if an invalid header value is encountered
     */
    protected int findNext(int from)
        throws ParseException {

        if (from < 0) {
            // called from the constructor, initialize the first header
            if (!this.headerIt.hasNext()) {
                return -1;
            }
            this.currentHeader = this.headerIt.nextHeader().getValue();
            from = 0;
        } else {
            // called after a token, make sure there is a separator
            from = findTokenSeparator(from);
        }

        int start = findTokenStart(from);
        if (start < 0) {
            this.currentToken = null;
            return -1; // nothing found
        }

        int end = findTokenEnd(start);
        this.currentToken = createToken(this.currentHeader, start, end);
        return end;
    }


    /**
     * Creates a new token to be returned.
     * Called from {@link #findNext findNext} after the token is identified.
     * The default implementation simply calls
     * {@link java.lang.String#substring String.substring}.
     * <br/>
     * If header values are significantly longer than tokens, and some
     * tokens are permanently referenced by the application, there can
     * be problems with garbage collection. A substring will hold a
     * reference to the full characters of the original string and
     * therefore occupies more memory than might be expected.
     * To avoid this, override this method and create a new string
     * instead of a substring.
     *
     * @param value     the full header value from which to create a token
     * @param start     the index of the first token character
     * @param end       the index after the last token character
     *
     * @return  a string representing the token identified by the arguments
     */
    protected String createToken(String value, int start, int end) {
        return value.substring(start, end);
    }


    /**
     * Determines the starting position of the next token.
     * This method will iterate over headers if necessary.
     *
     * @param from      the position in the current header at which to
     *                  start the search
     *
     * @return  the position of the token start in the current header,
     *          negative if no token start could be found
     */
    protected int findTokenStart(int from) {
        if (from < 0) {
            throw new IllegalArgumentException
                ("Search position must not be negative: " + from);
        }

        boolean found = false;
        while (!found && (this.currentHeader != null)) {

            final int to = this.currentHeader.length();
            while (!found && (from < to)) {

                final char ch = this.currentHeader.charAt(from);
                if (isTokenSeparator(ch) || isWhitespace(ch)) {
                    // whitspace and token separators are skipped
                    from++;
                } else if (isTokenChar(this.currentHeader.charAt(from))) {
                    // found the start of a token
                    found = true;
                } else {
                    throw new ParseException
                        ("Invalid character before token (pos " + from +
                         "): " + this.currentHeader);
                }
            }
            if (!found) {
                if (this.headerIt.hasNext()) {
                    this.currentHeader = this.headerIt.nextHeader().getValue();
                    from = 0;
                } else {
                    this.currentHeader = null;
                }
            }
        } // while headers

        return found ? from : -1;
    }


    /**
     * Determines the position of the next token separator.
     * Because of multi-header joining rules, the end of a
     * header value is a token separator. This method does
     * therefore not need to iterate over headers.
     *
     * @param from      the position in the current header at which to
     *                  start the search
     *
     * @return  the position of a token separator in the current header,
     *          or at the end
     *
     * @throws ParseException
     *         if a new token is found before a token separator.
     *         RFC 2616, section 2.1 explicitly requires a comma between
     *         tokens for <tt>#</tt>.
     */
    protected int findTokenSeparator(int from) {
        if (from < 0) {
            throw new IllegalArgumentException
                ("Search position must not be negative: " + from);
        }

        boolean found = false;
        final int to = this.currentHeader.length();
        while (!found && (from < to)) {
            final char ch = this.currentHeader.charAt(from);
            if (isTokenSeparator(ch)) {
                found = true;
            } else if (isWhitespace(ch)) {
                from++;
            } else if (isTokenChar(ch)) {
                throw new ParseException
                    ("Tokens without separator (pos " + from +
                     "): " + this.currentHeader);
            } else {
                throw new ParseException
                    ("Invalid character after token (pos " + from +
                     "): " + this.currentHeader);
            }
        }

        return from;
    }


    /**
     * Determines the ending position of the current token.
     * This method will not leave the current header value,
     * since the end of the header value is a token boundary.
     *
     * @param from      the position of the first character of the token
     *
     * @return  the position after the last character of the token.
     *          The behavior is undefined if <code>from</code> does not
     *          point to a token character in the current header value.
     */
    protected int findTokenEnd(int from) {
        if (from < 0) {
            throw new IllegalArgumentException
                ("Token start position must not be negative: " + from);
        }

        final int to = this.currentHeader.length();
        int end = from+1;
        while ((end < to) && isTokenChar(this.currentHeader.charAt(end))) {
            end++;
        }

        return end;
    }


    /**
     * Checks whether a character is a token separator.
     * RFC 2616, section 2.1 defines comma as the separator for
     * <tt>#token</tt> sequences. The end of a header value will
     * also separate tokens, but that is not a character check.
     *
     * @param ch        the character to check
     *
     * @return  <code>true</code> if the character is a token separator,
     *          <code>false</code> otherwise
     */
    protected boolean isTokenSeparator(char ch) {
        return (ch == ',');
    }


    /**
     * Checks whether a character is a whitespace character.
     * RFC 2616, section 2.2 defines space and horizontal tab as whitespace.
     * The optional preceeding line break is irrelevant, since header
     * continuation is handled transparently when parsing messages.
     *
     * @param ch        the character to check
     *
     * @return  <code>true</code> if the character is whitespace,
     *          <code>false</code> otherwise
     */
    protected boolean isWhitespace(char ch) {

        // we do not use Character.isWhitspace(ch) here, since that allows
        // many control characters which are not whitespace as per RFC 2616
        return ((ch == '\t') || Character.isSpaceChar(ch));
    }


    /**
     * Checks whether a character is a valid token character.
     * Whitespace, control characters, and HTTP separators are not
     * valid token characters. The HTTP specification (RFC 2616, section 2.2)
     * defines tokens only for the US-ASCII character set, this
     * method extends the definition to other character sets.
     *
     * @param ch        the character to check
     *
     * @return  <code>true</code> if the character is a valid token start,
     *          <code>false</code> otherwise
     */
    protected boolean isTokenChar(char ch) {

        // common sense extension of ALPHA + DIGIT
        if (Character.isLetterOrDigit(ch))
            return true;

        // common sense extension of CTL
        if (Character.isISOControl(ch))
            return false;

        // no common sense extension for this
        if (isHttpSeparator(ch))
            return false;

        // RFC 2616, section 2.2 defines a token character as
        // "any CHAR except CTLs or separators". The controls
        // and separators are included in the checks above.
        // This will yield unexpected results for Unicode format characters.
        // If that is a problem, overwrite isHttpSeparator(char) to filter
        // out the false positives.
        return true;
    }


    /**
     * Checks whether a character is an HTTP separator.
     * The implementation in this class checks only for the HTTP separators
     * defined in RFC 2616, section 2.2. If you need to detect other
     * separators beyond the US-ASCII character set, override this method.
     *
     * @param ch        the character to check
     *
     * @return  <code>true</code> if the character is an HTTP separator
     */
    protected boolean isHttpSeparator(char ch) {
        return (HTTP_SEPARATORS.indexOf(ch) >= 0);
    }


} // class BasicTokenIterator