summaryrefslogtreecommitdiffstats
path: root/simple/simple-http/src/main/java/org/simpleframework/http/parse/QueryParser.java
blob: 56b67880bd67143b184e504b67cfb2f3b87bc137 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
/*
 * QueryParser.java December 2002
 *
 * Copyright (C) 2002, Niall Gallagher <niallg@users.sf.net>
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
 * implied. See the License for the specific language governing 
 * permissions and limitations under the License.
 */
  
package org.simpleframework.http.parse;

import org.simpleframework.common.parse.MapParser;
import org.simpleframework.http.Query;

import java.net.URLEncoder;
import java.util.Set;

/**
 * The <code>ParameterParser</code> is used to parse data encoded in 
 * the <code>application/x-www-form-urlencoded</code> MIME type. It 
 * is also used to parse a query string from a HTTP URL, see RFC 2616.
 * The parsed parameters are available through the various methods of
 * the <code>org.simpleframework.http.net.Query</code> interface. The
 * syntax of the parsed parameters is described below in BNF.
 * <pre>
 *
 *    params  = *(pair [ "&amp;" params])
 *    pair    = name "=" value
 *    name    = *(text | escaped)
 *    value   = *(text | escaped)
 *    escaped = % HEX HEX
 *
 * </pre> 
 * This will consume all data found as a name or value, if the data 
 * is a "+" character then it is replaced with a space character.
 * This regards only "=", "&amp;", and "%" as having special values.
 * The "=" character delimits the name from the value and the "&amp;"
 * delimits the name value pair. The "%" character represents the 
 * start of an escaped sequence, which consists of two hex digits.
 * All escaped sequences are converted to its character value. 
 *
 * @author Niall Gallagher
 */
public class QueryParser extends MapParser<String> implements Query {

   /**
    * Used to accumulate the characters for the parameter name.
    */
   private Token name;
   
   /**
    * Used to accumulate the characters for the parameter value.
    */
   private Token value;

   /**
    * Constructor for the <code>ParameterParser</code>. This creates
    * an instance that can be use to parse HTML form data and URL
    * query strings encoded as application/x-www-form-urlencoded.
    * The parsed parameters are made available through the interface
    * <code>org.simpleframework.util.net.Query</code>.    
    */
   public QueryParser(){
      this.name = new Token();
      this.value = new Token();
   }
   
   /**
    * Constructor for the <code>ParameterParser</code>. This creates
    * an instance that can be use to parse HTML form data and URL
    * query strings encoded as application/x-www-form-urlencoded.
    * The parsed parameters are made available through the interface
    * <code>org.simpleframework.util.net.Query</code>.    
    *
    * @param text this is the text to parse for the parameters
    */
   public QueryParser(String text){
      this();
      parse(text);
   }

   /**
    * This extracts an integer parameter for the named value. If the 
    * named parameter does not exist this will return a zero value. 
    * If however the parameter exists but is not in the format of a 
    * decimal integer value then this will throw an exception.
    *
    * @param name the name of the parameter value to retrieve
    *
    * @return this returns the named parameter value as an integer   
    */
   public int getInteger(Object name) {
      String value = get(name);

      if(value != null) {
         return Integer.parseInt(value);      
      }
      return 0;
   }

   /**
    * This extracts a float parameter for the named value. If the 
    * named parameter does not exist this will return a zero value. 
    * If however the parameter exists but is not in the format of a 
    * floating point number then this will throw an exception.
    *
    * @param name the name of the parameter value to retrieve
    *
    * @return this returns the named parameter value as a float   
    */
   public float getFloat(Object name) {
      String value = get(name);

      if(value != null) {
         return Float.parseFloat(value);      
      }
      return 0.0f;
   }

   /**
    * This extracts a boolean parameter for the named value. If the
    * named parameter does not exist this will return false otherwise
    * the value is evaluated. If it is either <code>true</code> or 
    * <code>false</code> then those boolean values are returned.
    * 
    * @param name the name of the parameter value to retrieve
    *
    * @return this returns the named parameter value as an float
    */
   public boolean getBoolean(Object name) {
      Boolean flag = Boolean.FALSE;           
      String value = get(name);
           
      if(value != null) {         
         flag = Boolean.valueOf(value);
      }
      return flag.booleanValue();
   }

   
   /**
    * This initializes the parser so that it can be used several
    * times. This clears any previous parameters extracted. This
    * ensures that when the next <code>parse(String)</code> is
    * invoked the status of the <code>Query</code> is empty.
    */
   protected void init(){
      all.clear();
      map.clear();
      name.len = 0;
      value.len = 0;
      off = 0;
   }

   /**
    * This performs the actual parsing of the parameter text. The
    * parameters parsed from this are taken as "name=value" pairs.
    * Multiple pairs within the text are separated by an "&amp;".
    * This will parse and insert all parameters into a hashtable.
    */
   protected void parse() {            
      param();
      while(skip("&")){         
         param();
      }
   }

   /**
    * This method adds the name and value to a map so that the next
    * name and value can be collected. The name and value are added
    * to the map as string objects. Once added to the map the
    * <code>Token</code> objects are set to have zero length so they
    * can be reused to collect further values. This will add the 
    * values to the map as an array of type string. This is done so
    * that if there are multiple values that they can be stored. 
    */
   private void insert(){
      if(name.len > 0){
         insert(name,value);
      }
      name.len = 0;
      value.len = 0;
   }

   /**
    * This will add the given name and value to the parameters map.
    * If any previous value of the given name has been inserted
    * into the map then this will overwrite that value. This is
    * used to ensure that the string value is inserted to the map.
    *
    * @param name this is the name of the value to be inserted
    * @param value this is the value of a that is to be inserted
    */
   private void insert(Token name, Token value){
      put(name.toString(), value.toString());
   }
   
   /** 
    * This is an expression that is defined by RFC 2396 it is used
    * in the definition of a segment expression. This is basically
    * a list of chars with escaped sequences. 
    * <p>
    * This method has to ensure that no escaped chars go unchecked. 
    * This ensures that the read offset does not go out of bounds 
    * and consequently throw an out of bounds exception.  
    */
   private void param() {
      name(); 
      if(skip("=")){ /* in case of error*/
         value();
      }
      insert();      
   }    
   
   /**
    * This extracts the name of the parameter from the character 
    * buffer. The name of a parameter is defined as a set of 
    * chars including escape sequences. This will extract the
    * parameter name and buffer the chars. The name ends when a
    * equals character, "=", is encountered.
    */
   private void name(){  
      int mark = off;
      int pos = off;

      while(off < count){         
         if(buf[off]=='%'){ /* escaped */         
            escape();
         }else if(buf[off]=='=') {
            break;
         }else if(buf[off]=='+'){
            buf[off] = ' ';
         }
         buf[pos++] = buf[off++];         
      }          
      name.len = pos - mark;
      name.off = mark;
   }

   /**
    * This extracts a parameter value from a path segment. The
    * parameter value consists of a sequence of chars and some
    * escape sequences. The parameter value is buffered so that
    * the name and values can be paired. The end of the value 
    * is determined as the end of the buffer or an ampersand.
    */
   private void value(){
      int mark = off;
      int pos = off;

      while(off < count){         
         if(buf[off]=='%'){ /* escaped */         
            escape();
         }else if(buf[off]=='+'){
            buf[off] = ' ';
         }else if(buf[off]=='&'){
            break;
         }
         buf[pos++] = buf[off++];
      }              
      value.len = pos - mark;
      value.off = mark; 
   }
   
   /** 
    * This converts an encountered escaped sequence, that is all
    * embedded hexidecimal characters into a native UCS character 
    * value. This does not take any characters from the stream it 
    * just prepares the buffer with the correct byte. The escaped 
    * sequence within the URI will be interpreded as UTF-8.
    * <p>
    * This will leave the next character to read from the buffer 
    * as the character encoded from the URI. If there is a fully 
    * valid escaped sequence, that is <code>"%" HEX HEX</code>.
    * This decodes the escaped sequence using UTF-8 encoding, all
    * encoded sequences should be in UCS-2 to fit in a Java char.
    */
   private void escape() {
      int peek = peek(off);

      if(!unicode(peek)) {
         binary(peek);
      }
   }

   /**
    * This method determines, using a peek character, whether the
    * sequence of escaped characters within the URI is binary data.
    * If the data within the escaped sequence is binary then this
    * will ensure that the next character read from the URI is the
    * binary octet. This is used strictly for backward compatible
    * parsing of URI strings, binary data should never appear.
    *
    * @param peek this is the first escaped character from the URI
    *
    * @return currently this implementation always returns true 
    */
   private boolean binary(int peek) {
      if(off + 2 < count) {
         off += 2;
         buf[off] =bits(peek);
      }
      return true;
   }

   /**
    * This method determines, using a peek character, whether the
    * sequence of escaped characters within the URI is in UTF-8. If
    * a UTF-8 character can be successfully decoded from the URI it
    * will be the next character read from the buffer. This can 
    * check for both UCS-2 and UCS-4 characters. However, because
    * the Java <code>char</code> can only hold UCS-2, the UCS-4
    * characters will have only the low order octets stored.
    * <p> 
    * The WWW Consortium provides a reference implementation of a
    * UTF-8 decoding for Java, in this the low order octets in the
    * UCS-4 sequence are used for the character. So, in the
    * absence of a defined behaviour, the W3C behaviour is assumed.
    * 
    * @param peek this is the first escaped character from the URI
    *
    * @return this returns true if a UTF-8 character is decoded 
    */
   private boolean unicode(int peek) {
      if((peek & 0x80) == 0x00){
         return unicode(peek, 0);
      }
      if((peek & 0xe0) == 0xc0){
         return unicode(peek & 0x1f, 1);
      }
      if((peek & 0xf0) == 0xe0){
         return unicode(peek & 0x0f, 2);
      }
      if((peek & 0xf8) == 0xf0){
         return unicode(peek & 0x07, 3);
      }
      if((peek & 0xfc) == 0xf8){
         return unicode(peek & 0x03, 4);
      }
      if((peek & 0xfe) == 0xfc){
         return unicode(peek & 0x01, 5);
      }
      return false;
   }

   /**
    * This method will decode the specified amount of escaped 
    * characters from the URI and convert them into a single Java
    * UCS-2 character. If there are not enough characters within
    * the URI then this will return false and leave the URI alone.   
    * <p>
    * The number of characters left is determined from the first
    * UTF-8 octet, as specified in RFC 2279, and because this is 
    * a URI there must that number of <code>"%" HEX HEX</code>
    * sequences left. If successful the next character read is 
    * the UTF-8 sequence decoded into a native UCS-2 character.
    *
    * @param peek contains the bits read from the first UTF octet
    * @param more this specifies the number of UTF octets left
    *
    * @return this returns true if a UTF-8 character is decoded
    */
   private boolean unicode(int peek, int more) {
      if(off + more * 3 >= count) {
         return false;
      }
      return unicode(peek,more,off);
   }

   /**
    * This will decode the specified amount of trailing UTF-8 bits
    * from the URI. The trailing bits are those following the first 
    * UTF-8 octet, which specifies the length, in octets, of the 
    * sequence. The trailing octets are of the form 10xxxxxx, for
    * each of these octets only the last six bits are valid UCS
    * bits. So a conversion is basically an accumulation of these.
    * <p>
    * If at any point during the accumulation of the UTF-8 bits
    * there is a parsing error, then parsing is aborted an false
    * is returned, as a result the URI is left unchanged.
    *
    * @param peek bytes that have been accumulated fron the URI
    * @param more this specifies the number of UTF octets left
    * @param pos this specifies the position the parsing begins
    *
    * @return this returns true if a UTF-8 character is decoded
    */
   private boolean unicode(int peek, int more, int pos) {
      while(more-- > 0) {
         if(buf[pos] == '%'){ 
            int next = pos + 3;
            int hex = peek(next);

            if((hex & 0xc0) == 0x80){
               peek = (peek<<6)|(hex&0x3f);
               pos = next;
               continue;
            }
         }
         return false;
      }
      if(pos + 2 < count) {
         off = pos + 2;
         buf[off]= bits(peek);
      }
      return true;
   }

   /**
    * Defines behaviour for UCS-2 versus UCS-4 conversion from four
    * octets. The UTF-8 encoding scheme enables UCS-4 characters to
    * be encoded and decodeded. However, Java supports the 16-bit
    * UCS-2 character set, and so the 32-bit UCS-4 character set is
    * not compatable. This basically decides what to do with UCS-4.
    *
    * @param data up to four octets to be converted to UCS-2 format
    *
    * @return this returns a native UCS-2 character from the int
    */
   private char bits(int data) {
      return (char)data;
   }     

   /** 
    * This will return the escape expression specified from the URI
    * as an integer value of the hexadecimal sequence. This does
    * not make any changes to the buffer it simply checks to see if
    * the characters at the position specified are an escaped set 
    * characters of the form <code>"%" HEX HEX</code>, if so, then
    * it will convert that hexadecimal string  in to an integer 
    * value, or -1 if the expression is not hexadecimal.
    *
    * @param pos this is the position the expression starts from
    *
    * @return the integer value of the hexadecimal expression
    */
   private int peek(int pos) {
      if(buf[pos] == '%'){
         if(count <= pos + 2) {
            return -1;
         }
         char high = buf[pos + 1];
         char low = buf[pos + 2];
  
         return convert(high, low);
      }
      return -1;
   }

   /**
    * This will convert the two hexidecimal characters to a real
    * integer value, which is returned. This requires characters
    * within the range of 'A' to 'F' and 'a' to 'f', and also 
    * the digits '0' to '9'. The characters encoded using the
    * ISO-8859-1 encoding scheme, if the characters are not with
    * in the range specified then this returns -1. 
    * 
    * @param high this is the high four bits within the integer
    * @param low this is the low four bits within the integer
    *  
    * @return this returns the indeger value of the conversion 
    */
   private int convert(char high, char low) {
      int hex = 0x00;
   
      if(hex(high) && hex(low)){
         if('A' <= high && high <= 'F'){
            high -= 'A' - 'a';
         }
         if(high >= 'a') {
            hex ^= (high-'a')+10;      
         } else {
            hex ^= high -'0';
         }
         hex <<= 4;

         if('A' <= low && low <= 'F') {
            low -= 'A' - 'a';
         }
         if(low >= 'a') {
            hex ^= (low-'a')+10;      
         } else {
            hex ^= low-'0';
         }    
         return hex;
      }
      return -1;
   }

   /** 
    * This is used to determine whether a char is a hexadecimal
    * <code>char</code> or not. A hexadecimal character is considered 
    * to be a character within the range of <code>0 - 9</code> and 
    * between <code>a - f</code> and <code>A - F</code>. This will 
    * return <code>true</code> if the character is in this range.
    *
    * @param ch this is the character which is to be determined here
    *
    * @return true if the character given has a hexadecimal value
    */
   private boolean hex(char ch) {            
      if(ch >= '0' && ch <= '9') {    
         return true;
      } else if(ch >='a' && ch <= 'f') {   
         return true;
      } else if(ch >= 'A' && ch <= 'F') {
         return true;
      }
      return false;
   }

   /**
    * This <code>encode</code> method will escape the text that
    * is provided. This is used to that the parameter pairs can
    * be encoded in such a way that it can be transferred over
    * HTTP/1.1 using the ISO-8859-1 character set.
    *
    * @param text this is the text that is to be escaped
    *
    * @return the text with % HEX HEX UTF-8 escape sequences
    */ 
   private String encode(String text) {
      try {           
         return URLEncoder.encode(text, "UTF-8");           
      }catch(Exception e){
         return text;              
      }         
   }

   /**
    * This <code>encode</code> method will escape the name=value
    * pair provided using the UTF-8 character set. This method
    * will ensure that the parameters are encoded in such a way
    * that they can be transferred via HTTP in ISO-8859-1.
    *
    * @param name this is the name of that is to be escaped
    * @param value this is the value that is to be escaped
    *
    * @return the pair with % HEX HEX UTF-8 escape sequences
    */ 
   private String encode(String name, String value) {
      return encode(name) + "=" + encode(value);           
   }
   
   /**
    * This <code>toString</code> method is used to compose an string
    * in the <code>application/x-www-form-urlencoded</code> MIME type.
    * This will encode the tokens specified in the <code>Set</code>.
    * Each name=value pair acquired is converted into a UTF-8 escape
    * sequence so that the parameters can be sent in the IS0-8859-1
    * format required via the HTTP/1.1 specification RFC 2616.
    * 
    * @param set this is the set of parameters to be encoded
    * 
    * @return returns a HTTP parameter encoding for the pairs
    */ 
   public String toString(Set set) {
      Object[] list = set.toArray();
      String text = "";
      
      for(int i = 0; i < list.length; i++){
         String name = list[i].toString();
         String value = get(name);
         
         if(i > 0) {
            text += "&";                 
         }              
         text += encode(name, value);
      }  
      return text;    
   }

   /**
    * This <code>toString</code> method is used to compose an string
    * in the <code>application/x-www-form-urlencoded</code> MIME type.
    * This will iterate over all tokens that have been added to this
    * object, either during parsing, or during use of the instance.
    * Each name=value pair acquired is converted into a UTF-8 escape
    * sequence so that the parameters can be sent in the IS0-8859-1
    * format required via the HTTP/1.1 specification RFC 2616.
    * 
    * @return returns a HTTP parameter encoding for the pairs
    */ 
   public String toString() {
      Set set = map.keySet();
   
      if(map.size() > 0) {      
         return toString(set);
      }
      return "";      
   }
   
   /**
    * This is used to mark regions within the buffer that represent
    * a valid token for either the name of a parameter or its value.
    * This is used as an alternative to the <code>ParseBuffer</code>
    * which requires memory to be allocated for storing the data
    * read from the buffer. This requires only two integer values.
    */
   private class Token {
      
      /**
       * This represents the number of characters in the token.
       */
      public int len;

      /**
       * This represents the start offset within the buffer.
       */  
      public int off;

      /**
       * In order to represent the <code>Token</code> as a value
       * that can be used this converts it to a <code>String</code>.
       * If the length of the token is less than or equal to zero
       * this will return and empty string for the value.
       *
       * @return this returns a value representing the token
       */
      public String toString() {      
         if(len <= 0) {
            return "";
         }
         return new String(buf,off,len);
      }
   }
}