1 files changed, 636 insertions, 0 deletions
diff --git a/simple/simple-http/src/main/java/org/simpleframework/http/parse/QueryParser.java b/simple/simple-http/src/main/java/org/simpleframework/http/parse/QueryParser.java
new file mode 100644
index 0000000..56b6788
--- /dev/null
+++ b/simple/simple-http/src/main/java/org/simpleframework/http/parse/QueryParser.java
@@ -0,0 +1,636 @@
+/*
+ * QueryParser.java December 2002
+ *
+ * Copyright (C) 2002, Niall Gallagher <niallg@users.sf.net>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
+ * implied. See the License for the specific language governing 
+ * permissions and limitations under the License.
+ */
+  
+package org.simpleframework.http.parse;
+
+import org.simpleframework.common.parse.MapParser;
+import org.simpleframework.http.Query;
+
+import java.net.URLEncoder;
+import java.util.Set;
+
+/**
+ * The <code>ParameterParser</code> is used to parse data encoded in 
+ * the <code>application/x-www-form-urlencoded</code> MIME type. It 
+ * is also used to parse a query string from a HTTP URL, see RFC 2616.
+ * The parsed parameters are available through the various methods of
+ * the <code>org.simpleframework.http.net.Query</code> interface. The
+ * syntax of the parsed parameters is described below in BNF.
+ * <pre>
+ *
+ *    params  = *(pair [ "&amp;" params])
+ *    pair    = name "=" value
+ *    name    = *(text | escaped)
+ *    value   = *(text | escaped)
+ *    escaped = % HEX HEX
+ *
+ * </pre> 
+ * This will consume all data found as a name or value, if the data 
+ * is a "+" character then it is replaced with a space character.
+ * This regards only "=", "&amp;", and "%" as having special values.
+ * The "=" character delimits the name from the value and the "&amp;"
+ * delimits the name value pair. The "%" character represents the 
+ * start of an escaped sequence, which consists of two hex digits.
+ * All escaped sequences are converted to its character value. 
+ *
+ * @author Niall Gallagher
+ */
+public class QueryParser extends MapParser<String> implements Query {
+
+   /**
+    * Used to accumulate the characters for the parameter name.
+    */
+   private Token name;
+   
+   /**
+    * Used to accumulate the characters for the parameter value.
+    */
+   private Token value;
+
+   /**
+    * Constructor for the <code>ParameterParser</code>. This creates
+    * an instance that can be use to parse HTML form data and URL
+    * query strings encoded as application/x-www-form-urlencoded.
+    * The parsed parameters are made available through the interface
+    * <code>org.simpleframework.util.net.Query</code>.    
+    */
+   public QueryParser(){
+      this.name = new Token();
+      this.value = new Token();
+   }
+   
+   /**
+    * Constructor for the <code>ParameterParser</code>. This creates
+    * an instance that can be use to parse HTML form data and URL
+    * query strings encoded as application/x-www-form-urlencoded.
+    * The parsed parameters are made available through the interface
+    * <code>org.simpleframework.util.net.Query</code>.    
+    *
+    * @param text this is the text to parse for the parameters
+    */
+   public QueryParser(String text){
+      this();
+      parse(text);
+   }
+
+   /**
+    * This extracts an integer parameter for the named value. If the 
+    * named parameter does not exist this will return a zero value. 
+    * If however the parameter exists but is not in the format of a 
+    * decimal integer value then this will throw an exception.
+    *
+    * @param name the name of the parameter value to retrieve
+    *
+    * @return this returns the named parameter value as an integer   
+    */
+   public int getInteger(Object name) {
+      String value = get(name);
+
+      if(value != null) {
+         return Integer.parseInt(value);      
+      }
+      return 0;
+   }
+
+   /**
+    * This extracts a float parameter for the named value. If the 
+    * named parameter does not exist this will return a zero value. 
+    * If however the parameter exists but is not in the format of a 
+    * floating point number then this will throw an exception.
+    *
+    * @param name the name of the parameter value to retrieve
+    *
+    * @return this returns the named parameter value as a float   
+    */
+   public float getFloat(Object name) {
+      String value = get(name);
+
+      if(value != null) {
+         return Float.parseFloat(value);      
+      }
+      return 0.0f;
+   }
+
+   /**
+    * This extracts a boolean parameter for the named value. If the
+    * named parameter does not exist this will return false otherwise
+    * the value is evaluated. If it is either <code>true</code> or 
+    * <code>false</code> then those boolean values are returned.
+    * 
+    * @param name the name of the parameter value to retrieve
+    *
+    * @return this returns the named parameter value as an float
+    */
+   public boolean getBoolean(Object name) {
+      Boolean flag = Boolean.FALSE;           
+      String value = get(name);
+           
+      if(value != null) {         
+         flag = Boolean.valueOf(value);
+      }
+      return flag.booleanValue();
+   }
+
+   
+   /**
+    * This initializes the parser so that it can be used several
+    * times. This clears any previous parameters extracted. This
+    * ensures that when the next <code>parse(String)</code> is
+    * invoked the status of the <code>Query</code> is empty.
+    */
+   protected void init(){
+      all.clear();
+      map.clear();
+      name.len = 0;
+      value.len = 0;
+      off = 0;
+   }
+
+   /**
+    * This performs the actual parsing of the parameter text. The
+    * parameters parsed from this are taken as "name=value" pairs.
+    * Multiple pairs within the text are separated by an "&amp;".
+    * This will parse and insert all parameters into a hashtable.
+    */
+   protected void parse() {            
+      param();
+      while(skip("&")){         
+         param();
+      }
+   }
+
+   /**
+    * This method adds the name and value to a map so that the next
+    * name and value can be collected. The name and value are added
+    * to the map as string objects. Once added to the map the
+    * <code>Token</code> objects are set to have zero length so they
+    * can be reused to collect further values. This will add the 
+    * values to the map as an array of type string. This is done so
+    * that if there are multiple values that they can be stored. 
+    */
+   private void insert(){
+      if(name.len > 0){
+         insert(name,value);
+      }
+      name.len = 0;
+      value.len = 0;
+   }
+
+   /**
+    * This will add the given name and value to the parameters map.
+    * If any previous value of the given name has been inserted
+    * into the map then this will overwrite that value. This is
+    * used to ensure that the string value is inserted to the map.
+    *
+    * @param name this is the name of the value to be inserted
+    * @param value this is the value of a that is to be inserted
+    */
+   private void insert(Token name, Token value){
+      put(name.toString(), value.toString());
+   }
+   
+   /** 
+    * This is an expression that is defined by RFC 2396 it is used
+    * in the definition of a segment expression. This is basically
+    * a list of chars with escaped sequences. 
+    * <p>
+    * This method has to ensure that no escaped chars go unchecked. 
+    * This ensures that the read offset does not go out of bounds 
+    * and consequently throw an out of bounds exception.  
+    */
+   private void param() {
+      name(); 
+      if(skip("=")){ /* in case of error*/
+         value();
+      }
+      insert();      
+   }    
+   
+   /**
+    * This extracts the name of the parameter from the character 
+    * buffer. The name of a parameter is defined as a set of 
+    * chars including escape sequences. This will extract the
+    * parameter name and buffer the chars. The name ends when a
+    * equals character, "=", is encountered.
+    */
+   private void name(){  
+      int mark = off;
+      int pos = off;
+
+      while(off < count){         
+         if(buf[off]=='%'){ /* escaped */         
+            escape();
+         }else if(buf[off]=='=') {
+            break;
+         }else if(buf[off]=='+'){
+            buf[off] = ' ';
+         }
+         buf[pos++] = buf[off++];         
+      }          
+      name.len = pos - mark;
+      name.off = mark;
+   }
+
+   /**
+    * This extracts a parameter value from a path segment. The
+    * parameter value consists of a sequence of chars and some
+    * escape sequences. The parameter value is buffered so that
+    * the name and values can be paired. The end of the value 
+    * is determined as the end of the buffer or an ampersand.
+    */
+   private void value(){
+      int mark = off;
+      int pos = off;
+
+      while(off < count){         
+         if(buf[off]=='%'){ /* escaped */         
+            escape();
+         }else if(buf[off]=='+'){
+            buf[off] = ' ';
+         }else if(buf[off]=='&'){
+            break;
+         }
+         buf[pos++] = buf[off++];
+      }              
+      value.len = pos - mark;
+      value.off = mark; 
+   }
+   
+   /** 
+    * This converts an encountered escaped sequence, that is all
+    * embedded hexidecimal characters into a native UCS character 
+    * value. This does not take any characters from the stream it 
+    * just prepares the buffer with the correct byte. The escaped 
+    * sequence within the URI will be interpreded as UTF-8.
+    * <p>
+    * This will leave the next character to read from the buffer 
+    * as the character encoded from the URI. If there is a fully 
+    * valid escaped sequence, that is <code>"%" HEX HEX</code>.
+    * This decodes the escaped sequence using UTF-8 encoding, all
+    * encoded sequences should be in UCS-2 to fit in a Java char.
+    */
+   private void escape() {
+      int peek = peek(off);
+
+      if(!unicode(peek)) {
+         binary(peek);
+      }
+   }
+
+   /**
+    * This method determines, using a peek character, whether the
+    * sequence of escaped characters within the URI is binary data.
+    * If the data within the escaped sequence is binary then this
+    * will ensure that the next character read from the URI is the
+    * binary octet. This is used strictly for backward compatible
+    * parsing of URI strings, binary data should never appear.
+    *
+    * @param peek this is the first escaped character from the URI
+    *
+    * @return currently this implementation always returns true 
+    */
+   private boolean binary(int peek) {
+      if(off + 2 < count) {
+         off += 2;
+         buf[off] =bits(peek);
+      }
+      return true;
+   }
+
+   /**
+    * This method determines, using a peek character, whether the
+    * sequence of escaped characters within the URI is in UTF-8. If
+    * a UTF-8 character can be successfully decoded from the URI it
+    * will be the next character read from the buffer. This can 
+    * check for both UCS-2 and UCS-4 characters. However, because
+    * the Java <code>char</code> can only hold UCS-2, the UCS-4
+    * characters will have only the low order octets stored.
+    * <p> 
+    * The WWW Consortium provides a reference implementation of a
+    * UTF-8 decoding for Java, in this the low order octets in the
+    * UCS-4 sequence are used for the character. So, in the
+    * absence of a defined behaviour, the W3C behaviour is assumed.
+    * 
+    * @param peek this is the first escaped character from the URI
+    *
+    * @return this returns true if a UTF-8 character is decoded 
+    */
+   private boolean unicode(int peek) {
+      if((peek & 0x80) == 0x00){
+         return unicode(peek, 0);
+      }
+      if((peek & 0xe0) == 0xc0){
+         return unicode(peek & 0x1f, 1);
+      }
+      if((peek & 0xf0) == 0xe0){
+         return unicode(peek & 0x0f, 2);
+      }
+      if((peek & 0xf8) == 0xf0){
+         return unicode(peek & 0x07, 3);
+      }
+      if((peek & 0xfc) == 0xf8){
+         return unicode(peek & 0x03, 4);
+      }
+      if((peek & 0xfe) == 0xfc){
+         return unicode(peek & 0x01, 5);
+      }
+      return false;
+   }
+
+   /**
+    * This method will decode the specified amount of escaped 
+    * characters from the URI and convert them into a single Java
+    * UCS-2 character. If there are not enough characters within
+    * the URI then this will return false and leave the URI alone.   
+    * <p>
+    * The number of characters left is determined from the first
+    * UTF-8 octet, as specified in RFC 2279, and because this is 
+    * a URI there must that number of <code>"%" HEX HEX</code>
+    * sequences left. If successful the next character read is 
+    * the UTF-8 sequence decoded into a native UCS-2 character.
+    *
+    * @param peek contains the bits read from the first UTF octet
+    * @param more this specifies the number of UTF octets left
+    *
+    * @return this returns true if a UTF-8 character is decoded
+    */
+   private boolean unicode(int peek, int more) {
+      if(off + more * 3 >= count) {
+         return false;
+      }
+      return unicode(peek,more,off);
+   }
+
+   /**
+    * This will decode the specified amount of trailing UTF-8 bits
+    * from the URI. The trailing bits are those following the first 
+    * UTF-8 octet, which specifies the length, in octets, of the 
+    * sequence. The trailing octets are of the form 10xxxxxx, for
+    * each of these octets only the last six bits are valid UCS
+    * bits. So a conversion is basically an accumulation of these.
+    * <p>
+    * If at any point during the accumulation of the UTF-8 bits
+    * there is a parsing error, then parsing is aborted an false
+    * is returned, as a result the URI is left unchanged.
+    *
+    * @param peek bytes that have been accumulated fron the URI
+    * @param more this specifies the number of UTF octets left
+    * @param pos this specifies the position the parsing begins
+    *
+    * @return this returns true if a UTF-8 character is decoded
+    */
+   private boolean unicode(int peek, int more, int pos) {
+      while(more-- > 0) {
+         if(buf[pos] == '%'){ 
+            int next = pos + 3;
+            int hex = peek(next);
+
+            if((hex & 0xc0) == 0x80){
+               peek = (peek<<6)|(hex&0x3f);
+               pos = next;
+               continue;
+            }
+         }
+         return false;
+      }
+      if(pos + 2 < count) {
+         off = pos + 2;
+         buf[off]= bits(peek);
+      }
+      return true;
+   }
+
+   /**
+    * Defines behaviour for UCS-2 versus UCS-4 conversion from four
+    * octets. The UTF-8 encoding scheme enables UCS-4 characters to
+    * be encoded and decodeded. However, Java supports the 16-bit
+    * UCS-2 character set, and so the 32-bit UCS-4 character set is
+    * not compatable. This basically decides what to do with UCS-4.
+    *
+    * @param data up to four octets to be converted to UCS-2 format
+    *
+    * @return this returns a native UCS-2 character from the int
+    */
+   private char bits(int data) {
+      return (char)data;
+   }     
+
+   /** 
+    * This will return the escape expression specified from the URI
+    * as an integer value of the hexadecimal sequence. This does
+    * not make any changes to the buffer it simply checks to see if
+    * the characters at the position specified are an escaped set 
+    * characters of the form <code>"%" HEX HEX</code>, if so, then
+    * it will convert that hexadecimal string  in to an integer 
+    * value, or -1 if the expression is not hexadecimal.
+    *
+    * @param pos this is the position the expression starts from
+    *
+    * @return the integer value of the hexadecimal expression
+    */
+   private int peek(int pos) {
+      if(buf[pos] == '%'){
+         if(count <= pos + 2) {
+            return -1;
+         }
+         char high = buf[pos + 1];
+         char low = buf[pos + 2];
+  
+         return convert(high, low);
+      }
+      return -1;
+   }
+
+   /**
+    * This will convert the two hexidecimal characters to a real
+    * integer value, which is returned. This requires characters
+    * within the range of 'A' to 'F' and 'a' to 'f', and also 
+    * the digits '0' to '9'. The characters encoded using the
+    * ISO-8859-1 encoding scheme, if the characters are not with
+    * in the range specified then this returns -1. 
+    * 
+    * @param high this is the high four bits within the integer
+    * @param low this is the low four bits within the integer
+    *  
+    * @return this returns the indeger value of the conversion 
+    */
+   private int convert(char high, char low) {
+      int hex = 0x00;
+   
+      if(hex(high) && hex(low)){
+         if('A' <= high && high <= 'F'){
+            high -= 'A' - 'a';
+         }
+         if(high >= 'a') {
+            hex ^= (high-'a')+10;      
+         } else {
+            hex ^= high -'0';
+         }
+         hex <<= 4;
+
+         if('A' <= low && low <= 'F') {
+            low -= 'A' - 'a';
+         }
+         if(low >= 'a') {
+            hex ^= (low-'a')+10;      
+         } else {
+            hex ^= low-'0';
+         }    
+         return hex;
+      }
+      return -1;
+   }
+
+   /** 
+    * This is used to determine whether a char is a hexadecimal
+    * <code>char</code> or not. A hexadecimal character is considered 
+    * to be a character within the range of <code>0 - 9</code> and 
+    * between <code>a - f</code> and <code>A - F</code>. This will 
+    * return <code>true</code> if the character is in this range.
+    *
+    * @param ch this is the character which is to be determined here
+    *
+    * @return true if the character given has a hexadecimal value
+    */
+   private boolean hex(char ch) {            
+      if(ch >= '0' && ch <= '9') {    
+         return true;
+      } else if(ch >='a' && ch <= 'f') {   
+         return true;
+      } else if(ch >= 'A' && ch <= 'F') {
+         return true;
+      }
+      return false;
+   }
+
+   /**
+    * This <code>encode</code> method will escape the text that
+    * is provided. This is used to that the parameter pairs can
+    * be encoded in such a way that it can be transferred over
+    * HTTP/1.1 using the ISO-8859-1 character set.
+    *
+    * @param text this is the text that is to be escaped
+    *
+    * @return the text with % HEX HEX UTF-8 escape sequences
+    */ 
+   private String encode(String text) {
+      try {           
+         return URLEncoder.encode(text, "UTF-8");           
+      }catch(Exception e){
+         return text;              
+      }         
+   }
+
+   /**
+    * This <code>encode</code> method will escape the name=value
+    * pair provided using the UTF-8 character set. This method
+    * will ensure that the parameters are encoded in such a way
+    * that they can be transferred via HTTP in ISO-8859-1.
+    *
+    * @param name this is the name of that is to be escaped
+    * @param value this is the value that is to be escaped
+    *
+    * @return the pair with % HEX HEX UTF-8 escape sequences
+    */ 
+   private String encode(String name, String value) {
+      return encode(name) + "=" + encode(value);           
+   }
+   
+   /**
+    * This <code>toString</code> method is used to compose an string
+    * in the <code>application/x-www-form-urlencoded</code> MIME type.
+    * This will encode the tokens specified in the <code>Set</code>.
+    * Each name=value pair acquired is converted into a UTF-8 escape
+    * sequence so that the parameters can be sent in the IS0-8859-1
+    * format required via the HTTP/1.1 specification RFC 2616.
+    * 
+    * @param set this is the set of parameters to be encoded
+    * 
+    * @return returns a HTTP parameter encoding for the pairs
+    */ 
+   public String toString(Set set) {
+      Object[] list = set.toArray();
+      String text = "";
+      
+      for(int i = 0; i < list.length; i++){
+         String name = list[i].toString();
+         String value = get(name);
+         
+         if(i > 0) {
+            text += "&";                 
+         }              
+         text += encode(name, value);
+      }  
+      return text;    
+   }
+
+   /**
+    * This <code>toString</code> method is used to compose an string
+    * in the <code>application/x-www-form-urlencoded</code> MIME type.
+    * This will iterate over all tokens that have been added to this
+    * object, either during parsing, or during use of the instance.
+    * Each name=value pair acquired is converted into a UTF-8 escape
+    * sequence so that the parameters can be sent in the IS0-8859-1
+    * format required via the HTTP/1.1 specification RFC 2616.
+    * 
+    * @return returns a HTTP parameter encoding for the pairs
+    */ 
+   public String toString() {
+      Set set = map.keySet();
+   
+      if(map.size() > 0) {      
+         return toString(set);
+      }
+      return "";      
+   }
+   
+   /**
+    * This is used to mark regions within the buffer that represent
+    * a valid token for either the name of a parameter or its value.
+    * This is used as an alternative to the <code>ParseBuffer</code>
+    * which requires memory to be allocated for storing the data
+    * read from the buffer. This requires only two integer values.
+    */
+   private class Token {
+      
+      /**
+       * This represents the number of characters in the token.
+       */
+      public int len;
+
+      /**
+       * This represents the start offset within the buffer.
+       */  
+      public int off;
+
+      /**
+       * In order to represent the <code>Token</code> as a value
+       * that can be used this converts it to a <code>String</code>.
+       * If the length of the token is less than or equal to zero
+       * this will return and empty string for the value.
+       *
+       * @return this returns a value representing the token
+       */
+      public String toString() {      
+         if(len <= 0) {
+            return "";
+         }
+         return new String(buf,off,len);
+      }
+   }
+}