View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.myfaces.shared_orchestra.renderkit.html.util;
20  
21  import java.io.ByteArrayOutputStream;
22  import java.io.IOException;
23  import java.io.OutputStreamWriter;
24  import java.io.Writer;
25  
26  /**
27   * Converts Strings so that they can be used within HTML-Code.
28   */
29  public abstract class HTMLEncoder
30  {
31      /**
32       * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
33       */
34      public static String encode (String string)
35      {
36          return encode(string, false, true);
37      }
38  
39      /**
40       * Variant of {@link #encode} where encodeNbsp is true.
41       */
42      public static String encode (String string, boolean encodeNewline)
43      {
44          return encode(string, encodeNewline, true);
45      }
46  
47      /**
48       * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 
49       */
50      public static String encode (String string, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp)
51      {
52          return encode(string, encodeNewline, encodeSubsequentBlanksToNbsp, true);
53      }
54  
55      /**
56       * Encodes the given string, so that it can be used within a html page.
57       * @param string the string to convert
58       * @param encodeNewline if true newline characters are converted to <br>'s
59       * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to  's
60       * @param encodeNonLatin if true encode non-latin characters as numeric character references
61       */
62      public static String encode (String string,
63                                   boolean encodeNewline,
64                                   boolean encodeSubsequentBlanksToNbsp,
65                                   boolean encodeNonLatin)
66      {
67          if (string == null)
68          {
69              return "";
70          }
71  
72          StringBuffer sb = null;    //create later on demand
73          String app;
74          char c;
75          for (int i = 0; i < string.length (); ++i)
76          {
77              app = null;
78              c = string.charAt(i);
79              switch (c)
80              {
81                  case '"': app = "&quot;"; break;    //"
82                  case '&': app = "&amp;"; break;     //&
83                  case '<': app = "&lt;"; break;      //<
84                  case '>': app = "&gt;"; break;      //>
85                  case ' ':
86                      if (encodeSubsequentBlanksToNbsp &&
87                          (i == 0 || (i - 1 >= 0 && string.charAt(i - 1) == ' ')))
88                      {
89                          //Space at beginning or after another space
90                          app = "&#160;";
91                      }
92                      break;
93                  case '\n':
94                      if (encodeNewline)
95                      {
96                          app = "<br/>";
97                      }
98                      break;
99  
100 
101                 default:
102                     if (encodeNonLatin) switch(c) {
103                         //german umlauts
104                         case '\u00E4' : app = "&auml;";  break;
105                         case '\u00C4' : app = "&Auml;";  break;
106                         case '\u00F6' : app = "&ouml;";  break;
107                         case '\u00D6' : app = "&Ouml;";  break;
108                         case '\u00FC' : app = "&uuml;";  break;
109                         case '\u00DC' : app = "&Uuml;";  break;
110                         case '\u00DF' : app = "&szlig;"; break;
111         
112                         //misc
113                         //case 0x80: app = "&euro;"; break;  sometimes euro symbol is ascii 128, should we suport it?
114                         case '\u20AC': app = "&euro;";  break;
115                         case '\u00AB': app = "&laquo;"; break;
116                         case '\u00BB': app = "&raquo;"; break;
117                         case '\u00A0': app = "&#160;"; break;
118                     
119                         default :
120                         if (((int)c) >= 0x80)
121                         {
122                             //encode all non basic latin characters
123                             app = "&#" + ((int)c) + ";";
124                         }
125                         break;
126                     }
127                     break;
128             }
129             if (app != null)
130             {
131                 if (sb == null)
132                 {
133                     sb = new StringBuffer(string.substring(0, i));
134                 }
135                 sb.append(app);
136             } else {
137                 if (sb != null)
138                 {
139                     sb.append(c);
140                 }
141             }
142         }
143 
144         if (sb == null)
145         {
146             return string;
147         }
148         else
149         {
150             return sb.toString();
151         }
152     }
153 
154     
155     private static final String HEX_CHARSET = "0123456789ABCDEF";
156     
157     private static final String UTF8 = "UTF-8";
158     
159     /**
160      * Encode an URI, escaping or percent-encoding all required characters and
161      * following the rules mentioned on RFC 3986.  
162      * 
163      * @param string
164      * @param encodeNonLatin
165      * @return
166      * @throws IOException
167      */
168     public static String encodeURIAtributte(final String string, final String characterEncoding)
169         throws IOException
170     {
171         StringBuffer sb = null;    //create later on demand
172         String app;
173         char c;
174         boolean endLoop = false;
175         for (int i = 0; i < string.length (); ++i)
176         {
177             app = null;
178             c = string.charAt(i);
179             
180             // This are the guidelines to be taken into account by this algorithm to encode:
181             
182             // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters
183             //
184             // control     = <US-ASCII coded characters 00-1F and 7F hexadecimal>
185             // space       = <US-ASCII coded character 20 hexadecimal>
186             // delims      = "<" | ">" | "#" | "%" | <">
187             //               %3C   %3E   %23   %25   %22
188             // unwise      = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`"
189             //               %7D   %7B   %7C   %5C   %5E   %5B   %5D   %60
190             //
191             // ".... Data corresponding to excluded characters must be escaped in order to
192             // be properly represented within a URI....."
193             
194             // RFC 3986 Section 3.  Syntax Components
195             //
196             // "... The generic URI syntax consists of a hierarchical sequence of
197             // components referred to as the scheme, authority, path, query, and
198             // fragment.
199             //
200             //   URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
201             //
202             //   hier-part   = "//" authority path-abempty
203             //               / path-absolute
204             //               / path-rootless
205             //               / path-empty
206             // ...."
207             
208             // RFC 3986 Section 2.2:
209             // Reserved characters (should not be percent-encoded)
210             // reserved    = gen-delims / sub-delims
211             // gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
212             //               %3A   %2F   %3F   %23   %5B   %5D   %40
213             // sub-delims  = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
214             //               %21   %24   %26   %27   %28   %29   %2A   %2B   %2C   %3B   %3D
215             
216             // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396,
217             // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) 
218             // "...those rules were redefined to directly specify the characters allowed...."
219             // There is also other characters moved from excluded list to reserved:
220             // "[" / "]" / "#"  
221             
222             // RFC 3986 Section 2.3:
223             // "... for consistency, percent-encoded octets in the ranges of ALPHA
224             // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
225             // underscore (%5F), or tilde (%7E) should not be created by URI
226             // producers...."
227             
228             // RFC 3986 Section  3.2.2.  Host
229 
230             // host = IP-literal / IPv4address / reg-name
231 
232             // The reg-name syntax allows percent-encoded octets in order to
233             // represent non-ASCII registered names in a uniform way that is
234             // independent of the underlying name resolution technology.  Non-ASCII
235             // characters must first be encoded according to UTF-8 [STD63], and then
236             // each octet of the corresponding UTF-8 sequence must be percent-
237             // encoded to be represented as URI characters.  URI producing
238             // applications must not use percent-encoding in host unless it is used
239             // to represent a UTF-8 character sequence.
240             
241             // RFC 3986 Section 3.4 Query 
242             //         query       = *( pchar / "/" / "?" )
243             //
244             // "...  However, as query components are often used to carry identifying information 
245             // in the form of "key=value" pairs and one frequently used value is a reference to
246             // another URI, it is sometimes better for usability to avoid percent-encoding those characters....."
247             //
248             // RFC 3986 Section 2.5 Identifying Data (Apply to query section)
249             //
250             // When a new URI scheme defines a component that represents textual
251             // data consisting of characters from the Universal Character Set [UCS],
252             // the data should first be encoded as octets according to the UTF-8
253             // character encoding [STD63]; then only those octets that do not
254             // correspond to characters in the unreserved set should be percent-
255             // encoded.  For example, the character A would be represented as "A",
256             // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
257             // as "%C3%80", and the character KATAKANA LETTER A would be represented
258             // as "%E3%82%A2".
259             //
260             // RFC 3986 Section 3.5 Fragment
261             //         fragment    = *( pchar / "/" / "?" )
262             //
263             // Note that follows the same as query
264             
265             // Based on the extracts the strategy to apply on this method is:
266             // 
267             // On scheme ":" hier-part
268             //
269             // Escape or percent encode chars inside :
270             // 
271             // - From %00 to %20, 
272             // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
273             //                     duplicate encoding, encode it when we are sure 
274             //                     that there are not encoded twice)
275             // - "<" %3C, ">" %3E
276             // - "\" %5C, "^" %5E, "`" %60 
277             // - "{" %7B, "|" %7C, "}" %7D
278             // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this
279             //   part of an URI, but it is preferred to encode it that omit it).
280             //
281             // The remaining characters must not be encoded
282             //
283             // Characters after ? or # should be percent encoding but only the necessary ones:
284             //
285             // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
286             // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
287             //                     duplicate encoding, encode it when we are sure 
288             //                     that there are not encoded twice)
289             // - "<" %3C, ">" %3E,
290             // - "\" %5C, "^" %5E, "`" %60 
291             // - "{" %7B, "|" %7C, "}" %7D
292             // - From %7F ad infinitum (each character as many bytes as necessary but take into account
293             //   that a single char should contain 2,3 or more bytes!. This data should be encoded 
294             //   translating from the document character encoding to percent encoding, because this values
295             //   could be retrieved from httpRequest.getParameter() and it uses the current character encoding
296             //   for decode values)
297             //
298             // "&" should be encoded as "&amp;" because this link is inside an html page, and 
299             // put only & is invalid in this context.
300 
301             if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
302                     c == '"' || c == '<' ||
303                     c == '>' || c == '\\' || c == '^' || c == '`' ||
304                     c == '{' || c == '|' || c == '}')
305             {
306                 // The percent encoding on this part should be done using UTF-8 charset
307                 // as RFC 3986 Section 3.2.2 says.
308                 // Also there is a reference on 
309                 // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars
310                 // that recommend use of UTF-8 instead the document character encoding.
311                 // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113)
312                 app = percentEncode(c, "UTF-8");
313             }
314             else if (c == '%')
315             {
316                 if (i + 2 < string.length())
317                 {
318                     char c1 = string.charAt(i+1);
319                     char c2 = string.charAt(i+2);
320                     if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
321                         (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
322                     {
323                         // do not percent encode, because it could be already encoded
324                         // and we don't want encode it twice
325                     }
326                     else
327                     {
328                         app = percentEncode(c, UTF8);
329                     }
330                 }
331                 else
332                 {
333                     app = percentEncode(c, UTF8);
334                 }
335             }
336             else if (c == '?' || c == '#')
337             {
338                 if (i+1 < string.length())
339                 {
340                     // The remaining part of the URI are data that should be encoded
341                     // using the document character encoding.
342                     app = c + encodeURIQuery(string.substring(i+1), characterEncoding);
343                     endLoop = true;
344                 }
345             }
346             else
347             {
348                 //No encoding, just do nothing, char will be added later.
349             }
350                         
351             if (app != null)
352             {
353                 if (sb == null)
354                 {
355                     sb = new StringBuffer(string.substring(0, i));
356                 }
357                 sb.append(app);
358             } else {
359                 if (sb != null)
360                 {
361                     sb.append(c);
362                 }
363             }
364             if (endLoop)
365             {
366                 break;
367             }
368         }
369         if (sb == null)
370         {
371             return string;
372         }
373         else
374         {
375             return sb.toString();
376         }
377     }
378     
379     /**
380      * Encode a unicode char value in percentEncode, decoding its bytes using a specified 
381      * characterEncoding.
382      * 
383      * @param c
384      * @param characterEncoding
385      * @return
386      */
387     private static String percentEncode(char c, String characterEncoding)
388     {
389         String app = null;
390         if (c > (char)((short)0x007F))
391         {
392             //percent encode in the proper encoding to be consistent
393             app = percentEncodeNonUsAsciiCharacter(c, characterEncoding);
394         }
395         else
396         {
397             //percent encode US-ASCII char (0x00-0x7F range)
398             app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10);
399         }
400         return app;
401     }
402     
403     private static String percentEncodeNonUsAsciiCharacter(char c, String characterEncoding)
404     {
405         ByteArrayOutputStream baos = new ByteArrayOutputStream(10);
406         StringBuffer builder = new StringBuffer();
407         try
408         {
409             OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding);
410             writer.write(c);
411             writer.flush();
412         }
413         catch(IOException e)
414         {
415             baos.reset();
416             return null;
417         }
418         
419         byte [] byteArray =  baos.toByteArray();
420         for (int i=0; i < byteArray.length; i++)
421         {
422             builder.append('%');
423             builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
424             builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
425         }
426         
427         return builder.toString();
428     }
429 
430     /**
431      * Encode the query part using the document charset encoding provided.
432      * 
433      * 
434      * @param string
435      * @param characterEncoding
436      * @return
437      */
438     private static String encodeURIQuery(final String string, final String characterEncoding)
439     {
440         StringBuffer sb = null;    //create later on demand
441         String app;
442         char c;
443         boolean endLoop = false;
444         for (int i = 0; i < string.length (); ++i)
445         {
446             app = null;
447             c = string.charAt(i);
448             
449             // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
450             // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so we make easier and omit this one)
451             // - "<" %3C, ">" %3E,
452             // - "\" %5C, "^" %5E, "`" %60 
453             // - "{" %7B, "|" %7C, "}" %7D
454             // - From %7F ad infinitum (each character as many bytes as necessary but take into account
455             //   that a single char should contain 2,3 or more bytes!. This data should be encoded translating from the document
456             //   character encoding to percent encoding)
457             //
458             // "&" should be encoded as "&amp;" because this link is inside an html page, and 
459             // put & is invalid in this context   
460             
461             if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
462                     c == '"' || c == '<' ||
463                     c == '>' || c == '\\' || c == '^' || c == '`' ||
464                     c == '{' || c == '|' || c == '}')
465             {
466                 // The percent encoding on this part should be done using UTF-8 charset
467                 // as RFC 3986 Section 3.2.2 says
468                 app = percentEncode(c, characterEncoding);
469             }
470             else if (c == '%')
471             {
472                 if (i + 2 < string.length())
473                 {
474                     char c1 = string.charAt(i+1);
475                     char c2 = string.charAt(i+2);
476                     if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
477                         (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
478                     {
479                         // do not percent encode, because it could be already encoded
480                     }
481                     else
482                     {
483                         app = percentEncode(c, characterEncoding);
484                     }
485                 }
486                 else
487                 {
488                     app = percentEncode(c, characterEncoding);
489                 }
490             }
491             else if (c == '&')
492             {
493                 if (i+4 < string.length() )
494                 {
495                     if ('a' == string.charAt(i+1) &&
496                         'm' == string.charAt(i+2) &&
497                         'p' == string.charAt(i+3) &&
498                         ';' == string.charAt(i+4))
499                     {
500                         //Skip
501                     }
502                     else
503                     {
504                         app = "&amp;";
505                     }
506                 }
507                 else
508                 {
509                     app = "&amp;";
510                 }
511             }
512             else
513             {
514                 //No encoding, just do nothing, char will be added later.
515             }
516                         
517             if (app != null)
518             {
519                 if (sb == null)
520                 {
521                     sb = new StringBuffer(string.substring(0, i));
522                 }
523                 sb.append(app);
524             } else {
525                 if (sb != null)
526                 {
527                     sb.append(c);
528                 }
529             }
530             if (endLoop)
531             {
532                 break;
533             }
534         }
535         if (sb == null)
536         {
537             return string;
538         }
539         else
540         {
541             return sb.toString();
542         }
543     }
544 }