1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the 7 * "License"); you may not use this file except in compliance 8 * with the License. You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, 13 * software distributed under the License is distributed on an 14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 * KIND, either express or implied. See the License for the 16 * specific language governing permissions and limitations 17 * under the License. 18 */ 19 package org.apache.myfaces.shared_orchestra.renderkit.html.util; 20 21 import java.io.ByteArrayOutputStream; 22 import java.io.IOException; 23 import java.io.OutputStreamWriter; 24 import java.io.Writer; 25 26 /** 27 * Converts Strings so that they can be used within HTML-Code. 28 */ 29 public abstract class HTMLEncoder 30 { 31 /** 32 * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true. 33 */ 34 public static String encode (String string) 35 { 36 return encode(string, false, true); 37 } 38 39 /** 40 * Variant of {@link #encode} where encodeNbsp is true. 41 */ 42 public static String encode (String string, boolean encodeNewline) 43 { 44 return encode(string, encodeNewline, true); 45 } 46 47 /** 48 * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 49 */ 50 public static String encode (String string, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp) 51 { 52 return encode(string, encodeNewline, encodeSubsequentBlanksToNbsp, true); 53 } 54 55 /** 56 * Encodes the given string, so that it can be used within a html page. 57 * @param string the string to convert 58 * @param encodeNewline if true newline characters are converted to <br>'s 59 * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &nbsp;'s 60 * @param encodeNonLatin if true encode non-latin characters as numeric character references 61 */ 62 public static String encode (String string, 63 boolean encodeNewline, 64 boolean encodeSubsequentBlanksToNbsp, 65 boolean encodeNonLatin) 66 { 67 if (string == null) 68 { 69 return ""; 70 } 71 72 StringBuffer sb = null; //create later on demand 73 String app; 74 char c; 75 for (int i = 0; i < string.length (); ++i) 76 { 77 app = null; 78 c = string.charAt(i); 79 switch (c) 80 { 81 case '"': app = """; break; //" 82 case '&': app = "&"; break; //& 83 case '<': app = "<"; break; //< 84 case '>': app = ">"; break; //> 85 case ' ': 86 if (encodeSubsequentBlanksToNbsp && 87 (i == 0 || (i - 1 >= 0 && string.charAt(i - 1) == ' '))) 88 { 89 //Space at beginning or after another space 90 app = " "; 91 } 92 break; 93 case '\n': 94 if (encodeNewline) 95 { 96 app = "<br/>"; 97 } 98 break; 99 100 101 default: 102 if (encodeNonLatin) switch(c) { 103 //german umlauts 104 case '\u00E4' : app = "ä"; break; 105 case '\u00C4' : app = "Ä"; break; 106 case '\u00F6' : app = "ö"; break; 107 case '\u00D6' : app = "Ö"; break; 108 case '\u00FC' : app = "ü"; break; 109 case '\u00DC' : app = "Ü"; break; 110 case '\u00DF' : app = "ß"; break; 111 112 //misc 113 //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it? 114 case '\u20AC': app = "€"; break; 115 case '\u00AB': app = "«"; break; 116 case '\u00BB': app = "»"; break; 117 case '\u00A0': app = " "; break; 118 119 default : 120 if (((int)c) >= 0x80) 121 { 122 //encode all non basic latin characters 123 app = "&#" + ((int)c) + ";"; 124 } 125 break; 126 } 127 break; 128 } 129 if (app != null) 130 { 131 if (sb == null) 132 { 133 sb = new StringBuffer(string.substring(0, i)); 134 } 135 sb.append(app); 136 } else { 137 if (sb != null) 138 { 139 sb.append(c); 140 } 141 } 142 } 143 144 if (sb == null) 145 { 146 return string; 147 } 148 else 149 { 150 return sb.toString(); 151 } 152 } 153 154 155 private static final String HEX_CHARSET = "0123456789ABCDEF"; 156 157 private static final String UTF8 = "UTF-8"; 158 159 /** 160 * Encode an URI, escaping or percent-encoding all required characters and 161 * following the rules mentioned on RFC 3986. 162 * 163 * @param string 164 * @param encodeNonLatin 165 * @return 166 * @throws IOException 167 */ 168 public static String encodeURIAtributte(final String string, final String characterEncoding) 169 throws IOException 170 { 171 StringBuffer sb = null; //create later on demand 172 String app; 173 char c; 174 boolean endLoop = false; 175 for (int i = 0; i < string.length (); ++i) 176 { 177 app = null; 178 c = string.charAt(i); 179 180 // This are the guidelines to be taken into account by this algorithm to encode: 181 182 // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters 183 // 184 // control = <US-ASCII coded characters 00-1F and 7F hexadecimal> 185 // space = <US-ASCII coded character 20 hexadecimal> 186 // delims = "<" | ">" | "#" | "%" | <"> 187 // %3C %3E %23 %25 %22 188 // unwise = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`" 189 // %7D %7B %7C %5C %5E %5B %5D %60 190 // 191 // ".... Data corresponding to excluded characters must be escaped in order to 192 // be properly represented within a URI....." 193 194 // RFC 3986 Section 3. Syntax Components 195 // 196 // "... The generic URI syntax consists of a hierarchical sequence of 197 // components referred to as the scheme, authority, path, query, and 198 // fragment. 199 // 200 // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] 201 // 202 // hier-part = "//" authority path-abempty 203 // / path-absolute 204 // / path-rootless 205 // / path-empty 206 // ...." 207 208 // RFC 3986 Section 2.2: 209 // Reserved characters (should not be percent-encoded) 210 // reserved = gen-delims / sub-delims 211 // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" 212 // %3A %2F %3F %23 %5B %5D %40 213 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" 214 // %21 %24 %26 %27 %28 %29 %2A %2B %2C %3B %3D 215 216 // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396, 217 // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) 218 // "...those rules were redefined to directly specify the characters allowed...." 219 // There is also other characters moved from excluded list to reserved: 220 // "[" / "]" / "#" 221 222 // RFC 3986 Section 2.3: 223 // "... for consistency, percent-encoded octets in the ranges of ALPHA 224 // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E), 225 // underscore (%5F), or tilde (%7E) should not be created by URI 226 // producers...." 227 228 // RFC 3986 Section 3.2.2. Host 229 230 // host = IP-literal / IPv4address / reg-name 231 232 // The reg-name syntax allows percent-encoded octets in order to 233 // represent non-ASCII registered names in a uniform way that is 234 // independent of the underlying name resolution technology. Non-ASCII 235 // characters must first be encoded according to UTF-8 [STD63], and then 236 // each octet of the corresponding UTF-8 sequence must be percent- 237 // encoded to be represented as URI characters. URI producing 238 // applications must not use percent-encoding in host unless it is used 239 // to represent a UTF-8 character sequence. 240 241 // RFC 3986 Section 3.4 Query 242 // query = *( pchar / "/" / "?" ) 243 // 244 // "... However, as query components are often used to carry identifying information 245 // in the form of "key=value" pairs and one frequently used value is a reference to 246 // another URI, it is sometimes better for usability to avoid percent-encoding those characters....." 247 // 248 // RFC 3986 Section 2.5 Identifying Data (Apply to query section) 249 // 250 // When a new URI scheme defines a component that represents textual 251 // data consisting of characters from the Universal Character Set [UCS], 252 // the data should first be encoded as octets according to the UTF-8 253 // character encoding [STD63]; then only those octets that do not 254 // correspond to characters in the unreserved set should be percent- 255 // encoded. For example, the character A would be represented as "A", 256 // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented 257 // as "%C3%80", and the character KATAKANA LETTER A would be represented 258 // as "%E3%82%A2". 259 // 260 // RFC 3986 Section 3.5 Fragment 261 // fragment = *( pchar / "/" / "?" ) 262 // 263 // Note that follows the same as query 264 265 // Based on the extracts the strategy to apply on this method is: 266 // 267 // On scheme ":" hier-part 268 // 269 // Escape or percent encode chars inside : 270 // 271 // - From %00 to %20, 272 // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 273 // duplicate encoding, encode it when we are sure 274 // that there are not encoded twice) 275 // - "<" %3C, ">" %3E 276 // - "\" %5C, "^" %5E, "`" %60 277 // - "{" %7B, "|" %7C, "}" %7D 278 // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this 279 // part of an URI, but it is preferred to encode it that omit it). 280 // 281 // The remaining characters must not be encoded 282 // 283 // Characters after ? or # should be percent encoding but only the necessary ones: 284 // 285 // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20) 286 // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 287 // duplicate encoding, encode it when we are sure 288 // that there are not encoded twice) 289 // - "<" %3C, ">" %3E, 290 // - "\" %5C, "^" %5E, "`" %60 291 // - "{" %7B, "|" %7C, "}" %7D 292 // - From %7F ad infinitum (each character as many bytes as necessary but take into account 293 // that a single char should contain 2,3 or more bytes!. This data should be encoded 294 // translating from the document character encoding to percent encoding, because this values 295 // could be retrieved from httpRequest.getParameter() and it uses the current character encoding 296 // for decode values) 297 // 298 // "&" should be encoded as "&" because this link is inside an html page, and 299 // put only & is invalid in this context. 300 301 if ( (c <= (char)0x20) || (c >= (char)0x7F) || 302 c == '"' || c == '<' || 303 c == '>' || c == '\\' || c == '^' || c == '`' || 304 c == '{' || c == '|' || c == '}') 305 { 306 // The percent encoding on this part should be done using UTF-8 charset 307 // as RFC 3986 Section 3.2.2 says. 308 // Also there is a reference on 309 // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars 310 // that recommend use of UTF-8 instead the document character encoding. 311 // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113) 312 app = percentEncode(c, "UTF-8"); 313 } 314 else if (c == '%') 315 { 316 if (i + 2 < string.length()) 317 { 318 char c1 = string.charAt(i+1); 319 char c2 = string.charAt(i+2); 320 if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) && 321 (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z'))) 322 { 323 // do not percent encode, because it could be already encoded 324 // and we don't want encode it twice 325 } 326 else 327 { 328 app = percentEncode(c, UTF8); 329 } 330 } 331 else 332 { 333 app = percentEncode(c, UTF8); 334 } 335 } 336 else if (c == '?' || c == '#') 337 { 338 if (i+1 < string.length()) 339 { 340 // The remaining part of the URI are data that should be encoded 341 // using the document character encoding. 342 app = c + encodeURIQuery(string.substring(i+1), characterEncoding); 343 endLoop = true; 344 } 345 } 346 else 347 { 348 //No encoding, just do nothing, char will be added later. 349 } 350 351 if (app != null) 352 { 353 if (sb == null) 354 { 355 sb = new StringBuffer(string.substring(0, i)); 356 } 357 sb.append(app); 358 } else { 359 if (sb != null) 360 { 361 sb.append(c); 362 } 363 } 364 if (endLoop) 365 { 366 break; 367 } 368 } 369 if (sb == null) 370 { 371 return string; 372 } 373 else 374 { 375 return sb.toString(); 376 } 377 } 378 379 /** 380 * Encode a unicode char value in percentEncode, decoding its bytes using a specified 381 * characterEncoding. 382 * 383 * @param c 384 * @param characterEncoding 385 * @return 386 */ 387 private static String percentEncode(char c, String characterEncoding) 388 { 389 String app = null; 390 if (c > (char)((short)0x007F)) 391 { 392 //percent encode in the proper encoding to be consistent 393 app = percentEncodeNonUsAsciiCharacter(c, characterEncoding); 394 } 395 else 396 { 397 //percent encode US-ASCII char (0x00-0x7F range) 398 app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10); 399 } 400 return app; 401 } 402 403 private static String percentEncodeNonUsAsciiCharacter(char c, String characterEncoding) 404 { 405 ByteArrayOutputStream baos = new ByteArrayOutputStream(10); 406 StringBuffer builder = new StringBuffer(); 407 try 408 { 409 OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding); 410 writer.write(c); 411 writer.flush(); 412 } 413 catch(IOException e) 414 { 415 baos.reset(); 416 return null; 417 } 418 419 byte [] byteArray = baos.toByteArray(); 420 for (int i=0; i < byteArray.length; i++) 421 { 422 builder.append('%'); 423 builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) ); 424 builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10)); 425 } 426 427 return builder.toString(); 428 } 429 430 /** 431 * Encode the query part using the document charset encoding provided. 432 * 433 * 434 * @param string 435 * @param characterEncoding 436 * @return 437 */ 438 private static String encodeURIQuery(final String string, final String characterEncoding) 439 { 440 StringBuffer sb = null; //create later on demand 441 String app; 442 char c; 443 boolean endLoop = false; 444 for (int i = 0; i < string.length (); ++i) 445 { 446 app = null; 447 c = string.charAt(i); 448 449 // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20) 450 // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so we make easier and omit this one) 451 // - "<" %3C, ">" %3E, 452 // - "\" %5C, "^" %5E, "`" %60 453 // - "{" %7B, "|" %7C, "}" %7D 454 // - From %7F ad infinitum (each character as many bytes as necessary but take into account 455 // that a single char should contain 2,3 or more bytes!. This data should be encoded translating from the document 456 // character encoding to percent encoding) 457 // 458 // "&" should be encoded as "&" because this link is inside an html page, and 459 // put & is invalid in this context 460 461 if ( (c <= (char)0x20) || (c >= (char)0x7F) || 462 c == '"' || c == '<' || 463 c == '>' || c == '\\' || c == '^' || c == '`' || 464 c == '{' || c == '|' || c == '}') 465 { 466 // The percent encoding on this part should be done using UTF-8 charset 467 // as RFC 3986 Section 3.2.2 says 468 app = percentEncode(c, characterEncoding); 469 } 470 else if (c == '%') 471 { 472 if (i + 2 < string.length()) 473 { 474 char c1 = string.charAt(i+1); 475 char c2 = string.charAt(i+2); 476 if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) && 477 (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z'))) 478 { 479 // do not percent encode, because it could be already encoded 480 } 481 else 482 { 483 app = percentEncode(c, characterEncoding); 484 } 485 } 486 else 487 { 488 app = percentEncode(c, characterEncoding); 489 } 490 } 491 else if (c == '&') 492 { 493 if (i+4 < string.length() ) 494 { 495 if ('a' == string.charAt(i+1) && 496 'm' == string.charAt(i+2) && 497 'p' == string.charAt(i+3) && 498 ';' == string.charAt(i+4)) 499 { 500 //Skip 501 } 502 else 503 { 504 app = "&"; 505 } 506 } 507 else 508 { 509 app = "&"; 510 } 511 } 512 else 513 { 514 //No encoding, just do nothing, char will be added later. 515 } 516 517 if (app != null) 518 { 519 if (sb == null) 520 { 521 sb = new StringBuffer(string.substring(0, i)); 522 } 523 sb.append(app); 524 } else { 525 if (sb != null) 526 { 527 sb.append(c); 528 } 529 } 530 if (endLoop) 531 { 532 break; 533 } 534 } 535 if (sb == null) 536 { 537 return string; 538 } 539 else 540 { 541 return sb.toString(); 542 } 543 } 544 }