001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.fileupload.util.mime; 018 019import java.io.ByteArrayOutputStream; 020import java.io.IOException; 021import java.io.UnsupportedEncodingException; 022import java.util.HashMap; 023import java.util.Locale; 024import java.util.Map; 025 026/** 027 * Utility class to decode MIME texts. 028 * 029 * @since 1.3 030 */ 031public final class MimeUtility { 032 033 /** 034 * The {@code US-ASCII} charset identifier constant. 035 */ 036 private static final String US_ASCII_CHARSET = "US-ASCII"; 037 038 /** 039 * The marker to indicate text is encoded with BASE64 algorithm. 040 */ 041 private static final String BASE64_ENCODING_MARKER = "B"; 042 043 /** 044 * The marker to indicate text is encoded with QuotedPrintable algorithm. 045 */ 046 private static final String QUOTEDPRINTABLE_ENCODING_MARKER = "Q"; 047 048 /** 049 * If the text contains any encoded tokens, those tokens will be marked with "=?". 050 */ 051 private static final String ENCODED_TOKEN_MARKER = "=?"; 052 053 /** 054 * If the text contains any encoded tokens, those tokens will terminate with "=?". 055 */ 056 private static final String ENCODED_TOKEN_FINISHER = "?="; 057 058 /** 059 * The linear whitespace chars sequence. 060 */ 061 private static final String LINEAR_WHITESPACE = " \t\r\n"; 062 063 /** 064 * Mappings between MIME and Java charset. 065 */ 066 private static final Map<String, String> MIME2JAVA = new HashMap<String, String>(); 067 068 static { 069 MIME2JAVA.put("iso-2022-cn", "ISO2022CN"); 070 MIME2JAVA.put("iso-2022-kr", "ISO2022KR"); 071 MIME2JAVA.put("utf-8", "UTF8"); 072 MIME2JAVA.put("utf8", "UTF8"); 073 MIME2JAVA.put("ja_jp.iso2022-7", "ISO2022JP"); 074 MIME2JAVA.put("ja_jp.eucjp", "EUCJIS"); 075 MIME2JAVA.put("euc-kr", "KSC5601"); 076 MIME2JAVA.put("euckr", "KSC5601"); 077 MIME2JAVA.put("us-ascii", "ISO-8859-1"); 078 MIME2JAVA.put("x-us-ascii", "ISO-8859-1"); 079 } 080 081 /** 082 * Hidden constructor, this class must not be instantiated. 083 */ 084 private MimeUtility() { 085 // do nothing 086 } 087 088 /** 089 * Decode a string of text obtained from a mail header into 090 * its proper form. The text generally will consist of a 091 * string of tokens, some of which may be encoded using 092 * base64 encoding. 093 * 094 * @param text The text to decode. 095 * 096 * @return The decoded text string. 097 * @throws UnsupportedEncodingException if the detected encoding in the input text is not supported. 098 */ 099 public static String decodeText(String text) throws UnsupportedEncodingException { 100 // if the text contains any encoded tokens, those tokens will be marked with "=?". If the 101 // source string doesn't contain that sequent, no decoding is required. 102 if (text.indexOf(ENCODED_TOKEN_MARKER) < 0) { 103 return text; 104 } 105 106 int offset = 0; 107 int endOffset = text.length(); 108 109 int startWhiteSpace = -1; 110 int endWhiteSpace = -1; 111 112 StringBuilder decodedText = new StringBuilder(text.length()); 113 114 boolean previousTokenEncoded = false; 115 116 while (offset < endOffset) { 117 char ch = text.charAt(offset); 118 119 // is this a whitespace character? 120 if (LINEAR_WHITESPACE.indexOf(ch) != -1) { // whitespace found 121 startWhiteSpace = offset; 122 while (offset < endOffset) { 123 // step over the white space characters. 124 ch = text.charAt(offset); 125 if (LINEAR_WHITESPACE.indexOf(ch) != -1) { // whitespace found 126 offset++; 127 } else { 128 // record the location of the first non lwsp and drop down to process the 129 // token characters. 130 endWhiteSpace = offset; 131 break; 132 } 133 } 134 } else { 135 // we have a word token. We need to scan over the word and then try to parse it. 136 int wordStart = offset; 137 138 while (offset < endOffset) { 139 // step over the non white space characters. 140 ch = text.charAt(offset); 141 if (LINEAR_WHITESPACE.indexOf(ch) == -1) { // not white space 142 offset++; 143 } else { 144 break; 145 } 146 147 //NB: Trailing whitespace on these header strings will just be discarded. 148 } 149 // pull out the word token. 150 String word = text.substring(wordStart, offset); 151 // is the token encoded? decode the word 152 if (word.startsWith(ENCODED_TOKEN_MARKER)) { 153 try { 154 // if this gives a parsing failure, treat it like a non-encoded word. 155 String decodedWord = decodeWord(word); 156 157 // are any whitespace characters significant? Append 'em if we've got 'em. 158 if (!previousTokenEncoded && startWhiteSpace != -1) { 159 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace)); 160 startWhiteSpace = -1; 161 } 162 // this is definitely a decoded token. 163 previousTokenEncoded = true; 164 // and add this to the text. 165 decodedText.append(decodedWord); 166 // we continue parsing from here...we allow parsing errors to fall through 167 // and get handled as normal text. 168 continue; 169 170 } catch (ParseException e) { 171 // just ignore it, skip to next word 172 } 173 } 174 // this is a normal token, so it doesn't matter what the previous token was. Add the white space 175 // if we have it. 176 if (startWhiteSpace != -1) { 177 decodedText.append(text.substring(startWhiteSpace, endWhiteSpace)); 178 startWhiteSpace = -1; 179 } 180 // this is not a decoded token. 181 previousTokenEncoded = false; 182 decodedText.append(word); 183 } 184 } 185 186 return decodedText.toString(); 187 } 188 189 /** 190 * Parse a string using the RFC 2047 rules for an "encoded-word" 191 * type. This encoding has the syntax: 192 * 193 * encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" 194 * 195 * @param word The possibly encoded word value. 196 * 197 * @return The decoded word. 198 * @throws ParseException 199 * @throws UnsupportedEncodingException 200 */ 201 private static String decodeWord(String word) throws ParseException, UnsupportedEncodingException { 202 // encoded words start with the characters "=?". If this not an encoded word, we throw a 203 // ParseException for the caller. 204 205 if (!word.startsWith(ENCODED_TOKEN_MARKER)) { 206 throw new ParseException("Invalid RFC 2047 encoded-word: " + word); 207 } 208 209 int charsetPos = word.indexOf('?', 2); 210 if (charsetPos == -1) { 211 throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word); 212 } 213 214 // pull out the character set information (this is the MIME name at this point). 215 String charset = word.substring(2, charsetPos).toLowerCase(Locale.ENGLISH); 216 217 // now pull out the encoding token the same way. 218 int encodingPos = word.indexOf('?', charsetPos + 1); 219 if (encodingPos == -1) { 220 throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word); 221 } 222 223 String encoding = word.substring(charsetPos + 1, encodingPos); 224 225 // and finally the encoded text. 226 int encodedTextPos = word.indexOf(ENCODED_TOKEN_FINISHER, encodingPos + 1); 227 if (encodedTextPos == -1) { 228 throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word); 229 } 230 231 String encodedText = word.substring(encodingPos + 1, encodedTextPos); 232 233 // seems a bit silly to encode a null string, but easy to deal with. 234 if (encodedText.length() == 0) { 235 return ""; 236 } 237 238 try { 239 // the decoder writes directly to an output stream. 240 ByteArrayOutputStream out = new ByteArrayOutputStream(encodedText.length()); 241 242 byte[] encodedData = encodedText.getBytes(US_ASCII_CHARSET); 243 244 // Base64 encoded? 245 if (encoding.equals(BASE64_ENCODING_MARKER)) { 246 Base64Decoder.decode(encodedData, out); 247 } else if (encoding.equals(QUOTEDPRINTABLE_ENCODING_MARKER)) { // maybe quoted printable. 248 QuotedPrintableDecoder.decode(encodedData, out); 249 } else { 250 throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding); 251 } 252 // get the decoded byte data and convert into a string. 253 byte[] decodedData = out.toByteArray(); 254 return new String(decodedData, javaCharset(charset)); 255 } catch (IOException e) { 256 throw new UnsupportedEncodingException("Invalid RFC 2047 encoding"); 257 } 258 } 259 260 /** 261 * Translate a MIME standard character set name into the Java 262 * equivalent. 263 * 264 * @param charset The MIME standard name. 265 * 266 * @return The Java equivalent for this name. 267 */ 268 private static String javaCharset(String charset) { 269 // nothing in, nothing out. 270 if (charset == null) { 271 return null; 272 } 273 274 String mappedCharset = MIME2JAVA.get(charset.toLowerCase(Locale.ENGLISH)); 275 // if there is no mapping, then the original name is used. Many of the MIME character set 276 // names map directly back into Java. The reverse isn't necessarily true. 277 if (mappedCharset == null) { 278 return charset; 279 } 280 return mappedCharset; 281 } 282 283}