001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.fileupload.util.mime;
018
019import java.io.ByteArrayOutputStream;
020import java.io.IOException;
021import java.io.UnsupportedEncodingException;
022import java.util.HashMap;
023import java.util.Locale;
024import java.util.Map;
025
026/**
027 * Utility class to decode MIME texts.
028 *
029 * @since 1.3
030 */
031public final class MimeUtility {
032
033    /**
034     * The {@code US-ASCII} charset identifier constant.
035     */
036    private static final String US_ASCII_CHARSET = "US-ASCII";
037
038    /**
039     * The marker to indicate text is encoded with BASE64 algorithm.
040     */
041    private static final String BASE64_ENCODING_MARKER = "B";
042
043    /**
044     * The marker to indicate text is encoded with QuotedPrintable algorithm.
045     */
046    private static final String QUOTEDPRINTABLE_ENCODING_MARKER = "Q";
047
048    /**
049     * If the text contains any encoded tokens, those tokens will be marked with "=?".
050     */
051    private static final String ENCODED_TOKEN_MARKER = "=?";
052
053    /**
054     * If the text contains any encoded tokens, those tokens will terminate with "=?".
055     */
056    private static final String ENCODED_TOKEN_FINISHER = "?=";
057
058    /**
059     * The linear whitespace chars sequence.
060     */
061    private static final String LINEAR_WHITESPACE = " \t\r\n";
062
063    /**
064     * Mappings between MIME and Java charset.
065     */
066    private static final Map<String, String> MIME2JAVA = new HashMap<String, String>();
067
068    static {
069        MIME2JAVA.put("iso-2022-cn", "ISO2022CN");
070        MIME2JAVA.put("iso-2022-kr", "ISO2022KR");
071        MIME2JAVA.put("utf-8", "UTF8");
072        MIME2JAVA.put("utf8", "UTF8");
073        MIME2JAVA.put("ja_jp.iso2022-7", "ISO2022JP");
074        MIME2JAVA.put("ja_jp.eucjp", "EUCJIS");
075        MIME2JAVA.put("euc-kr", "KSC5601");
076        MIME2JAVA.put("euckr", "KSC5601");
077        MIME2JAVA.put("us-ascii", "ISO-8859-1");
078        MIME2JAVA.put("x-us-ascii", "ISO-8859-1");
079    }
080
081    /**
082     * Hidden constructor, this class must not be instantiated.
083     */
084    private MimeUtility() {
085        // do nothing
086    }
087
088    /**
089     * Decode a string of text obtained from a mail header into
090     * its proper form.  The text generally will consist of a
091     * string of tokens, some of which may be encoded using
092     * base64 encoding.
093     *
094     * @param text   The text to decode.
095     *
096     * @return The decoded text string.
097     * @throws UnsupportedEncodingException if the detected encoding in the input text is not supported.
098     */
099    public static String decodeText(String text) throws UnsupportedEncodingException {
100        // if the text contains any encoded tokens, those tokens will be marked with "=?".  If the
101        // source string doesn't contain that sequent, no decoding is required.
102        if (text.indexOf(ENCODED_TOKEN_MARKER) < 0) {
103            return text;
104        }
105
106        int offset = 0;
107        int endOffset = text.length();
108
109        int startWhiteSpace = -1;
110        int endWhiteSpace = -1;
111
112        StringBuilder decodedText = new StringBuilder(text.length());
113
114        boolean previousTokenEncoded = false;
115
116        while (offset < endOffset) {
117            char ch = text.charAt(offset);
118
119            // is this a whitespace character?
120            if (LINEAR_WHITESPACE.indexOf(ch) != -1) { // whitespace found
121                startWhiteSpace = offset;
122                while (offset < endOffset) {
123                    // step over the white space characters.
124                    ch = text.charAt(offset);
125                    if (LINEAR_WHITESPACE.indexOf(ch) != -1) { // whitespace found
126                        offset++;
127                    } else {
128                        // record the location of the first non lwsp and drop down to process the
129                        // token characters.
130                        endWhiteSpace = offset;
131                        break;
132                    }
133                }
134            } else {
135                // we have a word token.  We need to scan over the word and then try to parse it.
136                int wordStart = offset;
137
138                while (offset < endOffset) {
139                    // step over the non white space characters.
140                    ch = text.charAt(offset);
141                    if (LINEAR_WHITESPACE.indexOf(ch) == -1) { // not white space
142                        offset++;
143                    } else {
144                        break;
145                    }
146
147                    //NB:  Trailing whitespace on these header strings will just be discarded.
148                }
149                // pull out the word token.
150                String word = text.substring(wordStart, offset);
151                // is the token encoded?  decode the word
152                if (word.startsWith(ENCODED_TOKEN_MARKER)) {
153                    try {
154                        // if this gives a parsing failure, treat it like a non-encoded word.
155                        String decodedWord = decodeWord(word);
156
157                        // are any whitespace characters significant?  Append 'em if we've got 'em.
158                        if (!previousTokenEncoded && startWhiteSpace != -1) {
159                            decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
160                            startWhiteSpace = -1;
161                        }
162                        // this is definitely a decoded token.
163                        previousTokenEncoded = true;
164                        // and add this to the text.
165                        decodedText.append(decodedWord);
166                        // we continue parsing from here...we allow parsing errors to fall through
167                        // and get handled as normal text.
168                        continue;
169
170                    } catch (ParseException e) {
171                        // just ignore it, skip to next word
172                    }
173                }
174                // this is a normal token, so it doesn't matter what the previous token was.  Add the white space
175                // if we have it.
176                if (startWhiteSpace != -1) {
177                    decodedText.append(text.substring(startWhiteSpace, endWhiteSpace));
178                    startWhiteSpace = -1;
179                }
180                // this is not a decoded token.
181                previousTokenEncoded = false;
182                decodedText.append(word);
183            }
184        }
185
186        return decodedText.toString();
187    }
188
189    /**
190     * Parse a string using the RFC 2047 rules for an "encoded-word"
191     * type.  This encoding has the syntax:
192     *
193     * encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
194     *
195     * @param word   The possibly encoded word value.
196     *
197     * @return The decoded word.
198     * @throws ParseException
199     * @throws UnsupportedEncodingException
200     */
201    private static String decodeWord(String word) throws ParseException, UnsupportedEncodingException {
202        // encoded words start with the characters "=?".  If this not an encoded word, we throw a
203        // ParseException for the caller.
204
205        if (!word.startsWith(ENCODED_TOKEN_MARKER)) {
206            throw new ParseException("Invalid RFC 2047 encoded-word: " + word);
207        }
208
209        int charsetPos = word.indexOf('?', 2);
210        if (charsetPos == -1) {
211            throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word);
212        }
213
214        // pull out the character set information (this is the MIME name at this point).
215        String charset = word.substring(2, charsetPos).toLowerCase(Locale.ENGLISH);
216
217        // now pull out the encoding token the same way.
218        int encodingPos = word.indexOf('?', charsetPos + 1);
219        if (encodingPos == -1) {
220            throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word);
221        }
222
223        String encoding = word.substring(charsetPos + 1, encodingPos);
224
225        // and finally the encoded text.
226        int encodedTextPos = word.indexOf(ENCODED_TOKEN_FINISHER, encodingPos + 1);
227        if (encodedTextPos == -1) {
228            throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word);
229        }
230
231        String encodedText = word.substring(encodingPos + 1, encodedTextPos);
232
233        // seems a bit silly to encode a null string, but easy to deal with.
234        if (encodedText.length() == 0) {
235            return "";
236        }
237
238        try {
239            // the decoder writes directly to an output stream.
240            ByteArrayOutputStream out = new ByteArrayOutputStream(encodedText.length());
241
242            byte[] encodedData = encodedText.getBytes(US_ASCII_CHARSET);
243
244            // Base64 encoded?
245            if (encoding.equals(BASE64_ENCODING_MARKER)) {
246                Base64Decoder.decode(encodedData, out);
247            } else if (encoding.equals(QUOTEDPRINTABLE_ENCODING_MARKER)) { // maybe quoted printable.
248                QuotedPrintableDecoder.decode(encodedData, out);
249            } else {
250                throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding);
251            }
252            // get the decoded byte data and convert into a string.
253            byte[] decodedData = out.toByteArray();
254            return new String(decodedData, javaCharset(charset));
255        } catch (IOException e) {
256            throw new UnsupportedEncodingException("Invalid RFC 2047 encoding");
257        }
258    }
259
260    /**
261     * Translate a MIME standard character set name into the Java
262     * equivalent.
263     *
264     * @param charset The MIME standard name.
265     *
266     * @return The Java equivalent for this name.
267     */
268    private static String javaCharset(String charset) {
269        // nothing in, nothing out.
270        if (charset == null) {
271            return null;
272        }
273
274        String mappedCharset = MIME2JAVA.get(charset.toLowerCase(Locale.ENGLISH));
275        // if there is no mapping, then the original name is used.  Many of the MIME character set
276        // names map directly back into Java.  The reverse isn't necessarily true.
277        if (mappedCharset == null) {
278            return charset;
279        }
280        return mappedCharset;
281    }
282
283}