001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.fileupload.util.mime;
018
019import java.io.ByteArrayOutputStream;
020import java.io.UnsupportedEncodingException;
021import java.util.Base64;
022import java.util.HashMap;
023import java.util.Locale;
024import java.util.Map;
025
026/**
027 * Utility class to decode MIME texts.
028 *
029 * @since 1.3
030 */
031public final class MimeUtility {
032
033    /**
034     * The {@code US-ASCII} charset identifier constant.
035     */
036    private static final String US_ASCII_CHARSET = "US-ASCII";
037
038    /**
039     * The marker to indicate text is encoded with BASE64 algorithm.
040     */
041    private static final String BASE64_ENCODING_MARKER = "B";
042
043    /**
044     * The marker to indicate text is encoded with QuotedPrintable algorithm.
045     */
046    private static final String QUOTEDPRINTABLE_ENCODING_MARKER = "Q";
047
048    /**
049     * If the text contains any encoded tokens, those tokens will be marked with "=?".
050     */
051    private static final String ENCODED_TOKEN_MARKER = "=?";
052
053    /**
054     * If the text contains any encoded tokens, those tokens will terminate with "=?".
055     */
056    private static final String ENCODED_TOKEN_FINISHER = "?=";
057
058    /**
059     * The linear whitespace chars sequence.
060     */
061    private static final String LINEAR_WHITESPACE = " \t\r\n";
062
063    /**
064     * Mappings between MIME and Java charset.
065     */
066    private static final Map<String, String> MIME2JAVA = new HashMap<>();
067
068    static {
069        MIME2JAVA.put("iso-2022-cn", "ISO2022CN");
070        MIME2JAVA.put("iso-2022-kr", "ISO2022KR");
071        MIME2JAVA.put("utf-8", "UTF8");
072        MIME2JAVA.put("utf8", "UTF8");
073        MIME2JAVA.put("ja_jp.iso2022-7", "ISO2022JP");
074        MIME2JAVA.put("ja_jp.eucjp", "EUCJIS");
075        MIME2JAVA.put("euc-kr", "KSC5601");
076        MIME2JAVA.put("euckr", "KSC5601");
077        MIME2JAVA.put("us-ascii", "ISO-8859-1");
078        MIME2JAVA.put("x-us-ascii", "ISO-8859-1");
079    }
080
081    /**
082     * Decode a string of text obtained from a mail header into
083     * its proper form.  The text generally will consist of a
084     * string of tokens, some of which may be encoded using
085     * base64 encoding.
086     *
087     * @param text   The text to decode.
088     * @return The decoded text string.
089     * @throws UnsupportedEncodingException if the detected encoding in the input text is not supported.
090     */
091    public static String decodeText(final String text) throws UnsupportedEncodingException {
092        // if the text contains any encoded tokens, those tokens will be marked with "=?".  If the
093        // source string doesn't contain that sequent, no decoding is required.
094        if (!text.contains(ENCODED_TOKEN_MARKER)) {
095            return text;
096        }
097
098        int offset = 0;
099        final int endOffset = text.length();
100
101        int startWhiteSpace = -1;
102        int endWhiteSpace = -1;
103
104        final StringBuilder decodedText = new StringBuilder(text.length());
105
106        boolean previousTokenEncoded = false;
107
108        while (offset < endOffset) {
109            char ch = text.charAt(offset);
110
111            // is this a whitespace character?
112            if (LINEAR_WHITESPACE.indexOf(ch) != -1) { // whitespace found
113                startWhiteSpace = offset;
114                while (offset < endOffset) {
115                    // step over the white space characters.
116                    ch = text.charAt(offset);
117                    if (LINEAR_WHITESPACE.indexOf(ch) == -1) {
118                        // record the location of the first non lwsp and drop down to process the
119                        // token characters.
120                        endWhiteSpace = offset;
121                        break;
122                    }
123                    offset++;
124                }
125            } else {
126                // we have a word token.  We need to scan over the word and then try to parse it.
127                final int wordStart = offset;
128
129                while (offset < endOffset) {
130                    // step over the non white space characters.
131                    ch = text.charAt(offset);
132                    if (LINEAR_WHITESPACE.indexOf(ch) != -1) {
133                        break;
134                    }
135                    offset++;
136
137                    //NB:  Trailing whitespace on these header strings will just be discarded.
138                }
139                // pull out the word token.
140                final String word = text.substring(wordStart, offset);
141                // is the token encoded?  decode the word
142                if (word.startsWith(ENCODED_TOKEN_MARKER)) {
143                    try {
144                        // if this gives a parsing failure, treat it like a non-encoded word.
145                        final String decodedWord = decodeWord(word);
146
147                        // are any whitespace characters significant?  Append 'em if we've got 'em.
148                        if (!previousTokenEncoded && startWhiteSpace != -1) {
149                            decodedText.append(text, startWhiteSpace, endWhiteSpace);
150                            startWhiteSpace = -1;
151                        }
152                        // this is definitely a decoded token.
153                        previousTokenEncoded = true;
154                        // and add this to the text.
155                        decodedText.append(decodedWord);
156                        // we continue parsing from here...we allow parsing errors to fall through
157                        // and get handled as normal text.
158                        continue;
159
160                    } catch (final ParseException e) {
161                        // just ignore it, skip to next word
162                    }
163                }
164                // this is a normal token, so it doesn't matter what the previous token was.  Add the white space
165                // if we have it.
166                if (startWhiteSpace != -1) {
167                    decodedText.append(text, startWhiteSpace, endWhiteSpace);
168                    startWhiteSpace = -1;
169                }
170                // this is not a decoded token.
171                previousTokenEncoded = false;
172                decodedText.append(word);
173            }
174        }
175
176        return decodedText.toString();
177    }
178
179    /**
180     * Parse a string using the RFC 2047 rules for an "encoded-word"
181     * type.  This encoding has the syntax:
182     *
183     * encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
184     *
185     * @param word   The possibly encoded word value.
186     * @return The decoded word.
187     * @throws ParseException in case of a parse error of the RFC 2047
188     * @throws UnsupportedEncodingException Thrown when Invalid RFC 2047 encoding was found
189     */
190    private static String decodeWord(final String word) throws ParseException, UnsupportedEncodingException {
191        // encoded words start with the characters "=?".  If this not an encoded word, we throw a
192        // ParseException for the caller.
193
194        if (!word.startsWith(ENCODED_TOKEN_MARKER)) {
195            throw new ParseException("Invalid RFC 2047 encoded-word: " + word);
196        }
197
198        final int charsetPos = word.indexOf('?', 2);
199        if (charsetPos == -1) {
200            throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word);
201        }
202
203        // pull out the character set information (this is the MIME name at this point).
204        final String charset = word.substring(2, charsetPos).toLowerCase(Locale.ROOT);
205
206        // now pull out the encoding token the same way.
207        final int encodingPos = word.indexOf('?', charsetPos + 1);
208        if (encodingPos == -1) {
209            throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word);
210        }
211
212        final String encoding = word.substring(charsetPos + 1, encodingPos);
213
214        // and finally the encoded text.
215        final int encodedTextPos = word.indexOf(ENCODED_TOKEN_FINISHER, encodingPos + 1);
216        if (encodedTextPos == -1) {
217            throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word);
218        }
219
220        final String encodedText = word.substring(encodingPos + 1, encodedTextPos);
221
222        // seems a bit silly to encode a null string, but easy to deal with.
223        if (encodedText.isEmpty()) {
224            return "";
225        }
226
227        try {
228            // the decoder writes directly to an output stream.
229            final ByteArrayOutputStream out = new ByteArrayOutputStream(encodedText.length());
230
231            final byte[] encodedData = encodedText.getBytes(US_ASCII_CHARSET);
232
233            // Base64 encoded?
234            if (encoding.equals(BASE64_ENCODING_MARKER)) {
235                out.write(Base64.getDecoder().decode(encodedData));
236            } else if (encoding.equals(QUOTEDPRINTABLE_ENCODING_MARKER)) { // maybe quoted printable.
237                QuotedPrintableDecoder.decode(encodedData, out);
238            } else {
239                throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding);
240            }
241            // get the decoded byte data and convert into a string.
242            final byte[] decodedData = out.toByteArray();
243            return new String(decodedData, javaCharset(charset));
244        } catch (final Exception e) {
245            throw new UnsupportedEncodingException("Invalid RFC 2047 encoding");
246        }
247    }
248
249    /**
250     * Translate a MIME standard character set name into the Java
251     * equivalent.
252     *
253     * @param charset The MIME standard name.
254     * @return The Java equivalent for this name.
255     */
256    private static String javaCharset(final String charset) {
257        // nothing in, nothing out.
258        if (charset == null) {
259            return null;
260        }
261
262        final String mappedCharset = MIME2JAVA.get(charset.toLowerCase(Locale.ROOT));
263        // if there is no mapping, then the original name is used.  Many of the MIME character set
264        // names map directly back into Java.  The reverse isn't necessarily true.
265        if (mappedCharset == null) {
266            return charset;
267        }
268        return mappedCharset;
269    }
270
271    /**
272     * Hidden constructor, this class must not be instantiated.
273     */
274    private MimeUtility() {
275        // do nothing
276    }
277
278}