84UInt8 SequenceLengthTable[256] =
86 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
87 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
88 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
89 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
90 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
91 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
92 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
94 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
95 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
99 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
100 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
101 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
107 const char* p = utf8str;
113 if (c < 0x80 || c > 0xBF)
131 return static_cast<UInt16
>(c);
145 const char* p = utf8char;
146 const UInt32 c0 =
static_cast<UInt8>(p[0]);
147 const UInt32 bad = 0xFFFFFFFF;
148 switch (SequenceLengthTable[c0])
161 const UInt32 c1 =
static_cast<UInt8>(p[1]);
162 return ((c0 & 0x1fu) << 6) | (c1 & 0x3fu);
167 if (p[1] ==
'\0' || p[2] ==
'\0')
171 const UInt32 c1 =
static_cast<UInt8>(p[1]);
172 const UInt32 c2 =
static_cast<UInt8>(p[2]);
173 return ((c0 & 0x0fu) << 12) | ((c1 & 0x3fu) << 6) | (c2 & 0x3fu);
178 if (p[1] ==
'\0' || p[2] ==
'\0' || p[3] ==
'\0')
183 const UInt32 c1 =
static_cast<UInt8>(p[1]);
184 const UInt32 c2 =
static_cast<UInt8>(p[2]);
185 const UInt32 c3 =
static_cast<UInt8>(p[3]);
187 return ((c0 & 0x03u) << 18) | ((c1 & 0x3fu) << 12) | ((c2 & 0x3fu) << 6) | (c3 & 0x3fu);
209 if (ucs4char < 0x80u)
212 sb +=
static_cast<char>(
static_cast<UInt8>(ucs4char));
214 else if (ucs4char < 0x800u)
216 sb +=
static_cast<char>(
static_cast<UInt8>(0xc0u | (ucs4char >> 6)));
217 sb +=
static_cast<char>(
static_cast<UInt8>(0x80u | (ucs4char & 0x3fu)));
219 else if (ucs4char < 0x10000u)
221 sb +=
static_cast<char>(
static_cast<UInt8>(0xe0u | (ucs4char >> 12)));
222 sb +=
static_cast<char>(
static_cast<UInt8>(0x80u | ((ucs4char >> 6) & 0x3fu)));
223 sb +=
static_cast<char>(
static_cast<UInt8>(0x80u | (ucs4char & 0x3fu)));
227 sb +=
static_cast<char>(
static_cast<UInt8>(0xf0u | (ucs4char >> 18)));
228 sb +=
static_cast<char>(
static_cast<UInt8>(0x80u | ((ucs4char >> 12) & 0x3fu)));
229 sb +=
static_cast<char>(
static_cast<UInt8>(0x80u | ((ucs4char >> 6) & 0x3fu)));
230 sb +=
static_cast<char>(
static_cast<UInt8>(0x80u | (ucs4char & 0x3fu)));
239 if (ucs4char < 0x80u)
242 p[0] =
static_cast<char>(
static_cast<UInt8>(ucs4char));
244 else if (ucs4char < 0x800u)
246 p[0] =
static_cast<char>(
static_cast<UInt8>(0xc0u | (ucs4char >> 6)));
247 p[1] =
static_cast<char>(
static_cast<UInt8>(0x80u | (ucs4char & 0x3fu)));
249 else if (ucs4char < 0x10000u)
251 p[0] =
static_cast<char>(
static_cast<UInt8>(0xe0u | (ucs4char >> 12)));
252 p[1] =
static_cast<char>(
static_cast<UInt8>(0x80u | ((ucs4char >> 6) & 0x3fu)));
253 p[2] =
static_cast<char>(
static_cast<UInt8>(0x80u | (ucs4char & 0x3fu)));
257 p[0] =
static_cast<char>(
static_cast<UInt8>(0xf0u | (ucs4char >> 18)));
258 p[1] =
static_cast<char>(
static_cast<UInt8>(0x80u | ((ucs4char >> 12) & 0x3fu)));
259 p[2] =
static_cast<char>(
static_cast<UInt8>(0x80u | ((ucs4char >> 6) & 0x3fu)));
260 p[3] =
static_cast<char>(
static_cast<UInt8>(0x80u | (ucs4char & 0x3fu)));
265Array<UInt16> StringToUCS2Common(
const String& input,
bool throwException)
270 const UInt16 UCS2ReplacementChar = 0xFFFD;
271 const char* begin = input.c_str();
272 const char* end = begin + input.length();
274 const char* p = begin;
277 const UInt32 c0 =
static_cast<UInt8>(p[0]);
278 switch (SequenceLengthTable[c0])
293 BLOCXX_THROW(InvalidUTF8Exception, Format(
"Length: %1, input = %2, p = %3",
294 static_cast<int>(SequenceLengthTable[c0]), input.c_str(), p).c_str());
298 rval.push_back(UCS2ReplacementChar);
302 const UInt32 c1 =
static_cast<UInt8>(p[1]);
303 rval.push_back(((c0 & 0x1fu) << 6) | (c1 & 0x3fu));
310 if (p[1] ==
'\0' || p[2] ==
'\0')
314 BLOCXX_THROW(InvalidUTF8Exception, Format(
"Length: %1, input = %2, p = %3",
315 static_cast<int>(SequenceLengthTable[c0]), input.c_str(), p).c_str());
319 rval.push_back(UCS2ReplacementChar);
324 const UInt32 c1 =
static_cast<UInt8>(p[1]);
325 const UInt32 c2 =
static_cast<UInt8>(p[2]);
326 rval.push_back(((c0 & 0x0fu) << 12) | ((c1 & 0x3fu) << 6) | (c2 & 0x3fu));
335 BLOCXX_THROW(InvalidUTF8Exception, Format(
"Length: %1, input = %2, p = %3",
336 static_cast<int>(SequenceLengthTable[c0]), input.c_str(), p).c_str());
340 rval.push_back(UCS2ReplacementChar);
350 BLOCXX_THROW(InvalidUTF8Exception, Format(
"Length: %1, input = %2, p = %3",
351 static_cast<int>(SequenceLengthTable[c0]), input.c_str(), p).c_str());
355 rval.push_back(UCS2ReplacementChar);
369 return StringToUCS2Common(input,
false);
375 return StringToUCS2Common(input,
true);
384 size_t numchars = inputLength/2;
386 for (
size_t i = 0; i < numchars; ++i)
388 UCS4toUTF8(
reinterpret_cast<const UInt16*
>(input)[i], sb);
416int UTF8CharLen(UInt32 ucs4char)
418 if (ucs4char < 0x80u)
422 else if (ucs4char < 0x800u)
426 else if (ucs4char < 0x10000u)
436template <
typename TransformT>
437bool transformInPlace(
char* input, TransformT transformer)
440 char* output = input;
444 if (ucs4char == 0xFFFFFFFF)
450 UInt32 newUcs4Char = transformer(ucs4char);
452 const UInt32 c0 =
static_cast<UInt8>(p[0]);
453 int prevCharLen = SequenceLengthTable[c0];
454 int newCharLen = UTF8CharLen(newUcs4Char);
470 output += newCharLen;
476template <
typename TransformT>
477String transform(
const char* input, TransformT transformer)
479 StringBuffer rval(strlen(input));
480 const char* p = input;
484 if (ucs4char == 0xFFFFFFFF)
493 const UInt32 c0 =
static_cast<UInt8>(p[0]);
494 int prevCharLen = SequenceLengthTable[c0];
497 return rval.releaseString();
508const CaseMapping lowerMappings[] =
1266const CaseMapping upperMappings[] =
2033const CaseMapping*
const lowerMappingsEnd = lowerMappings +
2034 (
sizeof(lowerMappings)/
sizeof(lowerMappings[0]));
2036const CaseMapping*
const upperMappingsEnd = upperMappings +
2037 (
sizeof(upperMappings)/
sizeof(upperMappings[0]));
2039struct MappingOrdering
2041 bool operator()(
const CaseMapping& x,
const CaseMapping& y)
2043 return x.codePoint < y.codePoint;
2049 Transformer(
const CaseMapping*
const begin,
const CaseMapping*
const end)
2055 UInt32 operator()(UInt32 in)
const
2057 CaseMapping val = { in, 0 };
2058 const CaseMapping* i = std::lower_bound(m_begin, m_end, val, MappingOrdering());
2059 if (i == m_end || i->codePoint != in)
2068 const CaseMapping*
const m_begin;
2069 const CaseMapping*
const m_end;
2077 return transformInPlace(input, Transformer(upperMappings, upperMappingsEnd));
2083 return transform(input, Transformer(upperMappings, upperMappingsEnd));
2089 return transformInPlace(input, Transformer(lowerMappings, lowerMappingsEnd));
2095 return transform(input, Transformer(lowerMappings, lowerMappingsEnd));