Boost.Locale
utf.hpp
1 //
2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3 //
4 // Distributed under the Boost Software License, Version 1.0.
5 // https://www.boost.org/LICENSE_1_0.txt
6 
7 #ifndef BOOST_LOCALE_UTF_HPP_INCLUDED
8 #define BOOST_LOCALE_UTF_HPP_INCLUDED
9 
10 #include <boost/locale/config.hpp>
11 #include <boost/cstdint.hpp>
12 
13 namespace boost {
14 namespace locale {
20 namespace utf {
22  #ifdef __GNUC__
23  # define BOOST_LOCALE_LIKELY(x) __builtin_expect((x),1)
24  # define BOOST_LOCALE_UNLIKELY(x) __builtin_expect((x),0)
25  #else
26  # define BOOST_LOCALE_LIKELY(x) (x)
27  # define BOOST_LOCALE_UNLIKELY(x) (x)
28  #endif
29 
34  typedef uint32_t code_point;
35 
39  static const code_point illegal = 0xFFFFFFFFu;
40 
44  static const code_point incomplete = 0xFFFFFFFEu;
45 
50  {
51  if(v>0x10FFFF)
52  return false;
53  if(0xD800 <=v && v<= 0xDFFF) // surrogates
54  return false;
55  return true;
56  }
57 
58  #ifdef BOOST_LOCALE_DOXYGEN
59  template<typename CharType,int size=sizeof(CharType)>
63  struct utf_traits {
67  typedef CharType char_type;
82  template<typename Iterator>
83  static code_point decode(Iterator &p,Iterator e);
84 
92  static const int max_width;
99  static int width(code_point value);
100 
106  static int trail_length(char_type c);
110  static bool is_trail(char_type c);
114  static bool is_lead(char_type c);
115 
126  template<typename Iterator>
127  static Iterator encode(code_point value,Iterator out);
133  template<typename Iterator>
134  static code_point decode_valid(Iterator &p);
135  };
136 
137  #else
138 
139  template<typename CharType,int size=sizeof(CharType)>
140  struct utf_traits;
141 
142  template<typename CharType>
143  struct utf_traits<CharType,1> {
144 
145  typedef CharType char_type;
146 
147  static int trail_length(char_type ci)
148  {
149  unsigned char c = ci;
150  if(c < 128)
151  return 0;
152  if(BOOST_LOCALE_UNLIKELY(c < 194))
153  return -1;
154  if(c < 224)
155  return 1;
156  if(c < 240)
157  return 2;
158  if(BOOST_LOCALE_LIKELY(c <=244))
159  return 3;
160  return -1;
161  }
162 
163  static const int max_width = 4;
164 
165  static int width(code_point value)
166  {
167  if(value <=0x7F) {
168  return 1;
169  }
170  else if(value <=0x7FF) {
171  return 2;
172  }
173  else if(BOOST_LOCALE_LIKELY(value <=0xFFFF)) {
174  return 3;
175  }
176  else {
177  return 4;
178  }
179  }
180 
181  static bool is_trail(char_type ci)
182  {
183  unsigned char c=ci;
184  return (c & 0xC0)==0x80;
185  }
186 
187  static bool is_lead(char_type ci)
188  {
189  return !is_trail(ci);
190  }
191 
192  template<typename Iterator>
193  static code_point decode(Iterator &p,Iterator e)
194  {
195  if(BOOST_LOCALE_UNLIKELY(p==e))
196  return incomplete;
197 
198  unsigned char lead = *p++;
199 
200  // First byte is fully validated here
201  int trail_size = trail_length(lead);
202 
203  if(BOOST_LOCALE_UNLIKELY(trail_size < 0))
204  return illegal;
205 
206  //
207  // Ok as only ASCII may be of size = 0
208  // also optimize for ASCII text
209  //
210  if(trail_size == 0)
211  return lead;
212 
213  code_point c = lead & ((1 << (6-trail_size))-1);
214 
215  // Read the rest
216  unsigned char tmp;
217  switch(trail_size) {
218  case 3:
219  if(BOOST_LOCALE_UNLIKELY(p==e))
220  return incomplete;
221  tmp = *p++;
222  if (!is_trail(tmp))
223  return illegal;
224  c = (c << 6) | ( tmp & 0x3F);
225  BOOST_FALLTHROUGH;
226  case 2:
227  if(BOOST_LOCALE_UNLIKELY(p==e))
228  return incomplete;
229  tmp = *p++;
230  if (!is_trail(tmp))
231  return illegal;
232  c = (c << 6) | ( tmp & 0x3F);
233  BOOST_FALLTHROUGH;
234  case 1:
235  if(BOOST_LOCALE_UNLIKELY(p==e))
236  return incomplete;
237  tmp = *p++;
238  if (!is_trail(tmp))
239  return illegal;
240  c = (c << 6) | ( tmp & 0x3F);
241  }
242 
243  // Check code point validity: no surrogates and
244  // valid range
245  if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
246  return illegal;
247 
248  // make sure it is the most compact representation
249  if(BOOST_LOCALE_UNLIKELY(width(c)!=trail_size + 1))
250  return illegal;
251 
252  return c;
253 
254  }
255 
256  template<typename Iterator>
257  static code_point decode_valid(Iterator &p)
258  {
259  unsigned char lead = *p++;
260  if(lead < 192)
261  return lead;
262 
263  int trail_size;
264 
265  if(lead < 224)
266  trail_size = 1;
267  else if(BOOST_LOCALE_LIKELY(lead < 240)) // non-BMP rare
268  trail_size = 2;
269  else
270  trail_size = 3;
271 
272  code_point c = lead & ((1 << (6-trail_size))-1);
273 
274  switch(trail_size) {
275  case 3:
276  c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
277  BOOST_FALLTHROUGH;
278  case 2:
279  c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
280  BOOST_FALLTHROUGH;
281  case 1:
282  c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
283  }
284 
285  return c;
286  }
287 
288 
289 
290  template<typename Iterator>
291  static Iterator encode(code_point value,Iterator out)
292  {
293  if(value <= 0x7F) {
294  *out++ = static_cast<char_type>(value);
295  }
296  else if(value <= 0x7FF) {
297  *out++ = static_cast<char_type>((value >> 6) | 0xC0);
298  *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
299  }
300  else if(BOOST_LOCALE_LIKELY(value <= 0xFFFF)) {
301  *out++ = static_cast<char_type>((value >> 12) | 0xE0);
302  *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
303  *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
304  }
305  else {
306  *out++ = static_cast<char_type>((value >> 18) | 0xF0);
307  *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
308  *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
309  *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
310  }
311  return out;
312  }
313  }; // utf8
314 
315  template<typename CharType>
316  struct utf_traits<CharType,2> {
317  typedef CharType char_type;
318 
319  // See RFC 2781
320  static bool is_first_surrogate(uint16_t x)
321  {
322  return 0xD800 <=x && x<= 0xDBFF;
323  }
324  static bool is_second_surrogate(uint16_t x)
325  {
326  return 0xDC00 <=x && x<= 0xDFFF;
327  }
328  static code_point combine_surrogate(uint16_t w1,uint16_t w2)
329  {
330  return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
331  }
332  static int trail_length(char_type c)
333  {
334  if(is_first_surrogate(c))
335  return 1;
336  if(is_second_surrogate(c))
337  return -1;
338  return 0;
339  }
343  static bool is_trail(char_type c)
344  {
345  return is_second_surrogate(c);
346  }
350  static bool is_lead(char_type c)
351  {
352  return !is_second_surrogate(c);
353  }
354 
355  template<typename It>
356  static code_point decode(It &current,It last)
357  {
358  if(BOOST_LOCALE_UNLIKELY(current == last))
359  return incomplete;
360  uint16_t w1=*current++;
361  if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
362  return w1;
363  }
364  if(w1 > 0xDBFF)
365  return illegal;
366  if(current==last)
367  return incomplete;
368  uint16_t w2=*current++;
369  if(w2 < 0xDC00 || 0xDFFF < w2)
370  return illegal;
371  return combine_surrogate(w1,w2);
372  }
373  template<typename It>
374  static code_point decode_valid(It &current)
375  {
376  uint16_t w1=*current++;
377  if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
378  return w1;
379  }
380  uint16_t w2=*current++;
381  return combine_surrogate(w1,w2);
382  }
383 
384  static const int max_width = 2;
385  static int width(code_point u)
386  {
387  return u>=0x10000 ? 2 : 1;
388  }
389  template<typename It>
390  static It encode(code_point u,It out)
391  {
392  if(BOOST_LOCALE_LIKELY(u<=0xFFFF)) {
393  *out++ = static_cast<char_type>(u);
394  }
395  else {
396  u -= 0x10000;
397  *out++ = static_cast<char_type>(0xD800 | (u>>10));
398  *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
399  }
400  return out;
401  }
402  }; // utf16;
403 
404 
405  template<typename CharType>
406  struct utf_traits<CharType,4> {
407  typedef CharType char_type;
408  static int trail_length(char_type c)
409  {
410  if(is_valid_codepoint(c))
411  return 0;
412  return -1;
413  }
414  static bool is_trail(char_type /*c*/)
415  {
416  return false;
417  }
418  static bool is_lead(char_type /*c*/)
419  {
420  return true;
421  }
422 
423  template<typename It>
424  static code_point decode_valid(It &current)
425  {
426  return *current++;
427  }
428 
429  template<typename It>
430  static code_point decode(It &current,It last)
431  {
432  if(BOOST_LOCALE_UNLIKELY(current == last))
434  code_point c=*current++;
435  if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
437  return c;
438  }
439  static const int max_width = 1;
440  static int width(code_point /*u*/)
441  {
442  return 1;
443  }
444  template<typename It>
445  static It encode(code_point u,It out)
446  {
447  *out++ = static_cast<char_type>(u);
448  return out;
449  }
450 
451  }; // utf32
452 
453  #endif
454 
455 
456 } // utf
457 } // locale
458 } // boost
459 
460 
461 #endif
462 
463 
static code_point decode(Iterator &p, Iterator e)
bool is_valid_codepoint(code_point v)
the function checks if v is a valid code point
Definition: utf.hpp:49
static Iterator encode(code_point value, Iterator out)
static const code_point incomplete
Special constant that defines incomplete code point.
Definition: utf.hpp:44
uint32_t code_point
The integral type that can hold a Unicode code point.
Definition: utf.hpp:34
static const code_point illegal
Special constant that defines illegal code point.
Definition: utf.hpp:39
UTF Traits class - functions to convert UTF sequences to and from Unicode code points.
Definition: utf.hpp:63
static int width(code_point value)
CharType char_type
Definition: utf.hpp:67
static int trail_length(char_type c)
static bool is_lead(char_type c)
static code_point decode_valid(Iterator &p)
static bool is_trail(char_type c)
static const int max_width
Definition: utf.hpp:92