Boost.Locale
generic_codecvt.hpp
1 //
2 // Copyright (c) 2015 Artyom Beilis (Tonkikh)
3 //
4 // Distributed under the Boost Software License, Version 1.0.
5 // https://www.boost.org/LICENSE_1_0.txt
6 
7 #ifndef BOOST_LOCALE_GENERIC_CODECVT_HPP
8 #define BOOST_LOCALE_GENERIC_CODECVT_HPP
9 
10 #include <boost/locale/utf.hpp>
11 #include <boost/cstdint.hpp>
12 #include <boost/static_assert.hpp>
13 #include <locale>
14 
15 namespace boost {
16 namespace locale {
17 
18 #ifndef BOOST_LOCALE_DOXYGEN
19 //
20 // Make sure that mbstate can keep 16 bit of UTF-16 sequence
21 //
22 BOOST_STATIC_ASSERT(sizeof(std::mbstate_t)>=2);
23 #endif
24 
25 #if defined(_MSC_VER) && _MSC_VER < 1700
26 // up to MSVC 11 (2012) do_length is non-standard it counts wide characters instead of narrow and does not change mbstate
27 #define BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
28 #endif
29 
34 public:
41  };
42 };
43 
138 template<typename CharType,typename CodecvtImpl,int CharSize=sizeof(CharType)>
140 
149 template<typename CharType,typename CodecvtImpl>
150 class generic_codecvt<CharType,CodecvtImpl,2> : public std::codecvt<CharType,char,std::mbstate_t>, public generic_codecvt_base
151 {
152 public:
153 
154  typedef CharType uchar;
155 
156  generic_codecvt(size_t refs = 0) :
157  std::codecvt<CharType,char,std::mbstate_t>(refs)
158  {
159  }
160  CodecvtImpl const &implementation() const
161  {
162  return *static_cast<CodecvtImpl const *>(this);
163  }
164 
165 protected:
166 
167 
168  std::codecvt_base::result do_unshift(std::mbstate_t &s,char *from,char * /*to*/,char *&next) const BOOST_OVERRIDE
169  {
170  boost::uint16_t &state = *reinterpret_cast<boost::uint16_t *>(&s);
171 #ifdef DEBUG_CODECVT
172  std::cout << "Entering unshift " << std::hex << state << std::dec << std::endl;
173 #endif
174  if(state != 0)
175  return std::codecvt_base::error;
176  next=from;
177  return std::codecvt_base::ok;
178  }
179  int do_encoding() const BOOST_NOEXCEPT_OR_NOTHROW BOOST_OVERRIDE
180  {
181  return 0;
182  }
183  int do_max_length() const BOOST_NOEXCEPT_OR_NOTHROW BOOST_OVERRIDE
184  {
185  return implementation().max_encoding_length();
186  }
187  bool do_always_noconv() const BOOST_NOEXCEPT_OR_NOTHROW BOOST_OVERRIDE
188  {
189  return false;
190  }
191 
192  int
193  do_length( std::mbstate_t
194  #ifdef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
195  const
196  #endif
197  &std_state,
198  char const *from,
199  char const *from_end,
200  size_t max) const BOOST_OVERRIDE
201  {
202  #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
203  char const *save_from = from;
204  boost::uint16_t &state = *reinterpret_cast<boost::uint16_t *>(&std_state);
205  #else
206  size_t save_max = max;
207  boost::uint16_t state = *reinterpret_cast<boost::uint16_t const *>(&std_state);
208  #endif
209 
210  typename CodecvtImpl::state_type cvt_state = implementation().initial_state(generic_codecvt_base::to_unicode_state);
211  while(max > 0 && from < from_end){
212  char const *prev_from = from;
213  boost::uint32_t ch=implementation().to_unicode(cvt_state,from,from_end);
215  from = prev_from;
216  break;
217  }
218  max --;
219  if(ch > 0xFFFF) {
220  if(state == 0) {
221  from = prev_from;
222  state = 1;
223  }
224  else {
225  state = 0;
226  }
227  }
228  }
229  #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
230  return static_cast<int>(from - save_from);
231  #else
232  return static_cast<int>(save_max - max);
233  #endif
234  }
235 
236 
237  std::codecvt_base::result
238  do_in( std::mbstate_t &std_state,
239  char const *from,
240  char const *from_end,
241  char const *&from_next,
242  uchar *to,
243  uchar *to_end,
244  uchar *&to_next) const BOOST_OVERRIDE
245  {
246  std::codecvt_base::result r=std::codecvt_base::ok;
247 
248  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
249  // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
250  //
251  // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observerd
252  // and first pair is written, but no input consumed
253  boost::uint16_t &state = *reinterpret_cast<boost::uint16_t *>(&std_state);
254  typename CodecvtImpl::state_type cvt_state = implementation().initial_state(generic_codecvt_base::to_unicode_state);
255  while(to < to_end && from < from_end)
256  {
257 #ifdef DEBUG_CODECVT
258  std::cout << "Entering IN--------------\n";
259  std::cout << "State " << std::hex << state << std::endl;
260  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
261 #endif
262  char const *from_saved = from;
263 
264  uint32_t ch=implementation().to_unicode(cvt_state,from,from_end);
265 
267  from = from_saved;
268  r=std::codecvt_base::error;
269  break;
270  }
272  from = from_saved;
273  r=std::codecvt_base::partial;
274  break;
275  }
276  // Normal codepoints go direcly to stream
277  if(ch <= 0xFFFF) {
278  *to++ = static_cast<uchar>(ch);
279  }
280  else {
281  // for other codepoints we do following
282  //
283  // 1. We can't consume our input as we may find ourselfs
284  // in state where all input consumed but not all output written,i.e. only
285  // 1st pair is written
286  // 2. We only write first pair and mark this in the state, we also revert back
287  // the from pointer in order to make sure this codepoint would be read
288  // once again and then we would consume our input together with writing
289  // second surrogate pair
290  ch-=0x10000;
291  boost::uint16_t w1 = static_cast<boost::uint16_t>(0xD800 | (ch >> 10));
292  boost::uint16_t w2 = static_cast<boost::uint16_t>(0xDC00 | (ch & 0x3FF));
293  if(state == 0) {
294  from = from_saved;
295  *to++ = w1;
296  state = 1;
297  }
298  else {
299  *to++ = w2;
300  state = 0;
301  }
302  }
303  }
304  from_next=from;
305  to_next=to;
306  if(r == std::codecvt_base::ok && (from!=from_end || state!=0))
307  r = std::codecvt_base::partial;
308 #ifdef DEBUG_CODECVT
309  std::cout << "Returning ";
310  switch(r) {
311  case std::codecvt_base::ok:
312  std::cout << "ok\n";
313  break;
314  case std::codecvt_base::partial:
315  std::cout << "partial\n";
316  break;
317  case std::codecvt_base::error:
318  std::cout << "error\n";
319  break;
320  default:
321  std::cout << "other\n";
322  break;
323  }
324  std::cout << "State " << std::hex << state << std::endl;
325  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
326 #endif
327  return r;
328  }
329 
330  std::codecvt_base::result
331  do_out( std::mbstate_t &std_state,
332  uchar const *from,
333  uchar const *from_end,
334  uchar const *&from_next,
335  char *to,
336  char *to_end,
337  char *&to_next) const BOOST_OVERRIDE
338  {
339  std::codecvt_base::result r=std::codecvt_base::ok;
340  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
341  // according to standard. We assume that sizeof(mbstate_t) >=2 in order
342  // to be able to store first observed surrogate pair
343  //
344  // State: state!=0 - a first surrogate pair was observerd (state = first pair),
345  // we expect the second one to come and then zero the state
347  boost::uint16_t &state = *reinterpret_cast<boost::uint16_t *>(&std_state);
348  typename CodecvtImpl::state_type cvt_state = implementation().initial_state(generic_codecvt_base::from_unicode_state);
349  while(to < to_end && from < from_end)
350  {
351 #ifdef DEBUG_CODECVT
352  std::cout << "Entering OUT --------------\n";
353  std::cout << "State " << std::hex << state << std::endl;
354  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
355 #endif
356  boost::uint32_t ch=0;
357  if(state != 0) {
358  // if the state indicates that 1st surrogate pair was written
359  // we should make sure that the second one that comes is actually
360  // second surrogate
361  boost::uint16_t w1 = state;
362  boost::uint16_t w2 = *from;
363  // we don't forward from as writing may fail to incomplete or
364  // partial conversion
365  if(0xDC00 <= w2 && w2<=0xDFFF) {
366  boost::uint16_t vh = w1 - 0xD800;
367  boost::uint16_t vl = w2 - 0xDC00;
368  ch=((uint32_t(vh) << 10) | vl) + 0x10000;
369  }
370  else {
371  // Invalid surrogate
372  r=std::codecvt_base::error;
373  break;
374  }
375  }
376  else {
377  ch = *from;
378  if(0xD800 <= ch && ch<=0xDBFF) {
379  // if this is a first surrogate pair we put
380  // it into the state and consume it, note we don't
381  // go forward as it should be illegal so we increase
382  // the from pointer manually
383  state = static_cast<uint16_t>(ch);
384  from++;
385  continue;
386  }
387  else if(0xDC00 <= ch && ch<=0xDFFF) {
388  // if we observe second surrogate pair and
389  // first only may be expected we should break from the loop with error
390  // as it is illegal input
391  r=std::codecvt_base::error;
392  break;
393  }
394  }
396  r=std::codecvt_base::error;
397  break;
398  }
399  boost::uint32_t len = implementation().from_unicode(cvt_state,ch,to,to_end);
400  if(len == boost::locale::utf::incomplete) {
401  r=std::codecvt_base::partial;
402  break;
403  }
404  else if(len == boost::locale::utf::illegal) {
405  r=std::codecvt_base::error;
406  break;
407  }
408  else
409  to+= len;
410  state = 0;
411  from++;
412  }
413  from_next=from;
414  to_next=to;
415  if(r==std::codecvt_base::ok && from!=from_end)
416  r = std::codecvt_base::partial;
417 #ifdef DEBUG_CODECVT
418  std::cout << "Returning ";
419  switch(r) {
420  case std::codecvt_base::ok:
421  std::cout << "ok\n";
422  break;
423  case std::codecvt_base::partial:
424  std::cout << "partial\n";
425  break;
426  case std::codecvt_base::error:
427  std::cout << "error\n";
428  break;
429  default:
430  std::cout << "other\n";
431  break;
432  }
433  std::cout << "State " << std::hex << state << std::endl;
434  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
435 #endif
436  return r;
437  }
438 
439 };
440 
447 template<typename CharType,typename CodecvtImpl>
448 class generic_codecvt<CharType,CodecvtImpl,4> : public std::codecvt<CharType,char,std::mbstate_t>, public generic_codecvt_base
449 {
450 public:
451  typedef CharType uchar;
452 
453  generic_codecvt(size_t refs = 0) :
454  std::codecvt<CharType,char,std::mbstate_t>(refs)
455  {
456  }
457 
458  CodecvtImpl const &implementation() const
459  {
460  return *static_cast<CodecvtImpl const *>(this);
461  }
462 
463 protected:
464 
465  std::codecvt_base::result do_unshift(std::mbstate_t &/*s*/,char *from,char * /*to*/,char *&next) const BOOST_OVERRIDE
466  {
467  next=from;
468  return std::codecvt_base::ok;
469  }
470  int do_encoding() const BOOST_NOEXCEPT_OR_NOTHROW BOOST_OVERRIDE
471  {
472  return 0;
473  }
474  int do_max_length() const BOOST_NOEXCEPT_OR_NOTHROW BOOST_OVERRIDE
475  {
476  return implementation().max_encoding_length();
477  }
478  bool do_always_noconv() const BOOST_NOEXCEPT_OR_NOTHROW BOOST_OVERRIDE
479  {
480  return false;
481  }
482 
483  int
484  do_length( std::mbstate_t
485  #ifdef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
486  const
487  #endif
488  &/*state*/,
489  char const *from,
490  char const *from_end,
491  size_t max) const BOOST_OVERRIDE
492  {
493  #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
494  char const *start_from = from;
495  #else
496  size_t save_max = max;
497  #endif
498  typename CodecvtImpl::state_type cvt_state = implementation().initial_state(generic_codecvt_base::to_unicode_state);
499  while(max > 0 && from < from_end){
500  char const *save_from = from;
501  boost::uint32_t ch=implementation().to_unicode(cvt_state,from,from_end);
503  from = save_from;
504  break;
505  }
506  max--;
507  }
508  #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
509  return from - start_from;
510  #else
511  return save_max - max;
512  #endif
513  }
514 
515 
516  std::codecvt_base::result
517  do_in( std::mbstate_t &/*state*/,
518  char const *from,
519  char const *from_end,
520  char const *&from_next,
521  uchar *to,
522  uchar *to_end,
523  uchar *&to_next) const BOOST_OVERRIDE
524  {
525  std::codecvt_base::result r=std::codecvt_base::ok;
526 
527  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
528  // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
529  //
530  // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observerd
531  // and first pair is written, but no input consumed
532  typedef typename CodecvtImpl::state_type state_type;
533  state_type cvt_state = implementation().initial_state(generic_codecvt_base::to_unicode_state);
534  while(to < to_end && from < from_end)
535  {
536 #ifdef DEBUG_CODECVT
537  std::cout << "Entering IN--------------\n";
538  std::cout << "State " << std::hex << state << std::endl;
539  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
540 #endif
541  char const *from_saved = from;
542 
543  uint32_t ch=implementation().to_unicode(cvt_state,from,from_end);
544 
546  r=std::codecvt_base::error;
547  from = from_saved;
548  break;
549  }
551  r=std::codecvt_base::partial;
552  from=from_saved;
553  break;
554  }
555  *to++=ch;
556  }
557  from_next=from;
558  to_next=to;
559  if(r == std::codecvt_base::ok && from!=from_end)
560  r = std::codecvt_base::partial;
561 #ifdef DEBUG_CODECVT
562  std::cout << "Returning ";
563  switch(r) {
564  case std::codecvt_base::ok:
565  std::cout << "ok\n";
566  break;
567  case std::codecvt_base::partial:
568  std::cout << "partial\n";
569  break;
570  case std::codecvt_base::error:
571  std::cout << "error\n";
572  break;
573  default:
574  std::cout << "other\n";
575  break;
576  }
577  std::cout << "State " << std::hex << state << std::endl;
578  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
579 #endif
580  return r;
581  }
582 
583  std::codecvt_base::result
584  do_out( std::mbstate_t &/*std_state*/,
585  uchar const *from,
586  uchar const *from_end,
587  uchar const *&from_next,
588  char *to,
589  char *to_end,
590  char *&to_next) const BOOST_OVERRIDE
591  {
592  std::codecvt_base::result r=std::codecvt_base::ok;
593  typedef typename CodecvtImpl::state_type state_type;
594  state_type cvt_state = implementation().initial_state(generic_codecvt_base::from_unicode_state);
595  while(to < to_end && from < from_end)
596  {
597 #ifdef DEBUG_CODECVT
598  std::cout << "Entering OUT --------------\n";
599  std::cout << "State " << std::hex << state << std::endl;
600  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
601 #endif
602  boost::uint32_t ch=0;
603  ch = *from;
605  r=std::codecvt_base::error;
606  break;
607  }
608  boost::uint32_t len = implementation().from_unicode(cvt_state,ch,to,to_end);
609  if(len == boost::locale::utf::incomplete) {
610  r=std::codecvt_base::partial;
611  break;
612  }
613  else if(len == boost::locale::utf::illegal) {
614  r=std::codecvt_base::error;
615  break;
616  }
617  to+=len;
618  from++;
619  }
620  from_next=from;
621  to_next=to;
622  if(r==std::codecvt_base::ok && from!=from_end)
623  r = std::codecvt_base::partial;
624 #ifdef DEBUG_CODECVT
625  std::cout << "Returning ";
626  switch(r) {
627  case std::codecvt_base::ok:
628  std::cout << "ok\n";
629  break;
630  case std::codecvt_base::partial:
631  std::cout << "partial\n";
632  break;
633  case std::codecvt_base::error:
634  std::cout << "error\n";
635  break;
636  default:
637  std::cout << "other\n";
638  break;
639  }
640  std::cout << "State " << std::hex << state << std::endl;
641  std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
642 #endif
643  return r;
644  }
645 };
646 
647 
648 template<typename CharType,typename CodecvtImpl>
649 class generic_codecvt<CharType,CodecvtImpl,1> : public std::codecvt<CharType,char,std::mbstate_t>, public generic_codecvt_base
650 {
651 public:
652  typedef CharType uchar;
653 
654  CodecvtImpl const &implementation() const
655  {
656  return *static_cast<CodecvtImpl const *>(this);
657  }
658 
659  generic_codecvt(size_t refs = 0) : std::codecvt<char,char,std::mbstate_t>(refs)
660  {
661  }
662 };
663 
664 } // locale
665 } // namespace boost
666 
667 #endif
668 
bool is_valid_codepoint(code_point v)
the function checks if v is a valid code point
Definition: utf.hpp:49
The state would be used by to_unicode functions.
Definition: generic_codecvt.hpp:39
static const code_point incomplete
Special constant that defines incomplete code point.
Definition: utf.hpp:44
initial_convertion_state
Definition: generic_codecvt.hpp:38
static const code_point illegal
Special constant that defines illegal code point.
Definition: utf.hpp:39
A base class that used to define constants for generic_codecvt.
Definition: generic_codecvt.hpp:33
Geneneric generic codecvt facet, various stateless encodings to UTF-16 and UTF-32 using wchar_t,...
Definition: generic_codecvt.hpp:139
The state would be used by from_unicode functions.
Definition: generic_codecvt.hpp:40