Boost.Nowide
utf8_codecvt.hpp
1 //
2 // Copyright (c) 2015 Artyom Beilis (Tonkikh)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See
5 // accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
7 //
8 #ifndef BOOST_NOWIDE_UTF8_CODECVT_HPP_INCLUDED
9 #define BOOST_NOWIDE_UTF8_CODECVT_HPP_INCLUDED
10 
11 #include <boost/nowide/detail/utf.hpp>
13 #include <boost/cstdint.hpp>
14 #include <boost/static_assert.hpp>
15 #include <locale>
16 
17 namespace boost {
18 namespace nowide {
19 
20  // Make sure that mbstate can keep 16 bit of UTF-16 sequence
21  BOOST_STATIC_ASSERT(sizeof(std::mbstate_t) >= 2);
22  namespace detail {
23  // Avoid including cstring for std::memcpy
24  inline void copy_uint16_t(void* dst, const void* src)
25  {
26  unsigned char* cdst = static_cast<unsigned char*>(dst);
27  const unsigned char* csrc = static_cast<const unsigned char*>(src);
28  cdst[0] = csrc[0];
29  cdst[1] = csrc[1];
30  }
31  inline boost::uint16_t read_state(const std::mbstate_t& src)
32  {
33  boost::uint16_t dst;
34  copy_uint16_t(&dst, &src);
35  return dst;
36  }
37  inline void write_state(std::mbstate_t& dst, const boost::uint16_t src)
38  {
39  copy_uint16_t(&dst, &src);
40  }
41  } // namespace detail
42 
43 #if defined _MSC_VER && _MSC_VER < 1700
44 // MSVC do_length is non-standard it counts wide characters instead of narrow and does not change mbstate
45 #define BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
46 #endif
47 
54  template<typename CharType, int CharSize = sizeof(CharType)>
55  class utf8_codecvt;
56 
58  template<typename CharType>
59  class BOOST_SYMBOL_VISIBLE utf8_codecvt<CharType, 2> : public std::codecvt<CharType, char, std::mbstate_t>
60  {
61  public:
62  BOOST_STATIC_ASSERT_MSG(sizeof(CharType) >= 2, "CharType must be able to store UTF16 code point");
63 
64  utf8_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs)
65  {}
66 
67  protected:
68  typedef CharType uchar;
69 
70  virtual std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* /*to*/, char*& next) const
71  {
72  if(detail::read_state(s) != 0)
73  return std::codecvt_base::error;
74  next = from;
75  return std::codecvt_base::ok;
76  }
77  virtual int do_encoding() const throw()
78  {
79  return 0;
80  }
81  virtual int do_max_length() const throw()
82  {
83  return 4;
84  }
85  virtual bool do_always_noconv() const throw()
86  {
87  return false;
88  }
89 
90  virtual int do_length(std::mbstate_t
91 #ifdef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
92  const
93 #endif
94  & std_state,
95  const char* from,
96  const char* from_end,
97  size_t max) const
98  {
99  boost::uint16_t state = detail::read_state(std_state);
100 #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
101  const char* save_from = from;
102 #else
103  size_t save_max = max;
104 #endif
105  while(max > 0 && from < from_end)
106  {
107  const char* prev_from = from;
108  boost::uint32_t ch = detail::utf::utf_traits<char>::decode(from, from_end);
109  if(ch == detail::utf::illegal)
110  {
112  } else if(ch == detail::utf::incomplete)
113  {
114  from = prev_from;
115  break;
116  }
117  max--;
118  if(ch > 0xFFFF)
119  {
120  if(state == 0)
121  {
122  from = prev_from;
123  state = 1;
124  } else
125  {
126  state = 0;
127  }
128  }
129  }
130 #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
131  detail::write_state(std_state, state);
132  return static_cast<int>(from - save_from);
133 #else
134  return static_cast<int>(save_max - max);
135 #endif
136  }
137 
138  virtual std::codecvt_base::result do_in(std::mbstate_t& std_state,
139  const char* from,
140  const char* from_end,
141  const char*& from_next,
142  uchar* to,
143  uchar* to_end,
144  uchar*& to_next) const
145  {
146  std::codecvt_base::result r = std::codecvt_base::ok;
147 
148  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
149  // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
150  //
151  // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observed
152  // and first pair is written, but no input consumed
153  boost::uint16_t state = detail::read_state(std_state);
154  while(to < to_end && from < from_end)
155  {
156  const char* from_saved = from;
157 
158  uint32_t ch = detail::utf::utf_traits<char>::decode(from, from_end);
159 
160  if(ch == detail::utf::illegal)
161  {
163  } else if(ch == detail::utf::incomplete)
164  {
165  from = from_saved;
166  r = std::codecvt_base::partial;
167  break;
168  }
169  // Normal codepoints go directly to stream
170  if(ch <= 0xFFFF)
171  {
172  *to++ = static_cast<CharType>(ch);
173  } else
174  {
175  // for other codepoints we do following
176  //
177  // 1. We can't consume our input as we may find ourself
178  // in state where all input consumed but not all output written,i.e. only
179  // 1st pair is written
180  // 2. We only write first pair and mark this in the state, we also revert back
181  // the from pointer in order to make sure this codepoint would be read
182  // once again and then we would consume our input together with writing
183  // second surrogate pair
184  ch -= 0x10000;
185  boost::uint16_t vh = static_cast<boost::uint16_t>(ch >> 10);
186  boost::uint16_t vl = ch & 0x3FF;
187  boost::uint16_t w1 = vh + 0xD800;
188  boost::uint16_t w2 = vl + 0xDC00;
189  if(state == 0)
190  {
191  from = from_saved;
192  *to++ = static_cast<CharType>(w1);
193  state = 1;
194  } else
195  {
196  *to++ = static_cast<CharType>(w2);
197  state = 0;
198  }
199  }
200  }
201  from_next = from;
202  to_next = to;
203  if(r == std::codecvt_base::ok && (from != from_end || state != 0))
204  r = std::codecvt_base::partial;
205  detail::write_state(std_state, state);
206  return r;
207  }
208 
209  virtual std::codecvt_base::result do_out(std::mbstate_t& std_state,
210  const uchar* from,
211  const uchar* from_end,
212  const uchar*& from_next,
213  char* to,
214  char* to_end,
215  char*& to_next) const
216  {
217  std::codecvt_base::result r = std::codecvt_base::ok;
218  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
219  // according to standard. We assume that sizeof(mbstate_t) >=2 in order
220  // to be able to store first observed surrogate pair
221  //
222  // State: state!=0 - a first surrogate pair was observed (state = first pair),
223  // we expect the second one to come and then zero the state
225  boost::uint16_t state = detail::read_state(std_state);
226  while(to < to_end && from < from_end)
227  {
228  boost::uint32_t ch = 0;
229  if(state != 0)
230  {
231  // if the state indicates that 1st surrogate pair was written
232  // we should make sure that the second one that comes is actually
233  // second surrogate
234  boost::uint16_t w1 = state;
235  boost::uint16_t w2 = *from;
236  // we don't forward from as writing may fail to incomplete or
237  // partial conversion
238  if(0xDC00 <= w2 && w2 <= 0xDFFF)
239  {
240  boost::uint16_t vh = w1 - 0xD800;
241  boost::uint16_t vl = w2 - 0xDC00;
242  ch = ((uint32_t(vh) << 10) | vl) + 0x10000;
243  } else
244  {
246  }
247  } else
248  {
249  ch = *from;
250  if(0xD800 <= ch && ch <= 0xDBFF)
251  {
252  // if this is a first surrogate pair we put
253  // it into the state and consume it, note we don't
254  // go forward as it should be illegal so we increase
255  // the from pointer manually
256  state = static_cast<boost::uint16_t>(ch);
257  from++;
258  continue;
259  } else if(0xDC00 <= ch && ch <= 0xDFFF)
260  {
261  // if we observe second surrogate pair and
262  // first only may be expected we should break from the loop with error
263  // as it is illegal input
265  }
266  }
267  if(!detail::utf::is_valid_codepoint(ch))
268  {
269  r = std::codecvt_base::error;
270  break;
271  }
272  int len = detail::utf::utf_traits<char>::width(ch);
273  if(to_end - to < len)
274  {
275  r = std::codecvt_base::partial;
276  break;
277  }
278  to = detail::utf::utf_traits<char>::encode(ch, to);
279  state = 0;
280  from++;
281  }
282  from_next = from;
283  to_next = to;
284  if(r == std::codecvt_base::ok && (from != from_end || state != 0))
285  r = std::codecvt_base::partial;
286  detail::write_state(std_state, state);
287  return r;
288  }
289  };
290 
292  template<typename CharType>
293  class BOOST_SYMBOL_VISIBLE utf8_codecvt<CharType, 4> : public std::codecvt<CharType, char, std::mbstate_t>
294  {
295  public:
296  utf8_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs)
297  {}
298 
299  protected:
300  typedef CharType uchar;
301 
302  virtual std::codecvt_base::result do_unshift(std::mbstate_t& /*s*/, char* from, char* /*to*/, char*& next) const
303  {
304  next = from;
305  return std::codecvt_base::ok;
306  }
307  virtual int do_encoding() const throw()
308  {
309  return 0;
310  }
311  virtual int do_max_length() const throw()
312  {
313  return 4;
314  }
315  virtual bool do_always_noconv() const throw()
316  {
317  return false;
318  }
319 
320  virtual int do_length(std::mbstate_t
321 #ifdef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
322  const
323 #endif
324  & /*state*/,
325  const char* from,
326  const char* from_end,
327  size_t max) const
328  {
329 #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
330  const char* start_from = from;
331 #else
332  size_t save_max = max;
333 #endif
334 
335  while(max > 0 && from < from_end)
336  {
337  const char* save_from = from;
338  boost::uint32_t ch = detail::utf::utf_traits<char>::decode(from, from_end);
339  if(ch == detail::utf::incomplete)
340  {
341  from = save_from;
342  break;
343  } else if(ch == detail::utf::illegal)
344  {
346  }
347  max--;
348  }
349 #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
350  return from - start_from;
351 #else
352  return save_max - max;
353 #endif
354  }
355 
356  virtual std::codecvt_base::result do_in(std::mbstate_t& /*state*/,
357  const char* from,
358  const char* from_end,
359  const char*& from_next,
360  uchar* to,
361  uchar* to_end,
362  uchar*& to_next) const
363  {
364  std::codecvt_base::result r = std::codecvt_base::ok;
365 
366  while(to < to_end && from < from_end)
367  {
368  const char* from_saved = from;
369 
370  uint32_t ch = detail::utf::utf_traits<char>::decode(from, from_end);
371 
372  if(ch == detail::utf::illegal)
373  {
375  } else if(ch == detail::utf::incomplete)
376  {
377  r = std::codecvt_base::partial;
378  from = from_saved;
379  break;
380  }
381  *to++ = ch;
382  }
383  from_next = from;
384  to_next = to;
385  if(r == std::codecvt_base::ok && from != from_end)
386  r = std::codecvt_base::partial;
387  return r;
388  }
389 
390  virtual std::codecvt_base::result do_out(std::mbstate_t& /*std_state*/,
391  const uchar* from,
392  const uchar* from_end,
393  const uchar*& from_next,
394  char* to,
395  char* to_end,
396  char*& to_next) const
397  {
398  std::codecvt_base::result r = std::codecvt_base::ok;
399  while(to < to_end && from < from_end)
400  {
401  boost::uint32_t ch = 0;
402  ch = *from;
403  if(!detail::utf::is_valid_codepoint(ch))
404  {
406  }
407  int len = detail::utf::utf_traits<char>::width(ch);
408  if(to_end - to < len)
409  {
410  r = std::codecvt_base::partial;
411  break;
412  }
413  to = detail::utf::utf_traits<char>::encode(ch, to);
414  from++;
415  }
416  from_next = from;
417  to_next = to;
418  if(r == std::codecvt_base::ok && from != from_end)
419  r = std::codecvt_base::partial;
420  return r;
421  }
422  };
423 
424 } // namespace nowide
425 } // namespace boost
426 
427 #endif
Definition: utf8_codecvt.hpp:55
#define BOOST_NOWIDE_REPLACEMENT_CHARACTER
Definition: replacement.hpp:16