utf8_codecvt_facet.hpp 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
  2. // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
  3. // Distributed under the Boost Software License, Version 1.0. (See accompany-
  4. // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5. #ifndef BOOST_UTF8_CODECVT_FACET_HPP
  6. #define BOOST_UTF8_CODECVT_FACET_HPP
  7. // MS compatible compilers support #pragma once
  8. #if defined(_MSC_VER) && (_MSC_VER >= 1020)
  9. # pragma once
  10. #endif
  11. /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
  12. // utf8_codecvt_facet.hpp
  13. // This header defines class utf8_codecvt_facet, derived from
  14. // std::codecvt<wchar_t, char>, which can be used to convert utf8 data in
  15. // files into wchar_t strings in the application.
  16. //
  17. // The header is NOT STANDALONE, and is not to be included by the USER.
  18. // There are at least two libraries which want to use this functionality, and
  19. // we want to avoid code duplication. It would be possible to create utf8
  20. // library, but:
  21. // - this requires review process first
  22. // - in the case, when linking the a library which uses utf8
  23. // (say 'program_options'), user should also link to the utf8 library.
  24. // This seems inconvenient, and asking a user to link to an unrevieved
  25. // library is strange.
  26. // Until the above points are fixed, a library which wants to use utf8 must:
  27. // - include this header in one of it's headers or sources
  28. // - include the corresponding boost/detail/utf8_codecvt_facet.ipp file in one
  29. // of its sources
  30. // - before including either file, the library must define
  31. // - BOOST_UTF8_BEGIN_NAMESPACE to the namespace declaration that must be used
  32. // - BOOST_UTF8_END_NAMESPACE to the code to close the previous namespace
  33. // declaration.
  34. // - BOOST_UTF8_DECL -- to the code which must be used for all 'exportable'
  35. // symbols.
  36. //
  37. // For example, program_options library might contain:
  38. // #define BOOST_UTF8_BEGIN_NAMESPACE <backslash character>
  39. // namespace boost { namespace program_options {
  40. // #define BOOST_UTF8_END_NAMESPACE }}
  41. // #define BOOST_UTF8_DECL BOOST_PROGRAM_OPTIONS_DECL
  42. // #include <boost/detail/utf8_codecvt_facet.ipp>
  43. //
  44. // Essentially, each library will have its own copy of utf8 code, in
  45. // different namespaces.
  46. // Note:(Robert Ramey). I have made the following alterations in the original
  47. // code.
  48. // a) Rendered utf8_codecvt<wchar_t, char> with using templates
  49. // b) Move longer functions outside class definition to prevent inlining
  50. // and make code smaller
  51. // c) added on a derived class to permit translation to/from current
  52. // locale to utf8
  53. // See http://www.boost.org for updates, documentation, and revision history.
  54. // archives stored as text - note these ar templated on the basic
  55. // stream templates to accommodate wide (and other?) kind of characters
  56. //
  57. // note the fact that on libraries without wide characters, ostream is
  58. // is not a specialization of basic_ostream which in fact is not defined
  59. // in such cases. So we can't use basic_ostream<OStream::char_type> but rather
  60. // use two template parameters
  61. //
  62. // utf8_codecvt_facet
  63. // This is an implementation of a std::codecvt facet for translating
  64. // from UTF-8 externally to UCS-4. Note that this is not tied to
  65. // any specific types in order to allow customization on platforms
  66. // where wchar_t is not big enough.
  67. //
  68. // NOTES: The current implementation jumps through some unpleasant hoops in
  69. // order to deal with signed character types. As a std::codecvt_base::result,
  70. // it is necessary for the ExternType to be convertible to unsigned char.
  71. // I chose not to tie the extern_type explicitly to char. But if any combination
  72. // of types other than <wchar_t,char_t> is used, then std::codecvt must be
  73. // specialized on those types for this to work.
  74. #include <locale>
  75. #include <cwchar> // for mbstate_t
  76. #include <cstddef> // for std::size_t
  77. #include <boost/config.hpp>
  78. #include <boost/detail/workaround.hpp>
  79. #if defined(BOOST_NO_STDC_NAMESPACE)
  80. namespace std {
  81. using ::mbstate_t;
  82. using ::size_t;
  83. }
  84. #endif
  85. // maximum lenght of a multibyte string
  86. #define MB_LENGTH_MAX 8
  87. BOOST_UTF8_BEGIN_NAMESPACE
  88. //----------------------------------------------------------------------------//
  89. // //
  90. // utf8_codecvt_facet //
  91. // //
  92. // See utf8_codecvt_facet.ipp for the implementation. //
  93. //----------------------------------------------------------------------------//
  94. #ifndef BOOST_UTF8_DECL
  95. #define BOOST_UTF8_DECL
  96. #endif
  97. struct BOOST_SYMBOL_VISIBLE utf8_codecvt_facet :
  98. public std::codecvt<wchar_t, char, std::mbstate_t>
  99. {
  100. public:
  101. BOOST_UTF8_DECL explicit utf8_codecvt_facet(std::size_t no_locale_manage=0)
  102. : std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage)
  103. {}
  104. virtual ~utf8_codecvt_facet(){}
  105. protected:
  106. BOOST_UTF8_DECL virtual std::codecvt_base::result do_in(
  107. std::mbstate_t& state,
  108. const char * from,
  109. const char * from_end,
  110. const char * & from_next,
  111. wchar_t * to,
  112. wchar_t * to_end,
  113. wchar_t*& to_next
  114. ) const;
  115. BOOST_UTF8_DECL virtual std::codecvt_base::result do_out(
  116. std::mbstate_t & state,
  117. const wchar_t * from,
  118. const wchar_t * from_end,
  119. const wchar_t* & from_next,
  120. char * to,
  121. char * to_end,
  122. char * & to_next
  123. ) const;
  124. bool invalid_continuing_octet(unsigned char octet_1) const {
  125. return (octet_1 < 0x80|| 0xbf< octet_1);
  126. }
  127. bool invalid_leading_octet(unsigned char octet_1) const {
  128. return (0x7f < octet_1 && octet_1 < 0xc0) ||
  129. (octet_1 > 0xfd);
  130. }
  131. // continuing octets = octets except for the leading octet
  132. static unsigned int get_cont_octet_count(unsigned char lead_octet) {
  133. return get_octet_count(lead_octet) - 1;
  134. }
  135. BOOST_UTF8_DECL static unsigned int get_octet_count(unsigned char lead_octet);
  136. // How many "continuing octets" will be needed for this word
  137. // == total octets - 1.
  138. BOOST_UTF8_DECL int get_cont_octet_out_count(wchar_t word) const ;
  139. virtual bool do_always_noconv() const BOOST_NOEXCEPT_OR_NOTHROW {
  140. return false;
  141. }
  142. // UTF-8 isn't really stateful since we rewind on partial conversions
  143. virtual std::codecvt_base::result do_unshift(
  144. std::mbstate_t&,
  145. char * from,
  146. char * /*to*/,
  147. char * & next
  148. ) const {
  149. next = from;
  150. return ok;
  151. }
  152. virtual int do_encoding() const BOOST_NOEXCEPT_OR_NOTHROW {
  153. const int variable_byte_external_encoding=0;
  154. return variable_byte_external_encoding;
  155. }
  156. // How many char objects can I process to get <= max_limit
  157. // wchar_t objects?
  158. BOOST_UTF8_DECL virtual int do_length(
  159. const std::mbstate_t &,
  160. const char * from,
  161. const char * from_end,
  162. std::size_t max_limit
  163. ) const
  164. #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600))
  165. throw()
  166. #endif
  167. ;
  168. virtual int do_length(
  169. std::mbstate_t & s,
  170. const char * from,
  171. const char * from_end,
  172. std::size_t max_limit
  173. ) const
  174. #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600))
  175. throw()
  176. #endif
  177. {
  178. return do_length(
  179. const_cast<const std::mbstate_t &>(s),
  180. from,
  181. from_end,
  182. max_limit
  183. );
  184. }
  185. // Largest possible value do_length(state,from,from_end,1) could return.
  186. virtual int do_max_length() const BOOST_NOEXCEPT_OR_NOTHROW {
  187. return 6; // largest UTF-8 encoding of a UCS-4 character
  188. }
  189. };
  190. BOOST_UTF8_END_NAMESPACE
  191. #endif // BOOST_UTF8_CODECVT_FACET_HPP