util.hpp 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. //
  2. // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See
  5. // accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt)
  7. //
  8. #ifndef BOOST_LOCALE_UTIL_HPP
  9. #define BOOST_LOCALE_UTIL_HPP
  10. #include <locale>
  11. #include <typeinfo>
  12. #include <boost/cstdint.hpp>
  13. #include <boost/locale/utf.hpp>
  14. #include <boost/locale/generator.hpp>
  15. #include <boost/assert.hpp>
  16. #include <vector>
  17. namespace boost {
  18. namespace locale {
  19. ///
  20. /// \brief This namespace provides various utility function useful for Boost.Locale backends
  21. /// implementations
  22. ///
  23. namespace util {
  24. ///
  25. /// \brief Return default system locale name in POSIX format.
  26. ///
  27. /// This function tries to detect the locale using, LC_CTYPE, LC_ALL and LANG environment
  28. /// variables in this order and if all of them unset, in POSIX platforms it returns "C"
  29. ///
  30. /// On Windows additionally to check the above environment variables, this function
  31. /// tries to creates locale name from ISO-339 and ISO-3199 country codes defined
  32. /// for user default locale.
  33. /// If \a use_utf8_on_windows is true it sets the encoding to UTF-8, otherwise, if system
  34. /// locale supports ANSI code-page it defines the ANSI encoding like windows-1252, otherwise it fall-backs
  35. /// to UTF-8 encoding if ANSI code-page is not available.
  36. ///
  37. BOOST_LOCALE_DECL
  38. std::string get_system_locale(bool use_utf8_on_windows = false);
  39. ///
  40. /// \brief Installs information facet to locale in based on locale name \a name
  41. ///
  42. /// This function installs boost::locale::info facet into the locale \a in and returns
  43. /// newly created locale.
  44. ///
  45. /// Note: all information is based only on parsing of string \a name;
  46. ///
  47. /// The name has following format: language[_COUNTRY][.encoding][\@variant]
  48. /// Where language is ISO-639 language code like "en" or "ru", COUNTRY is ISO-3166
  49. /// country identifier like "US" or "RU". the Encoding is a charracter set name
  50. /// like UTF-8 or ISO-8859-1. Variant is backend specific variant like \c euro or
  51. /// calendar=hebrew.
  52. ///
  53. /// If some parameters are missing they are specified as blanks, default encoding
  54. /// is assumed to be US-ASCII and missing language is assumed to be "C"
  55. ///
  56. BOOST_LOCALE_DECL
  57. std::locale create_info(std::locale const &in,std::string const &name);
  58. ///
  59. /// \brief This class represent a simple stateless converter from UCS-4 and to UCS-4 for
  60. /// each single code point
  61. ///
  62. /// This class is used for creation of std::codecvt facet for converting utf-16/utf-32 encoding
  63. /// to encoding supported by this converter
  64. ///
  65. /// Please note, this converter should be fully stateless. Fully stateless means it should
  66. /// never assume that it is called in any specific order on the text. Even if the
  67. /// encoding itself seems to be stateless like windows-1255 or shift-jis, some
  68. /// encoders (most notably iconv) can actually compose several code-point into one or
  69. /// decompose them in case composite characters are found. So be very careful when implementing
  70. /// these converters for certain character set.
  71. ///
  72. class base_converter {
  73. public:
  74. ///
  75. /// This value should be returned when an illegal input sequence or code-point is observed:
  76. /// For example if a UCS-32 code-point is in the range reserved for UTF-16 surrogates
  77. /// or an invalid UTF-8 sequence is found
  78. ///
  79. static const uint32_t illegal=utf::illegal;
  80. ///
  81. /// This value is returned in following cases: The of incomplete input sequence was found or
  82. /// insufficient output buffer was provided so complete output could not be written.
  83. ///
  84. static const uint32_t incomplete=utf::incomplete;
  85. virtual ~base_converter()
  86. {
  87. }
  88. ///
  89. /// Return the maximal length that one Unicode code-point can be converted to, for example
  90. /// for UTF-8 it is 4, for Shift-JIS it is 2 and ISO-8859-1 is 1
  91. ///
  92. virtual int max_len() const
  93. {
  94. return 1;
  95. }
  96. ///
  97. /// Returns true if calling the functions from_unicode, to_unicode, and max_len is thread safe.
  98. ///
  99. /// Rule of thumb: if this class' implementation uses simple tables that are unchanged
  100. /// or is purely algorithmic like UTF-8 - so it does not share any mutable bit for
  101. /// independent to_unicode, from_unicode calls, you may set it to true, otherwise,
  102. /// for example if you use iconv_t descriptor or UConverter as conversion object return false,
  103. /// and this object will be cloned for each use.
  104. ///
  105. virtual bool is_thread_safe() const
  106. {
  107. return false;
  108. }
  109. ///
  110. /// Create a polymorphic copy of this object, usually called only if is_thread_safe() return false
  111. ///
  112. virtual base_converter *clone() const
  113. {
  114. BOOST_ASSERT(typeid(*this)==typeid(base_converter));
  115. return new base_converter();
  116. }
  117. ///
  118. /// Convert a single character starting at begin and ending at most at end to Unicode code-point.
  119. ///
  120. /// if valid input sequence found in [\a begin,\a code_point_end) such as \a begin < \a code_point_end && \a code_point_end <= \a end
  121. /// it is converted to its Unicode code point equivalent, \a begin is set to \a code_point_end
  122. ///
  123. /// if incomplete input sequence found in [\a begin,\a end), i.e. there my be such \a code_point_end that \a code_point_end > \a end
  124. /// and [\a begin, \a code_point_end) would be valid input sequence, then \a incomplete is returned begin stays unchanged, for example
  125. /// for UTF-8 conversion a *begin = 0xc2, \a begin +1 = \a end is such situation.
  126. ///
  127. /// if invalid input sequence found, i.e. there is a sequence [\a begin, \a code_point_end) such as \a code_point_end <= \a end
  128. /// that is illegal for this encoding, \a illegal is returned and begin stays unchanged. For example if *begin = 0xFF and begin < end
  129. /// for UTF-8, then \a illegal is returned.
  130. ///
  131. ///
  132. virtual uint32_t to_unicode(char const *&begin,char const *end)
  133. {
  134. if(begin == end)
  135. return incomplete;
  136. unsigned char cp = *begin;
  137. if(cp <= 0x7F) {
  138. begin++;
  139. return cp;
  140. }
  141. return illegal;
  142. }
  143. ///
  144. /// Convert a single code-point \a u into encoding and store it in [begin,end) range.
  145. ///
  146. /// If u is invalid Unicode code-point, or it can not be mapped correctly to represented character set,
  147. /// \a illegal should be returned
  148. ///
  149. /// If u can be converted to a sequence of bytes c1, ... , cN (1<= N <= max_len() ) then
  150. ///
  151. /// -# If end - begin >= N, c1, ... cN are written starting at begin and N is returned
  152. /// -# If end - begin < N, incomplete is returned, it is unspecified what would be
  153. /// stored in bytes in range [begin,end)
  154. virtual uint32_t from_unicode(uint32_t u,char *begin,char const *end)
  155. {
  156. if(begin==end)
  157. return incomplete;
  158. if(u >= 0x80)
  159. return illegal;
  160. *begin = static_cast<char>(u);
  161. return 1;
  162. }
  163. };
  164. ///
  165. /// This function creates a \a base_converter that can be used for conversion between UTF-8 and
  166. /// unicode code points
  167. ///
  168. BOOST_LOCALE_DECL std::auto_ptr<base_converter> create_utf8_converter();
  169. ///
  170. /// This function creates a \a base_converter that can be used for conversion between single byte
  171. /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
  172. ///
  173. /// If \a encoding is not supported, empty pointer is returned. You should check if
  174. /// std::auto_ptr<base_converter>::get() != 0
  175. ///
  176. BOOST_LOCALE_DECL std::auto_ptr<base_converter> create_simple_converter(std::string const &encoding);
  177. ///
  178. /// Install codecvt facet into locale \a in and return new locale that is based on \a in and uses new
  179. /// facet.
  180. ///
  181. /// codecvt facet would convert between narrow and wide/char16_t/char32_t encodings using \a cvt converter.
  182. /// If \a cvt is null pointer, always failure conversion would be used that fails on every first input or output.
  183. ///
  184. /// Note: the codecvt facet handles both UTF-16 and UTF-32 wide encodings, it knows to break and join
  185. /// Unicode code-points above 0xFFFF to and from surrogate pairs correctly. \a cvt should be unaware
  186. /// of wide encoding type
  187. ///
  188. BOOST_LOCALE_DECL
  189. std::locale create_codecvt(std::locale const &in,std::auto_ptr<base_converter> cvt,character_facet_type type);
  190. ///
  191. /// Install utf8 codecvt to UTF-16 or UTF-32 into locale \a in and return
  192. /// new locale that is based on \a in and uses new facet.
  193. ///
  194. BOOST_LOCALE_DECL
  195. std::locale create_utf8_codecvt(std::locale const &in,character_facet_type type);
  196. ///
  197. /// This function installs codecvt that can be used for conversion between single byte
  198. /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points,
  199. ///
  200. /// Throws boost::locale::conv::invalid_charset_error if the chacater set is not supported or isn't single byte character
  201. /// set
  202. BOOST_LOCALE_DECL
  203. std::locale create_simple_codecvt(std::locale const &in,std::string const &encoding,character_facet_type type);
  204. } // util
  205. } // locale
  206. } // boost
  207. #endif
  208. // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4