ucsdet.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. /*
  2. **********************************************************************
  3. * Copyright (C) 2005-2013, International Business Machines
  4. * Corporation and others. All Rights Reserved.
  5. **********************************************************************
  6. * file name: ucsdet.h
  7. * encoding: US-ASCII
  8. * indentation:4
  9. *
  10. * created on: 2005Aug04
  11. * created by: Andy Heninger
  12. *
  13. * ICU Character Set Detection, API for C
  14. *
  15. * Draft version 18 Oct 2005
  16. *
  17. */
  18. #ifndef __UCSDET_H
  19. #define __UCSDET_H
  20. #include "unicode/utypes.h"
  21. #if !UCONFIG_NO_CONVERSION
  22. #include "unicode/localpointer.h"
  23. #include "unicode/uenum.h"
  24. /**
  25. * \file
  26. * \brief C API: Charset Detection API
  27. *
  28. * This API provides a facility for detecting the
  29. * charset or encoding of character data in an unknown text format.
  30. * The input data can be from an array of bytes.
  31. * <p>
  32. * Character set detection is at best an imprecise operation. The detection
  33. * process will attempt to identify the charset that best matches the characteristics
  34. * of the byte data, but the process is partly statistical in nature, and
  35. * the results can not be guaranteed to always be correct.
  36. * <p>
  37. * For best accuracy in charset detection, the input data should be primarily
  38. * in a single language, and a minimum of a few hundred bytes worth of plain text
  39. * in the language are needed. The detection process will attempt to
  40. * ignore html or xml style markup that could otherwise obscure the content.
  41. */
  42. struct UCharsetDetector;
  43. /**
  44. * Structure representing a charset detector
  45. * @stable ICU 3.6
  46. */
  47. typedef struct UCharsetDetector UCharsetDetector;
  48. struct UCharsetMatch;
  49. /**
  50. * Opaque structure representing a match that was identified
  51. * from a charset detection operation.
  52. * @stable ICU 3.6
  53. */
  54. typedef struct UCharsetMatch UCharsetMatch;
  55. /**
  56. * Open a charset detector.
  57. *
  58. * @param status Any error conditions occurring during the open
  59. * operation are reported back in this variable.
  60. * @return the newly opened charset detector.
  61. * @stable ICU 3.6
  62. */
  63. U_STABLE UCharsetDetector * U_EXPORT2
  64. ucsdet_open(UErrorCode *status);
  65. /**
  66. * Close a charset detector. All storage and any other resources
  67. * owned by this charset detector will be released. Failure to
  68. * close a charset detector when finished with it can result in
  69. * memory leaks in the application.
  70. *
  71. * @param ucsd The charset detector to be closed.
  72. * @stable ICU 3.6
  73. */
  74. U_STABLE void U_EXPORT2
  75. ucsdet_close(UCharsetDetector *ucsd);
  76. #if U_SHOW_CPLUSPLUS_API
  77. U_NAMESPACE_BEGIN
  78. /**
  79. * \class LocalUCharsetDetectorPointer
  80. * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close().
  81. * For most methods see the LocalPointerBase base class.
  82. *
  83. * @see LocalPointerBase
  84. * @see LocalPointer
  85. * @stable ICU 4.4
  86. */
  87. U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close);
  88. U_NAMESPACE_END
  89. #endif
  90. /**
  91. * Set the input byte data whose charset is to detected.
  92. *
  93. * Ownership of the input text byte array remains with the caller.
  94. * The input string must not be altered or deleted until the charset
  95. * detector is either closed or reset to refer to different input text.
  96. *
  97. * @param ucsd the charset detector to be used.
  98. * @param textIn the input text of unknown encoding. .
  99. * @param len the length of the input text, or -1 if the text
  100. * is NUL terminated.
  101. * @param status any error conditions are reported back in this variable.
  102. *
  103. * @stable ICU 3.6
  104. */
  105. U_STABLE void U_EXPORT2
  106. ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);
  107. /** Set the declared encoding for charset detection.
  108. * The declared encoding of an input text is an encoding obtained
  109. * by the user from an http header or xml declaration or similar source that
  110. * can be provided as an additional hint to the charset detector.
  111. *
  112. * How and whether the declared encoding will be used during the
  113. * detection process is TBD.
  114. *
  115. * @param ucsd the charset detector to be used.
  116. * @param encoding an encoding for the current data obtained from
  117. * a header or declaration or other source outside
  118. * of the byte data itself.
  119. * @param length the length of the encoding name, or -1 if the name string
  120. * is NUL terminated.
  121. * @param status any error conditions are reported back in this variable.
  122. *
  123. * @stable ICU 3.6
  124. */
  125. U_STABLE void U_EXPORT2
  126. ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status);
  127. /**
  128. * Return the charset that best matches the supplied input data.
  129. *
  130. * Note though, that because the detection
  131. * only looks at the start of the input data,
  132. * there is a possibility that the returned charset will fail to handle
  133. * the full set of input data.
  134. * <p>
  135. * The returned UCharsetMatch object is owned by the UCharsetDetector.
  136. * It will remain valid until the detector input is reset, or until
  137. * the detector is closed.
  138. * <p>
  139. * The function will fail if
  140. * <ul>
  141. * <li>no charset appears to match the data.</li>
  142. * <li>no input text has been provided</li>
  143. * </ul>
  144. *
  145. * @param ucsd the charset detector to be used.
  146. * @param status any error conditions are reported back in this variable.
  147. * @return a UCharsetMatch representing the best matching charset,
  148. * or NULL if no charset matches the byte data.
  149. *
  150. * @stable ICU 3.6
  151. */
  152. U_STABLE const UCharsetMatch * U_EXPORT2
  153. ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status);
  154. /**
  155. * Find all charset matches that appear to be consistent with the input,
  156. * returning an array of results. The results are ordered with the
  157. * best quality match first.
  158. *
  159. * Because the detection only looks at a limited amount of the
  160. * input byte data, some of the returned charsets may fail to handle
  161. * the all of input data.
  162. * <p>
  163. * The returned UCharsetMatch objects are owned by the UCharsetDetector.
  164. * They will remain valid until the detector is closed or modified
  165. *
  166. * <p>
  167. * Return an error if
  168. * <ul>
  169. * <li>no charsets appear to match the input data.</li>
  170. * <li>no input text has been provided</li>
  171. * </ul>
  172. *
  173. * @param ucsd the charset detector to be used.
  174. * @param matchesFound pointer to a variable that will be set to the
  175. * number of charsets identified that are consistent with
  176. * the input data. Output only.
  177. * @param status any error conditions are reported back in this variable.
  178. * @return A pointer to an array of pointers to UCharSetMatch objects.
  179. * This array, and the UCharSetMatch instances to which it refers,
  180. * are owned by the UCharsetDetector, and will remain valid until
  181. * the detector is closed or modified.
  182. * @stable ICU 3.6
  183. */
  184. U_STABLE const UCharsetMatch ** U_EXPORT2
  185. ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status);
  186. /**
  187. * Get the name of the charset represented by a UCharsetMatch.
  188. *
  189. * The storage for the returned name string is owned by the
  190. * UCharsetMatch, and will remain valid while the UCharsetMatch
  191. * is valid.
  192. *
  193. * The name returned is suitable for use with the ICU conversion APIs.
  194. *
  195. * @param ucsm The charset match object.
  196. * @param status Any error conditions are reported back in this variable.
  197. * @return The name of the matching charset.
  198. *
  199. * @stable ICU 3.6
  200. */
  201. U_STABLE const char * U_EXPORT2
  202. ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status);
  203. /**
  204. * Get a confidence number for the quality of the match of the byte
  205. * data with the charset. Confidence numbers range from zero to 100,
  206. * with 100 representing complete confidence and zero representing
  207. * no confidence.
  208. *
  209. * The confidence values are somewhat arbitrary. They define an
  210. * an ordering within the results for any single detection operation
  211. * but are not generally comparable between the results for different input.
  212. *
  213. * A confidence value of ten does have a general meaning - it is used
  214. * for charsets that can represent the input data, but for which there
  215. * is no other indication that suggests that the charset is the correct one.
  216. * Pure 7 bit ASCII data, for example, is compatible with a
  217. * great many charsets, most of which will appear as possible matches
  218. * with a confidence of 10.
  219. *
  220. * @param ucsm The charset match object.
  221. * @param status Any error conditions are reported back in this variable.
  222. * @return A confidence number for the charset match.
  223. *
  224. * @stable ICU 3.6
  225. */
  226. U_STABLE int32_t U_EXPORT2
  227. ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status);
  228. /**
  229. * Get the RFC 3066 code for the language of the input data.
  230. *
  231. * The Charset Detection service is intended primarily for detecting
  232. * charsets, not language. For some, but not all, charsets, a language is
  233. * identified as a byproduct of the detection process, and that is what
  234. * is returned by this function.
  235. *
  236. * CAUTION:
  237. * 1. Language information is not available for input data encoded in
  238. * all charsets. In particular, no language is identified
  239. * for UTF-8 input data.
  240. *
  241. * 2. Closely related languages may sometimes be confused.
  242. *
  243. * If more accurate language detection is required, a linguistic
  244. * analysis package should be used.
  245. *
  246. * The storage for the returned name string is owned by the
  247. * UCharsetMatch, and will remain valid while the UCharsetMatch
  248. * is valid.
  249. *
  250. * @param ucsm The charset match object.
  251. * @param status Any error conditions are reported back in this variable.
  252. * @return The RFC 3066 code for the language of the input data, or
  253. * an empty string if the language could not be determined.
  254. *
  255. * @stable ICU 3.6
  256. */
  257. U_STABLE const char * U_EXPORT2
  258. ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status);
  259. /**
  260. * Get the entire input text as a UChar string, placing it into
  261. * a caller-supplied buffer. A terminating
  262. * NUL character will be appended to the buffer if space is available.
  263. *
  264. * The number of UChars in the output string, not including the terminating
  265. * NUL, is returned.
  266. *
  267. * If the supplied buffer is smaller than required to hold the output,
  268. * the contents of the buffer are undefined. The full output string length
  269. * (in UChars) is returned as always, and can be used to allocate a buffer
  270. * of the correct size.
  271. *
  272. *
  273. * @param ucsm The charset match object.
  274. * @param buf A UChar buffer to be filled with the converted text data.
  275. * @param cap The capacity of the buffer in UChars.
  276. * @param status Any error conditions are reported back in this variable.
  277. * @return The number of UChars in the output string.
  278. *
  279. * @stable ICU 3.6
  280. */
  281. U_STABLE int32_t U_EXPORT2
  282. ucsdet_getUChars(const UCharsetMatch *ucsm,
  283. UChar *buf, int32_t cap, UErrorCode *status);
  284. /**
  285. * Get an iterator over the set of all detectable charsets -
  286. * over the charsets that are known to the charset detection
  287. * service.
  288. *
  289. * The returned UEnumeration provides access to the names of
  290. * the charsets.
  291. *
  292. * <p>
  293. * The state of the Charset detector that is passed in does not
  294. * affect the result of this function, but requiring a valid, open
  295. * charset detector as a parameter insures that the charset detection
  296. * service has been safely initialized and that the required detection
  297. * data is available.
  298. *
  299. * <p>
  300. * <b>Note:</b> Multiple different charset encodings in a same family may use
  301. * a single shared name in this implementation. For example, this method returns
  302. * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
  303. * (Windows Latin 1). However, actual detection result could be "windows-1252"
  304. * when the input data matches Latin 1 code points with any points only available
  305. * in "windows-1252".
  306. *
  307. * @param ucsd a Charset detector.
  308. * @param status Any error conditions are reported back in this variable.
  309. * @return an iterator providing access to the detectable charset names.
  310. * @stable ICU 3.6
  311. */
  312. U_STABLE UEnumeration * U_EXPORT2
  313. ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status);
  314. /**
  315. * Test whether input filtering is enabled for this charset detector.
  316. * Input filtering removes text that appears to be HTML or xml
  317. * markup from the input before applying the code page detection
  318. * heuristics.
  319. *
  320. * @param ucsd The charset detector to check.
  321. * @return TRUE if filtering is enabled.
  322. * @stable ICU 3.6
  323. */
  324. U_STABLE UBool U_EXPORT2
  325. ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
  326. /**
  327. * Enable filtering of input text. If filtering is enabled,
  328. * text within angle brackets ("<" and ">") will be removed
  329. * before detection, which will remove most HTML or xml markup.
  330. *
  331. * @param ucsd the charset detector to be modified.
  332. * @param filter <code>true</code> to enable input text filtering.
  333. * @return The previous setting.
  334. *
  335. * @stable ICU 3.6
  336. */
  337. U_STABLE UBool U_EXPORT2
  338. ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);
  339. #ifndef U_HIDE_INTERNAL_API
  340. /**
  341. * Get an iterator over the set of detectable charsets -
  342. * over the charsets that are enabled by the specified charset detector.
  343. *
  344. * The returned UEnumeration provides access to the names of
  345. * the charsets.
  346. *
  347. * @param ucsd a Charset detector.
  348. * @param status Any error conditions are reported back in this variable.
  349. * @return an iterator providing access to the detectable charset names by
  350. * the specified charset detector.
  351. * @internal
  352. */
  353. U_INTERNAL UEnumeration * U_EXPORT2
  354. ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status);
  355. /**
  356. * Enable or disable individual charset encoding.
  357. * A name of charset encoding must be included in the names returned by
  358. * {@link #getAllDetectableCharsets()}.
  359. *
  360. * @param ucsd a Charset detector.
  361. * @param encoding encoding the name of charset encoding.
  362. * @param enabled <code>TRUE</code> to enable, or <code>FALSE</code> to disable the
  363. * charset encoding.
  364. * @param status receives the return status. When the name of charset encoding
  365. * is not supported, U_ILLEGAL_ARGUMENT_ERROR is set.
  366. * @internal
  367. */
  368. U_INTERNAL void U_EXPORT2
  369. ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status);
  370. #endif /* U_HIDE_INTERNAL_API */
  371. #endif
  372. #endif /* __UCSDET_H */