grapheme_util.c 9.8 KB


  1. /*
  2. +----------------------------------------------------------------------+
  3. | PHP Version 5 |
  4. +----------------------------------------------------------------------+
  5. | This source file is subject to version 3.01 of the PHP license, |
  6. | that is bundled with this package in the file LICENSE, and is |
  7. | available through the world-wide-web at the following url: |
  8. | http://www.php.net/license/3_01.txt |
  9. | If you did not receive a copy of the PHP license and are unable to |
  10. | obtain it through the world-wide-web, please send a note to |
  11. | license@php.net so we can mail you a copy immediately. |
  12. +----------------------------------------------------------------------+
  13. | Author: Ed Batutis <ed@batutis.com> |
  14. +----------------------------------------------------------------------+
  15. */
  16. /* {{{ includes */
  17. #ifdef HAVE_CONFIG_H
  18. #include "config.h"
  19. #endif
  20. #include <php.h>
  21. #include "grapheme.h"
  22. #include "grapheme_util.h"
  23. #include "intl_common.h"
  24. #include <unicode/utypes.h>
  25. #include <unicode/ucol.h>
  26. #include <unicode/ustring.h>
  27. #include <unicode/ubrk.h>
  28. #include <unicode/usearch.h>
  29. #include "ext/standard/php_string.h"
  30. ZEND_EXTERN_MODULE_GLOBALS( intl )
  31. /* }}} */
  32. /* {{{ grapheme_close_global_iterator - clean up */
  33. void
  34. grapheme_close_global_iterator( TSRMLS_D )
  35. {
  36. UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
  37. if ( NULL != global_break_iterator ) {
  38. ubrk_close(global_break_iterator);
  39. }
  40. }
  41. /* }}} */
  42. /* {{{ grapheme_substr_ascii f='from' - starting point, l='length' */
  43. void grapheme_substr_ascii(char *str, int str_len, int f, int l, int argc, char **sub_str, int *sub_str_len)
  44. {
  45. *sub_str = NULL;
  46. if (argc > 2) {
  47. if ((l < 0 && -l > str_len)) {
  48. return;
  49. } else if (l > str_len) {
  50. l = str_len;
  51. }
  52. } else {
  53. l = str_len;
  54. }
  55. if (f > str_len || (f < 0 && -f > str_len)) {
  56. return;
  57. }
  58. if (l < 0 && (l + str_len - f) < 0) {
  59. return;
  60. }
  61. /* if "from" position is negative, count start position from the end
  62. * of the string
  63. */
  64. if (f < 0) {
  65. f = str_len + f;
  66. if (f < 0) {
  67. f = 0;
  68. }
  69. }
  70. /* if "length" position is negative, set it to the length
  71. * needed to stop that many chars from the end of the string
  72. */
  73. if (l < 0) {
  74. l = (str_len - f) + l;
  75. if (l < 0) {
  76. l = 0;
  77. }
  78. }
  79. if (f >= str_len) {
  80. return;
  81. }
  82. if ((f + l) > str_len) {
  83. l = str_len - f;
  84. }
  85. *sub_str = str + f;
  86. *sub_str_len = l;
  87. return;
  88. }
  89. /* }}} */
  90. #define STRPOS_CHECK_STATUS(status, error) \
  91. if ( U_FAILURE( (status) ) ) { \
  92. intl_error_set_code( NULL, (status) TSRMLS_CC ); \
  93. intl_error_set_custom_msg( NULL, (error), 0 TSRMLS_CC ); \
  94. if (uhaystack) { \
  95. efree( uhaystack ); \
  96. } \
  97. if (uneedle) { \
  98. efree( uneedle ); \
  99. } \
  100. if(bi) { \
  101. ubrk_close (bi); \
  102. } \
  103. if(src) { \
  104. usearch_close(src); \
  105. } \
  106. return -1; \
  107. }
  108. /* {{{ grapheme_strpos_utf16 - strrpos using utf16*/
  109. int grapheme_strpos_utf16(unsigned char *haystack, int32_t haystack_len, unsigned char*needle, int32_t needle_len, int32_t offset, int32_t *puchar_pos, int f_ignore_case, int last TSRMLS_DC)
  110. {
  111. UChar *uhaystack = NULL, *uneedle = NULL;
  112. int32_t uhaystack_len = 0, uneedle_len = 0, char_pos, ret_pos, offset_pos = 0;
  113. unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
  114. UBreakIterator* bi = NULL;
  115. UErrorCode status;
  116. UStringSearch* src = NULL;
  117. UCollator *coll;
  118. if(puchar_pos) {
  119. *puchar_pos = -1;
  120. }
  121. /* convert the strings to UTF-16. */
  122. status = U_ZERO_ERROR;
  123. intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, (char *) haystack, haystack_len, &status );
  124. STRPOS_CHECK_STATUS(status, "Error converting input string to UTF-16");
  125. status = U_ZERO_ERROR;
  126. intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, (char *) needle, needle_len, &status );
  127. STRPOS_CHECK_STATUS(status, "Error converting input string to UTF-16");
  128. /* get a pointer to the haystack taking into account the offset */
  129. status = U_ZERO_ERROR;
  130. bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
  131. STRPOS_CHECK_STATUS(status, "Failed to get iterator");
  132. status = U_ZERO_ERROR;
  133. ubrk_setText(bi, uhaystack, uhaystack_len, &status);
  134. STRPOS_CHECK_STATUS(status, "Failed to set up iterator");
  135. status = U_ZERO_ERROR;
  136. src = usearch_open(uneedle, uneedle_len, uhaystack, uhaystack_len, "", bi, &status);
  137. STRPOS_CHECK_STATUS(status, "Error creating search object");
  138. if(f_ignore_case) {
  139. coll = usearch_getCollator(src);
  140. status = U_ZERO_ERROR;
  141. ucol_setAttribute(coll, UCOL_STRENGTH, UCOL_SECONDARY, &status);
  142. STRPOS_CHECK_STATUS(status, "Error setting collation strength");
  143. usearch_reset(src);
  144. }
  145. if(offset != 0) {
  146. offset_pos = grapheme_get_haystack_offset(bi, offset);
  147. if(offset_pos == -1) {
  148. status = U_ILLEGAL_ARGUMENT_ERROR;
  149. STRPOS_CHECK_STATUS(status, "Invalid search offset");
  150. }
  151. status = U_ZERO_ERROR;
  152. usearch_setOffset(src, offset_pos, &status);
  153. STRPOS_CHECK_STATUS(status, "Invalid search offset");
  154. }
  155. if(last) {
  156. char_pos = usearch_last(src, &status);
  157. if(char_pos < offset_pos) {
  158. /* last one is beyound our start offset */
  159. char_pos = USEARCH_DONE;
  160. }
  161. } else {
  162. char_pos = usearch_next(src, &status);
  163. }
  164. STRPOS_CHECK_STATUS(status, "Error looking up string");
  165. if(char_pos != USEARCH_DONE && ubrk_isBoundary(bi, char_pos)) {
  166. ret_pos = grapheme_count_graphemes(bi, uhaystack,char_pos);
  167. if(puchar_pos) {
  168. *puchar_pos = char_pos;
  169. }
  170. } else {
  171. ret_pos = -1;
  172. }
  173. if (uhaystack) {
  174. efree( uhaystack );
  175. }
  176. if (uneedle) {
  177. efree( uneedle );
  178. }
  179. ubrk_close (bi);
  180. usearch_close (src);
  181. return ret_pos;
  182. }
  183. /* }}} */
  184. /* {{{ grapheme_ascii_check: ASCII check */
  185. int grapheme_ascii_check(const unsigned char *day, int32_t len)
  186. {
  187. int ret_len = len;
  188. while ( len-- ) {
  189. if ( *day++ > 0x7f || (*day == '\n' && *(day - 1) == '\r') )
  190. return -1;
  191. }
  192. return ret_len;
  193. }
  194. /* }}} */
  195. /* {{{ grapheme_split_string: find and optionally return grapheme boundaries */
  196. int grapheme_split_string(const UChar *text, int32_t text_length, int boundary_array[], int boundary_array_len TSRMLS_DC )
  197. {
  198. unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
  199. UErrorCode status = U_ZERO_ERROR;
  200. int ret_len, pos;
  201. UBreakIterator* bi;
  202. bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );
  203. if( U_FAILURE(status) ) {
  204. return -1;
  205. }
  206. ubrk_setText(bi, text, text_length, &status);
  207. pos = 0;
  208. for ( ret_len = 0; pos != UBRK_DONE; ) {
  209. pos = ubrk_next(bi);
  210. if ( pos != UBRK_DONE ) {
  211. if ( NULL != boundary_array && ret_len < boundary_array_len ) {
  212. boundary_array[ret_len] = pos;
  213. }
  214. ret_len++;
  215. }
  216. }
  217. ubrk_close(bi);
  218. return ret_len;
  219. }
  220. /* }}} */
  221. /* {{{ grapheme_count_graphemes */
  222. int32_t grapheme_count_graphemes(UBreakIterator *bi, UChar *string, int32_t string_len)
  223. {
  224. int ret_len = 0;
  225. int pos = 0;
  226. UErrorCode status = U_ZERO_ERROR;
  227. ubrk_setText(bi, string, string_len, &status);
  228. do {
  229. pos = ubrk_next(bi);
  230. if ( UBRK_DONE != pos ) {
  231. ret_len++;
  232. }
  233. } while ( UBRK_DONE != pos );
  234. return ret_len;
  235. }
  236. /* }}} */
  237. /* {{{ grapheme_get_haystack_offset - bump the haystack pointer based on the grapheme count offset */
  238. int grapheme_get_haystack_offset(UBreakIterator* bi, int32_t offset)
  239. {
  240. int32_t pos;
  241. int32_t (*iter_op)(UBreakIterator* bi);
  242. int iter_incr;
  243. if ( 0 == offset ) {
  244. return 0;
  245. }
  246. if ( offset < 0 ) {
  247. iter_op = ubrk_previous;
  248. ubrk_last(bi); /* one past the end */
  249. iter_incr = 1;
  250. }
  251. else {
  252. iter_op = ubrk_next;
  253. iter_incr = -1;
  254. }
  255. pos = 0;
  256. while ( pos != UBRK_DONE && offset != 0 ) {
  257. pos = iter_op(bi);
  258. if ( UBRK_DONE != pos ) {
  259. offset += iter_incr;
  260. }
  261. }
  262. if ( offset != 0 ) {
  263. return -1;
  264. }
  265. return pos;
  266. }
  267. /* }}} */
  268. /* {{{ grapheme_strrpos_ascii: borrowed from the php ext/standard/string.c */
  269. int32_t
  270. grapheme_strrpos_ascii(unsigned char *haystack, int32_t haystack_len, unsigned char *needle, int32_t needle_len, int32_t offset)
  271. {
  272. unsigned char *p, *e;
  273. if (offset >= 0) {
  274. p = haystack + offset;
  275. e = haystack + haystack_len - needle_len;
  276. } else {
  277. p = haystack;
  278. if (needle_len > -offset) {
  279. e = haystack + haystack_len - needle_len;
  280. } else {
  281. e = haystack + haystack_len + offset;
  282. }
  283. }
  284. if (needle_len == 1) {
  285. /* Single character search can shortcut memcmps */
  286. while (e >= p) {
  287. if (*e == *needle) {
  288. return (e - p + (offset > 0 ? offset : 0));
  289. }
  290. e--;
  291. }
  292. return -1;
  293. }
  294. while (e >= p) {
  295. if (memcmp(e, needle, needle_len) == 0) {
  296. return (e - p + (offset > 0 ? offset : 0));
  297. }
  298. e--;
  299. }
  300. return -1;
  301. }
  302. /* }}} */
  303. /* {{{ grapheme_get_break_iterator: get a clone of the global character break iterator */
  304. UBreakIterator* grapheme_get_break_iterator(void *stack_buffer, UErrorCode *status TSRMLS_DC )
  305. {
  306. int32_t buffer_size;
  307. UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
  308. if ( NULL == global_break_iterator ) {
  309. global_break_iterator = ubrk_open(UBRK_CHARACTER,
  310. NULL, /* icu default locale - locale has no effect on this iterator */
  311. NULL, /* text not set in global iterator */
  312. 0, /* text length = 0 */
  313. status);
  314. INTL_G(grapheme_iterator) = global_break_iterator;
  315. }
  316. buffer_size = U_BRK_SAFECLONE_BUFFERSIZE;
  317. return ubrk_safeClone(global_break_iterator, stack_buffer, &buffer_size, status);
  318. }
  319. /* }}} */
  320. /*
  321. * Local variables:
  322. * tab-width: 4
  323. * c-basic-offset: 4
  324. * End:
  325. * vim600: fdm=marker
  326. * vim: noet sw=4 ts=4
  327. */