cyr_convert.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. /*
  2. +----------------------------------------------------------------------+
  3. | PHP Version 5 |
  4. +----------------------------------------------------------------------+
  5. | Copyright (c) 1997-2016 The PHP Group |
  6. +----------------------------------------------------------------------+
  7. | This source file is subject to version 3.01 of the PHP license, |
  8. | that is bundled with this package in the file LICENSE, and is |
  9. | available through the world-wide-web at the following url: |
  10. | http://www.php.net/license/3_01.txt |
  11. | If you did not receive a copy of the PHP license and are unable to |
  12. | obtain it through the world-wide-web, please send a note to |
  13. | license@php.net so we can mail you a copy immediately. |
  14. +----------------------------------------------------------------------+
  15. | Author: Kirill Maximov <kir@rus.net> |
  16. +----------------------------------------------------------------------+
  17. */
  18. /* $Id$ */
  19. #include <stdlib.h>
  20. #ifdef HAVE_UNISTD_H
  21. #include <unistd.h>
  22. #endif
  23. #include <string.h>
  24. #include <errno.h>
  25. #include "php.h"
  26. #include "cyr_convert.h"
  27. #include <stdio.h>
  28. /*****************************************************************************
  29. * This is codetables for different Cyrillic charsets (relative to koi8-r).
  30. * Each table contains data for 128-255 symbols from ASCII table.
  31. * First 256 symbols are for conversion from koi8-r to corresponding charset,
  32. * second 256 symbols are for reverse conversion, from charset to koi8-r.
  33. *
  34. * Here we have the following tables:
  35. * _cyr_win1251 - for windows-1251 charset
  36. * _cyr_iso88595 - for iso8859-5 charset
  37. * _cyr_cp866 - for x-cp866 charset
  38. * _cyr_mac - for x-mac-cyrillic charset
  39. *
  40. *****************************************************************************/
  41. typedef unsigned char _cyr_charset_table[512];
  42. /* {{{ static const _cyr_charset_table _cyr_win1251
  43. */
  44. static const _cyr_charset_table _cyr_win1251 = {
  45. 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
  46. 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
  47. 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
  48. 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
  49. 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
  50. 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
  51. 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
  52. 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
  53. 46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,
  54. 46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,
  55. 154,174,190,46,159,189,46,46,179,191,180,157,46,46,156,183,
  56. 46,46,182,166,173,46,46,158,163,152,164,155,46,46,46,167,
  57. 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
  58. 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
  59. 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
  60. 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
  61. 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
  62. 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
  63. 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
  64. 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
  65. 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
  66. 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
  67. 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
  68. 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
  69. 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
  70. 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
  71. 32,32,32,184,186,32,179,191,32,32,32,32,32,180,162,32,
  72. 32,32,32,168,170,32,178,175,32,32,32,32,32,165,161,169,
  73. 254,224,225,246,228,229,244,227,245,232,233,234,235,236,237,238,
  74. 239,255,240,241,242,243,230,226,252,251,231,248,253,249,247,250,
  75. 222,192,193,214,196,197,212,195,213,200,201,202,203,204,205,206,
  76. 207,223,208,209,210,211,198,194,220,219,199,216,221,217,215,218,
  77. },
  78. _cyr_cp866 = {
  79. 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
  80. 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
  81. 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
  82. 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
  83. 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
  84. 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
  85. 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
  86. 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
  87. 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
  88. 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
  89. 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
  90. 35,35,35,124,124,124,124,43,43,124,124,43,43,43,43,43,
  91. 43,45,45,124,45,43,124,124,43,43,45,45,124,45,43,45,
  92. 45,45,45,43,43,43,43,43,43,43,43,35,35,124,124,35,
  93. 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
  94. 179,163,180,164,183,167,190,174,32,149,158,32,152,159,148,154,
  95. 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
  96. 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
  97. 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
  98. 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
  99. 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
  100. 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
  101. 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
  102. 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
  103. 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
  104. 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
  105. 205,186,213,241,243,201,32,245,187,212,211,200,190,32,247,198,
  106. 199,204,181,240,242,185,32,244,203,207,208,202,216,32,246,32,
  107. 238,160,161,230,164,165,228,163,229,168,169,170,171,172,173,174,
  108. 175,239,224,225,226,227,166,162,236,235,167,232,237,233,231,234,
  109. 158,128,129,150,132,133,148,131,149,136,137,138,139,140,141,142,
  110. 143,159,144,145,146,147,134,130,156,155,135,152,157,153,151,154,
  111. },
  112. _cyr_iso88595 = {
  113. 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
  114. 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
  115. 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
  116. 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
  117. 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
  118. 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
  119. 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
  120. 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
  121. 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
  122. 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
  123. 32,179,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
  124. 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
  125. 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
  126. 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
  127. 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
  128. 32,163,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
  129. 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
  130. 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
  131. 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
  132. 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
  133. 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
  134. 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
  135. 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
  136. 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
  137. 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
  138. 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
  139. 32,32,32,241,32,32,32,32,32,32,32,32,32,32,32,32,
  140. 32,32,32,161,32,32,32,32,32,32,32,32,32,32,32,32,
  141. 238,208,209,230,212,213,228,211,229,216,217,218,219,220,221,222,
  142. 223,239,224,225,226,227,214,210,236,235,215,232,237,233,231,234,
  143. 206,176,177,198,180,181,196,179,197,184,185,186,187,188,189,190,
  144. 191,207,192,193,194,195,182,178,204,203,183,200,205,201,199,202,
  145. },
  146. _cyr_mac = {
  147. 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
  148. 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
  149. 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
  150. 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
  151. 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
  152. 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
  153. 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
  154. 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
  155. 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
  156. 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
  157. 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
  158. 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
  159. 128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
  160. 144,145,146,147,148,149,150,151,152,153,154,155,156,179,163,209,
  161. 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
  162. 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,255,
  163. 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
  164. 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
  165. 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
  166. 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
  167. 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
  168. 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
  169. 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
  170. 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
  171. 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
  172. 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
  173. 160,161,162,222,164,165,166,167,168,169,170,171,172,173,174,175,
  174. 176,177,178,221,180,181,182,183,184,185,186,187,188,189,190,191,
  175. 254,224,225,246,228,229,244,227,245,232,233,234,235,236,237,238,
  176. 239,223,240,241,242,243,230,226,252,251,231,248,253,249,247,250,
  177. 158,128,129,150,132,133,148,131,149,136,137,138,139,140,141,142,
  178. 143,159,144,145,146,147,134,130,156,155,135,152,157,153,151,154,
  179. };
  180. /* }}} */
  181. /* {{{ static char * php_convert_cyr_string(unsigned char *str, int length, char from, char to TSRMLS_DC)
  182. * This is the function that performs real in-place conversion of the string
  183. * between charsets.
  184. * Parameters:
  185. * str - string to be converted
  186. * from,to - one-symbol label of source and destination charset
  187. * The following symbols are used as labels:
  188. * k - koi8-r
  189. * w - windows-1251
  190. * i - iso8859-5
  191. * a - x-cp866
  192. * d - x-cp866
  193. * m - x-mac-cyrillic
  194. *****************************************************************************/
  195. static char * php_convert_cyr_string(unsigned char *str, int length, char from, char to TSRMLS_DC)
  196. {
  197. const unsigned char *from_table, *to_table;
  198. unsigned char tmp;
  199. int i;
  200. from_table = NULL;
  201. to_table = NULL;
  202. switch (toupper((int)(unsigned char)from))
  203. {
  204. case 'W':
  205. from_table = _cyr_win1251;
  206. break;
  207. case 'A':
  208. case 'D':
  209. from_table = _cyr_cp866;
  210. break;
  211. case 'I':
  212. from_table = _cyr_iso88595;
  213. break;
  214. case 'M':
  215. from_table = _cyr_mac;
  216. break;
  217. case 'K':
  218. break;
  219. default:
  220. php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unknown source charset: %c", from);
  221. break;
  222. }
  223. switch (toupper((int)(unsigned char)to))
  224. {
  225. case 'W':
  226. to_table = _cyr_win1251;
  227. break;
  228. case 'A':
  229. case 'D':
  230. to_table = _cyr_cp866;
  231. break;
  232. case 'I':
  233. to_table = _cyr_iso88595;
  234. break;
  235. case 'M':
  236. to_table = _cyr_mac;
  237. break;
  238. case 'K':
  239. break;
  240. default:
  241. php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unknown destination charset: %c", to);
  242. break;
  243. }
  244. if (!str)
  245. return (char *)str;
  246. for( i = 0; i<length; i++)
  247. {
  248. tmp = (from_table == NULL)? str[i] : from_table[ str[i] ];
  249. str[i] = (to_table == NULL) ? tmp : to_table[tmp + 256];
  250. }
  251. return (char *)str;
  252. }
  253. /* }}} */
  254. /* {{{ proto string convert_cyr_string(string str, string from, string to)
  255. Convert from one Cyrillic character set to another */
  256. PHP_FUNCTION(convert_cyr_string)
  257. {
  258. char *input, *fr_cs, *to_cs;
  259. int input_len, fr_cs_len, to_cs_len;
  260. unsigned char *str;
  261. if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sss", &input, &input_len, &fr_cs, &fr_cs_len, &to_cs, &to_cs_len) == FAILURE) {
  262. return;
  263. }
  264. str = (unsigned char*) estrndup(input, input_len);
  265. php_convert_cyr_string(str, input_len, fr_cs[0], to_cs[0] TSRMLS_CC);
  266. RETVAL_STRING((char *)str, 0);
  267. }
  268. /* }}} */
  269. /*
  270. * Local variables:
  271. * tab-width: 4
  272. * c-basic-offset: 4
  273. * End:
  274. * vim600: sw=4 ts=4 fdm=marker
  275. * vim<600: sw=4 ts=4
  276. */