gdkanji.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626
  1. /* gdkanji.c (Kanji code converter) */
  2. /* written by Masahito Yamaga (ma@yama-ga.com) */
  3. #include <stdio.h>
  4. #include <stdlib.h>
  5. #include <string.h>
  6. #include "gd.h"
  7. #include "gdhelpers.h"
  8. #include <stdarg.h>
  9. #if defined(HAVE_ICONV_H) || defined(HAVE_ICONV)
  10. #include <iconv.h>
  11. #ifdef HAVE_ERRNO_H
  12. #include <errno.h>
  13. #endif
  14. #endif
  15. #if defined(HAVE_ICONV_H) && !defined(HAVE_ICONV)
  16. #define HAVE_ICONV 1
  17. #endif
  18. #define LIBNAME "any2eucjp()"
  19. #if defined(__MSC__) || defined(__BORLANDC__) || defined(__TURBOC__) || defined(_Windows) || defined(MSDOS)
  20. #ifndef SJISPRE
  21. #define SJISPRE 1
  22. #endif
  23. #endif
  24. #ifdef TRUE
  25. #undef TRUE
  26. #endif
  27. #ifdef FALSE
  28. #undef FALSE
  29. #endif
  30. #define TRUE 1
  31. #define FALSE 0
  32. #define NEW 1
  33. #define OLD 2
  34. #define ESCI 3
  35. #define NEC 4
  36. #define EUC 5
  37. #define SJIS 6
  38. #define EUCORSJIS 7
  39. #define ASCII 8
  40. #define NEWJISSTR "JIS7"
  41. #define OLDJISSTR "jis"
  42. #define EUCSTR "eucJP"
  43. #define SJISSTR "SJIS"
  44. #define ESC 27
  45. #define SS2 142
  46. static void
  47. debug (const char *format,...)
  48. {
  49. #ifdef DEBUG
  50. va_list args;
  51. va_start (args, format);
  52. fprintf (stdout, "%s: ", LIBNAME);
  53. vfprintf (stdout, format, args);
  54. fprintf (stdout, "\n");
  55. va_end (args);
  56. #endif
  57. }
  58. static void
  59. error (const char *format,...)
  60. {
  61. va_list args;
  62. char *tmp;
  63. va_start(args, format);
  64. vspprintf(&tmp, 0, format, args);
  65. va_end(args);
  66. php_error_docref(NULL, E_WARNING, "%s: %s", LIBNAME, tmp);
  67. efree(tmp);
  68. }
  69. /* DetectKanjiCode() derived from DetectCodeType() by Ken Lunde. */
  70. static int
  71. DetectKanjiCode (unsigned char *str)
  72. {
  73. static int whatcode = ASCII;
  74. int oldcode = ASCII;
  75. int c, i;
  76. char *lang = NULL;
  77. c = '\1';
  78. i = 0;
  79. if (whatcode != EUCORSJIS && whatcode != ASCII)
  80. {
  81. oldcode = whatcode;
  82. whatcode = ASCII;
  83. }
  84. while ((whatcode == EUCORSJIS || whatcode == ASCII) && c != '\0')
  85. {
  86. if ((c = str[i++]) != '\0')
  87. {
  88. if (c == ESC)
  89. {
  90. c = str[i++];
  91. if (c == '$')
  92. {
  93. c = str[i++];
  94. if (c == 'B')
  95. whatcode = NEW;
  96. else if (c == '@')
  97. whatcode = OLD;
  98. }
  99. else if (c == '(')
  100. {
  101. c = str[i++];
  102. if (c == 'I')
  103. whatcode = ESCI;
  104. }
  105. else if (c == 'K')
  106. whatcode = NEC;
  107. }
  108. else if ((c >= 129 && c <= 141) || (c >= 143 && c <= 159))
  109. whatcode = SJIS;
  110. else if (c == SS2)
  111. {
  112. c = str[i++];
  113. if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160) || (c >= 224 && c <= 252))
  114. whatcode = SJIS;
  115. else if (c >= 161 && c <= 223)
  116. whatcode = EUCORSJIS;
  117. }
  118. else if (c >= 161 && c <= 223)
  119. {
  120. c = str[i++];
  121. if (c >= 240 && c <= 254)
  122. whatcode = EUC;
  123. else if (c >= 161 && c <= 223)
  124. whatcode = EUCORSJIS;
  125. else if (c >= 224 && c <= 239)
  126. {
  127. whatcode = EUCORSJIS;
  128. while (c >= 64 && c != '\0' && whatcode == EUCORSJIS)
  129. {
  130. if (c >= 129)
  131. {
  132. if (c <= 141 || (c >= 143 && c <= 159))
  133. whatcode = SJIS;
  134. else if (c >= 253 && c <= 254)
  135. whatcode = EUC;
  136. }
  137. c = str[i++];
  138. }
  139. }
  140. else if (c <= 159)
  141. whatcode = SJIS;
  142. }
  143. else if (c >= 240 && c <= 254)
  144. whatcode = EUC;
  145. else if (c >= 224 && c <= 239)
  146. {
  147. c = str[i++];
  148. if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160))
  149. whatcode = SJIS;
  150. else if (c >= 253 && c <= 254)
  151. whatcode = EUC;
  152. else if (c >= 161 && c <= 252)
  153. whatcode = EUCORSJIS;
  154. }
  155. }
  156. }
  157. #ifdef DEBUG
  158. if (whatcode == ASCII)
  159. debug ("Kanji code not included.");
  160. else if (whatcode == EUCORSJIS)
  161. debug ("Kanji code not detected.");
  162. else
  163. debug ("Kanji code detected at %d byte.", i);
  164. #endif
  165. if (whatcode == EUCORSJIS && oldcode != ASCII)
  166. whatcode = oldcode;
  167. if (whatcode == EUCORSJIS)
  168. {
  169. if (getenv ("LC_ALL"))
  170. lang = getenv ("LC_ALL");
  171. else if (getenv ("LC_CTYPE"))
  172. lang = getenv ("LC_CTYPE");
  173. else if (getenv ("LANG"))
  174. lang = getenv ("LANG");
  175. if (lang)
  176. {
  177. if (strcmp (lang, "ja_JP.SJIS") == 0 ||
  178. #ifdef hpux
  179. strcmp (lang, "japanese") == 0 ||
  180. #endif
  181. strcmp (lang, "ja_JP.mscode") == 0 ||
  182. strcmp (lang, "ja_JP.PCK") == 0)
  183. whatcode = SJIS;
  184. else if (strncmp (lang, "ja", 2) == 0)
  185. #ifdef SJISPRE
  186. whatcode = SJIS;
  187. #else
  188. whatcode = EUC;
  189. #endif
  190. }
  191. }
  192. if (whatcode == EUCORSJIS)
  193. #ifdef SJISPRE
  194. whatcode = SJIS;
  195. #else
  196. whatcode = EUC;
  197. #endif
  198. return whatcode;
  199. }
  200. /* SJIStoJIS() is sjis2jis() by Ken Lunde. */
  201. static void
  202. SJIStoJIS (int *p1, int *p2)
  203. {
  204. register unsigned char c1 = *p1;
  205. register unsigned char c2 = *p2;
  206. register int adjust = c2 < 159;
  207. register int rowOffset = c1 < 160 ? 112 : 176;
  208. register int cellOffset = adjust ? (31 + (c2 > 127)) : 126;
  209. *p1 = ((c1 - rowOffset) << 1) - adjust;
  210. *p2 -= cellOffset;
  211. }
  212. /* han2zen() was derived from han2zen() written by Ken Lunde. */
  213. #define IS_DAKU(c) ((c >= 182 && c <= 196) || (c >= 202 && c <= 206) || (c == 179))
  214. #define IS_HANDAKU(c) (c >= 202 && c <= 206)
  215. static void
  216. han2zen (int *p1, int *p2)
  217. {
  218. int c = *p1;
  219. int daku = FALSE;
  220. int handaku = FALSE;
  221. int mtable[][2] =
  222. {
  223. {129, 66},
  224. {129, 117},
  225. {129, 118},
  226. {129, 65},
  227. {129, 69},
  228. {131, 146},
  229. {131, 64},
  230. {131, 66},
  231. {131, 68},
  232. {131, 70},
  233. {131, 72},
  234. {131, 131},
  235. {131, 133},
  236. {131, 135},
  237. {131, 98},
  238. {129, 91},
  239. {131, 65},
  240. {131, 67},
  241. {131, 69},
  242. {131, 71},
  243. {131, 73},
  244. {131, 74},
  245. {131, 76},
  246. {131, 78},
  247. {131, 80},
  248. {131, 82},
  249. {131, 84},
  250. {131, 86},
  251. {131, 88},
  252. {131, 90},
  253. {131, 92},
  254. {131, 94},
  255. {131, 96},
  256. {131, 99},
  257. {131, 101},
  258. {131, 103},
  259. {131, 105},
  260. {131, 106},
  261. {131, 107},
  262. {131, 108},
  263. {131, 109},
  264. {131, 110},
  265. {131, 113},
  266. {131, 116},
  267. {131, 119},
  268. {131, 122},
  269. {131, 125},
  270. {131, 126},
  271. {131, 128},
  272. {131, 129},
  273. {131, 130},
  274. {131, 132},
  275. {131, 134},
  276. {131, 136},
  277. {131, 137},
  278. {131, 138},
  279. {131, 139},
  280. {131, 140},
  281. {131, 141},
  282. {131, 143},
  283. {131, 147},
  284. {129, 74},
  285. {129, 75}
  286. };
  287. if (*p2 == 222 && IS_DAKU (*p1))
  288. daku = TRUE; /* Daku-ten */
  289. else if (*p2 == 223 && IS_HANDAKU (*p1))
  290. handaku = TRUE; /* Han-daku-ten */
  291. *p1 = mtable[c - 161][0];
  292. *p2 = mtable[c - 161][1];
  293. if (daku)
  294. {
  295. if ((*p2 >= 74 && *p2 <= 103) || (*p2 >= 110 && *p2 <= 122))
  296. (*p2)++;
  297. else if (*p2 == 131 || *p2 == 69)
  298. *p2 = 148;
  299. }
  300. else if (handaku && *p2 >= 110 && *p2 <= 122)
  301. (*p2) += 2;
  302. }
  303. /* Recast strcpy to handle unsigned chars used below. */
  304. #define ustrcpy(A,B) (strcpy((char*)(A),(const char*)(B)))
  305. static void
  306. do_convert (unsigned char *to, unsigned char *from, const char *code)
  307. {
  308. #ifdef HAVE_ICONV
  309. iconv_t cd;
  310. size_t from_len, to_len;
  311. if ((cd = iconv_open (EUCSTR, code)) == (iconv_t) - 1)
  312. {
  313. error ("iconv_open() error");
  314. #ifdef HAVE_ERRNO_H
  315. if (errno == EINVAL)
  316. error ("invalid code specification: \"%s\" or \"%s\"",
  317. EUCSTR, code);
  318. #endif
  319. strcpy ((char *) to, (const char *) from);
  320. return;
  321. }
  322. from_len = strlen ((const char *) from) + 1;
  323. to_len = BUFSIZ;
  324. if ((int) iconv(cd, (char **) &from, &from_len, (char **) &to, &to_len) == -1)
  325. {
  326. #ifdef HAVE_ERRNO_H
  327. if (errno == EINVAL)
  328. error ("invalid end of input string");
  329. else if (errno == EILSEQ)
  330. error ("invalid code in input string");
  331. else if (errno == E2BIG)
  332. error ("output buffer overflow at do_convert()");
  333. else
  334. #endif
  335. error ("something happen");
  336. strcpy ((char *) to, (const char *) from);
  337. return;
  338. }
  339. if (iconv_close (cd) != 0)
  340. {
  341. error ("iconv_close() error");
  342. }
  343. #else
  344. int p1, p2, i, j;
  345. int jisx0208 = FALSE;
  346. int hankaku = FALSE;
  347. j = 0;
  348. if (strcmp (code, NEWJISSTR) == 0 || strcmp (code, OLDJISSTR) == 0)
  349. {
  350. for (i = 0; from[i] != '\0' && j < BUFSIZ; i++)
  351. {
  352. if (from[i] == ESC)
  353. {
  354. i++;
  355. if (from[i] == '$')
  356. {
  357. jisx0208 = TRUE;
  358. hankaku = FALSE;
  359. i++;
  360. }
  361. else if (from[i] == '(')
  362. {
  363. jisx0208 = FALSE;
  364. i++;
  365. if (from[i] == 'I') /* Hankaku Kana */
  366. hankaku = TRUE;
  367. else
  368. hankaku = FALSE;
  369. }
  370. }
  371. else
  372. {
  373. if (jisx0208)
  374. to[j++] = from[i] + 128;
  375. else if (hankaku)
  376. {
  377. to[j++] = SS2;
  378. to[j++] = from[i] + 128;
  379. }
  380. else
  381. to[j++] = from[i];
  382. }
  383. }
  384. }
  385. else if (strcmp (code, SJISSTR) == 0)
  386. {
  387. for (i = 0; from[i] != '\0' && j < BUFSIZ; i++)
  388. {
  389. p1 = from[i];
  390. if (p1 < 127)
  391. to[j++] = p1;
  392. else if ((p1 >= 161) && (p1 <= 223))
  393. { /* Hankaku Kana */
  394. to[j++] = SS2;
  395. to[j++] = p1;
  396. }
  397. else
  398. {
  399. p2 = from[++i];
  400. SJIStoJIS (&p1, &p2);
  401. to[j++] = p1 + 128;
  402. to[j++] = p2 + 128;
  403. }
  404. }
  405. }
  406. else
  407. {
  408. error ("invalid code specification: \"%s\"", code);
  409. return;
  410. }
  411. if (j >= BUFSIZ)
  412. {
  413. error ("output buffer overflow at do_convert()");
  414. ustrcpy (to, from);
  415. }
  416. else
  417. to[j] = '\0';
  418. #endif /* HAVE_ICONV */
  419. }
  420. static int
  421. do_check_and_conv (unsigned char *to, unsigned char *from)
  422. {
  423. static unsigned char tmp[BUFSIZ];
  424. int p1, p2, i, j;
  425. int kanji = TRUE;
  426. switch (DetectKanjiCode (from))
  427. {
  428. case NEW:
  429. debug ("Kanji code is New JIS.");
  430. do_convert (tmp, from, NEWJISSTR);
  431. break;
  432. case OLD:
  433. debug ("Kanji code is Old JIS.");
  434. do_convert (tmp, from, OLDJISSTR);
  435. break;
  436. case ESCI:
  437. debug ("This string includes Hankaku-Kana (jisx0201) escape sequence [ESC] + ( + I.");
  438. do_convert (tmp, from, NEWJISSTR);
  439. break;
  440. case NEC:
  441. debug ("Kanji code is NEC Kanji.");
  442. error ("cannot convert NEC Kanji.");
  443. ustrcpy (tmp, from);
  444. kanji = FALSE;
  445. break;
  446. case EUC:
  447. debug ("Kanji code is EUC.");
  448. ustrcpy (tmp, from);
  449. break;
  450. case SJIS:
  451. debug ("Kanji code is SJIS.");
  452. do_convert (tmp, from, SJISSTR);
  453. break;
  454. case EUCORSJIS:
  455. debug ("Kanji code is EUC or SJIS.");
  456. ustrcpy (tmp, from);
  457. kanji = FALSE;
  458. break;
  459. case ASCII:
  460. debug ("This is ASCII string.");
  461. ustrcpy (tmp, from);
  462. kanji = FALSE;
  463. break;
  464. default:
  465. debug ("This string includes unknown code.");
  466. ustrcpy (tmp, from);
  467. kanji = FALSE;
  468. break;
  469. }
  470. /* Hankaku Kana ---> Zenkaku Kana */
  471. if (kanji)
  472. {
  473. j = 0;
  474. for (i = 0; tmp[i] != '\0' && j < BUFSIZ; i++)
  475. {
  476. if (tmp[i] == SS2)
  477. {
  478. p1 = tmp[++i];
  479. if (tmp[i + 1] == SS2)
  480. {
  481. p2 = tmp[i + 2];
  482. if (p2 == 222 || p2 == 223)
  483. i += 2;
  484. else
  485. p2 = 0;
  486. }
  487. else
  488. p2 = 0;
  489. han2zen (&p1, &p2);
  490. SJIStoJIS (&p1, &p2);
  491. to[j++] = p1 + 128;
  492. to[j++] = p2 + 128;
  493. }
  494. else
  495. to[j++] = tmp[i];
  496. }
  497. if (j >= BUFSIZ)
  498. {
  499. error ("output buffer overflow at Hankaku --> Zenkaku");
  500. ustrcpy (to, tmp);
  501. }
  502. else
  503. to[j] = '\0';
  504. }
  505. else
  506. ustrcpy (to, tmp);
  507. return kanji;
  508. }
  509. int
  510. any2eucjp (unsigned char *dest, unsigned char *src, unsigned int dest_max)
  511. {
  512. static unsigned char tmp_dest[BUFSIZ];
  513. int ret;
  514. if (strlen ((const char *) src) >= BUFSIZ)
  515. {
  516. error ("input string too large");
  517. return -1;
  518. }
  519. if (dest_max > BUFSIZ)
  520. {
  521. error ("invalid maximum size of destination\nit should be less than %d.", BUFSIZ);
  522. return -1;
  523. }
  524. ret = do_check_and_conv (tmp_dest, src);
  525. if (strlen ((const char *) tmp_dest) >= dest_max)
  526. {
  527. error ("output buffer overflow");
  528. ustrcpy (dest, src);
  529. return -1;
  530. }
  531. ustrcpy (dest, tmp_dest);
  532. return ret;
  533. }
  534. #if 0
  535. unsigned int
  536. strwidth (unsigned char *s)
  537. {
  538. unsigned char *t;
  539. unsigned int i;
  540. t = (unsigned char *) gdMalloc (BUFSIZ);
  541. any2eucjp (t, s, BUFSIZ);
  542. i = strlen (t);
  543. gdFree (t);
  544. return i;
  545. }
  546. #ifdef DEBUG
  547. int
  548. main ()
  549. {
  550. unsigned char input[BUFSIZ];
  551. unsigned char *output;
  552. unsigned char *str;
  553. int c, i = 0;
  554. while ((c = fgetc (stdin)) != '\n' && i < BUFSIZ)
  555. input[i++] = c;
  556. input[i] = '\0';
  557. printf ("input : %d bytes\n", strlen ((const char *) input));
  558. printf ("output: %d bytes\n", strwidth (input));
  559. output = (unsigned char *) gdMalloc (BUFSIZ);
  560. any2eucjp (output, input, BUFSIZ);
  561. str = output;
  562. while (*str != '\0')
  563. putchar (*(str++));
  564. putchar ('\n');
  565. gdFree (output);
  566. return 0;
  567. }
  568. #endif
  569. #endif