utf8.c 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401
  1. /* $OpenBSD$ */
  2. /*
  3. * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
  4. *
  5. * Permission to use, copy, modify, and distribute this software for any
  6. * purpose with or without fee is hereby granted, provided that the above
  7. * copyright notice and this permission notice appear in all copies.
  8. *
  9. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  10. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  11. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  12. * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  13. * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
  14. * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  15. * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  16. */
  17. #include <sys/types.h>
  18. #include <stdlib.h>
  19. #include <string.h>
  20. #include <wchar.h>
  21. #include "tmux.h"
  22. static int utf8_width(wchar_t);
  23. /* Set a single character. */
  24. void
  25. utf8_set(struct utf8_data *ud, u_char ch)
  26. {
  27. u_int i;
  28. *ud->data = ch;
  29. ud->have = 1;
  30. ud->size = 1;
  31. ud->width = 1;
  32. for (i = ud->size; i < sizeof ud->data; i++)
  33. ud->data[i] = '\0';
  34. }
  35. /* Copy UTF-8 character. */
  36. void
  37. utf8_copy(struct utf8_data *to, const struct utf8_data *from)
  38. {
  39. u_int i;
  40. memcpy(to, from, sizeof *to);
  41. for (i = to->size; i < sizeof to->data; i++)
  42. to->data[i] = '\0';
  43. }
  44. /*
  45. * Open UTF-8 sequence.
  46. *
  47. * 11000010-11011111 C2-DF start of 2-byte sequence
  48. * 11100000-11101111 E0-EF start of 3-byte sequence
  49. * 11110000-11110100 F0-F4 start of 4-byte sequence
  50. */
  51. enum utf8_state
  52. utf8_open(struct utf8_data *ud, u_char ch)
  53. {
  54. memset(ud, 0, sizeof *ud);
  55. if (ch >= 0xc2 && ch <= 0xdf)
  56. ud->size = 2;
  57. else if (ch >= 0xe0 && ch <= 0xef)
  58. ud->size = 3;
  59. else if (ch >= 0xf0 && ch <= 0xf4)
  60. ud->size = 4;
  61. else
  62. return (UTF8_ERROR);
  63. utf8_append(ud, ch);
  64. return (UTF8_MORE);
  65. }
  66. /* Append character to UTF-8, closing if finished. */
  67. enum utf8_state
  68. utf8_append(struct utf8_data *ud, u_char ch)
  69. {
  70. wchar_t wc;
  71. int width;
  72. if (ud->have >= ud->size)
  73. fatalx("UTF-8 character overflow");
  74. if (ud->size > sizeof ud->data)
  75. fatalx("UTF-8 character size too large");
  76. if (ud->have != 0 && (ch & 0xc0) != 0x80)
  77. ud->width = 0xff;
  78. ud->data[ud->have++] = ch;
  79. if (ud->have != ud->size)
  80. return (UTF8_MORE);
  81. if (ud->width == 0xff)
  82. return (UTF8_ERROR);
  83. if (utf8_combine(ud, &wc) != UTF8_DONE)
  84. return (UTF8_ERROR);
  85. if ((width = utf8_width(wc)) < 0)
  86. return (UTF8_ERROR);
  87. ud->width = width;
  88. return (UTF8_DONE);
  89. }
  90. /* Get width of Unicode character. */
  91. static int
  92. utf8_width(wchar_t wc)
  93. {
  94. int width;
  95. width = wcwidth(wc);
  96. if (width < 0 || width > 0xff)
  97. return (-1);
  98. return (width);
  99. }
  100. /* Combine UTF-8 into Unicode. */
  101. enum utf8_state
  102. utf8_combine(const struct utf8_data *ud, wchar_t *wc)
  103. {
  104. switch (mbtowc(wc, ud->data, ud->size)) {
  105. case -1:
  106. mbtowc(NULL, NULL, MB_CUR_MAX);
  107. return (UTF8_ERROR);
  108. case 0:
  109. return (UTF8_ERROR);
  110. default:
  111. return (UTF8_DONE);
  112. }
  113. }
  114. /* Split Unicode into UTF-8. */
  115. enum utf8_state
  116. utf8_split(wchar_t wc, struct utf8_data *ud)
  117. {
  118. char s[MB_LEN_MAX];
  119. int slen;
  120. slen = wctomb(s, wc);
  121. if (slen <= 0 || slen > (int)sizeof ud->data)
  122. return (UTF8_ERROR);
  123. memcpy(ud->data, s, slen);
  124. ud->size = slen;
  125. ud->width = utf8_width(wc);
  126. return (UTF8_DONE);
  127. }
  128. /*
  129. * Encode len characters from src into dst, which is guaranteed to have four
  130. * bytes available for each character from src (for \abc or UTF-8) plus space
  131. * for \0.
  132. */
  133. int
  134. utf8_strvis(char *dst, const char *src, size_t len, int flag)
  135. {
  136. struct utf8_data ud;
  137. const char *start, *end;
  138. enum utf8_state more;
  139. size_t i;
  140. start = dst;
  141. end = src + len;
  142. while (src < end) {
  143. if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
  144. while (++src < end && more == UTF8_MORE)
  145. more = utf8_append(&ud, *src);
  146. if (more == UTF8_DONE) {
  147. /* UTF-8 character finished. */
  148. for (i = 0; i < ud.size; i++)
  149. *dst++ = ud.data[i];
  150. continue;
  151. }
  152. /* Not a complete, valid UTF-8 character. */
  153. src -= ud.have;
  154. }
  155. if (src < end - 1)
  156. dst = vis(dst, src[0], flag, src[1]);
  157. else if (src < end)
  158. dst = vis(dst, src[0], flag, '\0');
  159. src++;
  160. }
  161. *dst = '\0';
  162. return (dst - start);
  163. }
  164. /*
  165. * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
  166. * the returned string. Anything not valid printable ASCII or UTF-8 is
  167. * stripped.
  168. */
  169. char *
  170. utf8_sanitize(const char *src)
  171. {
  172. char *dst;
  173. size_t n;
  174. enum utf8_state more;
  175. struct utf8_data ud;
  176. u_int i;
  177. dst = NULL;
  178. n = 0;
  179. while (*src != '\0') {
  180. dst = xreallocarray(dst, n + 1, sizeof *dst);
  181. if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
  182. while (*++src != '\0' && more == UTF8_MORE)
  183. more = utf8_append(&ud, *src);
  184. if (more == UTF8_DONE) {
  185. dst = xreallocarray(dst, n + ud.width,
  186. sizeof *dst);
  187. for (i = 0; i < ud.width; i++)
  188. dst[n++] = '_';
  189. continue;
  190. }
  191. src -= ud.have;
  192. }
  193. if (*src > 0x1f && *src < 0x7f)
  194. dst[n++] = *src;
  195. else
  196. dst[n++] = '_';
  197. src++;
  198. }
  199. dst = xreallocarray(dst, n + 1, sizeof *dst);
  200. dst[n] = '\0';
  201. return (dst);
  202. }
  203. /*
  204. * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
  205. * Caller frees.
  206. */
  207. struct utf8_data *
  208. utf8_fromcstr(const char *src)
  209. {
  210. struct utf8_data *dst;
  211. size_t n;
  212. enum utf8_state more;
  213. dst = NULL;
  214. n = 0;
  215. while (*src != '\0') {
  216. dst = xreallocarray(dst, n + 1, sizeof *dst);
  217. if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
  218. while (*++src != '\0' && more == UTF8_MORE)
  219. more = utf8_append(&dst[n], *src);
  220. if (more == UTF8_DONE) {
  221. n++;
  222. continue;
  223. }
  224. src -= dst[n].have;
  225. }
  226. utf8_set(&dst[n], *src);
  227. n++;
  228. src++;
  229. }
  230. dst = xreallocarray(dst, n + 1, sizeof *dst);
  231. dst[n].size = 0;
  232. return (dst);
  233. }
  234. /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
  235. char *
  236. utf8_tocstr(struct utf8_data *src)
  237. {
  238. char *dst;
  239. size_t n;
  240. dst = NULL;
  241. n = 0;
  242. for(; src->size != 0; src++) {
  243. dst = xreallocarray(dst, n + src->size, 1);
  244. memcpy(dst + n, src->data, src->size);
  245. n += src->size;
  246. }
  247. dst = xreallocarray(dst, n + 1, 1);
  248. dst[n] = '\0';
  249. return (dst);
  250. }
  251. /* Get width of UTF-8 string. */
  252. u_int
  253. utf8_cstrwidth(const char *s)
  254. {
  255. struct utf8_data tmp;
  256. u_int width;
  257. enum utf8_state more;
  258. width = 0;
  259. while (*s != '\0') {
  260. if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
  261. while (*++s != '\0' && more == UTF8_MORE)
  262. more = utf8_append(&tmp, *s);
  263. if (more == UTF8_DONE) {
  264. width += tmp.width;
  265. continue;
  266. }
  267. s -= tmp.have;
  268. }
  269. if (*s > 0x1f && *s != 0x7f)
  270. width++;
  271. s++;
  272. }
  273. return (width);
  274. }
  275. /* Trim UTF-8 string to width. Caller frees. */
  276. char *
  277. utf8_trimcstr(const char *s, u_int width)
  278. {
  279. struct utf8_data *tmp, *next;
  280. char *out;
  281. u_int at;
  282. tmp = utf8_fromcstr(s);
  283. at = 0;
  284. for (next = tmp; next->size != 0; next++) {
  285. if (at + next->width > width) {
  286. next->size = 0;
  287. break;
  288. }
  289. at += next->width;
  290. }
  291. out = utf8_tocstr(tmp);
  292. free(tmp);
  293. return (out);
  294. }
  295. /* Trim UTF-8 string to width. Caller frees. */
  296. char *
  297. utf8_rtrimcstr(const char *s, u_int width)
  298. {
  299. struct utf8_data *tmp, *next, *end;
  300. char *out;
  301. u_int at;
  302. tmp = utf8_fromcstr(s);
  303. for (end = tmp; end->size != 0; end++)
  304. /* nothing */;
  305. if (end == tmp) {
  306. free(tmp);
  307. return (xstrdup(""));
  308. }
  309. next = end - 1;
  310. at = 0;
  311. for (;;)
  312. {
  313. if (at + next->width > width) {
  314. next++;
  315. break;
  316. }
  317. at += next->width;
  318. if (next == tmp)
  319. break;
  320. next--;
  321. }
  322. out = utf8_tocstr(next);
  323. free(tmp);
  324. return (out);
  325. }
  326. /* Pad UTF-8 string to width. Caller frees. */
  327. char *
  328. utf8_padcstr(const char *s, u_int width)
  329. {
  330. size_t slen;
  331. char *out;
  332. u_int n, i;
  333. n = utf8_cstrwidth(s);
  334. if (n >= width)
  335. return (xstrdup(s));
  336. slen = strlen(s);
  337. out = xmalloc(slen + 1 + (width - n));
  338. memcpy(out, s, slen);
  339. for (i = n; i < width; i++)
  340. out[slen++] = ' ';
  341. out[slen] = '\0';
  342. return (out);
  343. }