html_table_gen.php 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813
  1. <?php
  2. /*
  3. +----------------------------------------------------------------------+
  4. | PHP Version 7 |
  5. +----------------------------------------------------------------------+
  6. | Copyright (c) 1997-2018 The PHP Group |
  7. +----------------------------------------------------------------------+
  8. | This source file is subject to version 3.01 of the PHP license, |
  9. | that is bundled with this package in the file LICENSE, and is |
  10. | available through the world-wide-web at the following url: |
  11. | http://www.php.net/license/3_01.txt |
  12. | If you did not receive a copy of the PHP license and are unable to |
  13. | obtain it through the world-wide-web, please send a note to |
  14. | license@php.net so we can mail you a copy immediately. |
  15. +----------------------------------------------------------------------+
  16. | Authors: Gustavo Lopes <cataphract@php.net> |
  17. +----------------------------------------------------------------------+
  18. */
  19. /* This file prints to stdout the contents of ext/standard/html_tables.h */
  20. /* put together with glue; have patience */
  21. $t = <<<CODE
  22. /*
  23. +----------------------------------------------------------------------+
  24. | PHP Version 7 |
  25. +----------------------------------------------------------------------+
  26. | Copyright (c) 1997-%s The PHP Group |
  27. +----------------------------------------------------------------------+
  28. | This source file is subject to version 3.01 of the PHP license, |
  29. | that is bundled with this package in the file LICENSE, and is |
  30. | available through the world-wide-web at the following url: |
  31. | http://www.php.net/license/3_01.txt |
  32. | If you did not receive a copy of the PHP license and are unable to |
  33. | obtain it through the world-wide-web, please send a note to |
  34. | license@php.net so we can mail you a copy immediately. |
  35. +----------------------------------------------------------------------+
  36. */
  37. #ifndef HTML_TABLES_H
  38. #define HTML_TABLES_H
  39. /**************************************************************************
  40. ***************************************************************************
  41. ** THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY IT. **
  42. ***************************************************************************
  43. ** Please change html_tables/html_table_gen.php instead and then **
  44. ** run it in order to generate this file **
  45. ***************************************************************************
  46. **************************************************************************/
  47. enum entity_charset { cs_utf_8, cs_8859_1, cs_cp1252, cs_8859_15, cs_cp1251,
  48. cs_8859_5, cs_cp866, cs_macroman, cs_koi8r, cs_big5,
  49. cs_gb2312, cs_big5hkscs, cs_sjis, cs_eucjp,
  50. cs_numelems /* used to count the number of charsets */
  51. };
  52. #define CHARSET_UNICODE_COMPAT(cs) ((cs) <= cs_8859_1)
  53. #define CHARSET_SINGLE_BYTE(cs) ((cs) > cs_utf_8 && (cs) < cs_big5)
  54. #define CHARSET_PARTIAL_SUPPORT(cs) ((cs) >= cs_big5)
  55. static const struct {
  56. const char *codeset;
  57. uint32_t codeset_len;
  58. enum entity_charset charset;
  59. } charset_map[] = {
  60. { "ISO-8859-1", sizeof("ISO-8859-1")-1, cs_8859_1 },
  61. { "ISO8859-1", sizeof("ISO8859-1")-1, cs_8859_1 },
  62. { "ISO-8859-15", sizeof("ISO-8859-15")-1, cs_8859_15 },
  63. { "ISO8859-15", sizeof("ISO8859-15")-1, cs_8859_15 },
  64. { "utf-8", sizeof("utf-8")-1, cs_utf_8 },
  65. { "cp1252", sizeof("cp1252")-1, cs_cp1252 },
  66. { "Windows-1252", sizeof("Windows-1252")-1, cs_cp1252 },
  67. { "1252", sizeof("1252")-1, cs_cp1252 },
  68. { "BIG5", sizeof("BIG5")-1, cs_big5 },
  69. { "950", sizeof("950")-1, cs_big5 },
  70. { "GB2312", sizeof("GB2312")-1, cs_gb2312 },
  71. { "936", sizeof("936")-1, cs_gb2312 },
  72. { "BIG5-HKSCS", sizeof("BIG5-HKSCS")-1, cs_big5hkscs },
  73. { "Shift_JIS", sizeof("Shift_JIS")-1, cs_sjis },
  74. { "SJIS", sizeof("SJIS")-1, cs_sjis },
  75. { "932", sizeof("932")-1, cs_sjis },
  76. { "SJIS-win", sizeof("SJIS-win")-1, cs_sjis },
  77. { "CP932", sizeof("CP932")-1, cs_sjis },
  78. { "EUCJP", sizeof("EUCJP")-1, cs_eucjp },
  79. { "EUC-JP", sizeof("EUC-JP")-1, cs_eucjp },
  80. { "eucJP-win", sizeof("eucJP-win")-1, cs_eucjp },
  81. { "KOI8-R", sizeof("KOI8-R")-1, cs_koi8r },
  82. { "koi8-ru", sizeof("koi8-ru")-1, cs_koi8r },
  83. { "koi8r", sizeof("koi8r")-1, cs_koi8r },
  84. { "cp1251", sizeof("cp1251")-1, cs_cp1251 },
  85. { "Windows-1251", sizeof("Windows-1251")-1, cs_cp1251 },
  86. { "win-1251", sizeof("win-1251")-1, cs_cp1251 },
  87. { "iso8859-5", sizeof("iso8859-5")-1, cs_8859_5 },
  88. { "iso-8859-5", sizeof("iso-8859-5")-1, cs_8859_5 },
  89. { "cp866", sizeof("cp866")-1, cs_cp866 },
  90. { "866", sizeof("866")-1, cs_cp866 },
  91. { "ibm866", sizeof("ibm866")-1, cs_cp866 },
  92. { "MacRoman", sizeof("MacRoman")-1, cs_macroman }
  93. };
  94. /* longest entity name length excluding & and ; */
  95. #define LONGEST_ENTITY_LENGTH 31
  96. /* Definitions for mappings *to* Unicode.
  97. * The origin charset must have at most 256 code points.
  98. * The multi-byte encodings are not supported */
  99. typedef struct {
  100. unsigned short uni_cp[64];
  101. } enc_to_uni_stage2;
  102. typedef struct {
  103. const enc_to_uni_stage2 *inner[4];
  104. } enc_to_uni;
  105. /* bits 7-8 bits (only single bytes encodings supported )*/
  106. #define ENT_ENC_TO_UNI_STAGE1(k) ((k & 0xC0) >> 6)
  107. /* bits 1-6 */
  108. #define ENT_ENC_TO_UNI_STAGE2(k) ((k) & 0x3F)
  109. CODE;
  110. echo sprintf($t, date("Y"));
  111. $encodings = array(
  112. array(
  113. "ident" => "iso88591",
  114. "enumid" => 1,
  115. "name" => "ISO-8859-1",
  116. "file" => "mappings/8859-1.TXT",
  117. ),
  118. array(
  119. "ident" => "iso88595",
  120. "enumid" => 5,
  121. "name" => "ISO-8859-5",
  122. "file" => "mappings/8859-5.TXT",
  123. ),
  124. array(
  125. "ident" => "iso885915",
  126. "enumid" => 3,
  127. "name" => "ISO-8859-15",
  128. "file" => "mappings/8859-15.TXT",
  129. ),
  130. array(
  131. "ident" => "win1252",
  132. "enumid" => 2,
  133. "enumident" => "cp1252",
  134. "name" => "Windows-1252",
  135. "file" => "mappings/CP1252.TXT",
  136. ),
  137. array(
  138. "ident" => "win1251",
  139. "enumid" => 4,
  140. "enumident" => "cp1252",
  141. "name" => "Windows-1251",
  142. "file" => "mappings/CP1251.TXT",
  143. ),
  144. array(
  145. "ident" => "koi8r",
  146. "enumid" => 8,
  147. "name" => "KOI8-R",
  148. "file" => "mappings/KOI8-R.TXT",
  149. ),
  150. array(
  151. "ident" => "cp866",
  152. "enumid" => 6,
  153. "name" => "CP-866",
  154. "file" => "mappings/CP866.TXT",
  155. ),
  156. array(
  157. "ident" => "macroman",
  158. "enumid" => 7,
  159. "name" => "MacRoman",
  160. "file" => "mappings/ROMAN.TXT",
  161. ),
  162. );
  163. $prevStage2 = array();
  164. foreach ($encodings as $e) {
  165. echo
  166. "/* {{{ Mappings *to* Unicode for {$e['name']} */\n\n";
  167. /* process file */
  168. $map = array();
  169. $lines = explode("\n", file_get_contents($e{'file'}));
  170. foreach ($lines as $l) {
  171. if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})/i", $l, $matches))
  172. $map[] = array($matches[1], $matches[2]);
  173. }
  174. $mappy = array();
  175. foreach ($map as $v) { $mappy[hexdec($v[0])] = hexdec($v[1]); }
  176. $mstable = array("ident" => $e['ident']);
  177. /* calculate two-stage tables */
  178. for ($i = 0; $i < 4; $i++) {
  179. for ($j = 0; $j < 64; $j++) {
  180. $cp = $i << 6 | $j;
  181. $mstable[$i][$j] = isset($mappy[$cp]) ? $mappy[$cp] : NULL;
  182. }
  183. }
  184. echo
  185. "/* {{{ Stage 2 tables for {$e['name']} */\n\n";
  186. $s2tables_idents = array();
  187. for ($i = 0; $i < 4; $i++) {
  188. if (($t = array_keys($prevStage2, $mstable[$i])) !== array()) {
  189. $s2tables_idents[$i] = $encodings[$t[0]/5]["ident"];
  190. continue;
  191. }
  192. $s2tables_idents[$i] = $e["ident"];
  193. echo "static const enc_to_uni_stage2 enc_to_uni_s2_{$e['ident']}_".
  194. sprintf("%02X", $i << 6)." = { {\n";
  195. for ($j = 0; $j < 64; $j++) {
  196. if ($j == 0) echo "\t";
  197. elseif ($j % 6 == 0) echo "\n\t";
  198. else echo " ";
  199. if ($mstable[$i][$j] !== NULL)
  200. echo sprintf("0x%04X,", $mstable[$i][$j]);
  201. else
  202. echo "0xFFFF,"; /* special value; indicates no mapping */
  203. }
  204. echo "\n} };\n\n";
  205. $prevStage2[] = $mstable[$i];
  206. }
  207. echo
  208. "/* end of stage 2 tables for {$e['name']} }}} */\n\n";
  209. echo
  210. "/* {{{ Stage 1 table for {$e['name']} */\n";
  211. echo
  212. "static const enc_to_uni enc_to_uni_{$e['ident']} = { {
  213. \t&enc_to_uni_s2_{$s2tables_idents[0]}_00,
  214. \t&enc_to_uni_s2_{$s2tables_idents[1]}_40,
  215. \t&enc_to_uni_s2_{$s2tables_idents[2]}_80,
  216. \t&enc_to_uni_s2_{$s2tables_idents[3]}_C0 }
  217. };
  218. ";
  219. echo
  220. "/* end of stage 1 table for {$e['name']} }}} */\n\n";
  221. }
  222. $maxencnum = max(array_map(function($e) { return $e['enumid']; }, $encodings));
  223. $a = range(0, $maxencnum);
  224. foreach ($encodings as $e) { $a[$e['enumid']] = $e['ident']; }
  225. echo
  226. "/* {{{ Index of tables for encoding conversion */
  227. static const enc_to_uni *const enc_to_uni_index[cs_numelems] = {\n";
  228. foreach ($a as $k => $v) {
  229. if (is_numeric($v))
  230. echo "\tNULL,\n";
  231. else
  232. echo "\t&enc_to_uni_$v,\n";
  233. }
  234. echo
  235. "};
  236. /* }}} */\n";
  237. $t = <<<CODE
  238. /* Definitions for mappings *from* Unicode */
  239. typedef struct {
  240. unsigned short un_code_point; /* we don't need bigger */
  241. unsigned char cs_code; /* currently, we only have maps to single-byte encodings */
  242. } uni_to_enc;
  243. CODE;
  244. echo $t;
  245. $encodings = array(
  246. array(
  247. "ident" => "iso885915",
  248. "name" => "ISO-8859-15",
  249. "file" => "mappings/8859-15.TXT",
  250. "range" => array(0xA4, 0xBE),
  251. ),
  252. array(
  253. "ident" => "win1252",
  254. "name" => "Windows-1252",
  255. "file" => "mappings/CP1252.TXT",
  256. "range" => array(0x80, 0x9F),
  257. ),
  258. array(
  259. "ident" => "win1251",
  260. "name" => "Windows-1251",
  261. "file" => "mappings/CP1251.TXT",
  262. "range" => array(0x80, 0xFF),
  263. ),
  264. array(
  265. "ident" => "koi8r",
  266. "name" => "KOI8-R",
  267. "file" => "mappings/KOI8-R.TXT",
  268. "range" => array(0x80, 0xFF),
  269. ),
  270. array(
  271. "ident" => "cp866",
  272. "name" => "CP-866",
  273. "file" => "mappings/CP866.TXT",
  274. "range" => array(0x80, 0xFF),
  275. ),
  276. array(
  277. "ident" => "macroman",
  278. "name" => "MacRoman",
  279. "file" => "mappings/ROMAN.TXT",
  280. "range" => array(0x80, 0xFF),
  281. ),
  282. );
  283. foreach ($encodings as $e) {
  284. echo
  285. "/* {{{ Mappings *from* Unicode for {$e['name']} */\n";
  286. /* process file */
  287. $map = array();
  288. $lines = explode("\n", file_get_contents($e{'file'}));
  289. foreach ($lines as $l) {
  290. if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})\s+#\s*(.*)$/i", $l, $matches))
  291. $map[] = array($matches[1], $matches[2], rtrim($matches[3]));
  292. }
  293. $mappy = array();
  294. foreach ($map as $v) {
  295. if (hexdec($v[0]) >= $e['range'][0] && hexdec($v[0]) <= $e['range'][1])
  296. $mappy[hexdec($v[1])] = array(hexdec($v[0]), strtolower($v[2]));
  297. }
  298. ksort($mappy);
  299. echo
  300. "static const uni_to_enc unimap_{$e['ident']}[] = {\n";
  301. foreach ($mappy as $k => $v) {
  302. echo "\t{ ", sprintf("0x%04X", $k), ", ", sprintf("0x%02X", $v[0]), " },\t/* ",
  303. $v[1], " */\n";
  304. }
  305. echo "};\n";
  306. echo
  307. "/* {{{ end of mappings *from* Unicode for {$e['name']} */\n\n";
  308. }
  309. $data = file_get_contents("ents_html5.txt");
  310. $pass2 = false;
  311. $name = "HTML5";
  312. $ident = "html5";
  313. again:
  314. $t = <<<'CODE'
  315. /* HTML 5 has many more named entities.
  316. * Some of them map to two unicode code points, not one.
  317. * We're going to use a three-stage table (with an extra one for the entities
  318. * with two code points). */
  319. #define ENT_STAGE1_INDEX(k) (((k) & 0xFFF000) >> 12) /* > 1D, we have no mapping */
  320. #define ENT_STAGE2_INDEX(k) (((k) & 0xFC0) >> 6)
  321. #define ENT_STAGE3_INDEX(k) ((k) & 0x3F)
  322. #define ENT_CODE_POINT_FROM_STAGES(i,j,k) (((i) << 12) | ((j) << 6) | (k))
  323. /* The default entity may be NULL. Binary search is still possible while
  324. is senseless as there are just two rows (see also find_entity_for_char()). */
  325. typedef union {
  326. struct {
  327. const char *default_entity;
  328. unsigned size; /* number of remaining entries in the table */
  329. unsigned short default_entity_len;
  330. } leading_entry;
  331. struct {
  332. const char *entity;
  333. unsigned second_cp; /* second code point */
  334. unsigned short entity_len;
  335. } normal_entry;
  336. } entity_multicodepoint_row;
  337. /* blocks of these should start at code points k where k % 0xFC0 == 0 */
  338. typedef struct {
  339. char ambiguous; /* if 0 look into entity */
  340. union {
  341. struct {
  342. const char *entity; /* may be NULL */
  343. unsigned short entity_len;
  344. } ent;
  345. const entity_multicodepoint_row *multicodepoint_table;
  346. } data;
  347. } entity_stage3_row;
  348. /* Calculate k & 0x3F Use as offset */
  349. typedef const entity_stage3_row *entity_stage2_row; /* 64 elements */
  350. /* Calculate k & 0xFC0 >> 6. Use as offset */
  351. typedef const entity_stage3_row *const *entity_stage1_row; /* 64 elements */
  352. /* For stage 1, Calculate k & 0xFFF000 >> 3*4.
  353. * If larger than 1D, we have no mapping. Otherwise lookup that index */
  354. typedef struct {
  355. const entity_stage1_row *ms_table;
  356. /* for tables with only basic entities, this member is to be accessed
  357. * directly for better performance: */
  358. const entity_stage3_row *table;
  359. } entity_table_opt;
  360. /* Replaced "GT" > "gt" and "QUOT" > "quot" for consistency's sake. */
  361. CODE;
  362. if (!$pass2)
  363. echo $t;
  364. $dp = array();
  365. foreach (explode("\n", $data) as $l) {
  366. if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+) ([a-f0-9]+)/i', $l, $matches)) {
  367. //echo sprintf("\t{\"%-21s 1, 0x%05d},\n", $matches[1].",", $matches[2]);
  368. $dp[] = array($matches[1], $matches[2], $matches[3]);
  369. } else if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+)/i', $l, $matches)) {
  370. $dp[] = array($matches[1], $matches[2]);
  371. }
  372. }
  373. $origdp = $dp;
  374. usort($dp, function($a, $b) { return hexdec($a[1])-hexdec($b[1]); });
  375. $multicp_rows = array();
  376. foreach ($dp as $el) {
  377. if (count($el) == 3) {
  378. $multicp_rows[$el[1]] = array();
  379. }
  380. }
  381. foreach ($dp as $el) {
  382. if (key_exists($el[1], $multicp_rows)) {
  383. if (count($el) == 3)
  384. $multicp_rows[$el[1]][$el[2]] = $el[0];
  385. else
  386. $multicp_rows[$el[1]]["default"] = $el[0];
  387. }
  388. }
  389. if ($pass2 < 2)
  390. echo "/* {{{ Start of $name multi-stage table for codepoint -> entity */", "\n\n";
  391. else
  392. echo "/* {{{ Start of $name table for codepoint -> entity */", "\n\n";
  393. if (empty($multicp_rows))
  394. goto skip_multicp;
  395. ksort($multicp_rows);
  396. foreach ($multicp_rows as &$v) { ksort($v); }
  397. unset($v);
  398. echo
  399. "/* {{{ Start of double code point tables for $name */", "\n\n";
  400. foreach ($multicp_rows as $k => $v) {
  401. echo "static const entity_multicodepoint_row multi_cp_{$ident}_",
  402. sprintf("%05s", $k), "[] = {", "\n";
  403. if (key_exists("default", $v)) {
  404. if ($v['default'] == 'GT') /* hack to make > translate to &gt; not GT; */
  405. $v['default'] = "gt";
  406. echo "\t{ {", sprintf("\"%-21s", $v["default"].'",'),
  407. "\t", sprintf("%02d", (count($v) - 1)), ",\t\t",
  408. sprintf("% 2d", strlen($v["default"])), '} },', "\n";
  409. } else {
  410. echo "\t{ {", sprintf("%-22s", 'NULL,'),
  411. "\t", sprintf("%02d", count($v)), ",\t\t0} },\n";
  412. }
  413. unset($v["default"]);
  414. foreach ($v as $l => $w) {
  415. echo "\t{ {", sprintf("\"%-21s", $w.'",'), "\t", sprintf("0x%05s", $l), ",\t",
  416. sprintf("% 2d", strlen($w)), '} },', "\n";
  417. }
  418. echo "};\n";
  419. }
  420. echo "\n/* End of double code point tables }}} */", "\n\n";
  421. skip_multicp:
  422. if ($pass2 < 2)
  423. echo "/* {{{ Stage 3 Tables for $name */", "\n\n";
  424. $t = <<<CODE
  425. static const entity_stage3_row empty_stage3_table[] = {
  426. /* 64 elements */
  427. {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
  428. {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
  429. {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
  430. {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
  431. {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
  432. {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
  433. {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
  434. {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
  435. {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
  436. {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
  437. {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
  438. {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
  439. {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
  440. {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
  441. {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
  442. {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
  443. };
  444. CODE;
  445. if (!$pass2)
  446. echo $t;
  447. $mstable = array();
  448. foreach ($dp as $el) {
  449. $s1 = (hexdec($el[1]) & 0xFFF000) >> 12;
  450. $s2 = (hexdec($el[1]) & 0xFC0) >> 6;
  451. $s3 = hexdec($el[1]) & 0x3F;
  452. if (key_exists($el[1], $multicp_rows)) {
  453. $mstable[$s1][$s2][$s3] = "";
  454. } else {
  455. $mstable[$s1][$s2][$s3] = $el[0];
  456. }
  457. }
  458. for ($i = 0; $i < 0x1E; $i++) {
  459. for ($k = 0; $k < 64; $k++) {
  460. $any3 = false;
  461. $col3 = array();
  462. for ($l = 0; $l < 64; $l++) {
  463. if (isset($mstable[$i][$k][$l])) {
  464. $any3 = true;
  465. $col3[$l] = $mstable[$i][$k][$l];
  466. } else {
  467. $col3[$l] = null;
  468. }
  469. }
  470. if ($any3) {
  471. echo "static const entity_stage3_row stage3_table_{$ident}_",
  472. sprintf("%02X%03X", $i, $k << 6), "[] = {\n";
  473. foreach ($col3 as $y => $z) {
  474. if ($y == 0) echo "\t";
  475. elseif ($y % 4 == 0) echo "\n\t";
  476. else echo " ";
  477. if ($z === NULL)
  478. echo "{0, { {NULL, 0} } },";
  479. elseif ($z === "QUOT") /* hack to translate " into &quote;, not &QUOT; */
  480. echo "{0, { {\"quot\", 4} } },";
  481. elseif ($z !== "")
  482. echo "{0, { {\"$z\", ", strlen($z), "} } },";
  483. else
  484. echo "{1, { {(void *)", sprintf("multi_cp_{$ident}_%05X",
  485. ($i << 12) | ($k << 6) | $y ), ", 0} } },";
  486. }
  487. echo "\n};\n\n";
  488. }
  489. }
  490. }
  491. if ($pass2 < 2)
  492. echo "/* end of stage 3 Tables for $name }}} */", "\n\n";
  493. if ($pass2 > 1)
  494. goto hashtables;
  495. echo
  496. "/* {{{ Stage 2 Tables for $name */", "\n\n";
  497. $t = <<<CODE
  498. static const entity_stage2_row empty_stage2_table[] = {
  499. empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
  500. empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
  501. empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
  502. empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
  503. empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
  504. empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
  505. empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
  506. empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
  507. empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
  508. empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
  509. empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
  510. empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
  511. empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
  512. empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
  513. empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
  514. empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
  515. };
  516. CODE;
  517. if (!$pass2)
  518. echo $t;
  519. for ($i = 0; $i < 0x1E; $i++) {
  520. $any = false;
  521. for ($k = 0; $k < 64; $k++) {
  522. if (isset($mstable[$i][$k]))
  523. $any = true;
  524. }
  525. if ($any) {
  526. echo "static const entity_stage2_row stage2_table_{$ident}_",
  527. sprintf("%02X000", $i), "[] = {\n";
  528. for ($k = 0; $k < 64; $k++) {
  529. if ($k == 0) echo "\t";
  530. elseif ($k % 4 == 0) echo "\n\t";
  531. else echo " ";
  532. if (isset($mstable[$i][$k])) {
  533. echo sprintf("stage3_table_{$ident}_%05X", ($i << 12) | ($k << 6)), ",";
  534. } else {
  535. echo "empty_stage3_table", ",";
  536. }
  537. }
  538. echo "\n};\n\n";
  539. }
  540. }
  541. echo
  542. "/* end of stage 2 tables for $name }}} */", "\n\n";
  543. echo "static const entity_stage1_row entity_ms_table_{$ident}[] = {\n";
  544. for ($i = 0; $i < 0x1E; $i++) {
  545. if (isset($mstable[$i]))
  546. echo "\t", sprintf("stage2_table_{$ident}_%02X000", $i), ",\n";
  547. else
  548. echo "\tempty_stage2_table,\n";
  549. }
  550. echo "};\n\n";
  551. echo
  552. "/* end of $name multi-stage table for codepoint -> entity }}} */\n\n";
  553. /* commented-out; this enabled binary search, which turned out to be
  554. * significantly slower than the hash tables for html 5 entities */
  555. //echo
  556. //"/* {{{ HTML 5 tables for entity -> codepoint */", "\n\n";
  557. //$t = <<<CODE
  558. //typedef struct {
  559. // const char *entity;
  560. // unsigned short entity_len;
  561. // unsigned int codepoint1;
  562. // unsigned int codepoint2;
  563. //} entity_cp_map;
  564. //
  565. //#define ENTITY_CP_MAP_CMP(l, lsize, r, rsize) \
  566. // ( ((lsize)==(rsize)) ? (memcmp((l), (r), (lsize))) : ((lsize)-(rsize)) )
  567. //
  568. //static const entity_cp_map html5_ent_cp_map[] = {
  569. //
  570. //CODE;
  571. //echo $t;
  572. //
  573. //$dp = $origdp;
  574. //usort($dp, function($a, $b) { $d = strlen($a[0])-strlen($b[0]);
  575. // return $d==0?strcmp($a[0], $b[0]):$d; });
  576. //
  577. //$k = 0;
  578. //foreach ($dp as $o) {
  579. // if ($k == 0) echo "\t";
  580. // elseif ($k % 3 == 0) echo "\n\t";
  581. // else echo " ";
  582. // if (isset($o[2]))
  583. // echo sprintf('{"%s", %d, 0x%X, 0x%X},', $o[0], strlen($o[0]),
  584. // hexdec($o[1]), hexdec($o[2]));
  585. // else
  586. // echo sprintf('{"%s", %d, 0x%X, 0},', $o[0], strlen($o[0]),
  587. // hexdec($o[1]));
  588. //
  589. // if (isset($o[2])) {
  590. // $entlen = strlen($o[0]) + 2;
  591. // $utf8len = strlen(
  592. // mb_convert_encoding("&#x{$o[1]};&#x{$o[2]};", "UTF-8", "HTML-ENTITIES"));
  593. // if ($utf8len > $entlen*1.2) {
  594. // die("violated assumption for traverse_for_entities");
  595. // }
  596. // }
  597. //
  598. // $k++;
  599. //}
  600. //echo "\n};\n\n";
  601. //
  602. //echo "static const size_t html5_ent_cp_map_size = $k;\n\n";
  603. //
  604. //echo
  605. //"/* end of HTML 5 tables for entity -> codepoint }}} */\n\n";
  606. hashtables:
  607. echo
  608. "/* {{{ $name hash table for entity -> codepoint */", "\n\n";
  609. $t = <<<CODE
  610. typedef struct {
  611. const char *entity;
  612. unsigned short entity_len;
  613. unsigned int codepoint1;
  614. unsigned int codepoint2;
  615. } entity_cp_map;
  616. typedef const entity_cp_map *entity_ht_bucket;
  617. typedef struct {
  618. unsigned num_elems; /* power of 2 */
  619. const entity_ht_bucket *buckets; /* .num_elems elements */
  620. } entity_ht;
  621. static const entity_cp_map ht_bucket_empty[] = { {NULL, 0, 0, 0} };
  622. CODE;
  623. if (!$pass2)
  624. echo $t;
  625. function hashfun($str)
  626. {
  627. $hash = 5381;
  628. $nKeyLength = strlen($str);
  629. $pos = 0;
  630. for (; $nKeyLength > 0; $nKeyLength--) {
  631. $hash = (int)(((int)(((int)($hash << 5)) + $hash)) + ord($str[$pos++]))
  632. & 0xFFFFFFFF;
  633. }
  634. return $hash;
  635. }
  636. $numelems = max(pow(2, ceil(log(1.5*count($origdp))/log(2))),16);
  637. $mask = $numelems - 1;
  638. $hashes = array();
  639. foreach ($origdp as $e) {
  640. $hashes[hashfun($e[0]) & $mask][] = $e;
  641. if (isset($e[2])) {
  642. $entlen = strlen($e[0]) + 2;
  643. $utf8len = strlen(
  644. mb_convert_encoding("&#x{$e[1]};&#x{$e[2]};", "UTF-8", "HTML-ENTITIES"));
  645. if ($utf8len > $entlen*1.2) {
  646. die("violated assumption for traverse_for_entities");
  647. }
  648. }
  649. }
  650. for ($i = 0; $i < $numelems; $i++) {
  651. if (empty($hashes[$i]))
  652. continue;
  653. echo "static const entity_cp_map ht_bucket_{$ident}_", sprintf("%03X", $i) ,"[] = {";
  654. foreach ($hashes[$i] as $h) {
  655. if (isset($h[2])) {
  656. echo sprintf(' {"%s", %d, 0x%05X, 0x%05X},',
  657. $h[0], strlen($h[0]), hexdec($h[1]), hexdec($h[2]));
  658. } else {
  659. echo sprintf(' {"%s", %d, 0x%05X, 0},',
  660. $h[0], strlen($h[0]), hexdec($h[1]));
  661. }
  662. }
  663. echo " {NULL, 0, 0, 0} };\n";
  664. }
  665. echo "\n";
  666. echo
  667. "static const entity_cp_map *const ht_buckets_{$ident}[] = {\n";
  668. for ($i = 0; $i < $numelems; $i++) {
  669. if ($i == 0) echo "\t";
  670. elseif ($i % 4 == 0) echo "\n\t";
  671. else echo " ";
  672. if (empty($hashes[$i]))
  673. echo "ht_bucket_empty,";
  674. else
  675. echo "ht_bucket_{$ident}_", sprintf("%03X", $i), ",";
  676. }
  677. echo "\n};\n\n";
  678. echo
  679. "static const entity_ht ent_ht_{$ident} = {
  680. ", sprintf("0x%X", $numelems), ",
  681. ht_buckets_{$ident}
  682. };\n\n";
  683. echo
  684. "/* end of $name hash table for entity -> codepoint }}} */\n\n";
  685. if (!$pass2) {
  686. $data = file_get_contents("ents_html401.txt");
  687. $pass2 = 1;
  688. $name = "HTML 4.01";
  689. $ident = "html4";
  690. goto again;
  691. } elseif ($pass2 == 1) {
  692. $data = file_get_contents("ents_basic.txt");
  693. $pass2 = 2;
  694. $name = "Basic entities (no apos)";
  695. $ident = "be_noapos";
  696. goto again;
  697. } elseif ($pass2 == 2) {
  698. $data = file_get_contents("ents_basic_apos.txt");
  699. $pass2 = 3;
  700. $name = "Basic entities (with apos)";
  701. $ident = "be_apos";
  702. goto again;
  703. }
  704. echo "#endif /* HTML_TABLES_H */\n";