123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813 |
- <?php
- $t = <<<CODE
- /*
- +----------------------------------------------------------------------+
- | PHP Version 7 |
- +----------------------------------------------------------------------+
- | Copyright (c) 1997-%s The PHP Group |
- +----------------------------------------------------------------------+
- | This source file is subject to version 3.01 of the PHP license, |
- | that is bundled with this package in the file LICENSE, and is |
- | available through the world-wide-web at the following url: |
- | http://www.php.net/license/3_01.txt |
- | If you did not receive a copy of the PHP license and are unable to |
- | obtain it through the world-wide-web, please send a note to |
- | license@php.net so we can mail you a copy immediately. |
- +----------------------------------------------------------------------+
- */
- #ifndef HTML_TABLES_H
- #define HTML_TABLES_H
- /**************************************************************************
- ***************************************************************************
- ** THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY IT. **
- ***************************************************************************
- ** Please change html_tables/html_table_gen.php instead and then **
- ** run it in order to generate this file **
- ***************************************************************************
- **************************************************************************/
- enum entity_charset { cs_utf_8, cs_8859_1, cs_cp1252, cs_8859_15, cs_cp1251,
- cs_8859_5, cs_cp866, cs_macroman, cs_koi8r, cs_big5,
- cs_gb2312, cs_big5hkscs, cs_sjis, cs_eucjp,
- cs_numelems /* used to count the number of charsets */
- };
- #define CHARSET_UNICODE_COMPAT(cs) ((cs) <= cs_8859_1)
- #define CHARSET_SINGLE_BYTE(cs) ((cs) > cs_utf_8 && (cs) < cs_big5)
- #define CHARSET_PARTIAL_SUPPORT(cs) ((cs) >= cs_big5)
- static const struct {
- const char *codeset;
- uint32_t codeset_len;
- enum entity_charset charset;
- } charset_map[] = {
- { "ISO-8859-1", sizeof("ISO-8859-1")-1, cs_8859_1 },
- { "ISO8859-1", sizeof("ISO8859-1")-1, cs_8859_1 },
- { "ISO-8859-15", sizeof("ISO-8859-15")-1, cs_8859_15 },
- { "ISO8859-15", sizeof("ISO8859-15")-1, cs_8859_15 },
- { "utf-8", sizeof("utf-8")-1, cs_utf_8 },
- { "cp1252", sizeof("cp1252")-1, cs_cp1252 },
- { "Windows-1252", sizeof("Windows-1252")-1, cs_cp1252 },
- { "1252", sizeof("1252")-1, cs_cp1252 },
- { "BIG5", sizeof("BIG5")-1, cs_big5 },
- { "950", sizeof("950")-1, cs_big5 },
- { "GB2312", sizeof("GB2312")-1, cs_gb2312 },
- { "936", sizeof("936")-1, cs_gb2312 },
- { "BIG5-HKSCS", sizeof("BIG5-HKSCS")-1, cs_big5hkscs },
- { "Shift_JIS", sizeof("Shift_JIS")-1, cs_sjis },
- { "SJIS", sizeof("SJIS")-1, cs_sjis },
- { "932", sizeof("932")-1, cs_sjis },
- { "SJIS-win", sizeof("SJIS-win")-1, cs_sjis },
- { "CP932", sizeof("CP932")-1, cs_sjis },
- { "EUCJP", sizeof("EUCJP")-1, cs_eucjp },
- { "EUC-JP", sizeof("EUC-JP")-1, cs_eucjp },
- { "eucJP-win", sizeof("eucJP-win")-1, cs_eucjp },
- { "KOI8-R", sizeof("KOI8-R")-1, cs_koi8r },
- { "koi8-ru", sizeof("koi8-ru")-1, cs_koi8r },
- { "koi8r", sizeof("koi8r")-1, cs_koi8r },
- { "cp1251", sizeof("cp1251")-1, cs_cp1251 },
- { "Windows-1251", sizeof("Windows-1251")-1, cs_cp1251 },
- { "win-1251", sizeof("win-1251")-1, cs_cp1251 },
- { "iso8859-5", sizeof("iso8859-5")-1, cs_8859_5 },
- { "iso-8859-5", sizeof("iso-8859-5")-1, cs_8859_5 },
- { "cp866", sizeof("cp866")-1, cs_cp866 },
- { "866", sizeof("866")-1, cs_cp866 },
- { "ibm866", sizeof("ibm866")-1, cs_cp866 },
- { "MacRoman", sizeof("MacRoman")-1, cs_macroman }
- };
- /* longest entity name length excluding & and ; */
- #define LONGEST_ENTITY_LENGTH 31
- /* Definitions for mappings *to* Unicode.
- * The origin charset must have at most 256 code points.
- * The multi-byte encodings are not supported */
- typedef struct {
- unsigned short uni_cp[64];
- } enc_to_uni_stage2;
- typedef struct {
- const enc_to_uni_stage2 *inner[4];
- } enc_to_uni;
- /* bits 7-8 bits (only single bytes encodings supported )*/
- #define ENT_ENC_TO_UNI_STAGE1(k) ((k & 0xC0) >> 6)
- /* bits 1-6 */
- #define ENT_ENC_TO_UNI_STAGE2(k) ((k) & 0x3F)
- CODE;
- echo sprintf($t, date("Y"));
- $encodings = array(
- array(
- "ident" => "iso88591",
- "enumid" => 1,
- "name" => "ISO-8859-1",
- "file" => "mappings/8859-1.TXT",
- ),
- array(
- "ident" => "iso88595",
- "enumid" => 5,
- "name" => "ISO-8859-5",
- "file" => "mappings/8859-5.TXT",
- ),
- array(
- "ident" => "iso885915",
- "enumid" => 3,
- "name" => "ISO-8859-15",
- "file" => "mappings/8859-15.TXT",
- ),
- array(
- "ident" => "win1252",
- "enumid" => 2,
- "enumident" => "cp1252",
- "name" => "Windows-1252",
- "file" => "mappings/CP1252.TXT",
- ),
- array(
- "ident" => "win1251",
- "enumid" => 4,
- "enumident" => "cp1252",
- "name" => "Windows-1251",
- "file" => "mappings/CP1251.TXT",
- ),
- array(
- "ident" => "koi8r",
- "enumid" => 8,
- "name" => "KOI8-R",
- "file" => "mappings/KOI8-R.TXT",
- ),
- array(
- "ident" => "cp866",
- "enumid" => 6,
- "name" => "CP-866",
- "file" => "mappings/CP866.TXT",
- ),
- array(
- "ident" => "macroman",
- "enumid" => 7,
- "name" => "MacRoman",
- "file" => "mappings/ROMAN.TXT",
- ),
- );
- $prevStage2 = array();
- foreach ($encodings as $e) {
- echo
- "/* {{{ Mappings *to* Unicode for {$e['name']} */\n\n";
-
- $map = array();
- $lines = explode("\n", file_get_contents($e{'file'}));
- foreach ($lines as $l) {
- if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})/i", $l, $matches))
- $map[] = array($matches[1], $matches[2]);
- }
- $mappy = array();
- foreach ($map as $v) { $mappy[hexdec($v[0])] = hexdec($v[1]); }
- $mstable = array("ident" => $e['ident']);
-
- for ($i = 0; $i < 4; $i++) {
- for ($j = 0; $j < 64; $j++) {
- $cp = $i << 6 | $j;
- $mstable[$i][$j] = isset($mappy[$cp]) ? $mappy[$cp] : NULL;
- }
- }
- echo
- "/* {{{ Stage 2 tables for {$e['name']} */\n\n";
- $s2tables_idents = array();
- for ($i = 0; $i < 4; $i++) {
- if (($t = array_keys($prevStage2, $mstable[$i])) !== array()) {
- $s2tables_idents[$i] = $encodings[$t[0]/5]["ident"];
- continue;
- }
- $s2tables_idents[$i] = $e["ident"];
- echo "static const enc_to_uni_stage2 enc_to_uni_s2_{$e['ident']}_".
- sprintf("%02X", $i << 6)." = { {\n";
- for ($j = 0; $j < 64; $j++) {
- if ($j == 0) echo "\t";
- elseif ($j % 6 == 0) echo "\n\t";
- else echo " ";
- if ($mstable[$i][$j] !== NULL)
- echo sprintf("0x%04X,", $mstable[$i][$j]);
- else
- echo "0xFFFF,";
- }
- echo "\n} };\n\n";
- $prevStage2[] = $mstable[$i];
- }
- echo
- "/* end of stage 2 tables for {$e['name']} }}} */\n\n";
- echo
- "/* {{{ Stage 1 table for {$e['name']} */\n";
- echo
- "static const enc_to_uni enc_to_uni_{$e['ident']} = { {
- \t&enc_to_uni_s2_{$s2tables_idents[0]}_00,
- \t&enc_to_uni_s2_{$s2tables_idents[1]}_40,
- \t&enc_to_uni_s2_{$s2tables_idents[2]}_80,
- \t&enc_to_uni_s2_{$s2tables_idents[3]}_C0 }
- };
- ";
- echo
- "/* end of stage 1 table for {$e['name']} }}} */\n\n";
- }
- $maxencnum = max(array_map(function($e) { return $e['enumid']; }, $encodings));
- $a = range(0, $maxencnum);
- foreach ($encodings as $e) { $a[$e['enumid']] = $e['ident']; }
- echo
- "/* {{{ Index of tables for encoding conversion */
- static const enc_to_uni *const enc_to_uni_index[cs_numelems] = {\n";
- foreach ($a as $k => $v) {
- if (is_numeric($v))
- echo "\tNULL,\n";
- else
- echo "\t&enc_to_uni_$v,\n";
- }
- echo
- "};
- /* }}} */\n";
- $t = <<<CODE
- /* Definitions for mappings *from* Unicode */
- typedef struct {
- unsigned short un_code_point; /* we don't need bigger */
- unsigned char cs_code; /* currently, we only have maps to single-byte encodings */
- } uni_to_enc;
- CODE;
- echo $t;
- $encodings = array(
- array(
- "ident" => "iso885915",
- "name" => "ISO-8859-15",
- "file" => "mappings/8859-15.TXT",
- "range" => array(0xA4, 0xBE),
- ),
- array(
- "ident" => "win1252",
- "name" => "Windows-1252",
- "file" => "mappings/CP1252.TXT",
- "range" => array(0x80, 0x9F),
- ),
- array(
- "ident" => "win1251",
- "name" => "Windows-1251",
- "file" => "mappings/CP1251.TXT",
- "range" => array(0x80, 0xFF),
- ),
- array(
- "ident" => "koi8r",
- "name" => "KOI8-R",
- "file" => "mappings/KOI8-R.TXT",
- "range" => array(0x80, 0xFF),
- ),
- array(
- "ident" => "cp866",
- "name" => "CP-866",
- "file" => "mappings/CP866.TXT",
- "range" => array(0x80, 0xFF),
- ),
- array(
- "ident" => "macroman",
- "name" => "MacRoman",
- "file" => "mappings/ROMAN.TXT",
- "range" => array(0x80, 0xFF),
- ),
- );
- foreach ($encodings as $e) {
- echo
- "/* {{{ Mappings *from* Unicode for {$e['name']} */\n";
-
- $map = array();
- $lines = explode("\n", file_get_contents($e{'file'}));
- foreach ($lines as $l) {
- if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})\s+#\s*(.*)$/i", $l, $matches))
- $map[] = array($matches[1], $matches[2], rtrim($matches[3]));
- }
- $mappy = array();
- foreach ($map as $v) {
- if (hexdec($v[0]) >= $e['range'][0] && hexdec($v[0]) <= $e['range'][1])
- $mappy[hexdec($v[1])] = array(hexdec($v[0]), strtolower($v[2]));
- }
- ksort($mappy);
- echo
- "static const uni_to_enc unimap_{$e['ident']}[] = {\n";
- foreach ($mappy as $k => $v) {
- echo "\t{ ", sprintf("0x%04X", $k), ", ", sprintf("0x%02X", $v[0]), " },\t/* ",
- $v[1], " */\n";
- }
- echo "};\n";
- echo
- "/* {{{ end of mappings *from* Unicode for {$e['name']} */\n\n";
- }
- $data = file_get_contents("ents_html5.txt");
- $pass2 = false;
- $name = "HTML5";
- $ident = "html5";
- again:
- $t = <<<'CODE'
- /* HTML 5 has many more named entities.
- * Some of them map to two unicode code points, not one.
- * We're going to use a three-stage table (with an extra one for the entities
- * with two code points). */
- #define ENT_STAGE1_INDEX(k) (((k) & 0xFFF000) >> 12) /* > 1D, we have no mapping */
- #define ENT_STAGE2_INDEX(k) (((k) & 0xFC0) >> 6)
- #define ENT_STAGE3_INDEX(k) ((k) & 0x3F)
- #define ENT_CODE_POINT_FROM_STAGES(i,j,k) (((i) << 12) | ((j) << 6) | (k))
- /* The default entity may be NULL. Binary search is still possible while
- is senseless as there are just two rows (see also find_entity_for_char()). */
- typedef union {
- struct {
- const char *default_entity;
- unsigned size; /* number of remaining entries in the table */
- unsigned short default_entity_len;
- } leading_entry;
- struct {
- const char *entity;
- unsigned second_cp; /* second code point */
- unsigned short entity_len;
- } normal_entry;
- } entity_multicodepoint_row;
- /* blocks of these should start at code points k where k % 0xFC0 == 0 */
- typedef struct {
- char ambiguous; /* if 0 look into entity */
- union {
- struct {
- const char *entity; /* may be NULL */
- unsigned short entity_len;
- } ent;
- const entity_multicodepoint_row *multicodepoint_table;
- } data;
- } entity_stage3_row;
- /* Calculate k & 0x3F Use as offset */
- typedef const entity_stage3_row *entity_stage2_row; /* 64 elements */
- /* Calculate k & 0xFC0 >> 6. Use as offset */
- typedef const entity_stage3_row *const *entity_stage1_row; /* 64 elements */
- /* For stage 1, Calculate k & 0xFFF000 >> 3*4.
- * If larger than 1D, we have no mapping. Otherwise lookup that index */
- typedef struct {
- const entity_stage1_row *ms_table;
- /* for tables with only basic entities, this member is to be accessed
- * directly for better performance: */
- const entity_stage3_row *table;
- } entity_table_opt;
- /* Replaced "GT" > "gt" and "QUOT" > "quot" for consistency's sake. */
- CODE;
- if (!$pass2)
- echo $t;
- $dp = array();
- foreach (explode("\n", $data) as $l) {
- if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+) ([a-f0-9]+)/i', $l, $matches)) {
-
- $dp[] = array($matches[1], $matches[2], $matches[3]);
- } else if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+)/i', $l, $matches)) {
- $dp[] = array($matches[1], $matches[2]);
- }
- }
- $origdp = $dp;
- usort($dp, function($a, $b) { return hexdec($a[1])-hexdec($b[1]); });
- $multicp_rows = array();
- foreach ($dp as $el) {
- if (count($el) == 3) {
- $multicp_rows[$el[1]] = array();
- }
- }
- foreach ($dp as $el) {
- if (key_exists($el[1], $multicp_rows)) {
- if (count($el) == 3)
- $multicp_rows[$el[1]][$el[2]] = $el[0];
- else
- $multicp_rows[$el[1]]["default"] = $el[0];
- }
- }
- if ($pass2 < 2)
- echo "/* {{{ Start of $name multi-stage table for codepoint -> entity */", "\n\n";
- else
- echo "/* {{{ Start of $name table for codepoint -> entity */", "\n\n";
- if (empty($multicp_rows))
- goto skip_multicp;
- ksort($multicp_rows);
- foreach ($multicp_rows as &$v) { ksort($v); }
- unset($v);
- echo
- "/* {{{ Start of double code point tables for $name */", "\n\n";
- foreach ($multicp_rows as $k => $v) {
- echo "static const entity_multicodepoint_row multi_cp_{$ident}_",
- sprintf("%05s", $k), "[] = {", "\n";
- if (key_exists("default", $v)) {
- if ($v['default'] == 'GT')
- $v['default'] = "gt";
- echo "\t{ {", sprintf("\"%-21s", $v["default"].'",'),
- "\t", sprintf("%02d", (count($v) - 1)), ",\t\t",
- sprintf("% 2d", strlen($v["default"])), '} },', "\n";
- } else {
- echo "\t{ {", sprintf("%-22s", 'NULL,'),
- "\t", sprintf("%02d", count($v)), ",\t\t0} },\n";
- }
- unset($v["default"]);
- foreach ($v as $l => $w) {
- echo "\t{ {", sprintf("\"%-21s", $w.'",'), "\t", sprintf("0x%05s", $l), ",\t",
- sprintf("% 2d", strlen($w)), '} },', "\n";
- }
- echo "};\n";
- }
- echo "\n/* End of double code point tables }}} */", "\n\n";
- skip_multicp:
- if ($pass2 < 2)
- echo "/* {{{ Stage 3 Tables for $name */", "\n\n";
- $t = <<<CODE
- static const entity_stage3_row empty_stage3_table[] = {
- /* 64 elements */
- {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
- {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
- {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
- {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
- {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
- {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
- {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
- {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
- {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
- {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
- {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
- {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
- {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
- {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
- {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
- {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
- };
- CODE;
- if (!$pass2)
- echo $t;
- $mstable = array();
- foreach ($dp as $el) {
- $s1 = (hexdec($el[1]) & 0xFFF000) >> 12;
- $s2 = (hexdec($el[1]) & 0xFC0) >> 6;
- $s3 = hexdec($el[1]) & 0x3F;
- if (key_exists($el[1], $multicp_rows)) {
- $mstable[$s1][$s2][$s3] = "";
- } else {
- $mstable[$s1][$s2][$s3] = $el[0];
- }
- }
- for ($i = 0; $i < 0x1E; $i++) {
- for ($k = 0; $k < 64; $k++) {
- $any3 = false;
- $col3 = array();
- for ($l = 0; $l < 64; $l++) {
- if (isset($mstable[$i][$k][$l])) {
- $any3 = true;
- $col3[$l] = $mstable[$i][$k][$l];
- } else {
- $col3[$l] = null;
- }
- }
- if ($any3) {
- echo "static const entity_stage3_row stage3_table_{$ident}_",
- sprintf("%02X%03X", $i, $k << 6), "[] = {\n";
- foreach ($col3 as $y => $z) {
- if ($y == 0) echo "\t";
- elseif ($y % 4 == 0) echo "\n\t";
- else echo " ";
- if ($z === NULL)
- echo "{0, { {NULL, 0} } },";
- elseif ($z === "QUOT")
- echo "{0, { {\"quot\", 4} } },";
- elseif ($z !== "")
- echo "{0, { {\"$z\", ", strlen($z), "} } },";
- else
- echo "{1, { {(void *)", sprintf("multi_cp_{$ident}_%05X",
- ($i << 12) | ($k << 6) | $y ), ", 0} } },";
- }
- echo "\n};\n\n";
- }
- }
- }
- if ($pass2 < 2)
- echo "/* end of stage 3 Tables for $name }}} */", "\n\n";
- if ($pass2 > 1)
- goto hashtables;
- echo
- "/* {{{ Stage 2 Tables for $name */", "\n\n";
- $t = <<<CODE
- static const entity_stage2_row empty_stage2_table[] = {
- empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
- empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
- empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
- empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
- empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
- empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
- empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
- empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
- empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
- empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
- empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
- empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
- empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
- empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
- empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
- empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
- };
- CODE;
- if (!$pass2)
- echo $t;
- for ($i = 0; $i < 0x1E; $i++) {
- $any = false;
- for ($k = 0; $k < 64; $k++) {
- if (isset($mstable[$i][$k]))
- $any = true;
- }
- if ($any) {
- echo "static const entity_stage2_row stage2_table_{$ident}_",
- sprintf("%02X000", $i), "[] = {\n";
- for ($k = 0; $k < 64; $k++) {
- if ($k == 0) echo "\t";
- elseif ($k % 4 == 0) echo "\n\t";
- else echo " ";
- if (isset($mstable[$i][$k])) {
- echo sprintf("stage3_table_{$ident}_%05X", ($i << 12) | ($k << 6)), ",";
- } else {
- echo "empty_stage3_table", ",";
- }
- }
- echo "\n};\n\n";
- }
- }
- echo
- "/* end of stage 2 tables for $name }}} */", "\n\n";
- echo "static const entity_stage1_row entity_ms_table_{$ident}[] = {\n";
- for ($i = 0; $i < 0x1E; $i++) {
- if (isset($mstable[$i]))
- echo "\t", sprintf("stage2_table_{$ident}_%02X000", $i), ",\n";
- else
- echo "\tempty_stage2_table,\n";
- }
- echo "};\n\n";
- echo
- "/* end of $name multi-stage table for codepoint -> entity }}} */\n\n";
- hashtables:
- echo
- "/* {{{ $name hash table for entity -> codepoint */", "\n\n";
- $t = <<<CODE
- typedef struct {
- const char *entity;
- unsigned short entity_len;
- unsigned int codepoint1;
- unsigned int codepoint2;
- } entity_cp_map;
- typedef const entity_cp_map *entity_ht_bucket;
- typedef struct {
- unsigned num_elems; /* power of 2 */
- const entity_ht_bucket *buckets; /* .num_elems elements */
- } entity_ht;
- static const entity_cp_map ht_bucket_empty[] = { {NULL, 0, 0, 0} };
- CODE;
- if (!$pass2)
- echo $t;
- function hashfun($str)
- {
- $hash = 5381;
- $nKeyLength = strlen($str);
- $pos = 0;
- for (; $nKeyLength > 0; $nKeyLength--) {
- $hash = (int)(((int)(((int)($hash << 5)) + $hash)) + ord($str[$pos++]))
- & 0xFFFFFFFF;
- }
- return $hash;
- }
- $numelems = max(pow(2, ceil(log(1.5*count($origdp))/log(2))),16);
- $mask = $numelems - 1;
- $hashes = array();
- foreach ($origdp as $e) {
- $hashes[hashfun($e[0]) & $mask][] = $e;
- if (isset($e[2])) {
- $entlen = strlen($e[0]) + 2;
- $utf8len = strlen(
- mb_convert_encoding("&#x{$e[1]};&#x{$e[2]};", "UTF-8", "HTML-ENTITIES"));
- if ($utf8len > $entlen*1.2) {
- die("violated assumption for traverse_for_entities");
- }
- }
- }
- for ($i = 0; $i < $numelems; $i++) {
- if (empty($hashes[$i]))
- continue;
- echo "static const entity_cp_map ht_bucket_{$ident}_", sprintf("%03X", $i) ,"[] = {";
- foreach ($hashes[$i] as $h) {
- if (isset($h[2])) {
- echo sprintf(' {"%s", %d, 0x%05X, 0x%05X},',
- $h[0], strlen($h[0]), hexdec($h[1]), hexdec($h[2]));
- } else {
- echo sprintf(' {"%s", %d, 0x%05X, 0},',
- $h[0], strlen($h[0]), hexdec($h[1]));
- }
- }
- echo " {NULL, 0, 0, 0} };\n";
- }
- echo "\n";
- echo
- "static const entity_cp_map *const ht_buckets_{$ident}[] = {\n";
- for ($i = 0; $i < $numelems; $i++) {
- if ($i == 0) echo "\t";
- elseif ($i % 4 == 0) echo "\n\t";
- else echo " ";
- if (empty($hashes[$i]))
- echo "ht_bucket_empty,";
- else
- echo "ht_bucket_{$ident}_", sprintf("%03X", $i), ",";
- }
- echo "\n};\n\n";
- echo
- "static const entity_ht ent_ht_{$ident} = {
- ", sprintf("0x%X", $numelems), ",
- ht_buckets_{$ident}
- };\n\n";
- echo
- "/* end of $name hash table for entity -> codepoint }}} */\n\n";
- if (!$pass2) {
- $data = file_get_contents("ents_html401.txt");
- $pass2 = 1;
- $name = "HTML 4.01";
- $ident = "html4";
- goto again;
- } elseif ($pass2 == 1) {
- $data = file_get_contents("ents_basic.txt");
- $pass2 = 2;
- $name = "Basic entities (no apos)";
- $ident = "be_noapos";
- goto again;
- } elseif ($pass2 == 2) {
- $data = file_get_contents("ents_basic_apos.txt");
- $pass2 = 3;
- $name = "Basic entities (with apos)";
- $ident = "be_apos";
- goto again;
- }
- echo "#endif /* HTML_TABLES_H */\n";
|