<?php /* +----------------------------------------------------------------------+ | PHP Version 5 | +----------------------------------------------------------------------+ | Copyright (c) 1997-2010 The PHP Group | +----------------------------------------------------------------------+ | This source file is subject to version 3.01 of the PHP license, | | that is bundled with this package in the file LICENSE, and is | | available through the world-wide-web at the following url: | | http://www.php.net/license/3_01.txt | | If you did not receive a copy of the PHP license and are unable to | | obtain it through the world-wide-web, please send a note to | | license@php.net so we can mail you a copy immediately. | +----------------------------------------------------------------------+ | Authors: Gustavo Lopes <cataphract@php.net> | +----------------------------------------------------------------------+ */ /* This file prints to stdout the contents of ext/standard/html_tables.h */ /* put together with glue; have patience */ $t = <<<CODE /* +----------------------------------------------------------------------+ | PHP Version 5 | +----------------------------------------------------------------------+ | Copyright (c) 1997-%s The PHP Group | +----------------------------------------------------------------------+ | This source file is subject to version 3.01 of the PHP license, | | that is bundled with this package in the file LICENSE, and is | | available through the world-wide-web at the following url: | | http://www.php.net/license/3_01.txt | | If you did not receive a copy of the PHP license and are unable to | | obtain it through the world-wide-web, please send a note to | | license@php.net so we can mail you a copy immediately. | +----------------------------------------------------------------------+ */ /* \$Id$ */ #ifndef HTML_TABLES_H #define HTML_TABLES_H /************************************************************************** *************************************************************************** ** THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY IT. ** *************************************************************************** ** Please change html_tables/html_table_gen.php instead and then ** ** run it in order to generate this file ** *************************************************************************** **************************************************************************/ enum entity_charset { cs_utf_8, cs_8859_1, cs_cp1252, cs_8859_15, cs_cp1251, cs_8859_5, cs_cp866, cs_macroman, cs_koi8r, cs_big5, cs_gb2312, cs_big5hkscs, cs_sjis, cs_eucjp, cs_numelems /* used to count the number of charsets */ }; #define CHARSET_UNICODE_COMPAT(cs) ((cs) <= cs_8859_1) #define CHARSET_SINGLE_BYTE(cs) ((cs) > cs_utf_8 && (cs) < cs_big5) #define CHARSET_PARTIAL_SUPPORT(cs) ((cs) >= cs_big5) static const struct { const char *codeset; enum entity_charset charset; } charset_map[] = { { "ISO-8859-1", cs_8859_1 }, { "ISO8859-1", cs_8859_1 }, { "ISO-8859-15", cs_8859_15 }, { "ISO8859-15", cs_8859_15 }, { "utf-8", cs_utf_8 }, { "cp1252", cs_cp1252 }, { "Windows-1252", cs_cp1252 }, { "1252", cs_cp1252 }, { "BIG5", cs_big5 }, { "950", cs_big5 }, { "GB2312", cs_gb2312 }, { "936", cs_gb2312 }, { "BIG5-HKSCS", cs_big5hkscs }, { "Shift_JIS", cs_sjis }, { "SJIS", cs_sjis }, { "932", cs_sjis }, { "SJIS-win", cs_sjis }, { "CP932", cs_sjis }, { "EUCJP", cs_eucjp }, { "EUC-JP", cs_eucjp }, { "eucJP-win", cs_eucjp }, { "KOI8-R", cs_koi8r }, { "koi8-ru", cs_koi8r }, { "koi8r", cs_koi8r }, { "cp1251", cs_cp1251 }, { "Windows-1251", cs_cp1251 }, { "win-1251", cs_cp1251 }, { "iso8859-5", cs_8859_5 }, { "iso-8859-5", cs_8859_5 }, { "cp866", cs_cp866 }, { "866", cs_cp866 }, { "ibm866", cs_cp866 }, { "MacRoman", cs_macroman }, { NULL } }; /* longest entity name length excluding & and ; */ #define LONGEST_ENTITY_LENGTH 31 /* Definitions for mappings *to* Unicode. * The origin charset must have at most 256 code points. * The multi-byte encodings are not supported */ typedef struct { unsigned short uni_cp[64]; } enc_to_uni_stage2; typedef struct { const enc_to_uni_stage2 *inner[4]; } enc_to_uni; /* bits 7-8 bits (only single bytes encodings supported )*/ #define ENT_ENC_TO_UNI_STAGE1(k) ((k & 0xC0) >> 6) /* bits 1-6 */ #define ENT_ENC_TO_UNI_STAGE2(k) ((k) & 0x3F) CODE; echo sprintf($t, date("Y")); $encodings = array( array( "ident" => "iso88591", "enumid" => 1, "name" => "ISO-8859-1", "file" => "mappings/8859-1.TXT", ), array( "ident" => "iso88595", "enumid" => 5, "name" => "ISO-8859-5", "file" => "mappings/8859-5.TXT", ), array( "ident" => "iso885915", "enumid" => 3, "name" => "ISO-8859-15", "file" => "mappings/8859-15.TXT", ), array( "ident" => "win1252", "enumid" => 2, "enumident" => "cp1252", "name" => "Windows-1252", "file" => "mappings/CP1252.TXT", ), array( "ident" => "win1251", "enumid" => 4, "enumident" => "cp1252", "name" => "Windows-1251", "file" => "mappings/CP1251.TXT", ), array( "ident" => "koi8r", "enumid" => 8, "name" => "KOI8-R", "file" => "mappings/KOI8-R.TXT", ), array( "ident" => "cp866", "enumid" => 6, "name" => "CP-866", "file" => "mappings/CP866.TXT", ), array( "ident" => "macroman", "enumid" => 7, "name" => "MacRoman", "file" => "mappings/ROMAN.TXT", ), ); $prevStage2 = array(); foreach ($encodings as $e) { echo "/* {{{ Mappings *to* Unicode for {$e['name']} */\n\n"; /* process file */ $map = array(); $lines = explode("\n", file_get_contents($e{'file'})); foreach ($lines as $l) { if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})/i", $l, $matches)) $map[] = array($matches[1], $matches[2]); } $mappy = array(); foreach ($map as $v) { $mappy[hexdec($v[0])] = hexdec($v[1]); } $mstable = array("ident" => $e['ident']); /* calculate two-stage tables */ for ($i = 0; $i < 4; $i++) { for ($j = 0; $j < 64; $j++) { $cp = $i << 6 | $j; $mstable[$i][$j] = isset($mappy[$cp]) ? $mappy[$cp] : NULL; } } echo "/* {{{ Stage 2 tables for {$e['name']} */\n\n"; $s2tables_idents = array(); for ($i = 0; $i < 4; $i++) { if (($t = array_keys($prevStage2, $mstable[$i])) !== array()) { $s2tables_idents[$i] = $encodings[$t[0]/5]["ident"]; continue; } $s2tables_idents[$i] = $e["ident"]; echo "static const enc_to_uni_stage2 enc_to_uni_s2_{$e['ident']}_". sprintf("%02X", $i << 6)." = { {\n"; for ($j = 0; $j < 64; $j++) { if ($j == 0) echo "\t"; elseif ($j % 6 == 0) echo "\n\t"; else echo " "; if ($mstable[$i][$j] !== NULL) echo sprintf("0x%04X,", $mstable[$i][$j]); else echo "0xFFFF,"; /* special value; indicates no mapping */ } echo "\n} };\n\n"; $prevStage2[] = $mstable[$i]; } echo "/* end of stage 2 tables for {$e['name']} }}} */\n\n"; echo "/* {{{ Stage 1 table for {$e['name']} */\n"; echo "static const enc_to_uni enc_to_uni_{$e['ident']} = { { \t&enc_to_uni_s2_{$s2tables_idents[0]}_00, \t&enc_to_uni_s2_{$s2tables_idents[1]}_40, \t&enc_to_uni_s2_{$s2tables_idents[2]}_80, \t&enc_to_uni_s2_{$s2tables_idents[3]}_C0 } }; "; echo "/* end of stage 1 table for {$e['name']} }}} */\n\n"; } $maxencnum = max(array_map(function($e) { return $e['enumid']; }, $encodings)); $a = range(0, $maxencnum); foreach ($encodings as $e) { $a[$e['enumid']] = $e['ident']; } echo "/* {{{ Index of tables for encoding conversion */ static const enc_to_uni *const enc_to_uni_index[cs_numelems] = {\n"; foreach ($a as $k => $v) { if (is_numeric($v)) echo "\tNULL,\n"; else echo "\t&enc_to_uni_$v,\n"; } echo "}; /* }}} */\n"; $t = <<<CODE /* Definitions for mappings *from* Unicode */ typedef struct { unsigned short un_code_point; /* we don't need bigger */ unsigned char cs_code; /* currently, we only have maps to single-byte encodings */ } uni_to_enc; CODE; echo $t; $encodings = array( array( "ident" => "iso885915", "name" => "ISO-8859-15", "file" => "mappings/8859-15.TXT", "range" => array(0xA4, 0xBE), ), array( "ident" => "win1252", "name" => "Windows-1252", "file" => "mappings/CP1252.TXT", "range" => array(0x80, 0x9F), ), array( "ident" => "win1251", "name" => "Windows-1251", "file" => "mappings/CP1251.TXT", "range" => array(0x80, 0xFF), ), array( "ident" => "koi8r", "name" => "KOI8-R", "file" => "mappings/KOI8-R.TXT", "range" => array(0x80, 0xFF), ), array( "ident" => "cp866", "name" => "CP-866", "file" => "mappings/CP866.TXT", "range" => array(0x80, 0xFF), ), array( "ident" => "macroman", "name" => "MacRoman", "file" => "mappings/ROMAN.TXT", "range" => array(0x80, 0xFF), ), ); foreach ($encodings as $e) { echo "/* {{{ Mappings *from* Unicode for {$e['name']} */\n"; /* process file */ $map = array(); $lines = explode("\n", file_get_contents($e{'file'})); foreach ($lines as $l) { if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})\s+#\s*(.*)$/i", $l, $matches)) $map[] = array($matches[1], $matches[2], rtrim($matches[3])); } $mappy = array(); foreach ($map as $v) { if (hexdec($v[0]) >= $e['range'][0] && hexdec($v[0]) <= $e['range'][1]) $mappy[hexdec($v[1])] = array(hexdec($v[0]), strtolower($v[2])); } ksort($mappy); echo "static const uni_to_enc unimap_{$e['ident']}[] = {\n"; foreach ($mappy as $k => $v) { echo "\t{ ", sprintf("0x%04X", $k), ", ", sprintf("0x%02X", $v[0]), " },\t/* ", $v[1], " */\n"; } echo "};\n"; echo "/* {{{ end of mappings *from* Unicode for {$e['name']} */\n\n"; } $data = file_get_contents("ents_html5.txt"); $pass2 = false; $name = "HTML5"; $ident = "html5"; again: $t = <<<'CODE' /* HTML 5 has many more named entities. * Some of them map to two unicode code points, not one. * We're going to use a three-stage table (with an extra one for the entities * with two code points). */ #define ENT_STAGE1_INDEX(k) (((k) & 0xFFF000) >> 12) /* > 1D, we have no mapping */ #define ENT_STAGE2_INDEX(k) (((k) & 0xFC0) >> 6) #define ENT_STAGE3_INDEX(k) ((k) & 0x3F) #define ENT_CODE_POINT_FROM_STAGES(i,j,k) (((i) << 12) | ((j) << 6) | (k)) /* Table should be organized with a leading row telling the size of * the table and the default entity (maybe NULL) and the rest being * normal rows ordered by code point so that we can do a binary search */ typedef union { struct { unsigned size; /* number of remaining entries in the table */ const char *default_entity; unsigned short default_entity_len; } leading_entry; struct { unsigned second_cp; /* second code point */ const char *entity; unsigned short entity_len; } normal_entry; } entity_multicodepoint_row; /* blocks of these should start at code points k where k % 0xFC0 == 0 */ typedef struct { char ambiguous; /* if 0 look into entity */ union { struct { const char *entity; /* may be NULL */ unsigned short entity_len; } ent; const entity_multicodepoint_row *multicodepoint_table; } data; } entity_stage3_row; /* Calculate k & 0x3F Use as offset */ typedef const entity_stage3_row *entity_stage2_row; /* 64 elements */ /* Calculate k & 0xFC0 >> 6. Use as offset */ typedef const entity_stage3_row *const *entity_stage1_row; /* 64 elements */ /* For stage 1, Calculate k & 0xFFF000 >> 3*4. * If larger than 1D, we have no mapping. Otherwise lookup that index */ typedef struct { const entity_stage1_row *ms_table; /* for tables with only basic entities, this member is to be accessed * directly for better performance: */ const entity_stage3_row *table; } entity_table_opt; /* Replaced "GT" > "gt" and "QUOT" > "quot" for consistency's sake. */ CODE; if (!$pass2) echo $t; $dp = array(); foreach (explode("\n", $data) as $l) { if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+) ([a-f0-9]+)/i', $l, $matches)) { //echo sprintf("\t{\"%-21s 1, 0x%05d},\n", $matches[1].",", $matches[2]); $dp[] = array($matches[1], $matches[2], $matches[3]); } else if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+)/i', $l, $matches)) { $dp[] = array($matches[1], $matches[2]); } } $origdp = $dp; usort($dp, function($a, $b) { return hexdec($a[1])-hexdec($b[1]); }); $multicp_rows = array(); foreach ($dp as $el) { if (count($el) == 3) { $multicp_rows[$el[1]] = array(); } } foreach ($dp as $el) { if (key_exists($el[1], $multicp_rows)) { if (count($el) == 3) $multicp_rows[$el[1]][$el[2]] = $el[0]; else $multicp_rows[$el[1]]["default"] = $el[0]; } } if ($pass2 < 2) echo "/* {{{ Start of $name multi-stage table for codepoint -> entity */", "\n\n"; else echo "/* {{{ Start of $name table for codepoint -> entity */", "\n\n"; if (empty($multicp_rows)) goto skip_multicp; ksort($multicp_rows); foreach ($multicp_rows as &$v) { ksort($v); } unset($v); echo "/* {{{ Start of double code point tables for $name */", "\n\n"; foreach ($multicp_rows as $k => $v) { echo "static const entity_multicodepoint_row multi_cp_{$ident}_", sprintf("%05s", $k), "[] = {", "\n"; if (key_exists("default", $v)) { if ($v['default'] == 'GT') /* hack to make > translate to > not GT; */ $v['default'] = "gt"; echo "\t{ {", sprintf("%02d", count($v) - 1), ",\t\t", sprintf("\"%-21s", $v["default"].'",'), "\t", sprintf("% 2d", strlen($v["default"])), '} },', "\n"; } else { echo "\t{ {", sprintf("%02d", count($v)), ",\t\t", sprintf("%-22s", 'NULL'), ",\t0} },\n"; } unset($v["default"]); foreach ($v as $l => $w) { echo "\t{ {", sprintf("0x%05s", $l), ",\t", sprintf("\"%-21s", $w.'",'), "\t", sprintf("% 2d", strlen($w)), '} },', "\n"; } echo "};\n"; } echo "\n/* End of double code point tables }}} */", "\n\n"; skip_multicp: if ($pass2 < 2) echo "/* {{{ Stage 3 Tables for $name */", "\n\n"; $t = <<<CODE static const entity_stage3_row empty_stage3_table[] = { /* 64 elements */ {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, }; CODE; if (!$pass2) echo $t; $mstable = array(); foreach ($dp as $el) { $s1 = (hexdec($el[1]) & 0xFFF000) >> 12; $s2 = (hexdec($el[1]) & 0xFC0) >> 6; $s3 = hexdec($el[1]) & 0x3F; if (key_exists($el[1], $multicp_rows)) { $mstable[$s1][$s2][$s3] = ""; } else { $mstable[$s1][$s2][$s3] = $el[0]; } } for ($i = 0; $i < 0x1E; $i++) { for ($k = 0; $k < 64; $k++) { $any3 = false; $col3 = array(); for ($l = 0; $l < 64; $l++) { if (isset($mstable[$i][$k][$l])) { $any3 = true; $col3[$l] = $mstable[$i][$k][$l]; } else { $col3[$l] = null; } } if ($any3) { echo "static const entity_stage3_row stage3_table_{$ident}_", sprintf("%02X%03X", $i, $k << 6), "[] = {\n"; foreach ($col3 as $y => $z) { if ($y == 0) echo "\t"; elseif ($y % 4 == 0) echo "\n\t"; else echo " "; if ($z === NULL) echo "{0, { {NULL, 0} } },"; elseif ($z === "QUOT") /* hack to translate " into "e;, not " */ echo "{0, { {\"quot\", 4} } },"; elseif ($z !== "") echo "{0, { {\"$z\", ", strlen($z), "} } },"; else echo "{1, { {(void *)", sprintf("multi_cp_{$ident}_%05X", ($i << 12) | ($k << 6) | $y ), "} } },"; } echo "\n};\n\n"; } } } if ($pass2 < 2) echo "/* end of stage 3 Tables for $name }}} */", "\n\n"; if ($pass2 > 1) goto hashtables; echo "/* {{{ Stage 2 Tables for $name */", "\n\n"; $t = <<<CODE static const entity_stage2_row empty_stage2_table[] = { empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, }; CODE; if (!$pass2) echo $t; for ($i = 0; $i < 0x1E; $i++) { $any = false; for ($k = 0; $k < 64; $k++) { if (isset($mstable[$i][$k])) $any = true; } if ($any) { echo "static const entity_stage2_row stage2_table_{$ident}_", sprintf("%02X000", $i), "[] = {\n"; for ($k = 0; $k < 64; $k++) { if ($k == 0) echo "\t"; elseif ($k % 4 == 0) echo "\n\t"; else echo " "; if (isset($mstable[$i][$k])) { echo sprintf("stage3_table_{$ident}_%05X", ($i << 12) | ($k << 6)), ","; } else { echo "empty_stage3_table", ","; } } echo "\n};\n\n"; } } echo "/* end of stage 2 tables for $name }}} */", "\n\n"; echo "static const entity_stage1_row entity_ms_table_{$ident}[] = {\n"; for ($i = 0; $i < 0x1E; $i++) { if (isset($mstable[$i])) echo "\t", sprintf("stage2_table_{$ident}_%02X000", $i), ",\n"; else echo "\tempty_stage2_table,\n"; } echo "};\n\n"; echo "/* end of $name multi-stage table for codepoint -> entity }}} */\n\n"; /* commented-out; this enabled binary search, which turned out to be * significantly slower than the hash tables for html 5 entities */ //echo //"/* {{{ HTML 5 tables for entity -> codepoint */", "\n\n"; //$t = <<<CODE //typedef struct { // const char *entity; // unsigned short entity_len; // unsigned int codepoint1; // unsigned int codepoint2; //} entity_cp_map; // //#define ENTITY_CP_MAP_CMP(l, lsize, r, rsize) \ // ( ((lsize)==(rsize)) ? (memcmp((l), (r), (lsize))) : ((lsize)-(rsize)) ) // //static const entity_cp_map html5_ent_cp_map[] = { // //CODE; //echo $t; // //$dp = $origdp; //usort($dp, function($a, $b) { $d = strlen($a[0])-strlen($b[0]); // return $d==0?strcmp($a[0], $b[0]):$d; }); // //$k = 0; //foreach ($dp as $o) { // if ($k == 0) echo "\t"; // elseif ($k % 3 == 0) echo "\n\t"; // else echo " "; // if (isset($o[2])) // echo sprintf('{"%s", %d, 0x%X, 0x%X},', $o[0], strlen($o[0]), // hexdec($o[1]), hexdec($o[2])); // else // echo sprintf('{"%s", %d, 0x%X, 0},', $o[0], strlen($o[0]), // hexdec($o[1])); // // if (isset($o[2])) { // $entlen = strlen($o[0]) + 2; // $utf8len = strlen( // mb_convert_encoding("&#x{$o[1]};&#x{$o[2]};", "UTF-8", "HTML-ENTITIES")); // if ($utf8len > $entlen*1.2) { // die("violated assumption for traverse_for_entities"); // } // } // // $k++; //} //echo "\n};\n\n"; // //echo "static const size_t html5_ent_cp_map_size = $k;\n\n"; // //echo //"/* end of HTML 5 tables for entity -> codepoint }}} */\n\n"; hashtables: echo "/* {{{ $name hash table for entity -> codepoint */", "\n\n"; $t = <<<CODE typedef struct { const char *entity; unsigned short entity_len; unsigned int codepoint1; unsigned int codepoint2; } entity_cp_map; typedef const entity_cp_map *entity_ht_bucket; typedef struct { unsigned num_elems; /* power of 2 */ const entity_ht_bucket *buckets; /* .num_elems elements */ } entity_ht; static const entity_cp_map ht_bucket_empty[] = { {NULL, 0, 0, 0} }; CODE; if (!$pass2) echo $t; function hashfun($str) { $hash = 5381; $nKeyLength = strlen($str); $pos = 0; for (; $nKeyLength > 0; $nKeyLength--) { $hash = (int)(((int)(((int)($hash << 5)) + $hash)) + ord($str[$pos++])) & 0xFFFFFFFF; } return $hash; } $numelems = max(pow(2, ceil(log(1.5*count($origdp))/log(2))),16); $mask = $numelems - 1; $hashes = array(); foreach ($origdp as $e) { $hashes[hashfun($e[0]) & $mask][] = $e; if (isset($e[2])) { $entlen = strlen($e[0]) + 2; $utf8len = strlen( mb_convert_encoding("&#x{$e[1]};&#x{$e[2]};", "UTF-8", "HTML-ENTITIES")); if ($utf8len > $entlen*1.2) { die("violated assumption for traverse_for_entities"); } } } for ($i = 0; $i < $numelems; $i++) { if (empty($hashes[$i])) continue; echo "static const entity_cp_map ht_bucket_{$ident}_", sprintf("%03X", $i) ,"[] = {"; foreach ($hashes[$i] as $h) { if (isset($h[2])) { echo sprintf(' {"%s", %d, 0x%05X, 0x%05X},', $h[0], strlen($h[0]), hexdec($h[1]), hexdec($h[2])); } else { echo sprintf(' {"%s", %d, 0x%05X, 0},', $h[0], strlen($h[0]), hexdec($h[1])); } } echo " {NULL, 0, 0, 0} };\n"; } echo "\n"; echo "static const entity_cp_map *const ht_buckets_{$ident}[] = {\n"; for ($i = 0; $i < $numelems; $i++) { if ($i == 0) echo "\t"; elseif ($i % 4 == 0) echo "\n\t"; else echo " "; if (empty($hashes[$i])) echo "ht_bucket_empty,"; else echo "ht_bucket_{$ident}_", sprintf("%03X", $i), ","; } echo "\n};\n\n"; echo "static const entity_ht ent_ht_{$ident} = { ", sprintf("0x%X", $numelems), ", ht_buckets_{$ident} };\n\n"; echo "/* end of $name hash table for entity -> codepoint }}} */\n\n"; if (!$pass2) { $data = file_get_contents("ents_html401.txt"); $pass2 = 1; $name = "HTML 4.01"; $ident = "html4"; goto again; } elseif ($pass2 == 1) { $data = file_get_contents("ents_basic.txt"); $pass2 = 2; $name = "Basic entities (no apos)"; $ident = "be_noapos"; goto again; } elseif ($pass2 == 2) { $data = file_get_contents("ents_basic_apos.txt"); $pass2 = 3; $name = "Basic entities (with apos)"; $ident = "be_apos"; goto again; } echo "#endif /* HTML_TABLES_H */\n";