ucgendat.php 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735
  1. #!/usr/bin/env php
  2. <?php error_reporting(E_ALL);
  3. /**
  4. * This is based on the ucgendat.c file from the OpenLDAP project, licensed as
  5. * follows. This file is not necessary to build PHP. It's only necessary to
  6. * rebuild unicode_data.h and eaw_width.h from Unicode ucd files.
  7. *
  8. * Example usage:
  9. * php ucgendat.php path/to/Unicode/data/files
  10. */
  11. /* Copyright 1998-2007 The OpenLDAP Foundation.
  12. * All rights reserved.
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted only as authorized by the OpenLDAP
  16. * Public License.
  17. *
  18. * A copy of this license is available at
  19. * <http://www.OpenLDAP.org/license.html>.
  20. */
  21. /* Copyright 2001 Computing Research Labs, New Mexico State University
  22. *
  23. * Permission is hereby granted, free of charge, to any person obtaining a
  24. * copy of this software and associated documentation files (the "Software"),
  25. * to deal in the Software without restriction, including without limitation
  26. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  27. * and/or sell copies of the Software, and to permit persons to whom the
  28. * Software is furnished to do so, subject to the following conditions:
  29. *
  30. * The above copyright notice and this permission notice shall be included in
  31. * all copies or substantial portions of the Software.
  32. *
  33. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  34. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  35. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  36. * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
  37. * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
  38. * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
  39. * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  40. */
  41. if ($argc < 2) {
  42. echo "Usage: php ucgendata.php ./datadir\n";
  43. echo "./datadir must contain:\n";
  44. echo "UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt, DerivedCoreProperties.txt, and EastAsianWidth.txt\n";
  45. return;
  46. }
  47. $dir = $argv[1];
  48. $unicodeDataFile = $dir . '/UnicodeData.txt';
  49. $caseFoldingFile = $dir . '/CaseFolding.txt';
  50. $specialCasingFile = $dir . '/SpecialCasing.txt';
  51. $derivedCorePropertiesFile = $dir . '/DerivedCoreProperties.txt';
  52. $eastAsianWidthFile = $dir . '/EastAsianWidth.txt';
  53. $files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile, $derivedCorePropertiesFile, $eastAsianWidthFile];
  54. foreach ($files as $file) {
  55. if (!file_exists($file)) {
  56. echo "File $file does not exist.\n";
  57. return;
  58. }
  59. }
  60. $outputFile = __DIR__ . "/../unicode_data.h";
  61. $data = new UnicodeData;
  62. parseUnicodeData($data, file_get_contents($unicodeDataFile));
  63. parseCaseFolding($data, file_get_contents($caseFoldingFile));
  64. parseSpecialCasing($data, file_get_contents($specialCasingFile));
  65. parseDerivedCoreProperties($data, file_get_contents($derivedCorePropertiesFile));
  66. file_put_contents($outputFile, generateData($data));
  67. $eawFile = __DIR__ . "/../libmbfl/mbfl/eaw_table.h";
  68. $eawData = parseEastAsianWidth(file_get_contents($eastAsianWidthFile));
  69. file_put_contents($eawFile, generateEastAsianWidthData($eawData));
  70. class Range {
  71. public $start;
  72. public $end;
  73. public function __construct(int $start, int $end) {
  74. $this->start = $start;
  75. $this->end = $end;
  76. }
  77. }
  78. class UnicodeData {
  79. public $propIndexes;
  80. public $numProps;
  81. public $propRanges;
  82. public $caseMaps;
  83. public $extraCaseData;
  84. public function __construct() {
  85. /*
  86. * List of properties expected to be found in the Unicode Character Database.
  87. */
  88. $this->propIndexes = array_flip([
  89. "Mn", "Mc", "Me", "Nd", "Nl", "No",
  90. "Zs", "Zl", "Zp", "Cs", "Co", "Cn",
  91. "Lu", "Ll", "Lt", "Lm", "Lo", "Sm",
  92. "Sc", "Sk", "So", "L", "R", "EN",
  93. "ES", "ET", "AN", "CS", "B", "S",
  94. "WS", "ON", "AL",
  95. "C", "P", "Cased", "Case_Ignorable"
  96. ]);
  97. $this->numProps = count($this->propIndexes);
  98. $this->propRanges = array_fill(0, $this->numProps, []);
  99. $this->caseMaps = [
  100. 'upper' => [],
  101. 'lower' => [],
  102. 'title' => [],
  103. 'fold' => [],
  104. ];
  105. $this->extraCaseData = [];
  106. }
  107. function propToIndex(string $prop) : int {
  108. /* Deal with directionality codes introduced in Unicode 3.0. */
  109. if (in_array($prop, ["BN", "NSM", "PDF", "LRE", "LRO", "RLE", "RLO", "LRI", "RLI", "FSI", "PDI"])) {
  110. /*
  111. * Mark all of these as Other Neutral to preserve compatibility with
  112. * older versions.
  113. */
  114. $prop = "ON";
  115. }
  116. /* Merge all punctuation into a single category for efficiency of access.
  117. * We're currently not interested in distinguishing different kinds of punctuation. */
  118. if (in_array($prop, ["Pc", "Pd", "Ps", "Pe", "Po", "Pi", "Pf"])) {
  119. $prop = "P";
  120. }
  121. /* Same for control. */
  122. if (in_array($prop, ["Cc", "Cf"])) {
  123. $prop = "C";
  124. }
  125. if (!isset($this->propIndexes[$prop])) {
  126. throw new Exception("Unknown property $prop");
  127. }
  128. return $this->propIndexes[$prop];
  129. }
  130. public function addProp(int $code, string $prop) {
  131. $propIdx = self::propToIndex($prop);
  132. // Check if this extends the last range
  133. $ranges = $this->propRanges[$propIdx];
  134. if (!empty($ranges)) {
  135. $lastRange = $ranges[count($ranges) - 1];
  136. if ($code === $lastRange->end + 1) {
  137. $lastRange->end++;
  138. return;
  139. }
  140. }
  141. $this->propRanges[$propIdx][] = new Range($code, $code);
  142. }
  143. public function addPropRange(int $startCode, int $endCode, string $prop) {
  144. $propIdx = self::propToIndex($prop);
  145. $this->propRanges[$propIdx][] = new Range($startCode, $endCode);
  146. }
  147. public function addCaseMapping(string $case, int $origCode, int $mappedCode) {
  148. $this->caseMaps[$case][$origCode] = $mappedCode;
  149. }
  150. public function compactRangeArray(array $ranges) : array {
  151. // Sort by start codepoint
  152. usort($ranges, function (Range $r1, Range $r2) {
  153. return $r1->start <=> $r2->start;
  154. });
  155. $lastRange = new Range(-1, -1);
  156. $newRanges = [];
  157. foreach ($ranges as $range) {
  158. if ($lastRange->end == -1) {
  159. $lastRange = $range;
  160. } else if ($range->start == $lastRange->end + 1) {
  161. $lastRange->end = $range->end;
  162. } else if ($range->start > $lastRange->end + 1) {
  163. $newRanges[] = $lastRange;
  164. $lastRange = $range;
  165. } else {
  166. throw new Exception(sprintf(
  167. "Overlapping ranges [%x, %x] and [%x, %x]",
  168. $lastRange->start, $lastRange->end,
  169. $range->start, $range->end
  170. ));
  171. }
  172. }
  173. if ($lastRange->end != -1) {
  174. $newRanges[] = $lastRange;
  175. }
  176. return $newRanges;
  177. }
  178. public function compactPropRanges() {
  179. foreach ($this->propRanges as &$ranges) {
  180. $ranges = $this->compactRangeArray($ranges);
  181. }
  182. }
  183. }
  184. function parseDataFile(string $input) {
  185. $lines = explode("\n", $input);
  186. foreach ($lines as $line) {
  187. // Strip comments
  188. if (false !== $hashPos = strpos($line, '#')) {
  189. $line = substr($line, 0, $hashPos);
  190. }
  191. // Skip empty lines
  192. $line = trim($line);
  193. if ($line === '') {
  194. continue;
  195. }
  196. $fields = array_map('trim', explode(';', $line));
  197. yield $fields;
  198. }
  199. }
  200. function parseUnicodeData(UnicodeData $data, string $input) : void {
  201. $lines = parseDataFile($input);
  202. foreach ($lines as $fields) {
  203. if (count($fields) != 15) {
  204. throw new Exception("Line does not contain 15 fields");
  205. }
  206. $code = intval($fields[0], 16);
  207. $name = $fields[1];
  208. if ($name === '') {
  209. throw new Exception("Empty name");
  210. }
  211. if ($name[0] === '<' && $name !== '<control>') {
  212. // This is a character range
  213. $lines->next();
  214. $nextFields = $lines->current();
  215. $nextCode = intval($nextFields[0], 16);
  216. $generalCategory = $fields[2];
  217. $data->addPropRange($code, $nextCode, $generalCategory);
  218. $bidiClass = $fields[4];
  219. $data->addPropRange($code, $nextCode, $bidiClass);
  220. continue;
  221. }
  222. $generalCategory = $fields[2];
  223. $data->addProp($code, $generalCategory);
  224. $bidiClass = $fields[4];
  225. $data->addProp($code, $bidiClass);
  226. $upperCase = intval($fields[12], 16);
  227. $lowerCase = intval($fields[13], 16);
  228. $titleCase = intval($fields[14], 16) ?: $upperCase;
  229. if ($upperCase) {
  230. $data->addCaseMapping('upper', $code, $upperCase);
  231. }
  232. if ($lowerCase) {
  233. $data->addCaseMapping('lower', $code, $lowerCase);
  234. }
  235. if ($titleCase) {
  236. $data->addCaseMapping('title', $code, $titleCase);
  237. }
  238. }
  239. }
  240. function parseCodes(string $strCodes) : array {
  241. $codes = [];
  242. foreach (explode(' ', $strCodes) as $strCode) {
  243. $codes[] = intval($strCode, 16);
  244. }
  245. return $codes;
  246. }
  247. function parseCaseFolding(UnicodeData $data, string $input) : void {
  248. foreach (parseDataFile($input) as $fields) {
  249. if (count($fields) != 4) {
  250. throw new Exception("Line does not contain 4 fields");
  251. }
  252. $code = intval($fields[0], 16);
  253. $status = $fields[1];
  254. if ($status == 'T') {
  255. // Use language-agnostic case folding
  256. continue;
  257. }
  258. if ($status == 'C' || $status == 'S') {
  259. $foldCode = intval($fields[2], 16);
  260. if (!isset($data->caseMaps['fold'][$code])) {
  261. $data->addCaseMapping('fold', $code, $foldCode);
  262. } else {
  263. // Add simple mapping to full mapping data
  264. assert(is_array($data->caseMaps['fold'][$code]));
  265. $data->caseMaps['fold'][$code][0] = $foldCode;
  266. }
  267. } else if ($status == 'F') {
  268. $foldCodes = parseCodes($fields[2]);
  269. $existingFoldCode = $data->caseMaps['fold'][$code] ?? $code;
  270. $data->caseMaps['fold'][$code] = array_merge([$code], $foldCodes);
  271. } else {
  272. assert(0);
  273. }
  274. }
  275. }
  276. function addSpecialCasing(UnicodeData $data, string $type, int $code, array $caseCodes) : void {
  277. $simpleCaseCode = $data->caseMaps[$type][$code] ?? $code;
  278. if (count($caseCodes) == 1) {
  279. if ($caseCodes[0] != $simpleCaseCode) {
  280. throw new Exception("Simple case code in special casing does not match");
  281. }
  282. // Special case: If a title-case character maps to itself, we may still have to store it,
  283. // if there is a non-trivial upper-case mapping for it
  284. if ($type == 'title' && $code == $caseCodes[0]
  285. && ($data->caseMaps['upper'][$code] ?? $code) != $code) {
  286. $data->caseMaps['title'][$code] = $code;
  287. }
  288. return;
  289. }
  290. if (count($caseCodes) > 3) {
  291. throw new Exception("Special case mapping with more than 3 code points");
  292. }
  293. $data->caseMaps[$type][$code] = array_merge([$simpleCaseCode], $caseCodes);
  294. }
  295. function parseSpecialCasing(UnicodeData $data, string $input) : void {
  296. foreach (parseDataFile($input) as $fields) {
  297. if (count($fields) != 5 && count($fields) != 6) {
  298. throw new Exception("Line does not contain 5 or 6 fields");
  299. }
  300. $code = intval($fields[0], 16);
  301. $lower = parseCodes($fields[1]);
  302. $title = parseCodes($fields[2]);
  303. $upper = parseCodes($fields[3]);
  304. $cond = $fields[4];
  305. if ($cond) {
  306. // Only use unconditional mappings
  307. continue;
  308. }
  309. addSpecialCasing($data, 'lower', $code, $lower);
  310. addSpecialCasing($data, 'upper', $code, $upper);
  311. // Should happen last
  312. addSpecialCasing($data, 'title', $code, $title);
  313. }
  314. }
  315. function parseDerivedCoreProperties(UnicodeData $data, string $input) : void {
  316. foreach (parseDataFile($input) as $fields) {
  317. if (count($fields) != 2) {
  318. throw new Exception("Line does not contain 2 fields");
  319. }
  320. $property = $fields[1];
  321. if ($property != 'Cased' && $property != 'Case_Ignorable') {
  322. continue;
  323. }
  324. $range = explode('..', $fields[0]);
  325. if (count($range) == 2) {
  326. $data->addPropRange(intval($range[0], 16), intval($range[1], 16), $property);
  327. } else if (count($range) == 1) {
  328. $data->addProp(intval($range[0], 16), $property);
  329. } else {
  330. throw new Exception("Invalid range");
  331. }
  332. }
  333. }
  334. function parseEastAsianWidth(string $input) : array {
  335. $wideRanges = [];
  336. foreach (parseDataFile($input) as $fields) {
  337. if ($fields[1] == 'W' || $fields[1] == 'F') {
  338. if ($dotsPos = strpos($fields[0], '..')) {
  339. $startCode = intval(substr($fields[0], 0, $dotsPos), 16);
  340. $endCode = intval(substr($fields[0], $dotsPos + 2), 16);
  341. if (!empty($wideRanges)) {
  342. $lastRange = $wideRanges[count($wideRanges) - 1];
  343. if ($startCode == $lastRange->end + 1) {
  344. $lastRange->end = $endCode;
  345. continue;
  346. }
  347. }
  348. $wideRanges[] = new Range($startCode, $endCode);
  349. } else {
  350. $code = intval($fields[0], 16);
  351. if (!empty($wideRanges)) {
  352. $lastRange = $wideRanges[count($wideRanges) - 1];
  353. if ($code == $lastRange->end + 1) {
  354. $lastRange->end++;
  355. continue;
  356. }
  357. }
  358. $wideRanges[] = new Range($code, $code);
  359. }
  360. }
  361. }
  362. return $wideRanges;
  363. }
  364. function formatArray(array $values, int $width, string $format) : string {
  365. $result = '';
  366. $i = 0;
  367. $c = count($values);
  368. for ($i = 0; $i < $c; $i++) {
  369. if ($i != 0) {
  370. $result .= ',';
  371. }
  372. $result .= $i % $width == 0 ? "\n\t" : " ";
  373. $result .= sprintf($format, $values[$i]);
  374. }
  375. return $result;
  376. }
  377. function formatShortHexArray(array $values, int $width) : string {
  378. return formatArray($values, $width, "0x%04x");
  379. }
  380. function formatShortDecArray(array $values, int $width) : string {
  381. return formatArray($values, $width, "% 5d");
  382. }
  383. function formatIntArray(array $values, int $width) : string {
  384. return formatArray($values, $width, "0x%08x");
  385. }
  386. function generatePropData(UnicodeData $data) {
  387. $data->compactPropRanges();
  388. $propOffsets = [];
  389. $idx = 0;
  390. foreach ($data->propRanges as $ranges) {
  391. $num = count($ranges);
  392. $propOffsets[] = $idx;
  393. $idx += 2*$num;
  394. }
  395. // Add sentinel for binary search
  396. $propOffsets[] = $idx;
  397. // TODO ucgendat.c pads the prop offsets to the next multiple of 4
  398. // for rather dubious reasons of alignment. This should probably be
  399. // dropped
  400. while (count($propOffsets) % 4 != 0) {
  401. $propOffsets[] = 0;
  402. }
  403. $totalRanges = $idx;
  404. $result = "";
  405. $result .= "static const unsigned short _ucprop_size = $data->numProps;\n\n";
  406. $result .= "static const unsigned short _ucprop_offsets[] = {";
  407. $result .= formatShortHexArray($propOffsets, 8);
  408. $result .= "\n};\n\n";
  409. $values = [];
  410. foreach ($data->propRanges as $ranges) {
  411. foreach ($ranges as $range) {
  412. $values[] = $range->start;
  413. $values[] = $range->end;
  414. }
  415. }
  416. $result .= "static const unsigned int _ucprop_ranges[] = {";
  417. $result .= formatIntArray($values, 4);
  418. $result .= "\n};\n\n";
  419. return $result;
  420. }
  421. function flatten(array $array) {
  422. $result = [];
  423. foreach ($array as $arr) {
  424. foreach ($arr as $v) {
  425. $result[] = $v;
  426. }
  427. }
  428. return $result;
  429. }
  430. function prepareCaseData(UnicodeData $data) {
  431. // Don't store titlecase if it's the same as uppercase
  432. foreach ($data->caseMaps['title'] as $code => $titleCode) {
  433. if ($titleCode == ($data->caseMaps['upper'][$code] ?? $code)) {
  434. unset($data->caseMaps['title'][$code]);
  435. }
  436. }
  437. // Store full (multi-char) case mappings in a separate table and only
  438. // store an index into it
  439. foreach ($data->caseMaps as $type => $caseMap) {
  440. foreach ($caseMap as $code => $caseCode) {
  441. if (is_array($caseCode)) {
  442. // -1 because the first entry is the simple case mapping
  443. $len = count($caseCode) - 1;
  444. $idx = count($data->extraCaseData);
  445. $data->caseMaps[$type][$code] = ($len << 24) | $idx;
  446. foreach ($caseCode as $c) {
  447. $data->extraCaseData[] = $c;
  448. }
  449. }
  450. }
  451. }
  452. }
  453. function generateCaseMPH(string $name, array $map) {
  454. $prefix = "_uccase_" . $name;
  455. list($gTable, $table) = generateMPH($map, $fast = false);
  456. echo "$name: n=", count($table), ", g=", count($gTable), "\n";
  457. $result = "";
  458. $result .= "static const unsigned {$prefix}_g_size = " . count($gTable) . ";\n";
  459. $result .= "static const short {$prefix}_g[] = {";
  460. $result .= formatShortDecArray($gTable, 8);
  461. $result .= "\n};\n\n";
  462. $result .= "static const unsigned {$prefix}_table_size = " . count($table) . ";\n";
  463. $result .= "static const unsigned {$prefix}_table[] = {";
  464. $result .= formatIntArray(flatten($table), 4);
  465. $result .= "\n};\n\n";
  466. return $result;
  467. }
  468. function generateCaseData(UnicodeData $data) {
  469. prepareCaseData($data);
  470. $result = "";
  471. $result .= generateCaseMPH('upper', $data->caseMaps['upper']);
  472. $result .= generateCaseMPH('lower', $data->caseMaps['lower']);
  473. $result .= generateCaseMPH('title', $data->caseMaps['title']);
  474. $result .= generateCaseMPH('fold', $data->caseMaps['fold']);
  475. $result .= "static const unsigned _uccase_extra_table[] = {";
  476. $result .= formatIntArray($data->extraCaseData, 4);
  477. $result .= "\n};\n\n";
  478. return $result;
  479. }
  480. function generateData(UnicodeData $data) {
  481. $result = <<<'HEADER'
  482. /* This file was generated from a modified version of UCData's ucgendat.
  483. *
  484. * DO NOT EDIT THIS FILE!
  485. *
  486. * Instead, download the appropriate UnicodeData-x.x.x.txt and
  487. * CompositionExclusions-x.x.x.txt files from http://www.unicode.org/Public/
  488. * and run ext/mbstring/ucgendat/ucgendat.php.
  489. *
  490. * More information can be found in the UCData package. Unfortunately,
  491. * the project's page doesn't seem to be live anymore, so you can use
  492. * OpenLDAP's modified copy (look in libraries/liblunicode/ucdata) */
  493. HEADER;
  494. $result .= "\n\n" . generatePropData($data);
  495. $result .= generateCaseData($data);
  496. return $result;
  497. }
  498. /*
  499. * Minimal Perfect Hash Generation
  500. *
  501. * Based on "Hash, displace, and compress" algorithm due to
  502. * Belazzougui, Botelho and Dietzfelbinger.
  503. *
  504. * Hash function based on https://stackoverflow.com/a/12996028/385378.
  505. * MPH implementation based on http://stevehanov.ca/blog/index.php?id=119.
  506. */
  507. function hashInt(int $d, int $x) {
  508. $x ^= $d;
  509. $x = (($x >> 16) ^ $x) * 0x45d9f3b;
  510. return $x & 0xffffffff;
  511. }
  512. function tryGenerateMPH(array $map, int $gSize) {
  513. $tableSize = count($map);
  514. $table = [];
  515. $gTable = array_fill(0, $gSize, 0x7fff);
  516. $buckets = [];
  517. foreach ($map as $k => $v) {
  518. $h = hashInt(0, $k) % $gSize;
  519. $buckets[$h][] = [$k, $v];
  520. }
  521. // Sort by descending number of collisions
  522. usort($buckets, function ($b1, $b2) {
  523. return -(count($b1) <=> count($b2));
  524. });
  525. foreach ($buckets as $bucket) {
  526. $collisions = count($bucket);
  527. if ($collisions <= 1) {
  528. continue;
  529. }
  530. // Try values of $d until all elements placed in different slots
  531. $d = 1;
  532. $i = 0;
  533. $used = [];
  534. while ($i < $collisions) {
  535. if ($d > 0x7fff) {
  536. return [];
  537. }
  538. list($k) = $bucket[$i];
  539. $slot = hashInt($d, $k) % $tableSize;
  540. if (isset($table[$slot]) || isset($used[$slot])) {
  541. $d++;
  542. $i = 0;
  543. $used = [];
  544. } else {
  545. $i++;
  546. $used[$slot] = true;
  547. }
  548. }
  549. $g = hashInt(0, $bucket[0][0]) % $gSize;
  550. $gTable[$g] = $d;
  551. foreach ($bucket as $elem) {
  552. $table[hashInt($d, $elem[0]) % $tableSize] = $elem;
  553. }
  554. }
  555. $freeSlots = [];
  556. for ($i = 0; $i < $tableSize; $i++) {
  557. if (!isset($table[$i])) {
  558. $freeSlots[] = $i;
  559. }
  560. }
  561. // For buckets with only one element, we directly store the index
  562. $freeIdx = 0;
  563. foreach ($buckets as $bucket) {
  564. if (count($bucket) != 1) {
  565. continue;
  566. }
  567. $elem = $bucket[0];
  568. $slot = $freeSlots[$freeIdx++];
  569. $table[$slot] = $elem;
  570. $g = hashInt(0, $elem[0]) % $gSize;
  571. $gTable[$g] = -$slot;
  572. }
  573. ksort($gTable);
  574. ksort($table);
  575. return [$gTable, $table];
  576. }
  577. function generateMPH(array $map, bool $fast) {
  578. if ($fast) {
  579. // Check size starting lambda=5.0 in 0.5 increments
  580. for ($lambda = 5.0;; $lambda -= 0.5) {
  581. $m = (int) (count($map) / $lambda);
  582. $tmpMph = tryGenerateMPH($map, $m);
  583. if (!empty($tmpMph)) {
  584. $mph = $tmpMph;
  585. break;
  586. }
  587. }
  588. } else {
  589. // Check all sizes starting lambda=7.0
  590. $m = (int) (count($map) / 7.0);
  591. for (;; $m++) {
  592. $tmpMph = tryGenerateMPH($map, $m);
  593. if (!empty($tmpMph)) {
  594. $mph = $tmpMph;
  595. break;
  596. }
  597. }
  598. }
  599. return $mph;
  600. }
  601. function generateEastAsianWidthData(array $wideRanges) {
  602. $result = <<<'HEADER'
  603. /* This file was generated by ext/mbstring/ucgendat/ucgendat.php.
  604. *
  605. * DO NOT EDIT THIS FILE!
  606. *
  607. * East Asian Width table
  608. *
  609. * Some characters in East Asian languages are intended to be displayed in a space
  610. * which is roughly square. (This contrasts with others such as the Latin alphabet,
  611. * which are taller than they are wide.) To display these East Asian characters
  612. * properly, twice the horizontal space is used. This must be taken into account
  613. * when doing things like wrapping text to a specific width.
  614. *
  615. * Each pair of numbers in the below table is a range of Unicode codepoints
  616. * which should be displayed as double-width.
  617. */
  618. static const struct {
  619. int begin;
  620. int end;
  621. } mbfl_eaw_table[] = {
  622. HEADER;
  623. foreach ($wideRanges as $range) {
  624. $startCode = dechex($range->start);
  625. $endCode = dechex($range->end);
  626. $result .= "\t{ 0x{$startCode}, 0x{$endCode} },\n";
  627. }
  628. $result .= "};\n";
  629. return $result;
  630. }