uctest.php 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. <?php error_reporting(E_ALL);
  2. $dir = __DIR__;
  3. $unicodeDataFile = $dir . '/UnicodeData.txt';
  4. $caseFoldingFile = $dir . '/CaseFolding.txt';
  5. $specialCasingFile = $dir . '/SpecialCasing.txt';
  6. $files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile];
  7. foreach ($files as $file) {
  8. if (!file_exists($file)) {
  9. echo "File $file does not exist.\n";
  10. return;
  11. }
  12. }
  13. testUnicodeData(file_get_contents($unicodeDataFile));
  14. testCaseFolding(file_get_contents($caseFoldingFile));
  15. testSpecialCasing(file_get_contents($specialCasingFile));
  16. function parseDataFile(string $input) {
  17. $lines = explode("\n", $input);
  18. foreach ($lines as $line) {
  19. // Strip comments
  20. if (false !== $hashPos = strpos($line, '#')) {
  21. $line = substr($line, 0, $hashPos);
  22. }
  23. // Skip empty lines
  24. $line = trim($line);
  25. if ($line === '') {
  26. continue;
  27. }
  28. $fields = array_map('trim', explode(';', $line));
  29. yield $fields;
  30. }
  31. }
  32. function parseCodes(string $strCodes) : array {
  33. $codes = [];
  34. foreach (explode(' ', $strCodes) as $strCode) {
  35. $codes[] = intval($strCode, 16);
  36. }
  37. return $codes;
  38. }
  39. function testCaseMap($type, int $origCode, array $newCodes) {
  40. $origChar = mb_chr($origCode);
  41. $newStr = "";
  42. foreach ($newCodes as $newCode) {
  43. $newStr .= mb_chr($newCode);
  44. }
  45. $mbNewStr = mb_convert_case($origChar, $type);
  46. if ($mbNewStr !== $newStr) {
  47. echo "$type: $mbNewStr != $newStr\n";
  48. }
  49. }
  50. function testSimpleCaseMap($type, int $origCode, int $newCode) {
  51. if ($newCode) {
  52. testCaseMap($type, $origCode, [$newCode]);
  53. } else {
  54. testCaseMap($type, $origCode, [$origCode]);
  55. }
  56. }
  57. function testUnicodeData(string $input) {
  58. $uppers = [];
  59. $folds = [];
  60. foreach (parseDataFile($input) as $fields) {
  61. assert(count($fields) == 15);
  62. $code = intval($fields[0], 16);
  63. $upperCase = intval($fields[12], 16);
  64. $lowerCase = intval($fields[13], 16);
  65. $titleCase = intval($fields[14], 16);
  66. testSimpleCaseMap(MB_CASE_UPPER_SIMPLE, $code, $upperCase);
  67. testSimpleCaseMap(MB_CASE_LOWER_SIMPLE, $code, $lowerCase);
  68. // Unfortunately MB_CASE_TITLE does not actually return the title case, even when passed
  69. // only a single character. It does ad-hoc magic based on the character class, so that
  70. // certain characters, such as roman numerals or circled characters will not be
  71. // title-cased.
  72. //testSimpleCaseMap(MB_CASE_TITLE_SIMPLE, $code, $titleCase ?: $upperCase);
  73. $chr = mb_chr($code);
  74. $upper = mb_strtoupper($chr);
  75. $uppers[$upper][] = $chr;
  76. $fold = mb_convert_case($chr, 3);
  77. $folds[$fold][] = $chr;
  78. }
  79. }
  80. function testCaseFolding(string $input) {
  81. foreach (parseDataFile($input) as $fields) {
  82. assert(count($fields) == 4);
  83. $code = intval($fields[0], 16);
  84. $status = $fields[1];
  85. if ($status == 'C' || $status == 'S') {
  86. $foldCode = intval($fields[2], 16);
  87. testSimpleCaseMap(MB_CASE_FOLD_SIMPLE, $code, $foldCode);
  88. } else if ($status == 'F') {
  89. $foldCodes = parseCodes($fields[2]);
  90. testCaseMap(MB_CASE_FOLD, $code, $foldCodes);
  91. }
  92. }
  93. }
  94. function testSpecialCasing(string $input) {
  95. foreach (parseDataFile($input) as $fields) {
  96. assert(count($fields) >= 5);
  97. $code = intval($fields[0], 16);
  98. $lower = parseCodes($fields[1]);
  99. $title = parseCodes($fields[2]);
  100. $upper = parseCodes($fields[3]);
  101. $cond = $fields[4];
  102. if ($cond) {
  103. // We don't support conditional mappings
  104. continue;
  105. }
  106. testCaseMap(MB_CASE_LOWER, $code, $lower);
  107. testCaseMap(MB_CASE_UPPER, $code, $upper);
  108. testCaseMap(MB_CASE_TITLE, $code, $title);
  109. }
  110. }