uctest.php 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. #!/usr/bin/env php
  2. <?php error_reporting(E_ALL);
  3. $dir = __DIR__;
  4. $unicodeDataFile = $dir . '/UnicodeData.txt';
  5. $caseFoldingFile = $dir . '/CaseFolding.txt';
  6. $specialCasingFile = $dir . '/SpecialCasing.txt';
  7. $files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile];
  8. foreach ($files as $file) {
  9. if (!file_exists($file)) {
  10. echo "File $file does not exist.\n";
  11. return;
  12. }
  13. }
  14. testUnicodeData(file_get_contents($unicodeDataFile));
  15. testCaseFolding(file_get_contents($caseFoldingFile));
  16. testSpecialCasing(file_get_contents($specialCasingFile));
  17. function parseDataFile(string $input) {
  18. $lines = explode("\n", $input);
  19. foreach ($lines as $line) {
  20. // Strip comments
  21. if (false !== $hashPos = strpos($line, '#')) {
  22. $line = substr($line, 0, $hashPos);
  23. }
  24. // Skip empty lines
  25. $line = trim($line);
  26. if ($line === '') {
  27. continue;
  28. }
  29. $fields = array_map('trim', explode(';', $line));
  30. yield $fields;
  31. }
  32. }
  33. function parseCodes(string $strCodes) : array {
  34. $codes = [];
  35. foreach (explode(' ', $strCodes) as $strCode) {
  36. $codes[] = intval($strCode, 16);
  37. }
  38. return $codes;
  39. }
  40. function testCaseMap($type, int $origCode, array $newCodes) {
  41. $origChar = mb_chr($origCode);
  42. $newStr = "";
  43. foreach ($newCodes as $newCode) {
  44. $newStr .= mb_chr($newCode);
  45. }
  46. $mbNewStr = mb_convert_case($origChar, $type);
  47. if ($mbNewStr !== $newStr) {
  48. echo "$type: $mbNewStr != $newStr\n";
  49. }
  50. }
  51. function testSimpleCaseMap($type, int $origCode, int $newCode) {
  52. if ($newCode) {
  53. testCaseMap($type, $origCode, [$newCode]);
  54. } else {
  55. testCaseMap($type, $origCode, [$origCode]);
  56. }
  57. }
  58. function testUnicodeData(string $input) {
  59. $uppers = [];
  60. $folds = [];
  61. foreach (parseDataFile($input) as $fields) {
  62. assert(count($fields) == 15);
  63. $code = intval($fields[0], 16);
  64. $upperCase = intval($fields[12], 16);
  65. $lowerCase = intval($fields[13], 16);
  66. $titleCase = intval($fields[14], 16);
  67. testSimpleCaseMap(MB_CASE_UPPER_SIMPLE, $code, $upperCase);
  68. testSimpleCaseMap(MB_CASE_LOWER_SIMPLE, $code, $lowerCase);
  69. // Unfortunately MB_CASE_TITLE does not actually return the title case, even when passed
  70. // only a single character. It does ad-hoc magic based on the character class, so that
  71. // certain characters, such as roman numerals or circled characters will not be
  72. // title-cased.
  73. //testSimpleCaseMap(MB_CASE_TITLE_SIMPLE, $code, $titleCase ?: $upperCase);
  74. $chr = mb_chr($code);
  75. $upper = mb_strtoupper($chr);
  76. $uppers[$upper][] = $chr;
  77. $fold = mb_convert_case($chr, 3);
  78. $folds[$fold][] = $chr;
  79. }
  80. }
  81. function testCaseFolding(string $input) {
  82. foreach (parseDataFile($input) as $fields) {
  83. assert(count($fields) == 4);
  84. $code = intval($fields[0], 16);
  85. $status = $fields[1];
  86. if ($status == 'C' || $status == 'S') {
  87. $foldCode = intval($fields[2], 16);
  88. testSimpleCaseMap(MB_CASE_FOLD_SIMPLE, $code, $foldCode);
  89. } else if ($status == 'F') {
  90. $foldCodes = parseCodes($fields[2]);
  91. testCaseMap(MB_CASE_FOLD, $code, $foldCodes);
  92. }
  93. }
  94. }
  95. function testSpecialCasing(string $input) {
  96. foreach (parseDataFile($input) as $fields) {
  97. assert(count($fields) >= 5);
  98. $code = intval($fields[0], 16);
  99. $lower = parseCodes($fields[1]);
  100. $title = parseCodes($fields[2]);
  101. $upper = parseCodes($fields[3]);
  102. $cond = $fields[4];
  103. if ($cond) {
  104. // We don't support conditional mappings
  105. continue;
  106. }
  107. testCaseMap(MB_CASE_LOWER, $code, $lower);
  108. testCaseMap(MB_CASE_UPPER, $code, $upper);
  109. testCaseMap(MB_CASE_TITLE, $code, $title);
  110. }
  111. }