cp_enc_map_gen.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. #include <stdio.h>
  2. #include <windows.h>
  3. struct cp {
  4. DWORD id;
  5. char *name;
  6. char *enc;
  7. char *desc;
  8. };
  9. static const struct cp cp_map[] = {
  10. { 37, "IBM037", "", "IBM EBCDIC US-Canada" },
  11. { 437, "IBM437", "", "OEM United States" },
  12. { 500, "IBM500", "", "IBM EBCDIC International" },
  13. { 708, "ASMO-708", "", "Arabic (ASMO 708)" },
  14. { 709, "", "", "Arabic (ASMO-449+, BCON V4)" },
  15. { 710, "", "", "Arabic - Transparent Arabic" },
  16. { 720, "DOS-720", "", "Arabic (Transparent ASMO); Arabic (DOS)" },
  17. { 737, "ibm737", "", "OEM Greek (formerly 437G); Greek (DOS)" },
  18. { 775, "ibm775", "", "OEM Baltic; Baltic (DOS)" },
  19. { 850, "ibm850", "850|CP850|IBM850|CSPC850MULTILINGUAL", "OEM Multilingual Latin 1; Western European (DOS)" },
  20. { 852, "ibm852", "", "OEM Latin 2; Central European (DOS)" },
  21. { 855, "IBM855", "", "OEM Cyrillic (primarily Russian)" },
  22. { 857, "ibm857", "", "OEM Turkish; Turkish (DOS)" },
  23. { 858, "IBM00858", "", "OEM Multilingual Latin 1 + Euro symbol" },
  24. { 860, "IBM860", "", "OEM Portuguese; Portuguese (DOS)" },
  25. { 861, "ibm861", "", "OEM Icelandic; Icelandic (DOS)" },
  26. { 862, "DOS-862", "862|CP862|IBM862|CSPC862LATINHEBREW", "OEM Hebrew; Hebrew (DOS)" },
  27. { 863, "IBM863", "", "OEM French Canadian; French Canadian (DOS)" },
  28. { 864, "IBM864", "", "OEM Arabic; Arabic (864)" },
  29. { 865, "IBM865", "", "OEM Nordic; Nordic (DOS)" },
  30. { 866, "cp866", "866|CP866|IBM866|CSIBM866", "OEM Russian; Cyrillic (DOS)" },
  31. { 869, "ibm869", "", "OEM Modern Greek; Greek, Modern (DOS)" },
  32. { 870, "IBM870", "", "IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2" },
  33. { 874, "windows-874", "CP874", "ANSI/OEM Thai (ISO 8859-11); Thai (Windows)" },
  34. { 875, "cp875", "", "IBM EBCDIC Greek Modern" },
  35. { 932, "shift_jis", "CP932|SHIFT_JIS|MS_KANJI|CSSHIFTJIS", "ANSI/OEM Japanese; Japanese (Shift-JIS)" },
  36. { 936, "gb2312", "GB2312|GBK|CP936|MS936|WINDOWS-936", "ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)" },
  37. { 949, "ks_c_5601-1987", "CP949|UHC", "ANSI/OEM Korean (Unified Hangul Code)" },
  38. { 950, "big5", "CP950|BIG-5", "ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)" },
  39. { 1026, "IBM1026", "", "IBM EBCDIC Turkish (Latin 5)" },
  40. { 1047, "IBM01047", "", "IBM EBCDIC Latin 1/Open System" },
  41. { 1140, "IBM01140", "", "IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)" },
  42. { 1141, "IBM01141", "", "IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)" },
  43. { 1142, "IBM01142", "", "IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)" },
  44. { 1143, "IBM01143", "", "IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)" },
  45. { 1144, "IBM01144", "", "IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)" },
  46. { 1145, "IBM01145", "", "IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)" },
  47. { 1146, "IBM01146", "", "IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)" },
  48. { 1147, "IBM01147", "", "IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)" },
  49. { 1148, "IBM01148", "", "IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)" },
  50. { 1149, "IBM01149", "", "IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)" },
  51. { 1200, "utf-16", "", "Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications" },
  52. { 1201, "unicodeFFFE", "", "Unicode UTF-16, big endian byte order; available only to managed applications" },
  53. { 1250, "windows-1250", "CP1250|MS-EE|WINDOWS-1250", "ANSI Central European; Central European (Windows)" },
  54. { 1251, "windows-1251", "CP1251|MS-CYRL|WINDOWS-1251", "ANSI Cyrillic; Cyrillic (Windows)" },
  55. { 1252, "windows-1252", "CP1252|MS-ANSI|WINDOWS-1252", "ANSI Latin 1; Western European (Windows)" },
  56. { 1253, "windows-1253", "CP1253|MS-GREEK|WINDOWS-1253", "ANSI Greek; Greek (Windows)" },
  57. { 1254, "windows-1254", "CP1254|MS-TURK|WINDOWS-1254", "ANSI Turkish; Turkish (Windows)" },
  58. { 1255, "windows-1255", "CP1255|MS-HEBR|WINDOWS-1255", "ANSI Hebrew; Hebrew (Windows)" },
  59. { 1256, "windows-1256", "CP1256|MS-ARAB|WINDOWS-1256", "ANSI Arabic; Arabic (Windows)" },
  60. { 1257, "windows-1257", "CP1257|WINBALTRIM|WINDOWS-1257", "ANSI Baltic; Baltic (Windows)" },
  61. { 1258, "windows-1258", "CP1258|WINDOWS-1258", "ANSI/OEM Vietnamese; Vietnamese (Windows)" },
  62. { 1361, "Johab", "CP1361|JOHAB", "Korean (Johab)" },
  63. { 10000, "macintosh", "MAC|MACINTOSH|MACROMAN|CSMACINTOSH", "MAC Roman; Western European (Mac)" },
  64. { 10001, "x-mac-japanese", "", "Japanese (Mac)" },
  65. { 10002, "x-mac-chinesetrad", "", "MAC Traditional Chinese (Big5); Chinese Traditional (Mac)" },
  66. { 10003, "x-mac-korean", "", "Korean (Mac)" },
  67. { 10004, "x-mac-arabic", "MACARABIC", "Arabic (Mac)" },
  68. { 10005, "x-mac-hebrew", "MACHEBREW", "Hebrew (Mac)" },
  69. { 10006, "x-mac-greek", "MACGREEK", "Greek (Mac)" },
  70. { 10007, "x-mac-cyrillic", "MACCYRILLIC", "Cyrillic (Mac)" },
  71. { 10008, "x-mac-chinesesimp", "", "MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)" },
  72. { 10010, "x-mac-romanian", "MACROMANIA", "Romanian (Mac)" },
  73. { 10017, "x-mac-ukrainian", "MACUKRAINE", "Ukrainian (Mac)" },
  74. { 10021, "x-mac-thai", "MACTHAI", "Thai (Mac)" },
  75. { 10029, "x-mac-ce", "MACCENTRALEUROPE", "MAC Latin 2; Central European (Mac)" },
  76. { 10079, "x-mac-icelandic", "MACICELAND", "Icelandic (Mac)" },
  77. { 10081, "x-mac-turkish", "MACTURKISH", "Turkish (Mac)" },
  78. { 10082, "x-mac-croatian", "MACCROATIAN", "Croatian (Mac)" },
  79. { 12000, "utf-32", "", "Unicode UTF-32, little endian byte order; available only to managed applications" },
  80. { 12001, "utf-32BE", "", "Unicode UTF-32, big endian byte order; available only to managed applications" },
  81. { 20000, "x-Chinese_CNS", "", "CNS Taiwan; Chinese Traditional (CNS)" },
  82. { 20001, "x-cp20001", "", "TCA Taiwan" },
  83. { 20002, "x_Chinese-Eten", "", "Eten Taiwan; Chinese Traditional (Eten)" },
  84. { 20003, "x-cp20003", "", "IBM5550 Taiwan" },
  85. { 20004, "x-cp20004", "", "TeleText Taiwan" },
  86. { 20005, "x-cp20005", "", "Wang Taiwan" },
  87. { 20105, "x-IA5", "", "IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)" },
  88. { 20106, "x-IA5-German", "", "IA5 German (7-bit)" },
  89. { 20107, "x-IA5-Swedish", "", "IA5 Swedish (7-bit)" },
  90. { 20108, "x-IA5-Norwegian", "", "IA5 Norwegian (7-bit)" },
  91. { 20127, "us-ascii", "", "US-ASCII (7-bit)" },
  92. { 20261, "x-cp20261", "", "T.61" },
  93. { 20269, "x-cp20269", "", "ISO 6937 Non-Spacing Accent" },
  94. { 20273, "IBM273", "", "IBM EBCDIC Germany" },
  95. { 20277, "IBM277", "", "IBM EBCDIC Denmark-Norway" },
  96. { 20278, "IBM278", "", "IBM EBCDIC Finland-Sweden" },
  97. { 20280, "IBM280", "", "IBM EBCDIC Italy" },
  98. { 20284, "IBM284", "", "IBM EBCDIC Latin America-Spain" },
  99. { 20285, "IBM285", "", "IBM EBCDIC United Kingdom" },
  100. { 20290, "IBM290", "", "IBM EBCDIC Japanese Katakana Extended" },
  101. { 20297, "IBM297", "", "IBM EBCDIC France" },
  102. { 20420, "IBM420", "", "IBM EBCDIC Arabic" },
  103. { 20423, "IBM423", "", "IBM EBCDIC Greek" },
  104. { 20424, "IBM424", "", "IBM EBCDIC Hebrew" },
  105. { 20833, "x-EBCDIC-KoreanExtended", "", "IBM EBCDIC Korean Extended" },
  106. { 20838, "IBM-Thai", "", "IBM EBCDIC Thai" },
  107. { 20866, "koi8-r", "KOI8-R|CSKOI8R", "Russian (KOI8-R); Cyrillic (KOI8-R)" },
  108. { 20871, "IBM871", "", "IBM EBCDIC Icelandic" },
  109. { 20880, "IBM880", "", "IBM EBCDIC Cyrillic Russian" },
  110. { 20905, "IBM905", "", "IBM EBCDIC Turkish" },
  111. { 20924, "IBM00924", "", "IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)" },
  112. { 20932, "EUC-JP", "EUC-JP|EUCJP|EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE|CSEUCPKDFMTJAPANESE", "Japanese (JIS 0208-1990 and 0212-1990)" },
  113. { 20936, "x-cp20936", "", "Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)" },
  114. { 20949, "x-cp20949", "", "Korean Wansung" },
  115. { 21025, "cp1025", "", "IBM EBCDIC Cyrillic Serbian-Bulgarian" },
  116. /*{ 21027, "", "", "(deprecated)" },*/
  117. { 21866, "koi8-u", "KOI8-U", "Ukrainian (KOI8-U); Cyrillic (KOI8-U)" },
  118. { 28591, "iso-8859-1", "CP819|IBM819|ISO-8859-1|ISO-IR-100|ISO8859-1|ISO_8859-1|ISO_8859-1:1987|L1|LATIN1|CSISOLATIN1", "ISO 8859-1 Latin 1; Western European (ISO)" },
  119. { 28592, "iso-8859-2", "ISO-8859-2|ISO-IR-101|ISO8859-2|ISO_8859-2|ISO_8859-2:1987|L2|LATIN2|CSISOLATIN2", "ISO 8859-2 Central European; Central European (ISO)" },
  120. { 28593, "iso-8859-3", "ISO-8859-3|ISO-IR-109|ISO8859-3|ISO_8859-3|ISO_8859-3:1988|L3|LATIN3|CSISOLATIN3", "ISO 8859-3 Latin 3" },
  121. { 28594, "iso-8859-4", "ISO-8859-4|ISO-IR-110|ISO8859-4|ISO_8859-4|ISO_8859-4:1988|L4|LATIN4|CSISOLATIN4", "ISO 8859-4 Baltic" },
  122. { 28595, "iso-8859-5", "CYRILLIC|ISO-8859-5|ISO-IR-144|ISO8859-5|ISO_8859-5|ISO_8859-5:1988|CSISOLATINCYRILLIC", "ISO 8859-5 Cyrillic" },
  123. { 28596, "iso-8859-6", "ARABIC|ASMO-708|ECMA-114|ISO-8859-6|ISO-IR-127|ISO8859-6|ISO_8859-6|ISO_8859-6:1987|CSISOLATINARABIC", "ISO 8859-6 Arabic" },
  124. { 28597, "iso-8859-7", "ECMA-118|ELOT_928|GREEK|GREEK8|ISO-8859-7|ISO-IR-126|ISO8859-7|ISO_8859-7|ISO_8859-7:1987|ISO_8859-7:2003|CSISOLATINGREEK", "ISO 8859-7 Greek" },
  125. { 28598, "iso-8859-8", "HEBREW|ISO-8859-8|ISO-IR-138|ISO8859-8|ISO_8859-8|ISO_8859-8:1988|CSISOLATINHEBREW", "ISO 8859-8 Hebrew; Hebrew (ISO-Visual)" },
  126. { 28599, "iso-8859-9", "ISO-8859-9|ISO-IR-148|ISO8859-9|ISO_8859-9|ISO_8859-9:1989|L5|LATIN5|CSISOLATIN5", "ISO 8859-9 Turkish" },
  127. { 28603, "iso-8859-13", "ISO-8859-13|ISO-IR-179|ISO8859-13|ISO_8859-13|L7|LATIN7", "ISO 8859-13 Estonian" },
  128. { 28605, "iso-8859-15", "ISO-8859-15|ISO-IR-203|ISO8859-15|ISO_8859-15|ISO_8859-15:1998|LATIN-9", "ISO 8859-15 Latin 9" },
  129. { 29001, "x-Europa", "", "Europa 3" },
  130. { 38598, "iso-8859-8-i", "", "ISO 8859-8 Hebrew; Hebrew (ISO-Logical)" },
  131. { 50220, "iso-2022-jp", "CP50220", "ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)" },
  132. { 50221, "csISO2022JP", "CP50221", "ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)" },
  133. { 50222, "iso-2022-jp", "ISO-2022-JP|CP50222", "ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)" },
  134. { 50225, "iso-2022-kr", "ISO-2022-KR|CSISO2022KR", "ISO 2022 Korean" },
  135. { 50227, "x-cp50227", "", "ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)" },
  136. { 50229, "x-cp50229", "", "ISO 2022 Traditional Chinese" },
  137. { 50930, "", "", "EBCDIC Japanese (Katakana) Extended" },
  138. { 50931, "", "", "EBCDIC US-Canada and Japanese" },
  139. { 50933, "", "", "EBCDIC Korean Extended and Korean" },
  140. { 50935, "", "", "EBCDIC Simplified Chinese Extended and Simplified Chinese" },
  141. { 50936, "", "", "EBCDIC Simplified Chinese" },
  142. { 50937, "", "", "EBCDIC US-Canada and Traditional Chinese" },
  143. { 50939, "", "", "EBCDIC Japanese (Latin) Extended and Japanese" },
  144. { 51932, "euc-jp", "", "EUC Japanese" },
  145. { 51936, "EUC-CN", "", "EUC Simplified Chinese; Chinese Simplified (EUC)" },
  146. { 51949, "euc-kr", "EUC-KR|EUCKR|CSEUCKR", "EUC Korean" },
  147. { 51950, "", "", "EUC Traditional Chinese" },
  148. { 52936, "hz-gb-2312", "HZ|HZ-GB-2312", "HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)" },
  149. { 54936, "GB18030", "GB18030|CSGB18030", "Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)" },
  150. { 57002, "x-iscii-de", "", "ISCII Devanagari" },
  151. { 57003, "x-iscii-be", "", "ISCII Bangla" },
  152. { 57004, "x-iscii-ta", "", "ISCII Tamil" },
  153. { 57005, "x-iscii-te", "", "ISCII Telugu" },
  154. { 57006, "x-iscii-as", "", "ISCII Assamese" },
  155. { 57007, "x-iscii-or", "", "ISCII Odia" },
  156. { 57008, "x-iscii-ka", "", "ISCII Kannada" },
  157. { 57009, "x-iscii-ma", "", "ISCII Malayalam" },
  158. { 57010, "x-iscii-gu", "", "ISCII Gujarati" },
  159. { 57011, "x-iscii-pa", "", "ISCII Punjabi" },
  160. { 65000, "utf-7", "UTF-7", "Unicode (UTF-7)" },
  161. { 65001, "utf-8", "UTF-8", "Unicode (UTF-8)" },
  162. { 0, NULL, NULL },
  163. };
  164. int
  165. main(int argc, char **argv)
  166. {
  167. DWORD cp;
  168. CPINFOEX info;
  169. struct cp *cur;
  170. int rnd = 0;
  171. /*if (argc < 2) {
  172. printf("Usage: cpinfoex cp_id\n");
  173. return 0;
  174. }
  175. cp = atoi(argv[1]);*/
  176. #if 0
  177. /* Ref:
  178. http://www.iana.org/assignments/character-sets/character-sets.xhtml
  179. https://msdn.microsoft.com/en-us/goglobal/bb964653
  180. http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/
  181. */
  182. #endif
  183. /*
  184. struct php_win32_cp {
  185. DWORD id;
  186. DWORD to_w_fl;
  187. DWORD from_w_fl;
  188. DWORD char_size;
  189. char *name;
  190. char *enc;
  191. char *desc;
  192. };
  193. */
  194. /*printf("struct php_win32_cp {\n\tDWORD id;\n\tDWORD to_w_fl;\n\tDWORD from_w_fl;\n\tDWORD char_size;\n\tchar *name;\n\tchar *enc;\n\tchar *desc;\n};\n\n"); */
  195. printf("/* Autogenerated file. Update cp_enc_map_gen.c and regen like \n"
  196. " cp_enc_map_gen.exe > cp_enc_map.c \n*/\n\n");
  197. printf("static const struct php_win32_cp php_win32_cp_map[] = {");
  198. cur = &cp_map[0];
  199. #ifdef ORDER_IT
  200. while (rnd <= 2 && ++rnd && (cur = &cp_map[0]))
  201. #endif
  202. while (cur->desc != NULL) {
  203. if (!IsValidCodePage(cur->id)) {
  204. #ifdef ORDER_IT
  205. if (2 == rnd)
  206. #endif
  207. printf("\t/* %u is invalid */\n", cur->id);
  208. //printf("#if 0\n\t{ %u, 0, \"%s\", \"%s\" },\n#endif\n", cur->id, cur->name, cur->desc);
  209. } else if (GetCPInfoEx(cur->id, 0, &info)) {
  210. DWORD to_w_fl = 0, from_w_fl = 0;
  211. if (65001U == cur->id || 54936U == cur->id) {
  212. from_w_fl = WC_ERR_INVALID_CHARS;
  213. to_w_fl = MB_ERR_INVALID_CHARS;
  214. }
  215. //printf("\t{ %u, %u, \"%s\", \"%s\" },\n", cur->id, info.MaxCharSize, cur->name, cur->desc);
  216. if (!cur->enc[0]) {
  217. #ifdef ORDER_IT
  218. if (2 == rnd)
  219. #endif
  220. //printf("\t/* { %u, %u, \"%s\", NULL, \"%s\" }, */\n", info.CodePage, info.MaxCharSize, cur->name, info.CodePageName);
  221. printf("\t{ %u, %u, %u, %u, \"%s\", NULL, \"%s\" },\n", info.CodePage, to_w_fl, from_w_fl, info.MaxCharSize, cur->name, info.CodePageName);
  222. } else {
  223. #ifdef ORDER_IT
  224. if (1 == rnd)
  225. #endif
  226. printf("\t{ %u, %u, %u, %u, \"%s\", \"%s\", \"%s\" },\n", info.CodePage, to_w_fl, from_w_fl, info.MaxCharSize, cur->name, cur->enc, info.CodePageName);
  227. }
  228. }
  229. cur++;
  230. }
  231. printf("};\n\n");
  232. return 0;
  233. }