__init__.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. """ Standard "encodings" Package
  2. Standard Python encoding modules are stored in this package
  3. directory.
  4. Codec modules must have names corresponding to normalized encoding
  5. names as defined in the normalize_encoding() function below, e.g.
  6. 'utf-8' must be implemented by the module 'utf_8.py'.
  7. Each codec module must export the following interface:
  8. * getregentry() -> codecs.CodecInfo object
  9. The getregentry() API must return a CodecInfo object with encoder, decoder,
  10. incrementalencoder, incrementaldecoder, streamwriter and streamreader
  11. atttributes which adhere to the Python Codec Interface Standard.
  12. In addition, a module may optionally also define the following
  13. APIs which are then used by the package's codec search function:
  14. * getaliases() -> sequence of encoding name strings to use as aliases
  15. Alias names returned by getaliases() must be normalized encoding
  16. names as defined by normalize_encoding().
  17. Written by Marc-Andre Lemburg (mal@lemburg.com).
  18. (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
  19. """#"
  20. import codecs
  21. from . import aliases
  22. _cache = {}
  23. _unknown = '--unknown--'
  24. _import_tail = ['*']
  25. _aliases = aliases.aliases
  26. class CodecRegistryError(LookupError, SystemError):
  27. pass
  28. def normalize_encoding(encoding):
  29. """ Normalize an encoding name.
  30. Normalization works as follows: all non-alphanumeric
  31. characters except the dot used for Python package names are
  32. collapsed and replaced with a single underscore, e.g. ' -;#'
  33. becomes '_'. Leading and trailing underscores are removed.
  34. Note that encoding names should be ASCII only; if they do use
  35. non-ASCII characters, these must be Latin-1 compatible.
  36. """
  37. if isinstance(encoding, bytes):
  38. encoding = str(encoding, "ascii")
  39. chars = []
  40. punct = False
  41. for c in encoding:
  42. if c.isalnum() or c == '.':
  43. if punct and chars:
  44. chars.append('_')
  45. chars.append(c)
  46. punct = False
  47. else:
  48. punct = True
  49. return ''.join(chars)
  50. def search_function(encoding):
  51. # Cache lookup
  52. entry = _cache.get(encoding, _unknown)
  53. if entry is not _unknown:
  54. return entry
  55. # Import the module:
  56. #
  57. # First try to find an alias for the normalized encoding
  58. # name and lookup the module using the aliased name, then try to
  59. # lookup the module using the standard import scheme, i.e. first
  60. # try in the encodings package, then at top-level.
  61. #
  62. norm_encoding = normalize_encoding(encoding)
  63. aliased_encoding = _aliases.get(norm_encoding) or \
  64. _aliases.get(norm_encoding.replace('.', '_'))
  65. if aliased_encoding is not None:
  66. modnames = [aliased_encoding,
  67. norm_encoding]
  68. else:
  69. modnames = [norm_encoding]
  70. for modname in modnames:
  71. if not modname or '.' in modname:
  72. continue
  73. try:
  74. # Import is absolute to prevent the possibly malicious import of a
  75. # module with side-effects that is not in the 'encodings' package.
  76. mod = __import__('encodings.' + modname, fromlist=_import_tail,
  77. level=0)
  78. except ImportError:
  79. pass
  80. else:
  81. break
  82. else:
  83. mod = None
  84. try:
  85. getregentry = mod.getregentry
  86. except AttributeError:
  87. # Not a codec module
  88. mod = None
  89. if mod is None:
  90. # Cache misses
  91. _cache[encoding] = None
  92. return None
  93. # Now ask the module for the registry entry
  94. entry = getregentry()
  95. if not isinstance(entry, codecs.CodecInfo):
  96. if not 4 <= len(entry) <= 7:
  97. raise CodecRegistryError('module "%s" (%s) failed to register'
  98. % (mod.__name__, mod.__file__))
  99. if not callable(entry[0]) or not callable(entry[1]) or \
  100. (entry[2] is not None and not callable(entry[2])) or \
  101. (entry[3] is not None and not callable(entry[3])) or \
  102. (len(entry) > 4 and entry[4] is not None and not callable(entry[4])) or \
  103. (len(entry) > 5 and entry[5] is not None and not callable(entry[5])):
  104. raise CodecRegistryError('incompatible codecs in module "%s" (%s)'
  105. % (mod.__name__, mod.__file__))
  106. if len(entry)<7 or entry[6] is None:
  107. entry += (None,)*(6-len(entry)) + (mod.__name__.split(".", 1)[1],)
  108. entry = codecs.CodecInfo(*entry)
  109. # Cache the codec registry entry
  110. _cache[encoding] = entry
  111. # Register its aliases (without overwriting previously registered
  112. # aliases)
  113. try:
  114. codecaliases = mod.getaliases()
  115. except AttributeError:
  116. pass
  117. else:
  118. for alias in codecaliases:
  119. if alias not in _aliases:
  120. _aliases[alias] = modname
  121. # Return the registry entry
  122. return entry
  123. # Register the search_function in the Python codec registry
  124. codecs.register(search_function)