123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383 |
- """Implementation of JSONDecoder
- """
- import re
- import sys
- import struct
- from json import scanner
- try:
- from _json import scanstring as c_scanstring
- except ImportError:
- c_scanstring = None
- __all__ = ['JSONDecoder']
- FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
- def _floatconstants():
- nan, = struct.unpack('>d', b'\x7f\xf8\x00\x00\x00\x00\x00\x00')
- inf, = struct.unpack('>d', b'\x7f\xf0\x00\x00\x00\x00\x00\x00')
- return nan, inf, -inf
- NaN, PosInf, NegInf = _floatconstants()
- def linecol(doc, pos):
- lineno = doc.count('\n', 0, pos) + 1
- if lineno == 1:
- colno = pos + 1
- else:
- colno = pos - doc.rindex('\n', 0, pos)
- return lineno, colno
- def errmsg(msg, doc, pos, end=None):
- # Note that this function is called from _json
- lineno, colno = linecol(doc, pos)
- if end is None:
- fmt = '{0}: line {1} column {2} (char {3})'
- return fmt.format(msg, lineno, colno, pos)
- #fmt = '%s: line %d column %d (char %d)'
- #return fmt % (msg, lineno, colno, pos)
- endlineno, endcolno = linecol(doc, end)
- fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
- return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
- #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
- #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
- _CONSTANTS = {
- '-Infinity': NegInf,
- 'Infinity': PosInf,
- 'NaN': NaN,
- }
- STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
- BACKSLASH = {
- '"': u'"', '\\': u'\\', '/': u'/',
- 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
- }
- DEFAULT_ENCODING = "utf-8"
- def _decode_uXXXX(s, pos):
- esc = s[pos + 1:pos + 5]
- if len(esc) == 4 and esc[1] not in 'xX':
- try:
- return int(esc, 16)
- except ValueError:
- pass
- msg = "Invalid \\uXXXX escape"
- raise ValueError(errmsg(msg, s, pos))
- def py_scanstring(s, end, encoding=None, strict=True,
- _b=BACKSLASH, _m=STRINGCHUNK.match):
- """Scan the string s for a JSON string. End is the index of the
- character in s after the quote that started the JSON string.
- Unescapes all valid JSON string escape sequences and raises ValueError
- on attempt to decode an invalid string. If strict is False then literal
- control characters are allowed in the string.
- Returns a tuple of the decoded string and the index of the character in s
- after the end quote."""
- if encoding is None:
- encoding = DEFAULT_ENCODING
- chunks = []
- _append = chunks.append
- begin = end - 1
- while 1:
- chunk = _m(s, end)
- if chunk is None:
- raise ValueError(
- errmsg("Unterminated string starting at", s, begin))
- end = chunk.end()
- content, terminator = chunk.groups()
- # Content is contains zero or more unescaped string characters
- if content:
- if not isinstance(content, unicode):
- content = unicode(content, encoding)
- _append(content)
- # Terminator is the end of string, a literal control character,
- # or a backslash denoting that an escape sequence follows
- if terminator == '"':
- break
- elif terminator != '\\':
- if strict:
- #msg = "Invalid control character %r at" % (terminator,)
- msg = "Invalid control character {0!r} at".format(terminator)
- raise ValueError(errmsg(msg, s, end))
- else:
- _append(terminator)
- continue
- try:
- esc = s[end]
- except IndexError:
- raise ValueError(
- errmsg("Unterminated string starting at", s, begin))
- # If not a unicode escape sequence, must be in the lookup table
- if esc != 'u':
- try:
- char = _b[esc]
- except KeyError:
- msg = "Invalid \\escape: " + repr(esc)
- raise ValueError(errmsg(msg, s, end))
- end += 1
- else:
- # Unicode escape sequence
- uni = _decode_uXXXX(s, end)
- end += 5
- # Check for surrogate pair on UCS-4 systems
- if sys.maxunicode > 65535 and \
- 0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u':
- uni2 = _decode_uXXXX(s, end + 1)
- if 0xdc00 <= uni2 <= 0xdfff:
- uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
- end += 6
- char = unichr(uni)
- # Append the unescaped character
- _append(char)
- return u''.join(chunks), end
- # Use speedup if available
- scanstring = c_scanstring or py_scanstring
- WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
- WHITESPACE_STR = ' \t\n\r'
- def JSONObject(s_and_end, encoding, strict, scan_once, object_hook,
- object_pairs_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
- s, end = s_and_end
- pairs = []
- pairs_append = pairs.append
- # Use a slice to prevent IndexError from being raised, the following
- # check will raise a more specific ValueError if the string is empty
- nextchar = s[end:end + 1]
- # Normally we expect nextchar == '"'
- if nextchar != '"':
- if nextchar in _ws:
- end = _w(s, end).end()
- nextchar = s[end:end + 1]
- # Trivial empty object
- if nextchar == '}':
- if object_pairs_hook is not None:
- result = object_pairs_hook(pairs)
- return result, end + 1
- pairs = {}
- if object_hook is not None:
- pairs = object_hook(pairs)
- return pairs, end + 1
- elif nextchar != '"':
- raise ValueError(errmsg(
- "Expecting property name enclosed in double quotes", s, end))
- end += 1
- while True:
- key, end = scanstring(s, end, encoding, strict)
- # To skip some function call overhead we optimize the fast paths where
- # the JSON key separator is ": " or just ":".
- if s[end:end + 1] != ':':
- end = _w(s, end).end()
- if s[end:end + 1] != ':':
- raise ValueError(errmsg("Expecting ':' delimiter", s, end))
- end += 1
- try:
- if s[end] in _ws:
- end += 1
- if s[end] in _ws:
- end = _w(s, end + 1).end()
- except IndexError:
- pass
- try:
- value, end = scan_once(s, end)
- except StopIteration:
- raise ValueError(errmsg("Expecting object", s, end))
- pairs_append((key, value))
- try:
- nextchar = s[end]
- if nextchar in _ws:
- end = _w(s, end + 1).end()
- nextchar = s[end]
- except IndexError:
- nextchar = ''
- end += 1
- if nextchar == '}':
- break
- elif nextchar != ',':
- raise ValueError(errmsg("Expecting ',' delimiter", s, end - 1))
- try:
- nextchar = s[end]
- if nextchar in _ws:
- end += 1
- nextchar = s[end]
- if nextchar in _ws:
- end = _w(s, end + 1).end()
- nextchar = s[end]
- except IndexError:
- nextchar = ''
- end += 1
- if nextchar != '"':
- raise ValueError(errmsg(
- "Expecting property name enclosed in double quotes", s, end - 1))
- if object_pairs_hook is not None:
- result = object_pairs_hook(pairs)
- return result, end
- pairs = dict(pairs)
- if object_hook is not None:
- pairs = object_hook(pairs)
- return pairs, end
- def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
- s, end = s_and_end
- values = []
- nextchar = s[end:end + 1]
- if nextchar in _ws:
- end = _w(s, end + 1).end()
- nextchar = s[end:end + 1]
- # Look-ahead for trivial empty array
- if nextchar == ']':
- return values, end + 1
- _append = values.append
- while True:
- try:
- value, end = scan_once(s, end)
- except StopIteration:
- raise ValueError(errmsg("Expecting object", s, end))
- _append(value)
- nextchar = s[end:end + 1]
- if nextchar in _ws:
- end = _w(s, end + 1).end()
- nextchar = s[end:end + 1]
- end += 1
- if nextchar == ']':
- break
- elif nextchar != ',':
- raise ValueError(errmsg("Expecting ',' delimiter", s, end))
- try:
- if s[end] in _ws:
- end += 1
- if s[end] in _ws:
- end = _w(s, end + 1).end()
- except IndexError:
- pass
- return values, end
- class JSONDecoder(object):
- """Simple JSON <http://json.org> decoder
- Performs the following translations in decoding by default:
- +---------------+-------------------+
- | JSON | Python |
- +===============+===================+
- | object | dict |
- +---------------+-------------------+
- | array | list |
- +---------------+-------------------+
- | string | unicode |
- +---------------+-------------------+
- | number (int) | int, long |
- +---------------+-------------------+
- | number (real) | float |
- +---------------+-------------------+
- | true | True |
- +---------------+-------------------+
- | false | False |
- +---------------+-------------------+
- | null | None |
- +---------------+-------------------+
- It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
- their corresponding ``float`` values, which is outside the JSON spec.
- """
- def __init__(self, encoding=None, object_hook=None, parse_float=None,
- parse_int=None, parse_constant=None, strict=True,
- object_pairs_hook=None):
- """``encoding`` determines the encoding used to interpret any ``str``
- objects decoded by this instance (utf-8 by default). It has no
- effect when decoding ``unicode`` objects.
- Note that currently only encodings that are a superset of ASCII work,
- strings of other encodings should be passed in as ``unicode``.
- ``object_hook``, if specified, will be called with the result
- of every JSON object decoded and its return value will be used in
- place of the given ``dict``. This can be used to provide custom
- deserializations (e.g. to support JSON-RPC class hinting).
- ``object_pairs_hook``, if specified will be called with the result of
- every JSON object decoded with an ordered list of pairs. The return
- value of ``object_pairs_hook`` will be used instead of the ``dict``.
- This feature can be used to implement custom decoders that rely on the
- order that the key and value pairs are decoded (for example,
- collections.OrderedDict will remember the order of insertion). If
- ``object_hook`` is also defined, the ``object_pairs_hook`` takes
- priority.
- ``parse_float``, if specified, will be called with the string
- of every JSON float to be decoded. By default this is equivalent to
- float(num_str). This can be used to use another datatype or parser
- for JSON floats (e.g. decimal.Decimal).
- ``parse_int``, if specified, will be called with the string
- of every JSON int to be decoded. By default this is equivalent to
- int(num_str). This can be used to use another datatype or parser
- for JSON integers (e.g. float).
- ``parse_constant``, if specified, will be called with one of the
- following strings: -Infinity, Infinity, NaN.
- This can be used to raise an exception if invalid JSON numbers
- are encountered.
- If ``strict`` is false (true is the default), then control
- characters will be allowed inside strings. Control characters in
- this context are those with character codes in the 0-31 range,
- including ``'\\t'`` (tab), ``'\\n'``, ``'\\r'`` and ``'\\0'``.
- """
- self.encoding = encoding
- self.object_hook = object_hook
- self.object_pairs_hook = object_pairs_hook
- self.parse_float = parse_float or float
- self.parse_int = parse_int or int
- self.parse_constant = parse_constant or _CONSTANTS.__getitem__
- self.strict = strict
- self.parse_object = JSONObject
- self.parse_array = JSONArray
- self.parse_string = scanstring
- self.scan_once = scanner.make_scanner(self)
- def decode(self, s, _w=WHITESPACE.match):
- """Return the Python representation of ``s`` (a ``str`` or ``unicode``
- instance containing a JSON document)
- """
- obj, end = self.raw_decode(s, idx=_w(s, 0).end())
- end = _w(s, end).end()
- if end != len(s):
- raise ValueError(errmsg("Extra data", s, end, len(s)))
- return obj
- def raw_decode(self, s, idx=0):
- """Decode a JSON document from ``s`` (a ``str`` or ``unicode``
- beginning with a JSON document) and return a 2-tuple of the Python
- representation and the index in ``s`` where the document ended.
- This can be used to decode a JSON document from a string that may
- have extraneous data at the end.
- """
- try:
- obj, end = self.scan_once(s, idx)
- except StopIteration:
- raise ValueError("No JSON object could be decoded")
- return obj, end
|