csv.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456
  1. """
  2. csv.py - read/write/investigate CSV files
  3. """
  4. import re
  5. from functools import reduce
  6. from _csv import Error, __version__, writer, reader, register_dialect, \
  7. unregister_dialect, get_dialect, list_dialects, \
  8. field_size_limit, \
  9. QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
  10. __doc__
  11. from _csv import Dialect as _Dialect
  12. try:
  13. from cStringIO import StringIO
  14. except ImportError:
  15. from StringIO import StringIO
  16. __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
  17. "Error", "Dialect", "__doc__", "excel", "excel_tab",
  18. "field_size_limit", "reader", "writer",
  19. "register_dialect", "get_dialect", "list_dialects", "Sniffer",
  20. "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
  21. class Dialect:
  22. """Describe an Excel dialect.
  23. This must be subclassed (see csv.excel). Valid attributes are:
  24. delimiter, quotechar, escapechar, doublequote, skipinitialspace,
  25. lineterminator, quoting.
  26. """
  27. _name = ""
  28. _valid = False
  29. # placeholders
  30. delimiter = None
  31. quotechar = None
  32. escapechar = None
  33. doublequote = None
  34. skipinitialspace = None
  35. lineterminator = None
  36. quoting = None
  37. def __init__(self):
  38. if self.__class__ != Dialect:
  39. self._valid = True
  40. self._validate()
  41. def _validate(self):
  42. try:
  43. _Dialect(self)
  44. except TypeError, e:
  45. # We do this for compatibility with py2.3
  46. raise Error(str(e))
  47. class excel(Dialect):
  48. """Describe the usual properties of Excel-generated CSV files."""
  49. delimiter = ','
  50. quotechar = '"'
  51. doublequote = True
  52. skipinitialspace = False
  53. lineterminator = '\r\n'
  54. quoting = QUOTE_MINIMAL
  55. register_dialect("excel", excel)
  56. class excel_tab(excel):
  57. """Describe the usual properties of Excel-generated TAB-delimited files."""
  58. delimiter = '\t'
  59. register_dialect("excel-tab", excel_tab)
  60. class DictReader:
  61. def __init__(self, f, fieldnames=None, restkey=None, restval=None,
  62. dialect="excel", *args, **kwds):
  63. self._fieldnames = fieldnames # list of keys for the dict
  64. self.restkey = restkey # key to catch long rows
  65. self.restval = restval # default value for short rows
  66. self.reader = reader(f, dialect, *args, **kwds)
  67. self.dialect = dialect
  68. self.line_num = 0
  69. def __iter__(self):
  70. return self
  71. @property
  72. def fieldnames(self):
  73. if self._fieldnames is None:
  74. try:
  75. self._fieldnames = self.reader.next()
  76. except StopIteration:
  77. pass
  78. self.line_num = self.reader.line_num
  79. return self._fieldnames
  80. # Issue 20004: Because DictReader is a classic class, this setter is
  81. # ignored. At this point in 2.7's lifecycle, it is too late to change the
  82. # base class for fear of breaking working code. If you want to change
  83. # fieldnames without overwriting the getter, set _fieldnames directly.
  84. @fieldnames.setter
  85. def fieldnames(self, value):
  86. self._fieldnames = value
  87. def next(self):
  88. if self.line_num == 0:
  89. # Used only for its side effect.
  90. self.fieldnames
  91. row = self.reader.next()
  92. self.line_num = self.reader.line_num
  93. # unlike the basic reader, we prefer not to return blanks,
  94. # because we will typically wind up with a dict full of None
  95. # values
  96. while row == []:
  97. row = self.reader.next()
  98. d = dict(zip(self.fieldnames, row))
  99. lf = len(self.fieldnames)
  100. lr = len(row)
  101. if lf < lr:
  102. d[self.restkey] = row[lf:]
  103. elif lf > lr:
  104. for key in self.fieldnames[lr:]:
  105. d[key] = self.restval
  106. return d
  107. class DictWriter:
  108. def __init__(self, f, fieldnames, restval="", extrasaction="raise",
  109. dialect="excel", *args, **kwds):
  110. self.fieldnames = fieldnames # list of keys for the dict
  111. self.restval = restval # for writing short dicts
  112. if extrasaction.lower() not in ("raise", "ignore"):
  113. raise ValueError, \
  114. ("extrasaction (%s) must be 'raise' or 'ignore'" %
  115. extrasaction)
  116. self.extrasaction = extrasaction
  117. self.writer = writer(f, dialect, *args, **kwds)
  118. def writeheader(self):
  119. header = dict(zip(self.fieldnames, self.fieldnames))
  120. self.writerow(header)
  121. def _dict_to_list(self, rowdict):
  122. if self.extrasaction == "raise":
  123. wrong_fields = [k for k in rowdict if k not in self.fieldnames]
  124. if wrong_fields:
  125. raise ValueError("dict contains fields not in fieldnames: "
  126. + ", ".join([repr(x) for x in wrong_fields]))
  127. return [rowdict.get(key, self.restval) for key in self.fieldnames]
  128. def writerow(self, rowdict):
  129. return self.writer.writerow(self._dict_to_list(rowdict))
  130. def writerows(self, rowdicts):
  131. rows = []
  132. for rowdict in rowdicts:
  133. rows.append(self._dict_to_list(rowdict))
  134. return self.writer.writerows(rows)
  135. # Guard Sniffer's type checking against builds that exclude complex()
  136. try:
  137. complex
  138. except NameError:
  139. complex = float
  140. class Sniffer:
  141. '''
  142. "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
  143. Returns a Dialect object.
  144. '''
  145. def __init__(self):
  146. # in case there is more than one possible delimiter
  147. self.preferred = [',', '\t', ';', ' ', ':']
  148. def sniff(self, sample, delimiters=None):
  149. """
  150. Returns a dialect (or None) corresponding to the sample
  151. """
  152. quotechar, doublequote, delimiter, skipinitialspace = \
  153. self._guess_quote_and_delimiter(sample, delimiters)
  154. if not delimiter:
  155. delimiter, skipinitialspace = self._guess_delimiter(sample,
  156. delimiters)
  157. if not delimiter:
  158. raise Error, "Could not determine delimiter"
  159. class dialect(Dialect):
  160. _name = "sniffed"
  161. lineterminator = '\r\n'
  162. quoting = QUOTE_MINIMAL
  163. # escapechar = ''
  164. dialect.doublequote = doublequote
  165. dialect.delimiter = delimiter
  166. # _csv.reader won't accept a quotechar of ''
  167. dialect.quotechar = quotechar or '"'
  168. dialect.skipinitialspace = skipinitialspace
  169. return dialect
  170. def _guess_quote_and_delimiter(self, data, delimiters):
  171. """
  172. Looks for text enclosed between two identical quotes
  173. (the probable quotechar) which are preceded and followed
  174. by the same character (the probable delimiter).
  175. For example:
  176. ,'some text',
  177. The quote with the most wins, same with the delimiter.
  178. If there is no quotechar the delimiter can't be determined
  179. this way.
  180. """
  181. matches = []
  182. for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
  183. '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
  184. '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
  185. '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
  186. regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
  187. matches = regexp.findall(data)
  188. if matches:
  189. break
  190. if not matches:
  191. # (quotechar, doublequote, delimiter, skipinitialspace)
  192. return ('', False, None, 0)
  193. quotes = {}
  194. delims = {}
  195. spaces = 0
  196. for m in matches:
  197. n = regexp.groupindex['quote'] - 1
  198. key = m[n]
  199. if key:
  200. quotes[key] = quotes.get(key, 0) + 1
  201. try:
  202. n = regexp.groupindex['delim'] - 1
  203. key = m[n]
  204. except KeyError:
  205. continue
  206. if key and (delimiters is None or key in delimiters):
  207. delims[key] = delims.get(key, 0) + 1
  208. try:
  209. n = regexp.groupindex['space'] - 1
  210. except KeyError:
  211. continue
  212. if m[n]:
  213. spaces += 1
  214. quotechar = reduce(lambda a, b, quotes = quotes:
  215. (quotes[a] > quotes[b]) and a or b, quotes.keys())
  216. if delims:
  217. delim = reduce(lambda a, b, delims = delims:
  218. (delims[a] > delims[b]) and a or b, delims.keys())
  219. skipinitialspace = delims[delim] == spaces
  220. if delim == '\n': # most likely a file with a single column
  221. delim = ''
  222. else:
  223. # there is *no* delimiter, it's a single column of quoted data
  224. delim = ''
  225. skipinitialspace = 0
  226. # if we see an extra quote between delimiters, we've got a
  227. # double quoted format
  228. dq_regexp = re.compile(
  229. r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
  230. {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
  231. if dq_regexp.search(data):
  232. doublequote = True
  233. else:
  234. doublequote = False
  235. return (quotechar, doublequote, delim, skipinitialspace)
  236. def _guess_delimiter(self, data, delimiters):
  237. """
  238. The delimiter /should/ occur the same number of times on
  239. each row. However, due to malformed data, it may not. We don't want
  240. an all or nothing approach, so we allow for small variations in this
  241. number.
  242. 1) build a table of the frequency of each character on every line.
  243. 2) build a table of frequencies of this frequency (meta-frequency?),
  244. e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
  245. 7 times in 2 rows'
  246. 3) use the mode of the meta-frequency to determine the /expected/
  247. frequency for that character
  248. 4) find out how often the character actually meets that goal
  249. 5) the character that best meets its goal is the delimiter
  250. For performance reasons, the data is evaluated in chunks, so it can
  251. try and evaluate the smallest portion of the data possible, evaluating
  252. additional chunks as necessary.
  253. """
  254. data = filter(None, data.split('\n'))
  255. ascii = [chr(c) for c in range(127)] # 7-bit ASCII
  256. # build frequency tables
  257. chunkLength = min(10, len(data))
  258. iteration = 0
  259. charFrequency = {}
  260. modes = {}
  261. delims = {}
  262. start, end = 0, min(chunkLength, len(data))
  263. while start < len(data):
  264. iteration += 1
  265. for line in data[start:end]:
  266. for char in ascii:
  267. metaFrequency = charFrequency.get(char, {})
  268. # must count even if frequency is 0
  269. freq = line.count(char)
  270. # value is the mode
  271. metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
  272. charFrequency[char] = metaFrequency
  273. for char in charFrequency.keys():
  274. items = charFrequency[char].items()
  275. if len(items) == 1 and items[0][0] == 0:
  276. continue
  277. # get the mode of the frequencies
  278. if len(items) > 1:
  279. modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
  280. items)
  281. # adjust the mode - subtract the sum of all
  282. # other frequencies
  283. items.remove(modes[char])
  284. modes[char] = (modes[char][0], modes[char][1]
  285. - reduce(lambda a, b: (0, a[1] + b[1]),
  286. items)[1])
  287. else:
  288. modes[char] = items[0]
  289. # build a list of possible delimiters
  290. modeList = modes.items()
  291. total = float(chunkLength * iteration)
  292. # (rows of consistent data) / (number of rows) = 100%
  293. consistency = 1.0
  294. # minimum consistency threshold
  295. threshold = 0.9
  296. while len(delims) == 0 and consistency >= threshold:
  297. for k, v in modeList:
  298. if v[0] > 0 and v[1] > 0:
  299. if ((v[1]/total) >= consistency and
  300. (delimiters is None or k in delimiters)):
  301. delims[k] = v
  302. consistency -= 0.01
  303. if len(delims) == 1:
  304. delim = delims.keys()[0]
  305. skipinitialspace = (data[0].count(delim) ==
  306. data[0].count("%c " % delim))
  307. return (delim, skipinitialspace)
  308. # analyze another chunkLength lines
  309. start = end
  310. end += chunkLength
  311. if not delims:
  312. return ('', 0)
  313. # if there's more than one, fall back to a 'preferred' list
  314. if len(delims) > 1:
  315. for d in self.preferred:
  316. if d in delims.keys():
  317. skipinitialspace = (data[0].count(d) ==
  318. data[0].count("%c " % d))
  319. return (d, skipinitialspace)
  320. # nothing else indicates a preference, pick the character that
  321. # dominates(?)
  322. items = [(v,k) for (k,v) in delims.items()]
  323. items.sort()
  324. delim = items[-1][1]
  325. skipinitialspace = (data[0].count(delim) ==
  326. data[0].count("%c " % delim))
  327. return (delim, skipinitialspace)
  328. def has_header(self, sample):
  329. # Creates a dictionary of types of data in each column. If any
  330. # column is of a single type (say, integers), *except* for the first
  331. # row, then the first row is presumed to be labels. If the type
  332. # can't be determined, it is assumed to be a string in which case
  333. # the length of the string is the determining factor: if all of the
  334. # rows except for the first are the same length, it's a header.
  335. # Finally, a 'vote' is taken at the end for each column, adding or
  336. # subtracting from the likelihood of the first row being a header.
  337. rdr = reader(StringIO(sample), self.sniff(sample))
  338. header = rdr.next() # assume first row is header
  339. columns = len(header)
  340. columnTypes = {}
  341. for i in range(columns): columnTypes[i] = None
  342. checked = 0
  343. for row in rdr:
  344. # arbitrary number of rows to check, to keep it sane
  345. if checked > 20:
  346. break
  347. checked += 1
  348. if len(row) != columns:
  349. continue # skip rows that have irregular number of columns
  350. for col in columnTypes.keys():
  351. for thisType in [int, long, float, complex]:
  352. try:
  353. thisType(row[col])
  354. break
  355. except (ValueError, OverflowError):
  356. pass
  357. else:
  358. # fallback to length of string
  359. thisType = len(row[col])
  360. # treat longs as ints
  361. if thisType == long:
  362. thisType = int
  363. if thisType != columnTypes[col]:
  364. if columnTypes[col] is None: # add new column type
  365. columnTypes[col] = thisType
  366. else:
  367. # type is inconsistent, remove column from
  368. # consideration
  369. del columnTypes[col]
  370. # finally, compare results against first row and "vote"
  371. # on whether it's a header
  372. hasHeader = 0
  373. for col, colType in columnTypes.items():
  374. if type(colType) == type(0): # it's a length
  375. if len(header[col]) != colType:
  376. hasHeader += 1
  377. else:
  378. hasHeader -= 1
  379. else: # attempt typecast
  380. try:
  381. colType(header[col])
  382. except (ValueError, TypeError):
  383. hasHeader += 1
  384. else:
  385. hasHeader -= 1
  386. return hasHeader > 0