_parseaddr.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497
  1. # Copyright (C) 2002-2007 Python Software Foundation
  2. # Contact: email-sig@python.org
  3. """Email address parsing code.
  4. Lifted directly from rfc822.py. This should eventually be rewritten.
  5. """
  6. __all__ = [
  7. 'mktime_tz',
  8. 'parsedate',
  9. 'parsedate_tz',
  10. 'quote',
  11. ]
  12. import time, calendar
  13. SPACE = ' '
  14. EMPTYSTRING = ''
  15. COMMASPACE = ', '
  16. # Parse a date field
  17. _monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
  18. 'aug', 'sep', 'oct', 'nov', 'dec',
  19. 'january', 'february', 'march', 'april', 'may', 'june', 'july',
  20. 'august', 'september', 'october', 'november', 'december']
  21. _daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
  22. # The timezone table does not include the military time zones defined
  23. # in RFC822, other than Z. According to RFC1123, the description in
  24. # RFC822 gets the signs wrong, so we can't rely on any such time
  25. # zones. RFC1123 recommends that numeric timezone indicators be used
  26. # instead of timezone names.
  27. _timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
  28. 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
  29. 'EST': -500, 'EDT': -400, # Eastern
  30. 'CST': -600, 'CDT': -500, # Central
  31. 'MST': -700, 'MDT': -600, # Mountain
  32. 'PST': -800, 'PDT': -700 # Pacific
  33. }
  34. def parsedate_tz(data):
  35. """Convert a date string to a time tuple.
  36. Accounts for military timezones.
  37. """
  38. data = data.split()
  39. # The FWS after the comma after the day-of-week is optional, so search and
  40. # adjust for this.
  41. if data[0].endswith(',') or data[0].lower() in _daynames:
  42. # There's a dayname here. Skip it
  43. del data[0]
  44. else:
  45. i = data[0].rfind(',')
  46. if i >= 0:
  47. data[0] = data[0][i+1:]
  48. if len(data) == 3: # RFC 850 date, deprecated
  49. stuff = data[0].split('-')
  50. if len(stuff) == 3:
  51. data = stuff + data[1:]
  52. if len(data) == 4:
  53. s = data[3]
  54. i = s.find('+')
  55. if i > 0:
  56. data[3:] = [s[:i], s[i+1:]]
  57. else:
  58. data.append('') # Dummy tz
  59. if len(data) < 5:
  60. return None
  61. data = data[:5]
  62. [dd, mm, yy, tm, tz] = data
  63. mm = mm.lower()
  64. if mm not in _monthnames:
  65. dd, mm = mm, dd.lower()
  66. if mm not in _monthnames:
  67. return None
  68. mm = _monthnames.index(mm) + 1
  69. if mm > 12:
  70. mm -= 12
  71. if dd[-1] == ',':
  72. dd = dd[:-1]
  73. i = yy.find(':')
  74. if i > 0:
  75. yy, tm = tm, yy
  76. if yy[-1] == ',':
  77. yy = yy[:-1]
  78. if not yy[0].isdigit():
  79. yy, tz = tz, yy
  80. if tm[-1] == ',':
  81. tm = tm[:-1]
  82. tm = tm.split(':')
  83. if len(tm) == 2:
  84. [thh, tmm] = tm
  85. tss = '0'
  86. elif len(tm) == 3:
  87. [thh, tmm, tss] = tm
  88. else:
  89. return None
  90. try:
  91. yy = int(yy)
  92. dd = int(dd)
  93. thh = int(thh)
  94. tmm = int(tmm)
  95. tss = int(tss)
  96. except ValueError:
  97. return None
  98. # Check for a yy specified in two-digit format, then convert it to the
  99. # appropriate four-digit format, according to the POSIX standard. RFC 822
  100. # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
  101. # mandates a 4-digit yy. For more information, see the documentation for
  102. # the time module.
  103. if yy < 100:
  104. # The year is between 1969 and 1999 (inclusive).
  105. if yy > 68:
  106. yy += 1900
  107. # The year is between 2000 and 2068 (inclusive).
  108. else:
  109. yy += 2000
  110. tzoffset = None
  111. tz = tz.upper()
  112. if tz in _timezones:
  113. tzoffset = _timezones[tz]
  114. else:
  115. try:
  116. tzoffset = int(tz)
  117. except ValueError:
  118. pass
  119. # Convert a timezone offset into seconds ; -0500 -> -18000
  120. if tzoffset:
  121. if tzoffset < 0:
  122. tzsign = -1
  123. tzoffset = -tzoffset
  124. else:
  125. tzsign = 1
  126. tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
  127. # Daylight Saving Time flag is set to -1, since DST is unknown.
  128. return yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset
  129. def parsedate(data):
  130. """Convert a time string to a time tuple."""
  131. t = parsedate_tz(data)
  132. if isinstance(t, tuple):
  133. return t[:9]
  134. else:
  135. return t
  136. def mktime_tz(data):
  137. """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp."""
  138. if data[9] is None:
  139. # No zone info, so localtime is better assumption than GMT
  140. return time.mktime(data[:8] + (-1,))
  141. else:
  142. t = calendar.timegm(data)
  143. return t - data[9]
  144. def quote(str):
  145. """Prepare string to be used in a quoted string.
  146. Turns backslash and double quote characters into quoted pairs. These
  147. are the only characters that need to be quoted inside a quoted string.
  148. Does not add the surrounding double quotes.
  149. """
  150. return str.replace('\\', '\\\\').replace('"', '\\"')
  151. class AddrlistClass:
  152. """Address parser class by Ben Escoto.
  153. To understand what this class does, it helps to have a copy of RFC 2822 in
  154. front of you.
  155. Note: this class interface is deprecated and may be removed in the future.
  156. Use rfc822.AddressList instead.
  157. """
  158. def __init__(self, field):
  159. """Initialize a new instance.
  160. `field' is an unparsed address header field, containing
  161. one or more addresses.
  162. """
  163. self.specials = '()<>@,:;.\"[]'
  164. self.pos = 0
  165. self.LWS = ' \t'
  166. self.CR = '\r\n'
  167. self.FWS = self.LWS + self.CR
  168. self.atomends = self.specials + self.LWS + self.CR
  169. # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
  170. # is obsolete syntax. RFC 2822 requires that we recognize obsolete
  171. # syntax, so allow dots in phrases.
  172. self.phraseends = self.atomends.replace('.', '')
  173. self.field = field
  174. self.commentlist = []
  175. def gotonext(self):
  176. """Parse up to the start of the next address."""
  177. while self.pos < len(self.field):
  178. if self.field[self.pos] in self.LWS + '\n\r':
  179. self.pos += 1
  180. elif self.field[self.pos] == '(':
  181. self.commentlist.append(self.getcomment())
  182. else:
  183. break
  184. def getaddrlist(self):
  185. """Parse all addresses.
  186. Returns a list containing all of the addresses.
  187. """
  188. result = []
  189. while self.pos < len(self.field):
  190. ad = self.getaddress()
  191. if ad:
  192. result += ad
  193. else:
  194. result.append(('', ''))
  195. return result
  196. def getaddress(self):
  197. """Parse the next address."""
  198. self.commentlist = []
  199. self.gotonext()
  200. oldpos = self.pos
  201. oldcl = self.commentlist
  202. plist = self.getphraselist()
  203. self.gotonext()
  204. returnlist = []
  205. if self.pos >= len(self.field):
  206. # Bad email address technically, no domain.
  207. if plist:
  208. returnlist = [(SPACE.join(self.commentlist), plist[0])]
  209. elif self.field[self.pos] in '.@':
  210. # email address is just an addrspec
  211. # this isn't very efficient since we start over
  212. self.pos = oldpos
  213. self.commentlist = oldcl
  214. addrspec = self.getaddrspec()
  215. returnlist = [(SPACE.join(self.commentlist), addrspec)]
  216. elif self.field[self.pos] == ':':
  217. # address is a group
  218. returnlist = []
  219. fieldlen = len(self.field)
  220. self.pos += 1
  221. while self.pos < len(self.field):
  222. self.gotonext()
  223. if self.pos < fieldlen and self.field[self.pos] == ';':
  224. self.pos += 1
  225. break
  226. returnlist = returnlist + self.getaddress()
  227. elif self.field[self.pos] == '<':
  228. # Address is a phrase then a route addr
  229. routeaddr = self.getrouteaddr()
  230. if self.commentlist:
  231. returnlist = [(SPACE.join(plist) + ' (' +
  232. ' '.join(self.commentlist) + ')', routeaddr)]
  233. else:
  234. returnlist = [(SPACE.join(plist), routeaddr)]
  235. else:
  236. if plist:
  237. returnlist = [(SPACE.join(self.commentlist), plist[0])]
  238. elif self.field[self.pos] in self.specials:
  239. self.pos += 1
  240. self.gotonext()
  241. if self.pos < len(self.field) and self.field[self.pos] == ',':
  242. self.pos += 1
  243. return returnlist
  244. def getrouteaddr(self):
  245. """Parse a route address (Return-path value).
  246. This method just skips all the route stuff and returns the addrspec.
  247. """
  248. if self.field[self.pos] != '<':
  249. return
  250. expectroute = False
  251. self.pos += 1
  252. self.gotonext()
  253. adlist = ''
  254. while self.pos < len(self.field):
  255. if expectroute:
  256. self.getdomain()
  257. expectroute = False
  258. elif self.field[self.pos] == '>':
  259. self.pos += 1
  260. break
  261. elif self.field[self.pos] == '@':
  262. self.pos += 1
  263. expectroute = True
  264. elif self.field[self.pos] == ':':
  265. self.pos += 1
  266. else:
  267. adlist = self.getaddrspec()
  268. self.pos += 1
  269. break
  270. self.gotonext()
  271. return adlist
  272. def getaddrspec(self):
  273. """Parse an RFC 2822 addr-spec."""
  274. aslist = []
  275. self.gotonext()
  276. while self.pos < len(self.field):
  277. if self.field[self.pos] == '.':
  278. aslist.append('.')
  279. self.pos += 1
  280. elif self.field[self.pos] == '"':
  281. aslist.append('"%s"' % quote(self.getquote()))
  282. elif self.field[self.pos] in self.atomends:
  283. break
  284. else:
  285. aslist.append(self.getatom())
  286. self.gotonext()
  287. if self.pos >= len(self.field) or self.field[self.pos] != '@':
  288. return EMPTYSTRING.join(aslist)
  289. aslist.append('@')
  290. self.pos += 1
  291. self.gotonext()
  292. return EMPTYSTRING.join(aslist) + self.getdomain()
  293. def getdomain(self):
  294. """Get the complete domain name from an address."""
  295. sdlist = []
  296. while self.pos < len(self.field):
  297. if self.field[self.pos] in self.LWS:
  298. self.pos += 1
  299. elif self.field[self.pos] == '(':
  300. self.commentlist.append(self.getcomment())
  301. elif self.field[self.pos] == '[':
  302. sdlist.append(self.getdomainliteral())
  303. elif self.field[self.pos] == '.':
  304. self.pos += 1
  305. sdlist.append('.')
  306. elif self.field[self.pos] in self.atomends:
  307. break
  308. else:
  309. sdlist.append(self.getatom())
  310. return EMPTYSTRING.join(sdlist)
  311. def getdelimited(self, beginchar, endchars, allowcomments=True):
  312. """Parse a header fragment delimited by special characters.
  313. `beginchar' is the start character for the fragment.
  314. If self is not looking at an instance of `beginchar' then
  315. getdelimited returns the empty string.
  316. `endchars' is a sequence of allowable end-delimiting characters.
  317. Parsing stops when one of these is encountered.
  318. If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
  319. within the parsed fragment.
  320. """
  321. if self.field[self.pos] != beginchar:
  322. return ''
  323. slist = ['']
  324. quote = False
  325. self.pos += 1
  326. while self.pos < len(self.field):
  327. if quote:
  328. slist.append(self.field[self.pos])
  329. quote = False
  330. elif self.field[self.pos] in endchars:
  331. self.pos += 1
  332. break
  333. elif allowcomments and self.field[self.pos] == '(':
  334. slist.append(self.getcomment())
  335. continue # have already advanced pos from getcomment
  336. elif self.field[self.pos] == '\\':
  337. quote = True
  338. else:
  339. slist.append(self.field[self.pos])
  340. self.pos += 1
  341. return EMPTYSTRING.join(slist)
  342. def getquote(self):
  343. """Get a quote-delimited fragment from self's field."""
  344. return self.getdelimited('"', '"\r', False)
  345. def getcomment(self):
  346. """Get a parenthesis-delimited fragment from self's field."""
  347. return self.getdelimited('(', ')\r', True)
  348. def getdomainliteral(self):
  349. """Parse an RFC 2822 domain-literal."""
  350. return '[%s]' % self.getdelimited('[', ']\r', False)
  351. def getatom(self, atomends=None):
  352. """Parse an RFC 2822 atom.
  353. Optional atomends specifies a different set of end token delimiters
  354. (the default is to use self.atomends). This is used e.g. in
  355. getphraselist() since phrase endings must not include the `.' (which
  356. is legal in phrases)."""
  357. atomlist = ['']
  358. if atomends is None:
  359. atomends = self.atomends
  360. while self.pos < len(self.field):
  361. if self.field[self.pos] in atomends:
  362. break
  363. else:
  364. atomlist.append(self.field[self.pos])
  365. self.pos += 1
  366. return EMPTYSTRING.join(atomlist)
  367. def getphraselist(self):
  368. """Parse a sequence of RFC 2822 phrases.
  369. A phrase is a sequence of words, which are in turn either RFC 2822
  370. atoms or quoted-strings. Phrases are canonicalized by squeezing all
  371. runs of continuous whitespace into one space.
  372. """
  373. plist = []
  374. while self.pos < len(self.field):
  375. if self.field[self.pos] in self.FWS:
  376. self.pos += 1
  377. elif self.field[self.pos] == '"':
  378. plist.append(self.getquote())
  379. elif self.field[self.pos] == '(':
  380. self.commentlist.append(self.getcomment())
  381. elif self.field[self.pos] in self.phraseends:
  382. break
  383. else:
  384. plist.append(self.getatom(self.phraseends))
  385. return plist
  386. class AddressList(AddrlistClass):
  387. """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
  388. def __init__(self, field):
  389. AddrlistClass.__init__(self, field)
  390. if field:
  391. self.addresslist = self.getaddrlist()
  392. else:
  393. self.addresslist = []
  394. def __len__(self):
  395. return len(self.addresslist)
  396. def __add__(self, other):
  397. # Set union
  398. newaddr = AddressList(None)
  399. newaddr.addresslist = self.addresslist[:]
  400. for x in other.addresslist:
  401. if not x in self.addresslist:
  402. newaddr.addresslist.append(x)
  403. return newaddr
  404. def __iadd__(self, other):
  405. # Set union, in-place
  406. for x in other.addresslist:
  407. if not x in self.addresslist:
  408. self.addresslist.append(x)
  409. return self
  410. def __sub__(self, other):
  411. # Set difference
  412. newaddr = AddressList(None)
  413. for x in self.addresslist:
  414. if not x in other.addresslist:
  415. newaddr.addresslist.append(x)
  416. return newaddr
  417. def __isub__(self, other):
  418. # Set difference, in-place
  419. for x in other.addresslist:
  420. if x in self.addresslist:
  421. self.addresslist.remove(x)
  422. return self
  423. def __getitem__(self, index):
  424. # Make indexing, slices, and 'in' work
  425. return self.addresslist[index]