robotparser.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. """ robotparser.py
  2. Copyright (C) 2000 Bastian Kleineidam
  3. You can choose between two licenses when using this package:
  4. 1) GNU GPLv2
  5. 2) PSF license for Python 2.2
  6. The robots.txt Exclusion Protocol is implemented as specified in
  7. http://www.robotstxt.org/norobots-rfc.txt
  8. """
  9. import urlparse
  10. import urllib
  11. __all__ = ["RobotFileParser"]
  12. class RobotFileParser:
  13. """ This class provides a set of methods to read, parse and answer
  14. questions about a single robots.txt file.
  15. """
  16. def __init__(self, url=''):
  17. self.entries = []
  18. self.default_entry = None
  19. self.disallow_all = False
  20. self.allow_all = False
  21. self.set_url(url)
  22. self.last_checked = 0
  23. def mtime(self):
  24. """Returns the time the robots.txt file was last fetched.
  25. This is useful for long-running web spiders that need to
  26. check for new robots.txt files periodically.
  27. """
  28. return self.last_checked
  29. def modified(self):
  30. """Sets the time the robots.txt file was last fetched to the
  31. current time.
  32. """
  33. import time
  34. self.last_checked = time.time()
  35. def set_url(self, url):
  36. """Sets the URL referring to a robots.txt file."""
  37. self.url = url
  38. self.host, self.path = urlparse.urlparse(url)[1:3]
  39. def read(self):
  40. """Reads the robots.txt URL and feeds it to the parser."""
  41. opener = URLopener()
  42. f = opener.open(self.url)
  43. lines = [line.strip() for line in f]
  44. f.close()
  45. self.errcode = opener.errcode
  46. if self.errcode in (401, 403):
  47. self.disallow_all = True
  48. elif self.errcode >= 400 and self.errcode < 500:
  49. self.allow_all = True
  50. elif self.errcode == 200 and lines:
  51. self.parse(lines)
  52. def _add_entry(self, entry):
  53. if "*" in entry.useragents:
  54. # the default entry is considered last
  55. if self.default_entry is None:
  56. # the first default entry wins
  57. self.default_entry = entry
  58. else:
  59. self.entries.append(entry)
  60. def parse(self, lines):
  61. """parse the input lines from a robots.txt file.
  62. We allow that a user-agent: line is not preceded by
  63. one or more blank lines."""
  64. # states:
  65. # 0: start state
  66. # 1: saw user-agent line
  67. # 2: saw an allow or disallow line
  68. state = 0
  69. linenumber = 0
  70. entry = Entry()
  71. self.modified()
  72. for line in lines:
  73. linenumber += 1
  74. if not line:
  75. if state == 1:
  76. entry = Entry()
  77. state = 0
  78. elif state == 2:
  79. self._add_entry(entry)
  80. entry = Entry()
  81. state = 0
  82. # remove optional comment and strip line
  83. i = line.find('#')
  84. if i >= 0:
  85. line = line[:i]
  86. line = line.strip()
  87. if not line:
  88. continue
  89. line = line.split(':', 1)
  90. if len(line) == 2:
  91. line[0] = line[0].strip().lower()
  92. line[1] = urllib.unquote(line[1].strip())
  93. if line[0] == "user-agent":
  94. if state == 2:
  95. self._add_entry(entry)
  96. entry = Entry()
  97. entry.useragents.append(line[1])
  98. state = 1
  99. elif line[0] == "disallow":
  100. if state != 0:
  101. entry.rulelines.append(RuleLine(line[1], False))
  102. state = 2
  103. elif line[0] == "allow":
  104. if state != 0:
  105. entry.rulelines.append(RuleLine(line[1], True))
  106. state = 2
  107. if state == 2:
  108. self._add_entry(entry)
  109. def can_fetch(self, useragent, url):
  110. """using the parsed robots.txt decide if useragent can fetch url"""
  111. if self.disallow_all:
  112. return False
  113. if self.allow_all:
  114. return True
  115. # Until the robots.txt file has been read or found not
  116. # to exist, we must assume that no url is allowable.
  117. # This prevents false positives when a user erroneously
  118. # calls can_fetch() before calling read().
  119. if not self.last_checked:
  120. return False
  121. # search for given user agent matches
  122. # the first match counts
  123. parsed_url = urlparse.urlparse(urllib.unquote(url))
  124. url = urlparse.urlunparse(('', '', parsed_url.path,
  125. parsed_url.params, parsed_url.query, parsed_url.fragment))
  126. url = urllib.quote(url)
  127. if not url:
  128. url = "/"
  129. for entry in self.entries:
  130. if entry.applies_to(useragent):
  131. return entry.allowance(url)
  132. # try the default entry last
  133. if self.default_entry:
  134. return self.default_entry.allowance(url)
  135. # agent not found ==> access granted
  136. return True
  137. def __str__(self):
  138. return ''.join([str(entry) + "\n" for entry in self.entries])
  139. class RuleLine:
  140. """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
  141. (allowance==False) followed by a path."""
  142. def __init__(self, path, allowance):
  143. if path == '' and not allowance:
  144. # an empty value means allow all
  145. allowance = True
  146. path = urlparse.urlunparse(urlparse.urlparse(path))
  147. self.path = urllib.quote(path)
  148. self.allowance = allowance
  149. def applies_to(self, filename):
  150. return self.path == "*" or filename.startswith(self.path)
  151. def __str__(self):
  152. return (self.allowance and "Allow" or "Disallow") + ": " + self.path
  153. class Entry:
  154. """An entry has one or more user-agents and zero or more rulelines"""
  155. def __init__(self):
  156. self.useragents = []
  157. self.rulelines = []
  158. def __str__(self):
  159. ret = []
  160. for agent in self.useragents:
  161. ret.extend(["User-agent: ", agent, "\n"])
  162. for line in self.rulelines:
  163. ret.extend([str(line), "\n"])
  164. return ''.join(ret)
  165. def applies_to(self, useragent):
  166. """check if this entry applies to the specified agent"""
  167. # split the name token and make it lower case
  168. useragent = useragent.split("/")[0].lower()
  169. for agent in self.useragents:
  170. if agent == '*':
  171. # we have the catch-all agent
  172. return True
  173. agent = agent.lower()
  174. if agent in useragent:
  175. return True
  176. return False
  177. def allowance(self, filename):
  178. """Preconditions:
  179. - our agent applies to this entry
  180. - filename is URL decoded"""
  181. for line in self.rulelines:
  182. if line.applies_to(filename):
  183. return line.allowance
  184. return True
  185. class URLopener(urllib.FancyURLopener):
  186. def __init__(self, *args):
  187. urllib.FancyURLopener.__init__(self, *args)
  188. self.errcode = 200
  189. def prompt_user_passwd(self, host, realm):
  190. ## If robots.txt file is accessible only with a password,
  191. ## we act as if the file wasn't there.
  192. return None, None
  193. def http_error_default(self, url, fp, errcode, errmsg, headers):
  194. self.errcode = errcode
  195. return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
  196. errmsg, headers)