robotparser.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. """ robotparser.py
  2. Copyright (C) 2000 Bastian Kleineidam
  3. You can choose between two licenses when using this package:
  4. 1) GNU GPLv2
  5. 2) PSF license for Python 2.2
  6. The robots.txt Exclusion Protocol is implemented as specified in
  7. http://www.robotstxt.org/norobots-rfc.txt
  8. """
  9. import urllib.parse, urllib.request
  10. __all__ = ["RobotFileParser"]
  11. class RobotFileParser:
  12. """ This class provides a set of methods to read, parse and answer
  13. questions about a single robots.txt file.
  14. """
  15. def __init__(self, url=''):
  16. self.entries = []
  17. self.default_entry = None
  18. self.disallow_all = False
  19. self.allow_all = False
  20. self.set_url(url)
  21. self.last_checked = 0
  22. def mtime(self):
  23. """Returns the time the robots.txt file was last fetched.
  24. This is useful for long-running web spiders that need to
  25. check for new robots.txt files periodically.
  26. """
  27. return self.last_checked
  28. def modified(self):
  29. """Sets the time the robots.txt file was last fetched to the
  30. current time.
  31. """
  32. import time
  33. self.last_checked = time.time()
  34. def set_url(self, url):
  35. """Sets the URL referring to a robots.txt file."""
  36. self.url = url
  37. self.host, self.path = urllib.parse.urlparse(url)[1:3]
  38. def read(self):
  39. """Reads the robots.txt URL and feeds it to the parser."""
  40. try:
  41. f = urllib.request.urlopen(self.url)
  42. except urllib.error.HTTPError as err:
  43. if err.code in (401, 403):
  44. self.disallow_all = True
  45. elif err.code >= 400 and err.code < 500:
  46. self.allow_all = True
  47. else:
  48. raw = f.read()
  49. self.parse(raw.decode("utf-8").splitlines())
  50. def _add_entry(self, entry):
  51. if "*" in entry.useragents:
  52. # the default entry is considered last
  53. if self.default_entry is None:
  54. # the first default entry wins
  55. self.default_entry = entry
  56. else:
  57. self.entries.append(entry)
  58. def parse(self, lines):
  59. """Parse the input lines from a robots.txt file.
  60. We allow that a user-agent: line is not preceded by
  61. one or more blank lines.
  62. """
  63. # states:
  64. # 0: start state
  65. # 1: saw user-agent line
  66. # 2: saw an allow or disallow line
  67. state = 0
  68. entry = Entry()
  69. self.modified()
  70. for line in lines:
  71. if not line:
  72. if state == 1:
  73. entry = Entry()
  74. state = 0
  75. elif state == 2:
  76. self._add_entry(entry)
  77. entry = Entry()
  78. state = 0
  79. # remove optional comment and strip line
  80. i = line.find('#')
  81. if i >= 0:
  82. line = line[:i]
  83. line = line.strip()
  84. if not line:
  85. continue
  86. line = line.split(':', 1)
  87. if len(line) == 2:
  88. line[0] = line[0].strip().lower()
  89. line[1] = urllib.parse.unquote(line[1].strip())
  90. if line[0] == "user-agent":
  91. if state == 2:
  92. self._add_entry(entry)
  93. entry = Entry()
  94. entry.useragents.append(line[1])
  95. state = 1
  96. elif line[0] == "disallow":
  97. if state != 0:
  98. entry.rulelines.append(RuleLine(line[1], False))
  99. state = 2
  100. elif line[0] == "allow":
  101. if state != 0:
  102. entry.rulelines.append(RuleLine(line[1], True))
  103. state = 2
  104. if state == 2:
  105. self._add_entry(entry)
  106. def can_fetch(self, useragent, url):
  107. """using the parsed robots.txt decide if useragent can fetch url"""
  108. if self.disallow_all:
  109. return False
  110. if self.allow_all:
  111. return True
  112. # Until the robots.txt file has been read or found not
  113. # to exist, we must assume that no url is allowable.
  114. # This prevents false positives when a user erroneously
  115. # calls can_fetch() before calling read().
  116. if not self.last_checked:
  117. return False
  118. # search for given user agent matches
  119. # the first match counts
  120. parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
  121. url = urllib.parse.urlunparse(('','',parsed_url.path,
  122. parsed_url.params,parsed_url.query, parsed_url.fragment))
  123. url = urllib.parse.quote(url)
  124. if not url:
  125. url = "/"
  126. for entry in self.entries:
  127. if entry.applies_to(useragent):
  128. return entry.allowance(url)
  129. # try the default entry last
  130. if self.default_entry:
  131. return self.default_entry.allowance(url)
  132. # agent not found ==> access granted
  133. return True
  134. def __str__(self):
  135. return ''.join([str(entry) + "\n" for entry in self.entries])
  136. class RuleLine:
  137. """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
  138. (allowance==False) followed by a path."""
  139. def __init__(self, path, allowance):
  140. if path == '' and not allowance:
  141. # an empty value means allow all
  142. allowance = True
  143. path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
  144. self.path = urllib.parse.quote(path)
  145. self.allowance = allowance
  146. def applies_to(self, filename):
  147. return self.path == "*" or filename.startswith(self.path)
  148. def __str__(self):
  149. return ("Allow" if self.allowance else "Disallow") + ": " + self.path
  150. class Entry:
  151. """An entry has one or more user-agents and zero or more rulelines"""
  152. def __init__(self):
  153. self.useragents = []
  154. self.rulelines = []
  155. def __str__(self):
  156. ret = []
  157. for agent in self.useragents:
  158. ret.extend(["User-agent: ", agent, "\n"])
  159. for line in self.rulelines:
  160. ret.extend([str(line), "\n"])
  161. return ''.join(ret)
  162. def applies_to(self, useragent):
  163. """check if this entry applies to the specified agent"""
  164. # split the name token and make it lower case
  165. useragent = useragent.split("/")[0].lower()
  166. for agent in self.useragents:
  167. if agent == '*':
  168. # we have the catch-all agent
  169. return True
  170. agent = agent.lower()
  171. if agent in useragent:
  172. return True
  173. return False
  174. def allowance(self, filename):
  175. """Preconditions:
  176. - our agent applies to this entry
  177. - filename is URL decoded"""
  178. for line in self.rulelines:
  179. if line.applies_to(filename):
  180. return line.allowance
  181. return True