test_robotparser.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. import unittest, StringIO, robotparser
  2. from test import test_support
  3. from urllib2 import urlopen, HTTPError
  4. HAVE_HTTPS = True
  5. try:
  6. from urllib2 import HTTPSHandler
  7. except ImportError:
  8. HAVE_HTTPS = False
  9. class RobotTestCase(unittest.TestCase):
  10. def __init__(self, index, parser, url, good, agent):
  11. unittest.TestCase.__init__(self)
  12. if good:
  13. self.str = "RobotTest(%d, good, %s)" % (index, url)
  14. else:
  15. self.str = "RobotTest(%d, bad, %s)" % (index, url)
  16. self.parser = parser
  17. self.url = url
  18. self.good = good
  19. self.agent = agent
  20. def runTest(self):
  21. if isinstance(self.url, tuple):
  22. agent, url = self.url
  23. else:
  24. url = self.url
  25. agent = self.agent
  26. if self.good:
  27. self.assertTrue(self.parser.can_fetch(agent, url))
  28. else:
  29. self.assertFalse(self.parser.can_fetch(agent, url))
  30. def __str__(self):
  31. return self.str
  32. tests = unittest.TestSuite()
  33. def RobotTest(index, robots_txt, good_urls, bad_urls,
  34. agent="test_robotparser"):
  35. lines = StringIO.StringIO(robots_txt).readlines()
  36. parser = robotparser.RobotFileParser()
  37. parser.parse(lines)
  38. for url in good_urls:
  39. tests.addTest(RobotTestCase(index, parser, url, 1, agent))
  40. for url in bad_urls:
  41. tests.addTest(RobotTestCase(index, parser, url, 0, agent))
  42. # Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
  43. # 1.
  44. doc = """
  45. User-agent: *
  46. Disallow: /cyberworld/map/ # This is an infinite virtual URL space
  47. Disallow: /tmp/ # these will soon disappear
  48. Disallow: /foo.html
  49. """
  50. good = ['/','/test.html']
  51. bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
  52. RobotTest(1, doc, good, bad)
  53. # 2.
  54. doc = """
  55. # robots.txt for http://www.example.com/
  56. User-agent: *
  57. Disallow: /cyberworld/map/ # This is an infinite virtual URL space
  58. # Cybermapper knows where to go.
  59. User-agent: cybermapper
  60. Disallow:
  61. """
  62. good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
  63. bad = ['/cyberworld/map/index.html']
  64. RobotTest(2, doc, good, bad)
  65. # 3.
  66. doc = """
  67. # go away
  68. User-agent: *
  69. Disallow: /
  70. """
  71. good = []
  72. bad = ['/cyberworld/map/index.html','/','/tmp/']
  73. RobotTest(3, doc, good, bad)
  74. # Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
  75. # 4.
  76. doc = """
  77. User-agent: figtree
  78. Disallow: /tmp
  79. Disallow: /a%3cd.html
  80. Disallow: /a%2fb.html
  81. Disallow: /%7ejoe/index.html
  82. """
  83. good = [] # XFAIL '/a/b.html'
  84. bad = ['/tmp','/tmp.html','/tmp/a.html',
  85. '/a%3cd.html','/a%3Cd.html','/a%2fb.html',
  86. '/~joe/index.html'
  87. ]
  88. RobotTest(4, doc, good, bad, 'figtree')
  89. RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')
  90. # 6.
  91. doc = """
  92. User-agent: *
  93. Disallow: /tmp/
  94. Disallow: /a%3Cd.html
  95. Disallow: /a/b.html
  96. Disallow: /%7ejoe/index.html
  97. """
  98. good = ['/tmp',] # XFAIL: '/a%2fb.html'
  99. bad = ['/tmp/','/tmp/a.html',
  100. '/a%3cd.html','/a%3Cd.html',"/a/b.html",
  101. '/%7Ejoe/index.html']
  102. RobotTest(6, doc, good, bad)
  103. # From bug report #523041
  104. # 7.
  105. doc = """
  106. User-Agent: *
  107. Disallow: /.
  108. """
  109. good = ['/foo.html']
  110. bad = [] # Bug report says "/" should be denied, but that is not in the RFC
  111. RobotTest(7, doc, good, bad)
  112. # From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364
  113. # 8.
  114. doc = """
  115. User-agent: Googlebot
  116. Allow: /folder1/myfile.html
  117. Disallow: /folder1/
  118. """
  119. good = ['/folder1/myfile.html']
  120. bad = ['/folder1/anotherfile.html']
  121. RobotTest(8, doc, good, bad, agent="Googlebot")
  122. # 9. This file is incorrect because "Googlebot" is a substring of
  123. # "Googlebot-Mobile", so test 10 works just like test 9.
  124. doc = """
  125. User-agent: Googlebot
  126. Disallow: /
  127. User-agent: Googlebot-Mobile
  128. Allow: /
  129. """
  130. good = []
  131. bad = ['/something.jpg']
  132. RobotTest(9, doc, good, bad, agent="Googlebot")
  133. good = []
  134. bad = ['/something.jpg']
  135. RobotTest(10, doc, good, bad, agent="Googlebot-Mobile")
  136. # 11. Get the order correct.
  137. doc = """
  138. User-agent: Googlebot-Mobile
  139. Allow: /
  140. User-agent: Googlebot
  141. Disallow: /
  142. """
  143. good = []
  144. bad = ['/something.jpg']
  145. RobotTest(11, doc, good, bad, agent="Googlebot")
  146. good = ['/something.jpg']
  147. bad = []
  148. RobotTest(12, doc, good, bad, agent="Googlebot-Mobile")
  149. # 13. Google also got the order wrong in #8. You need to specify the
  150. # URLs from more specific to more general.
  151. doc = """
  152. User-agent: Googlebot
  153. Allow: /folder1/myfile.html
  154. Disallow: /folder1/
  155. """
  156. good = ['/folder1/myfile.html']
  157. bad = ['/folder1/anotherfile.html']
  158. RobotTest(13, doc, good, bad, agent="googlebot")
  159. # 14. For issue #6325 (query string support)
  160. doc = """
  161. User-agent: *
  162. Disallow: /some/path?name=value
  163. """
  164. good = ['/some/path']
  165. bad = ['/some/path?name=value']
  166. RobotTest(14, doc, good, bad)
  167. # 15. For issue #4108 (obey first * entry)
  168. doc = """
  169. User-agent: *
  170. Disallow: /some/path
  171. User-agent: *
  172. Disallow: /another/path
  173. """
  174. good = ['/another/path']
  175. bad = ['/some/path']
  176. RobotTest(15, doc, good, bad)
  177. # 16. Empty query (issue #17403). Normalizing the url first.
  178. doc = """
  179. User-agent: *
  180. Allow: /some/path?
  181. Disallow: /another/path?
  182. """
  183. good = ['/some/path?']
  184. bad = ['/another/path?']
  185. RobotTest(16, doc, good, bad)
  186. class NetworkTestCase(unittest.TestCase):
  187. def testPasswordProtectedSite(self):
  188. test_support.requires('network')
  189. with test_support.transient_internet('mueblesmoraleda.com'):
  190. url = 'http://mueblesmoraleda.com'
  191. robots_url = url + "/robots.txt"
  192. # First check the URL is usable for our purposes, since the
  193. # test site is a bit flaky.
  194. try:
  195. urlopen(robots_url)
  196. except HTTPError as e:
  197. if e.code not in {401, 403}:
  198. self.skipTest(
  199. "%r should return a 401 or 403 HTTP error, not %r"
  200. % (robots_url, e.code))
  201. else:
  202. self.skipTest(
  203. "%r should return a 401 or 403 HTTP error, not succeed"
  204. % (robots_url))
  205. parser = robotparser.RobotFileParser()
  206. parser.set_url(url)
  207. try:
  208. parser.read()
  209. except IOError:
  210. self.skipTest('%s is unavailable' % url)
  211. self.assertEqual(parser.can_fetch("*", robots_url), False)
  212. @unittest.skipUnless(HAVE_HTTPS, 'need SSL support to download license')
  213. @test_support.system_must_validate_cert
  214. def testPythonOrg(self):
  215. test_support.requires('network')
  216. with test_support.transient_internet('www.python.org'):
  217. parser = robotparser.RobotFileParser(
  218. "https://www.python.org/robots.txt")
  219. parser.read()
  220. self.assertTrue(
  221. parser.can_fetch("*", "https://www.python.org/robots.txt"))
  222. def test_main():
  223. test_support.run_unittest(tests)
  224. test_support.run_unittest(NetworkTestCase)
  225. if __name__=='__main__':
  226. test_support.verbose = 1
  227. test_main()