Skip to content

bpo-32861: urllib.robotparser fix incomplete __str__ methods. #5711

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 14, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions Lib/test/test_robotparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,32 @@ class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
bad = ['/cyberworld/map/index.html']


class StringFormattingTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Crawl-delay: 1
Request-rate: 3/15
Disallow: /cyberworld/map/ # This is an infinite virtual URL space

# Cybermapper knows where to go.
User-agent: cybermapper
Disallow: /some/path
"""

expected_output = """\
User-agent: cybermapper
Disallow: /some/path

User-agent: *
Crawl-delay: 1
Request-rate: 3/15
Disallow: /cyberworld/map/\
"""

def test_string_formatting(self):
self.assertEqual(str(self.parser), self.expected_output)


class RobotHandler(BaseHTTPRequestHandler):

def do_GET(self):
Expand Down
17 changes: 12 additions & 5 deletions Lib/urllib/robotparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,10 @@ def request_rate(self, useragent):
return self.default_entry.req_rate

def __str__(self):
return ''.join([str(entry) + "\n" for entry in self.entries])
entries = self.entries
if self.default_entry is not None:
entries = entries + [self.default_entry]
return '\n\n'.join(map(str, entries))


class RuleLine:
Expand Down Expand Up @@ -222,10 +225,14 @@ def __init__(self):
def __str__(self):
ret = []
for agent in self.useragents:
ret.extend(["User-agent: ", agent, "\n"])
for line in self.rulelines:
ret.extend([str(line), "\n"])
return ''.join(ret)
ret.append(f"User-agent: {agent}")
if self.delay is not None:
ret.append(f"Crawl-delay: {self.delay}")
if self.req_rate is not None:
rate = self.req_rate
ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
ret.extend(map(str, self.rulelines))
return '\n'.join(ret)

def applies_to(self, useragent):
"""check if this entry applies to the specified agent"""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
The urllib.robotparser's ``__str__`` representation now includes wildcard
entries and the "Crawl-delay" and "Request-rate" fields. Also removes extra
newlines that were being appended to the end of the string. Patch by
Michael Lazar.