Improve whitespace encoding in converter module

xolox · xolox · commit 50cebecf76b1 · 2017-05-17T23:23:53.000+02:00
diff --git a/coloredlogs/converter.py b/coloredlogs/converter.py
@@ -81,14 +81,15 @@ def capture(command, encoding='UTF-8'):
     return u'\n'.join(clean_terminal_output(output))
 
 
-def convert(text, code=True):
+def convert(text, code=True, tabsize=4):
     """
     Convert text with ANSI escape sequences to HTML.
 
     :param text: The text with ANSI escape sequences (a string).
     :param code: Whether to wrap the returned HTML fragment in a
                  ``<code>...</code>`` element (a boolean, defaults
                  to :data:`True`).
+    :param tabsize: Refer to :func:`str.expandtabs()` for details.
     :returns: The text converted to HTML (a string).
     """
     output = []
@@ -116,31 +117,67 @@ def convert(text, code=True):
                     token = ''
         else:
             token = html_encode(token)
-            token = encode_whitespace(token)
         output.append(token)
     html = ''.join(output)
+    html = encode_whitespace(html, tabsize)
     if code:
         html = '<code>%s</code>' % html
     return html
 
 
-def encode_whitespace(text):
+def encode_whitespace(text, tabsize=4):
     """
     Encode whitespace so that web browsers properly render it.
 
     :param text: The plain text (a string).
+    :param tabsize: Refer to :func:`str.expandtabs()` for details.
     :returns: The text converted to HTML (a string).
 
     The purpose of this function is to encode whitespace in such a way that web
     browsers render the same whitespace regardless of whether 'preformatted'
     styling is used (by wrapping the text in a ``<pre>...</pre>`` element).
+
+    .. note:: While the string manipulation performed by this function is
+              specifically intended not to corrupt the HTML generated by
+              :func:`convert()` it definitely does have the potential to
+              corrupt HTML from other sources. You have been warned :-).
     """
+    # Convert Windows line endings (CR+LF) to UNIX line endings (LF).
     text = text.replace('\r\n', '\n')
+    # Convert UNIX line endings (LF) to HTML line endings (<br>).
     text = text.replace('\n', '<br>\n')
-    text = text.replace(' ', '&nbsp;')
+    # Convert tabs to spaces.
+    text = text.expandtabs(tabsize)
+    # Convert leading spaces (that is to say spaces at the start of the string
+    # and/or directly after a line ending) into non-breaking spaces, otherwise
+    # HTML rendering engines will simply ignore these spaces.
+    text = re.sub('^ +', encode_whitespace_cb, text, 0, re.MULTILINE)
+    # Convert runs of multiple spaces into non-breaking spaces to avoid HTML
+    # rendering engines from visually collapsing runs of spaces into a single
+    # space. We specifically don't replace single spaces for several reasons:
+    # 1. We'd break the HTML emitted by convert() by replacing spaces
+    #    inside HTML elements (for example the spaces that separate
+    #    element names from attribute names).
+    # 2. If every single space is replaced by a non-breaking space,
+    #    web browsers perform awkwardly unintuitive word wrapping.
+    # 3. The HTML output would be bloated for no good reason.
+    text = re.sub(' {2,}', encode_whitespace_cb, text)
     return text
 
 
+def encode_whitespace_cb(match):
+    """
+    Replace runs of multiple spaces with non-breaking spaces.
+
+    :param match: A regular expression match object.
+    :returns: The replacement string.
+
+    This function is used by func:`encode_whitespace()` as a callback for
+    replacement using a regular expression pattern.
+    """
+    return '&nbsp;' * len(match.group(0))
+
+
 def html_encode(text):
     """
     Encode characters with a special meaning as HTML.
diff --git a/coloredlogs/tests.py b/coloredlogs/tests.py
@@ -343,7 +343,7 @@ def test_html_conversion(self):
         assert ansi_encoded_text == 'I like \x1b[1;34mbirds\x1b[0m - www.eelstheband.com'
         html_encoded_text = convert(ansi_encoded_text)
         assert html_encoded_text == (
-            '<code>I&nbsp;like&nbsp;<span style="font-weight:bold;color:blue">birds</span>&nbsp;-&nbsp;'
+            '<code>I like <span style="font-weight:bold;color:blue">birds</span> - '
             '<a href="http://www.eelstheband.com" style="color:inherit">www.eelstheband.com</a></code>'
         )
 

Original file line number	Diff line number	Diff line change
`@@ -343,7 +343,7 @@ def test_html_conversion(self):`
`343`	`343`	`assert ansi_encoded_text == 'I like \x1b[1;34mbirds\x1b[0m - www.eelstheband.com'`
`344`	`344`	`html_encoded_text = convert(ansi_encoded_text)`
`345`	`345`	`assert html_encoded_text == (`
`346`		`- '<code>I like <span style="font-weight:bold;color:blue">birds</span> - '`
	`346`	`+ '<code>I like <span style="font-weight:bold;color:blue">birds</span> - '`
`347`	`347`	`'<a href="http://www.eelstheband.com" style="color:inherit">www.eelstheband.com</a></code>'`
`348`	`348`	`)`
`349`	`349`