Skip to content

Commit 2837f6c

Browse files
timarmstrongcloudera-hudson
authored andcommitted
IMPALA-2717: fix output of formatted unicode to non-TTY
The bug is that PrettyOutputFormatter.format() returned a unicode object, and Python cannot automatically write unicode objects to output streams where there is no default encoding. The fix is to convert to UTF-8 encoded in a regular string, which can be output to any output device. This makes the output type consistent with DelimitedOutputFormatter.format(). Based on code by Marcell Szabo. Testing: Added a basic test. Played around in an interactive shell to make sure that unicode characters still work in interactive mode. Change-Id: I9de641ecf767a2feef3b9f48b344ef2d55e17a7f Reviewed-on: http://gerrit.cloudera.org:8080/9928 Reviewed-by: Tim Armstrong <[email protected]> Tested-by: Impala Public Jenkins <[email protected]>
1 parent 3e32a4a commit 2837f6c

File tree

3 files changed

+31
-6
lines changed

3 files changed

+31
-6
lines changed

shell/impala_shell.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,9 @@ class CmdStatus:
7070
ERROR = False
7171

7272
class ImpalaPrettyTable(prettytable.PrettyTable):
73-
"""Patched version of PrettyTable that TODO"""
73+
"""Patched version of PrettyTable with different unicode handling - instead of throwing
74+
exceptions when a character can't be converted to unicode, it is replaced with a
75+
placeholder character."""
7476
def _unicode(self, value):
7577
if not isinstance(value, basestring):
7678
value = str(value)

shell/shell_output.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,16 @@ def __init__(self, prettytable):
2828
self.prettytable = prettytable
2929

3030
def format(self, rows):
31+
"""Returns string containing UTF-8-encoded representation of the table data."""
3132
# Clear rows that already exist in the table.
3233
self.prettytable.clear_rows()
3334
try:
3435
map(self.prettytable.add_row, rows)
35-
return self.prettytable.get_string()
36+
# PrettyTable.get_string() converts UTF-8-encoded strs added via add_row() into
37+
# Python unicode strings. We need to convert it back to a UTF-8-encoded str for
38+
# output, since Python won't do the encoding automatically when outputting to a
39+
# non-terminal (see IMPALA-2717).
40+
return self.prettytable.get_string().encode('utf-8')
3641
except Exception, e:
3742
# beeswax returns each row as a tab separated string. If a string column
3843
# value in a row has tabs, it will break the row split. Default to displaying
@@ -53,6 +58,7 @@ def __init__(self, field_delim="\t"):
5358
raise ValueError, error_msg
5459

5560
def format(self, rows):
61+
"""Returns string containing UTF-8-encoded representation of the table data."""
5662
# csv.writer expects a file handle to the input.
5763
# cStringIO is used as the temporary buffer.
5864
temp_buffer = StringIO()

tests/shell/test_shell_commandline.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@
3333
DEFAULT_QUERY = 'select 1'
3434
QUERY_FILE_PATH = os.path.join(os.environ['IMPALA_HOME'], 'tests', 'shell')
3535

36+
RUSSIAN_CHARS = (u"А, Б, В, Г, Д, Е, Ё, Ж, З, И, Й, К, Л, М, Н, О, П, Р,"
37+
u"С, Т, У, Ф, Х, Ц,Ч, Ш, Щ, Ъ, Ы, Ь, Э, Ю, Я")
3638

3739
@pytest.fixture
3840
def empty_table(unique_database, request):
@@ -405,12 +407,27 @@ def test_get_log_once(self, empty_table):
405407

406408
def test_international_characters(self):
407409
"""Sanity test to ensure that the shell can read international characters."""
408-
russian_chars = (u"А, Б, В, Г, Д, Е, Ё, Ж, З, И, Й, К, Л, М, Н, О, П, Р,"
409-
u"С, Т, У, Ф, Х, Ц,Ч, Ш, Щ, Ъ, Ы, Ь, Э, Ю, Я")
410-
args = """-B -q "select '%s'" """ % russian_chars
410+
args = """-B -q "select '%s'" """ % RUSSIAN_CHARS
411411
result = run_impala_shell_cmd(args.encode('utf-8'))
412412
assert 'UnicodeDecodeError' not in result.stderr
413-
assert russian_chars.encode('utf-8') in result.stdout
413+
assert RUSSIAN_CHARS.encode('utf-8') in result.stdout
414+
415+
def test_international_characters_prettyprint(self):
416+
"""IMPALA-2717: ensure we can handle international characters in pretty-printed
417+
output"""
418+
args = """-q "select '%s'" """ % RUSSIAN_CHARS
419+
result = run_impala_shell_cmd(args.encode('utf-8'))
420+
assert 'UnicodeDecodeError' not in result.stderr
421+
assert RUSSIAN_CHARS.encode('utf-8') in result.stdout
422+
423+
def test_international_characters_prettyprint_tabs(self):
424+
"""IMPALA-2717: ensure we can handle international characters in pretty-printed
425+
output when pretty-printing falls back to delimited output."""
426+
args = """-q "select '%s\\t'" """ % RUSSIAN_CHARS
427+
result = run_impala_shell_cmd(args.encode('utf-8'))
428+
assert 'Reverting to tab delimited text' in result.stderr
429+
assert 'UnicodeDecodeError' not in result.stderr
430+
assert RUSSIAN_CHARS.encode('utf-8') in result.stdout
414431

415432
@pytest.mark.execute_serially # This tests invalidates metadata, and must run serially
416433
def test_config_file(self):

0 commit comments

Comments
 (0)