Skip to content

Commit 873a877

Browse files
committed
Added option to not decode header text
1 parent 11c2f31 commit 873a877

File tree

3 files changed

+38
-19
lines changed

3 files changed

+38
-19
lines changed

pandas/io/sas/sas7bdat.py

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -54,20 +54,24 @@ class SAS7BDATReader(BaseIterator):
5454
with given number of lines.
5555
encoding : string, defaults to None
5656
String encoding.
57-
convert_text : bool, deafaults to True
57+
convert_text : bool, defaults to True
5858
If False, text variables are left as raw bytes.
59+
convert_header_text : bool, defaults to True
60+
If False, header text, including column names, are left as raw
61+
bytes.
5962
"""
6063

6164
def __init__(self, path_or_buf, index=None, convert_dates=True,
6265
blank_missing=True, chunksize=None, encoding=None,
63-
convert_text=True):
66+
convert_text=True, convert_header_text=True):
6467

6568
self.index = index
6669
self.convert_dates = convert_dates
6770
self.blank_missing = blank_missing
6871
self.chunksize = chunksize
6972
self.encoding = encoding
7073
self.convert_text = convert_text
74+
self.convert_header_text = convert_header_text
7175

7276
self.compression = ""
7377
self.column_names_strings = []
@@ -143,10 +147,14 @@ def _get_properties(self):
143147
self.platform = "unknown"
144148

145149
buf = self._read_bytes(const.dataset_offset, const.dataset_length)
146-
self.name = buf.rstrip(b'\x00 ').decode()
150+
self.name = buf.rstrip(b'\x00 ')
151+
if self.convert_header_text:
152+
self.name = self.name.decode(self.encoding)
147153

148154
buf = self._read_bytes(const.file_type_offset, const.file_type_length)
149-
self.file_type = buf.rstrip(b'\x00 ').decode()
155+
self.file_type = buf.rstrip(b'\x00 ')
156+
if self.convert_header_text:
157+
self.file_type = self.file_type.decode(self.encoding)
150158

151159
# Timestamp is epoch 01/01/1960
152160
epoch = pd.datetime(1960, 1, 1)
@@ -173,25 +181,33 @@ def _get_properties(self):
173181

174182
buf = self._read_bytes(const.sas_release_offset + total_align,
175183
const.sas_release_length)
176-
self.sas_release = buf.rstrip(b'\x00 ').decode()
184+
self.sas_release = buf.rstrip(b'\x00 ')
185+
if self.convert_header_text:
186+
self.sas_release = self.sas_release.decode(self.encoding)
177187

178188
buf = self._read_bytes(const.sas_server_type_offset + total_align,
179189
const.sas_server_type_length)
180-
self.server_type = buf.rstrip(b'\x00 ').decode()
190+
self.server_type = buf.rstrip(b'\x00 ')
191+
if self.convert_header_text:
192+
self.server_type = self.server_type.decode(self.encoding)
181193

182194
buf = self._read_bytes(const.os_version_number_offset + total_align,
183195
const.os_version_number_length)
184-
self.os_version = buf.rstrip(b'\x00 ').decode()
196+
self.os_version = buf.rstrip(b'\x00 ')
197+
if self.convert_header_text:
198+
self.os_version = self.os_version.decode(self.encoding)
185199

186200
buf = self._read_bytes(const.os_name_offset + total_align,
187201
const.os_name_length)
188202
buf = buf.rstrip(b'\x00 ')
189203
if len(buf) > 0:
190-
self.os_name = buf.decode()
204+
self.os_name = buf.decode(self.encoding)
191205
else:
192206
buf = self._read_bytes(const.os_maker_offset + total_align,
193207
const.os_maker_length)
194-
self.os_name = buf.rstrip(b'\x00 ').decode()
208+
self.os_name = buf.rstrip(b'\x00 ')
209+
if self.convert_header_text:
210+
self.os_name = self.os_name.decode(self.encoding)
195211

196212
# Read a single float of the given width (4 or 8).
197213
def _read_float(self, offset, width):
@@ -383,8 +399,10 @@ def _process_columntext_subheader(self, offset, length):
383399
text_block_size = self._read_int(offset, const.text_block_size_length)
384400

385401
buf = self._read_bytes(offset, text_block_size)
386-
self.column_names_strings.append(
387-
buf[0:text_block_size].rstrip(b"\x00 ").decode(self.encoding))
402+
cname = buf[0:text_block_size].rstrip(b"\x00 ")
403+
if self.convert_header_text:
404+
cname = cname.decode(self.encoding)
405+
self.column_names_strings.append(cname)
388406

389407
if len(self.column_names_strings) == 1:
390408
column_name = self.column_names_strings[0]

pandas/io/sas/saslib.pyx

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ cdef rle_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
1111

1212
cdef uint8_t control_byte
1313
cdef uint8_t [:] result = np.zeros(result_length, np.uint8)
14-
1514
cdef int rpos = 0
1615
cdef int ipos = 0
1716
cdef int i
@@ -106,7 +105,7 @@ cdef rle_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
106105
if len(result) != result_length:
107106
print("RLE: %v != %v\n", (len(result), result_length))
108107

109-
return np.asarray(result).tostring()
108+
return np.asarray(result)
110109

111110

112111
# rdc_decompress decompresses data using the Ross Data Compression algorithm:
@@ -122,7 +121,6 @@ cdef rdc_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
122121
cdef int ipos = 0
123122
cdef int rpos = 0
124123
cdef int k
125-
126124
cdef uint8_t [:] outbuff = np.zeros(result_length, dtype=np.uint8)
127125

128126
ii = -1
@@ -190,7 +188,7 @@ cdef rdc_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
190188
if len(outbuff) != result_length:
191189
raise ValueError("RDC: %v != %v\n", len(outbuff), result_length)
192190

193-
return np.asarray(outbuff).tostring()
191+
return np.asarray(outbuff)
194192

195193
cdef decompress(object parser, int row_length, page):
196194
page = np.frombuffer(page, dtype=np.uint8)
@@ -292,10 +290,13 @@ cdef process_byte_array_with_data(object parser, int offset, int length):
292290
char[:] column_types = parser.column_types
293291
uint8_t[:, :] byte_chunk = parser._byte_chunk
294292
object[:, :] string_chunk = parser._string_chunk
293+
np.ndarray[uint8_t, ndim=1] source
294+
np.ndarray[uint8_t, ndim=1] raw_source = np.frombuffer(parser._cached_page[offset:offset+length], dtype=np.uint8)
295295

296-
source = parser._cached_page[offset:offset+length]
297296
if (parser.compression != "") and (length < parser.row_length):
298-
source = decompress(parser, parser.row_length, source)
297+
source = decompress(parser, parser.row_length, raw_source)
298+
else:
299+
source = raw_source
299300

300301
s = 8 * parser._current_row_in_chunk_index
301302
js = 0
@@ -314,7 +315,7 @@ cdef process_byte_array_with_data(object parser, int offset, int length):
314315
byte_chunk[jb, m + k] = source[start + k]
315316
jb += 1
316317
elif column_types[j] == b's':
317-
string_chunk[js, parser._current_row_in_chunk_index] = source[start:(start+lngt)].rstrip()
318+
string_chunk[js, parser._current_row_in_chunk_index] = source[start:(start+lngt)].tostring().rstrip()
318319
js += 1
319320
else:
320321
raise ValueError("unknown column type: %s" % parser.columns[j].ctype)

pandas/io/tests/sas/test_sas7bdat.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def test_from_buffer(self):
4747
byts = open(fname, 'rb').read()
4848
buf = io.BytesIO(byts)
4949
df = pd.read_sas(buf, format="sas7bdat", encoding='utf-8')
50-
tm.assert_frame_equal(df, df0)
50+
tm.assert_frame_equal(df, df0, check_exact=False)
5151

5252
def test_from_iterator(self):
5353
for j in 0, 1:

0 commit comments

Comments
 (0)