Skip to content

Commit 24d8cf3

Browse files
committed
initial sketch
1 parent 08fdcd8 commit 24d8cf3

27 files changed

+54721
-451
lines changed

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@
4444

4545
# define extension module
4646
ext_modules = cythonize([
47-
Extension('zarr.ext',
48-
sources=['zarr/ext.pyx'] + blosc_sources,
47+
Extension('zarr.blosc',
48+
sources=['zarr/blosc.pyx'] + blosc_sources,
4949
include_dirs=include_dirs,
5050
define_macros=define_macros,
5151
extra_compile_args=extra_compile_args,

zarr/__init__.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,3 @@
22
from __future__ import absolute_import, print_function, division
33

44

5-
import atexit
6-
import multiprocessing
7-
8-
9-
from zarr.create import empty, zeros, ones, full, array, open, empty_like, \
10-
zeros_like, ones_like, full_like, open_like
11-
from zarr.ext import blosc_version, init as _init, destroy as _destroy, \
12-
set_blosc_options
13-
from zarr import defaults
14-
from zarr import constants
15-
from zarr.version import version as __version__
16-
17-
18-
ncores = multiprocessing.cpu_count()
19-
_init()
20-
set_blosc_options(use_context=False, nthreads=ncores)
21-
atexit.register(_destroy)

zarr/array.py

Lines changed: 341 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,341 @@
1+
# -*- coding: utf-8 -*-
2+
from __future__ import absolute_import, print_function, division
3+
4+
5+
from functools import reduce # TODO PY2 compatibility
6+
import operator
7+
import itertools
8+
import numpy as np
9+
10+
11+
from zarr.blosc import compress, decompress
12+
13+
14+
def _is_total_slice(item, shape):
15+
"""Determine whether `item` specifies a complete slice of array with the
16+
given `shape`. Used to optimise __setitem__ operations on the Chunk
17+
class."""
18+
19+
if item == Ellipsis:
20+
return True
21+
if item == slice(None):
22+
return True
23+
if isinstance(item, tuple):
24+
return all(
25+
(isinstance(s, slice) and
26+
((s == slice(None)) or (s.stop - s.start == l)))
27+
for s, l in zip(item, shape)
28+
)
29+
return False
30+
31+
32+
def _normalize_axis_selection(item, l):
33+
"""Convenience function to normalize a selection within a single axis
34+
of size `l`."""
35+
36+
if isinstance(item, int):
37+
if item < 0:
38+
# handle wraparound
39+
item = l + item
40+
if item > (l - 1) or item < 0:
41+
raise IndexError('index out of bounds: %s' % item)
42+
return item, item + 1
43+
44+
elif isinstance(item, slice):
45+
if item.step is not None and item.step != 1:
46+
raise NotImplementedError('slice with step not supported')
47+
start = 0 if item.start is None else item.start
48+
stop = l if item.stop is None else item.stop
49+
if start < 0:
50+
start = l + start
51+
if stop < 0:
52+
stop = l + stop
53+
if start < 0 or stop < 0:
54+
raise IndexError('index out of bounds: %s, %s' % (start, stop))
55+
if stop > l:
56+
stop = l
57+
return start, stop
58+
59+
else:
60+
raise ValueError('expected integer or slice, found: %r' % item)
61+
62+
63+
def _normalize_array_selection(item, shape):
64+
"""Convenience function to normalize a selection within an array with
65+
the given `shape`."""
66+
67+
# normalize item
68+
if isinstance(item, int):
69+
item = (item,)
70+
elif isinstance(item, slice):
71+
item = (item,)
72+
elif item == Ellipsis:
73+
item = (slice(None),)
74+
75+
# handle tuple of indices/slices
76+
if isinstance(item, tuple):
77+
78+
# determine start and stop indices for all axes
79+
selection = tuple(_normalize_axis_selection(i, l)
80+
for i, l in zip(item, shape))
81+
82+
# fill out selection if not completely specified
83+
if len(selection) < len(shape):
84+
selection += tuple((0, l) for l in shape[len(selection):])
85+
86+
return selection
87+
88+
else:
89+
raise ValueError('expected indices or slice, found: %r' % item)
90+
91+
92+
def _get_chunk_range(selection, chunks):
93+
"""Convenience function to get a range over all chunk indices,
94+
for iterating over chunks."""
95+
chunk_range = [range(start//l, int(np.ceil(stop/l)))
96+
for (start, stop), l in zip(selection, chunks)]
97+
return chunk_range
98+
99+
100+
class Array(object):
101+
102+
def __init__(self, store):
103+
self._store = store
104+
105+
# store configuration metadata
106+
self._shape = store.meta['shape']
107+
self._chunks = store.meta['chunks']
108+
self._dtype = store.meta['dtype']
109+
self._cname = store.meta['cname']
110+
# TODO check valid cname here?
111+
self._clevel = store.meta['clevel']
112+
# TODO check valid clevel here?
113+
self._shuffle = store.meta['shuffle']
114+
self._fill_value = store.meta['fill_value']
115+
116+
# store user-defined attributes
117+
self._attrs = store.attrs
118+
119+
@property
120+
def shape(self):
121+
return self._shape
122+
123+
@property
124+
def chunks(self):
125+
return self._chunks
126+
127+
@property
128+
def dtype(self):
129+
return self._dtype
130+
131+
@property
132+
def cname(self):
133+
return self._cname
134+
135+
@property
136+
def clevel(self):
137+
return self._clevel
138+
139+
@property
140+
def shuffle(self):
141+
return self._shuffle
142+
143+
@property
144+
def fill_value(self):
145+
return self._fill_value
146+
147+
@property
148+
def attrs(self):
149+
return self._attrs
150+
151+
@property
152+
def cbytes(self):
153+
# pass through
154+
return self._store.cbytes
155+
156+
# derived properties
157+
158+
@property
159+
def size(self):
160+
return reduce(operator.mul, self._shape)
161+
162+
@property
163+
def itemsize(self):
164+
return self._dtype.itemsize
165+
166+
@property
167+
def nbytes(self):
168+
return self.size * self.itemsize
169+
170+
# methods
171+
172+
def __getitem__(self, item):
173+
174+
# normalize selection
175+
selection = _normalize_array_selection(item, self._shape)
176+
177+
# determine output array shape
178+
out_shape = tuple(stop - start for start, stop in selection)
179+
180+
# setup output array
181+
out = np.empty(out_shape, dtype=self._dtype)
182+
183+
# determine indices of chunks overlapping the selection
184+
chunk_range = _get_chunk_range(selection, self._chunks)
185+
186+
# iterate over chunks in range
187+
for cidx in itertools.product(*chunk_range):
188+
189+
# determine chunk offset
190+
offset = [i * c for i, c in zip(cidx, self._chunks)]
191+
192+
# determine region within output array
193+
out_selection = tuple(
194+
slice(max(0, o - start), min(o + c - start, stop - start))
195+
for (start, stop), o, c, in zip(selection, offset, self._chunks)
196+
)
197+
198+
# determine region within chunk
199+
chunk_selection = tuple(
200+
slice(max(0, start - o), min(c, stop - o))
201+
for (start, stop), o, c in zip(selection, offset, self._chunks)
202+
)
203+
204+
# obtain the destination array as a view of the output array
205+
dest = out[out_selection]
206+
207+
# load chunk selection into output array
208+
self._chunk_getitem(cidx, chunk_selection, dest)
209+
210+
return out
211+
212+
def __array__(self):
213+
return self[:]
214+
215+
def __setitem__(self, key, value):
216+
217+
# normalize selection
218+
selection = _normalize_array_selection(key, self._shape)
219+
220+
# determine indices of chunks overlapping the selection
221+
chunk_range = _get_chunk_range(selection, self._chunks)
222+
223+
# iterate over chunks in range
224+
for cidx in itertools.product(*chunk_range):
225+
226+
# determine chunk offset
227+
offset = [i * c for i, c in zip(cidx, self._chunks)]
228+
229+
# determine required index range within chunk
230+
chunk_selection = tuple(
231+
slice(max(0, start - o), min(c, stop - o))
232+
for (start, stop), o, c in zip(selection, offset, self._chunks)
233+
)
234+
235+
if np.isscalar(value):
236+
237+
# put data
238+
self._chunk_setitem(cidx, chunk_selection, value)
239+
240+
else:
241+
# assume value is array-like
242+
243+
# determine index within value
244+
value_selection = tuple(
245+
slice(max(0, o - start), min(o + c - start, stop - start))
246+
for (start, stop), o, c, in zip(selection, offset, self._chunks)
247+
)
248+
249+
# put data
250+
self._chunk_setitem(cidx, chunk_selection, value[value_selection])
251+
252+
def _chunk_getitem(self, cidx, item, dest):
253+
254+
# override this in sub-classes, e.g., if need to use a lock
255+
256+
# obtain compressed data for chunk
257+
cdata = self._store.data[cidx]
258+
259+
if _is_total_slice(item, self._chunks) and dest.flags.c_contiguous:
260+
261+
# optimisation: we want the whole chunk, and the destination is
262+
# C contiguous, so we can decompress directly from the chunk
263+
# into the destination array
264+
decompress(cdata, dest, self._cname, self._clevel, self._shuffle)
265+
266+
else:
267+
268+
# decompress chunk
269+
chunk = np.empty(self._chunks, dtype=self._dtype)
270+
decompress(cdata, chunk, self._cname, self._clevel, self._shuffle)
271+
272+
# set data in output array
273+
# (split into two lines for profiling)
274+
tmp = chunk[item]
275+
dest[:] = tmp
276+
277+
def _chunk_setitem(self, cidx, key, value):
278+
279+
# override this in sub-classes, e.g., if need to use a lock
280+
281+
if _is_total_slice(key, self._chunks):
282+
283+
# optimisation: we are completely replacing the chunk, so no need
284+
# to access the existing chunk data
285+
286+
if np.isscalar(value):
287+
288+
# setup array filled with value
289+
chunk = np.empty(self._chunks, dtype=self._dtype)
290+
chunk.fill(value)
291+
292+
else:
293+
294+
# ensure array is C contiguous
295+
chunk = np.ascontiguousarray(value, dtype=self._dtype)
296+
297+
else:
298+
# partially replace the contents of this chunk
299+
300+
# obtain compressed data for chunk
301+
cdata = self._store.data[cidx]
302+
303+
# decompress
304+
chunk = np.empty(self._chunks, dtype=self._dtype)
305+
decompress(cdata, chunk, self._cname, self._clevel, self._shuffle)
306+
307+
# modify
308+
chunk[key] = value
309+
310+
# compress
311+
cdata = compress(chunk, self._cname, self._clevel, self._shuffle)
312+
313+
# store
314+
self._store.data[cidx] = cdata
315+
316+
def __repr__(self):
317+
# TODO
318+
pass
319+
320+
def __str__(self):
321+
# TODO
322+
pass
323+
324+
def resize(self, *args):
325+
# TODO
326+
pass
327+
328+
def append(self, data, axis=0):
329+
# TODO
330+
pass
331+
332+
# TODO
333+
334+
335+
class SynchronizedArray(Array):
336+
337+
def __init__(self, store, synchronizer):
338+
super(SynchronizedArray, self).__init__(store)
339+
self._synchronizer = synchronizer
340+
341+
# TODO

zarr/blosc.pyx

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# -*- coding: utf-8 -*-
2+
from __future__ import absolute_import, print_function, division
3+
4+
5+
import numpy as np
6+
cimport numpy as np
7+
8+
9+
def decompress(bytes cdata, np.ndarray array, bytes cname, int clevel,
10+
int shuffle):
11+
# TODO
12+
pass
13+
14+
15+
def compress(np.ndarray array, bytes cname, int clevel, int shuffle):
16+
# TODO
17+
pass

0 commit comments

Comments
 (0)