Skip to content

Commit 048df33

Browse files
committed
evaluate jsonpickle to serialize/deserialise objects
verdict: order of magnitude slower than 'custom' serialization these are baseline results on @karlicoss desktop pc (initialization + hitting the cache) 11.13s call src/cachew/tests/test_cachew.py::test_many[1000000-False] 5.61s call src/cachew/tests/test_cachew.py::test_many[500000-False] 1.13s call src/cachew/tests/test_cachew.py::test_many[100000-True] these are results with jsonpickle.. not great 86.42s call src/cachew/tests/test_cachew.py::test_many[1000000-False] 44.08s call src/cachew/tests/test_cachew.py::test_many[500000-False] 8.78s call src/cachew/tests/test_cachew.py::test_many[100000-True]
1 parent b867166 commit 048df33

File tree

2 files changed

+31
-108
lines changed

2 files changed

+31
-108
lines changed

src/cachew/__init__.py

Lines changed: 12 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
import warnings
1919

2020

21+
import jsonpickle
22+
2123
import appdirs
2224

2325
import sqlalchemy
@@ -403,103 +405,6 @@ def make(tp: Type[NT], name: Optional[str]=None) -> 'NTBinder[NT]':
403405
fields=fields,
404406
)
405407

406-
@property
407-
def columns(self) -> List[Column]:
408-
return list(self.iter_columns())
409-
410-
# TODO not necessarily namedtuple? could be primitive type
411-
def to_row(self, obj: NT) -> Tuple[Optional[Values], ...]:
412-
return tuple(self._to_row(obj))
413-
414-
def from_row(self, row: Iterable[Any]) -> NT:
415-
riter = iter(row)
416-
res = self._from_row(riter)
417-
remaining = list(islice(riter, 0, 1))
418-
if len(remaining) != 0:
419-
raise CachewException(f'unconsumed items in iterator {remaining}')
420-
assert res is not None # nosec # help mypy; top level will not be None
421-
return res
422-
423-
424-
def _to_row(self, obj) -> Iterator[Optional[Values]]:
425-
if self.primitive:
426-
yield obj
427-
elif self.union is not None:
428-
CachewUnion = self.union
429-
(uf,) = self.fields
430-
# TODO assert only one of them matches??
431-
union = CachewUnion(**{
432-
f.name: obj if isinstance(obj, f.type_) else None
433-
for f in uf.fields
434-
})
435-
yield from uf._to_row(union)
436-
else:
437-
if self.optional:
438-
is_none = obj is None
439-
yield is_none
440-
else:
441-
is_none = False; assert obj is not None # TODO hmm, that last assert is not very symmetric...
442-
443-
if is_none:
444-
for _ in range(self.span - 1):
445-
yield None
446-
else:
447-
yield from chain.from_iterable(
448-
f._to_row(getattr(obj, f.name))
449-
for f in self.fields
450-
)
451-
452-
def _from_row(self, row_iter):
453-
if self.primitive:
454-
return next(row_iter)
455-
elif self.union is not None:
456-
CachewUnion = self.union
457-
(uf,) = self.fields
458-
# TODO assert only one of them is not None?
459-
union_params = [
460-
r
461-
for r in uf._from_row(row_iter) if r is not None
462-
]
463-
assert len(union_params) == 1, union_params
464-
return union_params[0]
465-
else:
466-
if self.optional:
467-
is_none = next(row_iter)
468-
else:
469-
is_none = False
470-
471-
if is_none:
472-
for _ in range(self.span - 1):
473-
x = next(row_iter)
474-
assert x is None, x # huh. assert is kinda opposite of producing value
475-
return None
476-
else:
477-
return self.type_(*(
478-
f._from_row(row_iter)
479-
for f in self.fields
480-
))
481-
482-
# TODO not sure if we want to allow optionals on top level?
483-
def iter_columns(self) -> Iterator[Column]:
484-
used_names: Set[str] = set()
485-
486-
def col(name: str, tp) -> Column:
487-
while name in used_names:
488-
name = '_' + name
489-
used_names.add(name)
490-
return Column(name, tp)
491-
492-
if self.primitive:
493-
if self.name is None: raise AssertionError
494-
yield col(self.name, PRIMITIVES[self.type_])
495-
else:
496-
prefix = '' if self.name is None else self.name + '_'
497-
if self.optional:
498-
yield col(f'_{prefix}is_null', sqlalchemy.Boolean)
499-
for f in self.fields:
500-
for c in f.iter_columns():
501-
yield col(f'{prefix}{c.name}', c.type)
502-
503408
def __str__(self):
504409
lines = [' ' * level + str(x.name) + ('?' if x.optional else '') + f' <span {x.span}>' for level, x in self.flatten()]
505410
return '\n'.join(lines)
@@ -562,9 +467,10 @@ def do_begin(conn):
562467

563468
self.binder = NTBinder.make(tp=cls)
564469
# actual cache
565-
self.table_cache = Table('cache' , self.meta, *self.binder.columns)
470+
# FIXME change table definition
471+
self.table_cache = Table('cache' , self.meta, Column('data', sqlalchemy.String))
566472
# temporary table, we use it to insert and then (atomically?) rename to the above table at the very end
567-
self.table_cache_tmp = Table('cache_tmp', self.meta, *self.binder.columns)
473+
self.table_cache_tmp = Table('cache_tmp', self.meta, Column('data', sqlalchemy.String))
568474

569475
def __enter__(self) -> 'DbHelper':
570476
return self
@@ -882,7 +788,7 @@ def composite_hash(self, *args, **kwargs) -> Dict[str, Any]:
882788
}
883789
kwargs = {**defaults, **kwargs}
884790
binder = NTBinder.make(tp=self.cls_)
885-
schema = str(binder.columns) # todo not super nice, but works fine for now
791+
schema = str('FIXME') # todo not super nice, but works fine for now
886792
hash_parts = {
887793
'cachew' : CACHEW_VERSION,
888794
'schema' : schema,
@@ -996,8 +902,8 @@ def cachew_wrapper(
996902

997903
def cached_items():
998904
rows = conn.execute(table_cache.select())
999-
for row in rows:
1000-
yield binder.from_row(row)
905+
for (js,) in rows:
906+
yield jsonpickle.decode(js)
1001907

1002908
if new_hash == old_hash:
1003909
logger.debug('hash matched: loading from cache')
@@ -1107,7 +1013,7 @@ def flush() -> None:
11071013
dict(zip(column_names, row))
11081014
for row in chunk
11091015
]
1110-
conn.execute(insert_into_table_cache_tmp, chunk_dict)
1016+
conn.execute(insert_into_table_cache_tmp, [{'data': c} for c in chunk])
11111017
chunk = []
11121018

11131019
total_objects = 0
@@ -1118,8 +1024,9 @@ def flush() -> None:
11181024
except GeneratorExit:
11191025
early_exit = True
11201026
return
1121-
1122-
chunk.append(binder.to_row(d))
1027+
1028+
js = jsonpickle.encode(d)
1029+
chunk.append(js)
11231030
if len(chunk) >= chunk_by:
11241031
flush()
11251032
flush()

src/cachew/tests/test_cachew.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -295,8 +295,9 @@ class TE2(NamedTuple):
295295
# e.g. -k 'test_many[500000-False]'
296296
# fmt: off
297297
@pytest.mark.parametrize('count,on_ci', [
298-
(100000, True),
299-
(500000, False),
298+
(100_000, True),
299+
(500_000, False),
300+
(1_000_000, False),
300301
])
301302
# fmt: on
302303
def test_many(count: int, on_ci: bool, tmp_path: Path) -> None:
@@ -317,7 +318,22 @@ def iter_data() -> Iterator[TE2]:
317318

318319
assert ilen(iter_data()) == count # initial
319320
assert ilen(iter_data()) == count # hitting cache
320-
assert last(iter_data()) == TE2(value=count - 1, uuu=UUU(xx=count - 1, yy=count - 1), value2=count - 1)
321+
322+
# these are baseline results on @karlicoss desktop pc (initialization + hitting the cache)
323+
# 11.13s call src/cachew/tests/test_cachew.py::test_many[1000000-False]
324+
# 5.61s call src/cachew/tests/test_cachew.py::test_many[500000-False]
325+
# 1.13s call src/cachew/tests/test_cachew.py::test_many[100000-True]
326+
327+
328+
# these are results with jsonpickle.. not great
329+
# 86.42s call src/cachew/tests/test_cachew.py::test_many[1000000-False]
330+
# 44.08s call src/cachew/tests/test_cachew.py::test_many[500000-False]
331+
# 8.78s call src/cachew/tests/test_cachew.py::test_many[100000-True]
332+
333+
334+
335+
# assert last(iter_data()) == TE2(value=count - 1, uuu=UUU(xx=count - 1, yy=count - 1), value2=count - 1)
336+
321337

322338
# serializing to db
323339
# in-memory: 16 seconds

0 commit comments

Comments
 (0)