Skip to content

Commit f0706b1

Browse files
committed
PERF: value_counts_float64 #10821
1 parent 13cb1a7 commit f0706b1

File tree

4 files changed

+58
-1
lines changed

4 files changed

+58
-1
lines changed

Diff for: doc/source/whatsnew/v0.17.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -585,7 +585,7 @@ Performance Improvements
585585
- Improved performance of ``Series.isin`` for datetimelike/integer Series (:issue:`10287`)
586586
- 20x improvement in ``concat`` of Categoricals when categories are identical (:issue:`10587`)
587587
- Improved performance of ``to_datetime`` when specified format string is ISO8601 (:issue:`10178`)
588-
588+
- 2x improvement of ``Series.value_counts`` for float dtype (:issue:`10821`)
589589

590590
.. _whatsnew_0170.bug_fixes:
591591

Diff for: pandas/core/algorithms.py

+3
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,9 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
245245
elif com.is_integer_dtype(dtype):
246246
values = com._ensure_int64(values)
247247
keys, counts = htable.value_count_int64(values)
248+
elif com.is_float_dtype(dtype):
249+
values = com._ensure_float64(values)
250+
keys, counts = htable.value_count_float64(values, dropna)
248251

249252
else:
250253
values = com._ensure_object(values)

Diff for: pandas/hashtable.pyx

+45
Original file line numberDiff line numberDiff line change
@@ -866,7 +866,52 @@ cdef class Int64Factorizer:
866866
self.count = len(self.uniques)
867867
return labels
868868

869+
@cython.boundscheck(False)
870+
cdef build_count_table_float64(float64_t[:] values, kh_float64_t *table, bint dropna):
871+
cdef:
872+
khiter_t k
873+
Py_ssize_t i, n = len(values)
874+
float64_t val
875+
int ret = 0
876+
877+
with nogil:
878+
kh_resize_float64(table, n)
879+
880+
for i in range(n):
881+
val = values[i]
882+
if val == val or not dropna:
883+
k = kh_get_float64(table, val)
884+
if k != table.n_buckets:
885+
table.vals[k] += 1
886+
else:
887+
k = kh_put_float64(table, val, &ret)
888+
table.vals[k] = 1
889+
890+
@cython.boundscheck(False)
891+
cpdef value_count_float64(float64_t[:] values, bint dropna):
892+
cdef:
893+
Py_ssize_t i
894+
kh_float64_t * table
895+
float64_t[:] result_keys
896+
int64_t[:] result_counts
897+
int k
898+
899+
table = kh_init_float64()
900+
build_count_table_float64(values, table, dropna)
901+
902+
i = 0
903+
result_keys = np.empty(table.n_occupied, dtype=np.float64)
904+
result_counts = np.zeros(table.n_occupied, dtype=np.int64)
869905

906+
with nogil:
907+
for k in range(table.n_buckets):
908+
if kh_exist_float64(table, k):
909+
result_keys[i] = table.keys[k]
910+
result_counts[i] = table.vals[k]
911+
i += 1
912+
kh_destroy_float64(table)
913+
914+
return np.asarray(result_keys), np.asarray(result_counts)
870915

871916
@cython.boundscheck(False)
872917
cdef build_count_table_int64(int64_t[:] values, kh_int64_t *table):

Diff for: vb_suite/groupby.py

+9
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,15 @@ def f():
194194
series_value_counts_strings = Benchmark('s.value_counts()', setup,
195195
start_date=datetime(2011, 10, 21))
196196

197+
#value_counts on float dtype
198+
199+
setup = common_setup + """
200+
s = Series(np.random.randint(0, 1000, size=100000)).astype(float)
201+
"""
202+
203+
series_value_counts_float64 = Benchmark('s.value_counts()', setup,
204+
start_date=datetime(2015, 8, 17))
205+
197206
#----------------------------------------------------------------------
198207
# pivot_table
199208

0 commit comments

Comments
 (0)