Skip to content

Commit 487cee0

Browse files
committed
prelim gil impl for Int64Hashtable
1 parent ed234c5 commit 487cee0

File tree

2 files changed

+86
-86
lines changed

2 files changed

+86
-86
lines changed

pandas/hashtable.pyx

Lines changed: 78 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -49,18 +49,11 @@ def list_to_object_array(list obj):
4949

5050
cdef size_t _INIT_VEC_CAP = 32
5151

52-
cdef class ObjectVector:
52+
cdef class Vector:
5353

5454
cdef:
5555
size_t n, m
5656
ndarray ao
57-
PyObject **data
58-
59-
def __cinit__(self):
60-
self.n = 0
61-
self.m = _INIT_VEC_CAP
62-
self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
63-
self.data = <PyObject**> self.ao.data
6457

6558
def __len__(self):
6659
return self.n
@@ -70,6 +63,18 @@ cdef class ObjectVector:
7063
self.m = self.n
7164
return self.ao
7265

66+
67+
cdef class ObjectVector(Vector):
68+
69+
cdef:
70+
PyObject **data
71+
72+
def __cinit__(self):
73+
self.n = 0
74+
self.m = _INIT_VEC_CAP
75+
self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
76+
self.data = <PyObject**> self.ao.data
77+
7378
cdef inline append(self, object o):
7479
if self.n == self.m:
7580
self.m = max(self.m * 2, _INIT_VEC_CAP)
@@ -81,11 +86,9 @@ cdef class ObjectVector:
8186
self.n += 1
8287

8388

84-
cdef class Int64Vector:
89+
cdef class Int64Vector(Vector):
8590

8691
cdef:
87-
size_t n, m
88-
ndarray ao
8992
int64_t *data
9093

9194
def __cinit__(self):
@@ -94,28 +97,29 @@ cdef class Int64Vector:
9497
self.ao = np.empty(_INIT_VEC_CAP, dtype=np.int64)
9598
self.data = <int64_t*> self.ao.data
9699

97-
def __len__(self):
98-
return self.n
100+
cdef inline uint8_t needs_resize(self) nogil:
101+
# if we need to resize
102+
return self.n == self.m
99103

100-
def to_array(self):
101-
self.ao.resize(self.n)
102-
self.m = self.n
103-
return self.ao
104+
cdef resize(self):
105+
self.m = max(self.m * 2, _INIT_VEC_CAP)
106+
self.ao.resize(self.m)
107+
self.data = <int64_t*> self.ao.data
104108

105-
cdef inline append(self, int64_t x):
106-
if self.n == self.m:
107-
self.m = max(self.m * 2, _INIT_VEC_CAP)
108-
self.ao.resize(self.m)
109-
self.data = <int64_t*> self.ao.data
109+
cdef inline void append(self, int64_t x) nogil:
110110

111-
self.data[self.n] = x
112-
self.n += 1
111+
with nogil:
112+
113+
if self.needs_resize():
114+
with gil:
115+
self.resize()
113116

114-
cdef class Float64Vector:
117+
self.data[self.n] = x
118+
self.n += 1
119+
120+
cdef class Float64Vector(Vector):
115121

116122
cdef:
117-
size_t n, m
118-
ndarray ao
119123
float64_t *data
120124

121125
def __cinit__(self):
@@ -124,14 +128,6 @@ cdef class Float64Vector:
124128
self.ao = np.empty(_INIT_VEC_CAP, dtype=np.float64)
125129
self.data = <float64_t*> self.ao.data
126130

127-
def __len__(self):
128-
return self.n
129-
130-
def to_array(self):
131-
self.ao.resize(self.n)
132-
self.m = self.n
133-
return self.ao
134-
135131
cdef inline append(self, float64_t x):
136132
if self.n == self.m:
137133
self.m = max(self.m * 2, _INIT_VEC_CAP)
@@ -142,18 +138,17 @@ cdef class Float64Vector:
142138
self.n += 1
143139

144140

145-
cdef class HashTable:
146-
pass
147-
148-
149-
cdef class StringHashTable(HashTable):
141+
cdef class StringHashTable:
150142
cdef kh_str_t *table
151143

152144
def __cinit__(self, int size_hint=1):
153145
self.table = kh_init_str()
154146
if size_hint is not None:
155147
kh_resize_str(self.table, size_hint)
156148

149+
def __len__(self):
150+
return self.table.size
151+
157152
def __dealloc__(self):
158153
kh_destroy_str(self.table)
159154

@@ -256,7 +251,7 @@ cdef class StringHashTable(HashTable):
256251

257252
return reverse, labels
258253

259-
cdef class Int32HashTable(HashTable):
254+
cdef class Int32HashTable:
260255
cdef kh_int32_t *table
261256

262257
def __init__(self, size_hint=1):
@@ -266,6 +261,9 @@ cdef class Int32HashTable(HashTable):
266261
def __cinit__(self):
267262
self.table = kh_init_int32()
268263

264+
def __len__(self):
265+
return self.table.size
266+
269267
def __dealloc__(self):
270268
kh_destroy_int32(self.table)
271269

@@ -353,14 +351,16 @@ cdef class Int32HashTable(HashTable):
353351

354352
return reverse, labels
355353

356-
cdef class Int64HashTable: #(HashTable):
357-
# cdef kh_int64_t *table
354+
cdef class Int64HashTable:
358355

359356
def __cinit__(self, size_hint=1):
360357
self.table = kh_init_int64()
361358
if size_hint is not None:
362359
kh_resize_int64(self.table, size_hint)
363360

361+
def __len__(self):
362+
return self.table.size
363+
364364
def __dealloc__(self):
365365
kh_destroy_int64(self.table)
366366

@@ -369,9 +369,6 @@ cdef class Int64HashTable: #(HashTable):
369369
k = kh_get_int64(self.table, key)
370370
return k != self.table.n_buckets
371371

372-
def __len__(self):
373-
return self.table.size
374-
375372
cpdef get_item(self, int64_t val):
376373
cdef khiter_t k
377374
k = kh_get_int64(self.table, val)
@@ -446,6 +443,7 @@ cdef class Int64HashTable: #(HashTable):
446443
labels = self.get_labels(values, reverse, 0)
447444
return reverse, labels
448445

446+
@cython.boundscheck(False)
449447
def get_labels(self, ndarray[int64_t] values, Int64Vector uniques,
450448
Py_ssize_t count_prior, Py_ssize_t na_sentinel):
451449
cdef:
@@ -458,21 +456,23 @@ cdef class Int64HashTable: #(HashTable):
458456

459457
labels = np.empty(n, dtype=np.int64)
460458

461-
for i in range(n):
462-
val = values[i]
463-
k = kh_get_int64(self.table, val)
464-
if k != self.table.n_buckets:
465-
idx = self.table.vals[k]
466-
labels[i] = idx
467-
else:
468-
k = kh_put_int64(self.table, val, &ret)
469-
self.table.vals[k] = count
470-
uniques.append(val)
471-
labels[i] = count
472-
count += 1
459+
with nogil:
460+
for i in range(n):
461+
val = values[i]
462+
k = kh_get_int64(self.table, val)
463+
if k != self.table.n_buckets:
464+
idx = self.table.vals[k]
465+
labels[i] = idx
466+
else:
467+
k = kh_put_int64(self.table, val, &ret)
468+
self.table.vals[k] = count
469+
uniques.append(val)
470+
labels[i] = count
471+
count += 1
473472

474473
return labels
475474

475+
@cython.boundscheck(False)
476476
def get_labels_groupby(self, ndarray[int64_t] values):
477477
cdef:
478478
Py_ssize_t i, n = len(values)
@@ -485,24 +485,25 @@ cdef class Int64HashTable: #(HashTable):
485485

486486
labels = np.empty(n, dtype=np.int64)
487487

488-
for i in range(n):
489-
val = values[i]
490-
491-
# specific for groupby
492-
if val < 0:
493-
labels[i] = -1
494-
continue
495-
496-
k = kh_get_int64(self.table, val)
497-
if k != self.table.n_buckets:
498-
idx = self.table.vals[k]
499-
labels[i] = idx
500-
else:
501-
k = kh_put_int64(self.table, val, &ret)
502-
self.table.vals[k] = count
503-
uniques.append(val)
504-
labels[i] = count
505-
count += 1
488+
with nogil:
489+
for i in range(n):
490+
val = values[i]
491+
492+
# specific for groupby
493+
if val < 0:
494+
labels[i] = -1
495+
continue
496+
497+
k = kh_get_int64(self.table, val)
498+
if k != self.table.n_buckets:
499+
idx = self.table.vals[k]
500+
labels[i] = idx
501+
else:
502+
k = kh_put_int64(self.table, val, &ret)
503+
self.table.vals[k] = count
504+
uniques.append(val)
505+
labels[i] = count
506+
count += 1
506507

507508
arr_uniques = uniques.to_array()
508509

@@ -530,6 +531,7 @@ cdef class Int64HashTable: #(HashTable):
530531

531532

532533
cdef class Float64HashTable(HashTable):
534+
533535
def __cinit__(self, size_hint=1):
534536
self.table = kh_init_float64()
535537
if size_hint is not None:
@@ -658,7 +660,6 @@ cdef class Float64HashTable(HashTable):
658660
na_sentinel = object
659661

660662
cdef class PyObjectHashTable(HashTable):
661-
# cdef kh_pymap_t *table
662663

663664
def __init__(self, size_hint=1):
664665
self.table = kh_init_pymap()

pandas/src/khash.pxd

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -62,15 +62,15 @@ cdef extern from "khash_python.h":
6262
int64_t *keys
6363
size_t *vals
6464

65-
inline kh_int64_t* kh_init_int64()
66-
inline void kh_destroy_int64(kh_int64_t*)
67-
inline void kh_clear_int64(kh_int64_t*)
68-
inline khint_t kh_get_int64(kh_int64_t*, int64_t)
69-
inline void kh_resize_int64(kh_int64_t*, khint_t)
70-
inline khint_t kh_put_int64(kh_int64_t*, int64_t, int*)
71-
inline void kh_del_int64(kh_int64_t*, khint_t)
65+
inline kh_int64_t* kh_init_int64() nogil
66+
inline void kh_destroy_int64(kh_int64_t*) nogil
67+
inline void kh_clear_int64(kh_int64_t*) nogil
68+
inline khint_t kh_get_int64(kh_int64_t*, int64_t) nogil
69+
inline void kh_resize_int64(kh_int64_t*, khint_t) nogil
70+
inline khint_t kh_put_int64(kh_int64_t*, int64_t, int*) nogil
71+
inline void kh_del_int64(kh_int64_t*, khint_t) nogil
7272

73-
bint kh_exist_int64(kh_int64_t*, khiter_t)
73+
bint kh_exist_int64(kh_int64_t*, khiter_t) nogil
7474

7575
ctypedef struct kh_float64_t:
7676
khint_t n_buckets, size, n_occupied, upper_bound
@@ -121,4 +121,3 @@ cdef extern from "khash_python.h":
121121
inline void kh_del_strbox(kh_strbox_t*, khint_t)
122122

123123
bint kh_exist_strbox(kh_strbox_t*, khiter_t)
124-

0 commit comments

Comments
 (0)