Skip to content

Commit dbe4e0e

Browse files
committed
Add separate functions for return_inverse=False
1 parent 52ae84e commit dbe4e0e

File tree

1 file changed

+80
-33
lines changed

1 file changed

+80
-33
lines changed

pandas/_libs/hashtable_class_helper.pxi.in

Lines changed: 80 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -418,21 +418,35 @@ cdef class {{name}}HashTable(HashTable):
418418
return uniques.to_array(), np.asarray(labels)
419419
return uniques.to_array()
420420

421+
@cython.boundscheck(False)
422+
def _unique_no_inverse(self, const {{dtype}}_t[:] values):
423+
# define separate functions without inverse for performance
424+
cdef:
425+
Py_ssize_t i, n = len(values)
426+
int ret = 0
427+
{{dtype}}_t val
428+
khiter_t k
429+
{{name}}Vector uniques = {{name}}Vector()
430+
{{name}}VectorData *ud
431+
ud = uniques.data
432+
with nogil:
433+
for i in range(n):
434+
val = values[i]
435+
k = kh_get_{{dtype}}(self.table, val)
436+
if k == self.table.n_buckets:
437+
kh_put_{{dtype}}(self.table, val, &ret)
438+
if needs_resize(ud):
439+
with gil:
440+
uniques.resize()
441+
append_data_{{dtype}}(ud, val)
442+
return uniques.to_array()
443+
421444
def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
422-
# define separate functions with/without inverse to force compilation
423-
# of the different code paths for boolean "return_inverse"
424445
if return_inverse:
425-
return self._unique_with_inverse(values)
446+
return self._unique(values, uniques={{name}}Vector(), ignore_na=False,
447+
return_inverse=True)
426448
return self._unique_no_inverse(values)
427449

428-
def _unique_no_inverse(self, const {{dtype}}_t[:] values):
429-
return self._unique(values, uniques={{name}}Vector(), ignore_na=False,
430-
return_inverse=False)
431-
432-
def _unique_with_inverse(self, const {{dtype}}_t[:] values):
433-
return self._unique(values, uniques={{name}}Vector(), ignore_na=False,
434-
return_inverse=True)
435-
436450
def factorize(self, {{dtype}}_t[:] values):
437451
return self._unique(values, uniques={{name}}Vector(), ignore_na=True,
438452
return_inverse=True)
@@ -695,21 +709,46 @@ cdef class StringHashTable(HashTable):
695709
return uniques.to_array(), np.asarray(labels)
696710
return uniques.to_array()
697711

712+
@cython.boundscheck(False)
713+
def _unique_no_inverse(self, ndarray[object] values):
714+
# define separate functions without inverse for performance
715+
cdef:
716+
Py_ssize_t i, count, n = len(values)
717+
int64_t[:] uindexer
718+
int ret = 0
719+
object val
720+
ObjectVector uniques
721+
khiter_t k
722+
const char *v
723+
const char **vecs
724+
vecs = <const char **> malloc(n * sizeof(char *))
725+
uindexer = np.empty(n, dtype=np.int64)
726+
for i in range(n):
727+
val = values[i]
728+
v = util.get_c_string(val)
729+
vecs[i] = v
730+
count = 0
731+
with nogil:
732+
for i in range(n):
733+
v = vecs[i]
734+
k = kh_get_str(self.table, v)
735+
if k == self.table.n_buckets:
736+
kh_put_str(self.table, v, &ret)
737+
uindexer[count] = i
738+
count += 1
739+
free(vecs)
740+
# uniques
741+
uniques = ObjectVector()
742+
for i in range(count):
743+
uniques.append(values[uindexer[i]])
744+
return uniques.to_array()
745+
698746
def unique(self, ndarray[object] values, bint return_inverse=False):
699-
# define separate functions with/without inverse to force compilation
700-
# of the different code paths for boolean "return_inverse"
701747
if return_inverse:
702-
return self._unique_with_inverse(values)
748+
return self._unique(values, uniques=ObjectVector(), ignore_na=False,
749+
return_inverse=True)
703750
return self._unique_no_inverse(values)
704751

705-
def _unique_no_inverse(self, ndarray[object] values):
706-
return self._unique(values, uniques=ObjectVector(), ignore_na=False,
707-
return_inverse=False)
708-
709-
def _unique_with_inverse(self, ndarray[object] values):
710-
return self._unique(values, uniques=ObjectVector(), ignore_na=False,
711-
return_inverse=True)
712-
713752
def factorize(self, ndarray[object] values):
714753
return self._unique(values, uniques=ObjectVector(), ignore_na=True,
715754
return_inverse=True)
@@ -852,21 +891,29 @@ cdef class PyObjectHashTable(HashTable):
852891
return uniques.to_array(), np.asarray(labels)
853892
return uniques.to_array()
854893

894+
def _unique_no_inverse(self, ndarray[object] values):
895+
# define separate functions without inverse for performance
896+
cdef:
897+
Py_ssize_t i, n = len(values)
898+
int ret = 0
899+
object val
900+
khiter_t k
901+
ObjectVector uniques = ObjectVector()
902+
for i in range(n):
903+
val = values[i]
904+
hash(val)
905+
k = kh_get_pymap(self.table, <PyObject*>val)
906+
if k == self.table.n_buckets:
907+
kh_put_pymap(self.table, <PyObject*>val, &ret)
908+
uniques.append(val)
909+
return uniques.to_array()
910+
855911
def unique(self, ndarray[object] values, bint return_inverse=False):
856-
# define separate functions with/without inverse to force compilation
857-
# of the different code paths for boolean "return_inverse"
858912
if return_inverse:
859-
return self._unique_with_inverse(values)
913+
return self._unique(values, uniques=ObjectVector(), ignore_na=False,
914+
return_inverse=True)
860915
return self._unique_no_inverse(values)
861916

862-
def _unique_no_inverse(self, ndarray[object] values):
863-
return self._unique(values, uniques=ObjectVector(), ignore_na=False,
864-
return_inverse=False)
865-
866-
def _unique_with_inverse(self, ndarray[object] values):
867-
return self._unique(values, uniques=ObjectVector(), ignore_na=False,
868-
return_inverse=True)
869-
870917
def factorize(self, ndarray[object] values):
871918
return self._unique(values, uniques=ObjectVector(), ignore_na=True,
872919
return_inverse=True)

0 commit comments

Comments
 (0)