@@ -418,21 +418,35 @@ cdef class {{name}}HashTable(HashTable):
418
418
return uniques.to_array(), np.asarray(labels)
419
419
return uniques.to_array()
420
420
421
+ @cython.boundscheck(False)
422
+ def _unique_no_inverse(self, const {{dtype}}_t[:] values):
423
+ # define separate functions without inverse for performance
424
+ cdef:
425
+ Py_ssize_t i, n = len(values)
426
+ int ret = 0
427
+ {{dtype}}_t val
428
+ khiter_t k
429
+ {{name}}Vector uniques = {{name}}Vector()
430
+ {{name}}VectorData *ud
431
+ ud = uniques.data
432
+ with nogil:
433
+ for i in range(n):
434
+ val = values[i]
435
+ k = kh_get_{{dtype}}(self.table, val)
436
+ if k == self.table.n_buckets:
437
+ kh_put_{{dtype}}(self.table, val, &ret)
438
+ if needs_resize(ud):
439
+ with gil:
440
+ uniques.resize()
441
+ append_data_{{dtype}}(ud, val)
442
+ return uniques.to_array()
443
+
421
444
def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
422
- # define separate functions with/without inverse to force compilation
423
- # of the different code paths for boolean "return_inverse"
424
445
if return_inverse:
425
- return self._unique_with_inverse(values)
446
+ return self._unique(values, uniques={{name}}Vector(), ignore_na=False,
447
+ return_inverse=True)
426
448
return self._unique_no_inverse(values)
427
449
428
- def _unique_no_inverse(self, const {{dtype}}_t[:] values):
429
- return self._unique(values, uniques={{name}}Vector(), ignore_na=False,
430
- return_inverse=False)
431
-
432
- def _unique_with_inverse(self, const {{dtype}}_t[:] values):
433
- return self._unique(values, uniques={{name}}Vector(), ignore_na=False,
434
- return_inverse=True)
435
-
436
450
def factorize(self, {{dtype}}_t[:] values):
437
451
return self._unique(values, uniques={{name}}Vector(), ignore_na=True,
438
452
return_inverse=True)
@@ -695,21 +709,46 @@ cdef class StringHashTable(HashTable):
695
709
return uniques.to_array(), np.asarray(labels)
696
710
return uniques.to_array()
697
711
712
+ @cython.boundscheck(False)
713
+ def _unique_no_inverse(self, ndarray[object] values):
714
+ # define separate functions without inverse for performance
715
+ cdef:
716
+ Py_ssize_t i, count, n = len(values)
717
+ int64_t[:] uindexer
718
+ int ret = 0
719
+ object val
720
+ ObjectVector uniques
721
+ khiter_t k
722
+ const char *v
723
+ const char **vecs
724
+ vecs = <const char **> malloc(n * sizeof(char *))
725
+ uindexer = np.empty(n, dtype=np.int64)
726
+ for i in range(n):
727
+ val = values[i]
728
+ v = util.get_c_string(val)
729
+ vecs[i] = v
730
+ count = 0
731
+ with nogil:
732
+ for i in range(n):
733
+ v = vecs[i]
734
+ k = kh_get_str(self.table, v)
735
+ if k == self.table.n_buckets:
736
+ kh_put_str(self.table, v, &ret)
737
+ uindexer[count] = i
738
+ count += 1
739
+ free(vecs)
740
+ # uniques
741
+ uniques = ObjectVector()
742
+ for i in range(count):
743
+ uniques.append(values[uindexer[i]])
744
+ return uniques.to_array()
745
+
698
746
def unique(self, ndarray[object] values, bint return_inverse=False):
699
- # define separate functions with/without inverse to force compilation
700
- # of the different code paths for boolean "return_inverse"
701
747
if return_inverse:
702
- return self._unique_with_inverse(values)
748
+ return self._unique(values, uniques=ObjectVector(), ignore_na=False,
749
+ return_inverse=True)
703
750
return self._unique_no_inverse(values)
704
751
705
- def _unique_no_inverse(self, ndarray[object] values):
706
- return self._unique(values, uniques=ObjectVector(), ignore_na=False,
707
- return_inverse=False)
708
-
709
- def _unique_with_inverse(self, ndarray[object] values):
710
- return self._unique(values, uniques=ObjectVector(), ignore_na=False,
711
- return_inverse=True)
712
-
713
752
def factorize(self, ndarray[object] values):
714
753
return self._unique(values, uniques=ObjectVector(), ignore_na=True,
715
754
return_inverse=True)
@@ -852,21 +891,29 @@ cdef class PyObjectHashTable(HashTable):
852
891
return uniques.to_array(), np.asarray(labels)
853
892
return uniques.to_array()
854
893
894
+ def _unique_no_inverse(self, ndarray[object] values):
895
+ # define separate functions without inverse for performance
896
+ cdef:
897
+ Py_ssize_t i, n = len(values)
898
+ int ret = 0
899
+ object val
900
+ khiter_t k
901
+ ObjectVector uniques = ObjectVector()
902
+ for i in range(n):
903
+ val = values[i]
904
+ hash(val)
905
+ k = kh_get_pymap(self.table, <PyObject*>val)
906
+ if k == self.table.n_buckets:
907
+ kh_put_pymap(self.table, <PyObject*>val, &ret)
908
+ uniques.append(val)
909
+ return uniques.to_array()
910
+
855
911
def unique(self, ndarray[object] values, bint return_inverse=False):
856
- # define separate functions with/without inverse to force compilation
857
- # of the different code paths for boolean "return_inverse"
858
912
if return_inverse:
859
- return self._unique_with_inverse(values)
913
+ return self._unique(values, uniques=ObjectVector(), ignore_na=False,
914
+ return_inverse=True)
860
915
return self._unique_no_inverse(values)
861
916
862
- def _unique_no_inverse(self, ndarray[object] values):
863
- return self._unique(values, uniques=ObjectVector(), ignore_na=False,
864
- return_inverse=False)
865
-
866
- def _unique_with_inverse(self, ndarray[object] values):
867
- return self._unique(values, uniques=ObjectVector(), ignore_na=False,
868
- return_inverse=True)
869
-
870
917
def factorize(self, ndarray[object] values):
871
918
return self._unique(values, uniques=ObjectVector(), ignore_na=True,
872
919
return_inverse=True)
0 commit comments