Skip to content

Commit df6853a

Browse files
Factorize fix (#11)
* Updated for factorize * BUG: Fixed factorization rules * Updated requirements * Fixed old factorize test * Added to appveyor * Added numba to conda build recipe
1 parent 3e375c2 commit df6853a

File tree

9 files changed

+119
-23
lines changed

9 files changed

+119
-23
lines changed

.appveyor.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ install:
7070

7171
# create our env
7272
- cmd: conda install -q -y conda-build anaconda-client
73-
- cmd: conda create -q -n test-environment python=%PYTHON_VERSION% coverage cython flake8 hypothesis numpy pytest pytest-cov python-dateutil pytz six
73+
- cmd: conda create -q -n test-environment python=%PYTHON_VERSION% coverage cython flake8 hypothesis numba numpy pytest pytest-cov python-dateutil pytz six
7474
- cmd: activate test-environment
7575
- cmd: conda list -n test-environment
7676

ci/environment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ dependencies:
77
- flake8
88
- ipython
99
- matplotlib
10+
- numba
1011
- numpy
1112
- numpydoc
1213
- pandas

ci/install-travis.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ conda install -q \
3232
cython \
3333
flake8 \
3434
hypothesis \
35+
numba \
3536
numpy \
3637
pytest \
3738
pytest-cov \

conda-recipes/cyberpandas/meta.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,11 @@ requirements:
1616
- setuptools >=3.3
1717

1818
run:
19+
- ipaddress # [py27]
20+
- numba
21+
- pandas
1922
- python
2023
- setuptools >=3.3
21-
- pandas
22-
- ipaddress # [py27]
2324

2425
test:
2526
imports:

cyberpandas/_utils.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Utilities for working with IP address data."""
22
import struct
33

4+
import numba
45
import six
56

67

@@ -31,3 +32,34 @@ def combine(hi, lo):
3132
# type: (int, int) -> int
3233
"""Combine the hi and lo bytes into the final ip address."""
3334
return (hi << 64) + lo
35+
36+
37+
@numba.jit(nopython=True)
38+
def refactorize(arr, first_na, na_sentinel=-1):
39+
"""
40+
Modify `arr` *inplace* to match pandas' factorization rules.
41+
42+
This detects the code missing values were assigned, sets
43+
those to `na_sentinel`, and shifts codes above that value
44+
down by 1 to fill the hole.
45+
46+
Parameters
47+
----------
48+
arr : ndarray
49+
First return value from :meth:`pandas.factorize`
50+
first_na : int
51+
The index location of the first missing value
52+
na_sentinel : int, default -1
53+
Value to set for missing values.
54+
"""
55+
# A naive benchmark shows that this gets ~285x speedup
56+
# with numba on a 10,000 element array.
57+
na_code = arr[first_na]
58+
for i in range(len(arr)):
59+
val = arr[i]
60+
if val == na_code:
61+
arr[i] = na_sentinel
62+
elif val > na_code:
63+
arr[i] -= 1
64+
65+
return arr

cyberpandas/ip_array.py

Lines changed: 67 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from ._accessor import (DelegatedMethod, DelegatedProperty,
1414
delegated_method)
15-
from ._utils import combine, pack, unpack
15+
from ._utils import combine, pack, unpack, refactorize
1616
from .common import _U8_MAX, _IPv4_MAX
1717
from .parser import _to_ipaddress_pyint, _as_ip_object
1818

@@ -69,6 +69,10 @@ def __init__(self, values):
6969
values = _to_ip_array(values) # TODO: avoid potential copy
7070
self.data = values
7171

72+
@classmethod
73+
def _constructor_from_sequence(cls, scalars):
74+
return cls(scalars)
75+
7276
# -------------------------------------------------------------------------
7377
# Pandas Interface
7478
# -------------------------------------------------------------------------
@@ -287,7 +291,7 @@ def equals(self, other):
287291

288292
def isna(self):
289293
ips = self.data
290-
return (ips['lo'] == 0) & (ips['lo'] - ips['hi'] == 0)
294+
return (ips['lo'] == 0) & (ips['hi'] == 0)
291295

292296
def argsort(self, axis=-1, kind='quicksort', order=None):
293297
return self.data.argsort()
@@ -460,16 +464,67 @@ def unique(self):
460464
data = self.data.take(np.sort(indices))
461465
return self._from_ndarray(data)
462466

463-
def factorize(self, sort=False):
464-
# XXX: Verify this, check for better algo
465-
uniques, indices, labels = np.unique(self.data,
466-
return_index=True,
467-
return_inverse=True)
468-
if not sort:
469-
# Unsort, since np.unique sorts
470-
uniques = self._from_ndarray(self.data.take(np.sort(indices)))
471-
labels = np.argsort(uniques.data).take(labels)
472-
return labels, uniques
467+
def factorize(self, na_sentinel=-1):
468+
"""Factorize an IPArray into integer labels and unique values.
469+
470+
Calling :meth:`pandas.Series.factorize` or :meth:`pandas.factorize`
471+
will dispatch to this method.
472+
473+
Parameters
474+
----------
475+
na_sentinel : int, default -1
476+
The value in `labels` to use for indicating missing values in
477+
`self`.
478+
479+
Returns
480+
-------
481+
labels : ndarray
482+
An integer-type ndarray the same length as `self`. Each newly-
483+
observed value in `self` will be assigned the next integer.
484+
Missing values in self are assigned `na_sentinel`.
485+
uniques : IPArray
486+
The unique values in `self` in order of appereance, not including
487+
the missing value ``IPv4Address('0.0.0.0')``.
488+
489+
See Also
490+
--------
491+
pandas.factorize, pandas.Series.factorize
492+
493+
Examples
494+
--------
495+
>>> arr = IPArray([2, 2, 0, 1, 2, 2**64 + 1])
496+
>>> arr
497+
IPArray(['0.0.0.2', '0.0.0.2', '0.0.0.0', '0.0.0.1',
498+
'0.0.0.2', '::1:0:0:0:1'])
499+
500+
>>> labels, uniques = arr.factorize()
501+
>>> labels
502+
array([ 0, 0, -1, 1, 0, 2])
503+
504+
Notice that `uniques` does not include the missing value.
505+
>>> uniques
506+
IPArray(['0.0.0.2', '0.0.0.1', '::1:0:0:0:1'])
507+
"""
508+
# OK, so here's the plan.
509+
# Start with factorizing `self.data`, which has two unfortunate issues
510+
# 1. Requires casting to object.
511+
# 2. Gets the NA logic wrong, since (0, 0) isn't NA to pandas.
512+
# For now, we can't help with 1. Maybe someday.
513+
# For 2, we can "fix" things with a little post-factorization cleanup.
514+
l, u = pd.factorize(self.data)
515+
mask = self.isna()
516+
any_na = mask.any()
517+
518+
if any_na:
519+
first_na = mask.argmax()
520+
refactorize(l, first_na, na_sentinel=na_sentinel) # inplace op
521+
522+
# u is an ndarray of tuples. Go to our record type, then an IPArray
523+
u2 = type(self)((u.astype(self.dtype._record_type)))
524+
# May have a missing value.
525+
if any_na:
526+
u2 = u2[~u2.isna()]
527+
return l, u2
473528

474529

475530
# -----

cyberpandas/test_interface.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,16 @@ def data_missing_for_sorting():
3838
return ip.IPArray([2 ** 64 + 1, 0, 1])
3939

4040

41+
@pytest.fixture
42+
def data_for_grouping():
43+
b = 1
44+
a = 2 ** 32 + 1
45+
c = 2 ** 32 + 10
46+
return ip.IPArray([
47+
b, b, 0, 0, a, a, b, c
48+
])
49+
50+
4151
@pytest.fixture
4252
def na_cmp():
4353
"""Binary operator for comparing NA values.

cyberpandas/test_ip.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -290,15 +290,10 @@ def test_unique():
290290
tm.assert_numpy_array_equal(result, expected)
291291

292292

293-
@pytest.mark.parametrize('sort', [
294-
pytest.param(True, marks=pytest.mark.xfail(reason="Upstream sort_values")),
295-
False
296-
])
297-
def test_factorize(sort):
293+
def test_factorize():
298294
arr = ip.IPArray([3, 3, 1, 2, 3, _U8_MAX + 1])
299-
labels, uniques = arr.factorize(sort=sort)
300-
expected_labels, expected_uniques = pd.factorize(arr.astype(object),
301-
sort=sort)
295+
labels, uniques = arr.factorize()
296+
expected_labels, expected_uniques = pd.factorize(arr.astype(object))
302297

303298
assert isinstance(uniques, ip.IPArray)
304299

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,6 @@
2323
packages=find_packages(),
2424
install_requires=[
2525
'pandas>=0.23.0.dev0',
26+
'numba',
2627
]
2728
)

0 commit comments

Comments
 (0)