Skip to content

Commit 0c38792

Browse files
committed
support CategoricalIndex
raise KeyError when accessing invalid elements setting elements not in the categories is equiv of .append() (which coerces to an Index)
1 parent 8d2818e commit 0c38792

File tree

10 files changed

+610
-141
lines changed

10 files changed

+610
-141
lines changed

pandas/core/api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from pandas.core.categorical import Categorical
99
from pandas.core.groupby import Grouper
1010
from pandas.core.format import set_eng_float_format
11-
from pandas.core.index import Index, Int64Index, Float64Index, MultiIndex
11+
from pandas.core.index import Index, CategoricalIndex, Int64Index, Float64Index, MultiIndex
1212

1313
from pandas.core.series import Series, TimeSeries
1414
from pandas.core.frame import DataFrame

pandas/core/categorical.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import pandas.core.common as com
1515
from pandas.util.decorators import cache_readonly
1616

17-
from pandas.core.common import (CategoricalDtype, ABCSeries, isnull, notnull,
17+
from pandas.core.common import (CategoricalDtype, ABCSeries, ABCCategoricalIndex, isnull, notnull,
1818
is_categorical_dtype, is_integer_dtype, is_object_dtype,
1919
_possibly_infer_to_datetimelike, get_dtype_kinds,
2020
is_list_like, is_sequence, is_null_slice, is_bool,
@@ -79,7 +79,7 @@ def f(self, other):
7979

8080
def maybe_to_categorical(array):
8181
""" coerce to a categorical if a series is given """
82-
if isinstance(array, ABCSeries):
82+
if isinstance(array, (ABCSeries, ABCCategoricalIndex)):
8383
return array.values
8484
return array
8585

@@ -302,11 +302,31 @@ def copy(self):
302302
return Categorical(values=self._codes.copy(),categories=self.categories,
303303
name=self.name, ordered=self.ordered, fastpath=True)
304304

305+
def astype(self, dtype):
306+
""" coerce this type to another dtype """
307+
if is_categorical_dtype(dtype):
308+
return self
309+
elif is_object_dtype(dtype):
310+
return np.array(self)
311+
312+
raise TypeError('Astype a Categorical to anything other than '
313+
'categorical or object is not supported')
314+
305315
@cache_readonly
306316
def ndim(self):
307317
"""Number of dimensions of the Categorical """
308318
return self._codes.ndim
309319

320+
@cache_readonly
321+
def size(self):
322+
""" return the len of myself """
323+
return len(self)
324+
325+
@cache_readonly
326+
def itemsize(self):
327+
""" return the size of a single category """
328+
return self.categories.itemsize
329+
310330
def reshape(self, new_shape, **kwargs):
311331
""" compat with .reshape """
312332
return self

pandas/core/common.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ def _check(cls, inst):
7272
ABCDatetimeIndex = create_pandas_abc_type("ABCDatetimeIndex", "_typ", ("datetimeindex",))
7373
ABCTimedeltaIndex = create_pandas_abc_type("ABCTimedeltaIndex", "_typ", ("timedeltaindex",))
7474
ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",))
75+
ABCCategoricalIndex = create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex",))
7576
ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",))
7677
ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",))
7778
ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel",))
@@ -2438,9 +2439,26 @@ def _get_dtype_type(arr_or_dtype):
24382439
return np.dtype(arr_or_dtype).type
24392440
elif isinstance(arr_or_dtype, CategoricalDtype):
24402441
return CategoricalDtypeType
2442+
elif isinstance(arr_or_dtype, compat.string_types):
2443+
if is_categorical_dtype(arr_or_dtype):
2444+
return CategoricalDtypeType
2445+
return _get_dtype_type(np.dtype(arr_or_dtype))
24412446
return arr_or_dtype.dtype.type
24422447

24432448

2449+
def is_dtypes_equal(source, target):
2450+
""" return a boolean if the dtypes are equal """
2451+
source = _get_dtype_type(source)
2452+
target = _get_dtype_type(target)
2453+
2454+
try:
2455+
return source == target
2456+
except:
2457+
2458+
# invalid comparison
2459+
# object == category will hit this
2460+
return False
2461+
24442462
def is_any_int_dtype(arr_or_dtype):
24452463
tipo = _get_dtype_type(arr_or_dtype)
24462464
return issubclass(tipo, np.integer)

pandas/core/index.py

Lines changed: 215 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import datetime
33
import warnings
44
import operator
5+
56
from functools import partial
67
from pandas.compat import range, zip, lrange, lzip, u, reduce, filter, map
78
from pandas import compat
@@ -16,9 +17,9 @@
1617
from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs
1718
from pandas.util.decorators import (Appender, Substitution, cache_readonly,
1819
deprecate)
19-
from pandas.core.common import isnull, array_equivalent
2020
import pandas.core.common as com
21-
from pandas.core.common import (_values_from_object, is_float, is_integer,
21+
from pandas.core.common import (isnull, array_equivalent,
22+
_values_from_object, is_float, is_integer, is_categorical_dtype,
2223
ABCSeries, _ensure_object, _ensure_int64, is_bool_indexer,
2324
is_list_like, is_bool_dtype, is_null_slice, is_integer_dtype)
2425
from pandas.core.config import get_option
@@ -163,6 +164,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False,
163164
return Float64Index(data, copy=copy, dtype=dtype, name=name)
164165
elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):
165166
subarr = data.astype('object')
167+
elif is_categorical_dtype(data):
168+
return CategoricalIndex(data, copy=copy, name=name, **kwargs)
166169
else:
167170
subarr = com._asarray_tuplesafe(data, dtype=object)
168171

@@ -171,6 +174,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False,
171174
if copy:
172175
subarr = subarr.copy()
173176

177+
elif is_categorical_dtype(data):
178+
return CategoricalIndex(data, copy=copy, name=name, **kwargs)
174179
elif hasattr(data, '__array__'):
175180
return Index(np.asarray(data), dtype=dtype, copy=copy, name=name,
176181
**kwargs)
@@ -626,6 +631,9 @@ def is_numeric(self):
626631
def is_object(self):
627632
return self.dtype == np.object_
628633

634+
def is_categorical(self):
635+
return self.inferred_type in ['categorical']
636+
629637
def is_mixed(self):
630638
return 'mixed' in self.inferred_type
631639

@@ -1045,10 +1053,12 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs):
10451053

10461054
from pandas.core.format import format_array
10471055

1048-
if values.dtype == np.object_:
1056+
if com.is_categorical_dtype(values.dtype):
1057+
values = np.array(values)
1058+
elif com.is_object_dtype(values.dtype):
10491059
values = lib.maybe_convert_objects(values, safe=1)
10501060

1051-
if values.dtype == np.object_:
1061+
if com.is_object_dtype(values.dtype):
10521062
result = [com.pprint_thing(x, escape_chars=('\t', '\r', '\n'))
10531063
for x in values]
10541064

@@ -1543,7 +1553,7 @@ def get_indexer(self, target, method=None, limit=None):
15431553
if pself is not self or ptarget is not target:
15441554
return pself.get_indexer(ptarget, method=method, limit=limit)
15451555

1546-
if self.dtype != target.dtype:
1556+
if not com.is_dtypes_equal(self.dtype,target.dtype):
15471557
this = self.astype(object)
15481558
target = target.astype(object)
15491559
return this.get_indexer(target, method=method, limit=limit)
@@ -2511,6 +2521,206 @@ def invalid_op(self, other=None):
25112521
Index._add_numeric_methods_disabled()
25122522
Index._add_logical_methods()
25132523

2524+
class CategoricalIndex(Index):
2525+
"""
2526+
2527+
Immutable Index implementing an ordered, sliceable set. CategoricalIndex
2528+
represents a sparsely populated Index with an underlying Categorical.
2529+
2530+
Parameters
2531+
----------
2532+
data : array-like (1-dimensional)
2533+
categories : optional categories for the CategoricalIndex
2534+
copy : bool
2535+
Make a copy of input ndarray
2536+
name : object
2537+
Name to be stored in the index
2538+
2539+
"""
2540+
2541+
_typ = 'categoricalindex'
2542+
_engine_type = _index.Int64Engine
2543+
_attributes = ['name','categories']
2544+
2545+
def __new__(cls, data=None, categories=None, dtype=None, copy=False, name=None, fastpath=False, ordered=None, **kwargs):
2546+
2547+
def create_categorical(data=data, categories=categories):
2548+
if categories is not None:
2549+
data = data.set_categories(categories)
2550+
if not data.ordered:
2551+
data = data.as_ordered()
2552+
return data
2553+
2554+
if fastpath:
2555+
return cls._simple_new(data, name=name)
2556+
2557+
if ordered is not None:
2558+
raise ValueError("CategoricalIndex are by definition ordered")
2559+
2560+
if isinstance(data, com.ABCCategorical):
2561+
data = create_categorical(data, categories)
2562+
elif data is None or np.isscalar(data):
2563+
cls._scalar_data_error(data)
2564+
elif isinstance(data, CategoricalIndex):
2565+
data = data._data
2566+
data = create_categorical(data, categories)
2567+
else:
2568+
from pandas.core.categorical import Categorical
2569+
data = Categorical(data, categories=categories, ordered=True)
2570+
2571+
if copy:
2572+
data = data.copy()
2573+
2574+
return cls._simple_new(data, name=name)
2575+
2576+
@classmethod
2577+
def _simple_new(cls, values, name=None, categories=None, **kwargs):
2578+
result = object.__new__(cls)
2579+
2580+
if not isinstance(values, com.ABCCategorical):
2581+
from pandas.core.categorical import Categorical
2582+
values = Categorical(values, categories=categories, ordered=True)
2583+
elif categories is not None:
2584+
values = values.set_categories(categories)
2585+
2586+
result._data = values
2587+
result.name = name
2588+
for k, v in compat.iteritems(kwargs):
2589+
setattr(result,k,v)
2590+
2591+
result._reset_identity()
2592+
return result
2593+
2594+
def equals(self, other):
2595+
"""
2596+
Determines if two CategorialIndex objects contain the same elements.
2597+
"""
2598+
if self.is_(other):
2599+
return True
2600+
2601+
if not isinstance(other, CategoricalIndex):
2602+
return False
2603+
2604+
try:
2605+
return (self._data == other._data).all()
2606+
except:
2607+
return False
2608+
2609+
@property
2610+
def inferred_type(self):
2611+
return 'categorical'
2612+
2613+
@property
2614+
def values(self):
2615+
""" return the underlying data, which is a Categorical """
2616+
return self._data
2617+
2618+
@property
2619+
def codes(self):
2620+
return self._data.codes
2621+
2622+
@property
2623+
def categories(self):
2624+
return self._data.categories
2625+
2626+
def __contains__(self, key):
2627+
hash(key)
2628+
return key in self.categories
2629+
2630+
def __array__(self, result=None):
2631+
""" the array interface, return my values """
2632+
return np.array(self._data)
2633+
2634+
def _array_values(self):
2635+
return self.values
2636+
2637+
def argsort(self, *args, **kwargs):
2638+
return self.values.argsort(*args, **kwargs)
2639+
2640+
@cache_readonly
2641+
def _engine(self):
2642+
2643+
# we are going to look things up with the codes themselves
2644+
return self._engine_type(lambda: self.codes.astype('i8'), len(self))
2645+
2646+
@cache_readonly
2647+
def is_unique(self):
2648+
return not self.duplicated().any()
2649+
2650+
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
2651+
def duplicated(self, take_last=False):
2652+
from pandas.hashtable import duplicated_int64
2653+
return duplicated_int64(self.codes.astype('i8'), take_last)
2654+
2655+
def get_loc(self, key, method=None):
2656+
"""
2657+
Get integer location for requested label
2658+
2659+
Parameters
2660+
----------
2661+
key : label
2662+
method : {None}
2663+
* default: exact matches only.
2664+
2665+
Returns
2666+
-------
2667+
loc : int if unique index, possibly slice or mask if not
2668+
"""
2669+
tcodes = self.categories.get_indexer([key])
2670+
if (tcodes == -1):
2671+
raise KeyError
2672+
return self._engine.get_indexer_non_unique(tcodes)[0]
2673+
2674+
def get_indexer(self, target, method=None, limit=None):
2675+
"""
2676+
Compute indexer and mask for new index given the current index. The
2677+
indexer should be then used as an input to ndarray.take to align the
2678+
current data to the new index. The mask determines whether labels are
2679+
found or not in the current index
2680+
2681+
Parameters
2682+
----------
2683+
target : MultiIndex or Index (of tuples)
2684+
method : {'pad', 'ffill', 'backfill', 'bfill'}
2685+
pad / ffill: propagate LAST valid observation forward to next valid
2686+
backfill / bfill: use NEXT valid observation to fill gap
2687+
2688+
Notes
2689+
-----
2690+
This is a low-level method and probably should be used at your own risk
2691+
2692+
Examples
2693+
--------
2694+
>>> indexer, mask = index.get_indexer(new_index)
2695+
>>> new_values = cur_values.take(indexer)
2696+
>>> new_values[-mask] = np.nan
2697+
2698+
Returns
2699+
-------
2700+
(indexer, mask) : (ndarray, ndarray)
2701+
"""
2702+
method = com._clean_reindex_fill_method(method)
2703+
target = _ensure_index(target)
2704+
2705+
if isinstance(target, CategoricalIndex):
2706+
target = target.categories
2707+
2708+
if method == 'pad' or method == 'backfill':
2709+
raise NotImplementedError("method='pad' and method='backfill' not implemented yet "
2710+
'for CategoricalIndex')
2711+
elif method == 'nearest':
2712+
raise NotImplementedError("method='nearest' not implemented yet "
2713+
'for CategoricalIndex')
2714+
else:
2715+
2716+
tcodes = self.categories.get_indexer(target)
2717+
indexer = self._engine.get_indexer_non_unique(tcodes)[0]
2718+
2719+
return com._ensure_platform_int(indexer)
2720+
2721+
2722+
CategoricalIndex._add_numeric_methods_disabled()
2723+
CategoricalIndex._add_logical_methods_disabled()
25142724

25152725
class NumericIndex(Index):
25162726
"""

pandas/core/series.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2605,8 +2605,9 @@ def _try_cast(arr, take_fast_path):
26052605

26062606
# GH #846
26072607
if isinstance(data, (np.ndarray, Index, Series)):
2608-
subarr = np.array(data, copy=False)
2608+
26092609
if dtype is not None:
2610+
subarr = np.array(data, copy=False)
26102611

26112612
# possibility of nan -> garbage
26122613
if com.is_float_dtype(data.dtype) and com.is_integer_dtype(dtype):

pandas/tests/test_categorical.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2658,6 +2658,14 @@ def cmp(a,b):
26582658
self.assertRaises(TypeError, lambda : invalid(s))
26592659

26602660

2661+
def test_astype_categorical(self):
2662+
2663+
cat = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
2664+
tm.assert_categorical_equal(cat,cat.astype('category'))
2665+
tm.assert_almost_equal(np.array(cat),cat.astype('object'))
2666+
2667+
self.assertRaises(TypeError, lambda : cat.astype(float))
2668+
26612669
def test_to_records(self):
26622670

26632671
# GH8626

0 commit comments

Comments
 (0)