Skip to content

Commit 9245c3d

Browse files
POC: ArrayManager -- array-based data manager for columnar store
1 parent b45327f commit 9245c3d

File tree

5 files changed

+590
-9
lines changed

5 files changed

+590
-9
lines changed

pandas/core/frame.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@
127127
from pandas.core.indexes.multi import MultiIndex, maybe_droplevels
128128
from pandas.core.indexes.period import PeriodIndex
129129
from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable
130-
from pandas.core.internals import BlockManager
130+
from pandas.core.internals import ArrayManager, BlockManager
131131
from pandas.core.internals.construction import (
132132
arrays_to_mgr,
133133
dataclasses_to_dicts,
@@ -445,6 +445,7 @@ def __init__(
445445
columns: Optional[Axes] = None,
446446
dtype: Optional[Dtype] = None,
447447
copy: bool = False,
448+
manager: str = "array",
448449
):
449450
if data is None:
450451
data = {}
@@ -454,7 +455,7 @@ def __init__(
454455
if isinstance(data, DataFrame):
455456
data = data._mgr
456457

457-
if isinstance(data, BlockManager):
458+
if isinstance(data, (BlockManager, ArrayManager)):
458459
if index is None and columns is None and dtype is None and copy is False:
459460
# GH#33357 fastpath
460461
NDFrame.__init__(self, data)
@@ -561,6 +562,11 @@ def __init__(
561562
values, index, columns, dtype=values.dtype, copy=False
562563
)
563564

565+
if manager == "array" and not isinstance(mgr, ArrayManager):
566+
# TODO proper initialization
567+
df = DataFrame(mgr, manager="block")
568+
arrays = [arr.copy() for arr in df._iter_column_arrays()]
569+
mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]])
564570
NDFrame.__init__(self, mgr)
565571

566572
# ----------------------------------------------------------------------
@@ -635,6 +641,8 @@ def _is_homogeneous_type(self) -> bool:
635641
... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
636642
False
637643
"""
644+
if isinstance(self._mgr, ArrayManager):
645+
return False
638646
if self._mgr.any_extension_types:
639647
return len({block.dtype for block in self._mgr.blocks}) == 1
640648
else:

pandas/core/generic.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@
9696
from pandas.core.indexes.datetimes import DatetimeIndex
9797
from pandas.core.indexes.period import Period, PeriodIndex
9898
import pandas.core.indexing as indexing
99-
from pandas.core.internals import BlockManager
99+
from pandas.core.internals import ArrayManager, BlockManager
100100
from pandas.core.missing import find_valid_index
101101
from pandas.core.ops import _align_method_FRAME
102102
from pandas.core.shared_docs import _shared_docs
@@ -192,7 +192,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin):
192192
_deprecations: FrozenSet[str] = frozenset(["get_values", "tshift"])
193193
_metadata: List[str] = []
194194
_is_copy = None
195-
_mgr: BlockManager
195+
_mgr: Union[BlockManager, ArrayManager]
196196
_attrs: Dict[Optional[Hashable], Any]
197197
_typ: str
198198

@@ -201,7 +201,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin):
201201

202202
def __init__(
203203
self,
204-
data: BlockManager,
204+
data: Union[BlockManager, ArrayManager],
205205
copy: bool = False,
206206
attrs: Optional[Mapping[Optional[Hashable], Any]] = None,
207207
):
@@ -217,7 +217,9 @@ def __init__(
217217
object.__setattr__(self, "_attrs", attrs)
218218

219219
@classmethod
220-
def _init_mgr(cls, mgr, axes, dtype=None, copy: bool = False) -> BlockManager:
220+
def _init_mgr(
221+
cls, mgr, axes, dtype=None, copy: bool = False
222+
) -> Union[BlockManager, ArrayManager]:
221223
""" passed a manager and a axes dict """
222224
for a, axe in axes.items():
223225
if axe is not None:
@@ -5242,6 +5244,8 @@ def _protect_consolidate(self, f):
52425244
Consolidate _mgr -- if the blocks have changed, then clear the
52435245
cache
52445246
"""
5247+
if isinstance(self._mgr, ArrayManager):
5248+
return f()
52455249
blocks_before = len(self._mgr.blocks)
52465250
result = f()
52475251
if len(self._mgr.blocks) != blocks_before:

pandas/core/internals/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from pandas.core.internals.concat import concatenate_block_managers
1717
from pandas.core.internals.managers import (
1818
BlockManager,
19+
ArrayManager,
1920
SingleBlockManager,
2021
create_block_manager_from_arrays,
2122
create_block_manager_from_blocks,

pandas/core/internals/concat.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from collections import defaultdict
22
import copy
3+
import itertools
34
from typing import List
45

56
import numpy as np
@@ -25,7 +26,7 @@
2526
import pandas.core.algorithms as algos
2627
from pandas.core.arrays import ExtensionArray
2728
from pandas.core.internals.blocks import make_block
28-
from pandas.core.internals.managers import BlockManager
29+
from pandas.core.internals.managers import ArrayManager, BlockManager
2930

3031

3132
def concatenate_block_managers(
@@ -45,6 +46,23 @@ def concatenate_block_managers(
4546
-------
4647
BlockManager
4748
"""
49+
# breakpoint()
50+
51+
if isinstance(mgrs_indexers[0][0], ArrayManager):
52+
53+
if concat_axis == 1:
54+
# TODO for now only fastpath without indexers
55+
mgrs = [t[0] for t in mgrs_indexers]
56+
arrays = [
57+
np.concatenate([mgrs[i].arrays[j] for i in range(len(mgrs))])
58+
for j in range(len(mgrs[0].arrays))
59+
]
60+
return ArrayManager(arrays, [axes[1], axes[0]])
61+
elif concat_axis == 0:
62+
mgrs = [t[0] for t in mgrs_indexers]
63+
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
64+
return ArrayManager(arrays, [axes[1], axes[0]])
65+
4866
concat_plans = [
4967
_get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers
5068
]

0 commit comments

Comments
 (0)