Fix test declarations, some impl bugs remain

vnlitvinov · vnlitvinov · commit fb496f14dc88 · 2022-03-31T19:02:27.000+03:00
Signed-off-by: Vasily Litvinov &lt;vasilij.n.litvinov@intel.com&gt;
diff --git a/pandas/core/exchange/dataframe.py b/pandas/core/exchange/dataframe.py
@@ -28,6 +28,9 @@ def __init__(
         self._nan_as_null = nan_as_null
         self._allow_copy = allow_copy
 
+    def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
+        return PandasDataFrameXchg(self._df, nan_as_null, allow_copy)
+
     @property
     def metadata(self):
         # `index` isn't a regular column, and the protocol doesn't support row
diff --git a/pandas/core/exchange/dataframe_protocol.py b/pandas/core/exchange/dataframe_protocol.py
@@ -93,6 +93,15 @@ class ColumnBuffers(TypedDict):
     offsets: Optional[Tuple["Buffer", Any]]
 
 
+class CategoricalDescription(TypedDict):
+    # whether the ordering of dictionary indices is semantically meaningful
+    is_ordered: bool
+    # whether a dictionary-style mapping of categorical values to other objects exists
+    is_dictionary: bool
+    # Python-level only (e.g. ``{int: str}``). None if not a dictionary-style categorical.
+    mapping: Optional[dict]
+
+
 class Buffer(ABC):
     """
     Data in the buffer is guaranteed to be contiguous in memory.
@@ -250,15 +259,15 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]:
 
     @property
     @abstractmethod
-    def describe_categorical(self) -> Tuple[bool, bool, Optional[dict]]:
+    def describe_categorical(self) -> CategoricalDescription:
         """
         If the dtype is categorical, there are two options:
         - There are only values in the data buffer.
         - There is a separate dictionary-style encoding for categorical values.
 
         Raises TypeError if the dtype is not categorical
 
-        Returns the description on how to interpret the data buffer:
+        Returns the dictionary with description on how to interpret the data buffer:
             - "is_ordered" : bool, whether the ordering of dictionary indices is
                              semantically meaningful.
             - "is_dictionary" : bool, whether a dictionary-style mapping of
@@ -367,6 +376,11 @@ class DataFrame(ABC):
 
     version = 0  # version of the protocol
 
+    @abstractmethod
+    def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
+        """Construct a new exchange object, potentially changing the parameters."""
+        pass
+
     @property
     @abstractmethod
     def metadata(self) -> Dict[str, Any]:
diff --git a/pandas/tests/exchange/test_impl.py b/pandas/tests/exchange/test_impl.py
@@ -0,0 +1,191 @@
+import pandas as pd
+import numpy as np
+import pytest
+import random
+
+from pandas.testing import assert_frame_equal
+from pandas.core.exchange.dataframe_protocol import DtypeKind, ColumnNullType
+from pandas.core.exchange.from_dataframe import from_dataframe
+
+test_data_categorical = {
+    "ordered": pd.Categorical(list("testdata") * 30, ordered=True),
+    "unordered": pd.Categorical(list("testdata") * 30, ordered=False),
+}
+
+NCOLS, NROWS = 100, 200
+
+int_data = {
+    "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): [
+        random.randint(0, 100) for _ in range(NROWS)
+    ]
+    for i in range(NCOLS)
+}
+
+bool_data = {
+    "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): [
+        random.choice([True, False]) for _ in range(NROWS)
+    ]
+    for i in range(NCOLS)
+}
+
+float_data = {
+    "col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): [
+        random.random() for _ in range(NROWS)
+    ]
+    for i in range(NCOLS)
+}
+
+string_data = {
+    "separator data": [
+        "abC|DeF,Hik",
+        "234,3245.67",
+        "gSaf,qWer|Gre",
+        "asd3,4sad|",
+        np.NaN,
+    ]
+}
+
+
+@pytest.mark.parametrize("data", [("ordered", True), ("unordered", False)])
+def test_categorical_dtype(data):
+    df = pd.DataFrame({"A": (test_data_categorical[data[0]])})
+
+    col = df.__dataframe__().get_column_by_name("A")
+    assert col.dtype[0] == DtypeKind.CATEGORICAL
+    assert col.null_count == 0
+    assert col.describe_null == (ColumnNullType.USE_SENTINEL, -1)
+    assert col.num_chunks() == 1
+    assert col.describe_categorical == {
+        "is_ordered": data[1],
+        "is_dictionary": True,
+        "mapping": {4: "s", 2: "d", 3: "e", 1: "t"},
+    }
+
+    assert assert_frame_equal(df, from_dataframe(df.__dataframe__()))
+
+
+@pytest.mark.parametrize("data", [int_data, float_data, bool_data])
+def test_dataframe(data):
+    df = pd.DataFrame(data)
+
+    df2 = df.__dataframe__()
+
+    assert df2._allow_copy is True
+    assert df2.num_columns() == NCOLS
+    assert df2.num_rows() == NROWS
+
+    assert list(df2.column_names()) == list(data.keys())
+
+    assert assert_frame_equal(
+        from_dataframe(df2.select_columns((0, 2))),
+        from_dataframe(df2.select_columns_by_name(("col33", "col35"))),
+    )
+    assert assert_frame_equal(
+        from_dataframe(df2.select_columns((0, 2))),
+        from_dataframe(df2.select_columns_by_name(("col33", "col35"))),
+    )
+
+
+def test_missing_from_masked():
+    df = pd.DataFrame(
+        {
+            "x": np.array([1, 2, 3, 4, 0]),
+            "y": np.array([1.5, 2.5, 3.5, 4.5, 0]),
+            "z": np.array([True, False, True, True, True]),
+        }
+    )
+
+    df2 = df.__dataframe__()
+
+    # for col_name in df.columns:
+    # assert convert_column_to_array(df2.get_column_by_name(col_name) == df[col_name].tolist()
+    # assert df[col_name].dtype == convert_column_to_array(df2.get_column_by_name(col_name)).dtype
+
+    rng = np.random.RandomState(42)
+    dict_null = {col: rng.randint(low=0, high=len(df)) for col in df.columns}
+    for col, num_nulls in dict_null.items():
+        null_idx = df.index[
+            rng.choice(np.arange(len(df)), size=num_nulls, replace=False)
+        ]
+        df.loc[null_idx, col] = None
+
+    df2 = df.__dataframe__()
+
+    assert df2.get_column_by_name("x").null_count == dict_null["x"]
+    assert df2.get_column_by_name("y").null_count == dict_null["y"]
+    assert df2.get_column_by_name("z").null_count == dict_null["z"]
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"x": [1.5, 2.5, 3.5], "y": [9.2, 10.5, 11.8]},
+        {"x": [1, 2, 0], "y": [9.2, 10.5, 11.8]},
+        {
+            "x": np.array([True, True, False]),
+            "y": np.array([1, 2, 0]),
+            "z": np.array([9.2, 10.5, 11.8]),
+        },
+    ],
+)
+def test_mixed_data(data):
+    df = pd.DataFrame(data)
+    df2 = df.__dataframe__()
+
+    for col_name in df.columns:
+        assert df2.get_column_by_name(col_name).null_count == 0
+
+
+def test_mixed_missing():
+    df = pd.DataFrame(
+        {
+            "x": np.array([True, None, False, None, True]),
+            "y": np.array([None, 2, None, 1, 2]),
+            "z": np.array([9.2, 10.5, None, 11.8, None]),
+        }
+    )
+
+    df2 = df.__dataframe__()
+
+    for col_name in df.columns:
+        assert df2.get_column_by_name(col_name).null_count == 2
+
+
+def test_select_columns_error():
+    df = pd.DataFrame(int_data)
+
+    df2 = df.__dataframe__()
+
+    with pytest.raises(ValueError):
+        assert from_dataframe(df2.select_columns(np.array([0, 2]))) == from_dataframe(
+            df2.select_columns_by_name(("col33", "col35"))
+        )
+
+
+def test_select_columns_by_name_error():
+    df = pd.DataFrame(int_data)
+
+    df2 = df.__dataframe__()
+
+    with pytest.raises(ValueError):
+        assert from_dataframe(
+            df2.select_columns_by_name(np.array(["col33", "col35"]))
+        ) == from_dataframe(df2.select_columns((0, 2)))
+
+
+def test_string():
+    test_str_data = string_data["separator data"] + [""]
+    df = pd.DataFrame({"A": test_str_data})
+    col = df.__dataframe__().get_column_by_name("A")
+
+    assert col.size == 6
+    assert col.null_count == 1
+    assert col.dtype[0] == DtypeKind.STRING
+    assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0)
+
+    df_sliced = df[1:]
+    col = df_sliced.__dataframe__().get_column_by_name("A")
+    assert col.size == 5
+    assert col.null_count == 1
+    assert col.dtype[0] == DtypeKind.STRING
+    assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0)