Align spec with existing implementations

vnlitvinov · rgommers · commit 2b35e5d71c6d · 2022-07-28T14:12:01.000+02:00
Signed-off-by: Vasily Litvinov &lt;vasilij.n.litvinov@intel.com&gt;
diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py
@@ -1,7 +1,11 @@
-from typing import Tuple, Optional, Dict, Any, Iterable, Sequence
+from typing import Tuple, Optional, Dict, Any, Iterable, Sequence, TypedDict
 import enum
+from abc import ABC, abstractmethod
+
 
 class DlpackDeviceType(enum.IntEnum):
+    """Integer enum for device type codes matching DLPack."""
+
     CPU = 1
     CUDA = 2
     CPU_PINNED = 3
@@ -11,7 +15,29 @@ class DlpackDeviceType(enum.IntEnum):
     VPI = 9
     ROCM = 10
 
+
 class DtypeKind(enum.IntEnum):
+    """
+    Integer enum for data types.
+
+    Attributes
+    ----------
+    INT : int
+        Matches to signed integer data type.
+    UINT : int
+        Matches to unsigned integer data type.
+    FLOAT : int
+        Matches to floating point data type.
+    BOOL : int
+        Matches to boolean data type.
+    STRING : int
+        Matches to string data type (UTF-8 encoded).
+    DATETIME : int
+        Matches to datetime data type.
+    CATEGORICAL : int
+        Matches to categorical data type.
+    """
+
     INT = 0
     UINT = 1
     FLOAT = 2
@@ -20,14 +46,48 @@ class DtypeKind(enum.IntEnum):
     DATETIME = 22
     CATEGORICAL = 23
 
-class ColumnNullType:
+
+class ColumnNullType(enum.IntEnum):
+    """
+    Integer enum for null type representation.
+
+    Attributes
+    ----------
+    NON_NULLABLE : int
+        Non-nullable column.
+    USE_NAN : int
+        Use explicit float NaN/NaT value.
+    USE_SENTINEL : int
+        Sentinel value besides NaN/NaT.
+    USE_BITMASK : int
+        The bit is set/unset representing a null on a certain position.
+    USE_BYTEMASK : int
+        The byte is set/unset representing a null on a certain position.
+    """
+
     NON_NULLABLE = 0
     USE_NAN = 1
     USE_SENTINEL = 2
     USE_BITMASK = 3
     USE_BYTEMASK = 4
 
-class Buffer:
+
+class ColumnBuffers(TypedDict):
+    data: Tuple["Buffer", Any] # first element is a buffer containing the column data;
+                               # second element is the data buffer's associated dtype
+    validity: Optional[Tuple["Buffer", Any]] # first element is a buffer containing mask values
+                                             # indicating missing data and second element is
+                                             # the mask value buffer's associated dtype.
+                                             # None if the null representation is not a bit or byte mask
+    offsets: Optional[Tuple["Buffer", Any]] # first element is a buffer containing the
+                                            # offset values for variable-size binary data
+                                            # (e.g., variable-length strings) and
+                                            # second element is the offsets buffer's associated dtype.
+                                            # None if the data buffer does not have
+                                            # an associated offsets buffer
+
+
+class Buffer(ABC):
     """
     Data in the buffer is guaranteed to be contiguous in memory.
 
@@ -43,19 +103,22 @@ class Buffer:
     """
 
     @property
+    @abstractmethod
     def bufsize(self) -> int:
         """
         Buffer size in bytes.
         """
         pass
 
     @property
+    @abstractmethod
     def ptr(self) -> int:
         """
         Pointer to start of the buffer as an integer.
         """
         pass
 
+    @abstractmethod
     def __dlpack__(self):
         """
         Produce DLPack capsule (see array API standard).
@@ -70,18 +133,17 @@ def __dlpack__(self):
         """
         raise NotImplementedError("__dlpack__")
 
-    def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]:
+    @abstractmethod
+    def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]:
         """
         Device type and device ID for where the data in the buffer resides.
-
         Uses device type codes matching DLPack.
-
         Note: must be implemented even if ``__dlpack__`` is not.
         """
         pass
 
 
-class Column:
+class Column(ABC):
     """
     A column object, with only the methods and properties required by the
     interchange protocol defined.
@@ -123,10 +185,10 @@ class Column:
 
     Note: this Column object can only be produced by ``__dataframe__``, so
           doesn't need its own version or ``__column__`` protocol.
-
     """
 
     @property
+    @abstractmethod
     def size(self) -> Optional[int]:
         """
         Size of the column, in elements.
@@ -137,6 +199,7 @@ def size(self) -> Optional[int]:
         pass
 
     @property
+    @abstractmethod
     def offset(self) -> int:
         """
         Offset of first element.
@@ -148,6 +211,7 @@ def offset(self) -> int:
         pass
 
     @property
+    @abstractmethod
     def dtype(self) -> Tuple[DtypeKind, int, str, str]:
         """
         Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
@@ -158,7 +222,6 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]:
         Endianness : current only native endianness (``=``) is supported
 
         Notes:
-
             - Kind specifiers are aligned with DLPack where possible (hence the
               jump to 20, leave enough room for future extension)
             - Masks must be specified as boolean with either bit width 1 (for bit
@@ -180,17 +243,16 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]:
         pass
 
     @property
+    @abstractmethod
     def describe_categorical(self) -> dict[bool, bool, Optional[Column]]:
         """
         If the dtype is categorical, there are two options:
-
         - There are only values in the data buffer.
         - There is a separate non-categorical Column encoding categorical values.
 
-        Raises RuntimeError if the dtype is not categorical
-
-        Content of returned dict:
+        Raises TypeError if the dtype is not categorical
 
+        Returns the description on how to interpret the data buffer:
             - "is_ordered" : bool, whether the ordering of dictionary indices is
                              semantically meaningful.
             - "is_dictionary" : bool, whether a mapping of
@@ -204,6 +266,7 @@ def describe_categorical(self) -> dict[bool, bool, Optional[Column]]:
         pass
 
     @property
+    @abstractmethod
     def describe_null(self) -> Tuple[ColumnNullType, Any]:
         """
         Return the missing value (or "null") representation the column dtype
@@ -216,6 +279,7 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]:
         pass
 
     @property
+    @abstractmethod
     def null_count(self) -> Optional[int]:
         """
         Number of null elements, if known.
@@ -225,18 +289,21 @@ def null_count(self) -> Optional[int]:
         pass
 
     @property
+    @abstractmethod
     def metadata(self) -> Dict[str, Any]:
         """
         The metadata for the column. See `DataFrame.metadata` for more details.
         """
         pass
 
+    @abstractmethod
     def num_chunks(self) -> int:
         """
         Return the number of chunks the column consists of.
         """
         pass
 
+    @abstractmethod
     def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["Column"]:
         """
         Return an iterator yielding the chunks.
@@ -245,7 +312,8 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["Column"]:
         """
         pass
 
-    def get_buffers(self) -> Dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]], Optional[Tuple[Buffer, Any]]]:
+    @abstractmethod
+    def get_buffers(self) -> ColumnBuffers:
         """
         Return a dictionary containing the underlying buffers.
 
@@ -276,7 +344,7 @@ def get_buffers(self) -> Dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]],
 #        pass
 
 
-class DataFrame:
+class DataFrame(ABC):
     """
     A data frame class, with only the methods required by the interchange
     protocol defined.
@@ -290,29 +358,11 @@ class DataFrame:
     ``__dataframe__`` method of a public data frame class in a library adhering
     to the dataframe interchange protocol specification.
     """
-    def __dataframe__(self, nan_as_null : bool = False,
-                      allow_copy : bool = True) -> dict:
-        """
-        Produces a dictionary object following the dataframe protocol specification.
 
-        ``nan_as_null`` is a keyword intended for the consumer to tell the
-        producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
-        It is intended for cases where the consumer does not support the bit
-        mask or byte mask that is the producer's native representation.
-
-        ``allow_copy`` is a keyword that defines whether or not the library is
-        allowed to make a copy of the data. For example, copying data would be
-        necessary if a library supports strided buffers, given that this protocol
-        specifies contiguous buffers.
-        """
-        self._nan_as_null = nan_as_null
-        self._allow_zero_zopy = allow_copy
-        return {
-            "dataframe": self,  # DataFrame object adhering to the protocol
-            "version": 0        # Version number of the protocol
-        }
+    version = 0 # version of the protocol
 
     @property
+    @abstractmethod
     def metadata(self) -> Dict[str, Any]:
         """
         The metadata for the data frame, as a dictionary with string keys. The
@@ -325,12 +375,14 @@ def metadata(self) -> Dict[str, Any]:
         """
         pass
 
+    @abstractmethod
     def num_columns(self) -> int:
         """
         Return the number of columns in the DataFrame.
         """
         pass
 
+    @abstractmethod
     def num_rows(self) -> Optional[int]:
         # TODO: not happy with Optional, but need to flag it may be expensive
         #       why include it if it may be None - what do we expect consumers
@@ -340,48 +392,56 @@ def num_rows(self) -> Optional[int]:
         """
         pass
 
+    @abstractmethod
     def num_chunks(self) -> int:
         """
         Return the number of chunks the DataFrame consists of.
         """
         pass
 
+    @abstractmethod
     def column_names(self) -> Iterable[str]:
         """
         Return an iterator yielding the column names.
         """
         pass
 
+    @abstractmethod
     def get_column(self, i: int) -> Column:
         """
         Return the column at the indicated position.
         """
         pass
 
+    @abstractmethod
     def get_column_by_name(self, name: str) -> Column:
         """
         Return the column whose name is the indicated name.
         """
         pass
 
+    @abstractmethod
     def get_columns(self) -> Iterable[Column]:
         """
         Return an iterator yielding the columns.
         """
         pass
 
+    @abstractmethod
     def select_columns(self, indices: Sequence[int]) -> "DataFrame":
         """
         Create a new DataFrame by selecting a subset of columns by index.
         """
         pass
 
+    @abstractmethod
     def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame":
         """
         Create a new DataFrame by selecting a subset of columns by name.
         """
         pass
 
+    @abstractmethod
     def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["DataFrame"]:
         """
         Return an iterator yielding the chunks.