Skip to content

Commit 2b35e5d

Browse files
vnlitvinovrgommers
authored andcommitted
Align spec with existing implementations
Signed-off-by: Vasily Litvinov <[email protected]>
1 parent 8eab8a2 commit 2b35e5d

File tree

1 file changed

+95
-35
lines changed

1 file changed

+95
-35
lines changed

protocol/dataframe_protocol.py

Lines changed: 95 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
1-
from typing import Tuple, Optional, Dict, Any, Iterable, Sequence
1+
from typing import Tuple, Optional, Dict, Any, Iterable, Sequence, TypedDict
22
import enum
3+
from abc import ABC, abstractmethod
4+
35

46
class DlpackDeviceType(enum.IntEnum):
7+
"""Integer enum for device type codes matching DLPack."""
8+
59
CPU = 1
610
CUDA = 2
711
CPU_PINNED = 3
@@ -11,7 +15,29 @@ class DlpackDeviceType(enum.IntEnum):
1115
VPI = 9
1216
ROCM = 10
1317

18+
1419
class DtypeKind(enum.IntEnum):
20+
"""
21+
Integer enum for data types.
22+
23+
Attributes
24+
----------
25+
INT : int
26+
Matches to signed integer data type.
27+
UINT : int
28+
Matches to unsigned integer data type.
29+
FLOAT : int
30+
Matches to floating point data type.
31+
BOOL : int
32+
Matches to boolean data type.
33+
STRING : int
34+
Matches to string data type (UTF-8 encoded).
35+
DATETIME : int
36+
Matches to datetime data type.
37+
CATEGORICAL : int
38+
Matches to categorical data type.
39+
"""
40+
1541
INT = 0
1642
UINT = 1
1743
FLOAT = 2
@@ -20,14 +46,48 @@ class DtypeKind(enum.IntEnum):
2046
DATETIME = 22
2147
CATEGORICAL = 23
2248

23-
class ColumnNullType:
49+
50+
class ColumnNullType(enum.IntEnum):
51+
"""
52+
Integer enum for null type representation.
53+
54+
Attributes
55+
----------
56+
NON_NULLABLE : int
57+
Non-nullable column.
58+
USE_NAN : int
59+
Use explicit float NaN/NaT value.
60+
USE_SENTINEL : int
61+
Sentinel value besides NaN/NaT.
62+
USE_BITMASK : int
63+
The bit is set/unset representing a null on a certain position.
64+
USE_BYTEMASK : int
65+
The byte is set/unset representing a null on a certain position.
66+
"""
67+
2468
NON_NULLABLE = 0
2569
USE_NAN = 1
2670
USE_SENTINEL = 2
2771
USE_BITMASK = 3
2872
USE_BYTEMASK = 4
2973

30-
class Buffer:
74+
75+
class ColumnBuffers(TypedDict):
76+
data: Tuple["Buffer", Any] # first element is a buffer containing the column data;
77+
# second element is the data buffer's associated dtype
78+
validity: Optional[Tuple["Buffer", Any]] # first element is a buffer containing mask values
79+
# indicating missing data and second element is
80+
# the mask value buffer's associated dtype.
81+
# None if the null representation is not a bit or byte mask
82+
offsets: Optional[Tuple["Buffer", Any]] # first element is a buffer containing the
83+
# offset values for variable-size binary data
84+
# (e.g., variable-length strings) and
85+
# second element is the offsets buffer's associated dtype.
86+
# None if the data buffer does not have
87+
# an associated offsets buffer
88+
89+
90+
class Buffer(ABC):
3191
"""
3292
Data in the buffer is guaranteed to be contiguous in memory.
3393
@@ -43,19 +103,22 @@ class Buffer:
43103
"""
44104

45105
@property
106+
@abstractmethod
46107
def bufsize(self) -> int:
47108
"""
48109
Buffer size in bytes.
49110
"""
50111
pass
51112

52113
@property
114+
@abstractmethod
53115
def ptr(self) -> int:
54116
"""
55117
Pointer to start of the buffer as an integer.
56118
"""
57119
pass
58120

121+
@abstractmethod
59122
def __dlpack__(self):
60123
"""
61124
Produce DLPack capsule (see array API standard).
@@ -70,18 +133,17 @@ def __dlpack__(self):
70133
"""
71134
raise NotImplementedError("__dlpack__")
72135

73-
def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]:
136+
@abstractmethod
137+
def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]:
74138
"""
75139
Device type and device ID for where the data in the buffer resides.
76-
77140
Uses device type codes matching DLPack.
78-
79141
Note: must be implemented even if ``__dlpack__`` is not.
80142
"""
81143
pass
82144

83145

84-
class Column:
146+
class Column(ABC):
85147
"""
86148
A column object, with only the methods and properties required by the
87149
interchange protocol defined.
@@ -123,10 +185,10 @@ class Column:
123185
124186
Note: this Column object can only be produced by ``__dataframe__``, so
125187
doesn't need its own version or ``__column__`` protocol.
126-
127188
"""
128189

129190
@property
191+
@abstractmethod
130192
def size(self) -> Optional[int]:
131193
"""
132194
Size of the column, in elements.
@@ -137,6 +199,7 @@ def size(self) -> Optional[int]:
137199
pass
138200

139201
@property
202+
@abstractmethod
140203
def offset(self) -> int:
141204
"""
142205
Offset of first element.
@@ -148,6 +211,7 @@ def offset(self) -> int:
148211
pass
149212

150213
@property
214+
@abstractmethod
151215
def dtype(self) -> Tuple[DtypeKind, int, str, str]:
152216
"""
153217
Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
@@ -158,7 +222,6 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]:
158222
Endianness : current only native endianness (``=``) is supported
159223
160224
Notes:
161-
162225
- Kind specifiers are aligned with DLPack where possible (hence the
163226
jump to 20, leave enough room for future extension)
164227
- Masks must be specified as boolean with either bit width 1 (for bit
@@ -180,17 +243,16 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]:
180243
pass
181244

182245
@property
246+
@abstractmethod
183247
def describe_categorical(self) -> dict[bool, bool, Optional[Column]]:
184248
"""
185249
If the dtype is categorical, there are two options:
186-
187250
- There are only values in the data buffer.
188251
- There is a separate non-categorical Column encoding categorical values.
189252
190-
Raises RuntimeError if the dtype is not categorical
191-
192-
Content of returned dict:
253+
Raises TypeError if the dtype is not categorical
193254
255+
Returns the description on how to interpret the data buffer:
194256
- "is_ordered" : bool, whether the ordering of dictionary indices is
195257
semantically meaningful.
196258
- "is_dictionary" : bool, whether a mapping of
@@ -204,6 +266,7 @@ def describe_categorical(self) -> dict[bool, bool, Optional[Column]]:
204266
pass
205267

206268
@property
269+
@abstractmethod
207270
def describe_null(self) -> Tuple[ColumnNullType, Any]:
208271
"""
209272
Return the missing value (or "null") representation the column dtype
@@ -216,6 +279,7 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]:
216279
pass
217280

218281
@property
282+
@abstractmethod
219283
def null_count(self) -> Optional[int]:
220284
"""
221285
Number of null elements, if known.
@@ -225,18 +289,21 @@ def null_count(self) -> Optional[int]:
225289
pass
226290

227291
@property
292+
@abstractmethod
228293
def metadata(self) -> Dict[str, Any]:
229294
"""
230295
The metadata for the column. See `DataFrame.metadata` for more details.
231296
"""
232297
pass
233298

299+
@abstractmethod
234300
def num_chunks(self) -> int:
235301
"""
236302
Return the number of chunks the column consists of.
237303
"""
238304
pass
239305

306+
@abstractmethod
240307
def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["Column"]:
241308
"""
242309
Return an iterator yielding the chunks.
@@ -245,7 +312,8 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["Column"]:
245312
"""
246313
pass
247314

248-
def get_buffers(self) -> Dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]], Optional[Tuple[Buffer, Any]]]:
315+
@abstractmethod
316+
def get_buffers(self) -> ColumnBuffers:
249317
"""
250318
Return a dictionary containing the underlying buffers.
251319
@@ -276,7 +344,7 @@ def get_buffers(self) -> Dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]],
276344
# pass
277345

278346

279-
class DataFrame:
347+
class DataFrame(ABC):
280348
"""
281349
A data frame class, with only the methods required by the interchange
282350
protocol defined.
@@ -290,29 +358,11 @@ class DataFrame:
290358
``__dataframe__`` method of a public data frame class in a library adhering
291359
to the dataframe interchange protocol specification.
292360
"""
293-
def __dataframe__(self, nan_as_null : bool = False,
294-
allow_copy : bool = True) -> dict:
295-
"""
296-
Produces a dictionary object following the dataframe protocol specification.
297361

298-
``nan_as_null`` is a keyword intended for the consumer to tell the
299-
producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
300-
It is intended for cases where the consumer does not support the bit
301-
mask or byte mask that is the producer's native representation.
302-
303-
``allow_copy`` is a keyword that defines whether or not the library is
304-
allowed to make a copy of the data. For example, copying data would be
305-
necessary if a library supports strided buffers, given that this protocol
306-
specifies contiguous buffers.
307-
"""
308-
self._nan_as_null = nan_as_null
309-
self._allow_zero_zopy = allow_copy
310-
return {
311-
"dataframe": self, # DataFrame object adhering to the protocol
312-
"version": 0 # Version number of the protocol
313-
}
362+
version = 0 # version of the protocol
314363

315364
@property
365+
@abstractmethod
316366
def metadata(self) -> Dict[str, Any]:
317367
"""
318368
The metadata for the data frame, as a dictionary with string keys. The
@@ -325,12 +375,14 @@ def metadata(self) -> Dict[str, Any]:
325375
"""
326376
pass
327377

378+
@abstractmethod
328379
def num_columns(self) -> int:
329380
"""
330381
Return the number of columns in the DataFrame.
331382
"""
332383
pass
333384

385+
@abstractmethod
334386
def num_rows(self) -> Optional[int]:
335387
# TODO: not happy with Optional, but need to flag it may be expensive
336388
# why include it if it may be None - what do we expect consumers
@@ -340,48 +392,56 @@ def num_rows(self) -> Optional[int]:
340392
"""
341393
pass
342394

395+
@abstractmethod
343396
def num_chunks(self) -> int:
344397
"""
345398
Return the number of chunks the DataFrame consists of.
346399
"""
347400
pass
348401

402+
@abstractmethod
349403
def column_names(self) -> Iterable[str]:
350404
"""
351405
Return an iterator yielding the column names.
352406
"""
353407
pass
354408

409+
@abstractmethod
355410
def get_column(self, i: int) -> Column:
356411
"""
357412
Return the column at the indicated position.
358413
"""
359414
pass
360415

416+
@abstractmethod
361417
def get_column_by_name(self, name: str) -> Column:
362418
"""
363419
Return the column whose name is the indicated name.
364420
"""
365421
pass
366422

423+
@abstractmethod
367424
def get_columns(self) -> Iterable[Column]:
368425
"""
369426
Return an iterator yielding the columns.
370427
"""
371428
pass
372429

430+
@abstractmethod
373431
def select_columns(self, indices: Sequence[int]) -> "DataFrame":
374432
"""
375433
Create a new DataFrame by selecting a subset of columns by index.
376434
"""
377435
pass
378436

437+
@abstractmethod
379438
def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame":
380439
"""
381440
Create a new DataFrame by selecting a subset of columns by name.
382441
"""
383442
pass
384443

444+
@abstractmethod
385445
def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["DataFrame"]:
386446
"""
387447
Return an iterator yielding the chunks.

0 commit comments

Comments
 (0)