Skip to content

Commit ffa1544

Browse files
committed
Align spec with existing implementations
Signed-off-by: Vasily Litvinov <[email protected]>
1 parent ad298a7 commit ffa1544

File tree

1 file changed

+96
-36
lines changed

1 file changed

+96
-36
lines changed

protocol/dataframe_protocol.py

Lines changed: 96 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
1-
from typing import Tuple, Optional, Dict, Any, Iterable, Sequence
1+
from typing import Tuple, Optional, Dict, Any, Iterable, Sequence, TypedDict
22
import enum
3+
from abc import ABC, abstractmethod
4+
35

46
class DlpackDeviceType(enum.IntEnum):
7+
"""Integer enum for device type codes matching DLPack."""
8+
59
CPU = 1
610
CUDA = 2
711
CPU_PINNED = 3
@@ -11,7 +15,29 @@ class DlpackDeviceType(enum.IntEnum):
1115
VPI = 9
1216
ROCM = 10
1317

18+
1419
class DtypeKind(enum.IntEnum):
20+
"""
21+
Integer enum for data types.
22+
23+
Attributes
24+
----------
25+
INT : int
26+
Matches to signed integer data type.
27+
UINT : int
28+
Matches to unsigned integer data type.
29+
FLOAT : int
30+
Matches to floating point data type.
31+
BOOL : int
32+
Matches to boolean data type.
33+
STRING : int
34+
Matches to string data type (UTF-8 encoded).
35+
DATETIME : int
36+
Matches to datetime data type.
37+
CATEGORICAL : int
38+
Matches to categorical data type.
39+
"""
40+
1541
INT = 0
1642
UINT = 1
1743
FLOAT = 2
@@ -20,14 +46,48 @@ class DtypeKind(enum.IntEnum):
2046
DATETIME = 22
2147
CATEGORICAL = 23
2248

23-
class ColumnNullType:
49+
50+
class ColumnNullType(enum.IntEnum):
51+
"""
52+
Integer enum for null type representation.
53+
54+
Attributes
55+
----------
56+
NON_NULLABLE : int
57+
Non-nullable column.
58+
USE_NAN : int
59+
Use explicit float NaN/NaT value.
60+
USE_SENTINEL : int
61+
Sentinel value besides NaN/NaT.
62+
USE_BITMASK : int
63+
The bit is set/unset representing a null on a certain position.
64+
USE_BYTEMASK : int
65+
The byte is set/unset representing a null on a certain position.
66+
"""
67+
2468
NON_NULLABLE = 0
2569
USE_NAN = 1
2670
USE_SENTINEL = 2
2771
USE_BITMASK = 3
2872
USE_BYTEMASK = 4
2973

30-
class Buffer:
74+
75+
class ColumnBuffers(TypedDict):
76+
data: Tuple["Buffer", Any] # first element is a buffer containing the column data;
77+
# second element is the data buffer's associated dtype
78+
validity: Optional[Tuple["Buffer", Any]] # first element is a buffer containing mask values
79+
# indicating missing data and second element is
80+
# the mask value buffer's associated dtype.
81+
# None if the null representation is not a bit or byte mask
82+
offsets: Optional[Tuple["Buffer", Any]] # first element is a buffer containing the
83+
# offset values for variable-size binary data
84+
# (e.g., variable-length strings) and
85+
# second element is the offsets buffer's associated dtype.
86+
# None if the data buffer does not have
87+
# an associated offsets buffer
88+
89+
90+
class Buffer(ABC):
3191
"""
3292
Data in the buffer is guaranteed to be contiguous in memory.
3393
@@ -43,19 +103,22 @@ class Buffer:
43103
"""
44104

45105
@property
106+
@abstractmethod
46107
def bufsize(self) -> int:
47108
"""
48109
Buffer size in bytes.
49110
"""
50111
pass
51112

52113
@property
114+
@abstractmethod
53115
def ptr(self) -> int:
54116
"""
55117
Pointer to start of the buffer as an integer.
56118
"""
57119
pass
58120

121+
@abstractmethod
59122
def __dlpack__(self):
60123
"""
61124
Produce DLPack capsule (see array API standard).
@@ -70,18 +133,17 @@ def __dlpack__(self):
70133
"""
71134
raise NotImplementedError("__dlpack__")
72135

73-
def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]:
136+
@abstractmethod
137+
def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]:
74138
"""
75139
Device type and device ID for where the data in the buffer resides.
76-
77140
Uses device type codes matching DLPack.
78-
79141
Note: must be implemented even if ``__dlpack__`` is not.
80142
"""
81143
pass
82144

83145

84-
class Column:
146+
class Column(ABC):
85147
"""
86148
A column object, with only the methods and properties required by the
87149
interchange protocol defined.
@@ -123,10 +185,10 @@ class Column:
123185
124186
Note: this Column object can only be produced by ``__dataframe__``, so
125187
doesn't need its own version or ``__column__`` protocol.
126-
127188
"""
128189

129190
@property
191+
@abstractmethod
130192
def size(self) -> Optional[int]:
131193
"""
132194
Size of the column, in elements.
@@ -137,6 +199,7 @@ def size(self) -> Optional[int]:
137199
pass
138200

139201
@property
202+
@abstractmethod
140203
def offset(self) -> int:
141204
"""
142205
Offset of first element.
@@ -148,6 +211,7 @@ def offset(self) -> int:
148211
pass
149212

150213
@property
214+
@abstractmethod
151215
def dtype(self) -> Tuple[DtypeKind, int, str, str]:
152216
"""
153217
Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
@@ -158,7 +222,6 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]:
158222
Endianness : current only native endianness (``=``) is supported
159223
160224
Notes:
161-
162225
- Kind specifiers are aligned with DLPack where possible (hence the
163226
jump to 20, leave enough room for future extension)
164227
- Masks must be specified as boolean with either bit width 1 (for bit
@@ -180,17 +243,16 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]:
180243
pass
181244

182245
@property
183-
def describe_categorical(self) -> Dict[bool, bool, Optional[dict]]:
246+
@abstractmethod
247+
def describe_categorical(self) -> Tuple[bool, bool, Optional[dict]]:
184248
"""
185249
If the dtype is categorical, there are two options:
186-
187250
- There are only values in the data buffer.
188251
- There is a separate dictionary-style encoding for categorical values.
189252
190-
Raises RuntimeError if the dtype is not categorical
191-
192-
Content of returned dict:
253+
Raises TypeError if the dtype is not categorical
193254
255+
Returns the description on how to interpret the data buffer:
194256
- "is_ordered" : bool, whether the ordering of dictionary indices is
195257
semantically meaningful.
196258
- "is_dictionary" : bool, whether a dictionary-style mapping of
@@ -203,6 +265,7 @@ def describe_categorical(self) -> Dict[bool, bool, Optional[dict]]:
203265
pass
204266

205267
@property
268+
@abstractmethod
206269
def describe_null(self) -> Tuple[ColumnNullType, Any]:
207270
"""
208271
Return the missing value (or "null") representation the column dtype
@@ -215,6 +278,7 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]:
215278
pass
216279

217280
@property
281+
@abstractmethod
218282
def null_count(self) -> Optional[int]:
219283
"""
220284
Number of null elements, if known.
@@ -224,18 +288,21 @@ def null_count(self) -> Optional[int]:
224288
pass
225289

226290
@property
291+
@abstractmethod
227292
def metadata(self) -> Dict[str, Any]:
228293
"""
229294
The metadata for the column. See `DataFrame.metadata` for more details.
230295
"""
231296
pass
232297

298+
@abstractmethod
233299
def num_chunks(self) -> int:
234300
"""
235301
Return the number of chunks the column consists of.
236302
"""
237303
pass
238304

305+
@abstractmethod
239306
def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["Column"]:
240307
"""
241308
Return an iterator yielding the chunks.
@@ -244,7 +311,8 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["Column"]:
244311
"""
245312
pass
246313

247-
def get_buffers(self) -> Dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]], Optional[Tuple[Buffer, Any]]]:
314+
@abstractmethod
315+
def get_buffers(self) -> ColumnBuffers:
248316
"""
249317
Return a dictionary containing the underlying buffers.
250318
@@ -275,7 +343,7 @@ def get_buffers(self) -> Dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]],
275343
# pass
276344

277345

278-
class DataFrame:
346+
class DataFrame(ABC):
279347
"""
280348
A data frame class, with only the methods required by the interchange
281349
protocol defined.
@@ -289,29 +357,11 @@ class DataFrame:
289357
``__dataframe__`` method of a public data frame class in a library adhering
290358
to the dataframe interchange protocol specification.
291359
"""
292-
def __dataframe__(self, nan_as_null : bool = False,
293-
allow_copy : bool = True) -> dict:
294-
"""
295-
Produces a dictionary object following the dataframe protocol specification.
296360

297-
``nan_as_null`` is a keyword intended for the consumer to tell the
298-
producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
299-
It is intended for cases where the consumer does not support the bit
300-
mask or byte mask that is the producer's native representation.
301-
302-
``allow_copy`` is a keyword that defines whether or not the library is
303-
allowed to make a copy of the data. For example, copying data would be
304-
necessary if a library supports strided buffers, given that this protocol
305-
specifies contiguous buffers.
306-
"""
307-
self._nan_as_null = nan_as_null
308-
self._allow_zero_zopy = allow_copy
309-
return {
310-
"dataframe": self, # DataFrame object adhering to the protocol
311-
"version": 0 # Version number of the protocol
312-
}
361+
version = 0 # version of the protocol
313362

314363
@property
364+
@abstractmethod
315365
def metadata(self) -> Dict[str, Any]:
316366
"""
317367
The metadata for the data frame, as a dictionary with string keys. The
@@ -324,12 +374,14 @@ def metadata(self) -> Dict[str, Any]:
324374
"""
325375
pass
326376

377+
@abstractmethod
327378
def num_columns(self) -> int:
328379
"""
329380
Return the number of columns in the DataFrame.
330381
"""
331382
pass
332383

384+
@abstractmethod
333385
def num_rows(self) -> Optional[int]:
334386
# TODO: not happy with Optional, but need to flag it may be expensive
335387
# why include it if it may be None - what do we expect consumers
@@ -339,48 +391,56 @@ def num_rows(self) -> Optional[int]:
339391
"""
340392
pass
341393

394+
@abstractmethod
342395
def num_chunks(self) -> int:
343396
"""
344397
Return the number of chunks the DataFrame consists of.
345398
"""
346399
pass
347400

401+
@abstractmethod
348402
def column_names(self) -> Iterable[str]:
349403
"""
350404
Return an iterator yielding the column names.
351405
"""
352406
pass
353407

408+
@abstractmethod
354409
def get_column(self, i: int) -> Column:
355410
"""
356411
Return the column at the indicated position.
357412
"""
358413
pass
359414

415+
@abstractmethod
360416
def get_column_by_name(self, name: str) -> Column:
361417
"""
362418
Return the column whose name is the indicated name.
363419
"""
364420
pass
365421

422+
@abstractmethod
366423
def get_columns(self) -> Iterable[Column]:
367424
"""
368425
Return an iterator yielding the columns.
369426
"""
370427
pass
371428

429+
@abstractmethod
372430
def select_columns(self, indices: Sequence[int]) -> "DataFrame":
373431
"""
374432
Create a new DataFrame by selecting a subset of columns by index.
375433
"""
376434
pass
377435

436+
@abstractmethod
378437
def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame":
379438
"""
380439
Create a new DataFrame by selecting a subset of columns by name.
381440
"""
382441
pass
383442

443+
@abstractmethod
384444
def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["DataFrame"]:
385445
"""
386446
Return an iterator yielding the chunks.

0 commit comments

Comments
 (0)