1
- from typing import Tuple , Optional , Dict , Any , Iterable , Sequence
1
+ from typing import Tuple , Optional , Dict , Any , Iterable , Sequence , TypedDict
2
2
import enum
3
+ from abc import ABC , abstractmethod
4
+
3
5
4
6
class DlpackDeviceType (enum .IntEnum ):
7
+ """Integer enum for device type codes matching DLPack."""
8
+
5
9
CPU = 1
6
10
CUDA = 2
7
11
CPU_PINNED = 3
@@ -11,7 +15,29 @@ class DlpackDeviceType(enum.IntEnum):
11
15
VPI = 9
12
16
ROCM = 10
13
17
18
+
14
19
class DtypeKind (enum .IntEnum ):
20
+ """
21
+ Integer enum for data types.
22
+
23
+ Attributes
24
+ ----------
25
+ INT : int
26
+ Matches to signed integer data type.
27
+ UINT : int
28
+ Matches to unsigned integer data type.
29
+ FLOAT : int
30
+ Matches to floating point data type.
31
+ BOOL : int
32
+ Matches to boolean data type.
33
+ STRING : int
34
+ Matches to string data type (UTF-8 encoded).
35
+ DATETIME : int
36
+ Matches to datetime data type.
37
+ CATEGORICAL : int
38
+ Matches to categorical data type.
39
+ """
40
+
15
41
INT = 0
16
42
UINT = 1
17
43
FLOAT = 2
@@ -20,14 +46,48 @@ class DtypeKind(enum.IntEnum):
20
46
DATETIME = 22
21
47
CATEGORICAL = 23
22
48
23
- class ColumnNullType :
49
+
50
+ class ColumnNullType (enum .IntEnum ):
51
+ """
52
+ Integer enum for null type representation.
53
+
54
+ Attributes
55
+ ----------
56
+ NON_NULLABLE : int
57
+ Non-nullable column.
58
+ USE_NAN : int
59
+ Use explicit float NaN/NaT value.
60
+ USE_SENTINEL : int
61
+ Sentinel value besides NaN/NaT.
62
+ USE_BITMASK : int
63
+ The bit is set/unset representing a null on a certain position.
64
+ USE_BYTEMASK : int
65
+ The byte is set/unset representing a null on a certain position.
66
+ """
67
+
24
68
NON_NULLABLE = 0
25
69
USE_NAN = 1
26
70
USE_SENTINEL = 2
27
71
USE_BITMASK = 3
28
72
USE_BYTEMASK = 4
29
73
30
- class Buffer :
74
+
75
+ class ColumnBuffers (TypedDict ):
76
+ data : Tuple ["Buffer" , Any ] # first element is a buffer containing the column data;
77
+ # second element is the data buffer's associated dtype
78
+ validity : Optional [Tuple ["Buffer" , Any ]] # first element is a buffer containing mask values
79
+ # indicating missing data and second element is
80
+ # the mask value buffer's associated dtype.
81
+ # None if the null representation is not a bit or byte mask
82
+ offsets : Optional [Tuple ["Buffer" , Any ]] # first element is a buffer containing the
83
+ # offset values for variable-size binary data
84
+ # (e.g., variable-length strings) and
85
+ # second element is the offsets buffer's associated dtype.
86
+ # None if the data buffer does not have
87
+ # an associated offsets buffer
88
+
89
+
90
+ class Buffer (ABC ):
31
91
"""
32
92
Data in the buffer is guaranteed to be contiguous in memory.
33
93
@@ -43,19 +103,22 @@ class Buffer:
43
103
"""
44
104
45
105
@property
106
+ @abstractmethod
46
107
def bufsize (self ) -> int :
47
108
"""
48
109
Buffer size in bytes.
49
110
"""
50
111
pass
51
112
52
113
@property
114
+ @abstractmethod
53
115
def ptr (self ) -> int :
54
116
"""
55
117
Pointer to start of the buffer as an integer.
56
118
"""
57
119
pass
58
120
121
+ @abstractmethod
59
122
def __dlpack__ (self ):
60
123
"""
61
124
Produce DLPack capsule (see array API standard).
@@ -70,18 +133,17 @@ def __dlpack__(self):
70
133
"""
71
134
raise NotImplementedError ("__dlpack__" )
72
135
73
- def __dlpack_device__ (self ) -> Tuple [DlpackDeviceType , int ]:
136
+ @abstractmethod
137
+ def __dlpack_device__ (self ) -> Tuple [DlpackDeviceType , Optional [int ]]:
74
138
"""
75
139
Device type and device ID for where the data in the buffer resides.
76
-
77
140
Uses device type codes matching DLPack.
78
-
79
141
Note: must be implemented even if ``__dlpack__`` is not.
80
142
"""
81
143
pass
82
144
83
145
84
- class Column :
146
+ class Column ( ABC ) :
85
147
"""
86
148
A column object, with only the methods and properties required by the
87
149
interchange protocol defined.
@@ -123,10 +185,10 @@ class Column:
123
185
124
186
Note: this Column object can only be produced by ``__dataframe__``, so
125
187
doesn't need its own version or ``__column__`` protocol.
126
-
127
188
"""
128
189
129
190
@property
191
+ @abstractmethod
130
192
def size (self ) -> Optional [int ]:
131
193
"""
132
194
Size of the column, in elements.
@@ -137,6 +199,7 @@ def size(self) -> Optional[int]:
137
199
pass
138
200
139
201
@property
202
+ @abstractmethod
140
203
def offset (self ) -> int :
141
204
"""
142
205
Offset of first element.
@@ -148,6 +211,7 @@ def offset(self) -> int:
148
211
pass
149
212
150
213
@property
214
+ @abstractmethod
151
215
def dtype (self ) -> Tuple [DtypeKind , int , str , str ]:
152
216
"""
153
217
Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
@@ -158,7 +222,6 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]:
158
222
Endianness : current only native endianness (``=``) is supported
159
223
160
224
Notes:
161
-
162
225
- Kind specifiers are aligned with DLPack where possible (hence the
163
226
jump to 20, leave enough room for future extension)
164
227
- Masks must be specified as boolean with either bit width 1 (for bit
@@ -180,17 +243,16 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]:
180
243
pass
181
244
182
245
@property
183
- def describe_categorical (self ) -> Dict [bool , bool , Optional [dict ]]:
246
+ @abstractmethod
247
+ def describe_categorical (self ) -> Tuple [bool , bool , Optional [dict ]]:
184
248
"""
185
249
If the dtype is categorical, there are two options:
186
-
187
250
- There are only values in the data buffer.
188
251
- There is a separate dictionary-style encoding for categorical values.
189
252
190
- Raises RuntimeError if the dtype is not categorical
191
-
192
- Content of returned dict:
253
+ Raises TypeError if the dtype is not categorical
193
254
255
+ Returns the description on how to interpret the data buffer:
194
256
- "is_ordered" : bool, whether the ordering of dictionary indices is
195
257
semantically meaningful.
196
258
- "is_dictionary" : bool, whether a dictionary-style mapping of
@@ -203,6 +265,7 @@ def describe_categorical(self) -> Dict[bool, bool, Optional[dict]]:
203
265
pass
204
266
205
267
@property
268
+ @abstractmethod
206
269
def describe_null (self ) -> Tuple [ColumnNullType , Any ]:
207
270
"""
208
271
Return the missing value (or "null") representation the column dtype
@@ -215,6 +278,7 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]:
215
278
pass
216
279
217
280
@property
281
+ @abstractmethod
218
282
def null_count (self ) -> Optional [int ]:
219
283
"""
220
284
Number of null elements, if known.
@@ -224,18 +288,21 @@ def null_count(self) -> Optional[int]:
224
288
pass
225
289
226
290
@property
291
+ @abstractmethod
227
292
def metadata (self ) -> Dict [str , Any ]:
228
293
"""
229
294
The metadata for the column. See `DataFrame.metadata` for more details.
230
295
"""
231
296
pass
232
297
298
+ @abstractmethod
233
299
def num_chunks (self ) -> int :
234
300
"""
235
301
Return the number of chunks the column consists of.
236
302
"""
237
303
pass
238
304
305
+ @abstractmethod
239
306
def get_chunks (self , n_chunks : Optional [int ] = None ) -> Iterable ["Column" ]:
240
307
"""
241
308
Return an iterator yielding the chunks.
@@ -244,7 +311,8 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["Column"]:
244
311
"""
245
312
pass
246
313
247
- def get_buffers (self ) -> Dict [Tuple [Buffer , Any ], Optional [Tuple [Buffer , Any ]], Optional [Tuple [Buffer , Any ]]]:
314
+ @abstractmethod
315
+ def get_buffers (self ) -> ColumnBuffers :
248
316
"""
249
317
Return a dictionary containing the underlying buffers.
250
318
@@ -275,7 +343,7 @@ def get_buffers(self) -> Dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]],
275
343
# pass
276
344
277
345
278
- class DataFrame :
346
+ class DataFrame ( ABC ) :
279
347
"""
280
348
A data frame class, with only the methods required by the interchange
281
349
protocol defined.
@@ -289,29 +357,11 @@ class DataFrame:
289
357
``__dataframe__`` method of a public data frame class in a library adhering
290
358
to the dataframe interchange protocol specification.
291
359
"""
292
- def __dataframe__ (self , nan_as_null : bool = False ,
293
- allow_copy : bool = True ) -> dict :
294
- """
295
- Produces a dictionary object following the dataframe protocol specification.
296
360
297
- ``nan_as_null`` is a keyword intended for the consumer to tell the
298
- producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
299
- It is intended for cases where the consumer does not support the bit
300
- mask or byte mask that is the producer's native representation.
301
-
302
- ``allow_copy`` is a keyword that defines whether or not the library is
303
- allowed to make a copy of the data. For example, copying data would be
304
- necessary if a library supports strided buffers, given that this protocol
305
- specifies contiguous buffers.
306
- """
307
- self ._nan_as_null = nan_as_null
308
- self ._allow_zero_zopy = allow_copy
309
- return {
310
- "dataframe" : self , # DataFrame object adhering to the protocol
311
- "version" : 0 # Version number of the protocol
312
- }
361
+ version = 0 # version of the protocol
313
362
314
363
@property
364
+ @abstractmethod
315
365
def metadata (self ) -> Dict [str , Any ]:
316
366
"""
317
367
The metadata for the data frame, as a dictionary with string keys. The
@@ -324,12 +374,14 @@ def metadata(self) -> Dict[str, Any]:
324
374
"""
325
375
pass
326
376
377
+ @abstractmethod
327
378
def num_columns (self ) -> int :
328
379
"""
329
380
Return the number of columns in the DataFrame.
330
381
"""
331
382
pass
332
383
384
+ @abstractmethod
333
385
def num_rows (self ) -> Optional [int ]:
334
386
# TODO: not happy with Optional, but need to flag it may be expensive
335
387
# why include it if it may be None - what do we expect consumers
@@ -339,48 +391,56 @@ def num_rows(self) -> Optional[int]:
339
391
"""
340
392
pass
341
393
394
+ @abstractmethod
342
395
def num_chunks (self ) -> int :
343
396
"""
344
397
Return the number of chunks the DataFrame consists of.
345
398
"""
346
399
pass
347
400
401
+ @abstractmethod
348
402
def column_names (self ) -> Iterable [str ]:
349
403
"""
350
404
Return an iterator yielding the column names.
351
405
"""
352
406
pass
353
407
408
+ @abstractmethod
354
409
def get_column (self , i : int ) -> Column :
355
410
"""
356
411
Return the column at the indicated position.
357
412
"""
358
413
pass
359
414
415
+ @abstractmethod
360
416
def get_column_by_name (self , name : str ) -> Column :
361
417
"""
362
418
Return the column whose name is the indicated name.
363
419
"""
364
420
pass
365
421
422
+ @abstractmethod
366
423
def get_columns (self ) -> Iterable [Column ]:
367
424
"""
368
425
Return an iterator yielding the columns.
369
426
"""
370
427
pass
371
428
429
+ @abstractmethod
372
430
def select_columns (self , indices : Sequence [int ]) -> "DataFrame" :
373
431
"""
374
432
Create a new DataFrame by selecting a subset of columns by index.
375
433
"""
376
434
pass
377
435
436
+ @abstractmethod
378
437
def select_columns_by_name (self , names : Sequence [str ]) -> "DataFrame" :
379
438
"""
380
439
Create a new DataFrame by selecting a subset of columns by name.
381
440
"""
382
441
pass
383
442
443
+ @abstractmethod
384
444
def get_chunks (self , n_chunks : Optional [int ] = None ) -> Iterable ["DataFrame" ]:
385
445
"""
386
446
Return an iterator yielding the chunks.
0 commit comments