1
- from typing import Tuple , Optional , Dict , Any , Iterable , Sequence
1
+ from typing import Tuple , Optional , Dict , Any , Iterable , Sequence , TypedDict
2
2
import enum
3
+ from abc import ABC , abstractmethod
4
+
3
5
4
6
class DlpackDeviceType (enum .IntEnum ):
7
+ """Integer enum for device type codes matching DLPack."""
8
+
5
9
CPU = 1
6
10
CUDA = 2
7
11
CPU_PINNED = 3
@@ -11,7 +15,29 @@ class DlpackDeviceType(enum.IntEnum):
11
15
VPI = 9
12
16
ROCM = 10
13
17
18
+
14
19
class DtypeKind (enum .IntEnum ):
20
+ """
21
+ Integer enum for data types.
22
+
23
+ Attributes
24
+ ----------
25
+ INT : int
26
+ Matches to signed integer data type.
27
+ UINT : int
28
+ Matches to unsigned integer data type.
29
+ FLOAT : int
30
+ Matches to floating point data type.
31
+ BOOL : int
32
+ Matches to boolean data type.
33
+ STRING : int
34
+ Matches to string data type (UTF-8 encoded).
35
+ DATETIME : int
36
+ Matches to datetime data type.
37
+ CATEGORICAL : int
38
+ Matches to categorical data type.
39
+ """
40
+
15
41
INT = 0
16
42
UINT = 1
17
43
FLOAT = 2
@@ -20,14 +46,48 @@ class DtypeKind(enum.IntEnum):
20
46
DATETIME = 22
21
47
CATEGORICAL = 23
22
48
23
- class ColumnNullType :
49
+
50
+ class ColumnNullType (enum .IntEnum ):
51
+ """
52
+ Integer enum for null type representation.
53
+
54
+ Attributes
55
+ ----------
56
+ NON_NULLABLE : int
57
+ Non-nullable column.
58
+ USE_NAN : int
59
+ Use explicit float NaN/NaT value.
60
+ USE_SENTINEL : int
61
+ Sentinel value besides NaN/NaT.
62
+ USE_BITMASK : int
63
+ The bit is set/unset representing a null on a certain position.
64
+ USE_BYTEMASK : int
65
+ The byte is set/unset representing a null on a certain position.
66
+ """
67
+
24
68
NON_NULLABLE = 0
25
69
USE_NAN = 1
26
70
USE_SENTINEL = 2
27
71
USE_BITMASK = 3
28
72
USE_BYTEMASK = 4
29
73
30
- class Buffer :
74
+
75
+ class ColumnBuffers (TypedDict ):
76
+ data : Tuple ["Buffer" , Any ] # first element is a buffer containing the column data;
77
+ # second element is the data buffer's associated dtype
78
+ validity : Optional [Tuple ["Buffer" , Any ]] # first element is a buffer containing mask values
79
+ # indicating missing data and second element is
80
+ # the mask value buffer's associated dtype.
81
+ # None if the null representation is not a bit or byte mask
82
+ offsets : Optional [Tuple ["Buffer" , Any ]] # first element is a buffer containing the
83
+ # offset values for variable-size binary data
84
+ # (e.g., variable-length strings) and
85
+ # second element is the offsets buffer's associated dtype.
86
+ # None if the data buffer does not have
87
+ # an associated offsets buffer
88
+
89
+
90
+ class Buffer (ABC ):
31
91
"""
32
92
Data in the buffer is guaranteed to be contiguous in memory.
33
93
@@ -43,19 +103,22 @@ class Buffer:
43
103
"""
44
104
45
105
@property
106
+ @abstractmethod
46
107
def bufsize (self ) -> int :
47
108
"""
48
109
Buffer size in bytes.
49
110
"""
50
111
pass
51
112
52
113
@property
114
+ @abstractmethod
53
115
def ptr (self ) -> int :
54
116
"""
55
117
Pointer to start of the buffer as an integer.
56
118
"""
57
119
pass
58
120
121
+ @abstractmethod
59
122
def __dlpack__ (self ):
60
123
"""
61
124
Produce DLPack capsule (see array API standard).
@@ -70,18 +133,17 @@ def __dlpack__(self):
70
133
"""
71
134
raise NotImplementedError ("__dlpack__" )
72
135
73
- def __dlpack_device__ (self ) -> Tuple [DlpackDeviceType , int ]:
136
+ @abstractmethod
137
+ def __dlpack_device__ (self ) -> Tuple [DlpackDeviceType , Optional [int ]]:
74
138
"""
75
139
Device type and device ID for where the data in the buffer resides.
76
-
77
140
Uses device type codes matching DLPack.
78
-
79
141
Note: must be implemented even if ``__dlpack__`` is not.
80
142
"""
81
143
pass
82
144
83
145
84
- class Column :
146
+ class Column ( ABC ) :
85
147
"""
86
148
A column object, with only the methods and properties required by the
87
149
interchange protocol defined.
@@ -123,10 +185,10 @@ class Column:
123
185
124
186
Note: this Column object can only be produced by ``__dataframe__``, so
125
187
doesn't need its own version or ``__column__`` protocol.
126
-
127
188
"""
128
189
129
190
@property
191
+ @abstractmethod
130
192
def size (self ) -> Optional [int ]:
131
193
"""
132
194
Size of the column, in elements.
@@ -137,6 +199,7 @@ def size(self) -> Optional[int]:
137
199
pass
138
200
139
201
@property
202
+ @abstractmethod
140
203
def offset (self ) -> int :
141
204
"""
142
205
Offset of first element.
@@ -148,6 +211,7 @@ def offset(self) -> int:
148
211
pass
149
212
150
213
@property
214
+ @abstractmethod
151
215
def dtype (self ) -> Tuple [DtypeKind , int , str , str ]:
152
216
"""
153
217
Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
@@ -158,7 +222,6 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]:
158
222
Endianness : current only native endianness (``=``) is supported
159
223
160
224
Notes:
161
-
162
225
- Kind specifiers are aligned with DLPack where possible (hence the
163
226
jump to 20, leave enough room for future extension)
164
227
- Masks must be specified as boolean with either bit width 1 (for bit
@@ -180,17 +243,16 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]:
180
243
pass
181
244
182
245
@property
246
+ @abstractmethod
183
247
def describe_categorical (self ) -> dict [bool , bool , Optional [Column ]]:
184
248
"""
185
249
If the dtype is categorical, there are two options:
186
-
187
250
- There are only values in the data buffer.
188
251
- There is a separate non-categorical Column encoding categorical values.
189
252
190
- Raises RuntimeError if the dtype is not categorical
191
-
192
- Content of returned dict:
253
+ Raises TypeError if the dtype is not categorical
193
254
255
+ Returns the description on how to interpret the data buffer:
194
256
- "is_ordered" : bool, whether the ordering of dictionary indices is
195
257
semantically meaningful.
196
258
- "is_dictionary" : bool, whether a mapping of
@@ -204,6 +266,7 @@ def describe_categorical(self) -> dict[bool, bool, Optional[Column]]:
204
266
pass
205
267
206
268
@property
269
+ @abstractmethod
207
270
def describe_null (self ) -> Tuple [ColumnNullType , Any ]:
208
271
"""
209
272
Return the missing value (or "null") representation the column dtype
@@ -216,6 +279,7 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]:
216
279
pass
217
280
218
281
@property
282
+ @abstractmethod
219
283
def null_count (self ) -> Optional [int ]:
220
284
"""
221
285
Number of null elements, if known.
@@ -225,18 +289,21 @@ def null_count(self) -> Optional[int]:
225
289
pass
226
290
227
291
@property
292
+ @abstractmethod
228
293
def metadata (self ) -> Dict [str , Any ]:
229
294
"""
230
295
The metadata for the column. See `DataFrame.metadata` for more details.
231
296
"""
232
297
pass
233
298
299
+ @abstractmethod
234
300
def num_chunks (self ) -> int :
235
301
"""
236
302
Return the number of chunks the column consists of.
237
303
"""
238
304
pass
239
305
306
+ @abstractmethod
240
307
def get_chunks (self , n_chunks : Optional [int ] = None ) -> Iterable ["Column" ]:
241
308
"""
242
309
Return an iterator yielding the chunks.
@@ -245,7 +312,8 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable["Column"]:
245
312
"""
246
313
pass
247
314
248
- def get_buffers (self ) -> Dict [Tuple [Buffer , Any ], Optional [Tuple [Buffer , Any ]], Optional [Tuple [Buffer , Any ]]]:
315
+ @abstractmethod
316
+ def get_buffers (self ) -> ColumnBuffers :
249
317
"""
250
318
Return a dictionary containing the underlying buffers.
251
319
@@ -276,7 +344,7 @@ def get_buffers(self) -> Dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]],
276
344
# pass
277
345
278
346
279
- class DataFrame :
347
+ class DataFrame ( ABC ) :
280
348
"""
281
349
A data frame class, with only the methods required by the interchange
282
350
protocol defined.
@@ -290,29 +358,11 @@ class DataFrame:
290
358
``__dataframe__`` method of a public data frame class in a library adhering
291
359
to the dataframe interchange protocol specification.
292
360
"""
293
- def __dataframe__ (self , nan_as_null : bool = False ,
294
- allow_copy : bool = True ) -> dict :
295
- """
296
- Produces a dictionary object following the dataframe protocol specification.
297
361
298
- ``nan_as_null`` is a keyword intended for the consumer to tell the
299
- producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
300
- It is intended for cases where the consumer does not support the bit
301
- mask or byte mask that is the producer's native representation.
302
-
303
- ``allow_copy`` is a keyword that defines whether or not the library is
304
- allowed to make a copy of the data. For example, copying data would be
305
- necessary if a library supports strided buffers, given that this protocol
306
- specifies contiguous buffers.
307
- """
308
- self ._nan_as_null = nan_as_null
309
- self ._allow_zero_zopy = allow_copy
310
- return {
311
- "dataframe" : self , # DataFrame object adhering to the protocol
312
- "version" : 0 # Version number of the protocol
313
- }
362
+ version = 0 # version of the protocol
314
363
315
364
@property
365
+ @abstractmethod
316
366
def metadata (self ) -> Dict [str , Any ]:
317
367
"""
318
368
The metadata for the data frame, as a dictionary with string keys. The
@@ -325,12 +375,14 @@ def metadata(self) -> Dict[str, Any]:
325
375
"""
326
376
pass
327
377
378
+ @abstractmethod
328
379
def num_columns (self ) -> int :
329
380
"""
330
381
Return the number of columns in the DataFrame.
331
382
"""
332
383
pass
333
384
385
+ @abstractmethod
334
386
def num_rows (self ) -> Optional [int ]:
335
387
# TODO: not happy with Optional, but need to flag it may be expensive
336
388
# why include it if it may be None - what do we expect consumers
@@ -340,48 +392,56 @@ def num_rows(self) -> Optional[int]:
340
392
"""
341
393
pass
342
394
395
+ @abstractmethod
343
396
def num_chunks (self ) -> int :
344
397
"""
345
398
Return the number of chunks the DataFrame consists of.
346
399
"""
347
400
pass
348
401
402
+ @abstractmethod
349
403
def column_names (self ) -> Iterable [str ]:
350
404
"""
351
405
Return an iterator yielding the column names.
352
406
"""
353
407
pass
354
408
409
+ @abstractmethod
355
410
def get_column (self , i : int ) -> Column :
356
411
"""
357
412
Return the column at the indicated position.
358
413
"""
359
414
pass
360
415
416
+ @abstractmethod
361
417
def get_column_by_name (self , name : str ) -> Column :
362
418
"""
363
419
Return the column whose name is the indicated name.
364
420
"""
365
421
pass
366
422
423
+ @abstractmethod
367
424
def get_columns (self ) -> Iterable [Column ]:
368
425
"""
369
426
Return an iterator yielding the columns.
370
427
"""
371
428
pass
372
429
430
+ @abstractmethod
373
431
def select_columns (self , indices : Sequence [int ]) -> "DataFrame" :
374
432
"""
375
433
Create a new DataFrame by selecting a subset of columns by index.
376
434
"""
377
435
pass
378
436
437
+ @abstractmethod
379
438
def select_columns_by_name (self , names : Sequence [str ]) -> "DataFrame" :
380
439
"""
381
440
Create a new DataFrame by selecting a subset of columns by name.
382
441
"""
383
442
pass
384
443
444
+ @abstractmethod
385
445
def get_chunks (self , n_chunks : Optional [int ] = None ) -> Iterable ["DataFrame" ]:
386
446
"""
387
447
Return an iterator yielding the chunks.
0 commit comments