Skip to content

Commit 1496bc8

Browse files
phoflyehoshuadimarsky
authored andcommitted
ENH: Add defaultdict support for dtype in read_csv (pandas-dev#46051)
1 parent 406ab9f commit 1496bc8

File tree

7 files changed

+70
-3
lines changed

7 files changed

+70
-3
lines changed

doc/source/user_guide/io.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,11 @@ dtype : Type name or dict of column -> type, default ``None``
186186
(unsupported with ``engine='python'``). Use ``str`` or ``object`` together
187187
with suitable ``na_values`` settings to preserve and
188188
not interpret dtype.
189+
.. versionadded:: 1.5.0
190+
191+
Support for defaultdict was added. Specify a defaultdict as input where
192+
the default determines the dtype of the columns which are not explicitly
193+
listed.
189194
engine : {``'c'``, ``'python'``, ``'pyarrow'``}
190195
Parser engine to use. The C and pyarrow engines are faster, while the python engine
191196
is currently more feature-complete. Multithreading is currently only supported by

doc/source/whatsnew/v1.5.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ Other enhancements
3939
- :meth:`to_numeric` now preserves float64 arrays when downcasting would generate values not representable in float32 (:issue:`43693`)
4040
- :meth:`Series.reset_index` and :meth:`DataFrame.reset_index` now support the argument ``allow_duplicates`` (:issue:`44410`)
4141
- :meth:`.GroupBy.min` and :meth:`.GroupBy.max` now supports `Numba <https://numba.pydata.org/>`_ execution with the ``engine`` keyword (:issue:`45428`)
42+
- :func:`read_csv` now supports ``defaultdict`` as a ``dtype`` parameter (:issue:`41574`)
4243
- :meth:`DataFrame.rolling` and :meth:`Series.rolling` now support a ``step`` parameter with fixed-length windows (:issue:`15354`)
4344
- Implemented a ``bool``-dtype :class:`Index`, passing a bool-dtype array-like to ``pd.Index`` will now retain ``bool`` dtype instead of casting to ``object`` (:issue:`45061`)
4445
- Implemented a complex-dtype :class:`Index`, passing a complex-dtype array-like to ``pd.Index`` will now retain complex dtype instead of casting to ``object`` (:issue:`45845`)

pandas/_libs/parsers.pyx

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Copyright (c) 2012, Lambda Foundry, Inc.
22
# See LICENSE for the license
33
from base64 import decode
4+
from collections import defaultdict
45
from csv import (
56
QUOTE_MINIMAL,
67
QUOTE_NONE,
@@ -964,6 +965,8 @@ cdef class TextReader:
964965

965966
results = {}
966967
nused = 0
968+
is_default_dict_dtype = isinstance(self.dtype, defaultdict)
969+
967970
for i in range(self.table_width):
968971
if i < self.leading_cols:
969972
# Pass through leading columns always
@@ -994,6 +997,8 @@ cdef class TextReader:
994997
col_dtype = self.dtype[name]
995998
elif i in self.dtype:
996999
col_dtype = self.dtype[i]
1000+
elif is_default_dict_dtype:
1001+
col_dtype = self.dtype[name]
9971002
else:
9981003
if self.dtype.names:
9991004
# structured array

pandas/io/parsers/base_parser.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -466,12 +466,16 @@ def _clean_mapping(self, mapping):
466466
if not isinstance(mapping, dict):
467467
return mapping
468468
clean = {}
469+
# for mypy
470+
assert self.orig_names is not None
471+
469472
for col, v in mapping.items():
470-
# for mypy
471-
assert self.orig_names is not None
472473
if isinstance(col, int) and col not in self.orig_names:
473474
col = self.orig_names[col]
474475
clean[col] = v
476+
if isinstance(mapping, defaultdict):
477+
remaining_cols = set(self.orig_names) - set(clean.keys())
478+
clean.update({col: mapping[col] for col in remaining_cols})
475479
return clean
476480

477481
@final

pandas/io/parsers/c_parser_wrapper.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
from collections import defaultdict
34
from typing import (
45
Hashable,
56
Mapping,
@@ -415,7 +416,14 @@ def ensure_dtype_objs(
415416
Ensure we have either None, a dtype object, or a dictionary mapping to
416417
dtype objects.
417418
"""
418-
if isinstance(dtype, dict):
419+
if isinstance(dtype, defaultdict):
420+
# "None" not callable [misc]
421+
default_dtype = pandas_dtype(dtype.default_factory()) # type: ignore[misc]
422+
dtype_converted: defaultdict = defaultdict(lambda: default_dtype)
423+
for key in dtype.keys():
424+
dtype_converted[key] = pandas_dtype(dtype[key])
425+
return dtype_converted
426+
elif isinstance(dtype, dict):
419427
return {k: pandas_dtype(dtype[k]) for k in dtype}
420428
elif dtype is not None:
421429
return pandas_dtype(dtype)

pandas/io/parsers/readers.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,12 @@
168168
to preserve and not interpret dtype.
169169
If converters are specified, they will be applied INSTEAD
170170
of dtype conversion.
171+
172+
.. versionadded:: 1.5.0
173+
174+
Support for defaultdict was added. Specify a defaultdict as input where
175+
the default determines the dtype of the columns which are not explicitly
176+
listed.
171177
engine : {{'c', 'python', 'pyarrow'}}, optional
172178
Parser engine to use. The C and pyarrow engines are faster, while the python engine
173179
is currently more feature-complete. Multithreading is currently only supported by

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Tests dtype specification during parsing
33
for all of the parsers defined in parsers.py
44
"""
5+
from collections import defaultdict
56
from io import StringIO
67

78
import numpy as np
@@ -343,3 +344,40 @@ def test_nullable_int_dtype(all_parsers, any_int_ea_dtype):
343344
)
344345
actual = parser.read_csv(StringIO(data), dtype=dtype)
345346
tm.assert_frame_equal(actual, expected)
347+
348+
349+
@pytest.mark.parametrize("default", ["float", "float64"])
350+
def test_dtypes_defaultdict(all_parsers, default):
351+
# GH#41574
352+
data = """a,b
353+
1,2
354+
"""
355+
dtype = defaultdict(lambda: default, a="int64")
356+
parser = all_parsers
357+
result = parser.read_csv(StringIO(data), dtype=dtype)
358+
expected = DataFrame({"a": [1], "b": 2.0})
359+
tm.assert_frame_equal(result, expected)
360+
361+
362+
def test_dtypes_defaultdict_mangle_dup_cols(all_parsers):
363+
# GH#41574
364+
data = """a,b,a,b,b.1
365+
1,2,3,4,5
366+
"""
367+
dtype = defaultdict(lambda: "float64", a="int64")
368+
dtype["b.1"] = "int64"
369+
parser = all_parsers
370+
result = parser.read_csv(StringIO(data), dtype=dtype)
371+
expected = DataFrame({"a": [1], "b": [2.0], "a.1": [3], "b.2": [4.0], "b.1": [5]})
372+
tm.assert_frame_equal(result, expected)
373+
374+
375+
def test_dtypes_defaultdict_invalid(all_parsers):
376+
# GH#41574
377+
data = """a,b
378+
1,2
379+
"""
380+
dtype = defaultdict(lambda: "invalid_dtype", a="int64")
381+
parser = all_parsers
382+
with pytest.raises(TypeError, match="not understood"):
383+
parser.read_csv(StringIO(data), dtype=dtype)

0 commit comments

Comments
 (0)