Skip to content

Commit 025b5dc

Browse files
heckeopjreback
heckeop
authored andcommitted
BUG: Raise ValueError if a column index in usecols is out of bounds. … (#25686)
1 parent 1d4c89f commit 025b5dc

File tree

3 files changed

+37
-2
lines changed

3 files changed

+37
-2
lines changed

doc/source/whatsnew/v0.25.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ I/O
353353
- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`)
354354
- Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`)
355355
- Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`)
356-
356+
- Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`)
357357

358358
Plotting
359359
^^^^^^^^

pandas/io/parsers.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1899,6 +1899,12 @@ def __init__(self, src, **kwds):
18991899
not set(usecols).issubset(self.orig_names)):
19001900
_validate_usecols_names(usecols, self.orig_names)
19011901

1902+
# GH 25623
1903+
# validate that column indices in usecols are not out of bounds
1904+
elif self.usecols_dtype == 'integer':
1905+
indices = lrange(self._reader.table_width)
1906+
_validate_usecols_names(usecols, indices)
1907+
19021908
if len(self.names) > len(usecols):
19031909
self.names = [n for i, n in enumerate(self.names)
19041910
if (i in usecols or n in usecols)]
@@ -2202,7 +2208,8 @@ def __init__(self, f, **kwds):
22022208
self.skipinitialspace = kwds['skipinitialspace']
22032209
self.lineterminator = kwds['lineterminator']
22042210
self.quoting = kwds['quoting']
2205-
self.usecols, _ = _validate_usecols_arg(kwds['usecols'])
2211+
self.usecols, self.usecols_dtype = _validate_usecols_arg(
2212+
kwds['usecols'])
22062213
self.skip_blank_lines = kwds['skip_blank_lines']
22072214

22082215
self.warn_bad_lines = kwds['warn_bad_lines']
@@ -2592,6 +2599,13 @@ def _infer_columns(self):
25922599
if clear_buffer:
25932600
self._clear_buffer()
25942601

2602+
# GH 25623
2603+
# validate that column indices in usecols are not out of bounds
2604+
if self.usecols_dtype == 'integer':
2605+
for col in columns:
2606+
indices = lrange(len(col))
2607+
_validate_usecols_names(self.usecols, indices)
2608+
25952609
if names is not None:
25962610
if ((self.usecols is not None and
25972611
len(names) != len(self.usecols)) or
@@ -2627,6 +2641,11 @@ def _infer_columns(self):
26272641
ncols = len(line)
26282642
num_original_columns = ncols
26292643

2644+
# GH 25623
2645+
# validate that column indices in usecols are not out of bounds
2646+
if self.usecols_dtype == 'integer':
2647+
_validate_usecols_names(self.usecols, lrange(ncols))
2648+
26302649
if not names:
26312650
if self.prefix:
26322651
columns = [['{prefix}{idx}'.format(

pandas/tests/io/parser/test_usecols.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,22 @@
2121
"expected but not found: {0}")
2222

2323

24+
@pytest.mark.parametrize("names,usecols,missing", [
25+
(None, [0, 3], r"\[3\]"),
26+
(["a", "b", "c"], [0, -1, 2], r"\[-1\]"),
27+
(None, [3], r"\[3\]"),
28+
(["a"], [3], r"\[3\]")
29+
])
30+
def test_usecols_out_of_bounds(all_parsers, names, usecols, missing):
31+
# See gh-25623
32+
data = "a,b,c\n1,2,3\n4,5,6"
33+
parser = all_parsers
34+
35+
mssg = _msg_validate_usecols_names.format(missing)
36+
with pytest.raises(ValueError, match=mssg):
37+
parser.read_csv(StringIO(data), usecols=usecols, names=names)
38+
39+
2440
def test_raise_on_mixed_dtype_usecols(all_parsers):
2541
# See gh-12678
2642
data = """a,b,c

0 commit comments

Comments
 (0)