BUG: Raise ValueError if a column index in usecols is out of bounds. … (#25686)

heckeop · jreback · commit 025b5dcbcf27 · 2019-03-30T13:36:22.000-04:00
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -353,7 +353,7 @@ I/O
 - Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`)
 - Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`)
 - Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`)
-
+- Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1899,6 +1899,12 @@ def __init__(self, src, **kwds):
                     not set(usecols).issubset(self.orig_names)):
                 _validate_usecols_names(usecols, self.orig_names)
 
+            # GH 25623
+            # validate that column indices in usecols are not out of bounds
+            elif self.usecols_dtype == 'integer':
+                indices = lrange(self._reader.table_width)
+                _validate_usecols_names(usecols, indices)
+
             if len(self.names) > len(usecols):
                 self.names = [n for i, n in enumerate(self.names)
                               if (i in usecols or n in usecols)]
@@ -2202,7 +2208,8 @@ def __init__(self, f, **kwds):
         self.skipinitialspace = kwds['skipinitialspace']
         self.lineterminator = kwds['lineterminator']
         self.quoting = kwds['quoting']
-        self.usecols, _ = _validate_usecols_arg(kwds['usecols'])
+        self.usecols, self.usecols_dtype = _validate_usecols_arg(
+            kwds['usecols'])
         self.skip_blank_lines = kwds['skip_blank_lines']
 
         self.warn_bad_lines = kwds['warn_bad_lines']
@@ -2592,6 +2599,13 @@ def _infer_columns(self):
             if clear_buffer:
                 self._clear_buffer()
 
+            # GH 25623
+            # validate that column indices in usecols are not out of bounds
+            if self.usecols_dtype == 'integer':
+                for col in columns:
+                    indices = lrange(len(col))
+                    _validate_usecols_names(self.usecols, indices)
+
             if names is not None:
                 if ((self.usecols is not None and
                      len(names) != len(self.usecols)) or
@@ -2627,6 +2641,11 @@ def _infer_columns(self):
             ncols = len(line)
             num_original_columns = ncols
 
+            # GH 25623
+            # validate that column indices in usecols are not out of bounds
+            if self.usecols_dtype == 'integer':
+                _validate_usecols_names(self.usecols, lrange(ncols))
+
             if not names:
                 if self.prefix:
                     columns = [['{prefix}{idx}'.format(
diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py
@@ -21,6 +21,22 @@
                                "expected but not found: {0}")
 
 
+@pytest.mark.parametrize("names,usecols,missing", [
+    (None, [0, 3], r"\[3\]"),
+    (["a", "b", "c"], [0, -1, 2], r"\[-1\]"),
+    (None, [3], r"\[3\]"),
+    (["a"], [3], r"\[3\]")
+])
+def test_usecols_out_of_bounds(all_parsers, names, usecols, missing):
+    # See gh-25623
+    data = "a,b,c\n1,2,3\n4,5,6"
+    parser = all_parsers
+
+    mssg = _msg_validate_usecols_names.format(missing)
+    with pytest.raises(ValueError, match=mssg):
+        parser.read_csv(StringIO(data), usecols=usecols, names=names)
+
+
 def test_raise_on_mixed_dtype_usecols(all_parsers):
     # See gh-12678
     data = """a,b,c