BUG: Check that values for "nrows" and "chunksize" are valid

toobaz · toobaz · commit b21fdcf1dfe7 · 2017-03-21T23:53:07.000+01:00
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -815,6 +815,7 @@ Bug Fixes
 - Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
 - Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`)
 - Bug in ``pd.read_csv()`` in which a file containing a row with many columns followed by rows with fewer columns would cause a crash (:issue:`14125`)
+- Added checks in ``pd.read_csv()`` ensuring that values for ``nrows`` and ``chunksize`` are valid (:issue:`15767`)
 - Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`)
 - Bug in ``.groupby(..).resample()`` when passed the ``on=`` kwarg. (:issue:`15021`)
 - Bug in using ``__deepcopy__`` on empty NDFrame objects (:issue:`15370`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -344,24 +344,34 @@
 """ % (_parser_params % (_fwf_widths, ''))
 
 
-def _validate_nrows(nrows):
+def _validate_integer(name, val, min_val=0):
     """
-    Checks whether the 'nrows' parameter for parsing is either
+    Checks whether the 'name' parameter for parsing is either
     an integer OR float that can SAFELY be cast to an integer
     without losing accuracy. Raises a ValueError if that is
     not the case.
+
+    Parameters
+    ----------
+    name : string
+        Parameter name (used for error reporting)
+    val : int or float
+        The value to check
+    min_val : int
+        Minimum allowed value (val < min_val will result in a ValueError)
     """
-    msg = "'nrows' must be an integer"
+    msg = "'{name:s}' must be an integer >={min_val:d}".format(name=name,
+                                                               min_val=min_val)
 
-    if nrows is not None:
-        if is_float(nrows):
-            if int(nrows) != nrows:
+    if val is not None:
+        if is_float(val):
+            if int(val) != val:
                 raise ValueError(msg)
-            nrows = int(nrows)
-        elif not is_integer(nrows):
+            val = int(val)
+        elif not (is_integer(val) and val >= min_val):
             raise ValueError(msg)
 
-    return nrows
+    return val
 
 
 def _read(filepath_or_buffer, kwds):
@@ -383,8 +393,8 @@ def _read(filepath_or_buffer, kwds):
 
     # Extract some of the arguments (pass chunksize on).
     iterator = kwds.get('iterator', False)
-    chunksize = kwds.get('chunksize', None)
-    nrows = _validate_nrows(kwds.get('nrows', None))
+    chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1)
+    nrows = _validate_integer('nrows', kwds.get('nrows', None))
 
     # Create the parser.
     parser = TextFileReader(filepath_or_buffer, **kwds)
diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py
@@ -384,14 +384,17 @@ def test_read_nrows(self):
         df = self.read_csv(StringIO(self.data1), nrows=3.0)
         tm.assert_frame_equal(df, expected)
 
-        msg = "must be an integer"
+        msg = r"'nrows' must be an integer >=0"
 
         with tm.assertRaisesRegexp(ValueError, msg):
             self.read_csv(StringIO(self.data1), nrows=1.2)
 
         with tm.assertRaisesRegexp(ValueError, msg):
             self.read_csv(StringIO(self.data1), nrows='foo')
 
+        with tm.assertRaisesRegexp(ValueError, msg):
+            self.read_csv(StringIO(self.data1), nrows=-1)
+
     def test_read_chunksize(self):
         reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2)
         df = self.read_csv(StringIO(self.data1), index_col=0)
@@ -402,6 +405,18 @@ def test_read_chunksize(self):
         tm.assert_frame_equal(chunks[1], df[2:4])
         tm.assert_frame_equal(chunks[2], df[4:])
 
+        # with invalid chunksize value:
+        msg = r"'chunksize' must be an integer >=1"
+
+        with tm.assertRaisesRegexp(ValueError, msg):
+            self.read_csv(StringIO(self.data1), chunksize=1.3)
+
+        with tm.assertRaisesRegexp(ValueError, msg):
+            self.read_csv(StringIO(self.data1), chunksize='foo')
+
+        with tm.assertRaisesRegexp(ValueError, msg):
+            self.read_csv(StringIO(self.data1), chunksize=0)
+
     def test_read_chunksize_and_nrows(self):
 
         # gh-15755