Skip to content

Commit b21fdcf

Browse files
committed
BUG: Check that values for "nrows" and "chunksize" are valid
1 parent 1c9d46a commit b21fdcf

File tree

3 files changed

+38
-12
lines changed

3 files changed

+38
-12
lines changed

doc/source/whatsnew/v0.20.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -815,6 +815,7 @@ Bug Fixes
815815
- Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
816816
- Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`)
817817
- Bug in ``pd.read_csv()`` in which a file containing a row with many columns followed by rows with fewer columns would cause a crash (:issue:`14125`)
818+
- Added checks in ``pd.read_csv()`` ensuring that values for ``nrows`` and ``chunksize`` are valid (:issue:`15767`)
818819
- Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`)
819820
- Bug in ``.groupby(..).resample()`` when passed the ``on=`` kwarg. (:issue:`15021`)
820821
- Bug in using ``__deepcopy__`` on empty NDFrame objects (:issue:`15370`)

pandas/io/parsers.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -344,24 +344,34 @@
344344
""" % (_parser_params % (_fwf_widths, ''))
345345

346346

347-
def _validate_nrows(nrows):
347+
def _validate_integer(name, val, min_val=0):
348348
"""
349-
Checks whether the 'nrows' parameter for parsing is either
349+
Checks whether the 'name' parameter for parsing is either
350350
an integer OR float that can SAFELY be cast to an integer
351351
without losing accuracy. Raises a ValueError if that is
352352
not the case.
353+
354+
Parameters
355+
----------
356+
name : string
357+
Parameter name (used for error reporting)
358+
val : int or float
359+
The value to check
360+
min_val : int
361+
Minimum allowed value (val < min_val will result in a ValueError)
353362
"""
354-
msg = "'nrows' must be an integer"
363+
msg = "'{name:s}' must be an integer >={min_val:d}".format(name=name,
364+
min_val=min_val)
355365

356-
if nrows is not None:
357-
if is_float(nrows):
358-
if int(nrows) != nrows:
366+
if val is not None:
367+
if is_float(val):
368+
if int(val) != val:
359369
raise ValueError(msg)
360-
nrows = int(nrows)
361-
elif not is_integer(nrows):
370+
val = int(val)
371+
elif not (is_integer(val) and val >= min_val):
362372
raise ValueError(msg)
363373

364-
return nrows
374+
return val
365375

366376

367377
def _read(filepath_or_buffer, kwds):
@@ -383,8 +393,8 @@ def _read(filepath_or_buffer, kwds):
383393

384394
# Extract some of the arguments (pass chunksize on).
385395
iterator = kwds.get('iterator', False)
386-
chunksize = kwds.get('chunksize', None)
387-
nrows = _validate_nrows(kwds.get('nrows', None))
396+
chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1)
397+
nrows = _validate_integer('nrows', kwds.get('nrows', None))
388398

389399
# Create the parser.
390400
parser = TextFileReader(filepath_or_buffer, **kwds)

pandas/tests/io/parser/common.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -384,14 +384,17 @@ def test_read_nrows(self):
384384
df = self.read_csv(StringIO(self.data1), nrows=3.0)
385385
tm.assert_frame_equal(df, expected)
386386

387-
msg = "must be an integer"
387+
msg = r"'nrows' must be an integer >=0"
388388

389389
with tm.assertRaisesRegexp(ValueError, msg):
390390
self.read_csv(StringIO(self.data1), nrows=1.2)
391391

392392
with tm.assertRaisesRegexp(ValueError, msg):
393393
self.read_csv(StringIO(self.data1), nrows='foo')
394394

395+
with tm.assertRaisesRegexp(ValueError, msg):
396+
self.read_csv(StringIO(self.data1), nrows=-1)
397+
395398
def test_read_chunksize(self):
396399
reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2)
397400
df = self.read_csv(StringIO(self.data1), index_col=0)
@@ -402,6 +405,18 @@ def test_read_chunksize(self):
402405
tm.assert_frame_equal(chunks[1], df[2:4])
403406
tm.assert_frame_equal(chunks[2], df[4:])
404407

408+
# with invalid chunksize value:
409+
msg = r"'chunksize' must be an integer >=1"
410+
411+
with tm.assertRaisesRegexp(ValueError, msg):
412+
self.read_csv(StringIO(self.data1), chunksize=1.3)
413+
414+
with tm.assertRaisesRegexp(ValueError, msg):
415+
self.read_csv(StringIO(self.data1), chunksize='foo')
416+
417+
with tm.assertRaisesRegexp(ValueError, msg):
418+
self.read_csv(StringIO(self.data1), chunksize=0)
419+
405420
def test_read_chunksize_and_nrows(self):
406421

407422
# gh-15755

0 commit comments

Comments
 (0)