pandas-dev · nikoskaragiannakis · Mar 20, 2018 · Mar 20, 2018 · Mar 20, 2018 · Mar 20, 2018
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -984,6 +984,8 @@ I/O
 - Bug in :meth:`pandas.io.stata.StataReader.value_labels` raising an ``AttributeError`` when called on very old files. Now returns an empty dict (:issue:`19417`)
 - Bug in :func:`read_pickle` when unpickling objects with :class:`TimedeltaIndex` or :class:`Float64Index` created with pandas prior to version 0.20 (:issue:`19939`)
 - Bug in :meth:`pandas.io.json.json_normalize` where subrecords are not properly normalized if any subrecords values are NoneType (:issue:`20030`)
+- Bug in :func:`read_excel` and :class:`TextReader` now turn np.nan to empty string when dtype=str. They used to turn np.nan to 'nan' (:issue `20377`)
+
 
 Plotting
 ^^^^^^^^

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -465,7 +465,8 @@ cpdef ndarray[object] astype_unicode(ndarray arr):
     for i in range(n):
         # we can use the unsafe version because we know `result` is mutable
         # since it was created from `np.empty`
-        util.set_value_at_unsafe(result, i, unicode(arr[i]))
+        arr_i = arr[i]
+        util.set_value_at_unsafe(result, i, unicode(arr_i) if arr_i is not np.nan else '')
 
     return result
 
@@ -478,7 +479,8 @@ cpdef ndarray[object] astype_str(ndarray arr):
     for i in range(n):
         # we can use the unsafe version because we know `result` is mutable
         # since it was created from `np.empty`
-        util.set_value_at_unsafe(result, i, str(arr[i]))
+        arr_i = arr[i]
+        util.set_value_at_unsafe(result, i, str(arr_i) if arr_i is not np.nan else '')
 
     return result
 

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -1217,17 +1217,28 @@ cdef class TextReader:
                 return result, 0
 
             # treat as a regular string parsing
-            return self._string_convert(i, start, end, na_filter,
-                                        na_hashset)
+            res, na_count = self._string_convert(i, start, end, na_filter,
+                                                 na_hashset)
+
+            for i in range(len(res)):
+                if res[i] is np.nan:
+                    res[i] = ''
+            return res, na_count
+
         elif dtype.kind == 'U':
             width = dtype.itemsize
             if width > 0:
                 raise TypeError("the dtype %s is not "
                                 "supported for parsing" % dtype)
 
             # unicode variable width
-            return self._string_convert(i, start, end, na_filter,
-                                        na_hashset)
+            res, na_count = self._string_convert(i, start, end, na_filter,
+                                                 na_hashset)
+            for i in range(len(res)):
+                if res[i] is np.nan:
+                    res[i] = ''
+            return res, na_count
+
         elif is_categorical_dtype(dtype):
             # TODO: I suspect that _categorical_convert could be
             # optimized when dtype is an instance of CategoricalDtype

diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py
@@ -361,6 +361,35 @@ def test_reader_dtype(self, ext):
         with pytest.raises(ValueError):
             actual = self.get_exceldf(basename, ext, dtype={'d': 'int64'})
 
+    def test_reader_dtype_str(self, ext):
+        # GH 20377
+        basename = 'testdtype'
+        actual = self.get_exceldf(basename, ext)
+
+        expected = DataFrame({
+            'a': [1, 2, 3, 4],
+            'b': [2.5, 3.5, 4.5, 5.5],
+            'c': [1, 2, 3, 4],
+            'd': [1.0, 2.0, np.nan, 4.0]}).reindex(
+                columns=['a', 'b', 'c', 'd'])
+
+        tm.assert_frame_equal(actual, expected)
+
+        actual = self.get_exceldf(basename, ext,
+                                  dtype={'a': 'float64',
+                                         'b': 'float32',
+                                         'c': str,
+                                         'd': str})
+
+        expected['a'] = expected['a'].astype('float64')
+        expected['b'] = expected['b'].astype('float32')
+        expected['c'] = ['001', '002', '003', '004']
+        expected['d'] = ['1', '2', '', '4']
+        tm.assert_frame_equal(actual, expected)
+
+        with pytest.raises(ValueError):
+            actual = self.get_exceldf(basename, ext, dtype={'d': 'int64'})
+
     def test_reading_all_sheets(self, ext):
         # Test reading all sheetnames by setting sheetname to None,
         # Ensure a dict is returned.

diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py
@@ -142,7 +142,7 @@ def test_astype_datetime64tz(self):
                                                 tm.rands(1000)]),
                                         Series([string.digits * 10,
                                                 tm.rands(63),
-                                                tm.rands(64), nan, 1.0])])
+                                                tm.rands(64), '', 1.0])])
     def test_astype_str_map(self, dtype, series):
         # see gh-4405
         result = series.astype(dtype)

diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py
@@ -162,6 +162,16 @@ def test_to_csv_compression(self, compression):
                                                    index_col=0,
                                                    squeeze=True))
 
+    def test_from_csv_dtype_str(self):
+        # GH20377
+        s = Series([1, 2, np.nan, 4], index=['A', 'B', 'C', 'D'],
+                   name='X')
+        with ensure_clean() as filename:
+            s.to_csv(filename, header=True)
+            rs = pd.read_csv(filename, dtype=str)
+            expected = Series(['1.0', '2.0', '', '4.0'], name=s.name)
+            assert_series_equal(rs.X, expected)
+
 
 class TestSeriesIO(TestData):