pandas-dev · WillAyd · Mar 12, 2020 · Mar 8, 2020 · Mar 8, 2020 · Mar 8, 2020
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -302,6 +302,7 @@ I/O
   timestamps with ``version="2.0"`` (:issue:`31652`).
 - Bug in :meth:`read_csv` was raising `TypeError` when `sep=None` was used in combination with `comment` keyword (:issue:`31396`)
 - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`)
+- Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`)
 
 
 Plotting

diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h
@@ -34,6 +34,10 @@ int floatify(PyObject *str, double *result, int *maybe_int) {
         data = PyBytes_AS_STRING(str);
     } else if (PyUnicode_Check(str)) {
         tmp = PyUnicode_AsUTF8String(str);
+        if (tmp == NULL) {
+            PyErr_SetString(PyExc_TypeError, "UTF8 decode error");
+            return -1;
+        }
         data = PyBytes_AS_STRING(tmp);
     } else {
         PyErr_SetString(PyExc_TypeError, "Invalid object type");

diff --git a/pandas/tests/io/data/excel/high_surrogate.xlsx b/pandas/tests/io/data/excel/high_surrogate.xlsx
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -1044,3 +1044,9 @@ def test_excel_read_binary(self, engine, read_ext):
 
         actual = pd.read_excel(data, engine=engine)
         tm.assert_frame_equal(expected, actual)
+
+    def test_excel_high_surrogate(self, read_ext):
+        # GH 23809
+
+        # should not produce a segmentation violation
+        pd.read_excel("high_surrogate.xlsx")