pandas-dev · jreback · Jul 3, 2019 · Feb 27, 2019 · Feb 28, 2019 · Feb 28, 2019
diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml
@@ -15,6 +15,7 @@ dependencies:
   - nomkl
   - numexpr
   - numpy=1.15.*
+  - odfpy
   - openpyxl
   - pandas-gbq
   # https://github.com/pydata/pandas-gbq/issues/271

diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -41,6 +41,7 @@ Other Enhancements
 - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`)
 - :func:`merge_asof` now gives a more clear error message when merge keys are categoricals that are not equal (:issue:`26136`)
 - :meth:`pandas.core.window.Rolling` supports exponential (or Poisson) window type (:issue:`21303`)
+- :func:`pandas.io.excel.read_excel` supports reading OpenDocument tables. Specify engine='odf' to enable. (:issue:`9070`)
 
 .. _whatsnew_0250.api_breaking:
 

diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -780,9 +780,11 @@ class ExcelFile:
     """
 
     from pandas.io.excel._xlrd import _XlrdReader
+    from pandas.io.excel._odfreader import ODFReader
 
     _engines = {
         'xlrd': _XlrdReader,
+        'odf': ODFReader,
     }
 
     def __init__(self, io, engine=None):

diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py
@@ -0,0 +1,164 @@
+import pandas as pd
+
+from pandas.io.parsers import TextParser
+
+
+class ODFReader:
+    """Read tables out of OpenDocument formatted files
+
+    Parameters
+    ----------
+    filepath_or_buffer: string, path to be parsed or
+        an open readable stream.
+    """
+    def __init__(self, filepath_or_buffer):
+        try:
+            from odf.opendocument import load as document_load
+            from odf.table import Table
+        except ImportError:
+            raise ImportError("Install odfpy >= 1.3 for OpenDocument support")
+
+        self.filepath_or_buffer = filepath_or_buffer
+        self.document = document_load(filepath_or_buffer)
+        self.tables = self.document.getElementsByType(Table)
+
+    @property
+    def sheet_names(self):
+        """Return a list of sheet names present in the document"""
+        from odf.namespaces import TABLENS
+        return [t.attributes[(TABLENS, 'name')] for t in self.tables]
+
+    def get_sheet_by_index(self, index):
+        return self.tables[index]
+
+    def get_sheet_by_name(self, name):
+        i = self.sheet_names.index(name)
+        return self.tables[i]
+
+    def _get_sheet(self, name):
+        """Given a sheet name or index, return the root ODF Table node
+        """
+        if isinstance(name, str):
+            return self.get_sheet_by_name(name)
+        elif isinstance(name, int):
+            return self.get_sheet_by_index(name)
+        else:
+            raise ValueError(
+                'Unrecognized sheet identifier type {}. Please use'
+                'a string or integer'.format(type(name)))
+
+    def parse(self, sheet_name=0, **kwds):
+        tree = self._get_sheet(sheet_name)
+        data = self.get_sheet_data(tree, convert_float=False)
+        parser = TextParser(data, **kwds)
+        return parser.read()
+
+    def get_sheet_data(self, sheet, convert_float):
+        """Parse an ODF Table into a list of lists
+        """
+        from odf.table import TableCell, TableRow
+
+        sheet_rows = sheet.getElementsByType(TableRow)
+        table = []
+        empty_rows = 0
+        max_row_len = 0
+        for i, sheet_row in enumerate(sheet_rows):
+            sheet_cells = sheet_row.getElementsByType(TableCell)
+            empty_cells = 0
+            table_row = []
+            for j, sheet_cell in enumerate(sheet_cells):
+                value = self._get_cell_value(sheet_cell)
+                column_repeat = self._get_cell_repeat(sheet_cell)
+
+                if len(sheet_cell.childNodes) == 0:
+                    empty_cells += column_repeat
+                else:
+                    if empty_cells > 0:
+                        table_row.extend([None] * empty_cells)
+                        empty_cells = 0
+                    table_row.extend([value] * column_repeat)
+
+            if max_row_len < len(table_row):
+                max_row_len = len(table_row)
+
+            row_repeat = self._get_row_repeat(sheet_row)
+            if self._is_empty_row(sheet_row):
+                empty_rows += row_repeat
+            else:
+                if empty_rows > 0:
+                    # add blank rows to our table
+                    table.extend([[None]] * empty_rows)
+                    empty_rows = 0
+                table.append(table_row)
+
+        # Make our table square
+        for row in table:
+            if len(row) < max_row_len:
+                row.extend([None] * (max_row_len - len(row)))
+
+        return table
+
+    def _get_row_repeat(self, row):
+        """Return number of times this row was repeated
+
+        Repeating an empty row appeared to be a common way
+        of representing sparse rows in the table.
+        """
+        from odf.namespaces import TABLENS
+        repeat = row.attributes.get((TABLENS, 'number-rows-repeated'))
+        if repeat is None:
+            return 1
+        return int(repeat)
+
+    def _get_cell_repeat(self, cell):
+        from odf.namespaces import TABLENS
+        repeat = cell.attributes.get((TABLENS, 'number-columns-repeated'))
+        if repeat is None:
+            return 1
+        return int(repeat)
+
+    def _is_empty_row(self, row):
+        """Helper function to find empty rows
+        """
+        for column in row.childNodes:
+            if len(column.childNodes) > 0:
+                return False
+
+        return True
+
+    def _get_cell_value(self, cell):
+        from odf.namespaces import OFFICENS
+        cell_type = cell.attributes.get((OFFICENS, 'value-type'))
+        if cell_type == 'boolean':
+            cell_value = cell.attributes.get((OFFICENS, 'boolean'))
+            return bool(cell_value)
+        elif cell_type in ('float', 'percentage'):
+            cell_value = cell.attributes.get((OFFICENS, 'value'))
+            return float(cell_value)
+        elif cell_type == 'string':
+            return str(cell)
+        elif cell_type == 'currency':
+            cell_value = cell.attributes.get((OFFICENS, 'value'))
+            return float(cell_value)
+        elif cell_type == 'date':
+            cell_value = cell.attributes.get((OFFICENS, 'date-value'))
+            return pd.Timestamp(cell_value)
+        elif cell_type == 'time':
+            cell_value = cell.attributes.get((OFFICENS, 'time-value'))
+            return(pandas_isoduration_compatibility(cell_value))
+        elif cell_type is None:
+            return None
+        else:
+            raise ValueError('Unrecognized type {}'.format(cell_type))
+
+
+def pandas_isoduration_compatibility(duration):
+    """Libreoffice returns durations without any day attributes
+
+    For example PT3H45M0S. The current pandas Timedelta
+    parse requires the presence of a day component.
+    Workaround for https://github.com/pandas-dev/pandas/issues/25422
+    """
+    if duration.startswith('PT'):
+        duration = 'P0DT' + duration[2:]
+    return pd.Timedelta(duration)
diff --git a/pandas/tests/io/data/blank-row-repeat.ods b/pandas/tests/io/data/blank-row-repeat.ods
diff --git a/pandas/tests/io/data/datatypes.ods b/pandas/tests/io/data/datatypes.ods
diff --git a/pandas/tests/io/data/headers.ods b/pandas/tests/io/data/headers.ods
diff --git a/pandas/tests/io/data/invalid_value_type.ods b/pandas/tests/io/data/invalid_value_type.ods
diff --git a/pandas/tests/io/data/lowerdiagonal.ods b/pandas/tests/io/data/lowerdiagonal.ods
diff --git a/pandas/tests/io/data/runlengthencoding.ods b/pandas/tests/io/data/runlengthencoding.ods
diff --git a/pandas/tests/io/data/writertable.odt b/pandas/tests/io/data/writertable.odt
diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py
@@ -2556,3 +2556,128 @@ def test_excelwriter_fspath(self):
         with tm.ensure_clean('foo.xlsx') as path:
             writer = ExcelWriter(path)
             assert os.fspath(writer) == str(path)
+
+
+@td.skip_if_no('odf')
+class TestODFReader(SharedItems):
+    def test_get_sheet(self):
+        from pandas.io.excel._odfreader import ODFReader
+
+        pth = os.path.join(self.dirpath, 'datatypes.ods')
+        book = ODFReader(pth)
+
+        assert len(book.sheet_names) == 1
+        assert book.sheet_names == ['Sheet1']
+
+    def test_get_sheet_raises(self):
+        from pandas.io.excel._odfreader import ODFReader
+
+        pth = os.path.join(self.dirpath, 'datatypes.ods')
+        book = ODFReader(pth)
+
+        with pytest.raises(ValueError):
+            book._get_sheet(3.14)
+
+        with pytest.raises(ValueError):
+            book.get_sheet_by_name("Invalid Sheet 77")
+
+        with pytest.raises(IndexError):
+            book.get_sheet_by_index(-33)
+
+    def test_read_types(self):
+        sheet = self.get_exceldf(
+            'datatypes', '.ods', header=None, engine='odf')
+
+        expected = DataFrame(
+            [[1.0],
+             [1.25],
+             ['a'],
+             [pd.Timestamp(2003, 1, 2)],
+             [False],
+             [0.35],
+             [pd.Timedelta(hours=3, minutes=45),
+              pd.Timedelta(hours=17, minutes=53),
+              pd.Timedelta(hours=14, minutes=8)],
+             # though what should the value of a hyperlink be?
+             ['UBERON:0002101']])
+        tm.assert_equal(sheet, expected)
+
+    def test_read_invalid_types_raises(self):
+        # the invalid_value_type.ods required manually editing
+        # of the included content.xml file
+        with pytest.raises(ValueError,
+                           match="Unrecognized type awesome_new_type"):
+            self.get_exceldf(
+                'invalid_value_type', '.ods', header=None, engine='odf')
+
+    def test_read_lower_diagonal(self):
+        # Make sure we can parse:
+        # 1
+        # 2 3
+        # 4 5 6
+        # 7 8 9 10
+
+        sheet = self.get_exceldf(
+            'lowerdiagonal', '.ods', 'Sheet1',
+            index_col=None, header=None, engine='odf')
+
+        assert sheet.shape == (4, 4)
+
+    def test_read_headers(self):
+        sheet = self.get_exceldf(
+            'headers', '.ods', 'Sheet1', index_col=0, engine='odf')
+
+        expected = DataFrame.from_dict(OrderedDict([
+            ("Header", ["Row 1", "Row 2"]),
+            ("Column 1", [1.0, 2.0]),
+            ("Column 2", [3.0, 4.0]),
+            # Empty Column
+            ("Column 4", [7.0, 8.0]),
+            # Empty Column 2
+            ("Column 6", [11.0, 12.0])]))
+        expected.set_index("Header", inplace=True)
+        columns = ["Column 1", "Column 2", "Column 4", "Column 6"]
+        tm.assert_equal(sheet[columns], expected)
+        empties = [None, 'None.1']
+        for name in empties:
+            for value in sheet[name]:
+                assert pd.isnull(value)
+
+    def test_read_writer_table(self):
+        # Also test reading tables from an text OpenDocument file
+        # (.odt)
+
+        table = self.get_exceldf(
+            'writertable', '.odt', 'Table1', index_col=0, engine='odf')
+
+        assert table.shape == (3, 3)
+        expected = DataFrame.from_dict(OrderedDict([
+            ("Header", ["Row 1", "Row 2", "Row 3"]),
+            ("Column 1", [1.0, 2.0, 3.0]),
+            ("Unnamed: 2", [nan, nan, nan]),
+            ("Column 3", [7.0, 8.0, 9.0])]))
+        expected.set_index("Header", inplace=True)
+        columns = ["Column 1", "Column 3"]
+        tm.assert_equal(table[columns], expected[columns])
+
+        # make sure pandas gives a name to the unnamed column
+        for i in range(3):
+            assert pd.isnull(table["Unnamed: 2"][i])
+
+    def test_blank_row_repeat(self):
+        table = self.get_exceldf(
+            'blank-row-repeat', '.ods', 'Value', engine='odf')
+
+        assert table.shape == (14, 2)
+        assert table['value'][7] == 9.0
+        assert pd.isnull(table['value'][8])
+        assert not pd.isnull(table['value'][11])
+
+    def test_runlengthencoding(self):
+        sheet = self.get_exceldf(
+            'runlengthencoding', '.ods', 'Sheet1', header=None, engine='odf')
+        assert sheet.shape == (5, 3)
+        # check by column, not by row.
+        assert list(sheet[0]) == [1.0, 1.0, 2.0, 2.0, 2.0]
+        assert list(sheet[1]) == [1.0, 2.0, 2.0, 2.0, 2.0]
+        assert list(sheet[2]) == [1.0, 2.0, 2.0, 2.0, 2.0]