-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Class to read OpenDocument Tables #25427
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 53 commits
479e639
8be4b67
77d9033
47b2ffb
0fa2ac9
e6e2365
1bbf284
d5c7ec0
691f1e9
93c2b66
394c4bd
b149d84
19587b3
60a5bc1
1fef008
7148995
5db1a0b
83c0243
735e2b4
8302fd7
d0df3bd
47597c9
9e1799a
d5c60ab
39cfecf
8a9a66c
fd7663f
3bcc1b7
15e69eb
65615cd
9584753
9dc34f4
5e32f6d
6360c07
4227268
80607b0
43f7160
4da0445
cbbc653
1227216
696ed5d
49fff9f
7b08304
4d97d84
59cdf0b
98d3ca7
fb48d8d
6576af9
4dc1b51
8ce45b4
f9f88b0
3e0d758
ff28993
7396ad6
5a440a4
250a3d3
d7e7d05
93adedb
62a37e7
13fb76f
fb6c5ee
5c839f4
4026fc1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,192 @@ | ||
from typing import Dict, List | ||
|
||
from pandas.compat._optional import import_optional_dependency | ||
|
||
import pandas as pd | ||
from pandas._typing import FilePathOrBuffer, Scalar | ||
|
||
from pandas.io.excel._base import _BaseExcelReader | ||
|
||
|
||
class _ODFReader(_BaseExcelReader): | ||
"""Read tables out of OpenDocument formatted files | ||
|
||
Parameters | ||
---------- | ||
filepath_or_buffer: string, path to be parsed or | ||
an open readable stream. | ||
""" | ||
def __init__(self, filepath_or_buffer: FilePathOrBuffer): | ||
import_optional_dependency("odf") | ||
super().__init__(filepath_or_buffer) | ||
|
||
@property | ||
def _workbook_class(self): | ||
from odf.opendocument import OpenDocument | ||
return OpenDocument | ||
|
||
def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): | ||
from odf.opendocument import load | ||
return load(filepath_or_buffer) | ||
|
||
@property | ||
def sheet_names(self) -> List[str]: | ||
"""Return a list of sheet names present in the document""" | ||
from odf.table import Table | ||
|
||
tables = self.book.getElementsByType(Table) | ||
return [t.getAttribute("name") for t in tables] | ||
|
||
def get_sheet_by_index(self, index: int): | ||
from odf.table import Table | ||
tables = self.book.getElementsByType(Table) | ||
return tables[index] | ||
|
||
def get_sheet_by_name(self, name: str): | ||
from odf.table import Table | ||
|
||
tables = self.book.getElementsByType(Table) | ||
|
||
for table in tables: | ||
if table.getAttribute("name") == name: | ||
return table | ||
|
||
raise ValueError("sheet {name} not found".format(name)) | ||
|
||
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: | ||
"""Parse an ODF Table into a list of lists | ||
""" | ||
from odf.table import TableCell, TableRow | ||
|
||
sheet_rows = sheet.getElementsByType(TableRow) | ||
table = [] # type: List[List[Scalar]] | ||
empty_rows = 0 | ||
max_row_len = 0 | ||
row_spans = {} # type: Dict[int, int] | ||
|
||
for i, sheet_row in enumerate(sheet_rows): | ||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
sheet_cells = sheet_row.getElementsByType(TableCell) | ||
empty_cells = 0 | ||
table_row = [] # type: List[Scalar] | ||
|
||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for j, sheet_cell in enumerate(sheet_cells): | ||
# Handle vertically merged cells; only works with first column | ||
if row_spans.get(j, 0) > 1: | ||
table_row.append('') | ||
row_spans[j] = row_spans[j] - 1 | ||
|
||
value = self._get_cell_value(sheet_cell, convert_float) | ||
column_repeat = self._get_column_repeat(sheet_cell) | ||
column_span = self._get_column_span(sheet_cell) | ||
row_span = self._get_row_span(sheet_cell) | ||
|
||
if row_span > 1: | ||
if j > 0: | ||
raise NotImplementedError( | ||
"The odf reader only supports vertical cell" | ||
"merging in the initial column") | ||
else: | ||
row_spans[j] = row_span | ||
|
||
if len(sheet_cell.childNodes) == 0: | ||
empty_cells += column_repeat | ||
else: | ||
if empty_cells > 0: | ||
table_row.extend([''] * empty_cells) | ||
empty_cells = 0 | ||
table_row.extend([value] * column_repeat) | ||
|
||
# horizontally merged cells should only show first value | ||
if column_span > 1: | ||
table_row.extend([''] * (column_span - 1)) | ||
|
||
if max_row_len < len(table_row): | ||
max_row_len = len(table_row) | ||
|
||
row_repeat = self._get_row_repeat(sheet_row) | ||
if self._is_empty_row(sheet_row): | ||
empty_rows += row_repeat | ||
else: | ||
if empty_rows > 0: | ||
# add blank rows to our table | ||
table.extend([['']] * empty_rows) | ||
empty_rows = 0 | ||
for _ in range(row_repeat): | ||
table.append(table_row) | ||
|
||
# Make our table square | ||
for row in table: | ||
if len(row) < max_row_len: | ||
row.extend([''] * (max_row_len - len(row))) | ||
|
||
return table | ||
|
||
def _get_row_repeat(self, row) -> int: | ||
"""Return number of times this row was repeated | ||
Repeating an empty row appeared to be a common way | ||
of representing sparse rows in the table. | ||
""" | ||
from odf.namespaces import TABLENS | ||
|
||
return int(row.attributes.get((TABLENS, 'number-rows-repeated'), 1)) | ||
|
||
def _get_column_repeat(self, cell) -> int: | ||
from odf.namespaces import TABLENS | ||
return int(cell.attributes.get( | ||
(TABLENS, 'number-columns-repeated'), 1)) | ||
|
||
def _get_row_span(self, cell) -> int: | ||
"""For handling cells merged vertically.""" | ||
from odf.namespaces import TABLENS | ||
return int(cell.attributes.get((TABLENS, 'number-rows-spanned'), 1)) | ||
|
||
def _get_column_span(self, cell) -> int: | ||
"""For handling cells merged horizontally.""" | ||
from odf.namespaces import TABLENS | ||
return int(cell.attributes.get((TABLENS, 'number-columns-spanned'), 1)) | ||
|
||
def _is_empty_row(self, row) -> bool: | ||
"""Helper function to find empty rows | ||
""" | ||
for column in row.childNodes: | ||
if len(column.childNodes) > 0: | ||
return False | ||
|
||
return True | ||
|
||
def _get_cell_value(self, cell, convert_float: bool) -> Scalar: | ||
from odf.namespaces import OFFICENS | ||
cell_type = cell.attributes.get((OFFICENS, 'value-type')) | ||
if cell_type == 'boolean': | ||
if str(cell) == "TRUE": | ||
return True | ||
return False | ||
if cell_type is None: | ||
return '' # compat with xlrd | ||
elif cell_type == 'float': | ||
# GH5394 | ||
cell_value = float(cell.attributes.get((OFFICENS, 'value'))) | ||
|
||
if cell_value == 0. and str(cell) != cell_value: # NA handling | ||
return str(cell) | ||
|
||
if convert_float: | ||
val = int(cell_value) | ||
if val == cell_value: | ||
return val | ||
return cell_value | ||
elif cell_type == 'percentage': | ||
cell_value = cell.attributes.get((OFFICENS, 'value')) | ||
return float(cell_value) | ||
elif cell_type == 'string': | ||
return str(cell) | ||
elif cell_type == 'currency': | ||
cell_value = cell.attributes.get((OFFICENS, 'value')) | ||
return float(cell_value) | ||
elif cell_type == 'date': | ||
cell_value = cell.attributes.get((OFFICENS, 'date-value')) | ||
return pd.to_datetime(cell_value) | ||
elif cell_type == 'time': | ||
return pd.to_datetime(str(cell)).time() | ||
else: | ||
raise ValueError('Unrecognized type {}'.format(cell_type)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import functools | ||
|
||
import numpy as np | ||
import pytest | ||
|
||
import pandas as pd | ||
import pandas.util.testing as tm | ||
|
||
pytest.importorskip("odf") | ||
|
||
|
||
@pytest.fixture(autouse=True) | ||
def cd_and_set_engine(monkeypatch, datapath): | ||
func = functools.partial(pd.read_excel, engine="odf") | ||
monkeypatch.setattr(pd, 'read_excel', func) | ||
monkeypatch.chdir(datapath("io", "data")) | ||
|
||
|
||
def test_read_invalid_types_raises(): | ||
# the invalid_value_type.ods required manually editing | ||
# of the included content.xml file | ||
with pytest.raises(ValueError, | ||
match="Unrecognized type awesome_new_type"): | ||
pd.read_excel("invalid_value_type.ods") | ||
|
||
|
||
def test_read_writer_table(): | ||
# Also test reading tables from an text OpenDocument file | ||
# (.odt) | ||
index = pd.Index(["Row 1", "Row 2", "Row 3"], name="Header") | ||
expected = pd.DataFrame([ | ||
[1, np.nan, 7], | ||
[2, np.nan, 8], | ||
[3, np.nan, 9], | ||
], index=index, columns=["Column 1", "Unnamed: 2", "Column 3"]) | ||
|
||
result = pd.read_excel("writertable.odt", 'Table1', index_col=0) | ||
|
||
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
def test_raises_repeated_rows_not_in_col_0(): | ||
with pytest.raises(NotImplementedError, | ||
match="merging in the initial column"): | ||
pd.read_excel("raising_repeats.ods") |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,9 +33,21 @@ def ignore_xlrd_time_clock_warning(): | |
|
||
@pytest.fixture(params=[ | ||
# Add any engines to test here | ||
pytest.param('xlrd', marks=td.skip_if_no('xlrd')), | ||
pytest.param('openpyxl', marks=td.skip_if_no('openpyxl')), | ||
pytest.param(None, marks=td.skip_if_no('xlrd')), | ||
# When defusedxml is installed it triggers deprecation warnings for | ||
# xlrd and openpyxl, so catch those here | ||
pytest.param('xlrd', marks=[ | ||
td.skip_if_no('xlrd'), | ||
pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), | ||
]), | ||
pytest.param('openpyxl', marks=[ | ||
td.skip_if_no('openpyxl'), | ||
pytest.mark.filterwarnings("ignore:.*html argument"), | ||
]), | ||
pytest.param(None, marks=[ | ||
td.skip_if_no('xlrd'), | ||
pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), | ||
]), | ||
pytest.param("odf", marks=td.skip_if_no("odf")), | ||
]) | ||
def engine(request): | ||
""" | ||
|
@@ -53,6 +65,11 @@ def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext): | |
""" | ||
if engine == 'openpyxl' and read_ext == '.xls': | ||
pytest.skip() | ||
if engine == 'odf' and read_ext != '.ods': | ||
pytest.skip() | ||
if read_ext == ".ods" and engine != "odf": | ||
pytest.skip() | ||
|
||
func = partial(pd.read_excel, engine=engine) | ||
monkeypatch.chdir(datapath("io", "data")) | ||
monkeypatch.setattr(pd, 'read_excel', func) | ||
|
@@ -62,14 +79,16 @@ def test_usecols_int(self, read_ext, df_ref): | |
|
||
# usecols as int | ||
with tm.assert_produces_warning(FutureWarning, | ||
check_stacklevel=False): | ||
check_stacklevel=False, | ||
raise_on_extra_warnings=False): | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
with ignore_xlrd_time_clock_warning(): | ||
df1 = pd.read_excel("test1" + read_ext, "Sheet1", | ||
index_col=0, usecols=3) | ||
|
||
# usecols as int | ||
with tm.assert_produces_warning(FutureWarning, | ||
check_stacklevel=False): | ||
check_stacklevel=False, | ||
raise_on_extra_warnings=False): | ||
with ignore_xlrd_time_clock_warning(): | ||
df2 = pd.read_excel("test1" + read_ext, "Sheet2", skiprows=[1], | ||
index_col=0, usecols=3) | ||
|
@@ -439,6 +458,9 @@ def test_bad_engine_raises(self, read_ext): | |
|
||
@tm.network | ||
def test_read_from_http_url(self, read_ext): | ||
if read_ext == '.ods': # TODO: remove once on master | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test only works when the file is available on master, so have to merge first and then can try again There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems like this code is still hanging around ill aim to address it when I tackle: #29439 |
||
pytest.skip() | ||
|
||
url = ('https://raw.github.com/pandas-dev/pandas/master/' | ||
'pandas/tests/io/data/test1' + read_ext) | ||
url_table = pd.read_excel(url) | ||
|
@@ -736,6 +758,10 @@ def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext): | |
""" | ||
Change directory and set engine for ExcelFile objects. | ||
""" | ||
if engine == 'odf' and read_ext != '.ods': | ||
pytest.skip() | ||
if read_ext == ".ods" and engine != "odf": | ||
pytest.skip() | ||
if engine == 'openpyxl' and read_ext == '.xls': | ||
pytest.skip() | ||
|
||
|
@@ -802,7 +828,8 @@ def test_excel_table_sheet_by_index(self, read_ext, df_ref): | |
df3 = pd.read_excel(excel, 0, index_col=0, skipfooter=1) | ||
tm.assert_frame_equal(df3, df1.iloc[:-1]) | ||
|
||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): | ||
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False, | ||
raise_on_extra_warnings=False): | ||
with pd.ExcelFile('test1' + read_ext) as excel: | ||
df4 = pd.read_excel(excel, 0, index_col=0, skip_footer=1) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you put
enable='odf'
in double back ticks