-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Class to read OpenDocument Tables #25427
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 18 commits
479e639
8be4b67
77d9033
47b2ffb
0fa2ac9
e6e2365
1bbf284
d5c7ec0
691f1e9
93c2b66
394c4bd
b149d84
19587b3
60a5bc1
1fef008
7148995
5db1a0b
83c0243
735e2b4
8302fd7
d0df3bd
47597c9
9e1799a
d5c60ab
39cfecf
8a9a66c
fd7663f
3bcc1b7
15e69eb
65615cd
9584753
9dc34f4
5e32f6d
6360c07
4227268
80607b0
43f7160
4da0445
cbbc653
1227216
696ed5d
49fff9f
7b08304
4d97d84
59cdf0b
98d3ca7
fb48d8d
6576af9
4dc1b51
8ce45b4
f9f88b0
3e0d758
ff28993
7396ad6
5a440a4
250a3d3
d7e7d05
93adedb
62a37e7
13fb76f
fb6c5ee
5c839f4
4026fc1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
import pandas as pd | ||
|
||
from pandas.io.parsers import TextParser | ||
|
||
|
||
class ODFReader: | ||
"""Read tables out of OpenDocument formatted files | ||
|
||
Parameters | ||
---------- | ||
filepath_or_buffer: string, path to be parsed or | ||
an open readable stream. | ||
""" | ||
def __init__(self, filepath_or_buffer): | ||
try: | ||
from odf.opendocument import load as document_load | ||
from odf.table import Table | ||
except ImportError: | ||
raise ImportError("Install odfpy >= 1.3 for OpenDocument support") | ||
|
||
self.filepath_or_buffer = filepath_or_buffer | ||
self.document = document_load(filepath_or_buffer) | ||
self.tables = self.document.getElementsByType(Table) | ||
|
||
@property | ||
def sheet_names(self): | ||
"""Return a list of sheet names present in the document""" | ||
from odf.namespaces import TABLENS | ||
return [t.attributes[(TABLENS, 'name')] for t in self.tables] | ||
|
||
def get_sheet_by_index(self, index): | ||
return self.tables[index] | ||
|
||
def get_sheet_by_name(self, name): | ||
i = self.sheet_names.index(name) | ||
return self.tables[i] | ||
|
||
def _get_sheet(self, name): | ||
"""Given a sheet name or index, return the root ODF Table node | ||
""" | ||
if isinstance(name, str): | ||
detrout marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return self.get_sheet_by_name(name) | ||
elif isinstance(name, int): | ||
return self.get_sheet_by_index(name) | ||
else: | ||
raise ValueError( | ||
'Unrecognized sheet identifier type {}. Please use' | ||
'a string or integer'.format(type(name))) | ||
|
||
def parse(self, sheet_name=0, **kwds): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is already defined in the base class so ideally don't need to override here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok. _ODFReader wasn't derived from _BaseExcelReader... and I have a problem making it fit. The sheet object returned by xlrd has properties that I can't easily match. As far as I can tell from looking at the content.xml the only way to know the nrows, which is currently used by _BaseExcelReader is to actually parse the odf table. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is https://github.com/pandas-dev/pandas/pull/25092/files#r255718071 So if it's a hang up for two open PRs might consider doing that as a precursor to simplify things all around There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a good question. I'm pretty tired right now I'll try looking if something other than nrows might go also break in a few days. The convert_float parameter for get_sheet_data doesn't make as much sense as ODF has separate types for integer and floats. (Here's the list of supported cell types: http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1417680_253892949 ) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so its ok to ignore a passed parameter; if its in conflict then you could raise an error if its explicity passed |
||
tree = self._get_sheet(sheet_name) | ||
data = self.get_sheet_data(tree, convert_float=False) | ||
parser = TextParser(data, **kwds) | ||
return parser.read() | ||
|
||
def get_sheet_data(self, sheet, convert_float): | ||
"""Parse an ODF Table into a list of lists | ||
""" | ||
from odf.table import TableCell, TableRow | ||
|
||
sheet_rows = sheet.getElementsByType(TableRow) | ||
table = [] | ||
empty_rows = 0 | ||
max_row_len = 0 | ||
for i, sheet_row in enumerate(sheet_rows): | ||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
sheet_cells = sheet_row.getElementsByType(TableCell) | ||
empty_cells = 0 | ||
table_row = [] | ||
for j, sheet_cell in enumerate(sheet_cells): | ||
value = self._get_cell_value(sheet_cell) | ||
column_repeat = self._get_cell_repeat(sheet_cell) | ||
|
||
if len(sheet_cell.childNodes) == 0: | ||
empty_cells += column_repeat | ||
else: | ||
if empty_cells > 0: | ||
table_row.extend([None] * empty_cells) | ||
empty_cells = 0 | ||
table_row.extend([value] * column_repeat) | ||
|
||
if max_row_len < len(table_row): | ||
max_row_len = len(table_row) | ||
|
||
row_repeat = self._get_row_repeat(sheet_row) | ||
if self._is_empty_row(sheet_row): | ||
empty_rows += row_repeat | ||
else: | ||
if empty_rows > 0: | ||
# add blank rows to our table | ||
table.extend([[None]] * empty_rows) | ||
empty_rows = 0 | ||
table.append(table_row) | ||
|
||
# Make our table square | ||
for row in table: | ||
if len(row) < max_row_len: | ||
row.extend([None] * (max_row_len - len(row))) | ||
|
||
return table | ||
|
||
def _get_row_repeat(self, row): | ||
"""Return number of times this row was repeated | ||
|
||
Repeating an empty row appeared to be a common way | ||
of representing sparse rows in the table. | ||
""" | ||
from odf.namespaces import TABLENS | ||
repeat = row.attributes.get((TABLENS, 'number-rows-repeated')) | ||
if repeat is None: | ||
return 1 | ||
return int(repeat) | ||
|
||
def _get_cell_repeat(self, cell): | ||
from odf.namespaces import TABLENS | ||
repeat = cell.attributes.get((TABLENS, 'number-columns-repeated')) | ||
if repeat is None: | ||
return 1 | ||
return int(repeat) | ||
|
||
def _is_empty_row(self, row): | ||
"""Helper function to find empty rows | ||
""" | ||
for column in row.childNodes: | ||
if len(column.childNodes) > 0: | ||
return False | ||
|
||
return True | ||
|
||
def _get_cell_value(self, cell): | ||
from odf.namespaces import OFFICENS | ||
cell_type = cell.attributes.get((OFFICENS, 'value-type')) | ||
if cell_type == 'boolean': | ||
cell_value = cell.attributes.get((OFFICENS, 'boolean')) | ||
return bool(cell_value) | ||
elif cell_type in ('float', 'percentage'): | ||
cell_value = cell.attributes.get((OFFICENS, 'value')) | ||
return float(cell_value) | ||
elif cell_type == 'string': | ||
return str(cell) | ||
elif cell_type == 'currency': | ||
cell_value = cell.attributes.get((OFFICENS, 'value')) | ||
return float(cell_value) | ||
elif cell_type == 'date': | ||
cell_value = cell.attributes.get((OFFICENS, 'date-value')) | ||
return pd.Timestamp(cell_value) | ||
elif cell_type == 'time': | ||
cell_value = cell.attributes.get((OFFICENS, 'time-value')) | ||
return(pandas_isoduration_compatibility(cell_value)) | ||
elif cell_type is None: | ||
return None | ||
else: | ||
raise ValueError('Unrecognized type {}'.format(cell_type)) | ||
|
||
|
||
def pandas_isoduration_compatibility(duration): | ||
"""Libreoffice returns durations without any day attributes | ||
|
||
For example PT3H45M0S. The current pandas Timedelta | ||
parse requires the presence of a day component. | ||
Workaround for https://github.com/pandas-dev/pandas/issues/25422 | ||
""" | ||
if duration.startswith('PT'): | ||
duration = 'P0DT' + duration[2:] | ||
return pd.Timedelta(duration) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2556,3 +2556,128 @@ def test_excelwriter_fspath(self): | |
with tm.ensure_clean('foo.xlsx') as path: | ||
writer = ExcelWriter(path) | ||
assert os.fspath(writer) == str(path) | ||
|
||
|
||
@td.skip_if_no('odf') | ||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
class TestODFReader(SharedItems): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hey @detrout ! So we just went through a drastic overhaul of the test cases. You will want to definitely merge master in before continuing on this which should (hopefully) make adding these tests easier. Rather than having everything in one large |
||
def test_get_sheet(self): | ||
from pandas.io.excel._odfreader import ODFReader | ||
|
||
pth = os.path.join(self.dirpath, 'datatypes.ods') | ||
book = ODFReader(pth) | ||
|
||
assert len(book.sheet_names) == 1 | ||
assert book.sheet_names == ['Sheet1'] | ||
|
||
def test_get_sheet_raises(self): | ||
from pandas.io.excel._odfreader import ODFReader | ||
|
||
pth = os.path.join(self.dirpath, 'datatypes.ods') | ||
book = ODFReader(pth) | ||
|
||
with pytest.raises(ValueError): | ||
detrout marked this conversation as resolved.
Show resolved
Hide resolved
|
||
book._get_sheet(3.14) | ||
|
||
with pytest.raises(ValueError): | ||
book.get_sheet_by_name("Invalid Sheet 77") | ||
|
||
with pytest.raises(IndexError): | ||
book.get_sheet_by_index(-33) | ||
|
||
def test_read_types(self): | ||
sheet = self.get_exceldf( | ||
'datatypes', '.ods', header=None, engine='odf') | ||
|
||
expected = DataFrame( | ||
[[1.0], | ||
[1.25], | ||
['a'], | ||
[pd.Timestamp(2003, 1, 2)], | ||
[False], | ||
[0.35], | ||
[pd.Timedelta(hours=3, minutes=45), | ||
pd.Timedelta(hours=17, minutes=53), | ||
pd.Timedelta(hours=14, minutes=8)], | ||
# though what should the value of a hyperlink be? | ||
['UBERON:0002101']]) | ||
tm.assert_equal(sheet, expected) | ||
|
||
def test_read_invalid_types_raises(self): | ||
# the invalid_value_type.ods required manually editing | ||
# of the included content.xml file | ||
with pytest.raises(ValueError, | ||
match="Unrecognized type awesome_new_type"): | ||
self.get_exceldf( | ||
'invalid_value_type', '.ods', header=None, engine='odf') | ||
|
||
def test_read_lower_diagonal(self): | ||
# Make sure we can parse: | ||
# 1 | ||
# 2 3 | ||
# 4 5 6 | ||
# 7 8 9 10 | ||
|
||
sheet = self.get_exceldf( | ||
'lowerdiagonal', '.ods', 'Sheet1', | ||
index_col=None, header=None, engine='odf') | ||
|
||
assert sheet.shape == (4, 4) | ||
|
||
def test_read_headers(self): | ||
sheet = self.get_exceldf( | ||
'headers', '.ods', 'Sheet1', index_col=0, engine='odf') | ||
|
||
expected = DataFrame.from_dict(OrderedDict([ | ||
("Header", ["Row 1", "Row 2"]), | ||
("Column 1", [1.0, 2.0]), | ||
("Column 2", [3.0, 4.0]), | ||
# Empty Column | ||
("Column 4", [7.0, 8.0]), | ||
# Empty Column 2 | ||
("Column 6", [11.0, 12.0])])) | ||
expected.set_index("Header", inplace=True) | ||
columns = ["Column 1", "Column 2", "Column 4", "Column 6"] | ||
tm.assert_equal(sheet[columns], expected) | ||
empties = [None, 'None.1'] | ||
for name in empties: | ||
for value in sheet[name]: | ||
assert pd.isnull(value) | ||
|
||
def test_read_writer_table(self): | ||
# Also test reading tables from an text OpenDocument file | ||
# (.odt) | ||
|
||
table = self.get_exceldf( | ||
'writertable', '.odt', 'Table1', index_col=0, engine='odf') | ||
|
||
assert table.shape == (3, 3) | ||
expected = DataFrame.from_dict(OrderedDict([ | ||
("Header", ["Row 1", "Row 2", "Row 3"]), | ||
("Column 1", [1.0, 2.0, 3.0]), | ||
("Unnamed: 2", [nan, nan, nan]), | ||
("Column 3", [7.0, 8.0, 9.0])])) | ||
expected.set_index("Header", inplace=True) | ||
columns = ["Column 1", "Column 3"] | ||
tm.assert_equal(table[columns], expected[columns]) | ||
|
||
# make sure pandas gives a name to the unnamed column | ||
for i in range(3): | ||
assert pd.isnull(table["Unnamed: 2"][i]) | ||
|
||
def test_blank_row_repeat(self): | ||
table = self.get_exceldf( | ||
'blank-row-repeat', '.ods', 'Value', engine='odf') | ||
|
||
assert table.shape == (14, 2) | ||
assert table['value'][7] == 9.0 | ||
assert pd.isnull(table['value'][8]) | ||
assert not pd.isnull(table['value'][11]) | ||
|
||
def test_runlengthencoding(self): | ||
sheet = self.get_exceldf( | ||
'runlengthencoding', '.ods', 'Sheet1', header=None, engine='odf') | ||
assert sheet.shape == (5, 3) | ||
# check by column, not by row. | ||
assert list(sheet[0]) == [1.0, 1.0, 2.0, 2.0, 2.0] | ||
assert list(sheet[1]) == [1.0, 2.0, 2.0, 2.0, 2.0] | ||
assert list(sheet[2]) == [1.0, 2.0, 2.0, 2.0, 2.0] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you put
enable='odf'
in double back ticks