Description
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
(optional) I have confirmed this bug exists on the master branch of pandas.
Code Sample, a copy-pastable example
import pandas as pd
import numpy as np
from scipy import sparse
X = sparse.random(100, 100, density=0.2, format="csr")
df = pd.DataFrame({"a": np.arange(100)})
df["X_sum"] = X.sum(axis=1)
df
Before 1.3.0, this worked fine. As of 1.3.0, displaying df fails with:
traceback
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/usr/local/lib/python3.8/site-packages/IPython/core/formatters.py in __call__(self, obj)
700 type_pprinters=self.type_printers,
701 deferred_pprinters=self.deferred_printers)
--> 702 printer.pretty(obj)
703 printer.flush()
704 return stream.getvalue()
/usr/local/lib/python3.8/site-packages/IPython/lib/pretty.py in pretty(self, obj)
392 if cls is not object \
393 and callable(cls.__dict__.get('__repr__')):
--> 394 return _repr_pprint(obj, self, cycle)
395
396 return _default_pprint(obj, self, cycle)
/usr/local/lib/python3.8/site-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
698 """A pprint that just redirects to the normal repr function."""
699 # Find newlines and replace them with p.break_()
--> 700 output = repr(obj)
701 lines = output.splitlines()
702 with p.group():
/usr/local/lib/python3.8/site-packages/pandas/core/frame.py in __repr__(self)
993 else:
994 width = None
--> 995 self.to_string(
996 buf=buf,
997 max_rows=max_rows,
/usr/local/lib/python3.8/site-packages/pandas/core/frame.py in to_string(self, buf, columns, col_space, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, max_rows, min_rows, max_cols, show_dimensions, decimal, line_width, max_colwidth, encoding)
1129 decimal=decimal,
1130 )
-> 1131 return fmt.DataFrameRenderer(formatter).to_string(
1132 buf=buf,
1133 encoding=encoding,
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in to_string(self, buf, encoding, line_width)
1051
1052 string_formatter = StringFormatter(self.fmt, line_width=line_width)
-> 1053 string = string_formatter.to_string()
1054 return save_to_buffer(string, buf=buf, encoding=encoding)
1055
/usr/local/lib/python3.8/site-packages/pandas/io/formats/string.py in to_string(self)
23
24 def to_string(self) -> str:
---> 25 text = self._get_string_representation()
26 if self.fmt.should_show_dimensions:
27 text = "".join([text, self.fmt.dimensions_info])
/usr/local/lib/python3.8/site-packages/pandas/io/formats/string.py in _get_string_representation(self)
38 return self._empty_info_line
39
---> 40 strcols = self._get_strcols()
41
42 if self.line_width is None:
/usr/local/lib/python3.8/site-packages/pandas/io/formats/string.py in _get_strcols(self)
29
30 def _get_strcols(self) -> list[list[str]]:
---> 31 strcols = self.fmt.get_strcols()
32 if self.fmt.is_truncated:
33 strcols = self._insert_dot_separators(strcols)
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in get_strcols(self)
538 Render a DataFrame to a list of columns (as lists of strings).
539 """
--> 540 strcols = self._get_strcols_without_index()
541
542 if self.index:
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in _get_strcols_without_index(self)
802 int(self.col_space.get(c, 0)), *(self.adj.len(x) for x in cheader)
803 )
--> 804 fmt_values = self.format_col(i)
805 fmt_values = _make_fixed_width(
806 fmt_values, self.justify, minimum=header_colwidth, adj=self.adj
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in format_col(self, i)
816 frame = self.tr_frame
817 formatter = self._get_formatter(i)
--> 818 return format_array(
819 frame.iloc[:, i]._values,
820 formatter,
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in format_array(values, formatter, float_format, na_rep, digits, space, justify, decimal, leading_space, quoting)
1238 )
1239
-> 1240 return fmt_obj.get_result()
1241
1242
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in get_result(self)
1269
1270 def get_result(self) -> list[str]:
-> 1271 fmt_values = self._format_strings()
1272 return _make_fixed_width(fmt_values, self.justify)
1273
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in _format_strings(self)
1516
1517 def _format_strings(self) -> list[str]:
-> 1518 return list(self.get_result_as_array())
1519
1520
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in get_result_as_array(self)
1480 float_format = lambda value: self.float_format % value
1481
-> 1482 formatted_values = format_values_with(float_format)
1483
1484 if not self.fixed_width:
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in format_values_with(float_format)
1454 values = self.values
1455 is_complex = is_complex_dtype(values)
-> 1456 values = format_with_na_rep(values, formatter, na_rep)
1457
1458 if self.fixed_width:
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in format_with_na_rep(values, formatter, na_rep)
1425 mask = isna(values)
1426 formatted = np.array(
-> 1427 [
1428 formatter(val) if not m else na_rep
1429 for val, m in zip(values.ravel(), mask.ravel())
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in <listcomp>(.0)
1426 formatted = np.array(
1427 [
-> 1428 formatter(val) if not m else na_rep
1429 for val, m in zip(values.ravel(), mask.ravel())
1430 ]
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/usr/local/lib/python3.8/site-packages/IPython/core/formatters.py in __call__(self, obj)
343 method = get_real_method(obj, self.print_method)
344 if method is not None:
--> 345 return method()
346 return None
347 else:
/usr/local/lib/python3.8/site-packages/pandas/core/frame.py in _repr_html_(self)
1045 decimal=".",
1046 )
-> 1047 return fmt.DataFrameRenderer(formatter).to_html(notebook=True)
1048 else:
1049 return None
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in to_html(self, buf, encoding, classes, notebook, border, table_id, render_links)
1027 render_links=render_links,
1028 )
-> 1029 string = html_formatter.to_string()
1030 return save_to_buffer(string, buf=buf, encoding=encoding)
1031
/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in to_string(self)
70
71 def to_string(self) -> str:
---> 72 lines = self.render()
73 if any(isinstance(x, str) for x in lines):
74 lines = [str(x) for x in lines]
/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in render(self)
619 self.write("<div>")
620 self.write_style()
--> 621 super().render()
622 self.write("</div>")
623 return self.elements
/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in render(self)
76
77 def render(self) -> list[str]:
---> 78 self._write_table()
79
80 if self.should_show_dimensions:
/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in _write_table(self, indent)
246 self._write_header(indent + self.indent_delta)
247
--> 248 self._write_body(indent + self.indent_delta)
249
250 self.write("</table>", indent)
/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in _write_body(self, indent)
393 def _write_body(self, indent: int) -> None:
394 self.write("<tbody>", indent)
--> 395 fmt_values = self._get_formatted_values()
396
397 # write values
/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in _get_formatted_values(self)
583
584 def _get_formatted_values(self) -> dict[int, list[str]]:
--> 585 return {i: self.fmt.format_col(i) for i in range(self.ncols)}
586
587 def _get_columns_formatted_values(self) -> list[str]:
/usr/local/lib/python3.8/site-packages/pandas/io/formats/html.py in <dictcomp>(.0)
583
584 def _get_formatted_values(self) -> dict[int, list[str]]:
--> 585 return {i: self.fmt.format_col(i) for i in range(self.ncols)}
586
587 def _get_columns_formatted_values(self) -> list[str]:
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in format_col(self, i)
816 frame = self.tr_frame
817 formatter = self._get_formatter(i)
--> 818 return format_array(
819 frame.iloc[:, i]._values,
820 formatter,
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in format_array(values, formatter, float_format, na_rep, digits, space, justify, decimal, leading_space, quoting)
1238 )
1239
-> 1240 return fmt_obj.get_result()
1241
1242
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in get_result(self)
1269
1270 def get_result(self) -> list[str]:
-> 1271 fmt_values = self._format_strings()
1272 return _make_fixed_width(fmt_values, self.justify)
1273
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in _format_strings(self)
1516
1517 def _format_strings(self) -> list[str]:
-> 1518 return list(self.get_result_as_array())
1519
1520
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in get_result_as_array(self)
1480 float_format = lambda value: self.float_format % value
1481
-> 1482 formatted_values = format_values_with(float_format)
1483
1484 if not self.fixed_width:
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in format_values_with(float_format)
1454 values = self.values
1455 is_complex = is_complex_dtype(values)
-> 1456 values = format_with_na_rep(values, formatter, na_rep)
1457
1458 if self.fixed_width:
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in format_with_na_rep(values, formatter, na_rep)
1425 mask = isna(values)
1426 formatted = np.array(
-> 1427 [
1428 formatter(val) if not m else na_rep
1429 for val, m in zip(values.ravel(), mask.ravel())
/usr/local/lib/python3.8/site-packages/pandas/io/formats/format.py in <listcomp>(.0)
1426 formatted = np.array(
1427 [
-> 1428 formatter(val) if not m else na_rep
1429 for val, m in zip(values.ravel(), mask.ravel())
1430 ]
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
I discovered this new behaviour due to our tests starting to fail. What was causing that was:
df["sum"] = X.sum(axis=1)
df["log1p_sum"] = np.log1p(df["sum"])
failing with:
traceback
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-24-4ada647d56a0> in <module>
1 df["sum"] = X.sum(axis=1)
----> 2 df["log1p_sum"] = np.log1p(df["sum"])
/usr/local/lib/python3.8/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
3605 else:
3606 # set column
-> 3607 self._set_item(key, value)
3608
3609 def _setitem_slice(self, key: slice, value):
/usr/local/lib/python3.8/site-packages/pandas/core/frame.py in _set_item(self, key, value)
3777 ensure homogeneity.
3778 """
-> 3779 value = self._sanitize_column(value)
3780
3781 if (
/usr/local/lib/python3.8/site-packages/pandas/core/frame.py in _sanitize_column(self, value)
4502
4503 if is_list_like(value):
-> 4504 com.require_length_match(value, self.index)
4505 return sanitize_array(value, self.index, copy=True, allow_2d=True)
4506
/usr/local/lib/python3.8/site-packages/pandas/core/common.py in require_length_match(data, index)
525 """
526 if len(data) != len(index):
--> 527 raise ValueError(
528 "Length of values "
529 f"({len(data)}) "
ValueError: Length of values (1) does not match length of index (100)
Problem description
This problem is being triggered because the result of X.sum(axis=1)
when X
is a scipy sparse matrix is not a 1d numpy ndarray, but a np.matrix
with one column. This used to be handled by pandas, but now isn't.
This is a problem because it's a behaviour change that breaks existing code. As far as I can tell from the release notes, this was not an intentional behaviour change. It does look like some things around column assignment did change, and I imagine that assigning with deprecated numpy types was not considered.
Expected Output
I would expect this to not error, and for this to pass: np.testing.assert_array_equal(df["X_sum"], np.ravel(X.sum(axis=1)))
Output of pd.show_versions()
INSTALLED VERSIONS
------------------
commit : f00ed8f47020034e752baf0250483053340971b0
python : 3.8.10.final.0
python-bits : 64
OS : Darwin
OS-release : 20.5.0
Version : Darwin Kernel Version 20.5.0: Sat May 8 05:10:33 PDT 2021; root:xnu-7195.121.3~9/RELEASE_X86_64
machine : x86_64
processor : i386
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
pandas : 1.3.0
numpy : 1.21.0
pytz : 2020.1
dateutil : 2.8.1
pip : 21.1.3
setuptools : 56.0.0
Cython : 0.29.23
pytest : 6.2.4
hypothesis : None
sphinx : 4.0.2
blosc : None
feather : None
xlsxwriter : None
lxml.etree : 4.6.3
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 2.11.2
IPython : 7.23.1
pandas_datareader: None
bs4 : 4.9.3
bottleneck : None
fsspec : 2021.06.0
fastparquet : 0.4.1
gcsfs : None
matplotlib : 3.4.2
numexpr : 2.7.2
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : 4.0.1
pyxlsb : None
s3fs : 0.4.2
scipy : 1.7.0
sqlalchemy : 1.3.18
tables : 3.6.1
tabulate : 0.8.7
xarray : 0.18.2
xlrd : 1.2.0
xlwt : None
numba : 0.53.1