Skip to content

ENH: Adding new pandas option for integer formatting, Issue#57177 #58519

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,7 @@ Other enhancements
- Support reading value labels from Stata 108-format (Stata 6) and earlier files (:issue:`58154`)
- Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`)
- :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
- :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`)
- :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`)
- :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
- :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- Added new display.option for integer formatting within DataFrames (:issue:`57177`)

.. ---------------------------------------------------------------------------
.. _whatsnew_300.notable_bug_fixes:
Expand Down
20 changes: 17 additions & 3 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,12 @@ def use_numba_cb(key: str) -> None:
df.info() is called. Valid values True,False,'deep'
"""

pc_integer_format_doc = """
: str
This formats integer values in a DataFrame with a delimiter that defaults
to None (''). The other two options are 'comma' (',') and 'underscore' ('_').
"""


def table_schema_cb(key: str) -> None:
from pandas.io.formats.printing import enable_data_resource_formatter
Expand Down Expand Up @@ -388,6 +394,12 @@ def is_terminal() -> bool:
cf.register_option(
"max_dir_items", 100, pc_max_dir_items, validator=is_nonnegative_int
)
cf.register_option(
"integer_format",
None,
pc_integer_format_doc,
validator=is_instance_factory((type(None), str)),
)

tc_sim_interactive_doc = """
: boolean
Expand All @@ -412,9 +424,11 @@ def is_terminal() -> bool:
"copy_on_write",
# Get the default from an environment variable, if set, otherwise defaults
# to False. This environment variable can be set for testing.
"warn"
if os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "warn"
else os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "1",
(
"warn"
if os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "warn"
else os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "1"
),
copy_on_write_doc,
validator=is_one_of_factory([True, False, "warn"]),
)
Expand Down
36 changes: 27 additions & 9 deletions pandas/io/formats/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,7 @@ def __init__(
formatters: FormattersType | None = None,
justify: str | None = None,
float_format: FloatFormatType | None = None,
integer_format: str | None = None,
sparsify: bool | None = None,
index_names: bool = True,
max_rows: int | None = None,
Expand All @@ -453,6 +454,7 @@ def __init__(
self.formatters = self._initialize_formatters(formatters)
self.justify = self._initialize_justify(justify)
self.float_format = float_format
self.integer_format = integer_format
self.sparsify = self._initialize_sparsify(sparsify)
self.show_index_names = index_names
self.decimal = decimal
Expand Down Expand Up @@ -756,6 +758,7 @@ def format_col(self, i: int) -> list[str]:
frame.iloc[:, i]._values,
formatter,
float_format=self.float_format,
integer_format=self.integer_format,
na_rep=self.na_rep,
space=self.col_space.get(frame.columns[i]),
decimal=self.decimal,
Expand Down Expand Up @@ -789,9 +792,11 @@ def _get_formatted_column_labels(self, frame: DataFrame) -> list[list[str]]:
fmt_columns = columns._format_flat(include_name=False)
str_columns = [
[
" " + x
if not self._get_formatter(i) and is_numeric_dtype(dtype)
else x
(
" " + x
if not self._get_formatter(i) and is_numeric_dtype(dtype)
else x
)
]
for i, (x, dtype) in enumerate(zip(fmt_columns, self.frame.dtypes))
]
Expand Down Expand Up @@ -1063,6 +1068,7 @@ def format_array(
values: ArrayLike,
formatter: Callable | None,
float_format: FloatFormatType | None = None,
integer_format: str | None = None,
na_rep: str = "NaN",
digits: int | None = None,
space: str | int | None = None,
Expand Down Expand Up @@ -1124,6 +1130,9 @@ def format_array(
if float_format is None:
float_format = get_option("display.float_format")

if integer_format is None:
integer_format = get_option("display.integer_format")

if digits is None:
digits = get_option("display.precision")

Expand All @@ -1132,6 +1141,7 @@ def format_array(
digits=digits,
na_rep=na_rep,
float_format=float_format,
integer_format=integer_format,
formatter=formatter,
space=space,
justify=justify,
Expand All @@ -1153,6 +1163,7 @@ def __init__(
na_rep: str = "NaN",
space: str | int = 12,
float_format: FloatFormatType | None = None,
integer_format: str | None = None,
justify: str = "right",
decimal: str = ".",
quoting: int | None = None,
Expand All @@ -1166,6 +1177,7 @@ def __init__(
self.space = space
self.formatter = formatter
self.float_format = float_format
self.integer_format = integer_format
self.justify = justify
self.decimal = decimal
self.quoting = quoting
Expand Down Expand Up @@ -1455,13 +1467,19 @@ def _format_strings(self) -> list[str]:

class _IntArrayFormatter(_GenericArrayFormatter):
def _format_strings(self) -> list[str]:
if self.leading_space is False:
formatter_str = lambda x: f"{x:d}".format(x=x)

if self.integer_format in (",", "_", None):
if self.integer_format is None:
self.integer_format = ""
if self.leading_space is False:
formatter_str = lambda x: f"{x:{self.integer_format}}".format(x=x)
else:
formatter_str = lambda x: f"{x: {self.integer_format}}".format(x=x)
formatter = self.formatter or formatter_str
fmt_values = [formatter(x) for x in self.values]
return fmt_values
else:
formatter_str = lambda x: f"{x: d}".format(x=x)
formatter = self.formatter or formatter_str
fmt_values = [formatter(x) for x in self.values]
return fmt_values
raise ValueError("integer_format must be one of ',','_', or None")


class _Datetime64Formatter(_GenericArrayFormatter):
Expand Down
66 changes: 66 additions & 0 deletions pandas/tests/io/formats/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -2259,3 +2259,69 @@ def test_filepath_or_buffer_bad_arg_raises(float_frame, method):
msg = "buf is not a file name and it has no write method"
with pytest.raises(TypeError, match=msg):
getattr(float_frame, method)(buf=object())


class TestIntArrayFormatter:
def test_format_comma(self):
with option_context("display.integer_format", ","):
df = DataFrame({"A": [1000, 20000, 30], "B": [4.1, 50000.2, 600.0]})
formatted_df = repr(df)
# Valid delimiter used for integer_format
expected_results = {
"A": ["1,000", "20,000", "30"],
"B": ["4.1", "50000.2", "600.0"],
}
expected_df = DataFrame(expected_results)
expected_df = repr(expected_df)

assert formatted_df == expected_df

def test_format_underscore(self):
with option_context("display.integer_format", "_"):
df = DataFrame({"A": [1000, 20000, 30], "B": [4.1, 50000.2, 600.0]})
formatted_df = repr(df)
# Valid delimiter used for integer_format
expected_results = {
"A": ["1_000", "20_000", "30"],
"B": ["4.1", "50000.2", "600.0"],
}
expected_df = DataFrame(expected_results)
expected_df = repr(expected_df)

assert formatted_df == expected_df

def test_format_empty(self):
with option_context("display.integer_format", None):
df = DataFrame({"A": [1000, 20000, 30], "B": [4.1, 50000.2, 600.0]})
formatted_df = repr(df)
# Valid delimiter used for integer_format
expected_results = {
"A": ["1000", "20000", "30"],
"B": ["4.1", "50000.2", "600.0"],
}
expected_df = DataFrame(expected_results)
expected_df = repr(expected_df)

assert formatted_df == expected_df

def test_format_invalid_to_none(self):
with option_context("display.integer_format", "."):
df = DataFrame({"A": [1000, 20000, 30], "B": [4.1, 50000.2, 600.0]})

with pytest.raises(
ValueError, match="integer_format must be one of ',','_', or None"
):
repr(df)

with option_context("display.integer_format", None):
df = DataFrame({"A": [1000, 20000, 30], "B": [4.1, 50000.2, 600.0]})
formatted_df = repr(df)
# Valid delimiter used for integer_format
expected_results = {
"A": ["1000", "20000", "30"],
"B": ["4.1", "50000.2", "600.0"],
}
expected_df = DataFrame(expected_results)
expected_df = repr(expected_df)

assert formatted_df == expected_df