Skip to content

Commit 1fe00b5

Browse files
NicolasHugogrisel
authored andcommitted
[MRG + 1] Fix pprint ellipsis (scikit-learn#13436)
1 parent 77b73d6 commit 1fe00b5

File tree

2 files changed

+108
-15
lines changed

2 files changed

+108
-15
lines changed

sklearn/base.py

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from collections import defaultdict
99
import platform
1010
import inspect
11+
import re
1112

1213
import numpy as np
1314

@@ -233,10 +234,13 @@ def set_params(self, **params):
233234

234235
return self
235236

236-
def __repr__(self):
237+
def __repr__(self, N_CHAR_MAX=700):
238+
# N_CHAR_MAX is the (approximate) maximum number of non-blank
239+
# characters to render. We pass it as an optional parameter to ease
240+
# the tests.
241+
237242
from .utils._pprint import _EstimatorPrettyPrinter
238243

239-
N_CHAR_MAX = 700 # number of non-whitespace or newline chars
240244
N_MAX_ELEMENTS_TO_SHOW = 30 # number of elements to show in sequences
241245

242246
# use ellipsis for sequences with a lot of elements
@@ -246,10 +250,37 @@ def __repr__(self):
246250

247251
repr_ = pp.pformat(self)
248252

249-
# Use bruteforce ellipsis if string is very long
250-
if len(''.join(repr_.split())) > N_CHAR_MAX: # check non-blank chars
251-
lim = N_CHAR_MAX // 2
252-
repr_ = repr_[:lim] + '...' + repr_[-lim:]
253+
# Use bruteforce ellipsis when there are a lot of non-blank characters
254+
n_nonblank = len(''.join(repr_.split()))
255+
if n_nonblank > N_CHAR_MAX:
256+
lim = N_CHAR_MAX // 2 # apprx number of chars to keep on both ends
257+
regex = r'^(\s*\S){%d}' % lim
258+
# The regex '^(\s*\S){%d}' % n
259+
# matches from the start of the string until the nth non-blank
260+
# character:
261+
# - ^ matches the start of string
262+
# - (pattern){n} matches n repetitions of pattern
263+
# - \s*\S matches a non-blank char following zero or more blanks
264+
left_lim = re.match(regex, repr_).end()
265+
right_lim = re.match(regex, repr_[::-1]).end()
266+
267+
if '\n' in repr_[left_lim:-right_lim]:
268+
# The left side and right side aren't on the same line.
269+
# To avoid weird cuts, e.g.:
270+
# categoric...ore',
271+
# we need to start the right side with an appropriate newline
272+
# character so that it renders properly as:
273+
# categoric...
274+
# handle_unknown='ignore',
275+
# so we add [^\n]*\n which matches until the next \n
276+
regex += r'[^\n]*\n'
277+
right_lim = re.match(regex, repr_[::-1]).end()
278+
279+
ellipsis = '...'
280+
if left_lim + len(ellipsis) < len(repr_) - right_lim:
281+
# Only add ellipsis if it results in a shorter repr
282+
repr_ = repr_[:left_lim] + '...' + repr_[-right_lim:]
283+
253284
return repr_
254285

255286
def __getstate__(self):

sklearn/utils/tests/test_pprint.py

Lines changed: 71 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -459,16 +459,78 @@ def test_n_max_elements_to_show():
459459
assert pp.pformat(gs) == expected
460460

461461

462-
def test_length_constraint():
463-
# When repr is still too long, use bruteforce ellipsis
464-
# repr is a very long line so we don't check for equality here, just that
465-
# ellipsis has been done. It's not the ellipsis from before because the
466-
# number of elements in the dict is only 1.
467-
vocabulary = {0: 'hello' * 1000}
468-
vectorizer = CountVectorizer(vocabulary=vocabulary)
469-
repr_ = vectorizer.__repr__()
470-
assert '...' in repr_
462+
def test_bruteforce_ellipsis():
463+
# Check that the bruteforce ellipsis (used when the number of non-blank
464+
# characters exceeds N_CHAR_MAX) renders correctly.
465+
466+
lr = LogisticRegression()
467+
468+
# test when the left and right side of the ellipsis aren't on the same
469+
# line.
470+
expected = """
471+
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
472+
in...
473+
multi_class='warn', n_jobs=None, penalty='l2',
474+
random_state=None, solver='warn', tol=0.0001, verbose=0,
475+
warm_start=False)"""
476+
477+
expected = expected[1:] # remove first \n
478+
assert expected == lr.__repr__(N_CHAR_MAX=150)
479+
480+
# test with very small N_CHAR_MAX
481+
# Note that N_CHAR_MAX is not strictly enforced, but it's normal: to avoid
482+
# weird reprs we still keep the whole line of the right part (after the
483+
# ellipsis).
484+
expected = """
485+
Lo...
486+
warm_start=False)"""
487+
488+
expected = expected[1:] # remove first \n
489+
assert expected == lr.__repr__(N_CHAR_MAX=4)
490+
491+
# test with N_CHAR_MAX == number of non-blank characters: In this case we
492+
# don't want ellipsis
493+
full_repr = lr.__repr__(N_CHAR_MAX=float('inf'))
494+
n_nonblank = len(''.join(full_repr.split()))
495+
assert lr.__repr__(N_CHAR_MAX=n_nonblank) == full_repr
496+
assert '...' not in full_repr
497+
498+
# test with N_CHAR_MAX == number of non-blank characters - 10: the left and
499+
# right side of the ellispsis are on different lines. In this case we
500+
# want to expend the whole line of the right side
501+
expected = """
502+
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
503+
intercept_scaling=1, l1_ratio=None, max_i...
504+
multi_class='warn', n_jobs=None, penalty='l2',
505+
random_state=None, solver='warn', tol=0.0001, verbose=0,
506+
warm_start=False)"""
507+
expected = expected[1:] # remove first \n
508+
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 10)
509+
510+
# test with N_CHAR_MAX == number of non-blank characters - 10: the left and
511+
# right side of the ellispsis are on the same line. In this case we don't
512+
# want to expend the whole line of the right side, just add the ellispsis
513+
# between the 2 sides.
514+
expected = """
515+
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
516+
intercept_scaling=1, l1_ratio=None, max_iter...,
517+
multi_class='warn', n_jobs=None, penalty='l2',
518+
random_state=None, solver='warn', tol=0.0001, verbose=0,
519+
warm_start=False)"""
520+
expected = expected[1:] # remove first \n
521+
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 4)
471522

523+
# test with N_CHAR_MAX == number of non-blank characters - 2: the left and
524+
# right side of the ellispsis are on the same line, but adding the ellipsis
525+
# would actually make the repr longer. So we don't add the ellipsis.
526+
expected = """
527+
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
528+
intercept_scaling=1, l1_ratio=None, max_iter=100,
529+
multi_class='warn', n_jobs=None, penalty='l2',
530+
random_state=None, solver='warn', tol=0.0001, verbose=0,
531+
warm_start=False)"""
532+
expected = expected[1:] # remove first \n
533+
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 2)
472534

473535
def test_builtin_prettyprinter():
474536
# non regression test than ensures we can still use the builtin

0 commit comments

Comments
 (0)