[MRG + 1] Fix pprint ellipsis (scikit-learn#13436)

NicolasHug · ogrisel · commit 1fe00b58949a · 2019-04-18T11:15:41.000+02:00
diff --git a/sklearn/base.py b/sklearn/base.py
@@ -8,6 +8,7 @@
 from collections import defaultdict
 import platform
 import inspect
+import re
 
 import numpy as np
 
@@ -233,10 +234,13 @@ def set_params(self, **params):
 
         return self
 
-    def __repr__(self):
+    def __repr__(self, N_CHAR_MAX=700):
+        # N_CHAR_MAX is the (approximate) maximum number of non-blank
+        # characters to render. We pass it as an optional parameter to ease
+        # the tests.
+
         from .utils._pprint import _EstimatorPrettyPrinter
 
-        N_CHAR_MAX = 700  # number of non-whitespace or newline chars
         N_MAX_ELEMENTS_TO_SHOW = 30  # number of elements to show in sequences
 
         # use ellipsis for sequences with a lot of elements
@@ -246,10 +250,37 @@ def __repr__(self):
 
         repr_ = pp.pformat(self)
 
-        # Use bruteforce ellipsis if string is very long
-        if len(''.join(repr_.split())) > N_CHAR_MAX:  # check non-blank chars
-            lim = N_CHAR_MAX // 2
-            repr_ = repr_[:lim] + '...' + repr_[-lim:]
+        # Use bruteforce ellipsis when there are a lot of non-blank characters
+        n_nonblank = len(''.join(repr_.split()))
+        if n_nonblank > N_CHAR_MAX:
+            lim = N_CHAR_MAX // 2  # apprx number of chars to keep on both ends
+            regex = r'^(\s*\S){%d}' % lim
+            # The regex '^(\s*\S){%d}' % n
+            # matches from the start of the string until the nth non-blank
+            # character:
+            # - ^ matches the start of string
+            # - (pattern){n} matches n repetitions of pattern
+            # - \s*\S matches a non-blank char following zero or more blanks
+            left_lim = re.match(regex, repr_).end()
+            right_lim = re.match(regex, repr_[::-1]).end()
+
+            if '\n' in repr_[left_lim:-right_lim]:
+                # The left side and right side aren't on the same line.
+                # To avoid weird cuts, e.g.:
+                # categoric...ore',
+                # we need to start the right side with an appropriate newline
+                # character so that it renders properly as:
+                # categoric...
+                # handle_unknown='ignore',
+                # so we add [^\n]*\n which matches until the next \n
+                regex += r'[^\n]*\n'
+                right_lim = re.match(regex, repr_[::-1]).end()
+
+            ellipsis = '...'
+            if left_lim + len(ellipsis) < len(repr_) - right_lim:
+                # Only add ellipsis if it results in a shorter repr
+                repr_ = repr_[:left_lim] + '...' + repr_[-right_lim:]
+
         return repr_
 
     def __getstate__(self):
diff --git a/sklearn/utils/tests/test_pprint.py b/sklearn/utils/tests/test_pprint.py
@@ -459,16 +459,78 @@ def test_n_max_elements_to_show():
     assert  pp.pformat(gs) == expected
 
 
-def test_length_constraint():
-    # When repr is still too long, use bruteforce ellipsis
-    # repr is a very long line so we don't check for equality here, just that
-    # ellipsis has been done. It's not the ellipsis from before because the
-    # number of elements in the dict is only 1.
-    vocabulary = {0: 'hello' * 1000}
-    vectorizer = CountVectorizer(vocabulary=vocabulary)
-    repr_ = vectorizer.__repr__()
-    assert '...' in repr_
+def test_bruteforce_ellipsis():
+    # Check that the bruteforce ellipsis (used when the number of non-blank
+    # characters exceeds N_CHAR_MAX) renders correctly.
+
+    lr = LogisticRegression()
+
+    # test when the left and right side of the ellipsis aren't on the same
+    # line.
+    expected = """
+LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
+                   in...
+                   multi_class='warn', n_jobs=None, penalty='l2',
+                   random_state=None, solver='warn', tol=0.0001, verbose=0,
+                   warm_start=False)"""
+
+    expected = expected[1:]  # remove first \n
+    assert expected == lr.__repr__(N_CHAR_MAX=150)
+
+    # test with very small N_CHAR_MAX
+    # Note that N_CHAR_MAX is not strictly enforced, but it's normal: to avoid
+    # weird reprs we still keep the whole line of the right part (after the
+    # ellipsis).
+    expected = """
+Lo...
+                   warm_start=False)"""
+
+    expected = expected[1:]  # remove first \n
+    assert expected == lr.__repr__(N_CHAR_MAX=4)
+
+    # test with N_CHAR_MAX == number of non-blank characters: In this case we
+    # don't want ellipsis
+    full_repr = lr.__repr__(N_CHAR_MAX=float('inf'))
+    n_nonblank = len(''.join(full_repr.split()))
+    assert lr.__repr__(N_CHAR_MAX=n_nonblank) == full_repr
+    assert '...' not in full_repr
+
+    # test with N_CHAR_MAX == number of non-blank characters - 10: the left and
+    # right side of the ellispsis are on different lines. In this case we
+    # want to expend the whole line of the right side
+    expected = """
+LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
+                   intercept_scaling=1, l1_ratio=None, max_i...
+                   multi_class='warn', n_jobs=None, penalty='l2',
+                   random_state=None, solver='warn', tol=0.0001, verbose=0,
+                   warm_start=False)"""
+    expected = expected[1:]  # remove first \n
+    assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 10)
+
+    # test with N_CHAR_MAX == number of non-blank characters - 10: the left and
+    # right side of the ellispsis are on the same line. In this case we don't
+    # want to expend the whole line of the right side, just add the ellispsis
+    # between the 2 sides.
+    expected = """
+LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
+                   intercept_scaling=1, l1_ratio=None, max_iter...,
+                   multi_class='warn', n_jobs=None, penalty='l2',
+                   random_state=None, solver='warn', tol=0.0001, verbose=0,
+                   warm_start=False)"""
+    expected = expected[1:]  # remove first \n
+    assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 4)
 
+    # test with N_CHAR_MAX == number of non-blank characters - 2: the left and
+    # right side of the ellispsis are on the same line, but adding the ellipsis
+    # would actually make the repr longer. So we don't add the ellipsis.
+    expected = """
+LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
+                   intercept_scaling=1, l1_ratio=None, max_iter=100,
+                   multi_class='warn', n_jobs=None, penalty='l2',
+                   random_state=None, solver='warn', tol=0.0001, verbose=0,
+                   warm_start=False)"""
+    expected = expected[1:]  # remove first \n
+    assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 2)
 
 def test_builtin_prettyprinter():
     # non regression test than ensures we can still use the builtin