pandas-dev · jtratner · Oct 30, 2013 · Oct 30, 2013 · Nov 17, 2014 · jorisvandenbossche
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4297,6 +4297,55 @@ def mode(self, axis=0, numeric_only=False):
         f = lambda s: s.mode()
         return data.apply(f, axis=axis)
 
+    def value_counts(self, axis=0, normalize=False, sort=True,
+                     ascending=False, bins=None, numeric_only=False):
+        """
+        Returns DataFrame containing counts of unique values. The resulting
+        DataFrame will be in descending order so that the first element is the
+        most frequently-occurring element among *all* columns. Excludes NA
+        values. Maintains order along axis (i.e., column/row)
+
+        Parameters
+        ----------
+        axis : {0, 1, 'index', 'columns'} (default 0)
+            0/'index' : get value_counts by column
+            1/'columns' : get value_counts by row
+        normalize: boolean, default False
+            If True then the Series returned will contain the relative
+            frequencies of the unique values.
+        sort : boolean, default True
+            Sort by sum of counts across columns (if False, DataFrame will be
+            sorted by union of all the unique values found)
+        ascending : boolean, default False
+            Sort in ascending order
+        bins : integer or sequence of scalars, optional
+            Rather than count values, group them into half-open bins, a
+            convenience for pd.cut, only works with numeric data. If integer,
+            then creates bins based upon overall max and overall min. If
+            passed, assumes numeric_only.
+        numeric_only : bool, default False
+            only apply to numeric columns.
+
+        Returns
+        -------
+        counts : DataFrame
+        """
+        data = self if not numeric_only else self._get_numeric_data()
+        from pandas.tools.tile import _generate_bins
+        if bins is not None and not com._is_sequence(bins):
+                max_val = self.max().max()
+                min_val = self.min().min()
+                bins = _generate_bins(bins=bins, min_val=min_val, max_val=max_val)
+
+        f = lambda s: s.value_counts(normalize=normalize, bins=bins)
+        res = data.apply(f, axis=axis)
+
+        if sort:
+            order = res.sum(1).order(ascending=ascending).index
+            res = res.reindex(order)
+
+        return res
+
     def quantile(self, q=0.5, axis=0, numeric_only=True):
         """
         Return values at the given quantile over requested axis, a la

diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -11253,6 +11253,39 @@ def test_count(self):
         expected = Series(0, index=[])
         assert_series_equal(result, expected)
 
+    def test_value_counts(self):
+        df = DataFrame({"A": [0, 5, 8, 10, 13], "B": [4, 16, 2, 30, 10]})
+        expected = DataFrame({"A": pd.Series([1, 1, 1, 1, 1],
+                                             index=[0, 5, 8, 10, 13]),
+                              "B": pd.Series([1, 1, 1, 1, 1],
+                                             index=[4, 16, 2, 30, 10])})
+        expected = expected.reindex([10, 30, 16, 13, 8, 5, 4, 2, 0])
+        assert_frame_equal(df.value_counts(), expected)
+        df = DataFrame({"A": ['a', 'a', 'a', 'c', 'd', 'e'],
+                        "B": ['e', 'c', 'd', 'x', 'y', 'a']})
+        actual = df.value_counts()
+        expected = DataFrame({"A": Series([3, 1, 1, 1], index=['a', 'e', 'd',
+                                                               'c']),
+                              "B": Series([1, 1, 1, 1, 1, 1],
+                                          index=['e', 'c', 'd', 'x', 'y',
+                                                 'a'])})
+        expected = expected.ix[expected.sum(1).order(ascending=False).index]
+        assert_frame_equal(actual, expected)
+
+        # finally, with bins
+
+        # levels = Index(['(-0.03, 3]', '(3, 6]', '(6, 9]', '(9, 12]',
+        #             '(12, 15]', '(15, 18]', '(18, 21]', '(21, 24]',
+        #             '(24, 27]', '(27, 30]'], dtype=object)
+        bins = [-0.03, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30]
+        actual = df.value_counts(bins=bins)
+        expected = DataFrame({
+            "A": pd.cut(df["A"], bins=bins).value_counts(),
+            "B": pd.cut(df["B"], bins=bins).value_counts()
+        })
+        expected = expected.ix[expected.sum(1).order(ascending=False).index]
+        assert_frame_equal(actual, expected)
+
     def test_sum(self):
         self._check_stat_op('sum', np.sum, has_numeric_only=True)
 

diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py
@@ -12,6 +12,60 @@
 
 import numpy as np
 
+def _generate_bins(x=None, bins=None, min_val=None, max_val=None, right=True):
+    """
+    Generate bins for cut, must either pass x (an array-like) or a min and max
+    value. If min or max are passed, ignores x.
+
+    Adds .1% space around bins if integer.
+    """
+    if bins is None:
+        raise ValueError("bins cannot be None.")
+    # ignore x if min and max are passed
+    if min_val is not None or max_val is not None:
+        assert min_val is not None and max_val is not None, (
+            "Must pass *both* min_val and max_val")
+    else:
+        assert x is not None, "Must pass either min/max vals or array-like"
+
+    # NOTE: this binning code is changed a bit from histogram for var(x) == 0
+    if not np.iterable(bins):
+        if np.isscalar(bins) and bins < 1:
+            raise ValueError("`bins` should be a positive integer.")
+        if min_val is not None:
+            mn, mx = min_val, max_val
+        else:
+            try:  # for array-like
+                sz = x.size
+            except AttributeError:
+                x = np.asarray(x)
+                sz = x.size
+            if sz == 0:
+                raise ValueError('Cannot cut empty array')
+                # handle empty arrays. Can't determine range, so use 0-1.
+                # rng = (0, 1)
+            else:
+                rng = (nanops.nanmin(x), nanops.nanmax(x))
+            mn, mx = [mi + 0.0 for mi in rng]
+
+        if mn == mx:  # adjust end points before binning
+            mn -= .001 * mn
+            mx += .001 * mx
+            bins = np.linspace(mn, mx, bins + 1, endpoint=True)
+        else:  # adjust end points after binning
+            bins = np.linspace(mn, mx, bins + 1, endpoint=True)
+            adj = (mx - mn) * 0.001  # 0.1% of the range
+            if right:
+                bins[0] -= adj
+            else:
+                bins[-1] += adj
+
+    else:
+        bins = np.asarray(bins)
+        if (np.diff(bins) < 0).any():
+            raise ValueError('bins must increase monotonically.')
+    return bins
+
 
 def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
         include_lowest=False):
@@ -75,39 +129,10 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
     >>> pd.cut(np.ones(5), 4, labels=False)
     array([1, 1, 1, 1, 1], dtype=int64)
     """
-    # NOTE: this binning code is changed a bit from histogram for var(x) == 0
-    if not np.iterable(bins):
-        if np.isscalar(bins) and bins < 1:
-            raise ValueError("`bins` should be a positive integer.")
-        try:  # for array-like
-            sz = x.size
-        except AttributeError:
-            x = np.asarray(x)
-            sz = x.size
-        if sz == 0:
-            raise ValueError('Cannot cut empty array')
-            # handle empty arrays. Can't determine range, so use 0-1.
-            # rng = (0, 1)
-        else:
-            rng = (nanops.nanmin(x), nanops.nanmax(x))
-        mn, mx = [mi + 0.0 for mi in rng]
+    if x is None:
+        raise TypeError("Must pass array-like as first argument, not None")
 
-        if mn == mx:  # adjust end points before binning
-            mn -= .001 * mn
-            mx += .001 * mx
-            bins = np.linspace(mn, mx, bins + 1, endpoint=True)
-        else:  # adjust end points after binning
-            bins = np.linspace(mn, mx, bins + 1, endpoint=True)
-            adj = (mx - mn) * 0.001  # 0.1% of the range
-            if right:
-                bins[0] -= adj
-            else:
-                bins[-1] += adj
-
-    else:
-        bins = np.asarray(bins)
-        if (np.diff(bins) < 0).any():
-            raise ValueError('bins must increase monotonically.')
+    bins = _generate_bins(x, bins, right=right)
 
     return _bins_to_cuts(x, bins, right=right, labels=labels,retbins=retbins, precision=precision,
                          include_lowest=include_lowest)