Merge branch 'ragrawal-master'

dukebody · dukebody · commit 2fc6286e2d60 · 2017-04-08T17:29:57.000+02:00
diff --git a/README.rst b/README.rst
@@ -103,6 +103,18 @@ Now that the transformation is trained, we confirm that it works on new data::
     array([[ 1.  ,  0.  ,  0.  ,  1.04]])
 
 
+Output features names
+*********************
+
+In certain cases, like when studying the feature importances for some model,
+we want to be able to associate the original features to the ones generated by
+the dataframe mapper. We can do so by inspecting the automatically generated
+ ``transformed_names_`` attribute of the mapper after transformation::
+
+    >>> mapper.transformed_names_
+    ['pet_cat', 'pet_dog', 'pet_fish', 'children']
+
+
 Outputting a dataframe
 **********************
 
@@ -123,6 +135,9 @@ By default the output of the dataframe mapper is a numpy array. This is so becau
     6      1.0      0.0       0.0      1.04
     7      0.0      0.0       1.0      0.21
 
+The names for the columns are the same ones present in the ``transformed_names_``
+attribute.
+
 Note this does not work together with the ``default=True`` or ``sparse=True`` arguments to the mapper.
 
 Transform Multiple Columns
@@ -252,6 +267,11 @@ Sklearn-pandas' ``cross_val_score`` function provides exactly the same interface
 Changelog
 ---------
 
+Development
+***********
+* Capture output columns generated names in ``transformed_names_`` attribute (#78).
+
+
 1.3.0 (2017-01-21)
 ******************
 
@@ -308,5 +328,6 @@ Other contributors:
 * Jeremy Howard
 * Olivier Grisel
 * Paul Butler
+* Ritesh Agrawal
 * Vitaley Zaretskey
 * Zac Stewart
diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py
@@ -65,6 +65,8 @@ def __init__(self, features, default=False, sparse=False, df_out=False):
         self.default = _build_transformer(default)
         self.sparse = sparse
         self.df_out = df_out
+        self.transformed_names_ = []
+
         if (df_out and (sparse or default)):
             raise ValueError("Can not use df_out with sparse or default")
 
@@ -187,7 +189,7 @@ def transform(self, X):
         X       the data to transform
         """
         extracted = []
-        index = []
+        self.transformed_names_ = []
         for columns, transformers in self.features:
             # columns could be a string or list of
             # strings; we don't care because pandas
@@ -196,8 +198,8 @@ def transform(self, X):
             if transformers is not None:
                 Xt = transformers.transform(Xt)
             extracted.append(_handle_feature(Xt))
-            if self.df_out:
-                index = index + self.get_names(columns, transformers, Xt)
+
+            self.transformed_names_ += self.get_names(columns, transformers, Xt)
 
         # handle features not explicitly selected
         if self.default is not False:
@@ -206,7 +208,7 @@ def transform(self, X):
             if self.default is not None:
                 Xt = self.default.transform(Xt)
             extracted.append(_handle_feature(Xt))
-
+            self.transformed_names_ += self.get_names(unsel_cols, self.default, Xt)
 
         # combine the feature outputs into one array.
         # at this point we lose track of which features
@@ -227,4 +229,4 @@ def transform(self, X):
         if not self.df_out:
             return stacked
 
-        return pd.DataFrame(stacked, columns=index)
+        return pd.DataFrame(stacked, columns=self.transformed_names_)
diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py
@@ -96,6 +96,28 @@ def complex_dataframe():
                          'feat2': [1, 2, 3, 2, 3, 4]})
 
 
+def test_transformed_names_simple(simple_dataframe):
+    """
+    Get transformed names of features in `transformed_names` attribute
+    for simple transformation
+    """
+    df = simple_dataframe
+    mapper = DataFrameMapper([('a', None)])
+    mapper.fit_transform(df)
+    assert mapper.transformed_names_ == ['a']
+
+
+def test_transformed_names_binarizer(complex_dataframe):
+    """
+    Get transformed names of features in `transformed_names` attribute
+    for a transformation that multiplies the number of columns
+    """
+    df = complex_dataframe
+    mapper = DataFrameMapper([('target', LabelBinarizer())])
+    mapper.fit_transform(df)
+    mapper.transformed_names_ == ['target_a', 'target_b']
+
+
 def test_simple_df(simple_dataframe):
     """
     Get a dataframe from a simple mapped dataframe