Skip to content

Commit 2fc6286

Browse files
committed
Merge branch 'ragrawal-master'
2 parents c916211 + 79a79f1 commit 2fc6286

File tree

3 files changed

+50
-5
lines changed

3 files changed

+50
-5
lines changed

README.rst

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,18 @@ Now that the transformation is trained, we confirm that it works on new data::
103103
array([[ 1. , 0. , 0. , 1.04]])
104104

105105

106+
Output features names
107+
*********************
108+
109+
In certain cases, like when studying the feature importances for some model,
110+
we want to be able to associate the original features to the ones generated by
111+
the dataframe mapper. We can do so by inspecting the automatically generated
112+
``transformed_names_`` attribute of the mapper after transformation::
113+
114+
>>> mapper.transformed_names_
115+
['pet_cat', 'pet_dog', 'pet_fish', 'children']
116+
117+
106118
Outputting a dataframe
107119
**********************
108120

@@ -123,6 +135,9 @@ By default the output of the dataframe mapper is a numpy array. This is so becau
123135
6 1.0 0.0 0.0 1.04
124136
7 0.0 0.0 1.0 0.21
125137

138+
The names for the columns are the same ones present in the ``transformed_names_``
139+
attribute.
140+
126141
Note this does not work together with the ``default=True`` or ``sparse=True`` arguments to the mapper.
127142

128143
Transform Multiple Columns
@@ -252,6 +267,11 @@ Sklearn-pandas' ``cross_val_score`` function provides exactly the same interface
252267
Changelog
253268
---------
254269

270+
Development
271+
***********
272+
* Capture output columns generated names in ``transformed_names_`` attribute (#78).
273+
274+
255275
1.3.0 (2017-01-21)
256276
******************
257277

@@ -308,5 +328,6 @@ Other contributors:
308328
* Jeremy Howard
309329
* Olivier Grisel
310330
* Paul Butler
331+
* Ritesh Agrawal
311332
* Vitaley Zaretskey
312333
* Zac Stewart

sklearn_pandas/dataframe_mapper.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ def __init__(self, features, default=False, sparse=False, df_out=False):
6565
self.default = _build_transformer(default)
6666
self.sparse = sparse
6767
self.df_out = df_out
68+
self.transformed_names_ = []
69+
6870
if (df_out and (sparse or default)):
6971
raise ValueError("Can not use df_out with sparse or default")
7072

@@ -187,7 +189,7 @@ def transform(self, X):
187189
X the data to transform
188190
"""
189191
extracted = []
190-
index = []
192+
self.transformed_names_ = []
191193
for columns, transformers in self.features:
192194
# columns could be a string or list of
193195
# strings; we don't care because pandas
@@ -196,8 +198,8 @@ def transform(self, X):
196198
if transformers is not None:
197199
Xt = transformers.transform(Xt)
198200
extracted.append(_handle_feature(Xt))
199-
if self.df_out:
200-
index = index + self.get_names(columns, transformers, Xt)
201+
202+
self.transformed_names_ += self.get_names(columns, transformers, Xt)
201203

202204
# handle features not explicitly selected
203205
if self.default is not False:
@@ -206,7 +208,7 @@ def transform(self, X):
206208
if self.default is not None:
207209
Xt = self.default.transform(Xt)
208210
extracted.append(_handle_feature(Xt))
209-
211+
self.transformed_names_ += self.get_names(unsel_cols, self.default, Xt)
210212

211213
# combine the feature outputs into one array.
212214
# at this point we lose track of which features
@@ -227,4 +229,4 @@ def transform(self, X):
227229
if not self.df_out:
228230
return stacked
229231

230-
return pd.DataFrame(stacked, columns=index)
232+
return pd.DataFrame(stacked, columns=self.transformed_names_)

tests/test_dataframe_mapper.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,28 @@ def complex_dataframe():
9696
'feat2': [1, 2, 3, 2, 3, 4]})
9797

9898

99+
def test_transformed_names_simple(simple_dataframe):
100+
"""
101+
Get transformed names of features in `transformed_names` attribute
102+
for simple transformation
103+
"""
104+
df = simple_dataframe
105+
mapper = DataFrameMapper([('a', None)])
106+
mapper.fit_transform(df)
107+
assert mapper.transformed_names_ == ['a']
108+
109+
110+
def test_transformed_names_binarizer(complex_dataframe):
111+
"""
112+
Get transformed names of features in `transformed_names` attribute
113+
for a transformation that multiplies the number of columns
114+
"""
115+
df = complex_dataframe
116+
mapper = DataFrameMapper([('target', LabelBinarizer())])
117+
mapper.fit_transform(df)
118+
mapper.transformed_names_ == ['target_a', 'target_b']
119+
120+
99121
def test_simple_df(simple_dataframe):
100122
"""
101123
Get a dataframe from a simple mapped dataframe

0 commit comments

Comments
 (0)