Support data input as DataFrame in Clustergram. (#478)

mkcor · Shammamah Hossain · web-flow · commit 9523a2ce2b80 · 2020-02-21T15:01:35.000-05:00
* Write test for dataframe input of Clustergram

* Support data input as DataFrame in Clustergram

* Add integration test for Clustergram reading in dataframes

* Lint and standardize imports

* Log enhancement for upcoming release

* Update version number in package.

* Update autogenerated files.

* Update CHANGELOG with release date.

Co-authored-by: Shammamah Hossain  &lt;shammamah.hossain@gmail.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## [0.4.7] - 2020-02-21
+### Added
+* [#478](https://github.com/plotly/dash-bio/pull/478) Added support of
+  DataFrame as Clustergram input.
+
 ## [0.4.6] - 2020-01-07
 ### Fixed
 * [#458](https://github.com/plotly/dash-bio/pull/458) Fixed reordering bug of
diff --git a/dash_bio/bundle.js b/dash_bio/bundle.js
diff --git a/dash_bio/component_factory/_clustergram.py b/dash_bio/component_factory/_clustergram.py
@@ -2,6 +2,7 @@
 from random import shuffle
 
 import numpy as np
+import pandas as pd
 import scipy
 import scipy.cluster.hierarchy as sch
 import scipy.spatial as scs
@@ -13,7 +14,7 @@
 
 # pylint: disable=assignment-from-no-return, no-self-use
 def Clustergram(
-        data=None,
+        data,
         generate_curves_dict=False,
         return_computed_traces=False,
         computed_traces=None,
@@ -49,7 +50,8 @@ def Clustergram(
 
 Keyword arguments:
 
-- data (ndarray; required): Matrix of observations as array of arrays
+- data (2D array-like; required): Matrix or table of observations (dropping
+    columns of non-numeric dtype).
 - generate_curves_dict (bool; default False): Whether or not to return a
     dictionary containing information about the cluster number
     associated with each curve number in the graph. (May be useful
@@ -200,7 +202,7 @@ class _Clustergram:
 
     def __init__(
             self,
-            data=None,
+            data,
             row_labels=None,
             column_labels=None,
             hidden_labels=None,
@@ -234,6 +236,9 @@ def __init__(
     See docstring of the `Clustergram` function, where the same keyword arguments (and a couple
     of other ones) are documented.
         """
+        if isinstance(data, pd.DataFrame):
+            data = data.select_dtypes('number')
+            data = data.values
         if hidden_labels is None:
             hidden_labels = []
         if color_threshold is None:
diff --git a/dash_bio/package-info.json b/dash_bio/package-info.json
@@ -1 +1 @@
-{"name": "dash_bio", "version": "0.4.6", "author": "The Plotly Team <dashbio@plot.ly>"}
+{"name": "dash_bio", "version": "0.4.7", "author": "The Plotly Team <dashbio@plot.ly>"}
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "dash-bio",
-  "version": "0.4.6",
+  "version": "0.4.7",
   "description": "Dash components for bioinformatics",
   "repository": {
     "type": "git",
diff --git a/tests/integration/test_clustergram.py b/tests/integration/test_clustergram.py
@@ -1,16 +1,16 @@
-import pandas
 import json
+import pandas as pd
 
 import dash
-import dash_bio
 import dash_html_components as html
+import dash_bio
 
 from common_features import nested_component_layout, \
     nested_component_app_callback
 
 _data = None
 
-_mtcars_data = pandas.read_csv(
+_mtcars_data = pd.read_csv(
     'tests/dashbio_demos/dash-clustergram/data/mtcars.tsv',
     delimiter='\t',
     skiprows=4
@@ -185,3 +185,30 @@ def test_dbcl005_row_annotations(dash_duo):
     # the annotation is the correct color
     dash_duo.wait_for_style_to_equal(
         'g.subplot.x6y6 g.plot g.lines > path', 'stroke', 'rgb(248, 62, 199)')
+
+
+def test_dbcl006_df_input_row_cluster(dash_duo):
+
+    app = dash.Dash(__name__)
+
+    # run the same test as dbcl002 (row clustering) where table of
+    # observations (data argument) is left as a DataFrame
+    assert isinstance(_mtcars_data, pd.DataFrame)
+    app.layout = html.Div(nested_component_layout(
+        dash_bio.Clustergram(
+            data=_mtcars_data
+        )
+    ))
+
+    nested_component_app_callback(
+        app,
+        dash_duo,
+        component=dash_bio.Clustergram,
+        component_data=_data,
+        test_prop_name='cluster',
+        test_prop_value='row',
+        prop_value_type='string'
+    )
+
+    assert len(dash_duo.find_elements('g.subplot.x2y2')) == 0
+    assert len(dash_duo.find_elements('g.subplot.x4y4')) == 1
diff --git a/tests/unit/test_clustergram.py b/tests/unit/test_clustergram.py
@@ -1,33 +1,51 @@
 import numpy as np
+import pandas as pd
 
 from dash_bio import Clustergram
 
+DATA = np.array(
+    [[1, 1, 1, 1],
+     [3, 3, 3, 3],
+     [1, 1, 1, 1],
+     [3, 3, 3, 3],
+     [1, 1, 1, 1],
+     [3, 3, 3, 3]]
+)
+CLUSTERED_DATA = np.array(
+    [[1, 1, 1, 1],
+     [1, 1, 1, 1],
+     [1, 1, 1, 1],
+     [3, 3, 3, 3],
+     [3, 3, 3, 3],
+     [3, 3, 3, 3]]
+)
+
 
 def test_cluster_rows():
     """Test that rows of 1's and 3's are properly clustered."""
-    data = np.array(
-        [[1, 1, 1, 1],
-         [3, 3, 3, 3],
-         [1, 1, 1, 1],
-         [3, 3, 3, 3],
-         [1, 1, 1, 1],
-         [3, 3, 3, 3]]
-    )
 
+    data = DATA
     _, _, curves_dict = Clustergram(
         data,
         generate_curves_dict=True,
         return_computed_traces=True,
         center_values=False
     )
+    clustered_data = CLUSTERED_DATA
+
+    assert np.array_equal(curves_dict['heatmap']['z'], clustered_data)
+
+
+def test_read_dataframe():
+    """Test that input data can be in a dataframe."""
 
-    clustered_data = np.array(
-        [[1, 1, 1, 1],
-         [1, 1, 1, 1],
-         [1, 1, 1, 1],
-         [3, 3, 3, 3],
-         [3, 3, 3, 3],
-         [3, 3, 3, 3]]
+    data = pd.DataFrame(DATA)
+    _, _, curves_dict = Clustergram(
+        data,
+        generate_curves_dict=True,
+        return_computed_traces=True,
+        center_values=False
     )
+    clustered_data = CLUSTERED_DATA
 
     assert np.array_equal(curves_dict['heatmap']['z'], clustered_data)

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-{"name": "dash_bio", "version": "0.4.6", "author": "The Plotly Team <[email protected]>"}`
	`1`	`+{"name": "dash_bio", "version": "0.4.7", "author": "The Plotly Team <[email protected]>"}`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "dash-bio",`
`3`		`- "version": "0.4.6",`
	`3`	`+ "version": "0.4.7",`
`4`	`4`	`"description": "Dash components for bioinformatics",`
`5`	`5`	`"repository": {`
`6`	`6`	`"type": "git",`