pandas-dev · sinhrks · Jul 27, 2016 · jreback · Aug 1, 2016 · sinhrks
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -307,6 +307,29 @@ Google BigQuery Enhancements
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 - The :func:`pandas.io.gbq.read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the :ref:`docs <io.bigquery_reader>` for more details (:issue:`13615`).
 
+.. _whatsnew_0190.sparse:
+
+Sparse changes
+~~~~~~~~~~~~~~
+
+These changes allow pandas to handle sparse data with more dtypes, and for work to make a smoother experience with data handling.
+
+- Sparse data structure now can preserve ``dtype`` after arithmetic ops (:issue:`13848`)
+
+.. ipython:: python
+
+   s = pd.SparseSeries([0, 2, 0, 1], fill_value=0, dtype=np.int64)
+   s.dtype
+
+   s + 1
+
+
+- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`)
+- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`)
+- Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`)
+- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`)
+- Bug in ``SparseDataFrame`` doesn't respect passed ``SparseArray`` or ``SparseSeries`` 's dtype and ``fill_value``  (:issue:`13866`)
+
 .. _whatsnew_0190.enhancements.other:
 
 Other enhancements
@@ -754,11 +777,6 @@ Bug Fixes
 - Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`)
 - Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`)
 - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`)
-- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`)
-- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`)
-- Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`)
-- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`)
-- Bug in ``SparseDataFrame`` doesn't respect passed ``SparseArray`` or ``SparseSeries`` 's dtype and ``fill_value``  (:issue:`13866`)
 - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`)
 - Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`)
 - Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`)

diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py
@@ -48,16 +48,14 @@ def wrapper(self, other):
                 raise AssertionError("length mismatch: %d vs. %d" %
                                      (len(self), len(other)))
             if not isinstance(other, ABCSparseArray):
-                other = SparseArray(other, fill_value=self.fill_value)
-            if name[0] == 'r':
-                return _sparse_array_op(other, self, op, name[1:])
-            else:
-                return _sparse_array_op(self, other, op, name)
+                dtype = getattr(other, 'dtype', None)
+                other = SparseArray(other, fill_value=self.fill_value,
+                                    dtype=dtype)
+            return _sparse_array_op(self, other, op, name)
         elif is_scalar(other):
-            new_fill_value = op(np.float64(self.fill_value), np.float64(other))
-
+            fill = op(_get_fill(self), np.asarray(other))
             return _wrap_result(name, op(self.sp_values, other),
-                                self.sp_index, new_fill_value)
+                                self.sp_index, fill)
         else:  # pragma: no cover
             raise TypeError('operation with %s not supported' % type(other))
 
@@ -67,33 +65,74 @@ def wrapper(self, other):
     return wrapper
 
 
-def _sparse_array_op(left, right, op, name):
-    if left.sp_index.equals(right.sp_index):
-        result = op(left.sp_values, right.sp_values)
-        result_index = left.sp_index
+def _maybe_match_dtype(left, right):
+    if not hasattr(right, 'dtype'):
+        return left.dtype
+    elif left.dtype == right.dtype:
+        return getattr(left.dtype, '__name__', left.dtype)
     else:
-        sparse_op = getattr(splib, 'sparse_%s' % name)
-        result, result_index = sparse_op(left.sp_values, left.sp_index,
-                                         left.fill_value, right.sp_values,
-                                         right.sp_index, right.fill_value)
+        # ToDo: to be supported after GH 667
+        raise NotImplementedError('dtypes must be identical')
+
+
+def _get_fill(arr):
+    # coerce fill_value to arr dtype if possible
+    # int64 SparseArray can have NaN as fill_value if there is no missing
     try:
-        fill_value = op(left.fill_value, right.fill_value)
-    except:
-        fill_value = nan
-    return _wrap_result(name, result, result_index, fill_value)
+        return np.asarray(arr.fill_value, dtype=arr.dtype)
+    except ValueError:
+        return np.asarray(arr.fill_value)
 
 
-def _wrap_result(name, data, sparse_index, fill_value):
+def _sparse_array_op(left, right, op, name, series=False):
+
+    if series and is_integer_dtype(left) and is_integer_dtype(right):
+        # series coerces to float64 if result should have NaN/inf
+        if name in ('floordiv', 'mod') and (right.values == 0).any():
+            left = left.astype(np.float64)
+            right = right.astype(np.float64)
+        elif name in ('rfloordiv', 'rmod') and (left.values == 0).any():
+            left = left.astype(np.float64)
+            right = right.astype(np.float64)
+
+    dtype = _maybe_match_dtype(left, right)
+
+    if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0:
+        result = op(left.get_values(), right.get_values())
+
+        if left.sp_index.ngaps == 0:
+            index = left.sp_index
+        else:
+            index = right.sp_index
+        fill = op(_get_fill(left), _get_fill(right))
+    elif left.sp_index.equals(right.sp_index):
+        result = op(left.sp_values, right.sp_values)
+        index = left.sp_index
+        fill = op(_get_fill(left), _get_fill(right))
+    else:
+        if name[0] == 'r':
+            left, right = right, left
+            name = name[1:]
+
+        opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype)
+        sparse_op = getattr(splib, opname)
+
+        result, index, fill = sparse_op(left.sp_values, left.sp_index,
+                                        left.fill_value, right.sp_values,
+                                        right.sp_index, right.fill_value)
+    return _wrap_result(name, result, index, fill, dtype=result.dtype)
+
+
+def _wrap_result(name, data, sparse_index, fill_value, dtype=None):
     """ wrap op result to have correct dtype """
     if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
         # ToDo: We can remove this condition when removing
         # SparseArray's dtype default when closing GH 667
-        return SparseArray(data, sparse_index=sparse_index,
-                           fill_value=fill_value,
-                           dtype=np.bool)
-    else:
-        return SparseArray(data, sparse_index=sparse_index,
-                           fill_value=fill_value)
+        dtype = np.bool
+    elif name == 'truediv':
+        dtype = np.float64
+    return SparseArray(data, sparse_index=sparse_index,
+                       fill_value=fill_value, dtype=dtype)
 
 
 class SparseArray(PandasObject, np.ndarray):
@@ -419,7 +458,12 @@ def astype(self, dtype=None):
         dtype = np.dtype(dtype)
         if dtype is not None and dtype not in (np.float_, float):
             raise TypeError('Can only support floating point data for now')
-        return self.copy()
+
+        if self.dtype == dtype:
+            return self.copy()
+        else:
+            return self._simple_new(self.sp_values.astype(dtype),
+                                    self.sp_index, float(self.fill_value))
 
     def copy(self, deep=True):
         """

diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py
@@ -57,16 +57,9 @@ def wrapper(self, other):
         elif isinstance(other, DataFrame):
             return NotImplemented
         elif is_scalar(other):
-            if isnull(other) or isnull(self.fill_value):
-                new_fill_value = np.nan
-            else:
-                new_fill_value = op(np.float64(self.fill_value),
-                                    np.float64(other))
-
-            return self._constructor(op(self.sp_values, other),
+            new_values = op(self.values, other)
+            return self._constructor(new_values,
                                      index=self.index,
-                                     sparse_index=self.sp_index,
-                                     fill_value=new_fill_value,
                                      name=self.name)
         else:  # pragma: no cover
             raise TypeError('operation with %s not supported' % type(other))
@@ -84,7 +77,8 @@ def _sparse_series_op(left, right, op, name):
     new_index = left.index
     new_name = _maybe_match_name(left, right)
 
-    result = _sparse_array_op(left, right, op, name)
+    result = _sparse_array_op(left.values, right.values, op, name,
+                              series=True)
     return left._constructor(result, index=new_index, name=new_name)