Merge pull request #163 from shoyer/encoding-fixes

shoyer · shoyer · commit 4fce6d2e4aca · 2014-06-15T21:52:43.000-07:00
BUG: fix encoding issues (array indexing now resets encoding)
diff --git a/test/test_backends.py b/test/test_backends.py
@@ -73,21 +73,21 @@ def test_write_store(self):
 
     def test_roundtrip_test_data(self):
         expected = create_test_data()
-        actual = self.roundtrip(expected)
-        self.assertDatasetAllClose(expected, actual)
+        with self.roundtrip(expected) as actual:
+            self.assertDatasetAllClose(expected, actual)
 
     def test_load_data(self):
         expected = create_test_data()
 
         @contextlib.contextmanager
         def assert_loads():
-            actual = self.roundtrip(expected)
-            for v in actual.variables.values():
-                self.assertFalse(v._in_memory())
-            yield actual
-            for v in actual.variables.values():
-                self.assertTrue(v._in_memory())
-            self.assertDatasetAllClose(expected, actual)
+            with self.roundtrip(expected) as actual:
+                for v in actual.variables.values():
+                    self.assertFalse(v._in_memory())
+                yield actual
+                for v in actual.variables.values():
+                    self.assertTrue(v._in_memory())
+                self.assertDatasetAllClose(expected, actual)
 
         with self.assertRaises(AssertionError):
             # make sure the contextmanager works!
@@ -102,42 +102,44 @@ def assert_loads():
 
     def test_roundtrip_None_variable(self):
         expected = Dataset({None: (('x', 'y'), [[0, 1], [2, 3]])})
-        actual = self.roundtrip(expected)
-        self.assertDatasetAllClose(expected, actual)
+        with self.roundtrip(expected) as actual:
+            self.assertDatasetAllClose(expected, actual)
 
     def test_roundtrip_string_data(self):
         expected = Dataset({'x': ('t', ['abc', 'def', np.nan],
                                   {}, {'_FillValue': ''})})
-        actual = self.roundtrip(expected)
-        self.assertDatasetAllClose(expected, actual)
+        with self.roundtrip(expected) as actual:
+            self.assertDatasetAllClose(expected, actual)
 
     def test_roundtrip_mask_and_scale(self):
         decoded = create_masked_and_scaled_data()
         encoded = create_encoded_masked_and_scaled_data()
-        self.assertDatasetAllClose(decoded, self.roundtrip(decoded))
-        self.assertDatasetAllClose(encoded,
-                                   self.roundtrip(decoded, decode_cf=False))
-        self.assertDatasetAllClose(decoded, self.roundtrip(encoded))
-        self.assertDatasetAllClose(encoded,
-                                   self.roundtrip(encoded, decode_cf=False))
+        with self.roundtrip(decoded) as actual:
+            self.assertDatasetAllClose(decoded, actual)
+        with self.roundtrip(decoded, decode_cf=False) as actual:
+            self.assertDatasetAllClose(encoded, actual)
+        with self.roundtrip(encoded) as actual:
+            self.assertDatasetAllClose(decoded, actual)
+        with self.roundtrip(encoded, decode_cf=False) as actual:
+            self.assertDatasetAllClose(encoded, actual)
 
     def test_roundtrip_example_1_netcdf(self):
         expected = open_example_dataset('example_1.nc')
-        actual = self.roundtrip(expected)
-        self.assertDatasetIdentical(expected, actual)
+        with self.roundtrip(expected) as actual:
+            self.assertDatasetIdentical(expected, actual)
 
     def test_orthogonal_indexing(self):
         in_memory = create_test_data()
-        on_disk = self.roundtrip(in_memory)
-        indexers = {'dim1': np.arange(3), 'dim2': np.arange(4),
-                    'dim3': np.arange(5)}
-        expected = in_memory.indexed(**indexers)
-        actual = on_disk.indexed(**indexers)
-        self.assertDatasetAllClose(expected, actual)
-        # do it twice, to make sure we're switched from orthogonal -> numpy
-        # when we cached the values
-        actual = on_disk.indexed(**indexers)
-        self.assertDatasetAllClose(expected, actual)
+        with self.roundtrip(in_memory) as on_disk:
+            indexers = {'dim1': np.arange(3), 'dim2': np.arange(4),
+                        'dim3': np.arange(5)}
+            expected = in_memory.indexed(**indexers)
+            actual = on_disk.indexed(**indexers)
+            self.assertDatasetAllClose(expected, actual)
+            # do it twice, to make sure we're switched from orthogonal -> numpy
+            # when we cached the values
+            actual = on_disk.indexed(**indexers)
+            self.assertDatasetAllClose(expected, actual)
 
     def test_pickle(self):
         on_disk = open_example_dataset('bears.nc')
@@ -162,24 +164,23 @@ def create_store(self):
         with create_tmp_file() as tmp_file:
             yield backends.NetCDF4DataStore(tmp_file, mode='w')
 
+    @contextlib.contextmanager
     def roundtrip(self, data, **kwargs):
         with create_tmp_file() as tmp_file:
             data.dump(tmp_file)
-            roundtrip_data = open_dataset(tmp_file, **kwargs)
-        return roundtrip_data
+            yield open_dataset(tmp_file, **kwargs)
 
     def test_open_encodings(self):
         # Create a netCDF file with explicit time units
         # and make sure it makes it into the encodings
         # and survives a round trip
         with create_tmp_file() as tmp_file:
-            ds = nc4.Dataset(tmp_file, 'w')
-            ds.createDimension('time', size=10)
-            ds.createVariable('time', np.int32, dimensions=('time',))
-            units = 'days since 1999-01-01'
-            ds.variables['time'].setncattr('units', units)
-            ds.variables['time'][:] = np.arange(10) + 4
-            ds.close()
+            with nc4.Dataset(tmp_file, 'w') as ds:
+                ds.createDimension('time', size=10)
+                ds.createVariable('time', np.int32, dimensions=('time',))
+                units = 'days since 1999-01-01'
+                ds.variables['time'].setncattr('units', units)
+                ds.variables['time'][:] = np.arange(10) + 4
 
             expected = Dataset()
 
@@ -244,53 +245,53 @@ def test_dump_and_open_encodings(self):
         # and make sure it makes it into the encodings
         # and survives a round trip
         with create_tmp_file() as tmp_file:
-            ds = nc4.Dataset(tmp_file, 'w')
-            ds.createDimension('time', size=10)
-            ds.createVariable('time', np.int32, dimensions=('time',))
-            units = 'days since 1999-01-01'
-            ds.variables['time'].setncattr('units', units)
-            ds.variables['time'][:] = np.arange(10) + 4
-            ds.close()
+            with nc4.Dataset(tmp_file, 'w') as ds:
+                ds.createDimension('time', size=10)
+                ds.createVariable('time', np.int32, dimensions=('time',))
+                units = 'days since 1999-01-01'
+                ds.variables['time'].setncattr('units', units)
+                ds.variables['time'][:] = np.arange(10) + 4
 
             xray_dataset = open_dataset(tmp_file)
 
-        with create_tmp_file() as tmp_file:
-            xray_dataset.dump(tmp_file)
-
-            ds = nc4.Dataset(tmp_file, 'r')
+            with create_tmp_file() as tmp_file2:
+                xray_dataset.dump(tmp_file2)
 
-            self.assertEqual(ds.variables['time'].getncattr('units'), units)
-            self.assertArrayEqual(ds.variables['time'], np.arange(10) + 4)
-
-            ds.close()
+                with nc4.Dataset(tmp_file2, 'r') as ds:
+                    self.assertEqual(ds.variables['time'].getncattr('units'), units)
+                    self.assertArrayEqual(ds.variables['time'], np.arange(10) + 4)
 
     def test_compression_encoding(self):
         data = create_test_data()
         data['var2'].encoding.update({'zlib': True,
                                       'chunksizes': (10, 10),
                                       'least_significant_digit': 2})
-        actual = self.roundtrip(data)
-        for k, v in iteritems(data['var2'].encoding):
-            self.assertEqual(v, actual['var2'].encoding[k])
+        with self.roundtrip(data) as actual:
+            for k, v in iteritems(data['var2'].encoding):
+                self.assertEqual(v, actual['var2'].encoding[k])
+
+        # regression test for #156
+        expected = data.indexed(dim1=0)
+        with self.roundtrip(expected) as actual:
+            self.assertDatasetEqual(expected, actual)
 
     def test_mask_and_scale(self):
         with create_tmp_file() as tmp_file:
-            nc = nc4.Dataset(tmp_file, mode='w')
-            nc.createDimension('t', 5)
-            nc.createVariable('x', 'int16', ('t',), fill_value=-1)
-            v = nc.variables['x']
-            v.set_auto_maskandscale(False)
-            v.add_offset = 10
-            v.scale_factor = 0.1
-            v[:] = np.array([-1, -1, 0, 1, 2])
-            nc.close()
+            with nc4.Dataset(tmp_file, mode='w') as nc:
+                nc.createDimension('t', 5)
+                nc.createVariable('x', 'int16', ('t',), fill_value=-1)
+                v = nc.variables['x']
+                v.set_auto_maskandscale(False)
+                v.add_offset = 10
+                v.scale_factor = 0.1
+                v[:] = np.array([-1, -1, 0, 1, 2])
 
             # first make sure netCDF4 reads the masked and scaled data correctly
-            nc = nc4.Dataset(tmp_file, mode='r')
-            expected = np.ma.array([-1, -1, 10, 10.1, 10.2],
-                                   mask=[True, True, False, False, False])
-            actual = nc.variables['x'][:]
-            self.assertArrayEqual(expected, actual)
+            with nc4.Dataset(tmp_file, mode='r') as nc:
+                expected = np.ma.array([-1, -1, 10, 10.1, 10.2],
+                                       mask=[True, True, False, False, False])
+                actual = nc.variables['x'][:]
+                self.assertArrayEqual(expected, actual)
 
             # now check xray
             ds = open_dataset(tmp_file)
@@ -301,10 +302,9 @@ def test_0dimensional_variable(self):
         # This fix verifies our work-around to this netCDF4-python bug:
         # https://github.com/Unidata/netcdf4-python/pull/220
         with create_tmp_file() as tmp_file:
-            nc = nc4.Dataset(tmp_file, mode='w')
-            v = nc.createVariable('x', 'int16')
-            v[...] = 123
-            nc.close()
+            with nc4.Dataset(tmp_file, mode='w') as nc:
+                v = nc.createVariable('x', 'int16')
+                v[...] = 123
 
             ds = open_dataset(tmp_file)
             expected = Dataset({'x': ((), 123)})
@@ -314,17 +314,35 @@ def test_variable_len_strings(self):
         with create_tmp_file() as tmp_file:
             values = np.array(['foo', 'bar', 'baz'], dtype=object)
 
-            nc = nc4.Dataset(tmp_file, mode='w')
-            nc.createDimension('x', 3)
-            v = nc.createVariable('x', str, ('x',))
-            v[:] = values
-            nc.close()
+            with nc4.Dataset(tmp_file, mode='w') as nc:
+                nc.createDimension('x', 3)
+                v = nc.createVariable('x', str, ('x',))
+                v[:] = values
 
             expected = Dataset({'x': ('x', values)})
             for kwargs in [{}, {'decode_cf': True}]:
                 actual = open_dataset(tmp_file, **kwargs)
                 self.assertDatasetIdentical(expected, actual)
 
+    def test_roundtrip_character_array(self):
+        with create_tmp_file() as tmp_file:
+            values = np.array([['a', 'b', 'c'], ['d', 'e', 'f']], dtype='S')
+
+            with nc4.Dataset(tmp_file, mode='w') as nc:
+                nc.createDimension('x', 2)
+                nc.createDimension('string3', 3)
+                v = nc.createVariable('x', np.dtype('S1'), ('x', 'string3'))
+                v[:] = values
+
+            values = np.array(['abc', 'def'], dtype='S')
+            expected = Dataset({'x': ('x', values)})
+            actual = open_dataset(tmp_file)
+            self.assertDatasetIdentical(expected, actual)
+
+            # regression test for #157
+            with self.roundtrip(actual) as roundtripped:
+                self.assertDatasetIdentical(expected, roundtripped)
+
 
 @requires_netCDF4
 @requires_scipy
@@ -334,9 +352,10 @@ def create_store(self):
         fobj = BytesIO()
         yield backends.ScipyDataStore(fobj, 'w')
 
+    @contextlib.contextmanager
     def roundtrip(self, data, **kwargs):
         serialized = data.dumps()
-        return open_dataset(BytesIO(serialized), **kwargs)
+        yield open_dataset(BytesIO(serialized), **kwargs)
 
 
 @requires_netCDF4
@@ -347,11 +366,11 @@ def create_store(self):
             yield backends.NetCDF4DataStore(tmp_file, mode='w',
                                             format='NETCDF3_CLASSIC')
 
+    @contextlib.contextmanager
     def roundtrip(self, data, **kwargs):
         with create_tmp_file() as tmp_file:
             data.dump(tmp_file, format='NETCDF3_CLASSIC')
-            roundtrip_data = open_dataset(tmp_file, **kwargs)
-        return roundtrip_data
+            yield open_dataset(tmp_file, **kwargs)
 
 
 @requires_netCDF4
diff --git a/xray/conventions.py b/xray/conventions.py
@@ -374,6 +374,8 @@ def encode_cf_variable(var):
         # maintain dtype careful). This code makes a best effort attempt to
         # encode them into a dtype that NETCDF can handle by inspecting the
         # dtype of the first element.
+        # TODO: we should really check all elements here, because if the first
+        # value is missing (represented as np.nan), this is liable to fail
         dtype = np.array(data.reshape(-1)[0]).dtype
         # N.B. the "astype" call below will fail if data cannot be cast to the
         # type of its first element (which is probably the only sensible thing
@@ -406,7 +408,10 @@ def get_to(source, dest, k):
     if 'dtype' in encoding and encoding['dtype'].kind != 'O':
         if np.issubdtype(encoding['dtype'], int):
             data = data.round()
-        data = data.astype(encoding['dtype'])
+        if encoding['dtype'].kind == 'S' and encoding['dtype'].itemsize == 1:
+            data = string_to_char(data)
+            dimensions = dimensions + ('string%s' % data.shape[-1],)
+        data = np.asarray(data, dtype=encoding['dtype'])
 
     return xray.Variable(dimensions, data, attributes, encoding=encoding)
 
diff --git a/xray/variable.py b/xray/variable.py
@@ -314,24 +314,24 @@ def __getitem__(self, key):
         (including `Ellipsis`) and 1d arrays, each of which are applied
         orthogonally along their respective dimensions.
 
-        The difference not matter in most cases unless you are using numpy's
-        "fancy indexing," which can otherwise result in data arrays
-        with shapes is inconsistent (or just uninterpretable with) with the
+        The difference does not matter in most cases unless you are using
+        numpy's "fancy indexing," which can otherwise result in data arrays
+        whose shapes is inconsistent (or just uninterpretable with) with the
         variable's dimensions.
 
         If you really want to do indexing like `x[x > 0]`, manipulate the numpy
         array `x.values` directly.
         """
         key = indexing.expanded_indexer(key, self.ndim)
         dimensions = [dim for k, dim in zip(key, self.dimensions)
-                        if not isinstance(k, (int, np.integer))]
+                      if not isinstance(k, (int, np.integer))]
         values = self._data[key]
         # orthogonal indexing should ensure the dimensionality is consistent
         if hasattr(values, 'ndim'):
             assert values.ndim == len(dimensions), (values.ndim, len(dimensions))
         else:
             assert len(dimensions) == 0, len(dimensions)
-        return type(self)(dimensions, values, self.attrs, self.encoding)
+        return type(self)(dimensions, values, self.attrs)
 
     def __setitem__(self, key, value):
         """__setitem__ is overloaded to access the underlying numpy values with