Skip to content

Commit 4fce6d2

Browse files
committed
Merge pull request #163 from shoyer/encoding-fixes
BUG: fix encoding issues (array indexing now resets encoding)
2 parents 71226fb + 667f26f commit 4fce6d2

File tree

3 files changed

+114
-90
lines changed

3 files changed

+114
-90
lines changed

test/test_backends.py

Lines changed: 103 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -73,21 +73,21 @@ def test_write_store(self):
7373

7474
def test_roundtrip_test_data(self):
7575
expected = create_test_data()
76-
actual = self.roundtrip(expected)
77-
self.assertDatasetAllClose(expected, actual)
76+
with self.roundtrip(expected) as actual:
77+
self.assertDatasetAllClose(expected, actual)
7878

7979
def test_load_data(self):
8080
expected = create_test_data()
8181

8282
@contextlib.contextmanager
8383
def assert_loads():
84-
actual = self.roundtrip(expected)
85-
for v in actual.variables.values():
86-
self.assertFalse(v._in_memory())
87-
yield actual
88-
for v in actual.variables.values():
89-
self.assertTrue(v._in_memory())
90-
self.assertDatasetAllClose(expected, actual)
84+
with self.roundtrip(expected) as actual:
85+
for v in actual.variables.values():
86+
self.assertFalse(v._in_memory())
87+
yield actual
88+
for v in actual.variables.values():
89+
self.assertTrue(v._in_memory())
90+
self.assertDatasetAllClose(expected, actual)
9191

9292
with self.assertRaises(AssertionError):
9393
# make sure the contextmanager works!
@@ -102,42 +102,44 @@ def assert_loads():
102102

103103
def test_roundtrip_None_variable(self):
104104
expected = Dataset({None: (('x', 'y'), [[0, 1], [2, 3]])})
105-
actual = self.roundtrip(expected)
106-
self.assertDatasetAllClose(expected, actual)
105+
with self.roundtrip(expected) as actual:
106+
self.assertDatasetAllClose(expected, actual)
107107

108108
def test_roundtrip_string_data(self):
109109
expected = Dataset({'x': ('t', ['abc', 'def', np.nan],
110110
{}, {'_FillValue': ''})})
111-
actual = self.roundtrip(expected)
112-
self.assertDatasetAllClose(expected, actual)
111+
with self.roundtrip(expected) as actual:
112+
self.assertDatasetAllClose(expected, actual)
113113

114114
def test_roundtrip_mask_and_scale(self):
115115
decoded = create_masked_and_scaled_data()
116116
encoded = create_encoded_masked_and_scaled_data()
117-
self.assertDatasetAllClose(decoded, self.roundtrip(decoded))
118-
self.assertDatasetAllClose(encoded,
119-
self.roundtrip(decoded, decode_cf=False))
120-
self.assertDatasetAllClose(decoded, self.roundtrip(encoded))
121-
self.assertDatasetAllClose(encoded,
122-
self.roundtrip(encoded, decode_cf=False))
117+
with self.roundtrip(decoded) as actual:
118+
self.assertDatasetAllClose(decoded, actual)
119+
with self.roundtrip(decoded, decode_cf=False) as actual:
120+
self.assertDatasetAllClose(encoded, actual)
121+
with self.roundtrip(encoded) as actual:
122+
self.assertDatasetAllClose(decoded, actual)
123+
with self.roundtrip(encoded, decode_cf=False) as actual:
124+
self.assertDatasetAllClose(encoded, actual)
123125

124126
def test_roundtrip_example_1_netcdf(self):
125127
expected = open_example_dataset('example_1.nc')
126-
actual = self.roundtrip(expected)
127-
self.assertDatasetIdentical(expected, actual)
128+
with self.roundtrip(expected) as actual:
129+
self.assertDatasetIdentical(expected, actual)
128130

129131
def test_orthogonal_indexing(self):
130132
in_memory = create_test_data()
131-
on_disk = self.roundtrip(in_memory)
132-
indexers = {'dim1': np.arange(3), 'dim2': np.arange(4),
133-
'dim3': np.arange(5)}
134-
expected = in_memory.indexed(**indexers)
135-
actual = on_disk.indexed(**indexers)
136-
self.assertDatasetAllClose(expected, actual)
137-
# do it twice, to make sure we're switched from orthogonal -> numpy
138-
# when we cached the values
139-
actual = on_disk.indexed(**indexers)
140-
self.assertDatasetAllClose(expected, actual)
133+
with self.roundtrip(in_memory) as on_disk:
134+
indexers = {'dim1': np.arange(3), 'dim2': np.arange(4),
135+
'dim3': np.arange(5)}
136+
expected = in_memory.indexed(**indexers)
137+
actual = on_disk.indexed(**indexers)
138+
self.assertDatasetAllClose(expected, actual)
139+
# do it twice, to make sure we're switched from orthogonal -> numpy
140+
# when we cached the values
141+
actual = on_disk.indexed(**indexers)
142+
self.assertDatasetAllClose(expected, actual)
141143

142144
def test_pickle(self):
143145
on_disk = open_example_dataset('bears.nc')
@@ -162,24 +164,23 @@ def create_store(self):
162164
with create_tmp_file() as tmp_file:
163165
yield backends.NetCDF4DataStore(tmp_file, mode='w')
164166

167+
@contextlib.contextmanager
165168
def roundtrip(self, data, **kwargs):
166169
with create_tmp_file() as tmp_file:
167170
data.dump(tmp_file)
168-
roundtrip_data = open_dataset(tmp_file, **kwargs)
169-
return roundtrip_data
171+
yield open_dataset(tmp_file, **kwargs)
170172

171173
def test_open_encodings(self):
172174
# Create a netCDF file with explicit time units
173175
# and make sure it makes it into the encodings
174176
# and survives a round trip
175177
with create_tmp_file() as tmp_file:
176-
ds = nc4.Dataset(tmp_file, 'w')
177-
ds.createDimension('time', size=10)
178-
ds.createVariable('time', np.int32, dimensions=('time',))
179-
units = 'days since 1999-01-01'
180-
ds.variables['time'].setncattr('units', units)
181-
ds.variables['time'][:] = np.arange(10) + 4
182-
ds.close()
178+
with nc4.Dataset(tmp_file, 'w') as ds:
179+
ds.createDimension('time', size=10)
180+
ds.createVariable('time', np.int32, dimensions=('time',))
181+
units = 'days since 1999-01-01'
182+
ds.variables['time'].setncattr('units', units)
183+
ds.variables['time'][:] = np.arange(10) + 4
183184

184185
expected = Dataset()
185186

@@ -244,53 +245,53 @@ def test_dump_and_open_encodings(self):
244245
# and make sure it makes it into the encodings
245246
# and survives a round trip
246247
with create_tmp_file() as tmp_file:
247-
ds = nc4.Dataset(tmp_file, 'w')
248-
ds.createDimension('time', size=10)
249-
ds.createVariable('time', np.int32, dimensions=('time',))
250-
units = 'days since 1999-01-01'
251-
ds.variables['time'].setncattr('units', units)
252-
ds.variables['time'][:] = np.arange(10) + 4
253-
ds.close()
248+
with nc4.Dataset(tmp_file, 'w') as ds:
249+
ds.createDimension('time', size=10)
250+
ds.createVariable('time', np.int32, dimensions=('time',))
251+
units = 'days since 1999-01-01'
252+
ds.variables['time'].setncattr('units', units)
253+
ds.variables['time'][:] = np.arange(10) + 4
254254

255255
xray_dataset = open_dataset(tmp_file)
256256

257-
with create_tmp_file() as tmp_file:
258-
xray_dataset.dump(tmp_file)
259-
260-
ds = nc4.Dataset(tmp_file, 'r')
257+
with create_tmp_file() as tmp_file2:
258+
xray_dataset.dump(tmp_file2)
261259

262-
self.assertEqual(ds.variables['time'].getncattr('units'), units)
263-
self.assertArrayEqual(ds.variables['time'], np.arange(10) + 4)
264-
265-
ds.close()
260+
with nc4.Dataset(tmp_file2, 'r') as ds:
261+
self.assertEqual(ds.variables['time'].getncattr('units'), units)
262+
self.assertArrayEqual(ds.variables['time'], np.arange(10) + 4)
266263

267264
def test_compression_encoding(self):
268265
data = create_test_data()
269266
data['var2'].encoding.update({'zlib': True,
270267
'chunksizes': (10, 10),
271268
'least_significant_digit': 2})
272-
actual = self.roundtrip(data)
273-
for k, v in iteritems(data['var2'].encoding):
274-
self.assertEqual(v, actual['var2'].encoding[k])
269+
with self.roundtrip(data) as actual:
270+
for k, v in iteritems(data['var2'].encoding):
271+
self.assertEqual(v, actual['var2'].encoding[k])
272+
273+
# regression test for #156
274+
expected = data.indexed(dim1=0)
275+
with self.roundtrip(expected) as actual:
276+
self.assertDatasetEqual(expected, actual)
275277

276278
def test_mask_and_scale(self):
277279
with create_tmp_file() as tmp_file:
278-
nc = nc4.Dataset(tmp_file, mode='w')
279-
nc.createDimension('t', 5)
280-
nc.createVariable('x', 'int16', ('t',), fill_value=-1)
281-
v = nc.variables['x']
282-
v.set_auto_maskandscale(False)
283-
v.add_offset = 10
284-
v.scale_factor = 0.1
285-
v[:] = np.array([-1, -1, 0, 1, 2])
286-
nc.close()
280+
with nc4.Dataset(tmp_file, mode='w') as nc:
281+
nc.createDimension('t', 5)
282+
nc.createVariable('x', 'int16', ('t',), fill_value=-1)
283+
v = nc.variables['x']
284+
v.set_auto_maskandscale(False)
285+
v.add_offset = 10
286+
v.scale_factor = 0.1
287+
v[:] = np.array([-1, -1, 0, 1, 2])
287288

288289
# first make sure netCDF4 reads the masked and scaled data correctly
289-
nc = nc4.Dataset(tmp_file, mode='r')
290-
expected = np.ma.array([-1, -1, 10, 10.1, 10.2],
291-
mask=[True, True, False, False, False])
292-
actual = nc.variables['x'][:]
293-
self.assertArrayEqual(expected, actual)
290+
with nc4.Dataset(tmp_file, mode='r') as nc:
291+
expected = np.ma.array([-1, -1, 10, 10.1, 10.2],
292+
mask=[True, True, False, False, False])
293+
actual = nc.variables['x'][:]
294+
self.assertArrayEqual(expected, actual)
294295

295296
# now check xray
296297
ds = open_dataset(tmp_file)
@@ -301,10 +302,9 @@ def test_0dimensional_variable(self):
301302
# This fix verifies our work-around to this netCDF4-python bug:
302303
# https://github.com/Unidata/netcdf4-python/pull/220
303304
with create_tmp_file() as tmp_file:
304-
nc = nc4.Dataset(tmp_file, mode='w')
305-
v = nc.createVariable('x', 'int16')
306-
v[...] = 123
307-
nc.close()
305+
with nc4.Dataset(tmp_file, mode='w') as nc:
306+
v = nc.createVariable('x', 'int16')
307+
v[...] = 123
308308

309309
ds = open_dataset(tmp_file)
310310
expected = Dataset({'x': ((), 123)})
@@ -314,17 +314,35 @@ def test_variable_len_strings(self):
314314
with create_tmp_file() as tmp_file:
315315
values = np.array(['foo', 'bar', 'baz'], dtype=object)
316316

317-
nc = nc4.Dataset(tmp_file, mode='w')
318-
nc.createDimension('x', 3)
319-
v = nc.createVariable('x', str, ('x',))
320-
v[:] = values
321-
nc.close()
317+
with nc4.Dataset(tmp_file, mode='w') as nc:
318+
nc.createDimension('x', 3)
319+
v = nc.createVariable('x', str, ('x',))
320+
v[:] = values
322321

323322
expected = Dataset({'x': ('x', values)})
324323
for kwargs in [{}, {'decode_cf': True}]:
325324
actual = open_dataset(tmp_file, **kwargs)
326325
self.assertDatasetIdentical(expected, actual)
327326

327+
def test_roundtrip_character_array(self):
328+
with create_tmp_file() as tmp_file:
329+
values = np.array([['a', 'b', 'c'], ['d', 'e', 'f']], dtype='S')
330+
331+
with nc4.Dataset(tmp_file, mode='w') as nc:
332+
nc.createDimension('x', 2)
333+
nc.createDimension('string3', 3)
334+
v = nc.createVariable('x', np.dtype('S1'), ('x', 'string3'))
335+
v[:] = values
336+
337+
values = np.array(['abc', 'def'], dtype='S')
338+
expected = Dataset({'x': ('x', values)})
339+
actual = open_dataset(tmp_file)
340+
self.assertDatasetIdentical(expected, actual)
341+
342+
# regression test for #157
343+
with self.roundtrip(actual) as roundtripped:
344+
self.assertDatasetIdentical(expected, roundtripped)
345+
328346

329347
@requires_netCDF4
330348
@requires_scipy
@@ -334,9 +352,10 @@ def create_store(self):
334352
fobj = BytesIO()
335353
yield backends.ScipyDataStore(fobj, 'w')
336354

355+
@contextlib.contextmanager
337356
def roundtrip(self, data, **kwargs):
338357
serialized = data.dumps()
339-
return open_dataset(BytesIO(serialized), **kwargs)
358+
yield open_dataset(BytesIO(serialized), **kwargs)
340359

341360

342361
@requires_netCDF4
@@ -347,11 +366,11 @@ def create_store(self):
347366
yield backends.NetCDF4DataStore(tmp_file, mode='w',
348367
format='NETCDF3_CLASSIC')
349368

369+
@contextlib.contextmanager
350370
def roundtrip(self, data, **kwargs):
351371
with create_tmp_file() as tmp_file:
352372
data.dump(tmp_file, format='NETCDF3_CLASSIC')
353-
roundtrip_data = open_dataset(tmp_file, **kwargs)
354-
return roundtrip_data
373+
yield open_dataset(tmp_file, **kwargs)
355374

356375

357376
@requires_netCDF4

xray/conventions.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,8 @@ def encode_cf_variable(var):
374374
# maintain dtype careful). This code makes a best effort attempt to
375375
# encode them into a dtype that NETCDF can handle by inspecting the
376376
# dtype of the first element.
377+
# TODO: we should really check all elements here, because if the first
378+
# value is missing (represented as np.nan), this is liable to fail
377379
dtype = np.array(data.reshape(-1)[0]).dtype
378380
# N.B. the "astype" call below will fail if data cannot be cast to the
379381
# type of its first element (which is probably the only sensible thing
@@ -406,7 +408,10 @@ def get_to(source, dest, k):
406408
if 'dtype' in encoding and encoding['dtype'].kind != 'O':
407409
if np.issubdtype(encoding['dtype'], int):
408410
data = data.round()
409-
data = data.astype(encoding['dtype'])
411+
if encoding['dtype'].kind == 'S' and encoding['dtype'].itemsize == 1:
412+
data = string_to_char(data)
413+
dimensions = dimensions + ('string%s' % data.shape[-1],)
414+
data = np.asarray(data, dtype=encoding['dtype'])
410415

411416
return xray.Variable(dimensions, data, attributes, encoding=encoding)
412417

xray/variable.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -314,24 +314,24 @@ def __getitem__(self, key):
314314
(including `Ellipsis`) and 1d arrays, each of which are applied
315315
orthogonally along their respective dimensions.
316316
317-
The difference not matter in most cases unless you are using numpy's
318-
"fancy indexing," which can otherwise result in data arrays
319-
with shapes is inconsistent (or just uninterpretable with) with the
317+
The difference does not matter in most cases unless you are using
318+
numpy's "fancy indexing," which can otherwise result in data arrays
319+
whose shapes is inconsistent (or just uninterpretable with) with the
320320
variable's dimensions.
321321
322322
If you really want to do indexing like `x[x > 0]`, manipulate the numpy
323323
array `x.values` directly.
324324
"""
325325
key = indexing.expanded_indexer(key, self.ndim)
326326
dimensions = [dim for k, dim in zip(key, self.dimensions)
327-
if not isinstance(k, (int, np.integer))]
327+
if not isinstance(k, (int, np.integer))]
328328
values = self._data[key]
329329
# orthogonal indexing should ensure the dimensionality is consistent
330330
if hasattr(values, 'ndim'):
331331
assert values.ndim == len(dimensions), (values.ndim, len(dimensions))
332332
else:
333333
assert len(dimensions) == 0, len(dimensions)
334-
return type(self)(dimensions, values, self.attrs, self.encoding)
334+
return type(self)(dimensions, values, self.attrs)
335335

336336
def __setitem__(self, key, value):
337337
"""__setitem__ is overloaded to access the underlying numpy values with

0 commit comments

Comments
 (0)