Merge pull request #313 into master.

Benjamin Moody · Benjamin Moody · commit 6cf52fb130a5 · 2022-03-28T17:14:54.000-04:00
The smooth_frames argument to rdrecord is meant to determine whether
the input signals are stored as {d|p}_signal (a homogeneous
two-dimensional array, requiring all signals to be resampled to the
same frequency) or e_{d|p}_signal (a list of one-dimensional arrays,
keeping each signal at its actual sampling frequency).

Previously, this only worked if you were trying to read signals with
multiple samples per frame.  If all signals in the record had only one
sample per frame, then the smooth_frames argument was ignored.

Instead, the application should not need to know in advance whether
the record contains multiple samples per frame.  If the application
asks for a 2D array it should get a 2D array, and if the application
asks for a list of 1D arrays it should get a list of 1D arrays.

Furthermore, simplify internal logic by removing the "frame smoothing"
functionality of _rd_segment, in favor of "smoothing" the result (if
desired) afterwards.  The smooth_frames function performs the same
computation as _rd_dat_signals did previously, but somewhat more
efficiently.
diff --git a/tests/test_record.py b/tests/test_record.py
@@ -26,19 +26,22 @@ def test_1a(self):
         Target file created with:
             rdsamp -r sample-data/test01_00s | cut -f 2- > record-1a
         """
-        record = wfdb.rdrecord('sample-data/test01_00s', physical=False)
+        record = wfdb.rdrecord('sample-data/test01_00s',
+                               physical=False, return_res=16)
         sig = record.d_signal
         sig_target = np.genfromtxt('tests/target-output/record-1a')
 
         # Compare data streaming from Physionet
-        record_pn = wfdb.rdrecord('test01_00s', physical=False,
-                                  pn_dir='macecgdb')
+        record_pn = wfdb.rdrecord('test01_00s', pn_dir='macecgdb',
+                                  physical=False, return_res=16)
 
         # Test file writing
-        record_2 = wfdb.rdrecord('sample-data/test01_00s', physical=False)
+        record_2 = wfdb.rdrecord('sample-data/test01_00s',
+                                 physical=False, return_res=16)
         record_2.sig_name = ['ECG_1', 'ECG_2', 'ECG_3', 'ECG_4']
         record_2.wrsamp()
-        record_write = wfdb.rdrecord('test01_00s', physical=False)
+        record_write = wfdb.rdrecord('test01_00s',
+                                     physical=False, return_res=16)
 
         assert np.array_equal(sig, sig_target)
         assert record.__eq__(record_pn)
@@ -75,24 +78,30 @@ def test_1b(self):
     def test_1c(self):
         """
         Format 16, byte offset, selected duration, selected channels,
-        digital.
+        digital, expanded format.
 
         Target file created with:
             rdsamp -r sample-data/a103l -f 80 -s 0 1 | cut -f 2- > record-1c
         """
         record = wfdb.rdrecord('sample-data/a103l',
-                               sampfrom=20000, channels=[0, 1], physical=False)
-        sig = record.d_signal
+                               sampfrom=20000, channels=[0, 1], physical=False,
+                               smooth_frames=False)
+        # convert expanded to uniform array
+        sig = np.zeros((record.sig_len, record.n_sig))
+        for i in range(record.n_sig):
+            sig[:,i] = record.e_d_signal[i]
+
         sig_target = np.genfromtxt('tests/target-output/record-1c')
 
         # Compare data streaming from Physionet
         record_pn = wfdb.rdrecord('a103l', pn_dir='challenge-2015/training',
                                   sampfrom=20000, channels=[0, 1],
-                                  physical=False)
+                                  physical=False, smooth_frames=False)
 
         # Test file writing
-        record.wrsamp()
-        record_write = wfdb.rdrecord('a103l', physical=False)
+        record.wrsamp(expanded=True)
+        record_write = wfdb.rdrecord('a103l', physical=False,
+                                     smooth_frames=False)
 
         assert np.array_equal(sig, sig_target)
         assert record.__eq__(record_pn)
diff --git a/wfdb/io/_signal.py b/wfdb/io/_signal.py
@@ -745,7 +745,7 @@ def calc_checksum(self, expanded=False):
 
         """
         if expanded:
-            cs = [int(np.sum(self.e_d_signal[ch]) % 65536) for ch in range(self.n_sig)]
+            cs = [int(np.sum(s) % 65536) for s in self.e_d_signal]
         else:
             cs = np.sum(self.d_signal, 0) % 65536
             cs = [int(c) for c in cs]
@@ -822,37 +822,61 @@ def smooth_frames(self, sigtype='physical'):
             if spf[ch] is None:
                 spf[ch] = 1
 
-        # Total samples per frame
-        tspf = sum(spf)
+        # The output data type should be the smallest type that can
+        # represent any input sample value.  The intermediate data type
+        # must be able to represent the sum of spf[ch] sample values.
 
         if sigtype == 'physical':
-            n_sig = len(self.e_p_signal)
-            sig_len = int(len(self.e_p_signal[0])/spf[0])
-            signal = np.zeros((sig_len, n_sig), dtype='float64')
-
-            for ch in range(n_sig):
-                if spf[ch] == 1:
-                    signal[:, ch] = self.e_p_signal[ch]
-                else:
-                    for frame in range(spf[ch]):
-                        signal[:, ch] += self.e_p_signal[ch][frame::spf[ch]]
-                    signal[:, ch] = signal[:, ch] / spf[ch]
-
+            expanded_signal = self.e_p_signal
+            intermediate_dtype = np.dtype('float64')
+            allowed_dtypes = [
+                np.dtype('float32'),
+                np.dtype('float64'),
+            ]
         elif sigtype == 'digital':
-            n_sig = len(self.e_d_signal)
-            sig_len = int(len(self.e_d_signal[0])/spf[0])
-            signal = np.zeros((sig_len, n_sig), dtype='int64')
-
-            for ch in range(n_sig):
-                if spf[ch] == 1:
-                    signal[:, ch] = self.e_d_signal[ch]
-                else:
-                    for frame in range(spf[ch]):
-                        signal[:, ch] += self.e_d_signal[ch][frame::spf[ch]]
-                    signal[:, ch] = signal[:, ch] / spf[ch]
+            expanded_signal = self.e_d_signal
+            intermediate_dtype = np.dtype('int64')
+            allowed_dtypes = [
+                np.dtype('int8'),
+                np.dtype('int16'),
+                np.dtype('int32'),
+                np.dtype('int64'),
+            ]
         else:
             raise ValueError("sigtype must be 'physical' or 'digital'")
 
+        n_sig = len(expanded_signal)
+        sig_len = len(expanded_signal[0]) // spf[0]
+        input_dtypes = set()
+        for ch in range(n_sig):
+            if len(expanded_signal[ch]) != sig_len * spf[ch]:
+                raise ValueError("length mismatch: signal %d has %d samples,"
+                                 " expected %dx%d"
+                                 % (ch, len(expanded_signal),
+                                    sig_len, spf[ch]))
+            input_dtypes.add(expanded_signal[ch].dtype)
+
+        for output_dtype in allowed_dtypes:
+            if all(dt <= output_dtype for dt in input_dtypes):
+                break
+
+        signal = np.empty((sig_len, n_sig), dtype=output_dtype)
+
+        # Large input arrays will be processed in chunks to avoid the need
+        # to allocate a single huge temporary array.
+        CHUNK_SIZE = 65536
+
+        for ch in range(n_sig):
+            if spf[ch] == 1:
+                signal[:, ch] = expanded_signal[ch]
+            else:
+                frames = expanded_signal[ch].reshape(-1, spf[ch])
+                for chunk_start in range(0, sig_len, CHUNK_SIZE):
+                    chunk_end = chunk_start + CHUNK_SIZE
+                    signal_sum = np.sum(frames[chunk_start:chunk_end, :],
+                                        axis=1, dtype=intermediate_dtype)
+                    signal[chunk_start:chunk_end, ch] = signal_sum / spf[ch]
+
         return signal
 
 
@@ -861,7 +885,7 @@ def smooth_frames(self, sigtype='physical'):
 
 def _rd_segment(file_name, dir_name, pn_dir, fmt, n_sig, sig_len, byte_offset,
                 samps_per_frame, skew, init_value, sampfrom, sampto, channels,
-                smooth_frames, ignore_skew, no_file=False, sig_data=None, return_res=64):
+                ignore_skew, no_file=False, sig_data=None, return_res=64):
     """
     Read the digital samples from a single segment record's associated
     dat file(s).
@@ -894,8 +918,6 @@ def _rd_segment(file_name, dir_name, pn_dir, fmt, n_sig, sig_len, byte_offset,
         The starting sample number to be read from the signals.
     sampto : int
         The final sample number to be read from the signals.
-    smooth_frames : bool
-        Whether to smooth channels with multiple samples/frame.
     ignore_skew : bool
         Used when reading records with at least one skewed signal.
         Specifies whether to apply the skew to align the signals in the
@@ -915,17 +937,14 @@ def _rd_segment(file_name, dir_name, pn_dir, fmt, n_sig, sig_len, byte_offset,
 
     Returns
     -------
-    signals : ndarray, list
-        The signals read from the dat file(s). A 2d numpy array is
-        returned if the signals have uniform samples/frame or if
-        `smooth_frames` is True. Otherwise a list of 1d numpy arrays
-        is returned.
+    signals : list
+        The signals read from the dat file(s). Each signal is returned as a
+        one-dimensional numpy array.
 
     Notes
     -----
-    'channels', 'sampfrom', 'sampto', 'smooth_frames', and 'ignore_skew'
-    are user desired input fields. All other parameters are
-    specifications of the segment.
+    'channels', 'sampfrom', 'sampto', and 'ignore_skew' are user desired
+    input fields. All other parameters are specifications of the segment.
 
     """
     # Check for valid inputs
@@ -989,69 +1008,38 @@ def _rd_segment(file_name, dir_name, pn_dir, fmt, n_sig, sig_len, byte_offset,
         r_w_channel[fn] = [c - min(datchannel[fn]) for c in w_channel[fn]]
         out_dat_channel[fn] = [channels.index(c) for c in w_channel[fn]]
 
-    # Signals with multiple samples/frame are smoothed, or all signals have 1 sample/frame.
-    # Return uniform numpy array
-    if smooth_frames or sum(samps_per_frame) == n_sig:
-        # Figure out the largest required dtype for the segment to minimize memory usage
-        max_dtype = _np_dtype(_fmt_res(fmt, max_res=True), discrete=True)
-        # Allocate signal array. Minimize dtype
-        signals = np.zeros([sampto-sampfrom, len(channels)], dtype=max_dtype)
-
-        # Read each wanted dat file and store signals
-        for fn in w_file_name:
-            datsignals = _rd_dat_signals(
-                file_name=fn,
-                dir_name=dir_name,
-                pn_dir=pn_dir,
-                fmt=w_fmt[fn],
-                n_sig=len(datchannel[fn]),
-                sig_len=sig_len,
-                byte_offset=w_byte_offset[fn],
-                samps_per_frame=w_samps_per_frame[fn],
-                skew=w_skew[fn],
-                init_value=w_init_value[fn],
-                sampfrom=sampfrom,
-                sampto=sampto,
-                smooth_frames=smooth_frames,
-                no_file=no_file,
-                sig_data=sig_data)
-            signals[:, out_dat_channel[fn]] = datsignals[:, r_w_channel[fn]]
-
     # Return each sample in signals with multiple samples/frame, without smoothing.
     # Return a list of numpy arrays for each signal.
-    else:
-        signals = [None] * len(channels)
-
-        for fn in w_file_name:
-            # Get the list of all signals contained in the dat file
-            datsignals = _rd_dat_signals(
-                file_name=fn,
-                dir_name=dir_name,
-                pn_dir=pn_dir,
-                fmt=w_fmt[fn],
-                n_sig=len(datchannel[fn]),
-                sig_len=sig_len,
-                byte_offset=w_byte_offset[fn],
-                samps_per_frame=w_samps_per_frame[fn],
-                skew=w_skew[fn],
-                init_value=w_init_value[fn],
-                sampfrom=sampfrom,
-                sampto=sampto,
-                smooth_frames=smooth_frames,
-                no_file=no_file,
-                sig_data=sig_data)
-
-            # Copy over the wanted signals
-            for cn in range(len(out_dat_channel[fn])):
-                signals[out_dat_channel[fn][cn]] = datsignals[r_w_channel[fn][cn]]
+    signals = [None] * len(channels)
+
+    for fn in w_file_name:
+        # Get the list of all signals contained in the dat file
+        datsignals = _rd_dat_signals(
+            file_name=fn,
+            dir_name=dir_name,
+            pn_dir=pn_dir,
+            fmt=w_fmt[fn],
+            n_sig=len(datchannel[fn]),
+            sig_len=sig_len,
+            byte_offset=w_byte_offset[fn],
+            samps_per_frame=w_samps_per_frame[fn],
+            skew=w_skew[fn],
+            init_value=w_init_value[fn],
+            sampfrom=sampfrom,
+            sampto=sampto,
+            no_file=no_file,
+            sig_data=sig_data)
+
+        # Copy over the wanted signals
+        for cn in range(len(out_dat_channel[fn])):
+            signals[out_dat_channel[fn][cn]] = datsignals[r_w_channel[fn][cn]]
 
     return signals
 
 
 def _rd_dat_signals(file_name, dir_name, pn_dir, fmt, n_sig, sig_len,
                     byte_offset, samps_per_frame, skew, init_value,
-                    sampfrom, sampto, smooth_frames,
-                    no_file=False, sig_data=None):
+                    sampfrom, sampto, no_file=False, sig_data=None):
     """
     Read all signals from a WFDB dat file.
 
@@ -1083,8 +1071,6 @@ def _rd_dat_signals(file_name, dir_name, pn_dir, fmt, n_sig, sig_len,
         The starting sample number to be read from the signals.
     sampto : int
         The final sample number to be read from the signals.
-    smooth_frames : bool
-        Whether to smooth channels with multiple samples/frame.
     no_file : bool, optional
         Used when using this function with just an array of signal data
         and no associated file to read the data from.
@@ -1095,16 +1081,13 @@ def _rd_dat_signals(file_name, dir_name, pn_dir, fmt, n_sig, sig_len,
     Returns
     -------
     signal : ndarray, list
-        The signals read from the dat file(s). A 2d numpy array is
-        returned if the signals have uniform samples/frame or if
-        `smooth_frames` is True. Otherwise a list of 1d numpy arrays
-        is returned.
+        The signals read from the dat file(s). Each signal is returned as a
+        one-dimensional numpy array.
 
     Notes
     -----
-    'channels', 'sampfrom', 'sampto', 'smooth_frames', and 'ignore_skew'
-    are user desired input fields. All other parameters are
-    specifications of the segment.
+    'channels', 'sampfrom', 'sampto', and 'ignore_skew' are user desired
+    input fields. All other parameters are specifications of the segment.
 
     """
     # Check for valid inputs
@@ -1206,46 +1189,18 @@ def _rd_dat_signals(file_name, dir_name, pn_dir, fmt, n_sig, sig_len,
     # At this point, dtype of sig_data is the minimum integer format
     # required for storing the final digital samples.
 
-    # No extra samples/frame. Obtain original uniform numpy array
-    if tsamps_per_frame == n_sig:
-        # Reshape into multiple channels
-        signal = sig_data.reshape(-1, n_sig)
-        # Skew the signal
-        signal = _skew_sig(signal, skew, n_sig, read_len, fmt, nan_replace)
-    # Extra frames present to be smoothed. Obtain averaged uniform numpy array
-    elif smooth_frames:
-        # Allocate memory for smoothed signal.
-        signal = np.zeros((int(len(sig_data) / tsamps_per_frame) , n_sig),
-                       dtype=sig_data.dtype)
-
-        # Transfer and average samples
-        for ch in range(n_sig):
-            if samps_per_frame[ch] == 1:
-                signal[:, ch] = sig_data[sum(([0] + samps_per_frame)[:ch + 1])::tsamps_per_frame]
-            else:
-                if ch == 0:
-                    startind = 0
-                else:
-                    startind = np.sum(samps_per_frame[:ch])
-                signal[:,ch] = [np.average(sig_data[ind:ind+samps_per_frame[ch]]) for ind in range(startind,len(sig_data),tsamps_per_frame)]
-        # Skew the signal
-        signal = _skew_sig(signal, skew, n_sig, read_len, fmt, nan_replace)
-
-    # Extra frames present without wanting smoothing. Return all
-    # expanded samples.
-    else:
-        # List of 1d numpy arrays
-        signal = []
-        # Transfer over samples
-        sig_frames = sig_data.reshape(-1, tsamps_per_frame)
-        ch_start = 0
-        for ch in range(n_sig):
-            ch_end = ch_start + samps_per_frame[ch]
-            ch_signal = sig_frames[:, ch_start:ch_end].reshape(-1)
-            signal.append(ch_signal)
-            ch_start = ch_end
-        # Skew the signal
-        signal = _skew_sig(signal, skew, n_sig, read_len, fmt, nan_replace, samps_per_frame)
+    # List of 1d numpy arrays
+    signal = []
+    # Transfer over samples
+    sig_frames = sig_data.reshape(-1, tsamps_per_frame)
+    ch_start = 0
+    for ch in range(n_sig):
+        ch_end = ch_start + samps_per_frame[ch]
+        ch_signal = sig_frames[:, ch_start:ch_end].reshape(-1)
+        signal.append(ch_signal)
+        ch_start = ch_end
+    # Skew the signal
+    signal = _skew_sig(signal, skew, n_sig, read_len, fmt, nan_replace, samps_per_frame)
 
     # Integrity check of signal shape after reading
     _check_sig_dims(signal, read_len, n_sig, samps_per_frame)
diff --git a/wfdb/io/record.py b/wfdb/io/record.py