pvlib · eccoope · Mar 7, 2023 · Mar 7, 2023 · Mar 16, 2023 · Mar 22, 2023
diff --git a/docs/sphinx/source/whatsnew/v0.9.6.rst b/docs/sphinx/source/whatsnew/v0.9.6.rst
@@ -11,7 +11,9 @@ Deprecations
 
 Enhancements
 ~~~~~~~~~~~~
-
+* Added compatibility with missing timesteps to
+  :py:func`pvlib.clearsky.detect_clearsky`. (:issue:`1678`,
+  :pull:`1708`)
 
 Bug fixes
 ~~~~~~~~~
@@ -37,3 +39,4 @@ Requirements
 Contributors
 ~~~~~~~~~~~~
 * Adam R. Jensen (:ghuser:`adamrjensen`)
+* Emma C. Cooper (:ghuser:`eccoope`)
diff --git a/pvlib/clearsky.py b/pvlib/clearsky.py
@@ -579,9 +579,10 @@ def _calc_stats(data, samples_per_window, sample_interval, H):
     """
 
     data_mean = data.values[H].mean(axis=0)
-    data_mean = _to_centered_series(data_mean, data.index, samples_per_window)
+    data_mean = _to_centered_series(data_mean, data.index, samples_per_window,
+                                    H)
     data_max = data.values[H].max(axis=0)
-    data_max = _to_centered_series(data_max, data.index, samples_per_window)
+    data_max = _to_centered_series(data_max, data.index, samples_per_window, H)
     # shift to get forward difference, .diff() is backward difference instead
     data_diff = data.diff().shift(-1)
     data_slope = data_diff / sample_interval
@@ -594,30 +595,48 @@ def _slope_nstd_windowed(slopes, data, H, samples_per_window, sample_interval):
     with np.errstate(divide='ignore', invalid='ignore'):
         nstd = slopes[H[:-1, ]].std(ddof=1, axis=0) \
             / data.values[H].mean(axis=0)
-    return _to_centered_series(nstd, data.index, samples_per_window)
+    return _to_centered_series(nstd, data.index, samples_per_window, H)
 
 
 def _max_diff_windowed(data, H, samples_per_window):
     raw = np.diff(data)
     raw = np.abs(raw[H[:-1, ]]).max(axis=0)
-    return _to_centered_series(raw, data.index, samples_per_window)
+    return _to_centered_series(raw, data.index, samples_per_window, H)
 
 
 def _line_length_windowed(data, H, samples_per_window,
                           sample_interval):
     raw = np.sqrt(np.diff(data)**2. + sample_interval**2.)
     raw = np.sum(raw[H[:-1, ]], axis=0)
-    return _to_centered_series(raw, data.index, samples_per_window)
+    return _to_centered_series(raw, data.index, samples_per_window, H)
 
 
-def _to_centered_series(vals, idx, samples_per_window):
-    vals = np.pad(vals, ((0, len(idx) - len(vals)),), mode='constant',
-                  constant_values=np.nan)
-    shift = samples_per_window // 2  # align = 'center' only
-    return pd.Series(index=idx, data=vals).shift(shift)
+def _to_centered_series(vals, idx, samples_per_window, H):
+    # Get center of interval using zero-indexing, round down to nearest
+    # index if there are an even number of rows
+    if samples_per_window % 2 == 0:
+        center_row = samples_per_window//2 - 1
+    else:
+        center_row = samples_per_window//2
+
+    try:
+        # Maintain tz that is stripped when idx is put in H
+        if idx.tz is not None:
+            c = pd.DatetimeIndex(idx.values[H][center_row, :],
+                                 tz='UTC').tz_convert(idx.tz)
+        else:
+            c = idx.values[H][center_row, :]
+    # If the index is a range
+    except AttributeError:
+        c = idx.values[H][center_row, :]
+
+    # Assign summary values for each interval to the indices of the center row
+    centered = pd.Series(index=idx, dtype='object')
+    centered.loc[c] = vals
+    return centered
 
 
-def _clear_sample_index(clear_windows, samples_per_window, align, H):
+def _clear_sample_index(clear_windows, samples_per_window, gaps, H, align):
     """
     Returns indices of clear samples in clear windows
     """
@@ -635,16 +654,26 @@ def _clear_sample_index(clear_windows, samples_per_window, align, H):
     #     shift = - (samples_per_window // 2)
     # else:
     #     shift = 0
-    shift = -(samples_per_window // 2)
-    idx = clear_windows.shift(shift)
+
+    # Account for the row # on which the interval is centered not actually
+    # being in row samples_per_window // 2 if samples_per_window is even
+    if samples_per_window % 2 == 0:
+        shift = -(samples_per_window // 2 - 1)
+    else:
+        shift = -(samples_per_window // 2)
+    clear_cols = clear_windows.shift(shift)
     # drop rows at the end corresponding to windows past the end of data
-    idx = idx.drop(clear_windows.index[1 - samples_per_window:])
-    idx = idx.astype(bool)  # shift changed type to object
-    clear_samples = np.unique(H[:, idx])
+    clear_cols = clear_cols.drop(clear_windows.index[1 - samples_per_window:])
+    clear_cols = clear_cols.astype(bool)  # shift changed type to object
+    # Boolean mask for column indices of intervals with temporal gaps
+    gap_cols = [True if c not in gaps else False for c in range(0,
+                len(clear_windows) - (samples_per_window - 1))]
+    mask = np.logical_and(clear_cols, gap_cols)
+    clear_samples = np.unique(H[:, mask])
     return clear_samples
 
 
-def detect_clearsky(measured, clearsky, times=None, window_length=10,
+def detect_clearsky(measured, clear_sky, times=None, window_length=10,
                     mean_diff=75, max_diff=75,
                     lower_line_length=-5, upper_line_length=10,
                     var_diff=0.005, slope_dev=8, max_iterations=20,
@@ -723,8 +752,6 @@ def detect_clearsky(measured, clearsky, times=None, window_length=10,
     ------
     ValueError
         If measured is not a Series and times is not provided
-    NotImplementedError
-        If timestamps are not equally spaced
 
     References
     ----------
@@ -748,6 +775,11 @@ def detect_clearsky(measured, clearsky, times=None, window_length=10,
         * option to return individual test components and clearsky scaling
           parameter
         * uses centered windows (Matlab function uses left-aligned windows)
+
+    2023-03-24 - This algorithm does accept data with skipped or missing
+    timestamps. The DatetimeIndex (either times or index of measured)
+    provided still must be regular, i.e. the length of intervals between
+    points are equal except in the case that data is missing.
     """
 
     if times is None:
@@ -765,10 +797,17 @@ def detect_clearsky(measured, clearsky, times=None, window_length=10,
     else:
         meas = measured
 
-    if not isinstance(clearsky, pd.Series):
-        clear = pd.Series(clearsky, index=times)
+    if not isinstance(clear_sky, pd.Series):
+        clear = pd.Series(clear_sky, index=times)
+    # This clause is designed to address cases where measured has missing time
+    # steps - if this is the case, clear should be set to have the same
+    # missing time intervals as measured. Not doing this may cause issues with
+    # arrays of different lengths when evaluating comparison criteria and
+    # when indexing the Hankel matrix to construct clear_samples
+    elif len(clear_sky.index) != len(times):
+        clear = pd.Series(clear_sky, index=times)
     else:
-        clear = clearsky
+        clear = clear_sky
 
     sample_interval, samples_per_window = \
         tools._get_sample_intervals(times, window_length)
@@ -777,6 +816,19 @@ def detect_clearsky(measured, clearsky, times=None, window_length=10,
     H = hankel(np.arange(samples_per_window),
                np.arange(samples_per_window-1, len(times)))
 
+    # Identify intervals with missing indices
+    time_h = times.values[H]
+    # Get maximum time step (in minutes) between consecutive Timestamps
+    # for each column
+    time_h_diff_max = np.max(np.diff(time_h, axis=0) /
+                             np.timedelta64(1, '60s'), axis=0)
+    # Get column indices where max time step > sample_interval
+    gaps = np.ravel(np.argwhere(time_h_diff_max > sample_interval))
+    # Get column indices where at least one of the values is a NaN
+    gaps = set().union(*[
+      gaps, np.ravel(np.argwhere(np.isnan(meas\
+                                          .values[H].mean(axis=0))))])
+
     # calculate measurement statistics
     meas_mean, meas_max, meas_slope_nstd, meas_slope = _calc_stats(
         meas, samples_per_window, sample_interval, H)
@@ -802,26 +854,55 @@ def detect_clearsky(measured, clearsky, times=None, window_length=10,
         line_diff = meas_line_length - clear_line_length
         slope_max_diff = _max_diff_windowed(
             meas - scaled_clear, H, samples_per_window)
+
         # evaluate comparison criteria
-        c1 = np.abs(meas_mean - alpha*clear_mean) < mean_diff
-        c2 = np.abs(meas_max - alpha*clear_max) < max_diff
-        c3 = (line_diff > lower_line_length) & (line_diff < upper_line_length)
+        # Condition 1
+        c1 = np.abs(meas_mean - alpha*clear_mean)
+        c1_where_nan = c1[c1.isna()].index
+        c1 = c1 < mean_diff
+        # Condition 2
+        c2 = np.abs(meas_max - alpha*clear_max)
+        c2_where_nan = c2[c2.isna()].index
+        c2 = c2 < max_diff
+        # Condition 3a & 3b
+        c3_where_nan = line_diff[line_diff.isna()].index
+        c3a = line_diff > lower_line_length
+        c3b = line_diff < upper_line_length
+        c3 = np.logical_and(c3a, c3b)
+        # Condition 4
+        c4_where_nan = meas_slope_nstd[meas_slope_nstd.isna()].index
         c4 = meas_slope_nstd < var_diff
+        # Condition 5
+        c5_where_nan = slope_max_diff[slope_max_diff.isna()].index
         c5 = slope_max_diff < slope_dev
-        c6 = (clear_mean != 0) & ~np.isnan(clear_mean)
-        clear_windows = c1 & c2 & c3 & c4 & c5 & c6
+        # Condition 6
+        c6 = clear_mean != 0
+        c6_where_nan = clear_mean[clear_mean.isna()].index
+
+        # np.logical_and() maintains NaNs
+        clear_windows = pd.Series(
+            index=times, data=np.logical_and.reduce([c1, c2, c3, c4, c5, c6]))
+        windows_where_nan = pd.DatetimeIndex(set().union(*[
+          c1_where_nan,c2_where_nan, c3_where_nan, c4_where_nan, c5_where_nan,
+          c6_where_nan]))
+        clear_windows[windows_where_nan] = np.nan
 
         # create array to return
-        clear_samples = np.full_like(meas, False, dtype='bool')
+        # dtype='bool' removed because it typecast NaNs to False values
+        clear_samples = np.full_like(meas, False)
         # find the samples contained in any window classified as clear
-        idx = _clear_sample_index(clear_windows, samples_per_window, 'center',
-                                  H)
+        idx = _clear_sample_index(clear_windows, samples_per_window, gaps, H,
+                                  'center')
         clear_samples[idx] = True
 
+        # Assign NaN to datapoints that were originally NaNs
+        where_nan = np.argwhere(np.isnan(meas.values))
+        clear_samples[where_nan] = np.nan
+
         # find a new alpha
         previous_alpha = alpha
-        clear_meas = meas[clear_samples]
-        clear_clear = clear[clear_samples]
+        clear_meas = meas[idx]
+        clear_clear = clear[idx]
 
         def rmse(alpha):
             return np.sqrt(np.mean((clear_meas - alpha*clear_clear)**2))

diff --git a/pvlib/data/detect_clearsky_data_missing1.csv b/pvlib/data/detect_clearsky_data_missing1.csv
@@ -0,0 +1,64 @@
+# latitude: 35.04
+# longitude: -106.62
+# elevation: 1619.0
+,CS,GHI,Clear or not
+2020-01-01 11:00:00-07:00,536.302172709558,509.4870640740801,1.0
+2020-01-01 11:01:00-07:00,537.591516239233,510.7119404272713,1.0
+2020-01-01 11:02:00-07:00,538.8629773699926,511.91982850149293,1.0
+2020-01-01 11:03:00-07:00,540.1165267379411,513.110700401044,1.0
+2020-01-01 11:04:00-07:00,541.3521354397624,514.2845286677742,1.0
+2020-01-01 11:05:00-07:00,542.5697750193947,515.441286268425,1.0
+2020-01-01 11:06:00-07:00,543.7694182656351,516.5809473523533,1.0
+2020-01-01 11:07:00-07:00,544.9510360161104,517.7034842153048,1.0
+2020-01-01 11:08:00-07:00,546.1146019716558,518.808871873073,1.0
+2020-01-01 11:09:00-07:00,547.2600894760479,519.8970850022455,1.0
+2020-01-01 11:10:00-07:00,548.3874723122931,520.9680986966785,1.0
+2020-01-01 11:11:00-07:00,549.4967247102148,522.021888474704,1.0
+2020-01-01 11:12:00-07:00,550.5878213431366,523.0584302759797,1.0
+2020-01-01 11:13:00-07:00,551.6607373248604,524.0777004586174,1.0
+2020-01-01 11:14:00-07:00,552.7154482165139,525.0796758056882,1.0
+2020-01-01 11:15:00-07:00,553.7519300136697,526.0643335129862,1.0
+2020-01-01 11:16:00-07:00,554.7701591532945,527.0316511956297,1.0
+2020-01-01 11:17:00-07:00,555.7701131753,527.981607516535,1.0
+2020-01-01 11:18:00-07:00,556.7517680497913,528.9141796473017,1.0
+2020-01-01 11:19:00-07:00,557.7151022068979,529.829347096553,1.0
+2020-01-01 11:20:00-07:00,558.6600938303893,530.7270891388698,1.0
+2020-01-01 11:21:00-07:00,559.5867215409612,531.6073854639131,1.0
+2020-01-01 11:22:00-07:00,560.4949643935382,532.4702161738613,1.0
+2020-01-01 11:23:00-07:00,561.3848018765772,533.3155617827483,1.0
+2020-01-01 11:24:00-07:00,562.2562139096694,534.1434032141859,1.0
+2020-01-01 11:25:00-07:00,563.1091808493264,534.9537218068601,1.0
+2020-01-01 11:26:00-07:00,563.9436834785143,535.7464993045886,1.0
+2020-01-01 11:27:00-07:00,564.7597030125947,536.5217178619649,1.0
+2020-01-01 11:28:00-07:00,565.5572210969589,537.2793600421109,1.0
+2020-01-01 11:29:00-07:00,566.336220321224,538.0194093051628,1.0
+2020-01-01 11:31:00-07:00,567.8385900341821,539.446660532473,1.0
+2020-01-01 11:32:00-07:00,568.5619273418005,540.1338309747105,1.0
+2020-01-01 11:33:00-07:00,569.2666778570792,540.8033439642253,1.0
+2020-01-01 11:34:00-07:00,569.9528257924741,541.4551845028503,1.0
+2020-01-01 11:35:00-07:00,570.6203557904192,542.0893380008982,1.0
+2020-01-01 11:36:00-07:00,571.2692529152785,542.7057902695145,1.0
+2020-01-01 11:37:00-07:00,571.8995026579759,543.304527525077,1.0
+2020-01-01 11:38:00-07:00,572.5110909343163,543.8855363876004,1.0
+2020-01-01 11:39:00-07:00,573.1040040834109,544.4488038792404,1.0
+2020-01-01 11:40:00-07:00,573.6782292505169,544.994317787991,1.0
+2020-01-01 11:41:00-07:00,574.2337528522768,545.522065209663,1.0
+2020-01-01 11:42:00-07:00,574.7705628911892,546.0320347466297,1.0
+2020-01-01 11:43:00-07:00,575.2886474013371,546.5242150312703,1.0
+2020-01-01 11:44:00-07:00,575.7879948384899,546.9985950965654,1.0
+2020-01-01 11:45:00-07:00,576.2685940834351,547.4551643792634,1.0
+2020-01-01 11:46:00-07:00,576.7304344361013,547.8939127142962,1.0
+2020-01-01 11:47:00-07:00,577.1735056190306,548.3148303380791,1.0
+2020-01-01 11:48:00-07:00,577.5977977761287,548.7179078873222,1.0
+2020-01-01 11:49:00-07:00,578.0033014716739,549.1031363980902,1.0
+2020-01-01 11:50:00-07:00,578.3900076930416,549.4705073083895,1.0
+2020-01-01 11:51:00-07:00,578.7579080863377,549.8200126820208,1.0
+2020-01-01 11:52:00-07:00,579.1069939850056,550.1516442857553,1.0
+2020-01-01 11:53:00-07:00,579.4372578890877,550.4653949946334,1.0
+2020-01-01 11:54:00-07:00,579.7486924650192,550.7612578417682,1.0
+2020-01-01 11:55:00-07:00,580.0412908007218,551.0392262606856,1.0
+2020-01-01 11:56:00-07:00,580.3150464019636,551.2992940818655,1.0
+2020-01-01 11:57:00-07:00,580.569953194532,551.5414555348053,1.0
+2020-01-01 11:58:00-07:00,580.8060055234938,551.7657052473191,1.0
+2020-01-01 11:59:00-07:00,581.0231981526458,551.9720382450136,1.0
+2020-01-01 12:00:00-07:00,581.2215262659929,552.1604499526933,1.0

diff --git a/pvlib/data/detect_clearsky_data_missing2.csv b/pvlib/data/detect_clearsky_data_missing2.csv
@@ -0,0 +1,64 @@
+# latitude: 35.04
+# longitude: -106.62
+# elevation: 1619.0
+,CS,GHI,Clear or not
+2020-01-01 11:00:00-07:00,536.302172709558,509.4870640740801,1.0
+2020-01-01 11:01:00-07:00,537.591516239233,510.7119404272713,1.0
+2020-01-01 11:02:00-07:00,538.8629773699926,511.91982850149293,1.0
+2020-01-01 11:03:00-07:00,540.1165267379411,513.110700401044,1.0
+2020-01-01 11:04:00-07:00,541.3521354397624,514.2845286677742,1.0
+2020-01-01 11:05:00-07:00,542.5697750193947,515.441286268425,1.0
+2020-01-01 11:06:00-07:00,543.7694182656351,516.5809473523533,1.0
+2020-01-01 11:07:00-07:00,544.9510360161104,517.7034842153048,1.0
+2020-01-01 11:08:00-07:00,546.1146019716558,518.808871873073,1.0
+2020-01-01 11:09:00-07:00,547.2600894760479,519.8970850022455,1.0
+2020-01-01 11:10:00-07:00,548.3874723122931,520.9680986966785,1.0
+2020-01-01 11:11:00-07:00,549.4967247102148,522.021888474704,1.0
+2020-01-01 11:12:00-07:00,550.5878213431366,523.0584302759797,1.0
+2020-01-01 11:13:00-07:00,551.6607373248604,524.0777004586174,1.0
+2020-01-01 11:14:00-07:00,552.7154482165139,525.0796758056882,1.0
+2020-01-01 11:15:00-07:00,553.7519300136697,526.0643335129862,1.0
+2020-01-01 11:16:00-07:00,554.7701591532945,527.0316511956297,1.0
+2020-01-01 11:17:00-07:00,555.7701131753,527.981607516535,1.0
+2020-01-01 11:18:00-07:00,556.7517680497913,528.9141796473017,1.0
+2020-01-01 11:19:00-07:00,557.7151022068979,529.829347096553,1.0
+2020-01-01 11:20:00-07:00,558.6600938303893,530.7270891388698,1.0
+2020-01-01 11:21:00-07:00,559.5867215409612,531.6073854639131,1.0
+2020-01-01 11:22:00-07:00,560.4949643935382,532.4702161738613,1.0
+2020-01-01 11:23:00-07:00,561.3848018765772,533.3155617827483,1.0
+2020-01-01 11:24:00-07:00,562.2562139096694,534.1434032141859,1.0
+2020-01-01 11:25:00-07:00,563.1091808493264,534.9537218068601,1.0
+2020-01-01 11:26:00-07:00,563.9436834785143,535.7464993045886,1.0
+2020-01-01 11:27:00-07:00,564.7597030125947,536.5217178619649,1.0
+2020-01-01 11:28:00-07:00,565.5572210969589,537.2793600421109,1.0
+2020-01-01 11:29:00-07:00,566.336220321224,538.0194093051628,1.0
+2020-01-01 11:31:00-07:00,567.8385900341821,539.446660532473,0.0
+2020-01-01 11:32:00-07:00,568.5619273418005,540.1338309747105,0.0
+2020-01-01 11:33:00-07:00,569.2666778570792,540.8033439642253,0.0
+2020-01-01 11:34:00-07:00,569.9528257924741,541.4551845028503,0.0
+2020-01-01 11:35:00-07:00,570.6203557904192,300.0,0.0
+2020-01-01 11:36:00-07:00,571.2692529152785,200.0,0.0
+2020-01-01 11:37:00-07:00,571.8995026579759,250.0,0.0
+2020-01-01 11:38:00-07:00,572.5110909343163,310.0,0.0
+2020-01-01 11:39:00-07:00,573.1040040834109,330.0,0.0
+2020-01-01 11:40:00-07:00,573.6782292505169,544.994317787991,1.0
+2020-01-01 11:41:00-07:00,574.2337528522768,545.522065209663,1.0
+2020-01-01 11:42:00-07:00,574.7705628911892,546.0320347466297,1.0
+2020-01-01 11:43:00-07:00,575.2886474013371,546.5242150312703,1.0
+2020-01-01 11:44:00-07:00,575.7879948384899,546.9985950965654,1.0
+2020-01-01 11:45:00-07:00,576.2685940834351,547.4551643792634,1.0
+2020-01-01 11:46:00-07:00,576.7304344361013,547.8939127142962,1.0
+2020-01-01 11:47:00-07:00,577.1735056190306,548.3148303380791,1.0
+2020-01-01 11:48:00-07:00,577.5977977761287,548.7179078873222,1.0
+2020-01-01 11:49:00-07:00,578.0033014716739,549.1031363980902,1.0
+2020-01-01 11:50:00-07:00,578.3900076930416,549.4705073083895,1.0
+2020-01-01 11:51:00-07:00,578.7579080863377,549.8200126820208,1.0
+2020-01-01 11:52:00-07:00,579.1069939850056,550.1516442857553,1.0
+2020-01-01 11:53:00-07:00,579.4372578890877,550.4653949946334,1.0
+2020-01-01 11:54:00-07:00,579.7486924650192,550.7612578417682,1.0
+2020-01-01 11:55:00-07:00,580.0412908007218,551.0392262606856,1.0
+2020-01-01 11:56:00-07:00,580.3150464019636,551.2992940818655,1.0
+2020-01-01 11:57:00-07:00,580.569953194532,551.5414555348053,1.0
+2020-01-01 11:58:00-07:00,580.8060055234938,551.7657052473191,1.0
+2020-01-01 11:59:00-07:00,581.0231981526458,551.9720382450136,1.0
+2020-01-01 12:00:00-07:00,581.2215262659929,552.1604499526933,1.0