Coverage for src/gwtransport/utils.py: 0%
281 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 13:45 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 13:45 +0000
1"""General utilities for the 1D groundwater transport model."""
3from __future__ import annotations
5import io
6from collections.abc import Sequence
7from datetime import date
8from pathlib import Path
10import numpy as np
11import numpy.typing as npt
12import pandas as pd
13import requests
14from scipy import interpolate
15from scipy.linalg import null_space
16from scipy.optimize import minimize
18cache_dir = Path(__file__).parent.parent.parent / "cache"
21def linear_interpolate(x_ref, y_ref, x_query, left=None, right=None):
22 """
23 Linear interpolation on monotonically increasing data.
25 Parameters
26 ----------
27 x_ref : array-like
28 Reference vector with sorted x-values.
29 y_ref : array-like
30 Reference vector with y-values.
31 x_query : array-like
32 Query x-values. Array may have any shape.
33 left : float, optional
34 Value to return for x_query < x_ref[0].
35 - If `left` is set to None, x_query = x_ref[0].
36 - If `left` is set to a float, such as np.nan, this value is returned.
37 right : float, optional
38 Value to return for x_query > x_ref[-1].
39 - If `right` is set to None, x_query = x_ref[-1].
40 - If `right` is set to a float, such as np.nan, this value is returned.
42 Returns
43 -------
44 array
45 Interpolated y-values.
46 """
47 x_ref = np.asarray(x_ref)
48 y_ref = np.asarray(y_ref)
49 x_query = np.asarray(x_query)
51 # Find indices where x_query would be inserted in x_ref
52 idx_no_edges = np.searchsorted(x_ref, x_query)
54 idx = np.clip(idx_no_edges, 1, len(x_ref) - 1)
56 # Calculate interpolation weights
57 x0 = x_ref[idx - 1]
58 x1 = x_ref[idx]
59 y0 = y_ref[idx - 1]
60 y1 = y_ref[idx]
62 # Perform linear interpolation
63 weights = (x_query - x0) / (x1 - x0)
64 y_query = y0 + weights * (y1 - y0)
66 # Handle edge cases
67 if left is None:
68 y_query = np.where(x_query < x_ref[0], y_ref[0], y_query)
69 if right is None:
70 y_query = np.where(x_query > x_ref[-1], y_ref[-1], y_query)
71 if isinstance(left, float):
72 y_query = np.where(x_query < x_ref[0], left, y_query)
73 if isinstance(right, float):
74 y_query = np.where(x_query > x_ref[-1], right, y_query)
76 return y_query
79def interp_series(series, index_new, **interp1d_kwargs):
80 """
81 Interpolate a pandas.Series to a new index.
83 Parameters
84 ----------
85 series : pandas.Series
86 Series to interpolate.
87 index_new : pandas.DatetimeIndex
88 New index to interpolate to.
89 interp1d_kwargs : dict, optional
90 Keyword arguments passed to scipy.interpolate.interp1d. Default is {}.
92 Returns
93 -------
94 pandas.Series
95 Interpolated series.
96 """
97 series = series[series.index.notna() & series.notna()]
98 dt = (series.index - series.index[0]) / pd.to_timedelta(1, unit="D")
99 dt_interp = (index_new - series.index[0]) / pd.to_timedelta(1, unit="D")
100 interp_obj = interpolate.interp1d(dt, series.values, bounds_error=False, **interp1d_kwargs)
101 return interp_obj(dt_interp)
104def diff(a, alignment="centered"):
105 """Compute the cell widths for a given array of cell coordinates.
107 If alignment is "centered", the coordinates are assumed to be centered in the cells.
108 If alignment is "left", the coordinates are assumed to be at the left edge of the cells.
109 If alignment is "right", the coordinates are assumed to be at the right edge of the cells.
111 Parameters
112 ----------
113 a : array-like
114 Input array.
116 Returns
117 -------
118 array
119 Array with differences between elements.
120 """
121 if alignment == "centered":
122 mid = a[:-1] + (a[1:] - a[:-1]) / 2
123 return np.concatenate((a[[1]] - a[[0]], mid[1:] - mid[:-1], a[[-1]] - a[[-2]]))
124 if alignment == "left":
125 return np.concatenate((a[1:] - a[:-1], a[[-1]] - a[[-2]]))
126 if alignment == "right":
127 return np.concatenate((a[[1]] - a[[0]], a[1:] - a[:-1]))
129 msg = f"Invalid alignment: {alignment}"
130 raise ValueError(msg)
133def linear_average( # noqa: C901
134 x_data: Sequence[float] | npt.NDArray[np.float64],
135 y_data: Sequence[float] | npt.NDArray[np.float64],
136 x_edges: Sequence[float] | npt.NDArray[np.float64],
137 extrapolate_method: str = "nan",
138) -> npt.NDArray[np.float64]:
139 """
140 Compute the average value of a piecewise linear time series between specified x-edges.
142 Parameters
143 ----------
144 x_data : array-like
145 x-coordinates of the time series data points, must be in ascending order
146 y_data : array-like
147 y-coordinates of the time series data points
148 x_edges : array-like
149 x-coordinates of the integration edges. Can be 1D or 2D.
150 - If 1D: shape (n_edges,). Can be 1D or 2D.
151 - If 1D: shape (n_edges,), must be in ascending order
152 - If 2D: shape (n_series, n_edges), each row must be in ascending order
153 - If 2D: shape (n_series, n_edges), each row must be in ascending order
154 extrapolate_method : str, optional
155 Method for handling extrapolation. Default is 'nan'.
156 - 'outer': Extrapolate using the outermost data points.
157 - 'nan': Extrapolate using np.nan.
158 - 'raise': Raise an error for out-of-bounds values.
160 Returns
161 -------
162 numpy.ndarray
163 2D array of average values between consecutive pairs of x_edges.
164 Shape is (n_series, n_bins) where n_bins = n_edges - 1.
165 If x_edges is 1D, n_series = 1.
167 Examples
168 --------
169 >>> x_data = [0, 1, 2, 3]
170 >>> y_data = [0, 1, 1, 0]
171 >>> x_edges = [0, 1.5, 3]
172 >>> linear_average(x_data, y_data, x_edges)
173 array([[0.667, 0.667]])
175 >>> x_edges_2d = [[0, 1.5, 3], [0.5, 2, 3]]
176 >>> linear_average(x_data, y_data, x_edges_2d)
177 array([[0.667, 0.667], [0.75, 0.5]])
178 """
179 # Convert inputs to numpy arrays
180 x_data = np.asarray(x_data, dtype=float)
181 y_data = np.asarray(y_data, dtype=float)
182 x_edges = np.asarray(x_edges, dtype=float)
184 # Ensure x_edges is always 2D
185 if x_edges.ndim == 1:
186 x_edges = x_edges[np.newaxis, :]
187 elif x_edges.ndim != 2: # noqa: PLR2004
188 msg = "x_edges must be 1D or 2D array"
189 raise ValueError(msg)
191 # Input validation
192 if len(x_data) != len(y_data) or len(x_data) == 0:
193 msg = "x_data and y_data must have the same length and be non-empty"
194 raise ValueError(msg)
195 if x_edges.shape[1] < 2: # noqa: PLR2004
196 msg = "x_edges must contain at least 2 values in each row"
197 raise ValueError(msg)
198 if not np.all(np.diff(x_data) >= 0):
199 msg = "x_data must be in ascending order"
200 raise ValueError(msg)
201 if not np.all(np.diff(x_edges, axis=1) >= 0):
202 msg = "x_edges must be in ascending order along each row"
203 raise ValueError(msg)
205 # Filter out NaN values
206 show = ~np.isnan(x_data) & ~np.isnan(y_data)
207 if show.sum() < 2: # noqa: PLR2004
208 if show.sum() == 1 and extrapolate_method == "outer":
209 # For single data point with outer extrapolation, use constant value
210 constant_value = y_data[show][0]
211 return np.full(shape=(x_edges.shape[0], x_edges.shape[1] - 1), fill_value=constant_value)
212 return np.full(shape=(x_edges.shape[0], x_edges.shape[1] - 1), fill_value=np.nan)
214 x_data_clean = x_data[show]
215 y_data_clean = y_data[show]
217 # Handle extrapolation for all series at once (vectorized)
218 if extrapolate_method == "outer":
219 edges_processed = np.clip(x_edges, x_data_clean.min(), x_data_clean.max())
220 elif extrapolate_method == "raise":
221 if np.any(x_edges < x_data_clean.min()) or np.any(x_edges > x_data_clean.max()):
222 msg = "x_edges must be within the range of x_data"
223 raise ValueError(msg)
224 edges_processed = x_edges.copy()
225 else: # nan method
226 edges_processed = x_edges.copy()
228 # Create a combined grid of all unique x points (data + all edges)
229 all_unique_x = np.unique(np.concatenate([x_data_clean, edges_processed.ravel()]))
231 # Interpolate y values at all unique x points once
232 all_unique_y = np.interp(all_unique_x, x_data_clean, y_data_clean, left=np.nan, right=np.nan)
234 # Compute cumulative integrals once using trapezoidal rule
235 dx = np.diff(all_unique_x)
236 y_avg = (all_unique_y[:-1] + all_unique_y[1:]) / 2
237 segment_integrals = dx * y_avg
238 # Replace NaN values with 0 to avoid breaking cumulative sum
239 segment_integrals = np.nan_to_num(segment_integrals, nan=0.0)
240 cumulative_integral = np.concatenate([[0], np.cumsum(segment_integrals)])
242 # Vectorized computation for all series
243 # Find indices of all edges in the combined grid
244 edge_indices = np.searchsorted(all_unique_x, edges_processed)
246 # Compute integral between consecutive edges for all series (vectorized)
247 integral_values = cumulative_integral[edge_indices[:, 1:]] - cumulative_integral[edge_indices[:, :-1]]
249 # Compute widths between consecutive edges for all series (vectorized)
250 edge_widths = np.diff(edges_processed, axis=1)
252 # Handle zero-width intervals (vectorized)
253 zero_width_mask = edge_widths == 0
254 result = np.zeros_like(edge_widths)
256 # For non-zero width intervals, compute average = integral / width (vectorized)
257 non_zero_mask = ~zero_width_mask
258 result[non_zero_mask] = integral_values[non_zero_mask] / edge_widths[non_zero_mask]
260 # For zero-width intervals, interpolate y-value directly (vectorized)
261 if np.any(zero_width_mask):
262 zero_width_positions = edges_processed[:, :-1][zero_width_mask]
263 result[zero_width_mask] = np.interp(zero_width_positions, x_data_clean, y_data_clean)
265 # Handle extrapolation when 'nan' method is used (vectorized)
266 if extrapolate_method == "nan":
267 # Set out-of-range bins to NaN
268 bins_within_range = (x_edges[:, :-1] >= x_data_clean.min()) & (x_edges[:, 1:] <= x_data_clean.max())
269 result[~bins_within_range] = np.nan
271 return result
274def partial_isin(bin_edges_in, bin_edges_out):
275 """
276 Calculate the fraction of each input bin that overlaps with each output bin.
278 This function computes a matrix where element (i, j) represents the fraction
279 of input bin i that overlaps with output bin j. The computation uses
280 vectorized operations to avoid loops.
282 Parameters
283 ----------
284 bin_edges_in : array_like
285 1D array of input bin edges in ascending order. For n_in bins, there
286 should be n_in+1 edges.
287 bin_edges_out : array_like
288 1D array of output bin edges in ascending order. For n_out bins, there
289 should be n_out+1 edges.
291 Returns
292 -------
293 overlap_matrix : numpy.ndarray
294 Dense matrix of shape (n_in, n_out) where n_in is the number of input
295 bins and n_out is the number of output bins. Each element (i, j)
296 represents the fraction of input bin i that overlaps with output bin j.
297 Values range from 0 (no overlap) to 1 (complete overlap).
299 Notes
300 -----
301 - Both input arrays must be sorted in ascending order
302 - The function leverages the sorted nature of both inputs for efficiency
303 - Uses vectorized operations to handle large bin arrays efficiently
304 - All overlaps sum to 1.0 for each input bin when output bins fully cover input range
306 Examples
307 --------
308 >>> bin_edges_in = np.array([0, 10, 20, 30])
309 >>> bin_edges_out = np.array([5, 15, 25])
310 >>> partial_isin(bin_edges_in, bin_edges_out)
311 array([[0.5, 0.0],
312 [0.5, 0.5],
313 [0.0, 0.5]])
314 """
315 # Convert inputs to numpy arrays
316 bin_edges_in = np.asarray(bin_edges_in, dtype=float)
317 bin_edges_out = np.asarray(bin_edges_out, dtype=float)
319 # Validate inputs
320 if bin_edges_in.ndim != 1 or bin_edges_out.ndim != 1:
321 msg = "Both bin_edges_in and bin_edges_out must be 1D arrays"
322 raise ValueError(msg)
323 if len(bin_edges_in) < 2 or len(bin_edges_out) < 2: # noqa: PLR2004
324 msg = "Both edge arrays must have at least 2 elements"
325 raise ValueError(msg)
327 # Check ascending order, ignoring NaN values
328 diffs_in = np.diff(bin_edges_in)
329 valid_diffs_in = ~np.isnan(diffs_in)
330 if np.any(valid_diffs_in) and not np.all(diffs_in[valid_diffs_in] > 0):
331 msg = "bin_edges_in must be in ascending order"
332 raise ValueError(msg)
334 diffs_out = np.diff(bin_edges_out)
335 valid_diffs_out = ~np.isnan(diffs_out)
336 if np.any(valid_diffs_out) and not np.all(diffs_out[valid_diffs_out] > 0):
337 msg = "bin_edges_out must be in ascending order"
338 raise ValueError(msg)
340 # Build matrix using fully vectorized approach
341 # Create meshgrids for all possible input-output bin combinations
342 in_left = bin_edges_in[:-1, None] # Shape: (n_bins_in, 1)
343 in_right = bin_edges_in[1:, None] # Shape: (n_bins_in, 1)
344 in_width = np.diff(bin_edges_in)[:, None] # Shape: (n_bins_in, 1)
346 out_left = bin_edges_out[None, :-1] # Shape: (1, n_bins_out)
347 out_right = bin_edges_out[None, 1:] # Shape: (1, n_bins_out)
349 # Calculate overlaps for all combinations using broadcasting
350 overlap_left = np.maximum(in_left, out_left) # Shape: (n_bins_in, n_bins_out)
351 overlap_right = np.minimum(in_right, out_right) # Shape: (n_bins_in, n_bins_out)
353 # Calculate overlap widths (zero where no overlap)
354 overlap_widths = np.maximum(0, overlap_right - overlap_left)
356 # Calculate fractions (NaN widths will result in NaN fractions)
357 return overlap_widths / in_width
360def time_bin_overlap(tedges, bin_tedges):
361 """
362 Calculate the fraction of each time bin that overlaps with each time range.
364 This function computes an array where element (i, j) represents the fraction
365 of time bin j that overlaps with time range i. The computation uses
366 vectorized operations to avoid loops.
368 Parameters
369 ----------
370 tedges : array_like
371 1D array of time bin edges in ascending order. For n bins, there
372 should be n+1 edges.
373 bin_tedges : list of tuples
374 List of tuples where each tuple contains (start_time, end_time)
375 defining a time range.
377 Returns
378 -------
379 overlap_array : numpy.ndarray
380 Array of shape (len(bin_tedges), n_bins) where n_bins is the number of
381 time bins. Each element (i, j) represents the fraction of time bin j
382 that overlaps with time range i. Values range from 0 (no overlap) to
383 1 (complete overlap).
385 Notes
386 -----
387 - tedges must be sorted in ascending order
388 - Uses vectorized operations to handle large arrays efficiently
389 - Time ranges in bin_tedges can be in any order and can overlap
391 Examples
392 --------
393 >>> tedges = np.array([0, 10, 20, 30])
394 >>> bin_tedges = [(5, 15), (25, 35)]
395 >>> time_bin_overlap(tedges, bin_tedges)
396 array([[0.5, 0.5, 0.0],
397 [0.0, 0.0, 0.5]])
398 """
399 # Convert inputs to numpy arrays
400 tedges = np.asarray(tedges)
401 bin_tedges_array = np.asarray(bin_tedges)
403 # Validate inputs
404 if tedges.ndim != 1:
405 msg = "tedges must be a 1D array"
406 raise ValueError(msg)
407 if len(tedges) < 2: # noqa: PLR2004
408 msg = "tedges must have at least 2 elements"
409 raise ValueError(msg)
410 if bin_tedges_array.size == 0:
411 msg = "bin_tedges must be non-empty"
412 raise ValueError(msg)
414 # Calculate overlaps for all combinations using broadcasting
415 overlap_left = np.maximum(bin_tedges_array[:, [0]], tedges[None, :-1])
416 overlap_right = np.minimum(bin_tedges_array[:, [1]], tedges[None, 1:])
417 overlap_widths = np.maximum(0, overlap_right - overlap_left)
419 # Calculate fractions (handle division by zero for zero-width bins)
420 bin_width_bc = np.diff(tedges)[None, :] # Shape: (1, n_bins)
422 return np.divide(
423 overlap_widths, bin_width_bc, out=np.zeros_like(overlap_widths, dtype=float), where=bin_width_bc != 0.0
424 )
427def generate_failed_coverage_badge():
428 """Generate a badge indicating failed coverage."""
429 from genbadge import Badge # type: ignore # noqa: PLC0415
431 b = Badge(left_txt="coverage", right_txt="failed", color="red")
432 b.write_to("coverage_failed.svg", use_shields=False)
435def combine_bin_series(a, a_edges, b, b_edges, extrapolation=0.0):
436 """
437 Combine two binned series onto a common set of unique edges.
439 This function takes two binned series (a and b) with their respective bin edges
440 and creates new series (c and d) that are defined on a combined set of unique
441 edges from both input edge arrays.
443 Parameters
444 ----------
445 a : array-like
446 Values for the first binned series.
447 a_edges : array-like
448 Bin edges for the first series. Must have len(a) + 1 elements.
449 b : array-like
450 Values for the second binned series.
451 b_edges : array-like
452 Bin edges for the second series. Must have len(b) + 1 elements.
453 extrapolation : str or float, optional
454 Method for handling combined bins that fall outside the original series ranges.
455 - 'nearest': Use the nearest original bin value
456 - float value (e.g., np.nan, 0.0): Fill with the specified value (default: 0.0)
458 Returns
459 -------
460 c : numpy.ndarray
461 Values from series a mapped to the combined edge structure.
462 c_edges : numpy.ndarray
463 Combined unique edges from a_edges and b_edges.
464 d : numpy.ndarray
465 Values from series b mapped to the combined edge structure.
466 d_edges : numpy.ndarray
467 Combined unique edges from a_edges and b_edges (same as c_edges).
469 Notes
470 -----
471 The combined edges are created by taking the union of all unique values
472 from both a_edges and b_edges, sorted in ascending order. The values
473 are then broadcasted/repeated for each combined bin that falls within
474 the original bin's range.
475 """
476 # Convert inputs to numpy arrays
477 a = np.asarray(a, dtype=float)
478 a_edges = np.asarray(a_edges, dtype=float)
479 b = np.asarray(b, dtype=float)
480 b_edges = np.asarray(b_edges, dtype=float)
482 # Validate inputs
483 if len(a_edges) != len(a) + 1:
484 msg = "a_edges must have len(a) + 1 elements"
485 raise ValueError(msg)
486 if len(b_edges) != len(b) + 1:
487 msg = "b_edges must have len(b) + 1 elements"
488 raise ValueError(msg)
490 # Create combined unique edges
491 combined_edges = np.unique(np.concatenate([a_edges, b_edges]))
493 # Initialize output arrays
494 c = np.zeros(len(combined_edges) - 1)
495 d = np.zeros(len(combined_edges) - 1)
497 # Vectorized mapping using searchsorted - find which original bin each combined bin belongs to
498 # For series a: find which original bin each combined bin center falls into
499 combined_bin_centers = (combined_edges[:-1] + combined_edges[1:]) / 2
500 a_bin_assignment = np.searchsorted(a_edges, combined_bin_centers, side="right") - 1
501 a_bin_assignment = np.clip(a_bin_assignment, 0, len(a) - 1)
503 # Handle extrapolation for series a
504 if extrapolation == "nearest":
505 # Assign all values using nearest neighbor (already clipped)
506 c[:] = a[a_bin_assignment]
507 else:
508 # Only assign values where the combined bin is completely within the original bin
509 a_valid_mask = (combined_edges[:-1] >= a_edges[a_bin_assignment]) & (
510 combined_edges[1:] <= a_edges[a_bin_assignment + 1]
511 )
512 c[a_valid_mask] = a[a_bin_assignment[a_valid_mask]]
513 # Fill out-of-range bins with extrapolation value
514 c[~a_valid_mask] = extrapolation
516 # Handle extrapolation for series b
517 b_bin_assignment = np.searchsorted(b_edges, combined_bin_centers, side="right") - 1
518 b_bin_assignment = np.clip(b_bin_assignment, 0, len(b) - 1)
520 if extrapolation == "nearest":
521 # Assign all values using nearest neighbor (already clipped)
522 d[:] = b[b_bin_assignment]
523 else:
524 # Only assign values where the combined bin is completely within the original bin
525 b_valid_mask = (combined_edges[:-1] >= b_edges[b_bin_assignment]) & (
526 combined_edges[1:] <= b_edges[b_bin_assignment + 1]
527 )
528 d[b_valid_mask] = b[b_bin_assignment[b_valid_mask]]
529 # Fill out-of-range bins with extrapolation value
530 d[~b_valid_mask] = extrapolation
532 # Return the combined series
533 c_edges = combined_edges
534 d_edges = combined_edges.copy()
536 return c, c_edges, d, d_edges
539def compute_time_edges(tedges, tstart, tend, number_of_bins):
540 """
541 Compute time edges for binning data based on provided time parameters.
543 This function creates a DatetimeIndex of time bin edges from one of three possible
544 input formats: explicit edges, start times, or end times. The resulting edges
545 define the boundaries of time intervals for data binning.
547 Define either explicit time edges, or start and end times for each bin and leave the others at None.
549 Parameters
550 ----------
551 tedges : pandas.DatetimeIndex or None
552 Explicit time edges for the bins. If provided, must have one more element
553 than the number of bins (n_bins + 1). Takes precedence over tstart and tend.
554 tstart : pandas.DatetimeIndex or None
555 Start times for each bin. Must have the same number of elements as the
556 number of bins. Used when tedges is None.
557 tend : pandas.DatetimeIndex or None
558 End times for each bin. Must have the same number of elements as the
559 number of bins. Used when both tedges and tstart are None.
560 number_of_bins : int
561 The expected number of time bins. Used for validation against the provided
562 time parameters.
564 Returns
565 -------
566 pandas.DatetimeIndex
567 Time edges defining the boundaries of the time bins. Has one more element
568 than number_of_bins.
570 Raises
571 ------
572 ValueError
573 If tedges has incorrect length (not number_of_bins + 1).
574 If tstart has incorrect length (not equal to number_of_bins).
575 If tend has incorrect length (not equal to number_of_bins).
576 If none of tedges, tstart, or tend are provided.
578 Notes
579 -----
580 - When using tstart, the function assumes uniform spacing and extrapolates
581 the final edge based on the spacing between the last two start times.
582 - When using tend, the function assumes uniform spacing and extrapolates
583 the first edge based on the spacing between the first two end times.
584 - All input time data is converted to pandas.DatetimeIndex for consistency.
585 """
586 if tedges is not None:
587 tedges = pd.DatetimeIndex(tedges)
588 if number_of_bins != len(tedges) - 1:
589 msg = "tedges must have one more element than flow"
590 raise ValueError(msg)
592 elif tstart is not None:
593 # Assume the index refers to the time at the start of the measurement interval
594 tstart = pd.DatetimeIndex(tstart)
595 if number_of_bins != len(tstart):
596 msg = "tstart must have the same number of elements as flow"
597 raise ValueError(msg)
599 tedges = tstart.append(tstart[[-1]] + (tstart[-1] - tstart[-2]))
601 elif tend is not None:
602 # Assume the index refers to the time at the end of the measurement interval
603 tend = pd.DatetimeIndex(tend)
604 if number_of_bins != len(tend):
605 msg = "tend must have the same number of elements as flow"
606 raise ValueError(msg)
608 tedges = (tend[[0]] - (tend[1] - tend[0])).append(tend)
610 else:
611 msg = "Either provide tedges, tstart, and tend"
612 raise ValueError(msg)
613 return tedges
616def get_soil_temperature(station_number: int = 260, *, interpolate_missing_values: bool = True) -> pd.DataFrame:
617 """
618 Download soil temperature data from the KNMI and return it as a pandas DataFrame.
620 The data is available for the following KNMI weather stations:
621 - 260: De Bilt, the Netherlands (vanaf 1981)
622 - 273: Marknesse, the Netherlands (vanaf 1989)
623 - 286: Nieuw Beerta, the Netherlands (vanaf 1990)
624 - 323: Wilhelminadorp, the Netherlands (vanaf 1989)
626 TB1 = grondtemperatuur op 5 cm diepte (graden Celsius) tijdens de waarneming
627 TB2 = grondtemperatuur op 10 cm diepte (graden Celsius) tijdens de waarneming
628 TB3 = grondtemperatuur op 20 cm diepte (graden Celsius) tijdens de waarneming
629 TB4 = grondtemperatuur op 50 cm diepte (graden Celsius) tijdens de waarneming
630 TB5 = grondtemperatuur op 100 cm diepte (graden Celsius) tijdens de waarneming
631 TNB2 = minimum grondtemperatuur op 10 cm diepte in de afgelopen 6 uur (graden Celsius)
632 TNB1 = minimum grondtemperatuur op 5 cm diepte in de afgelopen 6 uur (graden Celsius)
633 TXB1 = maximum grondtemperatuur op 5 cm diepte in de afgelopen 6 uur (graden Celsius)
634 TXB2 = maximum grondtemperatuur op 10 cm diepte in de afgelopen 6 uur (graden Celsius)
636 Parameters
637 ----------
638 station_number : int, {260, 273, 286, 323}
639 The KNMI station number for which to download soil temperature data.
640 Default is 260 (De Bilt).
641 interpolate_missing_values : bool, optional
642 If True, missing values are interpolated and recent NaN values are extrapolated with the previous value.
643 If False, missing values remain as NaN. Default is True.
645 Returns
646 -------
647 pandas.DataFrame
648 DataFrame containing soil temperature data in Celsius with a DatetimeIndex.
649 Columns include TB1, TB2, TB3, TB4, TB5, TNB1, TNB2, TXB1, TXB2.
651 Notes
652 -----
653 - KNMI: Royal Netherlands Meteorological Institute
654 - The timeseries may contain NaN values for missing data.
655 """
656 # File-based daily cache
657 cache_dir.mkdir(exist_ok=True)
659 today = date.today().isoformat() # noqa: DTZ011
660 cache_path = cache_dir / f"soil_temp_{station_number}_{interpolate_missing_values}_{today}.pkl"
662 # Check if cached file exists and is from today
663 if cache_path.exists():
664 return pd.read_pickle(cache_path) # noqa: S301
666 # Clean up old cache files to prevent disk bloat
667 for old_file in cache_dir.glob(f"soil_temp_{station_number}_{interpolate_missing_values}_*.pkl"):
668 old_file.unlink(missing_ok=True)
670 url = f"https://cdn.knmi.nl/knmi/map/page/klimatologie/gegevens/bodemtemps/bodemtemps_{station_number}.zip"
672 dtypes = {
673 "YYYYMMDD": "int32",
674 "HH": "int8",
675 " TB1": "float32",
676 " TB3": "float32",
677 " TB2": "float32",
678 " TB4": "float32",
679 " TB5": "float32",
680 " TNB1": "float32",
681 " TNB2": "float32",
682 " TXB1": "float32",
683 " TXB2": "float32",
684 }
686 # Download the ZIP file
687 with requests.get(url, params={"download": "zip"}, timeout=10) as response:
688 response.raise_for_status()
690 df = pd.read_csv(
691 io.BytesIO(response.content),
692 compression="zip",
693 dtype=dtypes,
694 usecols=dtypes.keys(),
695 skiprows=16,
696 sep=",",
697 index_col=None,
698 na_values=[" "],
699 engine="c",
700 parse_dates=False,
701 )
703 df.index = pd.to_datetime(df["YYYYMMDD"].values, format=r"%Y%m%d").tz_localize("UTC") + pd.to_timedelta(
704 df["HH"].values, unit="h"
705 )
707 df.drop(columns=["YYYYMMDD", "HH"], inplace=True)
708 df.columns = df.columns.str.strip()
709 df /= 10.0
711 if interpolate_missing_values:
712 # Fill NaN values with interpolate linearly and then forward fill
713 df.interpolate(method="linear", inplace=True)
714 df.ffill(inplace=True)
716 # Save to cache for future use
717 df.to_pickle(cache_path)
718 return df
721def solve_underdetermined_system(
722 coefficient_matrix,
723 rhs_vector,
724 *,
725 nullspace_objective="squared_differences",
726 optimization_method="BFGS",
727):
728 """
729 Solve an underdetermined linear system with nullspace regularization.
731 For an underdetermined system Ax = b where A has more columns than rows,
732 multiple solutions exist. This function computes a least-squares solution
733 and then selects a specific solution from the nullspace based on a
734 regularization objective.
736 Parameters
737 ----------
738 coefficient_matrix : array_like
739 Coefficient matrix of shape (m, n) where m < n (underdetermined).
740 May contain NaN values in some rows, which will be excluded from the system.
741 rhs_vector : array_like
742 Right-hand side vector of length m. May contain NaN values corresponding
743 to NaN rows in coefficient_matrix, which will be excluded from the system.
744 nullspace_objective : str or callable, optional
745 Objective function to minimize in the nullspace. Options:
747 * "squared_differences" : Minimize sum of squared differences between
748 adjacent elements: sum((x[i+1] - x[i])**2)
749 * "summed_differences" : Minimize sum of absolute differences between
750 adjacent elements: sum(|x[i+1] - x[i]|)
751 * callable : Custom objective function with signature
752 ``objective(coeffs, x_ls, nullspace_basis)`` where:
754 - coeffs : optimization variables (nullspace coefficients)
755 - x_ls : least-squares solution
756 - nullspace_basis : nullspace basis matrix
758 Default is "squared_differences".
759 optimization_method : str, optional
760 Optimization method passed to scipy.optimize.minimize.
761 Default is "BFGS".
763 Returns
764 -------
765 numpy.ndarray
766 Solution vector that minimizes the specified nullspace objective.
767 Has length n (number of columns in coefficient_matrix).
769 Raises
770 ------
771 ValueError
772 If optimization fails, if coefficient_matrix and rhs_vector have incompatible shapes,
773 or if an unknown nullspace objective is specified.
775 Notes
776 -----
777 The algorithm follows these steps:
779 1. Remove rows with NaN values from both coefficient_matrix and rhs_vector
780 2. Compute least-squares solution: x_ls = pinv(valid_matrix) @ valid_rhs
781 3. Compute nullspace basis: N = null_space(valid_matrix)
782 4. Find nullspace coefficients: coeffs = argmin objective(x_ls + N @ coeffs)
783 5. Return final solution: x = x_ls + N @ coeffs
785 For the built-in objectives:
787 * "squared_differences" provides smooth solutions, minimizing rapid changes
788 * "summed_differences" provides sparse solutions, promoting piecewise constant behavior
790 Examples
791 --------
792 Basic usage with default squared differences objective:
794 >>> import numpy as np
795 >>> from gwtransport.utils import solve_underdetermined_system
796 >>>
797 >>> # Create underdetermined system (2 equations, 4 unknowns)
798 >>> matrix = np.array([[1, 2, 1, 0], [0, 1, 2, 1]])
799 >>> rhs = np.array([3, 4])
800 >>>
801 >>> # Solve with squared differences regularization
802 >>> x = solve_underdetermined_system(matrix, rhs)
803 >>> print(f"Solution: {x}")
804 >>> print(f"Residual: {np.linalg.norm(matrix @ x - rhs):.2e}")
806 With summed differences objective:
808 >>> x_sparse = solve_underdetermined_system(
809 ... matrix, rhs, nullspace_objective="summed_differences"
810 ... )
812 With custom objective function:
814 >>> def custom_objective(coeffs, x_ls, nullspace_basis):
815 ... x = x_ls + nullspace_basis @ coeffs
816 ... return np.sum(x**2) # Minimize L2 norm
817 >>>
818 >>> x_custom = solve_underdetermined_system(
819 ... matrix, rhs, nullspace_objective=custom_objective
820 ... )
822 Handling NaN values:
824 >>> # System with missing data
825 >>> matrix_nan = np.array([
826 ... [1, 2, 1, 0],
827 ... [np.nan, np.nan, np.nan, np.nan],
828 ... [0, 1, 2, 1],
829 ... ])
830 >>> rhs_nan = np.array([3, np.nan, 4])
831 >>>
832 >>> x_nan = solve_underdetermined_system(matrix_nan, rhs_nan)
833 """
834 matrix = np.asarray(coefficient_matrix)
835 rhs = np.asarray(rhs_vector)
837 if matrix.shape[0] != len(rhs):
838 msg = f"coefficient_matrix has {matrix.shape[0]} rows but rhs_vector has {len(rhs)} elements"
839 raise ValueError(msg)
841 # Identify valid rows (no NaN values in either matrix or rhs)
842 valid_rows = ~np.isnan(matrix).any(axis=1) & ~np.isnan(rhs)
844 if not np.any(valid_rows):
845 msg = "No valid rows found (all contain NaN values)"
846 raise ValueError(msg)
848 valid_matrix = matrix[valid_rows]
849 valid_rhs = rhs[valid_rows]
851 # Compute least-squares solution
852 x_ls, *_ = np.linalg.lstsq(valid_matrix, valid_rhs, rcond=None)
854 # Compute nullspace
855 nullspace_basis = null_space(valid_matrix, rcond=None)
856 nullrank = nullspace_basis.shape[1]
858 if nullrank == 0:
859 # System is determined, return least-squares solution
860 return x_ls
862 # Optimize in nullspace
863 coeffs = _optimize_nullspace_coefficients(x_ls, nullspace_basis, nullspace_objective, optimization_method)
865 return x_ls + nullspace_basis @ coeffs
868def _optimize_nullspace_coefficients(x_ls, nullspace_basis, nullspace_objective, optimization_method):
869 """Optimize coefficients in the nullspace to minimize the objective."""
870 nullrank = nullspace_basis.shape[1]
871 objective_func = _get_nullspace_objective_function(nullspace_objective)
872 coeffs_0 = np.zeros(nullrank)
874 # For stability, always start with squared differences if using summed differences
875 if nullspace_objective == "summed_differences":
876 res_init = minimize(
877 _squared_differences_objective,
878 x0=coeffs_0,
879 args=(x_ls, nullspace_basis),
880 method=optimization_method,
881 )
882 if not res_init.success:
883 msg = f"Initial optimization failed: {res_init.message}"
884 raise ValueError(msg)
885 coeffs_0 = res_init.x
887 # Final optimization with target objective
888 res = minimize(
889 objective_func,
890 x0=coeffs_0,
891 args=(x_ls, nullspace_basis),
892 method=optimization_method,
893 )
895 if not res.success:
896 msg = f"Optimization failed: {res.message}"
897 raise ValueError(msg)
899 return res.x
902def _squared_differences_objective(coeffs, x_ls, nullspace_basis):
903 """Minimize sum of squared differences between adjacent elements."""
904 x = x_ls + nullspace_basis @ coeffs
905 return np.sum(np.square(x[1:] - x[:-1]))
908def _summed_differences_objective(coeffs, x_ls, nullspace_basis):
909 """Minimize sum of absolute differences between adjacent elements."""
910 x = x_ls + nullspace_basis @ coeffs
911 return np.sum(np.abs(x[1:] - x[:-1]))
914def _get_nullspace_objective_function(nullspace_objective):
915 """Get the objective function for nullspace optimization."""
916 if nullspace_objective == "squared_differences":
917 return _squared_differences_objective
918 if nullspace_objective == "summed_differences":
919 return _summed_differences_objective
920 if callable(nullspace_objective):
921 return nullspace_objective
922 msg = f"Unknown nullspace objective: {nullspace_objective}"
923 raise ValueError(msg)