Coverage for src/gwtransport/utils.py: 0%

281 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-20 13:45 +0000

1"""General utilities for the 1D groundwater transport model.""" 

2 

3from __future__ import annotations 

4 

5import io 

6from collections.abc import Sequence 

7from datetime import date 

8from pathlib import Path 

9 

10import numpy as np 

11import numpy.typing as npt 

12import pandas as pd 

13import requests 

14from scipy import interpolate 

15from scipy.linalg import null_space 

16from scipy.optimize import minimize 

17 

18cache_dir = Path(__file__).parent.parent.parent / "cache" 

19 

20 

21def linear_interpolate(x_ref, y_ref, x_query, left=None, right=None): 

22 """ 

23 Linear interpolation on monotonically increasing data. 

24 

25 Parameters 

26 ---------- 

27 x_ref : array-like 

28 Reference vector with sorted x-values. 

29 y_ref : array-like 

30 Reference vector with y-values. 

31 x_query : array-like 

32 Query x-values. Array may have any shape. 

33 left : float, optional 

34 Value to return for x_query < x_ref[0]. 

35 - If `left` is set to None, x_query = x_ref[0]. 

36 - If `left` is set to a float, such as np.nan, this value is returned. 

37 right : float, optional 

38 Value to return for x_query > x_ref[-1]. 

39 - If `right` is set to None, x_query = x_ref[-1]. 

40 - If `right` is set to a float, such as np.nan, this value is returned. 

41 

42 Returns 

43 ------- 

44 array 

45 Interpolated y-values. 

46 """ 

47 x_ref = np.asarray(x_ref) 

48 y_ref = np.asarray(y_ref) 

49 x_query = np.asarray(x_query) 

50 

51 # Find indices where x_query would be inserted in x_ref 

52 idx_no_edges = np.searchsorted(x_ref, x_query) 

53 

54 idx = np.clip(idx_no_edges, 1, len(x_ref) - 1) 

55 

56 # Calculate interpolation weights 

57 x0 = x_ref[idx - 1] 

58 x1 = x_ref[idx] 

59 y0 = y_ref[idx - 1] 

60 y1 = y_ref[idx] 

61 

62 # Perform linear interpolation 

63 weights = (x_query - x0) / (x1 - x0) 

64 y_query = y0 + weights * (y1 - y0) 

65 

66 # Handle edge cases 

67 if left is None: 

68 y_query = np.where(x_query < x_ref[0], y_ref[0], y_query) 

69 if right is None: 

70 y_query = np.where(x_query > x_ref[-1], y_ref[-1], y_query) 

71 if isinstance(left, float): 

72 y_query = np.where(x_query < x_ref[0], left, y_query) 

73 if isinstance(right, float): 

74 y_query = np.where(x_query > x_ref[-1], right, y_query) 

75 

76 return y_query 

77 

78 

79def interp_series(series, index_new, **interp1d_kwargs): 

80 """ 

81 Interpolate a pandas.Series to a new index. 

82 

83 Parameters 

84 ---------- 

85 series : pandas.Series 

86 Series to interpolate. 

87 index_new : pandas.DatetimeIndex 

88 New index to interpolate to. 

89 interp1d_kwargs : dict, optional 

90 Keyword arguments passed to scipy.interpolate.interp1d. Default is {}. 

91 

92 Returns 

93 ------- 

94 pandas.Series 

95 Interpolated series. 

96 """ 

97 series = series[series.index.notna() & series.notna()] 

98 dt = (series.index - series.index[0]) / pd.to_timedelta(1, unit="D") 

99 dt_interp = (index_new - series.index[0]) / pd.to_timedelta(1, unit="D") 

100 interp_obj = interpolate.interp1d(dt, series.values, bounds_error=False, **interp1d_kwargs) 

101 return interp_obj(dt_interp) 

102 

103 

104def diff(a, alignment="centered"): 

105 """Compute the cell widths for a given array of cell coordinates. 

106 

107 If alignment is "centered", the coordinates are assumed to be centered in the cells. 

108 If alignment is "left", the coordinates are assumed to be at the left edge of the cells. 

109 If alignment is "right", the coordinates are assumed to be at the right edge of the cells. 

110 

111 Parameters 

112 ---------- 

113 a : array-like 

114 Input array. 

115 

116 Returns 

117 ------- 

118 array 

119 Array with differences between elements. 

120 """ 

121 if alignment == "centered": 

122 mid = a[:-1] + (a[1:] - a[:-1]) / 2 

123 return np.concatenate((a[[1]] - a[[0]], mid[1:] - mid[:-1], a[[-1]] - a[[-2]])) 

124 if alignment == "left": 

125 return np.concatenate((a[1:] - a[:-1], a[[-1]] - a[[-2]])) 

126 if alignment == "right": 

127 return np.concatenate((a[[1]] - a[[0]], a[1:] - a[:-1])) 

128 

129 msg = f"Invalid alignment: {alignment}" 

130 raise ValueError(msg) 

131 

132 

133def linear_average( # noqa: C901 

134 x_data: Sequence[float] | npt.NDArray[np.float64], 

135 y_data: Sequence[float] | npt.NDArray[np.float64], 

136 x_edges: Sequence[float] | npt.NDArray[np.float64], 

137 extrapolate_method: str = "nan", 

138) -> npt.NDArray[np.float64]: 

139 """ 

140 Compute the average value of a piecewise linear time series between specified x-edges. 

141 

142 Parameters 

143 ---------- 

144 x_data : array-like 

145 x-coordinates of the time series data points, must be in ascending order 

146 y_data : array-like 

147 y-coordinates of the time series data points 

148 x_edges : array-like 

149 x-coordinates of the integration edges. Can be 1D or 2D. 

150 - If 1D: shape (n_edges,). Can be 1D or 2D. 

151 - If 1D: shape (n_edges,), must be in ascending order 

152 - If 2D: shape (n_series, n_edges), each row must be in ascending order 

153 - If 2D: shape (n_series, n_edges), each row must be in ascending order 

154 extrapolate_method : str, optional 

155 Method for handling extrapolation. Default is 'nan'. 

156 - 'outer': Extrapolate using the outermost data points. 

157 - 'nan': Extrapolate using np.nan. 

158 - 'raise': Raise an error for out-of-bounds values. 

159 

160 Returns 

161 ------- 

162 numpy.ndarray 

163 2D array of average values between consecutive pairs of x_edges. 

164 Shape is (n_series, n_bins) where n_bins = n_edges - 1. 

165 If x_edges is 1D, n_series = 1. 

166 

167 Examples 

168 -------- 

169 >>> x_data = [0, 1, 2, 3] 

170 >>> y_data = [0, 1, 1, 0] 

171 >>> x_edges = [0, 1.5, 3] 

172 >>> linear_average(x_data, y_data, x_edges) 

173 array([[0.667, 0.667]]) 

174 

175 >>> x_edges_2d = [[0, 1.5, 3], [0.5, 2, 3]] 

176 >>> linear_average(x_data, y_data, x_edges_2d) 

177 array([[0.667, 0.667], [0.75, 0.5]]) 

178 """ 

179 # Convert inputs to numpy arrays 

180 x_data = np.asarray(x_data, dtype=float) 

181 y_data = np.asarray(y_data, dtype=float) 

182 x_edges = np.asarray(x_edges, dtype=float) 

183 

184 # Ensure x_edges is always 2D 

185 if x_edges.ndim == 1: 

186 x_edges = x_edges[np.newaxis, :] 

187 elif x_edges.ndim != 2: # noqa: PLR2004 

188 msg = "x_edges must be 1D or 2D array" 

189 raise ValueError(msg) 

190 

191 # Input validation 

192 if len(x_data) != len(y_data) or len(x_data) == 0: 

193 msg = "x_data and y_data must have the same length and be non-empty" 

194 raise ValueError(msg) 

195 if x_edges.shape[1] < 2: # noqa: PLR2004 

196 msg = "x_edges must contain at least 2 values in each row" 

197 raise ValueError(msg) 

198 if not np.all(np.diff(x_data) >= 0): 

199 msg = "x_data must be in ascending order" 

200 raise ValueError(msg) 

201 if not np.all(np.diff(x_edges, axis=1) >= 0): 

202 msg = "x_edges must be in ascending order along each row" 

203 raise ValueError(msg) 

204 

205 # Filter out NaN values 

206 show = ~np.isnan(x_data) & ~np.isnan(y_data) 

207 if show.sum() < 2: # noqa: PLR2004 

208 if show.sum() == 1 and extrapolate_method == "outer": 

209 # For single data point with outer extrapolation, use constant value 

210 constant_value = y_data[show][0] 

211 return np.full(shape=(x_edges.shape[0], x_edges.shape[1] - 1), fill_value=constant_value) 

212 return np.full(shape=(x_edges.shape[0], x_edges.shape[1] - 1), fill_value=np.nan) 

213 

214 x_data_clean = x_data[show] 

215 y_data_clean = y_data[show] 

216 

217 # Handle extrapolation for all series at once (vectorized) 

218 if extrapolate_method == "outer": 

219 edges_processed = np.clip(x_edges, x_data_clean.min(), x_data_clean.max()) 

220 elif extrapolate_method == "raise": 

221 if np.any(x_edges < x_data_clean.min()) or np.any(x_edges > x_data_clean.max()): 

222 msg = "x_edges must be within the range of x_data" 

223 raise ValueError(msg) 

224 edges_processed = x_edges.copy() 

225 else: # nan method 

226 edges_processed = x_edges.copy() 

227 

228 # Create a combined grid of all unique x points (data + all edges) 

229 all_unique_x = np.unique(np.concatenate([x_data_clean, edges_processed.ravel()])) 

230 

231 # Interpolate y values at all unique x points once 

232 all_unique_y = np.interp(all_unique_x, x_data_clean, y_data_clean, left=np.nan, right=np.nan) 

233 

234 # Compute cumulative integrals once using trapezoidal rule 

235 dx = np.diff(all_unique_x) 

236 y_avg = (all_unique_y[:-1] + all_unique_y[1:]) / 2 

237 segment_integrals = dx * y_avg 

238 # Replace NaN values with 0 to avoid breaking cumulative sum 

239 segment_integrals = np.nan_to_num(segment_integrals, nan=0.0) 

240 cumulative_integral = np.concatenate([[0], np.cumsum(segment_integrals)]) 

241 

242 # Vectorized computation for all series 

243 # Find indices of all edges in the combined grid 

244 edge_indices = np.searchsorted(all_unique_x, edges_processed) 

245 

246 # Compute integral between consecutive edges for all series (vectorized) 

247 integral_values = cumulative_integral[edge_indices[:, 1:]] - cumulative_integral[edge_indices[:, :-1]] 

248 

249 # Compute widths between consecutive edges for all series (vectorized) 

250 edge_widths = np.diff(edges_processed, axis=1) 

251 

252 # Handle zero-width intervals (vectorized) 

253 zero_width_mask = edge_widths == 0 

254 result = np.zeros_like(edge_widths) 

255 

256 # For non-zero width intervals, compute average = integral / width (vectorized) 

257 non_zero_mask = ~zero_width_mask 

258 result[non_zero_mask] = integral_values[non_zero_mask] / edge_widths[non_zero_mask] 

259 

260 # For zero-width intervals, interpolate y-value directly (vectorized) 

261 if np.any(zero_width_mask): 

262 zero_width_positions = edges_processed[:, :-1][zero_width_mask] 

263 result[zero_width_mask] = np.interp(zero_width_positions, x_data_clean, y_data_clean) 

264 

265 # Handle extrapolation when 'nan' method is used (vectorized) 

266 if extrapolate_method == "nan": 

267 # Set out-of-range bins to NaN 

268 bins_within_range = (x_edges[:, :-1] >= x_data_clean.min()) & (x_edges[:, 1:] <= x_data_clean.max()) 

269 result[~bins_within_range] = np.nan 

270 

271 return result 

272 

273 

274def partial_isin(bin_edges_in, bin_edges_out): 

275 """ 

276 Calculate the fraction of each input bin that overlaps with each output bin. 

277 

278 This function computes a matrix where element (i, j) represents the fraction 

279 of input bin i that overlaps with output bin j. The computation uses 

280 vectorized operations to avoid loops. 

281 

282 Parameters 

283 ---------- 

284 bin_edges_in : array_like 

285 1D array of input bin edges in ascending order. For n_in bins, there 

286 should be n_in+1 edges. 

287 bin_edges_out : array_like 

288 1D array of output bin edges in ascending order. For n_out bins, there 

289 should be n_out+1 edges. 

290 

291 Returns 

292 ------- 

293 overlap_matrix : numpy.ndarray 

294 Dense matrix of shape (n_in, n_out) where n_in is the number of input 

295 bins and n_out is the number of output bins. Each element (i, j) 

296 represents the fraction of input bin i that overlaps with output bin j. 

297 Values range from 0 (no overlap) to 1 (complete overlap). 

298 

299 Notes 

300 ----- 

301 - Both input arrays must be sorted in ascending order 

302 - The function leverages the sorted nature of both inputs for efficiency 

303 - Uses vectorized operations to handle large bin arrays efficiently 

304 - All overlaps sum to 1.0 for each input bin when output bins fully cover input range 

305 

306 Examples 

307 -------- 

308 >>> bin_edges_in = np.array([0, 10, 20, 30]) 

309 >>> bin_edges_out = np.array([5, 15, 25]) 

310 >>> partial_isin(bin_edges_in, bin_edges_out) 

311 array([[0.5, 0.0], 

312 [0.5, 0.5], 

313 [0.0, 0.5]]) 

314 """ 

315 # Convert inputs to numpy arrays 

316 bin_edges_in = np.asarray(bin_edges_in, dtype=float) 

317 bin_edges_out = np.asarray(bin_edges_out, dtype=float) 

318 

319 # Validate inputs 

320 if bin_edges_in.ndim != 1 or bin_edges_out.ndim != 1: 

321 msg = "Both bin_edges_in and bin_edges_out must be 1D arrays" 

322 raise ValueError(msg) 

323 if len(bin_edges_in) < 2 or len(bin_edges_out) < 2: # noqa: PLR2004 

324 msg = "Both edge arrays must have at least 2 elements" 

325 raise ValueError(msg) 

326 

327 # Check ascending order, ignoring NaN values 

328 diffs_in = np.diff(bin_edges_in) 

329 valid_diffs_in = ~np.isnan(diffs_in) 

330 if np.any(valid_diffs_in) and not np.all(diffs_in[valid_diffs_in] > 0): 

331 msg = "bin_edges_in must be in ascending order" 

332 raise ValueError(msg) 

333 

334 diffs_out = np.diff(bin_edges_out) 

335 valid_diffs_out = ~np.isnan(diffs_out) 

336 if np.any(valid_diffs_out) and not np.all(diffs_out[valid_diffs_out] > 0): 

337 msg = "bin_edges_out must be in ascending order" 

338 raise ValueError(msg) 

339 

340 # Build matrix using fully vectorized approach 

341 # Create meshgrids for all possible input-output bin combinations 

342 in_left = bin_edges_in[:-1, None] # Shape: (n_bins_in, 1) 

343 in_right = bin_edges_in[1:, None] # Shape: (n_bins_in, 1) 

344 in_width = np.diff(bin_edges_in)[:, None] # Shape: (n_bins_in, 1) 

345 

346 out_left = bin_edges_out[None, :-1] # Shape: (1, n_bins_out) 

347 out_right = bin_edges_out[None, 1:] # Shape: (1, n_bins_out) 

348 

349 # Calculate overlaps for all combinations using broadcasting 

350 overlap_left = np.maximum(in_left, out_left) # Shape: (n_bins_in, n_bins_out) 

351 overlap_right = np.minimum(in_right, out_right) # Shape: (n_bins_in, n_bins_out) 

352 

353 # Calculate overlap widths (zero where no overlap) 

354 overlap_widths = np.maximum(0, overlap_right - overlap_left) 

355 

356 # Calculate fractions (NaN widths will result in NaN fractions) 

357 return overlap_widths / in_width 

358 

359 

360def time_bin_overlap(tedges, bin_tedges): 

361 """ 

362 Calculate the fraction of each time bin that overlaps with each time range. 

363 

364 This function computes an array where element (i, j) represents the fraction 

365 of time bin j that overlaps with time range i. The computation uses 

366 vectorized operations to avoid loops. 

367 

368 Parameters 

369 ---------- 

370 tedges : array_like 

371 1D array of time bin edges in ascending order. For n bins, there 

372 should be n+1 edges. 

373 bin_tedges : list of tuples 

374 List of tuples where each tuple contains (start_time, end_time) 

375 defining a time range. 

376 

377 Returns 

378 ------- 

379 overlap_array : numpy.ndarray 

380 Array of shape (len(bin_tedges), n_bins) where n_bins is the number of 

381 time bins. Each element (i, j) represents the fraction of time bin j 

382 that overlaps with time range i. Values range from 0 (no overlap) to 

383 1 (complete overlap). 

384 

385 Notes 

386 ----- 

387 - tedges must be sorted in ascending order 

388 - Uses vectorized operations to handle large arrays efficiently 

389 - Time ranges in bin_tedges can be in any order and can overlap 

390 

391 Examples 

392 -------- 

393 >>> tedges = np.array([0, 10, 20, 30]) 

394 >>> bin_tedges = [(5, 15), (25, 35)] 

395 >>> time_bin_overlap(tedges, bin_tedges) 

396 array([[0.5, 0.5, 0.0], 

397 [0.0, 0.0, 0.5]]) 

398 """ 

399 # Convert inputs to numpy arrays 

400 tedges = np.asarray(tedges) 

401 bin_tedges_array = np.asarray(bin_tedges) 

402 

403 # Validate inputs 

404 if tedges.ndim != 1: 

405 msg = "tedges must be a 1D array" 

406 raise ValueError(msg) 

407 if len(tedges) < 2: # noqa: PLR2004 

408 msg = "tedges must have at least 2 elements" 

409 raise ValueError(msg) 

410 if bin_tedges_array.size == 0: 

411 msg = "bin_tedges must be non-empty" 

412 raise ValueError(msg) 

413 

414 # Calculate overlaps for all combinations using broadcasting 

415 overlap_left = np.maximum(bin_tedges_array[:, [0]], tedges[None, :-1]) 

416 overlap_right = np.minimum(bin_tedges_array[:, [1]], tedges[None, 1:]) 

417 overlap_widths = np.maximum(0, overlap_right - overlap_left) 

418 

419 # Calculate fractions (handle division by zero for zero-width bins) 

420 bin_width_bc = np.diff(tedges)[None, :] # Shape: (1, n_bins) 

421 

422 return np.divide( 

423 overlap_widths, bin_width_bc, out=np.zeros_like(overlap_widths, dtype=float), where=bin_width_bc != 0.0 

424 ) 

425 

426 

427def generate_failed_coverage_badge(): 

428 """Generate a badge indicating failed coverage.""" 

429 from genbadge import Badge # type: ignore # noqa: PLC0415 

430 

431 b = Badge(left_txt="coverage", right_txt="failed", color="red") 

432 b.write_to("coverage_failed.svg", use_shields=False) 

433 

434 

435def combine_bin_series(a, a_edges, b, b_edges, extrapolation=0.0): 

436 """ 

437 Combine two binned series onto a common set of unique edges. 

438 

439 This function takes two binned series (a and b) with their respective bin edges 

440 and creates new series (c and d) that are defined on a combined set of unique 

441 edges from both input edge arrays. 

442 

443 Parameters 

444 ---------- 

445 a : array-like 

446 Values for the first binned series. 

447 a_edges : array-like 

448 Bin edges for the first series. Must have len(a) + 1 elements. 

449 b : array-like 

450 Values for the second binned series. 

451 b_edges : array-like 

452 Bin edges for the second series. Must have len(b) + 1 elements. 

453 extrapolation : str or float, optional 

454 Method for handling combined bins that fall outside the original series ranges. 

455 - 'nearest': Use the nearest original bin value 

456 - float value (e.g., np.nan, 0.0): Fill with the specified value (default: 0.0) 

457 

458 Returns 

459 ------- 

460 c : numpy.ndarray 

461 Values from series a mapped to the combined edge structure. 

462 c_edges : numpy.ndarray 

463 Combined unique edges from a_edges and b_edges. 

464 d : numpy.ndarray 

465 Values from series b mapped to the combined edge structure. 

466 d_edges : numpy.ndarray 

467 Combined unique edges from a_edges and b_edges (same as c_edges). 

468 

469 Notes 

470 ----- 

471 The combined edges are created by taking the union of all unique values 

472 from both a_edges and b_edges, sorted in ascending order. The values 

473 are then broadcasted/repeated for each combined bin that falls within 

474 the original bin's range. 

475 """ 

476 # Convert inputs to numpy arrays 

477 a = np.asarray(a, dtype=float) 

478 a_edges = np.asarray(a_edges, dtype=float) 

479 b = np.asarray(b, dtype=float) 

480 b_edges = np.asarray(b_edges, dtype=float) 

481 

482 # Validate inputs 

483 if len(a_edges) != len(a) + 1: 

484 msg = "a_edges must have len(a) + 1 elements" 

485 raise ValueError(msg) 

486 if len(b_edges) != len(b) + 1: 

487 msg = "b_edges must have len(b) + 1 elements" 

488 raise ValueError(msg) 

489 

490 # Create combined unique edges 

491 combined_edges = np.unique(np.concatenate([a_edges, b_edges])) 

492 

493 # Initialize output arrays 

494 c = np.zeros(len(combined_edges) - 1) 

495 d = np.zeros(len(combined_edges) - 1) 

496 

497 # Vectorized mapping using searchsorted - find which original bin each combined bin belongs to 

498 # For series a: find which original bin each combined bin center falls into 

499 combined_bin_centers = (combined_edges[:-1] + combined_edges[1:]) / 2 

500 a_bin_assignment = np.searchsorted(a_edges, combined_bin_centers, side="right") - 1 

501 a_bin_assignment = np.clip(a_bin_assignment, 0, len(a) - 1) 

502 

503 # Handle extrapolation for series a 

504 if extrapolation == "nearest": 

505 # Assign all values using nearest neighbor (already clipped) 

506 c[:] = a[a_bin_assignment] 

507 else: 

508 # Only assign values where the combined bin is completely within the original bin 

509 a_valid_mask = (combined_edges[:-1] >= a_edges[a_bin_assignment]) & ( 

510 combined_edges[1:] <= a_edges[a_bin_assignment + 1] 

511 ) 

512 c[a_valid_mask] = a[a_bin_assignment[a_valid_mask]] 

513 # Fill out-of-range bins with extrapolation value 

514 c[~a_valid_mask] = extrapolation 

515 

516 # Handle extrapolation for series b 

517 b_bin_assignment = np.searchsorted(b_edges, combined_bin_centers, side="right") - 1 

518 b_bin_assignment = np.clip(b_bin_assignment, 0, len(b) - 1) 

519 

520 if extrapolation == "nearest": 

521 # Assign all values using nearest neighbor (already clipped) 

522 d[:] = b[b_bin_assignment] 

523 else: 

524 # Only assign values where the combined bin is completely within the original bin 

525 b_valid_mask = (combined_edges[:-1] >= b_edges[b_bin_assignment]) & ( 

526 combined_edges[1:] <= b_edges[b_bin_assignment + 1] 

527 ) 

528 d[b_valid_mask] = b[b_bin_assignment[b_valid_mask]] 

529 # Fill out-of-range bins with extrapolation value 

530 d[~b_valid_mask] = extrapolation 

531 

532 # Return the combined series 

533 c_edges = combined_edges 

534 d_edges = combined_edges.copy() 

535 

536 return c, c_edges, d, d_edges 

537 

538 

539def compute_time_edges(tedges, tstart, tend, number_of_bins): 

540 """ 

541 Compute time edges for binning data based on provided time parameters. 

542 

543 This function creates a DatetimeIndex of time bin edges from one of three possible 

544 input formats: explicit edges, start times, or end times. The resulting edges 

545 define the boundaries of time intervals for data binning. 

546 

547 Define either explicit time edges, or start and end times for each bin and leave the others at None. 

548 

549 Parameters 

550 ---------- 

551 tedges : pandas.DatetimeIndex or None 

552 Explicit time edges for the bins. If provided, must have one more element 

553 than the number of bins (n_bins + 1). Takes precedence over tstart and tend. 

554 tstart : pandas.DatetimeIndex or None 

555 Start times for each bin. Must have the same number of elements as the 

556 number of bins. Used when tedges is None. 

557 tend : pandas.DatetimeIndex or None 

558 End times for each bin. Must have the same number of elements as the 

559 number of bins. Used when both tedges and tstart are None. 

560 number_of_bins : int 

561 The expected number of time bins. Used for validation against the provided 

562 time parameters. 

563 

564 Returns 

565 ------- 

566 pandas.DatetimeIndex 

567 Time edges defining the boundaries of the time bins. Has one more element 

568 than number_of_bins. 

569 

570 Raises 

571 ------ 

572 ValueError 

573 If tedges has incorrect length (not number_of_bins + 1). 

574 If tstart has incorrect length (not equal to number_of_bins). 

575 If tend has incorrect length (not equal to number_of_bins). 

576 If none of tedges, tstart, or tend are provided. 

577 

578 Notes 

579 ----- 

580 - When using tstart, the function assumes uniform spacing and extrapolates 

581 the final edge based on the spacing between the last two start times. 

582 - When using tend, the function assumes uniform spacing and extrapolates 

583 the first edge based on the spacing between the first two end times. 

584 - All input time data is converted to pandas.DatetimeIndex for consistency. 

585 """ 

586 if tedges is not None: 

587 tedges = pd.DatetimeIndex(tedges) 

588 if number_of_bins != len(tedges) - 1: 

589 msg = "tedges must have one more element than flow" 

590 raise ValueError(msg) 

591 

592 elif tstart is not None: 

593 # Assume the index refers to the time at the start of the measurement interval 

594 tstart = pd.DatetimeIndex(tstart) 

595 if number_of_bins != len(tstart): 

596 msg = "tstart must have the same number of elements as flow" 

597 raise ValueError(msg) 

598 

599 tedges = tstart.append(tstart[[-1]] + (tstart[-1] - tstart[-2])) 

600 

601 elif tend is not None: 

602 # Assume the index refers to the time at the end of the measurement interval 

603 tend = pd.DatetimeIndex(tend) 

604 if number_of_bins != len(tend): 

605 msg = "tend must have the same number of elements as flow" 

606 raise ValueError(msg) 

607 

608 tedges = (tend[[0]] - (tend[1] - tend[0])).append(tend) 

609 

610 else: 

611 msg = "Either provide tedges, tstart, and tend" 

612 raise ValueError(msg) 

613 return tedges 

614 

615 

616def get_soil_temperature(station_number: int = 260, *, interpolate_missing_values: bool = True) -> pd.DataFrame: 

617 """ 

618 Download soil temperature data from the KNMI and return it as a pandas DataFrame. 

619 

620 The data is available for the following KNMI weather stations: 

621 - 260: De Bilt, the Netherlands (vanaf 1981) 

622 - 273: Marknesse, the Netherlands (vanaf 1989) 

623 - 286: Nieuw Beerta, the Netherlands (vanaf 1990) 

624 - 323: Wilhelminadorp, the Netherlands (vanaf 1989) 

625 

626 TB1 = grondtemperatuur op 5 cm diepte (graden Celsius) tijdens de waarneming 

627 TB2 = grondtemperatuur op 10 cm diepte (graden Celsius) tijdens de waarneming 

628 TB3 = grondtemperatuur op 20 cm diepte (graden Celsius) tijdens de waarneming 

629 TB4 = grondtemperatuur op 50 cm diepte (graden Celsius) tijdens de waarneming 

630 TB5 = grondtemperatuur op 100 cm diepte (graden Celsius) tijdens de waarneming 

631 TNB2 = minimum grondtemperatuur op 10 cm diepte in de afgelopen 6 uur (graden Celsius) 

632 TNB1 = minimum grondtemperatuur op 5 cm diepte in de afgelopen 6 uur (graden Celsius) 

633 TXB1 = maximum grondtemperatuur op 5 cm diepte in de afgelopen 6 uur (graden Celsius) 

634 TXB2 = maximum grondtemperatuur op 10 cm diepte in de afgelopen 6 uur (graden Celsius) 

635 

636 Parameters 

637 ---------- 

638 station_number : int, {260, 273, 286, 323} 

639 The KNMI station number for which to download soil temperature data. 

640 Default is 260 (De Bilt). 

641 interpolate_missing_values : bool, optional 

642 If True, missing values are interpolated and recent NaN values are extrapolated with the previous value. 

643 If False, missing values remain as NaN. Default is True. 

644 

645 Returns 

646 ------- 

647 pandas.DataFrame 

648 DataFrame containing soil temperature data in Celsius with a DatetimeIndex. 

649 Columns include TB1, TB2, TB3, TB4, TB5, TNB1, TNB2, TXB1, TXB2. 

650 

651 Notes 

652 ----- 

653 - KNMI: Royal Netherlands Meteorological Institute 

654 - The timeseries may contain NaN values for missing data. 

655 """ 

656 # File-based daily cache 

657 cache_dir.mkdir(exist_ok=True) 

658 

659 today = date.today().isoformat() # noqa: DTZ011 

660 cache_path = cache_dir / f"soil_temp_{station_number}_{interpolate_missing_values}_{today}.pkl" 

661 

662 # Check if cached file exists and is from today 

663 if cache_path.exists(): 

664 return pd.read_pickle(cache_path) # noqa: S301 

665 

666 # Clean up old cache files to prevent disk bloat 

667 for old_file in cache_dir.glob(f"soil_temp_{station_number}_{interpolate_missing_values}_*.pkl"): 

668 old_file.unlink(missing_ok=True) 

669 

670 url = f"https://cdn.knmi.nl/knmi/map/page/klimatologie/gegevens/bodemtemps/bodemtemps_{station_number}.zip" 

671 

672 dtypes = { 

673 "YYYYMMDD": "int32", 

674 "HH": "int8", 

675 " TB1": "float32", 

676 " TB3": "float32", 

677 " TB2": "float32", 

678 " TB4": "float32", 

679 " TB5": "float32", 

680 " TNB1": "float32", 

681 " TNB2": "float32", 

682 " TXB1": "float32", 

683 " TXB2": "float32", 

684 } 

685 

686 # Download the ZIP file 

687 with requests.get(url, params={"download": "zip"}, timeout=10) as response: 

688 response.raise_for_status() 

689 

690 df = pd.read_csv( 

691 io.BytesIO(response.content), 

692 compression="zip", 

693 dtype=dtypes, 

694 usecols=dtypes.keys(), 

695 skiprows=16, 

696 sep=",", 

697 index_col=None, 

698 na_values=[" "], 

699 engine="c", 

700 parse_dates=False, 

701 ) 

702 

703 df.index = pd.to_datetime(df["YYYYMMDD"].values, format=r"%Y%m%d").tz_localize("UTC") + pd.to_timedelta( 

704 df["HH"].values, unit="h" 

705 ) 

706 

707 df.drop(columns=["YYYYMMDD", "HH"], inplace=True) 

708 df.columns = df.columns.str.strip() 

709 df /= 10.0 

710 

711 if interpolate_missing_values: 

712 # Fill NaN values with interpolate linearly and then forward fill 

713 df.interpolate(method="linear", inplace=True) 

714 df.ffill(inplace=True) 

715 

716 # Save to cache for future use 

717 df.to_pickle(cache_path) 

718 return df 

719 

720 

721def solve_underdetermined_system( 

722 coefficient_matrix, 

723 rhs_vector, 

724 *, 

725 nullspace_objective="squared_differences", 

726 optimization_method="BFGS", 

727): 

728 """ 

729 Solve an underdetermined linear system with nullspace regularization. 

730 

731 For an underdetermined system Ax = b where A has more columns than rows, 

732 multiple solutions exist. This function computes a least-squares solution 

733 and then selects a specific solution from the nullspace based on a 

734 regularization objective. 

735 

736 Parameters 

737 ---------- 

738 coefficient_matrix : array_like 

739 Coefficient matrix of shape (m, n) where m < n (underdetermined). 

740 May contain NaN values in some rows, which will be excluded from the system. 

741 rhs_vector : array_like 

742 Right-hand side vector of length m. May contain NaN values corresponding 

743 to NaN rows in coefficient_matrix, which will be excluded from the system. 

744 nullspace_objective : str or callable, optional 

745 Objective function to minimize in the nullspace. Options: 

746 

747 * "squared_differences" : Minimize sum of squared differences between 

748 adjacent elements: sum((x[i+1] - x[i])**2) 

749 * "summed_differences" : Minimize sum of absolute differences between 

750 adjacent elements: sum(|x[i+1] - x[i]|) 

751 * callable : Custom objective function with signature 

752 ``objective(coeffs, x_ls, nullspace_basis)`` where: 

753 

754 - coeffs : optimization variables (nullspace coefficients) 

755 - x_ls : least-squares solution 

756 - nullspace_basis : nullspace basis matrix 

757 

758 Default is "squared_differences". 

759 optimization_method : str, optional 

760 Optimization method passed to scipy.optimize.minimize. 

761 Default is "BFGS". 

762 

763 Returns 

764 ------- 

765 numpy.ndarray 

766 Solution vector that minimizes the specified nullspace objective. 

767 Has length n (number of columns in coefficient_matrix). 

768 

769 Raises 

770 ------ 

771 ValueError 

772 If optimization fails, if coefficient_matrix and rhs_vector have incompatible shapes, 

773 or if an unknown nullspace objective is specified. 

774 

775 Notes 

776 ----- 

777 The algorithm follows these steps: 

778 

779 1. Remove rows with NaN values from both coefficient_matrix and rhs_vector 

780 2. Compute least-squares solution: x_ls = pinv(valid_matrix) @ valid_rhs 

781 3. Compute nullspace basis: N = null_space(valid_matrix) 

782 4. Find nullspace coefficients: coeffs = argmin objective(x_ls + N @ coeffs) 

783 5. Return final solution: x = x_ls + N @ coeffs 

784 

785 For the built-in objectives: 

786 

787 * "squared_differences" provides smooth solutions, minimizing rapid changes 

788 * "summed_differences" provides sparse solutions, promoting piecewise constant behavior 

789 

790 Examples 

791 -------- 

792 Basic usage with default squared differences objective: 

793 

794 >>> import numpy as np 

795 >>> from gwtransport.utils import solve_underdetermined_system 

796 >>> 

797 >>> # Create underdetermined system (2 equations, 4 unknowns) 

798 >>> matrix = np.array([[1, 2, 1, 0], [0, 1, 2, 1]]) 

799 >>> rhs = np.array([3, 4]) 

800 >>> 

801 >>> # Solve with squared differences regularization 

802 >>> x = solve_underdetermined_system(matrix, rhs) 

803 >>> print(f"Solution: {x}") 

804 >>> print(f"Residual: {np.linalg.norm(matrix @ x - rhs):.2e}") 

805 

806 With summed differences objective: 

807 

808 >>> x_sparse = solve_underdetermined_system( 

809 ... matrix, rhs, nullspace_objective="summed_differences" 

810 ... ) 

811 

812 With custom objective function: 

813 

814 >>> def custom_objective(coeffs, x_ls, nullspace_basis): 

815 ... x = x_ls + nullspace_basis @ coeffs 

816 ... return np.sum(x**2) # Minimize L2 norm 

817 >>> 

818 >>> x_custom = solve_underdetermined_system( 

819 ... matrix, rhs, nullspace_objective=custom_objective 

820 ... ) 

821 

822 Handling NaN values: 

823 

824 >>> # System with missing data 

825 >>> matrix_nan = np.array([ 

826 ... [1, 2, 1, 0], 

827 ... [np.nan, np.nan, np.nan, np.nan], 

828 ... [0, 1, 2, 1], 

829 ... ]) 

830 >>> rhs_nan = np.array([3, np.nan, 4]) 

831 >>> 

832 >>> x_nan = solve_underdetermined_system(matrix_nan, rhs_nan) 

833 """ 

834 matrix = np.asarray(coefficient_matrix) 

835 rhs = np.asarray(rhs_vector) 

836 

837 if matrix.shape[0] != len(rhs): 

838 msg = f"coefficient_matrix has {matrix.shape[0]} rows but rhs_vector has {len(rhs)} elements" 

839 raise ValueError(msg) 

840 

841 # Identify valid rows (no NaN values in either matrix or rhs) 

842 valid_rows = ~np.isnan(matrix).any(axis=1) & ~np.isnan(rhs) 

843 

844 if not np.any(valid_rows): 

845 msg = "No valid rows found (all contain NaN values)" 

846 raise ValueError(msg) 

847 

848 valid_matrix = matrix[valid_rows] 

849 valid_rhs = rhs[valid_rows] 

850 

851 # Compute least-squares solution 

852 x_ls, *_ = np.linalg.lstsq(valid_matrix, valid_rhs, rcond=None) 

853 

854 # Compute nullspace 

855 nullspace_basis = null_space(valid_matrix, rcond=None) 

856 nullrank = nullspace_basis.shape[1] 

857 

858 if nullrank == 0: 

859 # System is determined, return least-squares solution 

860 return x_ls 

861 

862 # Optimize in nullspace 

863 coeffs = _optimize_nullspace_coefficients(x_ls, nullspace_basis, nullspace_objective, optimization_method) 

864 

865 return x_ls + nullspace_basis @ coeffs 

866 

867 

868def _optimize_nullspace_coefficients(x_ls, nullspace_basis, nullspace_objective, optimization_method): 

869 """Optimize coefficients in the nullspace to minimize the objective.""" 

870 nullrank = nullspace_basis.shape[1] 

871 objective_func = _get_nullspace_objective_function(nullspace_objective) 

872 coeffs_0 = np.zeros(nullrank) 

873 

874 # For stability, always start with squared differences if using summed differences 

875 if nullspace_objective == "summed_differences": 

876 res_init = minimize( 

877 _squared_differences_objective, 

878 x0=coeffs_0, 

879 args=(x_ls, nullspace_basis), 

880 method=optimization_method, 

881 ) 

882 if not res_init.success: 

883 msg = f"Initial optimization failed: {res_init.message}" 

884 raise ValueError(msg) 

885 coeffs_0 = res_init.x 

886 

887 # Final optimization with target objective 

888 res = minimize( 

889 objective_func, 

890 x0=coeffs_0, 

891 args=(x_ls, nullspace_basis), 

892 method=optimization_method, 

893 ) 

894 

895 if not res.success: 

896 msg = f"Optimization failed: {res.message}" 

897 raise ValueError(msg) 

898 

899 return res.x 

900 

901 

902def _squared_differences_objective(coeffs, x_ls, nullspace_basis): 

903 """Minimize sum of squared differences between adjacent elements.""" 

904 x = x_ls + nullspace_basis @ coeffs 

905 return np.sum(np.square(x[1:] - x[:-1])) 

906 

907 

908def _summed_differences_objective(coeffs, x_ls, nullspace_basis): 

909 """Minimize sum of absolute differences between adjacent elements.""" 

910 x = x_ls + nullspace_basis @ coeffs 

911 return np.sum(np.abs(x[1:] - x[:-1])) 

912 

913 

914def _get_nullspace_objective_function(nullspace_objective): 

915 """Get the objective function for nullspace optimization.""" 

916 if nullspace_objective == "squared_differences": 

917 return _squared_differences_objective 

918 if nullspace_objective == "summed_differences": 

919 return _summed_differences_objective 

920 if callable(nullspace_objective): 

921 return nullspace_objective 

922 msg = f"Unknown nullspace objective: {nullspace_objective}" 

923 raise ValueError(msg)