Coverage for src / gwtransport / examples.py: 84%

61 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-04 18:51 +0000

1""" 

2Example Data Generation for Groundwater Transport Modeling. 

3 

4This module provides utilities to generate synthetic datasets for demonstrating 

5and testing groundwater transport models. It creates realistic flow patterns, 

6temperature/concentration time series, and deposition events suitable for testing 

7advection, diffusion, and deposition analysis functions. 

8 

9Available functions: 

10 

11- :func:`generate_example_data` - Generate comprehensive synthetic dataset with flow and 

12 temperature time series. Creates seasonal flow patterns with optional spill events, 

13 temperature data via synthetic sinusoidal patterns or real KNMI soil temperature, and 

14 extracted temperature computed through gamma-distributed pore volume transport. Returns 

15 DataFrame with flow, temp_infiltration, temp_extraction columns plus attrs containing 

16 generation parameters and aquifer properties. Temperature generation methods: 'synthetic' 

17 (seasonal sinusoidal pattern), 'constant' (constant temperature with noise), or 

18 'soil_temperature' (real data from KNMI station 260). 

19 

20- :func:`generate_example_deposition_timeseries` - Generate synthetic deposition time series 

21 for pathogen/contaminant deposition analysis. Combines baseline deposition, seasonal patterns, 

22 random noise, and episodic contamination events with exponential decay. Returns Series with 

23 deposition rates [ng/m²/day] and attrs containing generation parameters. Useful for testing 

24 extraction_to_deposition deconvolution and deposition_to_extraction convolution functions. 

25 

26This file is part of gwtransport which is released under AGPL-3.0 license. 

27See the ./LICENSE file or go to https://github.com/gwtransport/gwtransport/blob/main/LICENSE for full license details. 

28""" 

29 

30import numpy as np 

31import numpy.typing as npt 

32import pandas as pd 

33 

34from gwtransport.advection import gamma_infiltration_to_extraction 

35from gwtransport.gamma import mean_std_to_alpha_beta 

36from gwtransport.utils import compute_time_edges, get_soil_temperature 

37 

38 

39def generate_example_data( 

40 *, 

41 date_start: str = "2020-01-01", 

42 date_end: str = "2021-12-31", 

43 date_freq: str = "D", 

44 flow_mean: float = 100.0, # m3/day 

45 flow_amplitude: float = 30.0, # m3/day 

46 flow_noise: float = 10.0, # m3/day 

47 temp_infiltration_method: str = "synthetic", # Method for generating infiltration temperature 

48 temp_infiltration_mean: float = 12.0, # °C 

49 temp_infiltration_amplitude: float = 8.0, # °C 

50 temp_measurement_noise: float = 1.0, # °C 

51 aquifer_pore_volume_gamma_mean: float = 1000.0, # m3 

52 aquifer_pore_volume_gamma_std: float = 200.0, # m3 

53 aquifer_pore_volume_gamma_nbins: int = 250, # Discretization resolution 

54 retardation_factor: float = 1.0, 

55) -> tuple[pd.DataFrame, pd.DatetimeIndex]: 

56 """ 

57 Generate synthetic temperature and flow data for groundwater transport examples. 

58 

59 Parameters 

60 ---------- 

61 date_start, date_end : str 

62 Start and end dates for the generated time series (YYYY-MM-DD). 

63 date_freq : str, default "D" 

64 Frequency string for pandas.date_range (default 'D'). 

65 flow_mean : float, default 100.0 

66 Mean flow rate in m3/day 

67 flow_amplitude : float, default 30.0 

68 Seasonal amplitude of flow rate in m3/day 

69 flow_noise : float, default 10.0 

70 Random noise level for flow rate in m3/day 

71 temp_infiltration_method : str, default "synthetic" 

72 Method for generating infiltration temperature. Options: 

73 - "synthetic": Seasonal pattern. temp_infiltration_mean and temp_infiltration_amplitude define the pattern. temp_measurement_noise is applied. 

74 - "constant": Constant temperature equal to temp_infiltration_mean. temp_measurement_noise is still applied. 

75 - "soil_temperature": Use real soil temperature data from KNMI station 

76 temp_infiltration_mean : float, default 12.0 

77 Mean temperature of infiltrating water in °C 

78 temp_infiltration_amplitude : float, default 8.0 

79 Seasonal amplitude of infiltration temperature in °C (only used for "synthetic" method) 

80 temp_measurement_noise : float, default 1.0 

81 Random noise level for infiltration temperature in °C 

82 aquifer_pore_volume_gamma_mean : float, default 1000.0 

83 Mean pore volume of the aquifer gamma distribution in m3 

84 aquifer_pore_volume_gamma_std : float, default 200.0 

85 Standard deviation of aquifer pore volume gamma distribution in m3 

86 aquifer_pore_volume_gamma_nbins : int, default 250 

87 Number of bins to discretize the aquifer pore volume gamma distribution 

88 retardation_factor : float, default 1.0 

89 Retardation factor for temperature transport 

90 

91 Returns 

92 ------- 

93 tuple 

94 A tuple containing: 

95 - pandas.DataFrame: DataFrame with columns 'flow', 'temp_infiltration', 'temp_extraction' 

96 and metadata attributes for the aquifer parameters 

97 - pandas.DatetimeIndex: Time edges (tedges) used for the flow calculations 

98 

99 Raises 

100 ------ 

101 ValueError 

102 If temp_infiltration_method is not one of the supported methods 

103 """ 

104 # Create date range 

105 dates = pd.date_range(start=date_start, end=date_end, freq=date_freq).tz_localize("UTC") 

106 days = (dates - dates[0]).days.values 

107 

108 # Generate flow data with seasonal pattern (higher in winter) 

109 seasonal_flow = flow_mean + flow_amplitude * np.sin(2 * np.pi * days / 365 + np.pi) 

110 flow = seasonal_flow + np.random.normal(0, flow_noise, len(dates)) 

111 flow = np.maximum(flow, 5.0) # Ensure flow is not too small or negative 

112 

113 min_days_for_spills = 60 

114 if len(dates) > min_days_for_spills: # Only add spills for longer time series 

115 n_spills = np.random.randint(6, 16) 

116 for _ in range(n_spills): 

117 spill_start = np.random.randint(0, len(dates) - 30) 

118 spill_duration = np.random.randint(15, 45) 

119 spill_magnitude = np.random.uniform(2.0, 5.0) 

120 

121 flow[spill_start : spill_start + spill_duration] /= spill_magnitude 

122 

123 # Generate infiltration temperature 

124 if temp_infiltration_method == "synthetic": 

125 # Seasonal pattern with noise 

126 infiltration_temp = temp_infiltration_mean + temp_infiltration_amplitude * np.sin(2 * np.pi * days / 365) 

127 infiltration_temp += np.random.normal(0, temp_measurement_noise, len(dates)) 

128 elif temp_infiltration_method == "constant": 

129 # Constant temperature 

130 infiltration_temp = np.full(len(dates), temp_infiltration_mean) 

131 infiltration_temp += np.random.normal(0, temp_measurement_noise, len(dates)) 

132 elif temp_infiltration_method == "soil_temperature": 

133 # Use soil temperature data already includes measurement noise 

134 infiltration_temp = ( 

135 get_soil_temperature( 

136 station_number=260, # Example station number 

137 interpolate_missing_values=True, 

138 )["TB3"] 

139 .resample(date_freq) 

140 .mean()[dates] 

141 .values 

142 ) 

143 else: 

144 msg = f"Unknown temperature method: {temp_infiltration_method}" 

145 raise ValueError(msg) 

146 

147 # Compute tedges for the flow series 

148 tedges = compute_time_edges(tedges=None, tstart=None, tend=dates, number_of_bins=len(dates)) 

149 

150 temp_extraction = gamma_infiltration_to_extraction( 

151 cin=infiltration_temp, 

152 flow=flow, 

153 tedges=tedges, 

154 cout_tedges=tedges, 

155 mean=aquifer_pore_volume_gamma_mean, # Use mean pore volume 

156 std=aquifer_pore_volume_gamma_std, # Use standard deviation for heterogeneity 

157 n_bins=aquifer_pore_volume_gamma_nbins, # Discretization resolution 

158 retardation_factor=retardation_factor, 

159 ) 

160 

161 # Add some noise to represent measurement errors 

162 temp_extraction += np.random.normal(0, temp_measurement_noise, len(dates)) 

163 

164 # Create data frame 

165 alpha, beta = mean_std_to_alpha_beta(mean=aquifer_pore_volume_gamma_mean, std=aquifer_pore_volume_gamma_std) 

166 df = pd.DataFrame( 

167 data={"flow": flow, "temp_infiltration": infiltration_temp, "temp_extraction": temp_extraction}, 

168 index=dates, 

169 ) 

170 df.attrs = { 

171 "description": "Example data for groundwater transport modeling", 

172 "source": "Synthetic data generated by gwtransport.examples.generate_example_data", 

173 "aquifer_pore_volume_gamma_mean": aquifer_pore_volume_gamma_mean, 

174 "aquifer_pore_volume_gamma_std": aquifer_pore_volume_gamma_std, 

175 "aquifer_pore_volume_gamma_alpha": alpha, 

176 "aquifer_pore_volume_gamma_beta": beta, 

177 "aquifer_pore_volume_gamma_nbins": aquifer_pore_volume_gamma_nbins, 

178 "retardation_factor": retardation_factor, 

179 "date_start": date_start, 

180 "date_end": date_end, 

181 "date_freq": date_freq, 

182 "flow_mean": flow_mean, 

183 "flow_amplitude": flow_amplitude, 

184 "flow_noise": flow_noise, 

185 "temp_infiltration_method": temp_infiltration_method, 

186 "temp_infiltration_mean": temp_infiltration_mean, 

187 "temp_infiltration_amplitude": temp_infiltration_amplitude, 

188 "temp_measurement_noise": temp_measurement_noise, 

189 } 

190 return df, tedges 

191 

192 

193def generate_example_deposition_timeseries( 

194 *, 

195 date_start: str = "2018-01-01", 

196 date_end: str = "2023-12-31", 

197 freq: str = "D", 

198 base: float = 0.8, 

199 seasonal_amplitude: float = 0.3, 

200 noise_scale: float = 0.1, 

201 event_dates: npt.ArrayLike | pd.DatetimeIndex | None = None, 

202 event_magnitude: float = 3.0, 

203 event_duration: int = 30, 

204 event_decay_scale: float = 10.0, 

205 ensure_non_negative: bool = True, 

206) -> tuple[pd.Series, pd.DatetimeIndex]: 

207 """ 

208 Generate synthetic deposition timeseries for groundwater transport examples. 

209 

210 Parameters 

211 ---------- 

212 date_start, date_end : str 

213 Start and end dates for the generated time series (YYYY-MM-DD). 

214 freq : str 

215 Frequency string for pandas.date_range (default 'D'). 

216 base : float 

217 Baseline deposition rate (ng/m^2/day). 

218 seasonal_amplitude : float 

219 Amplitude of the annual seasonal sinusoidal pattern. 

220 noise_scale : float 

221 Standard deviation scale for Gaussian noise added to the signal. 

222 event_dates : list-like or None 

223 Dates (strings or pandas-compatible) at which to place episodic events. If None, 

224 a sensible default list is used. 

225 event_magnitude : float 

226 Peak magnitude multiplier for events. 

227 event_duration : int 

228 Duration of each event in days. 

229 event_decay_scale : float 

230 Decay scale used in the exponential decay for event time series. 

231 ensure_non_negative : bool 

232 If True, negative values are clipped to zero. 

233 

234 Returns 

235 ------- 

236 pandas.Series 

237 Time series of deposition values indexed by daily timestamps. 

238 """ 

239 # Create synthetic deposition time series - needs to match flow period 

240 dates = pd.date_range(date_start, date_end, freq=freq).tz_localize("UTC") 

241 n_dates = len(dates) 

242 tedges = compute_time_edges(tedges=None, tstart=None, tend=dates, number_of_bins=n_dates) 

243 

244 # Base deposition rate with seasonal and event patterns 

245 seasonal_pattern = seasonal_amplitude * np.sin(2 * np.pi * np.arange(n_dates) / 365.25) 

246 noise = noise_scale * np.random.normal(0, 1, n_dates) 

247 

248 # Default event dates if not provided 

249 if event_dates is None: 

250 event_dates = ["2020-06-15", "2021-03-20", "2021-09-10", "2022-07-05"] 

251 # Convert to DatetimeIndex - handles list, array, or DatetimeIndex input 

252 if isinstance(event_dates, pd.DatetimeIndex): 

253 event_dates_index = event_dates 

254 else: 

255 # Convert ArrayLike to list for pd.to_datetime 

256 event_dates_list = event_dates if isinstance(event_dates, list) else list(np.asarray(event_dates)) 

257 event_dates_index = pd.DatetimeIndex(pd.to_datetime(event_dates_list)) 

258 

259 event = np.zeros(n_dates) 

260 for event_date in event_dates_index: 

261 event_idx = dates.get_indexer([event_date], method="nearest")[0] 

262 event_indices = np.arange(event_idx, min(event_idx + event_duration, n_dates)) 

263 decay_pattern = event_magnitude * np.exp(-np.arange(len(event_indices)) / event_decay_scale) 

264 event[event_indices] += decay_pattern 

265 

266 # Combine all components 

267 total = base + seasonal_pattern + noise + event 

268 if ensure_non_negative: 

269 total = np.maximum(total, 0.0) 

270 

271 series = pd.Series(data=total, index=dates, name="deposition") 

272 series.attrs = { 

273 "description": "Example deposition time series for groundwater transport modeling", 

274 "source": "Synthetic data generated by gwtransport.examples.generate_example_deposition_timeseries", 

275 "base": base, 

276 "seasonal_amplitude": seasonal_amplitude, 

277 "noise_scale": noise_scale, 

278 "event_dates": [str(d.date()) for d in event_dates_index], 

279 "event_magnitude": event_magnitude, 

280 "event_duration": event_duration, 

281 "event_decay_scale": event_decay_scale, 

282 "date_start": date_start, 

283 "date_end": date_end, 

284 "date_freq": freq, 

285 } 

286 

287 # Create deposition series 

288 return series, tedges