Skip to content

DataManager

DataManager

Provide functionality for handling data processings and workflows.

Source code in pySWATPlus/data_manager.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
class DataManager:

    '''
    Provide functionality for handling data processings and workflows.
    '''

    def simulated_timeseries_df(
        self,
        sim_file: str | pathlib.Path,
        has_units: bool,
        begin_date: typing.Optional[str] = None,
        end_date: typing.Optional[str] = None,
        ref_day: typing.Optional[int] = None,
        ref_month: typing.Optional[int] = None,
        apply_filter: typing.Optional[dict[str, list[typing.Any]]] = None,
        usecols: typing.Optional[list[str]] = None,
        json_file: typing.Optional[str | pathlib.Path] = None
    ) -> pandas.DataFrame:
        '''
        Extract data from a simulation output file and return a time series `DataFrame`.
        A new `date` column is constructed using `datetime.date` objects from the `yr`, `mon`, and `day` columns.

        Args:
            sim_file (str | pathlib.Path): Path to the input file containing time series data generated by
                the method [`run_swat`](https://swat-model.github.io/pySWATPlus/api/txtinout_reader/#pySWATPlus.TxtinoutReader.run_swat).
                The file must contain `yr`, `mon`, and `day` columns.

            has_units (bool): If `True`, the third line of the input file contains column units.

            begin_date (str): Start date in `DD-Mon-YYYY` format (e.g., '01-Jan-2012'), inclusive.
                If `None` (default), the earliest available date is used.

            end_date (str): End date in `DD-Mon-YYYY` format (e.g., '31-Dec-2015'), inclusive.
                If `None` (default), the latest available date is used.

            ref_day (int): Reference day for monthly and yearly time series after filtering by `begin_date` and `end_date`.
                For example, `2012-01-31` and `2012-02-29` become `2012-01-15` and `2012-02-15` when `ref_day=15`.
                If `None` (default), the last day of the month or year is used, obtained from simulation.
                Not applicable to daily time series files (ending with `_day`).

            ref_month (int): Reference month for yearly time series after filtering by `begin_date` and `end_date`. For example,
                `2012-12-31` and `2013-12-31` become `2012-06-15` and `2013-06-15` when `ref_day=15` and `ref_month=6`.
                If `None` (default), the last month of the year is used, obtained from simulation.
                Not applicable to monthly time series files (ending with `_mon`).

            apply_filter (dict[str, list[Any]]): Dictionary mapping column names to lists of values for row filtering.
                If `None` (default), no filtering is applied.

            usecols (list[str]): Column names to include in the output. If `None` (default), all columns are used.

            json_file (str | pathlib.Path): Path to save the output `DataFrame` as a JSON file.
                If `None` (default), the DataFrame is not saved.

        Returns:
            Time series `DataFrame` with a new `date` column.
        '''

        # Check input variables type
        validators._variable_origin_static_type(
            vars_types=typing.get_type_hints(
                obj=self.simulated_timeseries_df
            ),
            vars_values=locals()
        )

        # Absolute file path
        sim_file = pathlib.Path(sim_file).resolve()

        # DataFrame from input file
        skiprows = [0, 2] if has_units else [0]
        df = utils._df_extract(
            input_file=sim_file,
            skiprows=skiprows
        )

        # DataFrame columns
        df_cols = list(df.columns)

        # Create date column
        date_col = 'date'
        time_cols = ['yr', 'mon', 'day']
        missing_cols = [
            col for col in time_cols if col not in df_cols
        ]
        if len(missing_cols) > 0:
            raise ValueError(
                f'Missing required time series columns "{missing_cols}" in file "{sim_file.name}"'
            )
        df[date_col] = pandas.to_datetime(
            df[time_cols].rename(columns={'yr': 'year', 'mon': 'month'})
        ).dt.date

        # Filter DataFrame by date
        begin_dt = utils._date_str_to_object(begin_date) if begin_date is not None else df[date_col].iloc[0]
        end_dt = utils._date_str_to_object(end_date) if end_date is not None else df[date_col].iloc[-1]
        df = df.loc[df[date_col].between(begin_dt, end_dt)].reset_index(drop=True)

        # Fix reference day
        if ref_day is not None:
            if sim_file.stem.endswith(('_day', '_subday')):
                raise ValueError(
                    f'Parameter "ref_day" is not applicable for daily or sub-daily time series in file "{sim_file.name}" '
                    f'because it would assign the same day to all records within a month.'
                )
            df[date_col] = df[date_col].apply(
                lambda x: x.replace(day=ref_day)
            )

        # Fix reference month
        if ref_month is not None:
            if sim_file.stem.endswith('_mon'):
                raise ValueError(
                    f'Parameter "ref_month" is not applicable for monthly time series in file "{sim_file.name}" '
                    f'because it would assign the same month to all records within a year.'
                )
            df[date_col] = df[date_col].apply(
                lambda x: x.replace(month=ref_month)
            )

        # Check if filtering by date removed all rows
        if df.empty:
            raise ValueError(
                f'No data found between "{begin_date}" and "{end_date}" in file "{sim_file.name}"'
            )

        # Filter rows by dictionary criteria
        if apply_filter is not None:
            for col, val in apply_filter.items():
                if col not in df_cols:
                    raise ValueError(
                        f'Column "{col}" in apply_filter was not found in file "{sim_file.name}"'
                    )
                if not isinstance(val, list):
                    raise TypeError(
                        f'Column "{col}" in apply_filter for file "{sim_file.name}" must be a list, '
                        f'but got type "{type(val).__name__}"'
                    )
                df = df.loc[df[col].isin(val)]
                # Check if filtering removed all rows
                if df.empty:
                    raise ValueError(
                        f'Filtering by column "{col}" with values "{val}" returned no rows in "{sim_file.name}"'
                    )

        # Reset DataFrame index
        df = df.reset_index(
            drop=True
        )

        # Finalize columns for DataFrame
        if usecols is None:
            retain_cols = [date_col] + df_cols
        else:
            for col in usecols:
                if col not in df_cols:
                    raise ValueError(
                        f'Column "{col}" specified in "usecols" was not found in file "{sim_file.name}"'
                    )
            retain_cols = [date_col] + usecols

        # Output DataFrame
        df = df[retain_cols]

        # Save DataFrame
        if json_file is not None:
            json_file = pathlib.Path(json_file).resolve()
            # Raise error for invalid JSON file extension
            validators._json_extension(
                json_file=json_file
            )
            # Write DataFrame to the JSON file
            copy_df = copy.deepcopy(
                x=df
            )
            copy_df[date_col] = copy_df[date_col].apply(lambda x: x.strftime('%d-%b-%Y'))
            copy_df.to_json(
                path_or_buf=json_file,
                orient='records',
                indent=4
            )

        return df

    def hru_stats_from_daily_simulation(
        self,
        sim_file: str | pathlib.Path,
        has_units: bool,
        gis_id: int,
        sim_col: str,
        output_dir: typing.Optional[str | pathlib.Path] = None
    ) -> dict[str, pandas.DataFrame]:
        '''
        Compute monthly and yearly statistical summaries for a Hydrological Response Unit (HRU) from daily simulation time series data.

        The method returns a dictionary containing two keys, `monthly` and `yearly`, whose values are `DataFrame` objects.
        Each `DataFrame` includes the following columns:

            - `date`: The first day of the corresponding month or year.
            - `min`: Minimum simulated value within the time window.
            - `max`: Maximum simulated value within the time window.
            - `mean`: Mean simulated value within the time window.
            - `std`: Standard deviation of simulated values within the time window.

        The statistics are computed using daily values between the first and last dates (both inclusive)
        of each month or year. The `date` column represents the first day of the corresponding period
        (e.g., 01-Jan-2012, 01-Feb-2012 for monthly; 01-Jan-2012 for yearly).

        If the first or last record in the input file does not align exactly with the start or end
        of a month or year, the statistics are computed for the available portion of that period.
        In such cases, the `date` column represents the first available date for that partial period.

        Args:
            sim_file (str | pathlib.Path): Path to the input file containing time series data generated by
                the method [`run_swat`](https://swat-model.github.io/pySWATPlus/api/txtinout_reader/#pySWATPlus.TxtinoutReader.run_swat).
                The file must contain `yr`, `mon`, and `day` columns.

            has_units (bool): If `True`, the third line of the input file contains column units.

            gis_id (int): Unique identifier for the Hydrological Response Unit (HRU) found in the `gis_id`.

            sim_col (str): Name of the column containing simulated values.

            output_dir (str | pathlib.Path): Directory path to save the computed results as two following JSON files.
                If `None` (default), the results are not saved.

                    - `statistics_monthly.json`: Contains the monthly statistical `DataFrame` (with `date` formatted as `DD-Mon-YYYY`).
                    - `statistics_yearly.json`: Contains the yearly statistical `DataFrame` (with `date` formatted as `DD-Mon-YYYY`).

        Returns:
            Dictionary with two keys:

                - `monthly`: `DataFrame` containing monthly statistics, with `date` as `datetime.date` objects.
                - `yearly`: `DataFrame` containing yearly statistics, with `date` as `datetime.date` objects.
        '''

        # Check input variables type
        validators._variable_origin_static_type(
            vars_types=typing.get_type_hints(
                obj=self.hru_stats_from_daily_simulation
            ),
            vars_values=locals()
        )

        # Check input file contains daily time series data
        sim_file = pathlib.Path(sim_file).resolve()
        if not sim_file.stem.endswith('_day'):
            raise ValueError(
                f'Statistical summary applies only to daily time series files ending with "_day"; received file name "{sim_file.stem}"'
            )

        # Validate directory path
        if output_dir is not None:
            validators._dir_path(
                input_dir=pathlib.Path(output_dir).resolve()
            )

        # Simulated DataFrame
        df = self.simulated_timeseries_df(
            sim_file=sim_file,
            has_units=has_units,
            apply_filter={
                'gis_id': [gis_id]
            },
            usecols=[sim_col]
        )

        # Frequncy abbreviations
        freq_abb = {
            'monthly': 'MS',
            'yearly': 'YS'
        }

        # Date column
        date_col = 'date'

        output = {}
        # Iterate frequency
        for freq in freq_abb:
            # Start date
            start_date = df[date_col].min()
            # End date
            end_date = df[date_col].max()
            # Time frequency representation days
            freq_day = pandas.date_range(
                start=start_date,
                end=end_date,
                freq=freq_abb[freq]
            )
            freq_day = pandas.Series(freq_day).dt.date
            # Add start and end date
            freq_day = pandas.Series(
                [start_date] + freq_day.tolist() + [end_date]
            )
            # Get unique date if repitation of start and end dates
            freq_day = freq_day.unique()
            # Frequency DataFrame
            freq_df = pandas.DataFrame()
            for idx, dates in enumerate(zip(freq_day[:-1], freq_day[1:])):
                idx_df = df[(df[date_col] >= dates[0]) & (df[date_col] < dates[1])]
                freq_df.loc[idx, 'date'] = dates[0]
                freq_df.loc[idx, 'max'] = idx_df[sim_col].max()
                freq_df.loc[idx, 'min'] = idx_df[sim_col].min()
                freq_df.loc[idx, 'mean'] = idx_df[sim_col].mean()
                freq_df.loc[idx, 'std'] = idx_df[sim_col].std()
            # Insert the DataFrame in the output dictionary
            output[freq] = freq_df
            # Save the DataFrame
            if output_dir is not None:
                save_file = pathlib.Path(output_dir).resolve() / f'statistics_{freq}.json'
                copy_df = copy.deepcopy(freq_df)
                copy_df[date_col] = copy_df[date_col].apply(lambda x: x.strftime('%d-%b-%Y'))
                copy_df.to_json(
                    path_or_buf=save_file,
                    orient='records',
                    indent=4
                )

        return output

    def read_sensitive_dfs(
        self,
        sensim_file: str | pathlib.Path,
        df_name: str,
        add_problem: bool = False,
        add_sample: bool = False
    ) -> dict[str, typing.Any]:
        '''
        Read sensitivity simulation data generated by the method
        [`simulation_by_sample_parameters`](https://swat-model.github.io/pySWATPlus/api/sensitivity_analyzer/#pySWATPlus.SensitivityAnalyzer.simulation_by_sample_parameters),
        and return a dictionary mapping each scenario integer to its corresponding `DataFrame`.

        The returned dictionary may include the following keys:
        - `scenario` (default): A mapping between each scenario integer and its corresponding DataFrame.
        - `problem` (optional): The problem definition.
        - `sample` (optional): The sample list used in the sensitivity simulation.

        Args:
            sensim_file (str | pathlib.Path): Path to the `sensitivity_simulation.json` file generated by `simulation_by_sample_parameters`.

            df_name (str): Name of the `DataFrame` within `sensitivity_simulation.json`.

            add_problem (bool): If `True`, includes the problem definition in the output dictionary under the `problem` key. Defaults to `False`.

            add_sample (bool): If `True`, includes the sample list used in the simulation under the `sample` key. Defaults to `False`.

        Returns:
            A dictionary with the following keys:

                - `scenario` (default): A mapping between each scenario integer and its corresponding DataFrame.
                - `problem` (optional):  The definition dictionary passed to sampling.
                - `sample` (optional): The sample list used in the sensitivity simulation.
        '''

        # Check input variables type
        validators._variable_origin_static_type(
            vars_types=typing.get_type_hints(
                obj=self.read_sensitive_dfs
            ),
            vars_values=locals()
        )

        # Absolute file path
        sensim_file = pathlib.Path(sensim_file).resolve()

        # Sensitiivty output data
        output = utils._sensitivity_output_retrieval(
            sensim_file=sensim_file,
            df_name=df_name,
            add_problem=add_problem,
            add_sample=add_sample
        )

        return output

hru_stats_from_daily_simulation(sim_file: str | pathlib.Path, has_units: bool, gis_id: int, sim_col: str, output_dir: typing.Optional[str | pathlib.Path] = None) -> dict[str, pandas.DataFrame]

Compute monthly and yearly statistical summaries for a Hydrological Response Unit (HRU) from daily simulation time series data.

The method returns a dictionary containing two keys, monthly and yearly, whose values are DataFrame objects. Each DataFrame includes the following columns:

- `date`: The first day of the corresponding month or year.
- `min`: Minimum simulated value within the time window.
- `max`: Maximum simulated value within the time window.
- `mean`: Mean simulated value within the time window.
- `std`: Standard deviation of simulated values within the time window.

The statistics are computed using daily values between the first and last dates (both inclusive) of each month or year. The date column represents the first day of the corresponding period (e.g., 01-Jan-2012, 01-Feb-2012 for monthly; 01-Jan-2012 for yearly).

If the first or last record in the input file does not align exactly with the start or end of a month or year, the statistics are computed for the available portion of that period. In such cases, the date column represents the first available date for that partial period.

Parameters:

Name Type Description Default
sim_file str | Path

Path to the input file containing time series data generated by the method run_swat. The file must contain yr, mon, and day columns.

required
has_units bool

If True, the third line of the input file contains column units.

required
gis_id int

Unique identifier for the Hydrological Response Unit (HRU) found in the gis_id.

required
sim_col str

Name of the column containing simulated values.

required
output_dir str | Path

Directory path to save the computed results as two following JSON files. If None (default), the results are not saved.

- `statistics_monthly.json`: Contains the monthly statistical `DataFrame` (with `date` formatted as `DD-Mon-YYYY`).
- `statistics_yearly.json`: Contains the yearly statistical `DataFrame` (with `date` formatted as `DD-Mon-YYYY`).
None

Returns:

Type Description
dict[str, DataFrame]

Dictionary with two keys:

  • monthly: DataFrame containing monthly statistics, with date as datetime.date objects.
  • yearly: DataFrame containing yearly statistics, with date as datetime.date objects.
Source code in pySWATPlus/data_manager.py
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
def hru_stats_from_daily_simulation(
    self,
    sim_file: str | pathlib.Path,
    has_units: bool,
    gis_id: int,
    sim_col: str,
    output_dir: typing.Optional[str | pathlib.Path] = None
) -> dict[str, pandas.DataFrame]:
    '''
    Compute monthly and yearly statistical summaries for a Hydrological Response Unit (HRU) from daily simulation time series data.

    The method returns a dictionary containing two keys, `monthly` and `yearly`, whose values are `DataFrame` objects.
    Each `DataFrame` includes the following columns:

        - `date`: The first day of the corresponding month or year.
        - `min`: Minimum simulated value within the time window.
        - `max`: Maximum simulated value within the time window.
        - `mean`: Mean simulated value within the time window.
        - `std`: Standard deviation of simulated values within the time window.

    The statistics are computed using daily values between the first and last dates (both inclusive)
    of each month or year. The `date` column represents the first day of the corresponding period
    (e.g., 01-Jan-2012, 01-Feb-2012 for monthly; 01-Jan-2012 for yearly).

    If the first or last record in the input file does not align exactly with the start or end
    of a month or year, the statistics are computed for the available portion of that period.
    In such cases, the `date` column represents the first available date for that partial period.

    Args:
        sim_file (str | pathlib.Path): Path to the input file containing time series data generated by
            the method [`run_swat`](https://swat-model.github.io/pySWATPlus/api/txtinout_reader/#pySWATPlus.TxtinoutReader.run_swat).
            The file must contain `yr`, `mon`, and `day` columns.

        has_units (bool): If `True`, the third line of the input file contains column units.

        gis_id (int): Unique identifier for the Hydrological Response Unit (HRU) found in the `gis_id`.

        sim_col (str): Name of the column containing simulated values.

        output_dir (str | pathlib.Path): Directory path to save the computed results as two following JSON files.
            If `None` (default), the results are not saved.

                - `statistics_monthly.json`: Contains the monthly statistical `DataFrame` (with `date` formatted as `DD-Mon-YYYY`).
                - `statistics_yearly.json`: Contains the yearly statistical `DataFrame` (with `date` formatted as `DD-Mon-YYYY`).

    Returns:
        Dictionary with two keys:

            - `monthly`: `DataFrame` containing monthly statistics, with `date` as `datetime.date` objects.
            - `yearly`: `DataFrame` containing yearly statistics, with `date` as `datetime.date` objects.
    '''

    # Check input variables type
    validators._variable_origin_static_type(
        vars_types=typing.get_type_hints(
            obj=self.hru_stats_from_daily_simulation
        ),
        vars_values=locals()
    )

    # Check input file contains daily time series data
    sim_file = pathlib.Path(sim_file).resolve()
    if not sim_file.stem.endswith('_day'):
        raise ValueError(
            f'Statistical summary applies only to daily time series files ending with "_day"; received file name "{sim_file.stem}"'
        )

    # Validate directory path
    if output_dir is not None:
        validators._dir_path(
            input_dir=pathlib.Path(output_dir).resolve()
        )

    # Simulated DataFrame
    df = self.simulated_timeseries_df(
        sim_file=sim_file,
        has_units=has_units,
        apply_filter={
            'gis_id': [gis_id]
        },
        usecols=[sim_col]
    )

    # Frequncy abbreviations
    freq_abb = {
        'monthly': 'MS',
        'yearly': 'YS'
    }

    # Date column
    date_col = 'date'

    output = {}
    # Iterate frequency
    for freq in freq_abb:
        # Start date
        start_date = df[date_col].min()
        # End date
        end_date = df[date_col].max()
        # Time frequency representation days
        freq_day = pandas.date_range(
            start=start_date,
            end=end_date,
            freq=freq_abb[freq]
        )
        freq_day = pandas.Series(freq_day).dt.date
        # Add start and end date
        freq_day = pandas.Series(
            [start_date] + freq_day.tolist() + [end_date]
        )
        # Get unique date if repitation of start and end dates
        freq_day = freq_day.unique()
        # Frequency DataFrame
        freq_df = pandas.DataFrame()
        for idx, dates in enumerate(zip(freq_day[:-1], freq_day[1:])):
            idx_df = df[(df[date_col] >= dates[0]) & (df[date_col] < dates[1])]
            freq_df.loc[idx, 'date'] = dates[0]
            freq_df.loc[idx, 'max'] = idx_df[sim_col].max()
            freq_df.loc[idx, 'min'] = idx_df[sim_col].min()
            freq_df.loc[idx, 'mean'] = idx_df[sim_col].mean()
            freq_df.loc[idx, 'std'] = idx_df[sim_col].std()
        # Insert the DataFrame in the output dictionary
        output[freq] = freq_df
        # Save the DataFrame
        if output_dir is not None:
            save_file = pathlib.Path(output_dir).resolve() / f'statistics_{freq}.json'
            copy_df = copy.deepcopy(freq_df)
            copy_df[date_col] = copy_df[date_col].apply(lambda x: x.strftime('%d-%b-%Y'))
            copy_df.to_json(
                path_or_buf=save_file,
                orient='records',
                indent=4
            )

    return output

read_sensitive_dfs(sensim_file: str | pathlib.Path, df_name: str, add_problem: bool = False, add_sample: bool = False) -> dict[str, typing.Any]

Read sensitivity simulation data generated by the method simulation_by_sample_parameters, and return a dictionary mapping each scenario integer to its corresponding DataFrame.

The returned dictionary may include the following keys: - scenario (default): A mapping between each scenario integer and its corresponding DataFrame. - problem (optional): The problem definition. - sample (optional): The sample list used in the sensitivity simulation.

Parameters:

Name Type Description Default
sensim_file str | Path

Path to the sensitivity_simulation.json file generated by simulation_by_sample_parameters.

required
df_name str

Name of the DataFrame within sensitivity_simulation.json.

required
add_problem bool

If True, includes the problem definition in the output dictionary under the problem key. Defaults to False.

False
add_sample bool

If True, includes the sample list used in the simulation under the sample key. Defaults to False.

False

Returns:

Type Description
dict[str, Any]

A dictionary with the following keys:

  • scenario (default): A mapping between each scenario integer and its corresponding DataFrame.
  • problem (optional): The definition dictionary passed to sampling.
  • sample (optional): The sample list used in the sensitivity simulation.
Source code in pySWATPlus/data_manager.py
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
def read_sensitive_dfs(
    self,
    sensim_file: str | pathlib.Path,
    df_name: str,
    add_problem: bool = False,
    add_sample: bool = False
) -> dict[str, typing.Any]:
    '''
    Read sensitivity simulation data generated by the method
    [`simulation_by_sample_parameters`](https://swat-model.github.io/pySWATPlus/api/sensitivity_analyzer/#pySWATPlus.SensitivityAnalyzer.simulation_by_sample_parameters),
    and return a dictionary mapping each scenario integer to its corresponding `DataFrame`.

    The returned dictionary may include the following keys:
    - `scenario` (default): A mapping between each scenario integer and its corresponding DataFrame.
    - `problem` (optional): The problem definition.
    - `sample` (optional): The sample list used in the sensitivity simulation.

    Args:
        sensim_file (str | pathlib.Path): Path to the `sensitivity_simulation.json` file generated by `simulation_by_sample_parameters`.

        df_name (str): Name of the `DataFrame` within `sensitivity_simulation.json`.

        add_problem (bool): If `True`, includes the problem definition in the output dictionary under the `problem` key. Defaults to `False`.

        add_sample (bool): If `True`, includes the sample list used in the simulation under the `sample` key. Defaults to `False`.

    Returns:
        A dictionary with the following keys:

            - `scenario` (default): A mapping between each scenario integer and its corresponding DataFrame.
            - `problem` (optional):  The definition dictionary passed to sampling.
            - `sample` (optional): The sample list used in the sensitivity simulation.
    '''

    # Check input variables type
    validators._variable_origin_static_type(
        vars_types=typing.get_type_hints(
            obj=self.read_sensitive_dfs
        ),
        vars_values=locals()
    )

    # Absolute file path
    sensim_file = pathlib.Path(sensim_file).resolve()

    # Sensitiivty output data
    output = utils._sensitivity_output_retrieval(
        sensim_file=sensim_file,
        df_name=df_name,
        add_problem=add_problem,
        add_sample=add_sample
    )

    return output

simulated_timeseries_df(sim_file: str | pathlib.Path, has_units: bool, begin_date: typing.Optional[str] = None, end_date: typing.Optional[str] = None, ref_day: typing.Optional[int] = None, ref_month: typing.Optional[int] = None, apply_filter: typing.Optional[dict[str, list[typing.Any]]] = None, usecols: typing.Optional[list[str]] = None, json_file: typing.Optional[str | pathlib.Path] = None) -> pandas.DataFrame

Extract data from a simulation output file and return a time series DataFrame. A new date column is constructed using datetime.date objects from the yr, mon, and day columns.

Parameters:

Name Type Description Default
sim_file str | Path

Path to the input file containing time series data generated by the method run_swat. The file must contain yr, mon, and day columns.

required
has_units bool

If True, the third line of the input file contains column units.

required
begin_date str

Start date in DD-Mon-YYYY format (e.g., '01-Jan-2012'), inclusive. If None (default), the earliest available date is used.

None
end_date str

End date in DD-Mon-YYYY format (e.g., '31-Dec-2015'), inclusive. If None (default), the latest available date is used.

None
ref_day int

Reference day for monthly and yearly time series after filtering by begin_date and end_date. For example, 2012-01-31 and 2012-02-29 become 2012-01-15 and 2012-02-15 when ref_day=15. If None (default), the last day of the month or year is used, obtained from simulation. Not applicable to daily time series files (ending with _day).

None
ref_month int

Reference month for yearly time series after filtering by begin_date and end_date. For example, 2012-12-31 and 2013-12-31 become 2012-06-15 and 2013-06-15 when ref_day=15 and ref_month=6. If None (default), the last month of the year is used, obtained from simulation. Not applicable to monthly time series files (ending with _mon).

None
apply_filter dict[str, list[Any]]

Dictionary mapping column names to lists of values for row filtering. If None (default), no filtering is applied.

None
usecols list[str]

Column names to include in the output. If None (default), all columns are used.

None
json_file str | Path

Path to save the output DataFrame as a JSON file. If None (default), the DataFrame is not saved.

None

Returns:

Type Description
DataFrame

Time series DataFrame with a new date column.

Source code in pySWATPlus/data_manager.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
def simulated_timeseries_df(
    self,
    sim_file: str | pathlib.Path,
    has_units: bool,
    begin_date: typing.Optional[str] = None,
    end_date: typing.Optional[str] = None,
    ref_day: typing.Optional[int] = None,
    ref_month: typing.Optional[int] = None,
    apply_filter: typing.Optional[dict[str, list[typing.Any]]] = None,
    usecols: typing.Optional[list[str]] = None,
    json_file: typing.Optional[str | pathlib.Path] = None
) -> pandas.DataFrame:
    '''
    Extract data from a simulation output file and return a time series `DataFrame`.
    A new `date` column is constructed using `datetime.date` objects from the `yr`, `mon`, and `day` columns.

    Args:
        sim_file (str | pathlib.Path): Path to the input file containing time series data generated by
            the method [`run_swat`](https://swat-model.github.io/pySWATPlus/api/txtinout_reader/#pySWATPlus.TxtinoutReader.run_swat).
            The file must contain `yr`, `mon`, and `day` columns.

        has_units (bool): If `True`, the third line of the input file contains column units.

        begin_date (str): Start date in `DD-Mon-YYYY` format (e.g., '01-Jan-2012'), inclusive.
            If `None` (default), the earliest available date is used.

        end_date (str): End date in `DD-Mon-YYYY` format (e.g., '31-Dec-2015'), inclusive.
            If `None` (default), the latest available date is used.

        ref_day (int): Reference day for monthly and yearly time series after filtering by `begin_date` and `end_date`.
            For example, `2012-01-31` and `2012-02-29` become `2012-01-15` and `2012-02-15` when `ref_day=15`.
            If `None` (default), the last day of the month or year is used, obtained from simulation.
            Not applicable to daily time series files (ending with `_day`).

        ref_month (int): Reference month for yearly time series after filtering by `begin_date` and `end_date`. For example,
            `2012-12-31` and `2013-12-31` become `2012-06-15` and `2013-06-15` when `ref_day=15` and `ref_month=6`.
            If `None` (default), the last month of the year is used, obtained from simulation.
            Not applicable to monthly time series files (ending with `_mon`).

        apply_filter (dict[str, list[Any]]): Dictionary mapping column names to lists of values for row filtering.
            If `None` (default), no filtering is applied.

        usecols (list[str]): Column names to include in the output. If `None` (default), all columns are used.

        json_file (str | pathlib.Path): Path to save the output `DataFrame` as a JSON file.
            If `None` (default), the DataFrame is not saved.

    Returns:
        Time series `DataFrame` with a new `date` column.
    '''

    # Check input variables type
    validators._variable_origin_static_type(
        vars_types=typing.get_type_hints(
            obj=self.simulated_timeseries_df
        ),
        vars_values=locals()
    )

    # Absolute file path
    sim_file = pathlib.Path(sim_file).resolve()

    # DataFrame from input file
    skiprows = [0, 2] if has_units else [0]
    df = utils._df_extract(
        input_file=sim_file,
        skiprows=skiprows
    )

    # DataFrame columns
    df_cols = list(df.columns)

    # Create date column
    date_col = 'date'
    time_cols = ['yr', 'mon', 'day']
    missing_cols = [
        col for col in time_cols if col not in df_cols
    ]
    if len(missing_cols) > 0:
        raise ValueError(
            f'Missing required time series columns "{missing_cols}" in file "{sim_file.name}"'
        )
    df[date_col] = pandas.to_datetime(
        df[time_cols].rename(columns={'yr': 'year', 'mon': 'month'})
    ).dt.date

    # Filter DataFrame by date
    begin_dt = utils._date_str_to_object(begin_date) if begin_date is not None else df[date_col].iloc[0]
    end_dt = utils._date_str_to_object(end_date) if end_date is not None else df[date_col].iloc[-1]
    df = df.loc[df[date_col].between(begin_dt, end_dt)].reset_index(drop=True)

    # Fix reference day
    if ref_day is not None:
        if sim_file.stem.endswith(('_day', '_subday')):
            raise ValueError(
                f'Parameter "ref_day" is not applicable for daily or sub-daily time series in file "{sim_file.name}" '
                f'because it would assign the same day to all records within a month.'
            )
        df[date_col] = df[date_col].apply(
            lambda x: x.replace(day=ref_day)
        )

    # Fix reference month
    if ref_month is not None:
        if sim_file.stem.endswith('_mon'):
            raise ValueError(
                f'Parameter "ref_month" is not applicable for monthly time series in file "{sim_file.name}" '
                f'because it would assign the same month to all records within a year.'
            )
        df[date_col] = df[date_col].apply(
            lambda x: x.replace(month=ref_month)
        )

    # Check if filtering by date removed all rows
    if df.empty:
        raise ValueError(
            f'No data found between "{begin_date}" and "{end_date}" in file "{sim_file.name}"'
        )

    # Filter rows by dictionary criteria
    if apply_filter is not None:
        for col, val in apply_filter.items():
            if col not in df_cols:
                raise ValueError(
                    f'Column "{col}" in apply_filter was not found in file "{sim_file.name}"'
                )
            if not isinstance(val, list):
                raise TypeError(
                    f'Column "{col}" in apply_filter for file "{sim_file.name}" must be a list, '
                    f'but got type "{type(val).__name__}"'
                )
            df = df.loc[df[col].isin(val)]
            # Check if filtering removed all rows
            if df.empty:
                raise ValueError(
                    f'Filtering by column "{col}" with values "{val}" returned no rows in "{sim_file.name}"'
                )

    # Reset DataFrame index
    df = df.reset_index(
        drop=True
    )

    # Finalize columns for DataFrame
    if usecols is None:
        retain_cols = [date_col] + df_cols
    else:
        for col in usecols:
            if col not in df_cols:
                raise ValueError(
                    f'Column "{col}" specified in "usecols" was not found in file "{sim_file.name}"'
                )
        retain_cols = [date_col] + usecols

    # Output DataFrame
    df = df[retain_cols]

    # Save DataFrame
    if json_file is not None:
        json_file = pathlib.Path(json_file).resolve()
        # Raise error for invalid JSON file extension
        validators._json_extension(
            json_file=json_file
        )
        # Write DataFrame to the JSON file
        copy_df = copy.deepcopy(
            x=df
        )
        copy_df[date_col] = copy_df[date_col].apply(lambda x: x.strftime('%d-%b-%Y'))
        copy_df.to_json(
            path_or_buf=json_file,
            orient='records',
            indent=4
        )

    return df