SWAP_analysis/tools.py at main · JoYvBa/SWAP_analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 25 14:45:13 2025

@author: Jorrit Bakker
"""

import pandas as pd
pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np

def cleanup_redox(file_path : str,
                  correction : float = 200,
                  rename : dict = None,
                  ):
    """ Takes an Excel sheet of SWAP sensor data and seperates it into two
    dataframes for redox and temperature that can be used for data analysis.

    Parameters
    ----------
    file_path : str
        Path to excel or csv file containing redox data.
    correction : float, optional
        The correction factor with regards to 3M KCl electrode reference for the redox data. Default is 200
    rename : dictionary, optional
        Dictionary for renaming the SWAP nodes. With initial name as key and new name as value. Default is none.

    Returns
    -------
    df_redox : pd.Dataframe
        A cleaned up dataframe containing the redox data from the input Excel sheet.
    df_temp : pd.Dataframe
        A cleaned up dataframe containing the soil temperature data from the input Excel sheet.
    """

    # Handle both Excel files and .csv (or .dat) files as input
    if file_path.endswith("xlsx") or file_path.endswith("xls"):
        dataframe = pd.read_excel(file_path, sheet_name = 0)
    else:
        try:
            dataframe = pd.read_csv(file_path, skiprows = [0], header = None)
        except:
            IOError("File type is not of the expected type or recognized. File should be a csv or Excel")

    drop_cols = ['batt_volt_Avg', 'RECORD', 'TIMESTAMP']

    # Locate the row containing sensor names and change column headers
    header = dataframe[dataframe.iloc[:,0] == "TIMESTAMP"].index[0]
    dataframe.columns = dataframe.iloc[header]
    # Locate the row where redox data starts, to remove any unnecessary rows above
    start_data = dataframe[dataframe.loc[:,"RECORD"] == "0"].index[0]
    dataframe = dataframe.drop(range(0,start_data))

    # Time column is set to datatime datatype and moved to first column of dataframe for ease of reading.
    dataframe['Time'] = pd.to_datetime(dataframe['TIMESTAMP'])
    move_col = dataframe.pop("Time")
    dataframe.insert(0, "Time", move_col)
    # Remove irrelevant columns
    dataframe = dataframe.drop(drop_cols, axis = 1)

    # Split dataframe into redox and temperature by selecting on the beginning of the column names.
    df_redox = dataframe.filter(regex='^redox')
    df_redox = pd.concat([dataframe['Time'], df_redox], axis = 1)
    df_temp = dataframe.filter(regex='^temp')
    df_temp = pd.concat([dataframe['Time'], df_temp], axis = 1)

    # Column names are changed to more concise names for ease of understanding.
    # Renaming dictionaries are located at the bottom of this script and can be passed to this function.
    if rename:
        df_redox.rename(columns = rename, inplace = True)
        df_temp.rename(columns = rename, inplace = True)

    # Set dataframe index to the date-time column for ease of indexing.
    df_redox = df_redox.set_index("Time")
    df_temp = df_temp.set_index("Time")
    # Set datatype to numerical for plotting and data analysis.
    df_redox = df_redox.apply(pd.to_numeric, errors='coerce') + correction
    df_temp = df_temp.apply(pd.to_numeric, errors='coerce')

    # Hourly data range from beginning to end date of dataframe, to include missing timeframe.
    date_range_redox = pd.date_range(df_redox.index[0], df_redox.index[-1], freq = "h")
    date_range_temp = pd.date_range(df_temp.index[0], df_temp.index[-1], freq = "h")

    # Fill any missing time with NaN values, to clarify which parts of data are missing.
    df_redox = df_redox.reindex(date_range_redox, fill_value=np.nan)
    df_temp = df_temp.reindex(date_range_temp, fill_value=np.nan)

    return(df_redox, df_temp)

def plot_redox(df_redox : pd.DataFrame(),
               redox_nodes : list,
               start_date : str,
               end_date : str,
               ylimit : tuple = None,
               ax = None,
               **kwargs
               ):
    """ Plot redox potential and optionally rainfall data for selected nodes in a timeframe.


    Parameters
    ----------
    df_redox : pd.DataFrame
        Dataframe with SWAP redox data
    redox_nodes : List
        List with nodes from the dataframe to be plotted
    start_date : str
        The start date of the data to be plotted, as "YYYY-MM-DD"
    end_date : str
        The end date of the data to be plotted, as "YYYY-MM-DD"
    ylimit_redox : tuple, optional
        Manually set y-axis range for the redox data, as (min_y, max_y). The default is None.
    ax : obj, optional
        Axes object of matplotlib, specify when adding this to a pre-exisiting plot object. Default is None.
    **kwargs :
        Keyword arguments for plt.plot()

    Returns
    -------
    fig : obj
        Figure object of matplotlib
    ax : obj
        Axes object of matplotlib
    """

    # Select date range of data
    mask = (df_redox.index > start_date) & (df_redox.index <= end_date)
    df_redox_plot = df_redox[mask]

    # Detect if an axis object is passed to the function, so data will be plotted to the right axis.
    if ax is None:
        fig, ax = plt.subplots(figsize=(8, 6), dpi=300)
    else:
        fig = None

    # Plot redox potential measured for every input node
    for i, node in enumerate(redox_nodes):
        ax.plot(df_redox_plot.index, df_redox_plot[node],
                label = node,
                **kwargs)

    ax.set_xlabel("Time")
    ax.set_ylabel("Redox potential [mV]")

    # try-except structure to handle ylimits of wrong format
    if ylimit:
        try:
            ax.set_ylim(ylimit)
        except:
            print("ylimits_redox is not of the correct format and is thus ignored.")

    ax.set_xlabel('Date')
    ax.set_ylabel(r'Redox potential [$mV$]')

    # Modify axis ticks with the goal to make it more readable.
    ax.xaxis.set_major_locator(mdates.MonthLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b-%Y'))
    ax.tick_params(axis='x', rotation=0, pad=15)

    ax.xaxis.set_minor_locator(mdates.WeekdayLocator())
    ax.xaxis.set_minor_formatter(mdates.DateFormatter('%d'))
    ax.tick_params(axis='x', which='minor', length=4, width=1)

    # When part of pre-exisiting figure, only ax needs to be returned, otherwise, fig is generated inside this function so it has to be returned as well.
    if fig is None:
        return(ax)
    else:
        fig.tight_layout()
        return(fig, ax)

    return(fig, ax)


def plot_temp(df_temp : pd.DataFrame(),
              temp_nodes : list,
              start_date : str,
              end_date : str,
              mean : bool = False,
              ylimit : tuple = None,
              ax = None,
              **kwargs
              ):
    """ Plot temperature for a selection of nodes in a specified timeframe.


    Parameters
    ----------
    df_temp : pd.DataFrame
        Dataframe with temperature data
    temp_nodes : List
        List with nodes from the dataframe to be plotted
    start_date : str
        The start date of the data to be plotted, as "YYYY-MM-DD"
    end_date : str
        The end date of the data to be plotted, as "YYYY-MM-DD"
    mean : Bool, optional
        When set to True, plots the mean soil temperature of the given nodes. The default is False.
    ylimit : tuple, optional
        Manually set y-axis range for the redox data, as (min_y, max_y). The default is False.
    ax : obj, optional
        Axes object of matplotlib, specify when adding this to a pre-exisiting plot. Default is None.
    **kwargs :
        Keyword arguments for plt.plot()

    Returns
    -------
    fig : obj
        Figure object of matplotlib
    ax : obj
        Axes object of matplotlib

    """

    # Select date range of data
    mask = (df_temp.index > start_date) & (df_temp.index <= end_date)
    df_temp = df_temp[mask]

    # When plotting temperature and redox together, it is more clear to use the mean temperature.
    if mean:
        mean_temp = df_temp.loc[:,temp_nodes].mean(axis=1)
        df_temp['mean_temperature'] = mean_temp

    # Detect if an axis object is passed to the function, so data will be plotted to the right axis.
    if ax is None:
        fig, ax = plt.subplots(figsize=(8, 6), dpi=300)
    else:
        fig = None

    # Plotting parameters are different when plotting the mean temperature versus every temperature node.
    if mean:
            ax.plot(df_temp.index, df_temp["mean_temperature"],
                    label = "Mean temperature",
                    **kwargs
                    )
    else:
        for i, node in enumerate(temp_nodes):
            ax.plot(df_temp.index, df_temp[node],
                    label = node,
                    **kwargs
                    )

    # try-except structure to handle ylimits of wrong format
    if ylimit:
        try:
            ax.set_ylim(ylimit)
        except:
            print("ylimits is not of the correct format and is thus ignored.")

    ax.set_xlabel('Date')
    ax.set_ylabel('Temperature (ºC)')

    # Modify axis ticks to make it more readable.
    ax.xaxis.set_major_locator(mdates.MonthLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b-%Y'))
    ax.tick_params(axis='x', rotation=0, pad=15)

    ax.xaxis.set_minor_locator(mdates.WeekdayLocator())
    ax.xaxis.set_minor_formatter(mdates.DateFormatter('%d'))
    ax.tick_params(axis='x', which='minor', length=4, width=1)

    # When part of pre-exisiting figure, only ax needs to be returned, otherwise, fig is generated inside this function so it has to be returned as well.
    if fig is None:
        return(ax)
    else:
        fig.tight_layout()
        return(fig, ax)


# Dictionary which renames each soil node for the Constructed Wetland pilot study
CW_rename = {
    'redox_raw_Avg(1)'  : 'CW1S1-1', 'redox_raw_Avg(2)'   : 'CW1S1-2', 'redox_raw_Avg(3)'   : 'CW1S1-3', 'redox_raw_Avg(4)'   : 'CW1S1-4',
    'redox_raw_Avg(5)'  : 'CW1S2-1', 'redox_raw_Avg(6)'   : 'CW1S2-2', 'redox_raw_Avg(7)'   : 'CW1S2-3', 'redox_raw_Avg(8)'   : 'CW1S2-4',
    'redox_raw_Avg(9)'  : 'CW1S3-1', 'redox_raw_Avg(10)'  : 'CW1S3-2', 'redox_raw_Avg(11)'  : 'CW1S3-3', 'redox_raw_Avg(12)'  : 'CW1S3-4',
    'redox_raw_Avg(13)' : 'CW1S4-1', 'redox_raw_Avg(14)'  : 'CW1S4-2', 'redox_raw_Avg(15)'  : 'CW1S4-3', 'redox_raw_Avg(16)'  : 'CW1S4-4',
    'redox_raw_Avg(17)' : 'CW2S1-1', 'redox_raw_Avg(18)'  : 'CW2S1-2', 'redox_raw_Avg(19)'  : 'CW2S1-3', 'redox_raw_Avg(20)'  : 'CW2S1-4',
    'redox_raw_Avg(21)' : 'CW2S2-1', 'redox_raw_Avg(22)'  : 'CW2S2-2', 'redox_raw_Avg(23)'  : 'CW2S2-3', 'redox_raw_Avg(24)'  : 'CW2S2-4',
    'redox_raw_Avg(25)' : 'CW2S3-1', 'redox_raw_Avg(26)'  : 'CW2S3-2', 'redox_raw_Avg(27)'  : 'CW2S3-3', 'redox_raw_Avg(28)'  : 'CW2S3-4',
    'redox_raw_Avg(29)' : 'CW2S4-1', 'redox_raw_Avg(30)'  : 'CW2S4-2', 'redox_raw_Avg(31)'  : 'CW2S4-3', 'redox_raw_Avg(32)'  : 'CW2S4-4',
    'redox_raw_Avg(33)' : 'CW3S1-1', 'redox_raw_Avg(34)'  : 'CW3S1-2', 'redox_raw_Avg(35)'  : 'CW3S1-3', 'redox_raw_Avg(36)'  : 'CW3S1-4',
    'redox_raw_Avg(37)' : 'CW3S2-1', 'redox_raw_Avg(38)'  : 'CW3S2-2', 'redox_raw_Avg(39)'  : 'CW3S2-3', 'redox_raw_Avg(40)'  : 'CW3S2-4',
    'redox_raw_Avg(41)' : 'CW3S3-1', 'redox_raw_Avg(42)'  : 'CW3S3-2', 'redox_raw_Avg(43)'  : 'CW3S3-3', 'redox_raw_Avg(44)'  : 'CW3S3-4',
    'redox_raw_Avg(45)' : 'CW3S4-1', 'redox_raw_Avg(46)'  : 'CW3S4-2', 'redox_raw_Avg(47)'  : 'CW3S4-3', 'redox_raw_Avg(48)'  : 'CW3S4-4',
    'temp_C_Avg(1)'     : 'CW1S1', 'temp_C_Avg(2)'        : 'CW1S2', 'temp_C_Avg(3)'        : 'CW1S3', 'temp_C_Avg(4)'        : 'CW1S4',
    'temp_C_Avg(5)'     : 'CW2S1', 'temp_C_Avg(6)'        : 'CW2S2', 'temp_C_Avg(7)'        : 'CW2S3', 'temp_C_Avg(8)'        : 'CW2S4',
    'temp_C_Avg(9)'     : 'CW3S1', 'temp_C_Avg(10)'       : 'CW3S2', 'temp_C_Avg(11)'       : 'CW3S3', 'temp_C_Avg(12)'       : 'CW3S4'
    }

# Dictionary with nodes for each wetland and depth in the Constructed Wetland pilot study
node_dictionary = {
    "CW1_20cm" : ["CW1S1-1", "CW1S2-1", "CW1S3-1", "CW1S4-1"],
    "CW1_40cm" : ["CW1S1-2", "CW1S2-2", "CW1S3-2", "CW1S4-2"],
    "CW1_60cm" : ["CW1S1-3", "CW1S2-3", "CW1S3-3", "CW1S4-3"],
    "CW1_80cm" : ["CW1S1-4", "CW1S2-4", "CW1S3-4", "CW1S4-4"],
    "CW2_20cm" : ["CW2S1-1", "CW2S2-1", "CW2S3-1", "CW2S4-1"],
    "CW2_40cm" : ["CW2S1-2", "CW2S2-2", "CW2S3-2", "CW2S4-2"],
    "CW2_60cm" : ["CW2S1-3", "CW2S2-3", "CW2S3-3", "CW2S4-3"],
    "CW2_80cm" : ["CW2S1-4", "CW2S2-4", "CW2S3-4", "CW2S4-4"],
    "CW3_20cm" : ["CW3S1-1", "CW3S2-1", "CW3S3-1", "CW3S4-1"],
    "CW3_40cm" : ["CW3S1-2", "CW3S2-2", "CW3S3-2", "CW3S4-2"],
    "CW3_60cm" : ["CW3S1-3", "CW3S2-3", "CW3S3-3", "CW3S4-3"],
    "CW3_80cm" : ["CW3S1-4", "CW3S2-4", "CW3S3-4", "CW3S4-4"]
    }