forked from inderpartap/trendcast
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_transform.py
More file actions
83 lines (57 loc) · 2.6 KB
/
data_transform.py
File metadata and controls
83 lines (57 loc) · 2.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
from utils.utils import datapath
# given the retail data, create onehot-encoding of the dept wise quantity sold
# group by city to compress the data
def create_onehot(df):
df = df.groupby(['date', 'province', 'city', 'department']).sum().reset_index()
# set the initial values to 0
df = df.assign(**{'department1' : 0,'department2' : 0,'department3' : 0,'department4' : 0,
'department5' : 0,'department6' : 0,'department7' : 0,'department8' : 0,
'department9' : 0,'department10': 0,'department11': 0,'department12': 0,
'department13': 0,'department14': 0})
# for each value in department column, update the one-hot columns
def restructure(row):
dept_name = str(row['department'])
row[dept_name] = row['totalQuantity']
return row
df = df.apply(restructure, axis=1)
df = df.drop('department', axis=1)
df = df.groupby(['date', 'province', 'city']).sum().reset_index()
return df
# add the weather information to the retail dataset
def add_weather(city_df, weather_df):
provinces = city_df[['province', 'city']].drop_duplicates()
# right join to keep the weather information
result = pd.merge(city_df, weather_df,
how = 'right',
on = ['date', 'city'])
result = pd.merge(provinces, result, on = ['city']) # fill in missing province values
result = result.rename(columns={"province_x": "province"}).drop("province_y", axis=1)
return result
def fill_missing(df):
# drop columns with more than 80% null values
df = df.dropna(thresh=df.shape[0]*0.2, axis=1)
retailCols = list(df.columns)[:19]
weatherCols = list(df.columns)[20:]
result = pd.DataFrame()
# fill the missing transaction records as 0; ~0.05% of data
result[retailCols] = df[retailCols].fillna(value=0.0)
# fill missing weather records with surrounding data
result[weatherCols] = df[weatherCols].interpolate(method='nearest', axis=0).ffill().bfill()
return result
def main():
# load retail sales data
retail_data = pd.read_csv(datapath['retail'], index_col=0).drop(["category", "class", "style", "vendor"], axis=1)
retail_data['city'] = retail_data['city'].str.lower()
# load weather data
weather_data = pd.read_csv(datapath['weather']).rename(columns={"station_name": "city"}).drop("station_id", axis=1)
# create onehot encoding of the department column
city_data = create_onehot(retail_data)
# add weather information
combined_df = add_weather(city_data, weather_data)
# handle missing values
final_df = fill_missing(combined_df)
# store station information in CSV
final_df.to_csv(datapath['trendcast'], index=False)
if __name__=='__main__':
main()