-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathprocessing.py
More file actions
64 lines (51 loc) · 2.32 KB
/
processing.py
File metadata and controls
64 lines (51 loc) · 2.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pandas as pd
import os
def create_silver_datafile(country, filename):
df =pd.read_csv("dataset/bronze/"+country+"/"+filename, header=None,delimiter =";")
df1=df[0].str.split(',', expand=True)
df2= df1[10:95]
# Nouveaux noms de colonnes
nouvelles_colonnes = ["SUBDIVISION", "YEAR", "JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC", "ANNUAL"]
# Changer les noms des colonnes
df2.columns = nouvelles_colonnes
df2.reset_index(drop=True, inplace=True)
# Supprimer la colonne "COL_NAME" du dataframe en utilisant drop()
df2 = df2.drop(columns=["SUBDIVISION"])
df2=df2.apply(pd.to_numeric)
df2.to_csv(
"dataset/silver/"+country+"/"+filename,
index=False,
sep=';',
columns=['YEAR', 'JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC','ANNUAL'],
encoding='utf-8-sig' # For better Excel compatibility
)
def create_gold_datafile(country, filename):
df =pd.read_csv("dataset/silver/"+country+"/"+filename,delimiter =";")
df=df.apply(pd.to_numeric)
df["APR-MAY"]=df["APR"]+df["MAY"]
df["JUL-SEP"]=df["JUL"]+df["AUG"]+df["SEP"]
df["MAR-MAY"]=df["MAR"]+df["APR"]+df["MAY"]
df["APR-JUN"]=df["APR"]+df["MAY"]+df["JUN"]
df["AUG-OCT"]=df["AUG"]+df["SEP"]+df["OCT"]
df["JUL-OCT"]=df["JUL"]+df["AUG"]+df["SEP"]+df["OCT"]
df.to_csv(
"dataset/gold/"+country+"/"+filename,
index=False,
sep=';',
columns=['YEAR', 'JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC','ANNUAL','APR-MAY','JUL-SEP','MAR-MAY','APR-JUN','AUG-OCT','JUL-OCT'],
encoding='utf-8-sig' # For better Excel compatibility
)
if __name__ == "__main__":
# Specify the folder path
folder_path = 'dataset/bronze/'
countries = ["cameroon", "ethiopia","nigeria","rwanda"]
for country in countries:
for file in os.listdir(folder_path+country):
bronze_datafile = folder_path + country + "/" + file
if os.path.isfile(bronze_datafile):
try:
silver_datafile = create_silver_datafile(country, file)
gold_datafile = create_gold_datafile(country, file)
except:
pass
print("Dataset pre-processing completed!")