Skip to content

Commit ff9ec8b

Browse files
committed
Enhance data processing in pipeline.py by ensuring numeric columns are converted to float type and updating the calculation of regional averages to exclude non-numeric columns. This improves data integrity and prepares the DataFrame for further analysis.
1 parent 381c712 commit ff9ec8b

File tree

2 files changed

+13
-6
lines changed

2 files changed

+13
-6
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
energy_data.csv

pipeline.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -64,12 +64,12 @@ def fetch_world_bank_data():
6464
@task
6565
def process_data(df):
6666
"""Process and clean the energy data"""
67+
# Ensure numeric columns are float type
68+
numeric_columns = ['Energy use per capita', 'Renewable energy consumption', 'Fossil fuel energy consumption']
69+
for col in numeric_columns:
70+
df[col] = pd.to_numeric(df[col], errors='coerce')
71+
6772
# Calculate year-over-year changes
68-
# write df to csv in the output directory
69-
# create output directory if it doesn't exist
70-
os.makedirs('output', exist_ok=True)
71-
df.to_csv('output/energy_data.csv', index=False)
72-
exit()
7373
df['Renewable Growth'] = df.groupby('country')['Renewable energy consumption'].pct_change()
7474

7575
# Calculate energy transition score (higher renewable %, lower fossil fuel %)
@@ -83,7 +83,13 @@ def process_data(df):
8383

8484
# Calculate regional averages
8585
df['Region'] = df['country'].map(lambda x: get_region(x))
86-
regional_avg = df.groupby(['Region', 'date']).mean().reset_index()
86+
87+
# Ensure date is numeric for calculations
88+
df['date'] = pd.to_numeric(df['date'], errors='coerce')
89+
90+
# Calculate regional averages, excluding non-numeric columns
91+
numeric_cols = df.select_dtypes(include=[np.number]).columns
92+
regional_avg = df.groupby(['Region', 'date'])[numeric_cols].mean().reset_index()
8793

8894
return df, latest_data, regional_avg
8995

0 commit comments

Comments
 (0)