diff --git a/split_csv.py b/split_csv.py new file mode 100644 index 0000000..8eb8b81 --- /dev/null +++ b/split_csv.py @@ -0,0 +1,112 @@ +import pandas as pd +import os +import sys +from pathlib import Path + +def split_csv_hierarchically(input_file, output_dir='output'): + """ + Split a CSV file by Cluster, then Type, then Indicator. + Each split removes the splitting column and adds it to the filename. + + Args: + input_file: Path to the input CSV file + output_dir: Directory where output files will be saved + """ + # Create output directory if it doesn't exist + Path(output_dir).mkdir(parents=True, exist_ok=True) + + # Read the CSV file with low_memory=False to avoid dtype warnings + print(f"Reading {input_file}...") + df = pd.read_csv(input_file, low_memory=False) + + # Strip whitespace from all column headers + df.columns = df.columns.str.strip() + print("Stripped whitespace from column headers") + + # Convert Year column to integer if it exists, handling missing values + if 'Year' in df.columns: + # Convert to nullable integer type (Int64 instead of int64) + df['Year'] = pd.to_numeric(df['Year'], errors='coerce') + df['Year'] = df['Year'].astype('Int64') + print("Converted 'Year' column to integers (preserving empty values)") + + # Clean and convert Value column if it exists + if 'Value' in df.columns: + # Remove commas and other non-numeric characters, then convert to number + df['Value'] = df['Value'].astype(str).str.replace(',', '', regex=False) + df['Value'] = df['Value'].str.replace(' ', '', regex=False) + df['Value'] = pd.to_numeric(df['Value'], errors='coerce') + print("Cleaned and converted 'Value' column to numbers") + + # Verify required columns exist + required_columns = ['Cluster', 'Type', 'Indicator'] + for col in required_columns: + if col not in df.columns: + raise ValueError(f"Column '{col}' not found in CSV. Available columns: {df.columns.tolist()}") + + # Get unique values for each splitting column + clusters = df['Cluster'].unique() + + print(f"Found {len(clusters)} unique clusters") + + # Split by Cluster + for cluster in clusters: + cluster_df = df[df['Cluster'] == cluster].copy() + + # Get unique Types for this cluster + types = cluster_df['Type'].unique() + + # Split by Type + for type in types: + type_df = cluster_df[cluster_df['Type'] == type].copy() + + # Get unique Indicators for this Type + indicators = type_df['Indicator'].unique() + + # Split by Indicator + for indicator in indicators: + indicator_df = type_df[type_df['Indicator'] == indicator].copy() + + # Remove the splitting columns + indicator_df = indicator_df.drop(columns=['Cluster', 'Type', 'Indicator']) + + # Create filename with all three values + # Clean the values to make them filesystem-safe + clean_cluster = str(cluster).replace('/', '_').replace('\\', '_').replace(' ', '_') + clean_type = str(type).replace('/', '_').replace('\\', '_').replace(' ', '_') + clean_indicator = str(indicator).replace('/', '_').replace('\\', '_').replace(' ', '_') + + filename = f"{clean_cluster}_{clean_type}_{clean_indicator}.csv" + output_path = os.path.join(output_dir, filename) + + # Save the split CSV + indicator_df.to_csv(output_path, index=False) + print(f"Created: {filename} ({len(indicator_df)} rows)") + + print(f"\nAll files saved to '{output_dir}' directory") + +# Main execution +if __name__ == "__main__": + # Check if file path was provided + if len(sys.argv) < 2: + print("Usage: python split_csv.py [output_directory]") + print("\nExamples:") + print(" python split_csv.py mydata.csv") + print(" python split_csv.py mydata.csv split_output") + print(" python split_csv.py /path/to/mydata.csv /path/to/output") + sys.exit(1) + + # Get input file from command line argument + input_csv = sys.argv[1] + + # Get output directory (optional, defaults to 'output') + output_dir = sys.argv[2] if len(sys.argv) > 2 else 'output' + + try: + split_csv_hierarchically(input_csv, output_dir=output_dir) + except FileNotFoundError: + print(f"Error: File '{input_csv}' not found.") + sys.exit(1) + except Exception as e: + print(f"Error: {e}") + sys.exit(1)