#df = pd.read_csv("data\Electric_Vehicle_Population_Data_fixed.csv", nrows=10) import pandas as pd import numpy as np import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import Dataset, DataLoader, TensorDataset from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.metrics import accuracy_score, classification_report, confusion_matrix import matplotlib.pyplot as plt import seaborn as sns import warnings warnings.filterwarnings('ignore') # Define a simple TabularModel class class TabularModel(nn.Module): def __init__(self, input_size, hidden_sizes, output_size, dropout_rate=0.2): super(TabularModel, self).__init__() layers = [] prev_size = input_size # Create hidden layers for hidden_size in hidden_sizes: layers.extend([ nn.Linear(prev_size, hidden_size), nn.BatchNorm1d(hidden_size), nn.ReLU(), nn.Dropout(dropout_rate) ]) prev_size = hidden_size # Output layer layers.append(nn.Linear(prev_size, output_size)) self.model = nn.Sequential(*layers) def forward(self, x): return self.model(x) # Data preprocessing function def preprocess_data(df, target_column, test_size=0.2): """ Preprocess tabular data for neural network training """ # Separate features and target X = df.drop(columns=[target_column]) y = df[target_column] # Handle categorical variables categorical_columns = X.select_dtypes(include=['object']).columns numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns # Encode categorical variables label_encoders = {} for col in categorical_columns: le = LabelEncoder() X[col] = le.fit_transform(X[col].astype(str)) label_encoders[col] = le # Scale numerical features scaler = StandardScaler() X[numerical_columns] = scaler.fit_transform(X[numerical_columns]) # Encode target variable if it's categorical target_encoder = None if y.dtype == 'object': target_encoder = LabelEncoder() y = target_encoder.fit_transform(y) # Split the data X_train, X_test, y_train, y_test = train_test_split( X.values, y.values, test_size=test_size, random_state=42, stratify=y ) return (X_train, X_test, y_train, y_test, scaler, label_encoders, target_encoder) # Training function def train_model(model, train_loader, val_loader, epochs=100, lr=0.001): """ Train the tabular model """ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10) train_losses = [] val_losses = [] for epoch in range(epochs): # Training phase model.train() train_loss = 0.0 for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = criterion(output, target) loss.backward() optimizer.step() train_loss += loss.item() # Validation phase model.eval() val_loss = 0.0 with torch.no_grad(): for data, target in val_loader: data, target = data.to(device), target.to(device) output = model(data) val_loss += criterion(output, target).item() avg_train_loss = train_loss / len(train_loader) avg_val_loss = val_loss / len(val_loader) train_losses.append(avg_train_loss) val_losses.append(avg_val_loss) scheduler.step(avg_val_loss) if (epoch + 1) % 20 == 0: print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}') return train_losses, val_losses # Evaluation function def evaluate_model(model, test_loader, target_encoder=None): """ Evaluate the trained model """ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.eval() all_predictions = [] all_targets = [] with torch.no_grad(): for data, target in test_loader: data, target = data.to(device), target.to(device) output = model(data) predictions = torch.argmax(output, dim=1) all_predictions.extend(predictions.cpu().numpy()) all_targets.extend(target.cpu().numpy()) # Convert back to original labels if target was encoded if target_encoder: all_predictions = target_encoder.inverse_transform(all_predictions) all_targets = target_encoder.inverse_transform(all_targets) accuracy = accuracy_score(all_targets, all_predictions) report = classification_report(all_targets, all_predictions) return accuracy, report, all_predictions, all_targets # Plotting function for training history def plot_training_history(train_losses, val_losses): """ Plot training and validation losses """ plt.figure(figsize=(12, 5)) plt.subplot(1, 2, 1) plt.plot(train_losses, label='Training Loss', color='blue') plt.plot(val_losses, label='Validation Loss', color='red') plt.xlabel('Epoch') plt.ylabel('Loss') plt.title('Training and Validation Loss') plt.legend() plt.grid(True) plt.subplot(1, 2, 2) plt.plot(train_losses, label='Training Loss', color='blue') plt.plot(val_losses, label='Validation Loss', color='red') plt.xlabel('Epoch') plt.ylabel('Loss (Log Scale)') plt.title('Training and Validation Loss (Log Scale)') plt.yscale('log') plt.legend() plt.grid(True) plt.tight_layout() plt.show() # Function to plot confusion matrix def plot_confusion_matrix(y_true, y_pred, labels=None): """ Plot confusion matrix """ cm = confusion_matrix(y_true, y_pred) plt.figure(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels) plt.xlabel('Predicted') plt.ylabel('Actual') plt.title('Confusion Matrix') plt.show() # Function to save model def save_model(model, filepath, scaler, label_encoders, target_encoder=None): """ Save the trained model and preprocessing objects """ torch.save({ 'model_state_dict': model.state_dict(), 'scaler': scaler, 'label_encoders': label_encoders, 'target_encoder': target_encoder }, filepath) print(f"Model saved to {filepath}") # Function to load model def load_model(filepath, input_size, hidden_sizes, output_size, dropout_rate=0.2): """ Load the trained model and preprocessing objects """ checkpoint = torch.load(filepath) model = TabularModel(input_size, hidden_sizes, output_size, dropout_rate) model.load_state_dict(checkpoint['model_state_dict']) return model, checkpoint['scaler'], checkpoint['label_encoders'], checkpoint['target_encoder'] # Main training pipeline def main(): # Load your CSV file # Replace 'electric_vehicles.csv' with your actual CSV file path #df = pd.read_csv('data\Electric_Vehicle_Population_Data_fixed.csv", nrows=10') df = pd.read_csv("Electric_Vehicle_Population.csv") # Data preprocessing for Electric Vehicle dataset print(f"Original dataset shape: {df.shape}") print(f"Columns: {list(df.columns)}") # Clean and prepare the data # Remove or handle missing values df = df.dropna(subset=['Make', 'Model', 'Electric Vehicle Type', 'Model Year']) # Extract useful features and create target variable # For this example, let's predict Electric Vehicle Type (BEV vs PHEV) df_clean = df.copy() # Clean numeric columns df_clean['Model Year'] = pd.to_numeric(df_clean['Model Year'], errors='coerce') df_clean['Electric Range'] = pd.to_numeric(df_clean['Electric Range'], errors='coerce') df_clean['Base MSRP'] = pd.to_numeric(df_clean['Base MSRP'], errors='coerce') df_clean['Legislative District'] = pd.to_numeric(df_clean['Legislative District'], errors='coerce') # Fill missing values df_clean['Electric Range'] = df_clean['Electric Range'].fillna(df_clean['Electric Range'].median()) df_clean['Base MSRP'] = df_clean['Base MSRP'].fillna(df_clean['Base MSRP'].median()) df_clean['Legislative District'] = df_clean['Legislative District'].fillna(0) # Create binary target: BEV vs PHEV df_clean['target'] = (df_clean['Electric Vehicle Type'] == 'Battery Electric Vehicle (BEV)').astype(int) # Select relevant features for training feature_columns = [ 'Model Year', 'Make', 'Model', 'Electric Range', 'Base MSRP', 'Legislative District', 'County', 'State', 'Clean Alternative Fuel Vehicle (CAFV) Eligibility' ] # Create final dataset with selected features df_final = df_clean[feature_columns + ['target']].copy() # Clean column names for easier handling df_final.columns = [ 'model_year', 'make', 'model', 'electric_range', 'base_msrp', 'legislative_district', 'county', 'state', 'cafv_eligibility', 'target' ] # Handle categorical variables with too many categories # Keep only top N categories for Make and Model top_makes = df_final['make'].value_counts().head(10).index df_final['make'] = df_final['make'].apply(lambda x: x if x in top_makes else 'OTHER') top_models = df_final['model'].value_counts().head(15).index df_final['model'] = df_final['model'].apply(lambda x: x if x in top_models else 'OTHER') top_counties = df_final['county'].value_counts().head(20).index df_final['county'] = df_final['county'].apply(lambda x: x if x in top_counties else 'OTHER') # Remove rows where target might be ambiguous df_final = df_final.dropna() df = df_final print(f"Processed dataset shape: {df.shape}") print(f"Target distribution:") print(f"BEV (1): {(df['target'] == 1).sum()}") print(f"PHEV (0): {(df['target'] == 0).sum()}") # Specify your target column name target_column = 'target' # Preprocess the data X_train, X_test, y_train, y_test, scaler, label_encoders, target_encoder = preprocess_data( df, target_column ) # Convert to PyTorch tensors X_train_tensor = torch.FloatTensor(X_train) y_train_tensor = torch.LongTensor(y_train) X_test_tensor = torch.FloatTensor(X_test) y_test_tensor = torch.LongTensor(y_test) # Create validation split from training data X_train_split, X_val_split, y_train_split, y_val_split = train_test_split( X_train_tensor, y_train_tensor, test_size=0.2, random_state=42, stratify=y_train_tensor ) # Create data loaders batch_size = 64 train_dataset = TensorDataset(X_train_split, y_train_split) val_dataset = TensorDataset(X_val_split, y_val_split) test_dataset = TensorDataset(X_test_tensor, y_test_tensor) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) # Model parameters input_size = X_train.shape[1] hidden_sizes = [128, 64, 32] # You can adjust these output_size = len(np.unique(y_train)) # Create the model model = TabularModel( input_size=input_size, hidden_sizes=hidden_sizes, output_size=output_size, dropout_rate=0.3 ) print(f"\nModel architecture:") print(f"Input size: {input_size}") print(f"Hidden layers: {hidden_sizes}") print(f"Output size: {output_size}") print(f"Total parameters: {sum(p.numel() for p in model.parameters())}") # Train the model print("\nStarting training...") epochs = 100 learning_rate = 0.001 train_losses, val_losses = train_model( model, train_loader, val_loader, epochs=epochs, lr=learning_rate ) # Plot training history plot_training_history(train_losses, val_losses) # Evaluate the model print("\nEvaluating model on test set...") accuracy, report, predictions, targets = evaluate_model(model, test_loader, target_encoder) print(f"Test Accuracy: {accuracy:.4f}") print("\nClassification Report:") print(report) # Plot confusion matrix labels = ['PHEV', 'BEV'] if target_encoder is None else None plot_confusion_matrix(targets, predictions, labels) # Save the model model_filepath = 'ev_classifier_model.pth' save_model(model, model_filepath, scaler, label_encoders, target_encoder) print(f"\nTraining completed successfully!") print(f"Final test accuracy: {accuracy:.4f}") return model, scaler, label_encoders, target_encoder # Function to make predictions on new data def predict_new_data(model, new_data, scaler, label_encoders, target_encoder=None): """ Make predictions on new data """ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) model.eval() # Preprocess new data new_data_processed = new_data.copy() # Apply label encoders for col, encoder in label_encoders.items(): if col in new_data_processed.columns: # Handle unseen categories new_data_processed[col] = new_data_processed[col].apply( lambda x: x if x in encoder.classes_ else 'OTHER' ) new_data_processed[col] = encoder.transform(new_data_processed[col].astype(str)) # Apply scaler to numerical columns numerical_columns = new_data_processed.select_dtypes(include=['int64', 'float64']).columns new_data_processed[numerical_columns] = scaler.transform(new_data_processed[numerical_columns]) # Convert to tensor X_new = torch.FloatTensor(new_data_processed.values) X_new = X_new.to(device) # Make predictions with torch.no_grad(): outputs = model(X_new) probabilities = torch.softmax(outputs, dim=1) predictions = torch.argmax(outputs, dim=1) # Convert back to original labels if needed if target_encoder: predictions = target_encoder.inverse_transform(predictions.cpu().numpy()) else: predictions = predictions.cpu().numpy() return predictions, probabilities.cpu().numpy() if __name__ == "__main__": # Run the main training pipeline model, scaler, label_encoders, target_encoder = main() # Example of how to use the trained model for predictions # Uncomment and modify the following code to make predictions on new data # # Load new data for prediction # new_data = pd.DataFrame({ # 'model_year': [2020, 2021, 2019], # 'make': ['TESLA', 'NISSAN', 'CHEVROLET'], # 'model': ['MODEL S', 'LEAF', 'BOLT EV'], # 'electric_range': [370, 150, 259], # 'base_msrp': [80000, 32000, 32000], # 'legislative_district': [43, 11, 36], # 'county': ['King', 'Snohomish', 'Pierce'], # 'state': ['WA', 'WA', 'WA'], # 'cafv_eligibility': ['Clean Alternative Fuel Vehicle Eligible', # 'Clean Alternative Fuel Vehicle Eligible', # 'Clean Alternative Fuel Vehicle Eligible'] # }) # # predictions, probabilities = predict_new_data(model, new_data, scaler, label_encoders, target_encoder) # print(f"Predictions: {predictions}") # print(f"Probabilities: {probabilities}")