initial commit

7bb1e78 4 months ago

16.1 kB

	#df = pd.read_csv("data\Electric_Vehicle_Population_Data_fixed.csv", nrows=10)

	import pandas as pd
	import numpy as np
	import torch
	import torch.nn as nn
	import torch.optim as optim
	from torch.utils.data import Dataset, DataLoader, TensorDataset
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler, LabelEncoder
	from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
	import matplotlib.pyplot as plt
	import seaborn as sns
	import warnings
	warnings.filterwarnings('ignore')

	# Define a simple TabularModel class
	class TabularModel(nn.Module):
	def __init__(self, input_size, hidden_sizes, output_size, dropout_rate=0.2):
	super(TabularModel, self).__init__()

	layers = []
	prev_size = input_size

	# Create hidden layers
	for hidden_size in hidden_sizes:
	layers.extend([
	nn.Linear(prev_size, hidden_size),
	nn.BatchNorm1d(hidden_size),
	nn.ReLU(),
	nn.Dropout(dropout_rate)
	])
	prev_size = hidden_size

	# Output layer
	layers.append(nn.Linear(prev_size, output_size))

	self.model = nn.Sequential(*layers)

	def forward(self, x):
	return self.model(x)

	# Data preprocessing function
	def preprocess_data(df, target_column, test_size=0.2):
	"""
	Preprocess tabular data for neural network training
	"""
	# Separate features and target
	X = df.drop(columns=[target_column])
	y = df[target_column]

	# Handle categorical variables
	categorical_columns = X.select_dtypes(include=['object']).columns
	numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns

	# Encode categorical variables
	label_encoders = {}
	for col in categorical_columns:
	le = LabelEncoder()
	X[col] = le.fit_transform(X[col].astype(str))
	label_encoders[col] = le

	# Scale numerical features
	scaler = StandardScaler()
	X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

	# Encode target variable if it's categorical
	target_encoder = None
	if y.dtype == 'object':
	target_encoder = LabelEncoder()
	y = target_encoder.fit_transform(y)

	# Split the data
	X_train, X_test, y_train, y_test = train_test_split(
	X.values, y.values, test_size=test_size, random_state=42, stratify=y
	)

	return (X_train, X_test, y_train, y_test, scaler, label_encoders, target_encoder)

	# Training function
	def train_model(model, train_loader, val_loader, epochs=100, lr=0.001):
	"""
	Train the tabular model
	"""
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	model.to(device)

	criterion = nn.CrossEntropyLoss()
	optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
	scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10)

	train_losses = []
	val_losses = []

	for epoch in range(epochs):
	# Training phase
	model.train()
	train_loss = 0.0
	for batch_idx, (data, target) in enumerate(train_loader):
	data, target = data.to(device), target.to(device)

	optimizer.zero_grad()
	output = model(data)
	loss = criterion(output, target)
	loss.backward()
	optimizer.step()

	train_loss += loss.item()

	# Validation phase
	model.eval()
	val_loss = 0.0
	with torch.no_grad():
	for data, target in val_loader:
	data, target = data.to(device), target.to(device)
	output = model(data)
	val_loss += criterion(output, target).item()

	avg_train_loss = train_loss / len(train_loader)
	avg_val_loss = val_loss / len(val_loader)

	train_losses.append(avg_train_loss)
	val_losses.append(avg_val_loss)

	scheduler.step(avg_val_loss)

	if (epoch + 1) % 20 == 0:
	print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')

	return train_losses, val_losses

	# Evaluation function
	def evaluate_model(model, test_loader, target_encoder=None):
	"""
	Evaluate the trained model
	"""
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	model.eval()

	all_predictions = []
	all_targets = []

	with torch.no_grad():
	for data, target in test_loader:
	data, target = data.to(device), target.to(device)
	output = model(data)
	predictions = torch.argmax(output, dim=1)

	all_predictions.extend(predictions.cpu().numpy())
	all_targets.extend(target.cpu().numpy())

	# Convert back to original labels if target was encoded
	if target_encoder:
	all_predictions = target_encoder.inverse_transform(all_predictions)
	all_targets = target_encoder.inverse_transform(all_targets)

	accuracy = accuracy_score(all_targets, all_predictions)
	report = classification_report(all_targets, all_predictions)

	return accuracy, report, all_predictions, all_targets

	# Plotting function for training history
	def plot_training_history(train_losses, val_losses):
	"""
	Plot training and validation losses
	"""
	plt.figure(figsize=(12, 5))

	plt.subplot(1, 2, 1)
	plt.plot(train_losses, label='Training Loss', color='blue')
	plt.plot(val_losses, label='Validation Loss', color='red')
	plt.xlabel('Epoch')
	plt.ylabel('Loss')
	plt.title('Training and Validation Loss')
	plt.legend()
	plt.grid(True)

	plt.subplot(1, 2, 2)
	plt.plot(train_losses, label='Training Loss', color='blue')
	plt.plot(val_losses, label='Validation Loss', color='red')
	plt.xlabel('Epoch')
	plt.ylabel('Loss (Log Scale)')
	plt.title('Training and Validation Loss (Log Scale)')
	plt.yscale('log')
	plt.legend()
	plt.grid(True)

	plt.tight_layout()
	plt.show()

	# Function to plot confusion matrix
	def plot_confusion_matrix(y_true, y_pred, labels=None):
	"""
	Plot confusion matrix
	"""
	cm = confusion_matrix(y_true, y_pred)
	plt.figure(figsize=(8, 6))
	sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
	xticklabels=labels, yticklabels=labels)
	plt.xlabel('Predicted')
	plt.ylabel('Actual')
	plt.title('Confusion Matrix')
	plt.show()

	# Function to save model
	def save_model(model, filepath, scaler, label_encoders, target_encoder=None):
	"""
	Save the trained model and preprocessing objects
	"""
	torch.save({
	'model_state_dict': model.state_dict(),
	'scaler': scaler,
	'label_encoders': label_encoders,
	'target_encoder': target_encoder
	}, filepath)
	print(f"Model saved to {filepath}")

	# Function to load model
	def load_model(filepath, input_size, hidden_sizes, output_size, dropout_rate=0.2):
	"""
	Load the trained model and preprocessing objects
	"""
	checkpoint = torch.load(filepath)

	model = TabularModel(input_size, hidden_sizes, output_size, dropout_rate)
	model.load_state_dict(checkpoint['model_state_dict'])

	return model, checkpoint['scaler'], checkpoint['label_encoders'], checkpoint['target_encoder']

	# Main training pipeline
	def main():
	# Load your CSV file
	# Replace 'electric_vehicles.csv' with your actual CSV file path
	#df = pd.read_csv('data\Electric_Vehicle_Population_Data_fixed.csv", nrows=10')
	df = pd.read_csv("Electric_Vehicle_Population.csv")

	# Data preprocessing for Electric Vehicle dataset
	print(f"Original dataset shape: {df.shape}")
	print(f"Columns: {list(df.columns)}")

	# Clean and prepare the data
	# Remove or handle missing values
	df = df.dropna(subset=['Make', 'Model', 'Electric Vehicle Type', 'Model Year'])

	# Extract useful features and create target variable
	# For this example, let's predict Electric Vehicle Type (BEV vs PHEV)
	df_clean = df.copy()

	# Clean numeric columns
	df_clean['Model Year'] = pd.to_numeric(df_clean['Model Year'], errors='coerce')
	df_clean['Electric Range'] = pd.to_numeric(df_clean['Electric Range'], errors='coerce')
	df_clean['Base MSRP'] = pd.to_numeric(df_clean['Base MSRP'], errors='coerce')
	df_clean['Legislative District'] = pd.to_numeric(df_clean['Legislative District'], errors='coerce')

	# Fill missing values
	df_clean['Electric Range'] = df_clean['Electric Range'].fillna(df_clean['Electric Range'].median())
	df_clean['Base MSRP'] = df_clean['Base MSRP'].fillna(df_clean['Base MSRP'].median())
	df_clean['Legislative District'] = df_clean['Legislative District'].fillna(0)

	# Create binary target: BEV vs PHEV
	df_clean['target'] = (df_clean['Electric Vehicle Type'] == 'Battery Electric Vehicle (BEV)').astype(int)

	# Select relevant features for training
	feature_columns = [
	'Model Year', 'Make', 'Model', 'Electric Range', 'Base MSRP',
	'Legislative District', 'County', 'State', 'Clean Alternative Fuel Vehicle (CAFV) Eligibility'
	]

	# Create final dataset with selected features
	df_final = df_clean[feature_columns + ['target']].copy()

	# Clean column names for easier handling
	df_final.columns = [
	'model_year', 'make', 'model', 'electric_range', 'base_msrp',
	'legislative_district', 'county', 'state', 'cafv_eligibility', 'target'
	]

	# Handle categorical variables with too many categories
	# Keep only top N categories for Make and Model
	top_makes = df_final['make'].value_counts().head(10).index
	df_final['make'] = df_final['make'].apply(lambda x: x if x in top_makes else 'OTHER')

	top_models = df_final['model'].value_counts().head(15).index
	df_final['model'] = df_final['model'].apply(lambda x: x if x in top_models else 'OTHER')

	top_counties = df_final['county'].value_counts().head(20).index
	df_final['county'] = df_final['county'].apply(lambda x: x if x in top_counties else 'OTHER')

	# Remove rows where target might be ambiguous
	df_final = df_final.dropna()

	df = df_final
	print(f"Processed dataset shape: {df.shape}")
	print(f"Target distribution:")
	print(f"BEV (1): {(df['target'] == 1).sum()}")
	print(f"PHEV (0): {(df['target'] == 0).sum()}")

	# Specify your target column name
	target_column = 'target'

	# Preprocess the data
	X_train, X_test, y_train, y_test, scaler, label_encoders, target_encoder = preprocess_data(
	df, target_column
	)

	# Convert to PyTorch tensors
	X_train_tensor = torch.FloatTensor(X_train)
	y_train_tensor = torch.LongTensor(y_train)
	X_test_tensor = torch.FloatTensor(X_test)
	y_test_tensor = torch.LongTensor(y_test)

	# Create validation split from training data
	X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
	X_train_tensor, y_train_tensor, test_size=0.2, random_state=42, stratify=y_train_tensor
	)

	# Create data loaders
	batch_size = 64
	train_dataset = TensorDataset(X_train_split, y_train_split)
	val_dataset = TensorDataset(X_val_split, y_val_split)
	test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

	train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
	val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
	test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

	# Model parameters
	input_size = X_train.shape[1]
	hidden_sizes = [128, 64, 32] # You can adjust these
	output_size = len(np.unique(y_train))

	# Create the model
	model = TabularModel(
	input_size=input_size,
	hidden_sizes=hidden_sizes,
	output_size=output_size,
	dropout_rate=0.3
	)

	print(f"\nModel architecture:")
	print(f"Input size: {input_size}")
	print(f"Hidden layers: {hidden_sizes}")
	print(f"Output size: {output_size}")
	print(f"Total parameters: {sum(p.numel() for p in model.parameters())}")

	# Train the model
	print("\nStarting training...")
	epochs = 100
	learning_rate = 0.001

	train_losses, val_losses = train_model(
	model, train_loader, val_loader, epochs=epochs, lr=learning_rate
	)

	# Plot training history
	plot_training_history(train_losses, val_losses)

	# Evaluate the model
	print("\nEvaluating model on test set...")
	accuracy, report, predictions, targets = evaluate_model(model, test_loader, target_encoder)

	print(f"Test Accuracy: {accuracy:.4f}")
	print("\nClassification Report:")
	print(report)

	# Plot confusion matrix
	labels = ['PHEV', 'BEV'] if target_encoder is None else None
	plot_confusion_matrix(targets, predictions, labels)

	# Save the model
	model_filepath = 'ev_classifier_model.pth'
	save_model(model, model_filepath, scaler, label_encoders, target_encoder)

	print(f"\nTraining completed successfully!")
	print(f"Final test accuracy: {accuracy:.4f}")

	return model, scaler, label_encoders, target_encoder

	# Function to make predictions on new data
	def predict_new_data(model, new_data, scaler, label_encoders, target_encoder=None):
	"""
	Make predictions on new data
	"""
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	model.to(device)
	model.eval()

	# Preprocess new data
	new_data_processed = new_data.copy()

	# Apply label encoders
	for col, encoder in label_encoders.items():
	if col in new_data_processed.columns:
	# Handle unseen categories
	new_data_processed[col] = new_data_processed[col].apply(
	lambda x: x if x in encoder.classes_ else 'OTHER'
	)
	new_data_processed[col] = encoder.transform(new_data_processed[col].astype(str))

	# Apply scaler to numerical columns
	numerical_columns = new_data_processed.select_dtypes(include=['int64', 'float64']).columns
	new_data_processed[numerical_columns] = scaler.transform(new_data_processed[numerical_columns])

	# Convert to tensor
	X_new = torch.FloatTensor(new_data_processed.values)
	X_new = X_new.to(device)

	# Make predictions
	with torch.no_grad():
	outputs = model(X_new)
	probabilities = torch.softmax(outputs, dim=1)
	predictions = torch.argmax(outputs, dim=1)

	# Convert back to original labels if needed
	if target_encoder:
	predictions = target_encoder.inverse_transform(predictions.cpu().numpy())
	else:
	predictions = predictions.cpu().numpy()

	return predictions, probabilities.cpu().numpy()

	if __name__ == "__main__":
	# Run the main training pipeline
	model, scaler, label_encoders, target_encoder = main()

	# Example of how to use the trained model for predictions
	# Uncomment and modify the following code to make predictions on new data

	# # Load new data for prediction
	# new_data = pd.DataFrame({
	# 'model_year': [2020, 2021, 2019],
	# 'make': ['TESLA', 'NISSAN', 'CHEVROLET'],
	# 'model': ['MODEL S', 'LEAF', 'BOLT EV'],
	# 'electric_range': [370, 150, 259],
	# 'base_msrp': [80000, 32000, 32000],
	# 'legislative_district': [43, 11, 36],
	# 'county': ['King', 'Snohomish', 'Pierce'],
	# 'state': ['WA', 'WA', 'WA'],
	# 'cafv_eligibility': ['Clean Alternative Fuel Vehicle Eligible',
	# 'Clean Alternative Fuel Vehicle Eligible',
	# 'Clean Alternative Fuel Vehicle Eligible']
	# })
	#
	# predictions, probabilities = predict_new_data(model, new_data, scaler, label_encoders, target_encoder)
	# print(f"Predictions: {predictions}")
	# print(f"Probabilities: {probabilities}")