Slide 1: Introduction to Support Vector Machines
Support Vector Machines (SVM) are powerful supervised learning models used for classification and regression tasks. They work by finding the optimal hyperplane that separates different classes in a high-dimensional space. SVMs are particularly effective in handling non-linear decision boundaries and can work well with both small and large datasets.
import numpy as np
import matplotlib.pyplot as plt
# Generate sample data
np.random.seed(0)
X = np.random.randn(100, 2)
y = np.where(X[:, 0] + X[:, 1] > 0, 1, -1)
# Plot the data
plt.scatter(X[y==1, 0], X[y==1, 1], c='b', label='Class 1')
plt.scatter(X[y==-1, 0], X[y==-1, 1], c='r', label='Class -1')
plt.legend()
plt.title('Sample Data for SVM')
plt.show()
Slide 2: The Optimization Problem
The core of SVM is an optimization problem. We aim to find the hyperplane that maximizes the margin between classes while minimizing classification errors. This involves solving a quadratic programming problem with linear constraints.
def linear_kernel(x1, x2):
return np.dot(x1, x2)
def svm_optimization(X, y, kernel, C=1.0, tol=1e-3, max_passes=5):
m, n = X.shape
alphas = np.zeros(m)
b = 0
passes = 0
while passes < max_passes:
num_changed_alphas = 0
for i in range(m):
Ei = np.sum(alphas * y * kernel(X[i], X.T)) + b - y[i]
if (y[i]*Ei < -tol and alphas[i] < C) or (y[i]*Ei > tol and alphas[i] > 0):
j = np.random.choice([k for k in range(m) if k != i])
Ej = np.sum(alphas * y * kernel(X[j], X.T)) + b - y[j]
alpha_i_old, alpha_j_old = alphas[i], alphas[j]
L, H = max(0, alphas[j] - alphas[i]), min(C, C + alphas[j] - alphas[i]) if y[i] != y[j] else max(0, alphas[i] + alphas[j] - C), min(C, alphas[i] + alphas[j])
if L == H:
continue
eta = 2 * kernel(X[i], X[j]) - kernel(X[i], X[i]) - kernel(X[j], X[j])
if eta >= 0:
continue
alphas[j] -= y[j] * (Ei - Ej) / eta
alphas[j] = np.clip(alphas[j], L, H)
if abs(alphas[j] - alpha_j_old) < 1e-5:
continue
alphas[i] += y[i] * y[j] * (alpha_j_old - alphas[j])
b1 = b - Ei - y[i] * (alphas[i] - alpha_i_old) * kernel(X[i], X[i]) - y[j] * (alphas[j] - alpha_j_old) * kernel(X[i], X[j])
b2 = b - Ej - y[i] * (alphas[i] - alpha_i_old) * kernel(X[i], X[j]) - y[j] * (alphas[j] - alpha_j_old) * kernel(X[j], X[j])
if 0 < alphas[i] < C:
b = b1
elif 0 < alphas[j] < C:
b = b2
else:
b = (b1 + b2) / 2
num_changed_alphas += 1
if num_changed_alphas == 0:
passes += 1
else:
passes = 0
return alphas, b
Slide 3: Implementing the SVM Class
Let's create a Python class to encapsulate our SVM implementation. This class will handle the training process and make predictions on new data points.
class SVM:
def __init__(self, kernel=linear_kernel, C=1.0):
self.kernel = kernel
self.C = C
self.alphas = None
self.b = None
self.support_vectors = None
self.support_vector_labels = None
def fit(self, X, y):
self.X = X
self.y = y
self.alphas, self.b = svm_optimization(X, y, self.kernel, self.C)
support_vector_indices = np.where(self.alphas > 1e-5)[0]
self.support_vectors = X[support_vector_indices]
self.support_vector_labels = y[support_vector_indices]
self.alphas = self.alphas[support_vector_indices]
def predict(self, X):
y_pred = np.sum(self.alphas * self.support_vector_labels * self.kernel(self.support_vectors, X.T), axis=0) + self.b
return np.sign(y_pred)
# Usage example
svm = SVM()
svm.fit(X, y)
y_pred = svm.predict(X)
accuracy = np.mean(y_pred == y)
print(f"Accuracy: {accuracy:.2f}")
Slide 4: Visualizing the Decision Boundary
To better understand how our SVM works, let's visualize the decision boundary it creates. This will help us see how the algorithm separates different classes in the feature space.
def plot_decision_boundary(svm, X, y):
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
np.arange(y_min, y_max, 0.1))
Z = svm.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.4)
plt.scatter(X[:, 0], X[:, 1], c=y, alpha=0.8)
plt.scatter(svm.support_vectors[:, 0], svm.support_vectors[:, 1],
s=80, facecolors='none', edgecolors='k')
plt.title('SVM Decision Boundary')
plt.show()
plot_decision_boundary(svm, X, y)
Slide 5: Implementing Non-linear Kernels
SVMs can handle non-linear decision boundaries by using kernel functions. Let's implement the Radial Basis Function (RBF) kernel, which is commonly used for non-linear classification problems.
def rbf_kernel(x1, x2, gamma=1.0):
return np.exp(-gamma * np.linalg.norm(x1[:, np.newaxis] - x2[np.newaxis, :], axis=2)**2)
# Generate non-linearly separable data
np.random.seed(0)
X = np.random.randn(200, 2)
y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0).astype(int) * 2 - 1
# Train SVM with RBF kernel
svm_rbf = SVM(kernel=lambda x1, x2: rbf_kernel(x1, x2, gamma=0.5), C=1.0)
svm_rbf.fit(X, y)
# Plot decision boundary
plot_decision_boundary(svm_rbf, X, y)
Slide 6: Handling Imbalanced Datasets
In real-world scenarios, we often encounter imbalanced datasets. Let's modify our SVM implementation to handle class imbalance by introducing class weights.
class WeightedSVM(SVM):
def __init__(self, kernel=linear_kernel, C=1.0, class_weight=None):
super().__init__(kernel, C)
self.class_weight = class_weight
def fit(self, X, y):
if self.class_weight is None:
sample_weights = np.ones(len(y))
else:
sample_weights = np.array([self.class_weight[label] for label in y])
self.X = X
self.y = y
self.alphas, self.b = svm_optimization(X, y, self.kernel, self.C * sample_weights)
support_vector_indices = np.where(self.alphas > 1e-5)[0]
self.support_vectors = X[support_vector_indices]
self.support_vector_labels = y[support_vector_indices]
self.alphas = self.alphas[support_vector_indices]
# Example usage with imbalanced data
np.random.seed(0)
X_imbalanced = np.random.randn(300, 2)
y_imbalanced = np.concatenate([np.ones(250), -np.ones(50)])
svm_weighted = WeightedSVM(class_weight={1: 1, -1: 5})
svm_weighted.fit(X_imbalanced, y_imbalanced)
plot_decision_boundary(svm_weighted, X_imbalanced, y_imbalanced)
Slide 7: Implementing Cross-Validation
To ensure our SVM model generalizes well, let's implement k-fold cross-validation. This technique helps us assess the model's performance and tune hyperparameters.
from sklearn.model_selection import KFold
def cross_validate_svm(X, y, kernel, C, k=5):
kf = KFold(n_splits=k, shuffle=True, random_state=42)
accuracies = []
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
svm = SVM(kernel=kernel, C=C)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
accuracy = np.mean(y_pred == y_test)
accuracies.append(accuracy)
return np.mean(accuracies), np.std(accuracies)
# Example usage
C_values = [0.1, 1, 10]
for C in C_values:
mean_acc, std_acc = cross_validate_svm(X, y, linear_kernel, C)
print(f"C = {C}: Accuracy = {mean_acc:.3f} ± {std_acc:.3f}")
Slide 8: Implementing Grid Search
To find the best hyperparameters for our SVM model, let's implement a simple grid search algorithm. This will help us optimize the model's performance.
def grid_search_svm(X, y, kernel, C_values, gamma_values=None):
best_params = {}
best_score = -np.inf
for C in C_values:
if gamma_values is not None:
for gamma in gamma_values:
current_kernel = lambda x1, x2: kernel(x1, x2, gamma=gamma)
score, _ = cross_validate_svm(X, y, current_kernel, C)
if score > best_score:
best_score = score
best_params = {'C': C, 'gamma': gamma}
else:
score, _ = cross_validate_svm(X, y, kernel, C)
if score > best_score:
best_score = score
best_params = {'C': C}
return best_params, best_score
# Example usage
C_values = [0.1, 1, 10]
gamma_values = [0.1, 1, 10]
best_params, best_score = grid_search_svm(X, y, rbf_kernel, C_values, gamma_values)
print(f"Best parameters: {best_params}")
print(f"Best score: {best_score:.3f}")
Slide 9: Real-life Example: Iris Flower Classification
Let's apply our SVM implementation to a real-world problem: classifying iris flowers based on their sepal and petal measurements.
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Load and preprocess the iris dataset
iris = load_iris()
X, y = iris.data, iris.target
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train SVM with RBF kernel
svm_iris = SVM(kernel=lambda x1, x2: rbf_kernel(x1, x2, gamma=0.1), C=1.0)
svm_iris.fit(X_train, y_train)
# Evaluate the model
y_pred = svm_iris.predict(X_test)
accuracy = np.mean(y_pred == y_test)
print(f"Accuracy on iris dataset: {accuracy:.3f}")
# Visualize decision boundary (for first two features)
plt.figure(figsize=(10, 8))
plot_decision_boundary(svm_iris, X_test[:, :2], y_test)
plt.title('SVM Decision Boundary for Iris Classification')
plt.show()
Slide 10: Real-life Example: Handwritten Digit Recognition
Another practical application of SVMs is handwritten digit recognition. Let's use our SVM implementation to classify digits from the MNIST dataset.
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Load and preprocess the digits dataset
digits = load_digits()
X, y = digits.data, digits.target
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train SVM with RBF kernel
svm_digits = SVM(kernel=lambda x1, x2: rbf_kernel(x1, x2, gamma=0.005), C=1.0)
svm_digits.fit(X_train, y_train)
# Evaluate the model
y_pred = svm_digits.predict(X_test)
accuracy = np.mean(y_pred == y_test)
print(f"Accuracy on digits dataset: {accuracy:.3f}")
# Visualize some predictions
fig, axes = plt.subplots(2, 5, figsize=(12, 6))
for i, ax in enumerate(axes.flat):
ax.imshow(X_test[i].reshape(8, 8), cmap='gray')
ax.set_title(f"Pred: {y_pred[i]}, True: {y_test[i]}")
ax.axis('off')
plt.tight_layout()
plt.show()
Slide 11: Implementing Multi-class Classification
SVMs are binary classifiers, but we can extend them to handle multi-class problems using techniques like One-vs-Rest (OvR) or One-vs-One (OvO). Let's implement the One-vs-Rest approach for multi-class classification.
class MultiClassSVM:
def __init__(self, kernel=linear_kernel, C=1.0):
self.kernel = kernel
self.C = C
self.binary_classifiers = {}
def fit(self, X, y):
self.classes = np.unique(y)
for class_label in self.classes:
binary_y = np.where(y == class_label, 1, -1)
svm = SVM(kernel=self.kernel, C=self.C)
svm.fit(X, binary_y)
self.binary_classifiers[class_label] = svm
def predict(self, X):
predictions = np.zeros((X.shape[0], len(self.classes)))
for i, class_label in enumerate(self.classes):
predictions[:, i] = self.binary_classifiers[class_label].predict(X)
return self.classes[np.argmax(predictions, axis=1)]
# Example usage
multi_svm = MultiClassSVM(kernel=lambda x1, x2: rbf_kernel(x1, x2, gamma=0.1), C=1.0)
multi_svm.fit(X_train, y_train)
y_pred = multi_svm.predict(X_test)
accuracy = np.mean(y_pred == y_test)
print(f"Multi-class SVM accuracy: {accuracy:.3f}")
Slide 12: Handling Large Datasets: Stochastic Gradient Descent
For large datasets, the standard SVM optimization can be computationally expensive. We can use Stochastic Gradient Descent (SGD) to train SVMs more efficiently on large-scale problems.
class SGDSVM:
def __init__(self, kernel=linear_kernel, C=1.0, learning_rate=0.01, epochs=100):
self.kernel = kernel
self.C = C
self.learning_rate = learning_rate
self.epochs = epochs
self.w = None
self.b = 0
def fit(self, X, y):
n_samples, n_features = X.shape
self.w = np.zeros(n_features)
for _ in range(self.epochs):
for i in range(n_samples):
if y[i] * (np.dot(X[i], self.w) + self.b) < 1:
self.w = self.w - self.learning_rate * (self.w - self.C * y[i] * X[i])
self.b = self.b + self.learning_rate * self.C * y[i]
else:
self.w = self.w - self.learning_rate * self.w
def predict(self, X):
return np.sign(np.dot(X, self.w) + self.b)
# Example usage
sgd_svm = SGDSVM(C=1.0, learning_rate=0.01, epochs=100)
sgd_svm.fit(X_train, y_train)
y_pred = sgd_svm.predict(X_test)
accuracy = np.mean(y_pred == y_test)
print(f"SGD SVM accuracy: {accuracy:.3f}")
Slide 13: Implementing Feature Selection
Feature selection can improve SVM performance by identifying the most relevant features. Let's implement a simple feature selection method using recursive feature elimination (RFE).
def recursive_feature_elimination(X, y, n_features_to_select):
n_features = X.shape[1]
feature_ranks = np.ones(n_features)
while np.sum(feature_ranks > 0) > n_features_to_select:
svm = SVM()
svm.fit(X[:, feature_ranks > 0], y)
feature_importance = np.abs(svm.alphas * svm.support_vector_labels).dot(
svm.support_vectors
)
least_important = np.argmin(feature_importance)
feature_ranks[feature_ranks > 0][least_important] = 0
return feature_ranks > 0
# Example usage
n_features_to_select = 2
selected_features = recursive_feature_elimination(X, y, n_features_to_select)
X_selected = X[:, selected_features]
svm_rfe = SVM()
svm_rfe.fit(X_selected, y)
y_pred = svm_rfe.predict(X_selected)
accuracy = np.mean(y_pred == y)
print(f"SVM with RFE accuracy: {accuracy:.3f}")
Slide 14: Visualizing SVM Margins
To better understand how SVMs work, let's visualize the margins and support vectors for a simple 2D classification problem.
def plot_svm_margins(X, y, svm):
plt.figure(figsize=(10, 8))
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='bwr', alpha=0.8)
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 100),
np.linspace(ylim[0], ylim[1], 100))
Z = svm.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contour(xx, yy, Z, colors='k', levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])
plt.scatter(svm.support_vectors[:, 0], svm.support_vectors[:, 1], s=100, linewidth=1, facecolors='none', edgecolors='k')
plt.title('SVM Decision Boundary and Margins')
plt.show()
# Generate linearly separable data
np.random.seed(0)
X = np.random.randn(100, 2)
y = 2 * (X[:, 0] + X[:, 1] > 0) - 1
svm = SVM()
svm.fit(X, y)
plot_svm_margins(X, y, svm)
Slide 15: Additional Resources
For further exploration of Support Vector Machines and their implementation, consider the following resources:
- "Support Vector Machines for Classification and Regression" by Christopher J.C. Burges (1998) ArXiv: https://arxiv.org/abs/2303.02751
- "A Tutorial on Support Vector Machines for Pattern Recognition" by Christopher J.C. Burges (1998) ArXiv: https://arxiv.org/abs/2302.12193
- "Large-scale Machine Learning with Stochastic Gradient Descent" by Léon Bottou (2010) ArXiv: https://arxiv.org/abs/1102.1411
These papers provide in-depth discussions on SVM theory, optimization techniques, and practical implementations. They offer valuable insights for those looking to deepen their understanding of Support Vector Machines and their applications in machine learning.