-
Notifications
You must be signed in to change notification settings - Fork 0
/
KNearestNieghbour.py
80 lines (57 loc) · 2.26 KB
/
KNearestNieghbour.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# KNN Classifier
# To work with dataframes
import pandas as pd
# To perform numerical operations
import numpy as np
# To visualize data
import seaborn as sns
# To partition the data
from sklearn.model_selection import train_test_split
# importing the library of KNN
from sklearn.neighbors import KNeighborsClassifier
# Importing performance metrics - accuracy score & confusion matrix
from sklearn.metrics import accuracy_score,confusion_matrix
# Importing data
data = pd.read_csv('income.csv',na_values=[" ?"])
# Data pre-processing
data.isnull().sum()
missing = data[data.isnull().any(axis=1)] # axis=1 => to consider at least one column value is missing in a row
data2 = data.dropna(axis=0)
data2['SalStat']=data2['SalStat'].map({' less than or equal to 50,000':0,' greater than 50,000':1})
print(data2['SalStat'])
new_data=pd.get_dummies(data2, drop_first=True)
# Storing the column names
columns_list=list(new_data.columns)
print(columns_list)
# Separating the input names from data
features=list(set(columns_list)-set(['SalStat']))
print(features)
# Storing the output values in y
y=new_data['SalStat'].values
print(y)
# Storing the values from input features
x = new_data[features].values
print(x)
# Splitting the data into train and test
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.3, random_state=0)
# Storing the K nearest neighbors classifier
KNN_classifier = KNeighborsClassifier(n_neighbors = 5)
# Fitting the values for X and Y
KNN_classifier.fit(train_x, train_y)
# Predicting the test values with model
prediction = KNN_classifier.predict(test_x)
# Performance metric check
confusionMmatrix = confusion_matrix(test_y, prediction)
print(confusionMmatrix)
# Calculating the accuracy
accuracy_score=accuracy_score(test_y, prediction)
print(accuracy_score)
print('Misclassified samples: %d' % (test_y != prediction).sum())
Misclassified_sample = []
# Calculating error for K values between 1 and 20
for i in range(1, 20):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(train_x, train_y)
pred_i = knn.predict(test_x)
Misclassified_sample.append((test_y != pred_i).sum())
print(Misclassified_sample)