diff --git a/ismail_nurudeen/README.md b/ismail_nurudeen/README.md new file mode 100644 index 0000000..b4f97a9 --- /dev/null +++ b/ismail_nurudeen/README.md @@ -0,0 +1,43 @@ +## KNN Algorithm from scratch. + +**K Nearest Neighbor** is a simple supervised machine learning algorithm. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints. +This means that the new data point will be assigned a value based on how closely it matches the points in the training set. + + +In my implementation of the algorithm, there are three major steps: + +Step 1 − Initialized the classifier with the number of nearest neighbors (K). + +Step 2 − Fit - Initialized the train and test data. + +Step 3 − Predict - Takes the test data and performs the following operations to return a prediction; + +* Loops over the test data and calculates the distance between test data and each row of training data using the Euclidean Distance Method. +* Sorts the distance gotten from step 1 in ascending order. +* Next, It gets the position of the first K rows (neighbors). +* Then, it will assign a class to the test point based on most frequent class of these rows. +* Returns the label of the most frequent as the prediction. + +## Usage + +```python +from sklearn.datasets import load_iris +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score +from knn_classifier import KNN + +iris = load_iris() +data = iris.data +target = iris.target + +X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=43) + +clf = KNN(K = 4) # By Default K = 3 +clf.fit(X_train, y_train) + +predictions = clf.predict(X_test) + +print('Accuracy:', accuracy_score(y_test, predictions)) +``` diff --git a/ismail_nurudeen/knn_classifier.ipynb b/ismail_nurudeen/knn_classifier.ipynb new file mode 100644 index 0000000..0efcb16 --- /dev/null +++ b/ismail_nurudeen/knn_classifier.ipynb @@ -0,0 +1,99 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 36, + "id": "e1b89bac", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[1]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "import operator\n", + "\n", + "# calculate euclidean distance\n", + "def euc_dist(x1, x2):\n", + " return np.sqrt(np.sum((x1-x2)**2))\n", + "\n", + "class KNN():\n", + " \n", + " def __init__(self, K = 3):\n", + " if(K < 1):\n", + " raise Exception('K must be grater than 0')\n", + " self.K = K\n", + "\n", + " def fit(self, x_train, y_train):\n", + " self.X_train = x_train\n", + " self.y_train = y_train\n", + "\n", + " def predict(self, X_test):\n", + " \n", + " # list to store all our predictions\n", + " predictions = []\n", + " \n", + " # loop over all observations\n", + " for i in range(len(X_test)): \n", + " \n", + " # calculate the distance between the test point and all other points in the training set\n", + " dist = np.array([euc_dist(X_test[i], x) for x in self.X_train])\n", + " \n", + " # sort the distances and return the positions of the first K neighbors\n", + " dist_sorted = dist.argsort()[:self.K]\n", + " \n", + " neighbor_votes = {}\n", + "\n", + " # for each neighbor find the class and return the most voted.\n", + " for d in dist_sorted:\n", + " if self.y_train[d] in neighbor_votes:\n", + " neighbor_votes[self.y_train[d]] += 1\n", + " else:\n", + " neighbor_votes[self.y_train[d]] = 1\n", + " \n", + " # get the most common class label \n", + " sorted_neighbors = sorted(neighbor_votes.items(), key=operator.itemgetter(1), reverse=True)\n", + " \n", + " predictions.append(sorted_neighbors[0][0])\n", + " return predictions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02b69260", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ismail_nurudeen/knn_classifier.py b/ismail_nurudeen/knn_classifier.py new file mode 100644 index 0000000..a25070b --- /dev/null +++ b/ismail_nurudeen/knn_classifier.py @@ -0,0 +1,46 @@ +import numpy as np +import operator + +# calculate euclidean distance +def euc_dist(x1, x2): + return np.sqrt(np.sum((x1-x2)**2)) + +class KNN(): + + def __init__(self, K = 3): + if(K < 1): + raise Exception('K must be grater than 0') + self.K = K + + def fit(self, x_train, y_train): + self.X_train = x_train + self.y_train = y_train + + def predict(self, X_test): + + # list to store all our predictions + predictions = [] + + # loop over all observations + for i in range(len(X_test)): + + # calculate the distance between the test point and all other points in the training set + dist = np.array([euc_dist(X_test[i], x) for x in self.X_train]) + + # sort the distances and return the positions of the first K neighbors + dist_sorted = dist.argsort()[:self.K] + + neighbor_votes = {} + + # for each neighbor find the class and return the most voted. + for d in dist_sorted: + if self.y_train[d] in neighbor_votes: + neighbor_votes[self.y_train[d]] += 1 + else: + neighbor_votes[self.y_train[d]] = 1 + + # get the most common class label + sorted_neighbors = sorted(neighbor_votes.items(), key=operator.itemgetter(1), reverse=True) + + predictions.append(sorted_neighbors[0][0]) + return predictions \ No newline at end of file