datascienceife · ismailnurudeen · Jul 16, 2021 · Jul 16, 2021
diff --git a/ismail_nurudeen/README.md b/ismail_nurudeen/README.md
@@ -0,0 +1,43 @@
+## KNN Algorithm from scratch.
+
+**K Nearest Neighbor** is a simple supervised machine learning algorithm.
+
+K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints. 
+This means that the new data point will be assigned a value based on how closely it matches the points in the training set. 
+
+
+In my implementation of the algorithm, there are three major steps:
+
+Step 1  − Initialized the classifier with the number of nearest neighbors (K).
+
+Step 2  − Fit - Initialized the train and test data.
+
+Step 3 − Predict - Takes the test data and performs the following operations to return a prediction;
+
+* Loops over the test data and calculates the distance between test data and each row of training data using the Euclidean Distance Method.
+* Sorts the distance gotten from step 1 in ascending order.
+* Next, It gets the position of the first K rows (neighbors).
+* Then, it will assign a class to the test point based on most frequent class of these rows.
+* Returns the label of the most frequent as the prediction.
+
+## Usage
+
+```python
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
+from knn_classifier import KNN
+
+iris = load_iris()
+data = iris.data    
+target = iris.target
+
+X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=43)
+
+clf = KNN(K = 4) # By Default K = 3
+clf.fit(X_train, y_train)
+
+predictions = clf.predict(X_test)
+
+print('Accuracy:', accuracy_score(y_test, predictions))
+```
diff --git a/ismail_nurudeen/knn_classifier.ipynb b/ismail_nurudeen/knn_classifier.ipynb
@@ -0,0 +1,99 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "e1b89bac",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[1]"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import operator\n",
+    "\n",
+    "# calculate euclidean distance\n",
+    "def euc_dist(x1, x2):\n",
+    "    return np.sqrt(np.sum((x1-x2)**2))\n",
+    "\n",
+    "class KNN():\n",
+    "    \n",
+    "    def __init__(self, K = 3):\n",
+    "        if(K < 1):\n",
+    "            raise Exception('K must be grater than 0')\n",
+    "        self.K = K\n",
+    "\n",
+    "    def fit(self, x_train, y_train):\n",
+    "        self.X_train = x_train\n",
+    "        self.y_train = y_train\n",
+    "\n",
+    "    def predict(self, X_test):\n",
+    "        \n",
+    "        # list to store all our predictions\n",
+    "        predictions = []\n",
+    "        \n",
+    "        # loop over all observations\n",
+    "        for i in range(len(X_test)):            \n",
+    "            \n",
+    "            # calculate the distance between the test point and all other points in the training set\n",
+    "            dist = np.array([euc_dist(X_test[i], x) for x in self.X_train])\n",
+    "            \n",
+    "            # sort the distances and return the positions of the first K neighbors\n",
+    "            dist_sorted = dist.argsort()[:self.K]\n",
+    "            \n",
+    "            neighbor_votes = {}\n",
+    "\n",
+    "            # for each neighbor find the class and return the most voted.\n",
+    "            for d in dist_sorted:\n",
+    "                if self.y_train[d] in neighbor_votes:\n",
+    "                    neighbor_votes[self.y_train[d]] += 1\n",
+    "                else:\n",
+    "                    neighbor_votes[self.y_train[d]] = 1\n",
+    "            \n",
+    "            # get the most common class label \n",
+    "            sorted_neighbors = sorted(neighbor_votes.items(), key=operator.itemgetter(1), reverse=True)\n",
+    "            \n",
+    "            predictions.append(sorted_neighbors[0][0])\n",
+    "        return predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "02b69260",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/ismail_nurudeen/knn_classifier.py b/ismail_nurudeen/knn_classifier.py
@@ -0,0 +1,46 @@
+import numpy as np
+import operator
+
+# calculate euclidean distance
+def euc_dist(x1, x2):
+    return np.sqrt(np.sum((x1-x2)**2))
+
+class KNN():
+
+    def __init__(self, K = 3):
+        if(K < 1):
+            raise Exception('K must be grater than 0')
+        self.K = K
+
+    def fit(self, x_train, y_train):
+        self.X_train = x_train
+        self.y_train = y_train
+
+    def predict(self, X_test):
+
+        # list to store all our predictions
+        predictions = []
+
+        # loop over all observations
+        for i in range(len(X_test)):            
+
+            # calculate the distance between the test point and all other points in the training set
+            dist = np.array([euc_dist(X_test[i], x) for x in self.X_train])
+
+            # sort the distances and return the positions of the first K neighbors
+            dist_sorted = dist.argsort()[:self.K]
+
+            neighbor_votes = {}
+
+            # for each neighbor find the class and return the most voted.
+            for d in dist_sorted:
+                if self.y_train[d] in neighbor_votes:
+                    neighbor_votes[self.y_train[d]] += 1
+                else:
+                    neighbor_votes[self.y_train[d]] = 1
+
+            # get the most common class label 
+            sorted_neighbors = sorted(neighbor_votes.items(), key=operator.itemgetter(1), reverse=True)
+
+            predictions.append(sorted_neighbors[0][0])
+        return predictions