From 789aa263daf46cacfc59490c7fe80ca0b1da54c5 Mon Sep 17 00:00:00 2001 From: Oyelabi Paul <59335237+Yodeman@users.noreply.github.com> Date: Thu, 8 Jul 2021 17:23:53 +0100 Subject: [PATCH 01/10] Add files via upload my first push --- oyelabi_paul_oluwadara/dsnOAU.md | 660 +++++++++++++++++++++++++++++++ oyelabi_paul_oluwadara/knn.py | 99 +++++ 2 files changed, 759 insertions(+) create mode 100644 oyelabi_paul_oluwadara/dsnOAU.md create mode 100644 oyelabi_paul_oluwadara/knn.py diff --git a/oyelabi_paul_oluwadara/dsnOAU.md b/oyelabi_paul_oluwadara/dsnOAU.md new file mode 100644 index 0000000..c7a4095 --- /dev/null +++ b/oyelabi_paul_oluwadara/dsnOAU.md @@ -0,0 +1,660 @@ +# A Light Implementation of K neares neighbors. + +A light implementation of K nearest neighbors algorithm. +This implementation only supports euclidean and manhattan +distance metrics. It also provides the probability of +prediction. + +It provides common interface as the sklearn kNN algorithm. +Below are runs of the sklearn and my implementation. + + parameters: + k: the numbers of neighbors. + method: euclidean/manhattan. + mode: classification/regression. + +`clf_1 --> sklearn KNN classifier algorithm` + +`clf_2 --> my algorithm` + +`regr_1 --> sklearn KNN regressor algorithm` + +`regr_2 -- my algorithm` + +The implementation is found in knn.py file. + + + +```python +from sklearn.datasets import load_iris, load_breast_cancer, load_diabetes +from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score, mean_squared_error +from timeit import timeit +from knn import KNN +``` + +# CLASSIFICATION - iris flower classification + + +```python +#load classification data +data, target = load_iris(True) +data.shape, target.shape +``` + + + + + ((150, 4), (150,)) + + + + +```python +x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.25, random_state=0, shuffle=True) +``` + +### sklearn knn classifier + + +```python +clf_1 = KNeighborsClassifier(n_neighbors=3) +``` + + +```python +clf_1.fit(x_train, y_train) +``` + + + + + KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', + metric_params=None, n_jobs=None, n_neighbors=3, p=2, + weights='uniform') + + + + +```python +y1_pred = clf_1.predict(x_test) +``` + + +```python +print("sklearn KNN classifier accuracy score: {:.2f}%".format(accuracy_score(y_test, y1_pred)*100)) +``` + + sklearn KNN classifier accuracy score: 97.37% + + + +```python +clf_1.predict_proba(x_test) +``` + + + + + array([[0. , 0. , 1. ], + [0. , 1. , 0. ], + [1. , 0. , 0. ], + [0. , 0. , 1. ], + [1. , 0. , 0. ], + [0. , 0. , 1. ], + [1. , 0. , 0. ], + [0. , 1. , 0. ], + [0. , 1. , 0. ], + [0. , 1. , 0. ], + [0. , 0. , 1. ], + [0. , 1. , 0. ], + [0. , 1. , 0. ], + [0. , 1. , 0. ], + [0. , 0.66666667, 0.33333333], + [1. , 0. , 0. ], + [0. , 1. , 0. ], + [0. , 1. , 0. ], + [1. , 0. , 0. ], + [1. , 0. , 0. ], + [0. , 0. , 1. ], + [0. , 1. , 0. ], + [1. , 0. , 0. ], + [1. , 0. , 0. ], + [0. , 0. , 1. ], + [1. , 0. , 0. ], + [1. , 0. , 0. ], + [0. , 1. , 0. ], + [0. , 1. , 0. ], + [1. , 0. , 0. ], + [0. , 0. , 1. ], + [0. , 1. , 0. ], + [1. , 0. , 0. ], + [0. , 0.33333333, 0.66666667], + [0. , 0. , 1. ], + [0. , 1. , 0. ], + [1. , 0. , 0. ], + [0. , 0. , 1. ]]) + + + +### My KNN classifier + + +```python +clf_2 = KNN() #defaults to 3 nearest neighbors +``` + + +```python +clf_2.fit(x_train, y_train) +``` + + +```python +y2_pred = clf_2.predict(x_test) +``` + + +```python +print("My KNN classifier accuracy score: {:.2f}%".format(accuracy_score(y_test, y2_pred)*100)) +``` + + My KNN classifier accuracy score: 97.37% + + + +```python +clf_2.predict_proba(x_test) +``` + + + + + array([[0. , 0. , 1. ], + [0. , 1. , 0. ], + [1. , 0. , 0. ], + [0. , 0. , 1. ], + [1. , 0. , 0. ], + [0. , 0. , 1. ], + [1. , 0. , 0. ], + [0. , 1. , 0. ], + [0. , 1. , 0. ], + [0. , 1. , 0. ], + [0. , 0. , 1. ], + [0. , 1. , 0. ], + [0. , 1. , 0. ], + [0. , 1. , 0. ], + [0. , 0.66666667, 0.33333333], + [1. , 0. , 0. ], + [0. , 1. , 0. ], + [0. , 1. , 0. ], + [1. , 0. , 0. ], + [1. , 0. , 0. ], + [0. , 0. , 1. ], + [0. , 1. , 0. ], + [1. , 0. , 0. ], + [1. , 0. , 0. ], + [0. , 0. , 1. ], + [1. , 0. , 0. ], + [1. , 0. , 0. ], + [0. , 1. , 0. ], + [0. , 1. , 0. ], + [1. , 0. , 0. ], + [0. , 0. , 1. ], + [0. , 1. , 0. ], + [1. , 0. , 0. ], + [0. , 0.33333333, 0.66666667], + [0. , 0. , 1. ], + [0. , 1. , 0. ], + [1. , 0. , 0. ], + [0. , 0. , 1. ]]) + + + +# CLASSIFICATION - breast cancer + + +```python +breast_data, breast_target = load_breast_cancer(True) +breast_data.shape, breast_target.shape +``` + + + + + ((569, 30), (569,)) + + + + +```python +x2_train, x2_test, y2_train, y2_test = train_test_split( + breast_data, breast_target, test_size=0.25, random_state=0, shuffle=True + ) +``` + +### sklearn knn classifier + + +```python +clf_1 = KNeighborsClassifier(n_neighbors=3) +``` + + +```python +clf_1.fit(x2_train, y2_train); +``` + + +```python +y1_pred = clf_1.predict(x2_test) +``` + + +```python +print("sklearn KNN classifier accuracy score: {:.2f}%".format(accuracy_score(y2_test, y1_pred)*100)) +``` + + sklearn KNN classifier accuracy score: 92.31% + + + +```python +clf_1.predict_proba(x2_test) +``` + + + + + array([[0.66666667, 0.33333333], + [0. , 1. ], + [0. , 1. ], + [0.66666667, 0.33333333], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0.66666667, 0.33333333], + [0.33333333, 0.66666667], + [0. , 1. ], + [0.66666667, 0.33333333], + [0.66666667, 0.33333333], + [1. , 0. ], + [0. , 1. ], + [1. , 0. ], + [1. , 0. ], + [1. , 0. ], + [0.66666667, 0.33333333], + [0.66666667, 0.33333333], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [0.33333333, 0.66666667], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [0.66666667, 0.33333333], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [1. , 0. ], + [0.66666667, 0.33333333], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [0.66666667, 0.33333333], + [1. , 0. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [1. , 0. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [1. , 0. ], + [1. , 0. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [0.33333333, 0.66666667], + [1. , 0. ], + [1. , 0. ], + [1. , 0. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [1. , 0. ], + [1. , 0. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [1. , 0. ], + [0. , 1. ], + [1. , 0. ], + [0.33333333, 0.66666667], + [0.33333333, 0.66666667], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [1. , 0. ], + [0.66666667, 0.33333333], + [1. , 0. ], + [0.66666667, 0.33333333], + [0. , 1. ], + [1. , 0. ], + [1. , 0. ], + [0.33333333, 0.66666667], + [0. , 1. ], + [0. , 1. ], + [0.33333333, 0.66666667], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [0.33333333, 0.66666667], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [1. , 0. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ]]) + + + +### My KNN classifier + + +```python +clf_2 = KNN() #defaults to 3 nearest neighbors +``` + + +```python +clf_2.fit(x2_train, y2_train) +``` + + +```python +y2_pred = clf_2.predict(x2_test) +``` + + +```python +print("My KNN classifier accuracy score: {:.2f}%".format(accuracy_score(y2_test, y2_pred)*100)) +``` + + My KNN classifier accuracy score: 92.31% + + + +```python +clf_2.predict_proba(x2_test) +``` + + + + + array([[0.66666667, 0.33333333], + [0. , 1. ], + [0. , 1. ], + [0.66666667, 0.33333333], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0.66666667, 0.33333333], + [0.33333333, 0.66666667], + [0. , 1. ], + [0.66666667, 0.33333333], + [0.66666667, 0.33333333], + [1. , 0. ], + [0. , 1. ], + [1. , 0. ], + [1. , 0. ], + [1. , 0. ], + [0.66666667, 0.33333333], + [0.66666667, 0.33333333], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [0.33333333, 0.66666667], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [0.66666667, 0.33333333], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [1. , 0. ], + [0.66666667, 0.33333333], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [0.66666667, 0.33333333], + [1. , 0. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [1. , 0. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [1. , 0. ], + [1. , 0. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [0.33333333, 0.66666667], + [1. , 0. ], + [1. , 0. ], + [1. , 0. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [1. , 0. ], + [1. , 0. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [1. , 0. ], + [0. , 1. ], + [1. , 0. ], + [0.33333333, 0.66666667], + [0.33333333, 0.66666667], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [1. , 0. ], + [0.66666667, 0.33333333], + [1. , 0. ], + [0.66666667, 0.33333333], + [0. , 1. ], + [1. , 0. ], + [1. , 0. ], + [0.33333333, 0.66666667], + [0. , 1. ], + [0. , 1. ], + [0.33333333, 0.66666667], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [0.33333333, 0.66666667], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ], + [1. , 0. ], + [0. , 1. ], + [0. , 1. ], + [0. , 1. ], + [1. , 0. ]]) + + + +# REGRESSION - diabetes dataset + + +```python +rgr_data, rgr_target = load_diabetes(True) +rgr_data.shape, rgr_target.shape +``` + + + + + ((442, 10), (442,)) + + + + +```python +rx_train, rx_test, ry_train, ry_test = train_test_split(rgr_data, rgr_target, test_size=0.25, random_state=0, shuffle=True) +``` + +### sklearn KNN regressor + + +```python +regr_1 = KNeighborsRegressor(n_neighbors=3) +regr_1.fit(rx_train, ry_train); +``` + + +```python +ry_pred = regr_1.predict(rx_test) +``` + + +```python +print("sklearn KNN regressor percentage RMSE: {:.2f}".format(mean_squared_error(ry_test, ry_pred))) +``` + + sklearn KNN regressor percentage RMSE: 4232.01 + + +### My KNN regressor + + +```python +regr_2 = KNN(k=3, mode="regression") +regr_2.fit(rx_train, ry_train); +``` + + +```python +ry2_pred = regr_2.predict(rx_test) +``` + + +```python +print("My KNN regressor percentage RMSE: {:.2f}".format(mean_squared_error(ry_test, ry2_pred))) +``` + + My KNN regressor percentage RMSE: 4232.01 + + + +```python + +``` diff --git a/oyelabi_paul_oluwadara/knn.py b/oyelabi_paul_oluwadara/knn.py new file mode 100644 index 0000000..b868281 --- /dev/null +++ b/oyelabi_paul_oluwadara/knn.py @@ -0,0 +1,99 @@ +import numpy as np +import statistics as st + +class KNN(): + """ + A light implementation of K nearest neighbors algorithm. + This implementation only supports euclidean and manhattan + distance metrics. It also provides the probability of + prediction. + + parameters: + k: the numbers of neighbors. + method: euclidean/manhattan. + mode: classification/regression. + """ + + def __init__(self, k=3, mode="classification", method="euclidean"): + self.k = k + self.mode = mode + self.x = None + self.y = None + self.class_ = None + self.method = method + + assert self.mode in ("classification", "regression"), "Unsupported mode." + assert self.method in ("euclidean", "manhattan"), "Unsupported method." + + def fit(self, x, y): + if isinstance(x, np.ndarray) and isinstance(y, np.ndarray): + self.x = x + self.y = y + else: + self.x = np.array(x) + self.y = np.array(y) + self.class_ = np.unique(self.y) + + def predict(self, x): + assert isinstance(self.x, np.ndarray) and isinstance(self.y, np.ndarray),\ + "You need to train before predicting." + if (x.ndim < 2): raise Exception("Input not in right shape.") + return np.apply_along_axis(self.util, 1, x) + + def predict_proba(self, x): + assert self.mode=="classification", "Method availabel only for classification." + assert isinstance(self.x, np.ndarray) and isinstance(self.y, np.ndarray),\ + "You need to train before predicting." + return np.apply_along_axis(self.prob_util, 1, x) + + def euclidean(self, x): + return np.argpartition( + np.linalg.norm(self.x-x, axis=1), self.k)[:self.k] + #np.sqrt(np.sum(np.square(self.x-x), 1)) + + def manhattan(self, x): + return np.argpartition( + np.linalg.norm(self.x-x, ord=1, axis=1), self.k)[:self.k] + #np.sqrt(np.sum(np.absolute(self.x-x), 1)) + + def prob_util(self, x): + idx = self.euclidean(x) if self.method=="euclidean" else self.manhattan(x) + cls_ = self.y[idx] + #pred = st.mode(cls_) + out = [] + unique, count = np.unique(cls_, return_counts=True) + for i in self.class_: + try: + out.append(count[np.where(unique==i)][0]/len(cls_)) + except IndexError: + out.append(0) + return out + + def util(self, x): + idx = self.euclidean(x) if self.method=="euclidean" else self.manhattan(x) + cls_ = self.y[idx] + if self.mode=="classification": + try: + return st.mode(cls_) + except: + raise Exception( + "no unique mode; found 2 equally common values" + "You should consider the value of k with respect" + "to the number of classes in your target variable." + ) from None + + elif self.mode=="regression": + return st.mean(cls_) + +if __name__ == "__main__": + from sklearn.datasets import load_iris + from sklearn.model_selection import train_test_split + from sklearn.metrics import accuracy_score + data, target = load_iris(True) + x_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.25, random_state=0, shuffle=True) + clf_2 = KNN(method="manhattan") + clf_2.fit(x_train, y_train) + print(clf_2.class_) + y = clf_2.predict(X_test) + print("My KNN classifier accuracy score: {:.2f}%".format(accuracy_score(y_test, y)*100)) + print(clf_2.predict_proba(X_test)) From 0a47284d4c687abf8d1d8e86edfb46918e2f12f2 Mon Sep 17 00:00:00 2001 From: Oyelabi Paul <59335237+Yodeman@users.noreply.github.com> Date: Fri, 9 Jul 2021 09:05:55 +0100 Subject: [PATCH 02/10] Update dsnOAU.md --- oyelabi_paul_oluwadara/dsnOAU.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/oyelabi_paul_oluwadara/dsnOAU.md b/oyelabi_paul_oluwadara/dsnOAU.md index c7a4095..13668ca 100644 --- a/oyelabi_paul_oluwadara/dsnOAU.md +++ b/oyelabi_paul_oluwadara/dsnOAU.md @@ -12,6 +12,30 @@ Below are runs of the sklearn and my implementation. k: the numbers of neighbors. method: euclidean/manhattan. mode: classification/regression. + + + fit method: + The KNN algorithm doesn't really learn, this method only + captures the training dataset with which test samples will + be compared. + + predict method: + The prediction is made by comparing the distance of datapoints + in the training set to the the test sample. This method calls + the util method which calculates the distance metric and then + assign the class with majority vote(with highest number of + occurence) to the test sample. + + predict_proba method: + This method works like the predict method, it calls the proba_util + method which performs almost same functionality as the util method + but it calculates the probability of the test belonging to what + predict mwthods performs. + + The argpartition method used in the distance metrics works by partitioning + the array into two, with the k least numbers on the left and others on the + right and then returns the indices of the k-least numbers, which is then + used to index the target variable to get the corresponding labels. `clf_1 --> sklearn KNN classifier algorithm` From ac7c8243d3cef51f3bb86d537c529f8f98d9639c Mon Sep 17 00:00:00 2001 From: Oyelabi Paul <59335237+Yodeman@users.noreply.github.com> Date: Sat, 10 Jul 2021 21:36:49 +0100 Subject: [PATCH 03/10] Update dsnOAU.md --- oyelabi_paul_oluwadara/dsnOAU.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/oyelabi_paul_oluwadara/dsnOAU.md b/oyelabi_paul_oluwadara/dsnOAU.md index 13668ca..0ee9378 100644 --- a/oyelabi_paul_oluwadara/dsnOAU.md +++ b/oyelabi_paul_oluwadara/dsnOAU.md @@ -3,7 +3,8 @@ A light implementation of K nearest neighbors algorithm. This implementation only supports euclidean and manhattan distance metrics. It also provides the probability of -prediction. +prediction. I leveraged on Numpy and python built-in +statistics package. It provides common interface as the sklearn kNN algorithm. Below are runs of the sklearn and my implementation. From 789ea4bf75e74197995e2360f0542a5e6cd282d3 Mon Sep 17 00:00:00 2001 From: Oyelabi Paul <59335237+Yodeman@users.noreply.github.com> Date: Thu, 15 Jul 2021 17:31:26 +0100 Subject: [PATCH 04/10] Create readme.md --- oyelabi_paul_oluwadara/readme.md | 81 ++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 oyelabi_paul_oluwadara/readme.md diff --git a/oyelabi_paul_oluwadara/readme.md b/oyelabi_paul_oluwadara/readme.md new file mode 100644 index 0000000..7165b8e --- /dev/null +++ b/oyelabi_paul_oluwadara/readme.md @@ -0,0 +1,81 @@ +# A Light Implementation of K neares neighbors. + +A light implementation of K nearest neighbors algorithm. +This implementation only supports euclidean and manhattan +distance metrics. It also provides the probability of +prediction. I leveraged on Numpy and python built-in +statistics package. + +It provides common interface as the sklearn kNN algorithm. +Below are runs of the sklearn and my implementation. + +The KNN algorithm works by calculating how close datapoints are +to one another by calculating the distance between two datapoints +using distance metrics like euclidean distance and then select K +closest datapoints. It then assign the class with majority vote in +the K selected datapoints to the new sample the algorithm is trying +to classify. And in terms of regression, it calculates the mean of +the K selected datapoints to the new sample. + + parameters: + k: the numbers of neighbors. + method: euclidean/manhattan. + mode: classification/regression. + + + fit method: + Since the KNN algorithm doesn't really learn, this method only + captures the predictors (X) and target (Y) with which test samples + will be compared. + + predict method: + The prediction is made by comparing the distance of datapoints + in the training set to the the test sample. This method calls + the util method which calculates the distance metric and then + assign the class with majority vote(with highest number of + occurence) to the test sample. + + predict_proba method: + This method works like the predict method, it calls the proba_util + method which performs almost same functionality as the util method + but it calculates the probability of the test belonging to what + predict mwthods performs. + + euclidean method: + This method calculates the euclidean distance between datapoints using + the numpy linear algebra method. I used the numpy argpartition method to + sort the calculated distance and then it returns the indices of the K + closest (smallest value) distances. + + manhattan method: + This method calculates the manhattan distance between datapoints using + the numpy linear algebra method. I used the numpy argpartition method to + sort the calculated distance and then it returns the indices of the K + closest (smallest value) distances. + + util method: + This method calls either euclidean or manhattan method according to the + method parameter passed to the class constructor. It gets the indices of + the K closest datapoints which it then uses to index the target varible + to get the corresponding class label. It then calculates the majority vote + using the python built-in statistics mode method. If the mode parameter is + set to `regression` this method returns the mean of the K nearest datapoints. + + `I could have assigned class randomly whenever there is a tie in the majority + vote, but instead, I raised a warning and then advise the data scientist to + consider seecting the K value with respect to the number of classes (i.e. an + even value for K whenever the number of class in target varible is odd, and an + odd value for K whenever that number of class in target variable is even) in the + target varible as this is a best practice to ensure good results.` + + prob_util method: + This method is only available during classification mode. This method works exactly + like the util method but it returns the probability of a test sample belonging to + the predicted class. It acheive this by dividing the total number of occurence + of each class in the K nearest datapoints by K. + + The argpartition method used in the distance metrics works by partitioning + the array into two, with the k least numbers on the left and others on the + right and then returns the indices of the k-least numbers, which is then + used to index the target variable to get the corresponding labels. + From 10edb5980f0d211f3f5b17ab280f84b708dd1c03 Mon Sep 17 00:00:00 2001 From: Oyelabi Paul <59335237+Yodeman@users.noreply.github.com> Date: Thu, 15 Jul 2021 17:35:16 +0100 Subject: [PATCH 05/10] Update readme.md --- oyelabi_paul_oluwadara/readme.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/oyelabi_paul_oluwadara/readme.md b/oyelabi_paul_oluwadara/readme.md index 7165b8e..7d0c620 100644 --- a/oyelabi_paul_oluwadara/readme.md +++ b/oyelabi_paul_oluwadara/readme.md @@ -43,23 +43,23 @@ the K selected datapoints to the new sample. euclidean method: This method calculates the euclidean distance between datapoints using - the numpy linear algebra method. I used the numpy argpartition method to - sort the calculated distance and then it returns the indices of the K - closest (smallest value) distances. + the numpy linear algebra method. I used the numpy argpartition method + to sort the calculated distance and then it returns the indices of the + K closest (smallest value) distances. manhattan method: This method calculates the manhattan distance between datapoints using - the numpy linear algebra method. I used the numpy argpartition method to - sort the calculated distance and then it returns the indices of the K - closest (smallest value) distances. + the numpy linear algebra method. I used the numpy argpartition method + to sort the calculated distance and then it returns the indices of the + K closest (smallest value) distances. util method: This method calls either euclidean or manhattan method according to the method parameter passed to the class constructor. It gets the indices of the K closest datapoints which it then uses to index the target varible - to get the corresponding class label. It then calculates the majority vote - using the python built-in statistics mode method. If the mode parameter is - set to `regression` this method returns the mean of the K nearest datapoints. + to get the corresponding class label. It then calculates the majority + vote using the python built-in statistics mode method. If the mode parameter + is set to `regression` this method returns the mean of the K nearest datapoints. `I could have assigned class randomly whenever there is a tie in the majority vote, but instead, I raised a warning and then advise the data scientist to From afa1fddf3e9834e3ce8066092e55784c4e07bd32 Mon Sep 17 00:00:00 2001 From: Oyelabi Paul <59335237+Yodeman@users.noreply.github.com> Date: Thu, 15 Jul 2021 17:49:47 +0100 Subject: [PATCH 06/10] Update readme.md --- oyelabi_paul_oluwadara/readme.md | 1 + 1 file changed, 1 insertion(+) diff --git a/oyelabi_paul_oluwadara/readme.md b/oyelabi_paul_oluwadara/readme.md index 7d0c620..43b28ef 100644 --- a/oyelabi_paul_oluwadara/readme.md +++ b/oyelabi_paul_oluwadara/readme.md @@ -79,3 +79,4 @@ the K selected datapoints to the new sample. right and then returns the indices of the k-least numbers, which is then used to index the target variable to get the corresponding labels. +### Some tests are found in [dsnOAU.md](https://github.com/Yodeman/ML-Algorithm-Challenge/edit/main/oyelabi_paul_oluwadara/readme.md) From 791c611b0ed50f7adbc3e304d5b50256ad7f60b2 Mon Sep 17 00:00:00 2001 From: Oyelabi Paul <59335237+Yodeman@users.noreply.github.com> Date: Thu, 15 Jul 2021 17:50:37 +0100 Subject: [PATCH 07/10] Update readme.md --- oyelabi_paul_oluwadara/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oyelabi_paul_oluwadara/readme.md b/oyelabi_paul_oluwadara/readme.md index 43b28ef..58954ff 100644 --- a/oyelabi_paul_oluwadara/readme.md +++ b/oyelabi_paul_oluwadara/readme.md @@ -79,4 +79,4 @@ the K selected datapoints to the new sample. right and then returns the indices of the k-least numbers, which is then used to index the target variable to get the corresponding labels. -### Some tests are found in [dsnOAU.md](https://github.com/Yodeman/ML-Algorithm-Challenge/edit/main/oyelabi_paul_oluwadara/readme.md) +### Some tests are found in [dsnOAU.md](https://github.com/Yodeman/ML-Algorithm-Challenge/edit/main/oyelabi_paul_oluwadara/dsnOAU.md) From e538fc66d3537e738cc8c3fae431e891086afa24 Mon Sep 17 00:00:00 2001 From: Oyelabi Paul <59335237+Yodeman@users.noreply.github.com> Date: Thu, 15 Jul 2021 17:51:25 +0100 Subject: [PATCH 08/10] Update readme.md --- oyelabi_paul_oluwadara/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oyelabi_paul_oluwadara/readme.md b/oyelabi_paul_oluwadara/readme.md index 58954ff..22d1fb0 100644 --- a/oyelabi_paul_oluwadara/readme.md +++ b/oyelabi_paul_oluwadara/readme.md @@ -79,4 +79,4 @@ the K selected datapoints to the new sample. right and then returns the indices of the k-least numbers, which is then used to index the target variable to get the corresponding labels. -### Some tests are found in [dsnOAU.md](https://github.com/Yodeman/ML-Algorithm-Challenge/edit/main/oyelabi_paul_oluwadara/dsnOAU.md) +### Some tests are found in [dsnOAU.md](https://github.com/Yodeman/ML-Algorithm-Challenge/main/oyelabi_paul_oluwadara/dsnOAU.md) From 2da5401d63e7b1c375c47c51e5ea16a54090e42f Mon Sep 17 00:00:00 2001 From: Oyelabi Paul <59335237+Yodeman@users.noreply.github.com> Date: Thu, 15 Jul 2021 17:52:32 +0100 Subject: [PATCH 09/10] Update readme.md --- oyelabi_paul_oluwadara/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oyelabi_paul_oluwadara/readme.md b/oyelabi_paul_oluwadara/readme.md index 22d1fb0..a5278af 100644 --- a/oyelabi_paul_oluwadara/readme.md +++ b/oyelabi_paul_oluwadara/readme.md @@ -79,4 +79,4 @@ the K selected datapoints to the new sample. right and then returns the indices of the k-least numbers, which is then used to index the target variable to get the corresponding labels. -### Some tests are found in [dsnOAU.md](https://github.com/Yodeman/ML-Algorithm-Challenge/main/oyelabi_paul_oluwadara/dsnOAU.md) +### Some tests are found in [dsnOAU.md](https://github.com/Yodeman/ML-Algorithm-Challenge/blob/main/oyelabi_paul_oluwadara/dsnOAU.md) From 613b825edf01a595c117ff0715e310572d2276f7 Mon Sep 17 00:00:00 2001 From: Oyelabi Paul <59335237+Yodeman@users.noreply.github.com> Date: Fri, 16 Jul 2021 09:06:26 +0100 Subject: [PATCH 10/10] Update dsnOAU.md --- oyelabi_paul_oluwadara/dsnOAU.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/oyelabi_paul_oluwadara/dsnOAU.md b/oyelabi_paul_oluwadara/dsnOAU.md index 0ee9378..5198daf 100644 --- a/oyelabi_paul_oluwadara/dsnOAU.md +++ b/oyelabi_paul_oluwadara/dsnOAU.md @@ -652,10 +652,10 @@ ry_pred = regr_1.predict(rx_test) ```python -print("sklearn KNN regressor percentage RMSE: {:.2f}".format(mean_squared_error(ry_test, ry_pred))) +print("sklearn KNN regressor RMSE: {:.2f}".format(mean_squared_error(ry_test, ry_pred))) ``` - sklearn KNN regressor percentage RMSE: 4232.01 + sklearn KNN regressor RMSE: 4232.01 ### My KNN regressor @@ -673,10 +673,10 @@ ry2_pred = regr_2.predict(rx_test) ```python -print("My KNN regressor percentage RMSE: {:.2f}".format(mean_squared_error(ry_test, ry2_pred))) +print("My KNN regresRMSE: {:.2f}".format(mean_squared_error(ry_test, ry2_pred))) ``` - My KNN regressor percentage RMSE: 4232.01 + My KNN regressor RMSE: 4232.01