From d615d11243e928840530656692b2b01dd1cc288c Mon Sep 17 00:00:00 2001 From: Foton Date: Tue, 4 Jun 2024 13:44:59 +0300 Subject: [PATCH 1/7] Release Data Science Libreries Lesson4 --- Lesson4/Task2.ipynb | 6815 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 6815 insertions(+) create mode 100644 Lesson4/Task2.ipynb diff --git a/Lesson4/Task2.ipynb b/Lesson4/Task2.ipynb new file mode 100644 index 0000000..ac6e6bf --- /dev/null +++ b/Lesson4/Task2.ipynb @@ -0,0 +1,6815 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b27e41e4", + "metadata": {}, + "source": [ + "## Тема “Визуализация данных в Matplotlib”" + ] + }, + { + "cell_type": "markdown", + "id": "f324024b", + "metadata": {}, + "source": [ + "### Задание 1\n", + "Загрузите модуль pyplot библиотеки matplotlib с псевдонимом plt, а также библиотеку numpy с\n", + "псевдонимом np.\n", + "Примените магическую функцию %matplotlib inline для отображения графиков в Jupyter Notebook и\n", + "настройки конфигурации ноутбука со значением 'svg' для более четкого отображения графиков.\n", + "Создайте список под названием x с числами 1, 2, 3, 4, 5, 6, 7 и список y с числами 3.5, 3.8, 4.2, 4.5, 5,\n", + "5.5, 7.\n", + "С помощью функции plot постройте график, соединяющий линиями точки с горизонтальными\n", + "координатами из списка x и вертикальными - из списка y.\n", + "Затем в следующей ячейке постройте диаграмму рассеяния (другие названия - диаграмма разброса,\n", + "scatter plot)." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "99e11a55", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from random import seed\n", + "from matplotlib import pyplot as plt\n", + "\n", + "plt.style.use('fivethirtyeight')\n", + "\n", + "%matplotlib inline\n", + "%config InlineBackend.figure_format = 'svg'" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "bd0d7835", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " 2024-06-04T11:17:30.715908\n", + " image/svg+xml\n", + " \n", + " \n", + " Matplotlib v3.9.0, https://matplotlib.org/\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "x = [1, 2, 3, 4, 5, 6, 7]\n", + "y = [3.5, 3.8, 4.2, 4.5, 5, 5.5, 7]\n", + "\n", + "plt.plot(x, y)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "91fc998c", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " 2024-06-03T11:42:36.642755\n", + " image/svg+xml\n", + " \n", + " \n", + " Matplotlib v3.9.0, https://matplotlib.org/\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(x, y)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "fbf3b995", + "metadata": {}, + "source": [ + "### Задание 2\n", + "С помощью функции linspace из библиотеки Numpy создайте массив t из 51 числа от 0 до 10\n", + "включительно.\n", + "© geekbrains.ru\n", + "Создайте массив Numpy под названием f, содержащий косинусы элементов массива t.\n", + "Постройте линейную диаграмму, используя массив t для координат по горизонтали,а массив f - для\n", + "координат по вертикали. Линия графика должна быть зеленого цвета.\n", + "Выведите название диаграммы - 'График f(t)'. Также добавьте названия для горизонтальной оси -\n", + "'Значения t' и для вертикальной - 'Значения f'.\n", + "Ограничьте график по оси x значениями 0.5 и 9.5, а по оси y - значениями -2.5 и 2.5." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f19e1fe4", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " 2024-06-03T11:42:36.735077\n", + " image/svg+xml\n", + " \n", + " \n", + " Matplotlib v3.9.0, https://matplotlib.org/\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "t = np.linspace(0, 10, 51)\n", + "\n", + "f = np.cos(t)\n", + "\n", + "plt.plot(t, f, color=\"green\")\n", + "\n", + "plt.title(\"График f(t)\")\n", + "plt.xlabel(\"Значения t\")\n", + "plt.ylabel(\"Значения f\")\n", + "\n", + "plt.axis([0.5, 9.5, -2.5, 2.5])\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "73611d04", + "metadata": {}, + "source": [ + "### Задание 3\n", + "С помощью функции linspace библиотеки Numpy создайте массив x из 51 числа от -3 до 3\n", + "включительно.\n", + "Создайте массивы y1, y2, y3, y4 по следующим формулам:\n", + "y1 = x**2\n", + "y2 = 2 * x + 0.5\n", + "y3 = -3 * x - 1.5\n", + "y4 = sin(x)\n", + "Используя функцию subplots модуля matplotlib.pyplot, создайте объект matplotlib.figure.Figure с\n", + "названием fig и массив объектов Axes под названием ax,причем так, чтобы у вас было 4 отдельных\n", + "графика в сетке, состоящей из двух строк и двух столбцов. В каждом графике массив x используется\n", + "для координат по горизонтали.В левом верхнем графике для координат по вертикали используйте\n", + "y1,в правом верхнем - y2, в левом нижнем - y3, в правом нижнем - y4.Дайте название графикам:\n", + "'График y1', 'График y2' и т.д.\n", + "Для графика в левом верхнем углу установите границы по оси x от -5 до 5.\n", + "Установите размеры фигуры 8 дюймов по горизонтали и 6 дюймов по вертикали.\n", + "Вертикальные и горизонтальные зазоры между графиками должны составлять 0.3" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ea585c38", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'График y4')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " 2024-06-03T11:42:37.042700\n", + " image/svg+xml\n", + " \n", + " \n", + " Matplotlib v3.9.0, https://matplotlib.org/\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "x = np.linspace(-3, 3, 51)\n", + "\n", + "y1 = x**2\n", + "y2 = 2*x + 0.5\n", + "y3 = -3*x - 1.5\n", + "y4 = np.sin(x)\n", + "\n", + "fig, ax = plt.subplots(nrows=2, ncols=2)\n", + "\n", + "fig.set_size_inches(8, 6)\n", + "fig.subplots_adjust(wspace=0.3, hspace=0.3)\n", + "\n", + "ax1, ax2, ax3, ax4 = ax.flatten()\n", + "\n", + "ax1.plot(x, y1)\n", + "ax1.set_title(\"График y1\")\n", + "ax1.set_xlim([-5, 5])\n", + "ax2.plot(x, y2)\n", + "ax2.set_title(\"График y2\")\n", + "ax3.plot(x, y3)\n", + "ax3.set_title(\"График y3\")\n", + "ax4.plot(x, y4)\n", + "ax4.set_title(\"График y4\")" + ] + }, + { + "cell_type": "markdown", + "id": "c77b9177", + "metadata": {}, + "source": [ + "### Задание 4\n", + "В этом задании мы будем работать с датасетом, в котором приведены данные по мошенничеству с\n", + "кредитными данными: Credit Card Fraud Detection (информация об авторах: Andrea Dal Pozzolo, Olivier\n", + "Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced\n", + "Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015).\n", + "Ознакомьтесь с описанием и скачайте датасет creditcard.csv с сайта Kaggle.com по ссылке:\n", + "Credit Card Fraud Detection\n", + "Данный датасет является примером несбалансированных данных, так как мошеннические операции с\n", + "картами встречаются реже обычных.\n", + "Импортируйте библиотеку Pandas, а также используйте для графиков стиль “fivethirtyeight”.\n", + "© geekbrains.ru 1\n", + "Посчитайте с помощью метода value_counts количество наблюдений для каждого значения целевой\n", + "переменной Class и примените к полученным данным метод plot, чтобы построить столбчатую\n", + "диаграмму. Затем постройте такую же диаграмму, используя логарифмический масштаб.\n", + "На следующем графике постройте две гистограммы по значениям признака V1 - одну для\n", + "мошеннических транзакций (Class равен 1) и другую - для обычных (Class равен 0). Подберите\n", + "значение аргумента density так, чтобы по вертикали графика было расположено не число\n", + "наблюдений, а плотность распределения. Число бинов должно равняться 20 для обеих гистограмм, а\n", + "коэффициент alpha сделайте равным 0.5, чтобы гистограммы были полупрозрачными и не\n", + "загораживали друг друга. Создайте легенду с двумя значениями: “Class 0” и “Class 1”. Гистограмма\n", + "обычных транзакций должна быть серого цвета, а мошеннических - красного. Горизонтальной оси\n", + "дайте название “V1”." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "65f2dc4a-f788-4f45-b8d7-7bd6de270815", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset URL: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud\n", + "License(s): DbCL-1.0\n", + "Downloading creditcardfraud.zip to W:\\Projects\\GB\\Python\\MLearning\\DSLibraries\\Lesson4\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " 0%| | 0.00/66.0M [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TimeV1V2V3V4V5V6V7V8V9...V21V22V23V24V25V26V27V28AmountClass
00.0-1.359807-0.0727812.5363471.378155-0.3383210.4623880.2395990.0986980.363787...-0.0183070.277838-0.1104740.0669280.128539-0.1891150.133558-0.021053149.620
10.01.1918570.2661510.1664800.4481540.060018-0.082361-0.0788030.085102-0.255425...-0.225775-0.6386720.101288-0.3398460.1671700.125895-0.0089830.0147242.690
21.0-1.358354-1.3401631.7732090.379780-0.5031981.8004990.7914610.247676-1.514654...0.2479980.7716790.909412-0.689281-0.327642-0.139097-0.055353-0.059752378.660
31.0-0.966272-0.1852261.792993-0.863291-0.0103091.2472030.2376090.377436-1.387024...-0.1083000.005274-0.190321-1.1755750.647376-0.2219290.0627230.061458123.500
42.0-1.1582330.8777371.5487180.403034-0.4071930.0959210.592941-0.2705330.817739...-0.0094310.798278-0.1374580.141267-0.2060100.5022920.2194220.21515369.990
\n", + "

5 rows × 31 columns

\n", + "" + ], + "text/plain": [ + " Time V1 V2 V3 V4 V5 V6 V7 \\\n", + "0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n", + "1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n", + "2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n", + "3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n", + "4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n", + "\n", + " V8 V9 ... V21 V22 V23 V24 V25 \\\n", + "0 0.098698 0.363787 ... -0.018307 0.277838 -0.110474 0.066928 0.128539 \n", + "1 0.085102 -0.255425 ... -0.225775 -0.638672 0.101288 -0.339846 0.167170 \n", + "2 0.247676 -1.514654 ... 0.247998 0.771679 0.909412 -0.689281 -0.327642 \n", + "3 0.377436 -1.387024 ... -0.108300 0.005274 -0.190321 -1.175575 0.647376 \n", + "4 -0.270533 0.817739 ... -0.009431 0.798278 -0.137458 0.141267 -0.206010 \n", + "\n", + " V26 V27 V28 Amount Class \n", + "0 -0.189115 0.133558 -0.021053 149.62 0 \n", + "1 0.125895 -0.008983 0.014724 2.69 0 \n", + "2 -0.139097 -0.055353 -0.059752 378.66 0 \n", + "3 -0.221929 0.062723 0.061458 123.50 0 \n", + "4 0.502292 0.219422 0.215153 69.99 0 \n", + "\n", + "[5 rows x 31 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "d4fae789", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TimeV1V2V3V4V5V6V7V8V9...V21V22V23V24V25V26V27V28AmountClass
count284807.0000002.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+05...2.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+05284807.000000284807.000000
mean94813.8595751.168375e-153.416908e-16-1.379537e-152.074095e-159.604066e-161.487313e-15-5.556467e-161.213481e-16-2.406331e-15...1.654067e-16-3.568593e-162.578648e-164.473266e-155.340915e-161.683437e-15-3.660091e-16-1.227390e-1688.3496190.001727
std47488.1459551.958696e+001.651309e+001.516255e+001.415869e+001.380247e+001.332271e+001.237094e+001.194353e+001.098632e+00...7.345240e-017.257016e-016.244603e-016.056471e-015.212781e-014.822270e-014.036325e-013.300833e-01250.1201090.041527
min0.000000-5.640751e+01-7.271573e+01-4.832559e+01-5.683171e+00-1.137433e+02-2.616051e+01-4.355724e+01-7.321672e+01-1.343407e+01...-3.483038e+01-1.093314e+01-4.480774e+01-2.836627e+00-1.029540e+01-2.604551e+00-2.256568e+01-1.543008e+010.0000000.000000
25%54201.500000-9.203734e-01-5.985499e-01-8.903648e-01-8.486401e-01-6.915971e-01-7.682956e-01-5.540759e-01-2.086297e-01-6.430976e-01...-2.283949e-01-5.423504e-01-1.618463e-01-3.545861e-01-3.171451e-01-3.269839e-01-7.083953e-02-5.295979e-025.6000000.000000
50%84692.0000001.810880e-026.548556e-021.798463e-01-1.984653e-02-5.433583e-02-2.741871e-014.010308e-022.235804e-02-5.142873e-02...-2.945017e-026.781943e-03-1.119293e-024.097606e-021.659350e-02-5.213911e-021.342146e-031.124383e-0222.0000000.000000
75%139320.5000001.315642e+008.037239e-011.027196e+007.433413e-016.119264e-013.985649e-015.704361e-013.273459e-015.971390e-01...1.863772e-015.285536e-011.476421e-014.395266e-013.507156e-012.409522e-019.104512e-027.827995e-0277.1650000.000000
max172792.0000002.454930e+002.205773e+019.382558e+001.687534e+013.480167e+017.330163e+011.205895e+022.000721e+011.559499e+01...2.720284e+011.050309e+012.252841e+014.584549e+007.519589e+003.517346e+003.161220e+013.384781e+0125691.1600001.000000
\n", + "

8 rows × 31 columns

\n", + "
" + ], + "text/plain": [ + " Time V1 V2 V3 V4 \\\n", + "count 284807.000000 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n", + "mean 94813.859575 1.168375e-15 3.416908e-16 -1.379537e-15 2.074095e-15 \n", + "std 47488.145955 1.958696e+00 1.651309e+00 1.516255e+00 1.415869e+00 \n", + "min 0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00 \n", + "25% 54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01 \n", + "50% 84692.000000 1.810880e-02 6.548556e-02 1.798463e-01 -1.984653e-02 \n", + "75% 139320.500000 1.315642e+00 8.037239e-01 1.027196e+00 7.433413e-01 \n", + "max 172792.000000 2.454930e+00 2.205773e+01 9.382558e+00 1.687534e+01 \n", + "\n", + " V5 V6 V7 V8 V9 \\\n", + "count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n", + "mean 9.604066e-16 1.487313e-15 -5.556467e-16 1.213481e-16 -2.406331e-15 \n", + "std 1.380247e+00 1.332271e+00 1.237094e+00 1.194353e+00 1.098632e+00 \n", + "min -1.137433e+02 -2.616051e+01 -4.355724e+01 -7.321672e+01 -1.343407e+01 \n", + "25% -6.915971e-01 -7.682956e-01 -5.540759e-01 -2.086297e-01 -6.430976e-01 \n", + "50% -5.433583e-02 -2.741871e-01 4.010308e-02 2.235804e-02 -5.142873e-02 \n", + "75% 6.119264e-01 3.985649e-01 5.704361e-01 3.273459e-01 5.971390e-01 \n", + "max 3.480167e+01 7.330163e+01 1.205895e+02 2.000721e+01 1.559499e+01 \n", + "\n", + " ... V21 V22 V23 V24 \\\n", + "count ... 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n", + "mean ... 1.654067e-16 -3.568593e-16 2.578648e-16 4.473266e-15 \n", + "std ... 7.345240e-01 7.257016e-01 6.244603e-01 6.056471e-01 \n", + "min ... -3.483038e+01 -1.093314e+01 -4.480774e+01 -2.836627e+00 \n", + "25% ... -2.283949e-01 -5.423504e-01 -1.618463e-01 -3.545861e-01 \n", + "50% ... -2.945017e-02 6.781943e-03 -1.119293e-02 4.097606e-02 \n", + "75% ... 1.863772e-01 5.285536e-01 1.476421e-01 4.395266e-01 \n", + "max ... 2.720284e+01 1.050309e+01 2.252841e+01 4.584549e+00 \n", + "\n", + " V25 V26 V27 V28 Amount \\\n", + "count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 284807.000000 \n", + "mean 5.340915e-16 1.683437e-15 -3.660091e-16 -1.227390e-16 88.349619 \n", + "std 5.212781e-01 4.822270e-01 4.036325e-01 3.300833e-01 250.120109 \n", + "min -1.029540e+01 -2.604551e+00 -2.256568e+01 -1.543008e+01 0.000000 \n", + "25% -3.171451e-01 -3.269839e-01 -7.083953e-02 -5.295979e-02 5.600000 \n", + "50% 1.659350e-02 -5.213911e-02 1.342146e-03 1.124383e-02 22.000000 \n", + "75% 3.507156e-01 2.409522e-01 9.104512e-02 7.827995e-02 77.165000 \n", + "max 7.519589e+00 3.517346e+00 3.161220e+01 3.384781e+01 25691.160000 \n", + "\n", + " Class \n", + "count 284807.000000 \n", + "mean 0.001727 \n", + "std 0.041527 \n", + "min 0.000000 \n", + "25% 0.000000 \n", + "50% 0.000000 \n", + "75% 0.000000 \n", + "max 1.000000 \n", + "\n", + "[8 rows x 31 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ab04cf33-e36b-4991-917b-e91a2dfa7fcd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 284807 entries, 0 to 284806\n", + "Data columns (total 31 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Time 284807 non-null float64\n", + " 1 V1 284807 non-null float64\n", + " 2 V2 284807 non-null float64\n", + " 3 V3 284807 non-null float64\n", + " 4 V4 284807 non-null float64\n", + " 5 V5 284807 non-null float64\n", + " 6 V6 284807 non-null float64\n", + " 7 V7 284807 non-null float64\n", + " 8 V8 284807 non-null float64\n", + " 9 V9 284807 non-null float64\n", + " 10 V10 284807 non-null float64\n", + " 11 V11 284807 non-null float64\n", + " 12 V12 284807 non-null float64\n", + " 13 V13 284807 non-null float64\n", + " 14 V14 284807 non-null float64\n", + " 15 V15 284807 non-null float64\n", + " 16 V16 284807 non-null float64\n", + " 17 V17 284807 non-null float64\n", + " 18 V18 284807 non-null float64\n", + " 19 V19 284807 non-null float64\n", + " 20 V20 284807 non-null float64\n", + " 21 V21 284807 non-null float64\n", + " 22 V22 284807 non-null float64\n", + " 23 V23 284807 non-null float64\n", + " 24 V24 284807 non-null float64\n", + " 25 V25 284807 non-null float64\n", + " 26 V26 284807 non-null float64\n", + " 27 V27 284807 non-null float64\n", + " 28 V28 284807 non-null float64\n", + " 29 Amount 284807 non-null float64\n", + " 30 Class 284807 non-null int64 \n", + "dtypes: float64(30), int64(1)\n", + "memory usage: 67.4 MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "6c31fc17", + "metadata": {}, + "outputs": [], + "source": [ + "class_counts = df.Class.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "29b823c1", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " 2024-06-03T12:53:10.650385\n", + " image/svg+xml\n", + " \n", + " \n", + " Matplotlib v3.9.0, https://matplotlib.org/\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "class_counts.plot(kind=\"bar\")\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "918cf628", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " 2024-06-03T12:53:11.836204\n", + " image/svg+xml\n", + " \n", + " \n", + " Matplotlib v3.9.0, https://matplotlib.org/\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "class_counts.plot(kind=\"bar\", logy=True)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "b6af24e9", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " 2024-06-03T12:53:14.097839\n", + " image/svg+xml\n", + " \n", + " \n", + " Matplotlib v3.9.0, https://matplotlib.org/\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots()\n", + "\n", + "V1_0 = df[df[\"Class\"]==0].V1\n", + "\n", + "ax.set_xlabel(\"V1\")\n", + "ax.axis([-30, 5, 0, 0.2])\n", + "\n", + "df[df[\"Class\"]==0].V1.hist(ax=ax, label = \"Class 0\", bins=20, alpha=0.5, color='gray', density=True)\n", + "df[df[\"Class\"]==1].V1.hist(ax=ax, label = \"Class 1\", bins=20, alpha=0.5, color='red', density=True)\n", + "\n", + "legend = fig.legend(loc=\"upper right\", frameon=False)" + ] + }, + { + "cell_type": "markdown", + "id": "af2c4346", + "metadata": {}, + "source": [ + "## Задание на повторение материала\n", + "### 1. Создать одномерный массив Numpy под названием a из 12 последовательных целых чисел чисел от 12 до 24 невключительно" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "557a8256-9c01-4f25-a014-0d228c69024c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[12 13 14 15 16 17 18 19 20 21 22 23]\n" + ] + } + ], + "source": [ + "a = np.arange(12, 24, dtype=int)\n", + " \n", + "print(a)" + ] + }, + { + "cell_type": "markdown", + "id": "1c77f2b4-a4a1-4758-ab20-9c95b50a7228", + "metadata": {}, + "source": [ + "### 2. Создать 5 двумерных массивов разной формы из массива a. Не использовать в аргументах метода reshape число -1." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "c532a55c-a2e4-41b8-88fc-57202937c2df", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[12],\n", + " [13],\n", + " [14],\n", + " [15],\n", + " [16],\n", + " [17],\n", + " [18],\n", + " [19],\n", + " [20],\n", + " [21],\n", + " [22],\n", + " [23]])" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.reshape(a, (12, 1))" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "fdd1014d-c6f5-48bc-b047-81038d8fd827", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[12, 13, 14, 15, 16, 17],\n", + " [18, 19, 20, 21, 22, 23]])" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.reshape(a, (2, 6))" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "0ad59875-d3b2-4e5e-acfe-574669e8a068", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[12, 13],\n", + " [14, 15],\n", + " [16, 17],\n", + " [18, 19],\n", + " [20, 21],\n", + " [22, 23]])" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.reshape(a, (6, 2))" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "fc9a8aaa-d246-4876-a818-3bbb268a4a06", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[12, 13, 14, 15],\n", + " [16, 17, 18, 19],\n", + " [20, 21, 22, 23]])" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.reshape(a, (3, 4))" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "f58c5c92-4e45-4b11-8ee9-03930d40213e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[12, 13, 14],\n", + " [15, 16, 17],\n", + " [18, 19, 20],\n", + " [21, 22, 23]])" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.reshape(a, (4, 3))" + ] + }, + { + "cell_type": "markdown", + "id": "dda06888-4535-4b11-a8ac-8366cb665662", + "metadata": {}, + "source": [ + "### 3. Создать 5 двумерных массивов разной формы из массива a. Использовать в аргументах метода reshape число -1 (в трех примерах - для обозначения числа столбцов, в двух - для строк)." + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "5d2eceed-d468-43e2-a896-03db80347de7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[12, 13, 14, 15, 16, 17],\n", + " [18, 19, 20, 21, 22, 23]])" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.reshape(a, (2, -1))" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "5d4e33c6-9991-48f1-92dc-fe039f1f1403", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[12, 13, 14, 15],\n", + " [16, 17, 18, 19],\n", + " [20, 21, 22, 23]])" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.reshape(a, (3, -1))" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "7c5ee813-ab4a-44eb-af9b-0c6f086ff275", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[12, 13, 14],\n", + " [15, 16, 17],\n", + " [18, 19, 20],\n", + " [21, 22, 23]])" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.reshape(a, (4, -1))" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "19305afe-f66e-439d-86fb-4f1a8e9289fd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[12, 13],\n", + " [14, 15],\n", + " [16, 17],\n", + " [18, 19],\n", + " [20, 21],\n", + " [22, 23]])" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.reshape(a, (-1, 2))" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "021fef61-9517-4ed0-9a49-f0dd68416dff", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[12, 13, 14, 15, 16, 17],\n", + " [18, 19, 20, 21, 22, 23]])" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.reshape(a, (-1, 6))" + ] + }, + { + "cell_type": "markdown", + "id": "aae22acf-9657-4492-b4d8-d6707dd79d6e", + "metadata": {}, + "source": [ + "### 4. Можно ли массив Numpy, состоящий из одного столбца и 12 строк, назвать одномерным?" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "a039beed-b1fb-428a-995b-f4788b8ba5c2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape=(12, 1)=>размерность=2, следоватьельно одномерным может быть только вектор\n" + ] + } + ], + "source": [ + "b = np.resize(a, (12, 1))\n", + "\n", + "print(f\"shape={b.shape}=>размерность={b.ndim}, следоватьельно одномерным может быть только вектор\")" + ] + }, + { + "cell_type": "markdown", + "id": "65303edd-4937-4b7e-85f0-82412f9e320b", + "metadata": {}, + "source": [ + "### 5. Создать массив из 3 строк и 4 столбцов, состоящий из случайных чисел с плавающей запятой из нормального распределения со средним, равным 0 и среднеквадратичным отклонением, равным 1.0. Получить из этого массива одномерный массив с таким же атрибутом size, как и исходный массив." + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "6f9c7bba-d9b2-4e26-8630-8322e6d1f329", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 5.30087681e-01 -1.00730270e+00 3.49023257e-01 1.17610515e-01]\n", + " [-3.73873348e-05 -4.65003983e-01 1.17452549e+00 3.80377532e-02]\n", + " [-4.71755245e-02 3.99082256e-01 -1.15223721e-01 -2.65055731e+00]]\n", + "[[ 5.30087681e-01 -1.00730270e+00 3.49023257e-01 1.17610515e-01\n", + " -3.73873348e-05 -4.65003983e-01 1.17452549e+00 3.80377532e-02\n", + " -4.71755245e-02 3.99082256e-01 -1.15223721e-01 -2.65055731e+00]]\n", + "a.size = 12 <=> 12 = b.size\n" + ] + } + ], + "source": [ + "a = np.random.randn(3, 4)\n", + "b = a.reshape(1, 12)\n", + "print(a)\n", + "print(b)\n", + "print(f\"a.size = {a.size} <=> {b.size} = b.size\")" + ] + }, + { + "cell_type": "markdown", + "id": "6b177aff-4007-4af0-a927-82cb03183362", + "metadata": {}, + "source": [ + "### 6. Создать массив a, состоящий из целых чисел, убывающих от 20 до 0 невключительно с интервалом 2." + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "a78dbbed-cbf3-4536-8e0a-f19586aa9218", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[20 18 16 14 12 10 8 6 4 2]\n" + ] + } + ], + "source": [ + "a = np.arange(20, 0, -2, dtype=int)\n", + "\n", + "print(a)" + ] + }, + { + "cell_type": "markdown", + "id": "3edb8cc8-ca5a-414f-9928-0c3be3fae6a1", + "metadata": {}, + "source": [ + "### 7. Создать массив b, состоящий из 1 строки и 10 столбцов: целых чисел, убывающих от 20 до 1 невключительно с интервалом 2. В чем разница между массивами a и b?" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "de8d49ea-a843-46ff-ab8f-f2b967cebeb2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[20 18 16 14 12 10 8 6 4 2] = [20 18 16 14 12 10 8 6 4 2] => разницы нет\n" + ] + } + ], + "source": [ + "b = np.arange(20, 1, -2, dtype=int)\n", + "\n", + "print(f\"{a} = {b} => разницы нет\")" + ] + }, + { + "cell_type": "markdown", + "id": "48add411-602f-46b7-9cb5-98388a6d742c", + "metadata": {}, + "source": [ + "### 8. Вертикально соединить массивы a и b. a - двумерный массив из нулей, число строк которого больше 1 и на 1 меньше, чем число строк двумерного массива b, состоящего из единиц. Итоговый массив v должен иметь атрибут size, равный 10." + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "e64df321-366c-4400-a406-b775e19a6870", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0. 0.]\n", + " [0. 0.]\n", + " [1. 1.]\n", + " [1. 1.]\n", + " [1. 1.]] \n", + " size=10\n" + ] + } + ], + "source": [ + "a = np.zeros((2, 2))\n", + "b = np.ones((3,2))\n", + "c = np.vstack((a, b))\n", + "print(f\"{c} \\n size={c.size}\")" + ] + }, + { + "cell_type": "markdown", + "id": "85da760f-5076-4e9a-acb4-4e756ac72407", + "metadata": {}, + "source": [ + "### 9. Создать одномерный массив а, состоящий из последовательности целых чисел от 0 до 12. Поменять форму этого массива, чтобы получилась матрица A (двумерный массив Numpy), состоящая из 4 строк и 3 столбцов. Получить матрицу At путем транспонирования атрицы A. Получить матрицу B, умножив матрицу A на матрицу At с помощью матричного умножения. Какой размер имеет матрица B? Получится ли вычислить обратную матрицу для матрицы B и почему?" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "bc97b82a-eaec-4f6c-b9c8-4159dc45c44f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "матрица B =\n", + "[[ 5 14 23 32]\n", + " [ 14 50 86 122]\n", + " [ 23 86 149 212]\n", + " [ 32 122 212 302]]\n", + "размер = (4, 4)\n", + "определитель = 0 => обратная матрица не существует\n" + ] + } + ], + "source": [ + "a = np.arange(0, 12, dtype=int)\n", + "A = np.reshape(a, (4, -1))\n", + "At = A.T\n", + "B = np.dot(A, At)\n", + "print(f\"матрица B =\\n{B}\\nразмер = {B.shape}\\nопределитель = {np.linalg.det(B):1.0f} => обратная матрица не существует\")" + ] + }, + { + "cell_type": "markdown", + "id": "957e860b-9d20-46cf-91b0-2662c2faeebb", + "metadata": {}, + "source": [ + "### 10. Инициализируйте генератор случайных числе с помощью объекта seed, равного 42." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "97c0fa11-6530-461b-bb89-65cb6d38b83f", + "metadata": {}, + "outputs": [], + "source": [ + "seed(42)" + ] + }, + { + "cell_type": "markdown", + "id": "7d7b04ea-a205-4a64-bf93-f68c62b69b5f", + "metadata": {}, + "source": [ + "### 11. Создайте одномерный массив c, составленный из последовательности 16-ти случайных равномерно распределенных целых чисел от 0 до 16 невключительно." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "b307b5d5-e3aa-44b8-94f3-2a9b8877fd5b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 16]\n" + ] + } + ], + "source": [ + "c = np.linspace(start=0, stop=16, num=16, dtype=int)\n", + "print(c)" + ] + }, + { + "cell_type": "markdown", + "id": "3107f89b-c513-4eeb-b21f-d091920b51b3", + "metadata": {}, + "source": [ + "### 12. Поменяйте его форму так, чтобы получилась квадратная матрица C. Получите матрицу D, поэлементно прибавив матрицу B из предыдущего вопроса к матрице C, умноженной на 10. Вычислите определитель, ранг и обратную матрицу D_inv для D." + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "721efedc-bcdd-4aa0-b7bf-58469b31fbf2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "C =\n", + "[[ 0 1 2 3]\n", + " [ 4 5 6 7]\n", + " [ 8 9 10 11]\n", + " [12 13 14 16]]\n", + "D =\n", + "[[ 5 24 43 62]\n", + " [ 54 100 146 192]\n", + " [103 176 249 322]\n", + " [152 252 352 462]]\n", + "определитель = 2.2623680706601445e-10\n", + "ранг = 3\n", + "D_inv =\n", + "[[-3.51843721e+13 7.03687442e+13 -3.51843721e+13 -3.12500000e-03]\n", + " [ 7.03687442e+13 -1.40737488e+14 7.03687442e+13 1.06250000e-01]\n", + " [-3.51843721e+13 7.03687442e+13 -3.51843721e+13 -2.03125000e-01]\n", + " [ 2.00000000e-01 -3.00000000e-01 -0.00000000e+00 1.00000000e-01]]\n" + ] + } + ], + "source": [ + "C = np.reshape(c, (4, -1))\n", + "print(f\"C =\\n{C}\")\n", + "D = B + C * 10\n", + "D_inv = np.linalg.inv(D)\n", + "print(f\"D =\\n{D}\\nопределитель = {np.linalg.det(D)}\\nранг = {np.linalg.matrix_rank(D)}\\nD_inv =\\n{D_inv}\")" + ] + }, + { + "cell_type": "markdown", + "id": "05de35ef-439f-4273-8902-9fbf5d1dd878", + "metadata": {}, + "source": [ + "### 13. Приравняйте к нулю отрицательные числа в матрице D_inv, а положительные - к единице. Убедитесь, что в матрице D_inv остались только нули и единицы. С помощью функции numpy.where, используя матрицу D_inv в качестве маски, а матрицы B и C - в качестве источников данных, получите матрицу E размером 4x4. Элементы матрицы E, для которых соответствующий элемент матрицы D_inv равен 1, должны быть равны соответствующему элементу матрицы B, а элементы матрицы E, для которых соответствующий элемент матрицы D_inv равен 0, должны быть равны соответствующему элементу матрицы C." + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "ae14bda8-3197-4ce6-8ed7-9227f40ee440", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "D_inv =\n", + "[[ 0. 1. 0. 0.]\n", + " [ 1. 0. 1. 1.]\n", + " [ 0. 1. 0. 0.]\n", + " [ 1. 0. -0. 1.]]\n", + "E =\n", + "[[ 0. 14. 2. 3.]\n", + " [ 14. 5. 86. 122.]\n", + " [ 8. 86. 10. 11.]\n", + " [ 32. 13. 14. 302.]]\n" + ] + } + ], + "source": [ + "D_inv[np.where(D_inv < 0)] = 0\n", + "D_inv[np.where(D_inv > 0)] = 1\n", + "print(f\"D_inv =\\n{D_inv}\")\n", + "\n", + "E = np.zeros((4, 4))\n", + "E[np.where(D_inv == 0)] = C[np.where(D_inv == 0)]\n", + "E[np.where(D_inv == 1)] = B[np.where(D_inv == 1)]\n", + "print(f\"E =\\n{E}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8ecf6a0-078c-4d26-9aa0-8d84514df3e7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 6fc5beff16a37111bdea3e6ad063953403ed4c89 Mon Sep 17 00:00:00 2001 From: Foton Date: Mon, 10 Jun 2024 18:33:48 +0300 Subject: [PATCH 2/7] Release Home Work Machine Learning DScience Lesson6 --- Lesson6/Task2.ipynb | 1979 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1979 insertions(+) create mode 100644 Lesson6/Task2.ipynb diff --git a/Lesson6/Task2.ipynb b/Lesson6/Task2.ipynb new file mode 100644 index 0000000..2ebd9ce --- /dev/null +++ b/Lesson6/Task2.ipynb @@ -0,0 +1,1979 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b2f2e5e2", + "metadata": {}, + "source": [ + "## Тема “Обучение с учителем”" + ] + }, + { + "cell_type": "markdown", + "id": "16595a84", + "metadata": {}, + "source": [ + "### Задание 1\n", + "Импортируйте библиотеки pandas и numpy.\n", + "Загрузите \"Boston House Prices dataset\" из встроенных наборов данных библиотеки sklearn..\n", + "Разбейте эти датафреймы на тренировочные (X_train, y_train) и тестовые (X_test, y_test) с помощью\n", + "функции train_test_split так, чтобы размер тестовой выборки\n", + "составлял 30% от всех данных, при этом аргумент random state должен быть равен 42.\n", + "Создайте модель линейной регрессии под названием lr с помощью класса LinearRegression из модуля\n", + "sklearn.linear_model.\n", + "Обучите модель на тренировочных данных (используйте все признаки) и сделайте предсказание на\n", + "тестовых." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "274303e6", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "693a9c36", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "pd.options.display.max_columns = 100\n", + "\n", + "from sklearn.datasets import load_boston" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a37f783d", + "metadata": {}, + "outputs": [], + "source": [ + "boston = load_boston()\n", + "\n", + "feature_names = boston[\"feature_names\"]\n", + "\n", + "X = pd.DataFrame(boston[\"data\"], columns=feature_names)\n", + "y = pd.DataFrame(boston[\"target\"], columns=[\"price\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5a2e0780", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((354, 13), (152, 13))" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n", + "\n", + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "96164976", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "\n", + "lr = LinearRegression()\n", + "\n", + "lr.fit(X_train, y_train)\n", + "\n", + "lr_pred = lr.predict(X_test)" + ] + }, + { + "cell_type": "markdown", + "id": "a07cffa5", + "metadata": {}, + "source": [ + "### Задание 2\n", + "Создайте модель под названием model с помощью класса RandomForestRegressor из модуля\n", + "sklearn.ensemble.\n", + "Сделайте агрумент n_estimators равным 1000,\n", + "max_depth должен быть равен 12 и random_state сделайте равным 42.\n", + "Обучите модель на тренировочных данных аналогично тому, как вы обучали модель LinearRegression,\n", + "но при этом в метод fit вместо датафрейма y_train поставьте y_train.values[:, 0],\n", + "чтобы получить из датафрейма одномерный массив Numpy,\n", + "так как для класса RandomForestRegressor в данном методе для аргумента y предпочтительно\n", + "применение массивов вместо датафрейма.\n", + "Сделайте предсказание на тестовых данных и посчитайте R2. Сравните с результатом из\n", + "предыдущего задания.\n", + "Напишите в комментариях к коду, какая модель в данном случае работает лучше" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "64042e74", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "оценка R2 модели ансамбля случайного леса выше чем у линейной регрессии\n", + "RandomForestRegressor=0.8747\n", + "LinearRegression=0.7112\n" + ] + } + ], + "source": [ + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.metrics import r2_score\n", + "\n", + "model = RandomForestRegressor(max_depth=12, n_estimators=1000, random_state=42)\n", + "\n", + "model.fit(X_train, y_train.values[:,0])\n", + "\n", + "rf_pred = model.predict(X_test)\n", + "\n", + "r2_lr = r2_score(y_test, lr_pred)\n", + "r2_rf = r2_score(y_test, rf_pred)\n", + "\n", + "print(f\"оценка R2 модели ансамбля случайного леса {'выше' if r2_rf > r2_lr else 'ниже' } \" \n", + " f\"чем у линейной регрессии\\nRandomForestRegressor={r2_rf:.4f}\\nLinearRegression={r2_lr:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "94b0548e", + "metadata": {}, + "source": [ + "### *Задание 3\n", + "Вызовите документацию для класса RandomForestRegressor,\n", + "найдите информацию об атрибуте feature_importances_.\n", + "С помощью этого атрибута найдите сумму всех показателей важности,\n", + "установите, какие два признака показывают наибольшую важность." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1327004d", + "metadata": {}, + "outputs": [], + "source": [ + "?RandomForestRegressor" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c8e1fab0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "сумма показателей важности признаков модели = 1.0\n" + ] + } + ], + "source": [ + "print(f\"сумма показателей важности признаков модели = {np.sum(model.feature_importances_)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "744deb30", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "первые два наиболее важные признака\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
featureimportance
12LSTAT0.415847
5RM0.402682
\n", + "
" + ], + "text/plain": [ + " feature importance\n", + "12 LSTAT 0.415847\n", + "5 RM 0.402682" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"первые два наиболее важные признака\")\n", + "pd.DataFrame({'feature': feature_names,'importance': model.feature_importances_}) \\\n", + ".sort_values('importance',ascending = False) \\\n", + ".head(2)" + ] + }, + { + "cell_type": "markdown", + "id": "b9004884", + "metadata": {}, + "source": [ + "## *Задание 4\n", + "В этом задании мы будем работать с датасетом, с которым мы уже знакомы по домашнему заданию\n", + "по библиотеке Matplotlib, это датасет Credit Card Fraud Detection.Для этого датасета мы будем решать\n", + "задачу классификации - будем определять,какие из транзакции по кредитной карте являются\n", + "мошенническими.Данный датасет сильно несбалансирован (так как случаи мошенничества\n", + "относительно редки),так что применение метрики accuracy не принесет пользы и не поможет выбрать\n", + "лучшую модель.Мы будем вычислять AUC, то есть площадь под кривой ROC.\n", + "Импортируйте из соответствующих модулей RandomForestClassifier, GridSearchCV и train_test_split.\n", + "Загрузите датасет creditcard.csv и создайте датафрейм df." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a65b98a0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset URL: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud\n", + "License(s): DbCL-1.0\n", + "Downloading creditcardfraud.zip to W:\\Projects\\GB\\Python\\MLearning\\DSLibraries\\Lesson6\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " 0%| | 0.00/66.0M [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TimeV1V2V3V4V5V6V7V8V9V10V11V12V13V14V15V16V17V18V19V20V21V22V23V24V25V26V27V28AmountClass
00.0-1.359807-0.0727812.5363471.378155-0.3383210.4623880.2395990.0986980.3637870.090794-0.551600-0.617801-0.991390-0.3111691.468177-0.4704010.2079710.0257910.4039930.251412-0.0183070.277838-0.1104740.0669280.128539-0.1891150.133558-0.021053149.620
10.01.1918570.2661510.1664800.4481540.060018-0.082361-0.0788030.085102-0.255425-0.1669741.6127271.0652350.489095-0.1437720.6355580.463917-0.114805-0.183361-0.145783-0.069083-0.225775-0.6386720.101288-0.3398460.1671700.125895-0.0089830.0147242.690
21.0-1.358354-1.3401631.7732090.379780-0.5031981.8004990.7914610.247676-1.5146540.2076430.6245010.0660840.717293-0.1659462.345865-2.8900831.109969-0.121359-2.2618570.5249800.2479980.7716790.909412-0.689281-0.327642-0.139097-0.055353-0.059752378.660
31.0-0.966272-0.1852261.792993-0.863291-0.0103091.2472030.2376090.377436-1.387024-0.054952-0.2264870.1782280.507757-0.287924-0.631418-1.059647-0.6840931.965775-1.232622-0.208038-0.1083000.005274-0.190321-1.1755750.647376-0.2219290.0627230.061458123.500
42.0-1.1582330.8777371.5487180.403034-0.4071930.0959210.592941-0.2705330.8177390.753074-0.8228430.5381961.345852-1.1196700.175121-0.451449-0.237033-0.0381950.8034870.408542-0.0094310.798278-0.1374580.141267-0.2060100.5022920.2194220.21515369.990
52.0-0.4259660.9605231.141109-0.1682520.420987-0.0297280.4762010.260314-0.568671-0.3714071.3412620.359894-0.358091-0.1371340.5176170.401726-0.0581330.068653-0.0331940.084968-0.208254-0.559825-0.026398-0.371427-0.2327940.1059150.2538440.0810803.670
64.01.2296580.1410040.0453711.2026130.1918810.272708-0.0051590.0812130.464960-0.099254-1.416907-0.153826-0.7510630.1673720.050144-0.4435870.002821-0.611987-0.045575-0.219633-0.167716-0.270710-0.154104-0.7800550.750137-0.2572370.0345070.0051684.990
77.0-0.6442691.4179641.074380-0.4921990.9489340.4281181.120631-3.8078640.6153751.249376-0.6194680.2914741.757964-1.3238650.686133-0.076127-1.222127-0.3582220.324505-0.1567421.943465-1.0154550.057504-0.649709-0.415267-0.051634-1.206921-1.08533940.800
87.0-0.8942860.286157-0.113192-0.2715262.6695993.7218180.3701450.851084-0.392048-0.410430-0.705117-0.110452-0.2862540.074355-0.328783-0.210077-0.4997680.1187650.5703280.052736-0.073425-0.268092-0.2042331.0115920.373205-0.3841570.0117470.14240493.200
99.0-0.3382621.1195931.044367-0.2221870.499361-0.2467610.6515830.069539-0.736727-0.3668461.0176140.8363901.006844-0.4435230.1502190.739453-0.5409800.4766770.4517730.203711-0.246914-0.633753-0.120794-0.385050-0.0697330.0941990.2462190.0830763.680
\n", + "" + ], + "text/plain": [ + " Time V1 V2 V3 V4 V5 V6 V7 \\\n", + "0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n", + "1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n", + "2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n", + "3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n", + "4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n", + "5 2.0 -0.425966 0.960523 1.141109 -0.168252 0.420987 -0.029728 0.476201 \n", + "6 4.0 1.229658 0.141004 0.045371 1.202613 0.191881 0.272708 -0.005159 \n", + "7 7.0 -0.644269 1.417964 1.074380 -0.492199 0.948934 0.428118 1.120631 \n", + "8 7.0 -0.894286 0.286157 -0.113192 -0.271526 2.669599 3.721818 0.370145 \n", + "9 9.0 -0.338262 1.119593 1.044367 -0.222187 0.499361 -0.246761 0.651583 \n", + "\n", + " V8 V9 V10 V11 V12 V13 V14 \\\n", + "0 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169 \n", + "1 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772 \n", + "2 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946 \n", + "3 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924 \n", + "4 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670 \n", + "5 0.260314 -0.568671 -0.371407 1.341262 0.359894 -0.358091 -0.137134 \n", + "6 0.081213 0.464960 -0.099254 -1.416907 -0.153826 -0.751063 0.167372 \n", + "7 -3.807864 0.615375 1.249376 -0.619468 0.291474 1.757964 -1.323865 \n", + "8 0.851084 -0.392048 -0.410430 -0.705117 -0.110452 -0.286254 0.074355 \n", + "9 0.069539 -0.736727 -0.366846 1.017614 0.836390 1.006844 -0.443523 \n", + "\n", + " V15 V16 V17 V18 V19 V20 V21 \\\n", + "0 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307 \n", + "1 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775 \n", + "2 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998 \n", + "3 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300 \n", + "4 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431 \n", + "5 0.517617 0.401726 -0.058133 0.068653 -0.033194 0.084968 -0.208254 \n", + "6 0.050144 -0.443587 0.002821 -0.611987 -0.045575 -0.219633 -0.167716 \n", + "7 0.686133 -0.076127 -1.222127 -0.358222 0.324505 -0.156742 1.943465 \n", + "8 -0.328783 -0.210077 -0.499768 0.118765 0.570328 0.052736 -0.073425 \n", + "9 0.150219 0.739453 -0.540980 0.476677 0.451773 0.203711 -0.246914 \n", + "\n", + " V22 V23 V24 V25 V26 V27 V28 \\\n", + "0 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 \n", + "1 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 \n", + "2 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 \n", + "3 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 \n", + "4 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 \n", + "5 -0.559825 -0.026398 -0.371427 -0.232794 0.105915 0.253844 0.081080 \n", + "6 -0.270710 -0.154104 -0.780055 0.750137 -0.257237 0.034507 0.005168 \n", + "7 -1.015455 0.057504 -0.649709 -0.415267 -0.051634 -1.206921 -1.085339 \n", + "8 -0.268092 -0.204233 1.011592 0.373205 -0.384157 0.011747 0.142404 \n", + "9 -0.633753 -0.120794 -0.385050 -0.069733 0.094199 0.246219 0.083076 \n", + "\n", + " Amount Class \n", + "0 149.62 0 \n", + "1 2.69 0 \n", + "2 378.66 0 \n", + "3 123.50 0 \n", + "4 69.99 0 \n", + "5 3.67 0 \n", + "6 4.99 0 \n", + "7 40.80 0 \n", + "8 93.20 0 \n", + "9 3.68 0 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from zipfile import ZipFile\n", + "\n", + "ZipFile(\"creditcardfraud.zip\").extractall(\".\")\n", + "\n", + "df = pd.read_csv(\"creditcard.csv\")\n", + "\n", + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "9302f8fe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 284807 entries, 0 to 284806\n", + "Data columns (total 31 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Time 284807 non-null float64\n", + " 1 V1 284807 non-null float64\n", + " 2 V2 284807 non-null float64\n", + " 3 V3 284807 non-null float64\n", + " 4 V4 284807 non-null float64\n", + " 5 V5 284807 non-null float64\n", + " 6 V6 284807 non-null float64\n", + " 7 V7 284807 non-null float64\n", + " 8 V8 284807 non-null float64\n", + " 9 V9 284807 non-null float64\n", + " 10 V10 284807 non-null float64\n", + " 11 V11 284807 non-null float64\n", + " 12 V12 284807 non-null float64\n", + " 13 V13 284807 non-null float64\n", + " 14 V14 284807 non-null float64\n", + " 15 V15 284807 non-null float64\n", + " 16 V16 284807 non-null float64\n", + " 17 V17 284807 non-null float64\n", + " 18 V18 284807 non-null float64\n", + " 19 V19 284807 non-null float64\n", + " 20 V20 284807 non-null float64\n", + " 21 V21 284807 non-null float64\n", + " 22 V22 284807 non-null float64\n", + " 23 V23 284807 non-null float64\n", + " 24 V24 284807 non-null float64\n", + " 25 V25 284807 non-null float64\n", + " 26 V26 284807 non-null float64\n", + " 27 V27 284807 non-null float64\n", + " 28 V28 284807 non-null float64\n", + " 29 Amount 284807 non-null float64\n", + " 30 Class 284807 non-null int64 \n", + "dtypes: float64(30), int64(1)\n", + "memory usage: 67.4 MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "markdown", + "id": "4ba00248", + "metadata": {}, + "source": [ + "Создайте датафрейм X из датафрейма df, исключив столбец Class.\n", + "Создайте объект Series под названием y из столбца Class.\n", + "Разбейте X и y на тренировочный и тестовый наборы данных при помощи функции train_test_split,\n", + "используя аргументы: test_size=0.3, random_state=100, stratify=y.\n", + "У вас должны получиться объекты X_train, X_test, y_train и y_test.\n", + "Просмотрите информацию о их форме." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "59e1e34e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "выборка не сбалансированна, данных первого класса значительно меньше\n", + "0 0.998273\n", + "1 0.001727\n", + "Name: Class, dtype: float64\n" + ] + }, + { + "data": { + "text/plain": [ + "((199364, 30), (85443, 30), (199364,), (85443,))" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target = \"Class\"\n", + "\n", + "y = df[target]\n", + "X = df.drop(target, axis=1)\n", + "\n", + "print(f\"выборка не сбалансированна, данных первого класса значительно меньше\\n{y.value_counts(normalize=True)}\")\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)\n", + "X_train.shape, X_test.shape, y_train.shape, y_test.shape" + ] + }, + { + "cell_type": "markdown", + "id": "fe7fae18", + "metadata": {}, + "source": [ + "Для поиска по сетке параметров задайте такие параметры:\n", + "parameters = [{'n_estimators': [10, 15],\n", + "'max_features': np.arange(3, 5),\n", + "'max_depth': np.arange(4, 7)}]\n", + "Создайте модель GridSearchCV со следующими аргументами:\n", + "estimator=RandomForestClassifier(random_state=100),\n", + "param_grid=parameters,\n", + "scoring='roc_auc',\n", + "cv=3." + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "id": "c034c2a4", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "parameters = {\n", + " 'n_estimators': [10, 15],\n", + " 'max_features': np.arange(3, 5),\n", + " 'max_depth': np.arange(4, 7),\n", + "}\n", + "\n", + "clf = GridSearchCV(\n", + " estimator=RandomForestClassifier(random_state=100),\n", + " param_grid=parameters,\n", + " scoring='roc_auc',\n", + " cv=3,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a4308ac3", + "metadata": {}, + "source": [ + "Обучите модель на тренировочном наборе данных (может занять несколько минут).\n", + "Просмотрите параметры лучшей модели с помощью атрибута best_params_.\n", + "Предскажите вероятности классов с помощью полученной модели и метода predict_proba.\n", + "Из полученного результата (массив Numpy) выберите столбец с индексом 1 (вероятность класса 1) и\n", + "запишите в массив y_pred_proba. Из модуля sklearn.metrics импортируйте метрику roc_auc_score.\n", + "Вычислите AUC на тестовых данных и сравните с результатом,полученным на тренировочных данных,\n", + "используя в качестве аргументов массивы y_test и y_pred_proba" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "9b43d0b7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=100),\n", + " param_grid={'max_depth': array([4, 5, 6]),\n", + " 'max_features': array([3, 4]),\n", + " 'n_estimators': [10, 15]},\n", + " scoring='roc_auc')" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "id": "966c3d4d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'max_depth': 6, 'max_features': 3, 'n_estimators': 15}" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.best_params_" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "405d63d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "метрика AUC=0.9436 на тестовых данных меньше метрики AUC=0.9660 на обучающем наборе\n" + ] + } + ], + "source": [ + "from sklearn.metrics import roc_auc_score\n", + "\n", + "model = RandomForestClassifier(max_depth=6, max_features=3, n_estimators=15)\n", + "\n", + "model.fit(X_train, y_train)\n", + "\n", + "y_pred_proba = model.predict_proba(X_test)[:, 1]\n", + "\n", + "auc = roc_auc_score(y_test, y_pred_proba)\n", + "\n", + "print(f\"метрика AUC={auc:.4f} на тестовых данных {'больше' if auc > clf.best_score_ else 'меньше'} \"\n", + " f\"метрики AUC={clf.best_score_:.4f} на обучающем наборе\")" + ] + }, + { + "cell_type": "markdown", + "id": "5cb517be", + "metadata": {}, + "source": [ + "## *Дополнительные задания:\n", + "1). Загрузите датасет Wine из встроенных датасетов sklearn.datasets с помощью функции load_wine в\n", + "переменную data." + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "id": "1dbcea26", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import load_wine\n", + "\n", + "wine = load_wine()\n", + "\n", + "data = wine[\"data\"]" + ] + }, + { + "cell_type": "markdown", + "id": "18dea80c", + "metadata": {}, + "source": [ + "2). Полученный датасет не является датафреймом. Это структура данных, имеющая ключи\n", + "аналогично словарю. Просмотрите тип данных этой структуры данных и создайте список data_keys,\n", + "содержащий ее ключи." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d5f5e52a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_keys = wine.keys()\n", + "data_keys" + ] + }, + { + "cell_type": "markdown", + "id": "73bff5cf", + "metadata": {}, + "source": [ + "3). Просмотрите данные, описание и названия признаков в датасете. Описание нужно вывести в виде\n", + "привычного, аккуратно оформленного текста, без обозначений переноса строки, но с самими\n", + "переносами и т.д." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "39433029", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".. _wine_dataset:\n", + "\n", + "Wine recognition dataset\n", + "------------------------\n", + "\n", + "**Data Set Characteristics:**\n", + "\n", + " :Number of Instances: 178 (50 in each of three classes)\n", + " :Number of Attributes: 13 numeric, predictive attributes and the class\n", + " :Attribute Information:\n", + " \t\t- Alcohol\n", + " \t\t- Malic acid\n", + " \t\t- Ash\n", + "\t\t- Alcalinity of ash \n", + " \t\t- Magnesium\n", + "\t\t- Total phenols\n", + " \t\t- Flavanoids\n", + " \t\t- Nonflavanoid phenols\n", + " \t\t- Proanthocyanins\n", + "\t\t- Color intensity\n", + " \t\t- Hue\n", + " \t\t- OD280/OD315 of diluted wines\n", + " \t\t- Proline\n", + "\n", + " - class:\n", + " - class_0\n", + " - class_1\n", + " - class_2\n", + "\t\t\n", + " :Summary Statistics:\n", + " \n", + " ============================= ==== ===== ======= =====\n", + " Min Max Mean SD\n", + " ============================= ==== ===== ======= =====\n", + " Alcohol: 11.0 14.8 13.0 0.8\n", + " Malic Acid: 0.74 5.80 2.34 1.12\n", + " Ash: 1.36 3.23 2.36 0.27\n", + " Alcalinity of Ash: 10.6 30.0 19.5 3.3\n", + " Magnesium: 70.0 162.0 99.7 14.3\n", + " Total Phenols: 0.98 3.88 2.29 0.63\n", + " Flavanoids: 0.34 5.08 2.03 1.00\n", + " Nonflavanoid Phenols: 0.13 0.66 0.36 0.12\n", + " Proanthocyanins: 0.41 3.58 1.59 0.57\n", + " Colour Intensity: 1.3 13.0 5.1 2.3\n", + " Hue: 0.48 1.71 0.96 0.23\n", + " OD280/OD315 of diluted wines: 1.27 4.00 2.61 0.71\n", + " Proline: 278 1680 746 315\n", + " ============================= ==== ===== ======= =====\n", + "\n", + " :Missing Attribute Values: None\n", + " :Class Distribution: class_0 (59), class_1 (71), class_2 (48)\n", + " :Creator: R.A. Fisher\n", + " :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n", + " :Date: July, 1988\n", + "\n", + "This is a copy of UCI ML Wine recognition datasets.\n", + "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data\n", + "\n", + "The data is the results of a chemical analysis of wines grown in the same\n", + "region in Italy by three different cultivators. There are thirteen different\n", + "measurements taken for different constituents found in the three types of\n", + "wine.\n", + "\n", + "Original Owners: \n", + "\n", + "Forina, M. et al, PARVUS - \n", + "An Extendible Package for Data Exploration, Classification and Correlation. \n", + "Institute of Pharmaceutical and Food Analysis and Technologies,\n", + "Via Brigata Salerno, 16147 Genoa, Italy.\n", + "\n", + "Citation:\n", + "\n", + "Lichman, M. (2013). UCI Machine Learning Repository\n", + "[https://archive.ics.uci.edu/ml]. Irvine, CA: University of California,\n", + "School of Information and Computer Science. \n", + "\n", + ".. topic:: References\n", + "\n", + " (1) S. Aeberhard, D. Coomans and O. de Vel, \n", + " Comparison of Classifiers in High Dimensional Settings, \n", + " Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of \n", + " Mathematics and Statistics, James Cook University of North Queensland. \n", + " (Also submitted to Technometrics). \n", + "\n", + " The data was used with many others for comparing various \n", + " classifiers. The classes are separable, though only RDA \n", + " has achieved 100% correct classification. \n", + " (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) \n", + " (All results using the leave-one-out technique) \n", + "\n", + " (2) S. Aeberhard, D. Coomans and O. de Vel, \n", + " \"THE CLASSIFICATION PERFORMANCE OF RDA\" \n", + " Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of \n", + " Mathematics and Statistics, James Cook University of North Queensland. \n", + " (Also submitted to Journal of Chemometrics).\n", + "\n" + ] + } + ], + "source": [ + "print(wine[\"DESCR\"])" + ] + }, + { + "cell_type": "markdown", + "id": "772359a0", + "metadata": {}, + "source": [ + "4). Сколько классов содержит целевая переменная датасета? Выведите названия классов." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "183b0c76", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['class_0' 'class_1' 'class_2']\n", + "кол-во:3\n" + ] + } + ], + "source": [ + "target_class = wine[\"target_names\"]\n", + "\n", + "print(f\"{target_class}\\nкол-во:{len(target_class)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "41d6690b", + "metadata": {}, + "source": [ + "5). На основе данных датасета (они содержатся в двумерном массиве Numpy) и названий признаков\n", + "создайте датафрейм под названием X." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "15557fbe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesproline
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.0
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.0
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.0
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.0
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.0
\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + "\n", + " od280/od315_of_diluted_wines proline \n", + "0 3.92 1065.0 \n", + "1 3.40 1050.0 \n", + "2 3.17 1185.0 \n", + "3 3.45 1480.0 \n", + "4 2.93 735.0 " + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_names = wine[\"feature_names\"]\n", + "\n", + "X = pd.DataFrame(data, columns=feature_names)\n", + "\n", + "X.head()" + ] + }, + { + "cell_type": "markdown", + "id": "e9ef2d4b", + "metadata": {}, + "source": [ + "6). Выясните размер датафрейма X и установите, имеются ли в нем пропущенные значения." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "9f2ee54a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 178 entries, 0 to 177\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 alcohol 178 non-null float64\n", + " 1 malic_acid 178 non-null float64\n", + " 2 ash 178 non-null float64\n", + " 3 alcalinity_of_ash 178 non-null float64\n", + " 4 magnesium 178 non-null float64\n", + " 5 total_phenols 178 non-null float64\n", + " 6 flavanoids 178 non-null float64\n", + " 7 nonflavanoid_phenols 178 non-null float64\n", + " 8 proanthocyanins 178 non-null float64\n", + " 9 color_intensity 178 non-null float64\n", + " 10 hue 178 non-null float64\n", + " 11 od280/od315_of_diluted_wines 178 non-null float64\n", + " 12 proline 178 non-null float64\n", + "dtypes: float64(13)\n", + "memory usage: 18.2 KB\n" + ] + } + ], + "source": [ + "X.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "90a743fa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "размер = (178, 13)\n", + "кол-во пустых значений\n", + "alcohol 0\n", + "malic_acid 0\n", + "ash 0\n", + "alcalinity_of_ash 0\n", + "magnesium 0\n", + "total_phenols 0\n", + "flavanoids 0\n", + "nonflavanoid_phenols 0\n", + "proanthocyanins 0\n", + "color_intensity 0\n", + "hue 0\n", + "od280/od315_of_diluted_wines 0\n", + "proline 0\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "print(f\"размер = {X.shape}\\nкол-во пустых значений\\n{X.isnull().sum(axis=0)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "cf169b47", + "metadata": {}, + "source": [ + "7). Добавьте в датафрейм поле с классами вин в виде чисел, имеющих тип данных numpy.int64.\n", + "Название поля - 'target'." + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "3883dbe8", + "metadata": {}, + "outputs": [], + "source": [ + "X[\"target\"] = wine[\"target\"].astype(\"int64\")" + ] + }, + { + "cell_type": "markdown", + "id": "e5918c4e", + "metadata": {}, + "source": [ + "8). Постройте матрицу корреляций для всех полей X. Дайте полученному датафрейму название\n", + "X_corr" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "d4c9b5e1", + "metadata": {}, + "outputs": [], + "source": [ + "X_corr = X.corr()" + ] + }, + { + "cell_type": "markdown", + "id": "43fd36d4", + "metadata": {}, + "source": [ + "9). Создайте список high_corr из признаков, корреляция которых с полем target по абсолютному\n", + "значению превышает 0.5 (причем, само поле target не должно входить в этот список)." + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "a788a848", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['alcalinity_of_ash',\n", + " 'total_phenols',\n", + " 'flavanoids',\n", + " 'hue',\n", + " 'od280/od315_of_diluted_wines',\n", + " 'proline']" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "high_corr = [item for item in X_corr[abs(X_corr.target) > 0.5].index if item != \"target\"]\n", + "high_corr" + ] + }, + { + "cell_type": "markdown", + "id": "beab1b7d", + "metadata": {}, + "source": [ + "10). Удалите из датафрейма X поле с целевой переменной. Для всех признаков, названия которых\n", + "содержатся в списке high_corr, вычислите квадрат их значений и добавьте в датафрейм X\n", + "соответствующие поля с суффиксом '_2', добавленного к первоначальному названию признака.\n", + "Итоговый датафрейм должен содержать все поля, которые, были в нем изначально, а также поля с\n", + "признаками из списка high_corr, возведенными в квадрат. Выведите описание полей датафрейма X с\n", + "помощью метода describe." + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "426afeb5", + "metadata": {}, + "outputs": [], + "source": [ + "X.drop(\"target\", axis = 1, inplace = True) \n", + "\n", + "for item in high_corr:\n", + " X[item+\"_2\"] = X[item] ** 2" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "382274e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolinealcalinity_of_ash_2total_phenols_2flavanoids_2hue_2od280/od315_of_diluted_wines_2proline_2
count178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.0000001.780000e+02
mean13.0006182.3363482.36651719.49494499.7415732.2951122.0292700.3618541.5908995.0580900.9574492.611685746.893258391.1428655.6570305.1100490.9686617.3221556.564591e+05
std0.8118271.1171460.2743443.33956414.2824840.6258510.9988590.1244530.5723592.3182860.2285720.709990314.907474133.6717752.9362944.2114410.4437983.5843165.558591e+05
min11.0300000.7400001.36000010.60000070.0000000.9800000.3400000.1300000.4100001.2800000.4800001.270000278.000000112.3600000.9604000.1156000.2304001.6129007.728400e+04
25%12.3625001.6025002.21000017.20000088.0000001.7425001.2050000.2700001.2500003.2200000.7825001.937500500.500000295.8400003.0363251.4521000.6123253.7540752.505010e+05
50%13.0500001.8650002.36000019.50000098.0000002.3550002.1350000.3400001.5550004.6900000.9650002.780000673.500000380.2500005.5460504.5582500.9312507.7284004.536045e+05
75%13.6775003.0825002.55750021.500000107.0000002.8000002.8750000.4375001.9500006.2000001.1200003.170000985.000000462.2500007.8400008.2657001.25440010.0489009.702250e+05
max14.8300005.8000003.23000030.000000162.0000003.8800005.0800000.6600003.58000013.0000001.7100004.0000001680.000000900.00000015.05440025.8064002.92410016.0000002.822400e+06
\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "count 178.000000 178.000000 178.000000 178.000000 178.000000 \n", + "mean 13.000618 2.336348 2.366517 19.494944 99.741573 \n", + "std 0.811827 1.117146 0.274344 3.339564 14.282484 \n", + "min 11.030000 0.740000 1.360000 10.600000 70.000000 \n", + "25% 12.362500 1.602500 2.210000 17.200000 88.000000 \n", + "50% 13.050000 1.865000 2.360000 19.500000 98.000000 \n", + "75% 13.677500 3.082500 2.557500 21.500000 107.000000 \n", + "max 14.830000 5.800000 3.230000 30.000000 162.000000 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "count 178.000000 178.000000 178.000000 178.000000 \n", + "mean 2.295112 2.029270 0.361854 1.590899 \n", + "std 0.625851 0.998859 0.124453 0.572359 \n", + "min 0.980000 0.340000 0.130000 0.410000 \n", + "25% 1.742500 1.205000 0.270000 1.250000 \n", + "50% 2.355000 2.135000 0.340000 1.555000 \n", + "75% 2.800000 2.875000 0.437500 1.950000 \n", + "max 3.880000 5.080000 0.660000 3.580000 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \\\n", + "count 178.000000 178.000000 178.000000 178.000000 \n", + "mean 5.058090 0.957449 2.611685 746.893258 \n", + "std 2.318286 0.228572 0.709990 314.907474 \n", + "min 1.280000 0.480000 1.270000 278.000000 \n", + "25% 3.220000 0.782500 1.937500 500.500000 \n", + "50% 4.690000 0.965000 2.780000 673.500000 \n", + "75% 6.200000 1.120000 3.170000 985.000000 \n", + "max 13.000000 1.710000 4.000000 1680.000000 \n", + "\n", + " alcalinity_of_ash_2 total_phenols_2 flavanoids_2 hue_2 \\\n", + "count 178.000000 178.000000 178.000000 178.000000 \n", + "mean 391.142865 5.657030 5.110049 0.968661 \n", + "std 133.671775 2.936294 4.211441 0.443798 \n", + "min 112.360000 0.960400 0.115600 0.230400 \n", + "25% 295.840000 3.036325 1.452100 0.612325 \n", + "50% 380.250000 5.546050 4.558250 0.931250 \n", + "75% 462.250000 7.840000 8.265700 1.254400 \n", + "max 900.000000 15.054400 25.806400 2.924100 \n", + "\n", + " od280/od315_of_diluted_wines_2 proline_2 \n", + "count 178.000000 1.780000e+02 \n", + "mean 7.322155 6.564591e+05 \n", + "std 3.584316 5.558591e+05 \n", + "min 1.612900 7.728400e+04 \n", + "25% 3.754075 2.505010e+05 \n", + "50% 7.728400 4.536045e+05 \n", + "75% 10.048900 9.702250e+05 \n", + "max 16.000000 2.822400e+06 " + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1245b68", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 623c3e9b2431c8bc7443b5c2ca0912f43f1eda4b Mon Sep 17 00:00:00 2001 From: Foton Date: Mon, 10 Jun 2024 18:42:28 +0300 Subject: [PATCH 3/7] Release Home Work Machine Learning DLibraries Lesson6 --- Lesson4/Task2.ipynb | 6815 ------------------------------------------- 1 file changed, 6815 deletions(-) delete mode 100644 Lesson4/Task2.ipynb diff --git a/Lesson4/Task2.ipynb b/Lesson4/Task2.ipynb deleted file mode 100644 index ac6e6bf..0000000 --- a/Lesson4/Task2.ipynb +++ /dev/null @@ -1,6815 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "b27e41e4", - "metadata": {}, - "source": [ - "## Тема “Визуализация данных в Matplotlib”" - ] - }, - { - "cell_type": "markdown", - "id": "f324024b", - "metadata": {}, - "source": [ - "### Задание 1\n", - "Загрузите модуль pyplot библиотеки matplotlib с псевдонимом plt, а также библиотеку numpy с\n", - "псевдонимом np.\n", - "Примените магическую функцию %matplotlib inline для отображения графиков в Jupyter Notebook и\n", - "настройки конфигурации ноутбука со значением 'svg' для более четкого отображения графиков.\n", - "Создайте список под названием x с числами 1, 2, 3, 4, 5, 6, 7 и список y с числами 3.5, 3.8, 4.2, 4.5, 5,\n", - "5.5, 7.\n", - "С помощью функции plot постройте график, соединяющий линиями точки с горизонтальными\n", - "координатами из списка x и вертикальными - из списка y.\n", - "Затем в следующей ячейке постройте диаграмму рассеяния (другие названия - диаграмма разброса,\n", - "scatter plot)." - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "99e11a55", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "from random import seed\n", - "from matplotlib import pyplot as plt\n", - "\n", - "plt.style.use('fivethirtyeight')\n", - "\n", - "%matplotlib inline\n", - "%config InlineBackend.figure_format = 'svg'" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "bd0d7835", - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " 2024-06-04T11:17:30.715908\n", - " image/svg+xml\n", - " \n", - " \n", - " Matplotlib v3.9.0, https://matplotlib.org/\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n" - ], - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "x = [1, 2, 3, 4, 5, 6, 7]\n", - "y = [3.5, 3.8, 4.2, 4.5, 5, 5.5, 7]\n", - "\n", - "plt.plot(x, y)\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "91fc998c", - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " 2024-06-03T11:42:36.642755\n", - " image/svg+xml\n", - " \n", - " \n", - " Matplotlib v3.9.0, https://matplotlib.org/\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n" - ], - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plt.scatter(x, y)\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "fbf3b995", - "metadata": {}, - "source": [ - "### Задание 2\n", - "С помощью функции linspace из библиотеки Numpy создайте массив t из 51 числа от 0 до 10\n", - "включительно.\n", - "© geekbrains.ru\n", - "Создайте массив Numpy под названием f, содержащий косинусы элементов массива t.\n", - "Постройте линейную диаграмму, используя массив t для координат по горизонтали,а массив f - для\n", - "координат по вертикали. Линия графика должна быть зеленого цвета.\n", - "Выведите название диаграммы - 'График f(t)'. Также добавьте названия для горизонтальной оси -\n", - "'Значения t' и для вертикальной - 'Значения f'.\n", - "Ограничьте график по оси x значениями 0.5 и 9.5, а по оси y - значениями -2.5 и 2.5." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "f19e1fe4", - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " 2024-06-03T11:42:36.735077\n", - " image/svg+xml\n", - " \n", - " \n", - " Matplotlib v3.9.0, https://matplotlib.org/\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n" - ], - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "t = np.linspace(0, 10, 51)\n", - "\n", - "f = np.cos(t)\n", - "\n", - "plt.plot(t, f, color=\"green\")\n", - "\n", - "plt.title(\"График f(t)\")\n", - "plt.xlabel(\"Значения t\")\n", - "plt.ylabel(\"Значения f\")\n", - "\n", - "plt.axis([0.5, 9.5, -2.5, 2.5])\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "73611d04", - "metadata": {}, - "source": [ - "### Задание 3\n", - "С помощью функции linspace библиотеки Numpy создайте массив x из 51 числа от -3 до 3\n", - "включительно.\n", - "Создайте массивы y1, y2, y3, y4 по следующим формулам:\n", - "y1 = x**2\n", - "y2 = 2 * x + 0.5\n", - "y3 = -3 * x - 1.5\n", - "y4 = sin(x)\n", - "Используя функцию subplots модуля matplotlib.pyplot, создайте объект matplotlib.figure.Figure с\n", - "названием fig и массив объектов Axes под названием ax,причем так, чтобы у вас было 4 отдельных\n", - "графика в сетке, состоящей из двух строк и двух столбцов. В каждом графике массив x используется\n", - "для координат по горизонтали.В левом верхнем графике для координат по вертикали используйте\n", - "y1,в правом верхнем - y2, в левом нижнем - y3, в правом нижнем - y4.Дайте название графикам:\n", - "'График y1', 'График y2' и т.д.\n", - "Для графика в левом верхнем углу установите границы по оси x от -5 до 5.\n", - "Установите размеры фигуры 8 дюймов по горизонтали и 6 дюймов по вертикали.\n", - "Вертикальные и горизонтальные зазоры между графиками должны составлять 0.3" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "ea585c38", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Text(0.5, 1.0, 'График y4')" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " 2024-06-03T11:42:37.042700\n", - " image/svg+xml\n", - " \n", - " \n", - " Matplotlib v3.9.0, https://matplotlib.org/\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n" - ], - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "x = np.linspace(-3, 3, 51)\n", - "\n", - "y1 = x**2\n", - "y2 = 2*x + 0.5\n", - "y3 = -3*x - 1.5\n", - "y4 = np.sin(x)\n", - "\n", - "fig, ax = plt.subplots(nrows=2, ncols=2)\n", - "\n", - "fig.set_size_inches(8, 6)\n", - "fig.subplots_adjust(wspace=0.3, hspace=0.3)\n", - "\n", - "ax1, ax2, ax3, ax4 = ax.flatten()\n", - "\n", - "ax1.plot(x, y1)\n", - "ax1.set_title(\"График y1\")\n", - "ax1.set_xlim([-5, 5])\n", - "ax2.plot(x, y2)\n", - "ax2.set_title(\"График y2\")\n", - "ax3.plot(x, y3)\n", - "ax3.set_title(\"График y3\")\n", - "ax4.plot(x, y4)\n", - "ax4.set_title(\"График y4\")" - ] - }, - { - "cell_type": "markdown", - "id": "c77b9177", - "metadata": {}, - "source": [ - "### Задание 4\n", - "В этом задании мы будем работать с датасетом, в котором приведены данные по мошенничеству с\n", - "кредитными данными: Credit Card Fraud Detection (информация об авторах: Andrea Dal Pozzolo, Olivier\n", - "Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced\n", - "Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015).\n", - "Ознакомьтесь с описанием и скачайте датасет creditcard.csv с сайта Kaggle.com по ссылке:\n", - "Credit Card Fraud Detection\n", - "Данный датасет является примером несбалансированных данных, так как мошеннические операции с\n", - "картами встречаются реже обычных.\n", - "Импортируйте библиотеку Pandas, а также используйте для графиков стиль “fivethirtyeight”.\n", - "© geekbrains.ru 1\n", - "Посчитайте с помощью метода value_counts количество наблюдений для каждого значения целевой\n", - "переменной Class и примените к полученным данным метод plot, чтобы построить столбчатую\n", - "диаграмму. Затем постройте такую же диаграмму, используя логарифмический масштаб.\n", - "На следующем графике постройте две гистограммы по значениям признака V1 - одну для\n", - "мошеннических транзакций (Class равен 1) и другую - для обычных (Class равен 0). Подберите\n", - "значение аргумента density так, чтобы по вертикали графика было расположено не число\n", - "наблюдений, а плотность распределения. Число бинов должно равняться 20 для обеих гистограмм, а\n", - "коэффициент alpha сделайте равным 0.5, чтобы гистограммы были полупрозрачными и не\n", - "загораживали друг друга. Создайте легенду с двумя значениями: “Class 0” и “Class 1”. Гистограмма\n", - "обычных транзакций должна быть серого цвета, а мошеннических - красного. Горизонтальной оси\n", - "дайте название “V1”." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "65f2dc4a-f788-4f45-b8d7-7bd6de270815", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset URL: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud\n", - "License(s): DbCL-1.0\n", - "Downloading creditcardfraud.zip to W:\\Projects\\GB\\Python\\MLearning\\DSLibraries\\Lesson4\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - " 0%| | 0.00/66.0M [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TimeV1V2V3V4V5V6V7V8V9...V21V22V23V24V25V26V27V28AmountClass
00.0-1.359807-0.0727812.5363471.378155-0.3383210.4623880.2395990.0986980.363787...-0.0183070.277838-0.1104740.0669280.128539-0.1891150.133558-0.021053149.620
10.01.1918570.2661510.1664800.4481540.060018-0.082361-0.0788030.085102-0.255425...-0.225775-0.6386720.101288-0.3398460.1671700.125895-0.0089830.0147242.690
21.0-1.358354-1.3401631.7732090.379780-0.5031981.8004990.7914610.247676-1.514654...0.2479980.7716790.909412-0.689281-0.327642-0.139097-0.055353-0.059752378.660
31.0-0.966272-0.1852261.792993-0.863291-0.0103091.2472030.2376090.377436-1.387024...-0.1083000.005274-0.190321-1.1755750.647376-0.2219290.0627230.061458123.500
42.0-1.1582330.8777371.5487180.403034-0.4071930.0959210.592941-0.2705330.817739...-0.0094310.798278-0.1374580.141267-0.2060100.5022920.2194220.21515369.990
\n", - "

5 rows × 31 columns

\n", - "" - ], - "text/plain": [ - " Time V1 V2 V3 V4 V5 V6 V7 \\\n", - "0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n", - "1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n", - "2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n", - "3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n", - "4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n", - "\n", - " V8 V9 ... V21 V22 V23 V24 V25 \\\n", - "0 0.098698 0.363787 ... -0.018307 0.277838 -0.110474 0.066928 0.128539 \n", - "1 0.085102 -0.255425 ... -0.225775 -0.638672 0.101288 -0.339846 0.167170 \n", - "2 0.247676 -1.514654 ... 0.247998 0.771679 0.909412 -0.689281 -0.327642 \n", - "3 0.377436 -1.387024 ... -0.108300 0.005274 -0.190321 -1.175575 0.647376 \n", - "4 -0.270533 0.817739 ... -0.009431 0.798278 -0.137458 0.141267 -0.206010 \n", - "\n", - " V26 V27 V28 Amount Class \n", - "0 -0.189115 0.133558 -0.021053 149.62 0 \n", - "1 0.125895 -0.008983 0.014724 2.69 0 \n", - "2 -0.139097 -0.055353 -0.059752 378.66 0 \n", - "3 -0.221929 0.062723 0.061458 123.50 0 \n", - "4 0.502292 0.219422 0.215153 69.99 0 \n", - "\n", - "[5 rows x 31 columns]" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "d4fae789", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TimeV1V2V3V4V5V6V7V8V9...V21V22V23V24V25V26V27V28AmountClass
count284807.0000002.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+05...2.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+052.848070e+05284807.000000284807.000000
mean94813.8595751.168375e-153.416908e-16-1.379537e-152.074095e-159.604066e-161.487313e-15-5.556467e-161.213481e-16-2.406331e-15...1.654067e-16-3.568593e-162.578648e-164.473266e-155.340915e-161.683437e-15-3.660091e-16-1.227390e-1688.3496190.001727
std47488.1459551.958696e+001.651309e+001.516255e+001.415869e+001.380247e+001.332271e+001.237094e+001.194353e+001.098632e+00...7.345240e-017.257016e-016.244603e-016.056471e-015.212781e-014.822270e-014.036325e-013.300833e-01250.1201090.041527
min0.000000-5.640751e+01-7.271573e+01-4.832559e+01-5.683171e+00-1.137433e+02-2.616051e+01-4.355724e+01-7.321672e+01-1.343407e+01...-3.483038e+01-1.093314e+01-4.480774e+01-2.836627e+00-1.029540e+01-2.604551e+00-2.256568e+01-1.543008e+010.0000000.000000
25%54201.500000-9.203734e-01-5.985499e-01-8.903648e-01-8.486401e-01-6.915971e-01-7.682956e-01-5.540759e-01-2.086297e-01-6.430976e-01...-2.283949e-01-5.423504e-01-1.618463e-01-3.545861e-01-3.171451e-01-3.269839e-01-7.083953e-02-5.295979e-025.6000000.000000
50%84692.0000001.810880e-026.548556e-021.798463e-01-1.984653e-02-5.433583e-02-2.741871e-014.010308e-022.235804e-02-5.142873e-02...-2.945017e-026.781943e-03-1.119293e-024.097606e-021.659350e-02-5.213911e-021.342146e-031.124383e-0222.0000000.000000
75%139320.5000001.315642e+008.037239e-011.027196e+007.433413e-016.119264e-013.985649e-015.704361e-013.273459e-015.971390e-01...1.863772e-015.285536e-011.476421e-014.395266e-013.507156e-012.409522e-019.104512e-027.827995e-0277.1650000.000000
max172792.0000002.454930e+002.205773e+019.382558e+001.687534e+013.480167e+017.330163e+011.205895e+022.000721e+011.559499e+01...2.720284e+011.050309e+012.252841e+014.584549e+007.519589e+003.517346e+003.161220e+013.384781e+0125691.1600001.000000
\n", - "

8 rows × 31 columns

\n", - "
" - ], - "text/plain": [ - " Time V1 V2 V3 V4 \\\n", - "count 284807.000000 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n", - "mean 94813.859575 1.168375e-15 3.416908e-16 -1.379537e-15 2.074095e-15 \n", - "std 47488.145955 1.958696e+00 1.651309e+00 1.516255e+00 1.415869e+00 \n", - "min 0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00 \n", - "25% 54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01 \n", - "50% 84692.000000 1.810880e-02 6.548556e-02 1.798463e-01 -1.984653e-02 \n", - "75% 139320.500000 1.315642e+00 8.037239e-01 1.027196e+00 7.433413e-01 \n", - "max 172792.000000 2.454930e+00 2.205773e+01 9.382558e+00 1.687534e+01 \n", - "\n", - " V5 V6 V7 V8 V9 \\\n", - "count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n", - "mean 9.604066e-16 1.487313e-15 -5.556467e-16 1.213481e-16 -2.406331e-15 \n", - "std 1.380247e+00 1.332271e+00 1.237094e+00 1.194353e+00 1.098632e+00 \n", - "min -1.137433e+02 -2.616051e+01 -4.355724e+01 -7.321672e+01 -1.343407e+01 \n", - "25% -6.915971e-01 -7.682956e-01 -5.540759e-01 -2.086297e-01 -6.430976e-01 \n", - "50% -5.433583e-02 -2.741871e-01 4.010308e-02 2.235804e-02 -5.142873e-02 \n", - "75% 6.119264e-01 3.985649e-01 5.704361e-01 3.273459e-01 5.971390e-01 \n", - "max 3.480167e+01 7.330163e+01 1.205895e+02 2.000721e+01 1.559499e+01 \n", - "\n", - " ... V21 V22 V23 V24 \\\n", - "count ... 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 \n", - "mean ... 1.654067e-16 -3.568593e-16 2.578648e-16 4.473266e-15 \n", - "std ... 7.345240e-01 7.257016e-01 6.244603e-01 6.056471e-01 \n", - "min ... -3.483038e+01 -1.093314e+01 -4.480774e+01 -2.836627e+00 \n", - "25% ... -2.283949e-01 -5.423504e-01 -1.618463e-01 -3.545861e-01 \n", - "50% ... -2.945017e-02 6.781943e-03 -1.119293e-02 4.097606e-02 \n", - "75% ... 1.863772e-01 5.285536e-01 1.476421e-01 4.395266e-01 \n", - "max ... 2.720284e+01 1.050309e+01 2.252841e+01 4.584549e+00 \n", - "\n", - " V25 V26 V27 V28 Amount \\\n", - "count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 284807.000000 \n", - "mean 5.340915e-16 1.683437e-15 -3.660091e-16 -1.227390e-16 88.349619 \n", - "std 5.212781e-01 4.822270e-01 4.036325e-01 3.300833e-01 250.120109 \n", - "min -1.029540e+01 -2.604551e+00 -2.256568e+01 -1.543008e+01 0.000000 \n", - "25% -3.171451e-01 -3.269839e-01 -7.083953e-02 -5.295979e-02 5.600000 \n", - "50% 1.659350e-02 -5.213911e-02 1.342146e-03 1.124383e-02 22.000000 \n", - "75% 3.507156e-01 2.409522e-01 9.104512e-02 7.827995e-02 77.165000 \n", - "max 7.519589e+00 3.517346e+00 3.161220e+01 3.384781e+01 25691.160000 \n", - "\n", - " Class \n", - "count 284807.000000 \n", - "mean 0.001727 \n", - "std 0.041527 \n", - "min 0.000000 \n", - "25% 0.000000 \n", - "50% 0.000000 \n", - "75% 0.000000 \n", - "max 1.000000 \n", - "\n", - "[8 rows x 31 columns]" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "ab04cf33-e36b-4991-917b-e91a2dfa7fcd", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 284807 entries, 0 to 284806\n", - "Data columns (total 31 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 Time 284807 non-null float64\n", - " 1 V1 284807 non-null float64\n", - " 2 V2 284807 non-null float64\n", - " 3 V3 284807 non-null float64\n", - " 4 V4 284807 non-null float64\n", - " 5 V5 284807 non-null float64\n", - " 6 V6 284807 non-null float64\n", - " 7 V7 284807 non-null float64\n", - " 8 V8 284807 non-null float64\n", - " 9 V9 284807 non-null float64\n", - " 10 V10 284807 non-null float64\n", - " 11 V11 284807 non-null float64\n", - " 12 V12 284807 non-null float64\n", - " 13 V13 284807 non-null float64\n", - " 14 V14 284807 non-null float64\n", - " 15 V15 284807 non-null float64\n", - " 16 V16 284807 non-null float64\n", - " 17 V17 284807 non-null float64\n", - " 18 V18 284807 non-null float64\n", - " 19 V19 284807 non-null float64\n", - " 20 V20 284807 non-null float64\n", - " 21 V21 284807 non-null float64\n", - " 22 V22 284807 non-null float64\n", - " 23 V23 284807 non-null float64\n", - " 24 V24 284807 non-null float64\n", - " 25 V25 284807 non-null float64\n", - " 26 V26 284807 non-null float64\n", - " 27 V27 284807 non-null float64\n", - " 28 V28 284807 non-null float64\n", - " 29 Amount 284807 non-null float64\n", - " 30 Class 284807 non-null int64 \n", - "dtypes: float64(30), int64(1)\n", - "memory usage: 67.4 MB\n" - ] - } - ], - "source": [ - "df.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "6c31fc17", - "metadata": {}, - "outputs": [], - "source": [ - "class_counts = df.Class.value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "29b823c1", - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " 2024-06-03T12:53:10.650385\n", - " image/svg+xml\n", - " \n", - " \n", - " Matplotlib v3.9.0, https://matplotlib.org/\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n" - ], - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "class_counts.plot(kind=\"bar\")\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "918cf628", - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " 2024-06-03T12:53:11.836204\n", - " image/svg+xml\n", - " \n", - " \n", - " Matplotlib v3.9.0, https://matplotlib.org/\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n" - ], - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "class_counts.plot(kind=\"bar\", logy=True)\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "b6af24e9", - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " 2024-06-03T12:53:14.097839\n", - " image/svg+xml\n", - " \n", - " \n", - " Matplotlib v3.9.0, https://matplotlib.org/\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n" - ], - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "fig, ax = plt.subplots()\n", - "\n", - "V1_0 = df[df[\"Class\"]==0].V1\n", - "\n", - "ax.set_xlabel(\"V1\")\n", - "ax.axis([-30, 5, 0, 0.2])\n", - "\n", - "df[df[\"Class\"]==0].V1.hist(ax=ax, label = \"Class 0\", bins=20, alpha=0.5, color='gray', density=True)\n", - "df[df[\"Class\"]==1].V1.hist(ax=ax, label = \"Class 1\", bins=20, alpha=0.5, color='red', density=True)\n", - "\n", - "legend = fig.legend(loc=\"upper right\", frameon=False)" - ] - }, - { - "cell_type": "markdown", - "id": "af2c4346", - "metadata": {}, - "source": [ - "## Задание на повторение материала\n", - "### 1. Создать одномерный массив Numpy под названием a из 12 последовательных целых чисел чисел от 12 до 24 невключительно" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "id": "557a8256-9c01-4f25-a014-0d228c69024c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[12 13 14 15 16 17 18 19 20 21 22 23]\n" - ] - } - ], - "source": [ - "a = np.arange(12, 24, dtype=int)\n", - " \n", - "print(a)" - ] - }, - { - "cell_type": "markdown", - "id": "1c77f2b4-a4a1-4758-ab20-9c95b50a7228", - "metadata": {}, - "source": [ - "### 2. Создать 5 двумерных массивов разной формы из массива a. Не использовать в аргументах метода reshape число -1." - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "c532a55c-a2e4-41b8-88fc-57202937c2df", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[12],\n", - " [13],\n", - " [14],\n", - " [15],\n", - " [16],\n", - " [17],\n", - " [18],\n", - " [19],\n", - " [20],\n", - " [21],\n", - " [22],\n", - " [23]])" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.reshape(a, (12, 1))" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "fdd1014d-c6f5-48bc-b047-81038d8fd827", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[12, 13, 14, 15, 16, 17],\n", - " [18, 19, 20, 21, 22, 23]])" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.reshape(a, (2, 6))" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "0ad59875-d3b2-4e5e-acfe-574669e8a068", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[12, 13],\n", - " [14, 15],\n", - " [16, 17],\n", - " [18, 19],\n", - " [20, 21],\n", - " [22, 23]])" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.reshape(a, (6, 2))" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "fc9a8aaa-d246-4876-a818-3bbb268a4a06", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[12, 13, 14, 15],\n", - " [16, 17, 18, 19],\n", - " [20, 21, 22, 23]])" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.reshape(a, (3, 4))" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "f58c5c92-4e45-4b11-8ee9-03930d40213e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[12, 13, 14],\n", - " [15, 16, 17],\n", - " [18, 19, 20],\n", - " [21, 22, 23]])" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.reshape(a, (4, 3))" - ] - }, - { - "cell_type": "markdown", - "id": "dda06888-4535-4b11-a8ac-8366cb665662", - "metadata": {}, - "source": [ - "### 3. Создать 5 двумерных массивов разной формы из массива a. Использовать в аргументах метода reshape число -1 (в трех примерах - для обозначения числа столбцов, в двух - для строк)." - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "5d2eceed-d468-43e2-a896-03db80347de7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[12, 13, 14, 15, 16, 17],\n", - " [18, 19, 20, 21, 22, 23]])" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.reshape(a, (2, -1))" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "5d4e33c6-9991-48f1-92dc-fe039f1f1403", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[12, 13, 14, 15],\n", - " [16, 17, 18, 19],\n", - " [20, 21, 22, 23]])" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.reshape(a, (3, -1))" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "id": "7c5ee813-ab4a-44eb-af9b-0c6f086ff275", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[12, 13, 14],\n", - " [15, 16, 17],\n", - " [18, 19, 20],\n", - " [21, 22, 23]])" - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.reshape(a, (4, -1))" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "id": "19305afe-f66e-439d-86fb-4f1a8e9289fd", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[12, 13],\n", - " [14, 15],\n", - " [16, 17],\n", - " [18, 19],\n", - " [20, 21],\n", - " [22, 23]])" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.reshape(a, (-1, 2))" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "021fef61-9517-4ed0-9a49-f0dd68416dff", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[12, 13, 14, 15, 16, 17],\n", - " [18, 19, 20, 21, 22, 23]])" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.reshape(a, (-1, 6))" - ] - }, - { - "cell_type": "markdown", - "id": "aae22acf-9657-4492-b4d8-d6707dd79d6e", - "metadata": {}, - "source": [ - "### 4. Можно ли массив Numpy, состоящий из одного столбца и 12 строк, назвать одномерным?" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "id": "a039beed-b1fb-428a-995b-f4788b8ba5c2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shape=(12, 1)=>размерность=2, следоватьельно одномерным может быть только вектор\n" - ] - } - ], - "source": [ - "b = np.resize(a, (12, 1))\n", - "\n", - "print(f\"shape={b.shape}=>размерность={b.ndim}, следоватьельно одномерным может быть только вектор\")" - ] - }, - { - "cell_type": "markdown", - "id": "65303edd-4937-4b7e-85f0-82412f9e320b", - "metadata": {}, - "source": [ - "### 5. Создать массив из 3 строк и 4 столбцов, состоящий из случайных чисел с плавающей запятой из нормального распределения со средним, равным 0 и среднеквадратичным отклонением, равным 1.0. Получить из этого массива одномерный массив с таким же атрибутом size, как и исходный массив." - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "id": "6f9c7bba-d9b2-4e26-8630-8322e6d1f329", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[ 5.30087681e-01 -1.00730270e+00 3.49023257e-01 1.17610515e-01]\n", - " [-3.73873348e-05 -4.65003983e-01 1.17452549e+00 3.80377532e-02]\n", - " [-4.71755245e-02 3.99082256e-01 -1.15223721e-01 -2.65055731e+00]]\n", - "[[ 5.30087681e-01 -1.00730270e+00 3.49023257e-01 1.17610515e-01\n", - " -3.73873348e-05 -4.65003983e-01 1.17452549e+00 3.80377532e-02\n", - " -4.71755245e-02 3.99082256e-01 -1.15223721e-01 -2.65055731e+00]]\n", - "a.size = 12 <=> 12 = b.size\n" - ] - } - ], - "source": [ - "a = np.random.randn(3, 4)\n", - "b = a.reshape(1, 12)\n", - "print(a)\n", - "print(b)\n", - "print(f\"a.size = {a.size} <=> {b.size} = b.size\")" - ] - }, - { - "cell_type": "markdown", - "id": "6b177aff-4007-4af0-a927-82cb03183362", - "metadata": {}, - "source": [ - "### 6. Создать массив a, состоящий из целых чисел, убывающих от 20 до 0 невключительно с интервалом 2." - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "id": "a78dbbed-cbf3-4536-8e0a-f19586aa9218", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[20 18 16 14 12 10 8 6 4 2]\n" - ] - } - ], - "source": [ - "a = np.arange(20, 0, -2, dtype=int)\n", - "\n", - "print(a)" - ] - }, - { - "cell_type": "markdown", - "id": "3edb8cc8-ca5a-414f-9928-0c3be3fae6a1", - "metadata": {}, - "source": [ - "### 7. Создать массив b, состоящий из 1 строки и 10 столбцов: целых чисел, убывающих от 20 до 1 невключительно с интервалом 2. В чем разница между массивами a и b?" - ] - }, - { - "cell_type": "code", - "execution_count": 93, - "id": "de8d49ea-a843-46ff-ab8f-f2b967cebeb2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[20 18 16 14 12 10 8 6 4 2] = [20 18 16 14 12 10 8 6 4 2] => разницы нет\n" - ] - } - ], - "source": [ - "b = np.arange(20, 1, -2, dtype=int)\n", - "\n", - "print(f\"{a} = {b} => разницы нет\")" - ] - }, - { - "cell_type": "markdown", - "id": "48add411-602f-46b7-9cb5-98388a6d742c", - "metadata": {}, - "source": [ - "### 8. Вертикально соединить массивы a и b. a - двумерный массив из нулей, число строк которого больше 1 и на 1 меньше, чем число строк двумерного массива b, состоящего из единиц. Итоговый массив v должен иметь атрибут size, равный 10." - ] - }, - { - "cell_type": "code", - "execution_count": 104, - "id": "e64df321-366c-4400-a406-b775e19a6870", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[0. 0.]\n", - " [0. 0.]\n", - " [1. 1.]\n", - " [1. 1.]\n", - " [1. 1.]] \n", - " size=10\n" - ] - } - ], - "source": [ - "a = np.zeros((2, 2))\n", - "b = np.ones((3,2))\n", - "c = np.vstack((a, b))\n", - "print(f\"{c} \\n size={c.size}\")" - ] - }, - { - "cell_type": "markdown", - "id": "85da760f-5076-4e9a-acb4-4e756ac72407", - "metadata": {}, - "source": [ - "### 9. Создать одномерный массив а, состоящий из последовательности целых чисел от 0 до 12. Поменять форму этого массива, чтобы получилась матрица A (двумерный массив Numpy), состоящая из 4 строк и 3 столбцов. Получить матрицу At путем транспонирования атрицы A. Получить матрицу B, умножив матрицу A на матрицу At с помощью матричного умножения. Какой размер имеет матрица B? Получится ли вычислить обратную матрицу для матрицы B и почему?" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "bc97b82a-eaec-4f6c-b9c8-4159dc45c44f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "матрица B =\n", - "[[ 5 14 23 32]\n", - " [ 14 50 86 122]\n", - " [ 23 86 149 212]\n", - " [ 32 122 212 302]]\n", - "размер = (4, 4)\n", - "определитель = 0 => обратная матрица не существует\n" - ] - } - ], - "source": [ - "a = np.arange(0, 12, dtype=int)\n", - "A = np.reshape(a, (4, -1))\n", - "At = A.T\n", - "B = np.dot(A, At)\n", - "print(f\"матрица B =\\n{B}\\nразмер = {B.shape}\\nопределитель = {np.linalg.det(B):1.0f} => обратная матрица не существует\")" - ] - }, - { - "cell_type": "markdown", - "id": "957e860b-9d20-46cf-91b0-2662c2faeebb", - "metadata": {}, - "source": [ - "### 10. Инициализируйте генератор случайных числе с помощью объекта seed, равного 42." - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "97c0fa11-6530-461b-bb89-65cb6d38b83f", - "metadata": {}, - "outputs": [], - "source": [ - "seed(42)" - ] - }, - { - "cell_type": "markdown", - "id": "7d7b04ea-a205-4a64-bf93-f68c62b69b5f", - "metadata": {}, - "source": [ - "### 11. Создайте одномерный массив c, составленный из последовательности 16-ти случайных равномерно распределенных целых чисел от 0 до 16 невключительно." - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "b307b5d5-e3aa-44b8-94f3-2a9b8877fd5b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 16]\n" - ] - } - ], - "source": [ - "c = np.linspace(start=0, stop=16, num=16, dtype=int)\n", - "print(c)" - ] - }, - { - "cell_type": "markdown", - "id": "3107f89b-c513-4eeb-b21f-d091920b51b3", - "metadata": {}, - "source": [ - "### 12. Поменяйте его форму так, чтобы получилась квадратная матрица C. Получите матрицу D, поэлементно прибавив матрицу B из предыдущего вопроса к матрице C, умноженной на 10. Вычислите определитель, ранг и обратную матрицу D_inv для D." - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "721efedc-bcdd-4aa0-b7bf-58469b31fbf2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "C =\n", - "[[ 0 1 2 3]\n", - " [ 4 5 6 7]\n", - " [ 8 9 10 11]\n", - " [12 13 14 16]]\n", - "D =\n", - "[[ 5 24 43 62]\n", - " [ 54 100 146 192]\n", - " [103 176 249 322]\n", - " [152 252 352 462]]\n", - "определитель = 2.2623680706601445e-10\n", - "ранг = 3\n", - "D_inv =\n", - "[[-3.51843721e+13 7.03687442e+13 -3.51843721e+13 -3.12500000e-03]\n", - " [ 7.03687442e+13 -1.40737488e+14 7.03687442e+13 1.06250000e-01]\n", - " [-3.51843721e+13 7.03687442e+13 -3.51843721e+13 -2.03125000e-01]\n", - " [ 2.00000000e-01 -3.00000000e-01 -0.00000000e+00 1.00000000e-01]]\n" - ] - } - ], - "source": [ - "C = np.reshape(c, (4, -1))\n", - "print(f\"C =\\n{C}\")\n", - "D = B + C * 10\n", - "D_inv = np.linalg.inv(D)\n", - "print(f\"D =\\n{D}\\nопределитель = {np.linalg.det(D)}\\nранг = {np.linalg.matrix_rank(D)}\\nD_inv =\\n{D_inv}\")" - ] - }, - { - "cell_type": "markdown", - "id": "05de35ef-439f-4273-8902-9fbf5d1dd878", - "metadata": {}, - "source": [ - "### 13. Приравняйте к нулю отрицательные числа в матрице D_inv, а положительные - к единице. Убедитесь, что в матрице D_inv остались только нули и единицы. С помощью функции numpy.where, используя матрицу D_inv в качестве маски, а матрицы B и C - в качестве источников данных, получите матрицу E размером 4x4. Элементы матрицы E, для которых соответствующий элемент матрицы D_inv равен 1, должны быть равны соответствующему элементу матрицы B, а элементы матрицы E, для которых соответствующий элемент матрицы D_inv равен 0, должны быть равны соответствующему элементу матрицы C." - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "id": "ae14bda8-3197-4ce6-8ed7-9227f40ee440", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "D_inv =\n", - "[[ 0. 1. 0. 0.]\n", - " [ 1. 0. 1. 1.]\n", - " [ 0. 1. 0. 0.]\n", - " [ 1. 0. -0. 1.]]\n", - "E =\n", - "[[ 0. 14. 2. 3.]\n", - " [ 14. 5. 86. 122.]\n", - " [ 8. 86. 10. 11.]\n", - " [ 32. 13. 14. 302.]]\n" - ] - } - ], - "source": [ - "D_inv[np.where(D_inv < 0)] = 0\n", - "D_inv[np.where(D_inv > 0)] = 1\n", - "print(f\"D_inv =\\n{D_inv}\")\n", - "\n", - "E = np.zeros((4, 4))\n", - "E[np.where(D_inv == 0)] = C[np.where(D_inv == 0)]\n", - "E[np.where(D_inv == 1)] = B[np.where(D_inv == 1)]\n", - "print(f\"E =\\n{E}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e8ecf6a0-078c-4d26-9aa0-8d84514df3e7", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 5de9f5da05635067cd0303e6efeadadf529b8397 Mon Sep 17 00:00:00 2001 From: Foton Date: Mon, 17 Jun 2024 11:04:59 +0300 Subject: [PATCH 4/7] Add Home Work Machine Learning DSLibraries Lesson8 --- Lesson8/Task1.ipynb | 14532 ++++++++++++++++++++++++++++++++++++++++++ Lesson8/X_train.pkl | Bin 0 -> 37231 bytes Lesson8/X_valid.pkl | Bin 0 -> 13197 bytes Lesson8/y_train.pkl | Bin 0 -> 16662 bytes Lesson8/y_valid.pkl | Bin 0 -> 5978 bytes 5 files changed, 14532 insertions(+) create mode 100644 Lesson8/Task1.ipynb create mode 100644 Lesson8/X_train.pkl create mode 100644 Lesson8/X_valid.pkl create mode 100644 Lesson8/y_train.pkl create mode 100644 Lesson8/y_valid.pkl diff --git a/Lesson8/Task1.ipynb b/Lesson8/Task1.ipynb new file mode 100644 index 0000000..1d4b25f --- /dev/null +++ b/Lesson8/Task1.ipynb @@ -0,0 +1,14532 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "p62jUuP4ONfJ" + }, + "source": [ + "# Обучение без учителя в Scikit-learn\n", + "\n", + "Ранее мы познакомились с понятием обучения с учителем. В тех задачах у нас имелись какие-то данные, а также набор правильных ответов на этих данных. Тогда нашей задачей было научиться предсказывать аналогичный ответ на данных, которые мы раньше не видели.\n", + "\n", + "В отличие от обучения с учителем, в задачах обучения без учителя эти правильные ответы отсутствуют. В таком случае нам уже не нужно предсказывать какую-то величину, а лишь понять, как связаны между собой объекты из нашей выборки.\n", + "\n", + "## Кластеризация\n", + "\n", + "Одним из самых часто применяемых методов обучения без учителя является _кластеризация_. Задача кластеризации - разбить объекты из выборки на определённое число групп (или _кластеров_) так, чтобы \"похожие\" объекты попали в один кластер. \n", + "\n", + "Здесь мы познакомимся с одним из наиболее известных алгоритмов кластеризации - _k-means_ или _k средних_." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "id": "F9QU6VLLONfK" + }, + "outputs": [], + "source": [ + "from sklearn.cluster import KMeans\n", + "\n", + "model = KMeans(n_clusters=3, random_state=100)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQGSMZIzONfL" + }, + "source": [ + "Данная модель принимает на вход параметр `n_clusters`, т.е. число кластеров, на которые мы хотели бы разбить наши данные.\n", + "\n", + "На начальной итерации алгоритм произвольным образом выбирает центры кластеров - _центроиды_ (столько, сколько мы задали в параметре `n_clusters`). Затем повторяется следующий процесс:\n", + "\n", + "1. Отнести каждый объект из выборки к тому кластеру, чей центр ближе всего к этому объекту.\n", + "2. Переставить центроид каждого кластера в центр этого кластера (с учётом новых объектов).\n", + "\n", + "Этот процесс повторяется до тех пор, пока координаты центров не перестанут изменяться.\n", + "\n", + "Разберёмся с этим алгоритмом на практике. Для этого воспользуемся готовой функцией для создания датасета." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "id": "QuNQtlOkONfL" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "plt.style.use('fivethirtyeight')\n", + "\n", + "%config InlineBackend.figure_format = 'svg'\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "id": "kl5adWr8ONfM" + }, + "outputs": [], + "source": [ + "def make_data(n, seed):\n", + " np.random.seed(seed)\n", + " \n", + " shift_matrix = np.array([[3, 3], \n", + " [6, 9], \n", + " [9, 3]])\n", + " \n", + " data = np.random.randn(3, 2, n) + shift_matrix.reshape((3, 2, 1))\n", + " data = np.swapaxes(data, 1, 2)\n", + " data = data.reshape((-1, 2))\n", + " data *= np.array([[20, 0.5]])\n", + " \n", + " df = pd.DataFrame({'x': data[:, 0], 'y': data[:, 1]},\n", + " columns=['x', 'y'])\n", + " df = df.sample(frac=1.0)\n", + "\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TGt-kZLZONfM" + }, + "source": [ + "Эта функция принимает на вход параметры `n` (число объектов в каждой группе объектов) и `seed` (зерно, необходимое для воспроизводимости случайных значений). В результате получается `DataFrame`, содержащий заданное число объектов в каждой группе (всего 3 группы) с двумя признаками: `'x'` и `'y'`.\n", + "\n", + "Получим с помощью этой функции два набора данных. " + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "id": "tgYYt6P6ONfM" + }, + "outputs": [], + "source": [ + "train = make_data(10, 42)\n", + "test = make_data(10, 27)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yzBhMHMyONfM" + }, + "source": [ + "Чтобы придать данным больший предметный смысл, представим, что полученные нами данные - это данные о разных видах рыб, обитающих в водоёме. Мы бы хотели разбить представителей рыб из этого водоёма на 3 кластера.\n", + "\n", + "Посмотрим, как расположены наши объекты в пространстве. Поскольку у каждого объекта всего два признака, мы можем представить каждый объект как точку на плоскости:" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "id": "HWopWVdpONfN", + "outputId": "d5ea8794-5b56-44ce-eaed-0ed49ac221ff" + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2024-06-16T23:08:16.640340\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.3.4, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(train['x'], train['y'])\n", + "\n", + "plt.xlabel('Длина тела')\n", + "plt.ylabel('Длина чешуи')\n", + "\n", + "plt.title('Train data')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Us_CA4aGONfN" + }, + "source": [ + "Прежде чем приступить к кластеризации, отмасштабируем наши данные с помощью уже знакомого нам `MinMaxScaler`:" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "id": "fqLX3t5VONfO", + "outputId": "01cace59-8dbd-44bb-f918-8538bb02abe3" + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2024-06-16T23:08:18.574222\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.3.4, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.preprocessing import MinMaxScaler\n", + "\n", + "scaler = MinMaxScaler()\n", + "\n", + "train_scaled = pd.DataFrame(scaler.fit_transform(train), columns=['x', 'y'])\n", + "\n", + "plt.scatter(train_scaled['x'], train_scaled['y'])\n", + "\n", + "plt.xlabel('Длина тела')\n", + "plt.ylabel('Длина чешуи')\n", + "\n", + "plt.title('Train data (scaled)')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IiRCWIdRONfP" + }, + "source": [ + "Итак, перейдём к кластеризации. С помощью метода `.fit_predict` можно обучить модель, а затем сразу же получить метки кластеров на входных данных:" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "id": "dT1-Q0zfONfP", + "outputId": "13cb9d1e-a492-4891-e0e0-623476ebace5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2 2 1 1 1 2 0 2 0 1 1 0 0 0 0 0 2 0 2 2 1 0 1 1 2 2 2 0 1 1]\n" + ] + } + ], + "source": [ + "train_labels = model.fit_predict(train_scaled)\n", + "\n", + "print(train_labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jFf3PWYQONfQ" + }, + "source": [ + "С помощью атрибута `.cluster_centers_` можно получить получившиеся в результате работы алгоритма центры кластеров. Чтобы изобразить эти центры на плоскости, воспользуемся методом `scaler.inverse_transform`, который преобразует отмасштабированные значения обратно в сырые. Центры на графике пометим красными ромбами. Кроме того, покрасим объекты из разных кластеров в разные цвета для наглядности." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "id": "ye5MuAmNONfR", + "outputId": "a52d5c35-3166-48b1-da20-0338d7096b35" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Train data')" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2024-06-16T23:08:19.105253\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.3.4, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "centers = scaler.inverse_transform(model.cluster_centers_)\n", + "\n", + "plt.scatter(train['x'], train['y'], c=train_labels)\n", + "plt.scatter(centers[:, 0], centers[:, 1], marker='D', color='red')\n", + "\n", + "plt.xlabel('Длина тела')\n", + "plt.ylabel('Длина чешуи')\n", + "\n", + "plt.title('Train data')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nixbi0rrONfR" + }, + "source": [ + "Основная проблема алгоритма k-means в том, что нам нужно заранее знать количество кластеров. Однако, в реальности это не всегда бывает возможно. Количество кластеров можно оценить, используя величину _inertia_. Эта величина равна сумме квадратов расстояний от объектов до центров их кластеров. Чем лучше кластеризация, тем меньше эта величина. Кроме того, она уменьшается с увеличением числа кластеров (она равна 0, если кластеров столько же, сколько объектов), поэтому нужно соблюдать баланс между количеством кластеров, которое нам подходит, и величиной inertia, чтобы число кластеров было разумным." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "id": "T9R4O20-ONfS", + "outputId": "533762d4-b755-45c7-f21d-c6fa2d1c6bc9" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.5417377477488049" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.inertia_" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iAE5IZlAONfS" + }, + "source": [ + "Посчитаем эту величину на наших данных при различном количестве кластеров." + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "id": "sK44YXDaONfT", + "outputId": "6bbf6bf8-d9e3-4571-c975-7b9cdd48fe9c" + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2024-06-16T23:08:19.958900\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.3.4, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "inertias = []\n", + "\n", + "for i in range(2, 10):\n", + " temp_model = KMeans(n_clusters=i, random_state=100)\n", + " temp_model.fit(train_scaled)\n", + " \n", + " temp_inertia = temp_model.inertia_\n", + " \n", + " inertias.append(temp_inertia)\n", + "\n", + "plt.plot(range(2, 10), inertias)\n", + "\n", + "plt.title('Inertia')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EwTUeKFuONfT" + }, + "source": [ + "Мы видим сильное снижение inertia при переходе от 2 к 3 кластерам, а дальше идёт более плавное уменьшение. Это означает, что 3 - адекватное число кластеров. Такой подход называется _методом локтя_.\n", + "\n", + "Теперь перейдём к тестовым данным. Нормализуем их, а затем попробуем отнести их к одному из получившихся кластеров." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "id": "sKoyxTnwONfT", + "outputId": "bde7d2c5-4082-4891-a85d-dfedbdcb9c27" + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2024-06-16T23:08:20.352721\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.3.4, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "test_scaled = pd.DataFrame(scaler.transform(test), columns=['x', 'y'])\n", + "\n", + "test_labels = model.predict(test_scaled)\n", + "\n", + "plt.scatter(test['x'], test['y'], c=test_labels)\n", + "plt.scatter(centers[:, 0], centers[:, 1], marker='D', color='red')\n", + "\n", + "plt.xlabel('Длина тела')\n", + "plt.ylabel('Длина чешуи')\n", + "\n", + "plt.title('Test data')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "id": "NJVFjt4ZONfV" + }, + "source": [ + "Как мы видим, каждый объект из тестовых данных вполне оправданно был отнесён к соответствующему кластеру.\n", + "\n", + "### Агломеративная кластеризация\n", + "\n", + "Перейдём к _иерархической кластеризации_, которая в свою очередь состоит из _агломеративной_ и _дивизивной_ кластеризации.\n", + "\n", + "Алгоритмы _агломеративной_ кластеризации основаны на объединении мелких кластеров в более крупные. На начальной итерации каждый объект считается отдельным кластером, затем происходит их последовательное объединение. Напротив, _дивизивная_ кластеризация - это обратный процесс: сначала вся выборка считается одним большим кластером, затем происходит последовательное разбиение кластеров на части." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "id": "TUmgJi7fONfV", + "outputId": "30dd5e5e-bfdb-418f-daec-f4fab42eb264" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Train data')" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2024-06-16T23:08:21.291171\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.3.4, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.cluster import AgglomerativeClustering\n", + "\n", + "model = AgglomerativeClustering(n_clusters=3)\n", + "\n", + "train_labels = model.fit_predict(train_scaled)\n", + "\n", + "plt.scatter(train['x'], train['y'], c=train_labels)\n", + "\n", + "plt.xlabel('Длина тела')\n", + "plt.ylabel('Длина чешуи')\n", + "\n", + "plt.title('Train data')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jMAmKN6iONfV" + }, + "source": [ + "Отметим, что у модели `AgglomerativeClustering` нет метода `.predict`. Поэтому чтобы предсказать разбиение на кластеры для тестовых данных, воспользуемся уже известным нам алгоритмом KNN. Будем использовать полученные только что метки классов `train_labels` для того, чтобы превратить нашу задачу в задачу обучения с учителем." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "id": "H0c_RQj9ONfV", + "outputId": "416c09ec-c5cc-428e-8793-ddc59daa7f46" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Test data')" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2024-06-16T23:08:21.743197\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.3.4, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.neighbors import KNeighborsClassifier\n", + "\n", + "knn = KNeighborsClassifier(n_neighbors=5)\n", + "knn.fit(train_scaled, train_labels)\n", + "\n", + "test_labels = knn.predict(test_scaled)\n", + "\n", + "plt.scatter(test['x'], test['y'], c=test_labels)\n", + "\n", + "plt.xlabel('Длина тела')\n", + "plt.ylabel('Длина чешуи')\n", + "\n", + "plt.title('Test data')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "N4LH5gnYONfW" + }, + "source": [ + "## Понижение размерности данных\n", + "\n", + "Предположим, перед нами стоит задача обучения с учителем, и в наших данных очень много признаков (сотни, тысячи или даже больше). На таком количестве признаков модель построить практически невозможно: нам просто не хватит вычислительной мощности.\n", + "\n", + "В таких случаях можно пользоваться методами понижения размерности. Такие методы позволяют \"сжать\" данные, уменьшая число признаков в них, при этом сделать это так, чтобы потеря информации была минимальной.\n", + "\n", + "Один из таких методов - _Метод главных компонент_ (_Principal component analysis_ или _PCA_). Для нахождения главных компонент нужно вычислить ковариационную матрицу, затем для этой матрицы найти собственные векторы и собственные значения. (Также есть способ нахождения главных компонент с помощью сингулярного разложения.)\n", + "\n", + "Перед применением данного метода важно не забыть привести все данные к схожему масштабу. Это можно сделать, например, с помощью стандартизации или нормализации.\n", + "\n", + "Создадим небольшой набор данных чтобы на нём познакомиться с методом главных компонент. Это будет набор данных с двумя признаками:\n", + "\n", + "1. количество комнат в доме\n", + "2. площадь дома" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "id": "x-so-YwLONfX" + }, + "outputs": [], + "source": [ + "houses = np.array([[5, 1.6],\n", + " [4, 1.4],\n", + " [6, 1.9],\n", + " [3, 1.1],\n", + " [4, 1.25]])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lFRdxR4CONfX" + }, + "source": [ + "Изобразим эти данные на плоскости:" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "id": "Xn2yKcAOONfX", + "outputId": "cf2d0c00-b70d-4b6d-c9e5-4ba6e1439756" + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2024-06-16T23:08:22.162617\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.3.4, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(houses[:, 0], houses[:, 1])\n", + "\n", + "plt.xlabel('Количество комнат')\n", + "plt.ylabel('Площадь дома')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rdAXNce_ONfY" + }, + "source": [ + "Из графика видно, что два признака сильно коррелируют: их можно расположить практически на одной прямой. Это означает, что наши данные можно сжать с двух до одного измерения (представить данные одной _главной компонентой_) с минимальной потерей информации.\n", + "\n", + "Разберём подробнее алгоритм метода главных компонент. Вначале нужно центрировать наши данные, т.е. преобразовать их так, чтобы среднее значение каждого признака было равно 0. Для этого вычтем из матрицы признаков средние значения по каждому признаку:" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "id": "_WS2-179ONfY", + "outputId": "47cdb0ec-9425-4f39-b28c-989e80df9445" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 0.6 0.15]\n", + " [-0.4 -0.05]\n", + " [ 1.6 0.45]\n", + " [-1.4 -0.35]\n", + " [-0.4 -0.2 ]]\n" + ] + } + ], + "source": [ + "houses_centered = houses - houses.mean(axis=0)\n", + "\n", + "print(houses_centered)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bLVkFydKONfZ" + }, + "source": [ + "Теперь посчитаем _матрицу ковариаций_. Это матрица, состоящая из попарных ковариаций признаков. Это своего рода многомерный аналог дисперсии. Матрицу ковариаций можно посчитать с помощью функции `np.cov`:" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "id": "CW9aCI6pONfZ" + }, + "outputs": [], + "source": [ + "covariance_matrix = np.cov(houses_centered.T)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LMhpa5BXONfZ" + }, + "source": [ + "Отметим, что данная функция (как и многие функции из линейной алгебры) принимает на вход матрицу, у которой векторы расположены в столбцах (а не в строках, как мы привыкли). Поэтому внутрь этой функции мы подаём транспонированную матрицу `houses_centered.T`." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "id": "5amTYC9MONfZ", + "outputId": "2ec21bbe-dd17-4715-ebc7-9cb0f0b8e32d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[1.3 0.35 ]\n", + " [0.35 0.0975]]\n" + ] + } + ], + "source": [ + "print(covariance_matrix)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z6FgsGbgONfb" + }, + "source": [ + "У нас получилась матрица размера $2 \\times 2$, поскольку число признаков равно 2. На главной диагонали этой матрицы стоят дисперсии соответствующих признаков. Например, посчитаем дисперсию первого признака вручную:" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "id": "MCBrhV3mONfb", + "outputId": "28fe8039-91c0-45f8-e1f5-93e7f6c09b60" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1.3" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.var(houses_centered[:, 0], ddof=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e48ZA6k8ONfc" + }, + "source": [ + "Здесь мы подали параметр `ddof=1`, чтобы посчитать несмещённую дисперсию.\n", + "\n", + "Теперь найдём _собственные значения_ (_eigenvalues_) и _собственные векторы_ (_eigenvectors_) данной матрицы. Это можно сделать с помощью функции `np.linalg.eig`:" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "id": "T4fq3-_FONfc", + "outputId": "7ae7dea9-6479-4b55-9419-35bef3bebd47" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "eigenvalues:\n", + "[1.39445221 0.00304779]\n", + "\n", + "eigenvectors:\n", + "[[ 0.96546225 -0.26054298]\n", + " [ 0.26054298 0.96546225]]\n" + ] + } + ], + "source": [ + "eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)\n", + "\n", + "print('eigenvalues:\\n{}\\n'.format(eigenvalues))\n", + "print('eigenvectors:\\n{}'.format(eigenvectors))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pi5eL-7iONfd" + }, + "source": [ + "Метод главных компонент заключается в том, что если мы хотим сжать число признаков с $n$ признаков до $k$ признаков, то нам выгоднее всего взять $k$ наибольших собственных значений матрицы ковариаций. Этим $k$ наибольшим значениям соответствуют $k$ собственных векторов (обратите внимание, что векторы здесь также записаны в столбцах). Чтобы получить \"сжатые\" данные, остаётся лишь умножить нашу матрицу объект-признак на эти $k$ векторов-столбцов.\n", + "\n", + "В нашем случае мы хотим получить данные с 1 признаком, поэтому возьмём только первое собственное значение (оно сильно больше, чем второе). Ему соответствует вектор `eigenvectors[:, 0]`. Умножим нашу матрицу с центрированными данными на этот вектор." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "id": "9t4tzi2kONfd", + "outputId": "33f8256e-f675-4897-dd45-ad62a6150cab" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0.6183588 -0.39921205 1.66198394 -1.44283719 -0.4382935 ]\n" + ] + } + ], + "source": [ + "result = houses_centered.dot(eigenvectors[:, 0])\n", + "\n", + "print(result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CGSoh7d1ONfe" + }, + "source": [ + "Наконец, расположим эти значения в столбце, чтобы получить матрицу объект-признак:" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "id": "l_z8xHB_ONfe", + "outputId": "8fa90444-f50a-40aa-b3cd-b2a229cc39f5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 0.6183588 ]\n", + " [-0.39921205]\n", + " [ 1.66198394]\n", + " [-1.44283719]\n", + " [-0.4382935 ]]\n" + ] + } + ], + "source": [ + "result = result.reshape(-1, 1)\n", + "\n", + "print(result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-u4LfNuwONfe" + }, + "source": [ + "Всю проделанную нами работу можно выполнить и автоматически с помощью библиотеки `sklearn`:" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "id": "CokcTYCLONff", + "outputId": "fcf8090a-a7ae-43f5-838a-a8c78bc6a39d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 0.6183588 ]\n", + " [-0.39921205]\n", + " [ 1.66198394]\n", + " [-1.44283719]\n", + " [-0.4382935 ]]\n" + ] + } + ], + "source": [ + "from sklearn.decomposition import PCA\n", + "\n", + "pca = PCA(n_components=1)\n", + "\n", + "result_ = pca.fit_transform(houses)\n", + "\n", + "print(result_)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "U-P88G1xONfg" + }, + "source": [ + "Кроме того, использование модели из `sklearn` позволяет нам также посмотреть, какую долю объяснённой дисперсии содержит данная главная компонента:" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "id": "NK00XkCuONfg", + "outputId": "0b4311b2-5b9f-4df0-9dce-e76533edd66a" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0.99781911])" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pca.explained_variance_ratio_" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7U_GAERXONfh" + }, + "source": [ + "Мы видим, что это значение близко к 1. Это означает, что мы потеряли менее 1 процента информации. В общем случае, потеря информации до 10 процентов считается не очень большой.\n", + "\n", + "### Применение PCA в моделях машинного обучения\n", + "\n", + "Попробуем применить метод главных компонент в модели классификации. Загрузим данные по пассажирам Титаника, сохранённые ранее:" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "id": "UAFj0SGhONfh" + }, + "outputs": [], + "source": [ + "X_train = pd.read_pickle('X_train.pkl')\n", + "y_train = pd.read_pickle('y_train.pkl')\n", + "\n", + "X_valid = pd.read_pickle('X_valid.pkl')\n", + "y_valid = pd.read_pickle('y_valid.pkl')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_IBQNbByONfi" + }, + "source": [ + "Будем масштабировать данные с помощью стандартизации. Подадим стандартизатору параметр `with_mean=False` чтобы не проводить центрирование." + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "id": "yPxQL0KZONfi" + }, + "outputs": [], + "source": [ + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "scaler = StandardScaler(with_mean=False)\n", + "\n", + "X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)\n", + "X_valid_scaled = pd.DataFrame(scaler.transform(X_valid), columns=X_valid.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nDv3Dp5sONfi" + }, + "source": [ + "Запустим теперь метод главных компонент. Здесь мы не будем указывать число главных компонент, по умолчанию модель `PCA` посчитает все. Посчитаем главные компоненты и посмотрим на долю объяснённой дисперсии каждой компоненты." + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "id": "kLiY0IS0ONfj", + "outputId": "c46dac8d-dbab-4ff2-a12c-78e8faca5ba7" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2.22480052e-01 1.80579178e-01 1.61196472e-01 1.18874068e-01\n", + " 9.43088132e-02 7.61203186e-02 6.30288483e-02 4.72728880e-02\n", + " 3.61393615e-02 1.18513288e-31 4.46814462e-32]\n" + ] + } + ], + "source": [ + "pca = PCA(random_state=100)\n", + "\n", + "pca.fit(X_train_scaled)\n", + "\n", + "print(pca.explained_variance_ratio_)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QFGS_s5dONfj" + }, + "source": [ + "Посчитаем сумму первых 7 значений:" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "id": "qc4gtCGKONfk", + "outputId": "e819cc92-b51c-487b-e60a-39855b61a320" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9165877504406762" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pca.explained_variance_ratio_[:7].sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DzYuNz66ONfk" + }, + "source": [ + "Выходит, если мы оставим 7 главных компонент с самой большой объяснённой дисперсией, то мы сохраним почти 92 процента информации.\n", + "\n", + "Ещё раз создадим модель `PCA`, на этот раз указав число компонент, и получим тренировочные и валидационные данные пониженной размерности:" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "id": "2XZovIpzONfk" + }, + "outputs": [], + "source": [ + "pca = PCA(n_components=7, random_state=100)\n", + "\n", + "X_train_mc = pca.fit_transform(X_train_scaled)\n", + "X_valid_mc = pca.transform(X_valid_scaled)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_aYhDZ8PONfk" + }, + "source": [ + "Теперь обучим на этих данных модель логистической регрессии." + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "id": "4Tjl-rbqONfl", + "outputId": "a1041428-4cdd-4305-ef1a-0c9c99a41272" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.8026905829596412\n" + ] + } + ], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "lr = LogisticRegression()\n", + "\n", + "lr.fit(X_train_mc, y_train)\n", + "\n", + "y_pred = lr.predict(X_valid_mc)\n", + "\n", + "print(accuracy_score(y_valid, y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7uBf7K66ONfl" + }, + "source": [ + "Мы получили вполне приемлемую точность модели несмотря на то, что сильно снизили размерность данных.\n", + "\n", + "## Алгоритм t-SNE\n", + "\n", + "Алгоритм _t-SNE_ (_t-distributed Stochastic Neighbor Embedding_ или _Стохастическое вложение соседей с t-распределением_) позволяет понижать размерность данных до двух или трёх измерений, что позволяет визуализировать данные на двумерных и трёхмерных графиках. Изучая графики, можно, например, понять, на сколько кластеров адекватно разбивать данные, а также оценить уже выполненное разбиение на кластеры.\n", + "\n", + "Разберёмся в работе этого алгоритма с помощью датасета Boston из библиотеки `sklearn`." + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "id": "Lc8MbY5FONfm", + "outputId": "52f3427a-5eda-4f60-92ae-a3932be304fb" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 506 entries, 0 to 505\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 CRIM 506 non-null float64\n", + " 1 ZN 506 non-null float64\n", + " 2 INDUS 506 non-null float64\n", + " 3 CHAS 506 non-null float64\n", + " 4 NOX 506 non-null float64\n", + " 5 RM 506 non-null float64\n", + " 6 AGE 506 non-null float64\n", + " 7 DIS 506 non-null float64\n", + " 8 RAD 506 non-null float64\n", + " 9 TAX 506 non-null float64\n", + " 10 PTRATIO 506 non-null float64\n", + " 11 B 506 non-null float64\n", + " 12 LSTAT 506 non-null float64\n", + "dtypes: float64(13)\n", + "memory usage: 51.5 KB\n" + ] + } + ], + "source": [ + "from sklearn.manifold import TSNE\n", + "from sklearn.datasets import load_boston\n", + "\n", + "boston = load_boston()\n", + "\n", + "X = pd.DataFrame(boston.data, columns=boston.feature_names)\n", + "y = boston.target\n", + "\n", + "X.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4MSfnfZEONfm" + }, + "source": [ + "Видно, что данные представляют из себя 13 столбцов с дробными значениями, пропусков нет.\n", + "\n", + "Разобьём данные на тренировочный и тестовый датасеты." + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "id": "fvOnLpaXONfn" + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ekFpA-C4ONfo" + }, + "source": [ + "Нам важно чтобы все признаки имели схожиый масштаб, поэтому отмасштабируем их с помощью стандартизации." + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "id": "45CjnVG_ONfo" + }, + "outputs": [], + "source": [ + "scaler = StandardScaler()\n", + "\n", + "X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)\n", + "X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "J1d-5qwqONfo" + }, + "source": [ + "Итак, перейдём к использованию t-SNE. Зададим параметр `n_components=2`, чтобы получить данные с двумя признаками. Параметр `learning_rate` влияет на то, как плотно будут располагаться точки. Рекомендуется задавать его в диапазоне от 10 до 1000." + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "id": "HWt-_P61ONfo", + "outputId": "faac3c81-85b0-44df-9427-9494e76e33a6" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "До:\t(354, 13)\n", + "После:\t(354, 2)\n" + ] + } + ], + "source": [ + "tsne = TSNE(n_components=2, learning_rate=150, random_state=100)\n", + "\n", + "X_train_tsne = tsne.fit_transform(X_train_scaled)\n", + "\n", + "print('До:\\t{}'.format(X_train_scaled.shape))\n", + "print('После:\\t{}'.format(X_train_tsne.shape))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gEthQ47_ONfo" + }, + "source": [ + "Мы видим, что число признаков уменьшилось с 13 до 2. Теперь можно визуализировать наши данные на плоскости." + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "id": "RIGICrkHONfo", + "outputId": "671f4770-6c3f-4f12-b61c-50e7025bfe20" + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2024-06-16T23:08:27.752609\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.3.4, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(X_train_tsne[:, 0], X_train_tsne[:, 1])\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "441G3ut8ONfq" + }, + "source": [ + "По графику видно, что данные можно разбить как минимум на 2 кластера. Попробуем сделать это с помощью уже известного нам метода K-means и ещё раз построим график, но уже с полученными метками кластеров." + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "id": "6-qmw9S9ONfq", + "outputId": "8d7762ec-a613-4bd2-963c-83009118c099" + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2024-06-16T23:08:28.176633\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.3.4, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "kmeans = KMeans(n_clusters=2)\n", + "\n", + "labels_train = kmeans.fit_predict(X_train_scaled)\n", + "\n", + "plt.scatter(X_train_tsne[:, 0], X_train_tsne[:, 1], c=labels_train)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xkT2rdg0ONfr" + }, + "source": [ + "Как мы видим, при кластеризации мы получили практически такое же разбиение, как то, которое можно было наблюдать в результате t-SNE.\n", + "\n", + "Для тестовой выборки получить аналогичный график нам не удастся, поскольку у алгоритма t-SNE нет метода `.transform`. Однако, с помощью алгоритма K-means мы можем получить метки кластеров для тестовой выборки с помощью метода `.predict` и использовать их:" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "id": "m8VhvtYyONfr" + }, + "outputs": [], + "source": [ + "labels_test = kmeans.predict(X_test_scaled)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OghL70IVONfs" + }, + "source": [ + "Давайте посмотрим на объекты из разных кластеров, чтобы попытаться понять, почему алгоритм t-SNE выделил две группы. Например, посмотрим на среднюю цену недвижимости во всей выборке и в отдельных кластерах." + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "id": "x9EpamzgONfs", + "outputId": "dac65932-87f6-4432-d139-b1c786c9d654" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "22.595762711864406" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "id": "xZ5dj4BWONft", + "outputId": "cea24006-86c9-47ae-caa7-1280353339de" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "25.94439655172414" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train[labels_train == 0].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "id": "etaysaejONfu", + "outputId": "46f3d586-e105-40fd-ade2-d812ddf132fc" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "16.227868852459018" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train[labels_train == 1].mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-tmr-7bkONfv" + }, + "source": [ + "Видно, что в первый кластер попала более дорогая недвижимость, а во второй - менее дорогая. Построим также гистограммы распределения цены по каждому кластеру." + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "id": "pCYGhXyhONfv", + "outputId": "80ef9aba-5b50-4b32-d6bc-9e25743ddf68" + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2024-06-16T23:08:28.875272\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.3.4, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.hist(y_train[labels_train == 0], bins=20, density=True, alpha=0.5)\n", + "plt.hist(y_train[labels_train == 1], bins=20, density=True, alpha=0.5)\n", + "\n", + "plt.legend(['Кластер 0', 'Кластер 1'])\n", + "plt.xlabel('Цена')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "03W5vyzuONfw" + }, + "source": [ + "Гистограмма также отражает замеченную закономерность. Заметим, что мы не использовали цену на недвижимость ни при использовании t-SNE, ни при использовании K-means.\n", + "\n", + "Посмотрим теперь на распределение отдельных признаков внутри кластеров. Например, `'RM'` - количество комнат." + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "id": "JhCy_DGZONfw", + "outputId": "4f8096da-67a2-4b23-8f06-b9110a8364bf" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Кластер 0: 6.44851724137931\n", + "Кластер 1: 5.9092868852459\n" + ] + } + ], + "source": [ + "print('Кластер 0: {}'.format(X_train.loc[labels_train == 0, 'RM'].mean()))\n", + "print('Кластер 1: {}'.format(X_train.loc[labels_train == 1, 'RM'].mean()))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AYDgLchIONfx" + }, + "source": [ + "Значения не сильно отличаются. Не похоже, что этот признак внёс какой-то вклад в разбиение. Рассмотрим другой признак - `'CRIM'`. Он отражает криминальность места, в котором расположена недвижимость (количество преступлений на душу населения)." + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": { + "id": "dNQFF-IPONfx", + "outputId": "02456040-2ad5-40d1-e219-244a521c3184" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Кластер 0: 0.2679815948275862\n", + "Кластер 1: 9.056794180327868\n" + ] + } + ], + "source": [ + "print('Кластер 0: {}'.format(X_train.loc[labels_train == 0, 'CRIM'].mean()))\n", + "print('Кластер 1: {}'.format(X_train.loc[labels_train == 1, 'CRIM'].mean()))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3SJQzpbdONfz" + }, + "source": [ + "Видно, что данное значение сильно выше для второго кластера. Рассмотрим гистограммы распределений значений этого признака по кластерам." + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "id": "wmWbZwM8ONfz", + "outputId": "f1a82310-f208-4e10-fc96-09b62b4f31cd" + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2024-06-16T23:08:29.523701\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.3.4, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.hist(X_train.loc[labels_train == 0, 'CRIM'], bins=20, density=True, alpha=0.5)\n", + "plt.hist(X_train.loc[labels_train == 1, 'CRIM'], bins=20, density=True, alpha=0.5)\n", + "\n", + "plt.xlim(0, 12)\n", + "plt.legend(['Кластер 0', 'Кластер 1'])\n", + "plt.xlabel('CRIM')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hraxR5EPONf0" + }, + "source": [ + "В кластере 0 максимальная плотность этого признака расположена возле 0. Для кластера 1 значения этого признака более разнообразны.\n", + "\n", + "Ещё один интересный признак - концентрация оксида азота в воздухе (`'NOX'`). Построим гистограммы распределения этого признака." + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "id": "AwqHj23mONf0", + "outputId": "85dc51b8-f4be-46e6-f130-9ecc51c5cde7" + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2024-06-16T23:08:30.143534\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.3.4, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.hist(X_train.loc[labels_train == 0, 'NOX'], bins=20, density=True, alpha=0.5)\n", + "plt.hist(X_train.loc[labels_train == 1, 'NOX'], bins=20, density=True, alpha=0.5)\n", + "\n", + "plt.xlim(0, 1.2)\n", + "plt.legend(['Кластер 0', 'Кластер 1'])\n", + "plt.xlabel('NOX')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "43x9OkfZONf1" + }, + "source": [ + "Вновь кластер 1 в этом плане отличается в худшую сторону. \n", + "\n", + "Итак, как нам теперь использовать полученную информацию? Мы можем, например, попробовать построить отдельную модель для каждого кластера. Сначала построим модель на всех данных. Для оценки качества модели будем использовать `r2_score`." + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": { + "id": "Uk1ZpwAoONf2", + "outputId": "cf50c541-f953-451f-8f43-d11ae2cdf699" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6508417720329545" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import r2_score\n", + "\n", + "lr = LinearRegression()\n", + "\n", + "lr.fit(X_train_scaled, y_train)\n", + "\n", + "y_test_pred = lr.predict(X_test_scaled)\n", + "\n", + "r2_score(y_test, y_test_pred)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Un0-x2AsONf2" + }, + "source": [ + "Итак, мы получили базовое значение. Построим теперь разные модели для объектов разных кластеров и посмотрим, какое качество мы сможем получить на них.\n", + "\n", + "Модель для кластера 0:" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "id": "DpdalsvnONf3", + "outputId": "3e8ba6e5-e49d-4402-dbc0-581928f9e343" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8316034335298439" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lr.fit(X_train_scaled.loc[labels_train == 0], y_train[labels_train == 0])\n", + "\n", + "y_test_pred_0 = lr.predict(X_test_scaled.loc[labels_test == 0])\n", + "\n", + "r2_score(y_test[labels_test == 0], y_test_pred_0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9EoHivRXONf4" + }, + "source": [ + "Модель для кластера 1:" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": { + "id": "ciSWsRHRONf4", + "outputId": "3bfb16b4-d4d7-4f23-fba3-2da03b2db0cb" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6458127429551195" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lr.fit(X_train_scaled.loc[labels_train == 1], y_train[labels_train == 1])\n", + "\n", + "y_test_pred_1 = lr.predict(X_test_scaled.loc[labels_test == 1])\n", + "\n", + "r2_score(y_test[labels_test == 1], y_test_pred_1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tdE4IHVGONf5" + }, + "source": [ + "Для кластера 0 мы получили большее значение `r2_score`, однако, сравнивать эти модели пока рано, потому что данное значение получено не на всех данных. Соберём теперь данные по кусочкам и проверим значение `r2_score`." + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": { + "id": "CuIGQqf_ONf5", + "outputId": "b433067c-4efc-4c48-c6b7-7cc5491d437a" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8055877528812476" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_test_all = np.hstack([y_test[labels_test == 0], y_test[labels_test == 1]])\n", + "y_test_pred_all = np.hstack([y_test_pred_0, y_test_pred_1])\n", + "\n", + "r2_score(y_test_all, y_test_pred_all)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gdXJ4kAHONf6" + }, + "source": [ + "Итак, теперь можно утверждать, что мы получили значительно более высокий показатель `r2_score`, применив информацию, полученную с помощью t-SNE и K-means." + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "NJVFjt4ZONfV", + "N4LH5gnYONfW", + "7U_GAERXONfh", + "7uBf7K66ONfl" + ], + "name": "5. Обучение без учителя в Scikit-learn.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/Lesson8/X_train.pkl b/Lesson8/X_train.pkl new file mode 100644 index 0000000000000000000000000000000000000000..491452ab58709ae189dc64f20d9515f0d8bf59ba GIT binary patch literal 37231 zcmd5_3A|NRz21z1h(Ll$4EHi4!)2a_ySWSs7mz7SK`)ofg#*F`@1UY$#}pJhrO44V zeMMPjrB>pArNz&(vdru?I5dd$G>0UoQcJD1*Z=$0KKl>r?Q=n2@ANp|`Tqa!KdiOa zT6>@K>UXoPuU)+}u&-Q~&DC|Qs~gH{Hr3abt*NixSewtapH%B z^J^Q*R#i8caz|Iy)t*;1x6as1K&C^k4L{q$t4%ZcT(`QWjhnYfn;V-pG_I?zudm)> z>Q&X()@-V4XsmClX*A)vHddI=9c6}PCA;#K!M6O&7OV1?=a=LMSGK6^uq~hImTR-R zam(h~D5T}7WAi(%5O1qoi;MCL7cQLlFY~f*mF123RXd_Fv|4I9%(a@a)->H_>AF=* zHyd13U0;*Q=UUIHu8+ntd*iC=`m<|SSIsOWiwnupe6HQ1nhn(r4OOG!kE7#{WAe48 zWUh5xbTt+>Z8vRaI_7#DY6i_UjX~Vu>TGC4d8cypM>sq@n%u7Lf^z+S-4h<^}#4frF7I~I5`@_h=rJAg+4e-7OpzT5CxL$wdA|?)Kwx+HSHgb>a1U@S{GFg5 zh5D1h+e3FZa1HGD0IOj?4tZ{b{bb-W*rx-#K;Ij<7rK{#9ih7y_yBY#!M`8&mf%gm z`M|q@#{h2tehYE)Fut$A{w3Jo22O#!D|D5x*8?wt|L^E)9qbLjb6`Ic{59}1z!!sW zM*h*jX~0tC9{{XE+%dp!B7PY7P~be+Z>1jjK8!s3fp0*!2f9;W?*jc2@H^q(2L2;p z2AD;jmcXZBUj#lFyfgB&2HpgJ3*ZlcPXWJ;xGTY*M%+KaC&K?_@Y%rQU_ToAe*jMa z)O{4MA z-1)%cp_>i=WW+xJ-WmSSK>uUl%kZBL{5$ODf?o{$9qfIP=dZAX2KA`-D;TDN=fJxn z{^N){0rtCKZ-V_<;3n9w27VOS26!Ce%Yjv}<6$qj5PSr12>N*uydH5Kftz6;3w|qb z0s{P)BD67ZYwmm<$4 z;9rCOF5oEWPXkVdeL3Q;1K$eVhdg7zS0Qd9^jE?DIP6ygL&VJhH;-2K)g5s^fd5y( z>!B+H9|Zk3fqw@+3;%Lp3*>tj{6^?Xk?*gte+c$0a3=f#uod#04*VW;jle%5&)wi< zh`SZ`j{~m+t^YfB<;jn)Jd?ffe;2#73D)^fj`9=dOXzY07DbzcA; zh4_`w?*^U={W|zxf&E3q4TAm4uuq462l$QPCD5$}Zv||GZVRvn>|21B0G~m>)=;` zABVhefS(M$61uUlzY1Osz6ba-jB^NhN5uae`W^6p8u&Npo`Sy(@KV@&Bfb{=Pr!b_ z-vDm|)*$`~@ae$Mpx&FnkF1 z2R;G&70_J*`)br}5Bm(*I|2U-coOhsq0z$MVV4F6-mUjqlj|6TBR z5cfFvqwxP2cs}|)7W^mBe-StgaZdrSf$n?YzXxxPJnIoR9{%m{g9bC88x6x^=pP04 z1Gaya0T?sz{er)8Q|@a=QZe#1x|(kbm%Vw{}Avi&^03da_Gi@KMUP@ z@VCK3@HY@YfpNfZ0KW@=C+P2o{ddUoHSm9dPl5io;G^LG3-Dd|j|WbM?h)V@VDARL z5BwA01JTFl!2bgL7_cS$bufUiZqZr~4q zp8|e1uoL3WMBTf<*TSDSzjS6sb?^5gtJqg7^d0nFl@{aTQ8W-CXFVB;&KR zSHRBs(Q&g}e4^iQ5$vOYQ-PgTAFp6uE5x70d;~M0F9)*z=wy5rI%2@$B> zLB-?T$>KU?XE;BvBQdZq9glU$^NRBkOgku##0R0^Fd)|l*C+RByq}WwF+bdw0rPR( zxGwv+u1Zmd>krq}?LU1kcZQzp9M{#IKWhHXxRd?q^~H0G=ZRi#?34SM_a&~s!RTio zkn_WH3*5~+*DKHU@vgsQpFNQeu=zYxlF6WomVV0XW87KFpzVGuoaQvJ%>h=2K zzNDY?!0~WD@Or+2xaGabc zu5Zo<_W`dH&+T}+<5?#9eO#CN z{>^&Bk(cWa^T>T4uwH-Bx#y_>aa<2vKYE{`AD55&fEZu*z`l50r@MJi%xjjU)K$o76F{?vLj! z*C+3XJojnmbzuM0@%o>L@pl#E_0OXJApU-l>?>YhLG&`uPZr&{n13Yf`W%?4=7Y}( zdOybVldK)9ciG66kAF&Qm@^hV4$au2+KE!ii zG`_Dti+`*eBQd5UCTeKc1W1cjV~H&3}Nn_%slY^I7jV=I46jx?w!?lk59B*FUbOxh~0d5kIFT>^xuG?C{qx+UALrvi&ojKPyiU9x`aaJ4 zDAyPJ;ylOuRr=BQ8Ll(duz6ge$jt+ zdN{_udWJHt;o+NYtFE8lt#|m@8_es~Rj^MM-JTL~#IM;5UJ|x{d-vKsFZLAO(moR3 zo%&^A@8j1mc>Ruk;m;m1ugx2fZ!qEmye5ZhqSteA9_!fG9lCzVTPl5C%ly5>&+)v@ zyxfm*f42TzD^aJ1%eU#3Z4WQox)gN=04KY7u+wzo8L$s=;|%b!d2^U2Gk)^+SC-Tq zICq)rw_C4=pDmv?Q1a}=>zQ){n%Y-*9ZQ zznXI4lS;$QRYyJAeQ}8!zwLj|0>|xocw}jK+wD(Oom)OX>{If_hST1ik9jExJ9W7G zmM!P;{)KTiob3Ae&fZV8ePs^5e+1YUrPEwKyI*hVhkiLetS@t3TzKh-`M12ta|iQt z-9pT7nH!hnSrkrvV99lTezGjI>mgf#yqs^G4`w|D_SMU+54r9m-8!`Am^mLfUYq~9 zDWYG$Uh13u%($y}zcuQ)GS)b4%rAuTeJ#&$LE0Tvy#x>kM12R z@#dV7{y85(zWwE>dpSMeaz*$!#MYimi*>9Na9Dkaaf+k zeKO$l*=#pn&ZE_Z@j8?|yq^z|`@&8)~0 zm4r|4@38BuEqT8lDSeoA#5kf{ufDDio1ec|nfbSVcb^^M?t_fyzP38^oWOO%bwr)b z8%zjaePiC~+xPN&3D*OjcZc!*h<$p`QmJp|+vYdVVUmyM%yc&%>3e4K{5Q|le9n^k zI3LXSH~XBuu#eO8dWfz}=3`EIC>-bSEsM|(_W7P^Qs13_?0l{ZpK@`Ij?M=rZ3CQ_YS#!IMW6iuY!|u`-l?V!P$${>|rF@$>saiOln(rLz9rbNg8I-N*HB*A4Hx z=KApbwE4`r#QEVkY~SX-Df#;<~c^zDc232-^@8L{qnv+z1>zRFVoSZ+4<~}YM3I;u}p78xQ7_P1x zzc-e;^MdQl<}v^MBA#=m5rW5b4keDy8^j>sxgq{u?i{k` zX&v^7$!GiKePiE@u>atTbN4MDk9o)U(v=@|_X9hh<~hseKhPWF>??hkb;|w4>mc>d za`SG#H*-Bbwi4HCo{F32>e+nedBf)WOMj_jez)ay=ln>W>26=zadAInF<<;W!`!DB zpSlXA(~_U-c7)r1ydL7`{hH4|cn)#jhX%OyB>T9#oO=)P`K+(h;rBfATw(KOd%8Hw zc`llBBRPItfA@RKyyW>|=E26BbII1fjq9D`8iMr@KVK&E^Sbf(k2uceHP2r%Z?sFd zLBR85i1_d0c^p62v5wr&dWFf~3HbdmK7S-L&Pnq;DLk&r>qitn)-iuCO5(B`gI&R(Wj%mSk(D++<)5d>nE+R_bZNh(&`oKr;b=`ey#JxYdhnJ#ro59%$J5~ zd5+Yt>ocE5>U?(kHBztbK2E(x>a|^S>e5ih70Z2jbv$*nYfd}S=ck=WJu%HrKT*fg z?nA97*T{I<$(xIZ8((Z(vASaWZf-v6HR^iWPfi^<(P!6w>NHYEq+fH|iD`bV^XYwd z#uK$pbLxCZJ&`)iwcY2Zj##XYdLR1o&`xALIgxf^nqB*w%ULH4bzbUycKUtji%YW~ zslU1X`1G{vel$S!k?(yp;sKXn?JPuq*-#l~x$kJC={^+CT6eR0~Z36pU8Zg)9%CO>V0{9_O$%;6McEKoqCNvowob9FCXnK zeSwE*XWqjMFFMTEx2~VYi!P{y%&UIfdr+|5gv^N)ZyuOY)pVn(mJ?(lPXxBP&U0>^|OPd$!h&qpt zYrBq5^HYDMsK>{;#A5UN^v$*FdhK9f9*tV>Lb(_16SeN;j z$G1*s*Kv$57FmbLIL&D%`uy5nY=4|*#%pBV!$s!P@xK13*Qn#uxG%qsW1STI`~14D=8XTK zQ1`|9#3QY%^D|#_^ZWGL?yJYT8g+acZ>tid>6QG5DeDn+LZ6*FJwMd*{z6WDvFOXo zcOOQH_C@shwOw;x z9olvNG|v25N4pQTp7nitZEvn#$LspqPp<3e=Y5?gjWcf=>blhH{AqUj)6kcfak`(w z)$8ZXv^=cuL&g(*I-gyS%co<0ef_DYpPWcLk#?fbPJbGv^=S&uvoeR*{r&2?Vw*Y|7cbv+$NozG5x8tQn~C8ovu{J#CA^{juS z^>p51xvs0ZFAw`B`gA&<<~ok~d^(?3*nxzxrlAUFUG~`|`bC`E?%h zV(U`x!{&~=*t*Tl>+6s9_d6f+YUKTec3qd8NL{g*7MJGt)hYJ6_~H+@t}ma@{z2DC zn+N*Y$D#i>>}}m&GHMzX|0(!P-$VT+qgy-0V{IXSAv8*kkCBhHh}5?b1I?C3Z;=5; z`WRvy$`jQ|s)RXRLB{rCKd<TTKXe2Cb7~- zggKHp4FY4AIQj0}i5w(EQ6-oVXG+X%WHtXV+W?!xdR!2T18oRxi4oWtwPB1EaHOAXS;FNq$Uq^FtmYpI2#C^p*TOpz8;HrG z9YuOc#$Dd6pL+5TCk2NS=5D16-g|!Xg^A{d5>PMyXceD zL9h;scobk8v?PkK84zlW_LF&waz)IdyhZPpqmtz=UK%m)_Jgg%3_=htxuQBzGnPep z$&DfUY)eHD$-tuk;r5eQBe%#Rh-{WeA8nj1VZ$Uy64*=>ND5mFxgwGMk4?iAlG`{U zAF>It2Ac*N3qcTl+qZwaxMQZ%R`Vx}yBy+=7;npG`s|4QzVX1!NwxM5ATP9k^El&= zY2{)DWCmph7a7&oMGeUe%?!&7&y;0G6vozW&F9*VEE_epY*gNKn`>Qter-cOb9Aol zstucJ&Thz?E+c!}bE`KrMV`62b{p5$R~ctRKG$wl)4C0f>*~zMHVqrr)zs#95CiNB(VOWY(NTI#kH z6lT?`*G_e`^)X{dt+nFR!Lgl2XRy`LwrcCxX{**U_^4G{b=1+;p2zo{-95>jT`JS7 zV>$CmfH5_lSC- zc%*A*Y+x`x)HNu=lLboXCC7v>%}?fi!)t46Yx!JU>PKtCpx1dojv>E8bePOvv0XG>(9zS`(J!z)9Pf&H z$+1o0xE#x>zRqxbN2EK{m`*mNlMXLAzP+nAJTMTdOx-R?-Bx)KQ8GC;CRd|vXuoJX zIx%_HXfr4V8;SRHiS*5}!J6u{iHn{dof17$4m~=R1AkSJ(&U()+VtU`YYo?t!wr61 z=r~V}(VOIOp2p^UjnBhguBkH%{)NCc;Pdbw03HNBhWKO9eF*yn;G0n=2Ho|*-LUV3 zelGOGz$ajT0Q$3l)rbqge>dzd_;rYnL3cgu)4<;We-7A(_+s#10b8Lz3jZG1UqSvR z_|F9HfPD*eD}f={Uj+A%?+MtKBEAQF7Wh=~7U)+aegW*i2VV%h74}~O9{}D5{}EsT z@;8J32)Z*+ryaN)_FCW-h+7E^0xyQ%0iO=O4RN2NP7!z`xC>r{xQAeW7xtIIYrvO5 z_bKf60!vXZ4%`R*I{2r7KMj8e_!?jkSO))k;MuUh09=InkAwdZy5E8as7Ku&z<(5c zBjWR5KMmLidljoSLh}K{{-C>_}7C^MxL$Em4GMVZ-o6l#6^KY_+JN~fVi(g z-vC|(|C`Xa!M+6iI_S5+z6JUY*muEx7wj(~t_(N>cn9>&zb_*a0>0Iz}nIPh=4D}XWJ9l$>TZv&1;zpIe%cJRy4-~G^SgZwcFA@4v_4q``xoN>up-&E- z=jx&jz-nMIZ~?H+&D1Z3omi!LA_#kvrLTvbe#Wh`bmwSYVm0)&z!gB$ajLDjv*BL} z zJM8S!1#i}vu;+>W&4(^Ep3FYi!cRT-MJ4R)rw*L;!4r+Jb6i2q6I}Ni_@PU1T$~ry zTWRLeQ5X6e-hggc|_k1=NtPVvEIGm$q&mfy5a2xod_c!C0`{+~q|v;faYRe{{DK+wBak~xZC9QXa{ug zJ6zX*`_B(rF4(`O&VBA#aV1;a+in%t)EVxiqMzNnXZLa)e+={C%txI@_wmEm6&`PD zar;B#elv9g^YVOdaX%^yeShsO0pzW7Px<7|?T=g<(D4Ih=w~7FpQHOY%=2o#u``bO z9CZcUgV!JZSKv^+&R11x`ab+n{u{|R@p(VY^=5w!rZ1J|l8x?n@BFy(SjBV`&-@vA z7O!w$f9sY*r*4>G`d9N^g}4nD>3)LvT)bq9>0jr&*kxWdzi)9pX1lc?KiPf%@fiF| z-Ft4T%UgGK0R3=1`FV?VdWYvU$2D8Wi}9#BiGZH(x2L-w4ZU{z={K?-p1W=Rs8{cv z`-_d=oB7Npvp#zL7r~GG!#r=XE;TsUU*>a#`oFtG&(Hg%W_>nooO|a15wG%!c~bjk z7eA*}x8g^Mp0EAY8k-v2RjcElOq%Cn9HnO6xF2+6YQJhe-{>^w=Q`%$$Y$)9)Ozdj za-K0Bm*bk{>O2iVet#%&Z=7^R%>zxF^|+er+|IjRs($bA4!utvp5|ykMDbW(_XF<&zBPO{Or2ytckDvW4Y$fcDPr5XVKbw-(Fz$ zE%#04x!^czpT1vi=AHZco>`B5cFo}n%|6rnxfJ;;U6toAGtei`Yd+uLGtj5YbC`bY z>-J95pE_T9XS!Saj{N7)rzLpq0(!mY)EXV1s~TKKuo*~cQYzxBCA`+WD5 zgMqGh4{pNz%y4H0KIq;2Z=Ub7b-s6-%|2nh$x|Ah`tyT4?{Us`)#-WQIh&ZSXTd^OfNDGM<-s9>w{m&SSB^`P{RAbq-u}Kepr_T=2O&H{l8zVv16vzxj0`I%rf(PcKJ7xk6l>fu4(_Qc;KTl zfn-=BmpUXXmlGUwk&5Uz~0C{!@c0csf(Qc!i$M%y`N1iKYi#yr=Z29cIsVCA;PNdyN z+nTP77?1O$kPP>g`ErXrMay!}l)Z2M!w=vgzjPujePNd#O+i!DQ zZ*%I2+1A01w>k5isNFA)d2IiQa=$#Z+i2&v{p8dUX}9r<`g6@^*Ri=@9>4vaYdrJ% z>9g7W>N9UP^viF@**c!*MB8u2+njo$pWn9oar^nB?qua>K0oxU?`QYRbF%FmKiA1G zKl`Mf$o)k><84ko{WfOv+j??4j{HRH+j?^5^Q%XDuE@N^T;oob&K}Rn%F8^=&;3MB zKk-!CuopDZjuqW@Pt_I;xDa@A93&p+*~PvrQ?{dDvbnLk%OTP7)=>HU2$lsqCs!aV}YP4*$-=C=yD<_jNVd_Sf)K>9R|4&OE)BvT- zbMll~*%V3hrgAZp(MqY#&#Y1`WT|lFS2T4uiblt!OH_<9XCzEF!;5NZ9(9}gVY28d z3`^_Mm`2{Dr4>%YjKJC#X#q)WTi@P$Dz-IchL0tx&HgW}374Jo&Z zP~nJ^Ba^qOKgOX8tHi>lWlE_Ox&l%gf@&kN{`BDw3)Fwu<45GNO zs2b8kue3>Z`b7g#_;f?Ykh+l$nUt5jo;-fl zuP{!GP7<$HjP{1W81aU{O!=}ybWU`hdo5qn7Az%=`IEDV_zOh*wXFrMMXh-%Yh-9ZIK{g%;>{BA8jXB&g|F0v`XYm2^(xC&(NTDC NP`oU&*Bk1r_%GD(LB#+7 literal 0 HcmV?d00001 diff --git a/Lesson8/y_train.pkl b/Lesson8/y_train.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c9c10281ba6125ee04c830beb1d3315c0c9481e8 GIT binary patch literal 16662 zcmeI4d303e8OA3efw1q3vJ{XdEDi(#*%2rR1dvTCO9(f~m>EbWVU|FYAd74=tss|0 zp$Zk0T0oRd5zqs;6^it@AP5EZpaP<0k*xx~$^E_O=+}OonbTwc>EsW7JkR?)@4dMv zoWr^2%gjhvTE3T~)tI43!>X|7D5UVy0PSCWjKzbF+PB zsKHOe9&f8=$@%nB6dyNWyUo=)b1`S!R}D6X41WXO<#x2>;|KT;h5Gh;#TQ#BlpR&n;D4VJQb z&4%M7emmG55_2p=e;scZ64CkaIrg+--h<)`=pu4{bQ|<5usr@!^dS0K=)b~?&m%@$s-NmQZl)Zz|w_rK)bIHGr-w2Ln-jV(z`0wNY0AGfC;6id6*}D(FD;x|P zvv(}HKIrY}e(b4--w*$B_yBuGk(-ad4%UZN=w z{p=e+{~h=g^K`3CSca_3-0ay#H|axIvj!jDG>;UKsT zR)9~#SLhAo`+E_8E&g@b6~6|#RD2(t#{3HR<-zyE$@ppLi|8@vVd#AJw}ahbL-sd? z>GUeV&GcKNTfu?&>m|>=JK1vzUM9DNTz~wkakA9t89j@~Y{xkT4;9`1HVJ&ifnRlVT8(oF@a`NxN^UO!XEBFQI z>F_vyWA=QHk97FB-bWbS&>?gU`j648jlUH?h<_U9;V*)}fr+p>{hlx#e=+?>(QRN$ z?&mDpN3SBx$8V2b5ATJUunqgiqXY01?8Tna=>2j(T>lL79Q;iDZs;Y7m!7lh?=`BMS!ME6R4?2ThC-P6?AHiP)@1xfXZT;7(*N}bR<5$C<0sAm_z*6iP4c{aefM2j@8`?>4 zJ^o|x3Fv_j!qPB@{Z95Iqn|}TLv91QDs65Zp)aI@(3<8olx8ubBTD7UDloe+vH7=#A(CxEAI! z{}N8ZccWk6x@%!7^Qp``!WGaY*Pr`2jDLaNAy^jwLv&O0+w^at+oD&I>jsP90a%7y zCb}j1Ik=pCdzshb{+grDkTc0Gfqlt!h1oET{9)J+zCmvqyvBY5F2Wzlo_Ek0%CHERU0CzCIie8Pb$^KdR)7ig-d0pnO;@C3P?=nHT%*F6Kv(4RnlH7p?SVg7ggv-Fzdufy*_egS$V`d8$#(WPL3Tp_HB zUkGQy&$-?M>^VyQDD!df2Ash>k9=p?1m-Zm0q4R?@NPJQd~ft9?&A`C1AjHTBDx8> zKKf7SC(+f}dl`K{dIGuj_!rPU(OckO_&!^rE7Jdf`~v2`gFlcv#yk=^cYlk$V$;0-eB~iS#-$pUWKS=tZs_#xU}S;9W3@+)er$;JYvnjw3%3 z-GRMh&}G?kkz6I%jrnNukE5%>7s&4{_EoLsRV{g;%vG)P)tsuOr!vx<^dujSy1eSETK1}b zq*`*(QoXLO|4Zj|`^6XP?oah~d7ZEJMN3{NemF{AD0$JsNPWo(b$ywKquN*Vy1CAm zzB(5jj?!1D_GB);PK&Qn?WuFox}46Jo-kZa=0fQS!}+?o+7FkLyh`cmD0!LdwCbyQ znXA<0#8)YOoiB48rKirtSE=Syt>#s&<|Ea*J>si-#8(+Er_NQa+b6y*ulhPIdxVlx zbvR$0OJ7IH3ndq=mYz`87e89cUSYI-$*X-;OHQcHby|F(t}nh&msfqsM@!kGQr#nS z@l`GU|BB(RFLPm}_oe1lEqheI`1NS*kMuodBzTl+Rx!2 zFXt=4>HM6;4vX5)rwh*|c39MY94vX5)A$C~Qeh#t2 zqV{u$9Tv5pL+r4q{T$ZVaQr_W7IimwC);O3i>!|pXzNUA?@X~jRlu-50l;*Z3zg0& z{=fjgnO0(#E--R}#RF$3$@#%DFJ+q>p+qAPus*7w$P%HX5kX&pr@&>M IUCnU*6QHPf_W%F@ literal 0 HcmV?d00001 diff --git a/Lesson8/y_valid.pkl b/Lesson8/y_valid.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7b8bc7bcfa8ee72c04555c7c3878b7026f17fbc2 GIT binary patch literal 5978 zcmeI0O=w(I7=~w>pTZd98$q!o(K;PFDH&^Y5r;c7x0$gsH*xMv z(iDOU#u~Y;V2=yMDt4jNE<{&~3(dm6LXp-V1ffL@>O!;|L8wsA%zZwr2d^X5m1Guq z=6%2S`_9c>z^HM4^=;2>$K%$EH{##D%ug(F6L}kd5sDG%OTbOn3`98UVXAK zSyW!vh2fl2vWt$DcOwPMu_o=JZfs5DoXNaBl%GoP+2OmYI;3wxc^!Z5&e(fxH&gV(|K3|45P~QmS z@Fev`_!NAb{_n{Bg8u;eKGt!_9fY&^Q{;QezXzA_kC9&w2k7gjeguCj`T+e7xr6wf z=pWH1V1fP)^oKA;ewq3l{%PjlMZF#F!5=3#0+aZkq80O&@Nc0%hu(l*hu%(p8~xYg ze}>)&pTmC_9)m}zzYiOjA4R`Pu8nne!dvkN;Xe9C;1GO}dXs zf&Hxe8|>$K_M-Kl$m(Ycd>a2aY-Ua#og#MxeLMAC%zYET5#33C7i@v|kULHPBj_b` zg#KCb>+l~U*M&a_Q{?X@cNYH?dJJ7p|M&0&^=s(A9CqVxBiD}ZfXn#j&}-pwxS#r2 z_$}NA9e5ai4xfijJnx;%djb70&-WU+-S{8DqxeeifL^_074&xAveR;1+tL3ML^G=V zgx*rE36*Wjc2suRZ0Ci0Y>&TdGK86bK>Q#|(yWVLi)NqbpgyS!gL2~6$_3R;U$jYC z=f7sqI^xTEL0WvFnKyl@*P_`c+GNli$Y`0bJ&2N*y67O*%9(Xj2hFYQnw7orvg{gtkj=LK?IyW$wDe!&%F)t) z4=YDY|2?c6E&cbfa3d;Ee{FE{xpd`fHx`PWiG?d?(y~={ xtEbqczV+3s$aVN7$%0+7s&~wKtM0YD2CG!k*WL44cuk3NaV9ro>w|78@;i<)x>W!G literal 0 HcmV?d00001 From 05c0b57182706486ed327b793e9b7c26bd4f5b93 Mon Sep 17 00:00:00 2001 From: Foton Date: Mon, 17 Jun 2024 11:11:06 +0300 Subject: [PATCH 5/7] Add Home Work Machine Learning DSLibraries Lesson8 --- Lesson6/Task2.ipynb | 1979 ------------------------------------------- 1 file changed, 1979 deletions(-) delete mode 100644 Lesson6/Task2.ipynb diff --git a/Lesson6/Task2.ipynb b/Lesson6/Task2.ipynb deleted file mode 100644 index 2ebd9ce..0000000 --- a/Lesson6/Task2.ipynb +++ /dev/null @@ -1,1979 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "b2f2e5e2", - "metadata": {}, - "source": [ - "## Тема “Обучение с учителем”" - ] - }, - { - "cell_type": "markdown", - "id": "16595a84", - "metadata": {}, - "source": [ - "### Задание 1\n", - "Импортируйте библиотеки pandas и numpy.\n", - "Загрузите \"Boston House Prices dataset\" из встроенных наборов данных библиотеки sklearn..\n", - "Разбейте эти датафреймы на тренировочные (X_train, y_train) и тестовые (X_test, y_test) с помощью\n", - "функции train_test_split так, чтобы размер тестовой выборки\n", - "составлял 30% от всех данных, при этом аргумент random state должен быть равен 42.\n", - "Создайте модель линейной регрессии под названием lr с помощью класса LinearRegression из модуля\n", - "sklearn.linear_model.\n", - "Обучите модель на тренировочных данных (используйте все признаки) и сделайте предсказание на\n", - "тестовых." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "274303e6", - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "\n", - "warnings.filterwarnings('ignore')" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "693a9c36", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "pd.options.display.max_columns = 100\n", - "\n", - "from sklearn.datasets import load_boston" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "a37f783d", - "metadata": {}, - "outputs": [], - "source": [ - "boston = load_boston()\n", - "\n", - "feature_names = boston[\"feature_names\"]\n", - "\n", - "X = pd.DataFrame(boston[\"data\"], columns=feature_names)\n", - "y = pd.DataFrame(boston[\"target\"], columns=[\"price\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "5a2e0780", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "((354, 13), (152, 13))" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n", - "\n", - "X_train.shape, X_test.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "96164976", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.linear_model import LinearRegression\n", - "\n", - "lr = LinearRegression()\n", - "\n", - "lr.fit(X_train, y_train)\n", - "\n", - "lr_pred = lr.predict(X_test)" - ] - }, - { - "cell_type": "markdown", - "id": "a07cffa5", - "metadata": {}, - "source": [ - "### Задание 2\n", - "Создайте модель под названием model с помощью класса RandomForestRegressor из модуля\n", - "sklearn.ensemble.\n", - "Сделайте агрумент n_estimators равным 1000,\n", - "max_depth должен быть равен 12 и random_state сделайте равным 42.\n", - "Обучите модель на тренировочных данных аналогично тому, как вы обучали модель LinearRegression,\n", - "но при этом в метод fit вместо датафрейма y_train поставьте y_train.values[:, 0],\n", - "чтобы получить из датафрейма одномерный массив Numpy,\n", - "так как для класса RandomForestRegressor в данном методе для аргумента y предпочтительно\n", - "применение массивов вместо датафрейма.\n", - "Сделайте предсказание на тестовых данных и посчитайте R2. Сравните с результатом из\n", - "предыдущего задания.\n", - "Напишите в комментариях к коду, какая модель в данном случае работает лучше" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "64042e74", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "оценка R2 модели ансамбля случайного леса выше чем у линейной регрессии\n", - "RandomForestRegressor=0.8747\n", - "LinearRegression=0.7112\n" - ] - } - ], - "source": [ - "from sklearn.ensemble import RandomForestRegressor\n", - "from sklearn.metrics import r2_score\n", - "\n", - "model = RandomForestRegressor(max_depth=12, n_estimators=1000, random_state=42)\n", - "\n", - "model.fit(X_train, y_train.values[:,0])\n", - "\n", - "rf_pred = model.predict(X_test)\n", - "\n", - "r2_lr = r2_score(y_test, lr_pred)\n", - "r2_rf = r2_score(y_test, rf_pred)\n", - "\n", - "print(f\"оценка R2 модели ансамбля случайного леса {'выше' if r2_rf > r2_lr else 'ниже' } \" \n", - " f\"чем у линейной регрессии\\nRandomForestRegressor={r2_rf:.4f}\\nLinearRegression={r2_lr:.4f}\")" - ] - }, - { - "cell_type": "markdown", - "id": "94b0548e", - "metadata": {}, - "source": [ - "### *Задание 3\n", - "Вызовите документацию для класса RandomForestRegressor,\n", - "найдите информацию об атрибуте feature_importances_.\n", - "С помощью этого атрибута найдите сумму всех показателей важности,\n", - "установите, какие два признака показывают наибольшую важность." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "1327004d", - "metadata": {}, - "outputs": [], - "source": [ - "?RandomForestRegressor" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "c8e1fab0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "сумма показателей важности признаков модели = 1.0\n" - ] - } - ], - "source": [ - "print(f\"сумма показателей важности признаков модели = {np.sum(model.feature_importances_)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "744deb30", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "первые два наиболее важные признака\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
featureimportance
12LSTAT0.415847
5RM0.402682
\n", - "
" - ], - "text/plain": [ - " feature importance\n", - "12 LSTAT 0.415847\n", - "5 RM 0.402682" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print(\"первые два наиболее важные признака\")\n", - "pd.DataFrame({'feature': feature_names,'importance': model.feature_importances_}) \\\n", - ".sort_values('importance',ascending = False) \\\n", - ".head(2)" - ] - }, - { - "cell_type": "markdown", - "id": "b9004884", - "metadata": {}, - "source": [ - "## *Задание 4\n", - "В этом задании мы будем работать с датасетом, с которым мы уже знакомы по домашнему заданию\n", - "по библиотеке Matplotlib, это датасет Credit Card Fraud Detection.Для этого датасета мы будем решать\n", - "задачу классификации - будем определять,какие из транзакции по кредитной карте являются\n", - "мошенническими.Данный датасет сильно несбалансирован (так как случаи мошенничества\n", - "относительно редки),так что применение метрики accuracy не принесет пользы и не поможет выбрать\n", - "лучшую модель.Мы будем вычислять AUC, то есть площадь под кривой ROC.\n", - "Импортируйте из соответствующих модулей RandomForestClassifier, GridSearchCV и train_test_split.\n", - "Загрузите датасет creditcard.csv и создайте датафрейм df." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "a65b98a0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset URL: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud\n", - "License(s): DbCL-1.0\n", - "Downloading creditcardfraud.zip to W:\\Projects\\GB\\Python\\MLearning\\DSLibraries\\Lesson6\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - " 0%| | 0.00/66.0M [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TimeV1V2V3V4V5V6V7V8V9V10V11V12V13V14V15V16V17V18V19V20V21V22V23V24V25V26V27V28AmountClass
00.0-1.359807-0.0727812.5363471.378155-0.3383210.4623880.2395990.0986980.3637870.090794-0.551600-0.617801-0.991390-0.3111691.468177-0.4704010.2079710.0257910.4039930.251412-0.0183070.277838-0.1104740.0669280.128539-0.1891150.133558-0.021053149.620
10.01.1918570.2661510.1664800.4481540.060018-0.082361-0.0788030.085102-0.255425-0.1669741.6127271.0652350.489095-0.1437720.6355580.463917-0.114805-0.183361-0.145783-0.069083-0.225775-0.6386720.101288-0.3398460.1671700.125895-0.0089830.0147242.690
21.0-1.358354-1.3401631.7732090.379780-0.5031981.8004990.7914610.247676-1.5146540.2076430.6245010.0660840.717293-0.1659462.345865-2.8900831.109969-0.121359-2.2618570.5249800.2479980.7716790.909412-0.689281-0.327642-0.139097-0.055353-0.059752378.660
31.0-0.966272-0.1852261.792993-0.863291-0.0103091.2472030.2376090.377436-1.387024-0.054952-0.2264870.1782280.507757-0.287924-0.631418-1.059647-0.6840931.965775-1.232622-0.208038-0.1083000.005274-0.190321-1.1755750.647376-0.2219290.0627230.061458123.500
42.0-1.1582330.8777371.5487180.403034-0.4071930.0959210.592941-0.2705330.8177390.753074-0.8228430.5381961.345852-1.1196700.175121-0.451449-0.237033-0.0381950.8034870.408542-0.0094310.798278-0.1374580.141267-0.2060100.5022920.2194220.21515369.990
52.0-0.4259660.9605231.141109-0.1682520.420987-0.0297280.4762010.260314-0.568671-0.3714071.3412620.359894-0.358091-0.1371340.5176170.401726-0.0581330.068653-0.0331940.084968-0.208254-0.559825-0.026398-0.371427-0.2327940.1059150.2538440.0810803.670
64.01.2296580.1410040.0453711.2026130.1918810.272708-0.0051590.0812130.464960-0.099254-1.416907-0.153826-0.7510630.1673720.050144-0.4435870.002821-0.611987-0.045575-0.219633-0.167716-0.270710-0.154104-0.7800550.750137-0.2572370.0345070.0051684.990
77.0-0.6442691.4179641.074380-0.4921990.9489340.4281181.120631-3.8078640.6153751.249376-0.6194680.2914741.757964-1.3238650.686133-0.076127-1.222127-0.3582220.324505-0.1567421.943465-1.0154550.057504-0.649709-0.415267-0.051634-1.206921-1.08533940.800
87.0-0.8942860.286157-0.113192-0.2715262.6695993.7218180.3701450.851084-0.392048-0.410430-0.705117-0.110452-0.2862540.074355-0.328783-0.210077-0.4997680.1187650.5703280.052736-0.073425-0.268092-0.2042331.0115920.373205-0.3841570.0117470.14240493.200
99.0-0.3382621.1195931.044367-0.2221870.499361-0.2467610.6515830.069539-0.736727-0.3668461.0176140.8363901.006844-0.4435230.1502190.739453-0.5409800.4766770.4517730.203711-0.246914-0.633753-0.120794-0.385050-0.0697330.0941990.2462190.0830763.680
\n", - "" - ], - "text/plain": [ - " Time V1 V2 V3 V4 V5 V6 V7 \\\n", - "0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n", - "1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n", - "2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n", - "3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n", - "4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n", - "5 2.0 -0.425966 0.960523 1.141109 -0.168252 0.420987 -0.029728 0.476201 \n", - "6 4.0 1.229658 0.141004 0.045371 1.202613 0.191881 0.272708 -0.005159 \n", - "7 7.0 -0.644269 1.417964 1.074380 -0.492199 0.948934 0.428118 1.120631 \n", - "8 7.0 -0.894286 0.286157 -0.113192 -0.271526 2.669599 3.721818 0.370145 \n", - "9 9.0 -0.338262 1.119593 1.044367 -0.222187 0.499361 -0.246761 0.651583 \n", - "\n", - " V8 V9 V10 V11 V12 V13 V14 \\\n", - "0 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169 \n", - "1 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772 \n", - "2 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946 \n", - "3 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924 \n", - "4 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670 \n", - "5 0.260314 -0.568671 -0.371407 1.341262 0.359894 -0.358091 -0.137134 \n", - "6 0.081213 0.464960 -0.099254 -1.416907 -0.153826 -0.751063 0.167372 \n", - "7 -3.807864 0.615375 1.249376 -0.619468 0.291474 1.757964 -1.323865 \n", - "8 0.851084 -0.392048 -0.410430 -0.705117 -0.110452 -0.286254 0.074355 \n", - "9 0.069539 -0.736727 -0.366846 1.017614 0.836390 1.006844 -0.443523 \n", - "\n", - " V15 V16 V17 V18 V19 V20 V21 \\\n", - "0 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307 \n", - "1 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775 \n", - "2 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998 \n", - "3 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300 \n", - "4 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431 \n", - "5 0.517617 0.401726 -0.058133 0.068653 -0.033194 0.084968 -0.208254 \n", - "6 0.050144 -0.443587 0.002821 -0.611987 -0.045575 -0.219633 -0.167716 \n", - "7 0.686133 -0.076127 -1.222127 -0.358222 0.324505 -0.156742 1.943465 \n", - "8 -0.328783 -0.210077 -0.499768 0.118765 0.570328 0.052736 -0.073425 \n", - "9 0.150219 0.739453 -0.540980 0.476677 0.451773 0.203711 -0.246914 \n", - "\n", - " V22 V23 V24 V25 V26 V27 V28 \\\n", - "0 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 \n", - "1 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 \n", - "2 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 \n", - "3 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 \n", - "4 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 \n", - "5 -0.559825 -0.026398 -0.371427 -0.232794 0.105915 0.253844 0.081080 \n", - "6 -0.270710 -0.154104 -0.780055 0.750137 -0.257237 0.034507 0.005168 \n", - "7 -1.015455 0.057504 -0.649709 -0.415267 -0.051634 -1.206921 -1.085339 \n", - "8 -0.268092 -0.204233 1.011592 0.373205 -0.384157 0.011747 0.142404 \n", - "9 -0.633753 -0.120794 -0.385050 -0.069733 0.094199 0.246219 0.083076 \n", - "\n", - " Amount Class \n", - "0 149.62 0 \n", - "1 2.69 0 \n", - "2 378.66 0 \n", - "3 123.50 0 \n", - "4 69.99 0 \n", - "5 3.67 0 \n", - "6 4.99 0 \n", - "7 40.80 0 \n", - "8 93.20 0 \n", - "9 3.68 0 " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from zipfile import ZipFile\n", - "\n", - "ZipFile(\"creditcardfraud.zip\").extractall(\".\")\n", - "\n", - "df = pd.read_csv(\"creditcard.csv\")\n", - "\n", - "df.head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "9302f8fe", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 284807 entries, 0 to 284806\n", - "Data columns (total 31 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 Time 284807 non-null float64\n", - " 1 V1 284807 non-null float64\n", - " 2 V2 284807 non-null float64\n", - " 3 V3 284807 non-null float64\n", - " 4 V4 284807 non-null float64\n", - " 5 V5 284807 non-null float64\n", - " 6 V6 284807 non-null float64\n", - " 7 V7 284807 non-null float64\n", - " 8 V8 284807 non-null float64\n", - " 9 V9 284807 non-null float64\n", - " 10 V10 284807 non-null float64\n", - " 11 V11 284807 non-null float64\n", - " 12 V12 284807 non-null float64\n", - " 13 V13 284807 non-null float64\n", - " 14 V14 284807 non-null float64\n", - " 15 V15 284807 non-null float64\n", - " 16 V16 284807 non-null float64\n", - " 17 V17 284807 non-null float64\n", - " 18 V18 284807 non-null float64\n", - " 19 V19 284807 non-null float64\n", - " 20 V20 284807 non-null float64\n", - " 21 V21 284807 non-null float64\n", - " 22 V22 284807 non-null float64\n", - " 23 V23 284807 non-null float64\n", - " 24 V24 284807 non-null float64\n", - " 25 V25 284807 non-null float64\n", - " 26 V26 284807 non-null float64\n", - " 27 V27 284807 non-null float64\n", - " 28 V28 284807 non-null float64\n", - " 29 Amount 284807 non-null float64\n", - " 30 Class 284807 non-null int64 \n", - "dtypes: float64(30), int64(1)\n", - "memory usage: 67.4 MB\n" - ] - } - ], - "source": [ - "df.info()" - ] - }, - { - "cell_type": "markdown", - "id": "4ba00248", - "metadata": {}, - "source": [ - "Создайте датафрейм X из датафрейма df, исключив столбец Class.\n", - "Создайте объект Series под названием y из столбца Class.\n", - "Разбейте X и y на тренировочный и тестовый наборы данных при помощи функции train_test_split,\n", - "используя аргументы: test_size=0.3, random_state=100, stratify=y.\n", - "У вас должны получиться объекты X_train, X_test, y_train и y_test.\n", - "Просмотрите информацию о их форме." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "59e1e34e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "выборка не сбалансированна, данных первого класса значительно меньше\n", - "0 0.998273\n", - "1 0.001727\n", - "Name: Class, dtype: float64\n" - ] - }, - { - "data": { - "text/plain": [ - "((199364, 30), (85443, 30), (199364,), (85443,))" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target = \"Class\"\n", - "\n", - "y = df[target]\n", - "X = df.drop(target, axis=1)\n", - "\n", - "print(f\"выборка не сбалансированна, данных первого класса значительно меньше\\n{y.value_counts(normalize=True)}\")\n", - "\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)\n", - "X_train.shape, X_test.shape, y_train.shape, y_test.shape" - ] - }, - { - "cell_type": "markdown", - "id": "fe7fae18", - "metadata": {}, - "source": [ - "Для поиска по сетке параметров задайте такие параметры:\n", - "parameters = [{'n_estimators': [10, 15],\n", - "'max_features': np.arange(3, 5),\n", - "'max_depth': np.arange(4, 7)}]\n", - "Создайте модель GridSearchCV со следующими аргументами:\n", - "estimator=RandomForestClassifier(random_state=100),\n", - "param_grid=parameters,\n", - "scoring='roc_auc',\n", - "cv=3." - ] - }, - { - "cell_type": "code", - "execution_count": 103, - "id": "c034c2a4", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.ensemble import RandomForestClassifier\n", - "\n", - "parameters = {\n", - " 'n_estimators': [10, 15],\n", - " 'max_features': np.arange(3, 5),\n", - " 'max_depth': np.arange(4, 7),\n", - "}\n", - "\n", - "clf = GridSearchCV(\n", - " estimator=RandomForestClassifier(random_state=100),\n", - " param_grid=parameters,\n", - " scoring='roc_auc',\n", - " cv=3,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "a4308ac3", - "metadata": {}, - "source": [ - "Обучите модель на тренировочном наборе данных (может занять несколько минут).\n", - "Просмотрите параметры лучшей модели с помощью атрибута best_params_.\n", - "Предскажите вероятности классов с помощью полученной модели и метода predict_proba.\n", - "Из полученного результата (массив Numpy) выберите столбец с индексом 1 (вероятность класса 1) и\n", - "запишите в массив y_pred_proba. Из модуля sklearn.metrics импортируйте метрику roc_auc_score.\n", - "Вычислите AUC на тестовых данных и сравните с результатом,полученным на тренировочных данных,\n", - "используя в качестве аргументов массивы y_test и y_pred_proba" - ] - }, - { - "cell_type": "code", - "execution_count": 104, - "id": "9b43d0b7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=100),\n", - " param_grid={'max_depth': array([4, 5, 6]),\n", - " 'max_features': array([3, 4]),\n", - " 'n_estimators': [10, 15]},\n", - " scoring='roc_auc')" - ] - }, - "execution_count": 104, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "clf.fit(X_train, y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 105, - "id": "966c3d4d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'max_depth': 6, 'max_features': 3, 'n_estimators': 15}" - ] - }, - "execution_count": 105, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "clf.best_params_" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "id": "405d63d0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "метрика AUC=0.9436 на тестовых данных меньше метрики AUC=0.9660 на обучающем наборе\n" - ] - } - ], - "source": [ - "from sklearn.metrics import roc_auc_score\n", - "\n", - "model = RandomForestClassifier(max_depth=6, max_features=3, n_estimators=15)\n", - "\n", - "model.fit(X_train, y_train)\n", - "\n", - "y_pred_proba = model.predict_proba(X_test)[:, 1]\n", - "\n", - "auc = roc_auc_score(y_test, y_pred_proba)\n", - "\n", - "print(f\"метрика AUC={auc:.4f} на тестовых данных {'больше' if auc > clf.best_score_ else 'меньше'} \"\n", - " f\"метрики AUC={clf.best_score_:.4f} на обучающем наборе\")" - ] - }, - { - "cell_type": "markdown", - "id": "5cb517be", - "metadata": {}, - "source": [ - "## *Дополнительные задания:\n", - "1). Загрузите датасет Wine из встроенных датасетов sklearn.datasets с помощью функции load_wine в\n", - "переменную data." - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "id": "1dbcea26", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.datasets import load_wine\n", - "\n", - "wine = load_wine()\n", - "\n", - "data = wine[\"data\"]" - ] - }, - { - "cell_type": "markdown", - "id": "18dea80c", - "metadata": {}, - "source": [ - "2). Полученный датасет не является датафреймом. Это структура данных, имеющая ключи\n", - "аналогично словарю. Просмотрите тип данных этой структуры данных и создайте список data_keys,\n", - "содержащий ее ключи." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "d5f5e52a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_keys = wine.keys()\n", - "data_keys" - ] - }, - { - "cell_type": "markdown", - "id": "73bff5cf", - "metadata": {}, - "source": [ - "3). Просмотрите данные, описание и названия признаков в датасете. Описание нужно вывести в виде\n", - "привычного, аккуратно оформленного текста, без обозначений переноса строки, но с самими\n", - "переносами и т.д." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "39433029", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - ".. _wine_dataset:\n", - "\n", - "Wine recognition dataset\n", - "------------------------\n", - "\n", - "**Data Set Characteristics:**\n", - "\n", - " :Number of Instances: 178 (50 in each of three classes)\n", - " :Number of Attributes: 13 numeric, predictive attributes and the class\n", - " :Attribute Information:\n", - " \t\t- Alcohol\n", - " \t\t- Malic acid\n", - " \t\t- Ash\n", - "\t\t- Alcalinity of ash \n", - " \t\t- Magnesium\n", - "\t\t- Total phenols\n", - " \t\t- Flavanoids\n", - " \t\t- Nonflavanoid phenols\n", - " \t\t- Proanthocyanins\n", - "\t\t- Color intensity\n", - " \t\t- Hue\n", - " \t\t- OD280/OD315 of diluted wines\n", - " \t\t- Proline\n", - "\n", - " - class:\n", - " - class_0\n", - " - class_1\n", - " - class_2\n", - "\t\t\n", - " :Summary Statistics:\n", - " \n", - " ============================= ==== ===== ======= =====\n", - " Min Max Mean SD\n", - " ============================= ==== ===== ======= =====\n", - " Alcohol: 11.0 14.8 13.0 0.8\n", - " Malic Acid: 0.74 5.80 2.34 1.12\n", - " Ash: 1.36 3.23 2.36 0.27\n", - " Alcalinity of Ash: 10.6 30.0 19.5 3.3\n", - " Magnesium: 70.0 162.0 99.7 14.3\n", - " Total Phenols: 0.98 3.88 2.29 0.63\n", - " Flavanoids: 0.34 5.08 2.03 1.00\n", - " Nonflavanoid Phenols: 0.13 0.66 0.36 0.12\n", - " Proanthocyanins: 0.41 3.58 1.59 0.57\n", - " Colour Intensity: 1.3 13.0 5.1 2.3\n", - " Hue: 0.48 1.71 0.96 0.23\n", - " OD280/OD315 of diluted wines: 1.27 4.00 2.61 0.71\n", - " Proline: 278 1680 746 315\n", - " ============================= ==== ===== ======= =====\n", - "\n", - " :Missing Attribute Values: None\n", - " :Class Distribution: class_0 (59), class_1 (71), class_2 (48)\n", - " :Creator: R.A. Fisher\n", - " :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n", - " :Date: July, 1988\n", - "\n", - "This is a copy of UCI ML Wine recognition datasets.\n", - "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data\n", - "\n", - "The data is the results of a chemical analysis of wines grown in the same\n", - "region in Italy by three different cultivators. There are thirteen different\n", - "measurements taken for different constituents found in the three types of\n", - "wine.\n", - "\n", - "Original Owners: \n", - "\n", - "Forina, M. et al, PARVUS - \n", - "An Extendible Package for Data Exploration, Classification and Correlation. \n", - "Institute of Pharmaceutical and Food Analysis and Technologies,\n", - "Via Brigata Salerno, 16147 Genoa, Italy.\n", - "\n", - "Citation:\n", - "\n", - "Lichman, M. (2013). UCI Machine Learning Repository\n", - "[https://archive.ics.uci.edu/ml]. Irvine, CA: University of California,\n", - "School of Information and Computer Science. \n", - "\n", - ".. topic:: References\n", - "\n", - " (1) S. Aeberhard, D. Coomans and O. de Vel, \n", - " Comparison of Classifiers in High Dimensional Settings, \n", - " Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of \n", - " Mathematics and Statistics, James Cook University of North Queensland. \n", - " (Also submitted to Technometrics). \n", - "\n", - " The data was used with many others for comparing various \n", - " classifiers. The classes are separable, though only RDA \n", - " has achieved 100% correct classification. \n", - " (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) \n", - " (All results using the leave-one-out technique) \n", - "\n", - " (2) S. Aeberhard, D. Coomans and O. de Vel, \n", - " \"THE CLASSIFICATION PERFORMANCE OF RDA\" \n", - " Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of \n", - " Mathematics and Statistics, James Cook University of North Queensland. \n", - " (Also submitted to Journal of Chemometrics).\n", - "\n" - ] - } - ], - "source": [ - "print(wine[\"DESCR\"])" - ] - }, - { - "cell_type": "markdown", - "id": "772359a0", - "metadata": {}, - "source": [ - "4). Сколько классов содержит целевая переменная датасета? Выведите названия классов." - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "183b0c76", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['class_0' 'class_1' 'class_2']\n", - "кол-во:3\n" - ] - } - ], - "source": [ - "target_class = wine[\"target_names\"]\n", - "\n", - "print(f\"{target_class}\\nкол-во:{len(target_class)}\")" - ] - }, - { - "cell_type": "markdown", - "id": "41d6690b", - "metadata": {}, - "source": [ - "5). На основе данных датасета (они содержатся в двумерном массиве Numpy) и названий признаков\n", - "создайте датафрейм под названием X." - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "15557fbe", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesproline
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.0
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.0
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.0
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.0
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.0
\n", - "
" - ], - "text/plain": [ - " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", - "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", - "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", - "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", - "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", - "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", - "\n", - " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", - "0 3.06 0.28 2.29 5.64 1.04 \n", - "1 2.76 0.26 1.28 4.38 1.05 \n", - "2 3.24 0.30 2.81 5.68 1.03 \n", - "3 3.49 0.24 2.18 7.80 0.86 \n", - "4 2.69 0.39 1.82 4.32 1.04 \n", - "\n", - " od280/od315_of_diluted_wines proline \n", - "0 3.92 1065.0 \n", - "1 3.40 1050.0 \n", - "2 3.17 1185.0 \n", - "3 3.45 1480.0 \n", - "4 2.93 735.0 " - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "feature_names = wine[\"feature_names\"]\n", - "\n", - "X = pd.DataFrame(data, columns=feature_names)\n", - "\n", - "X.head()" - ] - }, - { - "cell_type": "markdown", - "id": "e9ef2d4b", - "metadata": {}, - "source": [ - "6). Выясните размер датафрейма X и установите, имеются ли в нем пропущенные значения." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "9f2ee54a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 178 entries, 0 to 177\n", - "Data columns (total 13 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 alcohol 178 non-null float64\n", - " 1 malic_acid 178 non-null float64\n", - " 2 ash 178 non-null float64\n", - " 3 alcalinity_of_ash 178 non-null float64\n", - " 4 magnesium 178 non-null float64\n", - " 5 total_phenols 178 non-null float64\n", - " 6 flavanoids 178 non-null float64\n", - " 7 nonflavanoid_phenols 178 non-null float64\n", - " 8 proanthocyanins 178 non-null float64\n", - " 9 color_intensity 178 non-null float64\n", - " 10 hue 178 non-null float64\n", - " 11 od280/od315_of_diluted_wines 178 non-null float64\n", - " 12 proline 178 non-null float64\n", - "dtypes: float64(13)\n", - "memory usage: 18.2 KB\n" - ] - } - ], - "source": [ - "X.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "90a743fa", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "размер = (178, 13)\n", - "кол-во пустых значений\n", - "alcohol 0\n", - "malic_acid 0\n", - "ash 0\n", - "alcalinity_of_ash 0\n", - "magnesium 0\n", - "total_phenols 0\n", - "flavanoids 0\n", - "nonflavanoid_phenols 0\n", - "proanthocyanins 0\n", - "color_intensity 0\n", - "hue 0\n", - "od280/od315_of_diluted_wines 0\n", - "proline 0\n", - "dtype: int64\n" - ] - } - ], - "source": [ - "print(f\"размер = {X.shape}\\nкол-во пустых значений\\n{X.isnull().sum(axis=0)}\")" - ] - }, - { - "cell_type": "markdown", - "id": "cf169b47", - "metadata": {}, - "source": [ - "7). Добавьте в датафрейм поле с классами вин в виде чисел, имеющих тип данных numpy.int64.\n", - "Название поля - 'target'." - ] - }, - { - "cell_type": "code", - "execution_count": 96, - "id": "3883dbe8", - "metadata": {}, - "outputs": [], - "source": [ - "X[\"target\"] = wine[\"target\"].astype(\"int64\")" - ] - }, - { - "cell_type": "markdown", - "id": "e5918c4e", - "metadata": {}, - "source": [ - "8). Постройте матрицу корреляций для всех полей X. Дайте полученному датафрейму название\n", - "X_corr" - ] - }, - { - "cell_type": "code", - "execution_count": 97, - "id": "d4c9b5e1", - "metadata": {}, - "outputs": [], - "source": [ - "X_corr = X.corr()" - ] - }, - { - "cell_type": "markdown", - "id": "43fd36d4", - "metadata": {}, - "source": [ - "9). Создайте список high_corr из признаков, корреляция которых с полем target по абсолютному\n", - "значению превышает 0.5 (причем, само поле target не должно входить в этот список)." - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "id": "a788a848", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['alcalinity_of_ash',\n", - " 'total_phenols',\n", - " 'flavanoids',\n", - " 'hue',\n", - " 'od280/od315_of_diluted_wines',\n", - " 'proline']" - ] - }, - "execution_count": 98, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "high_corr = [item for item in X_corr[abs(X_corr.target) > 0.5].index if item != \"target\"]\n", - "high_corr" - ] - }, - { - "cell_type": "markdown", - "id": "beab1b7d", - "metadata": {}, - "source": [ - "10). Удалите из датафрейма X поле с целевой переменной. Для всех признаков, названия которых\n", - "содержатся в списке high_corr, вычислите квадрат их значений и добавьте в датафрейм X\n", - "соответствующие поля с суффиксом '_2', добавленного к первоначальному названию признака.\n", - "Итоговый датафрейм должен содержать все поля, которые, были в нем изначально, а также поля с\n", - "признаками из списка high_corr, возведенными в квадрат. Выведите описание полей датафрейма X с\n", - "помощью метода describe." - ] - }, - { - "cell_type": "code", - "execution_count": 99, - "id": "426afeb5", - "metadata": {}, - "outputs": [], - "source": [ - "X.drop(\"target\", axis = 1, inplace = True) \n", - "\n", - "for item in high_corr:\n", - " X[item+\"_2\"] = X[item] ** 2" - ] - }, - { - "cell_type": "code", - "execution_count": 101, - "id": "382274e0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolinealcalinity_of_ash_2total_phenols_2flavanoids_2hue_2od280/od315_of_diluted_wines_2proline_2
count178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.0000001.780000e+02
mean13.0006182.3363482.36651719.49494499.7415732.2951122.0292700.3618541.5908995.0580900.9574492.611685746.893258391.1428655.6570305.1100490.9686617.3221556.564591e+05
std0.8118271.1171460.2743443.33956414.2824840.6258510.9988590.1244530.5723592.3182860.2285720.709990314.907474133.6717752.9362944.2114410.4437983.5843165.558591e+05
min11.0300000.7400001.36000010.60000070.0000000.9800000.3400000.1300000.4100001.2800000.4800001.270000278.000000112.3600000.9604000.1156000.2304001.6129007.728400e+04
25%12.3625001.6025002.21000017.20000088.0000001.7425001.2050000.2700001.2500003.2200000.7825001.937500500.500000295.8400003.0363251.4521000.6123253.7540752.505010e+05
50%13.0500001.8650002.36000019.50000098.0000002.3550002.1350000.3400001.5550004.6900000.9650002.780000673.500000380.2500005.5460504.5582500.9312507.7284004.536045e+05
75%13.6775003.0825002.55750021.500000107.0000002.8000002.8750000.4375001.9500006.2000001.1200003.170000985.000000462.2500007.8400008.2657001.25440010.0489009.702250e+05
max14.8300005.8000003.23000030.000000162.0000003.8800005.0800000.6600003.58000013.0000001.7100004.0000001680.000000900.00000015.05440025.8064002.92410016.0000002.822400e+06
\n", - "
" - ], - "text/plain": [ - " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", - "count 178.000000 178.000000 178.000000 178.000000 178.000000 \n", - "mean 13.000618 2.336348 2.366517 19.494944 99.741573 \n", - "std 0.811827 1.117146 0.274344 3.339564 14.282484 \n", - "min 11.030000 0.740000 1.360000 10.600000 70.000000 \n", - "25% 12.362500 1.602500 2.210000 17.200000 88.000000 \n", - "50% 13.050000 1.865000 2.360000 19.500000 98.000000 \n", - "75% 13.677500 3.082500 2.557500 21.500000 107.000000 \n", - "max 14.830000 5.800000 3.230000 30.000000 162.000000 \n", - "\n", - " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", - "count 178.000000 178.000000 178.000000 178.000000 \n", - "mean 2.295112 2.029270 0.361854 1.590899 \n", - "std 0.625851 0.998859 0.124453 0.572359 \n", - "min 0.980000 0.340000 0.130000 0.410000 \n", - "25% 1.742500 1.205000 0.270000 1.250000 \n", - "50% 2.355000 2.135000 0.340000 1.555000 \n", - "75% 2.800000 2.875000 0.437500 1.950000 \n", - "max 3.880000 5.080000 0.660000 3.580000 \n", - "\n", - " color_intensity hue od280/od315_of_diluted_wines proline \\\n", - "count 178.000000 178.000000 178.000000 178.000000 \n", - "mean 5.058090 0.957449 2.611685 746.893258 \n", - "std 2.318286 0.228572 0.709990 314.907474 \n", - "min 1.280000 0.480000 1.270000 278.000000 \n", - "25% 3.220000 0.782500 1.937500 500.500000 \n", - "50% 4.690000 0.965000 2.780000 673.500000 \n", - "75% 6.200000 1.120000 3.170000 985.000000 \n", - "max 13.000000 1.710000 4.000000 1680.000000 \n", - "\n", - " alcalinity_of_ash_2 total_phenols_2 flavanoids_2 hue_2 \\\n", - "count 178.000000 178.000000 178.000000 178.000000 \n", - "mean 391.142865 5.657030 5.110049 0.968661 \n", - "std 133.671775 2.936294 4.211441 0.443798 \n", - "min 112.360000 0.960400 0.115600 0.230400 \n", - "25% 295.840000 3.036325 1.452100 0.612325 \n", - "50% 380.250000 5.546050 4.558250 0.931250 \n", - "75% 462.250000 7.840000 8.265700 1.254400 \n", - "max 900.000000 15.054400 25.806400 2.924100 \n", - "\n", - " od280/od315_of_diluted_wines_2 proline_2 \n", - "count 178.000000 1.780000e+02 \n", - "mean 7.322155 6.564591e+05 \n", - "std 3.584316 5.558591e+05 \n", - "min 1.612900 7.728400e+04 \n", - "25% 3.754075 2.505010e+05 \n", - "50% 7.728400 4.536045e+05 \n", - "75% 10.048900 9.702250e+05 \n", - "max 16.000000 2.822400e+06 " - ] - }, - "execution_count": 101, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c1245b68", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 7dab7fa79071581a5c07bbceb340536bf5999ee8 Mon Sep 17 00:00:00 2001 From: Tolik Date: Sun, 23 Jun 2024 23:48:31 +0300 Subject: [PATCH 6/7] Release Home Work Machine Learning DSLibraries Lesson8 --- Lesson8/Task1.ipynb | 4329 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 4329 insertions(+) create mode 100644 Lesson8/Task1.ipynb diff --git a/Lesson8/Task1.ipynb b/Lesson8/Task1.ipynb new file mode 100644 index 0000000..82adc6c --- /dev/null +++ b/Lesson8/Task1.ipynb @@ -0,0 +1,4329 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "p62jUuP4ONfJ" + }, + "source": [ + "# Обучение без учителя в Scikit-learn" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Задание 1\n", + "Импортируйте библиотеки pandas, numpy и matplotlib.\n", + "Загрузите \"Boston House Prices dataset\" из встроенных наборов данных библиотеки sklearn.\n", + "Создайте датафреймы X и y из этих данных.\n", + "\n", + "Разбейте эти датафреймы на тренировочные (X_train, y_train) и тестовые (X_test, y_test)\n", + "с помощью функции train_test_split так, чтобы размер тестовой выборки\n", + "составлял 20% от всех данных, при этом аргумент random_state должен быть равен 42.\n", + "\n", + "Масштабируйте данные с помощью StandardScaler.\n", + "\n", + "Постройте модель TSNE на тренировочный данных с параметрами:\n", + "n_components=2, learning_rate=250, random_state=42.\n", + "\n", + "Постройте диаграмму рассеяния на этих данных." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "id": "QuNQtlOkONfL" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "plt.style.use('fivethirtyeight')\n", + "\n", + "%config InlineBackend.figure_format = 'svg'\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTAT
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.98
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.14
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.03
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.94
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.33
\n", + "
" + ], + "text/plain": [ + " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n", + "0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n", + "1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n", + "2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 \n", + "3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 \n", + "4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 \n", + "\n", + " PTRATIO B LSTAT \n", + "0 15.3 396.90 4.98 \n", + "1 17.8 396.90 9.14 \n", + "2 17.8 392.83 4.03 \n", + "3 18.7 394.63 2.94 \n", + "4 18.7 396.90 5.33 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
price
024.0
121.6
234.7
333.4
436.2
\n", + "
" + ], + "text/plain": [ + " price\n", + "0 24.0\n", + "1 21.6\n", + "2 34.7\n", + "3 33.4\n", + "4 36.2" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "(None, None)" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.datasets import load_boston\n", + "\n", + "boston = load_boston()\n", + "\n", + "data = boston[\"data\"]\n", + "\n", + "feature_names = boston[\"feature_names\"]\n", + "\n", + "X = pd.DataFrame(data, columns=feature_names)\n", + "y = pd.DataFrame(boston[\"target\"], columns=[\"price\"])\n", + "\n", + "display(X.head()), display(y.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "id": "fvOnLpaXONfn" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((404, 13), (102, 13))" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "id": "yPxQL0KZONfi" + }, + "outputs": [], + "source": [ + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "scaler = StandardScaler(with_mean=False)\n", + "\n", + "X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)\n", + "X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "id": "HWt-_P61ONfo", + "outputId": "faac3c81-85b0-44df-9427-9494e76e33a6" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "До:\t(404, 13)\n", + "После:\t(404, 2)\n" + ] + } + ], + "source": [ + "from sklearn.manifold import TSNE\n", + "\n", + "tsne = TSNE(n_components=2, learning_rate=250, random_state=42)\n", + "\n", + "X_train_tsne = tsne.fit_transform(X_train_scaled)\n", + "\n", + "print('До:\\t{}'.format(X_train_scaled.shape))\n", + "print('После:\\t{}'.format(X_train_tsne.shape))" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "id": "RIGICrkHONfo", + "outputId": "671f4770-6c3f-4f12-b61c-50e7025bfe20", + "scrolled": false + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2024-06-23T23:29:45.942609\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.3.4, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from pylab import rcParams\n", + "\n", + "plt.rcParams['figure.figsize'] = 10, 6\n", + "\n", + "plt.scatter(X_train_tsne[:, 0], X_train_tsne[:, 1])\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Задание 2\n", + "С помощью KMeans разбейте данные из тренировочного набора на 3 кластера,\n", + "используйте все признаки из датафрейма X_train.\n", + "Параметр max_iter должен быть равен 100, random_state сделайте равным 42.\n", + "\n", + "Постройте еще раз диаграмму рассеяния на данных, полученных с помощью TSNE,\n", + "и раскрасьте точки из разных кластеров разными цветами.\n", + "\n", + "Вычислите средние значения price и CRIM в разных кластерах." + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "id": "F9QU6VLLONfK" + }, + "outputs": [], + "source": [ + "from sklearn.cluster import KMeans" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "id": "sK44YXDaONfT", + "outputId": "6bbf6bf8-d9e3-4571-c975-7b9cdd48fe9c" + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2024-06-23T23:30:36.734098\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.3.4, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "inertias = []\n", + "\n", + "max_iter = 100\n", + "for i in range(2, max_iter):\n", + " temp_model = KMeans(n_clusters=i, random_state=42)\n", + " temp_model.fit(X_train_scaled)\n", + " \n", + " temp_inertia = temp_model.inertia_\n", + " \n", + " inertias.append(temp_inertia)\n", + "\n", + "plt.plot(range(2, max_iter), inertias)\n", + "\n", + "plt.title('Inertia')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "inertia на графике начинает выравниваться на 17 кластерах, используем данное значение в гиперпараметрах модели" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "id": "fqLX3t5VONfO", + "outputId": "01cace59-8dbd-44bb-f918-8538bb02abe3" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Train clustered')" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2024-06-23T23:30:37.298315\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.3.4, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "model = KMeans(n_clusters=17, random_state=42)\n", + "\n", + "train_labels = model.fit_predict(X_train_tsne)\n", + "\n", + "centers = model.cluster_centers_\n", + "\n", + "plt.scatter(X_train_tsne[:, 0], X_train_tsne[:, 1], c=train_labels)\n", + "plt.scatter(centers[:, 0], centers[:, 1], marker='D', color='red')\n", + "\n", + "plt.title('Train clustered')" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ClusterMeanPriceMeanCRIM
00.023.6625002.156072
11.022.0954551.568839
22.022.9450007.046766
33.023.6541671.453769
44.026.2600000.169306
55.025.7400000.923443
66.023.7800001.377391
77.028.1750002.585901
88.025.7214291.479126
99.024.1714292.669954
1010.025.4125001.494771
1111.023.6515151.944150
1212.025.3500000.895165
1313.022.9555560.162179
1414.024.5222220.984675
1515.022.2000002.488093
1616.025.7416670.640832
\n", + "
" + ], + "text/plain": [ + " Cluster MeanPrice MeanCRIM\n", + "0 0.0 23.662500 2.156072\n", + "1 1.0 22.095455 1.568839\n", + "2 2.0 22.945000 7.046766\n", + "3 3.0 23.654167 1.453769\n", + "4 4.0 26.260000 0.169306\n", + "5 5.0 25.740000 0.923443\n", + "6 6.0 23.780000 1.377391\n", + "7 7.0 28.175000 2.585901\n", + "8 8.0 25.721429 1.479126\n", + "9 9.0 24.171429 2.669954\n", + "10 10.0 25.412500 1.494771\n", + "11 11.0 23.651515 1.944150\n", + "12 12.0 25.350000 0.895165\n", + "13 13.0 22.955556 0.162179\n", + "14 14.0 24.522222 0.984675\n", + "15 15.0 22.200000 2.488093\n", + "16 16.0 25.741667 0.640832" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def summary(data):\n", + " return data.groupby(['Cluster'], as_index=False).agg({'price':'mean', 'CRIM':'mean'})\\\n", + " .rename(columns={'price':'MeanPrice', 'CRIM': 'MeanCRIM'})\n", + "\n", + "summary(pd.concat([X_train, pd.DataFrame(train_labels, columns=['Cluster']), y_train], axis=1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Задание 3 \n", + "Примените модель KMeans, построенную в предыдущем задании, к данным из тестового набора.\n", + "Вычислите средние значения price и CRIM в разных кластерах на тестовых данных.\n", + "Выполните, пожалуйста, если возникнут проблемы при отправке решения - пишите, пожалуйста." + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "test_labels = model.fit_predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ClusterMeanPriceMeanCRIM
00.019.4666670.341773
11.025.1600000.268740
22.0NaNNaN
33.024.2000000.088260
44.021.8000000.128630
55.030.8000000.027630
66.019.1166670.454337
77.020.9000000.128160
88.022.6500000.071760
99.0NaNNaN
1010.020.3000000.083870
1111.025.0000000.126500
1212.0NaNNaN
1313.024.1000000.115990
1414.025.0000000.028750
1515.022.6000000.046840
1616.0NaNNaN
\n", + "
" + ], + "text/plain": [ + " Cluster MeanPrice MeanCRIM\n", + "0 0.0 19.466667 0.341773\n", + "1 1.0 25.160000 0.268740\n", + "2 2.0 NaN NaN\n", + "3 3.0 24.200000 0.088260\n", + "4 4.0 21.800000 0.128630\n", + "5 5.0 30.800000 0.027630\n", + "6 6.0 19.116667 0.454337\n", + "7 7.0 20.900000 0.128160\n", + "8 8.0 22.650000 0.071760\n", + "9 9.0 NaN NaN\n", + "10 10.0 20.300000 0.083870\n", + "11 11.0 25.000000 0.126500\n", + "12 12.0 NaN NaN\n", + "13 13.0 24.100000 0.115990\n", + "14 14.0 25.000000 0.028750\n", + "15 15.0 22.600000 0.046840\n", + "16 16.0 NaN NaN" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "summary(pd.concat([X_test, pd.DataFrame(test_labels, columns=['Cluster']), y_test], axis=1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "NJVFjt4ZONfV", + "N4LH5gnYONfW", + "7U_GAERXONfh", + "7uBf7K66ONfl" + ], + "name": "5. Обучение без учителя в Scikit-learn.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} From da7d3c78b4d4e4f4ee08e4df19d1bc60be9902cb Mon Sep 17 00:00:00 2001 From: Tolik Date: Sun, 23 Jun 2024 23:57:00 +0300 Subject: [PATCH 7/7] Modify structure --- Lesson8/Task1.ipynb | 4329 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 4329 insertions(+) create mode 100644 Lesson8/Task1.ipynb diff --git a/Lesson8/Task1.ipynb b/Lesson8/Task1.ipynb new file mode 100644 index 0000000..82adc6c --- /dev/null +++ b/Lesson8/Task1.ipynb @@ -0,0 +1,4329 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "p62jUuP4ONfJ" + }, + "source": [ + "# Обучение без учителя в Scikit-learn" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Задание 1\n", + "Импортируйте библиотеки pandas, numpy и matplotlib.\n", + "Загрузите \"Boston House Prices dataset\" из встроенных наборов данных библиотеки sklearn.\n", + "Создайте датафреймы X и y из этих данных.\n", + "\n", + "Разбейте эти датафреймы на тренировочные (X_train, y_train) и тестовые (X_test, y_test)\n", + "с помощью функции train_test_split так, чтобы размер тестовой выборки\n", + "составлял 20% от всех данных, при этом аргумент random_state должен быть равен 42.\n", + "\n", + "Масштабируйте данные с помощью StandardScaler.\n", + "\n", + "Постройте модель TSNE на тренировочный данных с параметрами:\n", + "n_components=2, learning_rate=250, random_state=42.\n", + "\n", + "Постройте диаграмму рассеяния на этих данных." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "id": "QuNQtlOkONfL" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "plt.style.use('fivethirtyeight')\n", + "\n", + "%config InlineBackend.figure_format = 'svg'\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTAT
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.98
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.14
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.03
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.94
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.33
\n", + "
" + ], + "text/plain": [ + " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n", + "0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n", + "1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n", + "2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 \n", + "3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 \n", + "4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 \n", + "\n", + " PTRATIO B LSTAT \n", + "0 15.3 396.90 4.98 \n", + "1 17.8 396.90 9.14 \n", + "2 17.8 392.83 4.03 \n", + "3 18.7 394.63 2.94 \n", + "4 18.7 396.90 5.33 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
price
024.0
121.6
234.7
333.4
436.2
\n", + "
" + ], + "text/plain": [ + " price\n", + "0 24.0\n", + "1 21.6\n", + "2 34.7\n", + "3 33.4\n", + "4 36.2" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "(None, None)" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.datasets import load_boston\n", + "\n", + "boston = load_boston()\n", + "\n", + "data = boston[\"data\"]\n", + "\n", + "feature_names = boston[\"feature_names\"]\n", + "\n", + "X = pd.DataFrame(data, columns=feature_names)\n", + "y = pd.DataFrame(boston[\"target\"], columns=[\"price\"])\n", + "\n", + "display(X.head()), display(y.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "id": "fvOnLpaXONfn" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((404, 13), (102, 13))" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "id": "yPxQL0KZONfi" + }, + "outputs": [], + "source": [ + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "scaler = StandardScaler(with_mean=False)\n", + "\n", + "X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)\n", + "X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "id": "HWt-_P61ONfo", + "outputId": "faac3c81-85b0-44df-9427-9494e76e33a6" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "До:\t(404, 13)\n", + "После:\t(404, 2)\n" + ] + } + ], + "source": [ + "from sklearn.manifold import TSNE\n", + "\n", + "tsne = TSNE(n_components=2, learning_rate=250, random_state=42)\n", + "\n", + "X_train_tsne = tsne.fit_transform(X_train_scaled)\n", + "\n", + "print('До:\\t{}'.format(X_train_scaled.shape))\n", + "print('После:\\t{}'.format(X_train_tsne.shape))" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "id": "RIGICrkHONfo", + "outputId": "671f4770-6c3f-4f12-b61c-50e7025bfe20", + "scrolled": false + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2024-06-23T23:29:45.942609\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.3.4, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from pylab import rcParams\n", + "\n", + "plt.rcParams['figure.figsize'] = 10, 6\n", + "\n", + "plt.scatter(X_train_tsne[:, 0], X_train_tsne[:, 1])\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Задание 2\n", + "С помощью KMeans разбейте данные из тренировочного набора на 3 кластера,\n", + "используйте все признаки из датафрейма X_train.\n", + "Параметр max_iter должен быть равен 100, random_state сделайте равным 42.\n", + "\n", + "Постройте еще раз диаграмму рассеяния на данных, полученных с помощью TSNE,\n", + "и раскрасьте точки из разных кластеров разными цветами.\n", + "\n", + "Вычислите средние значения price и CRIM в разных кластерах." + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "id": "F9QU6VLLONfK" + }, + "outputs": [], + "source": [ + "from sklearn.cluster import KMeans" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "id": "sK44YXDaONfT", + "outputId": "6bbf6bf8-d9e3-4571-c975-7b9cdd48fe9c" + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2024-06-23T23:30:36.734098\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.3.4, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "inertias = []\n", + "\n", + "max_iter = 100\n", + "for i in range(2, max_iter):\n", + " temp_model = KMeans(n_clusters=i, random_state=42)\n", + " temp_model.fit(X_train_scaled)\n", + " \n", + " temp_inertia = temp_model.inertia_\n", + " \n", + " inertias.append(temp_inertia)\n", + "\n", + "plt.plot(range(2, max_iter), inertias)\n", + "\n", + "plt.title('Inertia')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "inertia на графике начинает выравниваться на 17 кластерах, используем данное значение в гиперпараметрах модели" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "id": "fqLX3t5VONfO", + "outputId": "01cace59-8dbd-44bb-f918-8538bb02abe3" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Train clustered')" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2024-06-23T23:30:37.298315\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.3.4, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "model = KMeans(n_clusters=17, random_state=42)\n", + "\n", + "train_labels = model.fit_predict(X_train_tsne)\n", + "\n", + "centers = model.cluster_centers_\n", + "\n", + "plt.scatter(X_train_tsne[:, 0], X_train_tsne[:, 1], c=train_labels)\n", + "plt.scatter(centers[:, 0], centers[:, 1], marker='D', color='red')\n", + "\n", + "plt.title('Train clustered')" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ClusterMeanPriceMeanCRIM
00.023.6625002.156072
11.022.0954551.568839
22.022.9450007.046766
33.023.6541671.453769
44.026.2600000.169306
55.025.7400000.923443
66.023.7800001.377391
77.028.1750002.585901
88.025.7214291.479126
99.024.1714292.669954
1010.025.4125001.494771
1111.023.6515151.944150
1212.025.3500000.895165
1313.022.9555560.162179
1414.024.5222220.984675
1515.022.2000002.488093
1616.025.7416670.640832
\n", + "
" + ], + "text/plain": [ + " Cluster MeanPrice MeanCRIM\n", + "0 0.0 23.662500 2.156072\n", + "1 1.0 22.095455 1.568839\n", + "2 2.0 22.945000 7.046766\n", + "3 3.0 23.654167 1.453769\n", + "4 4.0 26.260000 0.169306\n", + "5 5.0 25.740000 0.923443\n", + "6 6.0 23.780000 1.377391\n", + "7 7.0 28.175000 2.585901\n", + "8 8.0 25.721429 1.479126\n", + "9 9.0 24.171429 2.669954\n", + "10 10.0 25.412500 1.494771\n", + "11 11.0 23.651515 1.944150\n", + "12 12.0 25.350000 0.895165\n", + "13 13.0 22.955556 0.162179\n", + "14 14.0 24.522222 0.984675\n", + "15 15.0 22.200000 2.488093\n", + "16 16.0 25.741667 0.640832" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def summary(data):\n", + " return data.groupby(['Cluster'], as_index=False).agg({'price':'mean', 'CRIM':'mean'})\\\n", + " .rename(columns={'price':'MeanPrice', 'CRIM': 'MeanCRIM'})\n", + "\n", + "summary(pd.concat([X_train, pd.DataFrame(train_labels, columns=['Cluster']), y_train], axis=1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Задание 3 \n", + "Примените модель KMeans, построенную в предыдущем задании, к данным из тестового набора.\n", + "Вычислите средние значения price и CRIM в разных кластерах на тестовых данных.\n", + "Выполните, пожалуйста, если возникнут проблемы при отправке решения - пишите, пожалуйста." + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "test_labels = model.fit_predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ClusterMeanPriceMeanCRIM
00.019.4666670.341773
11.025.1600000.268740
22.0NaNNaN
33.024.2000000.088260
44.021.8000000.128630
55.030.8000000.027630
66.019.1166670.454337
77.020.9000000.128160
88.022.6500000.071760
99.0NaNNaN
1010.020.3000000.083870
1111.025.0000000.126500
1212.0NaNNaN
1313.024.1000000.115990
1414.025.0000000.028750
1515.022.6000000.046840
1616.0NaNNaN
\n", + "
" + ], + "text/plain": [ + " Cluster MeanPrice MeanCRIM\n", + "0 0.0 19.466667 0.341773\n", + "1 1.0 25.160000 0.268740\n", + "2 2.0 NaN NaN\n", + "3 3.0 24.200000 0.088260\n", + "4 4.0 21.800000 0.128630\n", + "5 5.0 30.800000 0.027630\n", + "6 6.0 19.116667 0.454337\n", + "7 7.0 20.900000 0.128160\n", + "8 8.0 22.650000 0.071760\n", + "9 9.0 NaN NaN\n", + "10 10.0 20.300000 0.083870\n", + "11 11.0 25.000000 0.126500\n", + "12 12.0 NaN NaN\n", + "13 13.0 24.100000 0.115990\n", + "14 14.0 25.000000 0.028750\n", + "15 15.0 22.600000 0.046840\n", + "16 16.0 NaN NaN" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "summary(pd.concat([X_test, pd.DataFrame(test_labels, columns=['Cluster']), y_test], axis=1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "NJVFjt4ZONfV", + "N4LH5gnYONfW", + "7U_GAERXONfh", + "7uBf7K66ONfl" + ], + "name": "5. Обучение без учителя в Scikit-learn.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}