diff --git a/Lesson6/Task2.ipynb b/Lesson6/Task2.ipynb new file mode 100644 index 0000000..2ebd9ce --- /dev/null +++ b/Lesson6/Task2.ipynb @@ -0,0 +1,1979 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b2f2e5e2", + "metadata": {}, + "source": [ + "## Тема “Обучение с учителем”" + ] + }, + { + "cell_type": "markdown", + "id": "16595a84", + "metadata": {}, + "source": [ + "### Задание 1\n", + "Импортируйте библиотеки pandas и numpy.\n", + "Загрузите \"Boston House Prices dataset\" из встроенных наборов данных библиотеки sklearn..\n", + "Разбейте эти датафреймы на тренировочные (X_train, y_train) и тестовые (X_test, y_test) с помощью\n", + "функции train_test_split так, чтобы размер тестовой выборки\n", + "составлял 30% от всех данных, при этом аргумент random state должен быть равен 42.\n", + "Создайте модель линейной регрессии под названием lr с помощью класса LinearRegression из модуля\n", + "sklearn.linear_model.\n", + "Обучите модель на тренировочных данных (используйте все признаки) и сделайте предсказание на\n", + "тестовых." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "274303e6", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "693a9c36", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "pd.options.display.max_columns = 100\n", + "\n", + "from sklearn.datasets import load_boston" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a37f783d", + "metadata": {}, + "outputs": [], + "source": [ + "boston = load_boston()\n", + "\n", + "feature_names = boston[\"feature_names\"]\n", + "\n", + "X = pd.DataFrame(boston[\"data\"], columns=feature_names)\n", + "y = pd.DataFrame(boston[\"target\"], columns=[\"price\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5a2e0780", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((354, 13), (152, 13))" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n", + "\n", + "X_train.shape, X_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "96164976", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "\n", + "lr = LinearRegression()\n", + "\n", + "lr.fit(X_train, y_train)\n", + "\n", + "lr_pred = lr.predict(X_test)" + ] + }, + { + "cell_type": "markdown", + "id": "a07cffa5", + "metadata": {}, + "source": [ + "### Задание 2\n", + "Создайте модель под названием model с помощью класса RandomForestRegressor из модуля\n", + "sklearn.ensemble.\n", + "Сделайте агрумент n_estimators равным 1000,\n", + "max_depth должен быть равен 12 и random_state сделайте равным 42.\n", + "Обучите модель на тренировочных данных аналогично тому, как вы обучали модель LinearRegression,\n", + "но при этом в метод fit вместо датафрейма y_train поставьте y_train.values[:, 0],\n", + "чтобы получить из датафрейма одномерный массив Numpy,\n", + "так как для класса RandomForestRegressor в данном методе для аргумента y предпочтительно\n", + "применение массивов вместо датафрейма.\n", + "Сделайте предсказание на тестовых данных и посчитайте R2. Сравните с результатом из\n", + "предыдущего задания.\n", + "Напишите в комментариях к коду, какая модель в данном случае работает лучше" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "64042e74", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "оценка R2 модели ансамбля случайного леса выше чем у линейной регрессии\n", + "RandomForestRegressor=0.8747\n", + "LinearRegression=0.7112\n" + ] + } + ], + "source": [ + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.metrics import r2_score\n", + "\n", + "model = RandomForestRegressor(max_depth=12, n_estimators=1000, random_state=42)\n", + "\n", + "model.fit(X_train, y_train.values[:,0])\n", + "\n", + "rf_pred = model.predict(X_test)\n", + "\n", + "r2_lr = r2_score(y_test, lr_pred)\n", + "r2_rf = r2_score(y_test, rf_pred)\n", + "\n", + "print(f\"оценка R2 модели ансамбля случайного леса {'выше' if r2_rf > r2_lr else 'ниже' } \" \n", + " f\"чем у линейной регрессии\\nRandomForestRegressor={r2_rf:.4f}\\nLinearRegression={r2_lr:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "94b0548e", + "metadata": {}, + "source": [ + "### *Задание 3\n", + "Вызовите документацию для класса RandomForestRegressor,\n", + "найдите информацию об атрибуте feature_importances_.\n", + "С помощью этого атрибута найдите сумму всех показателей важности,\n", + "установите, какие два признака показывают наибольшую важность." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1327004d", + "metadata": {}, + "outputs": [], + "source": [ + "?RandomForestRegressor" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c8e1fab0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "сумма показателей важности признаков модели = 1.0\n" + ] + } + ], + "source": [ + "print(f\"сумма показателей важности признаков модели = {np.sum(model.feature_importances_)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "744deb30", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "первые два наиболее важные признака\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
featureimportance
12LSTAT0.415847
5RM0.402682
\n", + "
" + ], + "text/plain": [ + " feature importance\n", + "12 LSTAT 0.415847\n", + "5 RM 0.402682" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"первые два наиболее важные признака\")\n", + "pd.DataFrame({'feature': feature_names,'importance': model.feature_importances_}) \\\n", + ".sort_values('importance',ascending = False) \\\n", + ".head(2)" + ] + }, + { + "cell_type": "markdown", + "id": "b9004884", + "metadata": {}, + "source": [ + "## *Задание 4\n", + "В этом задании мы будем работать с датасетом, с которым мы уже знакомы по домашнему заданию\n", + "по библиотеке Matplotlib, это датасет Credit Card Fraud Detection.Для этого датасета мы будем решать\n", + "задачу классификации - будем определять,какие из транзакции по кредитной карте являются\n", + "мошенническими.Данный датасет сильно несбалансирован (так как случаи мошенничества\n", + "относительно редки),так что применение метрики accuracy не принесет пользы и не поможет выбрать\n", + "лучшую модель.Мы будем вычислять AUC, то есть площадь под кривой ROC.\n", + "Импортируйте из соответствующих модулей RandomForestClassifier, GridSearchCV и train_test_split.\n", + "Загрузите датасет creditcard.csv и создайте датафрейм df." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a65b98a0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset URL: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud\n", + "License(s): DbCL-1.0\n", + "Downloading creditcardfraud.zip to W:\\Projects\\GB\\Python\\MLearning\\DSLibraries\\Lesson6\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " 0%| | 0.00/66.0M [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TimeV1V2V3V4V5V6V7V8V9V10V11V12V13V14V15V16V17V18V19V20V21V22V23V24V25V26V27V28AmountClass
00.0-1.359807-0.0727812.5363471.378155-0.3383210.4623880.2395990.0986980.3637870.090794-0.551600-0.617801-0.991390-0.3111691.468177-0.4704010.2079710.0257910.4039930.251412-0.0183070.277838-0.1104740.0669280.128539-0.1891150.133558-0.021053149.620
10.01.1918570.2661510.1664800.4481540.060018-0.082361-0.0788030.085102-0.255425-0.1669741.6127271.0652350.489095-0.1437720.6355580.463917-0.114805-0.183361-0.145783-0.069083-0.225775-0.6386720.101288-0.3398460.1671700.125895-0.0089830.0147242.690
21.0-1.358354-1.3401631.7732090.379780-0.5031981.8004990.7914610.247676-1.5146540.2076430.6245010.0660840.717293-0.1659462.345865-2.8900831.109969-0.121359-2.2618570.5249800.2479980.7716790.909412-0.689281-0.327642-0.139097-0.055353-0.059752378.660
31.0-0.966272-0.1852261.792993-0.863291-0.0103091.2472030.2376090.377436-1.387024-0.054952-0.2264870.1782280.507757-0.287924-0.631418-1.059647-0.6840931.965775-1.232622-0.208038-0.1083000.005274-0.190321-1.1755750.647376-0.2219290.0627230.061458123.500
42.0-1.1582330.8777371.5487180.403034-0.4071930.0959210.592941-0.2705330.8177390.753074-0.8228430.5381961.345852-1.1196700.175121-0.451449-0.237033-0.0381950.8034870.408542-0.0094310.798278-0.1374580.141267-0.2060100.5022920.2194220.21515369.990
52.0-0.4259660.9605231.141109-0.1682520.420987-0.0297280.4762010.260314-0.568671-0.3714071.3412620.359894-0.358091-0.1371340.5176170.401726-0.0581330.068653-0.0331940.084968-0.208254-0.559825-0.026398-0.371427-0.2327940.1059150.2538440.0810803.670
64.01.2296580.1410040.0453711.2026130.1918810.272708-0.0051590.0812130.464960-0.099254-1.416907-0.153826-0.7510630.1673720.050144-0.4435870.002821-0.611987-0.045575-0.219633-0.167716-0.270710-0.154104-0.7800550.750137-0.2572370.0345070.0051684.990
77.0-0.6442691.4179641.074380-0.4921990.9489340.4281181.120631-3.8078640.6153751.249376-0.6194680.2914741.757964-1.3238650.686133-0.076127-1.222127-0.3582220.324505-0.1567421.943465-1.0154550.057504-0.649709-0.415267-0.051634-1.206921-1.08533940.800
87.0-0.8942860.286157-0.113192-0.2715262.6695993.7218180.3701450.851084-0.392048-0.410430-0.705117-0.110452-0.2862540.074355-0.328783-0.210077-0.4997680.1187650.5703280.052736-0.073425-0.268092-0.2042331.0115920.373205-0.3841570.0117470.14240493.200
99.0-0.3382621.1195931.044367-0.2221870.499361-0.2467610.6515830.069539-0.736727-0.3668461.0176140.8363901.006844-0.4435230.1502190.739453-0.5409800.4766770.4517730.203711-0.246914-0.633753-0.120794-0.385050-0.0697330.0941990.2462190.0830763.680
\n", + "" + ], + "text/plain": [ + " Time V1 V2 V3 V4 V5 V6 V7 \\\n", + "0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n", + "1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n", + "2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n", + "3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n", + "4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n", + "5 2.0 -0.425966 0.960523 1.141109 -0.168252 0.420987 -0.029728 0.476201 \n", + "6 4.0 1.229658 0.141004 0.045371 1.202613 0.191881 0.272708 -0.005159 \n", + "7 7.0 -0.644269 1.417964 1.074380 -0.492199 0.948934 0.428118 1.120631 \n", + "8 7.0 -0.894286 0.286157 -0.113192 -0.271526 2.669599 3.721818 0.370145 \n", + "9 9.0 -0.338262 1.119593 1.044367 -0.222187 0.499361 -0.246761 0.651583 \n", + "\n", + " V8 V9 V10 V11 V12 V13 V14 \\\n", + "0 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169 \n", + "1 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772 \n", + "2 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946 \n", + "3 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924 \n", + "4 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670 \n", + "5 0.260314 -0.568671 -0.371407 1.341262 0.359894 -0.358091 -0.137134 \n", + "6 0.081213 0.464960 -0.099254 -1.416907 -0.153826 -0.751063 0.167372 \n", + "7 -3.807864 0.615375 1.249376 -0.619468 0.291474 1.757964 -1.323865 \n", + "8 0.851084 -0.392048 -0.410430 -0.705117 -0.110452 -0.286254 0.074355 \n", + "9 0.069539 -0.736727 -0.366846 1.017614 0.836390 1.006844 -0.443523 \n", + "\n", + " V15 V16 V17 V18 V19 V20 V21 \\\n", + "0 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307 \n", + "1 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775 \n", + "2 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998 \n", + "3 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300 \n", + "4 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431 \n", + "5 0.517617 0.401726 -0.058133 0.068653 -0.033194 0.084968 -0.208254 \n", + "6 0.050144 -0.443587 0.002821 -0.611987 -0.045575 -0.219633 -0.167716 \n", + "7 0.686133 -0.076127 -1.222127 -0.358222 0.324505 -0.156742 1.943465 \n", + "8 -0.328783 -0.210077 -0.499768 0.118765 0.570328 0.052736 -0.073425 \n", + "9 0.150219 0.739453 -0.540980 0.476677 0.451773 0.203711 -0.246914 \n", + "\n", + " V22 V23 V24 V25 V26 V27 V28 \\\n", + "0 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 \n", + "1 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 \n", + "2 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 \n", + "3 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 \n", + "4 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 \n", + "5 -0.559825 -0.026398 -0.371427 -0.232794 0.105915 0.253844 0.081080 \n", + "6 -0.270710 -0.154104 -0.780055 0.750137 -0.257237 0.034507 0.005168 \n", + "7 -1.015455 0.057504 -0.649709 -0.415267 -0.051634 -1.206921 -1.085339 \n", + "8 -0.268092 -0.204233 1.011592 0.373205 -0.384157 0.011747 0.142404 \n", + "9 -0.633753 -0.120794 -0.385050 -0.069733 0.094199 0.246219 0.083076 \n", + "\n", + " Amount Class \n", + "0 149.62 0 \n", + "1 2.69 0 \n", + "2 378.66 0 \n", + "3 123.50 0 \n", + "4 69.99 0 \n", + "5 3.67 0 \n", + "6 4.99 0 \n", + "7 40.80 0 \n", + "8 93.20 0 \n", + "9 3.68 0 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from zipfile import ZipFile\n", + "\n", + "ZipFile(\"creditcardfraud.zip\").extractall(\".\")\n", + "\n", + "df = pd.read_csv(\"creditcard.csv\")\n", + "\n", + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "9302f8fe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 284807 entries, 0 to 284806\n", + "Data columns (total 31 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Time 284807 non-null float64\n", + " 1 V1 284807 non-null float64\n", + " 2 V2 284807 non-null float64\n", + " 3 V3 284807 non-null float64\n", + " 4 V4 284807 non-null float64\n", + " 5 V5 284807 non-null float64\n", + " 6 V6 284807 non-null float64\n", + " 7 V7 284807 non-null float64\n", + " 8 V8 284807 non-null float64\n", + " 9 V9 284807 non-null float64\n", + " 10 V10 284807 non-null float64\n", + " 11 V11 284807 non-null float64\n", + " 12 V12 284807 non-null float64\n", + " 13 V13 284807 non-null float64\n", + " 14 V14 284807 non-null float64\n", + " 15 V15 284807 non-null float64\n", + " 16 V16 284807 non-null float64\n", + " 17 V17 284807 non-null float64\n", + " 18 V18 284807 non-null float64\n", + " 19 V19 284807 non-null float64\n", + " 20 V20 284807 non-null float64\n", + " 21 V21 284807 non-null float64\n", + " 22 V22 284807 non-null float64\n", + " 23 V23 284807 non-null float64\n", + " 24 V24 284807 non-null float64\n", + " 25 V25 284807 non-null float64\n", + " 26 V26 284807 non-null float64\n", + " 27 V27 284807 non-null float64\n", + " 28 V28 284807 non-null float64\n", + " 29 Amount 284807 non-null float64\n", + " 30 Class 284807 non-null int64 \n", + "dtypes: float64(30), int64(1)\n", + "memory usage: 67.4 MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "markdown", + "id": "4ba00248", + "metadata": {}, + "source": [ + "Создайте датафрейм X из датафрейма df, исключив столбец Class.\n", + "Создайте объект Series под названием y из столбца Class.\n", + "Разбейте X и y на тренировочный и тестовый наборы данных при помощи функции train_test_split,\n", + "используя аргументы: test_size=0.3, random_state=100, stratify=y.\n", + "У вас должны получиться объекты X_train, X_test, y_train и y_test.\n", + "Просмотрите информацию о их форме." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "59e1e34e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "выборка не сбалансированна, данных первого класса значительно меньше\n", + "0 0.998273\n", + "1 0.001727\n", + "Name: Class, dtype: float64\n" + ] + }, + { + "data": { + "text/plain": [ + "((199364, 30), (85443, 30), (199364,), (85443,))" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target = \"Class\"\n", + "\n", + "y = df[target]\n", + "X = df.drop(target, axis=1)\n", + "\n", + "print(f\"выборка не сбалансированна, данных первого класса значительно меньше\\n{y.value_counts(normalize=True)}\")\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)\n", + "X_train.shape, X_test.shape, y_train.shape, y_test.shape" + ] + }, + { + "cell_type": "markdown", + "id": "fe7fae18", + "metadata": {}, + "source": [ + "Для поиска по сетке параметров задайте такие параметры:\n", + "parameters = [{'n_estimators': [10, 15],\n", + "'max_features': np.arange(3, 5),\n", + "'max_depth': np.arange(4, 7)}]\n", + "Создайте модель GridSearchCV со следующими аргументами:\n", + "estimator=RandomForestClassifier(random_state=100),\n", + "param_grid=parameters,\n", + "scoring='roc_auc',\n", + "cv=3." + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "id": "c034c2a4", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "parameters = {\n", + " 'n_estimators': [10, 15],\n", + " 'max_features': np.arange(3, 5),\n", + " 'max_depth': np.arange(4, 7),\n", + "}\n", + "\n", + "clf = GridSearchCV(\n", + " estimator=RandomForestClassifier(random_state=100),\n", + " param_grid=parameters,\n", + " scoring='roc_auc',\n", + " cv=3,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a4308ac3", + "metadata": {}, + "source": [ + "Обучите модель на тренировочном наборе данных (может занять несколько минут).\n", + "Просмотрите параметры лучшей модели с помощью атрибута best_params_.\n", + "Предскажите вероятности классов с помощью полученной модели и метода predict_proba.\n", + "Из полученного результата (массив Numpy) выберите столбец с индексом 1 (вероятность класса 1) и\n", + "запишите в массив y_pred_proba. Из модуля sklearn.metrics импортируйте метрику roc_auc_score.\n", + "Вычислите AUC на тестовых данных и сравните с результатом,полученным на тренировочных данных,\n", + "используя в качестве аргументов массивы y_test и y_pred_proba" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "9b43d0b7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=100),\n", + " param_grid={'max_depth': array([4, 5, 6]),\n", + " 'max_features': array([3, 4]),\n", + " 'n_estimators': [10, 15]},\n", + " scoring='roc_auc')" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "id": "966c3d4d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'max_depth': 6, 'max_features': 3, 'n_estimators': 15}" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.best_params_" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "405d63d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "метрика AUC=0.9436 на тестовых данных меньше метрики AUC=0.9660 на обучающем наборе\n" + ] + } + ], + "source": [ + "from sklearn.metrics import roc_auc_score\n", + "\n", + "model = RandomForestClassifier(max_depth=6, max_features=3, n_estimators=15)\n", + "\n", + "model.fit(X_train, y_train)\n", + "\n", + "y_pred_proba = model.predict_proba(X_test)[:, 1]\n", + "\n", + "auc = roc_auc_score(y_test, y_pred_proba)\n", + "\n", + "print(f\"метрика AUC={auc:.4f} на тестовых данных {'больше' if auc > clf.best_score_ else 'меньше'} \"\n", + " f\"метрики AUC={clf.best_score_:.4f} на обучающем наборе\")" + ] + }, + { + "cell_type": "markdown", + "id": "5cb517be", + "metadata": {}, + "source": [ + "## *Дополнительные задания:\n", + "1). Загрузите датасет Wine из встроенных датасетов sklearn.datasets с помощью функции load_wine в\n", + "переменную data." + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "id": "1dbcea26", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import load_wine\n", + "\n", + "wine = load_wine()\n", + "\n", + "data = wine[\"data\"]" + ] + }, + { + "cell_type": "markdown", + "id": "18dea80c", + "metadata": {}, + "source": [ + "2). Полученный датасет не является датафреймом. Это структура данных, имеющая ключи\n", + "аналогично словарю. Просмотрите тип данных этой структуры данных и создайте список data_keys,\n", + "содержащий ее ключи." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d5f5e52a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_keys = wine.keys()\n", + "data_keys" + ] + }, + { + "cell_type": "markdown", + "id": "73bff5cf", + "metadata": {}, + "source": [ + "3). Просмотрите данные, описание и названия признаков в датасете. Описание нужно вывести в виде\n", + "привычного, аккуратно оформленного текста, без обозначений переноса строки, но с самими\n", + "переносами и т.д." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "39433029", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".. _wine_dataset:\n", + "\n", + "Wine recognition dataset\n", + "------------------------\n", + "\n", + "**Data Set Characteristics:**\n", + "\n", + " :Number of Instances: 178 (50 in each of three classes)\n", + " :Number of Attributes: 13 numeric, predictive attributes and the class\n", + " :Attribute Information:\n", + " \t\t- Alcohol\n", + " \t\t- Malic acid\n", + " \t\t- Ash\n", + "\t\t- Alcalinity of ash \n", + " \t\t- Magnesium\n", + "\t\t- Total phenols\n", + " \t\t- Flavanoids\n", + " \t\t- Nonflavanoid phenols\n", + " \t\t- Proanthocyanins\n", + "\t\t- Color intensity\n", + " \t\t- Hue\n", + " \t\t- OD280/OD315 of diluted wines\n", + " \t\t- Proline\n", + "\n", + " - class:\n", + " - class_0\n", + " - class_1\n", + " - class_2\n", + "\t\t\n", + " :Summary Statistics:\n", + " \n", + " ============================= ==== ===== ======= =====\n", + " Min Max Mean SD\n", + " ============================= ==== ===== ======= =====\n", + " Alcohol: 11.0 14.8 13.0 0.8\n", + " Malic Acid: 0.74 5.80 2.34 1.12\n", + " Ash: 1.36 3.23 2.36 0.27\n", + " Alcalinity of Ash: 10.6 30.0 19.5 3.3\n", + " Magnesium: 70.0 162.0 99.7 14.3\n", + " Total Phenols: 0.98 3.88 2.29 0.63\n", + " Flavanoids: 0.34 5.08 2.03 1.00\n", + " Nonflavanoid Phenols: 0.13 0.66 0.36 0.12\n", + " Proanthocyanins: 0.41 3.58 1.59 0.57\n", + " Colour Intensity: 1.3 13.0 5.1 2.3\n", + " Hue: 0.48 1.71 0.96 0.23\n", + " OD280/OD315 of diluted wines: 1.27 4.00 2.61 0.71\n", + " Proline: 278 1680 746 315\n", + " ============================= ==== ===== ======= =====\n", + "\n", + " :Missing Attribute Values: None\n", + " :Class Distribution: class_0 (59), class_1 (71), class_2 (48)\n", + " :Creator: R.A. Fisher\n", + " :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n", + " :Date: July, 1988\n", + "\n", + "This is a copy of UCI ML Wine recognition datasets.\n", + "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data\n", + "\n", + "The data is the results of a chemical analysis of wines grown in the same\n", + "region in Italy by three different cultivators. There are thirteen different\n", + "measurements taken for different constituents found in the three types of\n", + "wine.\n", + "\n", + "Original Owners: \n", + "\n", + "Forina, M. et al, PARVUS - \n", + "An Extendible Package for Data Exploration, Classification and Correlation. \n", + "Institute of Pharmaceutical and Food Analysis and Technologies,\n", + "Via Brigata Salerno, 16147 Genoa, Italy.\n", + "\n", + "Citation:\n", + "\n", + "Lichman, M. (2013). UCI Machine Learning Repository\n", + "[https://archive.ics.uci.edu/ml]. Irvine, CA: University of California,\n", + "School of Information and Computer Science. \n", + "\n", + ".. topic:: References\n", + "\n", + " (1) S. Aeberhard, D. Coomans and O. de Vel, \n", + " Comparison of Classifiers in High Dimensional Settings, \n", + " Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of \n", + " Mathematics and Statistics, James Cook University of North Queensland. \n", + " (Also submitted to Technometrics). \n", + "\n", + " The data was used with many others for comparing various \n", + " classifiers. The classes are separable, though only RDA \n", + " has achieved 100% correct classification. \n", + " (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) \n", + " (All results using the leave-one-out technique) \n", + "\n", + " (2) S. Aeberhard, D. Coomans and O. de Vel, \n", + " \"THE CLASSIFICATION PERFORMANCE OF RDA\" \n", + " Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of \n", + " Mathematics and Statistics, James Cook University of North Queensland. \n", + " (Also submitted to Journal of Chemometrics).\n", + "\n" + ] + } + ], + "source": [ + "print(wine[\"DESCR\"])" + ] + }, + { + "cell_type": "markdown", + "id": "772359a0", + "metadata": {}, + "source": [ + "4). Сколько классов содержит целевая переменная датасета? Выведите названия классов." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "183b0c76", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['class_0' 'class_1' 'class_2']\n", + "кол-во:3\n" + ] + } + ], + "source": [ + "target_class = wine[\"target_names\"]\n", + "\n", + "print(f\"{target_class}\\nкол-во:{len(target_class)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "41d6690b", + "metadata": {}, + "source": [ + "5). На основе данных датасета (они содержатся в двумерном массиве Numpy) и названий признаков\n", + "создайте датафрейм под названием X." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "15557fbe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesproline
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.0
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.0
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.0
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.0
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.0
\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + "\n", + " od280/od315_of_diluted_wines proline \n", + "0 3.92 1065.0 \n", + "1 3.40 1050.0 \n", + "2 3.17 1185.0 \n", + "3 3.45 1480.0 \n", + "4 2.93 735.0 " + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_names = wine[\"feature_names\"]\n", + "\n", + "X = pd.DataFrame(data, columns=feature_names)\n", + "\n", + "X.head()" + ] + }, + { + "cell_type": "markdown", + "id": "e9ef2d4b", + "metadata": {}, + "source": [ + "6). Выясните размер датафрейма X и установите, имеются ли в нем пропущенные значения." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "9f2ee54a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 178 entries, 0 to 177\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 alcohol 178 non-null float64\n", + " 1 malic_acid 178 non-null float64\n", + " 2 ash 178 non-null float64\n", + " 3 alcalinity_of_ash 178 non-null float64\n", + " 4 magnesium 178 non-null float64\n", + " 5 total_phenols 178 non-null float64\n", + " 6 flavanoids 178 non-null float64\n", + " 7 nonflavanoid_phenols 178 non-null float64\n", + " 8 proanthocyanins 178 non-null float64\n", + " 9 color_intensity 178 non-null float64\n", + " 10 hue 178 non-null float64\n", + " 11 od280/od315_of_diluted_wines 178 non-null float64\n", + " 12 proline 178 non-null float64\n", + "dtypes: float64(13)\n", + "memory usage: 18.2 KB\n" + ] + } + ], + "source": [ + "X.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "90a743fa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "размер = (178, 13)\n", + "кол-во пустых значений\n", + "alcohol 0\n", + "malic_acid 0\n", + "ash 0\n", + "alcalinity_of_ash 0\n", + "magnesium 0\n", + "total_phenols 0\n", + "flavanoids 0\n", + "nonflavanoid_phenols 0\n", + "proanthocyanins 0\n", + "color_intensity 0\n", + "hue 0\n", + "od280/od315_of_diluted_wines 0\n", + "proline 0\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "print(f\"размер = {X.shape}\\nкол-во пустых значений\\n{X.isnull().sum(axis=0)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "cf169b47", + "metadata": {}, + "source": [ + "7). Добавьте в датафрейм поле с классами вин в виде чисел, имеющих тип данных numpy.int64.\n", + "Название поля - 'target'." + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "3883dbe8", + "metadata": {}, + "outputs": [], + "source": [ + "X[\"target\"] = wine[\"target\"].astype(\"int64\")" + ] + }, + { + "cell_type": "markdown", + "id": "e5918c4e", + "metadata": {}, + "source": [ + "8). Постройте матрицу корреляций для всех полей X. Дайте полученному датафрейму название\n", + "X_corr" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "d4c9b5e1", + "metadata": {}, + "outputs": [], + "source": [ + "X_corr = X.corr()" + ] + }, + { + "cell_type": "markdown", + "id": "43fd36d4", + "metadata": {}, + "source": [ + "9). Создайте список high_corr из признаков, корреляция которых с полем target по абсолютному\n", + "значению превышает 0.5 (причем, само поле target не должно входить в этот список)." + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "a788a848", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['alcalinity_of_ash',\n", + " 'total_phenols',\n", + " 'flavanoids',\n", + " 'hue',\n", + " 'od280/od315_of_diluted_wines',\n", + " 'proline']" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "high_corr = [item for item in X_corr[abs(X_corr.target) > 0.5].index if item != \"target\"]\n", + "high_corr" + ] + }, + { + "cell_type": "markdown", + "id": "beab1b7d", + "metadata": {}, + "source": [ + "10). Удалите из датафрейма X поле с целевой переменной. Для всех признаков, названия которых\n", + "содержатся в списке high_corr, вычислите квадрат их значений и добавьте в датафрейм X\n", + "соответствующие поля с суффиксом '_2', добавленного к первоначальному названию признака.\n", + "Итоговый датафрейм должен содержать все поля, которые, были в нем изначально, а также поля с\n", + "признаками из списка high_corr, возведенными в квадрат. Выведите описание полей датафрейма X с\n", + "помощью метода describe." + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "426afeb5", + "metadata": {}, + "outputs": [], + "source": [ + "X.drop(\"target\", axis = 1, inplace = True) \n", + "\n", + "for item in high_corr:\n", + " X[item+\"_2\"] = X[item] ** 2" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "382274e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolinealcalinity_of_ash_2total_phenols_2flavanoids_2hue_2od280/od315_of_diluted_wines_2proline_2
count178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.000000178.0000001.780000e+02
mean13.0006182.3363482.36651719.49494499.7415732.2951122.0292700.3618541.5908995.0580900.9574492.611685746.893258391.1428655.6570305.1100490.9686617.3221556.564591e+05
std0.8118271.1171460.2743443.33956414.2824840.6258510.9988590.1244530.5723592.3182860.2285720.709990314.907474133.6717752.9362944.2114410.4437983.5843165.558591e+05
min11.0300000.7400001.36000010.60000070.0000000.9800000.3400000.1300000.4100001.2800000.4800001.270000278.000000112.3600000.9604000.1156000.2304001.6129007.728400e+04
25%12.3625001.6025002.21000017.20000088.0000001.7425001.2050000.2700001.2500003.2200000.7825001.937500500.500000295.8400003.0363251.4521000.6123253.7540752.505010e+05
50%13.0500001.8650002.36000019.50000098.0000002.3550002.1350000.3400001.5550004.6900000.9650002.780000673.500000380.2500005.5460504.5582500.9312507.7284004.536045e+05
75%13.6775003.0825002.55750021.500000107.0000002.8000002.8750000.4375001.9500006.2000001.1200003.170000985.000000462.2500007.8400008.2657001.25440010.0489009.702250e+05
max14.8300005.8000003.23000030.000000162.0000003.8800005.0800000.6600003.58000013.0000001.7100004.0000001680.000000900.00000015.05440025.8064002.92410016.0000002.822400e+06
\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "count 178.000000 178.000000 178.000000 178.000000 178.000000 \n", + "mean 13.000618 2.336348 2.366517 19.494944 99.741573 \n", + "std 0.811827 1.117146 0.274344 3.339564 14.282484 \n", + "min 11.030000 0.740000 1.360000 10.600000 70.000000 \n", + "25% 12.362500 1.602500 2.210000 17.200000 88.000000 \n", + "50% 13.050000 1.865000 2.360000 19.500000 98.000000 \n", + "75% 13.677500 3.082500 2.557500 21.500000 107.000000 \n", + "max 14.830000 5.800000 3.230000 30.000000 162.000000 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "count 178.000000 178.000000 178.000000 178.000000 \n", + "mean 2.295112 2.029270 0.361854 1.590899 \n", + "std 0.625851 0.998859 0.124453 0.572359 \n", + "min 0.980000 0.340000 0.130000 0.410000 \n", + "25% 1.742500 1.205000 0.270000 1.250000 \n", + "50% 2.355000 2.135000 0.340000 1.555000 \n", + "75% 2.800000 2.875000 0.437500 1.950000 \n", + "max 3.880000 5.080000 0.660000 3.580000 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \\\n", + "count 178.000000 178.000000 178.000000 178.000000 \n", + "mean 5.058090 0.957449 2.611685 746.893258 \n", + "std 2.318286 0.228572 0.709990 314.907474 \n", + "min 1.280000 0.480000 1.270000 278.000000 \n", + "25% 3.220000 0.782500 1.937500 500.500000 \n", + "50% 4.690000 0.965000 2.780000 673.500000 \n", + "75% 6.200000 1.120000 3.170000 985.000000 \n", + "max 13.000000 1.710000 4.000000 1680.000000 \n", + "\n", + " alcalinity_of_ash_2 total_phenols_2 flavanoids_2 hue_2 \\\n", + "count 178.000000 178.000000 178.000000 178.000000 \n", + "mean 391.142865 5.657030 5.110049 0.968661 \n", + "std 133.671775 2.936294 4.211441 0.443798 \n", + "min 112.360000 0.960400 0.115600 0.230400 \n", + "25% 295.840000 3.036325 1.452100 0.612325 \n", + "50% 380.250000 5.546050 4.558250 0.931250 \n", + "75% 462.250000 7.840000 8.265700 1.254400 \n", + "max 900.000000 15.054400 25.806400 2.924100 \n", + "\n", + " od280/od315_of_diluted_wines_2 proline_2 \n", + "count 178.000000 1.780000e+02 \n", + "mean 7.322155 6.564591e+05 \n", + "std 3.584316 5.558591e+05 \n", + "min 1.612900 7.728400e+04 \n", + "25% 3.754075 2.505010e+05 \n", + "50% 7.728400 4.536045e+05 \n", + "75% 10.048900 9.702250e+05 \n", + "max 16.000000 2.822400e+06 " + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1245b68", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}