diff --git a/Lesson6/Task2.ipynb b/Lesson6/Task2.ipynb
new file mode 100644
index 0000000..2ebd9ce
--- /dev/null
+++ b/Lesson6/Task2.ipynb
@@ -0,0 +1,1979 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "b2f2e5e2",
+ "metadata": {},
+ "source": [
+ "## Тема “Обучение с учителем”"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "16595a84",
+ "metadata": {},
+ "source": [
+ "### Задание 1\n",
+ "Импортируйте библиотеки pandas и numpy.\n",
+ "Загрузите \"Boston House Prices dataset\" из встроенных наборов данных библиотеки sklearn..\n",
+ "Разбейте эти датафреймы на тренировочные (X_train, y_train) и тестовые (X_test, y_test) с помощью\n",
+ "функции train_test_split так, чтобы размер тестовой выборки\n",
+ "составлял 30% от всех данных, при этом аргумент random state должен быть равен 42.\n",
+ "Создайте модель линейной регрессии под названием lr с помощью класса LinearRegression из модуля\n",
+ "sklearn.linear_model.\n",
+ "Обучите модель на тренировочных данных (используйте все признаки) и сделайте предсказание на\n",
+ "тестовых."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "274303e6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "\n",
+ "warnings.filterwarnings('ignore')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "693a9c36",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "\n",
+ "pd.options.display.max_columns = 100\n",
+ "\n",
+ "from sklearn.datasets import load_boston"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "a37f783d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "boston = load_boston()\n",
+ "\n",
+ "feature_names = boston[\"feature_names\"]\n",
+ "\n",
+ "X = pd.DataFrame(boston[\"data\"], columns=feature_names)\n",
+ "y = pd.DataFrame(boston[\"target\"], columns=[\"price\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "5a2e0780",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "((354, 13), (152, 13))"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n",
+ "\n",
+ "X_train.shape, X_test.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "96164976",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.linear_model import LinearRegression\n",
+ "\n",
+ "lr = LinearRegression()\n",
+ "\n",
+ "lr.fit(X_train, y_train)\n",
+ "\n",
+ "lr_pred = lr.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a07cffa5",
+ "metadata": {},
+ "source": [
+ "### Задание 2\n",
+ "Создайте модель под названием model с помощью класса RandomForestRegressor из модуля\n",
+ "sklearn.ensemble.\n",
+ "Сделайте агрумент n_estimators равным 1000,\n",
+ "max_depth должен быть равен 12 и random_state сделайте равным 42.\n",
+ "Обучите модель на тренировочных данных аналогично тому, как вы обучали модель LinearRegression,\n",
+ "но при этом в метод fit вместо датафрейма y_train поставьте y_train.values[:, 0],\n",
+ "чтобы получить из датафрейма одномерный массив Numpy,\n",
+ "так как для класса RandomForestRegressor в данном методе для аргумента y предпочтительно\n",
+ "применение массивов вместо датафрейма.\n",
+ "Сделайте предсказание на тестовых данных и посчитайте R2. Сравните с результатом из\n",
+ "предыдущего задания.\n",
+ "Напишите в комментариях к коду, какая модель в данном случае работает лучше"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "64042e74",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "оценка R2 модели ансамбля случайного леса выше чем у линейной регрессии\n",
+ "RandomForestRegressor=0.8747\n",
+ "LinearRegression=0.7112\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.ensemble import RandomForestRegressor\n",
+ "from sklearn.metrics import r2_score\n",
+ "\n",
+ "model = RandomForestRegressor(max_depth=12, n_estimators=1000, random_state=42)\n",
+ "\n",
+ "model.fit(X_train, y_train.values[:,0])\n",
+ "\n",
+ "rf_pred = model.predict(X_test)\n",
+ "\n",
+ "r2_lr = r2_score(y_test, lr_pred)\n",
+ "r2_rf = r2_score(y_test, rf_pred)\n",
+ "\n",
+ "print(f\"оценка R2 модели ансамбля случайного леса {'выше' if r2_rf > r2_lr else 'ниже' } \" \n",
+ " f\"чем у линейной регрессии\\nRandomForestRegressor={r2_rf:.4f}\\nLinearRegression={r2_lr:.4f}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "94b0548e",
+ "metadata": {},
+ "source": [
+ "### *Задание 3\n",
+ "Вызовите документацию для класса RandomForestRegressor,\n",
+ "найдите информацию об атрибуте feature_importances_.\n",
+ "С помощью этого атрибута найдите сумму всех показателей важности,\n",
+ "установите, какие два признака показывают наибольшую важность."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "1327004d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "?RandomForestRegressor"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "c8e1fab0",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "сумма показателей важности признаков модели = 1.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f\"сумма показателей важности признаков модели = {np.sum(model.feature_importances_)}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "744deb30",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "первые два наиболее важные признака\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " feature | \n",
+ " importance | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 12 | \n",
+ " LSTAT | \n",
+ " 0.415847 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " RM | \n",
+ " 0.402682 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " feature importance\n",
+ "12 LSTAT 0.415847\n",
+ "5 RM 0.402682"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "print(\"первые два наиболее важные признака\")\n",
+ "pd.DataFrame({'feature': feature_names,'importance': model.feature_importances_}) \\\n",
+ ".sort_values('importance',ascending = False) \\\n",
+ ".head(2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b9004884",
+ "metadata": {},
+ "source": [
+ "## *Задание 4\n",
+ "В этом задании мы будем работать с датасетом, с которым мы уже знакомы по домашнему заданию\n",
+ "по библиотеке Matplotlib, это датасет Credit Card Fraud Detection.Для этого датасета мы будем решать\n",
+ "задачу классификации - будем определять,какие из транзакции по кредитной карте являются\n",
+ "мошенническими.Данный датасет сильно несбалансирован (так как случаи мошенничества\n",
+ "относительно редки),так что применение метрики accuracy не принесет пользы и не поможет выбрать\n",
+ "лучшую модель.Мы будем вычислять AUC, то есть площадь под кривой ROC.\n",
+ "Импортируйте из соответствующих модулей RandomForestClassifier, GridSearchCV и train_test_split.\n",
+ "Загрузите датасет creditcard.csv и создайте датафрейм df."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "a65b98a0",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Dataset URL: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud\n",
+ "License(s): DbCL-1.0\n",
+ "Downloading creditcardfraud.zip to W:\\Projects\\GB\\Python\\MLearning\\DSLibraries\\Lesson6\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " 0%| | 0.00/66.0M [00:00, ?B/s]\n",
+ " 2%|1 | 1.00M/66.0M [00:01<01:18, 864kB/s]\n",
+ " 3%|3 | 2.00M/66.0M [00:01<00:57, 1.17MB/s]\n",
+ " 6%|6 | 4.00M/66.0M [00:02<00:24, 2.65MB/s]\n",
+ " 9%|9 | 6.00M/66.0M [00:02<00:15, 4.18MB/s]\n",
+ " 11%|# | 7.00M/66.0M [00:02<00:12, 4.88MB/s]\n",
+ " 14%|#3 | 9.00M/66.0M [00:02<00:09, 6.39MB/s]\n",
+ " 17%|#6 | 11.0M/66.0M [00:02<00:07, 7.61MB/s]\n",
+ " 20%|#9 | 13.0M/66.0M [00:02<00:06, 8.38MB/s]\n",
+ " 23%|##2 | 15.0M/66.0M [00:03<00:08, 6.29MB/s]\n",
+ " 24%|##4 | 16.0M/66.0M [00:03<00:07, 6.61MB/s]\n",
+ " 27%|##7 | 18.0M/66.0M [00:03<00:06, 7.73MB/s]\n",
+ " 30%|### | 20.0M/66.0M [00:03<00:05, 8.62MB/s]\n",
+ " 32%|###1 | 21.0M/66.0M [00:04<00:05, 8.79MB/s]\n",
+ " 35%|###4 | 23.0M/66.0M [00:04<00:04, 9.52MB/s]\n",
+ " 38%|###7 | 25.0M/66.0M [00:04<00:04, 10.0MB/s]\n",
+ " 39%|###9 | 26.0M/66.0M [00:04<00:04, 9.94MB/s]\n",
+ " 42%|####2 | 28.0M/66.0M [00:04<00:03, 10.3MB/s]\n",
+ " 45%|####5 | 30.0M/66.0M [00:04<00:03, 10.6MB/s]\n",
+ " 49%|####8 | 32.0M/66.0M [00:05<00:03, 10.3MB/s]\n",
+ " 50%|##### | 33.0M/66.0M [00:05<00:03, 10.1MB/s]\n",
+ " 53%|#####3 | 35.0M/66.0M [00:05<00:03, 10.5MB/s]\n",
+ " 56%|#####6 | 37.0M/66.0M [00:05<00:02, 10.4MB/s]\n",
+ " 58%|#####7 | 38.0M/66.0M [00:05<00:03, 9.68MB/s]\n",
+ " 59%|#####9 | 39.0M/66.0M [00:05<00:03, 8.84MB/s]\n",
+ " 61%|###### | 40.0M/66.0M [00:06<00:03, 7.78MB/s]\n",
+ " 62%|######2 | 41.0M/66.0M [00:06<00:03, 7.08MB/s]\n",
+ " 64%|######3 | 42.0M/66.0M [00:06<00:03, 6.98MB/s]\n",
+ " 65%|######5 | 43.0M/66.0M [00:06<00:03, 7.10MB/s]\n",
+ " 67%|######6 | 44.0M/66.0M [00:06<00:03, 7.20MB/s]\n",
+ " 68%|######8 | 45.0M/66.0M [00:06<00:03, 7.05MB/s]\n",
+ " 70%|######9 | 46.0M/66.0M [00:07<00:03, 6.95MB/s]\n",
+ " 71%|#######1 | 47.0M/66.0M [00:07<00:02, 7.32MB/s]\n",
+ " 73%|#######2 | 48.0M/66.0M [00:07<00:02, 7.36MB/s]\n",
+ " 74%|#######4 | 49.0M/66.0M [00:07<00:02, 7.15MB/s]\n",
+ " 76%|#######5 | 50.0M/66.0M [00:07<00:02, 7.24MB/s]\n",
+ " 77%|#######7 | 51.0M/66.0M [00:07<00:02, 7.30MB/s]\n",
+ " 79%|#######8 | 52.0M/66.0M [00:07<00:01, 7.60MB/s]\n",
+ " 82%|########1 | 54.0M/66.0M [00:08<00:01, 8.92MB/s]\n",
+ " 83%|########3 | 55.0M/66.0M [00:08<00:01, 9.07MB/s]\n",
+ " 86%|########6 | 57.0M/66.0M [00:08<00:00, 9.55MB/s]\n",
+ " 88%|########7 | 58.0M/66.0M [00:08<00:00, 9.56MB/s]\n",
+ " 89%|########9 | 59.0M/66.0M [00:08<00:00, 9.57MB/s]\n",
+ " 92%|#########2| 61.0M/66.0M [00:08<00:00, 9.58MB/s]\n",
+ " 94%|#########4| 62.0M/66.0M [00:08<00:00, 8.99MB/s]\n",
+ " 96%|#########5| 63.0M/66.0M [00:09<00:00, 8.56MB/s]\n",
+ " 97%|#########7| 64.0M/66.0M [00:09<00:00, 8.24MB/s]\n",
+ " 99%|#########8| 65.0M/66.0M [00:09<00:00, 8.28MB/s]\n",
+ "100%|##########| 66.0M/66.0M [00:09<00:00, 7.31MB/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.model_selection import GridSearchCV\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "!kaggle datasets download -d mlg-ulb/creditcardfraud"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5f7955e2",
+ "metadata": {},
+ "source": [
+ "С помощью метода value_counts с аргументом normalize=True убедитесь в том, что выборка\n",
+ "несбалансирована. Используя метод info, проверьте, все ли столбцы содержат числовые данные и нет\n",
+ "ли в них пропусков.Примените следующую настройку, чтобы можно было просматривать все столбцы\n",
+ "датафрейма: pd.options.display.max_columns = 100.\n",
+ "Просмотрите первые 10 строк датафрейма df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "3e959e4e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Time | \n",
+ " V1 | \n",
+ " V2 | \n",
+ " V3 | \n",
+ " V4 | \n",
+ " V5 | \n",
+ " V6 | \n",
+ " V7 | \n",
+ " V8 | \n",
+ " V9 | \n",
+ " V10 | \n",
+ " V11 | \n",
+ " V12 | \n",
+ " V13 | \n",
+ " V14 | \n",
+ " V15 | \n",
+ " V16 | \n",
+ " V17 | \n",
+ " V18 | \n",
+ " V19 | \n",
+ " V20 | \n",
+ " V21 | \n",
+ " V22 | \n",
+ " V23 | \n",
+ " V24 | \n",
+ " V25 | \n",
+ " V26 | \n",
+ " V27 | \n",
+ " V28 | \n",
+ " Amount | \n",
+ " Class | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0.0 | \n",
+ " -1.359807 | \n",
+ " -0.072781 | \n",
+ " 2.536347 | \n",
+ " 1.378155 | \n",
+ " -0.338321 | \n",
+ " 0.462388 | \n",
+ " 0.239599 | \n",
+ " 0.098698 | \n",
+ " 0.363787 | \n",
+ " 0.090794 | \n",
+ " -0.551600 | \n",
+ " -0.617801 | \n",
+ " -0.991390 | \n",
+ " -0.311169 | \n",
+ " 1.468177 | \n",
+ " -0.470401 | \n",
+ " 0.207971 | \n",
+ " 0.025791 | \n",
+ " 0.403993 | \n",
+ " 0.251412 | \n",
+ " -0.018307 | \n",
+ " 0.277838 | \n",
+ " -0.110474 | \n",
+ " 0.066928 | \n",
+ " 0.128539 | \n",
+ " -0.189115 | \n",
+ " 0.133558 | \n",
+ " -0.021053 | \n",
+ " 149.62 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0.0 | \n",
+ " 1.191857 | \n",
+ " 0.266151 | \n",
+ " 0.166480 | \n",
+ " 0.448154 | \n",
+ " 0.060018 | \n",
+ " -0.082361 | \n",
+ " -0.078803 | \n",
+ " 0.085102 | \n",
+ " -0.255425 | \n",
+ " -0.166974 | \n",
+ " 1.612727 | \n",
+ " 1.065235 | \n",
+ " 0.489095 | \n",
+ " -0.143772 | \n",
+ " 0.635558 | \n",
+ " 0.463917 | \n",
+ " -0.114805 | \n",
+ " -0.183361 | \n",
+ " -0.145783 | \n",
+ " -0.069083 | \n",
+ " -0.225775 | \n",
+ " -0.638672 | \n",
+ " 0.101288 | \n",
+ " -0.339846 | \n",
+ " 0.167170 | \n",
+ " 0.125895 | \n",
+ " -0.008983 | \n",
+ " 0.014724 | \n",
+ " 2.69 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1.0 | \n",
+ " -1.358354 | \n",
+ " -1.340163 | \n",
+ " 1.773209 | \n",
+ " 0.379780 | \n",
+ " -0.503198 | \n",
+ " 1.800499 | \n",
+ " 0.791461 | \n",
+ " 0.247676 | \n",
+ " -1.514654 | \n",
+ " 0.207643 | \n",
+ " 0.624501 | \n",
+ " 0.066084 | \n",
+ " 0.717293 | \n",
+ " -0.165946 | \n",
+ " 2.345865 | \n",
+ " -2.890083 | \n",
+ " 1.109969 | \n",
+ " -0.121359 | \n",
+ " -2.261857 | \n",
+ " 0.524980 | \n",
+ " 0.247998 | \n",
+ " 0.771679 | \n",
+ " 0.909412 | \n",
+ " -0.689281 | \n",
+ " -0.327642 | \n",
+ " -0.139097 | \n",
+ " -0.055353 | \n",
+ " -0.059752 | \n",
+ " 378.66 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1.0 | \n",
+ " -0.966272 | \n",
+ " -0.185226 | \n",
+ " 1.792993 | \n",
+ " -0.863291 | \n",
+ " -0.010309 | \n",
+ " 1.247203 | \n",
+ " 0.237609 | \n",
+ " 0.377436 | \n",
+ " -1.387024 | \n",
+ " -0.054952 | \n",
+ " -0.226487 | \n",
+ " 0.178228 | \n",
+ " 0.507757 | \n",
+ " -0.287924 | \n",
+ " -0.631418 | \n",
+ " -1.059647 | \n",
+ " -0.684093 | \n",
+ " 1.965775 | \n",
+ " -1.232622 | \n",
+ " -0.208038 | \n",
+ " -0.108300 | \n",
+ " 0.005274 | \n",
+ " -0.190321 | \n",
+ " -1.175575 | \n",
+ " 0.647376 | \n",
+ " -0.221929 | \n",
+ " 0.062723 | \n",
+ " 0.061458 | \n",
+ " 123.50 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2.0 | \n",
+ " -1.158233 | \n",
+ " 0.877737 | \n",
+ " 1.548718 | \n",
+ " 0.403034 | \n",
+ " -0.407193 | \n",
+ " 0.095921 | \n",
+ " 0.592941 | \n",
+ " -0.270533 | \n",
+ " 0.817739 | \n",
+ " 0.753074 | \n",
+ " -0.822843 | \n",
+ " 0.538196 | \n",
+ " 1.345852 | \n",
+ " -1.119670 | \n",
+ " 0.175121 | \n",
+ " -0.451449 | \n",
+ " -0.237033 | \n",
+ " -0.038195 | \n",
+ " 0.803487 | \n",
+ " 0.408542 | \n",
+ " -0.009431 | \n",
+ " 0.798278 | \n",
+ " -0.137458 | \n",
+ " 0.141267 | \n",
+ " -0.206010 | \n",
+ " 0.502292 | \n",
+ " 0.219422 | \n",
+ " 0.215153 | \n",
+ " 69.99 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 2.0 | \n",
+ " -0.425966 | \n",
+ " 0.960523 | \n",
+ " 1.141109 | \n",
+ " -0.168252 | \n",
+ " 0.420987 | \n",
+ " -0.029728 | \n",
+ " 0.476201 | \n",
+ " 0.260314 | \n",
+ " -0.568671 | \n",
+ " -0.371407 | \n",
+ " 1.341262 | \n",
+ " 0.359894 | \n",
+ " -0.358091 | \n",
+ " -0.137134 | \n",
+ " 0.517617 | \n",
+ " 0.401726 | \n",
+ " -0.058133 | \n",
+ " 0.068653 | \n",
+ " -0.033194 | \n",
+ " 0.084968 | \n",
+ " -0.208254 | \n",
+ " -0.559825 | \n",
+ " -0.026398 | \n",
+ " -0.371427 | \n",
+ " -0.232794 | \n",
+ " 0.105915 | \n",
+ " 0.253844 | \n",
+ " 0.081080 | \n",
+ " 3.67 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 4.0 | \n",
+ " 1.229658 | \n",
+ " 0.141004 | \n",
+ " 0.045371 | \n",
+ " 1.202613 | \n",
+ " 0.191881 | \n",
+ " 0.272708 | \n",
+ " -0.005159 | \n",
+ " 0.081213 | \n",
+ " 0.464960 | \n",
+ " -0.099254 | \n",
+ " -1.416907 | \n",
+ " -0.153826 | \n",
+ " -0.751063 | \n",
+ " 0.167372 | \n",
+ " 0.050144 | \n",
+ " -0.443587 | \n",
+ " 0.002821 | \n",
+ " -0.611987 | \n",
+ " -0.045575 | \n",
+ " -0.219633 | \n",
+ " -0.167716 | \n",
+ " -0.270710 | \n",
+ " -0.154104 | \n",
+ " -0.780055 | \n",
+ " 0.750137 | \n",
+ " -0.257237 | \n",
+ " 0.034507 | \n",
+ " 0.005168 | \n",
+ " 4.99 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 7.0 | \n",
+ " -0.644269 | \n",
+ " 1.417964 | \n",
+ " 1.074380 | \n",
+ " -0.492199 | \n",
+ " 0.948934 | \n",
+ " 0.428118 | \n",
+ " 1.120631 | \n",
+ " -3.807864 | \n",
+ " 0.615375 | \n",
+ " 1.249376 | \n",
+ " -0.619468 | \n",
+ " 0.291474 | \n",
+ " 1.757964 | \n",
+ " -1.323865 | \n",
+ " 0.686133 | \n",
+ " -0.076127 | \n",
+ " -1.222127 | \n",
+ " -0.358222 | \n",
+ " 0.324505 | \n",
+ " -0.156742 | \n",
+ " 1.943465 | \n",
+ " -1.015455 | \n",
+ " 0.057504 | \n",
+ " -0.649709 | \n",
+ " -0.415267 | \n",
+ " -0.051634 | \n",
+ " -1.206921 | \n",
+ " -1.085339 | \n",
+ " 40.80 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 7.0 | \n",
+ " -0.894286 | \n",
+ " 0.286157 | \n",
+ " -0.113192 | \n",
+ " -0.271526 | \n",
+ " 2.669599 | \n",
+ " 3.721818 | \n",
+ " 0.370145 | \n",
+ " 0.851084 | \n",
+ " -0.392048 | \n",
+ " -0.410430 | \n",
+ " -0.705117 | \n",
+ " -0.110452 | \n",
+ " -0.286254 | \n",
+ " 0.074355 | \n",
+ " -0.328783 | \n",
+ " -0.210077 | \n",
+ " -0.499768 | \n",
+ " 0.118765 | \n",
+ " 0.570328 | \n",
+ " 0.052736 | \n",
+ " -0.073425 | \n",
+ " -0.268092 | \n",
+ " -0.204233 | \n",
+ " 1.011592 | \n",
+ " 0.373205 | \n",
+ " -0.384157 | \n",
+ " 0.011747 | \n",
+ " 0.142404 | \n",
+ " 93.20 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 9.0 | \n",
+ " -0.338262 | \n",
+ " 1.119593 | \n",
+ " 1.044367 | \n",
+ " -0.222187 | \n",
+ " 0.499361 | \n",
+ " -0.246761 | \n",
+ " 0.651583 | \n",
+ " 0.069539 | \n",
+ " -0.736727 | \n",
+ " -0.366846 | \n",
+ " 1.017614 | \n",
+ " 0.836390 | \n",
+ " 1.006844 | \n",
+ " -0.443523 | \n",
+ " 0.150219 | \n",
+ " 0.739453 | \n",
+ " -0.540980 | \n",
+ " 0.476677 | \n",
+ " 0.451773 | \n",
+ " 0.203711 | \n",
+ " -0.246914 | \n",
+ " -0.633753 | \n",
+ " -0.120794 | \n",
+ " -0.385050 | \n",
+ " -0.069733 | \n",
+ " 0.094199 | \n",
+ " 0.246219 | \n",
+ " 0.083076 | \n",
+ " 3.68 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Time V1 V2 V3 V4 V5 V6 V7 \\\n",
+ "0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 \n",
+ "1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 \n",
+ "2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 \n",
+ "3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 \n",
+ "4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 \n",
+ "5 2.0 -0.425966 0.960523 1.141109 -0.168252 0.420987 -0.029728 0.476201 \n",
+ "6 4.0 1.229658 0.141004 0.045371 1.202613 0.191881 0.272708 -0.005159 \n",
+ "7 7.0 -0.644269 1.417964 1.074380 -0.492199 0.948934 0.428118 1.120631 \n",
+ "8 7.0 -0.894286 0.286157 -0.113192 -0.271526 2.669599 3.721818 0.370145 \n",
+ "9 9.0 -0.338262 1.119593 1.044367 -0.222187 0.499361 -0.246761 0.651583 \n",
+ "\n",
+ " V8 V9 V10 V11 V12 V13 V14 \\\n",
+ "0 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169 \n",
+ "1 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772 \n",
+ "2 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946 \n",
+ "3 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924 \n",
+ "4 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670 \n",
+ "5 0.260314 -0.568671 -0.371407 1.341262 0.359894 -0.358091 -0.137134 \n",
+ "6 0.081213 0.464960 -0.099254 -1.416907 -0.153826 -0.751063 0.167372 \n",
+ "7 -3.807864 0.615375 1.249376 -0.619468 0.291474 1.757964 -1.323865 \n",
+ "8 0.851084 -0.392048 -0.410430 -0.705117 -0.110452 -0.286254 0.074355 \n",
+ "9 0.069539 -0.736727 -0.366846 1.017614 0.836390 1.006844 -0.443523 \n",
+ "\n",
+ " V15 V16 V17 V18 V19 V20 V21 \\\n",
+ "0 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307 \n",
+ "1 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775 \n",
+ "2 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998 \n",
+ "3 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300 \n",
+ "4 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431 \n",
+ "5 0.517617 0.401726 -0.058133 0.068653 -0.033194 0.084968 -0.208254 \n",
+ "6 0.050144 -0.443587 0.002821 -0.611987 -0.045575 -0.219633 -0.167716 \n",
+ "7 0.686133 -0.076127 -1.222127 -0.358222 0.324505 -0.156742 1.943465 \n",
+ "8 -0.328783 -0.210077 -0.499768 0.118765 0.570328 0.052736 -0.073425 \n",
+ "9 0.150219 0.739453 -0.540980 0.476677 0.451773 0.203711 -0.246914 \n",
+ "\n",
+ " V22 V23 V24 V25 V26 V27 V28 \\\n",
+ "0 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 \n",
+ "1 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 \n",
+ "2 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 \n",
+ "3 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 \n",
+ "4 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 \n",
+ "5 -0.559825 -0.026398 -0.371427 -0.232794 0.105915 0.253844 0.081080 \n",
+ "6 -0.270710 -0.154104 -0.780055 0.750137 -0.257237 0.034507 0.005168 \n",
+ "7 -1.015455 0.057504 -0.649709 -0.415267 -0.051634 -1.206921 -1.085339 \n",
+ "8 -0.268092 -0.204233 1.011592 0.373205 -0.384157 0.011747 0.142404 \n",
+ "9 -0.633753 -0.120794 -0.385050 -0.069733 0.094199 0.246219 0.083076 \n",
+ "\n",
+ " Amount Class \n",
+ "0 149.62 0 \n",
+ "1 2.69 0 \n",
+ "2 378.66 0 \n",
+ "3 123.50 0 \n",
+ "4 69.99 0 \n",
+ "5 3.67 0 \n",
+ "6 4.99 0 \n",
+ "7 40.80 0 \n",
+ "8 93.20 0 \n",
+ "9 3.68 0 "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from zipfile import ZipFile\n",
+ "\n",
+ "ZipFile(\"creditcardfraud.zip\").extractall(\".\")\n",
+ "\n",
+ "df = pd.read_csv(\"creditcard.csv\")\n",
+ "\n",
+ "df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "id": "9302f8fe",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 284807 entries, 0 to 284806\n",
+ "Data columns (total 31 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 Time 284807 non-null float64\n",
+ " 1 V1 284807 non-null float64\n",
+ " 2 V2 284807 non-null float64\n",
+ " 3 V3 284807 non-null float64\n",
+ " 4 V4 284807 non-null float64\n",
+ " 5 V5 284807 non-null float64\n",
+ " 6 V6 284807 non-null float64\n",
+ " 7 V7 284807 non-null float64\n",
+ " 8 V8 284807 non-null float64\n",
+ " 9 V9 284807 non-null float64\n",
+ " 10 V10 284807 non-null float64\n",
+ " 11 V11 284807 non-null float64\n",
+ " 12 V12 284807 non-null float64\n",
+ " 13 V13 284807 non-null float64\n",
+ " 14 V14 284807 non-null float64\n",
+ " 15 V15 284807 non-null float64\n",
+ " 16 V16 284807 non-null float64\n",
+ " 17 V17 284807 non-null float64\n",
+ " 18 V18 284807 non-null float64\n",
+ " 19 V19 284807 non-null float64\n",
+ " 20 V20 284807 non-null float64\n",
+ " 21 V21 284807 non-null float64\n",
+ " 22 V22 284807 non-null float64\n",
+ " 23 V23 284807 non-null float64\n",
+ " 24 V24 284807 non-null float64\n",
+ " 25 V25 284807 non-null float64\n",
+ " 26 V26 284807 non-null float64\n",
+ " 27 V27 284807 non-null float64\n",
+ " 28 V28 284807 non-null float64\n",
+ " 29 Amount 284807 non-null float64\n",
+ " 30 Class 284807 non-null int64 \n",
+ "dtypes: float64(30), int64(1)\n",
+ "memory usage: 67.4 MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "df.info()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4ba00248",
+ "metadata": {},
+ "source": [
+ "Создайте датафрейм X из датафрейма df, исключив столбец Class.\n",
+ "Создайте объект Series под названием y из столбца Class.\n",
+ "Разбейте X и y на тренировочный и тестовый наборы данных при помощи функции train_test_split,\n",
+ "используя аргументы: test_size=0.3, random_state=100, stratify=y.\n",
+ "У вас должны получиться объекты X_train, X_test, y_train и y_test.\n",
+ "Просмотрите информацию о их форме."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "59e1e34e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "выборка не сбалансированна, данных первого класса значительно меньше\n",
+ "0 0.998273\n",
+ "1 0.001727\n",
+ "Name: Class, dtype: float64\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "((199364, 30), (85443, 30), (199364,), (85443,))"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "target = \"Class\"\n",
+ "\n",
+ "y = df[target]\n",
+ "X = df.drop(target, axis=1)\n",
+ "\n",
+ "print(f\"выборка не сбалансированна, данных первого класса значительно меньше\\n{y.value_counts(normalize=True)}\")\n",
+ "\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)\n",
+ "X_train.shape, X_test.shape, y_train.shape, y_test.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fe7fae18",
+ "metadata": {},
+ "source": [
+ "Для поиска по сетке параметров задайте такие параметры:\n",
+ "parameters = [{'n_estimators': [10, 15],\n",
+ "'max_features': np.arange(3, 5),\n",
+ "'max_depth': np.arange(4, 7)}]\n",
+ "Создайте модель GridSearchCV со следующими аргументами:\n",
+ "estimator=RandomForestClassifier(random_state=100),\n",
+ "param_grid=parameters,\n",
+ "scoring='roc_auc',\n",
+ "cv=3."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 103,
+ "id": "c034c2a4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "\n",
+ "parameters = {\n",
+ " 'n_estimators': [10, 15],\n",
+ " 'max_features': np.arange(3, 5),\n",
+ " 'max_depth': np.arange(4, 7),\n",
+ "}\n",
+ "\n",
+ "clf = GridSearchCV(\n",
+ " estimator=RandomForestClassifier(random_state=100),\n",
+ " param_grid=parameters,\n",
+ " scoring='roc_auc',\n",
+ " cv=3,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a4308ac3",
+ "metadata": {},
+ "source": [
+ "Обучите модель на тренировочном наборе данных (может занять несколько минут).\n",
+ "Просмотрите параметры лучшей модели с помощью атрибута best_params_.\n",
+ "Предскажите вероятности классов с помощью полученной модели и метода predict_proba.\n",
+ "Из полученного результата (массив Numpy) выберите столбец с индексом 1 (вероятность класса 1) и\n",
+ "запишите в массив y_pred_proba. Из модуля sklearn.metrics импортируйте метрику roc_auc_score.\n",
+ "Вычислите AUC на тестовых данных и сравните с результатом,полученным на тренировочных данных,\n",
+ "используя в качестве аргументов массивы y_test и y_pred_proba"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 104,
+ "id": "9b43d0b7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=100),\n",
+ " param_grid={'max_depth': array([4, 5, 6]),\n",
+ " 'max_features': array([3, 4]),\n",
+ " 'n_estimators': [10, 15]},\n",
+ " scoring='roc_auc')"
+ ]
+ },
+ "execution_count": 104,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "clf.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 105,
+ "id": "966c3d4d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'max_depth': 6, 'max_features': 3, 'n_estimators': 15}"
+ ]
+ },
+ "execution_count": 105,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "clf.best_params_"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 110,
+ "id": "405d63d0",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "метрика AUC=0.9436 на тестовых данных меньше метрики AUC=0.9660 на обучающем наборе\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.metrics import roc_auc_score\n",
+ "\n",
+ "model = RandomForestClassifier(max_depth=6, max_features=3, n_estimators=15)\n",
+ "\n",
+ "model.fit(X_train, y_train)\n",
+ "\n",
+ "y_pred_proba = model.predict_proba(X_test)[:, 1]\n",
+ "\n",
+ "auc = roc_auc_score(y_test, y_pred_proba)\n",
+ "\n",
+ "print(f\"метрика AUC={auc:.4f} на тестовых данных {'больше' if auc > clf.best_score_ else 'меньше'} \"\n",
+ " f\"метрики AUC={clf.best_score_:.4f} на обучающем наборе\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5cb517be",
+ "metadata": {},
+ "source": [
+ "## *Дополнительные задания:\n",
+ "1). Загрузите датасет Wine из встроенных датасетов sklearn.datasets с помощью функции load_wine в\n",
+ "переменную data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 111,
+ "id": "1dbcea26",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.datasets import load_wine\n",
+ "\n",
+ "wine = load_wine()\n",
+ "\n",
+ "data = wine[\"data\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "18dea80c",
+ "metadata": {},
+ "source": [
+ "2). Полученный датасет не является датафреймом. Это структура данных, имеющая ключи\n",
+ "аналогично словарю. Просмотрите тип данных этой структуры данных и создайте список data_keys,\n",
+ "содержащий ее ключи."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "d5f5e52a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data_keys = wine.keys()\n",
+ "data_keys"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "73bff5cf",
+ "metadata": {},
+ "source": [
+ "3). Просмотрите данные, описание и названия признаков в датасете. Описание нужно вывести в виде\n",
+ "привычного, аккуратно оформленного текста, без обозначений переноса строки, но с самими\n",
+ "переносами и т.д."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "39433029",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ ".. _wine_dataset:\n",
+ "\n",
+ "Wine recognition dataset\n",
+ "------------------------\n",
+ "\n",
+ "**Data Set Characteristics:**\n",
+ "\n",
+ " :Number of Instances: 178 (50 in each of three classes)\n",
+ " :Number of Attributes: 13 numeric, predictive attributes and the class\n",
+ " :Attribute Information:\n",
+ " \t\t- Alcohol\n",
+ " \t\t- Malic acid\n",
+ " \t\t- Ash\n",
+ "\t\t- Alcalinity of ash \n",
+ " \t\t- Magnesium\n",
+ "\t\t- Total phenols\n",
+ " \t\t- Flavanoids\n",
+ " \t\t- Nonflavanoid phenols\n",
+ " \t\t- Proanthocyanins\n",
+ "\t\t- Color intensity\n",
+ " \t\t- Hue\n",
+ " \t\t- OD280/OD315 of diluted wines\n",
+ " \t\t- Proline\n",
+ "\n",
+ " - class:\n",
+ " - class_0\n",
+ " - class_1\n",
+ " - class_2\n",
+ "\t\t\n",
+ " :Summary Statistics:\n",
+ " \n",
+ " ============================= ==== ===== ======= =====\n",
+ " Min Max Mean SD\n",
+ " ============================= ==== ===== ======= =====\n",
+ " Alcohol: 11.0 14.8 13.0 0.8\n",
+ " Malic Acid: 0.74 5.80 2.34 1.12\n",
+ " Ash: 1.36 3.23 2.36 0.27\n",
+ " Alcalinity of Ash: 10.6 30.0 19.5 3.3\n",
+ " Magnesium: 70.0 162.0 99.7 14.3\n",
+ " Total Phenols: 0.98 3.88 2.29 0.63\n",
+ " Flavanoids: 0.34 5.08 2.03 1.00\n",
+ " Nonflavanoid Phenols: 0.13 0.66 0.36 0.12\n",
+ " Proanthocyanins: 0.41 3.58 1.59 0.57\n",
+ " Colour Intensity: 1.3 13.0 5.1 2.3\n",
+ " Hue: 0.48 1.71 0.96 0.23\n",
+ " OD280/OD315 of diluted wines: 1.27 4.00 2.61 0.71\n",
+ " Proline: 278 1680 746 315\n",
+ " ============================= ==== ===== ======= =====\n",
+ "\n",
+ " :Missing Attribute Values: None\n",
+ " :Class Distribution: class_0 (59), class_1 (71), class_2 (48)\n",
+ " :Creator: R.A. Fisher\n",
+ " :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n",
+ " :Date: July, 1988\n",
+ "\n",
+ "This is a copy of UCI ML Wine recognition datasets.\n",
+ "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data\n",
+ "\n",
+ "The data is the results of a chemical analysis of wines grown in the same\n",
+ "region in Italy by three different cultivators. There are thirteen different\n",
+ "measurements taken for different constituents found in the three types of\n",
+ "wine.\n",
+ "\n",
+ "Original Owners: \n",
+ "\n",
+ "Forina, M. et al, PARVUS - \n",
+ "An Extendible Package for Data Exploration, Classification and Correlation. \n",
+ "Institute of Pharmaceutical and Food Analysis and Technologies,\n",
+ "Via Brigata Salerno, 16147 Genoa, Italy.\n",
+ "\n",
+ "Citation:\n",
+ "\n",
+ "Lichman, M. (2013). UCI Machine Learning Repository\n",
+ "[https://archive.ics.uci.edu/ml]. Irvine, CA: University of California,\n",
+ "School of Information and Computer Science. \n",
+ "\n",
+ ".. topic:: References\n",
+ "\n",
+ " (1) S. Aeberhard, D. Coomans and O. de Vel, \n",
+ " Comparison of Classifiers in High Dimensional Settings, \n",
+ " Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of \n",
+ " Mathematics and Statistics, James Cook University of North Queensland. \n",
+ " (Also submitted to Technometrics). \n",
+ "\n",
+ " The data was used with many others for comparing various \n",
+ " classifiers. The classes are separable, though only RDA \n",
+ " has achieved 100% correct classification. \n",
+ " (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) \n",
+ " (All results using the leave-one-out technique) \n",
+ "\n",
+ " (2) S. Aeberhard, D. Coomans and O. de Vel, \n",
+ " \"THE CLASSIFICATION PERFORMANCE OF RDA\" \n",
+ " Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of \n",
+ " Mathematics and Statistics, James Cook University of North Queensland. \n",
+ " (Also submitted to Journal of Chemometrics).\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(wine[\"DESCR\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "772359a0",
+ "metadata": {},
+ "source": [
+ "4). Сколько классов содержит целевая переменная датасета? Выведите названия классов."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "183b0c76",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['class_0' 'class_1' 'class_2']\n",
+ "кол-во:3\n"
+ ]
+ }
+ ],
+ "source": [
+ "target_class = wine[\"target_names\"]\n",
+ "\n",
+ "print(f\"{target_class}\\nкол-во:{len(target_class)}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "41d6690b",
+ "metadata": {},
+ "source": [
+ "5). На основе данных датасета (они содержатся в двумерном массиве Numpy) и названий признаков\n",
+ "создайте датафрейм под названием X."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "15557fbe",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " alcohol | \n",
+ " malic_acid | \n",
+ " ash | \n",
+ " alcalinity_of_ash | \n",
+ " magnesium | \n",
+ " total_phenols | \n",
+ " flavanoids | \n",
+ " nonflavanoid_phenols | \n",
+ " proanthocyanins | \n",
+ " color_intensity | \n",
+ " hue | \n",
+ " od280/od315_of_diluted_wines | \n",
+ " proline | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 14.23 | \n",
+ " 1.71 | \n",
+ " 2.43 | \n",
+ " 15.6 | \n",
+ " 127.0 | \n",
+ " 2.80 | \n",
+ " 3.06 | \n",
+ " 0.28 | \n",
+ " 2.29 | \n",
+ " 5.64 | \n",
+ " 1.04 | \n",
+ " 3.92 | \n",
+ " 1065.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 13.20 | \n",
+ " 1.78 | \n",
+ " 2.14 | \n",
+ " 11.2 | \n",
+ " 100.0 | \n",
+ " 2.65 | \n",
+ " 2.76 | \n",
+ " 0.26 | \n",
+ " 1.28 | \n",
+ " 4.38 | \n",
+ " 1.05 | \n",
+ " 3.40 | \n",
+ " 1050.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 13.16 | \n",
+ " 2.36 | \n",
+ " 2.67 | \n",
+ " 18.6 | \n",
+ " 101.0 | \n",
+ " 2.80 | \n",
+ " 3.24 | \n",
+ " 0.30 | \n",
+ " 2.81 | \n",
+ " 5.68 | \n",
+ " 1.03 | \n",
+ " 3.17 | \n",
+ " 1185.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 14.37 | \n",
+ " 1.95 | \n",
+ " 2.50 | \n",
+ " 16.8 | \n",
+ " 113.0 | \n",
+ " 3.85 | \n",
+ " 3.49 | \n",
+ " 0.24 | \n",
+ " 2.18 | \n",
+ " 7.80 | \n",
+ " 0.86 | \n",
+ " 3.45 | \n",
+ " 1480.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 13.24 | \n",
+ " 2.59 | \n",
+ " 2.87 | \n",
+ " 21.0 | \n",
+ " 118.0 | \n",
+ " 2.80 | \n",
+ " 2.69 | \n",
+ " 0.39 | \n",
+ " 1.82 | \n",
+ " 4.32 | \n",
+ " 1.04 | \n",
+ " 2.93 | \n",
+ " 735.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n",
+ "0 14.23 1.71 2.43 15.6 127.0 2.80 \n",
+ "1 13.20 1.78 2.14 11.2 100.0 2.65 \n",
+ "2 13.16 2.36 2.67 18.6 101.0 2.80 \n",
+ "3 14.37 1.95 2.50 16.8 113.0 3.85 \n",
+ "4 13.24 2.59 2.87 21.0 118.0 2.80 \n",
+ "\n",
+ " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n",
+ "0 3.06 0.28 2.29 5.64 1.04 \n",
+ "1 2.76 0.26 1.28 4.38 1.05 \n",
+ "2 3.24 0.30 2.81 5.68 1.03 \n",
+ "3 3.49 0.24 2.18 7.80 0.86 \n",
+ "4 2.69 0.39 1.82 4.32 1.04 \n",
+ "\n",
+ " od280/od315_of_diluted_wines proline \n",
+ "0 3.92 1065.0 \n",
+ "1 3.40 1050.0 \n",
+ "2 3.17 1185.0 \n",
+ "3 3.45 1480.0 \n",
+ "4 2.93 735.0 "
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "feature_names = wine[\"feature_names\"]\n",
+ "\n",
+ "X = pd.DataFrame(data, columns=feature_names)\n",
+ "\n",
+ "X.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e9ef2d4b",
+ "metadata": {},
+ "source": [
+ "6). Выясните размер датафрейма X и установите, имеются ли в нем пропущенные значения."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "9f2ee54a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 178 entries, 0 to 177\n",
+ "Data columns (total 13 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 alcohol 178 non-null float64\n",
+ " 1 malic_acid 178 non-null float64\n",
+ " 2 ash 178 non-null float64\n",
+ " 3 alcalinity_of_ash 178 non-null float64\n",
+ " 4 magnesium 178 non-null float64\n",
+ " 5 total_phenols 178 non-null float64\n",
+ " 6 flavanoids 178 non-null float64\n",
+ " 7 nonflavanoid_phenols 178 non-null float64\n",
+ " 8 proanthocyanins 178 non-null float64\n",
+ " 9 color_intensity 178 non-null float64\n",
+ " 10 hue 178 non-null float64\n",
+ " 11 od280/od315_of_diluted_wines 178 non-null float64\n",
+ " 12 proline 178 non-null float64\n",
+ "dtypes: float64(13)\n",
+ "memory usage: 18.2 KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "X.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "90a743fa",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "размер = (178, 13)\n",
+ "кол-во пустых значений\n",
+ "alcohol 0\n",
+ "malic_acid 0\n",
+ "ash 0\n",
+ "alcalinity_of_ash 0\n",
+ "magnesium 0\n",
+ "total_phenols 0\n",
+ "flavanoids 0\n",
+ "nonflavanoid_phenols 0\n",
+ "proanthocyanins 0\n",
+ "color_intensity 0\n",
+ "hue 0\n",
+ "od280/od315_of_diluted_wines 0\n",
+ "proline 0\n",
+ "dtype: int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f\"размер = {X.shape}\\nкол-во пустых значений\\n{X.isnull().sum(axis=0)}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cf169b47",
+ "metadata": {},
+ "source": [
+ "7). Добавьте в датафрейм поле с классами вин в виде чисел, имеющих тип данных numpy.int64.\n",
+ "Название поля - 'target'."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 96,
+ "id": "3883dbe8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X[\"target\"] = wine[\"target\"].astype(\"int64\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e5918c4e",
+ "metadata": {},
+ "source": [
+ "8). Постройте матрицу корреляций для всех полей X. Дайте полученному датафрейму название\n",
+ "X_corr"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 97,
+ "id": "d4c9b5e1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_corr = X.corr()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "43fd36d4",
+ "metadata": {},
+ "source": [
+ "9). Создайте список high_corr из признаков, корреляция которых с полем target по абсолютному\n",
+ "значению превышает 0.5 (причем, само поле target не должно входить в этот список)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 98,
+ "id": "a788a848",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['alcalinity_of_ash',\n",
+ " 'total_phenols',\n",
+ " 'flavanoids',\n",
+ " 'hue',\n",
+ " 'od280/od315_of_diluted_wines',\n",
+ " 'proline']"
+ ]
+ },
+ "execution_count": 98,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "high_corr = [item for item in X_corr[abs(X_corr.target) > 0.5].index if item != \"target\"]\n",
+ "high_corr"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "beab1b7d",
+ "metadata": {},
+ "source": [
+ "10). Удалите из датафрейма X поле с целевой переменной. Для всех признаков, названия которых\n",
+ "содержатся в списке high_corr, вычислите квадрат их значений и добавьте в датафрейм X\n",
+ "соответствующие поля с суффиксом '_2', добавленного к первоначальному названию признака.\n",
+ "Итоговый датафрейм должен содержать все поля, которые, были в нем изначально, а также поля с\n",
+ "признаками из списка high_corr, возведенными в квадрат. Выведите описание полей датафрейма X с\n",
+ "помощью метода describe."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 99,
+ "id": "426afeb5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X.drop(\"target\", axis = 1, inplace = True) \n",
+ "\n",
+ "for item in high_corr:\n",
+ " X[item+\"_2\"] = X[item] ** 2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 101,
+ "id": "382274e0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " alcohol | \n",
+ " malic_acid | \n",
+ " ash | \n",
+ " alcalinity_of_ash | \n",
+ " magnesium | \n",
+ " total_phenols | \n",
+ " flavanoids | \n",
+ " nonflavanoid_phenols | \n",
+ " proanthocyanins | \n",
+ " color_intensity | \n",
+ " hue | \n",
+ " od280/od315_of_diluted_wines | \n",
+ " proline | \n",
+ " alcalinity_of_ash_2 | \n",
+ " total_phenols_2 | \n",
+ " flavanoids_2 | \n",
+ " hue_2 | \n",
+ " od280/od315_of_diluted_wines_2 | \n",
+ " proline_2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | count | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 178.000000 | \n",
+ " 1.780000e+02 | \n",
+ "
\n",
+ " \n",
+ " | mean | \n",
+ " 13.000618 | \n",
+ " 2.336348 | \n",
+ " 2.366517 | \n",
+ " 19.494944 | \n",
+ " 99.741573 | \n",
+ " 2.295112 | \n",
+ " 2.029270 | \n",
+ " 0.361854 | \n",
+ " 1.590899 | \n",
+ " 5.058090 | \n",
+ " 0.957449 | \n",
+ " 2.611685 | \n",
+ " 746.893258 | \n",
+ " 391.142865 | \n",
+ " 5.657030 | \n",
+ " 5.110049 | \n",
+ " 0.968661 | \n",
+ " 7.322155 | \n",
+ " 6.564591e+05 | \n",
+ "
\n",
+ " \n",
+ " | std | \n",
+ " 0.811827 | \n",
+ " 1.117146 | \n",
+ " 0.274344 | \n",
+ " 3.339564 | \n",
+ " 14.282484 | \n",
+ " 0.625851 | \n",
+ " 0.998859 | \n",
+ " 0.124453 | \n",
+ " 0.572359 | \n",
+ " 2.318286 | \n",
+ " 0.228572 | \n",
+ " 0.709990 | \n",
+ " 314.907474 | \n",
+ " 133.671775 | \n",
+ " 2.936294 | \n",
+ " 4.211441 | \n",
+ " 0.443798 | \n",
+ " 3.584316 | \n",
+ " 5.558591e+05 | \n",
+ "
\n",
+ " \n",
+ " | min | \n",
+ " 11.030000 | \n",
+ " 0.740000 | \n",
+ " 1.360000 | \n",
+ " 10.600000 | \n",
+ " 70.000000 | \n",
+ " 0.980000 | \n",
+ " 0.340000 | \n",
+ " 0.130000 | \n",
+ " 0.410000 | \n",
+ " 1.280000 | \n",
+ " 0.480000 | \n",
+ " 1.270000 | \n",
+ " 278.000000 | \n",
+ " 112.360000 | \n",
+ " 0.960400 | \n",
+ " 0.115600 | \n",
+ " 0.230400 | \n",
+ " 1.612900 | \n",
+ " 7.728400e+04 | \n",
+ "
\n",
+ " \n",
+ " | 25% | \n",
+ " 12.362500 | \n",
+ " 1.602500 | \n",
+ " 2.210000 | \n",
+ " 17.200000 | \n",
+ " 88.000000 | \n",
+ " 1.742500 | \n",
+ " 1.205000 | \n",
+ " 0.270000 | \n",
+ " 1.250000 | \n",
+ " 3.220000 | \n",
+ " 0.782500 | \n",
+ " 1.937500 | \n",
+ " 500.500000 | \n",
+ " 295.840000 | \n",
+ " 3.036325 | \n",
+ " 1.452100 | \n",
+ " 0.612325 | \n",
+ " 3.754075 | \n",
+ " 2.505010e+05 | \n",
+ "
\n",
+ " \n",
+ " | 50% | \n",
+ " 13.050000 | \n",
+ " 1.865000 | \n",
+ " 2.360000 | \n",
+ " 19.500000 | \n",
+ " 98.000000 | \n",
+ " 2.355000 | \n",
+ " 2.135000 | \n",
+ " 0.340000 | \n",
+ " 1.555000 | \n",
+ " 4.690000 | \n",
+ " 0.965000 | \n",
+ " 2.780000 | \n",
+ " 673.500000 | \n",
+ " 380.250000 | \n",
+ " 5.546050 | \n",
+ " 4.558250 | \n",
+ " 0.931250 | \n",
+ " 7.728400 | \n",
+ " 4.536045e+05 | \n",
+ "
\n",
+ " \n",
+ " | 75% | \n",
+ " 13.677500 | \n",
+ " 3.082500 | \n",
+ " 2.557500 | \n",
+ " 21.500000 | \n",
+ " 107.000000 | \n",
+ " 2.800000 | \n",
+ " 2.875000 | \n",
+ " 0.437500 | \n",
+ " 1.950000 | \n",
+ " 6.200000 | \n",
+ " 1.120000 | \n",
+ " 3.170000 | \n",
+ " 985.000000 | \n",
+ " 462.250000 | \n",
+ " 7.840000 | \n",
+ " 8.265700 | \n",
+ " 1.254400 | \n",
+ " 10.048900 | \n",
+ " 9.702250e+05 | \n",
+ "
\n",
+ " \n",
+ " | max | \n",
+ " 14.830000 | \n",
+ " 5.800000 | \n",
+ " 3.230000 | \n",
+ " 30.000000 | \n",
+ " 162.000000 | \n",
+ " 3.880000 | \n",
+ " 5.080000 | \n",
+ " 0.660000 | \n",
+ " 3.580000 | \n",
+ " 13.000000 | \n",
+ " 1.710000 | \n",
+ " 4.000000 | \n",
+ " 1680.000000 | \n",
+ " 900.000000 | \n",
+ " 15.054400 | \n",
+ " 25.806400 | \n",
+ " 2.924100 | \n",
+ " 16.000000 | \n",
+ " 2.822400e+06 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n",
+ "count 178.000000 178.000000 178.000000 178.000000 178.000000 \n",
+ "mean 13.000618 2.336348 2.366517 19.494944 99.741573 \n",
+ "std 0.811827 1.117146 0.274344 3.339564 14.282484 \n",
+ "min 11.030000 0.740000 1.360000 10.600000 70.000000 \n",
+ "25% 12.362500 1.602500 2.210000 17.200000 88.000000 \n",
+ "50% 13.050000 1.865000 2.360000 19.500000 98.000000 \n",
+ "75% 13.677500 3.082500 2.557500 21.500000 107.000000 \n",
+ "max 14.830000 5.800000 3.230000 30.000000 162.000000 \n",
+ "\n",
+ " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n",
+ "count 178.000000 178.000000 178.000000 178.000000 \n",
+ "mean 2.295112 2.029270 0.361854 1.590899 \n",
+ "std 0.625851 0.998859 0.124453 0.572359 \n",
+ "min 0.980000 0.340000 0.130000 0.410000 \n",
+ "25% 1.742500 1.205000 0.270000 1.250000 \n",
+ "50% 2.355000 2.135000 0.340000 1.555000 \n",
+ "75% 2.800000 2.875000 0.437500 1.950000 \n",
+ "max 3.880000 5.080000 0.660000 3.580000 \n",
+ "\n",
+ " color_intensity hue od280/od315_of_diluted_wines proline \\\n",
+ "count 178.000000 178.000000 178.000000 178.000000 \n",
+ "mean 5.058090 0.957449 2.611685 746.893258 \n",
+ "std 2.318286 0.228572 0.709990 314.907474 \n",
+ "min 1.280000 0.480000 1.270000 278.000000 \n",
+ "25% 3.220000 0.782500 1.937500 500.500000 \n",
+ "50% 4.690000 0.965000 2.780000 673.500000 \n",
+ "75% 6.200000 1.120000 3.170000 985.000000 \n",
+ "max 13.000000 1.710000 4.000000 1680.000000 \n",
+ "\n",
+ " alcalinity_of_ash_2 total_phenols_2 flavanoids_2 hue_2 \\\n",
+ "count 178.000000 178.000000 178.000000 178.000000 \n",
+ "mean 391.142865 5.657030 5.110049 0.968661 \n",
+ "std 133.671775 2.936294 4.211441 0.443798 \n",
+ "min 112.360000 0.960400 0.115600 0.230400 \n",
+ "25% 295.840000 3.036325 1.452100 0.612325 \n",
+ "50% 380.250000 5.546050 4.558250 0.931250 \n",
+ "75% 462.250000 7.840000 8.265700 1.254400 \n",
+ "max 900.000000 15.054400 25.806400 2.924100 \n",
+ "\n",
+ " od280/od315_of_diluted_wines_2 proline_2 \n",
+ "count 178.000000 1.780000e+02 \n",
+ "mean 7.322155 6.564591e+05 \n",
+ "std 3.584316 5.558591e+05 \n",
+ "min 1.612900 7.728400e+04 \n",
+ "25% 3.754075 2.505010e+05 \n",
+ "50% 7.728400 4.536045e+05 \n",
+ "75% 10.048900 9.702250e+05 \n",
+ "max 16.000000 2.822400e+06 "
+ ]
+ },
+ "execution_count": 101,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c1245b68",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}