{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 加载 sklearn 公共数据集"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(150, 5)\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sepal length (cm) | \n",
" sepal width (cm) | \n",
" petal length (cm) | \n",
" petal width (cm) | \n",
" label | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 5.1 | \n",
" 3.5 | \n",
" 1.4 | \n",
" 0.2 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 4.9 | \n",
" 3.0 | \n",
" 1.4 | \n",
" 0.2 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 4.7 | \n",
" 3.2 | \n",
" 1.3 | \n",
" 0.2 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 4.6 | \n",
" 3.1 | \n",
" 1.5 | \n",
" 0.2 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 5.0 | \n",
" 3.6 | \n",
" 1.4 | \n",
" 0.2 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n",
"0 5.1 3.5 1.4 0.2 \n",
"1 4.9 3.0 1.4 0.2 \n",
"2 4.7 3.2 1.3 0.2 \n",
"3 4.6 3.1 1.5 0.2 \n",
"4 5.0 3.6 1.4 0.2 \n",
"\n",
" label \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.datasets import load_boston,load_breast_cancer,load_iris,load_digits,load_diabetes,load_wine\n",
"# load_data = load_boston() # (506, 14) regression\n",
"# load_data = load_breast_cancer() # (569, 31) classification\n",
"load_data = load_iris() # (150, 5) regression\n",
"# load_data = load_diabetes() # (442, 11) regression\n",
"# load_data = load_wine() # (178, 14) classification\n",
"frame = pd.DataFrame(data=load_data.data, columns=load_data.feature_names)\n",
"frame['label'] = load_data.target\n",
"print(frame.shape)\n",
"frame.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 返回 df2 按照 df1 的 id 排序"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import time\n",
"def order_same(df1, df2, left_id, right_id):\n",
" \"\"\"\n",
" 返回 df2 按照 df1 的 id 排序\n",
" :param data1:\n",
" :param data2:\n",
" :param left_id:\n",
" :param right_id:\n",
" :return:\n",
" \"\"\"\n",
" assert df1.shape[0] == df2.shape[0], 'no same shape'\n",
" result = pd.merge(df1, df2, how='left', suffixes=('_x', ''), left_on=left_id, right_on=right_id)\n",
" return result[df2.columns]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" score | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" a | \n",
" 62.118782 | \n",
"
\n",
" \n",
" 1 | \n",
" b | \n",
" 13.003589 | \n",
"
\n",
" \n",
" 2 | \n",
" c | \n",
" 997.357200 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id score\n",
"0 a 62.118782\n",
"1 b 13.003589\n",
"2 c 997.357200"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"d1 = {\"id\":[\"a\",\"b\",\"c\"],\"score\":[62.118782,13.003589,997.3572]}\n",
"d2 = {\"id\":[\"b\",\"a\",\"c\"],\"label\":[0,1,2]}\n",
"df1 = pd.DataFrame(data=d1)\n",
"df2 = pd.DataFrame(data=d2)\n",
"\n",
"df1.head()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" label | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" b | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" a | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" c | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id label\n",
"0 b 0\n",
"1 a 1\n",
"2 c 2"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.head()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" label | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" a | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" b | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" c | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id label\n",
"0 a 1\n",
"1 b 0\n",
"2 c 2"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"order_same(df1, df2, \"id\", \"id\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 数据预处理"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"def clean_dataset(df, threshold_del=0.8, fill_value=0):\n",
" '''\n",
" 处理缺失值和无穷大异常值\n",
" 缺失值超过threshold_del(默认0.8)的列,直接删除列。\n",
" 其他的,使用fill_value(默认0)填补\n",
"\n",
" '''\n",
" assert isinstance(df, pd.DataFrame), \"df needs to be a pd.DataFrame\"\n",
"\n",
" # 通过观察原数据,对于缺失值达80%以上的字段,在这里直接删除\n",
" df = df.loc[:, df.isnull().mean() < threshold_del]\n",
"\n",
" # 模型默认为float32,过大会报错\n",
" df = df.astype(np.float32)\n",
" # 模型不能处理NAN值,用值填充\n",
" df.fillna(fill_value, inplace=True)\n",
"\n",
" '''\n",
" 处理无穷大异常值\n",
" '''\n",
" assert isinstance(df, pd.DataFrame), \"df needs to be a pd.DataFrame\"\n",
" # 模型不能处理infinity,删除所在行\n",
" indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)\n",
" df = df[indices_to_keep]\n",
" return df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}