加载 sklearn 公共数据集

In [4]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston,load_breast_cancer,load_iris,load_digits,load_diabetes,load_wine
# load_data = load_boston() # (506, 14) regression
# load_data = load_breast_cancer() # (569, 31) classification
load_data = load_iris() # (150, 5) regression
# load_data = load_diabetes() # (442, 11) regression
# load_data = load_wine() # (178, 14) classification
frame = pd.DataFrame(data=load_data.data, columns=load_data.feature_names)
frame['label'] = load_data.target
print(frame.shape)
frame.head()
(150, 5)
Out[4]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) label
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0

返回 df2 按照 df1 的 id 排序

In [32]:
import pandas as pd
import numpy as np
import time
def order_same(df1, df2, left_id, right_id):
    """
    返回 df2 按照 df1 的 id 排序
    :param data1:
    :param data2:
    :param left_id:
    :param right_id:
    :return:
    """
    assert df1.shape[0] == df2.shape[0], 'no same shape'
    result = pd.merge(df1, df2, how='left', suffixes=('_x', ''), left_on=left_id, right_on=right_id)
    return result[df2.columns]
In [33]:
d1 = {"id":["a","b","c"],"score":[62.118782,13.003589,997.3572]}
d2 = {"id":["b","a","c"],"label":[0,1,2]}
df1 = pd.DataFrame(data=d1)
df2 = pd.DataFrame(data=d2)

df1.head()
Out[33]:
id score
0 a 62.118782
1 b 13.003589
2 c 997.357200
In [34]:
df2.head()
Out[34]:
id label
0 b 0
1 a 1
2 c 2
In [35]:
order_same(df1, df2, "id", "id")
Out[35]:
id label
0 a 1
1 b 0
2 c 2

数据预处理

In [36]:
def clean_dataset(df, threshold_del=0.8, fill_value=0):
    '''
    处理缺失值和无穷大异常值
    缺失值超过threshold_del(默认0.8)的列,直接删除列。
    其他的,使用fill_value(默认0)填补

    '''
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"

    # 通过观察原数据,对于缺失值达80%以上的字段,在这里直接删除
    df = df.loc[:, df.isnull().mean() < threshold_del]

    # 模型默认为float32,过大会报错
    df = df.astype(np.float32)
    # 模型不能处理NAN值,用值填充
    df.fillna(fill_value, inplace=True)

    '''
     处理无穷大异常值
    '''
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    # 模型不能处理infinity,删除所在行
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    df = df[indices_to_keep]
    return df