import pandas as pd
import numpy as np
from sklearn.datasets import load_boston,load_breast_cancer,load_iris,load_digits,load_diabetes,load_wine
# load_data = load_boston() # (506, 14) regression
# load_data = load_breast_cancer() # (569, 31) classification
load_data = load_iris() # (150, 5) regression
# load_data = load_diabetes() # (442, 11) regression
# load_data = load_wine() # (178, 14) classification
frame = pd.DataFrame(data=load_data.data, columns=load_data.feature_names)
frame['label'] = load_data.target
print(frame.shape)
frame.head()
import pandas as pd
import numpy as np
import time
def order_same(df1, df2, left_id, right_id):
"""
返回 df2 按照 df1 的 id 排序
:param data1:
:param data2:
:param left_id:
:param right_id:
:return:
"""
assert df1.shape[0] == df2.shape[0], 'no same shape'
result = pd.merge(df1, df2, how='left', suffixes=('_x', ''), left_on=left_id, right_on=right_id)
return result[df2.columns]
d1 = {"id":["a","b","c"],"score":[62.118782,13.003589,997.3572]}
d2 = {"id":["b","a","c"],"label":[0,1,2]}
df1 = pd.DataFrame(data=d1)
df2 = pd.DataFrame(data=d2)
df1.head()
df2.head()
order_same(df1, df2, "id", "id")
def clean_dataset(df, threshold_del=0.8, fill_value=0):
'''
处理缺失值和无穷大异常值
缺失值超过threshold_del(默认0.8)的列,直接删除列。
其他的,使用fill_value(默认0)填补
'''
assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
# 通过观察原数据,对于缺失值达80%以上的字段,在这里直接删除
df = df.loc[:, df.isnull().mean() < threshold_del]
# 模型默认为float32,过大会报错
df = df.astype(np.float32)
# 模型不能处理NAN值,用值填充
df.fillna(fill_value, inplace=True)
'''
处理无穷大异常值
'''
assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
# 模型不能处理infinity,删除所在行
indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
df = df[indices_to_keep]
return df