热线电话:13121318867

登录
2021-03-04 阅读量: 8694
使用xgboost模型填补缺失值时报错

fit的时候报错了,报错原因为

ValueError: The label must consist of integer labels of form 0, 1, 2, ..., [num_class - 1].请问怎么解决?代码.docx

data.xlsx


import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

%matplotlib inline

import xgboost as xgb


df1 = pd.read_excel('d:/python/data/data.xlsx')


x_ = df1.drop(['Target'], axis=1)

y_ = df1['Target'].values


def get_kind(x: pd.Series, diff_limit: int = 10):

x = x.astype('str')

x = x.str.extract(r'(^(\-|)(?=.*\d)\d*(?:\.\d*)?$)')[0]

x.dropna(inplace=True)

if x.nunique() > diff_limit:

kind = 'numeric'

else:

kind = 'categorical'

return kind


class xgb_fill(BaseEstimator, TransformerMixin):


def __init__(self,

num_list: list = None,

cate_list: list = None,

diff_num: int = 8,

random_state: int = 0):

self.num_list = num_list

self.cate_list = cate_list

self.diff_num = diff_num

self.random_state = random_state

self.xgb_cla_dict = {}

self.xgb_reg_dict = {}


def fit(self, X, y=None):

from tqdm import tqdm

X = X.copy()

if self.num_list is None:

self.num_list = []

for col in X.columns:

kind = get_kind(x=X[col], diff_limit=self.diff_num)

if kind == 'numeric':

self.num_list.append(col)

if self.cate_list is None:

self.cate_list = []

for col in X.columns:

kind = get_kind(x=X[col], diff_limit=self.diff_num)

if kind == 'categorical':

self.cate_list.append(col)

for col in tqdm(self.cate_list):

file = X.copy()

if file[col].isnull().any():

df = pd.get_dummies(file, columns=[i for i in self.cate_list if i != col],

prefix=[i for i in self.cate_list if i != col],

dummy_na=True)

not_null = df.dropna(subset=[col])

x_ = not_null.drop([col], axis=1)

y_ = not_null[col]

xgb_cla = xgb.XGBClassifier(random_state=self.random_state,use_label_encoder=False)

xgb_cla.fit(x_, y_)

self.xgb_cla_dict[col] = xgb_cla


for col in tqdm(self.num_list):

file = X.copy()

if file[col].isnull().any():

df = pd.get_dummies(file, columns=self.cate_list, dummy_na=True, prefix=self.cate_list)

not_null = df.dropna(subset=[col])

x_ = not_null.drop([col], axis=1)

y_ = not_null[col]

xgb_reg = xgb.XGBRegressor(random_state=self.random_state, objective='reg:squarederror')

xgb_reg.fit(x_, y_)

self.xgb_reg_dict[col] = xgb_reg

print('fit xgb fill the Na success!')

return self


def transform(self, X):

X = X.copy()

from tqdm import tqdm

for col in tqdm(self.cate_list):

file = X.copy()

if file[col].isnull().any():

df = pd.get_dummies(file, columns=[i for i in self.cate_list if i != col],

prefix=[i for i in self.cate_list if i != col],

dummy_na=True)

not_null = df.dropna(subset=[col])

null = df.drop(not_null.index)

null[col] = self.xgb_cla_dict[col].predict(null.drop([col], axis=1))

X[col] = pd.concat([null, not_null], axis=0)[col]

else:

X[col] = file[col]


for col in tqdm(self.num_list):

file = X.copy()

if file[col].isnull().any():

df = pd.get_dummies(file, columns=self.cate_list, dummy_na=True, prefix=self.cate_list)

not_null = df.dropna(subset=[col])

null = df.drop(not_null.index)

null[col] = self.xgb_reg_dict[col].predict(null.drop([col], axis=1))

X[col] = pd.concat([null, not_null], axis=0)[col]

else:

X[col] = file[col]

print('transform xgb fill the NA success!')

return X


xgbf = xgb_fill()


x_ = xgbf.fit_transform(x_) 运行到这步就报错了

80.0000
9
关注作者
收藏
评论(9)

发表评论