#确定哪一列是类别型变量: s = (X.dtypes == 'object') object_cols = list(s[s].index)
如果一个列中有文本,那么这一列的dtype就会是object
1 2 3 4 5 6 7 8 9 10
from sklearn.preprocessing import OrdinalEncoder
# Make copy to avoid changing original data label_X_train = X_train.copy() label_X_valid = X_valid.copy()
# Apply ordinal encoder to each column with categorical data ordinal_encoder = OrdinalEncoder() label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols]) label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
from sklearn.preprocessing import OneHotEncoder
# Apply one-hot encoder to each column with categorical data OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False) OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols])) OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))
# One-hot encoding removed index; put it back OH_cols_train.index = X_train.index OH_cols_valid.index = X_valid.index
from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder
# Preprocessing for numerical data numerical_transformer = SimpleImputer(strategy='constant')
# Preprocessing for categorical data categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ])
# Bundle preprocessing for numerical and categorical data preprocessor = ColumnTransformer( transformers=[ ('num', numerical_transformer, numerical_cols), ('cat', categorical_transformer, categorical_cols) ])
2.定义模型
1 2 3
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=0)
3.整合上述工作到管道中
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
from sklearn.metrics import mean_absolute_error
# Bundle preprocessing and modeling code in a pipeline my_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model) ])
# Preprocessing of training data, fit model my_pipeline.fit(X_train, y_train)
# Preprocessing of validation data, get predictions preds = my_pipeline.predict(X_valid)
# Evaluate the model score = mean_absolute_error(y_valid, preds)