objectif
Encoder des colonnes catégorielles et numériques.
code minimal
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import numpy as np
X = np.array([["red", 1.0], ["blue", 2.0]])
y = [0,1]
ct = ColumnTransformer([("cat", OneHotEncoder(handle_unknown="ignore"), [0]), ("num", StandardScaler(), [1])])
clf = Pipeline([("prep", ct), ("lr", LogisticRegression(max_iter=1000))]).fit(X, y)
print(hasattr(clf, "predict_proba"))
utilisation
from sklearn.compose import make_column_selector as selector
print(callable(selector(dtype_include=object)))
variante(s) utile(s)
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1).fit([["a"],["b"]])
print(enc.transform([["c"]])[0,0] == -1)
notes
- handle_unknown=‘ignore’ évite erreurs en prod.