Antecipando o inĂcio do curso "Machine Learning. Professional", publicamos a tradução de um artigo Ăștil.
Também o convidamos a assistir à gravação do webinar aberto sobre o tema "Clustering" .
Recursive Feature Elimination
, (recursive feature elimination), , , .
. . . , .
Sklearn
Scikit-learn sklearn.featureselection.RFE
. :
estimator
â ,coef
featureimportances attributes.
nfeaturestoselect
â . .step
â , , , 0 1, , .
:
ranking
â .nfeatures
â .support
â , , .
, , featureimportances
coeff
. . 13 . .
import pandas as pddf = pd.read_csv(âheart.csvâ)df.head()

x y.
X = df.drop([âtargetâ],axis=1)
y = df[âtargetâ]
:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=0)
:
Pipeline
â -, .RepeatedStratifiedKFold
â k- -.crossvalscore
â -.GradientBoostingClassifier
â , .Numpy
â .
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
RFE , . 6:
rfe = RFE(estimator=GradientBoostingClassifier(), n_features_to_select=6)
, :
model = GradientBoostingClassifier()
Pipeline
. Pipeline
rfe
, .
RepeatedStratifiedKFold
10 5 . k- - , . RepeatedStratifiedKFold
k- - .
pipe = Pipeline([(âFeature Selectionâ, rfe), (âModelâ, model)])
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=36851234)
n_scores = cross_val_score(pipe, X_train, y_train, scoring=âaccuracyâ, cv=cv, n_jobs=-1)
np.mean(n_scores)
â .
pipe.fit(X_train, y_train)
support
. Support
.
rfe.support_
array([ True, False, True, False, True, False, False, True, False,True, False, True, True])
.
pd.DataFrame(rfe.support_,index=X.columns,columns=[âRankâ])

.
rf_df = pd.DataFrame(rfe.ranking_,index=X.columns,columns=[âRankâ]).sort_values(by=âRankâ,ascending=True)rf_df.head()

, , , . -. sklearn.featureselection.RFECV
. :
estimator
â RFE.minfeaturestoselect
â .cv
â -.
:
nfeatures
â , -.support
â , .ranking
â .gridscores
â , -.
.
from sklearn.feature_selection import RFECVrfecv = RFECV(estimator=GradientBoostingClassifier())
cv. rfecv
.
pipeline = Pipeline([(âFeature Selectionâ, rfecv), (âModelâ, model)])
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=36851234)
n_scores = cross_val_score(pipeline, X_train, y_train, scoring=âaccuracyâ, cv=cv, n_jobs=-1)
np.mean(n_scores)
.
pipeline.fit(X_train,y_train)
nfeatures
.
print(âOptimal number of features : %dâ % rfecv.n_features_)Optimal number of features : 7
support
, .
rfecv.support_rfecv_df = pd.DataFrame(rfecv.ranking_,index=X.columns,columns=[âRankâ]).sort_values(by=âRankâ,ascending=True)
rfecv_df.head()
gridscores
, -.
import matplotlib.pyplot as plt
plt.figure(figsize=(12,6))
plt.xlabel(âNumber of features selectedâ)
plt.ylabel(âCross validation score (nb of correct classifications)â)
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

. . , , .