我在 scikit-learn 的 20 个新闻组数据集上训练了一个二元随机森林分类器。我想调整参数并通过网格搜索和训练数据的 3 折交叉验证来尝试。
这种方法有什么问题吗?
对于max_depth参数,我得到一个非常高的 500 值,这似乎太高了。有什么建议吗?
代码是:
from __future__ import print_function
import sklearn
import sklearn.ensemble
import sklearn.metrics
from sklearn.datasets import fetch_20newsgroups
from sklearn.grid_search import GridSearchCV
categories = ['sci.med', 'soc.religion.christian']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories,
remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories,
remove=('headers', 'footers', 'quotes'))
class_names = ['medicine', 'christian']
vectorizer =
sklearn.feature_extraction.text.TfidfVectorizer(lowercase=False)
train_vectors = vectorizer.fit_transform(newsgroups_train.data)
test_vectors = vectorizer.transform(newsgroups_test.data)
rf = sklearn.ensemble.RandomForestClassifier(max_features='sqrt')
param_grid = {
"n_estimators" : [10, 100, 1000],
"max_depth" : [5, 100, 500],
"min_samples_leaf" : [1, 20, 40]}
CV_rf = GridSearchCV(estimator=rf, param_grid=param_grid)
CV_rf.fit(train_vectors, newsgroups_train.target)
print(CV_rf.best_params_)