-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathfeatures_selection.py
More file actions
91 lines (70 loc) · 2.93 KB
/
features_selection.py
File metadata and controls
91 lines (70 loc) · 2.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
'''
Select features from the already classified dataset
to be used to train our model, as well as showing
which features affect our model training the most
'''
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import matplotlib.pyplot as plt
def select_features_extra_trees(x, y):
model = ExtraTreesClassifier(n_estimators=1000)
model.fit(x, y)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
std = np.std([tree.feature_importances_
for tree in model.estimators_], axis=0)
# Print the feature ranking
print("\nFeature ranking (ordered DESC) using extra trees classifier:")
for f in range(x.shape[1]):
print("%d. feature %d (%f)" %
(f + 1, indices[f], importances[indices[f]]))
plt.figure()
plt.title("Features importance due to extra trees classifier")
plt.bar(range(x.shape[1]), importances[indices],
color="g", yerr=std[indices], align="center")
plt.xticks(range(x.shape[1]), indices)
plt.xlim([-1, x.shape[1]])
plt.show()
def select_features_random_forest(x, y):
random_forest_classifier = RandomForestClassifier(
max_features='sqrt', n_estimators=1000)
random_forest_classifier.fit(x, y)
importances = random_forest_classifier.feature_importances_
indices = np.argsort(importances)[::-1]
std = np.std([tree.feature_importances_
for tree in random_forest_classifier.estimators_], axis=0)
# Print the feature ranking
print("\nFeature ranking (ordered DESC) using random forest classifier:")
for f in range(x.shape[1]):
print("%d. feature %d (%f)" %
(f + 1, indices[f], importances[indices[f]]))
plt.figure()
plt.title("Features importance due to random forest classifier")
plt.bar(range(x.shape[1]), importances[indices],
color="b", yerr=std[indices], align="center")
plt.xticks(range(x.shape[1]), indices)
plt.xlim([-1, x.shape[1]])
plt.show()
def select_features_recursive_feature_elimination(x, y):
model = LogisticRegression(solver='lbfgs')
rfe = RFE(model, 4)
fit = rfe.fit(x, y)
print('\nRFE chose the the top 4 features: ')
print('Numbers Features: ' + str(fit.n_features_))
print('Selected Features: ' + str(fit.support_))
print('Feature Ranking: ' + str(fit.ranking_))
def select_features_k_best(x, y):
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(x, y)
# summarize scores
np.set_printoptions(precision=3)
print('\nscores for each attribute and the 4 attributes chosen: ')
print(fit.scores_)
features = fit.transform(x)
# summarize selected features
print(features[0:5, :])