malware-analysis/features_selection.py at master · elsheikh21/malware-analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
'''
Select features from the already classified dataset
to be used to train our model, as well as showing
which features affect our model training the most
'''

from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import matplotlib.pyplot as plt


def select_features_extra_trees(x, y):
    model = ExtraTreesClassifier(n_estimators=1000)
    model.fit(x, y)

    importances = model.feature_importances_

    indices = np.argsort(importances)[::-1]

    std = np.std([tree.feature_importances_
                  for tree in model.estimators_], axis=0)

    # Print the feature ranking
    print("\nFeature ranking (ordered DESC) using extra trees classifier:")

    for f in range(x.shape[1]):
        print("%d. feature %d (%f)" %
              (f + 1, indices[f], importances[indices[f]]))

    plt.figure()
    plt.title("Features importance due to extra trees classifier")
    plt.bar(range(x.shape[1]), importances[indices],
            color="g", yerr=std[indices], align="center")
    plt.xticks(range(x.shape[1]), indices)
    plt.xlim([-1, x.shape[1]])
    plt.show()


def select_features_random_forest(x, y):
    random_forest_classifier = RandomForestClassifier(
        max_features='sqrt', n_estimators=1000)

    random_forest_classifier.fit(x, y)
    importances = random_forest_classifier.feature_importances_

    indices = np.argsort(importances)[::-1]

    std = np.std([tree.feature_importances_
                  for tree in random_forest_classifier.estimators_], axis=0)

    # Print the feature ranking
    print("\nFeature ranking (ordered DESC) using random forest classifier:")

    for f in range(x.shape[1]):
        print("%d. feature %d (%f)" %
              (f + 1, indices[f], importances[indices[f]]))

    plt.figure()
    plt.title("Features importance due to random forest classifier")
    plt.bar(range(x.shape[1]), importances[indices],
            color="b", yerr=std[indices], align="center")
    plt.xticks(range(x.shape[1]), indices)
    plt.xlim([-1, x.shape[1]])
    plt.show()


def select_features_recursive_feature_elimination(x, y):
    model = LogisticRegression(solver='lbfgs')
    rfe = RFE(model, 4)
    fit = rfe.fit(x, y)
    print('\nRFE chose the the top 4 features: ')
    print('Numbers Features: ' + str(fit.n_features_))
    print('Selected Features: ' + str(fit.support_))
    print('Feature Ranking: ' + str(fit.ranking_))


def select_features_k_best(x, y):
    test = SelectKBest(score_func=chi2, k=4)
    fit = test.fit(x, y)
    # summarize scores
    np.set_printoptions(precision=3)
    print('\nscores for each attribute and the 4 attributes chosen: ')
    print(fit.scores_)
    features = fit.transform(x)
    # summarize selected features
    print(features[0:5, :])