microsoft-malware-classification-challenge/solution4.py at master · canast02/microsoft-malware-classification-challenge · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import numpy as np
from csv import reader, writer
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
import six

# Decide read/write mode based on python version
read_mode, write_mode = ('r', 'w') if six.PY2 else ('rt', 'wt')

# Set path to your consolidated files
path = '/Users/chrysovalantis/Documents/UCY/EPL451/Project'
os.chdir(path)

# File names
ftrain = 'train_consolidation.txt'
ftest = 'test_consolidation.txt'
flabel = 'trainLabels.csv'
fsubmission = 'submission.csv'

print('loading started')
# Lets read labels first as things are not sorted in files
labels = {}
with open(flabel) as f:
    next(f)  # Ignoring header
    for row in reader(f):
        labels[row[0]] = int(row[1])
print('labels loaded')

# Dimensions for train set
ntrain = 10868
nfeature = 16 ** 2 + 1 + 1  # For two_byte_codes, no_que_marks, label
train = np.zeros((ntrain, nfeature), dtype=int)
with open(ftrain) as f:
    next(f)  # Ignoring header
    for t, row in enumerate(reader(f)):
        # train[t, :-1] = map(int, row[1:]) if six.PY2 else list(map(int, row[1:]))
        train[t, :-1] = map(float, row[1:]) if six.PY2 else list(map(float, row[1:]))
        train[t, -1] = labels[row[0]]
        if (t + 1) % 1000 == 0:
            print((t + 1) * 100.0 / ntrain, '% of records loaded')
print('training set loaded')

del labels

# Parameters for Randomforest
random_state = 5342
n_jobs = 8
verbose = 2
clf1 = ExtraTreesClassifier(criterion='entropy', random_state=random_state, n_jobs=n_jobs, verbose=verbose)
clf2 = ExtraTreesClassifier(criterion='entropy', random_state=random_state, n_jobs=n_jobs, verbose=verbose)
clf3 = RandomForestClassifier(criterion='entropy', random_state=random_state, n_jobs=n_jobs, verbose=verbose)
clf4 = RandomForestClassifier(criterion='entropy', random_state=random_state, n_jobs=n_jobs, verbose=verbose)

# Start training
print('training started')
clf1.fit(train[:, :-1], train[:, -1])
X_new1 = clf1.transform(train[:, :-1])
X_new2 = clf3.fit_transform(train[:, :-1], train[:, -1])
# print('importances', clf1.feature_importances_)
clf2.fit(X_new1, train[:, -1])
clf4.fit(X_new2, train[:, -1])
print('training completed')

print('n_components = ', len(X_new1[0]), len(X_new2[0]))

# We don't need training set now
del train

# Dimensions for train set
ntest = 10873
nfeature = 16 ** 2 + 1  # For two_byte_codes, no_que_marks
test = np.zeros((ntest, nfeature), dtype=int)
Ids = []  # Required test set ids

with open(ftest, read_mode) as f:
    next(f)  # Ignoring header
    for t, row in enumerate(reader(f)):
        # test[t, :] = map(int, row[1:]) if six.PY2 else list(map(int, row[1:]))
        test[t, :] = map(float, row[1:]) if six.PY2 else list(map(float, row[1:]))
        Ids.append(row[0])
        if (t + 1) % 1000 == 0:
            print(t + 1, 'records loaded')
print('test set loaded')

Y_new1 = clf1.transform(test)
Y_new2 = clf3.transform(test)

# Predict for whole test set
y_pred1 = clf2.predict_proba(Y_new1)
y_pred2 = clf4.predict_proba(Y_new2)

y_pred = np.zeros((len(y_pred1), len(y_pred1[0])), dtype=float)

# iterate through rows
for i in range(len(y_pred)):
    # iterate through columns
    for j in range(len(y_pred[0])):
        y_pred[i][j] = (y_pred1[i][j] + y_pred2[i][j]) / 2.0

# Writing results to file
with open(fsubmission, write_mode) as f:
    fw = writer(f)
    # Header preparation
    header = ['Id'] + ['Prediction' + str(i) for i in range(1, 10)]
    fw.writerow(header)
    for t, (Id, pred) in enumerate(zip(Ids, y_pred.tolist())):
        fw.writerow([Id] + pred)
        if (t + 1) % 1000 == 0:
            print(t + 1, 'prediction written')