-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsolution4.py
More file actions
109 lines (92 loc) · 3.57 KB
/
solution4.py
File metadata and controls
109 lines (92 loc) · 3.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import numpy as np
from csv import reader, writer
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
import six
# Decide read/write mode based on python version
read_mode, write_mode = ('r', 'w') if six.PY2 else ('rt', 'wt')
# Set path to your consolidated files
path = '/Users/chrysovalantis/Documents/UCY/EPL451/Project'
os.chdir(path)
# File names
ftrain = 'train_consolidation.txt'
ftest = 'test_consolidation.txt'
flabel = 'trainLabels.csv'
fsubmission = 'submission.csv'
print('loading started')
# Lets read labels first as things are not sorted in files
labels = {}
with open(flabel) as f:
next(f) # Ignoring header
for row in reader(f):
labels[row[0]] = int(row[1])
print('labels loaded')
# Dimensions for train set
ntrain = 10868
nfeature = 16 ** 2 + 1 + 1 # For two_byte_codes, no_que_marks, label
train = np.zeros((ntrain, nfeature), dtype=int)
with open(ftrain) as f:
next(f) # Ignoring header
for t, row in enumerate(reader(f)):
# train[t, :-1] = map(int, row[1:]) if six.PY2 else list(map(int, row[1:]))
train[t, :-1] = map(float, row[1:]) if six.PY2 else list(map(float, row[1:]))
train[t, -1] = labels[row[0]]
if (t + 1) % 1000 == 0:
print((t + 1) * 100.0 / ntrain, '% of records loaded')
print('training set loaded')
del labels
# Parameters for Randomforest
random_state = 5342
n_jobs = 8
verbose = 2
clf1 = ExtraTreesClassifier(criterion='entropy', random_state=random_state, n_jobs=n_jobs, verbose=verbose)
clf2 = ExtraTreesClassifier(criterion='entropy', random_state=random_state, n_jobs=n_jobs, verbose=verbose)
clf3 = RandomForestClassifier(criterion='entropy', random_state=random_state, n_jobs=n_jobs, verbose=verbose)
clf4 = RandomForestClassifier(criterion='entropy', random_state=random_state, n_jobs=n_jobs, verbose=verbose)
# Start training
print('training started')
clf1.fit(train[:, :-1], train[:, -1])
X_new1 = clf1.transform(train[:, :-1])
X_new2 = clf3.fit_transform(train[:, :-1], train[:, -1])
# print('importances', clf1.feature_importances_)
clf2.fit(X_new1, train[:, -1])
clf4.fit(X_new2, train[:, -1])
print('training completed')
print('n_components = ', len(X_new1[0]), len(X_new2[0]))
# We don't need training set now
del train
# Dimensions for train set
ntest = 10873
nfeature = 16 ** 2 + 1 # For two_byte_codes, no_que_marks
test = np.zeros((ntest, nfeature), dtype=int)
Ids = [] # Required test set ids
with open(ftest, read_mode) as f:
next(f) # Ignoring header
for t, row in enumerate(reader(f)):
# test[t, :] = map(int, row[1:]) if six.PY2 else list(map(int, row[1:]))
test[t, :] = map(float, row[1:]) if six.PY2 else list(map(float, row[1:]))
Ids.append(row[0])
if (t + 1) % 1000 == 0:
print(t + 1, 'records loaded')
print('test set loaded')
Y_new1 = clf1.transform(test)
Y_new2 = clf3.transform(test)
# Predict for whole test set
y_pred1 = clf2.predict_proba(Y_new1)
y_pred2 = clf4.predict_proba(Y_new2)
y_pred = np.zeros((len(y_pred1), len(y_pred1[0])), dtype=float)
# iterate through rows
for i in range(len(y_pred)):
# iterate through columns
for j in range(len(y_pred[0])):
y_pred[i][j] = (y_pred1[i][j] + y_pred2[i][j]) / 2.0
# Writing results to file
with open(fsubmission, write_mode) as f:
fw = writer(f)
# Header preparation
header = ['Id'] + ['Prediction' + str(i) for i in range(1, 10)]
fw.writerow(header)
for t, (Id, pred) in enumerate(zip(Ids, y_pred.tolist())):
fw.writerow([Id] + pred)
if (t + 1) % 1000 == 0:
print(t + 1, 'prediction written')