-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate.py
More file actions
66 lines (57 loc) · 1.95 KB
/
generate.py
File metadata and controls
66 lines (57 loc) · 1.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import numpy as np
import sys
import time
from collections import Counter
import pickle
# python generate.py rt
# python generate.py fisher
def main():
if len(sys.argv) != 2:
print('Usage: python %s data_folder' % (sys.argv[0]))
exit()
data_folder = sys.argv[1]
# read training data
positive = {}
negative = {}
positive_count = 0
negative_count = 0
pos_counter = Counter()
neg_counter = Counter()
with open(data_folder + '_train.txt', 'r') as f:
for i, line in enumerate(f):
# positive review
if line[:1] == '1':
positive_count += 1
dict_str = line[2:]
dict_ = dict(x.split(':') for x in dict_str.split(' '))
for x in dict_:
dict_[x] = int(dict_[x])
pos_counter += Counter(dict_)
# negative review
else:
negative_count += 1
dict_str = line[3:]
dict_ = dict(x.split(':') for x in dict_str.split(' '))
for x in dict_:
dict_[x] = int(dict_[x])
neg_counter += Counter(dict_)
positive = dict(pos_counter)
negative = dict(neg_counter)
# print Counter(positive).most_common(10)
# print Counter(negative).most_common(10)
# print len(positive)
# print len(negative)
# print positive
# print negative
with open('pos_'+ data_folder + '_train_dict.pkl', 'wb') as f:
pickle.dump(positive,f)
with open('neg_'+ data_folder + '_train_dict.pkl', 'wb') as f:
pickle.dump(negative,f)
with open('pos_'+data_folder+'_train_count.pkl', 'wb') as f:
pickle.dump(positive_count,f)
with open('neg_'+data_folder+'_train_count.pkl', 'wb') as f:
pickle.dump(negative_count,f)
if __name__ == '__main__':
start_time = time.time()
main()
print("--- %s seconds ---" % (time.time() - start_time))