-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess.py
More file actions
31 lines (22 loc) · 907 Bytes
/
process.py
File metadata and controls
31 lines (22 loc) · 907 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from sklearn.feature_extraction.text import CountVectorizer
import codecs
import sys
import scipy.io
reload(sys)
sys.setdefaultencoding('utf8')
with codecs.open(
'/Users/Vishal/Desktop/CS Classes/CS467/GameOfDebates/DebateAnalysis/transcripts/Republicans/rep_1-14-2016-wl.txt'
, 'r', encoding='utf-8', errors='ignore') as debate:
debate_text = debate.read().strip(' \t\n\r')
debate_text = debate_text.splitlines()
print debate_text[0]
vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(debate_text)
Y = vectorizer.get_feature_names()
print X
print Y
vocab_y = open('/Users/Vishal/Desktop/CS Classes/CS467/GameOfDebates/DebateAnalysis/Dataset/vocab.debate.txt', 'w')
for word in Y:
vocab_y.write("%s\n" % word)
vocab_x = open('/Users/Vishal/Desktop/CS Classes/CS467/GameOfDebates/DebateAnalysis/Dataset/docword.debate.txt', 'w')
scipy.io.mmwrite(vocab_x, X)