-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhoverpy_scikitlearn.py
More file actions
122 lines (114 loc) · 4.4 KB
/
hoverpy_scikitlearn.py
File metadata and controls
122 lines (114 loc) · 4.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def getHNData(verbose=False, limit=100, sub="showstories"):
from hackernews import HackerNews
from hackernews import settings
import hoverpy, time, os
dbpath = "data/hn.%s.db" % sub
with hoverpy.HoverPy(recordMode="once", dbpath=dbpath) as hp:
if not hp.mode() == "capture":
settings.supported_api_versions[
"v0"] = "http://hacker-news.firebaseio.com/v0/"
hn = HackerNews()
titles = []
print("GETTING HACKERNEWS %s DATA" % sub)
subs = {"showstories": hn.show_stories,
"askstories": hn.ask_stories,
"jobstories": hn.job_stories,
"topstories": hn.top_stories}
start = time.time()
for story_id in subs[sub](limit=limit):
story = hn.get_item(story_id)
if verbose:
print(story.title.lower())
titles.append(story.title.lower())
print(
"got %i hackernews titles in %f seconds" %
(len(titles), time.time() - start))
return titles
def getRedditData(verbose=False, comments=True, limit=100, sub="all"):
import hoverpy, praw, time
dbpath = ("data/reddit.%s.db" % sub)
with hoverpy.HoverPy(recordMode='once', dbpath=dbpath, httpsToHttp=True) as hp:
titles = []
print "GETTING REDDIT r/%s DATA" % sub
r = praw.Reddit(user_agent="Karma breakdown 1.0 by /u/_Daimon_", http_proxy=hp.httpProxy(), https_proxy=hp.httpProxy(), validate_certs="off")
if not hp.mode() == "capture":
r.config.api_request_delay = 0
subreddit = r.get_subreddit(sub)
for submission in subreddit.get_hot(limit=limit):
text = submission.title.lower()
if comments:
flat_comments = praw.helpers.flatten_tree(submission.comments)
for comment in flat_comments:
text += comment.body + " " if hasattr(comment, 'body') else ''
if verbose:
print text
titles.append(text)
return titles
subs = [('hn', 'showstories'),
('hn', 'askstories'),
('hn', 'jobstories'),
('reddit', 'republican'),
('reddit', 'democrat'),
('reddit', 'linux'),
('reddit', 'python'),
('reddit', 'music'),
('reddit', 'movies'),
('reddit', 'literature'),
('reddit', 'books')]
def doMining():
titles = []
target = []
getter = {'hn': getHNData, 'reddit': getRedditData}
for i in range(len(subs)):
subTitles = getter[subs[i][0]](
sub=subs[i][1])
titles += subTitles
target += [i] * len(subTitles)
return (titles, target)
sentences = ["powershell and openssl compatability testing",
"compiling source code on ubuntu",
"wifi drivers keep crashing",
"cron jobs",
"training day was a great movie with a legendary director",
"michael bay should remake lord of the rings, set in the future",
"hilary clinton may win voters' hearts",
"donald trump may donimate the presidency",
"reading dead wood gives me far more pleasure than using kindles",
"hiring a back end engineer",
"guitar is louder than the piano although electronic is best",
"drum solo and singer from the rolling stones",
"hiring a back end engineer",
"javascript loader",
"dostoevsky's existentialis"]
def main():
titles, target = doMining()
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
# build our count vectoriser
#
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(titles)
# build tfidf transformer
#
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# classifier
#
clf = MultinomialNB().fit(X_train_tfidf, target)
print "*"*30+"\nTEST CLASSIFIER\n"+"*"*30
# predict function
#
def predict(sentences):
X_new_counts = count_vect.transform(sentences)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(sentences, predicted):
print('%r => %s' % (doc, subs[category]))
#
predict(sentences)
#
while True:
predict([raw_input("Enter title: ").strip()])
if __name__ == "__main__":
main()