-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpredict.py
More file actions
57 lines (48 loc) · 1.67 KB
/
predict.py
File metadata and controls
57 lines (48 loc) · 1.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.snowball import SnowballStemmer
import re
import sys
import warnings
stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words('english'))
stop_words.update(['geek','given','zero','may','also','across','among','beside','however','yet','within','return'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
from sklearn.feature_extraction.text import TfidfVectorizer
def cleanHtml(sentence):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, ' ', str(sentence))
return cleantext
def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
cleaned = cleaned.strip()
cleaned = cleaned.replace("\n"," ")
return cleaned
def keepAlpha(sentence):
alpha_sent = ""
for word in sentence.split():
alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
alpha_sent += alpha_word
alpha_sent += " "
alpha_sent = alpha_sent.strip()
return alpha_sent
def removeStopWords(sentence):
global re_stop_words
return re_stop_words.sub(" ", sentence)
def stemming(sentence):
stemSentence = ""
for word in sentence.split():
stem = stemmer.stem(word)
stemSentence += stem
stemSentence += " "
stemSentence = stemSentence.strip()
return stemSentence
def preprocess(new_prob):
new_prob = cleanHtml(new_prob)
new_prob = keepAlpha(new_prob)
new_prob = removeStopWords(new_prob)
new_prob = stemming(new_prob)
new_prob = [new_prob]
return new_prob