-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathctoken.py
More file actions
executable file
·65 lines (57 loc) · 2.57 KB
/
ctoken.py
File metadata and controls
executable file
·65 lines (57 loc) · 2.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import re
from collections import defaultdict
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
# @ and $ are excluded from TABLE because they add important context to numbers and email addresses, respectively
# replace_string = "\n\t\r^_`{|}~?<=>*+#%!;:/[]\'\",()_”“‘’\\"
replace_string = "\n\t\r^_`{|}~?<=>*+#%!;:/[]\'\",()_”“‘’\\&$"
TABLE2 = str.maketrans(replace_string, " "*len(replace_string))
TABLE_NOSPACE = str.maketrans("", "", "-")
stemmer = SnowballStemmer("english")
STOPWORDS = set(stopwords.words('english'))
NO_NUMERIC = False
def tokenize(line: str, trim_stopwords=False) -> list:
tokenlist = list()
line = re.sub(r"\. ", ' ', line) # strips periods at the end of sentences
for word in line.lower().translate(TABLE2).split():
# lower:O(N) + translate:O(N) + split:O(N) + forloop:O(N) = O(4N) = O(N)
try:
word = word.strip(".@").translate(TABLE_NOSPACE)
if trim_stopwords and word in STOPWORDS:
break
if word != '':
if NO_NUMERIC and any(map(str.isdigit, word)):
pass
else:
tokenlist.append(stemmer.stem(word).encode("ascii").decode())
except UnicodeEncodeError: # for handling bad characters in word
pass
return tokenlist
def computeWordFrequencies(tokenlist: list) -> dict:
# O(N) Time Complexity, because worst case each word in for loop is visited once.
tokencount = defaultdict(int) # str: int
if tokenlist and tokenlist is not None:
for word in tokenlist:
tokencount[word] += 1
return tokencount
def computeWordFrequencies_postion_old(tokenlist: list) -> dict:
# O(N) Time Complexity, because worst case each word in for loop is visited once.
tokencount = defaultdict(set) # {word: [positions]} # len(positions) doubles as count
if tokenlist is not None:
for word in tokenlist:
for i, x in enumerate(tokenlist):
if x == word:
tokencount[word].add(i)
return tokencount
def computeWordFrequencies_postion_linear(tokenlist: list) -> dict:
# O(N) Time Complexity, because worst case each word in for loop is visited once.
tokencount = defaultdict(list) # {word: [positions]} # len(positions) doubles as count
if tokenlist is not None:
index = 0
for word in tokenlist:
tokencount[word].append(index)
index += 1
return tokencount
if __name__ == '__main__':
stri= "master of software engineering"
print(tokenize(stri))