-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathtopicmap.py
More file actions
66 lines (54 loc) · 1.85 KB
/
topicmap.py
File metadata and controls
66 lines (54 loc) · 1.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21
import spacy
from pathlib import Path
import os
import argparse
#nlp = de_core_news_sm.load()
nlp = spacy.load("de_core_news_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
parser = argparse.ArgumentParser(description='Zusammenfassung erstellen')
params = [
{
"command_arg": "file",
"default": "2015-09-09 VI-DS-01825 Bau- und Finanzierun SAO.txt",
"help": "file to summarize"
}
]
for entry in params:
parser.add_argument('--' + entry['command_arg'],
dest=entry['command_arg'],
action='store',
default=entry['default'],
help=entry['help'])
args = parser.parse_args()
# if token.orth_.isspace():
# continue
# elif token.like_url:
# lda_tokens.append('URL')
# elif token.orth_.startswith('
def tokenize(text):
lda_tokens = []
text2 = "tests2"
doc = nlp(text2)
print([(w.text, w.pos_) for w in doc])
# tokens = parser(text)
# for token in tokens:
# if token.orth_.isspace():
# continue
# elif token.like_url:
# lda_tokens.append('URL')
# elif token.orth_.startswith('@'):
# lda_tokens.append('SCREEN_NAME')
# else:
# lda_tokens.append(token.lower_)
return lda_tokens
file_name = args.file
try:
with open("data/txts/{}".format(file_name), 'r') as file:
content = file.read()
summary = tokenize(content)
Path("data/tokenized").mkdir(parents=True, exist_ok=True)
path_to_txt = "data/tokenized/" + file_name
with open(path_to_txt, "w") as txt_file:
txt_file.write(summary)
except:
content = '{}.txt not found>'.format(file_name)