allris-scraper/topicmap.py at master · CodeforLeipzig/allris-scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21
import spacy
from pathlib import Path
import os
import argparse

#nlp = de_core_news_sm.load()
nlp = spacy.load("de_core_news_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])

parser = argparse.ArgumentParser(description='Zusammenfassung erstellen')

params = [
    {
        "command_arg": "file",
        "default": "2015-09-09 VI-DS-01825 Bau- und Finanzierun SAO.txt",
        "help": "file to summarize"
    }
]

for entry in params:
    parser.add_argument('--' + entry['command_arg'],
                        dest=entry['command_arg'],
                        action='store',
                        default=entry['default'],
                        help=entry['help'])

args = parser.parse_args()

#        if token.orth_.isspace():
#            continue
#        elif token.like_url:
#            lda_tokens.append('URL')
#        elif token.orth_.startswith('

def tokenize(text):
    lda_tokens = []
    text2 = "tests2"
    doc = nlp(text2)
    print([(w.text, w.pos_) for w in doc])
#    tokens = parser(text)
#    for token in tokens:
#        if token.orth_.isspace():
#            continue
#        elif token.like_url:
#            lda_tokens.append('URL')
#        elif token.orth_.startswith('@'):
#            lda_tokens.append('SCREEN_NAME')
#        else:
#            lda_tokens.append(token.lower_)
    return lda_tokens


file_name = args.file
try:
    with open("data/txts/{}".format(file_name), 'r') as file:
        content = file.read()
        summary = tokenize(content)
        Path("data/tokenized").mkdir(parents=True, exist_ok=True)
        path_to_txt = "data/tokenized/" + file_name
        with open(path_to_txt, "w") as txt_file:
            txt_file.write(summary)
except:
    content = '{}.txt not found>'.format(file_name)