Text-Summarizer-WebTool/QuickSumm.py at main · QuickSumm/Text-Summarizer-WebTool · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import nltk
import spacy
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load SpaCy English model
# nlp = spacy.load("en_core_web_sm")

# Load SpaCy English model (auto-download if not present)
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

def summarize_text(text, ratio=0.3, include_named_entities=False):
    sentences = sent_tokenize(text)
    words = [word.lower() for sentence in sentences for word in word_tokenize(sentence) if word.isalnum()]
    words = [word for word in words if word not in stopwords.words('english')]

    freq_dist = FreqDist(words)

    # Calculate sentence importance based on frequency distribution
    sentence_importance = {sentence: sum(freq_dist[word] for word in word_tokenize(sentence) if word in freq_dist) for sentence in sentences}

    # Use SpaCy for Named Entity Recognition
    if include_named_entities:
        doc = nlp(text)
        named_entities = set()
        for ent in doc.ents:
            named_entities.add(ent.text)
        # Exclude named entities from word frequency calculation
        words = [word for word in words if word not in named_entities]

    # Recalculate sentence importance after excluding named entities
    freq_dist = FreqDist(words)
    sentence_importance = {sentence: sum(freq_dist[word] for word in word_tokenize(sentence) if word in freq_dist) for sentence in sentences}

    # Select top sentences based on importance
    top_sentences = sorted(sentence_importance, key=sentence_importance.get, reverse=True)[:int(len(sentences) * ratio)]

    return ' '.join(top_sentences)

# Example usage:
# text = input("Please enter the text you'd like to summarize: ")
# summary = summarize_text(text, include_named_entities=True)
# print("Summarized text:")
# print(summary)