-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
138 lines (112 loc) · 5.13 KB
/
app.py
File metadata and controls
138 lines (112 loc) · 5.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from flask import Flask, request, render_template, jsonify
from werkzeug.utils import secure_filename
import os
import docx
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
import logging
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
# Explicitly set NLTK data path
nltk.data.path.append(os.path.join(os.getcwd(), "nltk_data"))
nltk.download('punkt_tab', download_dir=os.path.join(os.getcwd(), "nltk_data"))
nltk.download('punkt', download_dir=os.path.join(os.getcwd(), "nltk_data"))
nltk.download('stopwords', download_dir=os.path.join(os.getcwd(), "nltk_data"))
nltk.download('punkt', download_dir=os.path.join(os.getcwd(), "nltk_data"))
print(nltk.data.path)
app = Flask(__name__)
# Configuration
UPLOAD_FOLDER = 'uploads'
ALLOWED_EXTENSIONS = {'docx'}
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
# Set up logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Ensure the upload folder exists
if not os.path.exists(UPLOAD_FOLDER):
os.makedirs(UPLOAD_FOLDER)
def allowed_file(filename):
"""Checks if the file is allowed based on its extension"""
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
def extract_text_from_docx(file_path):
"""Extracts text from a .docx file"""
try:
doc = docx.Document(file_path)
full_text = "\n".join([para.text for para in doc.paragraphs])
logger.debug(f"Extracted text from {file_path}")
return full_text
except Exception as e:
logger.error(f"Error extracting text from {file_path}: {e}")
raise
def summarize_text(text, num_sentences=5, algorithm="textrank"):
"""Summarizes the extracted text by focusing on key insights."""
try:
# 1. Tokenize the text into sentences
sentences = sent_tokenize(text)
# 2. Remove stop words and punctuation
stop_words = set(stopwords.words('english'))
words = [word.lower() for sentence in sentences for word in nltk.word_tokenize(sentence) if word.isalnum() and word.lower() not in stop_words]
# 3. Calculate word frequency
word_frequency = {}
for word in words:
if word in word_frequency:
word_frequency[word] += 1
else:
word_frequency[word] = 1
# 4. Score sentences based on word frequency
sentence_scores = {}
for i, sentence in enumerate(sentences):
for word in nltk.word_tokenize(sentence.lower()):
if word in word_frequency:
if len(sentence.split(" ")) < 30: # Consider only sentences with a reasonable length
if i in sentence_scores:
sentence_scores[i] += word_frequency[word]
else:
sentence_scores[i] = word_frequency[word]
# 5. Get the top N sentences as key insights
top_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
key_insights = [sentences[i] for i in top_sentences]
logger.debug("Summarization complete")
return key_insights
except Exception as e:
logger.error(f"Error summarizing text: {e}")
raise
@app.route("/", methods=["GET", "POST"])
def upload_file():
if request.method == "POST":
if "files" not in request.files:
logger.warning("No file part in request")
return "No file part"
files = request.files.getlist("files")
summaries = {}
for file in files:
if file and allowed_file(file.filename):
filename = secure_filename(file.filename)
file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
# Log the file before attempting to save
logger.debug(f"Processing file: {filename}")
try:
# Save the file to the upload folder
file.save(file_path)
logger.info(f"File saved to {file_path}")
# Process file
text = extract_text_from_docx(file_path)
key_insights = summarize_text(text, algorithm="textrank")
# Format the output as a bulleted list
formatted_insights = "<ul>\n"
for insight in key_insights:
formatted_insights += f" <li>{insight}</li>\n"
formatted_insights += "</ul>"
summaries[filename] = formatted_insights
logger.info(f"Summary for {filename} created")
except Exception as e:
logger.error(f"Error processing file {filename}: {e}")
summaries[filename] = f"Error processing file: {e}"
return jsonify(summaries)
return render_template("upload.html")
if __name__ == "__main__":
app.run(debug=True)