From 7ea614ed1ab2baabfcd0ed8fe88a71145b0c4289 Mon Sep 17 00:00:00 2001 From: TenzDelek Date: Tue, 24 Feb 2026 16:21:38 +0530 Subject: [PATCH 1/2] base --- api/Assistant/assistant_model.py | 1 - api/Assistant/assistant_response_model.py | 2 - api/Assistant/assistant_service.py | 65 +++---- api/Assistant/assistant_view.py | 31 +++- api/langgraph/context_processor.py | 41 +++-- api/ui.py | 169 ++++++++++-------- .../versions/18cf46778420_remove_file.py | 32 ++++ 7 files changed, 209 insertions(+), 132 deletions(-) create mode 100644 migrations/versions/18cf46778420_remove_file.py diff --git a/api/Assistant/assistant_model.py b/api/Assistant/assistant_model.py index 45512b8..2d6d306 100644 --- a/api/Assistant/assistant_model.py +++ b/api/Assistant/assistant_model.py @@ -28,7 +28,6 @@ class Context(Base): id = Column(UUID(as_uuid=True), primary_key=True, default=uuid4) content=Column(Text,nullable=True) - file_url=Column(Text,nullable=True) pecha_title = Column(String(255), nullable=True) pecha_text_id = Column(String(255), nullable=True) assistant_id = Column(UUID(as_uuid=True),ForeignKey("assistant.id", ondelete="CASCADE"),nullable=False) diff --git a/api/Assistant/assistant_response_model.py b/api/Assistant/assistant_response_model.py index 80f0724..191dbd2 100644 --- a/api/Assistant/assistant_response_model.py +++ b/api/Assistant/assistant_response_model.py @@ -4,14 +4,12 @@ class ContextRequest(BaseModel): content: Optional[str] = None - file_url: Optional[str] = None pecha_title: Optional[str] = None pecha_text_id: Optional[str] = None class ContextResponse(BaseModel): id: UUID content: Optional[str] = None - file_url: Optional[str] = None pecha_title: Optional[str] = None pecha_text_id: Optional[str] = None diff --git a/api/Assistant/assistant_service.py b/api/Assistant/assistant_service.py index 4997095..85cd8d1 100644 --- a/api/Assistant/assistant_service.py +++ b/api/Assistant/assistant_service.py @@ -1,4 +1,3 @@ -import logging from api.Users.user_service import validate_and_extract_user_email from api.db.pg_database import SessionLocal from api.Assistant.assistant_repository import get_all_assistants, get_assistant_by_id_repository, delete_assistant_repository, update_assistant_repository @@ -8,16 +7,15 @@ from api.Assistant.assistant_model import Assistant, Context from uuid import UUID from datetime import datetime, timezone -from fastapi import HTTPException, status +from fastapi import HTTPException, status, UploadFile from api.error_constant import ErrorConstants -from api.upload.S3_utils import generate_presigned_access_url, delete_file -from api.config import get from api.cache.cache_enums import CacheType from api.Assistant.assistant_cache_service import ( get_assistant_detail_cache, set_assistant_detail_cache, delete_assistant_detail_cache, ) +from api.langgraph.context_processor import validate_file, extract_content_from_file def _build_context_responses(contexts) -> List[ContextResponse]: @@ -25,12 +23,6 @@ def _build_context_responses(contexts) -> List[ContextResponse]: ContextResponse( id=context.id, content=context.content, - file_url=( - generate_presigned_access_url( - bucket_name=get("AWS_BUCKET_NAME"), - s3_key=context.file_url - ) if context.file_url else None - ), pecha_title=context.pecha_title, pecha_text_id=context.pecha_text_id ) for context in contexts @@ -79,21 +71,37 @@ def get_assistants(skip: 0, limit: 20) -> AssistantResponse: return assistant_response -def create_assistant_service(token: str, assistant_request: AssistantRequest): - current_user_email=validate_and_extract_user_email(token=token) +async def create_assistant_service(token: str, assistant_request: AssistantRequest, files: List[UploadFile] = None): + current_user_email = validate_and_extract_user_email(token=token) + + contexts_list = [] + + for ctx in assistant_request.contexts: + contexts_list.append( + Context(content=ctx.content, pecha_title=ctx.pecha_title, pecha_text_id=ctx.pecha_text_id) + ) + + if files: + for file in files: + if file.filename: + file_bytes = await file.read() + try: + validate_file(file.filename, len(file_bytes)) + extracted_content = extract_content_from_file(file_bytes, file.filename) + contexts_list.append(Context(content=extracted_content)) + except ValueError as e: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e)) + with SessionLocal() as db_session: assistant = Assistant( - name=assistant_request.name, - source_type=assistant_request.source_type, - description=assistant_request.description, - system_prompt=assistant_request.system_prompt, - system_assistance=assistant_request.system_assistance, - created_by=current_user_email, - contexts=[ - Context(content=ctx.content, file_url=ctx.file_url, pecha_title=ctx.pecha_title, pecha_text_id=ctx.pecha_text_id) - for ctx in assistant_request.contexts - ] - ) + name=assistant_request.name, + source_type=assistant_request.source_type, + description=assistant_request.description, + system_prompt=assistant_request.system_prompt, + system_assistance=assistant_request.system_assistance, + created_by=current_user_email, + contexts=contexts_list + ) create_assistant_repository(db=db_session, assistant=assistant) async def get_assistant_by_id_service(assistant_id: UUID) -> AssistantInfoResponse: @@ -118,7 +126,7 @@ async def get_assistant_by_id_service(assistant_id: UUID) -> AssistantInfoRespon return assistant_info async def delete_assistant_service(assistant_id: UUID, token: str): - current_user_email=validate_and_extract_user_email(token=token) + current_user_email = validate_and_extract_user_email(token=token) with SessionLocal() as db_session: assistant = get_assistant_by_id_repository(db=db_session, assistant_id=assistant_id) if current_user_email != assistant.created_by: @@ -126,13 +134,6 @@ async def delete_assistant_service(assistant_id: UUID, token: str): if assistant.system_assistance: raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=ErrorConstants.FORBIDDEN_ERROR_MESSAGE) - for context in assistant.contexts: - if context.file_url: - try: - delete_file(context.file_url) - except Exception as e: - logging.error(f"Failed to delete S3 file {context.file_url}: {str(e)}") - delete_assistant_repository(db=db_session, assistant_id=assistant_id) await delete_assistant_detail_cache( @@ -161,7 +162,7 @@ async def update_assistant_service(assistant_id: UUID, update_request: UpdateAss for context in assistant.contexts: db_session.delete(context) assistant.contexts = [ - Context(content=ctx.content, file_url=ctx.file_url, pecha_title=ctx.pecha_title, pecha_text_id=ctx.pecha_text_id) + Context(content=ctx.content, pecha_title=ctx.pecha_title, pecha_text_id=ctx.pecha_text_id) for ctx in update_request.contexts ] diff --git a/api/Assistant/assistant_view.py b/api/Assistant/assistant_view.py index e435f98..69c1b50 100644 --- a/api/Assistant/assistant_view.py +++ b/api/Assistant/assistant_view.py @@ -1,12 +1,13 @@ -from fastapi import APIRouter +from fastapi import APIRouter, UploadFile, File, Form from starlette import status from api.Assistant.assistant_response_model import AssistantResponse, AssistantRequest, AssistantInfoResponse, UpdateAssistantRequest from fastapi import Query, Depends from api.Assistant.assistant_service import create_assistant_service, get_assistant_by_id_service, get_assistants, delete_assistant_service, update_assistant_service from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials -from typing import Annotated +from typing import Annotated, Optional, List from uuid import UUID from api.constant import Constant +import json oauth2_scheme = HTTPBearer() assistant_router=APIRouter( @@ -21,8 +22,30 @@ async def get_all_assistants( return get_assistants(skip=skip, limit=limit) @assistant_router.post("", status_code=status.HTTP_201_CREATED) -async def create_assistant(assistant_request: AssistantRequest, authentication_credential: Annotated[HTTPAuthorizationCredentials, Depends(oauth2_scheme)]): - create_assistant_service(token=authentication_credential.credentials, assistant_request=assistant_request) +async def create_assistant( + authentication_credential: Annotated[HTTPAuthorizationCredentials, Depends(oauth2_scheme)], + name: str = Form(...), + system_prompt: str = Form(...), + source_type: Optional[str] = Form(None), + description: Optional[str] = Form(None), + system_assistance: bool = Form(False), + contexts: Optional[str] = Form(None), + files: List[UploadFile] = File(default=[]) +): + contexts_data = json.loads(contexts) if contexts else [] + assistant_request = AssistantRequest( + name=name, + source_type=source_type, + description=description, + system_prompt=system_prompt, + contexts=contexts_data, + system_assistance=system_assistance + ) + await create_assistant_service( + token=authentication_credential.credentials, + assistant_request=assistant_request, + files=files + ) return {"message": Constant.CREATED_ASSISTANT_MESSAGE} @assistant_router.get("/{assistant_id}", status_code=status.HTTP_200_OK) diff --git a/api/langgraph/context_processor.py b/api/langgraph/context_processor.py index d46bb03..d15f888 100644 --- a/api/langgraph/context_processor.py +++ b/api/langgraph/context_processor.py @@ -4,8 +4,10 @@ from pypdf import PdfReader from docx import Document from api.Assistant.assistant_response_model import ContextRequest -from api.upload.S3_utils import download_file_from_s3 -from api.config import get + +ALLOWED_FILE_EXTENSIONS = {'.pdf', '.txt', '.text', '.docx'} +MAX_FILE_SIZE_MB = 10 +MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024 def extract_text_from_pdf(pdf_bytes: BytesIO) -> str: @@ -49,19 +51,28 @@ def extract_text_from_docx(file_bytes: BytesIO) -> str: raise -def process_file_context(file_url: str) -> str: - bucket_name = get("AWS_BUCKET_NAME") - file_bytes = download_file_from_s3(bucket_name, file_url) +def validate_file(filename: str, file_size: int) -> None: + import os + ext = os.path.splitext(filename.lower())[1] + if ext not in ALLOWED_FILE_EXTENSIONS: + raise ValueError(f"Unsupported file type: {ext}. Allowed types: {', '.join(ALLOWED_FILE_EXTENSIONS)}") - if file_url.lower().endswith('.pdf'): - text = extract_text_from_pdf(file_bytes) - elif file_url.lower().endswith(('.txt', '.text')): - text = extract_text_from_txt(file_bytes) - elif file_url.lower().endswith(('.docx')): - text = extract_text_from_docx(file_bytes) + if file_size > MAX_FILE_SIZE_BYTES: + raise ValueError(f"File size exceeds {MAX_FILE_SIZE_MB}MB limit") + + +def extract_content_from_file(file_bytes: bytes, filename: str) -> str: + file_stream = BytesIO(file_bytes) + filename_lower = filename.lower() + + if filename_lower.endswith('.pdf'): + return extract_text_from_pdf(file_stream) + elif filename_lower.endswith(('.txt', '.text')): + return extract_text_from_txt(file_stream) + elif filename_lower.endswith('.docx'): + return extract_text_from_docx(file_stream) else: - raise ValueError(f"Unsupported file type: {file_url}") - return text + raise ValueError(f"Unsupported file type: {filename}") def process_contexts(contexts: List[ContextRequest]) -> Optional[List[str]]: @@ -75,10 +86,6 @@ def process_contexts(contexts: List[ContextRequest]) -> Optional[List[str]]: if ctx.content: processed_contexts.append(ctx.content) - elif ctx.file_url: - file_text = process_file_context(ctx.file_url) - processed_contexts.append(file_text) - elif ctx.pecha_title and ctx.pecha_text_id: pecha_context = f"[Pecha: {ctx.pecha_title}, ID: {ctx.pecha_text_id}]" processed_contexts.append(pecha_context) diff --git a/api/ui.py b/api/ui.py index 0761fd3..1a8131d 100644 --- a/api/ui.py +++ b/api/ui.py @@ -608,7 +608,7 @@ async def serve_ui():
@@ -819,8 +819,7 @@ async def serve_ui(): if (activeAssistant.contexts && activeAssistant.contexts.length) { ctxDiv.innerHTML = activeAssistant.contexts.map((c,i) => { if (c.pecha_title) return `Pecha: ${esc(c.pecha_title)}`; - if (c.content) return `Text #${i+1}`; - if (c.file_url) return `File: ${esc(c.file_url)}`; + if (c.content) return `Context #${i+1}`; return `Context #${i+1}`; }).join(''); } else { @@ -859,10 +858,8 @@ async def serve_ui(): activeAssistant.contexts.forEach(c => { let type = 'content'; if (c.pecha_text_id) type = 'search'; - else if (c.file_url) type = 'file'; addContextEntry(type, { content: c.content, - file_url: c.file_url, pecha_title: c.pecha_title, pecha_text_id: c.pecha_text_id }); @@ -906,7 +903,6 @@ async def serve_ui(): let selectedType = type; if (data.pecha_text_id) selectedType = 'search'; - else if (data.file_url) selectedType = 'file'; else if (data.content) selectedType = 'content'; // Store type as data attribute for retrieval @@ -914,7 +910,7 @@ async def serve_ui(): // Add type label and remove button only let typeLabel = 'Content'; - if (selectedType === 'file') typeLabel = 'File URL'; + if (selectedType === 'file') typeLabel = 'File Upload'; else if (selectedType === 'search') typeLabel = 'Search Pecha'; div.innerHTML = ` @@ -928,7 +924,7 @@ async def serve_ui(): if (selectedType === 'content') { renderContentField(fieldArea, data.content || ''); } else if (selectedType === 'file') { - renderFileField(fieldArea, data.file_url || ''); + renderFileField(fieldArea, ''); } else if (selectedType === 'search') { renderSearchField(fieldArea, data.pecha_title || '', data.pecha_text_id || ''); // If editing and has content, create a mock search result with the content @@ -954,82 +950,71 @@ async def serve_ui(): `; } -function renderFileField(container, value) { - const hasFile = value && value.trim(); +function renderFileField(container, fileName) { + const hasFile = fileName && fileName.trim(); container.innerHTML = `
+ ${hasFile ? `
- ✓ File uploaded + ✓ ${esc(fileName)}
` : `
- -
-
+
Max 10MB
`} - +
`; } -async function handleFileSelect(inputEl) { +function handleFileSelect(inputEl) { const file = inputEl.files?.[0]; if (!file) return; - const token = getToken(); - if (!token) { - toast('Please enter a bearer token first', 'error'); + const entry = inputEl.closest('.context-entry'); + const statusDiv = entry.querySelector('.ctx-file-status'); + const hiddenNameInput = entry.querySelector('.ctx-file-name'); + + // Validate file type + const allowedTypes = ['.pdf', '.docx', '.txt']; + const fileName = file.name.toLowerCase(); + const isAllowed = allowedTypes.some(ext => fileName.endsWith(ext)); + if (!isAllowed) { + if (statusDiv) statusDiv.innerHTML = 'Invalid file type. Allowed: .pdf, .docx, .txt'; inputEl.value = ''; return; } - const entry = inputEl.closest('.context-entry'); - const statusDiv = entry.querySelector('.ctx-file-status'); - const fileBtn = entry.querySelector('.ctx-file-btn'); - const hiddenUrlInput = entry.querySelector('.ctx-file-url'); - - // Show uploading status - if (statusDiv) statusDiv.innerHTML = ' Uploading...'; - if (fileBtn) fileBtn.disabled = true; - - try { - const formData = new FormData(); - formData.append('file', file); - - const r = await fetch(API_BASE + '/media/upload', { - method: 'POST', - headers: { - 'Authorization': 'Bearer ' + token - }, - body: formData - }); - - if (!r.ok) { - const err = await r.json().catch(() => ({})); - throw new Error(err.detail?.message || err.detail || 'Upload failed'); - } - - const data = await r.json(); - - // Store the S3 key (not the temporary presigned URL) - hiddenUrlInput.value = data.key; - - // Re-render the field to show success state - const fieldArea = entry.querySelector('.ctx-field-area'); - renderFileField(fieldArea, data.key); - - toast('File uploaded successfully!', 'success'); - } catch (e) { - if (statusDiv) statusDiv.innerHTML = 'Upload failed: ' + esc(e.message) + ''; - if (fileBtn) fileBtn.disabled = false; - toast('Upload error: ' + e.message, 'error'); + // Validate file size (10MB max) + const maxSize = 10 * 1024 * 1024; + if (file.size > maxSize) { + if (statusDiv) statusDiv.innerHTML = 'File too large. Max 10MB allowed.'; inputEl.value = ''; + return; + } + + // Store the file name and keep the file input for later submission + hiddenNameInput.value = file.name; + + // Re-render the field to show success state + const fieldArea = entry.querySelector('.ctx-field-area'); + renderFileField(fieldArea, file.name); + + // Re-attach the file to the new input element + const newFileInput = fieldArea.querySelector('.ctx-file-input'); + if (newFileInput) { + const dataTransfer = new DataTransfer(); + dataTransfer.items.add(file); + newFileInput.files = dataTransfer.files; } + + toast('File selected: ' + file.name, 'success'); } function clearUploadedFile(btn) { @@ -1202,10 +1187,7 @@ async def serve_ui(): const type = e.getAttribute('data-context-type'); if (type === 'content') { const content = e.querySelector('.ctx-content')?.value.trim(); - if (content) contexts.push({content, file_url:null, pecha_title:null, pecha_text_id:null}); - } else if (type === 'file') { - const file_url = e.querySelector('.ctx-file-url')?.value.trim(); - if (file_url) contexts.push({content:null, file_url, pecha_title:null, pecha_text_id:null}); + if (content) contexts.push({content, pecha_title:null, pecha_text_id:null}); } else if (type === 'search') { const pecha_title = e.querySelector('.ctx-pecha-title')?.value.trim(); const pecha_text_id = e.querySelector('.ctx-pecha-text-id')?.value.trim(); @@ -1227,7 +1209,6 @@ async def serve_ui(): searchData.forEach(item => { contexts.push({ content: item.content || null, - file_url: null, pecha_title: pecha_title, pecha_text_id: pecha_text_id }); @@ -1236,17 +1217,32 @@ async def serve_ui(): // No content loaded yet, just store the metadata contexts.push({ content: null, - file_url: null, pecha_title: pecha_title, pecha_text_id: pecha_text_id }); } } } + // Note: 'file' type contexts are handled separately via getFilesFromForm() }); return contexts; } +function getFilesFromForm() { + const entries = document.querySelectorAll('#contextList .context-entry'); + const files = []; + entries.forEach(e => { + const type = e.getAttribute('data-context-type'); + if (type === 'file') { + const fileInput = e.querySelector('.ctx-file-input'); + if (fileInput && fileInput.files && fileInput.files.length > 0) { + files.push(fileInput.files[0]); + } + } + }); + return files; +} + async function submitModal() { const token = getToken(); if (!token) { toast('Please enter a bearer token','error'); return; } @@ -1254,23 +1250,44 @@ async def serve_ui(): const system_prompt = document.getElementById('formSystemPrompt').value.trim(); if (!name || !system_prompt) { toast('Name and System Prompt are required','error'); return; } - const body = { - name, - description: document.getElementById('formDescription').value.trim() || null, - source_type: document.getElementById('formSourceType').value.trim() || null, - system_prompt, - system_assistance: document.getElementById('formSystemAssistance').checked, - contexts: getContextsFromForm() - }; + const contextsData = getContextsFromForm(); + const files = getFilesFromForm(); try { let r; if (editingId) { + // For editing, use JSON (files already extracted, just updating text contexts) + const body = { + name, + description: document.getElementById('formDescription').value.trim() || null, + source_type: document.getElementById('formSourceType').value.trim() || null, + system_prompt, + system_assistance: document.getElementById('formSystemAssistance').checked, + contexts: contextsData + }; r = await fetch(API_BASE + '/assistant/' + editingId, {method:'PUT', headers:authHeaders(), body:JSON.stringify(body)}); } else { - r = await fetch(API_BASE + '/assistant', {method:'POST', headers:authHeaders(), body:JSON.stringify(body)}); + // For creating, use FormData to include files + const formData = new FormData(); + formData.append('name', name); + formData.append('system_prompt', system_prompt); + formData.append('description', document.getElementById('formDescription').value.trim() || ''); + formData.append('source_type', document.getElementById('formSourceType').value.trim() || ''); + formData.append('system_assistance', document.getElementById('formSystemAssistance').checked); + formData.append('contexts', JSON.stringify(contextsData)); + + // Add files + files.forEach(file => { + formData.append('files', file); + }); + + r = await fetch(API_BASE + '/assistant', { + method: 'POST', + headers: { 'Authorization': 'Bearer ' + token }, + body: formData + }); } - if (!r.ok) { const err = await r.json().catch(()=>({})); throw new Error(err.detail?.message || 'Request failed'); } + if (!r.ok) { const err = await r.json().catch(()=>({})); throw new Error(err.detail?.message || err.detail || 'Request failed'); } toast(editingId ? 'Assistant updated!' : 'Assistant created!', 'success'); closeModal(); await loadAssistants(); diff --git a/migrations/versions/18cf46778420_remove_file.py b/migrations/versions/18cf46778420_remove_file.py new file mode 100644 index 0000000..087b96f --- /dev/null +++ b/migrations/versions/18cf46778420_remove_file.py @@ -0,0 +1,32 @@ +"""remove file + +Revision ID: 18cf46778420 +Revises: ff67e0cb91ac +Create Date: 2026-02-24 16:14:26.704258 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '18cf46778420' +down_revision: Union[str, Sequence[str], None] = 'ff67e0cb91ac' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('context', 'file_url') + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('context', sa.Column('file_url', sa.TEXT(), autoincrement=False, nullable=True)) + # ### end Alembic commands ### From 543ca30ee216099727f5486d46774cccabb35cc6 Mon Sep 17 00:00:00 2001 From: TenzDelek Date: Wed, 25 Feb 2026 09:53:43 +0530 Subject: [PATCH 2/2] changes --- api/Assistant/assistant_service.py | 8 +- api/langgraph/context_processor.py | 102 ----------------------- api/langgraph/nodes/node_initialize.py | 4 +- api/utils.py | 107 ++++++++++++++++++++++++- 4 files changed, 110 insertions(+), 111 deletions(-) delete mode 100644 api/langgraph/context_processor.py diff --git a/api/Assistant/assistant_service.py b/api/Assistant/assistant_service.py index 85cd8d1..914bd99 100644 --- a/api/Assistant/assistant_service.py +++ b/api/Assistant/assistant_service.py @@ -15,7 +15,7 @@ set_assistant_detail_cache, delete_assistant_detail_cache, ) -from api.langgraph.context_processor import validate_file, extract_content_from_file +from api.utils import Utils def _build_context_responses(contexts) -> List[ContextResponse]: @@ -73,9 +73,7 @@ def get_assistants(skip: 0, limit: 20) -> AssistantResponse: async def create_assistant_service(token: str, assistant_request: AssistantRequest, files: List[UploadFile] = None): current_user_email = validate_and_extract_user_email(token=token) - contexts_list = [] - for ctx in assistant_request.contexts: contexts_list.append( Context(content=ctx.content, pecha_title=ctx.pecha_title, pecha_text_id=ctx.pecha_text_id) @@ -86,8 +84,8 @@ async def create_assistant_service(token: str, assistant_request: AssistantReque if file.filename: file_bytes = await file.read() try: - validate_file(file.filename, len(file_bytes)) - extracted_content = extract_content_from_file(file_bytes, file.filename) + Utils.validate_file(file.filename, len(file_bytes)) + extracted_content = Utils.extract_content_from_file(file_bytes, file.filename) contexts_list.append(Context(content=extracted_content)) except ValueError as e: raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e)) diff --git a/api/langgraph/context_processor.py b/api/langgraph/context_processor.py deleted file mode 100644 index d15f888..0000000 --- a/api/langgraph/context_processor.py +++ /dev/null @@ -1,102 +0,0 @@ -import logging -from typing import List, Optional -from io import BytesIO -from pypdf import PdfReader -from docx import Document -from api.Assistant.assistant_response_model import ContextRequest - -ALLOWED_FILE_EXTENSIONS = {'.pdf', '.txt', '.text', '.docx'} -MAX_FILE_SIZE_MB = 10 -MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024 - - -def extract_text_from_pdf(pdf_bytes: BytesIO) -> str: - try: - reader = PdfReader(pdf_bytes) - pages = [] - - for page in reader.pages: - page_text = page.extract_text() - if page_text: - pages.append(page_text.strip()) - - return "\n\n".join(pages).strip() - except Exception as e: - logging.error(f"Failed to extract text from PDF: {e}") - raise - - - -def extract_text_from_txt(file_bytes: BytesIO) -> str: - try: - return file_bytes.read().decode('utf-8') - except Exception as e: - logging.error(f"Failed to read text file: {e}") - raise - - -def extract_text_from_docx(file_bytes: BytesIO) -> str: - try: - doc = Document(file_bytes) - paragraphs = [] - - for para in doc.paragraphs: - para_text = para.text.strip() - if para_text: - paragraphs.append(para_text) - - return "\n\n".join(paragraphs).strip() - except Exception as e: - logging.error(f"Failed to extract text from DOCX: {e}") - raise - - -def validate_file(filename: str, file_size: int) -> None: - import os - ext = os.path.splitext(filename.lower())[1] - if ext not in ALLOWED_FILE_EXTENSIONS: - raise ValueError(f"Unsupported file type: {ext}. Allowed types: {', '.join(ALLOWED_FILE_EXTENSIONS)}") - - if file_size > MAX_FILE_SIZE_BYTES: - raise ValueError(f"File size exceeds {MAX_FILE_SIZE_MB}MB limit") - - -def extract_content_from_file(file_bytes: bytes, filename: str) -> str: - file_stream = BytesIO(file_bytes) - filename_lower = filename.lower() - - if filename_lower.endswith('.pdf'): - return extract_text_from_pdf(file_stream) - elif filename_lower.endswith(('.txt', '.text')): - return extract_text_from_txt(file_stream) - elif filename_lower.endswith('.docx'): - return extract_text_from_docx(file_stream) - else: - raise ValueError(f"Unsupported file type: {filename}") - - -def process_contexts(contexts: List[ContextRequest]) -> Optional[List[str]]: - if not contexts: - return None - - processed_contexts = [] - - for idx, ctx in enumerate(contexts, 1): - try: - if ctx.content: - processed_contexts.append(ctx.content) - - elif ctx.pecha_title and ctx.pecha_text_id: - pecha_context = f"[Pecha: {ctx.pecha_title}, ID: {ctx.pecha_text_id}]" - processed_contexts.append(pecha_context) - - else: - logging.warning(f"Empty context #{idx}, skipping") - - except Exception as e: - error_msg = f"Failed to process context #{idx}: {str(e)}" - logging.error(error_msg) - - if not processed_contexts: - return None - return processed_contexts diff --git a/api/langgraph/nodes/node_initialize.py b/api/langgraph/nodes/node_initialize.py index c4faa72..f90635e 100644 --- a/api/langgraph/nodes/node_initialize.py +++ b/api/langgraph/nodes/node_initialize.py @@ -3,7 +3,7 @@ from datetime import datetime from api.langgraph.workflow_type import WorkflowState,Batch -from api.langgraph.context_processor import process_contexts +from api.utils import Utils from api import config DEFAULT_MAX_BATCH_SIZE = 2 @@ -17,7 +17,7 @@ def initialize_workflow(state: WorkflowState) -> WorkflowState: batch_size = init_size - processed_contexts = process_contexts(request.contexts) if request.contexts else None + processed_contexts = Utils.process_contexts(request.contexts) if request.contexts else None for i in range(0, len(texts), batch_size): batch_texts = texts[i : i + batch_size] diff --git a/api/utils.py b/api/utils.py index 05deffd..c16b502 100644 --- a/api/utils.py +++ b/api/utils.py @@ -1,8 +1,111 @@ -from typing import List, Union +import logging +import os +from typing import List, Optional, Union +from io import BytesIO import hashlib +from pypdf import PdfReader +from docx import Document +from api.Assistant.assistant_response_model import ContextRequest +from api.config import get, get_int + class Utils: + ALLOWED_FILE_EXTENSIONS = get("ALLOWED_EXTENSIONS") + MAX_FILE_SIZE_MB = get_int("MAX_FILE_SIZE_MB") + MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024 + + @staticmethod def generate_hash_key(payload: List[Union[str, int]]) -> str: params_str = "".join(str(param) for param in payload) hash_value = hashlib.sha256(params_str.encode()).hexdigest() - return hash_value \ No newline at end of file + return hash_value + + @staticmethod + def extract_text_from_pdf(pdf_bytes: BytesIO) -> str: + try: + reader = PdfReader(pdf_bytes) + pages = [] + + for page in reader.pages: + page_text = page.extract_text() + if page_text: + pages.append(page_text.strip()) + + return "\n\n".join(pages).strip() + except Exception as e: + logging.error(f"Failed to extract text from PDF: {e}") + raise + + @staticmethod + def extract_text_from_txt(file_bytes: BytesIO) -> str: + try: + return file_bytes.read().decode('utf-8') + except Exception as e: + logging.error(f"Failed to read text file: {e}") + raise + + @staticmethod + def extract_text_from_docx(file_bytes: BytesIO) -> str: + try: + doc = Document(file_bytes) + paragraphs = [] + + for para in doc.paragraphs: + para_text = para.text.strip() + if para_text: + paragraphs.append(para_text) + + return "\n\n".join(paragraphs).strip() + except Exception as e: + logging.error(f"Failed to extract text from DOCX: {e}") + raise + + @staticmethod + def validate_file(filename: str, file_size: int) -> None: + ext = os.path.splitext(filename.lower())[1] + if ext not in Utils.ALLOWED_FILE_EXTENSIONS: + raise ValueError(f"Unsupported file type: {ext}. Allowed types: {', '.join(Utils.ALLOWED_FILE_EXTENSIONS)}") + + if file_size > Utils.MAX_FILE_SIZE_BYTES: + raise ValueError(f"File size exceeds {Utils.MAX_FILE_SIZE_MB}MB limit") + + @staticmethod + def extract_content_from_file(file_bytes: bytes, filename: str) -> str: + file_stream = BytesIO(file_bytes) + filename_lower = filename.lower() + + if filename_lower.endswith('.pdf'): + return Utils.extract_text_from_pdf(file_stream) + elif filename_lower.endswith(('.txt', '.text')): + return Utils.extract_text_from_txt(file_stream) + elif filename_lower.endswith('.docx'): + return Utils.extract_text_from_docx(file_stream) + else: + raise ValueError(f"Unsupported file type: {filename}") + + @staticmethod + def process_contexts(contexts: List[ContextRequest]) -> Optional[List[str]]: + if not contexts: + return None + + processed_contexts = [] + + for idx, ctx in enumerate(contexts, 1): + try: + if ctx.content: + processed_contexts.append(ctx.content) + + elif ctx.pecha_title and ctx.pecha_text_id: + pecha_context = f"[Pecha: {ctx.pecha_title}, ID: {ctx.pecha_text_id}]" + processed_contexts.append(pecha_context) + + else: + logging.warning(f"Empty context #{idx}, skipping") + + except Exception as e: + error_msg = f"Failed to process context #{idx}: {str(e)}" + logging.error(error_msg) + + if not processed_contexts: + return None + return processed_contexts \ No newline at end of file