Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 68 additions & 16 deletions genon/preprocessor/facade/attachment_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@
import uuid
import warnings
from datetime import datetime
import logging
from fastapi import Request

_log = logging.getLogger(__name__)

from glob import glob
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
Expand Down Expand Up @@ -72,6 +76,11 @@
# from utils import assert_cancelled
# from genos_utils import upload_files, merge_overlapping_bboxes

try:
from genos_utils import upload_files
except ImportError:
upload_files = None

# import platform
from pathlib import Path
import os
Expand All @@ -86,7 +95,7 @@
lg = logging.getLogger(n)
lg.setLevel(logging.CRITICAL)
lg.propagate = False
logging.getLogger().setLevel(logging.WARNING)
logging.getLogger().setLevel(logging.WARNING)
# pdf 변환 대상 확장자
CONVERTIBLE_EXTENSIONS = ['.hwp', '.txt', '.json', '.md', '.ppt', '.pptx', '.docx']

Expand Down Expand Up @@ -1047,10 +1056,11 @@ async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk
vectors.append(vector)

chunk_index_on_page += 1
# file_list = self.get_media_files(chunk.meta.doc_items)
# upload_tasks.append(asyncio.create_task(
# upload_files(file_list, request=request)
# ))
if upload_files:
file_list = self.get_media_files(chunk.meta.doc_items)
upload_tasks.append(asyncio.create_task(
upload_files(file_list, request=request)
))

if upload_tasks:
await asyncio.gather(*upload_tasks)
Expand Down Expand Up @@ -1156,10 +1166,11 @@ async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk
vectors.append(vector)

chunk_index_on_page += 1
# file_list = self.get_media_files(chunk.meta.doc_items)
# upload_tasks.append(asyncio.create_task(
# upload_files(file_list, request=request)
# ))
if upload_files:
file_list = self.get_media_files(chunk.meta.doc_items)
upload_tasks.append(asyncio.create_task(
upload_files(file_list, request=request)
))

if upload_tasks:
await asyncio.gather(*upload_tasks)
Expand Down Expand Up @@ -1228,7 +1239,7 @@ def get_loader(self, file_path: str):
convert_to_pdf(file_path)
# 한국어 OCR 지원을 위한 언어 설정
return UnstructuredImageLoader(
file_path,
file_path,
languages=["kor", "eng"], # 한국어 + 영어 OCR
)
elif ext in ['.txt', '.json', '.md']:
Expand All @@ -1243,7 +1254,7 @@ def get_loader(self, file_path: str):
def get_real_file_type(self, file_path: str) -> str:
"""파일 확장자가 아닌 실제 내용으로 파일 타입 판단"""
with open(file_path, 'rb') as f:
header = f.read(8)
header = f.read(8)
if header.startswith(b'%PDF-'):
return 'pdf'
elif header.startswith(b'\x89PNG'):
Expand Down Expand Up @@ -1290,18 +1301,21 @@ def convert_md_to_pdf(self, md_path):
def load_documents(self, file_path: str, **kwargs: dict) -> list[Document]:
loader = self.get_loader(file_path)
documents = loader.load()

# 이미지 파일의 경우 텍스트 추출 안되었을 시 기본 텍스트 제공
ext = os.path.splitext(file_path)[-1].lower()
if ext in ['.jpg', '.jpeg', '.png']:
# documents가 없거나, 있어도 모든 page_content가 비어있는 경우
if not documents or not any(doc.page_content.strip() for doc in documents):
documents = [Document(page_content=".", metadata={'source': file_path, 'page': 0})]

return documents

def split_documents(self, documents, **kwargs: dict) -> list[Document]:
text_splitter = RecursiveCharacterTextSplitter(**kwargs)
chunk_size = kwargs.get('chunk_size', 1000)
chunk_overlap = kwargs.get('chunk_overlap', 100)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
chunk_overlap=chunk_overlap,)
chunks = text_splitter.split_documents(documents)
chunks = [chunk for chunk in chunks if chunk.page_content]
if not chunks:
Expand Down Expand Up @@ -1385,7 +1399,45 @@ def compose_vectors(self, file_path: str, chunks: list[Document], **kwargs: dict

return vectors

def setup_logging(self, level_num: int):
"""
5"DEBUG", 4"INFO", 3"WARNING", 2"ERROR", 1"CRITICAL", 0"NOLOG" 중 하나를 받아서 로깅 레벨을 설정하는 메서드
"""
def get_level_name(level_num: int) -> str:
level_map = {
5: "DEBUG",
4: "INFO",
3: "WARNING",
2: "ERROR",
1: "CRITICAL",
0: "NOLOG"
}
return level_map.get(level_num, "INFO")
level_name = get_level_name(level_num)
print(f"Setting log level to: {level_name}")

if level_name == "NOLOG" or not hasattr(logging, level_name):
logging.disable(logging.CRITICAL) # 🔥 모든 로그 비활성화
return

level = getattr(logging, level_name.upper())

# 🔥 root logger 설정 (핸들러는 main에서만 설정)
logging.basicConfig(
level=level,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
handlers=[logging.StreamHandler()] # 콘솔 출력
)

# root logger level 적용
logging.getLogger().setLevel(level)

Comment thread
mestanam-mnc marked this conversation as resolved.
async def __call__(self, request: Request, file_path: str, **kwargs: dict):
self.setup_logging(kwargs.get('log_level', 4))

_log.info(f"file_path: {file_path}")
_log.info(f"kwargs: {kwargs}")

ext = os.path.splitext(file_path)[-1].lower()
if ext in ('.wav', '.mp3', '.m4a'):
# Generate a temporal path saving audio chunks: the audio file is supposed to be splited to several chunks due to limitted length by the model
Expand Down Expand Up @@ -1440,7 +1492,7 @@ async def __call__(self, request: Request, file_path: str, **kwargs: dict):

elif ext == '.docx':
return await self.docx_processor(request, file_path, **kwargs)

else:
documents: list[Document] = self.load_documents(file_path, **kwargs)
# await assert_cancelled(request)
Expand All @@ -1449,4 +1501,4 @@ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
# await assert_cancelled(request)

vectors: list[dict] = self.compose_vectors(file_path, chunks, **kwargs)
return vectors
return vectors
19 changes: 11 additions & 8 deletions genon/preprocessor/facade/attachment_processor_origin.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@
lg = logging.getLogger(n)
lg.setLevel(logging.CRITICAL)
lg.propagate = False
logging.getLogger().setLevel(logging.WARNING)
logging.getLogger().setLevel(logging.WARNING)
# pdf 변환 대상 확장자
CONVERTIBLE_EXTENSIONS = ['.hwp', '.txt', '.json', '.md', '.ppt', '.pptx', '.docx']

Expand Down Expand Up @@ -1228,7 +1228,7 @@ def get_loader(self, file_path: str):
convert_to_pdf(file_path)
# 한국어 OCR 지원을 위한 언어 설정
return UnstructuredImageLoader(
file_path,
file_path,
languages=["kor", "eng"], # 한국어 + 영어 OCR
)
elif ext in ['.txt', '.json', '.md']:
Expand All @@ -1243,7 +1243,7 @@ def get_loader(self, file_path: str):
def get_real_file_type(self, file_path: str) -> str:
"""파일 확장자가 아닌 실제 내용으로 파일 타입 판단"""
with open(file_path, 'rb') as f:
header = f.read(8)
header = f.read(8)
if header.startswith(b'%PDF-'):
return 'pdf'
elif header.startswith(b'\x89PNG'):
Expand Down Expand Up @@ -1290,18 +1290,21 @@ def convert_md_to_pdf(self, md_path):
def load_documents(self, file_path: str, **kwargs: dict) -> list[Document]:
loader = self.get_loader(file_path)
documents = loader.load()

# 이미지 파일의 경우 텍스트 추출 안되었을 시 기본 텍스트 제공
ext = os.path.splitext(file_path)[-1].lower()
if ext in ['.jpg', '.jpeg', '.png']:
# documents가 없거나, 있어도 모든 page_content가 비어있는 경우
if not documents or not any(doc.page_content.strip() for doc in documents):
documents = [Document(page_content=".", metadata={'source': file_path, 'page': 0})]

return documents

def split_documents(self, documents, **kwargs: dict) -> list[Document]:
text_splitter = RecursiveCharacterTextSplitter(**kwargs)
chunk_size = kwargs.get('chunk_size', 1000)
chunk_overlap = kwargs.get('chunk_overlap', 100)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
chunk_overlap=chunk_overlap,)
chunks = text_splitter.split_documents(documents)
chunks = [chunk for chunk in chunks if chunk.page_content]
if not chunks:
Expand Down Expand Up @@ -1438,7 +1441,7 @@ async def __call__(self, request: Request, file_path: str, **kwargs: dict):

elif ext == '.docx':
return await self.docx_processor(request, file_path, **kwargs)

else:
documents: list[Document] = self.load_documents(file_path, **kwargs)
await assert_cancelled(request)
Expand All @@ -1447,4 +1450,4 @@ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
await assert_cancelled(request)

vectors: list[dict] = self.compose_vectors(file_path, chunks, **kwargs)
return vectors
return vectors
58 changes: 51 additions & 7 deletions genon/preprocessor/facade/basic_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@

import json
import os
import logging
from pathlib import Path

from collections import defaultdict
from datetime import datetime
from typing import Optional, Iterable, Any, List, Dict, Tuple

from fastapi import Request

_log = logging.getLogger(__name__)

import shutil
import subprocess
import tempfile
Expand Down Expand Up @@ -94,7 +98,10 @@
"`pip install 'docling-core[chunking]'`"
)

# from genos_utils import upload_files
try:
from genos_utils import upload_files
except ImportError:
upload_files = None

# ============================================
#
Expand Down Expand Up @@ -1150,10 +1157,11 @@ async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk
vectors.append(vector)

chunk_index_on_page += 1
# file_list = self.get_media_files(chunk.meta.doc_items)
# upload_tasks.append(asyncio.create_task(
# upload_files(file_list, request=request)
# ))
if upload_files:
file_list = self.get_media_files(chunk.meta.doc_items)
upload_tasks.append(asyncio.create_task(
upload_files(file_list, request=request)
))

if upload_tasks:
await asyncio.gather(*upload_tasks)
Expand All @@ -1169,9 +1177,45 @@ def get_media_files(self, doc_items: list):
temp_list.append({'path': path, 'name': name})
return temp_list

def setup_logging(self, level_num: int):
"""
5"DEBUG", 4"INFO", 3"WARNING", 2"ERROR", 1"CRITICAL", 0"NOLOG" 중 하나를 받아서 로깅 레벨을 설정하는 메서드
"""
def get_level_name(level_num: int) -> str:
level_map = {
5: "DEBUG",
4: "INFO",
3: "WARNING",
2: "ERROR",
1: "CRITICAL",
0: "NOLOG"
}
return level_map.get(level_num, "INFO")
level_name = get_level_name(level_num)
print(f"Setting log level to: {level_name}")

if level_name == "NOLOG" or not hasattr(logging, level_name):
logging.disable(logging.CRITICAL) # 🔥 모든 로그 비활성화
return

level = getattr(logging, level_name.upper())

# 🔥 root logger 설정 (핸들러는 main에서만 설정)
logging.basicConfig(
level=level,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
handlers=[logging.StreamHandler()] # 콘솔 출력
)

# root logger level 적용
logging.getLogger().setLevel(level)

Comment thread
mestanam-mnc marked this conversation as resolved.
async def __call__(self, request: Request, file_path: str, **kwargs: dict):
# kwargs['save_images'] = True # 이미지 처리
# kwargs['include_wmf'] = True # wmf 처리
self.setup_logging(kwargs.get('log_level', 4))

_log.info(f"file_path: {file_path}")
_log.info(f"kwargs: {kwargs}")

document: DoclingDocument = self.load_documents(file_path, **kwargs)
ext = Path(file_path).suffix.lower()
if ext in ['.pptx', '.docx', '.md']: # pdf 저장 원하는 확장자 추가(pptx, docx, md, xlsx, csv 제공가능)
Expand Down
Loading