Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build-script/doc-parser-build.config
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ DOCKER_REGISTRY=mncregistry:30500
IMAGE_NAME=doc-parser-preprocessor

# 버전 (git tag, 브랜치 이름, 날짜 등으로 교체 가능)
IMAGE_VERSION=1.3.0
IMAGE_VERSION=1.3.5

# 실제 Dockerfile 위치 (루트 기준)
DOCKERFILE_PATH=genon/preprocessor/docker/Dockerfile
Expand Down
76 changes: 63 additions & 13 deletions docling/models/readingorder_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from pathlib import Path
from typing import Dict, List

from docling_core.types.doc import (
DocItemLabel,
Expand All @@ -9,6 +8,7 @@
NodeItem,
ProvenanceItem,
RefItem,
RichTableCell,
TableData,
)
from docling_core.types.doc.document import ContentLayer
Expand Down Expand Up @@ -47,8 +47,8 @@ def __init__(self, options: ReadingOrderOptions):

def _assembled_to_readingorder_elements(
self, conv_res: ConversionResult
) -> List[ReadingOrderPageElement]:
elements: List[ReadingOrderPageElement] = []
) -> list[ReadingOrderPageElement]:
elements: list[ReadingOrderPageElement] = []
page_no_to_pages = {p.page_no: p for p in conv_res.pages}

for element in conv_res.assembled.elements:
Expand Down Expand Up @@ -103,13 +103,29 @@ def _add_child_elements(
else:
doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)

def _readingorder_elements_to_docling_doc( # noqa: C901
def _create_rich_cell_group(
self, element: BasePageElement, doc: DoclingDocument, table_item: NodeItem
) -> RefItem:
"""Create a group containing all child elements for a rich table cell."""
group_name = f"rich_cell_group_{len(doc.tables)}_0_0"
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

group_name을 생성할 때 _0_0 부분이 하드코딩되어 있어 의미를 파악하기 어렵습니다. 이 값들이 무엇을 나타내는지 주석을 추가하거나, 의미 있는 변수명을 사용하면 코드 가독성을 높일 수 있습니다. 예를 들어, 행/열 인덱스를 나타낸다면 row_index, col_index와 같은 변수를 사용하는 것을 고려해 보세요.

group_element = doc.add_group(
label=GroupLabel.UNSPECIFIED,
name=group_name,
parent=table_item,
)

# Add all child elements to the group
self._add_child_elements(element, group_element, doc)

return group_element.get_ref()

def _readingorder_elements_to_docling_doc(
self,
conv_res: ConversionResult,
ro_elements: List[ReadingOrderPageElement],
el_to_captions_mapping: Dict[int, List[int]],
el_to_footnotes_mapping: Dict[int, List[int]],
el_merges_mapping: Dict[int, List[int]],
ro_elements: list[ReadingOrderPageElement],
el_to_captions_mapping: dict[int, list[int]],
el_to_footnotes_mapping: dict[int, list[int]],
el_merges_mapping: dict[int, list[int]],
) -> DoclingDocument:
id_to_elem = {
RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
Expand Down Expand Up @@ -197,11 +213,21 @@ def _readingorder_elements_to_docling_doc( # noqa: C901
)

elif isinstance(element, Table):
tbl_data = TableData(
num_rows=element.num_rows,
num_cols=element.num_cols,
table_cells=element.table_cells,
)
# Check if table has no structure prediction
if element.num_rows == 0 and element.num_cols == 0:
# Only create 1x1 table if there are children to put in it
if element.cluster.children:
# Create minimal 1x1 table with rich cell containing all children
tbl_data = TableData(num_rows=1, num_cols=1, table_cells=[])
else:
# Create empty table with no structure
tbl_data = TableData(num_rows=0, num_cols=0, table_cells=[])
else:
tbl_data = TableData(
num_rows=element.num_rows,
num_cols=element.num_cols,
table_cells=element.table_cells,
)

prov = ProvenanceItem(
page_no=element.page_no + 1,
Expand Down Expand Up @@ -231,6 +257,30 @@ def _readingorder_elements_to_docling_doc( # noqa: C901

tbl.footnotes.append(new_footnote_item.get_ref())

# Handle case where table has no structure prediction but has children
if (
element.num_rows == 0
and element.num_cols == 0
and element.cluster.children
):
# Create rich cell containing all child elements
rich_cell_ref = self._create_rich_cell_group(element, out_doc, tbl)

# Create rich table cell spanning the entire 1x1 table
rich_cell = RichTableCell(
text="", # Empty text since content is in the group
row_span=1,
col_span=1,
start_row_offset_idx=0,
end_row_offset_idx=1,
start_col_offset_idx=0,
end_col_offset_idx=1,
column_header=False,
row_header=False,
ref=rich_cell_ref,
)
out_doc.add_table_cell(table_item=tbl, cell=rich_cell)

# TODO: Consider adding children of Table.

elif isinstance(element, FigureElement):
Expand Down
22 changes: 11 additions & 11 deletions genon/preprocessor/facade/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

## 📋 개요

GenOS DI(Document Intelligence)는 다양한 형식의 문서를 벡터 데이터베이스에 저장하기 위해 전처리하는 시스템입니다.
GenOS DI(Document Intelligence)는 다양한 형식의 문서를 벡터 데이터베이스에 저장하기 위해 전처리하는 시스템입니다.
각 문서 타입과 요구사항에 따라 **지능형(Intelligent)** 또는 **기본형(Basic)** 처리 방식을 동적으로 선택할 수 있습니다.

## 🏗️ 시스템 구조
Expand Down Expand Up @@ -97,7 +97,7 @@ processor = create_hybrid_processor()
### Enrichment 옵션 (PDF/HWPX) - 지능형 모드
```python
# 기본값이 이미 설정되어 있음
processor.set_enrichment_options('pdf',
processor.set_enrichment_options('pdf',
enabled=True, # 기본값: True
do_toc_enrichment=True, # 기본값: True
extract_metadata=True, # 기본값: True
Expand All @@ -110,7 +110,7 @@ processor.set_enrichment_options('pdf',

# API 설정 (필요시 변경)
processor.set_enrichment_options('pdf',
toc_api_base_url="http://llmops-gateway-api-service:8080/serving/364/799/v1/chat/completions",
toc_api_base_url="http://llmops-gateway-api-service:8080/serving/364/1073/v1/chat/completions",
toc_api_key="your_api_key"
)

Expand Down Expand Up @@ -200,8 +200,8 @@ processor.set_processor_option('pdf', 'chunking.max_tokens', 1536)
"toc_temperature": 0.0,
"toc_top_p": 0,
"toc_api_provider": "custom",
"toc_api_base_url": "http://llmops-gateway-api-service:8080/serving/364/799/v1/chat/completions",
"metadata_api_base_url": "http://llmops-gateway-api-service:8080/serving/364/799/v1/chat/completions",
"toc_api_base_url": "http://llmops-gateway-api-service:8080/serving/364/1073/v1/chat/completions",
"metadata_api_base_url": "http://llmops-gateway-api-service:8080/serving/364/1073/v1/chat/completions",
"toc_api_key": "a2ffe48f40ab4cf9a0699deac1c0cb76",
"metadata_api_key": "a2ffe48f40ab4cf9a0699deac1c0cb76",
"toc_model": "/model/snapshots/9eb2daaa8597bf192a8b0e73f848f3a102794df5"
Expand Down Expand Up @@ -328,26 +328,26 @@ processor.save_config('project_config.json')
graph TD
A[문서 입력] --> B{확장자 확인}
B --> C{처리 모드}

C -->|지능형 + PDF/HWPX| D[Docling Processor]
C -->|기본형 + 문서| E[LangChain Processor]
C -->|오디오| F[Audio Processor]
C -->|테이블| G[Tabular Processor]

D --> H[Enrichment<br/>기본값 적용]
H --> I[고급 청킹<br/>max_tokens: 2000]

E --> J[텍스트 추출]
J --> K[기본 청킹<br/>chunk_size: 1000]

F --> L[Whisper 전사<br/>ko, 30초 단위]
G --> M[데이터프레임 변환]

I --> N[벡터 메타데이터]
K --> N
L --> N
M --> N

N --> O[Weaviate 저장]
```

Expand Down
4 changes: 2 additions & 2 deletions genon/preprocessor/facade/attachment_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1069,7 +1069,7 @@ async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk
async def __call__(self, request: Request, file_path: str, **kwargs: dict):
document: DoclingDocument = self.load_documents(file_path, **kwargs)
artifacts_dir, reference_path = self.get_paths(file_path)
document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
document = document._with_pictures_refs(image_dir=artifacts_dir, page_no=None, reference_path=reference_path)

chunks: list[DocChunk] = self.split_documents(document, **kwargs)

Expand Down Expand Up @@ -1179,7 +1179,7 @@ async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk
async def __call__(self, request: Request, file_path: str, **kwargs: dict):
document: DoclingDocument = self.load_documents(file_path, **kwargs)
artifacts_dir, reference_path = self.get_paths(file_path)
document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
document = document._with_pictures_refs(image_dir=artifacts_dir, page_no=None, reference_path=reference_path)

chunks: list[DocChunk] = self.split_documents(document, **kwargs)

Expand Down
4 changes: 2 additions & 2 deletions genon/preprocessor/facade/attachment_processor_origin.py
Original file line number Diff line number Diff line change
Expand Up @@ -1059,7 +1059,7 @@ async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk
async def __call__(self, request: Request, file_path: str, **kwargs: dict):
document: DoclingDocument = self.load_documents(file_path, **kwargs)
artifacts_dir, reference_path = self.get_paths(file_path)
document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
document = document._with_pictures_refs(image_dir=artifacts_dir, page_no=None, reference_path=reference_path)

chunks: list[DocChunk] = self.split_documents(document, **kwargs)

Expand Down Expand Up @@ -1168,7 +1168,7 @@ async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk
async def __call__(self, request: Request, file_path: str, **kwargs: dict):
document: DoclingDocument = self.load_documents(file_path, **kwargs)
artifacts_dir, reference_path = self.get_paths(file_path)
document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
document = document._with_pictures_refs(image_dir=artifacts_dir, page_no=None, reference_path=reference_path)

chunks: list[DocChunk] = self.split_documents(document, **kwargs)

Expand Down
12 changes: 7 additions & 5 deletions genon/preprocessor/facade/basic_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1085,10 +1085,12 @@ def enrichment(self, document: DoclingDocument, **kwargs: dict) -> DoclingDocume
# metadata_api_key="9e32423947fd4a5da07a28962fe88487",

# Gemma-3 27B docling, 운영망
toc_api_base_url="http://llmops-gateway-api-service:8080/serving/364/799/v1/chat/completions",
metadata_api_base_url="http://llmops-gateway-api-service:8080/serving/364/799/v1/chat/completions",
toc_api_key="a2ffe48f40ab4cf9a0699deac1c0cb76",
metadata_api_key="a2ffe48f40ab4cf9a0699deac1c0cb76",
# toc_api_base_url="http://llmops-gateway-api-service:8080/serving/364/1073/v1/chat/completions",
# metadata_api_base_url="http://llmops-gateway-api-service:8080/serving/364/1073/v1/chat/completions",
toc_api_base_url="https://genos.genon.ai:3443/api/gateway/rep/serving/364/v1/chat/completions",
metadata_api_base_url="https://genos.genon.ai:3443/api/gateway/rep/serving/364/v1/chat/completions",
toc_api_key="f8f89bb0251b4af8945a240c0783dc9d",
metadata_api_key="f8f89bb0251b4af8945a240c0783dc9d",

toc_model="/model/snapshots/9eb2daaa8597bf192a8b0e73f848f3a102794df5",
metadata_model="/model/snapshots/9eb2daaa8597bf192a8b0e73f848f3a102794df5",
Expand Down Expand Up @@ -1230,7 +1232,7 @@ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
else:
reference_path = artifacts_dir.parent

document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
document = document._with_pictures_refs(image_dir=artifacts_dir, page_no=None, reference_path=reference_path)

document = self.enrichment(document, **kwargs)

Expand Down
22 changes: 11 additions & 11 deletions genon/preprocessor/facade/basic_processor_origin.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ def _generate_text_from_items_with_headers(self, items: list[DocItem],
text_parts.append(item.text)
elif isinstance(item, PictureItem):
text_parts.append("") # 이미지는 빈 텍스트

result_text = self.delim.join(text_parts)
return result_text

Expand Down Expand Up @@ -466,7 +466,7 @@ def _extract_used_headers(self, header_info_list: list[dict]) -> Optional[list[s
# 모든 헤더 정보를 종합하여 사용되는 헤더들 추출
all_headers = []
seen_headers = set()

for header_info in header_info_list:
if header_info: # dict가 비어있지 않은 경우
for level in sorted(header_info.keys()):
Expand All @@ -481,11 +481,11 @@ def _split_table_text(self, table_text: str, max_tokens: int) -> list[str]:
"""테이블 텍스트를 토큰 제한에 맞게 분할 (단순 토큰 수 기준)"""
if not table_text:
return [table_text]

# 전체 테이블이 토큰 제한 내인지 확인
if self._count_tokens(table_text) <= max_tokens:
return [table_text]

# 단순히 토큰 수 기준으로 텍스트 분할
# semchunk 사용하여 토큰 제한에 맞게 분할
chunker = semchunk.chunkerify(self._tokenizer, chunk_size=max_tokens)
Expand Down Expand Up @@ -564,7 +564,7 @@ def _split_document_by_tokens(self, doc_chunk: DocChunk, dl_doc: DoclingDocument
# 테이블 텍스트만 추출하여 분할
table_only_text = self._extract_table_text(item, dl_doc)
split_tables = self._split_table_text(table_only_text, 4096)

# 분할된 각 테이블에 대해 청크 생성
for split_table in split_tables:
# 기존 _generate_text_from_items_with_headers 함수 활용
Expand All @@ -573,7 +573,7 @@ def _split_document_by_tokens(self, doc_chunk: DocChunk, dl_doc: DoclingDocument
)
# 원본 테이블 텍스트를 분할된 테이블로 교체
full_text = full_text.replace(table_only_text, split_table)

# 원래 tableitem에 들어갔어야 할 heading 값 유지
used_headers = self._extract_used_headers([header_info])
result_chunks.append(DocChunk(
Expand Down Expand Up @@ -1062,8 +1062,8 @@ def enrichment(self, document: DoclingDocument, **kwargs: dict) -> DoclingDocume
do_toc_enrichment=True,
extract_metadata=True,
toc_api_provider="custom",
toc_api_base_url="http://llmops-gateway-api-service:8080/serving/364/799/v1/chat/completions",
metadata_api_base_url="http://llmops-gateway-api-service:8080/serving/364/799/v1/chat/completions",
toc_api_base_url="http://llmops-gateway-api-service:8080/serving/364/1073/v1/chat/completions",
metadata_api_base_url="http://llmops-gateway-api-service:8080/serving/364/1073/v1/chat/completions",
toc_api_key="a2ffe48f40ab4cf9a0699deac1c0cb76",
metadata_api_key="a2ffe48f40ab4cf9a0699deac1c0cb76",
toc_model="/model/snapshots/9eb2daaa8597bf192a8b0e73f848f3a102794df5",
Expand Down Expand Up @@ -1160,7 +1160,7 @@ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
if ext in ['.pptx', '.docx', '.md']: # pdf 저장 원하는 확장자 추가(pptx, docx, md, xlsx, csv 제공가능)
convert_to_pdf(file_path)
pdf_path = _get_pdf_path(file_path)

output_path, output_file = os.path.split(file_path)
filename, _ = os.path.splitext(output_file)
artifacts_dir = Path(f"{output_path}/{filename}")
Expand All @@ -1169,7 +1169,7 @@ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
else:
reference_path = artifacts_dir.parent

document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
document = document._with_pictures_refs(image_dir=artifacts_dir, page_no=None, reference_path=reference_path)

document = self.enrichment(document, **kwargs)

Expand Down Expand Up @@ -1248,4 +1248,4 @@ def __repr__(self) -> str:
# GenOS 와의 의존성 제거를 위해 추가
async def assert_cancelled(request: Request):
if await request.is_disconnected():
raise GenosServiceException(1, f"Cancelled")
raise GenosServiceException(1, f"Cancelled")
2 changes: 1 addition & 1 deletion genon/preprocessor/facade/intelligent_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1374,7 +1374,7 @@ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
else:
reference_path = artifacts_dir.parent

document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
document = document._with_pictures_refs(image_dir=artifacts_dir, page_no=None, reference_path=reference_path)

document = self.enrichment(document, **kwargs)

Expand Down
Loading