genonai · JaeseungYang · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026
diff --git a/build-script/doc-parser-build.config b/build-script/doc-parser-build.config
@@ -6,7 +6,7 @@ DOCKER_REGISTRY=mncregistry:30500
 IMAGE_NAME=doc-parser-preprocessor
 
 # 버전 (git tag, 브랜치 이름, 날짜 등으로 교체 가능)
-IMAGE_VERSION=1.3.0
+IMAGE_VERSION=1.3.5
 
 # 실제 Dockerfile 위치 (루트 기준)
 DOCKERFILE_PATH=genon/preprocessor/docker/Dockerfile

diff --git a/docling/models/readingorder_model.py b/docling/models/readingorder_model.py
@@ -1,5 +1,4 @@
 from pathlib import Path
-from typing import Dict, List
 
 from docling_core.types.doc import (
     DocItemLabel,
@@ -9,6 +8,7 @@
     NodeItem,
     ProvenanceItem,
     RefItem,
+    RichTableCell,
     TableData,
 )
 from docling_core.types.doc.document import ContentLayer
@@ -47,8 +47,8 @@ def __init__(self, options: ReadingOrderOptions):
 
     def _assembled_to_readingorder_elements(
         self, conv_res: ConversionResult
-    ) -> List[ReadingOrderPageElement]:
-        elements: List[ReadingOrderPageElement] = []
+    ) -> list[ReadingOrderPageElement]:
+        elements: list[ReadingOrderPageElement] = []
         page_no_to_pages = {p.page_no: p for p in conv_res.pages}
 
         for element in conv_res.assembled.elements:
@@ -103,13 +103,29 @@ def _add_child_elements(
             else:
                 doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
 
-    def _readingorder_elements_to_docling_doc(  # noqa: C901
+    def _create_rich_cell_group(
+        self, element: BasePageElement, doc: DoclingDocument, table_item: NodeItem
+    ) -> RefItem:
+        """Create a group containing all child elements for a rich table cell."""
+        group_name = f"rich_cell_group_{len(doc.tables)}_0_0"
+        group_element = doc.add_group(
+            label=GroupLabel.UNSPECIFIED,
+            name=group_name,
+            parent=table_item,
+        )
+
+        # Add all child elements to the group
+        self._add_child_elements(element, group_element, doc)
+
+        return group_element.get_ref()
+
+    def _readingorder_elements_to_docling_doc(
         self,
         conv_res: ConversionResult,
-        ro_elements: List[ReadingOrderPageElement],
-        el_to_captions_mapping: Dict[int, List[int]],
-        el_to_footnotes_mapping: Dict[int, List[int]],
-        el_merges_mapping: Dict[int, List[int]],
+        ro_elements: list[ReadingOrderPageElement],
+        el_to_captions_mapping: dict[int, list[int]],
+        el_to_footnotes_mapping: dict[int, list[int]],
+        el_merges_mapping: dict[int, list[int]],
     ) -> DoclingDocument:
         id_to_elem = {
             RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
@@ -197,11 +213,21 @@ def _readingorder_elements_to_docling_doc(  # noqa: C901
                             )
 
             elif isinstance(element, Table):
-                tbl_data = TableData(
-                    num_rows=element.num_rows,
-                    num_cols=element.num_cols,
-                    table_cells=element.table_cells,
-                )
+                # Check if table has no structure prediction
+                if element.num_rows == 0 and element.num_cols == 0:
+                    # Only create 1x1 table if there are children to put in it
+                    if element.cluster.children:
+                        # Create minimal 1x1 table with rich cell containing all children
+                        tbl_data = TableData(num_rows=1, num_cols=1, table_cells=[])
+                    else:
+                        # Create empty table with no structure
+                        tbl_data = TableData(num_rows=0, num_cols=0, table_cells=[])
+                else:
+                    tbl_data = TableData(
+                        num_rows=element.num_rows,
+                        num_cols=element.num_cols,
+                        table_cells=element.table_cells,
+                    )
 
                 prov = ProvenanceItem(
                     page_no=element.page_no + 1,
@@ -231,6 +257,30 @@ def _readingorder_elements_to_docling_doc(  # noqa: C901
 
                         tbl.footnotes.append(new_footnote_item.get_ref())
 
+                # Handle case where table has no structure prediction but has children
+                if (
+                    element.num_rows == 0
+                    and element.num_cols == 0
+                    and element.cluster.children
+                ):
+                    # Create rich cell containing all child elements
+                    rich_cell_ref = self._create_rich_cell_group(element, out_doc, tbl)
+
+                    # Create rich table cell spanning the entire 1x1 table
+                    rich_cell = RichTableCell(
+                        text="",  # Empty text since content is in the group
+                        row_span=1,
+                        col_span=1,
+                        start_row_offset_idx=0,
+                        end_row_offset_idx=1,
+                        start_col_offset_idx=0,
+                        end_col_offset_idx=1,
+                        column_header=False,
+                        row_header=False,
+                        ref=rich_cell_ref,
+                    )
+                    out_doc.add_table_cell(table_item=tbl, cell=rich_cell)
+
                 # TODO: Consider adding children of Table.
 
             elif isinstance(element, FigureElement):

diff --git a/genon/preprocessor/facade/README.md b/genon/preprocessor/facade/README.md
@@ -2,7 +2,7 @@
 
 ## 📋 개요
 
-GenOS DI(Document Intelligence)는 다양한 형식의 문서를 벡터 데이터베이스에 저장하기 위해 전처리하는 시스템입니다. 
+GenOS DI(Document Intelligence)는 다양한 형식의 문서를 벡터 데이터베이스에 저장하기 위해 전처리하는 시스템입니다.
 각 문서 타입과 요구사항에 따라 **지능형(Intelligent)** 또는 **기본형(Basic)** 처리 방식을 동적으로 선택할 수 있습니다.
 
 ## 🏗️ 시스템 구조
@@ -97,7 +97,7 @@ processor = create_hybrid_processor()
 ### Enrichment 옵션 (PDF/HWPX) - 지능형 모드
 ```python
 # 기본값이 이미 설정되어 있음
-processor.set_enrichment_options('pdf', 
+processor.set_enrichment_options('pdf',
     enabled=True,                      # 기본값: True
     do_toc_enrichment=True,           # 기본값: True
     extract_metadata=True,             # 기본값: True
@@ -110,7 +110,7 @@ processor.set_enrichment_options('pdf',
 
 # API 설정 (필요시 변경)
 processor.set_enrichment_options('pdf',
-    toc_api_base_url="http://llmops-gateway-api-service:8080/serving/364/799/v1/chat/completions",
+    toc_api_base_url="http://llmops-gateway-api-service:8080/serving/364/1073/v1/chat/completions",
     toc_api_key="your_api_key"
 )
 
@@ -200,8 +200,8 @@ processor.set_processor_option('pdf', 'chunking.max_tokens', 1536)
         "toc_temperature": 0.0,
         "toc_top_p": 0,
         "toc_api_provider": "custom",
-        "toc_api_base_url": "http://llmops-gateway-api-service:8080/serving/364/799/v1/chat/completions",
-        "metadata_api_base_url": "http://llmops-gateway-api-service:8080/serving/364/799/v1/chat/completions",
+        "toc_api_base_url": "http://llmops-gateway-api-service:8080/serving/364/1073/v1/chat/completions",
+        "metadata_api_base_url": "http://llmops-gateway-api-service:8080/serving/364/1073/v1/chat/completions",
         "toc_api_key": "a2ffe48f40ab4cf9a0699deac1c0cb76",
         "metadata_api_key": "a2ffe48f40ab4cf9a0699deac1c0cb76",
         "toc_model": "/model/snapshots/9eb2daaa8597bf192a8b0e73f848f3a102794df5"
@@ -328,26 +328,26 @@ processor.save_config('project_config.json')
 graph TD
     A[문서 입력] --> B{확장자 확인}
     B --> C{처리 모드}
-    
+
     C -->|지능형 + PDF/HWPX| D[Docling Processor]
     C -->|기본형 + 문서| E[LangChain Processor]
     C -->|오디오| F[Audio Processor]
     C -->|테이블| G[Tabular Processor]
-    
+
     D --> H[Enrichment<br/>기본값 적용]
     H --> I[고급 청킹<br/>max_tokens: 2000]
-    
+
     E --> J[텍스트 추출]
     J --> K[기본 청킹<br/>chunk_size: 1000]
-    
+
     F --> L[Whisper 전사<br/>ko, 30초 단위]
     G --> M[데이터프레임 변환]
-    
+
     I --> N[벡터 메타데이터]
     K --> N
     L --> N
     M --> N
-    
+
     N --> O[Weaviate 저장]
 ```
 

diff --git a/genon/preprocessor/facade/attachment_processor.py b/genon/preprocessor/facade/attachment_processor.py
@@ -1069,7 +1069,7 @@ async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk
     async def __call__(self, request: Request, file_path: str, **kwargs: dict):
         document: DoclingDocument = self.load_documents(file_path, **kwargs)
         artifacts_dir, reference_path = self.get_paths(file_path)
-        document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
+        document = document._with_pictures_refs(image_dir=artifacts_dir, page_no=None, reference_path=reference_path)
 
         chunks: list[DocChunk] = self.split_documents(document, **kwargs)
 
@@ -1179,7 +1179,7 @@ async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk
     async def __call__(self, request: Request, file_path: str, **kwargs: dict):
         document: DoclingDocument = self.load_documents(file_path, **kwargs)
         artifacts_dir, reference_path = self.get_paths(file_path)
-        document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
+        document = document._with_pictures_refs(image_dir=artifacts_dir, page_no=None, reference_path=reference_path)
 
         chunks: list[DocChunk] = self.split_documents(document, **kwargs)
 

diff --git a/genon/preprocessor/facade/attachment_processor_origin.py b/genon/preprocessor/facade/attachment_processor_origin.py
@@ -1059,7 +1059,7 @@ async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk
     async def __call__(self, request: Request, file_path: str, **kwargs: dict):
         document: DoclingDocument = self.load_documents(file_path, **kwargs)
         artifacts_dir, reference_path = self.get_paths(file_path)
-        document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
+        document = document._with_pictures_refs(image_dir=artifacts_dir, page_no=None, reference_path=reference_path)
 
         chunks: list[DocChunk] = self.split_documents(document, **kwargs)
 
@@ -1168,7 +1168,7 @@ async def compose_vectors(self, document: DoclingDocument, chunks: List[DocChunk
     async def __call__(self, request: Request, file_path: str, **kwargs: dict):
         document: DoclingDocument = self.load_documents(file_path, **kwargs)
         artifacts_dir, reference_path = self.get_paths(file_path)
-        document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
+        document = document._with_pictures_refs(image_dir=artifacts_dir, page_no=None, reference_path=reference_path)
 
         chunks: list[DocChunk] = self.split_documents(document, **kwargs)
 

diff --git a/genon/preprocessor/facade/basic_processor.py b/genon/preprocessor/facade/basic_processor.py
@@ -1085,10 +1085,12 @@ def enrichment(self, document: DoclingDocument, **kwargs: dict) -> DoclingDocume
             # metadata_api_key="9e32423947fd4a5da07a28962fe88487",
 
             # Gemma-3 27B docling, 운영망
-            toc_api_base_url="http://llmops-gateway-api-service:8080/serving/364/799/v1/chat/completions",
-            metadata_api_base_url="http://llmops-gateway-api-service:8080/serving/364/799/v1/chat/completions",
-            toc_api_key="a2ffe48f40ab4cf9a0699deac1c0cb76",
-            metadata_api_key="a2ffe48f40ab4cf9a0699deac1c0cb76",
+            # toc_api_base_url="http://llmops-gateway-api-service:8080/serving/364/1073/v1/chat/completions",
+            # metadata_api_base_url="http://llmops-gateway-api-service:8080/serving/364/1073/v1/chat/completions",
+            toc_api_base_url="https://genos.genon.ai:3443/api/gateway/rep/serving/364/v1/chat/completions",
+            metadata_api_base_url="https://genos.genon.ai:3443/api/gateway/rep/serving/364/v1/chat/completions",
+            toc_api_key="f8f89bb0251b4af8945a240c0783dc9d",
+            metadata_api_key="f8f89bb0251b4af8945a240c0783dc9d",
 
             toc_model="/model/snapshots/9eb2daaa8597bf192a8b0e73f848f3a102794df5",
             metadata_model="/model/snapshots/9eb2daaa8597bf192a8b0e73f848f3a102794df5",
@@ -1230,7 +1232,7 @@ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
         else:
             reference_path = artifacts_dir.parent
 
-        document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
+        document = document._with_pictures_refs(image_dir=artifacts_dir, page_no=None, reference_path=reference_path)
 
         document = self.enrichment(document, **kwargs)
 

diff --git a/genon/preprocessor/facade/basic_processor_origin.py b/genon/preprocessor/facade/basic_processor_origin.py
@@ -413,7 +413,7 @@ def _generate_text_from_items_with_headers(self, items: list[DocItem],
                     text_parts.append(item.text)
             elif isinstance(item, PictureItem):
                 text_parts.append("")  # 이미지는 빈 텍스트
-                
+
         result_text = self.delim.join(text_parts)
         return result_text
 
@@ -466,7 +466,7 @@ def _extract_used_headers(self, header_info_list: list[dict]) -> Optional[list[s
         # 모든 헤더 정보를 종합하여 사용되는 헤더들 추출
         all_headers = []
         seen_headers = set()
-        
+
         for header_info in header_info_list:
             if header_info:  # dict가 비어있지 않은 경우
                 for level in sorted(header_info.keys()):
@@ -481,11 +481,11 @@ def _split_table_text(self, table_text: str, max_tokens: int) -> list[str]:
         """테이블 텍스트를 토큰 제한에 맞게 분할 (단순 토큰 수 기준)"""
         if not table_text:
             return [table_text]
-        
+
         # 전체 테이블이 토큰 제한 내인지 확인
         if self._count_tokens(table_text) <= max_tokens:
             return [table_text]
-        
+
         # 단순히 토큰 수 기준으로 텍스트 분할
         # semchunk 사용하여 토큰 제한에 맞게 분할
         chunker = semchunk.chunkerify(self._tokenizer, chunk_size=max_tokens)
@@ -564,7 +564,7 @@ def _split_document_by_tokens(self, doc_chunk: DocChunk, dl_doc: DoclingDocument
                     # 테이블 텍스트만 추출하여 분할
                     table_only_text = self._extract_table_text(item, dl_doc)
                     split_tables = self._split_table_text(table_only_text, 4096)
-                    
+
                     # 분할된 각 테이블에 대해 청크 생성
                     for split_table in split_tables:
                         # 기존 _generate_text_from_items_with_headers 함수 활용
@@ -573,7 +573,7 @@ def _split_document_by_tokens(self, doc_chunk: DocChunk, dl_doc: DoclingDocument
                         )
                         # 원본 테이블 텍스트를 분할된 테이블로 교체
                         full_text = full_text.replace(table_only_text, split_table)
-                        
+
                         # 원래 tableitem에 들어갔어야 할 heading 값 유지
                         used_headers = self._extract_used_headers([header_info])
                         result_chunks.append(DocChunk(
@@ -1062,8 +1062,8 @@ def enrichment(self, document: DoclingDocument, **kwargs: dict) -> DoclingDocume
             do_toc_enrichment=True,
             extract_metadata=True,
             toc_api_provider="custom",
-            toc_api_base_url="http://llmops-gateway-api-service:8080/serving/364/799/v1/chat/completions",
-            metadata_api_base_url="http://llmops-gateway-api-service:8080/serving/364/799/v1/chat/completions",
+            toc_api_base_url="http://llmops-gateway-api-service:8080/serving/364/1073/v1/chat/completions",
+            metadata_api_base_url="http://llmops-gateway-api-service:8080/serving/364/1073/v1/chat/completions",
             toc_api_key="a2ffe48f40ab4cf9a0699deac1c0cb76",
             metadata_api_key="a2ffe48f40ab4cf9a0699deac1c0cb76",
             toc_model="/model/snapshots/9eb2daaa8597bf192a8b0e73f848f3a102794df5",
@@ -1160,7 +1160,7 @@ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
         if ext in ['.pptx', '.docx', '.md']: # pdf 저장 원하는 확장자 추가(pptx, docx, md, xlsx, csv 제공가능)
             convert_to_pdf(file_path)
             pdf_path = _get_pdf_path(file_path)
-            
+
         output_path, output_file = os.path.split(file_path)
         filename, _ = os.path.splitext(output_file)
         artifacts_dir = Path(f"{output_path}/{filename}")
@@ -1169,7 +1169,7 @@ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
         else:
             reference_path = artifacts_dir.parent
 
-        document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
+        document = document._with_pictures_refs(image_dir=artifacts_dir, page_no=None, reference_path=reference_path)
 
         document = self.enrichment(document, **kwargs)
 
@@ -1248,4 +1248,4 @@ def __repr__(self) -> str:
 # GenOS 와의 의존성 제거를 위해 추가
 async def assert_cancelled(request: Request):
     if await request.is_disconnected():
-        raise GenosServiceException(1, f"Cancelled")
+        raise GenosServiceException(1, f"Cancelled")
diff --git a/genon/preprocessor/facade/intelligent_processor.py b/genon/preprocessor/facade/intelligent_processor.py
@@ -1374,7 +1374,7 @@ async def __call__(self, request: Request, file_path: str, **kwargs: dict):
         else:
             reference_path = artifacts_dir.parent
 
-        document = document._with_pictures_refs(image_dir=artifacts_dir, reference_path=reference_path)
+        document = document._with_pictures_refs(image_dir=artifacts_dir, page_no=None, reference_path=reference_path)
 
         document = self.enrichment(document, **kwargs)