From 1814ca655f2e2cc483b0f5bd98aff8107a1d4fef Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 8 Mar 2026 11:52:30 +0800 Subject: [PATCH 01/14] Integrate litellm for multi-provider LLM support --- pageindex/config.yaml | 2 +- pageindex/page_index.py | 36 +++++++-------- pageindex/utils.py | 99 +++++++++++++++++------------------------ requirements.txt | 2 +- 4 files changed, 60 insertions(+), 79 deletions(-) diff --git a/pageindex/config.yaml b/pageindex/config.yaml index fd73e3a2..177affd1 100644 --- a/pageindex/config.yaml +++ b/pageindex/config.yaml @@ -1,4 +1,4 @@ -model: "gpt-4o-2024-11-20" +model: "anthropic/claude-haiku-4-5-20251001" toc_check_page_num: 20 max_page_num_each_node: 10 max_token_num_each_node: 20000 diff --git a/pageindex/page_index.py b/pageindex/page_index.py index d646bb9d..a153ef90 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -36,7 +36,7 @@ async def check_title_appearance(item, page_list, start_index=1, model=None): }} Directly return the final JSON structure. Do not output anything else.""" - response = await ChatGPT_API_async(model=model, prompt=prompt) + response = await allm_complete(model=model, prompt=prompt) response = extract_json(response) if 'answer' in response: answer = response['answer'] @@ -64,7 +64,7 @@ async def check_title_appearance_in_start(title, page_text, model=None, logger=N }} Directly return the final JSON structure. Do not output anything else.""" - response = await ChatGPT_API_async(model=model, prompt=prompt) + response = await allm_complete(model=model, prompt=prompt) response = extract_json(response) if logger: logger.info(f"Response: {response}") @@ -116,7 +116,7 @@ def toc_detector_single_page(content, model=None): Directly return the final JSON structure. Do not output anything else. Please note: abstract,summary, notation list, figure list, table list, etc. are not table of contents.""" - response = ChatGPT_API(model=model, prompt=prompt) + response = llm_complete(model=model, prompt=prompt) # print('response', response) json_content = extract_json(response) return json_content['toc_detected'] @@ -135,7 +135,7 @@ def check_if_toc_extraction_is_complete(content, toc, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = prompt + '\n Document:\n' + content + '\n Table of contents:\n' + toc - response = ChatGPT_API(model=model, prompt=prompt) + response = llm_complete(model=model, prompt=prompt) json_content = extract_json(response) return json_content['completed'] @@ -153,7 +153,7 @@ def check_if_toc_transformation_is_complete(content, toc, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc - response = ChatGPT_API(model=model, prompt=prompt) + response = llm_complete(model=model, prompt=prompt) json_content = extract_json(response) return json_content['completed'] @@ -165,7 +165,7 @@ def extract_toc_content(content, model=None): Directly return the full table of contents content. Do not output anything else.""" - response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) + response, finish_reason = llm_complete(model=model, prompt=prompt, return_finish_reason=True) if_complete = check_if_toc_transformation_is_complete(content, response, model) if if_complete == "yes" and finish_reason == "finished": @@ -176,7 +176,7 @@ def extract_toc_content(content, model=None): {"role": "assistant", "content": response}, ] prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure""" - new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history) + new_response, finish_reason = llm_complete(model=model, prompt=prompt, chat_history=chat_history, return_finish_reason=True) response = response + new_response if_complete = check_if_toc_transformation_is_complete(content, response, model) @@ -193,7 +193,7 @@ def extract_toc_content(content, model=None): {"role": "assistant", "content": response}, ] prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure""" - new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history) + new_response, finish_reason = llm_complete(model=model, prompt=prompt, chat_history=chat_history, return_finish_reason=True) response = response + new_response if_complete = check_if_toc_transformation_is_complete(content, response, model) @@ -215,7 +215,7 @@ def detect_page_index(toc_content, model=None): }} Directly return the final JSON structure. Do not output anything else.""" - response = ChatGPT_API(model=model, prompt=prompt) + response = llm_complete(model=model, prompt=prompt) json_content = extract_json(response) return json_content['page_index_given_in_toc'] @@ -263,8 +263,8 @@ def toc_index_extractor(toc, content, model=None): If the section is not in the provided pages, do not add the physical_index to it. Directly return the final JSON structure. Do not output anything else.""" - prompt = toc_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content - response = ChatGPT_API(model=model, prompt=prompt) + prompt = tob_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content + response = llm_complete(model=model, prompt=prompt) json_content = extract_json(response) return json_content @@ -292,7 +292,7 @@ def toc_transformer(toc_content, model=None): Directly return the final JSON structure, do not output anything else. """ prompt = init_prompt + '\n Given table of contents\n:' + toc_content - last_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) + last_complete, finish_reason = llm_complete(model=model, prompt=prompt, return_finish_reason=True) if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model) if if_complete == "yes" and finish_reason == "finished": last_complete = extract_json(last_complete) @@ -316,7 +316,7 @@ def toc_transformer(toc_content, model=None): Please continue the json structure, directly output the remaining part of the json structure.""" - new_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) + new_complete, finish_reason = llm_complete(model=model, prompt=prompt, return_finish_reason=True) if new_complete.startswith('```json'): new_complete = get_json_content(new_complete) @@ -477,7 +477,7 @@ def add_page_number_to_toc(part, structure, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = fill_prompt_seq + f"\n\nCurrent Partial Document:\n{part}\n\nGiven Structure\n{json.dumps(structure, indent=2)}\n" - current_json_raw = ChatGPT_API(model=model, prompt=prompt) + current_json_raw = llm_complete(model=model, prompt=prompt) json_result = extract_json(current_json_raw) for item in json_result: @@ -527,7 +527,7 @@ def generate_toc_continue(toc_content, part, model="gpt-4o-2024-11-20"): Directly return the additional part of the final JSON structure. Do not output anything else.""" prompt = prompt + '\nGiven text\n:' + part + '\nPrevious tree structure\n:' + json.dumps(toc_content, indent=2) - response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) + response, finish_reason = llm_complete(model=model, prompt=prompt, return_finish_reason=True) if finish_reason == 'finished': return extract_json(response) else: @@ -561,7 +561,7 @@ def generate_toc_init(part, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = prompt + '\nGiven text\n:' + part - response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) + response, finish_reason = llm_complete(model=model, prompt=prompt, return_finish_reason=True) if finish_reason == 'finished': return extract_json(response) @@ -745,8 +745,8 @@ def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20 } Directly return the final JSON structure. Do not output anything else.""" - prompt = toc_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content - response = ChatGPT_API(model=model, prompt=prompt) + prompt = tob_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content + response = llm_complete(model=model, prompt=prompt) json_content = extract_json(response) return convert_physical_index_to_int(json_content['physical_index']) diff --git a/pageindex/utils.py b/pageindex/utils.py index 3517ab80..90efb2e7 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -1,5 +1,4 @@ -import tiktoken -import openai +import litellm import logging import os from datetime import datetime @@ -17,96 +16,79 @@ from pathlib import Path from types import SimpleNamespace as config -CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY") +# Backward compatibility: support CHATGPT_API_KEY as alias for OPENAI_API_KEY +if not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"): + os.environ["OPENAI_API_KEY"] = os.getenv("CHATGPT_API_KEY") + +litellm.drop_params = True + def count_tokens(text, model=None): if not text: return 0 - enc = tiktoken.encoding_for_model(model) - tokens = enc.encode(text) - return len(tokens) + return litellm.token_counter(model=model, text=text) + -def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None): +def llm_complete(model, prompt, chat_history=None, return_finish_reason=False): max_retries = 10 - client = openai.OpenAI(api_key=api_key) + messages = list(chat_history) + [{"role": "user", "content": prompt}] if chat_history else [{"role": "user", "content": prompt}] for i in range(max_retries): try: - if chat_history: - messages = chat_history - messages.append({"role": "user", "content": prompt}) - else: - messages = [{"role": "user", "content": prompt}] - - response = client.chat.completions.create( + response = litellm.completion( model=model, messages=messages, temperature=0, ) - if response.choices[0].finish_reason == "length": - return response.choices[0].message.content, "max_output_reached" - else: - return response.choices[0].message.content, "finished" - + content = response.choices[0].message.content + if return_finish_reason: + finish_reason = "max_output_reached" if response.choices[0].finish_reason == "length" else "finished" + return content, finish_reason + return content except Exception as e: print('************* Retrying *************') logging.error(f"Error: {e}") if i < max_retries - 1: - time.sleep(1) # Wait for 1秒 before retrying + time.sleep(1) else: logging.error('Max retries reached for prompt: ' + prompt) return "", "error" +def llm_complete_stream(model, prompt): + """Return a generator that yields token chunks (str) one at a time.""" + response = litellm.completion( + model=model, + messages=[{"role": "user", "content": prompt}], + temperature=0, + stream=True, + ) + for chunk in response: + delta = chunk.choices[0].delta.content + if delta: + yield delta + -def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None): +async def allm_complete(model, prompt): max_retries = 10 - client = openai.OpenAI(api_key=api_key) + messages = [{"role": "user", "content": prompt}] for i in range(max_retries): try: - if chat_history: - messages = chat_history - messages.append({"role": "user", "content": prompt}) - else: - messages = [{"role": "user", "content": prompt}] - - response = client.chat.completions.create( + response = await asyncio.to_thread( + litellm.completion, model=model, messages=messages, temperature=0, ) - return response.choices[0].message.content except Exception as e: print('************* Retrying *************') logging.error(f"Error: {e}") if i < max_retries - 1: - time.sleep(1) # Wait for 1秒 before retrying + await asyncio.sleep(1) else: logging.error('Max retries reached for prompt: ' + prompt) return "Error" - -async def ChatGPT_API_async(model, prompt, api_key=CHATGPT_API_KEY): - max_retries = 10 - messages = [{"role": "user", "content": prompt}] - for i in range(max_retries): - try: - async with openai.AsyncOpenAI(api_key=api_key) as client: - response = await client.chat.completions.create( - model=model, - messages=messages, - temperature=0, - ) - return response.choices[0].message.content - except Exception as e: - print('************* Retrying *************') - logging.error(f"Error: {e}") - if i < max_retries - 1: - await asyncio.sleep(1) # Wait for 1s before retrying - else: - logging.error('Max retries reached for prompt: ' + prompt) - return "Error" - def get_json_content(response): start_idx = response.find("```json") @@ -411,14 +393,13 @@ def add_preface_if_needed(data): def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"): - enc = tiktoken.encoding_for_model(model) if pdf_parser == "PyPDF2": pdf_reader = PyPDF2.PdfReader(pdf_path) page_list = [] for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] page_text = page.extract_text() - token_length = len(enc.encode(page_text)) + token_length = litellm.token_counter(model=model, text=page_text) page_list.append((page_text, token_length)) return page_list elif pdf_parser == "PyMuPDF": @@ -430,7 +411,7 @@ def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"): page_list = [] for page in doc: page_text = page.get_text() - token_length = len(enc.encode(page_text)) + token_length = litellm.token_counter(model=model, text=page_text) page_list.append((page_text, token_length)) return page_list else: @@ -609,7 +590,7 @@ async def generate_node_summary(node, model=None): Directly return the description, do not include any other text. """ - response = await ChatGPT_API_async(model, prompt) + response = await allm_complete(model, prompt) return response @@ -654,7 +635,7 @@ def generate_doc_description(structure, model=None): Directly return the description, do not include any other text. """ - response = ChatGPT_API(model, prompt) + response = llm_complete(model, prompt) return response diff --git a/requirements.txt b/requirements.txt index 463db58f..31e4f164 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ openai==1.101.0 +litellm pymupdf==1.26.4 PyPDF2==3.0.1 python-dotenv==1.1.0 -tiktoken==0.11.0 pyyaml==6.0.2 From 769fbd59b5f4a0be59d90ef97ef45977bb338c86 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 8 Mar 2026 11:55:09 +0800 Subject: [PATCH 02/14] recover the default config yaml --- pageindex/config.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pageindex/config.yaml b/pageindex/config.yaml index 177affd1..1e2798cb 100644 --- a/pageindex/config.yaml +++ b/pageindex/config.yaml @@ -1,4 +1,5 @@ -model: "anthropic/claude-haiku-4-5-20251001" +model: "gpt-4o-2024-11-20" +# model: "anthropic/claude-haiku-4-5-20251001" toc_check_page_num: 20 max_page_num_each_node: 10 max_token_num_each_node: 20000 From 58c289d61c1d9f434c2369587f37e26e163581c2 Mon Sep 17 00:00:00 2001 From: mountain Date: Mon, 16 Mar 2026 18:58:05 +0800 Subject: [PATCH 03/14] Use litellm.acompletion for native async support --- pageindex/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pageindex/utils.py b/pageindex/utils.py index 90efb2e7..622fb1ca 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -73,8 +73,7 @@ async def allm_complete(model, prompt): messages = [{"role": "user", "content": prompt}] for i in range(max_retries): try: - response = await asyncio.to_thread( - litellm.completion, + response = await litellm.acompletion( model=model, messages=messages, temperature=0, From 36d87808d225a631d23c6784c6d5120aa5b066a3 Mon Sep 17 00:00:00 2001 From: mountain Date: Mon, 16 Mar 2026 19:09:19 +0800 Subject: [PATCH 04/14] fix tob --- pageindex/page_index.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pageindex/page_index.py b/pageindex/page_index.py index a153ef90..2f0f91aa 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -263,7 +263,7 @@ def toc_index_extractor(toc, content, model=None): If the section is not in the provided pages, do not add the physical_index to it. Directly return the final JSON structure. Do not output anything else.""" - prompt = tob_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content + prompt = toc_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content response = llm_complete(model=model, prompt=prompt) json_content = extract_json(response) return json_content @@ -745,7 +745,7 @@ def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20 } Directly return the final JSON structure. Do not output anything else.""" - prompt = tob_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content + prompt = toc_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content response = llm_complete(model=model, prompt=prompt) json_content = extract_json(response) return convert_physical_index_to_int(json_content['physical_index']) From 57803765d61e059b167b015fdc57a4f6a1625710 Mon Sep 17 00:00:00 2001 From: mountain Date: Wed, 18 Mar 2026 18:11:48 +0800 Subject: [PATCH 05/14] Rename llm_complete/allm_complete to llm_completion/llm_acompletion, remove unused llm_complete_stream --- pageindex/page_index.py | 32 ++++++++++++++++---------------- pageindex/utils.py | 21 ++++----------------- 2 files changed, 20 insertions(+), 33 deletions(-) diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 2f0f91aa..30e3069f 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -36,7 +36,7 @@ async def check_title_appearance(item, page_list, start_index=1, model=None): }} Directly return the final JSON structure. Do not output anything else.""" - response = await allm_complete(model=model, prompt=prompt) + response = await llm_acompletion(model=model, prompt=prompt) response = extract_json(response) if 'answer' in response: answer = response['answer'] @@ -64,7 +64,7 @@ async def check_title_appearance_in_start(title, page_text, model=None, logger=N }} Directly return the final JSON structure. Do not output anything else.""" - response = await allm_complete(model=model, prompt=prompt) + response = await llm_acompletion(model=model, prompt=prompt) response = extract_json(response) if logger: logger.info(f"Response: {response}") @@ -116,7 +116,7 @@ def toc_detector_single_page(content, model=None): Directly return the final JSON structure. Do not output anything else. Please note: abstract,summary, notation list, figure list, table list, etc. are not table of contents.""" - response = llm_complete(model=model, prompt=prompt) + response = llm_completion(model=model, prompt=prompt) # print('response', response) json_content = extract_json(response) return json_content['toc_detected'] @@ -135,7 +135,7 @@ def check_if_toc_extraction_is_complete(content, toc, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = prompt + '\n Document:\n' + content + '\n Table of contents:\n' + toc - response = llm_complete(model=model, prompt=prompt) + response = llm_completion(model=model, prompt=prompt) json_content = extract_json(response) return json_content['completed'] @@ -153,7 +153,7 @@ def check_if_toc_transformation_is_complete(content, toc, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc - response = llm_complete(model=model, prompt=prompt) + response = llm_completion(model=model, prompt=prompt) json_content = extract_json(response) return json_content['completed'] @@ -165,7 +165,7 @@ def extract_toc_content(content, model=None): Directly return the full table of contents content. Do not output anything else.""" - response, finish_reason = llm_complete(model=model, prompt=prompt, return_finish_reason=True) + response, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True) if_complete = check_if_toc_transformation_is_complete(content, response, model) if if_complete == "yes" and finish_reason == "finished": @@ -176,7 +176,7 @@ def extract_toc_content(content, model=None): {"role": "assistant", "content": response}, ] prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure""" - new_response, finish_reason = llm_complete(model=model, prompt=prompt, chat_history=chat_history, return_finish_reason=True) + new_response, finish_reason = llm_completion(model=model, prompt=prompt, chat_history=chat_history, return_finish_reason=True) response = response + new_response if_complete = check_if_toc_transformation_is_complete(content, response, model) @@ -193,7 +193,7 @@ def extract_toc_content(content, model=None): {"role": "assistant", "content": response}, ] prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure""" - new_response, finish_reason = llm_complete(model=model, prompt=prompt, chat_history=chat_history, return_finish_reason=True) + new_response, finish_reason = llm_completion(model=model, prompt=prompt, chat_history=chat_history, return_finish_reason=True) response = response + new_response if_complete = check_if_toc_transformation_is_complete(content, response, model) @@ -215,7 +215,7 @@ def detect_page_index(toc_content, model=None): }} Directly return the final JSON structure. Do not output anything else.""" - response = llm_complete(model=model, prompt=prompt) + response = llm_completion(model=model, prompt=prompt) json_content = extract_json(response) return json_content['page_index_given_in_toc'] @@ -264,7 +264,7 @@ def toc_index_extractor(toc, content, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = toc_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content - response = llm_complete(model=model, prompt=prompt) + response = llm_completion(model=model, prompt=prompt) json_content = extract_json(response) return json_content @@ -292,7 +292,7 @@ def toc_transformer(toc_content, model=None): Directly return the final JSON structure, do not output anything else. """ prompt = init_prompt + '\n Given table of contents\n:' + toc_content - last_complete, finish_reason = llm_complete(model=model, prompt=prompt, return_finish_reason=True) + last_complete, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True) if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model) if if_complete == "yes" and finish_reason == "finished": last_complete = extract_json(last_complete) @@ -316,7 +316,7 @@ def toc_transformer(toc_content, model=None): Please continue the json structure, directly output the remaining part of the json structure.""" - new_complete, finish_reason = llm_complete(model=model, prompt=prompt, return_finish_reason=True) + new_complete, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True) if new_complete.startswith('```json'): new_complete = get_json_content(new_complete) @@ -477,7 +477,7 @@ def add_page_number_to_toc(part, structure, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = fill_prompt_seq + f"\n\nCurrent Partial Document:\n{part}\n\nGiven Structure\n{json.dumps(structure, indent=2)}\n" - current_json_raw = llm_complete(model=model, prompt=prompt) + current_json_raw = llm_completion(model=model, prompt=prompt) json_result = extract_json(current_json_raw) for item in json_result: @@ -527,7 +527,7 @@ def generate_toc_continue(toc_content, part, model="gpt-4o-2024-11-20"): Directly return the additional part of the final JSON structure. Do not output anything else.""" prompt = prompt + '\nGiven text\n:' + part + '\nPrevious tree structure\n:' + json.dumps(toc_content, indent=2) - response, finish_reason = llm_complete(model=model, prompt=prompt, return_finish_reason=True) + response, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True) if finish_reason == 'finished': return extract_json(response) else: @@ -561,7 +561,7 @@ def generate_toc_init(part, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = prompt + '\nGiven text\n:' + part - response, finish_reason = llm_complete(model=model, prompt=prompt, return_finish_reason=True) + response, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True) if finish_reason == 'finished': return extract_json(response) @@ -746,7 +746,7 @@ def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20 Directly return the final JSON structure. Do not output anything else.""" prompt = toc_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content - response = llm_complete(model=model, prompt=prompt) + response = llm_completion(model=model, prompt=prompt) json_content = extract_json(response) return convert_physical_index_to_int(json_content['physical_index']) diff --git a/pageindex/utils.py b/pageindex/utils.py index 622fb1ca..7ca44731 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -29,7 +29,7 @@ def count_tokens(text, model=None): return litellm.token_counter(model=model, text=text) -def llm_complete(model, prompt, chat_history=None, return_finish_reason=False): +def llm_completion(model, prompt, chat_history=None, return_finish_reason=False): max_retries = 10 messages = list(chat_history) + [{"role": "user", "content": prompt}] if chat_history else [{"role": "user", "content": prompt}] for i in range(max_retries): @@ -54,21 +54,8 @@ def llm_complete(model, prompt, chat_history=None, return_finish_reason=False): return "", "error" -def llm_complete_stream(model, prompt): - """Return a generator that yields token chunks (str) one at a time.""" - response = litellm.completion( - model=model, - messages=[{"role": "user", "content": prompt}], - temperature=0, - stream=True, - ) - for chunk in response: - delta = chunk.choices[0].delta.content - if delta: - yield delta - -async def allm_complete(model, prompt): +async def llm_acompletion(model, prompt): max_retries = 10 messages = [{"role": "user", "content": prompt}] for i in range(max_retries): @@ -589,7 +576,7 @@ async def generate_node_summary(node, model=None): Directly return the description, do not include any other text. """ - response = await allm_complete(model, prompt) + response = await llm_acompletion(model, prompt) return response @@ -634,7 +621,7 @@ def generate_doc_description(structure, model=None): Directly return the description, do not include any other text. """ - response = llm_complete(model, prompt) + response = llm_completion(model, prompt) return response From 847782ccb4ae80f5295a872c7198814c5e7b11ea Mon Sep 17 00:00:00 2001 From: mountain Date: Wed, 18 Mar 2026 18:12:09 +0800 Subject: [PATCH 06/14] Pin litellm to version 1.82.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 31e4f164..f323fce0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ openai==1.101.0 -litellm +litellm==1.82.0 pymupdf==1.26.4 PyPDF2==3.0.1 python-dotenv==1.1.0 From 92845aa2e87a79bf58d8402523fd9c351e3d599d Mon Sep 17 00:00:00 2001 From: mountain Date: Wed, 18 Mar 2026 19:16:44 +0800 Subject: [PATCH 07/14] resolve comments --- pageindex/config.yaml | 2 +- pageindex/page_index.py | 4 ++-- pageindex/utils.py | 5 +++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pageindex/config.yaml b/pageindex/config.yaml index 1e2798cb..aa60a1f9 100644 --- a/pageindex/config.yaml +++ b/pageindex/config.yaml @@ -1,5 +1,5 @@ model: "gpt-4o-2024-11-20" -# model: "anthropic/claude-haiku-4-5-20251001" +# model: "anthropic/claude-sonnet-4-6" toc_check_page_num: 20 max_page_num_each_node: 10 max_token_num_each_node: 20000 diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 30e3069f..b9b4a157 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -499,7 +499,7 @@ def remove_first_physical_index_section(text): return text ### add verify completeness -def generate_toc_continue(toc_content, part, model="gpt-4o-2024-11-20"): +def generate_toc_continue(toc_content, part, model=None): print('start generate_toc_continue') prompt = """ You are an expert in extracting hierarchical tree structure. @@ -732,7 +732,7 @@ def check_toc(page_list, opt=None): ################### fix incorrect toc ######################################################### -def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20"): +def single_toc_item_index_fixer(section_title, content, model=None): toc_extractor_prompt = """ You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document. diff --git a/pageindex/utils.py b/pageindex/utils.py index 7ca44731..f3d6d51e 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -22,7 +22,6 @@ litellm.drop_params = True - def count_tokens(text, model=None): if not text: return 0 @@ -51,7 +50,9 @@ def llm_completion(model, prompt, chat_history=None, return_finish_reason=False) time.sleep(1) else: logging.error('Max retries reached for prompt: ' + prompt) - return "", "error" + if return_finish_reason: + return "", "error" + return "" From 6bda6e700fa0b35990ef3d44c29ab1cd194bef2c Mon Sep 17 00:00:00 2001 From: mountain Date: Wed, 18 Mar 2026 19:25:14 +0800 Subject: [PATCH 08/14] args from cli is used to overrides config.yaml --- run_pageindex.py | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/run_pageindex.py b/run_pageindex.py index 10702450..6f1a1acb 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -10,22 +10,22 @@ parser.add_argument('--pdf_path', type=str, help='Path to the PDF file') parser.add_argument('--md_path', type=str, help='Path to the Markdown file') - parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use') + parser.add_argument('--model', type=str, default=None, help='Model to use (overrides config.yaml)') - parser.add_argument('--toc-check-pages', type=int, default=20, + parser.add_argument('--toc-check-pages', type=int, default=None, help='Number of pages to check for table of contents (PDF only)') - parser.add_argument('--max-pages-per-node', type=int, default=10, + parser.add_argument('--max-pages-per-node', type=int, default=None, help='Maximum number of pages per node (PDF only)') - parser.add_argument('--max-tokens-per-node', type=int, default=20000, + parser.add_argument('--max-tokens-per-node', type=int, default=None, help='Maximum number of tokens per node (PDF only)') - parser.add_argument('--if-add-node-id', type=str, default='yes', + parser.add_argument('--if-add-node-id', type=str, default=None, help='Whether to add node id to the node') - parser.add_argument('--if-add-node-summary', type=str, default='yes', + parser.add_argument('--if-add-node-summary', type=str, default=None, help='Whether to add summary to the node') - parser.add_argument('--if-add-doc-description', type=str, default='no', + parser.add_argument('--if-add-doc-description', type=str, default=None, help='Whether to add doc description to the doc') - parser.add_argument('--if-add-node-text', type=str, default='no', + parser.add_argument('--if-add-node-text', type=str, default=None, help='Whether to add text to the node') # Markdown specific arguments @@ -51,17 +51,18 @@ raise ValueError(f"PDF file not found: {args.pdf_path}") # Process PDF file - # Configure options - opt = config( - model=args.model, - toc_check_page_num=args.toc_check_pages, - max_page_num_each_node=args.max_pages_per_node, - max_token_num_each_node=args.max_tokens_per_node, - if_add_node_id=args.if_add_node_id, - if_add_node_summary=args.if_add_node_summary, - if_add_doc_description=args.if_add_doc_description, - if_add_node_text=args.if_add_node_text - ) + from pageindex.utils import ConfigLoader + user_opt = { + 'model': args.model, + 'toc_check_page_num': args.toc_check_pages, + 'max_page_num_each_node': args.max_pages_per_node, + 'max_token_num_each_node': args.max_tokens_per_node, + 'if_add_node_id': args.if_add_node_id, + 'if_add_node_summary': args.if_add_node_summary, + 'if_add_doc_description': args.if_add_doc_description, + 'if_add_node_text': args.if_add_node_text, + } + opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None}) # Process the PDF toc_with_page_number = page_index_main(args.pdf_path, opt) From 37f38eeebc70edf0a6b19e98de302d9cdc168b83 Mon Sep 17 00:00:00 2001 From: mountain Date: Thu, 19 Mar 2026 19:21:55 +0800 Subject: [PATCH 09/14] Fix get_page_tokens hardcoded model default Pass opt.model to get_page_tokens so tokenization respects the configured model instead of always using gpt-4o-2024-11-20. Co-Authored-By: Claude Sonnet 4.6 --- pageindex/page_index.py | 2 +- pageindex/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pageindex/page_index.py b/pageindex/page_index.py index b9b4a157..2fa2b90b 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -1069,7 +1069,7 @@ def page_index_main(doc, opt=None): raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.") print('Parsing PDF...') - page_list = get_page_tokens(doc) + page_list = get_page_tokens(doc, model=opt.model) logger.info({'total_page_number': len(page_list)}) logger.info({'total_token': sum([page[1] for page in page_list])}) diff --git a/pageindex/utils.py b/pageindex/utils.py index f3d6d51e..f594cd83 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -379,7 +379,7 @@ def add_preface_if_needed(data): -def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"): +def get_page_tokens(pdf_path, model=None, pdf_parser="PyPDF2"): if pdf_parser == "PyPDF2": pdf_reader = PyPDF2.PdfReader(pdf_path) page_list = [] From 88415f10b511d122fdfa96f69cf075890a01c4ce Mon Sep 17 00:00:00 2001 From: mountain Date: Fri, 20 Mar 2026 14:03:09 +0800 Subject: [PATCH 10/14] Remove explicit openai dependency from requirements.txt openai is no longer directly imported; it comes in as a transitive dependency of litellm. Pinning it explicitly risks version conflicts. Co-Authored-By: Claude Sonnet 4.6 --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f323fce0..3b82eda0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -openai==1.101.0 litellm==1.82.0 pymupdf==1.26.4 PyPDF2==3.0.1 From 4e1db6c31b6ca7e5fa441bbe908d90046a67d0e1 Mon Sep 17 00:00:00 2001 From: mountain Date: Fri, 20 Mar 2026 14:07:19 +0800 Subject: [PATCH 11/14] Restore openai==1.101.0 pin in requirements.txt litellm==1.82.0 and openai-agents have conflicting openai version requirements, but openai==1.101.0 works at runtime for both. The pin is necessary to prevent litellm from pulling in openai>=2.x which would break openai-agents. Co-Authored-By: Claude Sonnet 4.6 --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 3b82eda0..f323fce0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +openai==1.101.0 litellm==1.82.0 pymupdf==1.26.4 PyPDF2==3.0.1 From 3b88cec89436799c251d9f885003e33b094be9fe Mon Sep 17 00:00:00 2001 From: mountain Date: Fri, 20 Mar 2026 14:44:37 +0800 Subject: [PATCH 12/14] Remove explicit openai dependency from requirements.txt openai is not directly used; it comes in as a transitive dependency of litellm. No openai-agents in this branch so no pin needed. Co-Authored-By: Claude Sonnet 4.6 --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f323fce0..3b82eda0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -openai==1.101.0 litellm==1.82.0 pymupdf==1.26.4 PyPDF2==3.0.1 From 025b0e2529bf5889bc2611f30d3f7586b24a7132 Mon Sep 17 00:00:00 2001 From: mountain Date: Fri, 20 Mar 2026 15:45:11 +0800 Subject: [PATCH 13/14] fix an litellm error log --- pageindex/page_index.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 2fa2b90b..1687a4e5 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -732,7 +732,7 @@ def check_toc(page_list, opt=None): ################### fix incorrect toc ######################################################### -def single_toc_item_index_fixer(section_title, content, model=None): +async def single_toc_item_index_fixer(section_title, content, model=None): toc_extractor_prompt = """ You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document. @@ -746,7 +746,7 @@ def single_toc_item_index_fixer(section_title, content, model=None): Directly return the final JSON structure. Do not output anything else.""" prompt = toc_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content - response = llm_completion(model=model, prompt=prompt) + response = await llm_acompletion(model=model, prompt=prompt) json_content = extract_json(response) return convert_physical_index_to_int(json_content['physical_index']) @@ -815,7 +815,7 @@ async def process_and_check_item(incorrect_item): continue content_range = ''.join(page_contents) - physical_index_int = single_toc_item_index_fixer(incorrect_item['title'], content_range, model) + physical_index_int = await single_toc_item_index_fixer(incorrect_item['title'], content_range, model) # Check if the result is correct check_item = incorrect_item.copy() From c72a7b7861f31a5d960eae49bc67d1f06833a17a Mon Sep 17 00:00:00 2001 From: mountain Date: Fri, 20 Mar 2026 16:09:04 +0800 Subject: [PATCH 14/14] resolve comments --- pageindex/page_index.py | 5 +++++ pageindex/utils.py | 4 ++-- run_pageindex.py | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 1687a4e5..71925546 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -300,7 +300,12 @@ def toc_transformer(toc_content, model=None): return cleaned_response last_complete = get_json_content(last_complete) + attempt = 0 + max_attempts = 5 while not (if_complete == "yes" and finish_reason == "finished"): + attempt += 1 + if attempt > max_attempts: + raise Exception('Failed to complete toc transformation after maximum retries') position = last_complete.rfind('}') if position != -1: last_complete = last_complete[:position+2] diff --git a/pageindex/utils.py b/pageindex/utils.py index f594cd83..57b69c5b 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -74,7 +74,7 @@ async def llm_acompletion(model, prompt): await asyncio.sleep(1) else: logging.error('Max retries reached for prompt: ' + prompt) - return "Error" + return "" def get_json_content(response): @@ -501,7 +501,7 @@ def remove_structure_text(data): def check_token_limit(structure, limit=110000): list = structure_to_list(structure) for node in list: - num_tokens = count_tokens(node['text'], model='gpt-4o') + num_tokens = count_tokens(node['text'], model=None) if num_tokens > limit: print(f"Node ID: {node['node_id']} has {num_tokens} tokens") print("Start Index:", node['start_index']) diff --git a/run_pageindex.py b/run_pageindex.py index 6f1a1acb..673439d8 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -3,6 +3,7 @@ import json from pageindex import * from pageindex.page_index_md import md_to_tree +from pageindex.utils import ConfigLoader if __name__ == "__main__": # Set up argument parser @@ -51,7 +52,6 @@ raise ValueError(f"PDF file not found: {args.pdf_path}") # Process PDF file - from pageindex.utils import ConfigLoader user_opt = { 'model': args.model, 'toc_check_page_num': args.toc_check_pages,