From 71c57623a286d566266554464f4ad0d9eab238bd Mon Sep 17 00:00:00 2001 From: Dhanush Varma Date: Sun, 5 Apr 2026 03:12:38 +0530 Subject: [PATCH] fix: resolve TESSDATA_PREFIX path correctly for all Tesseract versions Two bugs in init_ocr() in ocr.c: 1. The Tesseract 4/5 branch always blindly appended '/tessdata' to the path returned by probe_tessdata_location(). If TESSDATA_PREFIX was already set to a path ending in 'tessdata/', this caused a double- append e.g. '/usr/share/tessdata/tessdata'. 2. The legacy Tesseract <4 branch passed tessdata_path raw to TessBaseAPIInit4 without appending 'tessdata' at all, causing Tesseract to look for eng.traineddata directly in e.g. '/usr/share/' instead of '/usr/share/tessdata/'. Fix: normalize the path once before both branches. Detect whether the returned path already ends with 'tessdata' or 'tessdata/', handle Windows backslash separators, and use the resolved path in both Tesseract version branches. Add mprint diagnostic for the resolved path. Fixes #1492 --- src/lib_ccx/ocr.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/src/lib_ccx/ocr.c b/src/lib_ccx/ocr.c index 70fb9227d..9e51601e9 100644 --- a/src/lib_ccx/ocr.c +++ b/src/lib_ccx/ocr.c @@ -261,20 +261,46 @@ void *init_ocr(int lang_index) } ctx->api = TessBaseAPICreate(); + + /* Build the correct tessdata path for TessBaseAPIInit4. + * probe_tessdata_location() returns a base dir e.g. "/opt/homebrew/share/". + * TessBaseAPIInit4 expects "/tessdata" as the data path. + * If TESSDATA_PREFIX already points at the tessdata dir itself, + * avoid appending "tessdata" a second time. */ + char tess_path[1024]; + size_t tp_len = strlen(tessdata_path); + int already_has_tessdata = (tp_len >= 8 && + (strcmp(tessdata_path + tp_len - 8, "tessdata/") == 0 || + strcmp(tessdata_path + tp_len - 8, "tessdata") == 0)); + if (already_has_tessdata) + { + snprintf(tess_path, sizeof(tess_path), "%s", tessdata_path); + } + else + { + snprintf(tess_path, sizeof(tess_path), "%s%stessdata", + tessdata_path, + (tessdata_path[tp_len - 1] == '/' || tessdata_path[tp_len - 1] == '\\') ? "" : "/"); + } + + mprint("CCExtractor: using tessdata path: %s\n", tess_path); + if (!strncmp("4.", TessVersion(), 2) || !strncmp("5.", TessVersion(), 2)) { - char tess_path[1024]; - snprintf(tess_path, 1024, "%s%s%s", tessdata_path, "/", "tessdata"); if (ccx_options.ocr_oem < 0) + { ccx_options.ocr_oem = 1; + } ret = TessBaseAPIInit4(ctx->api, tess_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec, &pars_values, 1, false); } else { if (ccx_options.ocr_oem < 0) + { ccx_options.ocr_oem = 0; - ret = TessBaseAPIInit4(ctx->api, tessdata_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec, + } + ret = TessBaseAPIInit4(ctx->api, tess_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec, &pars_values, 1, false); }