Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 20 additions & 18 deletions lib/pdf.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import PDFFont from "./pdffont.js";
import PDFUnit from "./pdfunit.js";
import PTIXmlParser from "./ptixmlinject.js";
import { createScratchCanvas } from "./pdfcanvas.js";
import { BASELINE_TOLERANCE_RATIO, sortBidiTexts } from "./pdftextsorter.js";

//start of helper classes
class PDFPageParser {
Expand Down Expand Up @@ -333,41 +334,42 @@ export default class PDFJSClass extends EventEmitter {

this.rawTextContents.forEach((textContent, index) => {
let prevText = null;

textContent.bidiTexts.forEach((textObj, idx) => {
// Check if on same line
// Use a tolerance relative to font size for better accuracy
// Typical line spacing is 120% of font size, so 10-15% tolerance is reasonable
const tolerance = prevText ? (prevText.fontSize || 12) * 0.15 : 2;

// Spatially sort bidiTexts into visual reading order (top-to-bottom, left-to-right)
const bidiTexts = sortBidiTexts(textContent.bidiTexts);

bidiTexts.forEach((textObj) => {
// Check if on same line using the same tolerance ratio as the sorter
const tolerance = prevText ? (prevText.fontSize || 12) * BASELINE_TOLERANCE_RATIO : 2;
const sameLine = prevText && Math.abs(textObj.y - prevText.y) <= tolerance;

if (sameLine) {
// spaceWidth is in unscaled coordinates (no textHScale, matching JSON w property)
const { spaceWidth, startX, width, textHScale } = prevText;

// Use actual calculated text width (from glyph widths)
// width is in unscaled coordinates, but startX is in scaled coordinates
// So we must apply textHScale to width before adding to startX
// This matches canvas.js: current.x += x * textHScale (line 1267)
const prevTextEndX = startX + (width * textHScale);

// Calculate gap between end of previous text and start of current text
// gap is in SCALED coordinates (both textObj.x and prevTextEndX are scaled)
const gap = textObj.x - prevTextEndX;

// Scale spaceWidth to match gap's coordinate system
const scaledSpaceWidth = spaceWidth * textHScale;

// Add spaces if gap is positive and significant (> 30% of scaled space width)
// Also check that scaledSpaceWidth is valid to avoid division by zero
if (scaledSpaceWidth > 0 && gap > scaledSpaceWidth * 0.3) {
const numSpaces = Math.round(gap / scaledSpaceWidth);
prevText.str += ' '.repeat(Math.max(1, numSpaces));
}

// Append current text
prevText.str += textObj.str;

// Update prevText to track current text for next iteration
prevText.startX = textObj.x;
prevText.width = textObj.width;
Expand All @@ -378,11 +380,11 @@ export default class PDFJSClass extends EventEmitter {
if (prevText) {
retVal += `${prevText.str}\r\n`;
}

// Initialize new text object with font metrics
prevText = {
str: textObj.str,
y: textObj.y,
prevText = {
str: textObj.str,
y: textObj.y,
startX: textObj.x,
width: textObj.width,
spaceWidth: textObj.spaceWidth,
Expand All @@ -391,7 +393,7 @@ export default class PDFJSClass extends EventEmitter {
};
}
});

if (prevText) {
retVal += prevText.str;
}
Expand Down
60 changes: 60 additions & 0 deletions lib/pdftextsorter.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/**
* Ratio used to compute the Y-axis grouping tolerance for a text element.
* A text element with fontSize 12pt yields tolerance 12 * 0.15 = 1.8pt.
* This absorbs normal subscript/superscript baseline shifts while keeping
* genuinely different lines separate.
*/
const BASELINE_TOLERANCE_RATIO = 0.15;

/**
* Spatial Sort: Sort an array of bidiText objects into spatial reading order:
* 1. Group elements into horizontal lines by their Y coordinate, using a
* font-size-proportional tolerance to keep subscripts/superscripts on
* the same line as their base characters.
* 2. Sort the resulting lines top-to-bottom (ascending Y).
* 3. Within each line, sort elements left-to-right (ascending X).
*
* The original array is not mutated; a new sorted flat array is returned.
*
* @param {Array<{str:string, x:number, y:number, width:number, spaceWidth:number, textHScale:number, fontSize?:number}>} bidiTexts
* @returns {typeof bidiTexts}
*/
function sortBidiTexts(bidiTexts) {
if (!bidiTexts || bidiTexts.length === 0) return bidiTexts;

// — Phase 1: bucket elements into line groups by Y ——————————————————————
const lines = [];

for (const textObj of bidiTexts) {
const tolerance = (textObj.fontSize || 12) * BASELINE_TOLERANCE_RATIO;
let foundLine = null;

for (const line of lines) {
// Compare against the Y of the first element added to the line.
// Using the group's representative Y keeps the bucket anchor stable
// even when mixed-size fonts appear consecutively.
if (Math.abs(textObj.y - line[0].y) <= tolerance) {
foundLine = line;
break;
}
}

if (foundLine) {
foundLine.push(textObj);
} else {
lines.push([textObj]);
}
}

// — Phase 2: sort lines top-to-bottom ——————————————————————————————————
lines.sort((a, b) => a[0].y - b[0].y);

// — Phase 3: sort elements within each line left-to-right ——————————————
for (const line of lines) {
line.sort((a, b) => a.x - b.x);
}

return lines.flat();
}

export { BASELINE_TOLERANCE_RATIO, sortBidiTexts };
Loading
Loading