loading/hocr.js

function hasClass(element, className) {
  return element?.classList?.contains(className);
}

function parseBBox(title = "") {
  const match = /\bbbox\s+(-?\d+(?:\.\d+)?)\s+(-?\d+(?:\.\d+)?)\s+(-?\d+(?:\.\d+)?)\s+(-?\d+(?:\.\d+)?)/.exec(title);
  if (!match) return null;
  const [, left, top, right, bottom] = match.map(Number);
  if (![left, top, right, bottom].every(Number.isFinite)) return null;
  if (right <= left || bottom <= top) return null;
  return { left, top, right, bottom, width: right - left, height: bottom - top };
}

function getText(element) {
  return (element?.textContent || "").replace(/\s+/g, " ").trim();
}

function getWordElements(pageElement) {
  const words = Array.from(pageElement.querySelectorAll(".ocrx_word"));
  if (words.length) return words;
  return Array.from(pageElement.querySelectorAll(".ocr_word, .ocr_line"))
    .filter(element => getText(element));
}

function pageFromElement(pageElement) {
  const pageBox = parseBBox(pageElement.getAttribute("title") || "");
  const wordElements = getWordElements(pageElement);
  const wordBoxes = wordElements
    .map(element => ({ element, box: parseBBox(element.getAttribute("title") || "") }))
    .filter(({ box }) => box);
  const fallbackBox = pageBox || wordBoxes.reduce((box, word) => {
    if (!box) return { ...word.box };
    box.left = Math.min(box.left, word.box.left);
    box.top = Math.min(box.top, word.box.top);
    box.right = Math.max(box.right, word.box.right);
    box.bottom = Math.max(box.bottom, word.box.bottom);
    box.width = box.right - box.left;
    box.height = box.bottom - box.top;
    return box;
  }, null);
  if (!fallbackBox) return null;

  const items = [];
  for (const { element, box } of wordBoxes) {
    const str = getText(element);
    if (!str) continue;
    const fontHeight = Math.max(1, box.height);
    items.push({
      str,
      dir: element.dir || "ltr",
      fontName: "hocr",
      width: box.width,
      height: fontHeight,
      transform: [fontHeight, 0, 0, fontHeight, box.left, box.top + fontHeight],
    });
  }

  return {
    width: fallbackBox.width,
    height: fallbackBox.height,
    transform: [1, 0, 0, 1, -fallbackBox.left, -fallbackBox.top],
    styles: { hocr: { fontFamily: "", ascent: 1, descent: 0, vertical: false } },
    items,
  };
}

/**
 * Parses hOCR markup into Riffle text-layer pages.
 *
 * @param {string|Document} hocr hOCR markup or a parsed document.
 * @returns {Object[]} Text content pages compatible with Riffle's text layer.
 */
export function parseHocr(hocr) {
  const documentRef = typeof hocr === "string"
    ? new DOMParser().parseFromString(hocr, "text/html")
    : hocr;
  if (!documentRef) return [];
  const pageElements = Array.from(documentRef.querySelectorAll(".ocr_page"));
  const pages = (pageElements.length ? pageElements : [documentRef.body || documentRef.documentElement])
    .filter(element => element && (hasClass(element, "ocr_page") || element.querySelector?.(".ocrx_word, .ocr_word, .ocr_line")))
    .map(pageFromElement)
    .filter(page => page && page.items.length);
  return pages;
}

async function readHocr(input) {
  if (typeof input === "string") return input;
  if (typeof Document !== "undefined" && input instanceof Document) return input;
  if (typeof Blob !== "undefined" && input instanceof Blob) return input.text();
  throw new TypeError("hOCR input must be a string, Document, File, or Blob");
}

/**
 * Reads and parses hOCR markup.
 *
 * @param {string|Document|Blob} input hOCR markup, parsed document, File, or Blob.
 * @returns {Promise<Object[]>} Parsed text content pages.
 */
export async function loadHocr(input) {
  return parseHocr(await readHocr(input));
}