tesseract-002

Tests that hOCR output works

Test tesseract-002.xml is expected to pass.

The pipeline

<p:declare-step xmlns:cx="http://xmlcalabash.com/ns/extensions" xmlns:p="http://www.w3.org/ns/xproc" name="main" version="3.0">
   <p:import href="https://xmlcalabash.com/ext/library/pdf-steps.xpl"/>
   <p:import href="https://xmlcalabash.com/ext/library/tesseract.xpl"/>
   <p:output port="result"/>
   <cx:pdf-to-images dpi="300">
      <p:with-input port="source" href="../documents/example.pdf"/>
   </cx:pdf-to-images>
   <cx:tesseract language="eng" output-format="hocr" debug-output="/dev/null"/>
</p:declare-step>

Schematron validation

<s:schema xmlns:s="http://purl.oclc.org/dsdl/schematron" queryBinding="xslt2">
   <s:ns prefix="h" uri="http://www.w3.org/1999/xhtml"/>
   <s:pattern>
      <s:rule context="/">
         <s:assert test="h:html">Wrong document element</s:assert>
      </s:rule>
   </s:pattern>
   <s:pattern>
      <s:rule context="/h:html">
         <s:assert test="h:body/h:div[@class='ocr_page']">Unexpected page output</s:assert>
         <s:assert test="h:body//h:span[@class='ocrx_word'] = 'PDF'">Unexpected output</s:assert>
      </s:rule>
   </s:pattern>
</s:schema>

Revision history

12 Jun 2026, Norm Tovey-Walsh

Created test.