feat: 集成Tesseract源码到项目中

Description: 由于仓库中的Tesseract不是最新版本导致产生了一个bug，因此将Tesseract源码集成到项目中 Log: no Change-Id: I088de95d6c6ab670406daa8d47ed2ed46929c2c0
2021-06-22 20:13:39 +08:00 · 2021-06-22 20:13:39 +08:00 · 0cfed22ed4
commit 0cfed22ed4
parent 40c90fc3c7
439 changed files with 185083 additions and 13 deletions
--- a/3rdparty/tesseract_ocr/tesseract/include/tesseract/baseapi.h
+++ b/3rdparty/tesseract_ocr/tesseract/include/tesseract/baseapi.h
@ -0,0 +1,848 @@
+///////////////////////////////////////////////////////////////////////
+// File:        baseapi.h
+// Description: Simple API for calling tesseract.
+// Author:      Ray Smith
+//
+// (C) Copyright 2006, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_API_BASEAPI_H_
+#define TESSERACT_API_BASEAPI_H_
+
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h" // DISABLED_LEGACY_ENGINE
+#endif
+
+#include "export.h"
+#include "pageiterator.h"
+#include "publictypes.h"
+#include "resultiterator.h"
+#include "unichar.h"
+
+#include "3rdparty/tesseract_ocr/tesseract/include/tesseract/version.h"
+
+#include <cstdio>
+#include <tuple>  // for std::tuple
+#include <vector> // for std::vector
+
+struct Pix;
+struct Pixa;
+struct Boxa;
+
+namespace tesseract {
+
+class PAGE_RES;
+class ParagraphModel;
+class BLOCK_LIST;
+class ETEXT_DESC;
+struct OSResults;
+class UNICHARSET;
+
+class Dawg;
+class Dict;
+class EquationDetect;
+class PageIterator;
+class ImageThresholder;
+class LTRResultIterator;
+class ResultIterator;
+class MutableIterator;
+class TessResultRenderer;
+class Tesseract;
+
+// Function to read a std::vector<char> from a whole file.
+// Returns false on failure.
+using FileReader = bool (*)(const char *filename, std::vector<char> *data);
+
+using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID,
+                               bool) const;
+using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *,
+                                                  int, const char *, int);
+
+/**
+ * Base class for all tesseract APIs.
+ * Specific classes can add ability to work on different inputs or produce
+ * different outputs.
+ * This class is mostly an interface layer on top of the Tesseract instance
+ * class to hide the data types so that users of this class don't have to
+ * include any other Tesseract headers.
+ */
+class TESS_API TessBaseAPI {
+public:
+  TessBaseAPI();
+  virtual ~TessBaseAPI();
+  // Copy constructor and assignment operator are currently unsupported.
+  TessBaseAPI(TessBaseAPI const &) = delete;
+  TessBaseAPI &operator=(TessBaseAPI const &) = delete;
+
+  /**
+   * Returns the version identifier as a static string. Do not delete.
+   */
+  static const char *Version();
+
+  /**
+   * If compiled with OpenCL AND an available OpenCL
+   * device is deemed faster than serial code, then
+   * "device" is populated with the cl_device_id
+   * and returns sizeof(cl_device_id)
+   * otherwise *device=nullptr and returns 0.
+   */
+  static size_t getOpenCLDevice(void **device);
+
+  /**
+   * Set the name of the input file. Needed for training and
+   * reading a UNLV zone file, and for searchable PDF output.
+   */
+  void SetInputName(const char *name);
+  /**
+   * These functions are required for searchable PDF output.
+   * We need our hands on the input file so that we can include
+   * it in the PDF without transcoding. If that is not possible,
+   * we need the original image. Finally, resolution metadata
+   * is stored in the PDF so we need that as well.
+   */
+  const char *GetInputName();
+  // Takes ownership of the input pix.
+  void SetInputImage(Pix *pix);
+  Pix *GetInputImage();
+  int GetSourceYResolution();
+  const char *GetDatapath();
+
+  /** Set the name of the bonus output files. Needed only for debugging. */
+  void SetOutputName(const char *name);
+
+  /**
+   * Set the value of an internal "parameter."
+   * Supply the name of the parameter and the value as a string, just as
+   * you would in a config file.
+   * Returns false if the name lookup failed.
+   * Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z.
+   * Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode.
+   * SetVariable may be used before Init, but settings will revert to
+   * defaults on End().
+   *
+   * Note: Must be called after Init(). Only works for non-init variables
+   * (init variables should be passed to Init()).
+   */
+  bool SetVariable(const char *name, const char *value);
+  bool SetDebugVariable(const char *name, const char *value);
+
+  /**
+   * Returns true if the parameter was found among Tesseract parameters.
+   * Fills in value with the value of the parameter.
+   */
+  bool GetIntVariable(const char *name, int *value) const;
+  bool GetBoolVariable(const char *name, bool *value) const;
+  bool GetDoubleVariable(const char *name, double *value) const;
+
+  /**
+   * Returns the pointer to the string that represents the value of the
+   * parameter if it was found among Tesseract parameters.
+   */
+  const char *GetStringVariable(const char *name) const;
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+  /**
+   * Print Tesseract fonts table to the given file.
+   */
+  void PrintFontsTable(FILE* fp) const;
+
+#endif
+
+  /**
+   * Print Tesseract parameters to the given file.
+   */
+  void PrintVariables(FILE *fp) const;
+
+  /**
+   * Get value of named variable as a string, if it exists.
+   */
+  bool GetVariableAsString(const char *name, std::string *val) const;
+
+  /**
+   * Instances are now mostly thread-safe and totally independent,
+   * but some global parameters remain. Basically it is safe to use multiple
+   * TessBaseAPIs in different threads in parallel, UNLESS:
+   * you use SetVariable on some of the Params in classify and textord.
+   * If you do, then the effect will be to change it for all your instances.
+   *
+   * Start tesseract. Returns zero on success and -1 on failure.
+   * NOTE that the only members that may be called before Init are those
+   * listed above here in the class definition.
+   *
+   * The datapath must be the name of the tessdata directory.
+   * The language is (usually) an ISO 639-3 string or nullptr will default to
+   * eng. It is entirely safe (and eventually will be efficient too) to call
+   * Init multiple times on the same instance to change language, or just
+   * to reset the classifier.
+   * The language may be a string of the form [~]<lang>[+[~]<lang>]* indicating
+   * that multiple languages are to be loaded. Eg hin+eng will load Hindi and
+   * English. Languages may specify internally that they want to be loaded
+   * with one or more other languages, so the ~ sign is available to override
+   * that. Eg if hin were set to load eng by default, then hin+~eng would force
+   * loading only hin. The number of loaded languages is limited only by
+   * memory, with the caveat that loading additional languages will impact
+   * both speed and accuracy, as there is more work to do to decide on the
+   * applicable language, and there is more chance of hallucinating incorrect
+   * words.
+   * WARNING: On changing languages, all Tesseract parameters are reset
+   * back to their default values. (Which may vary between languages.)
+   * If you have a rare need to set a Variable that controls
+   * initialization for a second call to Init you should explicitly
+   * call End() and then use SetVariable before Init. This is only a very
+   * rare use case, since there are very few uses that require any parameters
+   * to be set before Init.
+   *
+   * If set_only_non_debug_params is true, only params that do not contain
+   * "debug" in the name will be set.
+   */
+  int Init(const char *datapath, const char *language, OcrEngineMode mode,
+           char **configs, int configs_size,
+           const std::vector<std::string> *vars_vec,
+           const std::vector<std::string> *vars_values,
+           bool set_only_non_debug_params);
+  int Init(const char *datapath, const char *language, OcrEngineMode oem) {
+    return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false);
+  }
+  int Init(const char *datapath, const char *language) {
+    return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr,
+                false);
+  }
+  // In-memory version reads the traineddata file directly from the given
+  // data[data_size] array, and/or reads data via a FileReader.
+  int Init(const char *data, int data_size, const char *language,
+           OcrEngineMode mode, char **configs, int configs_size,
+           const std::vector<std::string> *vars_vec,
+           const std::vector<std::string> *vars_values,
+           bool set_only_non_debug_params, FileReader reader);
+
+  /**
+   * Returns the languages string used in the last valid initialization.
+   * If the last initialization specified "deu+hin" then that will be
+   * returned. If hin loaded eng automatically as well, then that will
+   * not be included in this list. To find the languages actually
+   * loaded use GetLoadedLanguagesAsVector.
+   * The returned string should NOT be deleted.
+   */
+  const char *GetInitLanguagesAsString() const;
+
+  /**
+   * Returns the loaded languages in the vector of std::string.
+   * Includes all languages loaded by the last Init, including those loaded
+   * as dependencies of other loaded languages.
+   */
+  void GetLoadedLanguagesAsVector(std::vector<std::string> *langs) const;
+
+  /**
+   * Returns the available languages in the sorted vector of std::string.
+   */
+  void GetAvailableLanguagesAsVector(std::vector<std::string> *langs) const;
+
+  /**
+   * Init only the lang model component of Tesseract. The only functions
+   * that work after this init are SetVariable and IsValidWord.
+   * WARNING: temporary! This function will be removed from here and placed
+   * in a separate API at some future time.
+   */
+  int InitLangMod(const char *datapath, const char *language);
+
+  /**
+   * Init only for page layout analysis. Use only for calls to SetImage and
+   * AnalysePage. Calls that attempt recognition will generate an error.
+   */
+  void InitForAnalysePage();
+
+  /**
+   * Read a "config" file containing a set of param, value pairs.
+   * Searches the standard places: tessdata/configs, tessdata/tessconfigs
+   * and also accepts a relative or absolute path name.
+   * Note: only non-init params will be set (init params are set by Init()).
+   */
+  void ReadConfigFile(const char *filename);
+  /** Same as above, but only set debug params from the given config file. */
+  void ReadDebugConfigFile(const char *filename);
+
+  /**
+   * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK.
+   * The mode is stored as an IntParam so it can also be modified by
+   * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
+   */
+  void SetPageSegMode(PageSegMode mode);
+
+  /** Return the current page segmentation mode. */
+  PageSegMode GetPageSegMode() const;
+
+  /**
+   * Recognize a rectangle from an image and return the result as a string.
+   * May be called many times for a single Init.
+   * Currently has no error checking.
+   * Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
+   * Palette color images will not work properly and must be converted to
+   * 24 bit.
+   * Binary images of 1 bit per pixel may also be given but they must be
+   * byte packed with the MSB of the first byte being the first pixel, and a
+   * 1 represents WHITE. For binary images set bytes_per_pixel=0.
+   * The recognized text is returned as a char* which is coded
+   * as UTF8 and must be freed with the delete [] operator.
+   *
+   * Note that TesseractRect is the simplified convenience interface.
+   * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize,
+   * and one or more of the Get*Text functions below.
+   */
+  char *TesseractRect(const unsigned char *imagedata, int bytes_per_pixel,
+                      int bytes_per_line, int left, int top, int width,
+                      int height);
+
+  /**
+   * Call between pages or documents etc to free up memory and forget
+   * adaptive data.
+   */
+  void ClearAdaptiveClassifier();
+
+  /**
+   * @defgroup AdvancedAPI Advanced API
+   * The following methods break TesseractRect into pieces, so you can
+   * get hold of the thresholded image, get the text in different formats,
+   * get bounding boxes, confidences etc.
+   */
+  /* @{ */
+
+  /**
+   * Provide an image for Tesseract to recognize. Format is as
+   * TesseractRect above. Copies the image buffer and converts to Pix.
+   * SetImage clears all recognition results, and sets the rectangle to the
+   * full image, so it may be followed immediately by a GetUTF8Text, and it
+   * will automatically perform recognition.
+   */
+  void SetImage(const unsigned char *imagedata, int width, int height,
+                int bytes_per_pixel, int bytes_per_line);
+
+  /**
+   * Provide an image for Tesseract to recognize. As with SetImage above,
+   * Tesseract takes its own copy of the image, so it need not persist until
+   * after Recognize.
+   * Pix vs raw, which to use?
+   * Use Pix where possible. Tesseract uses Pix as its internal representation
+   * and it is therefore more efficient to provide a Pix directly.
+   */
+  void SetImage(Pix *pix);
+
+  /**
+   * Set the resolution of the source image in pixels per inch so font size
+   * information can be calculated in results.  Call this after SetImage().
+   */
+  void SetSourceResolution(int ppi);
+
+  /**
+   * Restrict recognition to a sub-rectangle of the image. Call after SetImage.
+   * Each SetRectangle clears the recogntion results so multiple rectangles
+   * can be recognized with the same image.
+   */
+  void SetRectangle(int left, int top, int width, int height);
+
+  /**
+   * Get a copy of the internal thresholded image from Tesseract.
+   * Caller takes ownership of the Pix and must pixDestroy it.
+   * May be called any time after SetImage, or after TesseractRect.
+   */
+  Pix *GetThresholdedImage();
+
+  /**
+   * Get the result of page layout analysis as a leptonica-style
+   * Boxa, Pixa pair, in reading order.
+   * Can be called before or after Recognize.
+   */
+  Boxa *GetRegions(Pixa **pixa);
+
+  /**
+   * Get the textlines as a leptonica-style
+   * Boxa, Pixa pair, in reading order.
+   * Can be called before or after Recognize.
+   * If raw_image is true, then extract from the original image instead of the
+   * thresholded image and pad by raw_padding pixels.
+   * If blockids is not nullptr, the block-id of each line is also returned as
+   * an array of one element per line. delete [] after use. If paraids is not
+   * nullptr, the paragraph-id of each line within its block is also returned as
+   * an array of one element per line. delete [] after use.
+   */
+  Boxa *GetTextlines(bool raw_image, int raw_padding, Pixa **pixa,
+                     int **blockids, int **paraids);
+  /*
+   Helper method to extract from the thresholded image. (most common usage)
+*/
+  Boxa *GetTextlines(Pixa **pixa, int **blockids) {
+    return GetTextlines(false, 0, pixa, blockids, nullptr);
+  }
+
+  /**
+   * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa
+   * pair, in reading order. Enables downstream handling of non-rectangular
+   * regions.
+   * Can be called before or after Recognize.
+   * If blockids is not nullptr, the block-id of each line is also returned as
+   * an array of one element per line. delete [] after use.
+   */
+  Boxa *GetStrips(Pixa **pixa, int **blockids);
+
+  /**
+   * Get the words as a leptonica-style
+   * Boxa, Pixa pair, in reading order.
+   * Can be called before or after Recognize.
+   */
+  Boxa *GetWords(Pixa **pixa);
+
+  /**
+   * Gets the individual connected (text) components (created
+   * after pages segmentation step, but before recognition)
+   * as a leptonica-style Boxa, Pixa pair, in reading order.
+   * Can be called before or after Recognize.
+   * Note: the caller is responsible for calling boxaDestroy()
+   * on the returned Boxa array and pixaDestroy() on cc array.
+   */
+  Boxa *GetConnectedComponents(Pixa **cc);
+
+  /**
+   * Get the given level kind of components (block, textline, word etc.) as a
+   * leptonica-style Boxa, Pixa pair, in reading order.
+   * Can be called before or after Recognize.
+   * If blockids is not nullptr, the block-id of each component is also returned
+   * as an array of one element per component. delete [] after use.
+   * If blockids is not nullptr, the paragraph-id of each component with its
+   * block is also returned as an array of one element per component. delete []
+   * after use. If raw_image is true, then portions of the original image are
+   * extracted instead of the thresholded image and padded with raw_padding. If
+   * text_only is true, then only text components are returned.
+   */
+  Boxa *GetComponentImages(PageIteratorLevel level, bool text_only,
+                           bool raw_image, int raw_padding, Pixa **pixa,
+                           int **blockids, int **paraids);
+  // Helper function to get binary images with no padding (most common usage).
+  Boxa *GetComponentImages(const PageIteratorLevel level, const bool text_only,
+                           Pixa **pixa, int **blockids) {
+    return GetComponentImages(level, text_only, false, 0, pixa, blockids,
+                              nullptr);
+  }
+
+  /**
+   * Returns the scale factor of the thresholded image that would be returned by
+   * GetThresholdedImage() and the various GetX() methods that call
+   * GetComponentImages().
+   * Returns 0 if no thresholder has been set.
+   */
+  int GetThresholdedImageScaleFactor() const;
+
+  /**
+   * Runs page layout analysis in the mode set by SetPageSegMode.
+   * May optionally be called prior to Recognize to get access to just
+   * the page layout results. Returns an iterator to the results.
+   * If merge_similar_words is true, words are combined where suitable for use
+   * with a line recognizer. Use if you want to use AnalyseLayout to find the
+   * textlines, and then want to process textline fragments with an external
+   * line recognizer.
+   * Returns nullptr on error or an empty page.
+   * The returned iterator must be deleted after use.
+   * WARNING! This class points to data held within the TessBaseAPI class, and
+   * therefore can only be used while the TessBaseAPI class still exists and
+   * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
+   * DetectOS, or anything else that changes the internal PAGE_RES.
+   */
+  PageIterator *AnalyseLayout();
+  PageIterator *AnalyseLayout(bool merge_similar_words);
+
+  /**
+   * Recognize the image from SetAndThresholdImage, generating Tesseract
+   * internal structures. Returns 0 on success.
+   * Optional. The Get*Text functions below will call Recognize if needed.
+   * After Recognize, the output is kept internally until the next SetImage.
+   */
+  int Recognize(ETEXT_DESC *monitor);
+
+  /**
+   * Methods to retrieve information after SetAndThresholdImage(),
+   * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.)
+   */
+
+  /**
+   * Turns images into symbolic text.
+   *
+   * filename can point to a single image, a multi-page TIFF,
+   * or a plain text list of image filenames.
+   *
+   * retry_config is useful for debugging. If not nullptr, you can fall
+   * back to an alternate configuration if a page fails for some
+   * reason.
+   *
+   * timeout_millisec terminates processing if any single page
+   * takes too long. Set to 0 for unlimited time.
+   *
+   * renderer is responible for creating the output. For example,
+   * use the TessTextRenderer if you want plaintext output, or
+   * the TessPDFRender to produce searchable PDF.
+   *
+   * If tessedit_page_number is non-negative, will only process that
+   * single page. Works for multi-page tiff file, or filelist.
+   *
+   * Returns true if successful, false on error.
+   */
+  bool ProcessPages(const char *filename, const char *retry_config,
+                    int timeout_millisec, TessResultRenderer *renderer);
+  // Does the real work of ProcessPages.
+  bool ProcessPagesInternal(const char *filename, const char *retry_config,
+                            int timeout_millisec, TessResultRenderer *renderer);
+
+  /**
+   * Turn a single image into symbolic text.
+   *
+   * The pix is the image processed. filename and page_index are
+   * metadata used by side-effect processes, such as reading a box
+   * file or formatting as hOCR.
+   *
+   * See ProcessPages for descriptions of other parameters.
+   */
+  bool ProcessPage(Pix *pix, int page_index, const char *filename,
+                   const char *retry_config, int timeout_millisec,
+                   TessResultRenderer *renderer);
+
+  /**
+   * Get a reading-order iterator to the results of LayoutAnalysis and/or
+   * Recognize. The returned iterator must be deleted after use.
+   * WARNING! This class points to data held within the TessBaseAPI class, and
+   * therefore can only be used while the TessBaseAPI class still exists and
+   * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
+   * DetectOS, or anything else that changes the internal PAGE_RES.
+   */
+  ResultIterator *GetIterator();
+
+  /**
+   * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.
+   * The returned iterator must be deleted after use.
+   * WARNING! This class points to data held within the TessBaseAPI class, and
+   * therefore can only be used while the TessBaseAPI class still exists and
+   * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
+   * DetectOS, or anything else that changes the internal PAGE_RES.
+   */
+  MutableIterator *GetMutableIterator();
+
+  /**
+   * The recognized text is returned as a char* which is coded
+   * as UTF8 and must be freed with the delete [] operator.
+   */
+  char *GetUTF8Text();
+
+  size_t GetNumberOfTables() const;
+
+  /// Return the i-th table bounding box coordinates
+  ///
+  /// Gives the (top_left.x, top_left.y, bottom_right.x, bottom_right.y)
+  /// coordinates of the i-th table.
+  std::tuple<int, int, int, int> GetTableBoundingBox(
+      unsigned
+          i ///< Index of the table, for upper limit \see GetNumberOfTables()
+  );
+
+  /// Get bounding boxes of the rows of a table
+  /// return values are (top_left.x, top_left.y, bottom_right.x, bottom_right.y)
+  std::vector<std::tuple<int, int, int, int> > GetTableRows(
+      unsigned
+          i ///< Index of the table, for upper limit \see GetNumberOfTables()
+  );
+
+  /// Get bounding boxes of the cols of a table
+  /// return values are (top_left.x, top_left.y, bottom_right.x, bottom_right.y)
+  std::vector<std::tuple<int, int, int, int> > GetTableCols(
+      unsigned
+          i ///< Index of the table, for upper limit \see GetNumberOfTables()
+  );
+
+  /**
+   * Make a HTML-formatted string with hOCR markup from the internal
+   * data structures.
+   * page_number is 0-based but will appear in the output as 1-based.
+   * monitor can be used to
+   *  cancel the recognition
+   *  receive progress callbacks
+   * Returned string must be freed with the delete [] operator.
+   */
+  char *GetHOCRText(ETEXT_DESC *monitor, int page_number);
+
+  /**
+   * Make a HTML-formatted string with hOCR markup from the internal
+   * data structures.
+   * page_number is 0-based but will appear in the output as 1-based.
+   * Returned string must be freed with the delete [] operator.
+   */
+  char *GetHOCRText(int page_number);
+
+  /**
+   * Make an XML-formatted string with Alto markup from the internal
+   * data structures.
+   */
+  char *GetAltoText(ETEXT_DESC *monitor, int page_number);
+
+  /**
+   * Make an XML-formatted string with Alto markup from the internal
+   * data structures.
+   */
+  char *GetAltoText(int page_number);
+
+  /**
+   * Make a TSV-formatted string from the internal data structures.
+   * page_number is 0-based but will appear in the output as 1-based.
+   * Returned string must be freed with the delete [] operator.
+   */
+  char *GetTSVText(int page_number);
+
+  /**
+   * Make a box file for LSTM training from the internal data structures.
+   * Constructs coordinates in the original image - not just the rectangle.
+   * page_number is a 0-based page index that will appear in the box file.
+   * Returned string must be freed with the delete [] operator.
+   */
+  char *GetLSTMBoxText(int page_number);
+
+  /**
+   * The recognized text is returned as a char* which is coded in the same
+   * format as a box file used in training.
+   * Constructs coordinates in the original image - not just the rectangle.
+   * page_number is a 0-based page index that will appear in the box file.
+   * Returned string must be freed with the delete [] operator.
+   */
+  char *GetBoxText(int page_number);
+
+  /**
+   * The recognized text is returned as a char* which is coded in the same
+   * format as a WordStr box file used in training.
+   * page_number is a 0-based page index that will appear in the box file.
+   * Returned string must be freed with the delete [] operator.
+   */
+  char *GetWordStrBoxText(int page_number);
+
+  /**
+   * The recognized text is returned as a char* which is coded
+   * as UNLV format Latin-1 with specific reject and suspect codes.
+   * Returned string must be freed with the delete [] operator.
+   */
+  char *GetUNLVText();
+
+  /**
+   * Detect the orientation of the input image and apparent script (alphabet).
+   * orient_deg is the detected clockwise rotation of the input image in degrees
+   * (0, 90, 180, 270)
+   * orient_conf is the confidence (15.0 is reasonably confident)
+   * script_name is an ASCII string, the name of the script, e.g. "Latin"
+   * script_conf is confidence level in the script
+   * Returns true on success and writes values to each parameter as an output
+   */
+  bool DetectOrientationScript(int *orient_deg, float *orient_conf,
+                               const char **script_name, float *script_conf);
+
+  /**
+   * The recognized text is returned as a char* which is coded
+   * as UTF8 and must be freed with the delete [] operator.
+   * page_number is a 0-based page index that will appear in the osd file.
+   */
+  char *GetOsdText(int page_number);
+
+  /** Returns the (average) confidence value between 0 and 100. */
+  int MeanTextConf();
+  /**
+   * Returns all word confidences (between 0 and 100) in an array, terminated
+   * by -1.  The calling function must delete [] after use.
+   * The number of confidences should correspond to the number of space-
+   * delimited words in GetUTF8Text.
+   */
+  int *AllWordConfidences();
+
+#ifndef DISABLED_LEGACY_ENGINE
+  /**
+   * Applies the given word to the adaptive classifier if possible.
+   * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can
+   * tell the boundaries of the graphemes.
+   * Assumes that SetImage/SetRectangle have been used to set the image
+   * to the given word. The mode arg should be PSM_SINGLE_WORD or
+   * PSM_CIRCLE_WORD, as that will be used to control layout analysis.
+   * The currently set PageSegMode is preserved.
+   * Returns false if adaption was not possible for some reason.
+   */
+  bool AdaptToWordStr(PageSegMode mode, const char *wordstr);
+#endif //  ndef DISABLED_LEGACY_ENGINE
+
+  /**
+   * Free up recognition results and any stored image data, without actually
+   * freeing any recognition data that would be time-consuming to reload.
+   * Afterwards, you must call SetImage or TesseractRect before doing
+   * any Recognize or Get* operation.
+   */
+  void Clear();
+
+  /**
+   * Close down tesseract and free up all memory. End() is equivalent to
+   * destructing and reconstructing your TessBaseAPI.
+   * Once End() has been used, none of the other API functions may be used
+   * other than Init and anything declared above it in the class definition.
+   */
+  void End();
+
+  /**
+   * Clear any library-level memory caches.
+   * There are a variety of expensive-to-load constant data structures (mostly
+   * language dictionaries) that are cached globally -- surviving the Init()
+   * and End() of individual TessBaseAPI's.  This function allows the clearing
+   * of these caches.
+   **/
+  static void ClearPersistentCache();
+
+  /**
+   * Check whether a word is valid according to Tesseract's language model
+   * @return 0 if the word is invalid, non-zero if valid.
+   * @warning temporary! This function will be removed from here and placed
+   * in a separate API at some future time.
+   */
+  int IsValidWord(const char *word) const;
+  // Returns true if utf8_character is defined in the UniCharset.
+  bool IsValidCharacter(const char *utf8_character) const;
+
+  bool GetTextDirection(int *out_offset, float *out_slope);
+
+  /** Sets Dict::letter_is_okay_ function to point to the given function. */
+  void SetDictFunc(DictFunc f);
+
+  /** Sets Dict::probability_in_context_ function to point to the given
+   * function.
+   */
+  void SetProbabilityInContextFunc(ProbabilityInContextFunc f);
+
+  /**
+   * Estimates the Orientation And Script of the image.
+   * @return true if the image was processed successfully.
+   */
+  bool DetectOS(OSResults *);
+
+  /**
+   * Return text orientation of each block as determined by an earlier run
+   * of layout analysis.
+   */
+  void GetBlockTextOrientations(int **block_orientation,
+                                bool **vertical_writing);
+
+  /** This method returns the string form of the specified unichar. */
+  const char *GetUnichar(int unichar_id) const;
+
+  /** Return the pointer to the i-th dawg loaded into tesseract_ object. */
+  const Dawg *GetDawg(int i) const;
+
+  /** Return the number of dawgs loaded into tesseract_ object. */
+  int NumDawgs() const;
+
+  Tesseract *tesseract() const {
+    return tesseract_;
+  }
+
+  OcrEngineMode oem() const {
+    return last_oem_requested_;
+  }
+
+  void set_min_orientation_margin(double margin);
+  /* @} */
+
+protected:
+  /** Common code for setting the image. Returns true if Init has been called.
+   */
+  bool InternalSetImage();
+
+  /**
+   * Run the thresholder to make the thresholded image. If pix is not nullptr,
+   * the source is thresholded to pix instead of the internal IMAGE.
+   */
+  virtual bool Threshold(Pix **pix);
+
+  /**
+   * Find lines from the image making the BLOCK_LIST.
+   * @return 0 on success.
+   */
+  int FindLines();
+
+  /** Delete the pageres and block list ready for a new page. */
+  void ClearResults();
+
+  /**
+   * Return an LTR Result Iterator -- used only for training, as we really want
+   * to ignore all BiDi smarts at that point.
+   * delete once you're done with it.
+   */
+  LTRResultIterator *GetLTRIterator();
+
+  /**
+   * Return the length of the output text string, as UTF8, assuming
+   * one newline per line and one per block, with a terminator,
+   * and assuming a single character reject marker for each rejected character.
+   * Also return the number of recognized blobs in blob_count.
+   */
+  int TextLength(int *blob_count) const;
+
+  //// paragraphs.cpp ////////////////////////////////////////////////////
+  void DetectParagraphs(bool after_text_recognition);
+
+  const PAGE_RES *GetPageRes() const {
+    return page_res_;
+  }
+
+protected:
+  Tesseract *tesseract_;          ///< The underlying data object.
+  Tesseract *osd_tesseract_;      ///< For orientation & script detection.
+  EquationDetect *equ_detect_;    ///< The equation detector.
+  FileReader reader_;             ///< Reads files from any filesystem.
+  ImageThresholder *thresholder_; ///< Image thresholding module.
+  std::vector<ParagraphModel *> *paragraph_models_;
+  BLOCK_LIST *block_list_;           ///< The page layout.
+  PAGE_RES *page_res_;               ///< The page-level data.
+  std::string input_file_;           ///< Name used by training code.
+  std::string output_file_;          ///< Name used by debug code.
+  std::string datapath_;             ///< Current location of tessdata.
+  std::string language_;             ///< Last initialized language.
+  OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested.
+  bool recognition_done_;            ///< page_res_ contains recognition data.
+
+  /**
+   * @defgroup ThresholderParams Thresholder Parameters
+   * Parameters saved from the Thresholder. Needed to rebuild coordinates.
+   */
+  /* @{ */
+  int rect_left_;
+  int rect_top_;
+  int rect_width_;
+  int rect_height_;
+  int image_width_;
+  int image_height_;
+  /* @} */
+
+private:
+  // A list of image filenames gets special consideration
+  bool ProcessPagesFileList(FILE *fp, std::string *buf,
+                            const char *retry_config, int timeout_millisec,
+                            TessResultRenderer *renderer,
+                            int tessedit_page_number);
+  // TIFF supports multipage so gets special consideration.
+  bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size,
+                                 const char *filename, const char *retry_config,
+                                 int timeout_millisec,
+                                 TessResultRenderer *renderer,
+                                 int tessedit_page_number);
+}; // class TessBaseAPI.
+
+/** Escape a char string - remove &<>"' with HTML codes. */
+std::string HOcrEscape(const char *text);
+
+} // namespace tesseract
+
+#endif // TESSERACT_API_BASEAPI_H_
--- a/3rdparty/tesseract_ocr/tesseract/include/tesseract/capi.h
+++ b/3rdparty/tesseract_ocr/tesseract/include/tesseract/capi.h
@ -0,0 +1,482 @@
+///////////////////////////////////////////////////////////////////////
+// File:        capi.h
+// Description: C-API TessBaseAPI
+//
+// (C) Copyright 2012, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef API_CAPI_H_
+#define API_CAPI_H_
+
+#include "export.h"
+
+#ifdef __cplusplus
+#  include <tesseract/baseapi.h>
+#  include <tesseract/ocrclass.h>
+#  include <tesseract/pageiterator.h>
+#  include <tesseract/renderer.h>
+#  include <tesseract/resultiterator.h>
+#endif
+
+#include <stdbool.h>
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef BOOL
+#  define BOOL int
+#  define TRUE 1
+#  define FALSE 0
+#endif
+
+#ifdef __cplusplus
+typedef tesseract::TessResultRenderer TessResultRenderer;
+typedef tesseract::TessBaseAPI TessBaseAPI;
+typedef tesseract::PageIterator TessPageIterator;
+typedef tesseract::ResultIterator TessResultIterator;
+typedef tesseract::MutableIterator TessMutableIterator;
+typedef tesseract::ChoiceIterator TessChoiceIterator;
+typedef tesseract::OcrEngineMode TessOcrEngineMode;
+typedef tesseract::PageSegMode TessPageSegMode;
+typedef tesseract::PageIteratorLevel TessPageIteratorLevel;
+typedef tesseract::Orientation TessOrientation;
+typedef tesseract::ParagraphJustification TessParagraphJustification;
+typedef tesseract::WritingDirection TessWritingDirection;
+typedef tesseract::TextlineOrder TessTextlineOrder;
+typedef tesseract::PolyBlockType TessPolyBlockType;
+typedef tesseract::ETEXT_DESC ETEXT_DESC;
+#else
+typedef struct TessResultRenderer TessResultRenderer;
+typedef struct TessBaseAPI TessBaseAPI;
+typedef struct TessPageIterator TessPageIterator;
+typedef struct TessResultIterator TessResultIterator;
+typedef struct TessMutableIterator TessMutableIterator;
+typedef struct TessChoiceIterator TessChoiceIterator;
+typedef enum TessOcrEngineMode {
+  OEM_TESSERACT_ONLY,
+  OEM_LSTM_ONLY,
+  OEM_TESSERACT_LSTM_COMBINED,
+  OEM_DEFAULT
+} TessOcrEngineMode;
+typedef enum TessPageSegMode {
+  PSM_OSD_ONLY,
+  PSM_AUTO_OSD,
+  PSM_AUTO_ONLY,
+  PSM_AUTO,
+  PSM_SINGLE_COLUMN,
+  PSM_SINGLE_BLOCK_VERT_TEXT,
+  PSM_SINGLE_BLOCK,
+  PSM_SINGLE_LINE,
+  PSM_SINGLE_WORD,
+  PSM_CIRCLE_WORD,
+  PSM_SINGLE_CHAR,
+  PSM_SPARSE_TEXT,
+  PSM_SPARSE_TEXT_OSD,
+  PSM_RAW_LINE,
+  PSM_COUNT
+} TessPageSegMode;
+typedef enum TessPageIteratorLevel {
+  RIL_BLOCK,
+  RIL_PARA,
+  RIL_TEXTLINE,
+  RIL_WORD,
+  RIL_SYMBOL
+} TessPageIteratorLevel;
+typedef enum TessPolyBlockType {
+  PT_UNKNOWN,
+  PT_FLOWING_TEXT,
+  PT_HEADING_TEXT,
+  PT_PULLOUT_TEXT,
+  PT_EQUATION,
+  PT_INLINE_EQUATION,
+  PT_TABLE,
+  PT_VERTICAL_TEXT,
+  PT_CAPTION_TEXT,
+  PT_FLOWING_IMAGE,
+  PT_HEADING_IMAGE,
+  PT_PULLOUT_IMAGE,
+  PT_HORZ_LINE,
+  PT_VERT_LINE,
+  PT_NOISE,
+  PT_COUNT
+} TessPolyBlockType;
+typedef enum TessOrientation {
+  ORIENTATION_PAGE_UP,
+  ORIENTATION_PAGE_RIGHT,
+  ORIENTATION_PAGE_DOWN,
+  ORIENTATION_PAGE_LEFT
+} TessOrientation;
+typedef enum TessParagraphJustification {
+  JUSTIFICATION_UNKNOWN,
+  JUSTIFICATION_LEFT,
+  JUSTIFICATION_CENTER,
+  JUSTIFICATION_RIGHT
+} TessParagraphJustification;
+typedef enum TessWritingDirection {
+  WRITING_DIRECTION_LEFT_TO_RIGHT,
+  WRITING_DIRECTION_RIGHT_TO_LEFT,
+  WRITING_DIRECTION_TOP_TO_BOTTOM
+} TessWritingDirection;
+typedef enum TessTextlineOrder {
+  TEXTLINE_ORDER_LEFT_TO_RIGHT,
+  TEXTLINE_ORDER_RIGHT_TO_LEFT,
+  TEXTLINE_ORDER_TOP_TO_BOTTOM
+} TessTextlineOrder;
+typedef struct ETEXT_DESC ETEXT_DESC;
+#endif
+
+typedef bool (*TessCancelFunc)(void *cancel_this, int words);
+typedef bool (*TessProgressFunc)(ETEXT_DESC *ths, int left, int right, int top,
+                                 int bottom);
+
+struct Pix;
+struct Boxa;
+struct Pixa;
+
+/* General free functions */
+
+TESS_API const char *TessVersion();
+TESS_API void TessDeleteText(const char *text);
+TESS_API void TessDeleteTextArray(char **arr);
+TESS_API void TessDeleteIntArray(const int *arr);
+
+/* Renderer API */
+TESS_API TessResultRenderer *TessTextRendererCreate(const char *outputbase);
+TESS_API TessResultRenderer *TessHOcrRendererCreate(const char *outputbase);
+TESS_API TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase,
+                                                     BOOL font_info);
+TESS_API TessResultRenderer *TessAltoRendererCreate(const char *outputbase);
+TESS_API TessResultRenderer *TessTsvRendererCreate(const char *outputbase);
+TESS_API TessResultRenderer *TessPDFRendererCreate(const char *outputbase,
+                                                   const char *datadir,
+                                                   BOOL textonly);
+TESS_API TessResultRenderer *TessUnlvRendererCreate(const char *outputbase);
+TESS_API TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase);
+TESS_API TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase);
+TESS_API TessResultRenderer *TessWordStrBoxRendererCreate(
+    const char *outputbase);
+
+TESS_API void TessDeleteResultRenderer(TessResultRenderer *renderer);
+TESS_API void TessResultRendererInsert(TessResultRenderer *renderer,
+                                       TessResultRenderer *next);
+TESS_API TessResultRenderer *TessResultRendererNext(
+    TessResultRenderer *renderer);
+TESS_API BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer,
+                                              const char *title);
+TESS_API BOOL TessResultRendererAddImage(TessResultRenderer *renderer,
+                                         TessBaseAPI *api);
+TESS_API BOOL TessResultRendererEndDocument(TessResultRenderer *renderer);
+
+TESS_API const char *TessResultRendererExtention(TessResultRenderer *renderer);
+TESS_API const char *TessResultRendererTitle(TessResultRenderer *renderer);
+TESS_API int TessResultRendererImageNum(TessResultRenderer *renderer);
+
+/* Base API */
+
+TESS_API TessBaseAPI *TessBaseAPICreate();
+TESS_API void TessBaseAPIDelete(TessBaseAPI *handle);
+
+TESS_API size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI *handle, void **device);
+
+TESS_API void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name);
+TESS_API const char *TessBaseAPIGetInputName(TessBaseAPI *handle);
+
+TESS_API void TessBaseAPISetInputImage(TessBaseAPI *handle, struct Pix *pix);
+TESS_API struct Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle);
+
+TESS_API int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle);
+TESS_API const char *TessBaseAPIGetDatapath(TessBaseAPI *handle);
+
+TESS_API void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name);
+
+TESS_API BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name,
+                                     const char *value);
+TESS_API BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name,
+                                          const char *value);
+
+TESS_API BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle,
+                                        const char *name, int *value);
+TESS_API BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle,
+                                         const char *name, BOOL *value);
+TESS_API BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle,
+                                           const char *name, double *value);
+TESS_API const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle,
+                                                  const char *name);
+
+TESS_API void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp);
+TESS_API BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle,
+                                              const char *filename);
+
+TESS_API int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath,
+                              const char *language, TessOcrEngineMode oem,
+                              char **configs, int configs_size);
+TESS_API int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath,
+                              const char *language, TessOcrEngineMode oem);
+TESS_API int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath,
+                              const char *language);
+
+TESS_API int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath,
+                              const char *language, TessOcrEngineMode mode,
+                              char **configs, int configs_size, char **vars_vec,
+                              char **vars_values, size_t vars_vec_size,
+                              BOOL set_only_non_debug_params);
+
+TESS_API const char *TessBaseAPIGetInitLanguagesAsString(
+    const TessBaseAPI *handle);
+TESS_API char **TessBaseAPIGetLoadedLanguagesAsVector(
+    const TessBaseAPI *handle);
+TESS_API char **TessBaseAPIGetAvailableLanguagesAsVector(
+    const TessBaseAPI *handle);
+
+TESS_API int TessBaseAPIInitLangMod(TessBaseAPI *handle, const char *datapath,
+                                    const char *language);
+TESS_API void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle);
+
+TESS_API void TessBaseAPIReadConfigFile(TessBaseAPI *handle,
+                                        const char *filename);
+TESS_API void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle,
+                                             const char *filename);
+
+TESS_API void TessBaseAPISetPageSegMode(TessBaseAPI *handle,
+                                        TessPageSegMode mode);
+TESS_API TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle);
+
+TESS_API char *TessBaseAPIRect(TessBaseAPI *handle,
+                               const unsigned char *imagedata,
+                               int bytes_per_pixel, int bytes_per_line,
+                               int left, int top, int width, int height);
+
+TESS_API void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle);
+
+TESS_API void TessBaseAPISetImage(TessBaseAPI *handle,
+                                  const unsigned char *imagedata, int width,
+                                  int height, int bytes_per_pixel,
+                                  int bytes_per_line);
+TESS_API void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix);
+
+TESS_API void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi);
+
+TESS_API void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top,
+                                      int width, int height);
+
+TESS_API struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle);
+TESS_API struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle,
+                                            struct Pixa **pixa);
+TESS_API struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle,
+                                              struct Pixa **pixa,
+                                              int **blockids);
+TESS_API struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle,
+                                               BOOL raw_image, int raw_padding,
+                                               struct Pixa **pixa,
+                                               int **blockids, int **paraids);
+TESS_API struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle,
+                                           struct Pixa **pixa, int **blockids);
+TESS_API struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle,
+                                          struct Pixa **pixa);
+TESS_API struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle,
+                                                        struct Pixa **cc);
+TESS_API struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle,
+                                                    TessPageIteratorLevel level,
+                                                    BOOL text_only,
+                                                    struct Pixa **pixa,
+                                                    int **blockids);
+TESS_API struct Boxa *TessBaseAPIGetComponentImages1(
+    TessBaseAPI *handle, TessPageIteratorLevel level, BOOL text_only,
+    BOOL raw_image, int raw_padding, struct Pixa **pixa, int **blockids,
+    int **paraids);
+
+TESS_API int TessBaseAPIGetThresholdedImageScaleFactor(
+    const TessBaseAPI *handle);
+
+TESS_API TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle);
+
+TESS_API int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor);
+
+TESS_API BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename,
+                                      const char *retry_config,
+                                      int timeout_millisec,
+                                      TessResultRenderer *renderer);
+TESS_API BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix,
+                                     int page_index, const char *filename,
+                                     const char *retry_config,
+                                     int timeout_millisec,
+                                     TessResultRenderer *renderer);
+
+TESS_API TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle);
+TESS_API TessMutableIterator *TessBaseAPIGetMutableIterator(
+    TessBaseAPI *handle);
+
+TESS_API char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle);
+TESS_API char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number);
+
+TESS_API char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number);
+TESS_API char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number);
+
+TESS_API char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number);
+TESS_API char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number);
+TESS_API char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle,
+                                            int page_number);
+
+TESS_API char *TessBaseAPIGetUNLVText(TessBaseAPI *handle);
+TESS_API int TessBaseAPIMeanTextConf(TessBaseAPI *handle);
+
+TESS_API int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle);
+
+#ifndef DISABLED_LEGACY_ENGINE
+TESS_API BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle,
+                                        TessPageSegMode mode,
+                                        const char *wordstr);
+#endif // #ifndef DISABLED_LEGACY_ENGINE
+
+TESS_API void TessBaseAPIClear(TessBaseAPI *handle);
+TESS_API void TessBaseAPIEnd(TessBaseAPI *handle);
+
+TESS_API int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word);
+TESS_API BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset,
+                                          float *out_slope);
+
+TESS_API const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id);
+
+TESS_API void TessBaseAPIClearPersistentCache(TessBaseAPI *handle);
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+// Call TessDeleteText(*best_script_name) to free memory allocated by this
+// function
+TESS_API BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle,
+                                                 int *orient_deg,
+                                                 float *orient_conf,
+                                                 const char **script_name,
+                                                 float *script_conf);
+#endif // #ifndef DISABLED_LEGACY_ENGINE
+
+TESS_API void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle,
+                                                 double margin);
+
+TESS_API int TessBaseAPINumDawgs(const TessBaseAPI *handle);
+
+TESS_API TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle);
+
+TESS_API void TessBaseGetBlockTextOrientations(TessBaseAPI *handle,
+                                               int **block_orientation,
+                                               bool **vertical_writing);
+
+/* Page iterator */
+
+TESS_API void TessPageIteratorDelete(TessPageIterator *handle);
+
+TESS_API TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle);
+
+TESS_API void TessPageIteratorBegin(TessPageIterator *handle);
+
+TESS_API BOOL TessPageIteratorNext(TessPageIterator *handle,
+                                   TessPageIteratorLevel level);
+
+TESS_API BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle,
+                                              TessPageIteratorLevel level);
+
+TESS_API BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle,
+                                               TessPageIteratorLevel level,
+                                               TessPageIteratorLevel element);
+
+TESS_API BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle,
+                                          TessPageIteratorLevel level,
+                                          int *left, int *top, int *right,
+                                          int *bottom);
+
+TESS_API TessPolyBlockType
+TessPageIteratorBlockType(const TessPageIterator *handle);
+
+TESS_API struct Pix *TessPageIteratorGetBinaryImage(
+    const TessPageIterator *handle, TessPageIteratorLevel level);
+
+TESS_API struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle,
+                                              TessPageIteratorLevel level,
+                                              int padding,
+                                              struct Pix *original_image,
+                                              int *left, int *top);
+
+TESS_API BOOL TessPageIteratorBaseline(const TessPageIterator *handle,
+                                       TessPageIteratorLevel level, int *x1,
+                                       int *y1, int *x2, int *y2);
+
+TESS_API void TessPageIteratorOrientation(
+    TessPageIterator *handle, TessOrientation *orientation,
+    TessWritingDirection *writing_direction, TessTextlineOrder *textline_order,
+    float *deskew_angle);
+
+TESS_API void TessPageIteratorParagraphInfo(
+    TessPageIterator *handle, TessParagraphJustification *justification,
+    BOOL *is_list_item, BOOL *is_crown, int *first_line_indent);
+
+/* Result iterator */
+
+TESS_API void TessResultIteratorDelete(TessResultIterator *handle);
+TESS_API TessResultIterator *TessResultIteratorCopy(
+    const TessResultIterator *handle);
+TESS_API TessPageIterator *TessResultIteratorGetPageIterator(
+    TessResultIterator *handle);
+TESS_API const TessPageIterator *TessResultIteratorGetPageIteratorConst(
+    const TessResultIterator *handle);
+TESS_API TessChoiceIterator *TessResultIteratorGetChoiceIterator(
+    const TessResultIterator *handle);
+
+TESS_API BOOL TessResultIteratorNext(TessResultIterator *handle,
+                                     TessPageIteratorLevel level);
+TESS_API char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle,
+                                             TessPageIteratorLevel level);
+TESS_API float TessResultIteratorConfidence(const TessResultIterator *handle,
+                                            TessPageIteratorLevel level);
+TESS_API const char *TessResultIteratorWordRecognitionLanguage(
+    const TessResultIterator *handle);
+TESS_API const char *TessResultIteratorWordFontAttributes(
+    const TessResultIterator *handle, BOOL *is_bold, BOOL *is_italic,
+    BOOL *is_underlined, BOOL *is_monospace, BOOL *is_serif, BOOL *is_smallcaps,
+    int *pointsize, int *font_id);
+
+TESS_API BOOL
+TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle);
+TESS_API BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle);
+TESS_API BOOL
+TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle);
+TESS_API BOOL
+TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle);
+TESS_API BOOL
+TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle);
+
+TESS_API void TessChoiceIteratorDelete(TessChoiceIterator *handle);
+TESS_API BOOL TessChoiceIteratorNext(TessChoiceIterator *handle);
+TESS_API const char *TessChoiceIteratorGetUTF8Text(
+    const TessChoiceIterator *handle);
+TESS_API float TessChoiceIteratorConfidence(const TessChoiceIterator *handle);
+
+/* Progress monitor */
+
+TESS_API ETEXT_DESC *TessMonitorCreate();
+TESS_API void TessMonitorDelete(ETEXT_DESC *monitor);
+TESS_API void TessMonitorSetCancelFunc(ETEXT_DESC *monitor,
+                                       TessCancelFunc cancelFunc);
+TESS_API void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis);
+TESS_API void *TessMonitorGetCancelThis(ETEXT_DESC *monitor);
+TESS_API void TessMonitorSetProgressFunc(ETEXT_DESC *monitor,
+                                         TessProgressFunc progressFunc);
+TESS_API int TessMonitorGetProgress(ETEXT_DESC *monitor);
+TESS_API void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // API_CAPI_H_
--- a/3rdparty/tesseract_ocr/tesseract/include/tesseract/export.h
+++ b/3rdparty/tesseract_ocr/tesseract/include/tesseract/export.h
@ -0,0 +1,39 @@
+///////////////////////////////////////////////////////////////////////
+// File:        export.h
+// Description: Place holder
+//
+// (C) Copyright 2006, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_PLATFORM_H_
+#define TESSERACT_PLATFORM_H_
+
+#ifndef TESS_API
+#  if defined(_WIN32) || defined(__CYGWIN__)
+#    if defined(TESS_EXPORTS)
+#      define TESS_API __declspec(dllexport)
+#    elif defined(TESS_IMPORTS)
+#      define TESS_API __declspec(dllimport)
+#    else
+#      define TESS_API
+#    endif
+#  else
+#    if defined(TESS_EXPORTS) || defined(TESS_IMPORTS)
+#      define TESS_API __attribute__((visibility("default")))
+#    else
+#      define TESS_API
+#    endif
+#  endif
+#endif
+
+#endif // TESSERACT_PLATFORM_H_
--- a/3rdparty/tesseract_ocr/tesseract/include/tesseract/ltrresultiterator.h
+++ b/3rdparty/tesseract_ocr/tesseract/include/tesseract/ltrresultiterator.h
@ -0,0 +1,241 @@
+///////////////////////////////////////////////////////////////////////
+// File:        ltrresultiterator.h
+// Description: Iterator for tesseract results in strict left-to-right
+//              order that avoids using tesseract internal data structures.
+// Author:      Ray Smith
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
+#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
+
+#include "export.h"       // for TESS_API
+#include "pageiterator.h" // for PageIterator
+#include "publictypes.h"  // for PageIteratorLevel
+#include "unichar.h"      // for StrongScriptDirection
+
+namespace tesseract {
+
+class BLOB_CHOICE_IT;
+class PAGE_RES;
+class WERD_RES;
+
+class Tesseract;
+
+// Class to iterate over tesseract results, providing access to all levels
+// of the page hierarchy, without including any tesseract headers or having
+// to handle any tesseract structures.
+// WARNING! This class points to data held within the TessBaseAPI class, and
+// therefore can only be used while the TessBaseAPI class still exists and
+// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
+// DetectOS, or anything else that changes the internal PAGE_RES.
+// See tesseract/publictypes.h for the definition of PageIteratorLevel.
+// See also base class PageIterator, which contains the bulk of the interface.
+// LTRResultIterator adds text-specific methods for access to OCR output.
+
+class TESS_API LTRResultIterator : public PageIterator {
+  friend class ChoiceIterator;
+
+public:
+  // page_res and tesseract come directly from the BaseAPI.
+  // The rectangle parameters are copied indirectly from the Thresholder,
+  // via the BaseAPI. They represent the coordinates of some rectangle in an
+  // original image (in top-left-origin coordinates) and therefore the top-left
+  // needs to be added to any output boxes in order to specify coordinates
+  // in the original image. See TessBaseAPI::SetRectangle.
+  // The scale and scaled_yres are in case the Thresholder scaled the image
+  // rectangle prior to thresholding. Any coordinates in tesseract's image
+  // must be divided by scale before adding (rect_left, rect_top).
+  // The scaled_yres indicates the effective resolution of the binary image
+  // that tesseract has been given by the Thresholder.
+  // After the constructor, Begin has already been called.
+  LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
+                    int scaled_yres, int rect_left, int rect_top,
+                    int rect_width, int rect_height);
+
+  ~LTRResultIterator() override;
+
+  // LTRResultIterators may be copied! This makes it possible to iterate over
+  // all the objects at a lower level, while maintaining an iterator to
+  // objects at a higher level. These constructors DO NOT CALL Begin, so
+  // iterations will continue from the location of src.
+  // TODO: For now the copy constructor and operator= only need the base class
+  // versions, but if new data members are added, don't forget to add them!
+
+  // ============= Moving around within the page ============.
+
+  // See PageIterator.
+
+  // ============= Accessing data ==============.
+
+  // Returns the null terminated UTF-8 encoded text string for the current
+  // object at the given level. Use delete [] to free after use.
+  char *GetUTF8Text(PageIteratorLevel level) const;
+
+  // Set the string inserted at the end of each text line. "\n" by default.
+  void SetLineSeparator(const char *new_line);
+
+  // Set the string inserted at the end of each paragraph. "\n" by default.
+  void SetParagraphSeparator(const char *new_para);
+
+  // Returns the mean confidence of the current object at the given level.
+  // The number should be interpreted as a percent probability. (0.0f-100.0f)
+  float Confidence(PageIteratorLevel level) const;
+
+  // Returns the attributes of the current row.
+  void RowAttributes(float *row_height, float *descenders,
+                     float *ascenders) const;
+
+  // ============= Functions that refer to words only ============.
+
+  // Returns the font attributes of the current word. If iterating at a higher
+  // level object than words, eg textlines, then this will return the
+  // attributes of the first word in that textline.
+  // The actual return value is a string representing a font name. It points
+  // to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
+  // the iterator itself, ie rendered invalid by various members of
+  // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
+  // Pointsize is returned in printers points (1/72 inch.)
+  const char *WordFontAttributes(bool *is_bold, bool *is_italic,
+                                 bool *is_underlined, bool *is_monospace,
+                                 bool *is_serif, bool *is_smallcaps,
+                                 int *pointsize, int *font_id) const;
+
+  // Return the name of the language used to recognize this word.
+  // On error, nullptr.  Do not delete this pointer.
+  const char *WordRecognitionLanguage() const;
+
+  // Return the overall directionality of this word.
+  StrongScriptDirection WordDirection() const;
+
+  // Returns true if the current word was found in a dictionary.
+  bool WordIsFromDictionary() const;
+
+  // Returns the number of blanks before the current word.
+  int BlanksBeforeWord() const;
+
+  // Returns true if the current word is numeric.
+  bool WordIsNumeric() const;
+
+  // Returns true if the word contains blamer information.
+  bool HasBlamerInfo() const;
+
+  // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
+  // of the current word.
+  const void *GetParamsTrainingBundle() const;
+
+  // Returns a pointer to the string with blamer information for this word.
+  // Assumes that the word's blamer_bundle is not nullptr.
+  const char *GetBlamerDebug() const;
+
+  // Returns a pointer to the string with misadaption information for this word.
+  // Assumes that the word's blamer_bundle is not nullptr.
+  const char *GetBlamerMisadaptionDebug() const;
+
+  // Returns true if a truth string was recorded for the current word.
+  bool HasTruthString() const;
+
+  // Returns true if the given string is equivalent to the truth string for
+  // the current word.
+  bool EquivalentToTruth(const char *str) const;
+
+  // Returns a null terminated UTF-8 encoded truth string for the current word.
+  // Use delete [] to free after use.
+  char *WordTruthUTF8Text() const;
+
+  // Returns a null terminated UTF-8 encoded normalized OCR string for the
+  // current word. Use delete [] to free after use.
+  char *WordNormedUTF8Text() const;
+
+  // Returns a pointer to serialized choice lattice.
+  // Fills lattice_size with the number of bytes in lattice data.
+  const char *WordLattice(int *lattice_size) const;
+
+  // ============= Functions that refer to symbols only ============.
+
+  // Returns true if the current symbol is a superscript.
+  // If iterating at a higher level object than symbols, eg words, then
+  // this will return the attributes of the first symbol in that word.
+  bool SymbolIsSuperscript() const;
+  // Returns true if the current symbol is a subscript.
+  // If iterating at a higher level object than symbols, eg words, then
+  // this will return the attributes of the first symbol in that word.
+  bool SymbolIsSubscript() const;
+  // Returns true if the current symbol is a dropcap.
+  // If iterating at a higher level object than symbols, eg words, then
+  // this will return the attributes of the first symbol in that word.
+  bool SymbolIsDropcap() const;
+
+protected:
+  const char *line_separator_;
+  const char *paragraph_separator_;
+};
+
+// Class to iterate over the classifier choices for a single RIL_SYMBOL.
+class TESS_API ChoiceIterator {
+public:
+  // Construction is from a LTRResultIterator that points to the symbol of
+  // interest. The ChoiceIterator allows a one-shot iteration over the
+  // choices for this symbol and after that is is useless.
+  explicit ChoiceIterator(const LTRResultIterator &result_it);
+  ~ChoiceIterator();
+
+  // Moves to the next choice for the symbol and returns false if there
+  // are none left.
+  bool Next();
+
+  // ============= Accessing data ==============.
+
+  // Returns the null terminated UTF-8 encoded text string for the current
+  // choice.
+  // NOTE: Unlike LTRResultIterator::GetUTF8Text, the return points to an
+  // internal structure and should NOT be delete[]ed to free after use.
+  const char *GetUTF8Text() const;
+
+  // Returns the confidence of the current choice depending on the used language
+  // data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All
+  // choices for one symbol should roughly add up to 1.0f.
+  // If only traineddata of the legacy engine is used, the number should be
+  // interpreted as a percent probability. (0.0f-100.0f) In this case
+  // probabilities won't add up to 100. Each one stands on its own.
+  float Confidence() const;
+
+  // Returns a vector containing all timesteps, which belong to the currently
+  // selected symbol. A timestep is a vector containing pairs of symbols and
+  // floating point numbers. The number states the probability for the
+  // corresponding symbol.
+  std::vector<std::vector<std::pair<const char *, float>>> *Timesteps() const;
+
+private:
+  // clears the remaining spaces out of the results and adapt the probabilities
+  void filterSpaces();
+  // Pointer to the WERD_RES object owned by the API.
+  WERD_RES *word_res_;
+  // Iterator over the blob choices.
+  BLOB_CHOICE_IT *choice_it_;
+  std::vector<std::pair<const char *, float>> *LSTM_choices_ = nullptr;
+  std::vector<std::pair<const char *, float>>::iterator LSTM_choice_it_;
+
+  const int *tstep_index_;
+  // regulates the rating granularity
+  double rating_coefficient_;
+  // leading blanks
+  int blanks_before_word_;
+  // true when there is lstm engine related trained data
+  bool oemLSTM_;
+};
+
+} // namespace tesseract.
+
+#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
--- a/3rdparty/tesseract_ocr/tesseract/include/tesseract/ocrclass.h
+++ b/3rdparty/tesseract_ocr/tesseract/include/tesseract/ocrclass.h
@ -0,0 +1,157 @@
+/**********************************************************************
+ * File:        ocrclass.h
+ * Description: Class definitions and constants for the OCR API.
+ * Author:      Hewlett-Packard Co
+ *
+ * (C) Copyright 1996, Hewlett-Packard Co.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+/**********************************************************************
+ * This file contains typedefs for all the structures used by
+ * the HP OCR interface.
+ * The structures are designed to allow them to be used with any
+ * structure alignment up to 8.
+ **********************************************************************/
+
+#ifndef CCUTIL_OCRCLASS_H_
+#define CCUTIL_OCRCLASS_H_
+
+#include <chrono>
+#include <ctime>
+
+namespace tesseract {
+
+/**********************************************************************
+ * EANYCODE_CHAR
+ * Description of a single character. The character code is defined by
+ * the character set of the current font.
+ * Output text is sent as an array of these structures.
+ * Spaces and line endings in the output are represented in the
+ * structures of the surrounding characters. They are not directly
+ * represented as characters.
+ * The first character in a word has a positive value of blanks.
+ * Missing information should be set to the defaults in the comments.
+ * If word bounds are known, but not character bounds, then the top and
+ * bottom of each character should be those of the word. The left of the
+ * first and right of the last char in each word should be set. All other
+ * lefts and rights should be set to -1.
+ * If set, the values of right and bottom are left+width and top+height.
+ * Most of the members come directly from the parameters to ocr_append_char.
+ * The formatting member uses the enhancement parameter and combines the
+ * line direction stuff into the top 3 bits.
+ * The coding is 0=RL char, 1=LR char, 2=DR NL, 3=UL NL, 4=DR Para,
+ * 5=UL Para, 6=TB char, 7=BT char. API users do not need to know what
+ * the coding is, only that it is backwards compatible with the previous
+ * version.
+ **********************************************************************/
+
+struct EANYCODE_CHAR { /*single character */
+  // It should be noted that the format for char_code for version 2.0 and beyond
+  // is UTF8 which means that ASCII characters will come out as one structure
+  // but other characters will be returned in two or more instances of this
+  // structure with a single byte of the  UTF8 code in each, but each will have
+  // the same bounding box. Programs which want to handle languagues with
+  // different characters sets will need to handle extended characters
+  // appropriately, but *all* code needs to be prepared to receive UTF8 coded
+  // characters for characters such as bullet and fancy quotes.
+  uint16_t char_code; /*character itself */
+  int16_t left;       /*of char (-1) */
+  int16_t right;      /*of char (-1) */
+  int16_t top;        /*of char (-1) */
+  int16_t bottom;     /*of char (-1) */
+  int16_t font_index; /*what font (0) */
+  uint8_t confidence; /*0=perfect, 100=reject (0/100) */
+  uint8_t point_size; /*of char, 72=i inch, (10) */
+  int8_t blanks;      /*no of spaces before this char (1) */
+  uint8_t formatting; /*char formatting (0) */
+};
+
+/**********************************************************************
+ * ETEXT_DESC
+ * Description of the output of the OCR engine.
+ * This structure is used as both a progress monitor and the final
+ * output header, since it needs to be a valid progress monitor while
+ * the OCR engine is storing its output to shared memory.
+ * During progress, all the buffer info is -1.
+ * Progress starts at 0 and increases to 100 during OCR. No other constraint.
+ * Additionally the progress callback contains the bounding box of the word that
+ * is currently being processed.
+ * Every progress callback, the OCR engine must set ocr_alive to 1.
+ * The HP side will set ocr_alive to 0. Repeated failure to reset
+ * to 1 indicates that the OCR engine is dead.
+ * If the cancel function is not null then it is called with the number of
+ * user words found. If it returns true then operation is cancelled.
+ **********************************************************************/
+class ETEXT_DESC;
+
+using CANCEL_FUNC = bool (*)(void *, int);
+using PROGRESS_FUNC = bool (*)(int, int, int, int, int);
+using PROGRESS_FUNC2 = bool (*)(ETEXT_DESC *, int, int, int, int);
+
+class ETEXT_DESC { // output header
+public:
+  int16_t count{0};    /// chars in this buffer(0)
+  int16_t progress{0}; /// percent complete increasing (0-100)
+  /** Progress monitor covers word recognition and it does not cover layout
+   * analysis.
+   * See Ray comment in https://github.com/tesseract-ocr/tesseract/pull/27 */
+  int8_t more_to_come{0};       /// true if not last
+  volatile int8_t ocr_alive{0}; /// ocr sets to 1, HP 0
+  int8_t err_code{0};           /// for errcode use
+  CANCEL_FUNC cancel{nullptr};  /// returns true to cancel
+  PROGRESS_FUNC progress_callback{
+      nullptr};                      /// called whenever progress increases
+  PROGRESS_FUNC2 progress_callback2; /// monitor-aware progress callback
+  void *cancel_this{nullptr};        /// this or other data for cancel
+  std::chrono::steady_clock::time_point end_time;
+  /// Time to stop. Expected to be set only
+  /// by call to set_deadline_msecs().
+  EANYCODE_CHAR text[1]{}; /// character data
+
+  ETEXT_DESC() : progress_callback2(&default_progress_func) {
+    end_time = std::chrono::time_point<std::chrono::steady_clock,
+                                       std::chrono::milliseconds>();
+  }
+
+  // Sets the end time to be deadline_msecs milliseconds from now.
+  void set_deadline_msecs(int32_t deadline_msecs) {
+    if (deadline_msecs > 0) {
+      end_time = std::chrono::steady_clock::now() +
+                 std::chrono::milliseconds(deadline_msecs);
+    }
+  }
+
+  // Returns false if we've not passed the end_time, or have not set a deadline.
+  bool deadline_exceeded() const {
+    if (end_time.time_since_epoch() ==
+        std::chrono::steady_clock::duration::zero()) {
+      return false;
+    }
+    auto now = std::chrono::steady_clock::now();
+    return (now > end_time);
+  }
+
+private:
+  static bool default_progress_func(ETEXT_DESC *ths, int left, int right,
+                                    int top, int bottom) {
+    if (ths->progress_callback != nullptr) {
+      return (*(ths->progress_callback))(ths->progress, left, right, top,
+                                         bottom);
+    }
+    return true;
+  }
+};
+
+} // namespace tesseract
+
+#endif // CCUTIL_OCRCLASS_H_
--- a/3rdparty/tesseract_ocr/tesseract/include/tesseract/osdetect.h
+++ b/3rdparty/tesseract_ocr/tesseract/include/tesseract/osdetect.h
@ -0,0 +1,141 @@
+///////////////////////////////////////////////////////////////////////
+// File:        osdetect.h
+// Description: Orientation and script detection.
+// Author:      Samuel Charron
+//              Ranjith Unnikrishnan
+//
+// (C) Copyright 2008, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCMAIN_OSDETECT_H_
+#define TESSERACT_CCMAIN_OSDETECT_H_
+
+#include "export.h" // for TESS_API
+
+#include <vector> // for std::vector
+
+namespace tesseract {
+
+class BLOBNBOX;
+class BLOBNBOX_CLIST;
+class BLOB_CHOICE_LIST;
+class TO_BLOCK_LIST;
+class UNICHARSET;
+
+class Tesseract;
+
+// Max number of scripts in ICU + "NULL" + Japanese and Korean + Fraktur
+const int kMaxNumberOfScripts = 116 + 1 + 2 + 1;
+
+struct OSBestResult {
+  OSBestResult()
+      : orientation_id(0), script_id(0), sconfidence(0.0), oconfidence(0.0) {}
+  int orientation_id;
+  int script_id;
+  float sconfidence;
+  float oconfidence;
+};
+
+struct OSResults {
+  OSResults() : unicharset(nullptr) {
+    for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < kMaxNumberOfScripts; ++j) {
+        scripts_na[i][j] = 0;
+      }
+      orientations[i] = 0;
+    }
+  }
+  void update_best_orientation();
+  // Set the estimate of the orientation to the given id.
+  void set_best_orientation(int orientation_id);
+  // Update/Compute the best estimate of the script assuming the given
+  // orientation id.
+  void update_best_script(int orientation_id);
+  // Return the index of the script with the highest score for this orientation.
+  TESS_API int get_best_script(int orientation_id) const;
+  // Accumulate scores with given OSResults instance and update the best script.
+  void accumulate(const OSResults &osr);
+
+  // Print statistics.
+  void print_scores(void) const;
+  void print_scores(int orientation_id) const;
+
+  // Array holding scores for each orientation id [0,3].
+  // Orientation ids [0..3] map to [0, 270, 180, 90] degree orientations of the
+  // page respectively, where the values refer to the amount of clockwise
+  // rotation to be applied to the page for the text to be upright and readable.
+  float orientations[4];
+  // Script confidence scores for each of 4 possible orientations.
+  float scripts_na[4][kMaxNumberOfScripts];
+
+  UNICHARSET *unicharset;
+  OSBestResult best_result;
+};
+
+class OrientationDetector {
+public:
+  OrientationDetector(const std::vector<int> *allowed_scripts,
+                      OSResults *results);
+  bool detect_blob(BLOB_CHOICE_LIST *scores);
+  int get_orientation();
+
+private:
+  OSResults *osr_;
+  const std::vector<int> *allowed_scripts_;
+};
+
+class ScriptDetector {
+public:
+  ScriptDetector(const std::vector<int> *allowed_scripts, OSResults *osr,
+                 tesseract::Tesseract *tess);
+  void detect_blob(BLOB_CHOICE_LIST *scores);
+  bool must_stop(int orientation) const;
+
+private:
+  OSResults *osr_;
+  static const char *korean_script_;
+  static const char *japanese_script_;
+  static const char *fraktur_script_;
+  int korean_id_;
+  int japanese_id_;
+  int katakana_id_;
+  int hiragana_id_;
+  int han_id_;
+  int hangul_id_;
+  int latin_id_;
+  int fraktur_id_;
+  tesseract::Tesseract *tess_;
+  const std::vector<int> *allowed_scripts_;
+};
+
+int orientation_and_script_detection(const char *filename, OSResults *,
+                                     tesseract::Tesseract *);
+
+int os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr,
+              tesseract::Tesseract *tess);
+
+int os_detect_blobs(const std::vector<int> *allowed_scripts,
+                    BLOBNBOX_CLIST *blob_list, OSResults *osr,
+                    tesseract::Tesseract *tess);
+
+bool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s,
+                    OSResults *, tesseract::Tesseract *tess);
+
+// Helper method to convert an orientation index to its value in degrees.
+// The value represents the amount of clockwise rotation in degrees that must be
+// applied for the text to be upright (readable).
+TESS_API int OrientationIdToValue(const int &id);
+
+} // namespace tesseract
+
+#endif // TESSERACT_CCMAIN_OSDETECT_H_
--- a/3rdparty/tesseract_ocr/tesseract/include/tesseract/pageiterator.h
+++ b/3rdparty/tesseract_ocr/tesseract/include/tesseract/pageiterator.h
@ -0,0 +1,362 @@
+///////////////////////////////////////////////////////////////////////
+// File:        pageiterator.h
+// Description: Iterator for tesseract page structure that avoids using
+//              tesseract internal data structures.
+// Author:      Ray Smith
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_
+#define TESSERACT_CCMAIN_PAGEITERATOR_H_
+
+#include "export.h"
+#include "publictypes.h"
+
+struct Pix;
+struct Pta;
+
+namespace tesseract {
+
+struct BlamerBundle;
+class C_BLOB_IT;
+class PAGE_RES;
+class PAGE_RES_IT;
+class WERD;
+
+class Tesseract;
+
+/**
+ * Class to iterate over tesseract page structure, providing access to all
+ * levels of the page hierarchy, without including any tesseract headers or
+ * having to handle any tesseract structures.
+ * WARNING! This class points to data held within the TessBaseAPI class, and
+ * therefore can only be used while the TessBaseAPI class still exists and
+ * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
+ * DetectOS, or anything else that changes the internal PAGE_RES.
+ * See tesseract/publictypes.h for the definition of PageIteratorLevel.
+ * See also ResultIterator, derived from PageIterator, which adds in the
+ * ability to access OCR output with text-specific methods.
+ */
+
+class TESS_API PageIterator {
+public:
+  /**
+   * page_res and tesseract come directly from the BaseAPI.
+   * The rectangle parameters are copied indirectly from the Thresholder,
+   * via the BaseAPI. They represent the coordinates of some rectangle in an
+   * original image (in top-left-origin coordinates) and therefore the top-left
+   * needs to be added to any output boxes in order to specify coordinates
+   * in the original image. See TessBaseAPI::SetRectangle.
+   * The scale and scaled_yres are in case the Thresholder scaled the image
+   * rectangle prior to thresholding. Any coordinates in tesseract's image
+   * must be divided by scale before adding (rect_left, rect_top).
+   * The scaled_yres indicates the effective resolution of the binary image
+   * that tesseract has been given by the Thresholder.
+   * After the constructor, Begin has already been called.
+   */
+  PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
+               int scaled_yres, int rect_left, int rect_top, int rect_width,
+               int rect_height);
+  virtual ~PageIterator();
+
+  /**
+   * Page/ResultIterators may be copied! This makes it possible to iterate over
+   * all the objects at a lower level, while maintaining an iterator to
+   * objects at a higher level. These constructors DO NOT CALL Begin, so
+   * iterations will continue from the location of src.
+   */
+  PageIterator(const PageIterator &src);
+  const PageIterator &operator=(const PageIterator &src);
+
+  /** Are we positioned at the same location as other? */
+  bool PositionedAtSameWord(const PAGE_RES_IT *other) const;
+
+  // ============= Moving around within the page ============.
+
+  /**
+   * Moves the iterator to point to the start of the page to begin an
+   * iteration.
+   */
+  virtual void Begin();
+
+  /**
+   * Moves the iterator to the beginning of the paragraph.
+   * This class implements this functionality by moving it to the zero indexed
+   * blob of the first (leftmost) word on the first row of the paragraph.
+   */
+  virtual void RestartParagraph();
+
+  /**
+   * Return whether this iterator points anywhere in the first textline of a
+   * paragraph.
+   */
+  bool IsWithinFirstTextlineOfParagraph() const;
+
+  /**
+   * Moves the iterator to the beginning of the text line.
+   * This class implements this functionality by moving it to the zero indexed
+   * blob of the first (leftmost) word of the row.
+   */
+  virtual void RestartRow();
+
+  /**
+   * Moves to the start of the next object at the given level in the
+   * page hierarchy, and returns false if the end of the page was reached.
+   * NOTE that RIL_SYMBOL will skip non-text blocks, but all other
+   * PageIteratorLevel level values will visit each non-text block once.
+   * Think of non text blocks as containing a single para, with a single line,
+   * with a single imaginary word.
+   * Calls to Next with different levels may be freely intermixed.
+   * This function iterates words in right-to-left scripts correctly, if
+   * the appropriate language has been loaded into Tesseract.
+   */
+  virtual bool Next(PageIteratorLevel level);
+
+  /**
+   * Returns true if the iterator is at the start of an object at the given
+   * level.
+   *
+   * For instance, suppose an iterator it is pointed to the first symbol of the
+   * first word of the third line of the second paragraph of the first block in
+   * a page, then:
+   *   it.IsAtBeginningOf(RIL_BLOCK) = false
+   *   it.IsAtBeginningOf(RIL_PARA) = false
+   *   it.IsAtBeginningOf(RIL_TEXTLINE) = true
+   *   it.IsAtBeginningOf(RIL_WORD) = true
+   *   it.IsAtBeginningOf(RIL_SYMBOL) = true
+   */
+  virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
+
+  /**
+   * Returns whether the iterator is positioned at the last element in a
+   * given level. (e.g. the last word in a line, the last line in a block)
+   *
+   *     Here's some two-paragraph example
+   *   text.  It starts off innocuously
+   *   enough but quickly turns bizarre.
+   *     The author inserts a cornucopia
+   *   of words to guard against confused
+   *   references.
+   *
+   * Now take an iterator it pointed to the start of "bizarre."
+   *  it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false
+   *  it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true
+   *  it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false
+   */
+  virtual bool IsAtFinalElement(PageIteratorLevel level,
+                                PageIteratorLevel element) const;
+
+  /**
+   * Returns whether this iterator is positioned
+   *   before other:   -1
+   *   equal to other:  0
+   *   after other:     1
+   */
+  int Cmp(const PageIterator &other) const;
+
+  // ============= Accessing data ==============.
+  // Coordinate system:
+  // Integer coordinates are at the cracks between the pixels.
+  // The top-left corner of the top-left pixel in the image is at (0,0).
+  // The bottom-right corner of the bottom-right pixel in the image is at
+  // (width, height).
+  // Every bounding box goes from the top-left of the top-left contained
+  // pixel to the bottom-right of the bottom-right contained pixel, so
+  // the bounding box of the single top-left pixel in the image is:
+  // (0,0)->(1,1).
+  // If an image rectangle has been set in the API, then returned coordinates
+  // relate to the original (full) image, rather than the rectangle.
+
+  /**
+   * Controls what to include in a bounding box. Bounding boxes of all levels
+   * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics.
+   * Between layout analysis and recognition, it isn't known where all
+   * diacritics belong, so this control is used to include or exclude some
+   * diacritics that are above or below the main body of the word. In most cases
+   * where the placement is obvious, and after recognition, it doesn't make as
+   * much difference, as the diacritics will already be included in the word.
+   */
+  void SetBoundingBoxComponents(bool include_upper_dots,
+                                bool include_lower_dots) {
+    include_upper_dots_ = include_upper_dots;
+    include_lower_dots_ = include_lower_dots;
+  }
+
+  /**
+   * Returns the bounding rectangle of the current object at the given level.
+   * See comment on coordinate system above.
+   * Returns false if there is no such object at the current position.
+   * The returned bounding box is guaranteed to match the size and position
+   * of the image returned by GetBinaryImage, but may clip foreground pixels
+   * from a grey image. The padding argument to GetImage can be used to expand
+   * the image to include more foreground pixels. See GetImage below.
+   */
+  bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right,
+                   int *bottom) const;
+  bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top,
+                   int *right, int *bottom) const;
+  /**
+   * Returns the bounding rectangle of the object in a coordinate system of the
+   * working image rectangle having its origin at (rect_left_, rect_top_) with
+   * respect to the original image and is scaled by a factor scale_.
+   */
+  bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top,
+                           int *right, int *bottom) const;
+
+  /** Returns whether there is no object of a given level. */
+  bool Empty(PageIteratorLevel level) const;
+
+  /**
+   * Returns the type of the current block.
+   * See tesseract/publictypes.h for PolyBlockType.
+   */
+  PolyBlockType BlockType() const;
+
+  /**
+   * Returns the polygon outline of the current block. The returned Pta must
+   * be ptaDestroy-ed after use. Note that the returned Pta lists the vertices
+   * of the polygon, and the last edge is the line segment between the last
+   * point and the first point. nullptr will be returned if the iterator is
+   * at the end of the document or layout analysis was not used.
+   */
+  Pta *BlockPolygon() const;
+
+  /**
+   * Returns a binary image of the current object at the given level.
+   * The position and size match the return from BoundingBoxInternal, and so
+   * this could be upscaled with respect to the original input image.
+   * Use pixDestroy to delete the image after use.
+   */
+  Pix *GetBinaryImage(PageIteratorLevel level) const;
+
+  /**
+   * Returns an image of the current object at the given level in greyscale
+   * if available in the input. To guarantee a binary image use BinaryImage.
+   * NOTE that in order to give the best possible image, the bounds are
+   * expanded slightly over the binary connected component, by the supplied
+   * padding, so the top-left position of the returned image is returned
+   * in (left,top). These will most likely not match the coordinates
+   * returned by BoundingBox.
+   * If you do not supply an original image, you will get a binary one.
+   * Use pixDestroy to delete the image after use.
+   */
+  Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img,
+                int *left, int *top) const;
+
+  /**
+   * Returns the baseline of the current object at the given level.
+   * The baseline is the line that passes through (x1, y1) and (x2, y2).
+   * WARNING: with vertical text, baselines may be vertical!
+   * Returns false if there is no baseline at the current position.
+   */
+  bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2,
+                int *y2) const;
+
+  /**
+   * Returns orientation for the block the iterator points to.
+   *   orientation, writing_direction, textline_order: see publictypes.h
+   *   deskew_angle: after rotating the block so the text orientation is
+   *                 upright, how many radians does one have to rotate the
+   *                 block anti-clockwise for it to be level?
+   *                   -Pi/4 <= deskew_angle <= Pi/4
+   */
+  void Orientation(tesseract::Orientation *orientation,
+                   tesseract::WritingDirection *writing_direction,
+                   tesseract::TextlineOrder *textline_order,
+                   float *deskew_angle) const;
+
+  /**
+   * Returns information about the current paragraph, if available.
+   *
+   *   justification -
+   *     LEFT if ragged right, or fully justified and script is left-to-right.
+   *     RIGHT if ragged left, or fully justified and script is right-to-left.
+   *     unknown if it looks like source code or we have very few lines.
+   *   is_list_item -
+   *     true if we believe this is a member of an ordered or unordered list.
+   *   is_crown -
+   *     true if the first line of the paragraph is aligned with the other
+   *     lines of the paragraph even though subsequent paragraphs have first
+   *     line indents.  This typically indicates that this is the continuation
+   *     of a previous paragraph or that it is the very first paragraph in
+   *     the chapter.
+   *   first_line_indent -
+   *     For LEFT aligned paragraphs, the first text line of paragraphs of
+   *     this kind are indented this many pixels from the left edge of the
+   *     rest of the paragraph.
+   *     for RIGHT aligned paragraphs, the first text line of paragraphs of
+   *     this kind are indented this many pixels from the right edge of the
+   *     rest of the paragraph.
+   *     NOTE 1: This value may be negative.
+   *     NOTE 2: if *is_crown == true, the first line of this paragraph is
+   *             actually flush, and first_line_indent is set to the "common"
+   *             first_line_indent for subsequent paragraphs in this block
+   *             of text.
+   */
+  void ParagraphInfo(tesseract::ParagraphJustification *justification,
+                     bool *is_list_item, bool *is_crown,
+                     int *first_line_indent) const;
+
+  // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle
+  // of the current word to the given pointer (takes ownership of the pointer)
+  // and returns true.
+  // Can only be used when iterating on the word level.
+  bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);
+
+protected:
+  /**
+   * Sets up the internal data for iterating the blobs of a new word, then
+   * moves the iterator to the given offset.
+   */
+  void BeginWord(int offset);
+
+  /** Pointer to the page_res owned by the API. */
+  PAGE_RES *page_res_;
+  /** Pointer to the Tesseract object owned by the API. */
+  Tesseract *tesseract_;
+  /**
+   * The iterator to the page_res_. Owned by this ResultIterator.
+   * A pointer just to avoid dragging in Tesseract includes.
+   */
+  PAGE_RES_IT *it_;
+  /**
+   * The current input WERD being iterated. If there is an output from OCR,
+   * then word_ is nullptr. Owned by the API
+   */
+  WERD *word_;
+  /** The length of the current word_. */
+  int word_length_;
+  /** The current blob index within the word. */
+  int blob_index_;
+  /**
+   * Iterator to the blobs within the word. If nullptr, then we are iterating
+   * OCR results in the box_word.
+   * Owned by this ResultIterator.
+   */
+  C_BLOB_IT *cblob_it_;
+  /** Control over what to include in bounding boxes. */
+  bool include_upper_dots_;
+  bool include_lower_dots_;
+  /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/
+  int scale_;
+  int scaled_yres_;
+  int rect_left_;
+  int rect_top_;
+  int rect_width_;
+  int rect_height_;
+};
+
+} // namespace tesseract.
+
+#endif // TESSERACT_CCMAIN_PAGEITERATOR_H_
--- a/3rdparty/tesseract_ocr/tesseract/include/tesseract/publictypes.h
+++ b/3rdparty/tesseract_ocr/tesseract/include/tesseract/publictypes.h
@ -0,0 +1,283 @@
+///////////////////////////////////////////////////////////////////////
+// File:        publictypes.h
+// Description: Types used in both the API and internally
+// Author:      Ray Smith
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCSTRUCT_PUBLICTYPES_H_
+#define TESSERACT_CCSTRUCT_PUBLICTYPES_H_
+
+namespace tesseract {
+
+// This file contains types that are used both by the API and internally
+// to Tesseract. In order to decouple the API from Tesseract and prevent cyclic
+// dependencies, THIS FILE SHOULD NOT DEPEND ON ANY OTHER PART OF TESSERACT.
+// Restated: It is OK for low-level Tesseract files to include publictypes.h,
+// but not for the low-level tesseract code to include top-level API code.
+// This file should not use other Tesseract types, as that would drag
+// their includes into the API-level.
+
+/** Number of printers' points in an inch. The unit of the pointsize return. */
+constexpr int kPointsPerInch = 72;
+/**
+ * Minimum believable resolution. Used as a default if there is no other
+ * information, as it is safer to under-estimate than over-estimate.
+ */
+constexpr int kMinCredibleResolution = 70;
+/** Maximum believable resolution.  */
+constexpr int kMaxCredibleResolution = 2400;
+/**
+ * Ratio between median blob size and likely resolution. Used to estimate
+ * resolution when none is provided. This is basically 1/usual text size in
+ * inches.  */
+constexpr int kResolutionEstimationFactor = 10;
+
+/**
+ * Possible types for a POLY_BLOCK or ColPartition.
+ * Must be kept in sync with kPBColors in polyblk.cpp and PTIs*Type functions
+ * below, as well as kPolyBlockNames in layout_test.cc.
+ * Used extensively by ColPartition, and POLY_BLOCK.
+ */
+enum PolyBlockType {
+  PT_UNKNOWN,         // Type is not yet known. Keep as the first element.
+  PT_FLOWING_TEXT,    // Text that lives inside a column.
+  PT_HEADING_TEXT,    // Text that spans more than one column.
+  PT_PULLOUT_TEXT,    // Text that is in a cross-column pull-out region.
+  PT_EQUATION,        // Partition belonging to an equation region.
+  PT_INLINE_EQUATION, // Partition has inline equation.
+  PT_TABLE,           // Partition belonging to a table region.
+  PT_VERTICAL_TEXT,   // Text-line runs vertically.
+  PT_CAPTION_TEXT,    // Text that belongs to an image.
+  PT_FLOWING_IMAGE,   // Image that lives inside a column.
+  PT_HEADING_IMAGE,   // Image that spans more than one column.
+  PT_PULLOUT_IMAGE,   // Image that is in a cross-column pull-out region.
+  PT_HORZ_LINE,       // Horizontal Line.
+  PT_VERT_LINE,       // Vertical Line.
+  PT_NOISE,           // Lies outside of any column.
+  PT_COUNT
+};
+
+/** Returns true if PolyBlockType is of horizontal line type */
+inline bool PTIsLineType(PolyBlockType type) {
+  return type == PT_HORZ_LINE || type == PT_VERT_LINE;
+}
+/** Returns true if PolyBlockType is of image type */
+inline bool PTIsImageType(PolyBlockType type) {
+  return type == PT_FLOWING_IMAGE || type == PT_HEADING_IMAGE ||
+         type == PT_PULLOUT_IMAGE;
+}
+/** Returns true if PolyBlockType is of text type */
+inline bool PTIsTextType(PolyBlockType type) {
+  return type == PT_FLOWING_TEXT || type == PT_HEADING_TEXT ||
+         type == PT_PULLOUT_TEXT || type == PT_TABLE ||
+         type == PT_VERTICAL_TEXT || type == PT_CAPTION_TEXT ||
+         type == PT_INLINE_EQUATION;
+}
+// Returns true if PolyBlockType is of pullout(inter-column) type
+inline bool PTIsPulloutType(PolyBlockType type) {
+  return type == PT_PULLOUT_IMAGE || type == PT_PULLOUT_TEXT;
+}
+
+/**
+ *  +------------------+  Orientation Example:
+ *  | 1 Aaaa Aaaa Aaaa |  ====================
+ *  | Aaa aa aaa aa    |  To left is a diagram of some (1) English and
+ *  | aaaaaa A aa aaa. |  (2) Chinese text and a (3) photo credit.
+ *  |                2 |
+ *  |   #######  c c C |  Upright Latin characters are represented as A and a.
+ *  |   #######  c c c |  '<' represents a latin character rotated
+ *  | < #######  c c c |      anti-clockwise 90 degrees.
+ *  | < #######  c   c |
+ *  | < #######  .   c |  Upright Chinese characters are represented C and c.
+ *  | 3 #######      c |
+ *  +------------------+  NOTA BENE: enum values here should match goodoc.proto
+
+ * If you orient your head so that "up" aligns with Orientation,
+ * then the characters will appear "right side up" and readable.
+ *
+ * In the example above, both the English and Chinese paragraphs are oriented
+ * so their "up" is the top of the page (page up).  The photo credit is read
+ * with one's head turned leftward ("up" is to page left).
+ *
+ * The values of this enum match the convention of Tesseract's osdetect.h
+*/
+enum Orientation {
+  ORIENTATION_PAGE_UP = 0,
+  ORIENTATION_PAGE_RIGHT = 1,
+  ORIENTATION_PAGE_DOWN = 2,
+  ORIENTATION_PAGE_LEFT = 3,
+};
+
+/**
+ * The grapheme clusters within a line of text are laid out logically
+ * in this direction, judged when looking at the text line rotated so that
+ * its Orientation is "page up".
+ *
+ * For English text, the writing direction is left-to-right.  For the
+ * Chinese text in the above example, the writing direction is top-to-bottom.
+ */
+enum WritingDirection {
+  WRITING_DIRECTION_LEFT_TO_RIGHT = 0,
+  WRITING_DIRECTION_RIGHT_TO_LEFT = 1,
+  WRITING_DIRECTION_TOP_TO_BOTTOM = 2,
+};
+
+/**
+ * The text lines are read in the given sequence.
+ *
+ * In English, the order is top-to-bottom.
+ * In Chinese, vertical text lines are read right-to-left.  Mongolian is
+ * written in vertical columns top to bottom like Chinese, but the lines
+ * order left-to right.
+ *
+ * Note that only some combinations make sense.  For example,
+ * WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM
+ */
+enum TextlineOrder {
+  TEXTLINE_ORDER_LEFT_TO_RIGHT = 0,
+  TEXTLINE_ORDER_RIGHT_TO_LEFT = 1,
+  TEXTLINE_ORDER_TOP_TO_BOTTOM = 2,
+};
+
+/**
+ * Possible modes for page layout analysis. These *must* be kept in order
+ * of decreasing amount of layout analysis to be done, except for OSD_ONLY,
+ * so that the inequality test macros below work.
+ */
+enum PageSegMode {
+  PSM_OSD_ONLY = 0,      ///< Orientation and script detection only.
+  PSM_AUTO_OSD = 1,      ///< Automatic page segmentation with orientation and
+                         ///< script detection. (OSD)
+  PSM_AUTO_ONLY = 2,     ///< Automatic page segmentation, but no OSD, or OCR.
+  PSM_AUTO = 3,          ///< Fully automatic page segmentation, but no OSD.
+  PSM_SINGLE_COLUMN = 4, ///< Assume a single column of text of variable sizes.
+  PSM_SINGLE_BLOCK_VERT_TEXT = 5, ///< Assume a single uniform block of
+                                  ///< vertically aligned text.
+  PSM_SINGLE_BLOCK = 6, ///< Assume a single uniform block of text. (Default.)
+  PSM_SINGLE_LINE = 7,  ///< Treat the image as a single text line.
+  PSM_SINGLE_WORD = 8,  ///< Treat the image as a single word.
+  PSM_CIRCLE_WORD = 9,  ///< Treat the image as a single word in a circle.
+  PSM_SINGLE_CHAR = 10, ///< Treat the image as a single character.
+  PSM_SPARSE_TEXT =
+      11, ///< Find as much text as possible in no particular order.
+  PSM_SPARSE_TEXT_OSD = 12, ///< Sparse text with orientation and script det.
+  PSM_RAW_LINE = 13, ///< Treat the image as a single text line, bypassing
+                     ///< hacks that are Tesseract-specific.
+
+  PSM_COUNT ///< Number of enum entries.
+};
+
+/**
+ * Inline functions that act on a PageSegMode to determine whether components of
+ * layout analysis are enabled.
+ * *Depend critically on the order of elements of PageSegMode.*
+ * NOTE that arg is an int for compatibility with INT_PARAM.
+ */
+inline bool PSM_OSD_ENABLED(int pageseg_mode) {
+  return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD;
+}
+inline bool PSM_ORIENTATION_ENABLED(int pageseg_mode) {
+  return pageseg_mode <= PSM_AUTO || pageseg_mode == PSM_SPARSE_TEXT_OSD;
+}
+inline bool PSM_COL_FIND_ENABLED(int pageseg_mode) {
+  return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO;
+}
+inline bool PSM_SPARSE(int pageseg_mode) {
+  return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
+}
+inline bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode) {
+  return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN;
+}
+inline bool PSM_LINE_FIND_ENABLED(int pageseg_mode) {
+  return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK;
+}
+inline bool PSM_WORD_FIND_ENABLED(int pageseg_mode) {
+  return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) ||
+         pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
+}
+
+/**
+ * enum of the elements of the page hierarchy, used in ResultIterator
+ * to provide functions that operate on each level without having to
+ * have 5x as many functions.
+ */
+enum PageIteratorLevel {
+  RIL_BLOCK,    // Block of text/image/separator line.
+  RIL_PARA,     // Paragraph within a block.
+  RIL_TEXTLINE, // Line within a paragraph.
+  RIL_WORD,     // Word within a textline.
+  RIL_SYMBOL    // Symbol/character within a word.
+};
+
+/**
+ * JUSTIFICATION_UNKNOWN
+ *   The alignment is not clearly one of the other options.  This could happen
+ *   for example if there are only one or two lines of text or the text looks
+ *   like source code or poetry.
+ *
+ * NOTA BENE: Fully justified paragraphs (text aligned to both left and right
+ *    margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text
+ *    is written with a left-to-right script and with JUSTIFICATION_RIGHT if
+ *    their text is written in a right-to-left script.
+ *
+ * Interpretation for text read in vertical lines:
+ *   "Left" is wherever the starting reading position is.
+ *
+ * JUSTIFICATION_LEFT
+ *   Each line, except possibly the first, is flush to the same left tab stop.
+ *
+ * JUSTIFICATION_CENTER
+ *   The text lines of the paragraph are centered about a line going
+ *   down through their middle of the text lines.
+ *
+ * JUSTIFICATION_RIGHT
+ *   Each line, except possibly the first, is flush to the same right tab stop.
+ */
+enum ParagraphJustification {
+  JUSTIFICATION_UNKNOWN,
+  JUSTIFICATION_LEFT,
+  JUSTIFICATION_CENTER,
+  JUSTIFICATION_RIGHT,
+};
+
+/**
+ * When Tesseract/Cube is initialized we can choose to instantiate/load/run
+ * only the Tesseract part, only the Cube part or both along with the combiner.
+ * The preference of which engine to use is stored in tessedit_ocr_engine_mode.
+ *
+ * ATTENTION: When modifying this enum, please make sure to make the
+ * appropriate changes to all the enums mirroring it (e.g. OCREngine in
+ * cityblock/workflow/detection/detection_storage.proto). Such enums will
+ * mention the connection to OcrEngineMode in the comments.
+ */
+enum OcrEngineMode {
+  OEM_TESSERACT_ONLY,          // Run Tesseract only - fastest; deprecated
+  OEM_LSTM_ONLY,               // Run just the LSTM line recognizer.
+  OEM_TESSERACT_LSTM_COMBINED, // Run the LSTM recognizer, but allow fallback
+                               // to Tesseract when things get difficult.
+                               // deprecated
+  OEM_DEFAULT,                 // Specify this mode when calling init_*(),
+                               // to indicate that any of the above modes
+                               // should be automatically inferred from the
+                               // variables in the language-specific config,
+                               // command-line configs, or if not specified
+                               // in any of the above should be set to the
+                               // default OEM_TESSERACT_ONLY.
+  OEM_COUNT                    // Number of OEMs
+};
+
+} // namespace tesseract.
+
+#endif // TESSERACT_CCSTRUCT_PUBLICTYPES_H_
--- a/3rdparty/tesseract_ocr/tesseract/include/tesseract/renderer.h
+++ b/3rdparty/tesseract_ocr/tesseract/include/tesseract/renderer.h
@ -0,0 +1,310 @@
+///////////////////////////////////////////////////////////////////////
+// File:        renderer.h
+// Description: Rendering interface to inject into TessBaseAPI
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_API_RENDERER_H_
+#define TESSERACT_API_RENDERER_H_
+
+#include "export.h"
+
+// To avoid collision with other typenames include the ABSOLUTE MINIMUM
+// complexity of includes here. Use forward declarations wherever possible
+// and hide includes of complex types in baseapi.cpp.
+#include <string> // for std::string
+#include <vector> // for std::vector
+
+struct Pix;
+
+namespace tesseract {
+
+class TessBaseAPI;
+
+/**
+ * Interface for rendering tesseract results into a document, such as text,
+ * HOCR or pdf. This class is abstract. Specific classes handle individual
+ * formats. This interface is then used to inject the renderer class into
+ * tesseract when processing images.
+ *
+ * For simplicity implementing this with tesseract version 3.01,
+ * the renderer contains document state that is cleared from document
+ * to document just as the TessBaseAPI is. This way the base API can just
+ * delegate its rendering functionality to injected renderers, and the
+ * renderers can manage the associated state needed for the specific formats
+ * in addition to the heuristics for producing it.
+ */
+class TESS_API TessResultRenderer {
+public:
+  virtual ~TessResultRenderer();
+
+  // Takes ownership of pointer so must be new'd instance.
+  // Renderers aren't ordered, but appends the sequences of next parameter
+  // and existing next(). The renderers should be unique across both lists.
+  void insert(TessResultRenderer *next);
+
+  // Returns the next renderer or nullptr.
+  TessResultRenderer *next() {
+    return next_;
+  }
+
+  /**
+   * Starts a new document with the given title.
+   * This clears the contents of the output data.
+   * Title should use UTF-8 encoding.
+   */
+  bool BeginDocument(const char *title);
+
+  /**
+   * Adds the recognized text from the source image to the current document.
+   * Invalid if BeginDocument not yet called.
+   *
+   * Note that this API is a bit weird but is designed to fit into the
+   * current TessBaseAPI implementation where the api has lots of state
+   * information that we might want to add in.
+   */
+  bool AddImage(TessBaseAPI *api);
+
+  /**
+   * Finishes the document and finalizes the output data
+   * Invalid if BeginDocument not yet called.
+   */
+  bool EndDocument();
+
+  const char *file_extension() const {
+    return file_extension_;
+  }
+  const char *title() const {
+    return title_.c_str();
+  }
+
+  // Is everything fine? Otherwise something went wrong.
+  bool happy() const {
+    return happy_;
+  }
+
+  /**
+   * Returns the index of the last image given to AddImage
+   * (i.e. images are incremented whether the image succeeded or not)
+   *
+   * This is always defined. It means either the number of the
+   * current image, the last image ended, or in the completed document
+   * depending on when in the document lifecycle you are looking at it.
+   * Will return -1 if a document was never started.
+   */
+  int imagenum() const {
+    return imagenum_;
+  }
+
+protected:
+  /**
+   * Called by concrete classes.
+   *
+   * outputbase is the name of the output file excluding
+   * extension. For example, "/path/to/chocolate-chip-cookie-recipe"
+   *
+   * extension indicates the file extension to be used for output
+   * files. For example "pdf" will produce a .pdf file, and "hocr"
+   * will produce .hocr files.
+   */
+  TessResultRenderer(const char *outputbase, const char *extension);
+
+  // Hook for specialized handling in BeginDocument()
+  virtual bool BeginDocumentHandler();
+
+  // This must be overridden to render the OCR'd results
+  virtual bool AddImageHandler(TessBaseAPI *api) = 0;
+
+  // Hook for specialized handling in EndDocument()
+  virtual bool EndDocumentHandler();
+
+  // Renderers can call this to append '\0' terminated strings into
+  // the output string returned by GetOutput.
+  // This method will grow the output buffer if needed.
+  void AppendString(const char *s);
+
+  // Renderers can call this to append binary byte sequences into
+  // the output string returned by GetOutput. Note that s is not necessarily
+  // '\0' terminated (and can contain '\0' within it).
+  // This method will grow the output buffer if needed.
+  void AppendData(const char *s, int len);
+
+private:
+  const char *file_extension_; // standard extension for generated output
+  std::string title_;          // title of document being rendered
+  int imagenum_;               // index of last image added
+
+  FILE *fout_;               // output file pointer
+  TessResultRenderer *next_; // Can link multiple renderers together
+  bool happy_;               // I get grumpy when the disk fills up, etc.
+};
+
+/**
+ * Renders tesseract output into a plain UTF-8 text string
+ */
+class TESS_API TessTextRenderer : public TessResultRenderer {
+public:
+  explicit TessTextRenderer(const char *outputbase);
+
+protected:
+  bool AddImageHandler(TessBaseAPI *api) override;
+};
+
+/**
+ * Renders tesseract output into an hocr text string
+ */
+class TESS_API TessHOcrRenderer : public TessResultRenderer {
+public:
+  explicit TessHOcrRenderer(const char *outputbase, bool font_info);
+  explicit TessHOcrRenderer(const char *outputbase);
+
+protected:
+  bool BeginDocumentHandler() override;
+  bool AddImageHandler(TessBaseAPI *api) override;
+  bool EndDocumentHandler() override;
+
+private:
+  bool font_info_; // whether to print font information
+};
+
+/**
+ * Renders tesseract output into an alto text string
+ */
+class TESS_API TessAltoRenderer : public TessResultRenderer {
+public:
+  explicit TessAltoRenderer(const char *outputbase);
+
+protected:
+  bool BeginDocumentHandler() override;
+  bool AddImageHandler(TessBaseAPI *api) override;
+  bool EndDocumentHandler() override;
+};
+
+/**
+ * Renders Tesseract output into a TSV string
+ */
+class TESS_API TessTsvRenderer : public TessResultRenderer {
+public:
+  explicit TessTsvRenderer(const char *outputbase, bool font_info);
+  explicit TessTsvRenderer(const char *outputbase);
+
+protected:
+  bool BeginDocumentHandler() override;
+  bool AddImageHandler(TessBaseAPI *api) override;
+  bool EndDocumentHandler() override;
+
+private:
+  bool font_info_; // whether to print font information
+};
+
+/**
+ * Renders tesseract output into searchable PDF
+ */
+class TESS_API TessPDFRenderer : public TessResultRenderer {
+public:
+  // datadir is the location of the TESSDATA. We need it because
+  // we load a custom PDF font from this location.
+  TessPDFRenderer(const char *outputbase, const char *datadir,
+                  bool textonly = false);
+
+protected:
+  bool BeginDocumentHandler() override;
+  bool AddImageHandler(TessBaseAPI *api) override;
+  bool EndDocumentHandler() override;
+
+private:
+  // We don't want to have every image in memory at once,
+  // so we store some metadata as we go along producing
+  // PDFs one page at a time. At the end, that metadata is
+  // used to make everything that isn't easily handled in a
+  // streaming fashion.
+  long int obj_;                  // counter for PDF objects
+  std::vector<long int> offsets_; // offset of every PDF object in bytes
+  std::vector<long int> pages_;   // object number for every /Page object
+  std::string datadir_;           // where to find the custom font
+  bool textonly_;                 // skip images if set
+  // Bookkeeping only. DIY = Do It Yourself.
+  void AppendPDFObjectDIY(size_t objectsize);
+  // Bookkeeping + emit data.
+  void AppendPDFObject(const char *data);
+  // Create the /Contents object for an entire page.
+  char *GetPDFTextObjects(TessBaseAPI *api, double width, double height);
+  // Turn an image into a PDF object. Only transcode if we have to.
+  static bool imageToPDFObj(Pix *pix, const char *filename, long int objnum,
+                            char **pdf_object, long int *pdf_object_size,
+                            int jpg_quality);
+};
+
+/**
+ * Renders tesseract output into a plain UTF-8 text string
+ */
+class TESS_API TessUnlvRenderer : public TessResultRenderer {
+public:
+  explicit TessUnlvRenderer(const char *outputbase);
+
+protected:
+  bool AddImageHandler(TessBaseAPI *api) override;
+};
+
+/**
+ * Renders tesseract output into a plain UTF-8 text string for LSTMBox
+ */
+class TESS_API TessLSTMBoxRenderer : public TessResultRenderer {
+public:
+  explicit TessLSTMBoxRenderer(const char *outputbase);
+
+protected:
+  bool AddImageHandler(TessBaseAPI *api) override;
+};
+
+/**
+ * Renders tesseract output into a plain UTF-8 text string
+ */
+class TESS_API TessBoxTextRenderer : public TessResultRenderer {
+public:
+  explicit TessBoxTextRenderer(const char *outputbase);
+
+protected:
+  bool AddImageHandler(TessBaseAPI *api) override;
+};
+
+/**
+ * Renders tesseract output into a plain UTF-8 text string in WordStr format
+ */
+class TESS_API TessWordStrBoxRenderer : public TessResultRenderer {
+public:
+  explicit TessWordStrBoxRenderer(const char *outputbase);
+
+protected:
+  bool AddImageHandler(TessBaseAPI *api) override;
+};
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+/**
+ * Renders tesseract output into an osd text string
+ */
+class TESS_API TessOsdRenderer : public TessResultRenderer {
+public:
+  explicit TessOsdRenderer(const char *outputbase);
+
+protected:
+  bool AddImageHandler(TessBaseAPI *api) override;
+};
+
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+} // namespace tesseract.
+
+#endif // TESSERACT_API_RENDERER_H_
--- a/3rdparty/tesseract_ocr/tesseract/include/tesseract/resultiterator.h
+++ b/3rdparty/tesseract_ocr/tesseract/include/tesseract/resultiterator.h
@ -0,0 +1,252 @@
+///////////////////////////////////////////////////////////////////////
+// File:        resultiterator.h
+// Description: Iterator for tesseract results that is capable of
+//              iterating in proper reading order over Bi Directional
+//              (e.g. mixed Hebrew and English) text.
+// Author:      David Eger
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_
+#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_
+
+#include "export.h"            // for TESS_API, TESS_LOCAL
+#include "ltrresultiterator.h" // for LTRResultIterator
+#include "publictypes.h"       // for PageIteratorLevel
+#include "unichar.h"           // for StrongScriptDirection
+
+#include <set>    // for std::pair
+#include <vector> // for std::vector
+
+namespace tesseract {
+
+class TESS_API ResultIterator : public LTRResultIterator {
+public:
+  static ResultIterator *StartOfParagraph(const LTRResultIterator &resit);
+
+  /**
+   * ResultIterator is copy constructible!
+   * The default copy constructor works just fine for us.
+   */
+  ~ResultIterator() override = default;
+
+  // ============= Moving around within the page ============.
+  /**
+   * Moves the iterator to point to the start of the page to begin
+   * an iteration.
+   */
+  void Begin() override;
+
+  /**
+   * Moves to the start of the next object at the given level in the
+   * page hierarchy in the appropriate reading order and returns false if
+   * the end of the page was reached.
+   * NOTE that RIL_SYMBOL will skip non-text blocks, but all other
+   * PageIteratorLevel level values will visit each non-text block once.
+   * Think of non text blocks as containing a single para, with a single line,
+   * with a single imaginary word.
+   * Calls to Next with different levels may be freely intermixed.
+   * This function iterates words in right-to-left scripts correctly, if
+   * the appropriate language has been loaded into Tesseract.
+   */
+  bool Next(PageIteratorLevel level) override;
+
+  /**
+   * IsAtBeginningOf() returns whether we're at the logical beginning of the
+   * given level.  (as opposed to ResultIterator's left-to-right top-to-bottom
+   * order).  Otherwise, this acts the same as PageIterator::IsAtBeginningOf().
+   * For a full description, see pageiterator.h
+   */
+  bool IsAtBeginningOf(PageIteratorLevel level) const override;
+
+  /**
+   * Implement PageIterator's IsAtFinalElement correctly in a BiDi context.
+   * For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we
+   * point at the last word in a paragraph.  See PageIterator for full comment.
+   */
+  bool IsAtFinalElement(PageIteratorLevel level,
+                        PageIteratorLevel element) const override;
+
+  // ============= Functions that refer to words only ============.
+  // Returns the number of blanks before the current word.
+  int BlanksBeforeWord() const;
+
+  // ============= Accessing data ==============.
+
+  /**
+   * Returns the null terminated UTF-8 encoded text string for the current
+   * object at the given level. Use delete [] to free after use.
+   */
+  virtual char *GetUTF8Text(PageIteratorLevel level) const;
+
+  /**
+   * Returns the LSTM choices for every LSTM timestep for the current word.
+   */
+  virtual std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
+      *GetRawLSTMTimesteps() const;
+  virtual std::vector<std::vector<std::pair<const char *, float>>>
+      *GetBestLSTMSymbolChoices() const;
+
+  /**
+   * Return whether the current paragraph's dominant reading direction
+   * is left-to-right (as opposed to right-to-left).
+   */
+  bool ParagraphIsLtr() const;
+
+  // ============= Exposed only for testing =============.
+
+  /**
+   * Yields the reading order as a sequence of indices and (optional)
+   * meta-marks for a set of words (given left-to-right).
+   * The meta marks are passed as negative values:
+   *   kMinorRunStart  Start of minor direction text.
+   *   kMinorRunEnd    End of minor direction text.
+   *   kComplexWord    The next indexed word contains both left-to-right and
+   *                    right-to-left characters and was treated as neutral.
+   *
+   * For example, suppose we have five words in a text line,
+   * indexed [0,1,2,3,4] from the leftmost side of the text line.
+   * The following are all believable reading_orders:
+   *
+   * Left-to-Right (in ltr paragraph):
+   *     { 0, 1, 2, 3, 4 }
+   * Left-to-Right (in rtl paragraph):
+   *     { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd }
+   * Right-to-Left (in rtl paragraph):
+   *     { 4, 3, 2, 1, 0 }
+   * Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph:
+   *     { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }
+   */
+  static void CalculateTextlineOrder(
+      bool paragraph_is_ltr,
+      const std::vector<StrongScriptDirection> &word_dirs,
+      std::vector<int> *reading_order);
+
+  static const int kMinorRunStart;
+  static const int kMinorRunEnd;
+  static const int kComplexWord;
+
+protected:
+  /**
+   * We presume the data associated with the given iterator will outlive us.
+   * NB: This is private because it does something that is non-obvious:
+   *   it resets to the beginning of the paragraph instead of staying wherever
+   *   resit might have pointed.
+   */
+  explicit ResultIterator(const LTRResultIterator &resit);
+
+private:
+  /**
+   * Calculates the current paragraph's dominant writing direction.
+   * Typically, members should use current_paragraph_ltr_ instead.
+   */
+  bool CurrentParagraphIsLtr() const;
+
+  /**
+   * Returns word indices as measured from resit->RestartRow() = index 0
+   * for the reading order of words within a textline given an iterator
+   * into the middle of the text line.
+   * In addition to non-negative word indices, the following negative values
+   * may be inserted:
+   *   kMinorRunStart  Start of minor direction text.
+   *   kMinorRunEnd    End of minor direction text.
+   *   kComplexWord    The previous word contains both left-to-right and
+   *                   right-to-left characters and was treated as neutral.
+   */
+  void CalculateTextlineOrder(bool paragraph_is_ltr,
+                              const LTRResultIterator &resit,
+                              std::vector<int> *indices) const;
+  /** Same as above, but the caller's ssd gets filled in if ssd != nullptr. */
+  void CalculateTextlineOrder(bool paragraph_is_ltr,
+                              const LTRResultIterator &resit,
+                              std::vector<StrongScriptDirection> *ssd,
+                              std::vector<int> *indices) const;
+
+  /**
+   * What is the index of the current word in a strict left-to-right reading
+   * of the row?
+   */
+  int LTRWordIndex() const;
+
+  /**
+   * Given an iterator pointing at a word, returns the logical reading order
+   * of blob indices for the word.
+   */
+  void CalculateBlobOrder(std::vector<int> *blob_indices) const;
+
+  /** Precondition: current_paragraph_is_ltr_ is set. */
+  void MoveToLogicalStartOfTextline();
+
+  /**
+   * Precondition: current_paragraph_is_ltr_ and in_minor_direction_
+   * are set.
+   */
+  void MoveToLogicalStartOfWord();
+
+  /** Are we pointing at the final (reading order) symbol of the word? */
+  bool IsAtFinalSymbolOfWord() const;
+
+  /** Are we pointing at the first (reading order) symbol of the word? */
+  bool IsAtFirstSymbolOfWord() const;
+
+  /**
+   * Append any extra marks that should be appended to this word when printed.
+   * Mostly, these are Unicode BiDi control characters.
+   */
+  void AppendSuffixMarks(std::string *text) const;
+
+  /** Appends the current word in reading order to the given buffer.*/
+  void AppendUTF8WordText(std::string *text) const;
+
+  /**
+   * Appends the text of the current text line, *assuming this iterator is
+   * positioned at the beginning of the text line*  This function
+   * updates the iterator to point to the first position past the text line.
+   * Each textline is terminated in a single newline character.
+   * If the textline ends a paragraph, it gets a second terminal newline.
+   */
+  void IterateAndAppendUTF8TextlineText(std::string *text);
+
+  /**
+   * Appends the text of the current paragraph in reading order
+   * to the given buffer.
+   * Each textline is terminated in a single newline character, and the
+   * paragraph gets an extra newline at the end.
+   */
+  void AppendUTF8ParagraphText(std::string *text) const;
+
+  /** Returns whether the bidi_debug flag is set to at least min_level. */
+  bool BidiDebug(int min_level) const;
+
+  bool current_paragraph_is_ltr_;
+
+  /**
+   * Is the currently pointed-at character at the beginning of
+   * a minor-direction run?
+   */
+  bool at_beginning_of_minor_run_;
+
+  /** Is the currently pointed-at character in a minor-direction sequence? */
+  bool in_minor_direction_;
+
+  /**
+   * Should detected inter-word spaces be preserved, or "compressed" to a single
+   * space character (default behavior).
+   */
+  bool preserve_interword_spaces_;
+};
+
+} // namespace tesseract.
+
+#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H_
--- a/3rdparty/tesseract_ocr/tesseract/include/tesseract/unichar.h
+++ b/3rdparty/tesseract_ocr/tesseract/include/tesseract/unichar.h
@ -0,0 +1,177 @@
+///////////////////////////////////////////////////////////////////////
+// File:        unichar.h
+// Description: Unicode character/ligature class.
+// Author:      Ray Smith
+//
+// (C) Copyright 2006, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCUTIL_UNICHAR_H_
+#define TESSERACT_CCUTIL_UNICHAR_H_
+
+#include "export.h"
+
+#include <memory.h>
+#include <cstring>
+#include <string>
+#include <vector>
+
+namespace tesseract {
+
+// Maximum number of characters that can be stored in a UNICHAR. Must be
+// at least 4. Must not exceed 31 without changing the coding of length.
+#define UNICHAR_LEN 30
+
+// TODO(rays) Move these to the tesseract namespace.
+// A UNICHAR_ID is the unique id of a unichar.
+using UNICHAR_ID = int;
+
+// A variable to indicate an invalid or uninitialized unichar id.
+static const int INVALID_UNICHAR_ID = -1;
+// A special unichar that corresponds to INVALID_UNICHAR_ID.
+static const char INVALID_UNICHAR[] = "__INVALID_UNICHAR__";
+
+enum StrongScriptDirection {
+  DIR_NEUTRAL = 0,       // Text contains only neutral characters.
+  DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters.
+  DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters.
+  DIR_MIX = 3,           // Text contains a mixture of left-to-right
+                         // and right-to-left characters.
+};
+
+using char32 = signed int;
+
+// The UNICHAR class holds a single classification result. This may be
+// a single Unicode character (stored as between 1 and 4 utf8 bytes) or
+// multiple Unicode characters representing the NFKC expansion of a ligature
+// such as fi, ffl etc. These are also stored as utf8.
+class TESS_API UNICHAR {
+public:
+  UNICHAR() {
+    memset(chars, 0, UNICHAR_LEN);
+  }
+
+  // Construct from a utf8 string. If len<0 then the string is null terminated.
+  // If the string is too long to fit in the UNICHAR then it takes only what
+  // will fit.
+  UNICHAR(const char *utf8_str, int len);
+
+  // Construct from a single UCS4 character.
+  explicit UNICHAR(int unicode);
+
+  // Default copy constructor and operator= are OK.
+
+  // Get the first character as UCS-4.
+  int first_uni() const;
+
+  // Get the length of the UTF8 string.
+  int utf8_len() const {
+    int len = chars[UNICHAR_LEN - 1];
+    return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;
+  }
+
+  // Get a UTF8 string, but NOT nullptr terminated.
+  const char *utf8() const {
+    return chars;
+  }
+
+  // Get a terminated UTF8 string: Must delete[] it after use.
+  char *utf8_str() const;
+
+  // Get the number of bytes in the first character of the given utf8 string.
+  static int utf8_step(const char *utf8_str);
+
+  // A class to simplify iterating over and accessing elements of a UTF8
+  // string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or
+  // take ownership of the underlying byte array. It also does not permit
+  // modification of the array (as the name suggests).
+  //
+  // Example:
+  //   for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len);
+  //        it != UNICHAR::end(str, len);
+  //        ++it) {
+  //     tprintf("UCS-4 symbol code = %d\n", *it);
+  //     char buf[5];
+  //     int char_len = it.get_utf8(buf); buf[char_len] = '\0';
+  //     tprintf("Char = %s\n", buf);
+  //   }
+  class TESS_API const_iterator {
+    using CI = const_iterator;
+
+  public:
+    // Step to the next UTF8 character.
+    // If the current position is at an illegal UTF8 character, then print an
+    // error message and step by one byte. If the current position is at a
+    // nullptr value, don't step past it.
+    const_iterator &operator++();
+
+    // Return the UCS-4 value at the current position.
+    // If the current position is at an illegal UTF8 value, return a single
+    // space character.
+    int operator*() const;
+
+    // Store the UTF-8 encoding of the current codepoint into buf, which must be
+    // at least 4 bytes long. Return the number of bytes written.
+    // If the current position is at an illegal UTF8 value, writes a single
+    // space character and returns 1.
+    // Note that this method does not null-terminate the buffer.
+    int get_utf8(char *buf) const;
+    // Returns the number of bytes of the current codepoint. Returns 1 if the
+    // current position is at an illegal UTF8 value.
+    int utf8_len() const;
+    // Returns true if the UTF-8 encoding at the current position is legal.
+    bool is_legal() const;
+
+    // Return the pointer into the string at the current position.
+    const char *utf8_data() const {
+      return it_;
+    }
+
+    // Iterator equality operators.
+    friend bool operator==(const CI &lhs, const CI &rhs) {
+      return lhs.it_ == rhs.it_;
+    }
+    friend bool operator!=(const CI &lhs, const CI &rhs) {
+      return !(lhs == rhs);
+    }
+
+  private:
+    friend class UNICHAR;
+    explicit const_iterator(const char *it) : it_(it) {}
+
+    const char *it_; // Pointer into the string.
+  };
+
+  // Create a start/end iterator pointing to a string. Note that these methods
+  // are static and do NOT create a copy or take ownership of the underlying
+  // array.
+  static const_iterator begin(const char *utf8_str, int byte_length);
+  static const_iterator end(const char *utf8_str, int byte_length);
+
+  // Converts a utf-8 string to a vector of unicodes.
+  // Returns an empty vector if the input contains invalid UTF-8.
+  static std::vector<char32> UTF8ToUTF32(const char *utf8_str);
+  // Converts a vector of unicodes to a utf8 string.
+  // Returns an empty string if the input contains an invalid unicode.
+  static std::string UTF32ToUTF8(const std::vector<char32> &str32);
+
+private:
+  // A UTF-8 representation of 1 or more Unicode characters.
+  // The last element (chars[UNICHAR_LEN - 1]) is a length if
+  // its value < UNICHAR_LEN, otherwise it is a genuine character.
+  char chars[UNICHAR_LEN]{};
+};
+
+} // namespace tesseract
+
+#endif // TESSERACT_CCUTIL_UNICHAR_H_
--- a/3rdparty/tesseract_ocr/tesseract/include/tesseract/version.h
+++ b/3rdparty/tesseract_ocr/tesseract/include/tesseract/version.h
@ -0,0 +1,36 @@
+///////////////////////////////////////////////////////////////////////
+// File:        version.h
+// Description: Version information
+//
+// (C) Copyright 2018, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_API_VERSION_H_
+#define TESSERACT_API_VERSION_H_
+
+// clang-format off
+
+#define TESSERACT_MAJOR_VERSION 5
+#define TESSERACT_MINOR_VERSION 0
+#define TESSERACT_MICRO_VERSION 0
+
+#define TESSERACT_VERSION          \
+  (TESSERACT_MAJOR_VERSION << 16 | \
+   TESSERACT_MINOR_VERSION <<  8 | \
+   TESSERACT_MICRO_VERSION)
+
+#define TESSERACT_VERSION_STR "5.0.0-alpha-20210401-98-g176d"
+
+// clang-format on
+
+#endif // TESSERACT_API_VERSION_H_
--- a/3rdparty/tesseract_ocr/tesseract/src/api/altorenderer.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/api/altorenderer.cpp
@ -0,0 +1,245 @@
+// File:        altorenderer.cpp
+// Description: ALTO rendering interface
+// Author:      Jake Sebright
+
+// (C) Copyright 2018
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef _WIN32
+#  include "host.h" // windows.h for MultiByteToWideChar, ...
+#endif
+
+#include <tesseract/baseapi.h>
+#include <tesseract/renderer.h>
+
+#include <memory>
+#include <sstream> // for std::stringstream
+
+namespace tesseract {
+
+/// Add coordinates to specified TextBlock, TextLine or String bounding box.
+/// Add word confidence if adding to a String bounding box.
+///
+static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level,
+                         std::stringstream &alto_str) {
+  int left, top, right, bottom;
+  it->BoundingBox(level, &left, &top, &right, &bottom);
+
+  int hpos = left;
+  int vpos = top;
+  int height = bottom - top;
+  int width = right - left;
+
+  alto_str << " HPOS=\"" << hpos << "\"";
+  alto_str << " VPOS=\"" << vpos << "\"";
+  alto_str << " WIDTH=\"" << width << "\"";
+  alto_str << " HEIGHT=\"" << height << "\"";
+
+  if (level == RIL_WORD) {
+    int wc = it->Confidence(RIL_WORD);
+    alto_str << " WC=\"0." << wc << "\"";
+  } else {
+    alto_str << ">";
+  }
+}
+
+///
+/// Append the ALTO XML for the beginning of the document
+///
+bool TessAltoRenderer::BeginDocumentHandler() {
+  AppendString(
+      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+      "<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" "
+      "xmlns:xlink=\"http://www.w3.org/1999/xlink\" "
+      "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+      "xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# "
+      "http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n"
+      "\t<Description>\n"
+      "\t\t<MeasurementUnit>pixel</MeasurementUnit>\n"
+      "\t\t<sourceImageInformation>\n"
+      "\t\t\t<fileName>");
+
+  AppendString(title());
+
+  AppendString(
+      "</fileName>\n"
+      "\t\t</sourceImageInformation>\n"
+      "\t\t<OCRProcessing ID=\"OCR_0\">\n"
+      "\t\t\t<ocrProcessingStep>\n"
+      "\t\t\t\t<processingSoftware>\n"
+      "\t\t\t\t\t<softwareName>tesseract ");
+  AppendString(TessBaseAPI::Version());
+  AppendString(
+      "</softwareName>\n"
+      "\t\t\t\t</processingSoftware>\n"
+      "\t\t\t</ocrProcessingStep>\n"
+      "\t\t</OCRProcessing>\n"
+      "\t</Description>\n"
+      "\t<Layout>\n");
+
+  return true;
+}
+
+///
+/// Append the ALTO XML for the layout of the image
+///
+bool TessAltoRenderer::AddImageHandler(TessBaseAPI *api) {
+  const std::unique_ptr<const char[]> text(api->GetAltoText(imagenum()));
+  if (text == nullptr) {
+    return false;
+  }
+
+  AppendString(text.get());
+
+  return true;
+}
+
+///
+/// Append the ALTO XML for the end of the document
+///
+bool TessAltoRenderer::EndDocumentHandler() {
+  AppendString("\t</Layout>\n</alto>\n");
+
+  return true;
+}
+
+TessAltoRenderer::TessAltoRenderer(const char *outputbase)
+    : TessResultRenderer(outputbase, "xml") {}
+
+///
+/// Make an XML-formatted string with ALTO markup from the internal
+/// data structures.
+///
+char *TessBaseAPI::GetAltoText(int page_number) {
+  return GetAltoText(nullptr, page_number);
+}
+
+///
+/// Make an XML-formatted string with ALTO markup from the internal
+/// data structures.
+///
+char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
+  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) {
+    return nullptr;
+  }
+
+  int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0;
+
+  if (input_file_.empty()) {
+    SetInputName(nullptr);
+  }
+
+#ifdef _WIN32
+  // convert input name from ANSI encoding to utf-8
+  int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
+  wchar_t *uni16_str = new WCHAR[str16_len];
+  str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, str16_len);
+  int utf8_len =
+      WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr);
+  char *utf8_str = new char[utf8_len];
+  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr);
+  input_file_ = utf8_str;
+  delete[] uni16_str;
+  delete[] utf8_str;
+#endif
+
+  std::stringstream alto_str;
+  // Use "C" locale (needed for int values larger than 999).
+  alto_str.imbue(std::locale::classic());
+  alto_str << "\t\t<Page WIDTH=\"" << rect_width_ << "\" HEIGHT=\"" << rect_height_
+           << "\" PHYSICAL_IMG_NR=\"" << page_number << "\""
+           << " ID=\"page_" << page_number << "\">\n"
+           << "\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\""
+           << " WIDTH=\"" << rect_width_ << "\""
+           << " HEIGHT=\"" << rect_height_ << "\">\n";
+
+  ResultIterator *res_it = GetIterator();
+  while (!res_it->Empty(RIL_BLOCK)) {
+    if (res_it->Empty(RIL_WORD)) {
+      res_it->Next(RIL_WORD);
+      continue;
+    }
+
+    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
+      alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\"";
+      AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
+      alto_str << "\n";
+    }
+
+    if (res_it->IsAtBeginningOf(RIL_PARA)) {
+      alto_str << "\t\t\t\t\t<TextBlock ID=\"block_" << tcnt << "\"";
+      AddBoxToAlto(res_it, RIL_PARA, alto_str);
+      alto_str << "\n";
+    }
+
+    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
+      alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
+      AddBoxToAlto(res_it, RIL_TEXTLINE, alto_str);
+      alto_str << "\n";
+    }
+
+    alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
+    AddBoxToAlto(res_it, RIL_WORD, alto_str);
+    alto_str << " CONTENT=\"";
+
+    bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
+    bool last_word_in_tblock = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
+    bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
+
+    int left, top, right, bottom;
+    res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
+
+    do {
+      const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
+      if (grapheme && grapheme[0] != 0) {
+        alto_str << HOcrEscape(grapheme.get()).c_str();
+      }
+      res_it->Next(RIL_SYMBOL);
+    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
+
+    alto_str << "\"/>";
+
+    wcnt++;
+
+    if (last_word_in_line) {
+      alto_str << "\n\t\t\t\t\t\t</TextLine>\n";
+      lcnt++;
+    } else {
+      int hpos = right;
+      int vpos = top;
+      res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
+      int width = left - hpos;
+      alto_str << "<SP WIDTH=\"" << width << "\" VPOS=\"" << vpos << "\" HPOS=\"" << hpos
+               << "\"/>\n";
+    }
+
+    if (last_word_in_tblock) {
+      alto_str << "\t\t\t\t\t</TextBlock>\n";
+      tcnt++;
+    }
+
+    if (last_word_in_cblock) {
+      alto_str << "\t\t\t\t</ComposedBlock>\n";
+      bcnt++;
+    }
+  }
+
+  alto_str << "\t\t\t</PrintSpace>\n"
+           << "\t\t</Page>\n";
+  const std::string &text = alto_str.str();
+
+  char *result = new char[text.length() + 1];
+  strcpy(result, text.c_str());
+  delete res_it;
+  return result;
+}
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/api/baseapi.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/api/baseapi.cpp
--- a/3rdparty/tesseract_ocr/tesseract/src/api/capi.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/api/capi.cpp
@ -0,0 +1,689 @@
+///////////////////////////////////////////////////////////////////////
+// File:        capi.cpp
+// Description: C-API TessBaseAPI
+//
+// (C) Copyright 2012, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include <tesseract/capi.h>
+
+#include <cstring> // for strdup
+
+const char *TessVersion() {
+  return TessBaseAPI::Version();
+}
+
+void TessDeleteText(const char *text) {
+  delete[] text;
+}
+
+void TessDeleteTextArray(char **arr) {
+  for (char **pos = arr; *pos != nullptr; ++pos) {
+    delete[] * pos;
+  }
+  delete[] arr;
+}
+
+void TessDeleteIntArray(const int *arr) {
+  delete[] arr;
+}
+
+TessResultRenderer *TessTextRendererCreate(const char *outputbase) {
+  return new tesseract::TessTextRenderer(outputbase);
+}
+
+TessResultRenderer *TessHOcrRendererCreate(const char *outputbase) {
+  return new tesseract::TessHOcrRenderer(outputbase);
+}
+
+TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase, BOOL font_info) {
+  return new tesseract::TessHOcrRenderer(outputbase, font_info != 0);
+}
+
+TessResultRenderer *TessAltoRendererCreate(const char *outputbase) {
+  return new tesseract::TessAltoRenderer(outputbase);
+}
+
+TessResultRenderer *TessTsvRendererCreate(const char *outputbase) {
+  return new tesseract::TessTsvRenderer(outputbase);
+}
+
+TessResultRenderer *TessPDFRendererCreate(const char *outputbase, const char *datadir,
+                                          BOOL textonly) {
+  return new tesseract::TessPDFRenderer(outputbase, datadir, textonly != 0);
+}
+
+TessResultRenderer *TessUnlvRendererCreate(const char *outputbase) {
+  return new tesseract::TessUnlvRenderer(outputbase);
+}
+
+TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase) {
+  return new tesseract::TessBoxTextRenderer(outputbase);
+}
+
+TessResultRenderer *TessWordStrBoxRendererCreate(const char *outputbase) {
+  return new tesseract::TessWordStrBoxRenderer(outputbase);
+}
+
+TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase) {
+  return new tesseract::TessLSTMBoxRenderer(outputbase);
+}
+
+void TessDeleteResultRenderer(TessResultRenderer *renderer) {
+  delete renderer;
+}
+
+void TessResultRendererInsert(TessResultRenderer *renderer, TessResultRenderer *next) {
+  renderer->insert(next);
+}
+
+TessResultRenderer *TessResultRendererNext(TessResultRenderer *renderer) {
+  return renderer->next();
+}
+
+BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer, const char *title) {
+  return static_cast<int>(renderer->BeginDocument(title));
+}
+
+BOOL TessResultRendererAddImage(TessResultRenderer *renderer, TessBaseAPI *api) {
+  return static_cast<int>(renderer->AddImage(api));
+}
+
+BOOL TessResultRendererEndDocument(TessResultRenderer *renderer) {
+  return static_cast<int>(renderer->EndDocument());
+}
+
+const char *TessResultRendererExtention(TessResultRenderer *renderer) {
+  return renderer->file_extension();
+}
+
+const char *TessResultRendererTitle(TessResultRenderer *renderer) {
+  return renderer->title();
+}
+
+int TessResultRendererImageNum(TessResultRenderer *renderer) {
+  return renderer->imagenum();
+}
+
+TessBaseAPI *TessBaseAPICreate() {
+  return new TessBaseAPI;
+}
+
+void TessBaseAPIDelete(TessBaseAPI *handle) {
+  delete handle;
+}
+
+size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI * /*handle*/, void **device) {
+  return TessBaseAPI::getOpenCLDevice(device);
+}
+
+void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name) {
+  handle->SetInputName(name);
+}
+
+const char *TessBaseAPIGetInputName(TessBaseAPI *handle) {
+  return handle->GetInputName();
+}
+
+void TessBaseAPISetInputImage(TessBaseAPI *handle, Pix *pix) {
+  handle->SetInputImage(pix);
+}
+
+Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle) {
+  return handle->GetInputImage();
+}
+
+int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle) {
+  return handle->GetSourceYResolution();
+}
+
+const char *TessBaseAPIGetDatapath(TessBaseAPI *handle) {
+  return handle->GetDatapath();
+}
+
+void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name) {
+  handle->SetOutputName(name);
+}
+
+BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name, const char *value) {
+  return static_cast<int>(handle->SetVariable(name, value));
+}
+
+BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name, const char *value) {
+  return static_cast<int>(handle->SetDebugVariable(name, value));
+}
+
+BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle, const char *name, int *value) {
+  return static_cast<int>(handle->GetIntVariable(name, value));
+}
+
+BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle, const char *name, BOOL *value) {
+  bool boolValue;
+  bool result = handle->GetBoolVariable(name, &boolValue);
+  if (result) {
+    *value = static_cast<int>(boolValue);
+  }
+  return static_cast<int>(result);
+}
+
+BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle, const char *name, double *value) {
+  return static_cast<int>(handle->GetDoubleVariable(name, value));
+}
+
+const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle, const char *name) {
+  return handle->GetStringVariable(name);
+}
+
+void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp) {
+  handle->PrintVariables(fp);
+}
+
+BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle, const char *filename) {
+  FILE *fp = fopen(filename, "w");
+  if (fp != nullptr) {
+    handle->PrintVariables(fp);
+    fclose(fp);
+    return TRUE;
+  }
+  return FALSE;
+}
+
+int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath, const char *language,
+                     TessOcrEngineMode mode, char **configs, int configs_size, char **vars_vec,
+                     char **vars_values, size_t vars_vec_size, BOOL set_only_non_debug_params) {
+  std::vector<std::string> varNames;
+  std::vector<std::string> varValues;
+  if (vars_vec != nullptr && vars_values != nullptr) {
+    for (size_t i = 0; i < vars_vec_size; i++) {
+      varNames.emplace_back(vars_vec[i]);
+      varValues.emplace_back(vars_values[i]);
+    }
+  }
+
+  return handle->Init(datapath, language, mode, configs, configs_size, &varNames, &varValues,
+                      set_only_non_debug_params != 0);
+}
+
+int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath, const char *language,
+                     TessOcrEngineMode oem, char **configs, int configs_size) {
+  return handle->Init(datapath, language, oem, configs, configs_size, nullptr, nullptr, false);
+}
+
+int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath, const char *language,
+                     TessOcrEngineMode oem) {
+  return handle->Init(datapath, language, oem);
+}
+
+int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath, const char *language) {
+  return handle->Init(datapath, language);
+}
+
+const char *TessBaseAPIGetInitLanguagesAsString(const TessBaseAPI *handle) {
+  return handle->GetInitLanguagesAsString();
+}
+
+char **TessBaseAPIGetLoadedLanguagesAsVector(const TessBaseAPI *handle) {
+  std::vector<std::string> languages;
+  handle->GetLoadedLanguagesAsVector(&languages);
+  char **arr = new char *[languages.size() + 1];
+  for (auto &language : languages) {
+    arr[&language - &languages[0]] = strdup(language.c_str());
+  }
+  arr[languages.size()] = nullptr;
+  return arr;
+}
+
+char **TessBaseAPIGetAvailableLanguagesAsVector(const TessBaseAPI *handle) {
+  std::vector<std::string> languages;
+  handle->GetAvailableLanguagesAsVector(&languages);
+  char **arr = new char *[languages.size() + 1];
+  for (auto &language : languages) {
+    arr[&language - &languages[0]] = strdup(language.c_str());
+  }
+  arr[languages.size()] = nullptr;
+  return arr;
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+int TessBaseAPIInitLangMod(TessBaseAPI *handle, const char *datapath, const char *language) {
+  return handle->InitLangMod(datapath, language);
+}
+#endif
+
+void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle) {
+  handle->InitForAnalysePage();
+}
+
+void TessBaseAPIReadConfigFile(TessBaseAPI *handle, const char *filename) {
+  handle->ReadConfigFile(filename);
+}
+
+void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle, const char *filename) {
+  handle->ReadDebugConfigFile(filename);
+}
+
+void TessBaseAPISetPageSegMode(TessBaseAPI *handle, TessPageSegMode mode) {
+  handle->SetPageSegMode(mode);
+}
+
+TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle) {
+  return handle->GetPageSegMode();
+}
+
+char *TessBaseAPIRect(TessBaseAPI *handle, const unsigned char *imagedata, int bytes_per_pixel,
+                      int bytes_per_line, int left, int top, int width, int height) {
+  return handle->TesseractRect(imagedata, bytes_per_pixel, bytes_per_line, left, top, width,
+                               height);
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle) {
+  handle->ClearAdaptiveClassifier();
+}
+#endif
+
+void TessBaseAPISetImage(TessBaseAPI *handle, const unsigned char *imagedata, int width, int height,
+                         int bytes_per_pixel, int bytes_per_line) {
+  handle->SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line);
+}
+
+void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix) {
+  return handle->SetImage(pix);
+}
+
+void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi) {
+  handle->SetSourceResolution(ppi);
+}
+
+void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top, int width, int height) {
+  handle->SetRectangle(left, top, width, height);
+}
+
+struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle) {
+  return handle->GetThresholdedImage();
+}
+
+void TessBaseAPIClearPersistentCache(TessBaseAPI * /*handle*/) {
+  TessBaseAPI::ClearPersistentCache();
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle, int *orient_deg, float *orient_conf,
+                                        const char **script_name, float *script_conf) {
+  auto success = handle->DetectOrientationScript(orient_deg, orient_conf, script_name, script_conf);
+  return static_cast<BOOL>(success);
+}
+
+#endif
+
+struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle, struct Pixa **pixa) {
+  return handle->GetRegions(pixa);
+}
+
+struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle, struct Pixa **pixa, int **blockids) {
+  return handle->GetTextlines(pixa, blockids);
+}
+
+struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle, const BOOL raw_image,
+                                      const int raw_padding, struct Pixa **pixa, int **blockids,
+                                      int **paraids) {
+  return handle->GetTextlines(raw_image != 0, raw_padding, pixa, blockids, paraids);
+}
+
+struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle, struct Pixa **pixa, int **blockids) {
+  return handle->GetStrips(pixa, blockids);
+}
+
+struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle, struct Pixa **pixa) {
+  return handle->GetWords(pixa);
+}
+
+struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle, struct Pixa **cc) {
+  return handle->GetConnectedComponents(cc);
+}
+
+struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle, TessPageIteratorLevel level,
+                                           BOOL text_only, struct Pixa **pixa, int **blockids) {
+  return handle->GetComponentImages(level, static_cast<bool>(text_only), pixa, blockids);
+}
+
+struct Boxa *TessBaseAPIGetComponentImages1(TessBaseAPI *handle, const TessPageIteratorLevel level,
+                                            const BOOL text_only, const BOOL raw_image,
+                                            const int raw_padding, struct Pixa **pixa,
+                                            int **blockids, int **paraids) {
+  return handle->GetComponentImages(level, static_cast<bool>(text_only), raw_image != 0,
+                                    raw_padding, pixa, blockids, paraids);
+}
+
+int TessBaseAPIGetThresholdedImageScaleFactor(const TessBaseAPI *handle) {
+  return handle->GetThresholdedImageScaleFactor();
+}
+
+TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle) {
+  return handle->AnalyseLayout();
+}
+
+int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor) {
+  return handle->Recognize(monitor);
+}
+
+BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename, const char *retry_config,
+                             int timeout_millisec, TessResultRenderer *renderer) {
+  return static_cast<int>(handle->ProcessPages(filename, retry_config, timeout_millisec, renderer));
+}
+
+BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix, int page_index,
+                            const char *filename, const char *retry_config, int timeout_millisec,
+                            TessResultRenderer *renderer) {
+  return static_cast<int>(
+      handle->ProcessPage(pix, page_index, filename, retry_config, timeout_millisec, renderer));
+}
+
+TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle) {
+  return handle->GetIterator();
+}
+
+TessMutableIterator *TessBaseAPIGetMutableIterator(TessBaseAPI *handle) {
+  return handle->GetMutableIterator();
+}
+
+char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle) {
+  return handle->GetUTF8Text();
+}
+
+char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number) {
+  return handle->GetHOCRText(nullptr, page_number);
+}
+
+char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number) {
+  return handle->GetAltoText(page_number);
+}
+
+char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number) {
+  return handle->GetTSVText(page_number);
+}
+
+char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number) {
+  return handle->GetBoxText(page_number);
+}
+
+char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle, int page_number) {
+  return handle->GetWordStrBoxText(page_number);
+}
+
+char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number) {
+  return handle->GetLSTMBoxText(page_number);
+}
+
+char *TessBaseAPIGetUNLVText(TessBaseAPI *handle) {
+  return handle->GetUNLVText();
+}
+
+int TessBaseAPIMeanTextConf(TessBaseAPI *handle) {
+  return handle->MeanTextConf();
+}
+
+int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle) {
+  return handle->AllWordConfidences();
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle, TessPageSegMode mode, const char *wordstr) {
+  return static_cast<int>(handle->AdaptToWordStr(mode, wordstr));
+}
+#endif
+
+void TessBaseAPIClear(TessBaseAPI *handle) {
+  handle->Clear();
+}
+
+void TessBaseAPIEnd(TessBaseAPI *handle) {
+  handle->End();
+}
+
+int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word) {
+  return handle->IsValidWord(word);
+}
+
+BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset, float *out_slope) {
+  return static_cast<int>(handle->GetTextDirection(out_offset, out_slope));
+}
+
+const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id) {
+  return handle->GetUnichar(unichar_id);
+}
+
+void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle, double margin) {
+  handle->set_min_orientation_margin(margin);
+}
+
+int TessBaseAPINumDawgs(const TessBaseAPI *handle) {
+  return handle->NumDawgs();
+}
+
+TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle) {
+  return handle->oem();
+}
+
+void TessBaseGetBlockTextOrientations(TessBaseAPI *handle, int **block_orientation,
+                                      bool **vertical_writing) {
+  handle->GetBlockTextOrientations(block_orientation, vertical_writing);
+}
+
+void TessPageIteratorDelete(TessPageIterator *handle) {
+  delete handle;
+}
+
+TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle) {
+  return new TessPageIterator(*handle);
+}
+
+void TessPageIteratorBegin(TessPageIterator *handle) {
+  handle->Begin();
+}
+
+BOOL TessPageIteratorNext(TessPageIterator *handle, TessPageIteratorLevel level) {
+  return static_cast<int>(handle->Next(level));
+}
+
+BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle, TessPageIteratorLevel level) {
+  return static_cast<int>(handle->IsAtBeginningOf(level));
+}
+
+BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle, TessPageIteratorLevel level,
+                                      TessPageIteratorLevel element) {
+  return static_cast<int>(handle->IsAtFinalElement(level, element));
+}
+
+BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle, TessPageIteratorLevel level,
+                                 int *left, int *top, int *right, int *bottom) {
+  return static_cast<int>(handle->BoundingBox(level, left, top, right, bottom));
+}
+
+TessPolyBlockType TessPageIteratorBlockType(const TessPageIterator *handle) {
+  return handle->BlockType();
+}
+
+struct Pix *TessPageIteratorGetBinaryImage(const TessPageIterator *handle,
+                                           TessPageIteratorLevel level) {
+  return handle->GetBinaryImage(level);
+}
+
+struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle, TessPageIteratorLevel level,
+                                     int padding, struct Pix *original_image, int *left, int *top) {
+  return handle->GetImage(level, padding, original_image, left, top);
+}
+
+BOOL TessPageIteratorBaseline(const TessPageIterator *handle, TessPageIteratorLevel level, int *x1,
+                              int *y1, int *x2, int *y2) {
+  return static_cast<int>(handle->Baseline(level, x1, y1, x2, y2));
+}
+
+void TessPageIteratorOrientation(TessPageIterator *handle, TessOrientation *orientation,
+                                 TessWritingDirection *writing_direction,
+                                 TessTextlineOrder *textline_order, float *deskew_angle) {
+  handle->Orientation(orientation, writing_direction, textline_order, deskew_angle);
+}
+
+void TessPageIteratorParagraphInfo(TessPageIterator *handle,
+                                   TessParagraphJustification *justification, BOOL *is_list_item,
+                                   BOOL *is_crown, int *first_line_indent) {
+  bool bool_is_list_item;
+  bool bool_is_crown;
+  handle->ParagraphInfo(justification, &bool_is_list_item, &bool_is_crown, first_line_indent);
+  if (is_list_item != nullptr) {
+    *is_list_item = static_cast<int>(bool_is_list_item);
+  }
+  if (is_crown != nullptr) {
+    *is_crown = static_cast<int>(bool_is_crown);
+  }
+}
+
+void TessResultIteratorDelete(TessResultIterator *handle) {
+  delete handle;
+}
+
+TessResultIterator *TessResultIteratorCopy(const TessResultIterator *handle) {
+  return new TessResultIterator(*handle);
+}
+
+TessPageIterator *TessResultIteratorGetPageIterator(TessResultIterator *handle) {
+  return handle;
+}
+
+const TessPageIterator *TessResultIteratorGetPageIteratorConst(const TessResultIterator *handle) {
+  return handle;
+}
+
+TessChoiceIterator *TessResultIteratorGetChoiceIterator(const TessResultIterator *handle) {
+  return new TessChoiceIterator(*handle);
+}
+
+BOOL TessResultIteratorNext(TessResultIterator *handle, TessPageIteratorLevel level) {
+  return static_cast<int>(handle->Next(level));
+}
+
+char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle, TessPageIteratorLevel level) {
+  return handle->GetUTF8Text(level);
+}
+
+float TessResultIteratorConfidence(const TessResultIterator *handle, TessPageIteratorLevel level) {
+  return handle->Confidence(level);
+}
+
+const char *TessResultIteratorWordRecognitionLanguage(const TessResultIterator *handle) {
+  return handle->WordRecognitionLanguage();
+}
+
+const char *TessResultIteratorWordFontAttributes(const TessResultIterator *handle, BOOL *is_bold,
+                                                 BOOL *is_italic, BOOL *is_underlined,
+                                                 BOOL *is_monospace, BOOL *is_serif,
+                                                 BOOL *is_smallcaps, int *pointsize, int *font_id) {
+  bool bool_is_bold;
+  bool bool_is_italic;
+  bool bool_is_underlined;
+  bool bool_is_monospace;
+  bool bool_is_serif;
+  bool bool_is_smallcaps;
+  const char *ret = handle->WordFontAttributes(&bool_is_bold, &bool_is_italic, &bool_is_underlined,
+                                               &bool_is_monospace, &bool_is_serif,
+                                               &bool_is_smallcaps, pointsize, font_id);
+  if (is_bold != nullptr) {
+    *is_bold = static_cast<int>(bool_is_bold);
+  }
+  if (is_italic != nullptr) {
+    *is_italic = static_cast<int>(bool_is_italic);
+  }
+  if (is_underlined != nullptr) {
+    *is_underlined = static_cast<int>(bool_is_underlined);
+  }
+  if (is_monospace != nullptr) {
+    *is_monospace = static_cast<int>(bool_is_monospace);
+  }
+  if (is_serif != nullptr) {
+    *is_serif = static_cast<int>(bool_is_serif);
+  }
+  if (is_smallcaps != nullptr) {
+    *is_smallcaps = static_cast<int>(bool_is_smallcaps);
+  }
+  return ret;
+}
+
+BOOL TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle) {
+  return static_cast<int>(handle->WordIsFromDictionary());
+}
+
+BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle) {
+  return static_cast<int>(handle->WordIsNumeric());
+}
+
+BOOL TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle) {
+  return static_cast<int>(handle->SymbolIsSuperscript());
+}
+
+BOOL TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle) {
+  return static_cast<int>(handle->SymbolIsSubscript());
+}
+
+BOOL TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle) {
+  return static_cast<int>(handle->SymbolIsDropcap());
+}
+
+void TessChoiceIteratorDelete(TessChoiceIterator *handle) {
+  delete handle;
+}
+
+BOOL TessChoiceIteratorNext(TessChoiceIterator *handle) {
+  return static_cast<int>(handle->Next());
+}
+
+const char *TessChoiceIteratorGetUTF8Text(const TessChoiceIterator *handle) {
+  return handle->GetUTF8Text();
+}
+
+float TessChoiceIteratorConfidence(const TessChoiceIterator *handle) {
+  return handle->Confidence();
+}
+
+ETEXT_DESC *TessMonitorCreate() {
+  return new ETEXT_DESC();
+}
+
+void TessMonitorDelete(ETEXT_DESC *monitor) {
+  delete monitor;
+}
+
+void TessMonitorSetCancelFunc(ETEXT_DESC *monitor, TessCancelFunc cancelFunc) {
+  monitor->cancel = cancelFunc;
+}
+
+void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis) {
+  monitor->cancel_this = cancelThis;
+}
+
+void *TessMonitorGetCancelThis(ETEXT_DESC *monitor) {
+  return monitor->cancel_this;
+}
+
+void TessMonitorSetProgressFunc(ETEXT_DESC *monitor, TessProgressFunc progressFunc) {
+  monitor->progress_callback2 = progressFunc;
+}
+
+int TessMonitorGetProgress(ETEXT_DESC *monitor) {
+  return monitor->progress;
+}
+
+void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline) {
+  monitor->set_deadline_msecs(deadline);
+}
--- a/3rdparty/tesseract_ocr/tesseract/src/api/hocrrenderer.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/api/hocrrenderer.cpp
@ -0,0 +1,489 @@
+/**********************************************************************
+ * File:        hocrrenderer.cpp
+ * Description: Simple API for calling tesseract.
+ * Author:      Ray Smith (original code from baseapi.cpp)
+ * Author:      Stefan Weil (moved to separate file and cleaned code)
+ *
+ * (C) Copyright 2006, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include <tesseract/baseapi.h> // for TessBaseAPI
+#include <locale>              // for std::locale::classic
+#include <memory>              // for std::unique_ptr
+#include <sstream>             // for std::stringstream
+#ifdef _WIN32
+#  include "host.h" // windows.h for MultiByteToWideChar, ...
+#endif
+#include <tesseract/renderer.h>
+#include "tesseractclass.h" // for Tesseract
+
+namespace tesseract {
+
+/**
+ * Gets the block orientation at the current iterator position.
+ */
+static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) {
+  tesseract::Orientation orientation;
+  tesseract::WritingDirection writing_direction;
+  tesseract::TextlineOrder textline_order;
+  float deskew_angle;
+  it->Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle);
+  return orientation;
+}
+
+/**
+ * Fits a line to the baseline at the given level, and appends its coefficients
+ * to the hOCR string.
+ * NOTE: The hOCR spec is unclear on how to specify baseline coefficients for
+ * rotated textlines. For this reason, on textlines that are not upright, this
+ * method currently only inserts a 'textangle' property to indicate the rotation
+ * direction and does not add any baseline information to the hocr string.
+ */
+static void AddBaselineCoordsTohOCR(const PageIterator *it, PageIteratorLevel level,
+                                    std::stringstream &hocr_str) {
+  tesseract::Orientation orientation = GetBlockTextOrientation(it);
+  if (orientation != ORIENTATION_PAGE_UP) {
+    hocr_str << "; textangle " << 360 - orientation * 90;
+    return;
+  }
+
+  int left, top, right, bottom;
+  it->BoundingBox(level, &left, &top, &right, &bottom);
+
+  // Try to get the baseline coordinates at this level.
+  int x1, y1, x2, y2;
+  if (!it->Baseline(level, &x1, &y1, &x2, &y2)) {
+    return;
+  }
+  // Following the description of this field of the hOCR spec, we convert the
+  // baseline coordinates so that "the bottom left of the bounding box is the
+  // origin".
+  x1 -= left;
+  x2 -= left;
+  y1 -= bottom;
+  y2 -= bottom;
+
+  // Now fit a line through the points so we can extract coefficients for the
+  // equation:  y = p1 x + p0
+  if (x1 == x2) {
+    // Problem computing the polynomial coefficients.
+    return;
+  }
+  double p1 = (y2 - y1) / static_cast<double>(x2 - x1);
+  double p0 = y1 - p1 * x1;
+
+  hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " " << round(p0 * 1000.0) / 1000.0;
+}
+
+static void AddBoxTohOCR(const ResultIterator *it, PageIteratorLevel level,
+                         std::stringstream &hocr_str) {
+  int left, top, right, bottom;
+  it->BoundingBox(level, &left, &top, &right, &bottom);
+  // This is the only place we use double quotes instead of single quotes,
+  // but it may too late to change for consistency
+  hocr_str << " title=\"bbox " << left << " " << top << " " << right << " " << bottom;
+  // Add baseline coordinates & heights for textlines only.
+  if (level == RIL_TEXTLINE) {
+    AddBaselineCoordsTohOCR(it, level, hocr_str);
+    // add custom height measures
+    float row_height, descenders, ascenders; // row attributes
+    it->RowAttributes(&row_height, &descenders, &ascenders);
+    // TODO(rays): Do we want to limit these to a single decimal place?
+    hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders << "; x_ascenders "
+             << ascenders;
+  }
+  hocr_str << "\">";
+}
+
+/**
+ * Make a HTML-formatted string with hOCR markup from the internal
+ * data structures.
+ * page_number is 0-based but will appear in the output as 1-based.
+ * Image name/input_file_ can be set by SetInputName before calling
+ * GetHOCRText
+ * STL removed from original patch submission and refactored by rays.
+ * Returned string must be freed with the delete [] operator.
+ */
+char *TessBaseAPI::GetHOCRText(int page_number) {
+  return GetHOCRText(nullptr, page_number);
+}
+
+/**
+ * Make a HTML-formatted string with hOCR markup from the internal
+ * data structures.
+ * page_number is 0-based but will appear in the output as 1-based.
+ * Image name/input_file_ can be set by SetInputName before calling
+ * GetHOCRText
+ * STL removed from original patch submission and refactored by rays.
+ * Returned string must be freed with the delete [] operator.
+ */
+char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
+  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) {
+    return nullptr;
+  }
+
+  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, ccnt = 1;
+  int page_id = page_number + 1; // hOCR uses 1-based page numbers.
+  bool para_is_ltr = true;       // Default direction is LTR
+  const char *paragraph_lang = nullptr;
+  bool font_info = false;
+  bool hocr_boxes = false;
+  GetBoolVariable("hocr_font_info", &font_info);
+  GetBoolVariable("hocr_char_boxes", &hocr_boxes);
+
+  if (input_file_.empty()) {
+    SetInputName(nullptr);
+  }
+
+#ifdef _WIN32
+  // convert input name from ANSI encoding to utf-8
+  int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
+  wchar_t *uni16_str = new WCHAR[str16_len];
+  str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, str16_len);
+  int utf8_len =
+      WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr);
+  char *utf8_str = new char[utf8_len];
+  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr);
+  input_file_ = utf8_str;
+  delete[] uni16_str;
+  delete[] utf8_str;
+#endif
+
+  std::stringstream hocr_str;
+  // Use "C" locale (needed for double values x_size and x_descenders).
+  hocr_str.imbue(std::locale::classic());
+  // Use 8 digits for double values.
+  hocr_str.precision(8);
+  hocr_str << "  <div class='ocr_page'"
+           << " id='"
+           << "page_" << page_id << "'"
+           << " title='image \"";
+  if (!input_file_.empty()) {
+    hocr_str << HOcrEscape(input_file_.c_str());
+  } else {
+    hocr_str << "unknown";
+  }
+  hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " " << rect_width_ << " "
+           << rect_height_ << "; ppageno " << page_number << "'>\n";
+
+  std::unique_ptr<ResultIterator> res_it(GetIterator());
+  while (!res_it->Empty(RIL_BLOCK)) {
+    if (res_it->Empty(RIL_WORD)) {
+      res_it->Next(RIL_WORD);
+      continue;
+    }
+
+    // Open any new block/paragraph/textline.
+    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
+      para_is_ltr = true; // reset to default direction
+      hocr_str << "   <div class='ocr_carea'"
+               << " id='"
+               << "block_" << page_id << "_" << bcnt << "'";
+      AddBoxTohOCR(res_it.get(), RIL_BLOCK, hocr_str);
+    }
+    if (res_it->IsAtBeginningOf(RIL_PARA)) {
+      hocr_str << "\n    <p class='ocr_par'";
+      para_is_ltr = res_it->ParagraphIsLtr();
+      if (!para_is_ltr) {
+        hocr_str << " dir='rtl'";
+      }
+      hocr_str << " id='"
+               << "par_" << page_id << "_" << pcnt << "'";
+      paragraph_lang = res_it->WordRecognitionLanguage();
+      if (paragraph_lang) {
+        hocr_str << " lang='" << paragraph_lang << "'";
+      }
+      AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str);
+    }
+    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
+      hocr_str << "\n     <span class='";
+      switch (res_it->BlockType()) {
+        case PT_HEADING_TEXT:
+          hocr_str << "ocr_header";
+          break;
+        case PT_PULLOUT_TEXT:
+          hocr_str << "ocr_textfloat";
+          break;
+        case PT_CAPTION_TEXT:
+          hocr_str << "ocr_caption";
+          break;
+        default:
+          hocr_str << "ocr_line";
+      }
+      hocr_str << "' id='"
+               << "line_" << page_id << "_" << lcnt << "'";
+      AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str);
+    }
+
+    // Now, process the word...
+    int32_t lstm_choice_mode = tesseract_->lstm_choice_mode;
+    std::vector<std::vector<std::vector<std::pair<const char *, float>>>> *rawTimestepMap = nullptr;
+    std::vector<std::vector<std::pair<const char *, float>>> *CTCMap = nullptr;
+    if (lstm_choice_mode) {
+      CTCMap = res_it->GetBestLSTMSymbolChoices();
+      rawTimestepMap = res_it->GetRawLSTMTimesteps();
+    }
+    hocr_str << "\n      <span class='ocrx_word'"
+             << " id='"
+             << "word_" << page_id << "_" << wcnt << "'";
+    int left, top, right, bottom;
+    bool bold, italic, underlined, monospace, serif, smallcaps;
+    int pointsize, font_id;
+    const char *font_name;
+    res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
+    font_name = res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif,
+                                           &smallcaps, &pointsize, &font_id);
+    hocr_str << " title='bbox " << left << " " << top << " " << right << " " << bottom
+             << "; x_wconf " << static_cast<int>(res_it->Confidence(RIL_WORD));
+    if (font_info) {
+      if (font_name) {
+        hocr_str << "; x_font " << HOcrEscape(font_name).c_str();
+      }
+      hocr_str << "; x_fsize " << pointsize;
+    }
+    hocr_str << "'";
+    const char *lang = res_it->WordRecognitionLanguage();
+    if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
+      hocr_str << " lang='" << lang << "'";
+    }
+    switch (res_it->WordDirection()) {
+      // Only emit direction if different from current paragraph direction
+      case DIR_LEFT_TO_RIGHT:
+        if (!para_is_ltr) {
+          hocr_str << " dir='ltr'";
+        }
+        break;
+      case DIR_RIGHT_TO_LEFT:
+        if (para_is_ltr) {
+          hocr_str << " dir='rtl'";
+        }
+        break;
+      case DIR_MIX:
+      case DIR_NEUTRAL:
+      default: // Do nothing.
+        break;
+    }
+    hocr_str << ">";
+    bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
+    bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
+    bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
+    if (bold) {
+      hocr_str << "<strong>";
+    }
+    if (italic) {
+      hocr_str << "<em>";
+    }
+    do {
+      const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
+      if (grapheme && grapheme[0] != 0) {
+        if (hocr_boxes) {
+          res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
+          hocr_str << "\n       <span class='ocrx_cinfo' title='x_bboxes " << left << " " << top
+                   << " " << right << " " << bottom << "; x_conf " << res_it->Confidence(RIL_SYMBOL)
+                   << "'>";
+        }
+        hocr_str << HOcrEscape(grapheme.get()).c_str();
+        if (hocr_boxes) {
+          hocr_str << "</span>";
+          tesseract::ChoiceIterator ci(*res_it);
+          if (lstm_choice_mode == 1 && ci.Timesteps() != nullptr) {
+            std::vector<std::vector<std::pair<const char *, float>>> *symbol = ci.Timesteps();
+            hocr_str << "\n        <span class='ocr_symbol'"
+                     << " id='"
+                     << "symbol_" << page_id << "_" << wcnt << "_" << scnt << "'>";
+            for (auto timestep : *symbol) {
+              hocr_str << "\n         <span class='ocrx_cinfo'"
+                       << " id='"
+                       << "timestep" << page_id << "_" << wcnt << "_" << tcnt << "'>";
+              for (auto conf : timestep) {
+                hocr_str << "\n          <span class='ocrx_cinfo'"
+                         << " id='"
+                         << "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'"
+                         << " title='x_confs " << int(conf.second * 100) << "'>"
+                         << HOcrEscape(conf.first).c_str() << "</span>";
+                ++ccnt;
+              }
+              hocr_str << "</span>";
+              ++tcnt;
+            }
+            hocr_str << "\n        </span>";
+            ++scnt;
+          } else if (lstm_choice_mode == 2) {
+            tesseract::ChoiceIterator ci(*res_it);
+            hocr_str << "\n        <span class='ocrx_cinfo'"
+                     << " id='"
+                     << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt << "'>";
+            do {
+              const char *choice = ci.GetUTF8Text();
+              float choiceconf = ci.Confidence();
+              if (choice != nullptr) {
+                hocr_str << "\n         <span class='ocrx_cinfo'"
+                         << " id='"
+                         << "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'"
+                         << " title='x_confs " << choiceconf << "'>" << HOcrEscape(choice).c_str()
+                         << "</span>";
+                ccnt++;
+              }
+            } while (ci.Next());
+            hocr_str << "\n        </span>";
+            tcnt++;
+          }
+        }
+      }
+      res_it->Next(RIL_SYMBOL);
+    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
+    if (italic) {
+      hocr_str << "</em>";
+    }
+    if (bold) {
+      hocr_str << "</strong>";
+    }
+    // If the lstm choice mode is required it is added here
+    if (lstm_choice_mode == 1 && !hocr_boxes && rawTimestepMap != nullptr) {
+      for (auto symbol : *rawTimestepMap) {
+        hocr_str << "\n       <span class='ocr_symbol'"
+                 << " id='"
+                 << "symbol_" << page_id << "_" << wcnt << "_" << scnt << "'>";
+        for (auto timestep : symbol) {
+          hocr_str << "\n        <span class='ocrx_cinfo'"
+                   << " id='"
+                   << "timestep" << page_id << "_" << wcnt << "_" << tcnt << "'>";
+          for (auto conf : timestep) {
+            hocr_str << "\n         <span class='ocrx_cinfo'"
+                     << " id='"
+                     << "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'"
+                     << " title='x_confs " << int(conf.second * 100) << "'>"
+                     << HOcrEscape(conf.first).c_str() << "</span>";
+            ++ccnt;
+          }
+          hocr_str << "</span>";
+          ++tcnt;
+        }
+        hocr_str << "</span>";
+        ++scnt;
+      }
+    } else if (lstm_choice_mode == 2 && !hocr_boxes && CTCMap != nullptr) {
+      for (auto timestep : *CTCMap) {
+        if (timestep.size() > 0) {
+          hocr_str << "\n       <span class='ocrx_cinfo'"
+                   << " id='"
+                   << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt << "'>";
+          for (auto &j : timestep) {
+            float conf = 100 - tesseract_->lstm_rating_coefficient * j.second;
+            if (conf < 0.0f) {
+              conf = 0.0f;
+            }
+            if (conf > 100.0f) {
+              conf = 100.0f;
+            }
+            hocr_str << "\n        <span class='ocrx_cinfo'"
+                     << " id='"
+                     << "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'"
+                     << " title='x_confs " << conf << "'>" << HOcrEscape(j.first).c_str()
+                     << "</span>";
+            ccnt++;
+          }
+          hocr_str << "</span>";
+          tcnt++;
+        }
+      }
+    }
+    // Close ocrx_word.
+    if (hocr_boxes || lstm_choice_mode > 0) {
+      hocr_str << "\n      ";
+    }
+    hocr_str << "</span>";
+    tcnt = 1;
+    ccnt = 1;
+    wcnt++;
+    // Close any ending block/paragraph/textline.
+    if (last_word_in_line) {
+      hocr_str << "\n     </span>";
+      lcnt++;
+    }
+    if (last_word_in_para) {
+      hocr_str << "\n    </p>\n";
+      pcnt++;
+      para_is_ltr = true; // back to default direction
+    }
+    if (last_word_in_block) {
+      hocr_str << "   </div>\n";
+      bcnt++;
+    }
+  }
+  hocr_str << "  </div>\n";
+
+  const std::string &text = hocr_str.str();
+  char *result = new char[text.length() + 1];
+  strcpy(result, text.c_str());
+  return result;
+}
+
+/**********************************************************************
+ * HOcr Text Renderer interface implementation
+ **********************************************************************/
+TessHOcrRenderer::TessHOcrRenderer(const char *outputbase)
+    : TessResultRenderer(outputbase, "hocr") {
+  font_info_ = false;
+}
+
+TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info)
+    : TessResultRenderer(outputbase, "hocr") {
+  font_info_ = font_info;
+}
+
+bool TessHOcrRenderer::BeginDocumentHandler() {
+  AppendString(
+      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+      "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
+      "    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
+      "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
+      "lang=\"en\">\n <head>\n  <title>");
+  AppendString(title());
+  AppendString(
+      "</title>\n"
+      "  <meta http-equiv=\"Content-Type\" content=\"text/html;"
+      "charset=utf-8\"/>\n"
+      "  <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR
+      "' />\n"
+      "  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
+      " ocr_line ocrx_word ocrp_wconf");
+  if (font_info_) {
+    AppendString(" ocrp_lang ocrp_dir ocrp_font ocrp_fsize");
+  }
+  AppendString(
+      "'/>\n"
+      " </head>\n"
+      " <body>\n");
+
+  return true;
+}
+
+bool TessHOcrRenderer::EndDocumentHandler() {
+  AppendString(" </body>\n</html>\n");
+
+  return true;
+}
+
+bool TessHOcrRenderer::AddImageHandler(TessBaseAPI *api) {
+  const std::unique_ptr<const char[]> hocr(api->GetHOCRText(imagenum()));
+  if (hocr == nullptr) {
+    return false;
+  }
+
+  AppendString(hocr.get());
+
+  return true;
+}
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/api/lstmboxrenderer.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/api/lstmboxrenderer.cpp
@ -0,0 +1,107 @@
+/**********************************************************************
+ * File:        lstmboxrenderer.cpp
+ * Description: Renderer for creating box file for LSTM training.
+ *              based on the tsv renderer.
+ *
+ * (C) Copyright 2019, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include <tesseract/baseapi.h> // for TessBaseAPI
+#include <tesseract/renderer.h>
+#include "tesseractclass.h" // for Tesseract
+
+namespace tesseract {
+
+/**
+ * Create a UTF8 box file for LSTM training from the internal data structures.
+ * page_number is a 0-base page index that will appear in the box file.
+ * Returned string must be freed with the delete [] operator.
+ */
+static void AddBoxToLSTM(int right, int bottom, int top, int image_height, int page_num,
+                         std::string &text) {
+  text += " " + std::to_string(image_height - bottom);
+  text += " " + std::to_string(right + 5);
+  text += " " + std::to_string(image_height - top);
+  text += " " + std::to_string(page_num);
+}
+
+char *TessBaseAPI::GetLSTMBoxText(int page_number = 0) {
+  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
+    return nullptr;
+  }
+
+  std::string lstm_box_str;
+  bool first_word = true;
+  int left = 0, top = 0, right = 0, bottom = 0;
+
+  LTRResultIterator *res_it = GetLTRIterator();
+  while (!res_it->Empty(RIL_BLOCK)) {
+    if (res_it->Empty(RIL_SYMBOL)) {
+      res_it->Next(RIL_SYMBOL);
+      continue;
+    }
+    if (!first_word) {
+      if (!(res_it->IsAtBeginningOf(RIL_TEXTLINE))) {
+        if (res_it->IsAtBeginningOf(RIL_WORD)) {
+          lstm_box_str += "  " + std::to_string(left);
+          AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
+          lstm_box_str += "\n"; // end of row for word
+        }                       // word
+      } else {
+        if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
+          lstm_box_str += "\t " + std::to_string(left);
+          AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
+          lstm_box_str += "\n"; // end of row for line
+        }                       // line
+      }
+    } // not first word
+    first_word = false;
+    // Use bounding box for whole line for everything
+    res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
+    do {
+      lstm_box_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
+      res_it->Next(RIL_SYMBOL);
+    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_SYMBOL));
+    lstm_box_str += " " + std::to_string(left);
+    AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
+    lstm_box_str += "\n"; // end of row for symbol
+  }
+  if (!first_word) { // if first_word is true  => empty page
+    lstm_box_str += "\t " + std::to_string(left);
+    AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
+    lstm_box_str += "\n"; // end of PAGE
+  }
+  char *ret = new char[lstm_box_str.length() + 1];
+  strcpy(ret, lstm_box_str.c_str());
+  delete res_it;
+  return ret;
+}
+
+/**********************************************************************
+ * LSTMBox Renderer interface implementation
+ **********************************************************************/
+TessLSTMBoxRenderer::TessLSTMBoxRenderer(const char *outputbase)
+    : TessResultRenderer(outputbase, "box") {}
+
+bool TessLSTMBoxRenderer::AddImageHandler(TessBaseAPI *api) {
+  const std::unique_ptr<const char[]> lstmbox(api->GetLSTMBoxText(imagenum()));
+  if (lstmbox == nullptr) {
+    return false;
+  }
+
+  AppendString(lstmbox.get());
+
+  return true;
+}
+
+} // namespace tesseract.
--- a/3rdparty/tesseract_ocr/tesseract/src/api/pdf_ttf.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/api/pdf_ttf.h
@ -0,0 +1,63 @@
+///////////////////////////////////////////////////////////////////////
+// File:        pdf_ttf.h
+// Description: pdf.ttf (GlyphLessFont) replacement.
+//              Generated with: "bin2cpp pdf.ttf pdf_ttf cpp17"
+// Author:      Zdenko Podobny
+//
+// (C) Copyright 2020, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef pdf_ttf__H
+#define pdf_ttf__H
+
+#include <cstdint> // uint8_t
+
+static const uint8_t pdf_ttf[] = {
+    0x0,  0x1,  0x0,  0x0,  0x0,  0xa,  0x0,  0x80, 0x0,  0x3,  0x0,  0x20, 0x4f, 0x53, 0x2f, 0x32,
+    0x56, 0xde, 0xc8, 0x94, 0x0,  0x0,  0x1,  0x28, 0x0,  0x0,  0x0,  0x60, 0x63, 0x6d, 0x61, 0x70,
+    0x0,  0xa,  0x0,  0x34, 0x0,  0x0,  0x1,  0x90, 0x0,  0x0,  0x0,  0x1e, 0x67, 0x6c, 0x79, 0x66,
+    0x15, 0x22, 0x41, 0x24, 0x0,  0x0,  0x1,  0xb8, 0x0,  0x0,  0x0,  0x18, 0x68, 0x65, 0x61, 0x64,
+    0xb,  0x78, 0xf1, 0x65, 0x0,  0x0,  0x0,  0xac, 0x0,  0x0,  0x0,  0x36, 0x68, 0x68, 0x65, 0x61,
+    0xc,  0x2,  0x4,  0x2,  0x0,  0x0,  0x0,  0xe4, 0x0,  0x0,  0x0,  0x24, 0x68, 0x6d, 0x74, 0x78,
+    0x4,  0x0,  0x0,  0x0,  0x0,  0x0,  0x1,  0x88, 0x0,  0x0,  0x0,  0x8,  0x6c, 0x6f, 0x63, 0x61,
+    0x0,  0xc,  0x0,  0x0,  0x0,  0x0,  0x1,  0xb0, 0x0,  0x0,  0x0,  0x6,  0x6d, 0x61, 0x78, 0x70,
+    0x0,  0x4,  0x0,  0x5,  0x0,  0x0,  0x1,  0x8,  0x0,  0x0,  0x0,  0x20, 0x6e, 0x61, 0x6d, 0x65,
+    0xf2, 0xeb, 0x16, 0xda, 0x0,  0x0,  0x1,  0xd0, 0x0,  0x0,  0x0,  0x4b, 0x70, 0x6f, 0x73, 0x74,
+    0x0,  0x1,  0x0,  0x1,  0x0,  0x0,  0x2,  0x1c, 0x0,  0x0,  0x0,  0x20, 0x0,  0x1,  0x0,  0x0,
+    0x0,  0x1,  0x0,  0x0,  0xb0, 0x94, 0x71, 0x10, 0x5f, 0xf,  0x3c, 0xf5, 0x4,  0x7,  0x8,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0xcf, 0x9a, 0xfc, 0x6e, 0x0,  0x0,  0x0,  0x0,  0xd4, 0xc3, 0xa7, 0xf2,
+    0x0,  0x0,  0x0,  0x0,  0x4,  0x0,  0x8,  0x0,  0x0,  0x0,  0x0,  0x10, 0x0,  0x2,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0x0,  0x1,  0x0,  0x0,  0x8,  0x0,  0xff, 0xff, 0x0,  0x0,  0x4,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0x4,  0x0,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x2,  0x0,  0x1,  0x0,  0x0,  0x0,  0x2,  0x0,  0x4,
+    0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x3,  0x0,  0x0,  0x1,  0x90, 0x0,  0x5,
+    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x5,  0x0,  0x1,  0x0,  0x1,  0x0,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+    0x0,  0x0,  0x47, 0x4f, 0x4f, 0x47, 0x0,  0x40, 0x0,  0x0,  0x0,  0x0,  0x0,  0x1,  0xff, 0xff,
+    0x0,  0x0,  0x0,  0x1,  0x0,  0x1,  0x80, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x4,  0x0,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x2,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x0,  0x14, 0x0,  0x3,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x14, 0x0,  0x6,  0x0,  0xa,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0x0,  0xc,  0x0,  0x0,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x4,  0x0,
+    0x8,  0x0,  0x0,  0x3,  0x0,  0x0,  0x31, 0x21, 0x11, 0x21, 0x4,  0x0,  0xfc, 0x0,  0x8,  0x0,
+    0x0,  0x0,  0x0,  0x3,  0x0,  0x2a, 0x0,  0x0,  0x0,  0x3,  0x0,  0x0,  0x0,  0x5,  0x0,  0x16,
+    0x0,  0x0,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,  0x0,  0x5,  0x0,  0xb,  0x0,  0x16, 0x0,  0x3,
+    0x0,  0x1,  0x4,  0x9,  0x0,  0x5,  0x0,  0x16, 0x0,  0x0,  0x0,  0x56, 0x0,  0x65, 0x0,  0x72,
+    0x0,  0x73, 0x0,  0x69, 0x0,  0x6f, 0x0,  0x6e, 0x0,  0x20, 0x0,  0x31, 0x0,  0x2e, 0x0,  0x30,
+    0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x31, 0x2e, 0x30, 0x0,  0x0,  0x1,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x1,  0x0,  0x0,  0x0,  0x0,
+    0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0};
+
+#endif
--- a/3rdparty/tesseract_ocr/tesseract/src/api/pdfrenderer.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/api/pdfrenderer.cpp
@ -0,0 +1,969 @@
+///////////////////////////////////////////////////////////////////////
+// File:        pdfrenderer.cpp
+// Description: PDF rendering interface to inject into TessBaseAPI
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h"
+#endif
+
+#include "pdf_ttf.h"
+#include "tprintf.h"
+
+#include <allheaders.h>
+#include <tesseract/baseapi.h>
+#include <tesseract/renderer.h>
+#include <cmath>
+#include <cstring>
+#include <fstream>   // for std::ifstream
+#include <locale>    // for std::locale::classic
+#include <memory>    // std::unique_ptr
+#include <sstream>   // for std::stringstream
+#include "helpers.h" // for Swap
+
+/*
+
+Design notes from Ken Sharp, with light editing.
+
+We think one solution is a font with a single glyph (.notdef) and a
+CIDToGIDMap which maps all the CIDs to 0. That map would then be
+stored as a stream in the PDF file, and when flat compressed should
+be pretty small. The font, of course, will be approximately the same
+size as the one you currently use.
+
+I'm working on such a font now, the CIDToGIDMap is trivial, you just
+create a stream object which contains 128k bytes (2 bytes per possible
+CID and your CIDs range from 0 to 65535) and where you currently have
+"/CIDToGIDMap /Identity" you would have "/CIDToGIDMap <object> 0 R".
+
+Note that if, in future, you were to use a different (ie not 2 byte)
+CMap for character codes you could trivially extend the CIDToGIDMap.
+
+The following is an explanation of how some of the font stuff works,
+this may be too simple for you in which case please accept my
+apologies, its hard to know how much knowledge someone has. You can
+skip all this anyway, its just for information.
+
+The font embedded in a PDF file is usually intended just to be
+rendered, but extensions allow for at least some ability to locate (or
+copy) text from a document. This isn't something which was an original
+goal of the PDF format, but its been retro-fitted, presumably due to
+popular demand.
+
+To do this reliably the PDF file must contain a ToUnicode CMap, a
+device for mapping character codes to Unicode code points. If one of
+these is present, then this will be used to convert the character
+codes into Unicode values. If its not present then the reader will
+fall back through a series of heuristics to try and guess the
+result. This is, as you would expect, prone to failure.
+
+This doesn't concern you of course, since you always write a ToUnicode
+CMap, so because you are writing the text in text rendering mode 3 it
+would seem that you don't really need to worry about this, but in the
+PDF spec you cannot have an isolated ToUnicode CMap, it has to be
+attached to a font, so in order to get even copy/paste to work you
+need to define a font.
+
+This is what leads to problems, tools like pdfwrite assume that they
+are going to be able to (or even have to) modify the font entries, so
+they require that the font being embedded be valid, and to be honest
+the font Tesseract embeds isn't valid (for this purpose).
+
+
+To see why lets look at how text is specified in a PDF file:
+
+(Test) Tj
+
+Now that looks like text but actually it isn't. Each of those bytes is
+a 'character code'. When it comes to rendering the text a complex
+sequence of events takes place, which converts the character code into
+'something' which the font understands. Its entirely possible via
+character mappings to have that text render as 'Sftu'
+
+For simple fonts (PostScript type 1), we use the character code as the
+index into an Encoding array (256 elements), each element of which is
+a glyph name, so this gives us a glyph name. We then consult the
+CharStrings dictionary in the font, that's a complex object which
+contains pairs of keys and values, you can use the key to retrieve a
+given value. So we have a glyph name, we then use that as the key to
+the dictionary and retrieve the associated value. For a type 1 font,
+the value is a glyph program that describes how to draw the glyph.
+
+For CIDFonts, its a little more complicated. Because CIDFonts can be
+large, using a glyph name as the key is unreasonable (it would also
+lead to unfeasibly large Encoding arrays), so instead we use a 'CID'
+as the key. CIDs are just numbers.
+
+But.... We don't use the character code as the CID. What we do is use
+a CMap to convert the character code into a CID. We then use the CID
+to key the CharStrings dictionary and proceed as before. So the 'CMap'
+is the equivalent of the Encoding array, but its a more compact and
+flexible representation.
+
+Note that you have to use the CMap just to find out how many bytes
+constitute a character code, and it can be variable. For example you
+can say if the first byte is 0x00->0x7f then its just one byte, if its
+0x80->0xf0 then its 2 bytes and if its 0xf0->0xff then its 3 bytes. I
+have seen CMaps defining character codes up to 5 bytes wide.
+
+Now that's fine for 'PostScript' CIDFonts, but its not sufficient for
+TrueType CIDFonts. The thing is that TrueType fonts are accessed using
+a Glyph ID (GID) (and the LOCA table) which may well not be anything
+like the CID. So for this case PDF includes a CIDToGIDMap. That maps
+the CIDs to GIDs, and we can then use the GID to get the glyph
+description from the GLYF table of the font.
+
+So for a TrueType CIDFont, character-code->CID->GID->glyf-program.
+
+Looking at the PDF file I was supplied with we see that it contains
+text like :
+
+<0x0075> Tj
+
+So we start by taking the character code (117) and look it up in the
+CMap. Well you don't supply a CMap, you just use the Identity-H one
+which is predefined. So character code 117 maps to CID 117. Then we
+use the CIDToGIDMap, again you don't supply one, you just use the
+predefined 'Identity' map. So CID 117 maps to GID 117. But the font we
+were supplied with only contains 116 glyphs.
+
+Now for Latin that's not a huge problem, you can just supply a bigger
+font. But for more complex languages that *is* going to be more of a
+problem. Either you need to supply a font which contains glyphs for
+all the possible CID->GID mappings, or we need to think laterally.
+
+Our solution using a TrueType CIDFont is to intervene at the
+CIDToGIDMap stage and convert all the CIDs to GID 0. Then we have a
+font with just one glyph, the .notdef glyph at GID 0. This is what I'm
+looking into now.
+
+It would also be possible to have a 'PostScript' (ie type 1 outlines)
+CIDFont which contained 1 glyph, and a CMap which mapped all character
+codes to CID 0. The effect would be the same.
+
+Its possible (I haven't checked) that the PostScript CIDFont and
+associated CMap would be smaller than the TrueType font and associated
+CIDToGIDMap.
+
+--- in a followup ---
+
+OK there is a small problem there, if I use GID 0 then Acrobat gets
+upset about it and complains it cannot extract the font. If I set the
+CIDToGIDMap so that all the entries are 1 instead, it's happy. Totally
+mad......
+
+*/
+
+namespace tesseract {
+
+// If the font is 10 pts, nominal character width is 5 pts
+static const int kCharWidth = 2;
+
+// Used for memory allocation. A codepoint must take no more than this
+// many bytes, when written in the PDF way. e.g. "<0063>" for the
+// letter 'c'
+static const int kMaxBytesPerCodepoint = 20;
+
+/**********************************************************************
+ * PDF Renderer interface implementation
+ **********************************************************************/
+TessPDFRenderer::TessPDFRenderer(const char *outputbase, const char *datadir, bool textonly)
+    : TessResultRenderer(outputbase, "pdf"), datadir_(datadir) {
+  obj_ = 0;
+  textonly_ = textonly;
+  offsets_.push_back(0);
+}
+
+void TessPDFRenderer::AppendPDFObjectDIY(size_t objectsize) {
+  offsets_.push_back(objectsize + offsets_.back());
+  obj_++;
+}
+
+void TessPDFRenderer::AppendPDFObject(const char *data) {
+  AppendPDFObjectDIY(strlen(data));
+  AppendString(data);
+}
+
+// Helper function to prevent us from accidentally writing
+// scientific notation to an HOCR or PDF file. Besides, three
+// decimal points are all you really need.
+static double prec(double x) {
+  double kPrecision = 1000.0;
+  double a = round(x * kPrecision) / kPrecision;
+  if (a == -0) {
+    return 0;
+  }
+  return a;
+}
+
+static long dist2(int x1, int y1, int x2, int y2) {
+  return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
+}
+
+// Viewers like evince can get really confused during copy-paste when
+// the baseline wanders around. So I've decided to project every word
+// onto the (straight) line baseline. All numbers are in the native
+// PDF coordinate system, which has the origin in the bottom left and
+// the unit is points, which is 1/72 inch. Tesseract reports baselines
+// left-to-right no matter what the reading order is. We need the
+// word baseline in reading order, so we do that conversion here. Returns
+// the word's baseline origin and length.
+static void GetWordBaseline(int writing_direction, int ppi, int height, int word_x1, int word_y1,
+                            int word_x2, int word_y2, int line_x1, int line_y1, int line_x2,
+                            int line_y2, double *x0, double *y0, double *length) {
+  if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {
+    std::swap(word_x1, word_x2);
+    std::swap(word_y1, word_y2);
+  }
+  double word_length;
+  double x, y;
+  {
+    int px = word_x1;
+    int py = word_y1;
+    double l2 = dist2(line_x1, line_y1, line_x2, line_y2);
+    if (l2 == 0) {
+      x = line_x1;
+      y = line_y1;
+    } else {
+      double t = ((px - line_x2) * (line_x2 - line_x1) + (py - line_y2) * (line_y2 - line_y1)) / l2;
+      x = line_x2 + t * (line_x2 - line_x1);
+      y = line_y2 + t * (line_y2 - line_y1);
+    }
+    word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1, word_x2, word_y2)));
+    word_length = word_length * 72.0 / ppi;
+    x = x * 72 / ppi;
+    y = height - (y * 72.0 / ppi);
+  }
+  *x0 = x;
+  *y0 = y;
+  *length = word_length;
+}
+
+// Compute coefficients for an affine matrix describing the rotation
+// of the text. If the text is right-to-left such as Arabic or Hebrew,
+// we reflect over the Y-axis. This matrix will set the coordinate
+// system for placing text in the PDF file.
+//
+//                           RTL
+// [ x' ] = [ a b ][ x ] = [-1 0 ] [ cos sin ][ x ]
+// [ y' ]   [ c d ][ y ]   [ 0 1 ] [-sin cos ][ y ]
+static void AffineMatrix(int writing_direction, int line_x1, int line_y1, int line_x2, int line_y2,
+                         double *a, double *b, double *c, double *d) {
+  double theta =
+      atan2(static_cast<double>(line_y1 - line_y2), static_cast<double>(line_x2 - line_x1));
+  *a = cos(theta);
+  *b = sin(theta);
+  *c = -sin(theta);
+  *d = cos(theta);
+  switch (writing_direction) {
+    case WRITING_DIRECTION_RIGHT_TO_LEFT:
+      *a = -*a;
+      *b = -*b;
+      break;
+    case WRITING_DIRECTION_TOP_TO_BOTTOM:
+      // TODO(jbreiden) Consider using the vertical PDF writing mode.
+      break;
+    default:
+      break;
+  }
+}
+
+// There are some really awkward PDF viewers in the wild, such as
+// 'Preview' which ships with the Mac. They do a better job with text
+// selection and highlighting when given perfectly flat baseline
+// instead of very slightly tilted. We clip small tilts to appease
+// these viewers. I chose this threshold large enough to absorb noise,
+// but small enough that lines probably won't cross each other if the
+// whole page is tilted at almost exactly the clipping threshold.
+static void ClipBaseline(int ppi, int x1, int y1, int x2, int y2, int *line_x1, int *line_y1,
+                         int *line_x2, int *line_y2) {
+  *line_x1 = x1;
+  *line_y1 = y1;
+  *line_x2 = x2;
+  *line_y2 = y2;
+  int rise = abs(y2 - y1) * 72;
+  int run = abs(x2 - x1) * 72;
+  if (rise < 2 * ppi && 2 * ppi < run) {
+    *line_y1 = *line_y2 = (y1 + y2) / 2;
+  }
+}
+
+static bool CodepointToUtf16be(int code, char utf16[kMaxBytesPerCodepoint]) {
+  if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
+    tprintf("Dropping invalid codepoint %d\n", code);
+    return false;
+  }
+  if (code < 0x10000) {
+    snprintf(utf16, kMaxBytesPerCodepoint, "%04X", code);
+  } else {
+    int a = code - 0x010000;
+    int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
+    int low_surrogate = (0x03FF & a) + 0xDC00;
+    snprintf(utf16, kMaxBytesPerCodepoint, "%04X%04X", high_surrogate, low_surrogate);
+  }
+  return true;
+}
+
+char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double height) {
+  double ppi = api->GetSourceYResolution();
+
+  // These initial conditions are all arbitrary and will be overwritten
+  double old_x = 0.0, old_y = 0.0;
+  int old_fontsize = 0;
+  tesseract::WritingDirection old_writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;
+  bool new_block = true;
+  int fontsize = 0;
+  double a = 1;
+  double b = 0;
+  double c = 0;
+  double d = 1;
+
+  std::stringstream pdf_str;
+  // Use "C" locale (needed for double values prec()).
+  pdf_str.imbue(std::locale::classic());
+  // Use 8 digits for double values.
+  pdf_str.precision(8);
+
+  // TODO(jbreiden) This marries the text and image together.
+  // Slightly cleaner from an abstraction standpoint if this were to
+  // live inside a separate text object.
+  pdf_str << "q " << prec(width) << " 0 0 " << prec(height) << " 0 0 cm";
+  if (!textonly_) {
+    pdf_str << " /Im1 Do";
+  }
+  pdf_str << " Q\n";
+
+  int line_x1 = 0;
+  int line_y1 = 0;
+  int line_x2 = 0;
+  int line_y2 = 0;
+
+  const std::unique_ptr</*non-const*/ ResultIterator> res_it(api->GetIterator());
+  while (!res_it->Empty(RIL_BLOCK)) {
+    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
+      pdf_str << "BT\n3 Tr"; // Begin text object, use invisible ink
+      old_fontsize = 0;      // Every block will declare its fontsize
+      new_block = true;      // Every block will declare its affine matrix
+    }
+
+    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
+      int x1, y1, x2, y2;
+      res_it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
+      ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
+    }
+
+    if (res_it->Empty(RIL_WORD)) {
+      res_it->Next(RIL_WORD);
+      continue;
+    }
+
+    // Writing direction changes at a per-word granularity
+    tesseract::WritingDirection writing_direction;
+    {
+      tesseract::Orientation orientation;
+      tesseract::TextlineOrder textline_order;
+      float deskew_angle;
+      res_it->Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle);
+      if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) {
+        switch (res_it->WordDirection()) {
+          case DIR_LEFT_TO_RIGHT:
+            writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;
+            break;
+          case DIR_RIGHT_TO_LEFT:
+            writing_direction = WRITING_DIRECTION_RIGHT_TO_LEFT;
+            break;
+          default:
+            writing_direction = old_writing_direction;
+        }
+      }
+    }
+
+    // Where is word origin and how long is it?
+    double x, y, word_length;
+    {
+      int word_x1, word_y1, word_x2, word_y2;
+      res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
+      GetWordBaseline(writing_direction, ppi, height, word_x1, word_y1, word_x2, word_y2, line_x1,
+                      line_y1, line_x2, line_y2, &x, &y, &word_length);
+    }
+
+    if (writing_direction != old_writing_direction || new_block) {
+      AffineMatrix(writing_direction, line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
+      pdf_str << " " << prec(a) // . This affine matrix
+              << " " << prec(b) // . sets the coordinate
+              << " " << prec(c) // . system for all
+              << " " << prec(d) // . text that follows.
+              << " " << prec(x) // .
+              << " " << prec(y) // .
+              << (" Tm ");      // Place cursor absolutely
+      new_block = false;
+    } else {
+      double dx = x - old_x;
+      double dy = y - old_y;
+      pdf_str << " " << prec(dx * a + dy * b) << " " << prec(dx * c + dy * d)
+              << (" Td "); // Relative moveto
+    }
+    old_x = x;
+    old_y = y;
+    old_writing_direction = writing_direction;
+
+    // Adjust font size on a per word granularity. Pay attention to
+    // fontsize, old_fontsize, and pdf_str. We've found that for
+    // in Arabic, Tesseract will happily return a fontsize of zero,
+    // so we make up a default number to protect ourselves.
+    {
+      bool bold, italic, underlined, monospace, serif, smallcaps;
+      int font_id;
+      res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps,
+                                 &fontsize, &font_id);
+      const int kDefaultFontsize = 8;
+      if (fontsize <= 0) {
+        fontsize = kDefaultFontsize;
+      }
+      if (fontsize != old_fontsize) {
+        pdf_str << "/f-0-0 " << fontsize << " Tf ";
+        old_fontsize = fontsize;
+      }
+    }
+
+    bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
+    bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
+    std::string pdf_word;
+    int pdf_word_len = 0;
+    do {
+      const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
+      if (grapheme && grapheme[0] != '\0') {
+        std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(grapheme.get());
+        char utf16[kMaxBytesPerCodepoint];
+        for (char32 code : unicodes) {
+          if (CodepointToUtf16be(code, utf16)) {
+            pdf_word += utf16;
+            pdf_word_len++;
+          }
+        }
+      }
+      res_it->Next(RIL_SYMBOL);
+    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
+    if (res_it->IsAtBeginningOf(RIL_WORD)) {
+      pdf_word += "0020";
+      pdf_word_len++;
+    }
+    if (word_length > 0 && pdf_word_len > 0) {
+      double h_stretch = kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
+      pdf_str << h_stretch << " Tz" // horizontal stretch
+              << " [ <" << pdf_word // UTF-16BE representation
+              << "> ] TJ";          // show the text
+    }
+    if (last_word_in_line) {
+      pdf_str << " \n";
+    }
+    if (last_word_in_block) {
+      pdf_str << "ET\n"; // end the text object
+    }
+  }
+  const std::string &text = pdf_str.str();
+  char *result = new char[text.length() + 1];
+  strcpy(result, text.c_str());
+  return result;
+}
+
+bool TessPDFRenderer::BeginDocumentHandler() {
+  AppendPDFObject("%PDF-1.5\n%\xDE\xAD\xBE\xEB\n");
+
+  // CATALOG
+  AppendPDFObject(
+      "1 0 obj\n"
+      "<<\n"
+      "  /Type /Catalog\n"
+      "  /Pages 2 0 R\n"
+      ">>\nendobj\n");
+
+  // We are reserving object #2 for the /Pages
+  // object, which I am going to create and write
+  // at the end of the PDF file.
+  AppendPDFObject("");
+
+  // TYPE0 FONT
+  AppendPDFObject(
+      "3 0 obj\n"
+      "<<\n"
+      "  /BaseFont /GlyphLessFont\n"
+      "  /DescendantFonts [ 4 0 R ]\n" // CIDFontType2 font
+      "  /Encoding /Identity-H\n"
+      "  /Subtype /Type0\n"
+      "  /ToUnicode 6 0 R\n" // ToUnicode
+      "  /Type /Font\n"
+      ">>\n"
+      "endobj\n");
+
+  // CIDFONTTYPE2
+  std::stringstream stream;
+  // Use "C" locale (needed for int values larger than 999).
+  stream.imbue(std::locale::classic());
+  stream << "4 0 obj\n"
+            "<<\n"
+            "  /BaseFont /GlyphLessFont\n"
+            "  /CIDToGIDMap 5 0 R\n" // CIDToGIDMap
+            "  /CIDSystemInfo\n"
+            "  <<\n"
+            "     /Ordering (Identity)\n"
+            "     /Registry (Adobe)\n"
+            "     /Supplement 0\n"
+            "  >>\n"
+            "  /FontDescriptor 7 0 R\n" // Font descriptor
+            "  /Subtype /CIDFontType2\n"
+            "  /Type /Font\n"
+            "  /DW "
+         << (1000 / kCharWidth)
+         << "\n"
+            ">>\n"
+            "endobj\n";
+  AppendPDFObject(stream.str().c_str());
+
+  // CIDTOGIDMAP
+  const int kCIDToGIDMapSize = 2 * (1 << 16);
+  const std::unique_ptr<unsigned char[]> cidtogidmap(new unsigned char[kCIDToGIDMapSize]);
+  for (int i = 0; i < kCIDToGIDMapSize; i++) {
+    cidtogidmap[i] = (i % 2) ? 1 : 0;
+  }
+  size_t len;
+  unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
+  stream.str("");
+  stream << "5 0 obj\n"
+            "<<\n"
+            "  /Length "
+         << len
+         << " /Filter /FlateDecode\n"
+            ">>\n"
+            "stream\n";
+  AppendString(stream.str().c_str());
+  long objsize = stream.str().size();
+  AppendData(reinterpret_cast<char *>(comp), len);
+  objsize += len;
+  lept_free(comp);
+  const char *endstream_endobj =
+      "endstream\n"
+      "endobj\n";
+  AppendString(endstream_endobj);
+  objsize += strlen(endstream_endobj);
+  AppendPDFObjectDIY(objsize);
+
+  const char stream2[] =
+      "/CIDInit /ProcSet findresource begin\n"
+      "12 dict begin\n"
+      "begincmap\n"
+      "/CIDSystemInfo\n"
+      "<<\n"
+      "  /Registry (Adobe)\n"
+      "  /Ordering (UCS)\n"
+      "  /Supplement 0\n"
+      ">> def\n"
+      "/CMapName /Adobe-Identify-UCS def\n"
+      "/CMapType 2 def\n"
+      "1 begincodespacerange\n"
+      "<0000> <FFFF>\n"
+      "endcodespacerange\n"
+      "1 beginbfrange\n"
+      "<0000> <FFFF> <0000>\n"
+      "endbfrange\n"
+      "endcmap\n"
+      "CMapName currentdict /CMap defineresource pop\n"
+      "end\n"
+      "end\n";
+
+  // TOUNICODE
+  stream.str("");
+  stream << "6 0 obj\n"
+            "<< /Length "
+         << (sizeof(stream2) - 1)
+         << " >>\n"
+            "stream\n"
+         << stream2
+         << "endstream\n"
+            "endobj\n";
+  AppendPDFObject(stream.str().c_str());
+
+  // FONT DESCRIPTOR
+  stream.str("");
+  stream << "7 0 obj\n"
+            "<<\n"
+            "  /Ascent 1000\n"
+            "  /CapHeight 1000\n"
+            "  /Descent -1\n" // Spec says must be negative
+            "  /Flags 5\n"    // FixedPitch + Symbolic
+            "  /FontBBox  [ 0 0 "
+         << (1000 / kCharWidth)
+         << " 1000 ]\n"
+            "  /FontFile2 8 0 R\n"
+            "  /FontName /GlyphLessFont\n"
+            "  /ItalicAngle 0\n"
+            "  /StemV 80\n"
+            "  /Type /FontDescriptor\n"
+            ">>\n"
+            "endobj\n";
+  AppendPDFObject(stream.str().c_str());
+
+  stream.str("");
+  stream << datadir_.c_str() << "/pdf.ttf";
+  const uint8_t *font;
+  std::ifstream input(stream.str().c_str(), std::ios::in | std::ios::binary);
+  std::vector<unsigned char> buffer(std::istreambuf_iterator<char>(input), {});
+  auto size = buffer.size();
+  if (size) {
+    font = buffer.data();
+  } else {
+#if !defined(NDEBUG)
+    tprintf("Cannot open file \"%s\"!\nUsing internal glyphless font.\n", stream.str().c_str());
+#endif
+    font = pdf_ttf;
+    size = sizeof(pdf_ttf);
+  }
+
+  // FONTFILE2
+  stream.str("");
+  stream << "8 0 obj\n"
+            "<<\n"
+            "  /Length "
+         << size
+         << "\n"
+            "  /Length1 "
+         << size
+         << "\n"
+            ">>\n"
+            "stream\n";
+  AppendString(stream.str().c_str());
+  objsize = stream.str().size();
+  AppendData(reinterpret_cast<const char *>(font), size);
+  objsize += size;
+  AppendString(endstream_endobj);
+  objsize += strlen(endstream_endobj);
+  AppendPDFObjectDIY(objsize);
+  return true;
+}
+
+bool TessPDFRenderer::imageToPDFObj(Pix *pix, const char *filename, long int objnum,
+                                    char **pdf_object, long int *pdf_object_size,
+                                    const int jpg_quality) {
+  if (!pdf_object_size || !pdf_object) {
+    return false;
+  }
+  *pdf_object = nullptr;
+  *pdf_object_size = 0;
+  if (!filename && !pix) {
+    return false;
+  }
+
+  L_Compressed_Data *cid = nullptr;
+
+  int sad = 0;
+  if (pixGetInputFormat(pix) == IFF_PNG) {
+    sad = pixGenerateCIData(pix, L_FLATE_ENCODE, 0, 0, &cid);
+  }
+  if (!cid) {
+    sad = l_generateCIDataForPdf(filename, pix, jpg_quality, &cid);
+  }
+
+  if (sad || !cid) {
+    l_CIDataDestroy(&cid);
+    return false;
+  }
+
+  const char *group4 = "";
+  const char *filter;
+  switch (cid->type) {
+    case L_FLATE_ENCODE:
+      filter = "/FlateDecode";
+      break;
+    case L_JPEG_ENCODE:
+      filter = "/DCTDecode";
+      break;
+    case L_G4_ENCODE:
+      filter = "/CCITTFaxDecode";
+      group4 = "    /K -1\n";
+      break;
+    case L_JP2K_ENCODE:
+      filter = "/JPXDecode";
+      break;
+    default:
+      l_CIDataDestroy(&cid);
+      return false;
+  }
+
+  // Maybe someday we will accept RGBA but today is not that day.
+  // It requires creating an /SMask for the alpha channel.
+  // http://stackoverflow.com/questions/14220221
+  std::stringstream colorspace;
+  // Use "C" locale (needed for int values larger than 999).
+  colorspace.imbue(std::locale::classic());
+  if (cid->ncolors > 0) {
+    colorspace << "  /ColorSpace [ /Indexed /DeviceRGB " << (cid->ncolors - 1) << " "
+               << cid->cmapdatahex << " ]\n";
+  } else {
+    switch (cid->spp) {
+      case 1:
+        if (cid->bps == 1 && pixGetInputFormat(pix) == IFF_PNG) {
+          colorspace.str(
+              "  /ColorSpace /DeviceGray\n"
+              "  /Decode [1 0]\n");
+        } else {
+          colorspace.str("  /ColorSpace /DeviceGray\n");
+        }
+        break;
+      case 3:
+        colorspace.str("  /ColorSpace /DeviceRGB\n");
+        break;
+      default:
+        l_CIDataDestroy(&cid);
+        return false;
+    }
+  }
+
+  int predictor = (cid->predictor) ? 14 : 1;
+
+  // IMAGE
+  std::stringstream b1;
+  // Use "C" locale (needed for int values larger than 999).
+  b1.imbue(std::locale::classic());
+  b1 << objnum
+     << " 0 obj\n"
+        "<<\n"
+        "  /Length "
+     << cid->nbytescomp
+     << "\n"
+        "  /Subtype /Image\n";
+
+  std::stringstream b2;
+  // Use "C" locale (needed for int values larger than 999).
+  b2.imbue(std::locale::classic());
+  b2 << "  /Width " << cid->w
+     << "\n"
+        "  /Height "
+     << cid->h
+     << "\n"
+        "  /BitsPerComponent "
+     << cid->bps
+     << "\n"
+        "  /Filter "
+     << filter
+     << "\n"
+        "  /DecodeParms\n"
+        "  <<\n"
+        "    /Predictor "
+     << predictor
+     << "\n"
+        "    /Colors "
+     << cid->spp << "\n"
+     << group4 << "    /Columns " << cid->w
+     << "\n"
+        "    /BitsPerComponent "
+     << cid->bps
+     << "\n"
+        "  >>\n"
+        ">>\n"
+        "stream\n";
+
+  const char *b3 =
+      "endstream\n"
+      "endobj\n";
+
+  size_t b1_len = b1.str().size();
+  size_t b2_len = b2.str().size();
+  size_t b3_len = strlen(b3);
+  size_t colorspace_len = colorspace.str().size();
+
+  *pdf_object_size = b1_len + colorspace_len + b2_len + cid->nbytescomp + b3_len;
+  *pdf_object = new char[*pdf_object_size];
+
+  char *p = *pdf_object;
+  memcpy(p, b1.str().c_str(), b1_len);
+  p += b1_len;
+  memcpy(p, colorspace.str().c_str(), colorspace_len);
+  p += colorspace_len;
+  memcpy(p, b2.str().c_str(), b2_len);
+  p += b2_len;
+  memcpy(p, cid->datacomp, cid->nbytescomp);
+  p += cid->nbytescomp;
+  memcpy(p, b3, b3_len);
+  l_CIDataDestroy(&cid);
+  return true;
+}
+
+bool TessPDFRenderer::AddImageHandler(TessBaseAPI *api) {
+  Pix *pix = api->GetInputImage();
+  const char *filename = api->GetInputName();
+  int ppi = api->GetSourceYResolution();
+  if (!pix || ppi <= 0) {
+    return false;
+  }
+  double width = pixGetWidth(pix) * 72.0 / ppi;
+  double height = pixGetHeight(pix) * 72.0 / ppi;
+
+  std::stringstream xobject;
+  // Use "C" locale (needed for int values larger than 999).
+  xobject.imbue(std::locale::classic());
+  if (!textonly_) {
+    xobject << "/XObject << /Im1 " << (obj_ + 2) << " 0 R >>\n";
+  }
+
+  // PAGE
+  std::stringstream stream;
+  // Use "C" locale (needed for double values width and height).
+  stream.imbue(std::locale::classic());
+  stream.precision(2);
+  stream << std::fixed << obj_
+         << " 0 obj\n"
+            "<<\n"
+            "  /Type /Page\n"
+            "  /Parent 2 0 R\n" // Pages object
+            "  /MediaBox [0 0 "
+         << width << " " << height
+         << "]\n"
+            "  /Contents "
+         << (obj_ + 1)
+         << " 0 R\n" // Contents object
+            "  /Resources\n"
+            "  <<\n"
+            "    "
+         << xobject.str() << // Image object
+      "    /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
+      "    /Font << /f-0-0 3 0 R >>\n" // Type0 Font
+      "  >>\n"
+      ">>\n"
+      "endobj\n";
+  pages_.push_back(obj_);
+  AppendPDFObject(stream.str().c_str());
+
+  // CONTENTS
+  const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));
+  const size_t pdftext_len = strlen(pdftext.get());
+  size_t len;
+  unsigned char *comp_pdftext =
+      zlibCompress(reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
+  long comp_pdftext_len = len;
+  stream.str("");
+  stream << obj_
+         << " 0 obj\n"
+            "<<\n"
+            "  /Length "
+         << comp_pdftext_len
+         << " /Filter /FlateDecode\n"
+            ">>\n"
+            "stream\n";
+  AppendString(stream.str().c_str());
+  long objsize = stream.str().size();
+  AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
+  objsize += comp_pdftext_len;
+  lept_free(comp_pdftext);
+  const char *b2 =
+      "endstream\n"
+      "endobj\n";
+  AppendString(b2);
+  objsize += strlen(b2);
+  AppendPDFObjectDIY(objsize);
+
+  if (!textonly_) {
+    char *pdf_object = nullptr;
+    int jpg_quality;
+    api->GetIntVariable("jpg_quality", &jpg_quality);
+    if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize, jpg_quality)) {
+      return false;
+    }
+    AppendData(pdf_object, objsize);
+    AppendPDFObjectDIY(objsize);
+    delete[] pdf_object;
+  }
+  return true;
+}
+
+bool TessPDFRenderer::EndDocumentHandler() {
+  // We reserved the /Pages object number early, so that the /Page
+  // objects could refer to their parent. We finally have enough
+  // information to go fill it in. Using lower level calls to manipulate
+  // the offset record in two spots, because we are placing objects
+  // out of order in the file.
+
+  // PAGES
+  const long int kPagesObjectNumber = 2;
+  offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1
+  std::stringstream stream;
+  // Use "C" locale (needed for int values larger than 999).
+  stream.imbue(std::locale::classic());
+  stream << kPagesObjectNumber << " 0 obj\n<<\n  /Type /Pages\n  /Kids [ ";
+  AppendString(stream.str().c_str());
+  size_t pages_objsize = stream.str().size();
+  for (const auto &page : pages_) {
+    stream.str("");
+    stream << page << " 0 R ";
+    AppendString(stream.str().c_str());
+    pages_objsize += stream.str().size();
+  }
+  stream.str("");
+  stream << "]\n  /Count " << pages_.size() << "\n>>\nendobj\n";
+  AppendString(stream.str().c_str());
+  pages_objsize += stream.str().size();
+  offsets_.back() += pages_objsize; // manipulation #2
+
+  // INFO
+  std::string utf16_title = "FEFF"; // byte_order_marker
+  std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(title());
+  char utf16[kMaxBytesPerCodepoint];
+  for (char32 code : unicodes) {
+    if (CodepointToUtf16be(code, utf16)) {
+      utf16_title += utf16;
+    }
+  }
+
+  char *datestr = l_getFormattedDate();
+  stream.str("");
+  stream << obj_
+         << " 0 obj\n"
+            "<<\n"
+            "  /Producer (Tesseract "
+         << tesseract::TessBaseAPI::Version()
+         << ")\n"
+            "  /CreationDate (D:"
+         << datestr
+         << ")\n"
+            "  /Title <"
+         << utf16_title.c_str()
+         << ">\n"
+            ">>\n"
+            "endobj\n";
+  lept_free(datestr);
+  AppendPDFObject(stream.str().c_str());
+  stream.str("");
+  stream << "xref\n0 " << obj_ << "\n0000000000 65535 f \n";
+  AppendString(stream.str().c_str());
+  for (int i = 1; i < obj_; i++) {
+    stream.str("");
+    stream.width(10);
+    stream.fill('0');
+    stream << offsets_[i] << " 00000 n \n";
+    AppendString(stream.str().c_str());
+  }
+  stream.str("");
+  stream << "trailer\n<<\n  /Size " << obj_
+         << "\n"
+            "  /Root 1 0 R\n" // catalog
+            "  /Info "
+         << (obj_ - 1)
+         << " 0 R\n" // info
+            ">>\nstartxref\n"
+         << offsets_.back() << "\n%%EOF\n";
+  AppendString(stream.str().c_str());
+  return true;
+}
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/api/renderer.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/api/renderer.cpp
@ -0,0 +1,241 @@
+///////////////////////////////////////////////////////////////////////
+// File:        renderer.cpp
+// Description: Rendering interface to inject into TessBaseAPI
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h"
+#endif
+#include <tesseract/baseapi.h>
+#include <tesseract/renderer.h>
+#include <cstring>
+#include <memory>     // std::unique_ptr
+#include <string>     // std::string
+#include "serialis.h" // Serialize
+
+namespace tesseract {
+
+/**********************************************************************
+ * Base Renderer interface implementation
+ **********************************************************************/
+TessResultRenderer::TessResultRenderer(const char *outputbase, const char *extension)
+    : file_extension_(extension)
+    , title_("")
+    , imagenum_(-1)
+    , fout_(stdout)
+    , next_(nullptr)
+    , happy_(true) {
+  if (strcmp(outputbase, "-") && strcmp(outputbase, "stdout")) {
+    std::string outfile = std::string(outputbase) + "." + extension;
+    fout_ = fopen(outfile.c_str(), "wb");
+    if (fout_ == nullptr) {
+      happy_ = false;
+    }
+  }
+}
+
+TessResultRenderer::~TessResultRenderer() {
+  if (fout_ != nullptr) {
+    if (fout_ != stdout) {
+      fclose(fout_);
+    } else {
+      clearerr(fout_);
+    }
+  }
+  delete next_;
+}
+
+void TessResultRenderer::insert(TessResultRenderer *next) {
+  if (next == nullptr) {
+    return;
+  }
+
+  TessResultRenderer *remainder = next_;
+  next_ = next;
+  if (remainder) {
+    while (next->next_ != nullptr) {
+      next = next->next_;
+    }
+    next->next_ = remainder;
+  }
+}
+
+bool TessResultRenderer::BeginDocument(const char *title) {
+  if (!happy_) {
+    return false;
+  }
+  title_ = title;
+  imagenum_ = -1;
+  bool ok = BeginDocumentHandler();
+  if (next_) {
+    ok = next_->BeginDocument(title) && ok;
+  }
+  return ok;
+}
+
+bool TessResultRenderer::AddImage(TessBaseAPI *api) {
+  if (!happy_) {
+    return false;
+  }
+  ++imagenum_;
+  bool ok = AddImageHandler(api);
+  if (next_) {
+    ok = next_->AddImage(api) && ok;
+  }
+  return ok;
+}
+
+bool TessResultRenderer::EndDocument() {
+  if (!happy_) {
+    return false;
+  }
+  bool ok = EndDocumentHandler();
+  if (next_) {
+    ok = next_->EndDocument() && ok;
+  }
+  return ok;
+}
+
+void TessResultRenderer::AppendString(const char *s) {
+  AppendData(s, strlen(s));
+}
+
+void TessResultRenderer::AppendData(const char *s, int len) {
+  if (!tesseract::Serialize(fout_, s, len)) {
+    happy_ = false;
+  }
+  fflush(fout_);
+}
+
+bool TessResultRenderer::BeginDocumentHandler() {
+  return happy_;
+}
+
+bool TessResultRenderer::EndDocumentHandler() {
+  return happy_;
+}
+
+/**********************************************************************
+ * UTF8 Text Renderer interface implementation
+ **********************************************************************/
+TessTextRenderer::TessTextRenderer(const char *outputbase)
+    : TessResultRenderer(outputbase, "txt") {}
+
+bool TessTextRenderer::AddImageHandler(TessBaseAPI *api) {
+  const std::unique_ptr<const char[]> utf8(api->GetUTF8Text());
+  if (utf8 == nullptr) {
+    return false;
+  }
+
+  AppendString(utf8.get());
+
+  const char *pageSeparator = api->GetStringVariable("page_separator");
+  if (pageSeparator != nullptr && *pageSeparator != '\0') {
+    AppendString(pageSeparator);
+  }
+
+  return true;
+}
+
+/**********************************************************************
+ * TSV Text Renderer interface implementation
+ **********************************************************************/
+TessTsvRenderer::TessTsvRenderer(const char *outputbase) : TessResultRenderer(outputbase, "tsv") {
+  font_info_ = false;
+}
+
+TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info)
+    : TessResultRenderer(outputbase, "tsv") {
+  font_info_ = font_info;
+}
+
+bool TessTsvRenderer::BeginDocumentHandler() {
+  // Output TSV column headings
+  AppendString(
+      "level\tpage_num\tblock_num\tpar_num\tline_num\tword_"
+      "num\tleft\ttop\twidth\theight\tconf\ttext\n");
+  return true;
+}
+
+bool TessTsvRenderer::EndDocumentHandler() {
+  return true;
+}
+
+bool TessTsvRenderer::AddImageHandler(TessBaseAPI *api) {
+  const std::unique_ptr<const char[]> tsv(api->GetTSVText(imagenum()));
+  if (tsv == nullptr) {
+    return false;
+  }
+
+  AppendString(tsv.get());
+
+  return true;
+}
+
+/**********************************************************************
+ * UNLV Text Renderer interface implementation
+ **********************************************************************/
+TessUnlvRenderer::TessUnlvRenderer(const char *outputbase)
+    : TessResultRenderer(outputbase, "unlv") {}
+
+bool TessUnlvRenderer::AddImageHandler(TessBaseAPI *api) {
+  const std::unique_ptr<const char[]> unlv(api->GetUNLVText());
+  if (unlv == nullptr) {
+    return false;
+  }
+
+  AppendString(unlv.get());
+
+  return true;
+}
+
+/**********************************************************************
+ * BoxText Renderer interface implementation
+ **********************************************************************/
+TessBoxTextRenderer::TessBoxTextRenderer(const char *outputbase)
+    : TessResultRenderer(outputbase, "box") {}
+
+bool TessBoxTextRenderer::AddImageHandler(TessBaseAPI *api) {
+  const std::unique_ptr<const char[]> text(api->GetBoxText(imagenum()));
+  if (text == nullptr) {
+    return false;
+  }
+
+  AppendString(text.get());
+
+  return true;
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+/**********************************************************************
+ * Osd Text Renderer interface implementation
+ **********************************************************************/
+TessOsdRenderer::TessOsdRenderer(const char *outputbase) : TessResultRenderer(outputbase, "osd") {}
+
+bool TessOsdRenderer::AddImageHandler(TessBaseAPI *api) {
+  const std::unique_ptr<const char[]> osd(api->GetOsdText(imagenum()));
+  if (osd == nullptr) {
+    return false;
+  }
+
+  AppendString(osd.get());
+
+  return true;
+}
+
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/api/wordstrboxrenderer.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/api/wordstrboxrenderer.cpp
@ -0,0 +1,106 @@
+/**********************************************************************
+ * File:        wordstrboxrenderer.cpp
+ * Description: Renderer for creating box file with WordStr strings.
+ *              based on the tsv renderer.
+ *
+ * (C) Copyright 2019, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include <tesseract/baseapi.h> // for TessBaseAPI
+#include <tesseract/renderer.h>
+#include "tesseractclass.h" // for Tesseract
+
+namespace tesseract {
+
+/**
+ * Create a UTF8 box file with WordStr strings from the internal data
+ * structures. page_number is a 0-base page index that will appear in the box
+ * file. Returned string must be freed with the delete [] operator.
+ */
+
+char *TessBaseAPI::GetWordStrBoxText(int page_number = 0) {
+  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
+    return nullptr;
+  }
+
+  std::string wordstr_box_str;
+  int left = 0, top = 0, right = 0, bottom = 0;
+
+  bool first_line = true;
+
+  LTRResultIterator *res_it = GetLTRIterator();
+  while (!res_it->Empty(RIL_BLOCK)) {
+    if (res_it->Empty(RIL_WORD)) {
+      res_it->Next(RIL_WORD);
+      continue;
+    }
+
+    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
+      if (!first_line) {
+        wordstr_box_str += "\n\t " + std::to_string(right + 1);
+        wordstr_box_str += " " + std::to_string(image_height_ - bottom);
+        wordstr_box_str += " " + std::to_string(right + 5);
+        wordstr_box_str += " " + std::to_string(image_height_ - top);
+        wordstr_box_str += " " + std::to_string(page_number); // row for tab for EOL
+        wordstr_box_str += "\n";
+      } else {
+        first_line = false;
+      }
+      // Use bounding box for whole line for WordStr
+      res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
+      wordstr_box_str += "WordStr " + std::to_string(left);
+      wordstr_box_str += " " + std::to_string(image_height_ - bottom);
+      wordstr_box_str += " " + std::to_string(right);
+      wordstr_box_str += " " + std::to_string(image_height_ - top);
+      wordstr_box_str += " " + std::to_string(page_number); // word
+      wordstr_box_str += " #";
+    }
+    do {
+      wordstr_box_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
+      wordstr_box_str += " ";
+      res_it->Next(RIL_WORD);
+    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
+  }
+
+  if (left != 0 && top != 0 && right != 0 && bottom != 0) {
+    wordstr_box_str += "\n\t " + std::to_string(right + 1);
+    wordstr_box_str += " " + std::to_string(image_height_ - bottom);
+    wordstr_box_str += " " + std::to_string(right + 5);
+    wordstr_box_str += " " + std::to_string(image_height_ - top);
+    wordstr_box_str += " " + std::to_string(page_number); // row for tab for EOL
+    wordstr_box_str += "\n";
+  }
+  char *ret = new char[wordstr_box_str.length() + 1];
+  strcpy(ret, wordstr_box_str.c_str());
+  delete res_it;
+  return ret;
+}
+
+/**********************************************************************
+ * WordStrBox Renderer interface implementation
+ **********************************************************************/
+TessWordStrBoxRenderer::TessWordStrBoxRenderer(const char *outputbase)
+    : TessResultRenderer(outputbase, "box") {}
+
+bool TessWordStrBoxRenderer::AddImageHandler(TessBaseAPI *api) {
+  const std::unique_ptr<const char[]> wordstrbox(api->GetWordStrBoxText(imagenum()));
+  if (wordstrbox == nullptr) {
+    return false;
+  }
+
+  AppendString(wordstrbox.get());
+
+  return true;
+}
+
+} // namespace tesseract.
--- a/3rdparty/tesseract_ocr/tesseract/src/arch/dotproduct.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/arch/dotproduct.cpp
@ -0,0 +1,30 @@
+///////////////////////////////////////////////////////////////////////
+// File:        dotproduct.h
+// Description: Native dot product function.
+//
+// (C) Copyright 2018, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#include "dotproduct.h"
+
+namespace tesseract {
+
+// Computes and returns the dot product of the two n-vectors u and v.
+double DotProductNative(const double *u, const double *v, int n) {
+  double total = 0.0;
+  for (int k = 0; k < n; ++k) {
+    total += u[k] * v[k];
+  }
+  return total;
+}
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/arch/dotproduct.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/arch/dotproduct.h
@ -0,0 +1,36 @@
+///////////////////////////////////////////////////////////////////////
+// File:        dotproduct.h
+// Description: Native dot product function.
+//
+// (C) Copyright 2018, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_ARCH_DOTPRODUCT_H_
+#define TESSERACT_ARCH_DOTPRODUCT_H_
+
+namespace tesseract {
+
+// Computes and returns the dot product of the n-vectors u and v.
+double DotProductNative(const double *u, const double *v, int n);
+
+// Uses Intel AVX intrinsics to access the SIMD instruction set.
+double DotProductAVX(const double *u, const double *v, int n);
+
+// Use Intel FMA.
+double DotProductFMA(const double *u, const double *v, int n);
+
+// Uses Intel SSE intrinsics to access the SIMD instruction set.
+double DotProductSSE(const double *u, const double *v, int n);
+
+} // namespace tesseract.
+
+#endif // TESSERACT_ARCH_DOTPRODUCT_H_
--- a/3rdparty/tesseract_ocr/tesseract/src/arch/dotproductavx.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/arch/dotproductavx.cpp
@ -0,0 +1,63 @@
+///////////////////////////////////////////////////////////////////////
+// File:        dotproductavx.cpp
+// Description: Architecture-specific dot-product function.
+// Author:      Ray Smith
+//
+// (C) Copyright 2015, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#if !defined(__AVX__)
+#  if defined(__i686__) || defined(__x86_64__)
+#    error Implementation only for AVX capable architectures
+#  endif
+#else
+
+#  include <immintrin.h>
+#  include <cstdint>
+#  include "dotproduct.h"
+
+namespace tesseract {
+
+// Computes and returns the dot product of the n-vectors u and v.
+// Uses Intel AVX intrinsics to access the SIMD instruction set.
+double DotProductAVX(const double *u, const double *v, int n) {
+  const unsigned quot = n / 8;
+  const unsigned rem = n % 8;
+  __m256d t0 = _mm256_setzero_pd();
+  __m256d t1 = _mm256_setzero_pd();
+  for (unsigned k = 0; k < quot; k++) {
+    __m256d f0 = _mm256_loadu_pd(u);
+    __m256d f1 = _mm256_loadu_pd(v);
+    f0 = _mm256_mul_pd(f0, f1);
+    t0 = _mm256_add_pd(t0, f0);
+    u += 4;
+    v += 4;
+    __m256d f2 = _mm256_loadu_pd(u);
+    __m256d f3 = _mm256_loadu_pd(v);
+    f2 = _mm256_mul_pd(f2, f3);
+    t1 = _mm256_add_pd(t1, f2);
+    u += 4;
+    v += 4;
+  }
+  t0 = _mm256_hadd_pd(t0, t1);
+  alignas(32) double tmp[4];
+  _mm256_store_pd(tmp, t0);
+  double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
+  for (unsigned k = 0; k < rem; k++) {
+    result += *u++ * *v++;
+  }
+  return result;
+}
+
+} // namespace tesseract.
+
+#endif
--- a/3rdparty/tesseract_ocr/tesseract/src/arch/dotproductfma.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/arch/dotproductfma.cpp
@ -0,0 +1,61 @@
+///////////////////////////////////////////////////////////////////////
+// File:        dotproductfma.cpp
+// Description: Architecture-specific dot-product function.
+// Author:      Stefan Weil
+//
+// (C) Copyright 2015, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#if !defined(__FMA__)
+#  if defined(__i686__) || defined(__x86_64__)
+#    error Implementation only for FMA capable architectures
+#  endif
+#else
+
+#  include <immintrin.h>
+#  include <cstdint>
+#  include "dotproduct.h"
+
+namespace tesseract {
+
+// Computes and returns the dot product of the n-vectors u and v.
+// Uses Intel FMA intrinsics to access the SIMD instruction set.
+double DotProductFMA(const double *u, const double *v, int n) {
+  const unsigned quot = n / 8;
+  const unsigned rem = n % 8;
+  __m256d t0 = _mm256_setzero_pd();
+  __m256d t1 = _mm256_setzero_pd();
+  for (unsigned k = 0; k < quot; k++) {
+    __m256d f0 = _mm256_loadu_pd(u);
+    __m256d f1 = _mm256_loadu_pd(v);
+    t0 = _mm256_fmadd_pd(f0, f1, t0);
+    u += 4;
+    v += 4;
+    __m256d f2 = _mm256_loadu_pd(u);
+    __m256d f3 = _mm256_loadu_pd(v);
+    t1 = _mm256_fmadd_pd(f2, f3, t1);
+    u += 4;
+    v += 4;
+  }
+  t0 = _mm256_hadd_pd(t0, t1);
+  alignas(32) double tmp[4];
+  _mm256_store_pd(tmp, t0);
+  double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
+  for (unsigned k = 0; k < rem; k++) {
+    result += *u++ * *v++;
+  }
+  return result;
+}
+
+} // namespace tesseract.
+
+#endif
--- a/3rdparty/tesseract_ocr/tesseract/src/arch/dotproductsse.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/arch/dotproductsse.cpp
@ -0,0 +1,84 @@
+///////////////////////////////////////////////////////////////////////
+// File:        dotproductsse.cpp
+// Description: Architecture-specific dot-product function.
+// Author:      Ray Smith
+//
+// (C) Copyright 2015, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#if !defined(__SSE4_1__)
+#  if defined(__i686__) || defined(__x86_64__)
+#    error Implementation only for SSE 4.1 capable architectures
+#  endif
+#else
+
+#  include <emmintrin.h>
+#  include <smmintrin.h>
+#  include <cstdint>
+#  include "dotproduct.h"
+
+namespace tesseract {
+
+// Computes and returns the dot product of the n-vectors u and v.
+// Uses Intel SSE intrinsics to access the SIMD instruction set.
+double DotProductSSE(const double *u, const double *v, int n) {
+  int max_offset = n - 2;
+  int offset = 0;
+  // Accumulate a set of 2 sums in sum, by loading pairs of 2 values from u and
+  // v, and multiplying them together in parallel.
+  __m128d sum = _mm_setzero_pd();
+  if (offset <= max_offset) {
+    offset = 2;
+    // Aligned load is reputedly faster but requires 16 byte aligned input.
+    if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 && (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
+      // Use aligned load.
+      sum = _mm_load_pd(u);
+      __m128d floats2 = _mm_load_pd(v);
+      // Multiply.
+      sum = _mm_mul_pd(sum, floats2);
+      while (offset <= max_offset) {
+        __m128d floats1 = _mm_load_pd(u + offset);
+        floats2 = _mm_load_pd(v + offset);
+        offset += 2;
+        floats1 = _mm_mul_pd(floats1, floats2);
+        sum = _mm_add_pd(sum, floats1);
+      }
+    } else {
+      // Use unaligned load.
+      sum = _mm_loadu_pd(u);
+      __m128d floats2 = _mm_loadu_pd(v);
+      // Multiply.
+      sum = _mm_mul_pd(sum, floats2);
+      while (offset <= max_offset) {
+        __m128d floats1 = _mm_loadu_pd(u + offset);
+        floats2 = _mm_loadu_pd(v + offset);
+        offset += 2;
+        floats1 = _mm_mul_pd(floats1, floats2);
+        sum = _mm_add_pd(sum, floats1);
+      }
+    }
+  }
+  // Add the 2 sums in sum horizontally.
+  sum = _mm_hadd_pd(sum, sum);
+  // Extract the low result.
+  double result = _mm_cvtsd_f64(sum);
+  // Add on any left-over products.
+  while (offset < n) {
+    result += u[offset] * v[offset];
+    ++offset;
+  }
+  return result;
+}
+
+} // namespace tesseract.
+
+#endif
--- a/3rdparty/tesseract_ocr/tesseract/src/arch/intsimdmatrix.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/arch/intsimdmatrix.cpp
@ -0,0 +1,94 @@
+///////////////////////////////////////////////////////////////////////
+// File:        intsimdmatrix.cpp
+// Description: Base class for 8-bit int SIMD matrix multipliers.
+// Author:      Ray Smith
+//
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#include "intsimdmatrix.h"
+#include "matrix.h"     // for GENERIC_2D_ARRAY
+#include "simddetect.h" // for SIMDDetect
+
+namespace tesseract {
+
+const IntSimdMatrix *IntSimdMatrix::intSimdMatrix = nullptr;
+
+// Computes a reshaped copy of the weight matrix w.
+void IntSimdMatrix::Init(const GENERIC_2D_ARRAY<int8_t> &w, std::vector<int8_t> &shaped_w,
+                         int32_t &rounded_num_out) const {
+  const int num_out = w.dim1();
+  const int num_in = w.dim2() - 1;
+  // The rounded-up sizes of the reshaped weight matrix, excluding biases.
+  int rounded_num_in = Roundup(num_in, num_inputs_per_group_);
+  rounded_num_out = RoundOutputs(num_out);
+  // Add the bias and compute the required size.
+  shaped_w.resize((rounded_num_in + 1) * rounded_num_out, 0);
+  int shaped_index = 0;
+  int output = 0;
+  // Each number of registers needs a different format! Iterates over the
+  // different numbers of registers (each a power of 2).
+  for (int num_registers = max_output_registers_; num_registers >= 1; num_registers /= 2) {
+    // The number of outputs that we will generate with this many registers.
+    int num_outputs_per_register_set = num_registers * num_outputs_per_register_;
+    // Use the max number of registers until we have to go fewer.
+    while (output + num_outputs_per_register_set <= rounded_num_out) {
+      // Accumulating outputs in registers saves iterating over the inputs, so
+      // we only have to do it once per output register set.
+      for (int input = 0; input < num_in; input += num_inputs_per_group_) {
+        // Iterate over the number of outputs in a register set.
+        for (int j = 0; j < num_outputs_per_register_set; ++j) {
+          // Inner-most loop corresponds to the number of inputs in an input
+          // group.
+          for (int i = 0; i < num_inputs_per_group_; ++i) {
+            int8_t weight = 0;
+            if (output + j < num_out && input + i < num_in) {
+              weight = w(output + j, input + i);
+            }
+            shaped_w[shaped_index++] = weight;
+          }
+        }
+      }
+      // Append the bias weights for the register set.
+      for (int j = 0; j < num_outputs_per_register_set; ++j) {
+        int8_t weight = 0;
+        if (output + j < num_out) {
+          weight = w(output + j, num_in);
+        }
+        shaped_w[shaped_index++] = weight;
+      }
+      output += num_outputs_per_register_set;
+    }
+  }
+}
+
+// Computes matrix.vector v = Wu.
+// u is of size W.dim2() - 1 and the output v is of size W.dim1().
+// u is imagined to have an extra element at the end with value 1, to
+// implement the bias, but it doesn't actually have it.
+void IntSimdMatrix::MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w,
+                                    const std::vector<double> &scales, const int8_t *u, double *v) {
+  int num_out = w.dim1();
+  int num_in = w.dim2() - 1;
+  // Base implementation.
+  for (int i = 0; i < num_out; ++i) {
+    const int8_t *wi = w[i];
+    int total = 0;
+    for (int j = 0; j < num_in; ++j) {
+      total += wi[j] * u[j];
+    }
+    // Add in the bias and correct for integer values.
+    v[i] = (total + wi[num_in] * INT8_MAX) * scales[i];
+  }
+}
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/arch/intsimdmatrix.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/arch/intsimdmatrix.h
@ -0,0 +1,123 @@
+///////////////////////////////////////////////////////////////////////
+// File:        intsimdmatrix.h
+// Description: Base class for 8-bit int SIMD matrix multipliers.
+// Author:      Ray Smith
+//
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_ARCH_INTSIMDMATRIX_H_
+#define TESSERACT_ARCH_INTSIMDMATRIX_H_
+
+#include <tesseract/export.h>
+
+#include <cstdint>
+#include <vector>
+
+namespace tesseract {
+
+template <class T>
+class GENERIC_2D_ARRAY;
+
+// Base class for a SIMD function to multiply a matrix by a vector, with sources
+// of 8-bit signed integer, and result in a double, after appropriate scaling.
+// Assumes a specific method of multiplication that can be applied to any size
+// and number of SIMD registers as follows:
+// int32_t results are computed with num_outputs_per_register_ in each of
+// max_output_registers_ result registers, repeatedly until it would make too
+// many results, then the number of registers is halved, and so-on down to a
+// single result register. The last calculation only outputs the required number
+// of results instead of writing beyond the bounds. Eg: matrix has 75 outputs,
+//  num_outputs_per_register_ = 4, and max_output_registers_ = 8,
+// Step 1: 8x4=32 results are computed,
+// Step 2: 8x4=32 again, total 64,
+// Step 3: 2x4=8 (since 8x4 is too many, so is 4x4), total 72,
+// Step 4: 1x3, total 75.
+// Each step above is computed using a PartialFunc, which runs over the input
+// vector once. The input is read one registerful of num_inputs_per_register_
+// at a time (presumably 4x num_outputs_per_register_ since they are int8_t)
+// so the inputs MUST BE PADDED to a multiple of num_inputs_per_register_.
+// Since it is slow (on Intel at least) to horizontally add in a register,
+// provision is made to process num_inputs_per_group_ inputs at a time, with
+// the group being replicated num_input_groups_ times and multiplied by a
+// num_inputs_per_group_ by num_input_groups_ rectangle of the weights matrix.
+// This is most convenient if num_inputs_per_group_ is 4, and the product
+// sign-extends and sums 8x8=16 bit results to 32 bits, adding 4 adjacent
+// results in the process, but it doesn't have to be implemented that way.
+// The weights are re-ordered by Init() to be used sequentially by the above
+// algorithm, followed by the biases, so they can be added at the end.
+// The base class computes the base C++ implementation.
+// NOTE that, although the subclasses execute on different SIMD hardware, no
+// virtual methods are needed, as the constructor sets up everything that
+// is required to allow the base class implementation to do all the work.
+struct TESS_API IntSimdMatrix {
+  // Computes a reshaped copy of the weight matrix w.
+  void Init(const GENERIC_2D_ARRAY<int8_t> &w, std::vector<int8_t> &shaped_w,
+            int32_t &rounded_num_out) const;
+
+  // Rounds the size up to a multiple of the input register size (in int8_t).
+  int RoundInputs(int size) const {
+    return Roundup(size, num_inputs_per_register_);
+  }
+  // Rounds the size up to a multiple of the output register size (in int32_t).
+  int RoundOutputs(int size) const {
+    return Roundup(size, num_outputs_per_register_);
+  }
+
+  // Computes matrix.vector v = Wu.
+  // u is of size W.dim2() - 1 and the output v is of size W.dim1().
+  // u is imagined to have an extra element at the end with value 1, to
+  // implement the bias, but it doesn't actually have it.
+  // Computes the base C++ implementation.
+  static void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w, const std::vector<double> &scales,
+                              const int8_t *u, double *v);
+
+  // Rounds the input up to a multiple of the given factor.
+  static int Roundup(int input, int factor) {
+    return (input + factor - 1) / factor * factor;
+  }
+
+  // Computes matrix.vector v = Wu.
+  // u is of size W.dim2() - 1 and the output v is of size W.dim1().
+  // u is imagined to have an extra element at the end with value 1, to
+  // implement the bias, but it doesn't actually have it.
+  // Uses an optimized implementation with partial funcs.
+  // NOTE: The size of the input vector (u) must be padded using
+  // RoundInputs above.
+  // The input will be over-read to the extent of the padding. There are no
+  // alignment requirements.
+  using MatrixDotVectorFunction = void (*)(int, int, const int8_t *, const double *, const int8_t *,
+                                           double *);
+  MatrixDotVectorFunction matrixDotVectorFunction;
+
+  // Number of 32 bit outputs held in each register.
+  int num_outputs_per_register_;
+  // Maximum number of registers that we will use to hold outputs.
+  int max_output_registers_;
+  // Number of 8 bit inputs in the inputs register.
+  int num_inputs_per_register_;
+  // Number of inputs in each weight group.
+  int num_inputs_per_group_;
+  // Number of groups of inputs to be broadcast.
+  // num_input_groups_ = num_inputs_per_register_ / num_inputs_per_group_
+
+  static const IntSimdMatrix *intSimdMatrix;
+  // Only available with NEON.
+  static const IntSimdMatrix intSimdMatrixNEON;
+  // Only available with AVX2 / SSE.
+  static const IntSimdMatrix intSimdMatrixAVX2;
+  static const IntSimdMatrix intSimdMatrixSSE;
+};
+
+} // namespace tesseract
+
+#endif // TESSERACT_ARCH_INTSIMDMATRIX_H_
--- a/3rdparty/tesseract_ocr/tesseract/src/arch/intsimdmatrixavx2.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/arch/intsimdmatrixavx2.cpp
@ -0,0 +1,348 @@
+///////////////////////////////////////////////////////////////////////
+// File:        intsimdmatrixavx2.cpp
+// Description: matrix-vector product for 8-bit data on avx2.
+// Author:      Ray Smith
+//
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#if !defined(__AVX2__)
+#  if defined(__i686__) || defined(__x86_64__)
+#    error Implementation only for AVX2 capable architectures
+#  endif
+#else
+
+#  include "intsimdmatrix.h"
+
+#  include <immintrin.h>
+#  include <algorithm>
+#  include <cstdint>
+#  include <vector>
+
+namespace tesseract {
+
+// Number of outputs held in each register. 8 x 32 bit ints.
+constexpr int kNumOutputsPerRegister = 8;
+// Maximum number of registers that we will use.
+constexpr int kMaxOutputRegisters = 8;
+// Number of inputs in the inputs register.
+constexpr int kNumInputsPerRegister = 32;
+// Number of inputs in each weight group.
+constexpr int kNumInputsPerGroup = 4;
+// Number of groups of inputs to be broadcast.
+constexpr int kNumInputGroups = kNumInputsPerRegister / kNumInputsPerGroup;
+
+// Functions to compute part of a matrix.vector multiplication. The weights
+// are in a very specific order (see above) in w, which is multiplied by
+// u of length num_in, to produce output v after scaling the integer results
+// by the corresponding member of scales.
+// The amount of w and scales consumed is fixed and not available to the
+// caller. The number of outputs written to v will be at most num_out.
+
+// Computes one set of 4x8 products of inputs and weights, adding to result.
+// Horizontally adds 4 adjacent results, making 8x32-bit results.
+// rep_input is assumed to be an 8x replicated set of 4x8-bit signed integers.
+// Note that wi must previously have been re-organized with blocks of 4x8
+// weights in contiguous memory.
+// ones is a register of 16x16-bit values all equal to 1.
+// Note: wi is incremented by the amount of data read.
+// weights and reps are scratch registers.
+// This function must be inlined with references in order for the compiler to
+// correctly use the registers declared in the caller.
+static inline void MultiplyGroup(const __m256i &rep_input, const __m256i &ones, const int8_t *&wi,
+                                 __m256i &weights, __m256i &reps, __m256i &result) {
+  // Load a 4x8 block of weights.
+  weights = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(wi));
+  wi += kNumInputsPerRegister;
+  // Normalize the signs on rep_input, weights, so weights is always +ve.
+  reps = _mm256_sign_epi8(rep_input, weights);
+  weights = _mm256_sign_epi8(weights, weights);
+  // Multiply 32x8-bit reps by 32x8-bit weights to make 16x16-bit results,
+  // with adjacent pairs added.
+  weights = _mm256_maddubs_epi16(weights, reps);
+  // Multiply 16x16-bit result by 16x16-bit ones to make 8x32-bit results,
+  // with  adjacent pairs added. What we really want is a horizontal add of
+  // 16+16=32 bit result, but there is no such instruction, so multiply by
+  // 16-bit ones instead. It is probably faster than all the sign-extending,
+  // permuting and adding that would otherwise be required.
+  weights = _mm256_madd_epi16(weights, ones);
+  result = _mm256_add_epi32(result, weights);
+}
+
+// Load 64 bits into the bottom of a 128bit register.
+// We don't actually care what the top 64bits are, but this ends
+// up with them being zero.
+static inline __m128i load64_to_128(const int8_t *wi_) {
+  const auto *wi = reinterpret_cast<const int64_t *>(wi_);
+  return _mm_set_epi64x(0, wi[0]);
+}
+
+static inline void ExtractResults8(__m256i result, const int8_t *wi, const double *scales,
+                                   double *v) {
+  __m128i w128 = load64_to_128(wi);          // 8x8bit vals in bottom of 128bit reg
+  __m256i w256 = _mm256_cvtepi8_epi32(w128); // 8x32bit vals in 256bit reg
+  __m256i bias_scale = _mm256_set_epi32(127, 127, 127, 127, 127, 127, 127, 127);
+  __m256d scale0123 = _mm256_loadu_pd(scales);
+  __m256d scale4567 = _mm256_loadu_pd(scales + 4);
+  w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 <bias * 127>
+  result = _mm256_add_epi32(result, w256);     // result += bias * 127
+  __m256d res0123 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result));
+  result = _mm256_permute4x64_epi64(result, 2 + (3 << 2));
+  __m256d res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result));
+  res0123 = _mm256_mul_pd(res0123, scale0123);
+  res4567 = _mm256_mul_pd(res4567, scale4567);
+  _mm256_storeu_pd(v, res0123);
+  _mm256_storeu_pd(v + 4, res4567);
+}
+
+static inline void ExtractResults16(__m256i result0, __m256i result1, const int8_t *&wi,
+                                    const double *&scales, double *&v) {
+  __m128i w8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(wi));
+  // 8x8bit vals in bottom of 128bit reg
+  const __m256i bias_scale = _mm256_set_epi32(127, 127, 127, 127, 127, 127, 127, 127);
+  __m256i w256 = _mm256_cvtepi8_epi32(w8); // 8x32bit vals in 256bit reg
+  __m256d scale0123 = _mm256_loadu_pd(scales);
+  __m256d scale4567 = _mm256_loadu_pd(scales + 4);
+  w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 <bias * 127>
+  result0 = _mm256_add_epi32(result0, w256);   // result += bias * 127
+  __m256d res0123 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result0));
+  result0 = _mm256_permute4x64_epi64(result0, 2 + (3 << 2));
+  __m256d res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result0));
+  res0123 = _mm256_mul_pd(res0123, scale0123);
+  res4567 = _mm256_mul_pd(res4567, scale4567);
+  _mm256_storeu_pd(v, res0123);
+  _mm256_storeu_pd(v + 4, res4567);
+  w8 = _mm_shuffle_epi32(w8, 2 + (3 << 2));
+  w256 = _mm256_cvtepi8_epi32(w8); // 8x32bit vals in 256bit reg
+  scale0123 = _mm256_loadu_pd(scales + 8);
+  scale4567 = _mm256_loadu_pd(scales + 12);
+  w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 <bias * 127>
+  result1 = _mm256_add_epi32(result1, w256);   // result += bias * 127
+  res0123 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result1));
+  result1 = _mm256_permute4x64_epi64(result1, 2 + (3 << 2));
+  res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result1));
+  res0123 = _mm256_mul_pd(res0123, scale0123);
+  res4567 = _mm256_mul_pd(res4567, scale4567);
+  _mm256_storeu_pd(v + 8, res0123);
+  _mm256_storeu_pd(v + 12, res4567);
+  wi += 16;
+  scales += 16;
+  v += 16;
+}
+
+// Computes part of matrix.vector v = Wu. Computes N=64 results.
+// The weights *must* be arranged so that consecutive reads from wi
+// provides (num_in/kNumInputsPerGroup groups of (N output dim groups of
+// (kNumInputsPerGroup inputs))). After that there must be N consecutive
+// bias weights, before continuing with any more weights.
+// u must be padded out with zeros to
+// kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements.
+static void PartialMatrixDotVector64(const int8_t *wi, const double *scales, const int8_t *u,
+                                     int num_in, double *v) {
+  // Register containing 16-bit ones for horizontal add with 16->32 bit
+  // conversion.
+  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
+  // Initialize all the results to 0.
+  __m256i result0 = _mm256_setzero_si256();
+  __m256i result1 = _mm256_setzero_si256();
+  __m256i result2 = _mm256_setzero_si256();
+  __m256i result3 = _mm256_setzero_si256();
+  __m256i result4 = _mm256_setzero_si256();
+  __m256i result5 = _mm256_setzero_si256();
+  __m256i result6 = _mm256_setzero_si256();
+  __m256i result7 = _mm256_setzero_si256();
+  // Iterate over the input (u), one registerful at a time.
+  for (int j = 0; j < num_in;) {
+    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
+    // Inputs are processed in groups of kNumInputsPerGroup, replicated
+    // kNumInputGroups times.
+    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
+      // Replicate the low 32 bits (4 inputs) 8 times.
+      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
+      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.
+      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
+      __m256i weights, reps;
+      // Mul-add, with horizontal add of the 4 inputs to each of the results.
+      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
+      MultiplyGroup(rep_input, ones, wi, weights, reps, result1);
+      MultiplyGroup(rep_input, ones, wi, weights, reps, result2);
+      MultiplyGroup(rep_input, ones, wi, weights, reps, result3);
+      MultiplyGroup(rep_input, ones, wi, weights, reps, result4);
+      MultiplyGroup(rep_input, ones, wi, weights, reps, result5);
+      MultiplyGroup(rep_input, ones, wi, weights, reps, result6);
+      MultiplyGroup(rep_input, ones, wi, weights, reps, result7);
+    }
+  }
+  ExtractResults16(result0, result1, wi, scales, v);
+  ExtractResults16(result2, result3, wi, scales, v);
+  ExtractResults16(result4, result5, wi, scales, v);
+  ExtractResults16(result6, result7, wi, scales, v);
+}
+
+// Computes part of matrix.vector v = Wu. Computes N=32 results.
+// For details see PartialMatrixDotVector64 with N=32.
+static void PartialMatrixDotVector32(const int8_t *wi, const double *scales, const int8_t *u,
+                                     int num_in, double *v) {
+  // Register containing 16-bit ones for horizontal add with 16->32 bit
+  // conversion.
+  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
+  // Initialize all the results to 0.
+  __m256i result0 = _mm256_setzero_si256();
+  __m256i result1 = _mm256_setzero_si256();
+  __m256i result2 = _mm256_setzero_si256();
+  __m256i result3 = _mm256_setzero_si256();
+  // Iterate over the input (u), one registerful at a time.
+  for (int j = 0; j < num_in;) {
+    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
+    // Inputs are processed in groups of kNumInputsPerGroup, replicated
+    // kNumInputGroups times.
+    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
+      // Replicate the low 32 bits (4 inputs) 8 times.
+      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
+      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.
+      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
+      __m256i weights, reps;
+      // Mul-add, with horizontal add of the 4 inputs to each of the results.
+      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
+      MultiplyGroup(rep_input, ones, wi, weights, reps, result1);
+      MultiplyGroup(rep_input, ones, wi, weights, reps, result2);
+      MultiplyGroup(rep_input, ones, wi, weights, reps, result3);
+    }
+  }
+  ExtractResults16(result0, result1, wi, scales, v);
+  ExtractResults16(result2, result3, wi, scales, v);
+}
+
+// Computes part of matrix.vector v = Wu. Computes N=16 results.
+// For details see PartialMatrixDotVector64 with N=16.
+static void PartialMatrixDotVector16(const int8_t *wi, const double *scales, const int8_t *u,
+                                     int num_in, double *v) {
+  // Register containing 16-bit ones for horizontal add with 16->32 bit
+  // conversion.
+  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
+  // Initialize all the results to 0.
+  __m256i result0 = _mm256_setzero_si256();
+  __m256i result1 = _mm256_setzero_si256();
+  // Iterate over the input (u), one registerful at a time.
+  for (int j = 0; j < num_in;) {
+    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
+    // Inputs are processed in groups of kNumInputsPerGroup, replicated
+    // kNumInputGroups times.
+    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
+      // Replicate the low 32 bits (4 inputs) 8 times.
+      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
+      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.
+      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
+      __m256i weights, reps;
+      // Mul-add, with horizontal add of the 4 inputs to each of the results.
+      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
+      MultiplyGroup(rep_input, ones, wi, weights, reps, result1);
+    }
+  }
+  ExtractResults16(result0, result1, wi, scales, v);
+}
+
+// Computes part of matrix.vector v = Wu. Computes N=8 results.
+// For details see PartialMatrixDotVector64 with N=8.
+static inline void PartialMatrixDotVector8(const int8_t *wi, const double *scales, const int8_t *u,
+                                           int num_in, double *v) {
+  // Register containing 16-bit ones for horizontal add with 16->32 bit
+  // conversion.
+  __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
+  // Initialize all the results to 0.
+  __m256i result0 = _mm256_setzero_si256();
+  // Iterate over the input (u), one registerful at a time.
+  for (int j = 0; j < num_in;) {
+    __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
+    // Inputs are processed in groups of kNumInputsPerGroup, replicated
+    // kNumInputGroups times.
+    for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
+      // Replicate the low 32 bits (4 inputs) 8 times.
+      __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
+      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.
+      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
+      __m256i weights, reps;
+      // Mul-add, with horizontal add of the 4 inputs to each of the results.
+      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
+    }
+  }
+  ExtractResults8(result0, wi, scales, v);
+}
+
+static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales,
+                            const int8_t *u, double *v) {
+  const int num_out = dim1;
+  const int num_in = dim2 - 1;
+  // Each call to a partial_func_ produces group_size outputs, except the
+  // last one, which can produce less.
+  const int rounded_num_in = IntSimdMatrix::Roundup(num_in, kNumInputsPerGroup);
+  const int rounded_num_out = IntSimdMatrix::Roundup(num_out, kNumOutputsPerRegister);
+  int group_size = kNumOutputsPerRegister * kMaxOutputRegisters;
+  int output = 0;
+
+  int w_step = (rounded_num_in + 1) * group_size;
+
+  // Run with this group size, until it would produce too much output, then
+  // switch to a smaller size.
+  for (; output + group_size <= rounded_num_out; output += group_size) {
+    PartialMatrixDotVector64(wi, scales, u, rounded_num_in, v);
+    wi += w_step;
+    scales += group_size;
+    v += group_size;
+  }
+  group_size /= 2;
+  w_step /= 2;
+
+  if (output + group_size <= rounded_num_out) {
+    PartialMatrixDotVector32(wi, scales, u, rounded_num_in, v);
+    wi += w_step;
+    scales += group_size;
+    v += group_size;
+    output += group_size;
+  }
+  group_size /= 2;
+  w_step /= 2;
+
+  if (output + group_size <= rounded_num_out) {
+    PartialMatrixDotVector16(wi, scales, u, rounded_num_in, v);
+    wi += w_step;
+    scales += group_size;
+    v += group_size;
+    output += group_size;
+  }
+  group_size /= 2;
+  w_step /= 2;
+
+  if (output + group_size <= rounded_num_out) {
+    PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v);
+  }
+}
+
+const IntSimdMatrix IntSimdMatrix::intSimdMatrixAVX2 = {
+    // Function.
+    matrixDotVector,
+    // Number of 32 bit outputs held in each register.
+    kNumOutputsPerRegister,
+    // Maximum number of registers that we will use to hold outputs.
+    kMaxOutputRegisters,
+    // Number of 8 bit inputs in the inputs register.
+    kNumInputsPerRegister,
+    // Number of inputs in each weight group.
+    kNumInputsPerGroup};
+
+} // namespace tesseract.
+
+#endif
--- a/3rdparty/tesseract_ocr/tesseract/src/arch/intsimdmatrixneon.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/arch/intsimdmatrixneon.cpp
@ -0,0 +1,203 @@
+///////////////////////////////////////////////////////////////////////
+// File:        intsimdmatrixneon.cpp
+// Description: matrix-vector product for 8-bit data on neon.
+// Author:      Robin Watts (from the AVX2 original by Ray Smith)
+//
+// (C) Copyright 2017, Google Inc.
+// (C) Copyright 2020, Artifex Software Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#if defined(__ARM_NEON)
+
+#  include "intsimdmatrix.h"
+
+#  include <algorithm>
+#  include <cstdint>
+#  include <vector>
+#  include "arm_neon.h"
+
+namespace tesseract {
+
+// Number of outputs held in each register. (Actually, we use a
+// pair of 4x32 registers, so 8 x 32 bit ints).
+constexpr int kNumOutputsPerRegister = 8;
+// Maximum number of registers that we will use.
+constexpr int kMaxOutputRegisters = 1;
+// Number of inputs in the inputs register.
+constexpr int kNumInputsPerRegister = 8;
+// Number of inputs in each weight group.
+constexpr int kNumInputsPerGroup = 8;
+
+// Function to compute part of a matrix.vector multiplication. The weights
+// are in a very specific order (see above) in w, which is multiplied by
+// u of length num_in, to produce output v after scaling the integer results
+// by the corresponding member of scales.
+// The amount of w and scales consumed is fixed and not available to the
+// caller.
+
+// Computes part of matrix.vector v = Wu. Computes N=8 results.
+// The weights *must* be arranged so that consecutive reads from wi
+// provides (num_in/kNumInputsPerGroup groups of (N output dim groups of
+// (kNumInputsPerGroup inputs))). After that there must be N consecutive
+// bias weights, before continuing with any more weights.
+// u must be padded out with zeros to
+// kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements.
+static inline void PartialMatrixDotVector8(const int8_t *__restrict wi,
+                                           const double *__restrict scales,
+                                           const int8_t *__restrict u, int num_in,
+                                           double *__restrict v, int num_out) {
+  // Initialize all the results to 0.
+  int32x4_t result0123 = {0, 0, 0, 0};
+  int32x4_t result4567 = {0, 0, 0, 0};
+  int8x8_t bias_scale = {127, 127, 127, 127, 127, 127, 127, 127};
+  // Iterate over the input (u), one registerful at a time.
+  for (int j = 0; j < num_in; j += 8) {
+    int8x8_t vu = vld1_s8(u);              // vu     = u0  u1  u2  u3  u4  u5  u6  u7
+    int8x16_t vw01 = vld1q_s8(wi);         // vw0    = w00 w01 w02 w03 w04 w05 w06 w07
+                                           // w10 w11 w12 w13 w14 w15 w16 w17
+    int8x16_t vw23 = vld1q_s8(wi + 8 * 2); // vw2    = w20 w21 w22 w23 w24 w25 w26 w27 w30
+                                           // w31 w32 w33 w34 w35 w36 w37
+    int8x16_t vw45 = vld1q_s8(wi + 8 * 4); // vw4    = w40 w41 w42 w43 w44 w45 w46 w47 w50
+                                           // w51 w52 w53 w54 w55 w56 w57
+    int8x16_t vw67 = vld1q_s8(wi + 8 * 6); // vw6    = w60 w61 w62 w63 w64 w65 w66 w67 w70
+                                           // w71 w72 w73 w74 w75 w76 w77
+
+    int16x8_t vrow0q = vmull_s8(vget_low_s8(vw01), vu); // vrow0q = vw00.u0 w01.u1 w02.u2
+                                                        // w03.u3 vw04.u4 w05.u5 w06.u6 w07.u7
+    int16x8_t vrow1q = vmull_s8(vget_high_s8(vw01),
+                                vu);                    // vrow1q = vw10.u0 w11.u1 w12.u2 w13.u3
+                                                        // vw14.u4 w15.u5 w16.u6 w17.u7
+    int16x8_t vrow2q = vmull_s8(vget_low_s8(vw23), vu); // vrow2q = vw20.u0 w21.u1 w22.u2
+                                                        // w23.u3 vw24.u4 w25.u5 w26.u6 w27.u7
+    int16x8_t vrow3q = vmull_s8(vget_high_s8(vw23),
+                                vu);                    // vrow3q = vw30.u0 w31.u1 w32.u2 w33.u3
+                                                        // vw34.u4 w35.u5 w36.u6 w37.u7
+    int16x8_t vrow4q = vmull_s8(vget_low_s8(vw45), vu); // vrow4q = vw40.u0 w41.u1 w42.u2
+                                                        // w43.u3 vw44.u4 w45.u5 w46.u6 w47.u7
+    int16x8_t vrow5q = vmull_s8(vget_high_s8(vw45),
+                                vu);                    // vrow5q = vw50.u0 w51.u1 w52.u2 w53.u3
+                                                        // vw54.u4 w55.u5 w56.u6 w57.u7
+    int16x8_t vrow6q = vmull_s8(vget_low_s8(vw67), vu); // vrow6q = vw60.u0 w61.u1 w62.u2
+                                                        // w63.u3 vw64.u4 w65.u5 w66.u6 w67.u7
+    int16x8_t vrow7q = vmull_s8(vget_high_s8(vw67),
+                                vu); // vrow7q = vw70.u0 w71.u1 w72.u2 w73.u3
+                                     // vw74.u4 w75.u5 w76.u6 w77.u7
+
+    int32x4_t vrow0q2 = vpaddlq_s16(vrow0q); // vrow0q2 = vw00.u0+w01.u1 w02.u2+w03.u3
+                                             // vw04.u4+w05.u5 w06.u6+w07.u7
+    int32x4_t vrow1q2 = vpaddlq_s16(vrow1q); // vrow1q2 = vw10.u0+w11.u1 w12.u2+w13.u3
+                                             // vw14.u4+w15.u5 w16.u6+w17.u7
+    int32x4_t vrow2q2 = vpaddlq_s16(vrow2q); // vrow2q2 = vw20.u0+w21.u1 w22.u2+w23.u3
+                                             // vw24.u4+w25.u5 w26.u6+w27.u7
+    int32x4_t vrow3q2 = vpaddlq_s16(vrow3q); // vrow3q2 = vw30.u0+w31.u1 w32.u2+w33.u3
+                                             // vw34.u4+w35.u5 w36.u6+w37.u7
+    int32x4_t vrow4q2 = vpaddlq_s16(vrow4q); // vrow4q2 = vw40.u0+w41.u1 w42.u2+w43.u3
+                                             // vw44.u4+w45.u5 w46.u6+w47.u7
+    int32x4_t vrow5q2 = vpaddlq_s16(vrow5q); // vrow5q2 = vw50.u0+w51.u1 w52.u2+w53.u3
+                                             // vw54.u4+w55.u5 w56.u6+w57.u7
+    int32x4_t vrow6q2 = vpaddlq_s16(vrow6q); // vrow6q2 = vw60.u0+w61.u1 w62.u2+w63.u3
+                                             // vw64.u4+w65.u5 w66.u6+w67.u7
+    int32x4_t vrow7q2 = vpaddlq_s16(vrow7q); // vrow7q2 = vw70.u0+w71.u1 w72.u2+w73.u3
+                                             // vw74.u4+w75.u5 w76.u6+w77.u7
+
+    vrow0q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow0q2), vget_high_s32(vrow0q2)),
+                           vpadd_s32(vget_low_s32(vrow1q2), vget_high_s32(vrow1q2)));
+    // vrow0q2 = vw00.u0+...+w03.u3 vw04.u4+...+w07.u7 vw10.u0+...+w13.u3
+    // vw14.u4+...+w17.u7
+    vrow2q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow2q2), vget_high_s32(vrow2q2)),
+                           vpadd_s32(vget_low_s32(vrow3q2), vget_high_s32(vrow3q2)));
+    // vrow0q2 = vw20.u0+...+w23.u3 vw24.u4+...+w27.u7 vw30.u0+...+w33.u3
+    // vw34.u4+...+w37.u7
+    vrow4q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow4q2), vget_high_s32(vrow4q2)),
+                           vpadd_s32(vget_low_s32(vrow5q2), vget_high_s32(vrow5q2)));
+    // vrow0q2 = vw40.u0+...+w43.u3 vw44.u4+...+w47.u7 vw50.u0+...+w53.u3
+    // vw54.u4+...+w57.u7
+    vrow6q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow6q2), vget_high_s32(vrow6q2)),
+                           vpadd_s32(vget_low_s32(vrow7q2), vget_high_s32(vrow7q2)));
+    // vrow0q2 = vw60.u0+...+w63.u3 vw64.u4+...+w67.u7 vw70.u0+...+w73.u3
+    // vw74.u4+...+w77.u7
+
+    vrow0q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow0q2), vget_high_s32(vrow0q2)),
+                           vpadd_s32(vget_low_s32(vrow2q2), vget_high_s32(vrow2q2)));
+    // vrow0q2 = vw00.u0+...+w07.u7 vw10.u0+...+w17.u7 vw20.u0+...+w27.u7
+    // vw30.u0+...+w37.u7
+    vrow4q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow4q2), vget_high_s32(vrow4q2)),
+                           vpadd_s32(vget_low_s32(vrow6q2), vget_high_s32(vrow6q2)));
+    // vrow0q2 = vw40.u0+...+w47.u7 vw50.u0+...+w57.u7 vw60.u0+...+w67.u7
+    // vw70.u0+...+w77.u7
+
+    result0123 = vaddq_s32(result0123, vrow0q2);
+    result4567 = vaddq_s32(result4567, vrow4q2);
+    u += 8;
+    wi += 64;
+  }
+  {
+    int8x8_t bias = vld1_s8(wi); // vw0    = b0  b1  b2  b3  b4  b5  b6  b7
+    int16x8_t scaled_bias = vmull_s8(bias, bias_scale);
+    result0123 = vaddw_s16(result0123, vget_low_s16(scaled_bias));
+    result4567 = vaddw_s16(result4567, vget_high_s16(scaled_bias));
+    *v++ = vget_lane_s32(vget_low_s32(result0123), 0) * *scales++;
+    if (num_out > 1)
+      *v++ = vget_lane_s32(vget_low_s32(result0123), 1) * *scales++;
+    if (num_out > 2)
+      *v++ = vget_lane_s32(vget_high_s32(result0123), 0) * *scales++;
+    if (num_out > 3)
+      *v++ = vget_lane_s32(vget_high_s32(result0123), 1) * *scales++;
+    if (num_out > 4)
+      *v++ = vget_lane_s32(vget_low_s32(result4567), 0) * *scales++;
+    if (num_out > 5)
+      *v++ = vget_lane_s32(vget_low_s32(result4567), 1) * *scales++;
+    if (num_out > 6)
+      *v++ = vget_lane_s32(vget_high_s32(result4567), 0) * *scales++;
+    if (num_out > 7)
+      *v = vget_lane_s32(vget_high_s32(result4567), 1) * *scales;
+  }
+}
+
+static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales,
+                            const int8_t *u, double *v) {
+  const int num_out = dim1;
+  const int num_in = dim2 - 1;
+  // Each call to a partial_func_ produces group_size outputs, except the
+  // last one, which can produce less.
+  const int rounded_num_in = IntSimdMatrix::Roundup(num_in, kNumInputsPerGroup);
+  int group_size = kNumOutputsPerRegister * kMaxOutputRegisters;
+  int output = 0;
+
+  int w_step = (rounded_num_in + 1) * group_size;
+
+  for (; output + group_size <= num_out; output += group_size) {
+    PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v, kNumOutputsPerRegister);
+    wi += w_step;
+    scales += group_size;
+    v += group_size;
+  }
+  if (output < num_out)
+    PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v,
+                            num_out & (kNumOutputsPerRegister - 1));
+}
+
+const IntSimdMatrix IntSimdMatrix::intSimdMatrixNEON = {
+    // Function.
+    matrixDotVector,
+    // Number of 32 bit outputs held in each register.
+    kNumOutputsPerRegister,
+    // Maximum number of registers that we will use to hold outputs.
+    kMaxOutputRegisters,
+    // Number of 8 bit inputs in the inputs register.
+    kNumInputsPerRegister,
+    // Number of inputs in each weight group.
+    kNumInputsPerGroup};
+
+} // namespace tesseract.
+
+#endif /* __ARM_NEON */
--- a/3rdparty/tesseract_ocr/tesseract/src/arch/intsimdmatrixsse.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/arch/intsimdmatrixsse.cpp
@ -0,0 +1,106 @@
+///////////////////////////////////////////////////////////////////////
+// File:        intsindmatrixsse.cpp
+// Description: SSE implementation of 8-bit int SIMD matrix multiply.
+// Author:      Ray Smith
+//
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#if !defined(__SSE4_1__)
+#  if defined(__i686__) || defined(__x86_64__)
+#    error Implementation only for SSE 4.1 capable architectures
+#  endif
+#else
+
+#  include "intsimdmatrix.h"
+
+#  include <emmintrin.h>
+#  include <smmintrin.h>
+#  include <cstdint>
+
+namespace tesseract {
+
+// Computes and returns the dot product of the n-vectors u and v.
+// Uses Intel SSE intrinsics to access the SIMD instruction set.
+static int32_t IntDotProductSSE(const int8_t *u, const int8_t *v, int n) {
+  int max_offset = n - 8;
+  int offset = 0;
+  // Accumulate a set of 4 32-bit sums in sum, by loading 8 pairs of 8-bit
+  // values, extending to 16 bit, multiplying to make 32 bit results.
+  int32_t result = 0;
+  if (offset <= max_offset) {
+    offset = 8;
+    __m128i packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(u));
+    __m128i packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(v));
+    __m128i sum = _mm_cvtepi8_epi16(packed1);
+    packed2 = _mm_cvtepi8_epi16(packed2);
+    // The magic _mm_add_epi16 is perfect here. It multiplies 8 pairs of 16 bit
+    // ints to make 32 bit results, which are then horizontally added in pairs
+    // to make 4 32 bit results that still fit in a 128 bit register.
+    sum = _mm_madd_epi16(sum, packed2);
+    while (offset <= max_offset) {
+      packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(u + offset));
+      packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(v + offset));
+      offset += 8;
+      packed1 = _mm_cvtepi8_epi16(packed1);
+      packed2 = _mm_cvtepi8_epi16(packed2);
+      packed1 = _mm_madd_epi16(packed1, packed2);
+      sum = _mm_add_epi32(sum, packed1);
+    }
+    // Sum the 4 packed 32 bit sums and extract the low result.
+    sum = _mm_hadd_epi32(sum, sum);
+    sum = _mm_hadd_epi32(sum, sum);
+    result = _mm_cvtsi128_si32(sum);
+  }
+  while (offset < n) {
+    result += u[offset] * v[offset];
+    ++offset;
+  }
+  return result;
+}
+
+// Computes part of matrix.vector v = Wu. Computes 1 result.
+static void PartialMatrixDotVector1(const int8_t *wi, const double *scales, const int8_t *u,
+                                    int num_in, double *v) {
+  double total = IntDotProductSSE(u, wi, num_in);
+  // Add in the bias and correct for integer values.
+  *v = (total + wi[num_in] * INT8_MAX) * *scales;
+}
+
+static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales,
+                            const int8_t *u, double *v) {
+  const int num_out = dim1;
+  const int num_in = dim2 - 1;
+  int output = 0;
+
+  for (; output < num_out; output++) {
+    PartialMatrixDotVector1(wi, scales, u, num_in, v);
+    wi += dim2;
+    scales++;
+    v++;
+  }
+}
+
+const IntSimdMatrix IntSimdMatrix::intSimdMatrixSSE = {
+    matrixDotVector,
+    // Number of 32 bit outputs held in each register.
+    1,
+    // Maximum number of registers that we will use to hold outputs.
+    1,
+    // Number of 8 bit inputs in the inputs register.
+    1,
+    // Number of inputs in each weight group.
+    1};
+
+} // namespace tesseract.
+
+#endif
--- a/3rdparty/tesseract_ocr/tesseract/src/arch/simddetect.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/arch/simddetect.cpp
@ -0,0 +1,283 @@
+///////////////////////////////////////////////////////////////////////
+// File:        simddetect.cpp
+// Description: Architecture detector.
+// Author:      Stefan Weil (based on code from Ray Smith)
+//
+// (C) Copyright 2014, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h" // for HAVE_AVX, ...
+#endif
+#include <numeric> // for std::inner_product
+#include "dotproduct.h"
+#include "intsimdmatrix.h" // for IntSimdMatrix
+#include "params.h"        // for STRING_VAR
+#include "simddetect.h"
+#include "tprintf.h" // for tprintf
+
+#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
+#  define HAS_CPUID
+#endif
+
+#if defined(HAS_CPUID)
+#  if defined(__GNUC__)
+#    include <cpuid.h>
+#  elif defined(_WIN32)
+#    include <intrin.h>
+#  endif
+#endif
+
+#if defined(HAVE_NEON) && !defined(__aarch64__)
+#  ifdef ANDROID
+#    include <cpu-features.h>
+#  else
+/* Assume linux */
+#    include <asm/hwcap.h>
+#    include <sys/auxv.h>
+#  endif
+#endif
+
+namespace tesseract {
+
+// Computes and returns the dot product of the two n-vectors u and v.
+// Note: because the order of addition is different among the different dot
+// product functions, the results can (and do) vary slightly (although they
+// agree to within about 4e-15). This produces different results when running
+// training, despite all random inputs being precisely equal.
+// To get consistent results, use just one of these dot product functions.
+// On a test multi-layer network, serial is 57% slower than SSE, and AVX
+// is about 8% faster than SSE. This suggests that the time is memory
+// bandwidth constrained and could benefit from holding the reused vector
+// in AVX registers.
+DotProductFunction DotProduct;
+
+static STRING_VAR(dotproduct, "auto", "Function used for calculation of dot product");
+
+SIMDDetect SIMDDetect::detector;
+
+#if defined(__aarch64__)
+// ARMv8 always has NEON.
+bool SIMDDetect::neon_available_ = true;
+#elif defined(HAVE_NEON)
+// If true, then Neon has been detected.
+bool SIMDDetect::neon_available_;
+#else
+// If true, then AVX has been detected.
+bool SIMDDetect::avx_available_;
+bool SIMDDetect::avx2_available_;
+bool SIMDDetect::avx512F_available_;
+bool SIMDDetect::avx512BW_available_;
+// If true, then FMA has been detected.
+bool SIMDDetect::fma_available_;
+// If true, then SSe4.1 has been detected.
+bool SIMDDetect::sse_available_;
+#endif
+
+// Computes and returns the dot product of the two n-vectors u and v.
+static double DotProductGeneric(const double *u, const double *v, int n) {
+  double total = 0.0;
+  for (int k = 0; k < n; ++k) {
+    total += u[k] * v[k];
+  }
+  return total;
+}
+
+// Compute dot product using std::inner_product.
+static double DotProductStdInnerProduct(const double *u, const double *v, int n) {
+  return std::inner_product(u, u + n, v, 0.0);
+}
+
+static void SetDotProduct(DotProductFunction f, const IntSimdMatrix *m = nullptr) {
+  DotProduct = f;
+  IntSimdMatrix::intSimdMatrix = m;
+}
+
+// Constructor.
+// Tests the architecture in a system-dependent way to detect AVX, SSE and
+// any other available SIMD equipment.
+// __GNUC__ is also defined by compilers that include GNU extensions such as
+// clang.
+SIMDDetect::SIMDDetect() {
+  // The fallback is a generic dot product calculation.
+  SetDotProduct(DotProductGeneric);
+
+#if defined(HAS_CPUID)
+#  if defined(__GNUC__)
+  unsigned int eax, ebx, ecx, edx;
+  if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
+    // Note that these tests all use hex because the older compilers don't have
+    // the newer flags.
+#    if defined(HAVE_SSE4_1)
+    sse_available_ = (ecx & 0x00080000) != 0;
+#    endif
+#    if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
+    auto xgetbv = []() {
+      uint32_t xcr0;
+      __asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
+      return xcr0;
+    };
+    if ((ecx & 0x08000000) && ((xgetbv() & 6) == 6)) {
+      // OSXSAVE bit is set, XMM state and YMM state are fine.
+#      if defined(HAVE_FMA)
+      fma_available_ = (ecx & 0x00001000) != 0;
+#      endif
+#      if defined(HAVE_AVX)
+      avx_available_ = (ecx & 0x10000000) != 0;
+      if (avx_available_) {
+        // There is supposed to be a __get_cpuid_count function, but this is all
+        // there is in my cpuid.h. It is a macro for an asm statement and cannot
+        // be used inside an if.
+        __cpuid_count(7, 0, eax, ebx, ecx, edx);
+        avx2_available_ = (ebx & 0x00000020) != 0;
+        avx512F_available_ = (ebx & 0x00010000) != 0;
+        avx512BW_available_ = (ebx & 0x40000000) != 0;
+      }
+#      endif
+    }
+#    endif
+  }
+#  elif defined(_WIN32)
+  int cpuInfo[4];
+  int max_function_id;
+  __cpuid(cpuInfo, 0);
+  max_function_id = cpuInfo[0];
+  if (max_function_id >= 1) {
+    __cpuid(cpuInfo, 1);
+#    if defined(HAVE_SSE4_1)
+    sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
+#    endif
+#    if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
+    if ((cpuInfo[2] & 0x08000000) && ((_xgetbv(0) & 6) == 6)) {
+      // OSXSAVE bit is set, XMM state and YMM state are fine.
+#      if defined(HAVE_FMA)
+      fma_available_ = (cpuInfo[2] & 0x00001000) != 0;
+#      endif
+#      if defined(HAVE_AVX)
+      avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
+#      endif
+#      if defined(HAVE_AVX2)
+      if (max_function_id >= 7) {
+        __cpuid(cpuInfo, 7);
+        avx2_available_ = (cpuInfo[1] & 0x00000020) != 0;
+        avx512F_available_ = (cpuInfo[1] & 0x00010000) != 0;
+        avx512BW_available_ = (cpuInfo[1] & 0x40000000) != 0;
+      }
+#      endif
+    }
+#    endif
+  }
+#  else
+#    error "I don't know how to test for SIMD with this compiler"
+#  endif
+#endif
+
+#if defined(HAVE_NEON) && !defined(__aarch64__)
+#  ifdef ANDROID
+  {
+    AndroidCpuFamily family = android_getCpuFamily();
+    if (family == ANDROID_CPU_FAMILY_ARM)
+      neon_available_ = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON);
+  }
+#  else
+  /* Assume linux */
+  neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON;
+#  endif
+#endif
+
+  // Select code for calculation of dot product based on autodetection.
+  if (false) {
+    // This is a dummy to support conditional compilation.
+#if defined(HAVE_AVX2)
+  } else if (avx2_available_) {
+    // AVX2 detected.
+    SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2);
+#endif
+#if defined(HAVE_AVX)
+  } else if (avx_available_) {
+    // AVX detected.
+    SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
+#endif
+#if defined(HAVE_SSE4_1)
+  } else if (sse_available_) {
+    // SSE detected.
+    SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);
+#endif
+#if defined(HAVE_NEON) || defined(__aarch64__)
+  } else if (neon_available_) {
+    // NEON detected.
+    SetDotProduct(DotProduct, &IntSimdMatrix::intSimdMatrixNEON);
+#endif
+  }
+}
+
+void SIMDDetect::Update() {
+  // Select code for calculation of dot product based on the
+  // value of the config variable if that value is not empty.
+  const char *dotproduct_method = "generic";
+  if (!strcmp(dotproduct.c_str(), "auto")) {
+    // Automatic detection. Nothing to be done.
+  } else if (!strcmp(dotproduct.c_str(), "generic")) {
+    // Generic code selected by config variable.
+    SetDotProduct(DotProductGeneric);
+    dotproduct_method = "generic";
+  } else if (!strcmp(dotproduct.c_str(), "native")) {
+    // Native optimized code selected by config variable.
+    SetDotProduct(DotProductNative);
+    dotproduct_method = "native";
+#if defined(HAVE_AVX2)
+  } else if (!strcmp(dotproduct.c_str(), "avx2")) {
+    // AVX2 selected by config variable.
+    SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2);
+    dotproduct_method = "avx2";
+#endif
+#if defined(HAVE_AVX)
+  } else if (!strcmp(dotproduct.c_str(), "avx")) {
+    // AVX selected by config variable.
+    SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
+    dotproduct_method = "avx";
+#endif
+#if defined(HAVE_FMA)
+  } else if (!strcmp(dotproduct.c_str(), "fma")) {
+    // FMA selected by config variable.
+    SetDotProduct(DotProductFMA, IntSimdMatrix::intSimdMatrix);
+    dotproduct_method = "fma";
+#endif
+#if defined(HAVE_SSE4_1)
+  } else if (!strcmp(dotproduct.c_str(), "sse")) {
+    // SSE selected by config variable.
+    SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);
+    dotproduct_method = "sse";
+#endif
+  } else if (!strcmp(dotproduct.c_str(), "std::inner_product")) {
+    // std::inner_product selected by config variable.
+    SetDotProduct(DotProductStdInnerProduct);
+    dotproduct_method = "std::inner_product";
+  } else {
+    // Unsupported value of config variable.
+    tprintf("Warning, ignoring unsupported config variable value: dotproduct=%s\n",
+            dotproduct.c_str());
+    tprintf(
+        "Support values for dotproduct: auto generic native"
+#if defined(HAVE_AVX)
+        " avx"
+#endif
+#if defined(HAVE_SSE4_1)
+        " sse"
+#endif
+        " std::inner_product.\n");
+  }
+
+  dotproduct.set_value(dotproduct_method);
+}
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/arch/simddetect.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/arch/simddetect.h
@ -0,0 +1,87 @@
+///////////////////////////////////////////////////////////////////////
+// File:        simddetect.h
+// Description: Architecture detector.
+// Author:      Stefan Weil (based on code from Ray Smith)
+//
+// (C) Copyright 2014, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+#ifndef TESSERACT_ARCH_SIMDDETECT_H_
+#define TESSERACT_ARCH_SIMDDETECT_H_
+
+#include <tesseract/export.h>
+
+namespace tesseract {
+
+// Function pointer for best calculation of dot product.
+using DotProductFunction = double (*)(const double *, const double *, int);
+extern DotProductFunction DotProduct;
+
+// Architecture detector. Add code here to detect any other architectures for
+// SIMD-based faster dot product functions. Intended to be a single static
+// object, but it does no real harm to have more than one.
+class SIMDDetect {
+public:
+  // Returns true if AVX is available on this system.
+  static inline bool IsAVXAvailable() {
+    return detector.avx_available_;
+  }
+  // Returns true if AVX2 (integer support) is available on this system.
+  static inline bool IsAVX2Available() {
+    return detector.avx2_available_;
+  }
+  // Returns true if AVX512 Foundation (float) is available on this system.
+  static inline bool IsAVX512FAvailable() {
+    return detector.avx512F_available_;
+  }
+  // Returns true if AVX512 integer is available on this system.
+  static inline bool IsAVX512BWAvailable() {
+    return detector.avx512BW_available_;
+  }
+  // Returns true if FMA is available on this system.
+  static inline bool IsFMAAvailable() {
+    return detector.fma_available_;
+  }
+  // Returns true if SSE4.1 is available on this system.
+  static inline bool IsSSEAvailable() {
+    return detector.sse_available_;
+  }
+  // Returns true if NEON is available on this system.
+  static inline bool IsNEONAvailable() {
+    return detector.neon_available_;
+  }
+
+  // Update settings after config variable was set.
+  static TESS_API void Update();
+
+private:
+  // Constructor, must set all static member variables.
+  SIMDDetect();
+
+private:
+  // Singleton.
+  static SIMDDetect detector;
+  // If true, then AVX has been detected.
+  static TESS_API bool avx_available_;
+  static TESS_API bool avx2_available_;
+  static TESS_API bool avx512F_available_;
+  static TESS_API bool avx512BW_available_;
+  // If true, then FMA has been detected.
+  static TESS_API bool fma_available_;
+  // If true, then SSe4.1 has been detected.
+  static TESS_API bool sse_available_;
+  // If true, then NEON has been detected.
+  static TESS_API bool neon_available_;
+};
+
+} // namespace tesseract
+
+#endif // TESSERACT_ARCH_SIMDDETECT_H_
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/adaptions.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/adaptions.cpp
@ -0,0 +1,120 @@
+/**********************************************************************
+ * File:        adaptions.cpp  (Formerly adaptions.c)
+ * Description: Functions used to adapt to blobs already confidently
+ *              identified
+ * Author:      Chris Newton
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include <cctype>
+#include <cstring>
+#include "control.h"
+#include "reject.h"
+#include "stopper.h"
+#include "tesseractclass.h"
+#include "tessvars.h"
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h"
+#endif
+
+namespace tesseract {
+bool Tesseract::word_adaptable( // should we adapt?
+    WERD_RES *word, uint16_t mode) {
+  if (tessedit_adaption_debug) {
+    tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
+            word->best_choice->unichar_string().c_str(), word->best_choice->rating(),
+            word->best_choice->certainty());
+  }
+
+  bool status = false;
+  std::bitset<16> flags(mode);
+
+  enum MODES {
+    ADAPTABLE_WERD,
+    ACCEPTABLE_WERD,
+    CHECK_DAWGS,
+    CHECK_SPACES,
+    CHECK_ONE_ELL_CONFLICT,
+    CHECK_AMBIG_WERD
+  };
+
+  /*
+0: NO adaption
+*/
+  if (mode == 0) {
+    if (tessedit_adaption_debug) {
+      tprintf("adaption disabled\n");
+    }
+    return false;
+  }
+
+  if (flags[ADAPTABLE_WERD]) {
+    status |= word->tess_would_adapt; // result of Classify::AdaptableWord()
+    if (tessedit_adaption_debug && !status) {
+      tprintf("tess_would_adapt bit is false\n");
+    }
+  }
+
+  if (flags[ACCEPTABLE_WERD]) {
+    status |= word->tess_accepted;
+    if (tessedit_adaption_debug && !status) {
+      tprintf("tess_accepted bit is false\n");
+    }
+  }
+
+  if (!status) {  // If not set then
+    return false; // ignore other checks
+  }
+
+  if (flags[CHECK_DAWGS] && (word->best_choice->permuter() != SYSTEM_DAWG_PERM) &&
+      (word->best_choice->permuter() != FREQ_DAWG_PERM) &&
+      (word->best_choice->permuter() != USER_DAWG_PERM) &&
+      (word->best_choice->permuter() != NUMBER_PERM)) {
+    if (tessedit_adaption_debug) {
+      tprintf("word not in dawgs\n");
+    }
+    return false;
+  }
+
+  if (flags[CHECK_ONE_ELL_CONFLICT] && one_ell_conflict(word, false)) {
+    if (tessedit_adaption_debug) {
+      tprintf("word has ell conflict\n");
+    }
+    return false;
+  }
+
+  if (flags[CHECK_SPACES] &&
+      (strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {
+    if (tessedit_adaption_debug) {
+      tprintf("word contains spaces\n");
+    }
+    return false;
+  }
+
+  if (flags[CHECK_AMBIG_WERD] && word->best_choice->dangerous_ambig_found()) {
+    if (tessedit_adaption_debug) {
+      tprintf("word is ambiguous\n");
+    }
+    return false;
+  }
+
+  if (tessedit_adaption_debug) {
+    tprintf("returning status %d\n", status);
+  }
+  return status;
+}
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/applybox.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/applybox.cpp
@ -0,0 +1,781 @@
+/**********************************************************************
+ * File:        applybox.cpp  (Formerly applybox.c)
+ * Description: Re segment rows according to box file data
+ * Author:      Phil Cheatle
+ *
+ * (C) Copyright 1993, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef DISABLED_LEGACY_ENGINE
+#  include <allheaders.h>
+#  include <cctype>
+#  include <cerrno>
+#  include <cstring>
+#  include "boxread.h"
+#endif // ndef DISABLED_LEGACY_ENGINE
+#include <tesseract/unichar.h>
+#include "pageres.h"
+#include "tesseractclass.h"
+#include "unicharset.h"
+
+#ifndef DISABLED_LEGACY_ENGINE
+/** Max number of blobs to classify together in FindSegmentation. */
+const int kMaxGroupSize = 4;
+/// Max fraction of median allowed as deviation in xheight before switching
+/// to median.
+const double kMaxXHeightDeviationFraction = 0.125;
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+/**
+ * The box file is assumed to contain box definitions, one per line, of the
+ * following format for blob-level boxes:
+ * @verbatim
+ *   <UTF8 str> <left> <bottom> <right> <top> <page id>
+ * @endverbatim
+ * and for word/line-level boxes:
+ * @verbatim
+ *   WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
+ * @endverbatim
+ * NOTES:
+ * The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
+ *
+ * <page id> is 0-based, and the page number is used for multipage input (tiff).
+ *
+ * In the blob-level form, each line represents a recognizable unit, which may
+ * be several UTF-8 bytes, but there is a bounding box around each recognizable
+ * unit, and no classifier is needed to train in this mode (bootstrapping.)
+ *
+ * In the word/line-level form, the line begins with the literal "WordStr", and
+ * the bounding box bounds either a whole line or a whole word. The recognizable
+ * units in the word/line are listed after the # at the end of the line and
+ * are space delimited, ignoring any original spaces on the line.
+ * Eg.
+ * @verbatim
+ * word -> #w o r d
+ * multi word line -> #m u l t i w o r d l i n e
+ * @endverbatim
+ * The recognizable units must be space-delimited in order to allow multiple
+ * unicodes to be used for a single recognizable unit, eg Hindi.
+ *
+ * In this mode, the classifier must have been pre-trained with the desired
+ * character set, or it will not be able to find the character segmentations.
+ */
+
+namespace tesseract {
+
+#ifndef DISABLED_LEGACY_ENGINE
+static void clear_any_old_text(BLOCK_LIST *block_list) {
+  BLOCK_IT block_it(block_list);
+  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
+    ROW_IT row_it(block_it.data()->row_list());
+    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
+      WERD_IT word_it(row_it.data()->word_list());
+      for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
+        word_it.data()->set_text("");
+      }
+    }
+  }
+}
+
+// Applies the box file based on the image name filename, and resegments
+// the words in the block_list (page), with:
+// blob-mode: one blob per line in the box file, words as input.
+// word/line-mode: one blob per space-delimited unit after the #, and one word
+// per line in the box file. (See comment above for box file format.)
+// If find_segmentation is true, (word/line mode) then the classifier is used
+// to re-segment words/lines to match the space-delimited truth string for
+// each box. In this case, the input box may be for a word or even a whole
+// text line, and the output words will contain multiple blobs corresponding
+// to the space-delimited input string.
+// With find_segmentation false, no classifier is needed, but the chopper
+// can still be used to correctly segment touching characters with the help
+// of the input boxes.
+// In the returned PAGE_RES, the WERD_RES are setup as they would be returned
+// from normal classification, ie. with a word, chopped_word, rebuild_word,
+// seam_array, denorm, box_word, and best_state, but NO best_choice or
+// raw_choice, as they would require a UNICHARSET, which we aim to avoid.
+// Instead, the correct_text member of WERD_RES is set, and this may be later
+// converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
+// is not required before calling ApplyBoxTraining.
+PAGE_RES *Tesseract::ApplyBoxes(const char *filename, bool find_segmentation,
+                                BLOCK_LIST *block_list) {
+  std::vector<TBOX> boxes;
+  std::vector<std::string> texts, full_texts;
+  if (!ReadAllBoxes(applybox_page, true, filename, &boxes, &texts, &full_texts, nullptr)) {
+    return nullptr; // Can't do it.
+  }
+
+  const int box_count = boxes.size();
+  int box_failures = 0;
+
+  // In word mode, we use the boxes to make a word for each box, but
+  // in blob mode we use the existing words and maximally chop them first.
+  PAGE_RES *page_res = find_segmentation ? nullptr : SetupApplyBoxes(boxes, block_list);
+  clear_any_old_text(block_list);
+
+  for (int i = 0; i < box_count; i++) {
+    bool foundit = false;
+    if (page_res != nullptr) {
+      foundit =
+          ResegmentCharBox(page_res, (i == 0) ? nullptr : &boxes[i - 1], boxes[i],
+                           (i == box_count - 1) ? nullptr : &boxes[i + 1], full_texts[i].c_str());
+    } else {
+      foundit = ResegmentWordBox(block_list, boxes[i],
+                                 (i == box_count - 1) ? nullptr : &boxes[i + 1], texts[i].c_str());
+    }
+    if (!foundit) {
+      box_failures++;
+      ReportFailedBox(i, boxes[i], texts[i].c_str(), "FAILURE! Couldn't find a matching blob");
+    }
+  }
+
+  if (page_res == nullptr) {
+    // In word/line mode, we now maximally chop all the words and resegment
+    // them with the classifier.
+    page_res = SetupApplyBoxes(boxes, block_list);
+    ReSegmentByClassification(page_res);
+  }
+  if (applybox_debug > 0) {
+    tprintf("APPLY_BOXES:\n");
+    tprintf("   Boxes read from boxfile:  %6d\n", box_count);
+    if (box_failures > 0) {
+      tprintf("   Boxes failed resegmentation:  %6d\n", box_failures);
+    }
+  }
+  TidyUp(page_res);
+  return page_res;
+}
+
+// Helper computes median xheight in the image.
+static double MedianXHeight(BLOCK_LIST *block_list) {
+  BLOCK_IT block_it(block_list);
+  STATS xheights(0, block_it.data()->pdblk.bounding_box().height());
+  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
+    ROW_IT row_it(block_it.data()->row_list());
+    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
+      xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
+    }
+  }
+  return xheights.median();
+}
+
+/// Any row xheight that is significantly different from the median is set
+/// to the median.
+void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
+  const double median_xheight = MedianXHeight(block_list);
+  const double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
+  // Strip all fuzzy space markers to simplify the PAGE_RES.
+  BLOCK_IT b_it(block_list);
+  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+    BLOCK *block = b_it.data();
+    ROW_IT r_it(block->row_list());
+    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
+      ROW *row = r_it.data();
+      const double diff = fabs(row->x_height() - median_xheight);
+      if (diff > max_deviation) {
+        if (applybox_debug) {
+          tprintf("row xheight=%g, but median xheight = %g\n", row->x_height(), median_xheight);
+        }
+        row->set_x_height(static_cast<float>(median_xheight));
+      }
+    }
+  }
+}
+
+/// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
+/// All fuzzy spaces are removed, and all the words are maximally chopped.
+PAGE_RES *Tesseract::SetupApplyBoxes(const std::vector<TBOX> &boxes, BLOCK_LIST *block_list) {
+  PreenXHeights(block_list);
+  // Strip all fuzzy space markers to simplify the PAGE_RES.
+  BLOCK_IT b_it(block_list);
+  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+    BLOCK *block = b_it.data();
+    ROW_IT r_it(block->row_list());
+    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
+      ROW *row = r_it.data();
+      WERD_IT w_it(row->word_list());
+      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
+        WERD *word = w_it.data();
+        if (word->cblob_list()->empty()) {
+          delete w_it.extract();
+        } else {
+          word->set_flag(W_FUZZY_SP, false);
+          word->set_flag(W_FUZZY_NON, false);
+        }
+      }
+    }
+  }
+  auto *page_res = new PAGE_RES(false, block_list, nullptr);
+  PAGE_RES_IT pr_it(page_res);
+  WERD_RES *word_res;
+  while ((word_res = pr_it.word()) != nullptr) {
+    MaximallyChopWord(boxes, pr_it.block()->block, pr_it.row()->row, word_res);
+    pr_it.forward();
+  }
+  return page_res;
+}
+
+/// Tests the chopper by exhaustively running chop_one_blob.
+/// The word_res will contain filled chopped_word, seam_array, denorm,
+/// box_word and best_state for the maximally chopped word.
+void Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block, ROW *row,
+                                  WERD_RES *word_res) {
+  if (!word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
+                                     classify_bln_numeric_mode, textord_use_cjk_fp_model,
+                                     poly_allow_detailed_fx, row, block)) {
+    word_res->CloneChoppedToRebuild();
+    return;
+  }
+  if (chop_debug) {
+    tprintf("Maximally chopping word at:");
+    word_res->word->bounding_box().print();
+  }
+  std::vector<BLOB_CHOICE *> blob_choices;
+  ASSERT_HOST(!word_res->chopped_word->blobs.empty());
+  auto rating = static_cast<float>(INT8_MAX);
+  for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
+    // The rating and certainty are not quite arbitrary. Since
+    // select_blob_to_chop uses the worst certainty to choose, they all have
+    // to be different, so starting with INT8_MAX, subtract 1/8 for each blob
+    // in here, and then divide by e each time they are chopped, which
+    // should guarantee a set of unequal values for the whole tree of blobs
+    // produced, however much chopping is required. The chops are thus only
+    // limited by the ability of the chopper to find suitable chop points,
+    // and not by the value of the certainties.
+    auto *choice = new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
+    blob_choices.push_back(choice);
+    rating -= 0.125f;
+  }
+  const double e = exp(1.0); // The base of natural logs.
+  int blob_number;
+  int right_chop_index = 0;
+  if (!assume_fixed_pitch_char_segment) {
+    // We only chop if the language is not fixed pitch like CJK.
+    SEAM *seam = nullptr;
+    while ((seam = chop_one_blob(boxes, blob_choices, word_res, &blob_number)) != nullptr) {
+      word_res->InsertSeam(blob_number, seam);
+      BLOB_CHOICE *left_choice = blob_choices[blob_number];
+      rating = left_choice->rating() / e;
+      left_choice->set_rating(rating);
+      left_choice->set_certainty(-rating);
+      // combine confidence w/ serial #
+      auto *right_choice = new BLOB_CHOICE(++right_chop_index, rating - 0.125f, -rating, -1, 0.0f,
+                                           0.0f, 0.0f, BCC_FAKE);
+      blob_choices.insert(blob_choices.begin() + blob_number + 1, right_choice);
+    }
+  }
+  word_res->CloneChoppedToRebuild();
+  word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
+}
+
+/// Helper to compute the dispute resolution metric.
+/// Disputed blob resolution. The aim is to give the blob to the most
+/// appropriate boxfile box. Most of the time it is obvious, but if
+/// two boxfile boxes overlap significantly it is not. If a small boxfile
+/// box takes most of the blob, and a large boxfile box does too, then
+/// we want the small boxfile box to get it, but if the small box
+/// is much smaller than the blob, we don't want it to get it.
+/// Details of the disputed blob resolution:
+/// Given a box with area A, and a blob with area B, with overlap area C,
+/// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
+/// miss metric gets the blob.
+static double BoxMissMetric(const TBOX &box1, const TBOX &box2) {
+  const int overlap_area = box1.intersection(box2).area();
+  const int a = box1.area();
+  const int b = box2.area();
+  ASSERT_HOST(a != 0 && b != 0);
+  return 1.0 * (a - overlap_area) * (b - overlap_area) / a / b;
+}
+
+/// Gather consecutive blobs that match the given box into the best_state
+/// and corresponding correct_text.
+///
+/// Fights over which box owns which blobs are settled by pre-chopping and
+/// applying the blobs to box or next_box with the least non-overlap.
+/// @return false if the box was in error, which can only be caused by
+/// failing to find an appropriate blob for a box.
+///
+/// This means that occasionally, blobs may be incorrectly segmented if the
+/// chopper fails to find a suitable chop point.
+bool Tesseract::ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box,
+                                 const TBOX *next_box, const char *correct_text) {
+  if (applybox_debug > 1) {
+    tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
+  }
+  PAGE_RES_IT page_res_it(page_res);
+  WERD_RES *word_res;
+  for (word_res = page_res_it.word(); word_res != nullptr; word_res = page_res_it.forward()) {
+    if (!word_res->box_word->bounding_box().major_overlap(box)) {
+      continue;
+    }
+    if (applybox_debug > 1) {
+      tprintf("Checking word box:");
+      word_res->box_word->bounding_box().print();
+    }
+    int word_len = word_res->box_word->length();
+    for (int i = 0; i < word_len; ++i) {
+      TBOX char_box = TBOX();
+      int blob_count = 0;
+      for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
+        TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
+        if (!blob_box.major_overlap(box)) {
+          break;
+        }
+        if (word_res->correct_text[i + blob_count].length() > 0) {
+          break; // Blob is claimed already.
+        }
+        if (next_box != nullptr) {
+          const double current_box_miss_metric = BoxMissMetric(blob_box, box);
+          const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
+          if (applybox_debug > 2) {
+            tprintf("Checking blob:");
+            blob_box.print();
+            tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,
+                    next_box_miss_metric);
+          }
+          if (current_box_miss_metric > next_box_miss_metric) {
+            break; // Blob is a better match for next box.
+          }
+        }
+        char_box += blob_box;
+      }
+      if (blob_count > 0) {
+        if (applybox_debug > 1) {
+          tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
+        }
+        if (!char_box.almost_equal(box, 3) &&
+            ((next_box != nullptr && box.x_gap(*next_box) < -3) ||
+             (prev_box != nullptr && prev_box->x_gap(box) < -3))) {
+          return false;
+        }
+        // We refine just the box_word, best_state and correct_text here.
+        // The rebuild_word is made in TidyUp.
+        // blob_count blobs are put together to match the box. Merge the
+        // box_word boxes, save the blob_count in the state and the text.
+        word_res->box_word->MergeBoxes(i, i + blob_count);
+        word_res->best_state[i] = blob_count;
+        word_res->correct_text[i] = correct_text;
+        if (applybox_debug > 2) {
+          tprintf("%d Blobs match: blob box:", blob_count);
+          word_res->box_word->BlobBox(i).print();
+          tprintf("Matches box:");
+          box.print();
+          if (next_box != nullptr) {
+            tprintf("With next box:");
+            next_box->print();
+          }
+        }
+        // Eliminated best_state and correct_text entries for the consumed
+        // blobs.
+        for (int j = 1; j < blob_count; ++j) {
+          word_res->best_state.erase(word_res->best_state.begin() + i + 1);
+          word_res->correct_text.erase(word_res->correct_text.begin() + i + 1);
+        }
+        // Assume that no box spans multiple source words, so we are done with
+        // this box.
+        if (applybox_debug > 1) {
+          tprintf("Best state = ");
+          for (auto best_state : word_res->best_state) {
+            tprintf("%d ", best_state);
+          }
+          tprintf("\n");
+          tprintf("Correct text = [[ ");
+          for (auto &it : word_res->correct_text) {
+            tprintf("%s ", it.c_str());
+          }
+          tprintf("]]\n");
+        }
+        return true;
+      }
+    }
+  }
+  if (applybox_debug > 0) {
+    tprintf("FAIL!\n");
+  }
+  return false; // Failure.
+}
+
+/// Consume all source blobs that strongly overlap the given box,
+/// putting them into a new word, with the correct_text label.
+/// Fights over which box owns which blobs are settled by
+/// applying the blobs to box or next_box with the least non-overlap.
+/// @return false if the box was in error, which can only be caused by
+/// failing to find an overlapping blob for a box.
+bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box,
+                                 const char *correct_text) {
+  if (applybox_debug > 1) {
+    tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
+  }
+  WERD *new_word = nullptr;
+  BLOCK_IT b_it(block_list);
+  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+    BLOCK *block = b_it.data();
+    if (!box.major_overlap(block->pdblk.bounding_box())) {
+      continue;
+    }
+    ROW_IT r_it(block->row_list());
+    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
+      ROW *row = r_it.data();
+      if (!box.major_overlap(row->bounding_box())) {
+        continue;
+      }
+      WERD_IT w_it(row->word_list());
+      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
+        WERD *word = w_it.data();
+        if (applybox_debug > 2) {
+          tprintf("Checking word:");
+          word->bounding_box().print();
+        }
+        if (word->text() != nullptr && word->text()[0] != '\0') {
+          continue; // Ignore words that are already done.
+        }
+        if (!box.major_overlap(word->bounding_box())) {
+          continue;
+        }
+        C_BLOB_IT blob_it(word->cblob_list());
+        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+          C_BLOB *blob = blob_it.data();
+          TBOX blob_box = blob->bounding_box();
+          if (!blob_box.major_overlap(box)) {
+            continue;
+          }
+          if (next_box != nullptr) {
+            const double current_box_miss_metric = BoxMissMetric(blob_box, box);
+            const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
+            if (applybox_debug > 2) {
+              tprintf("Checking blob:");
+              blob_box.print();
+              tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,
+                      next_box_miss_metric);
+            }
+            if (current_box_miss_metric > next_box_miss_metric) {
+              continue; // Blob is a better match for next box.
+            }
+          }
+          if (applybox_debug > 2) {
+            tprintf("Blob match: blob:");
+            blob_box.print();
+            tprintf("Matches box:");
+            box.print();
+            if (next_box != nullptr) {
+              tprintf("With next box:");
+              next_box->print();
+            }
+          }
+          if (new_word == nullptr) {
+            // Make a new word with a single blob.
+            new_word = word->shallow_copy();
+            new_word->set_text(correct_text);
+            w_it.add_to_end(new_word);
+          }
+          C_BLOB_IT new_blob_it(new_word->cblob_list());
+          new_blob_it.add_to_end(blob_it.extract());
+        }
+      }
+    }
+  }
+  if (new_word == nullptr && applybox_debug > 0) {
+    tprintf("FAIL!\n");
+  }
+  return new_word != nullptr;
+}
+
+/// Resegments the words by running the classifier in an attempt to find the
+/// correct segmentation that produces the required string.
+void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) {
+  PAGE_RES_IT pr_it(page_res);
+  WERD_RES *word_res;
+  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
+    const WERD *word = word_res->word;
+    if (word->text() == nullptr || word->text()[0] == '\0') {
+      continue; // Ignore words that have no text.
+    }
+    // Convert the correct text to a vector of UNICHAR_ID
+    std::vector<UNICHAR_ID> target_text;
+    if (!ConvertStringToUnichars(word->text(), &target_text)) {
+      tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text());
+      pr_it.DeleteCurrentWord();
+      continue;
+    }
+    if (!FindSegmentation(target_text, word_res)) {
+      tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n", word->text());
+      pr_it.DeleteCurrentWord();
+      continue;
+    }
+  }
+}
+
+/// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
+/// @return false if an invalid UNICHAR_ID is encountered.
+bool Tesseract::ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids) {
+  for (int step = 0; *utf8 != '\0'; utf8 += step) {
+    const char *next_space = strchr(utf8, ' ');
+    if (next_space == nullptr) {
+      next_space = utf8 + strlen(utf8);
+    }
+    step = next_space - utf8;
+    UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
+    if (class_id == INVALID_UNICHAR_ID) {
+      return false;
+    }
+    while (utf8[step] == ' ') {
+      ++step;
+    }
+    class_ids->push_back(class_id);
+  }
+  return true;
+}
+
+/// Resegments the word to achieve the target_text from the classifier.
+/// Returns false if the re-segmentation fails.
+/// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and
+/// applies a full search on the classifier results to find the best classified
+/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
+/// substitutions ARE used.
+bool Tesseract::FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res) {
+  // Classify all required combinations of blobs and save results in choices.
+  const int word_length = word_res->box_word->length();
+  auto *choices = new std::vector<BLOB_CHOICE_LIST *>[word_length];
+  for (int i = 0; i < word_length; ++i) {
+    for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
+      BLOB_CHOICE_LIST *match_result =
+          classify_piece(word_res->seam_array, i, i + j - 1, "Applybox", word_res->chopped_word,
+                         word_res->blamer_bundle);
+      if (applybox_debug > 2) {
+        tprintf("%d+%d:", i, j);
+        print_ratings_list("Segment:", match_result, unicharset);
+      }
+      choices[i].push_back(match_result);
+    }
+  }
+  // Search the segmentation graph for the target text. Must be an exact
+  // match. Using wildcards makes it difficult to find the correct
+  // segmentation even when it is there.
+  word_res->best_state.clear();
+  std::vector<int> search_segmentation;
+  float best_rating = 0.0f;
+  SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating,
+                &word_res->best_state);
+  for (int i = 0; i < word_length; ++i) {
+    for (auto choice : choices[i]) {
+      delete choice;
+    }
+  }
+  delete[] choices;
+  if (word_res->best_state.empty()) {
+    // Build the original segmentation and if it is the same length as the
+    // truth, assume it will do.
+    int blob_count = 1;
+    for (auto s : word_res->seam_array) {
+      SEAM *seam = s;
+      if (!seam->HasAnySplits()) {
+        word_res->best_state.push_back(blob_count);
+        blob_count = 1;
+      } else {
+        ++blob_count;
+      }
+    }
+    word_res->best_state.push_back(blob_count);
+    if (word_res->best_state.size() != target_text.size()) {
+      word_res->best_state.clear(); // No good. Original segmentation bad size.
+      return false;
+    }
+  }
+  word_res->correct_text.clear();
+  for (auto &text : target_text) {
+    word_res->correct_text.emplace_back(unicharset.id_to_unichar(text));
+  }
+  return true;
+}
+
+/// Recursive helper to find a match to the target_text (from text_index
+/// position) in the choices (from choices_pos position).
+/// @param choices is an array of vectors of length choices_length,
+/// with each element representing a starting position in the word, and the
+/// #vector holding classification results for a sequence of consecutive
+/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
+/// @param choices_pos
+/// @param choices_length
+/// @param target_text
+/// @param text_index
+/// @param rating
+/// @param segmentation
+/// @param best_rating
+/// @param best_segmentation
+void Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
+                              int choices_length, const std::vector<UNICHAR_ID> &target_text,
+                              int text_index, float rating, std::vector<int> *segmentation,
+                              float *best_rating, std::vector<int> *best_segmentation) {
+  const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs();
+  for (unsigned length = 1; length <= choices[choices_pos].size(); ++length) {
+    // Rating of matching choice or worst choice if no match.
+    float choice_rating = 0.0f;
+    // Find the corresponding best BLOB_CHOICE.
+    BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
+    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
+      const BLOB_CHOICE *choice = choice_it.data();
+      choice_rating = choice->rating();
+      UNICHAR_ID class_id = choice->unichar_id();
+      if (class_id == target_text[text_index]) {
+        break;
+      }
+      // Search ambigs table.
+      if (class_id < table.size() && table[class_id] != nullptr) {
+        AmbigSpec_IT spec_it(table[class_id]);
+        for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) {
+          const AmbigSpec *ambig_spec = spec_it.data();
+          // We'll only do 1-1.
+          if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
+              ambig_spec->correct_ngram_id == target_text[text_index]) {
+            break;
+          }
+        }
+        if (!spec_it.cycled_list()) {
+          break; // Found an ambig.
+        }
+      }
+    }
+    if (choice_it.cycled_list()) {
+      continue; // No match.
+    }
+    segmentation->push_back(length);
+    if (choices_pos + length == choices_length && text_index + 1 == target_text.size()) {
+      // This is a complete match. If the rating is good record a new best.
+      if (applybox_debug > 2) {
+        tprintf("Complete match, rating = %g, best=%g, seglength=%zu, best=%zu\n",
+                rating + choice_rating, *best_rating, segmentation->size(),
+                best_segmentation->size());
+      }
+      if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
+        *best_segmentation = *segmentation;
+        *best_rating = rating + choice_rating;
+      }
+    } else if (choices_pos + length < choices_length && text_index + 1 < target_text.size()) {
+      if (applybox_debug > 3) {
+        tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n", target_text[text_index],
+                unicharset.id_to_unichar(target_text[text_index]),
+                choice_it.data()->unichar_id() == target_text[text_index] ? "Match" : "Ambig",
+                choices_pos, length);
+      }
+      SearchForText(choices, choices_pos + length, choices_length, target_text, text_index + 1,
+                    rating + choice_rating, segmentation, best_rating, best_segmentation);
+      if (applybox_debug > 3) {
+        tprintf("End recursion for %d=%s\n", target_text[text_index],
+                unicharset.id_to_unichar(target_text[text_index]));
+      }
+    }
+    segmentation->resize(segmentation->size() - 1);
+  }
+}
+
+/// - Counts up the labelled words and the blobs within.
+/// - Deletes all unused or emptied words, counting the unused ones.
+/// - Resets W_BOL and W_EOL flags correctly.
+/// - Builds the rebuild_word and rebuilds the box_word and the best_choice.
+void Tesseract::TidyUp(PAGE_RES *page_res) {
+  int ok_blob_count = 0;
+  int bad_blob_count = 0;
+  int ok_word_count = 0;
+  int unlabelled_words = 0;
+  PAGE_RES_IT pr_it(page_res);
+  WERD_RES *word_res;
+  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
+    int ok_in_word = 0;
+    int blob_count = word_res->correct_text.size();
+    auto *word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
+    word_choice->set_permuter(TOP_CHOICE_PERM);
+    for (int c = 0; c < blob_count; ++c) {
+      if (word_res->correct_text[c].length() > 0) {
+        ++ok_in_word;
+      }
+      // Since we only need a fake word_res->best_choice, the actual
+      // unichar_ids do not matter. Which is fortunate, since TidyUp()
+      // can be called while training Tesseract, at the stage where
+      // unicharset is not meaningful yet.
+      word_choice->append_unichar_id_space_allocated(INVALID_UNICHAR_ID, word_res->best_state[c],
+                                                     1.0f, -1.0f);
+    }
+    if (ok_in_word > 0) {
+      ok_blob_count += ok_in_word;
+      bad_blob_count += word_res->correct_text.size() - ok_in_word;
+      word_res->LogNewRawChoice(word_choice);
+      word_res->LogNewCookedChoice(1, false, word_choice);
+    } else {
+      ++unlabelled_words;
+      if (applybox_debug > 0) {
+        tprintf("APPLY_BOXES: Unlabelled word at :");
+        word_res->word->bounding_box().print();
+      }
+      pr_it.DeleteCurrentWord();
+      delete word_choice;
+    }
+  }
+  pr_it.restart_page();
+  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
+    // Denormalize back to a BoxWord.
+    word_res->RebuildBestState();
+    word_res->SetupBoxWord();
+    word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
+    word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
+  }
+  if (applybox_debug > 0) {
+    tprintf("   Found %d good blobs.\n", ok_blob_count);
+    if (bad_blob_count > 0) {
+      tprintf("   Leaving %d unlabelled blobs in %d words.\n", bad_blob_count, ok_word_count);
+    }
+    if (unlabelled_words > 0) {
+      tprintf("   %d remaining unlabelled words deleted.\n", unlabelled_words);
+    }
+  }
+}
+
+/** Logs a bad box by line in the box file and box coords.*/
+void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch,
+                                const char *err_msg) {
+  tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n", boxfile_lineno + 1, box_ch,
+          box.left(), box.bottom(), box.right(), box.top(), err_msg);
+}
+
+/// Calls #LearnWord to extract features for labelled blobs within each word.
+/// Features are stored in an internal buffer.
+void Tesseract::ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res) {
+  PAGE_RES_IT pr_it(page_res);
+  int word_count = 0;
+  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
+    LearnWord(fontname.c_str(), word_res);
+    ++word_count;
+  }
+  tprintf("Generated training data for %d words\n", word_count);
+}
+
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+/** Creates a fake best_choice entry in each WERD_RES with the correct text.*/
+void Tesseract::CorrectClassifyWords(PAGE_RES *page_res) {
+  PAGE_RES_IT pr_it(page_res);
+  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
+    auto *choice = new WERD_CHOICE(word_res->uch_set, word_res->correct_text.size());
+    for (auto &correct_text : word_res->correct_text) {
+      // The part before the first space is the real ground truth, and the
+      // rest is the bounding box location and page number.
+      std::vector<std::string> tokens = split(correct_text, ' ');
+      UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].c_str());
+      choice->append_unichar_id_space_allocated(char_id, word_res->best_state[&correct_text - &word_res->correct_text[0]], 0.0f, 0.0f);
+    }
+    word_res->ClearWordChoices();
+    word_res->LogNewRawChoice(choice);
+    word_res->LogNewCookedChoice(1, false, choice);
+  }
+}
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/control.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/control.cpp
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/control.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/control.h
@ -0,0 +1,37 @@
+/**********************************************************************
+ * File:        control.h  (Formerly control.h)
+ * Description: Module-independent matcher controller.
+ * Author:      Ray Smith
+ * Created:     Thu Apr 23 11:09:58 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+/**
+ * @file control.h
+ * Module-independent matcher controller.
+ */
+
+#ifndef CONTROL_H
+#define CONTROL_H
+
+enum ACCEPTABLE_WERD_TYPE {
+  AC_UNACCEPTABLE, ///< Unacceptable word
+  AC_LOWER_CASE,   ///< ALL lower case
+  AC_UPPER_CASE,   ///< ALL upper case
+  AC_INITIAL_CAP,  ///< ALL but initial lc
+  AC_LC_ABBREV,    ///< a.b.c.
+  AC_UC_ABBREV     ///< A.B.C.
+};
+
+#endif
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/docqual.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/docqual.cpp
@ -0,0 +1,932 @@
+/******************************************************************
+ * File:        docqual.cpp  (Formerly docqual.c)
+ * Description: Document Quality Metrics
+ * Author:      Phil Cheatle
+ *
+ * (C) Copyright 1994, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "docqual.h"
+#include <cctype>
+#include "reject.h"
+#include "tesseractclass.h"
+#include "tessvars.h"
+
+namespace tesseract {
+
+static void countMatchingBlobs(int16_t &match_count, int /*index*/) {
+  ++match_count;
+}
+
+static void countAcceptedBlobs(WERD_RES *word, int16_t &match_count, int16_t &accepted_match_count,
+                               int index) {
+  if (word->reject_map[index].accepted()) {
+    ++accepted_match_count;
+  }
+  ++match_count;
+}
+
+static void acceptIfGoodQuality(WERD_RES *word, int index) {
+  if (word->reject_map[index].accept_if_good_quality()) {
+    word->reject_map[index].setrej_quality_accept();
+  }
+}
+
+/*************************************************************************
+ * word_blob_quality()
+ * How many blobs in the box_word are identical to those of the inword?
+ * ASSUME blobs in both initial word and box_word are in ascending order of
+ * left hand blob edge.
+ *************************************************************************/
+int16_t Tesseract::word_blob_quality(WERD_RES *word) {
+  int16_t match_count = 0;
+  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
+      !word->rebuild_word->blobs.empty()) {
+    using namespace std::placeholders; // for _1
+    word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word,
+                                         std::bind(countMatchingBlobs, match_count, _1));
+  }
+  return match_count;
+}
+
+int16_t Tesseract::word_outline_errs(WERD_RES *word) {
+  int16_t i = 0;
+  int16_t err_count = 0;
+
+  if (word->rebuild_word != nullptr) {
+    for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
+      TBLOB *blob = word->rebuild_word->blobs[b];
+      err_count += count_outline_errs(word->best_choice->unichar_string()[i], blob->NumOutlines());
+      i++;
+    }
+  }
+  return err_count;
+}
+
+/*************************************************************************
+ * word_char_quality()
+ * Combination of blob quality and outline quality - how many good chars are
+ * there? - I.e chars which pass the blob AND outline tests.
+ *************************************************************************/
+void Tesseract::word_char_quality(WERD_RES *word, int16_t *match_count,
+                                  int16_t *accepted_match_count) {
+  *match_count = 0;
+  *accepted_match_count = 0;
+  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
+      !word->rebuild_word->blobs.empty()) {
+    using namespace std::placeholders; // for _1
+    word->bln_boxes->ProcessMatchedBlobs(
+        *word->rebuild_word,
+        std::bind(countAcceptedBlobs, word, *match_count, *accepted_match_count, _1));
+  }
+}
+
+/*************************************************************************
+ * unrej_good_chs()
+ * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
+ *************************************************************************/
+void Tesseract::unrej_good_chs(WERD_RES *word) {
+  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
+      word->rebuild_word->blobs.empty()) {
+    using namespace std::placeholders; // for _1
+    word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word,
+                                         std::bind(acceptIfGoodQuality, word, _1));
+  }
+}
+
+int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) {
+  int expected_outline_count;
+
+  if (outlines_odd.contains(c)) {
+    return 0; // Don't use this char
+  } else if (outlines_2.contains(c)) {
+    expected_outline_count = 2;
+  } else {
+    expected_outline_count = 1;
+  }
+  return abs(outline_count - expected_outline_count);
+}
+
+void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc) {
+  if ((tessedit_good_quality_unrej && good_quality_doc)) {
+    unrej_good_quality_words(page_res_it);
+  }
+  doc_and_block_rejection(page_res_it, good_quality_doc);
+  if (unlv_tilde_crunching) {
+    tilde_crunch(page_res_it);
+    tilde_delete(page_res_it);
+  }
+}
+
+/*************************************************************************
+ * unrej_good_quality_words()
+ * Accept potential rejects in words which pass the following checks:
+ *    - Contains a potential reject
+ *    - Word looks like a sensible alpha word.
+ *    - Word segmentation is the same as the original image
+ *    - All characters have the expected number of outlines
+ * NOTE - the rejection counts are recalculated after unrejection
+ *      - CAN'T do it in a single pass without a bit of fiddling
+ *    - keep it simple but inefficient
+ *************************************************************************/
+void Tesseract::unrej_good_quality_words( // unreject potential
+    PAGE_RES_IT &page_res_it) {
+  WERD_RES *word;
+  ROW_RES *current_row;
+  BLOCK_RES *current_block;
+  int i;
+
+  page_res_it.restart_page();
+  while (page_res_it.word() != nullptr) {
+    check_debug_pt(page_res_it.word(), 100);
+    if (bland_unrej) {
+      word = page_res_it.word();
+      for (i = 0; i < word->reject_map.length(); i++) {
+        if (word->reject_map[i].accept_if_good_quality()) {
+          word->reject_map[i].setrej_quality_accept();
+        }
+      }
+      page_res_it.forward();
+    } else if ((page_res_it.row()->char_count > 0) &&
+               ((page_res_it.row()->rej_count /
+                 static_cast<float>(page_res_it.row()->char_count)) <= quality_rowrej_pc)) {
+      word = page_res_it.word();
+      if (word->reject_map.quality_recoverable_rejects() &&
+          (tessedit_unrej_any_wd ||
+           acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(),
+                                  word->best_choice->unichar_lengths().c_str()) !=
+               AC_UNACCEPTABLE)) {
+        unrej_good_chs(word);
+      }
+      page_res_it.forward();
+    } else {
+      // Skip to end of dodgy row.
+      current_row = page_res_it.row();
+      while ((page_res_it.word() != nullptr) && (page_res_it.row() == current_row)) {
+        page_res_it.forward();
+      }
+    }
+    check_debug_pt(page_res_it.word(), 110);
+  }
+  page_res_it.restart_page();
+  page_res_it.page_res->char_count = 0;
+  page_res_it.page_res->rej_count = 0;
+  current_block = nullptr;
+  current_row = nullptr;
+  while (page_res_it.word() != nullptr) {
+    if (current_block != page_res_it.block()) {
+      current_block = page_res_it.block();
+      current_block->char_count = 0;
+      current_block->rej_count = 0;
+    }
+    if (current_row != page_res_it.row()) {
+      current_row = page_res_it.row();
+      current_row->char_count = 0;
+      current_row->rej_count = 0;
+      current_row->whole_word_rej_count = 0;
+    }
+    page_res_it.rej_stat_word();
+    page_res_it.forward();
+  }
+}
+
+/*************************************************************************
+ * doc_and_block_rejection()
+ *
+ * If the page has too many rejects - reject all of it.
+ * If any block has too many rejects - reject all words in the block
+ *************************************************************************/
+
+void Tesseract::doc_and_block_rejection( // reject big chunks
+    PAGE_RES_IT &page_res_it, bool good_quality_doc) {
+  int16_t block_no = 0;
+  int16_t row_no = 0;
+  BLOCK_RES *current_block;
+  ROW_RES *current_row;
+
+  bool rej_word;
+  bool prev_word_rejected;
+  int16_t char_quality = 0;
+  int16_t accepted_char_quality;
+
+  if (page_res_it.page_res->rej_count * 100.0 / page_res_it.page_res->char_count >
+      tessedit_reject_doc_percent) {
+    reject_whole_page(page_res_it);
+    if (tessedit_debug_doc_rejection) {
+      tprintf("REJECT ALL #chars: %d #Rejects: %d; \n", page_res_it.page_res->char_count,
+              page_res_it.page_res->rej_count);
+    }
+  } else {
+    if (tessedit_debug_doc_rejection) {
+      tprintf("NO PAGE REJECTION #chars: %d  # Rejects: %d; \n", page_res_it.page_res->char_count,
+              page_res_it.page_res->rej_count);
+    }
+
+    /* Walk blocks testing for block rejection */
+
+    page_res_it.restart_page();
+    WERD_RES *word;
+    while ((word = page_res_it.word()) != nullptr) {
+      current_block = page_res_it.block();
+      block_no = current_block->block->pdblk.index();
+      if (current_block->char_count > 0 &&
+          (current_block->rej_count * 100.0 / current_block->char_count) >
+              tessedit_reject_block_percent) {
+        if (tessedit_debug_block_rejection) {
+          tprintf("REJECTING BLOCK %d  #chars: %d;  #Rejects: %d\n", block_no,
+                  current_block->char_count, current_block->rej_count);
+        }
+        prev_word_rejected = false;
+        while ((word = page_res_it.word()) != nullptr && (page_res_it.block() == current_block)) {
+          if (tessedit_preserve_blk_rej_perfect_wds) {
+            rej_word = word->reject_map.reject_count() > 0 ||
+                       word->reject_map.length() < tessedit_preserve_min_wd_len;
+            if (rej_word && tessedit_dont_blkrej_good_wds &&
+                word->reject_map.length() >= tessedit_preserve_min_wd_len &&
+                acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(),
+                                       word->best_choice->unichar_lengths().c_str()) !=
+                    AC_UNACCEPTABLE) {
+              word_char_quality(word, &char_quality, &accepted_char_quality);
+              rej_word = char_quality != word->reject_map.length();
+            }
+          } else {
+            rej_word = true;
+          }
+          if (rej_word) {
+            /*
+  Reject spacing if both current and prev words are rejected.
+  NOTE - this is NOT restricted to FUZZY spaces. - When tried this
+  generated more space errors.
+*/
+            if (tessedit_use_reject_spaces && prev_word_rejected &&
+                page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) {
+              word->reject_spaces = true;
+            }
+            word->reject_map.rej_word_block_rej();
+          }
+          prev_word_rejected = rej_word;
+          page_res_it.forward();
+        }
+      } else {
+        if (tessedit_debug_block_rejection) {
+          tprintf("NOT REJECTING BLOCK %d #chars: %d  # Rejects: %d; \n", block_no,
+                  page_res_it.block()->char_count, page_res_it.block()->rej_count);
+        }
+
+        /* Walk rows in block testing for row rejection */
+        row_no = 0;
+        while (page_res_it.word() != nullptr && page_res_it.block() == current_block) {
+          current_row = page_res_it.row();
+          row_no++;
+          /* Reject whole row if:
+  fraction of chars on row which are rejected exceed a limit AND
+  fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
+  limit
+*/
+          if (current_row->char_count > 0 &&
+              (current_row->rej_count * 100.0 / current_row->char_count) >
+                  tessedit_reject_row_percent &&
+              (current_row->whole_word_rej_count * 100.0 / current_row->rej_count) <
+                  tessedit_whole_wd_rej_row_percent) {
+            if (tessedit_debug_block_rejection) {
+              tprintf("REJECTING ROW %d  #chars: %d;  #Rejects: %d\n", row_no,
+                      current_row->char_count, current_row->rej_count);
+            }
+            prev_word_rejected = false;
+            while ((word = page_res_it.word()) != nullptr && page_res_it.row() == current_row) {
+              /* Preserve words on good docs unless they are mostly rejected*/
+              if (!tessedit_row_rej_good_docs && good_quality_doc) {
+                rej_word = word->reject_map.reject_count() /
+                               static_cast<float>(word->reject_map.length()) >
+                           tessedit_good_doc_still_rowrej_wd;
+              } else if (tessedit_preserve_row_rej_perfect_wds) {
+                /* Preserve perfect words anyway */
+                rej_word = word->reject_map.reject_count() > 0 ||
+                           word->reject_map.length() < tessedit_preserve_min_wd_len;
+                if (rej_word && tessedit_dont_rowrej_good_wds &&
+                    word->reject_map.length() >= tessedit_preserve_min_wd_len &&
+                    acceptable_word_string(
+                        *word->uch_set, word->best_choice->unichar_string().c_str(),
+                        word->best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE) {
+                  word_char_quality(word, &char_quality, &accepted_char_quality);
+                  rej_word = char_quality != word->reject_map.length();
+                }
+              } else {
+                rej_word = true;
+              }
+              if (rej_word) {
+                /*
+  Reject spacing if both current and prev words are rejected.
+  NOTE - this is NOT restricted to FUZZY spaces. - When tried
+  this generated more space errors.
+*/
+                if (tessedit_use_reject_spaces && prev_word_rejected &&
+                    page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) {
+                  word->reject_spaces = true;
+                }
+                word->reject_map.rej_word_row_rej();
+              }
+              prev_word_rejected = rej_word;
+              page_res_it.forward();
+            }
+          } else {
+            if (tessedit_debug_block_rejection) {
+              tprintf("NOT REJECTING ROW %d #chars: %d  # Rejects: %d; \n", row_no,
+                      current_row->char_count, current_row->rej_count);
+            }
+            while (page_res_it.word() != nullptr && page_res_it.row() == current_row) {
+              page_res_it.forward();
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+/*************************************************************************
+ * reject_whole_page()
+ * Don't believe any of it - set the reject map to 00..00 in all words
+ *
+ *************************************************************************/
+
+void reject_whole_page(PAGE_RES_IT &page_res_it) {
+  page_res_it.restart_page();
+  while (page_res_it.word() != nullptr) {
+    page_res_it.word()->reject_map.rej_word_doc_rej();
+    page_res_it.forward();
+  }
+  // whole page is rejected
+  page_res_it.page_res->rejected = true;
+}
+
+void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
+  WERD_RES *word;
+  GARBAGE_LEVEL garbage_level;
+  PAGE_RES_IT copy_it;
+  bool prev_potential_marked = false;
+  bool found_terrible_word = false;
+  bool ok_dict_word;
+
+  page_res_it.restart_page();
+  while (page_res_it.word() != nullptr) {
+    POLY_BLOCK *pb = page_res_it.block()->block->pdblk.poly_block();
+    if (pb != nullptr && !pb->IsText()) {
+      page_res_it.forward();
+      continue;
+    }
+    word = page_res_it.word();
+
+    if (crunch_early_convert_bad_unlv_chs) {
+      convert_bad_unlv_chs(word);
+    }
+
+    if (crunch_early_merge_tess_fails) {
+      word->merge_tess_fails();
+    }
+
+    if (word->reject_map.accept_count() != 0) {
+      found_terrible_word = false;
+      // Forget earlier potential crunches
+      prev_potential_marked = false;
+    } else {
+      ok_dict_word = safe_dict_word(word);
+      garbage_level = garbage_word(word, ok_dict_word);
+
+      if ((garbage_level != G_NEVER_CRUNCH) && (terrible_word_crunch(word, garbage_level))) {
+        if (crunch_debug > 0) {
+          tprintf("T CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
+        }
+        word->unlv_crunch_mode = CR_KEEP_SPACE;
+        if (prev_potential_marked) {
+          while (copy_it.word() != word) {
+            if (crunch_debug > 0) {
+              tprintf("P1 CRUNCHING: \"%s\"\n",
+                      copy_it.word()->best_choice->unichar_string().c_str());
+            }
+            copy_it.word()->unlv_crunch_mode = CR_KEEP_SPACE;
+            copy_it.forward();
+          }
+          prev_potential_marked = false;
+        }
+        found_terrible_word = true;
+      } else if ((garbage_level != G_NEVER_CRUNCH) &&
+                 (potential_word_crunch(word, garbage_level, ok_dict_word))) {
+        if (found_terrible_word) {
+          if (crunch_debug > 0) {
+            tprintf("P2 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
+          }
+          word->unlv_crunch_mode = CR_KEEP_SPACE;
+        } else if (!prev_potential_marked) {
+          copy_it = page_res_it;
+          prev_potential_marked = true;
+          if (crunch_debug > 1) {
+            tprintf("P3 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
+          }
+        }
+      } else {
+        found_terrible_word = false;
+        // Forget earlier potential crunches
+        prev_potential_marked = false;
+        if (crunch_debug > 2) {
+          tprintf("NO CRUNCH: \"%s\"\n", word->best_choice->unichar_string().c_str());
+        }
+      }
+    }
+    page_res_it.forward();
+  }
+}
+
+bool Tesseract::terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) {
+  float rating_per_ch;
+  int adjusted_len;
+  int crunch_mode = 0;
+
+  if (word->best_choice->unichar_string().empty() ||
+      (strspn(word->best_choice->unichar_string().c_str(), " ") ==
+       word->best_choice->unichar_string().size())) {
+    crunch_mode = 1;
+  } else {
+    adjusted_len = word->reject_map.length();
+    if (adjusted_len > crunch_rating_max) {
+      adjusted_len = crunch_rating_max;
+    }
+    rating_per_ch = word->best_choice->rating() / adjusted_len;
+
+    if (rating_per_ch > crunch_terrible_rating) {
+      crunch_mode = 2;
+    } else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE)) {
+      crunch_mode = 3;
+    } else if ((word->best_choice->certainty() < crunch_poor_garbage_cert) &&
+               (garbage_level != G_OK)) {
+      crunch_mode = 4;
+    } else if ((rating_per_ch > crunch_poor_garbage_rate) && (garbage_level != G_OK)) {
+      crunch_mode = 5;
+    }
+  }
+  if (crunch_mode > 0) {
+    if (crunch_debug > 2) {
+      tprintf("Terrible_word_crunch (%d) on \"%s\"\n", crunch_mode,
+              word->best_choice->unichar_string().c_str());
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool Tesseract::potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level,
+                                      bool ok_dict_word) {
+  float rating_per_ch;
+  int adjusted_len;
+  const char *str = word->best_choice->unichar_string().c_str();
+  const char *lengths = word->best_choice->unichar_lengths().c_str();
+  bool word_crunchable;
+  int poor_indicator_count = 0;
+
+  word_crunchable =
+      !crunch_leave_accept_strings || word->reject_map.length() < 3 ||
+      (acceptable_word_string(*word->uch_set, str, lengths) == AC_UNACCEPTABLE && !ok_dict_word);
+
+  adjusted_len = word->reject_map.length();
+  if (adjusted_len > 10) {
+    adjusted_len = 10;
+  }
+  rating_per_ch = word->best_choice->rating() / adjusted_len;
+
+  if (rating_per_ch > crunch_pot_poor_rate) {
+    if (crunch_debug > 2) {
+      tprintf("Potential poor rating on \"%s\"\n", word->best_choice->unichar_string().c_str());
+    }
+    poor_indicator_count++;
+  }
+
+  if (word_crunchable && word->best_choice->certainty() < crunch_pot_poor_cert) {
+    if (crunch_debug > 2) {
+      tprintf("Potential poor cert on \"%s\"\n", word->best_choice->unichar_string().c_str());
+    }
+    poor_indicator_count++;
+  }
+
+  if (garbage_level != G_OK) {
+    if (crunch_debug > 2) {
+      tprintf("Potential garbage on \"%s\"\n", word->best_choice->unichar_string().c_str());
+    }
+    poor_indicator_count++;
+  }
+  return poor_indicator_count >= crunch_pot_indicators;
+}
+
+void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
+  WERD_RES *word;
+  PAGE_RES_IT copy_it;
+  bool deleting_from_bol = false;
+  bool marked_delete_point = false;
+  int16_t debug_delete_mode;
+  CRUNCH_MODE delete_mode;
+  int16_t x_debug_delete_mode;
+  CRUNCH_MODE x_delete_mode;
+
+  page_res_it.restart_page();
+  while (page_res_it.word() != nullptr) {
+    word = page_res_it.word();
+
+    delete_mode = word_deletable(word, debug_delete_mode);
+    if (delete_mode != CR_NONE) {
+      if (word->word->flag(W_BOL) || deleting_from_bol) {
+        if (crunch_debug > 0) {
+          tprintf("BOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,
+                  word->best_choice->unichar_string().c_str());
+        }
+        word->unlv_crunch_mode = delete_mode;
+        deleting_from_bol = true;
+      } else if (word->word->flag(W_EOL)) {
+        if (marked_delete_point) {
+          while (copy_it.word() != word) {
+            x_delete_mode = word_deletable(copy_it.word(), x_debug_delete_mode);
+            if (crunch_debug > 0) {
+              tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", x_debug_delete_mode,
+                      copy_it.word()->best_choice->unichar_string().c_str());
+            }
+            copy_it.word()->unlv_crunch_mode = x_delete_mode;
+            copy_it.forward();
+          }
+        }
+        if (crunch_debug > 0) {
+          tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,
+                  word->best_choice->unichar_string().c_str());
+        }
+        word->unlv_crunch_mode = delete_mode;
+        deleting_from_bol = false;
+        marked_delete_point = false;
+      } else {
+        if (!marked_delete_point) {
+          copy_it = page_res_it;
+          marked_delete_point = true;
+        }
+      }
+    } else {
+      deleting_from_bol = false;
+      // Forget earlier potential crunches
+      marked_delete_point = false;
+    }
+    /*
+  The following step has been left till now as the tess fails are used to
+  determine if the word is deletable.
+*/
+    if (!crunch_early_merge_tess_fails) {
+      word->merge_tess_fails();
+    }
+    page_res_it.forward();
+  }
+}
+
+void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
+  int i;
+  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
+  UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
+  UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
+  UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
+  for (i = 0; i < word_res->reject_map.length(); ++i) {
+    if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
+      word_res->best_choice->set_unichar_id(unichar_dash, i);
+      if (word_res->reject_map[i].accepted()) {
+        word_res->reject_map[i].setrej_unlv_rej();
+      }
+    }
+    if (word_res->best_choice->unichar_id(i) == unichar_pow) {
+      word_res->best_choice->set_unichar_id(unichar_space, i);
+      if (word_res->reject_map[i].accepted()) {
+        word_res->reject_map[i].setrej_unlv_rej();
+      }
+    }
+  }
+}
+
+GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {
+  enum STATES {
+    JUNK,
+    FIRST_UPPER,
+    FIRST_LOWER,
+    FIRST_NUM,
+    SUBSEQUENT_UPPER,
+    SUBSEQUENT_LOWER,
+    SUBSEQUENT_NUM
+  };
+  const char *str = word->best_choice->unichar_string().c_str();
+  const char *lengths = word->best_choice->unichar_lengths().c_str();
+  STATES state = JUNK;
+  int len = 0;
+  int isolated_digits = 0;
+  int isolated_alphas = 0;
+  int bad_char_count = 0;
+  int tess_rejs = 0;
+  int dodgy_chars = 0;
+  int ok_chars;
+  UNICHAR_ID last_char = -1;
+  int alpha_repetition_count = 0;
+  int longest_alpha_repetition_count = 0;
+  int longest_lower_run_len = 0;
+  int lower_string_count = 0;
+  int longest_upper_run_len = 0;
+  int upper_string_count = 0;
+  int total_alpha_count = 0;
+  int total_digit_count = 0;
+
+  for (; *str != '\0'; str += *(lengths++)) {
+    len++;
+    if (word->uch_set->get_isupper(str, *lengths)) {
+      total_alpha_count++;
+      switch (state) {
+        case SUBSEQUENT_UPPER:
+        case FIRST_UPPER:
+          state = SUBSEQUENT_UPPER;
+          upper_string_count++;
+          if (longest_upper_run_len < upper_string_count) {
+            longest_upper_run_len = upper_string_count;
+          }
+          if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
+            alpha_repetition_count++;
+            if (longest_alpha_repetition_count < alpha_repetition_count) {
+              longest_alpha_repetition_count = alpha_repetition_count;
+            }
+          } else {
+            last_char = word->uch_set->unichar_to_id(str, *lengths);
+            alpha_repetition_count = 1;
+          }
+          break;
+        case FIRST_NUM:
+          isolated_digits++;
+          // Fall through.
+        default:
+          state = FIRST_UPPER;
+          last_char = word->uch_set->unichar_to_id(str, *lengths);
+          alpha_repetition_count = 1;
+          upper_string_count = 1;
+          break;
+      }
+    } else if (word->uch_set->get_islower(str, *lengths)) {
+      total_alpha_count++;
+      switch (state) {
+        case SUBSEQUENT_LOWER:
+        case FIRST_LOWER:
+          state = SUBSEQUENT_LOWER;
+          lower_string_count++;
+          if (longest_lower_run_len < lower_string_count) {
+            longest_lower_run_len = lower_string_count;
+          }
+          if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
+            alpha_repetition_count++;
+            if (longest_alpha_repetition_count < alpha_repetition_count) {
+              longest_alpha_repetition_count = alpha_repetition_count;
+            }
+          } else {
+            last_char = word->uch_set->unichar_to_id(str, *lengths);
+            alpha_repetition_count = 1;
+          }
+          break;
+        case FIRST_NUM:
+          isolated_digits++;
+          // Fall through.
+        default:
+          state = FIRST_LOWER;
+          last_char = word->uch_set->unichar_to_id(str, *lengths);
+          alpha_repetition_count = 1;
+          lower_string_count = 1;
+          break;
+      }
+    } else if (word->uch_set->get_isdigit(str, *lengths)) {
+      total_digit_count++;
+      switch (state) {
+        case FIRST_NUM:
+          state = SUBSEQUENT_NUM;
+        case SUBSEQUENT_NUM:
+          break;
+        case FIRST_UPPER:
+        case FIRST_LOWER:
+          isolated_alphas++;
+          // Fall through.
+        default:
+          state = FIRST_NUM;
+          break;
+      }
+    } else {
+      if (*lengths == 1 && *str == ' ') {
+        tess_rejs++;
+      } else {
+        bad_char_count++;
+      }
+      switch (state) {
+        case FIRST_NUM:
+          isolated_digits++;
+          break;
+        case FIRST_UPPER:
+        case FIRST_LOWER:
+          isolated_alphas++;
+        default:
+          break;
+      }
+      state = JUNK;
+    }
+  }
+
+  switch (state) {
+    case FIRST_NUM:
+      isolated_digits++;
+      break;
+    case FIRST_UPPER:
+    case FIRST_LOWER:
+      isolated_alphas++;
+    default:
+      break;
+  }
+
+  if (crunch_include_numerals) {
+    total_alpha_count += total_digit_count - isolated_digits;
+  }
+
+  if (crunch_leave_ok_strings && len >= 4 && 2 * (total_alpha_count - isolated_alphas) > len &&
+      longest_alpha_repetition_count < crunch_long_repetitions) {
+    if ((crunch_accept_ok &&
+         acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE) ||
+        longest_lower_run_len > crunch_leave_lc_strings ||
+        longest_upper_run_len > crunch_leave_uc_strings) {
+      return G_NEVER_CRUNCH;
+    }
+  }
+  if (word->reject_map.length() > 1 && strpbrk(str, " ") == nullptr &&
+      (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
+       word->best_choice->permuter() == FREQ_DAWG_PERM ||
+       word->best_choice->permuter() == USER_DAWG_PERM ||
+       word->best_choice->permuter() == NUMBER_PERM ||
+       acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE || ok_dict_word)) {
+    return G_OK;
+  }
+
+  ok_chars = len - bad_char_count - isolated_digits - isolated_alphas - tess_rejs;
+
+  if (crunch_debug > 3) {
+    tprintf("garbage_word: \"%s\"\n", word->best_choice->unichar_string().c_str());
+    tprintf("LEN: %d  bad: %d  iso_N: %d  iso_A: %d  rej: %d\n", len, bad_char_count,
+            isolated_digits, isolated_alphas, tess_rejs);
+  }
+  if (bad_char_count == 0 && tess_rejs == 0 &&
+      (len > isolated_digits + isolated_alphas || len <= 2)) {
+    return G_OK;
+  }
+
+  if (tess_rejs > ok_chars || (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len)) {
+    return G_TERRIBLE;
+  }
+
+  if (len > 4) {
+    dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits + isolated_alphas;
+    if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5) {
+      return G_DODGY;
+    } else {
+      return G_OK;
+    }
+  } else {
+    dodgy_chars = 2 * tess_rejs + bad_char_count;
+    if ((len == 4 && dodgy_chars > 2) || (len == 3 && dodgy_chars > 2) || dodgy_chars >= len) {
+      return G_DODGY;
+    } else {
+      return G_OK;
+    }
+  }
+}
+
+/*************************************************************************
+ * word_deletable()
+ *     DELETE WERDS AT ENDS OF ROWS IF
+ *        Word is crunched &&
+ *        ( string length = 0                                          OR
+ *          > 50% of chars are "|" (before merging)                    OR
+ *          certainty < -10                                            OR
+ *          rating /char > 60                                          OR
+ *          TOP of word is more than 0.5 xht BELOW baseline            OR
+ *          BOTTOM of word is more than 0.5 xht ABOVE xht              OR
+ *          length of word < 3xht                                      OR
+ *          height of word < 0.7 xht                                   OR
+ *          height of word > 3.0 xht                                   OR
+ *          >75% of the outline BBs have longest dimension < 0.5xht
+ *************************************************************************/
+
+CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) {
+  int word_len = word->reject_map.length();
+  float rating_per_ch;
+  TBOX box; // BB of word
+
+  if (word->unlv_crunch_mode == CR_NONE) {
+    delete_mode = 0;
+    return CR_NONE;
+  }
+
+  if (word_len == 0) {
+    delete_mode = 1;
+    return CR_DELETE;
+  }
+
+  if (word->rebuild_word != nullptr) {
+    // Cube leaves rebuild_word nullptr.
+    box = word->rebuild_word->bounding_box();
+    if (box.height() < crunch_del_min_ht * kBlnXHeight) {
+      delete_mode = 4;
+      return CR_DELETE;
+    }
+
+    if (noise_outlines(word->rebuild_word)) {
+      delete_mode = 5;
+      return CR_DELETE;
+    }
+  }
+
+  if ((failure_count(word) * 1.5) > word_len) {
+    delete_mode = 2;
+    return CR_LOOSE_SPACE;
+  }
+
+  if (word->best_choice->certainty() < crunch_del_cert) {
+    delete_mode = 7;
+    return CR_LOOSE_SPACE;
+  }
+
+  rating_per_ch = word->best_choice->rating() / word_len;
+
+  if (rating_per_ch > crunch_del_rating) {
+    delete_mode = 8;
+    return CR_LOOSE_SPACE;
+  }
+
+  if (box.top() < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
+    delete_mode = 9;
+    return CR_LOOSE_SPACE;
+  }
+
+  if (box.bottom() > kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
+    delete_mode = 10;
+    return CR_LOOSE_SPACE;
+  }
+
+  if (box.height() > crunch_del_max_ht * kBlnXHeight) {
+    delete_mode = 11;
+    return CR_LOOSE_SPACE;
+  }
+
+  if (box.width() < crunch_del_min_width * kBlnXHeight) {
+    delete_mode = 3;
+    return CR_LOOSE_SPACE;
+  }
+
+  delete_mode = 0;
+  return CR_NONE;
+}
+
+int16_t Tesseract::failure_count(WERD_RES *word) {
+  const char *str = word->best_choice->unichar_string().c_str();
+  int tess_rejs = 0;
+
+  for (; *str != '\0'; str++) {
+    if (*str == ' ') {
+      tess_rejs++;
+    }
+  }
+  return tess_rejs;
+}
+
+bool Tesseract::noise_outlines(TWERD *word) {
+  TBOX box; // BB of outline
+  int16_t outline_count = 0;
+  int16_t small_outline_count = 0;
+  int16_t max_dimension;
+  float small_limit = kBlnXHeight * crunch_small_outlines_size;
+
+  for (int b = 0; b < word->NumBlobs(); ++b) {
+    TBLOB *blob = word->blobs[b];
+    for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
+      outline_count++;
+      box = ol->bounding_box();
+      if (box.height() > box.width()) {
+        max_dimension = box.height();
+      } else {
+        max_dimension = box.width();
+      }
+      if (max_dimension < small_limit) {
+        small_outline_count++;
+      }
+    }
+  }
+  return small_outline_count >= outline_count;
+}
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/docqual.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/docqual.h
@ -0,0 +1,37 @@
+/******************************************************************
+ * File:        docqual.h  (Formerly docqual.h)
+ * Description: Document Quality Metrics
+ * Author:      Phil Cheatle
+ *
+ * (C) Copyright 1994, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef DOCQUAL_H
+#define DOCQUAL_H
+
+#include <cstdint> // for int16_t
+
+namespace tesseract {
+
+class PAGE_RES_IT;
+class ROW;
+class WERD_RES;
+
+enum GARBAGE_LEVEL { G_NEVER_CRUNCH, G_OK, G_DODGY, G_TERRIBLE };
+
+int16_t word_blob_quality(WERD_RES *word);
+void reject_whole_page(PAGE_RES_IT &page_res_it);
+
+} // namespace tesseract
+
+#endif
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/equationdetect.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/equationdetect.cpp
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/equationdetect.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/equationdetect.h
@ -0,0 +1,250 @@
+///////////////////////////////////////////////////////////////////////
+// File:        equationdetect.h
+// Description: The equation detection class that inherits equationdetectbase.
+// Author:      Zongyi (Joe) Liu (joeliu@google.com)
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCMAIN_EQUATIONDETECT_H_
+#define TESSERACT_CCMAIN_EQUATIONDETECT_H_
+
+#include <tesseract/unichar.h>  // for UNICHAR_ID
+#include "blobbox.h"            // for BLOBNBOX (ptr only), BlobSpecialText...
+#include "equationdetectbase.h" // for EquationDetectBase
+#include "tesseractclass.h"     // for Tesseract
+
+class TBOX;
+class UNICHARSET;
+
+namespace tesseract {
+
+class Tesseract;
+class ColPartition;
+class ColPartitionGrid;
+class ColPartitionSet;
+
+class TESS_API EquationDetect : public EquationDetectBase {
+public:
+  EquationDetect(const char *equ_datapath, const char *equ_language);
+  ~EquationDetect() override;
+
+  enum IndentType { NO_INDENT, LEFT_INDENT, RIGHT_INDENT, BOTH_INDENT, INDENT_TYPE_COUNT };
+
+  // Reset the lang_tesseract_ pointer. This function should be called before we
+  // do any detector work.
+  void SetLangTesseract(Tesseract *lang_tesseract);
+
+  // Iterate over the blobs inside to_block, and set the blobs that we want to
+  // process to BSTT_NONE. (By default, they should be BSTT_SKIP). The function
+  // returns 0 upon success.
+  int LabelSpecialText(TO_BLOCK *to_block) override;
+
+  // Find possible equation partitions from part_grid. Should be called
+  // after the special_text_type of blobs are set.
+  // It returns 0 upon success.
+  int FindEquationParts(ColPartitionGrid *part_grid, ColPartitionSet **best_columns) override;
+
+  // Reset the resolution of the processing image. TEST only function.
+  void SetResolution(const int resolution);
+
+protected:
+  // Identify the special text type for one blob, and update its field. When
+  // height_th is set (> 0), we will label the blob as BSTT_NONE if its height
+  // is less than height_th.
+  void IdentifySpecialText(BLOBNBOX *blob, const int height_th);
+
+  // Estimate the type for one unichar.
+  BlobSpecialTextType EstimateTypeForUnichar(const UNICHARSET &unicharset,
+                                             const UNICHAR_ID id) const;
+
+  // Compute special text type for each blobs in part_grid_.
+  void IdentifySpecialText();
+
+  // Identify blobs that we want to skip during special blob type
+  // classification.
+  void IdentifyBlobsToSkip(ColPartition *part);
+
+  // The ColPartitions in part_grid_ maybe over-segmented, particularly in the
+  // block equation regions. So we like to identify these partitions and merge
+  // them before we do the searching.
+  void MergePartsByLocation();
+
+  // Staring from the seed center, we do radius search. And for partitions that
+  // have large overlaps with seed, we remove them from part_grid_ and add into
+  // parts_overlap. Note: this function may update the part_grid_, so if the
+  // caller is also running ColPartitionGridSearch, use the RepositionIterator
+  // to continue.
+  void SearchByOverlap(ColPartition *seed, std::vector<ColPartition *> *parts_overlap);
+
+  // Insert part back into part_grid_, after it absorbs some other parts.
+  void InsertPartAfterAbsorb(ColPartition *part);
+
+  // Identify the colparitions in part_grid_, label them as PT_EQUATION, and
+  // save them into cp_seeds_.
+  void IdentifySeedParts();
+
+  // Check the blobs count for a seed region candidate.
+  bool CheckSeedBlobsCount(ColPartition *part);
+
+  // Compute the foreground pixel density for a tbox area.
+  float ComputeForegroundDensity(const TBOX &tbox);
+
+  // Check if part from seed2 label: with low math density and left indented. We
+  // are using two checks:
+  // 1. If its left is aligned with any coordinates in indented_texts_left,
+  // which we assume have been sorted.
+  // 2. If its foreground density is over foreground_density_th.
+  bool CheckForSeed2(const std::vector<int> &indented_texts_left,
+                     const float foreground_density_th, ColPartition *part);
+
+  // Count the number of values in sorted_vec that is close to val, used to
+  // check if a partition is aligned with text partitions.
+  int CountAlignment(const std::vector<int> &sorted_vec, const int val) const;
+
+  // Check for a seed candidate using the foreground pixel density. And we
+  // return true if the density is below a certain threshold, because characters
+  // in equation regions usually are apart with more white spaces.
+  bool CheckSeedFgDensity(const float density_th, ColPartition *part);
+
+  // A light version of SplitCPHor: instead of really doing the part split, we
+  // simply compute the union bounding box of each split part.
+  void SplitCPHorLite(ColPartition *part, std::vector<TBOX> *splitted_boxes);
+
+  // Split the part (horizontally), and save the split result into
+  // parts_splitted. Note that it is caller's responsibility to release the
+  // memory owns by parts_splitted. On the other hand, the part is unchanged
+  // during this process and still owns the blobs, so do NOT call DeleteBoxes
+  // when freeing the colpartitions in parts_splitted.
+  void SplitCPHor(ColPartition *part, std::vector<ColPartition *> *parts_splitted);
+
+  // Check the density for a seed candidate (part) using its math density and
+  // italic density, returns true if the check passed.
+  bool CheckSeedDensity(const float math_density_high, const float math_density_low,
+                        const ColPartition *part) const;
+
+  // Check if part is indented.
+  IndentType IsIndented(ColPartition *part);
+
+  // Identify inline partitions from cp_seeds_, and re-label them.
+  void IdentifyInlineParts();
+
+  // Compute the super bounding box for all colpartitions inside part_grid_.
+  void ComputeCPsSuperBBox();
+
+  // Identify inline partitions from cp_seeds_ using the horizontal search.
+  void IdentifyInlinePartsHorizontal();
+
+  // Estimate the line spacing between two text partitions. Returns -1 if not
+  // enough data.
+  int EstimateTextPartLineSpacing();
+
+  // Identify inline partitions from cp_seeds_ using vertical search.
+  void IdentifyInlinePartsVertical(const bool top_to_bottom, const int textPartsLineSpacing);
+
+  // Check if part is an inline equation zone. This should be called after we
+  // identified the seed regions.
+  bool IsInline(const bool search_bottom, const int textPartsLineSpacing, ColPartition *part);
+
+  // For a given seed partition, we search the part_grid_ and see if there is
+  // any partition can be merged with it. It returns true if the seed has been
+  // expanded.
+  bool ExpandSeed(ColPartition *seed);
+
+  // Starting from the seed position, we search the part_grid_
+  // horizontally/vertically, find all partitions that can be
+  // merged with seed, remove them from part_grid_, and put them  into
+  // parts_to_merge.
+  void ExpandSeedHorizontal(const bool search_left, ColPartition *seed,
+                            std::vector<ColPartition *> *parts_to_merge);
+  void ExpandSeedVertical(const bool search_bottom, ColPartition *seed,
+                          std::vector<ColPartition *> *parts_to_merge);
+
+  // Check if a part_box is the small neighbor of seed_box.
+  bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const;
+
+  // Perform the density check for part, which we assume is nearing a seed
+  // partition. It returns true if the check passed.
+  bool CheckSeedNeighborDensity(const ColPartition *part) const;
+
+  // After identify the math blocks, we do one more scanning on all text
+  // partitions, and check if any of them is the satellite of:
+  // math blocks: here a p is the satellite of q if:
+  // 1. q is the nearest vertical neighbor of p, and
+  // 2. y_gap(p, q) is less than a threshold, and
+  // 3. x_overlap(p, q) is over a threshold.
+  // Note that p can be the satellites of two blocks: its top neighbor and
+  // bottom neighbor.
+  void ProcessMathBlockSatelliteParts();
+
+  // Check if part is the satellite of one/two math blocks. If it is, we return
+  // true, and save the blocks into math_blocks.
+  bool IsMathBlockSatellite(ColPartition *part, std::vector<ColPartition *> *math_blocks);
+
+  // Search the nearest neighbor of part in one vertical direction as defined in
+  // search_bottom. It returns the neighbor found that major x overlap with it,
+  // or nullptr when not found.
+  ColPartition *SearchNNVertical(const bool search_bottom, const ColPartition *part);
+
+  // Check if the neighbor with vertical distance of y_gap is a near and math
+  // block partition.
+  bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const;
+
+  // Generate the tiff file name for output/debug file.
+  void GetOutputTiffName(const char *name, std::string &image_name) const;
+
+  // Debugger function that renders ColPartitions on the input image, where:
+  // parts labeled as PT_EQUATION will be painted in red, PT_INLINE_EQUATION
+  // will be painted in green, and other parts will be painted in blue.
+  void PaintColParts(const std::string &outfile) const;
+
+  // Debugger function that renders the blobs in part_grid_ over the input
+  // image.
+  void PaintSpecialTexts(const std::string &outfile) const;
+
+  // Debugger function that print the math blobs density values for a
+  // ColPartition object.
+  void PrintSpecialBlobsDensity(const ColPartition *part) const;
+
+  // The tesseract engine initialized from equation training data.
+  Tesseract equ_tesseract_;
+
+  // The tesseract engine used for OCR. This pointer is passed in by the caller,
+  // so do NOT destroy it in this class.
+  Tesseract *lang_tesseract_;
+
+  // The ColPartitionGrid that we are processing. This pointer is passed in from
+  // the caller, so do NOT destroy it in the class.
+  ColPartitionGrid *part_grid_ = nullptr;
+
+  // A simple array of pointers to the best assigned column division at
+  // each grid y coordinate. This pointer is passed in from the caller, so do
+  // NOT destroy it in the class.
+  ColPartitionSet **best_columns_ = nullptr;
+
+  // The super bounding box of all cps in the part_grid_.
+  TBOX *cps_super_bbox_;
+
+  // The seed ColPartition for equation region.
+  std::vector<ColPartition *> cp_seeds_;
+
+  // The resolution (dpi) of the processing image.
+  int resolution_;
+
+  // The number of pages we have processed.
+  int page_count_;
+};
+
+} // namespace tesseract
+
+#endif // TESSERACT_CCMAIN_EQUATIONDETECT_H_
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/fixspace.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/fixspace.cpp
@ -0,0 +1,870 @@
+/******************************************************************
+ * File:        fixspace.cpp  (Formerly fixspace.c)
+ * Description: Implements a pass over the page res, exploring the alternative
+ *              spacing possibilities, trying to use context to improve the
+ *              word spacing
+ * Author:      Phil Cheatle
+ *
+ * (C) Copyright 1993, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "fixspace.h"
+
+#include "blobs.h"          // for TWERD, TBLOB, TESSLINE
+#include "boxword.h"        // for BoxWord
+#include "errcode.h"        // for ASSERT_HOST
+#include "normalis.h"       // for kBlnXHeight, kBlnBaselineOffset
+#include "pageres.h"        // for WERD_RES_IT, WERD_RES, WERD_RES_LIST
+#include "params.h"         // for IntParam, StringParam, BoolParam, DoubleParam, ...
+#include "ratngs.h"         // for WERD_CHOICE, FREQ_DAWG_PERM, NUMBER_PERM
+#include "rect.h"           // for TBOX
+#include "stepblob.h"       // for C_BLOB_IT, C_BLOB_LIST, C_BLOB
+#include "tesseractclass.h" // for Tesseract, TesseractStats, WordData
+#include "tessvars.h"       // for debug_fp
+#include "tprintf.h"        // for tprintf
+#include "unicharset.h"     // for UNICHARSET
+#include "werd.h"           // for WERD, W_EOL, W_FUZZY_NON, W_FUZZY_SP
+
+#include <tesseract/ocrclass.h> // for ETEXT_DESC
+#include <tesseract/unichar.h>  // for UNICHAR_ID
+
+#include <cstdint> // for INT16_MAX, int16_t, int32_t
+
+namespace tesseract {
+
+class BLOCK;
+class ROW;
+
+#define PERFECT_WERDS 999
+
+/**********************************************************************
+ *  c_blob_comparator()
+ *
+ *  Blob comparator used to sort a blob list so that blobs are in increasing
+ *  order of left edge.
+ **********************************************************************/
+
+static int c_blob_comparator( // sort blobs
+    const void *blob1p,       // ptr to ptr to blob1
+    const void *blob2p        // ptr to ptr to blob2
+) {
+  const C_BLOB *blob1 = *reinterpret_cast<const C_BLOB *const *>(blob1p);
+  const C_BLOB *blob2 = *reinterpret_cast<const C_BLOB *const *>(blob2p);
+
+  return blob1->bounding_box().left() - blob2->bounding_box().left();
+}
+
+/**
+ * @name fix_fuzzy_spaces()
+ * Walk over the page finding sequences of words joined by fuzzy spaces. Extract
+ * them as a sublist, process the sublist to find the optimal arrangement of
+ * spaces then replace the sublist in the ROW_RES.
+ *
+ * @param monitor progress monitor
+ * @param word_count count of words in doc
+ * @param[out] page_res
+ */
+void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res) {
+  BLOCK_RES_IT block_res_it;
+  ROW_RES_IT row_res_it;
+  WERD_RES_IT word_res_it_from;
+  WERD_RES_IT word_res_it_to;
+  WERD_RES *word_res;
+  WERD_RES_LIST fuzzy_space_words;
+  int16_t new_length;
+  bool prevent_null_wd_fixsp; // DON'T process blobless wds
+  int32_t word_index;         // current word
+
+  block_res_it.set_to_list(&page_res->block_res_list);
+  word_index = 0;
+  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); block_res_it.forward()) {
+    row_res_it.set_to_list(&block_res_it.data()->row_res_list);
+    for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); row_res_it.forward()) {
+      word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
+      while (!word_res_it_from.at_last()) {
+        word_res = word_res_it_from.data();
+        while (!word_res_it_from.at_last() &&
+               !(word_res->combination ||
+                 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
+                 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
+          fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);
+          word_res = word_res_it_from.forward();
+          word_index++;
+          if (monitor != nullptr) {
+            monitor->ocr_alive = true;
+            monitor->progress = 90 + 5 * word_index / word_count;
+            if (monitor->deadline_exceeded() ||
+                (monitor->cancel != nullptr &&
+                 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) {
+              return;
+            }
+          }
+        }
+
+        if (!word_res_it_from.at_last()) {
+          word_res_it_to = word_res_it_from;
+          prevent_null_wd_fixsp = word_res->word->cblob_list()->empty();
+          if (check_debug_pt(word_res, 60)) {
+            debug_fix_space_level.set_value(10);
+          }
+          word_res_it_to.forward();
+          word_index++;
+          if (monitor != nullptr) {
+            monitor->ocr_alive = true;
+            monitor->progress = 90 + 5 * word_index / word_count;
+            if (monitor->deadline_exceeded() ||
+                (monitor->cancel != nullptr &&
+                 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) {
+              return;
+            }
+          }
+          while (!word_res_it_to.at_last() &&
+                 (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
+                  word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
+            if (check_debug_pt(word_res, 60)) {
+              debug_fix_space_level.set_value(10);
+            }
+            if (word_res->word->cblob_list()->empty()) {
+              prevent_null_wd_fixsp = true;
+            }
+            word_res = word_res_it_to.forward();
+          }
+          if (check_debug_pt(word_res, 60)) {
+            debug_fix_space_level.set_value(10);
+          }
+          if (word_res->word->cblob_list()->empty()) {
+            prevent_null_wd_fixsp = true;
+          }
+          if (prevent_null_wd_fixsp) {
+            word_res_it_from = word_res_it_to;
+          } else {
+            fuzzy_space_words.assign_to_sublist(&word_res_it_from, &word_res_it_to);
+            fix_fuzzy_space_list(fuzzy_space_words, row_res_it.data()->row,
+                                 block_res_it.data()->block);
+            new_length = fuzzy_space_words.length();
+            word_res_it_from.add_list_before(&fuzzy_space_words);
+            for (; !word_res_it_from.at_last() && new_length > 0; new_length--) {
+              word_res_it_from.forward();
+            }
+          }
+          if (test_pt) {
+            debug_fix_space_level.set_value(0);
+          }
+        }
+        fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);
+        // Last word in row
+      }
+    }
+  }
+}
+
+void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) {
+  int16_t best_score;
+  WERD_RES_LIST current_perm;
+  int16_t current_score;
+  bool improved = false;
+
+  best_score = eval_word_spacing(best_perm); // default score
+  dump_words(best_perm, best_score, 1, improved);
+
+  if (best_score != PERFECT_WERDS) {
+    initialise_search(best_perm, current_perm);
+  }
+
+  while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
+    match_current_words(current_perm, row, block);
+    current_score = eval_word_spacing(current_perm);
+    dump_words(current_perm, current_score, 2, improved);
+    if (current_score > best_score) {
+      best_perm.clear();
+      best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
+      best_score = current_score;
+      improved = true;
+    }
+    if (current_score < PERFECT_WERDS) {
+      transform_to_next_perm(current_perm);
+    }
+  }
+  dump_words(best_perm, best_score, 3, improved);
+}
+
+void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
+  WERD_RES_IT src_it(&src_list);
+  WERD_RES_IT new_it(&new_list);
+  WERD_RES *src_wd;
+  WERD_RES *new_wd;
+
+  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
+    src_wd = src_it.data();
+    if (!src_wd->combination) {
+      new_wd = WERD_RES::deep_copy(src_wd);
+      new_wd->combination = false;
+      new_wd->part_of_combo = false;
+      new_it.add_after_then_move(new_wd);
+    }
+  }
+}
+
+void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block) {
+  WERD_RES_IT word_it(&words);
+  WERD_RES *word;
+  // Since we are not using PAGE_RES to iterate over words, we need to update
+  // prev_word_best_choice_ before calling classify_word_pass2().
+  prev_word_best_choice_ = nullptr;
+  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
+    word = word_it.data();
+    if ((!word->part_of_combo) && (word->box_word == nullptr)) {
+      WordData word_data(block, row, word);
+      SetupWordPassN(2, &word_data);
+      classify_word_and_language(2, nullptr, &word_data);
+    }
+    prev_word_best_choice_ = word->best_choice;
+  }
+}
+
+/**
+ * @name eval_word_spacing()
+ * The basic measure is the number of characters in contextually confirmed
+ * words. (I.e the word is done)
+ * If all words are contextually confirmed the evaluation is deemed perfect.
+ *
+ * Some fiddles are done to handle "1"s as these are VERY frequent causes of
+ * fuzzy spaces. The problem with the basic measure is that "561 63" would score
+ * the same as "56163", though given our knowledge that the space is fuzzy, and
+ * that there is a "1" next to the fuzzy space, we need to ensure that "56163"
+ * is preferred.
+ *
+ * The solution is to NOT COUNT the score of any word which has a digit at one
+ * end and a "1Il" as the character the other side of the space.
+ *
+ * Conversely, any character next to a "1" within a word is counted as a
+ * positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1
+ * side of the "1" joined).  "56163" would score 7 - all chars in a numeric word
+ * + 2 sides of a "1" joined.
+ *
+ * The joined 1 rule is applied to any word REGARDLESS of contextual
+ * confirmation.  Thus "PS7a71 3/7a" scores 1 (neither word is contexutally
+ * confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.
+ *
+ */
+int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
+  WERD_RES_IT word_res_it(&word_res_list);
+  int16_t total_score = 0;
+  int16_t word_count = 0;
+  int16_t done_word_count = 0;
+  int16_t i;
+  int16_t offset;
+  int16_t prev_word_score = 0;
+  bool prev_word_done = false;
+  bool prev_char_1 = false;     // prev ch a "1/I/l"?
+  bool prev_char_digit = false; // prev ch 2..9 or 0
+  const char *punct_chars = "!\"`',.:;";
+  bool prev_char_punct = false;
+
+  do {
+    // current word
+    WERD_RES *word = word_res_it.data();
+    bool word_done = fixspace_thinks_word_done(word);
+    word_count++;
+    if (word->tess_failed) {
+      total_score += prev_word_score;
+      if (prev_word_done) {
+        done_word_count++;
+      }
+      prev_word_score = 0;
+      prev_char_1 = false;
+      prev_char_digit = false;
+      prev_word_done = false;
+    } else {
+      /*
+  Can we add the prev word score and potentially count this word?
+  Yes IF it didn't end in a 1 when the first char of this word is a digit
+    AND it didn't end in a digit when the first char of this word is a 1
+*/
+      auto word_len = word->reject_map.length();
+      bool current_word_ok_so_far = false;
+      if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
+            (prev_char_digit &&
+             ((word_done && word->best_choice->unichar_lengths().c_str()[0] == 1 &&
+               word->best_choice->unichar_string()[0] == '1') ||
+              (!word_done &&
+               conflict_set_I_l_1.contains(word->best_choice->unichar_string()[0])))))) {
+        total_score += prev_word_score;
+        if (prev_word_done) {
+          done_word_count++;
+        }
+        current_word_ok_so_far = word_done;
+      }
+
+      if (current_word_ok_so_far) {
+        prev_word_done = true;
+        prev_word_score = word_len;
+      } else {
+        prev_word_done = false;
+        prev_word_score = 0;
+      }
+
+      /* Add 1 to total score for every joined 1 regardless of context and
+   rejtn */
+      for (i = 0, prev_char_1 = false; i < word_len; i++) {
+        bool current_char_1 = word->best_choice->unichar_string()[i] == '1';
+        if (prev_char_1 || (current_char_1 && (i > 0))) {
+          total_score++;
+        }
+        prev_char_1 = current_char_1;
+      }
+
+      /* Add 1 to total score for every joined punctuation regardless of context
+  and rejtn */
+      if (tessedit_prefer_joined_punct) {
+        for (i = 0, offset = 0, prev_char_punct = false; i < word_len;
+             offset += word->best_choice->unichar_lengths()[i++]) {
+          bool current_char_punct =
+              strchr(punct_chars, word->best_choice->unichar_string()[offset]) != nullptr;
+          if (prev_char_punct || (current_char_punct && i > 0)) {
+            total_score++;
+          }
+          prev_char_punct = current_char_punct;
+        }
+      }
+      prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
+      for (i = 0, offset = 0; i < word_len - 1;
+           offset += word->best_choice->unichar_lengths()[i++]) {
+        ;
+      }
+      prev_char_1 =
+          ((word_done && (word->best_choice->unichar_string()[offset] == '1')) ||
+           (!word_done &&
+            conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])));
+    }
+    /* Find next word */
+    do {
+      word_res_it.forward();
+    } while (word_res_it.data()->part_of_combo);
+  } while (!word_res_it.at_first());
+  total_score += prev_word_score;
+  if (prev_word_done) {
+    done_word_count++;
+  }
+  if (done_word_count == word_count) {
+    return PERFECT_WERDS;
+  } else {
+    return total_score;
+  }
+}
+
+bool Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
+  int i;
+  int offset;
+
+  for (i = 0, offset = 0; i < char_position; offset += word->best_choice->unichar_lengths()[i++]) {
+    ;
+  }
+  return (
+      word->uch_set->get_isdigit(word->best_choice->unichar_string().c_str() + offset,
+                                 word->best_choice->unichar_lengths()[i]) ||
+      (word->best_choice->permuter() == NUMBER_PERM &&
+       numeric_punctuation.contains(word->best_choice->unichar_string().c_str()[offset])));
+}
+
+/**
+ * @name transform_to_next_perm()
+ * Examines the current word list to find the smallest word gap size. Then walks
+ * the word list closing any gaps of this size by either inserted new
+ * combination words, or extending existing ones.
+ *
+ * The routine COULD be limited to stop it building words longer than N blobs.
+ *
+ * If there are no more gaps then it DELETES the entire list and returns the
+ * empty list to cause termination.
+ */
+void transform_to_next_perm(WERD_RES_LIST &words) {
+  WERD_RES_IT word_it(&words);
+  WERD_RES_IT prev_word_it(&words);
+  WERD_RES *word;
+  WERD_RES *prev_word;
+  WERD_RES *combo;
+  WERD *copy_word;
+  int16_t prev_right = -INT16_MAX;
+  TBOX box;
+  int16_t gap;
+  int16_t min_gap = INT16_MAX;
+
+  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
+    word = word_it.data();
+    if (!word->part_of_combo) {
+      box = word->word->bounding_box();
+      if (prev_right > -INT16_MAX) {
+        gap = box.left() - prev_right;
+        if (gap < min_gap) {
+          min_gap = gap;
+        }
+      }
+      prev_right = box.right();
+    }
+  }
+  if (min_gap < INT16_MAX) {
+    prev_right = -INT16_MAX; // back to start
+    word_it.set_to_list(&words);
+    // Note: we can't use cycle_pt due to inserted combos at start of list.
+    for (; (prev_right == -INT16_MAX) || !word_it.at_first(); word_it.forward()) {
+      word = word_it.data();
+      if (!word->part_of_combo) {
+        box = word->word->bounding_box();
+        if (prev_right > -INT16_MAX) {
+          gap = box.left() - prev_right;
+          if (gap <= min_gap) {
+            prev_word = prev_word_it.data();
+            if (prev_word->combination) {
+              combo = prev_word;
+            } else {
+              /* Make a new combination and insert before
+               * the first word being joined. */
+              copy_word = new WERD;
+              *copy_word = *(prev_word->word);
+              // deep copy
+              combo = new WERD_RES(copy_word);
+              combo->combination = true;
+              combo->x_height = prev_word->x_height;
+              prev_word->part_of_combo = true;
+              prev_word_it.add_before_then_move(combo);
+            }
+            combo->word->set_flag(W_EOL, word->word->flag(W_EOL));
+            if (word->combination) {
+              combo->word->join_on(word->word);
+              // Move blobs to combo
+              // old combo no longer needed
+              delete word_it.extract();
+            } else {
+              // Copy current wd to combo
+              combo->copy_on(word);
+              word->part_of_combo = true;
+            }
+            combo->done = false;
+            combo->ClearResults();
+          } else {
+            prev_word_it = word_it; // catch up
+          }
+        }
+        prev_right = box.right();
+      }
+    }
+  } else {
+    words.clear(); // signal termination
+  }
+}
+
+void Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved) {
+  WERD_RES_IT word_res_it(&perm);
+
+  if (debug_fix_space_level > 0) {
+    if (mode == 1) {
+      stats_.dump_words_str = "";
+      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
+        if (!word_res_it.data()->part_of_combo) {
+          stats_.dump_words_str += word_res_it.data()->best_choice->unichar_string();
+          stats_.dump_words_str += ' ';
+        }
+      }
+    }
+
+    if (debug_fix_space_level > 1) {
+      switch (mode) {
+        case 1:
+          tprintf("EXTRACTED (%d): \"", score);
+          break;
+        case 2:
+          tprintf("TESTED (%d): \"", score);
+          break;
+        case 3:
+          tprintf("RETURNED (%d): \"", score);
+          break;
+      }
+
+      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
+        if (!word_res_it.data()->part_of_combo) {
+          tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),
+                  static_cast<int>(word_res_it.data()->best_choice->permuter()));
+        }
+      }
+      tprintf("\"\n");
+    } else if (improved) {
+      tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.c_str());
+      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
+        if (!word_res_it.data()->part_of_combo) {
+          tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),
+                  static_cast<int>(word_res_it.data()->best_choice->permuter()));
+        }
+      }
+      tprintf("\"\n");
+    }
+  }
+}
+
+bool Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
+  if (word->done) {
+    return true;
+  }
+
+  /*
+  Use all the standard pass 2 conditions for mode 5 in set_done() in
+  reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
+  CARE WHETHER WE HAVE of/at on/an etc.
+*/
+  if (fixsp_done_mode > 0 &&
+      (word->tess_accepted || (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
+       fixsp_done_mode == 3) &&
+      (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr) &&
+      ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
+       (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
+       (word->best_choice->permuter() == USER_DAWG_PERM) ||
+       (word->best_choice->permuter() == NUMBER_PERM))) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+/**
+ * @name fix_sp_fp_word()
+ * Test the current word to see if it can be split by deleting noise blobs. If
+ * so, do the business.
+ * Return with the iterator pointing to the same place if the word is unchanged,
+ * or the last of the replacement words.
+ */
+void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block) {
+  WERD_RES *word_res;
+  WERD_RES_LIST sub_word_list;
+  WERD_RES_IT sub_word_list_it(&sub_word_list);
+  int16_t blob_index;
+  int16_t new_length;
+  float junk;
+
+  word_res = word_res_it.data();
+  if (word_res->word->flag(W_REP_CHAR) || word_res->combination || word_res->part_of_combo ||
+      !word_res->word->flag(W_DONT_CHOP)) {
+    return;
+  }
+
+  blob_index = worst_noise_blob(word_res, &junk);
+  if (blob_index < 0) {
+    return;
+  }
+
+  if (debug_fix_space_level > 1) {
+    tprintf("FP fixspace working on \"%s\"\n", word_res->best_choice->unichar_string().c_str());
+  }
+  word_res->word->rej_cblob_list()->sort(c_blob_comparator);
+  sub_word_list_it.add_after_stay_put(word_res_it.extract());
+  fix_noisy_space_list(sub_word_list, row, block);
+  new_length = sub_word_list.length();
+  word_res_it.add_list_before(&sub_word_list);
+  for (; !word_res_it.at_last() && new_length > 1; new_length--) {
+    word_res_it.forward();
+  }
+}
+
+void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) {
+  int16_t best_score;
+  WERD_RES_IT best_perm_it(&best_perm);
+  WERD_RES_LIST current_perm;
+  WERD_RES_IT current_perm_it(&current_perm);
+  WERD_RES *old_word_res;
+  int16_t current_score;
+  bool improved = false;
+
+  best_score = fp_eval_word_spacing(best_perm); // default score
+
+  dump_words(best_perm, best_score, 1, improved);
+
+  old_word_res = best_perm_it.data();
+  // Even deep_copy doesn't copy the underlying WERD unless its combination
+  // flag is true!.
+  old_word_res->combination = true; // Kludge to force deep copy
+  current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
+  old_word_res->combination = false; // Undo kludge
+
+  break_noisiest_blob_word(current_perm);
+
+  while (best_score != PERFECT_WERDS && !current_perm.empty()) {
+    match_current_words(current_perm, row, block);
+    current_score = fp_eval_word_spacing(current_perm);
+    dump_words(current_perm, current_score, 2, improved);
+    if (current_score > best_score) {
+      best_perm.clear();
+      best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
+      best_score = current_score;
+      improved = true;
+    }
+    if (current_score < PERFECT_WERDS) {
+      break_noisiest_blob_word(current_perm);
+    }
+  }
+  dump_words(best_perm, best_score, 3, improved);
+}
+
+/**
+ * break_noisiest_blob_word()
+ * Find the word with the blob which looks like the worst noise.
+ * Break the word into two, deleting the noise blob.
+ */
+void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
+  WERD_RES_IT word_it(&words);
+  WERD_RES_IT worst_word_it;
+  float worst_noise_score = 9999;
+  int worst_blob_index = -1; // Noisiest blob of noisiest wd
+  int blob_index;            // of wds noisiest blob
+  float noise_score;         // of wds noisiest blob
+  WERD_RES *word_res;
+  C_BLOB_IT blob_it;
+  C_BLOB_IT rej_cblob_it;
+  C_BLOB_LIST new_blob_list;
+  C_BLOB_IT new_blob_it;
+  C_BLOB_IT new_rej_cblob_it;
+  WERD *new_word;
+  int16_t start_of_noise_blob;
+  int16_t i;
+
+  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
+    blob_index = worst_noise_blob(word_it.data(), &noise_score);
+    if (blob_index > -1 && worst_noise_score > noise_score) {
+      worst_noise_score = noise_score;
+      worst_blob_index = blob_index;
+      worst_word_it = word_it;
+    }
+  }
+  if (worst_blob_index < 0) {
+    words.clear(); // signal termination
+    return;
+  }
+
+  /* Now split the worst_word_it */
+
+  word_res = worst_word_it.data();
+
+  /* Move blobs before noise blob to a new bloblist */
+
+  new_blob_it.set_to_list(&new_blob_list);
+  blob_it.set_to_list(word_res->word->cblob_list());
+  for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
+    new_blob_it.add_after_then_move(blob_it.extract());
+  }
+  start_of_noise_blob = blob_it.data()->bounding_box().left();
+  delete blob_it.extract(); // throw out noise blob
+
+  new_word = new WERD(&new_blob_list, word_res->word);
+  new_word->set_flag(W_EOL, false);
+  word_res->word->set_flag(W_BOL, false);
+  word_res->word->set_blanks(1); // After break
+
+  new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
+  rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
+  for (; (!rej_cblob_it.empty() &&
+          (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
+       rej_cblob_it.forward()) {
+    new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
+  }
+
+  auto *new_word_res = new WERD_RES(new_word);
+  new_word_res->combination = true;
+  worst_word_it.add_before_then_move(new_word_res);
+
+  word_res->ClearResults();
+}
+
+int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) {
+  float noise_score[512];
+  int i;
+  int min_noise_blob; // 1st contender
+  int max_noise_blob; // last contender
+  int non_noise_count;
+  int worst_noise_blob; // Worst blob
+  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
+  float non_noise_limit = kBlnXHeight * 0.8;
+
+  if (word_res->rebuild_word == nullptr) {
+    return -1; // Can't handle cube words.
+  }
+
+  // Normalised.
+  int blob_count = word_res->box_word->length();
+  ASSERT_HOST(blob_count <= 512);
+  if (blob_count < 5) {
+    return -1; // too short to split
+  }
+
+    /* Get the noise scores for all blobs */
+
+#ifndef SECURE_NAMES
+  if (debug_fix_space_level > 5) {
+    tprintf("FP fixspace Noise metrics for \"%s\": ",
+            word_res->best_choice->unichar_string().c_str());
+  }
+#endif
+
+  for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
+    TBLOB *blob = word_res->rebuild_word->blobs[i];
+    if (word_res->reject_map[i].accepted()) {
+      noise_score[i] = non_noise_limit;
+    } else {
+      noise_score[i] = blob_noise_score(blob);
+    }
+
+    if (debug_fix_space_level > 5) {
+      tprintf("%1.1f ", noise_score[i]);
+    }
+  }
+  if (debug_fix_space_level > 5) {
+    tprintf("\n");
+  }
+
+  /* Now find the worst one which is far enough away from the end of the word */
+
+  non_noise_count = 0;
+  for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
+    if (noise_score[i] >= non_noise_limit) {
+      non_noise_count++;
+    }
+  }
+  if (non_noise_count < fixsp_non_noise_limit) {
+    return -1;
+  }
+
+  min_noise_blob = i;
+
+  non_noise_count = 0;
+  for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit; i--) {
+    if (noise_score[i] >= non_noise_limit) {
+      non_noise_count++;
+    }
+  }
+  if (non_noise_count < fixsp_non_noise_limit) {
+    return -1;
+  }
+
+  max_noise_blob = i;
+
+  if (min_noise_blob > max_noise_blob) {
+    return -1;
+  }
+
+  *worst_noise_score = small_limit;
+  worst_noise_blob = -1;
+  for (i = min_noise_blob; i <= max_noise_blob; i++) {
+    if (noise_score[i] < *worst_noise_score) {
+      worst_noise_blob = i;
+      *worst_noise_score = noise_score[i];
+    }
+  }
+  return worst_noise_blob;
+}
+
+float Tesseract::blob_noise_score(TBLOB *blob) {
+  TBOX box; // BB of outline
+  int16_t outline_count = 0;
+  int16_t max_dimension;
+  int16_t largest_outline_dimension = 0;
+
+  for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
+    outline_count++;
+    box = ol->bounding_box();
+    if (box.height() > box.width()) {
+      max_dimension = box.height();
+    } else {
+      max_dimension = box.width();
+    }
+
+    if (largest_outline_dimension < max_dimension) {
+      largest_outline_dimension = max_dimension;
+    }
+  }
+
+  if (outline_count > 5) {
+    // penalise LOTS of blobs
+    largest_outline_dimension *= 2;
+  }
+
+  box = blob->bounding_box();
+  if (box.bottom() > kBlnBaselineOffset * 4 || box.top() < kBlnBaselineOffset / 2) {
+    // Lax blob is if high or low
+    largest_outline_dimension /= 2;
+  }
+
+  return largest_outline_dimension;
+}
+
+void fixspace_dbg(WERD_RES *word) {
+  TBOX box = word->word->bounding_box();
+  const bool show_map_detail = false;
+  int16_t i;
+
+  box.print();
+  tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
+  tprintf("Blob count: %d (word); %d/%d (rebuild word)\n", word->word->cblob_list()->length(),
+          word->rebuild_word->NumBlobs(), word->box_word->length());
+  word->reject_map.print(debug_fp);
+  tprintf("\n");
+  if (show_map_detail) {
+    tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());
+    for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
+      tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
+      word->reject_map[i].full_print(debug_fp);
+    }
+  }
+
+  tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
+  tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
+}
+
+/**
+ * fp_eval_word_spacing()
+ * Evaluation function for fixed pitch word lists.
+ *
+ * Basically, count the number of "nice" characters - those which are in tess
+ * acceptable words or in dict words and are not rejected.
+ * Penalise any potential noise chars
+ */
+int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
+  WERD_RES_IT word_it(&word_res_list);
+  WERD_RES *word;
+  int16_t score = 0;
+  int16_t i;
+  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
+
+  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
+    word = word_it.data();
+    if (word->rebuild_word == nullptr) {
+      continue; // Can't handle cube words.
+    }
+    if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
+        word->best_choice->permuter() == FREQ_DAWG_PERM ||
+        word->best_choice->permuter() == USER_DAWG_PERM || safe_dict_word(word) > 0) {
+      int num_blobs = word->rebuild_word->NumBlobs();
+      UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
+      for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
+        TBLOB *blob = word->rebuild_word->blobs[i];
+        if (word->best_choice->unichar_id(i) == space || blob_noise_score(blob) < small_limit) {
+          score -= 1; // penalise possibly erroneous non-space
+        } else if (word->reject_map[i].accepted()) {
+          score++;
+        }
+      }
+    }
+  }
+  if (score < 0) {
+    score = 0;
+  }
+  return score;
+}
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/fixspace.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/fixspace.h
@ -0,0 +1,36 @@
+/******************************************************************
+ * File:        fixspace.h  (Formerly fixspace.h)
+ * Description: Implements a pass over the page res, exploring the alternative
+ *              spacing possibilities, trying to use context to improve the
+ *              word spacing
+ * Author:      Phil Cheatle
+ * Created:     Thu Oct 21 11:38:43 BST 1993
+ *
+ * (C) Copyright 1993, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef FIXSPACE_H
+#define FIXSPACE_H
+
+namespace tesseract {
+
+class WERD_RES;
+class WERD_RES_LIST;
+
+void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list);
+void transform_to_next_perm(WERD_RES_LIST &words);
+void fixspace_dbg(WERD_RES *word);
+
+} // namespace tesseract
+
+#endif
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/fixxht.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/fixxht.cpp
@ -0,0 +1,215 @@
+/**********************************************************************
+ * File:        fixxht.cpp  (Formerly fixxht.c)
+ * Description: Improve x_ht and look out for case inconsistencies
+ * Author:      Phil Cheatle
+ * Created:     Thu Aug  5 14:11:08 BST 1993
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "float2int.h"
+#include "params.h"
+#include "tesseractclass.h"
+
+#include <algorithm>
+#include <cctype>
+#include <cstring>
+
+namespace tesseract {
+
+// Fixxht overview.
+// Premise: Initial estimate of x-height is adequate most of the time, but
+// occasionally it is incorrect. Most notable causes of failure are:
+// 1. Small caps, where the top of the caps is the same as the body text
+// xheight. For small caps words the xheight needs to be reduced to correctly
+// recognize the caps in the small caps word.
+// 2. All xheight lines, such as summer. Here the initial estimate will have
+// guessed that the blob tops are caps and will have placed the xheight too low.
+// 3. Noise/logos beside words, or changes in font size on a line. Such
+// things can blow the statistics and cause an incorrect estimate.
+// 4. Incorrect baseline. Can happen when 2 columns are incorrectly merged.
+// In this case the x-height is often still correct.
+//
+// Algorithm.
+// Compare the vertical position (top only) of alphnumerics in a word with
+// the range of positions in training data (in the unicharset).
+// See CountMisfitTops. If any characters disagree sufficiently with the
+// initial xheight estimate, then recalculate the xheight, re-run OCR on
+// the word, and if the number of vertical misfits goes down, along with
+// either the word rating or certainty, then keep the new xheight.
+// The new xheight is calculated as follows:ComputeCompatibleXHeight
+// For each alphanumeric character that has a vertically misplaced top
+// (a misfit), yet its bottom is within the acceptable range (ie it is not
+// likely a sub-or super-script) calculate the range of acceptable xheight
+// positions from its range of tops, and give each value in the range a
+// number of votes equal to the distance of its top from its acceptance range.
+// The x-height position with the median of the votes becomes the new
+// x-height. This assumes that most characters will be correctly recognized
+// even if the x-height is incorrect. This is not a terrible assumption, but
+// it is not great. An improvement would be to use a classifier that does
+// not care about vertical position or scaling at all.
+// Separately collect stats on shifted baselines and apply the same logic to
+// computing a best-fit shift to fix the error. If the baseline needs to be
+// shifted, but the x-height is OK, returns the original x-height along with
+// the baseline shift to indicate that recognition needs to re-run.
+
+// If the max-min top of a unicharset char is bigger than kMaxCharTopRange
+// then the char top cannot be used to judge misfits or suggest a new top.
+const int kMaxCharTopRange = 48;
+
+// Returns the number of misfit blob tops in this word.
+int Tesseract::CountMisfitTops(WERD_RES *word_res) {
+  int bad_blobs = 0;
+  int num_blobs = word_res->rebuild_word->NumBlobs();
+  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
+    TBLOB *blob = word_res->rebuild_word->blobs[blob_id];
+    UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
+    if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
+      int top = blob->bounding_box().top();
+      if (top >= INT_FEAT_RANGE) {
+        top = INT_FEAT_RANGE - 1;
+      }
+      int min_bottom, max_bottom, min_top, max_top;
+      unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top);
+      if (max_top - min_top > kMaxCharTopRange) {
+        continue;
+      }
+      bool bad =
+          top < min_top - x_ht_acceptance_tolerance || top > max_top + x_ht_acceptance_tolerance;
+      if (bad) {
+        ++bad_blobs;
+      }
+      if (debug_x_ht_level >= 1) {
+        tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
+                unicharset.id_to_unichar(class_id), bad ? "Misfit" : "OK", top, min_top, max_top,
+                static_cast<int>(x_ht_acceptance_tolerance));
+      }
+    }
+  }
+  return bad_blobs;
+}
+
+// Returns a new x-height maximally compatible with the result in word_res.
+// See comment above for overall algorithm.
+float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift) {
+  STATS top_stats(0, UINT8_MAX);
+  STATS shift_stats(-UINT8_MAX, UINT8_MAX);
+  int bottom_shift = 0;
+  int num_blobs = word_res->rebuild_word->NumBlobs();
+  do {
+    top_stats.clear();
+    shift_stats.clear();
+    for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
+      TBLOB *blob = word_res->rebuild_word->blobs[blob_id];
+      UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
+      if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
+        int top = blob->bounding_box().top() + bottom_shift;
+        // Clip the top to the limit of normalized feature space.
+        if (top >= INT_FEAT_RANGE) {
+          top = INT_FEAT_RANGE - 1;
+        }
+        int bottom = blob->bounding_box().bottom() + bottom_shift;
+        int min_bottom, max_bottom, min_top, max_top;
+        unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top);
+        // Chars with a wild top range would mess up the result so ignore them.
+        if (max_top - min_top > kMaxCharTopRange) {
+          continue;
+        }
+        int misfit_dist = std::max((min_top - x_ht_acceptance_tolerance) - top,
+                                   top - (max_top + x_ht_acceptance_tolerance));
+        int height = top - kBlnBaselineOffset;
+        if (debug_x_ht_level >= 2) {
+          tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
+                  unicharset.id_to_unichar(class_id), height, min_bottom, max_bottom, min_top,
+                  max_top, bottom, top);
+        }
+        // Use only chars that fit in the expected bottom range, and where
+        // the range of tops is sensibly near the xheight.
+        if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
+            bottom - x_ht_acceptance_tolerance <= max_bottom && min_top > kBlnBaselineOffset &&
+            max_top - kBlnBaselineOffset >= kBlnXHeight && misfit_dist > 0) {
+          // Compute the x-height position using proportionality between the
+          // actual height and expected height.
+          int min_xht = DivRounded(height * kBlnXHeight, max_top - kBlnBaselineOffset);
+          int max_xht = DivRounded(height * kBlnXHeight, min_top - kBlnBaselineOffset);
+          if (debug_x_ht_level >= 2) {
+            tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
+          }
+          // The range of expected heights gets a vote equal to the distance
+          // of the actual top from the expected top.
+          for (int y = min_xht; y <= max_xht; ++y) {
+            top_stats.add(y, misfit_dist);
+          }
+        } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
+                    bottom - x_ht_acceptance_tolerance > max_bottom) &&
+                   bottom_shift == 0) {
+          // Get the range of required bottom shift.
+          int min_shift = min_bottom - bottom;
+          int max_shift = max_bottom - bottom;
+          if (debug_x_ht_level >= 2) {
+            tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
+          }
+          // The range of expected shifts gets a vote equal to the min distance
+          // of the actual bottom from the expected bottom, spread over the
+          // range of its acceptance.
+          int misfit_weight = abs(min_shift);
+          if (max_shift > min_shift) {
+            misfit_weight /= max_shift - min_shift;
+          }
+          for (int y = min_shift; y <= max_shift; ++y) {
+            shift_stats.add(y, misfit_weight);
+          }
+        } else {
+          if (bottom_shift == 0) {
+            // Things with bottoms that are already ok need to say so, on the
+            // 1st iteration only.
+            shift_stats.add(0, kBlnBaselineOffset);
+          }
+          if (debug_x_ht_level >= 2) {
+            tprintf(" already OK\n");
+          }
+        }
+      }
+    }
+    if (shift_stats.get_total() > top_stats.get_total()) {
+      bottom_shift = IntCastRounded(shift_stats.median());
+      if (debug_x_ht_level >= 2) {
+        tprintf("Applying bottom shift=%d\n", bottom_shift);
+      }
+    }
+  } while (bottom_shift != 0 && top_stats.get_total() < shift_stats.get_total());
+  // Baseline shift is opposite sign to the bottom shift.
+  *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
+  if (debug_x_ht_level >= 2) {
+    tprintf("baseline shift=%g\n", *baseline_shift);
+  }
+  if (top_stats.get_total() == 0) {
+    return bottom_shift != 0 ? word_res->x_height : 0.0f;
+  }
+  // The new xheight is just the median vote, which is then scaled out
+  // of BLN space back to pixel space to get the x-height in pixel space.
+  float new_xht = top_stats.median();
+  if (debug_x_ht_level >= 2) {
+    tprintf("Median xht=%f\n", new_xht);
+    tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n", new_xht,
+            new_xht / word_res->denorm.y_scale());
+  }
+  // The xheight must change by at least x_ht_min_change to be used.
+  if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change) {
+    return new_xht / word_res->denorm.y_scale();
+  } else {
+    return bottom_shift != 0 ? word_res->x_height : 0.0f;
+  }
+}
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/linerec.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/linerec.cpp
@ -0,0 +1,314 @@
+///////////////////////////////////////////////////////////////////////
+// File:        linerec.cpp
+// Description: Top-level line-based recognition module for Tesseract.
+// Author:      Ray Smith
+//
+// (C) Copyright 2013, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#include "tesseractclass.h"
+
+#include <allheaders.h>
+#include "boxread.h"
+#include "imagedata.h" // for ImageData
+#include "lstmrecognizer.h"
+#include "pageres.h"
+#include "recodebeam.h"
+#include "tprintf.h"
+
+#include <algorithm>
+
+namespace tesseract {
+
+// Scale factor to make certainty more comparable to Tesseract.
+const float kCertaintyScale = 7.0f;
+// Worst acceptable certainty for a dictionary word.
+const float kWorstDictCertainty = -25.0f;
+
+// Generates training data for training a line recognizer, eg LSTM.
+// Breaks the page into lines, according to the boxes, and writes them to a
+// serialized DocumentData based on output_basename.
+// Return true if successful, false if an error occurred.
+bool Tesseract::TrainLineRecognizer(const char *input_imagename, const std::string &output_basename,
+                                    BLOCK_LIST *block_list) {
+  std::string lstmf_name = output_basename + ".lstmf";
+  DocumentData images(lstmf_name);
+  if (applybox_page > 0) {
+    // Load existing document for the previous pages.
+    if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) {
+      tprintf("Failed to read training data from %s!\n", lstmf_name.c_str());
+      return false;
+    }
+  }
+  std::vector<TBOX> boxes;
+  std::vector<std::string> texts;
+  // Get the boxes for this page, if there are any.
+  if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr, nullptr) ||
+      boxes.empty()) {
+    tprintf("Failed to read boxes from %s\n", input_imagename);
+    return false;
+  }
+  TrainFromBoxes(boxes, texts, block_list, &images);
+  if (images.PagesSize() == 0) {
+    tprintf("Failed to read pages from %s\n", input_imagename);
+    return false;
+  }
+  images.Shuffle();
+  if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) {
+    tprintf("Failed to write training data to %s!\n", lstmf_name.c_str());
+    return false;
+  }
+  return true;
+}
+
+// Generates training data for training a line recognizer, eg LSTM.
+// Breaks the boxes into lines, normalizes them, converts to ImageData and
+// appends them to the given training_data.
+void Tesseract::TrainFromBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts,
+                               BLOCK_LIST *block_list, DocumentData *training_data) {
+  auto box_count = boxes.size();
+  // Process all the text lines in this page, as defined by the boxes.
+  unsigned end_box = 0;
+  // Don't let \t, which marks newlines in the box file, get into the line
+  // content, as that makes the line unusable in training.
+  while (end_box < texts.size() && texts[end_box] == "\t") {
+    ++end_box;
+  }
+  for (auto start_box = end_box; start_box < box_count; start_box = end_box) {
+    // Find the textline of boxes starting at start and their bounding box.
+    TBOX line_box = boxes[start_box];
+    std::string line_str = texts[start_box];
+    for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t"; ++end_box) {
+      line_box += boxes[end_box];
+      line_str += texts[end_box];
+    }
+    // Find the most overlapping block.
+    BLOCK *best_block = nullptr;
+    int best_overlap = 0;
+    BLOCK_IT b_it(block_list);
+    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+      BLOCK *block = b_it.data();
+      if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {
+        continue; // Not a text block.
+      }
+      TBOX block_box = block->pdblk.bounding_box();
+      block_box.rotate(block->re_rotation());
+      if (block_box.major_overlap(line_box)) {
+        TBOX overlap_box = line_box.intersection(block_box);
+        if (overlap_box.area() > best_overlap) {
+          best_overlap = overlap_box.area();
+          best_block = block;
+        }
+      }
+    }
+    ImageData *imagedata = nullptr;
+    if (best_block == nullptr) {
+      tprintf("No block overlapping textline: %s\n", line_str.c_str());
+    } else {
+      imagedata = GetLineData(line_box, boxes, texts, start_box, end_box, *best_block);
+    }
+    if (imagedata != nullptr) {
+      training_data->AddPageToDocument(imagedata);
+    }
+    // Don't let \t, which marks newlines in the box file, get into the line
+    // content, as that makes the line unusable in training.
+    while (end_box < texts.size() && texts[end_box] == "\t") {
+      ++end_box;
+    }
+  }
+}
+
+// Returns an Imagedata containing the image of the given box,
+// and ground truth boxes/truth text if available in the input.
+// The image is not normalized in any way.
+ImageData *Tesseract::GetLineData(const TBOX &line_box, const std::vector<TBOX> &boxes,
+                                  const std::vector<std::string> &texts, int start_box, int end_box,
+                                  const BLOCK &block) {
+  TBOX revised_box;
+  ImageData *image_data = GetRectImage(line_box, block, kImagePadding, &revised_box);
+  if (image_data == nullptr) {
+    return nullptr;
+  }
+  image_data->set_page_number(applybox_page);
+  // Copy the boxes and shift them so they are relative to the image.
+  FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
+  ICOORD shift = -revised_box.botleft();
+  std::vector<TBOX> line_boxes;
+  std::vector<std::string> line_texts;
+  for (int b = start_box; b < end_box; ++b) {
+    TBOX box = boxes[b];
+    box.rotate(block_rotation);
+    box.move(shift);
+    line_boxes.push_back(box);
+    line_texts.push_back(texts[b]);
+  }
+  std::vector<int> page_numbers;
+  page_numbers.resize(line_boxes.size(), applybox_page);
+  image_data->AddBoxes(line_boxes, line_texts, page_numbers);
+  return image_data;
+}
+
+// Helper gets the image of a rectangle, using the block.re_rotation() if
+// needed to get to the image, and rotating the result back to horizontal
+// layout. (CJK characters will be on their left sides) The vertical text flag
+// is set in the returned ImageData if the text was originally vertical, which
+// can be used to invoke a different CJK recognition engine. The revised_box
+// is also returned to enable calculation of output bounding boxes.
+ImageData *Tesseract::GetRectImage(const TBOX &box, const BLOCK &block, int padding,
+                                   TBOX *revised_box) const {
+  TBOX wbox = box;
+  wbox.pad(padding, padding);
+  *revised_box = wbox;
+  // Number of clockwise 90 degree rotations needed to get back to tesseract
+  // coords from the clipped image.
+  int num_rotations = 0;
+  if (block.re_rotation().y() > 0.0f) {
+    num_rotations = 1;
+  } else if (block.re_rotation().x() < 0.0f) {
+    num_rotations = 2;
+  } else if (block.re_rotation().y() < 0.0f) {
+    num_rotations = 3;
+  }
+  // Handle two cases automatically: 1 the box came from the block, 2 the box
+  // came from a box file, and refers to the image, which the block may not.
+  if (block.pdblk.bounding_box().major_overlap(*revised_box)) {
+    revised_box->rotate(block.re_rotation());
+  }
+  // Now revised_box always refers to the image.
+  // BestPix is never colormapped, but may be of any depth.
+  Image pix = BestPix();
+  int width = pixGetWidth(pix);
+  int height = pixGetHeight(pix);
+  TBOX image_box(0, 0, width, height);
+  // Clip to image bounds;
+  *revised_box &= image_box;
+  if (revised_box->null_box()) {
+    return nullptr;
+  }
+  Box *clip_box = boxCreate(revised_box->left(), height - revised_box->top(), revised_box->width(),
+                            revised_box->height());
+  Image box_pix = pixClipRectangle(pix, clip_box, nullptr);
+  boxDestroy(&clip_box);
+  if (box_pix == nullptr) {
+    return nullptr;
+  }
+  if (num_rotations > 0) {
+    Image rot_pix = pixRotateOrth(box_pix, num_rotations);
+    box_pix.destroy();
+    box_pix = rot_pix;
+  }
+  // Convert sub-8-bit images to 8 bit.
+  int depth = pixGetDepth(box_pix);
+  if (depth < 8) {
+    Image grey;
+    grey = pixConvertTo8(box_pix, false);
+    box_pix.destroy();
+    box_pix = grey;
+  }
+  bool vertical_text = false;
+  if (num_rotations > 0) {
+    // Rotated the clipped revised box back to internal coordinates.
+    FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());
+    revised_box->rotate(rotation);
+    if (num_rotations != 2) {
+      vertical_text = true;
+    }
+  }
+  return new ImageData(vertical_text, box_pix);
+}
+
+// Recognizes a word or group of words, converting to WERD_RES in *words.
+// Analogous to classify_word_pass1, but can handle a group of words as well.
+void Tesseract::LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word,
+                                  PointerVector<WERD_RES> *words) {
+  TBOX word_box = word->word->bounding_box();
+  // Get the word image - no frills.
+  if (tessedit_pageseg_mode == PSM_SINGLE_WORD || tessedit_pageseg_mode == PSM_RAW_LINE) {
+    // In single word mode, use the whole image without any other row/word
+    // interpretation.
+    word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
+  } else {
+    float baseline = row->base_line((word_box.left() + word_box.right()) / 2);
+    if (baseline + row->descenders() < word_box.bottom()) {
+      word_box.set_bottom(baseline + row->descenders());
+    }
+    if (baseline + row->x_height() + row->ascenders() > word_box.top()) {
+      word_box.set_top(baseline + row->x_height() + row->ascenders());
+    }
+  }
+  ImageData *im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
+  if (im_data == nullptr) {
+    return;
+  }
+
+  bool do_invert = tessedit_do_invert;
+  lstm_recognizer_->RecognizeLine(*im_data, do_invert, classify_debug_level > 0,
+                                  kWorstDictCertainty / kCertaintyScale, word_box, words,
+                                  lstm_choice_mode, lstm_choice_iterations);
+  delete im_data;
+  SearchWords(words);
+}
+
+// Apply segmentation search to the given set of words, within the constraints
+// of the existing ratings matrix. If there is already a best_choice on a word
+// leaves it untouched and just sets the done/accepted etc flags.
+void Tesseract::SearchWords(PointerVector<WERD_RES> *words) {
+  // Run the segmentation search on the network outputs and make a BoxWord
+  // for each of the output words.
+  // If we drop a word as junk, then there is always a space in front of the
+  // next.
+  const Dict *stopper_dict = lstm_recognizer_->GetDict();
+  if (stopper_dict == nullptr) {
+    stopper_dict = &getDict();
+  }
+  bool any_nonspace_delimited = false;
+  for (int w = 0; w < words->size(); ++w) {
+    WERD_RES *word = (*words)[w];
+    if (word->best_choice != nullptr && word->best_choice->ContainsAnyNonSpaceDelimited()) {
+      any_nonspace_delimited = true;
+      break;
+    }
+  }
+  for (int w = 0; w < words->size(); ++w) {
+    WERD_RES *word = (*words)[w];
+    if (word->best_choice == nullptr) {
+      // It is a dud.
+      word->SetupFake(lstm_recognizer_->GetUnicharset());
+    } else {
+      // Set the best state.
+      for (int i = 0; i < word->best_choice->length(); ++i) {
+        int length = word->best_choice->state(i);
+        word->best_state.push_back(length);
+      }
+      word->reject_map.initialise(word->best_choice->length());
+      word->tess_failed = false;
+      word->tess_accepted = true;
+      word->tess_would_adapt = false;
+      word->done = true;
+      word->tesseract = this;
+      float word_certainty = std::min(word->space_certainty, word->best_choice->certainty());
+      word_certainty *= kCertaintyScale;
+      if (getDict().stopper_debug_level >= 1) {
+        tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
+                word->best_choice->certainty(), word->space_certainty,
+                std::min(word->space_certainty, word->best_choice->certainty()) * kCertaintyScale,
+                word_certainty);
+        word->best_choice->print();
+      }
+      word->best_choice->set_certainty(word_certainty);
+
+      word->tess_accepted = stopper_dict->AcceptableResult(word);
+    }
+  }
+}
+
+} // namespace tesseract.
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/ltrresultiterator.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/ltrresultiterator.cpp
@ -0,0 +1,507 @@
+///////////////////////////////////////////////////////////////////////
+// File:        ltrresultiterator.cpp
+// Description: Iterator for tesseract results in strict left-to-right
+//              order that avoids using tesseract internal data structures.
+// Author:      Ray Smith
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include <tesseract/ltrresultiterator.h>
+
+#include "pageres.h"
+#include "tesseractclass.h"
+
+#include <allheaders.h>
+
+namespace tesseract {
+
+LTRResultIterator::LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
+                                     int scaled_yres, int rect_left, int rect_top, int rect_width,
+                                     int rect_height)
+    : PageIterator(page_res, tesseract, scale, scaled_yres, rect_left, rect_top, rect_width,
+                   rect_height)
+    , line_separator_("\n")
+    , paragraph_separator_("\n") {}
+
+// Destructor.
+// It is defined here, so the compiler can create a single vtable
+// instead of weak vtables in every compilation unit.
+LTRResultIterator::~LTRResultIterator() = default;
+
+// Returns the null terminated UTF-8 encoded text string for the current
+// object at the given level. Use delete [] to free after use.
+char *LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const {
+  if (it_->word() == nullptr) {
+    return nullptr; // Already at the end!
+  }
+  std::string text;
+  PAGE_RES_IT res_it(*it_);
+  WERD_CHOICE *best_choice = res_it.word()->best_choice;
+  ASSERT_HOST(best_choice != nullptr);
+  if (level == RIL_SYMBOL) {
+    text = res_it.word()->BestUTF8(blob_index_, false);
+  } else if (level == RIL_WORD) {
+    text = best_choice->unichar_string();
+  } else {
+    bool eol = false; // end of line?
+    bool eop = false; // end of paragraph?
+    do {              // for each paragraph in a block
+      do {            // for each text line in a paragraph
+        do {          // for each word in a text line
+          best_choice = res_it.word()->best_choice;
+          ASSERT_HOST(best_choice != nullptr);
+          text += best_choice->unichar_string();
+          text += " ";
+          res_it.forward();
+          eol = res_it.row() != res_it.prev_row();
+        } while (!eol);
+        text.resize(text.length() - 1);
+        text += line_separator_;
+        eop = res_it.block() != res_it.prev_block() ||
+              res_it.row()->row->para() != res_it.prev_row()->row->para();
+      } while (level != RIL_TEXTLINE && !eop);
+      if (eop) {
+        text += paragraph_separator_;
+      }
+    } while (level == RIL_BLOCK && res_it.block() == res_it.prev_block());
+  }
+  int length = text.length() + 1;
+  char *result = new char[length];
+  strncpy(result, text.c_str(), length);
+  return result;
+}
+
+// Set the string inserted at the end of each text line. "\n" by default.
+void LTRResultIterator::SetLineSeparator(const char *new_line) {
+  line_separator_ = new_line;
+}
+
+// Set the string inserted at the end of each paragraph. "\n" by default.
+void LTRResultIterator::SetParagraphSeparator(const char *new_para) {
+  paragraph_separator_ = new_para;
+}
+
+// Returns the mean confidence of the current object at the given level.
+// The number should be interpreted as a percent probability. (0.0f-100.0f)
+float LTRResultIterator::Confidence(PageIteratorLevel level) const {
+  if (it_->word() == nullptr) {
+    return 0.0f; // Already at the end!
+  }
+  float mean_certainty = 0.0f;
+  int certainty_count = 0;
+  PAGE_RES_IT res_it(*it_);
+  WERD_CHOICE *best_choice = res_it.word()->best_choice;
+  ASSERT_HOST(best_choice != nullptr);
+  switch (level) {
+    case RIL_BLOCK:
+      do {
+        best_choice = res_it.word()->best_choice;
+        ASSERT_HOST(best_choice != nullptr);
+        mean_certainty += best_choice->certainty();
+        ++certainty_count;
+        res_it.forward();
+      } while (res_it.block() == res_it.prev_block());
+      break;
+    case RIL_PARA:
+      do {
+        best_choice = res_it.word()->best_choice;
+        ASSERT_HOST(best_choice != nullptr);
+        mean_certainty += best_choice->certainty();
+        ++certainty_count;
+        res_it.forward();
+      } while (res_it.block() == res_it.prev_block() &&
+               res_it.row()->row->para() == res_it.prev_row()->row->para());
+      break;
+    case RIL_TEXTLINE:
+      do {
+        best_choice = res_it.word()->best_choice;
+        ASSERT_HOST(best_choice != nullptr);
+        mean_certainty += best_choice->certainty();
+        ++certainty_count;
+        res_it.forward();
+      } while (res_it.row() == res_it.prev_row());
+      break;
+    case RIL_WORD:
+      mean_certainty += best_choice->certainty();
+      ++certainty_count;
+      break;
+    case RIL_SYMBOL:
+      mean_certainty += best_choice->certainty(blob_index_);
+      ++certainty_count;
+  }
+  if (certainty_count > 0) {
+    mean_certainty /= certainty_count;
+    return ClipToRange(100 + 5 * mean_certainty, 0.0f, 100.0f);
+  }
+  return 0.0f;
+}
+
+void LTRResultIterator::RowAttributes(float *row_height, float *descenders,
+                                      float *ascenders) const {
+  *row_height =
+      it_->row()->row->x_height() + it_->row()->row->ascenders() - it_->row()->row->descenders();
+  *descenders = it_->row()->row->descenders();
+  *ascenders = it_->row()->row->ascenders();
+}
+
+// Returns the font attributes of the current word. If iterating at a higher
+// level object than words, eg textlines, then this will return the
+// attributes of the first word in that textline.
+// The actual return value is a string representing a font name. It points
+// to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
+// the iterator itself, ie rendered invalid by various members of
+// TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
+// Pointsize is returned in printers points (1/72 inch.)
+const char *LTRResultIterator::WordFontAttributes(bool *is_bold, bool *is_italic,
+                                                  bool *is_underlined, bool *is_monospace,
+                                                  bool *is_serif, bool *is_smallcaps,
+                                                  int *pointsize, int *font_id) const {
+  const char *result = nullptr;
+
+  if (it_->word() == nullptr) {
+    // Already at the end!
+    *pointsize = 0;
+  } else {
+    float row_height =
+        it_->row()->row->x_height() + it_->row()->row->ascenders() - it_->row()->row->descenders();
+    // Convert from pixels to printers points.
+    *pointsize =
+        scaled_yres_ > 0 ? static_cast<int>(row_height * kPointsPerInch / scaled_yres_ + 0.5) : 0;
+
+#ifndef DISABLED_LEGACY_ENGINE
+    const FontInfo *font_info = it_->word()->fontinfo;
+    if (font_info) {
+      // Font information available.
+      *font_id = font_info->universal_id;
+      *is_bold = font_info->is_bold();
+      *is_italic = font_info->is_italic();
+      *is_underlined = false; // TODO(rays) fix this!
+      *is_monospace = font_info->is_fixed_pitch();
+      *is_serif = font_info->is_serif();
+      result = font_info->name;
+    }
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+    *is_smallcaps = it_->word()->small_caps;
+  }
+
+  if (!result) {
+    *is_bold = false;
+    *is_italic = false;
+    *is_underlined = false;
+    *is_monospace = false;
+    *is_serif = false;
+    *is_smallcaps = false;
+    *font_id = -1;
+  }
+
+  return result;
+}
+
+// Returns the name of the language used to recognize this word.
+const char *LTRResultIterator::WordRecognitionLanguage() const {
+  if (it_->word() == nullptr || it_->word()->tesseract == nullptr) {
+    return nullptr;
+  }
+  return it_->word()->tesseract->lang.c_str();
+}
+
+// Return the overall directionality of this word.
+StrongScriptDirection LTRResultIterator::WordDirection() const {
+  if (it_->word() == nullptr) {
+    return DIR_NEUTRAL;
+  }
+  bool has_rtl = it_->word()->AnyRtlCharsInWord();
+  bool has_ltr = it_->word()->AnyLtrCharsInWord();
+  if (has_rtl && !has_ltr) {
+    return DIR_RIGHT_TO_LEFT;
+  }
+  if (has_ltr && !has_rtl) {
+    return DIR_LEFT_TO_RIGHT;
+  }
+  if (!has_ltr && !has_rtl) {
+    return DIR_NEUTRAL;
+  }
+  return DIR_MIX;
+}
+
+// Returns true if the current word was found in a dictionary.
+bool LTRResultIterator::WordIsFromDictionary() const {
+  if (it_->word() == nullptr) {
+    return false; // Already at the end!
+  }
+  int permuter = it_->word()->best_choice->permuter();
+  return permuter == SYSTEM_DAWG_PERM || permuter == FREQ_DAWG_PERM || permuter == USER_DAWG_PERM;
+}
+
+// Returns the number of blanks before the current word.
+int LTRResultIterator::BlanksBeforeWord() const {
+  if (it_->word() == nullptr) {
+    return 1;
+  }
+  return it_->word()->word->space();
+}
+
+// Returns true if the current word is numeric.
+bool LTRResultIterator::WordIsNumeric() const {
+  if (it_->word() == nullptr) {
+    return false; // Already at the end!
+  }
+  int permuter = it_->word()->best_choice->permuter();
+  return permuter == NUMBER_PERM;
+}
+
+// Returns true if the word contains blamer information.
+bool LTRResultIterator::HasBlamerInfo() const {
+  return it_->word() != nullptr && it_->word()->blamer_bundle != nullptr &&
+         it_->word()->blamer_bundle->HasDebugInfo();
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+// Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
+// of the current word.
+const void *LTRResultIterator::GetParamsTrainingBundle() const {
+  return (it_->word() != nullptr && it_->word()->blamer_bundle != nullptr)
+             ? &(it_->word()->blamer_bundle->params_training_bundle())
+             : nullptr;
+}
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+// Returns the pointer to the string with blamer information for this word.
+// Assumes that the word's blamer_bundle is not nullptr.
+const char *LTRResultIterator::GetBlamerDebug() const {
+  return it_->word()->blamer_bundle->debug().c_str();
+}
+
+// Returns the pointer to the string with misadaption information for this word.
+// Assumes that the word's blamer_bundle is not nullptr.
+const char *LTRResultIterator::GetBlamerMisadaptionDebug() const {
+  return it_->word()->blamer_bundle->misadaption_debug().c_str();
+}
+
+// Returns true if a truth string was recorded for the current word.
+bool LTRResultIterator::HasTruthString() const {
+  if (it_->word() == nullptr) {
+    return false; // Already at the end!
+  }
+  if (it_->word()->blamer_bundle == nullptr || it_->word()->blamer_bundle->NoTruth()) {
+    return false; // no truth information for this word
+  }
+  return true;
+}
+
+// Returns true if the given string is equivalent to the truth string for
+// the current word.
+bool LTRResultIterator::EquivalentToTruth(const char *str) const {
+  if (!HasTruthString()) {
+    return false;
+  }
+  ASSERT_HOST(it_->word()->uch_set != nullptr);
+  WERD_CHOICE str_wd(str, *(it_->word()->uch_set));
+  return it_->word()->blamer_bundle->ChoiceIsCorrect(&str_wd);
+}
+
+// Returns the null terminated UTF-8 encoded truth string for the current word.
+// Use delete [] to free after use.
+char *LTRResultIterator::WordTruthUTF8Text() const {
+  if (!HasTruthString()) {
+    return nullptr;
+  }
+  std::string truth_text = it_->word()->blamer_bundle->TruthString();
+  int length = truth_text.length() + 1;
+  char *result = new char[length];
+  strncpy(result, truth_text.c_str(), length);
+  return result;
+}
+
+// Returns the null terminated UTF-8 encoded normalized OCR string for the
+// current word. Use delete [] to free after use.
+char *LTRResultIterator::WordNormedUTF8Text() const {
+  if (it_->word() == nullptr) {
+    return nullptr; // Already at the end!
+  }
+  std::string ocr_text;
+  WERD_CHOICE *best_choice = it_->word()->best_choice;
+  const UNICHARSET *unicharset = it_->word()->uch_set;
+  ASSERT_HOST(best_choice != nullptr);
+  for (int i = 0; i < best_choice->length(); ++i) {
+    ocr_text += unicharset->get_normed_unichar(best_choice->unichar_id(i));
+  }
+  int length = ocr_text.length() + 1;
+  char *result = new char[length];
+  strncpy(result, ocr_text.c_str(), length);
+  return result;
+}
+
+// Returns a pointer to serialized choice lattice.
+// Fills lattice_size with the number of bytes in lattice data.
+const char *LTRResultIterator::WordLattice(int *lattice_size) const {
+  if (it_->word() == nullptr) {
+    return nullptr; // Already at the end!
+  }
+  if (it_->word()->blamer_bundle == nullptr) {
+    return nullptr;
+  }
+  *lattice_size = it_->word()->blamer_bundle->lattice_size();
+  return it_->word()->blamer_bundle->lattice_data();
+}
+
+// Returns true if the current symbol is a superscript.
+// If iterating at a higher level object than symbols, eg words, then
+// this will return the attributes of the first symbol in that word.
+bool LTRResultIterator::SymbolIsSuperscript() const {
+  if (cblob_it_ == nullptr && it_->word() != nullptr) {
+    return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUPERSCRIPT;
+  }
+  return false;
+}
+
+// Returns true if the current symbol is a subscript.
+// If iterating at a higher level object than symbols, eg words, then
+// this will return the attributes of the first symbol in that word.
+bool LTRResultIterator::SymbolIsSubscript() const {
+  if (cblob_it_ == nullptr && it_->word() != nullptr) {
+    return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUBSCRIPT;
+  }
+  return false;
+}
+
+// Returns true if the current symbol is a dropcap.
+// If iterating at a higher level object than symbols, eg words, then
+// this will return the attributes of the first symbol in that word.
+bool LTRResultIterator::SymbolIsDropcap() const {
+  if (cblob_it_ == nullptr && it_->word() != nullptr) {
+    return it_->word()->best_choice->BlobPosition(blob_index_) == SP_DROPCAP;
+  }
+  return false;
+}
+
+ChoiceIterator::ChoiceIterator(const LTRResultIterator &result_it) {
+  ASSERT_HOST(result_it.it_->word() != nullptr);
+  word_res_ = result_it.it_->word();
+  oemLSTM_ = word_res_->tesseract->AnyLSTMLang();
+  // Is there legacy engine related trained data?
+  bool oemLegacy = word_res_->tesseract->AnyTessLang();
+  // Is lstm_choice_mode activated?
+  bool lstm_choice_mode = word_res_->tesseract->lstm_choice_mode;
+  rating_coefficient_ = word_res_->tesseract->lstm_rating_coefficient;
+  blanks_before_word_ = result_it.BlanksBeforeWord();
+  BLOB_CHOICE_LIST *choices = nullptr;
+  tstep_index_ = &result_it.blob_index_;
+  if (oemLSTM_ && !word_res_->CTC_symbol_choices.empty()) {
+    if (!word_res_->CTC_symbol_choices[0].empty() &&
+        strcmp(word_res_->CTC_symbol_choices[0][0].first, " ")) {
+      blanks_before_word_ = 0;
+    }
+    auto index = *tstep_index_;
+    index += blanks_before_word_;
+    if (index < word_res_->CTC_symbol_choices.size()) {
+      LSTM_choices_ = &word_res_->CTC_symbol_choices[index];
+      filterSpaces();
+    }
+  }
+  if ((oemLegacy || !lstm_choice_mode) && word_res_->ratings != nullptr) {
+    choices = word_res_->GetBlobChoices(result_it.blob_index_);
+  }
+  if (choices != nullptr && !choices->empty()) {
+    choice_it_ = new BLOB_CHOICE_IT(choices);
+    choice_it_->mark_cycle_pt();
+  } else {
+    choice_it_ = nullptr;
+  }
+  if (LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
+    LSTM_choice_it_ = LSTM_choices_->begin();
+  }
+}
+ChoiceIterator::~ChoiceIterator() {
+  delete choice_it_;
+}
+
+// Moves to the next choice for the symbol and returns false if there
+// are none left.
+bool ChoiceIterator::Next() {
+  if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
+    if (LSTM_choice_it_ != LSTM_choices_->end() && next(LSTM_choice_it_) == LSTM_choices_->end()) {
+      return false;
+    } else {
+      ++LSTM_choice_it_;
+      return true;
+    }
+  } else {
+    if (choice_it_ == nullptr) {
+      return false;
+    }
+    choice_it_->forward();
+    return !choice_it_->cycled_list();
+  }
+}
+
+// Returns the null terminated UTF-8 encoded text string for the current
+// choice. Do NOT use delete [] to free after use.
+const char *ChoiceIterator::GetUTF8Text() const {
+  if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
+    std::pair<const char *, float> choice = *LSTM_choice_it_;
+    return choice.first;
+  } else {
+    if (choice_it_ == nullptr) {
+      return nullptr;
+    }
+    UNICHAR_ID id = choice_it_->data()->unichar_id();
+    return word_res_->uch_set->id_to_unichar_ext(id);
+  }
+}
+
+// Returns the confidence of the current choice depending on the used language
+// data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All
+// choices for one symbol should roughly add up to 1.0f.
+// If only traineddata of the legacy engine is used, the number should be
+// interpreted as a percent probability. (0.0f-100.0f) In this case
+// probabilities won't add up to 100. Each one stands on its own.
+float ChoiceIterator::Confidence() const {
+  float confidence;
+  if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
+    std::pair<const char *, float> choice = *LSTM_choice_it_;
+    confidence = 100 - rating_coefficient_ * choice.second;
+  } else {
+    if (choice_it_ == nullptr) {
+      return 0.0f;
+    }
+    confidence = 100 + 5 * choice_it_->data()->certainty();
+  }
+  return ClipToRange(confidence, 0.0f, 100.0f);
+}
+
+// Returns the set of timesteps which belong to the current symbol
+std::vector<std::vector<std::pair<const char *, float>>> *ChoiceIterator::Timesteps() const {
+  int offset = *tstep_index_ + blanks_before_word_;
+  if (offset >= word_res_->segmented_timesteps.size() || !oemLSTM_) {
+    return nullptr;
+  }
+  return &word_res_->segmented_timesteps[offset];
+}
+
+void ChoiceIterator::filterSpaces() {
+  if (LSTM_choices_->empty()) {
+    return;
+  }
+  std::vector<std::pair<const char *, float>>::iterator it;
+  for (it = LSTM_choices_->begin(); it != LSTM_choices_->end();) {
+    if (!strcmp(it->first, " ")) {
+      it = LSTM_choices_->erase(it);
+    } else {
+      ++it;
+    }
+  }
+}
+} // namespace tesseract.
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/mutableiterator.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/mutableiterator.cpp
@ -0,0 +1,24 @@
+///////////////////////////////////////////////////////////////////////
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "mutableiterator.h"
+
+namespace tesseract {
+
+// Destructor.
+// It is defined here, so the compiler can create a single vtable
+// instead of weak vtables in every compilation unit.
+MutableIterator::~MutableIterator() = default;
+
+} // namespace tesseract.
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/mutableiterator.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/mutableiterator.h
@ -0,0 +1,62 @@
+///////////////////////////////////////////////////////////////////////
+// File:        mutableiterator.h
+// Description: Iterator for tesseract results providing access to
+//              both high-level API and Tesseract internal data structures.
+// Author:      David Eger
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCMAIN_MUTABLEITERATOR_H_
+#define TESSERACT_CCMAIN_MUTABLEITERATOR_H_
+
+#include <tesseract/resultiterator.h>
+
+class BLOB_CHOICE_IT;
+
+namespace tesseract {
+
+class Tesseract;
+
+// Class to iterate over tesseract results, providing access to all levels
+// of the page hierarchy, without including any tesseract headers or having
+// to handle any tesseract structures.
+// WARNING! This class points to data held within the TessBaseAPI class, and
+// therefore can only be used while the TessBaseAPI class still exists and
+// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
+// DetectOS, or anything else that changes the internal PAGE_RES.
+// See tesseract/publictypes.h for the definition of PageIteratorLevel.
+// See also base class PageIterator, which contains the bulk of the interface.
+// ResultIterator adds text-specific methods for access to OCR output.
+// MutableIterator adds access to internal data structures.
+
+class TESS_API MutableIterator : public ResultIterator {
+public:
+  // See argument descriptions in ResultIterator()
+  MutableIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres,
+                  int rect_left, int rect_top, int rect_width, int rect_height)
+      : ResultIterator(LTRResultIterator(page_res, tesseract, scale, scaled_yres, rect_left,
+                                         rect_top, rect_width, rect_height)) {}
+  ~MutableIterator() override;
+
+  // See PageIterator and ResultIterator for most calls.
+
+  // Return access to Tesseract internals.
+  const PAGE_RES_IT *PageResIt() const {
+    return it_;
+  }
+};
+
+} // namespace tesseract.
+
+#endif // TESSERACT_CCMAIN_MUTABLEITERATOR_H_
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/osdetect.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/osdetect.cpp
@ -0,0 +1,581 @@
+///////////////////////////////////////////////////////////////////////
+// File:        osdetect.cpp
+// Description: Orientation and script detection.
+// Author:      Samuel Charron
+//              Ranjith Unnikrishnan
+//
+// (C) Copyright 2008, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include <tesseract/osdetect.h>
+
+#include "blobbox.h"
+#include "blread.h"
+#include "colfind.h"
+#include "fontinfo.h"
+#include "imagefind.h"
+#include "linefind.h"
+#include "oldlist.h"
+#include "qrsequence.h"
+#include "ratngs.h"
+#include "tabvector.h"
+#include "tesseractclass.h"
+#include "textord.h"
+
+#include <algorithm>
+#include <cmath> // for std::fabs
+#include <memory>
+
+namespace tesseract {
+
+const float kSizeRatioToReject = 2.0;
+const int kMinAcceptableBlobHeight = 10;
+
+const float kScriptAcceptRatio = 1.3;
+
+const float kHanRatioInKorean = 0.7;
+const float kHanRatioInJapanese = 0.3;
+
+const float kNonAmbiguousMargin = 1.0;
+
+// General scripts
+static const char *han_script = "Han";
+static const char *latin_script = "Latin";
+static const char *katakana_script = "Katakana";
+static const char *hiragana_script = "Hiragana";
+static const char *hangul_script = "Hangul";
+
+// Pseudo-scripts Name
+const char *ScriptDetector::korean_script_ = "Korean";
+const char *ScriptDetector::japanese_script_ = "Japanese";
+const char *ScriptDetector::fraktur_script_ = "Fraktur";
+
+void OSResults::update_best_orientation() {
+  float first = orientations[0];
+  float second = orientations[1];
+  best_result.orientation_id = 0;
+  if (orientations[0] < orientations[1]) {
+    first = orientations[1];
+    second = orientations[0];
+    best_result.orientation_id = 1;
+  }
+  for (int i = 2; i < 4; ++i) {
+    if (orientations[i] > first) {
+      second = first;
+      first = orientations[i];
+      best_result.orientation_id = i;
+    } else if (orientations[i] > second) {
+      second = orientations[i];
+    }
+  }
+  // Store difference of top two orientation scores.
+  best_result.oconfidence = first - second;
+}
+
+void OSResults::set_best_orientation(int orientation_id) {
+  best_result.orientation_id = orientation_id;
+  best_result.oconfidence = 0;
+}
+
+void OSResults::update_best_script(int orientation) {
+  // We skip index 0 to ignore the "Common" script.
+  float first = scripts_na[orientation][1];
+  float second = scripts_na[orientation][2];
+  best_result.script_id = 1;
+  if (scripts_na[orientation][1] < scripts_na[orientation][2]) {
+    first = scripts_na[orientation][2];
+    second = scripts_na[orientation][1];
+    best_result.script_id = 2;
+  }
+  for (int i = 3; i < kMaxNumberOfScripts; ++i) {
+    if (scripts_na[orientation][i] > first) {
+      best_result.script_id = i;
+      second = first;
+      first = scripts_na[orientation][i];
+    } else if (scripts_na[orientation][i] > second) {
+      second = scripts_na[orientation][i];
+    }
+  }
+  best_result.sconfidence =
+      (second == 0.0f) ? 2.0f : (first / second - 1.0) / (kScriptAcceptRatio - 1.0);
+}
+
+int OSResults::get_best_script(int orientation_id) const {
+  int max_id = -1;
+  for (int j = 0; j < kMaxNumberOfScripts; ++j) {
+    const char *script = unicharset->get_script_from_script_id(j);
+    if (strcmp(script, "Common") && strcmp(script, "NULL")) {
+      if (max_id == -1 || scripts_na[orientation_id][j] > scripts_na[orientation_id][max_id]) {
+        max_id = j;
+      }
+    }
+  }
+  return max_id;
+}
+
+// Print the script scores for all possible orientations.
+void OSResults::print_scores(void) const {
+  for (int i = 0; i < 4; ++i) {
+    tprintf("Orientation id #%d", i);
+    print_scores(i);
+  }
+}
+
+// Print the script scores for the given candidate orientation.
+void OSResults::print_scores(int orientation_id) const {
+  for (int j = 0; j < kMaxNumberOfScripts; ++j) {
+    if (scripts_na[orientation_id][j]) {
+      tprintf("%12s\t: %f\n", unicharset->get_script_from_script_id(j),
+              scripts_na[orientation_id][j]);
+    }
+  }
+}
+
+// Accumulate scores with given OSResults instance and update the best script.
+void OSResults::accumulate(const OSResults &osr) {
+  for (int i = 0; i < 4; ++i) {
+    orientations[i] += osr.orientations[i];
+    for (int j = 0; j < kMaxNumberOfScripts; ++j) {
+      scripts_na[i][j] += osr.scripts_na[i][j];
+    }
+  }
+  unicharset = osr.unicharset;
+  update_best_orientation();
+  update_best_script(best_result.orientation_id);
+}
+
+// Detect and erase horizontal/vertical lines and picture regions from the
+// image, so that non-text blobs are removed from consideration.
+static void remove_nontext_regions(tesseract::Tesseract *tess, BLOCK_LIST *blocks,
+                                   TO_BLOCK_LIST *to_blocks) {
+  Image pix = tess->pix_binary();
+  ASSERT_HOST(pix != nullptr);
+  int vertical_x = 0;
+  int vertical_y = 1;
+  tesseract::TabVector_LIST v_lines;
+  tesseract::TabVector_LIST h_lines;
+  int resolution;
+  if (kMinCredibleResolution > pixGetXRes(pix)) {
+    resolution = kMinCredibleResolution;
+    tprintf("Warning. Invalid resolution %d dpi. Using %d instead.\n", pixGetXRes(pix), resolution);
+  } else {
+    resolution = pixGetXRes(pix);
+  }
+
+  tesseract::LineFinder::FindAndRemoveLines(resolution, false, pix, &vertical_x, &vertical_y,
+                                            nullptr, &v_lines, &h_lines);
+  Image im_pix = tesseract::ImageFind::FindImages(pix, nullptr);
+  if (im_pix != nullptr) {
+    pixSubtract(pix, pix, im_pix);
+    im_pix.destroy();
+  }
+  tess->mutable_textord()->find_components(tess->pix_binary(), blocks, to_blocks);
+}
+
+// Find connected components in the page and process a subset until finished or
+// a stopping criterion is met.
+// Returns the number of blobs used in making the estimate. 0 implies failure.
+int orientation_and_script_detection(const char *filename, OSResults *osr,
+                                     tesseract::Tesseract *tess) {
+  std::string name = filename; // truncated name
+
+  const char *lastdot = strrchr(name.c_str(), '.');
+  if (lastdot != nullptr) {
+    name[lastdot - name.c_str()] = '\0';
+  }
+
+  ASSERT_HOST(tess->pix_binary() != nullptr);
+  int width = pixGetWidth(tess->pix_binary());
+  int height = pixGetHeight(tess->pix_binary());
+
+  BLOCK_LIST blocks;
+  if (!read_unlv_file(name, width, height, &blocks)) {
+    FullPageBlock(width, height, &blocks);
+  }
+
+  // Try to remove non-text regions from consideration.
+  TO_BLOCK_LIST land_blocks, port_blocks;
+  remove_nontext_regions(tess, &blocks, &port_blocks);
+
+  if (port_blocks.empty()) {
+    // page segmentation did not succeed, so we need to find_components first.
+    tess->mutable_textord()->find_components(tess->pix_binary(), &blocks, &port_blocks);
+  } else {
+    TBOX page_box(0, 0, width, height);
+    // Filter_blobs sets up the TO_BLOCKs the same as find_components does.
+    tess->mutable_textord()->filter_blobs(page_box.topright(), &port_blocks, true);
+  }
+
+  return os_detect(&port_blocks, osr, tess);
+}
+
+// Filter and sample the blobs.
+// Returns a non-zero number of blobs if the page was successfully processed, or
+// zero if the page had too few characters to be reliable
+int os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr, tesseract::Tesseract *tess) {
+  int blobs_total = 0;
+  TO_BLOCK_IT block_it;
+  block_it.set_to_list(port_blocks);
+
+  BLOBNBOX_CLIST filtered_list;
+  BLOBNBOX_C_IT filtered_it(&filtered_list);
+
+  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
+    TO_BLOCK *to_block = block_it.data();
+    if (to_block->block->pdblk.poly_block() && !to_block->block->pdblk.poly_block()->IsText()) {
+      continue;
+    }
+    BLOBNBOX_IT bbox_it;
+    bbox_it.set_to_list(&to_block->blobs);
+    for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {
+      BLOBNBOX *bbox = bbox_it.data();
+      C_BLOB *blob = bbox->cblob();
+      TBOX box = blob->bounding_box();
+      ++blobs_total;
+
+      // Catch illegal value of box width and avoid division by zero.
+      if (box.width() == 0) {
+        continue;
+      }
+      // TODO: Can height and width be negative? If not, remove fabs.
+      float y_x = std::fabs((box.height() * 1.0f) / box.width());
+      float x_y = 1.0f / y_x;
+      // Select a >= 1.0 ratio
+      float ratio = x_y > y_x ? x_y : y_x;
+      // Blob is ambiguous
+      if (ratio > kSizeRatioToReject) {
+        continue;
+      }
+      if (box.height() < kMinAcceptableBlobHeight) {
+        continue;
+      }
+      filtered_it.add_to_end(bbox);
+    }
+  }
+  return os_detect_blobs(nullptr, &filtered_list, osr, tess);
+}
+
+// Detect orientation and script from a list of blobs.
+// Returns a non-zero number of blobs if the list was successfully processed, or
+// zero if the list had too few characters to be reliable.
+// If allowed_scripts is non-null and non-empty, it is a list of scripts that
+// constrains both orientation and script detection to consider only scripts
+// from the list.
+int os_detect_blobs(const std::vector<int> *allowed_scripts, BLOBNBOX_CLIST *blob_list,
+                    OSResults *osr, tesseract::Tesseract *tess) {
+  OSResults osr_;
+  int minCharactersToTry = tess->min_characters_to_try;
+  int maxCharactersToTry = 5 * minCharactersToTry;
+  if (osr == nullptr) {
+    osr = &osr_;
+  }
+
+  osr->unicharset = &tess->unicharset;
+  OrientationDetector o(allowed_scripts, osr);
+  ScriptDetector s(allowed_scripts, osr, tess);
+
+  BLOBNBOX_C_IT filtered_it(blob_list);
+  int real_max = std::min(filtered_it.length(), maxCharactersToTry);
+  // tprintf("Total blobs found = %d\n", blobs_total);
+  // tprintf("Number of blobs post-filtering = %d\n", filtered_it.length());
+  // tprintf("Number of blobs to try = %d\n", real_max);
+
+  // If there are too few characters, skip this page entirely.
+  if (real_max < minCharactersToTry / 2) {
+    tprintf("Too few characters. Skipping this page\n");
+    return 0;
+  }
+
+  auto **blobs = new BLOBNBOX *[filtered_it.length()];
+  int number_of_blobs = 0;
+  for (filtered_it.mark_cycle_pt(); !filtered_it.cycled_list(); filtered_it.forward()) {
+    blobs[number_of_blobs++] = filtered_it.data();
+  }
+  QRSequenceGenerator sequence(number_of_blobs);
+  int num_blobs_evaluated = 0;
+  for (int i = 0; i < real_max; ++i) {
+    if (os_detect_blob(blobs[sequence.GetVal()], &o, &s, osr, tess) && i > minCharactersToTry) {
+      break;
+    }
+    ++num_blobs_evaluated;
+  }
+  delete[] blobs;
+
+  // Make sure the best_result is up-to-date
+  int orientation = o.get_orientation();
+  osr->update_best_script(orientation);
+  return num_blobs_evaluated;
+}
+
+// Processes a single blob to estimate script and orientation.
+// Return true if estimate of orientation and script satisfies stopping
+// criteria.
+bool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s, OSResults *osr,
+                    tesseract::Tesseract *tess) {
+  tess->tess_cn_matching.set_value(true); // turn it on
+  tess->tess_bn_matching.set_value(false);
+  C_BLOB *blob = bbox->cblob();
+  TBLOB *tblob = TBLOB::PolygonalCopy(tess->poly_allow_detailed_fx, blob);
+  TBOX box = tblob->bounding_box();
+  FCOORD current_rotation(1.0f, 0.0f);
+  FCOORD rotation90(0.0f, 1.0f);
+  BLOB_CHOICE_LIST ratings[4];
+  // Test the 4 orientations
+  for (int i = 0; i < 4; ++i) {
+    // Normalize the blob. Set the origin to the place we want to be the
+    // bottom-middle after rotation.
+    // Scaling is to make the rotated height the x-height.
+    float scaling = static_cast<float>(kBlnXHeight) / box.height();
+    float x_origin = (box.left() + box.right()) / 2.0f;
+    float y_origin = (box.bottom() + box.top()) / 2.0f;
+    if (i == 0 || i == 2) {
+      // Rotation is 0 or 180.
+      y_origin = i == 0 ? box.bottom() : box.top();
+    } else {
+      // Rotation is 90 or 270.
+      scaling = static_cast<float>(kBlnXHeight) / box.width();
+      x_origin = i == 1 ? box.left() : box.right();
+    }
+    std::unique_ptr<TBLOB> rotated_blob(new TBLOB(*tblob));
+    rotated_blob->Normalize(nullptr, &current_rotation, nullptr, x_origin, y_origin, scaling,
+                            scaling, 0.0f, static_cast<float>(kBlnBaselineOffset), false, nullptr);
+    tess->AdaptiveClassifier(rotated_blob.get(), ratings + i);
+    current_rotation.rotate(rotation90);
+  }
+  delete tblob;
+
+  bool stop = o->detect_blob(ratings);
+  s->detect_blob(ratings);
+  int orientation = o->get_orientation();
+  stop = s->must_stop(orientation) && stop;
+  return stop;
+}
+
+OrientationDetector::OrientationDetector(const std::vector<int> *allowed_scripts, OSResults *osr) {
+  osr_ = osr;
+  allowed_scripts_ = allowed_scripts;
+}
+
+// Score the given blob and return true if it is now sure of the orientation
+// after adding this block.
+bool OrientationDetector::detect_blob(BLOB_CHOICE_LIST *scores) {
+  float blob_o_score[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+  float total_blob_o_score = 0.0f;
+
+  for (int i = 0; i < 4; ++i) {
+    BLOB_CHOICE_IT choice_it(scores + i);
+    if (!choice_it.empty()) {
+      BLOB_CHOICE *choice = nullptr;
+      if (allowed_scripts_ != nullptr && !allowed_scripts_->empty()) {
+        // Find the top choice in an allowed script.
+        for (choice_it.mark_cycle_pt(); !choice_it.cycled_list() && choice == nullptr;
+             choice_it.forward()) {
+          int choice_script = choice_it.data()->script_id();
+          int s = 0;
+          for (s = 0; s < allowed_scripts_->size(); ++s) {
+            if ((*allowed_scripts_)[s] == choice_script) {
+              choice = choice_it.data();
+              break;
+            }
+          }
+        }
+      } else {
+        choice = choice_it.data();
+      }
+      if (choice != nullptr) {
+        // The certainty score ranges between [-20,0]. This is converted here to
+        // [0,1], with 1 indicating best match.
+        blob_o_score[i] = 1 + 0.05 * choice->certainty();
+        total_blob_o_score += blob_o_score[i];
+      }
+    }
+  }
+  if (total_blob_o_score == 0.0) {
+    return false;
+  }
+  // Fill in any blanks with the worst score of the others. This is better than
+  // picking an arbitrary probability for it and way better than -inf.
+  float worst_score = 0.0f;
+  int num_good_scores = 0;
+  for (float f : blob_o_score) {
+    if (f > 0.0f) {
+      ++num_good_scores;
+      if (worst_score == 0.0f || f < worst_score) {
+        worst_score = f;
+      }
+    }
+  }
+  if (num_good_scores == 1) {
+    // Lower worst if there is only one.
+    worst_score /= 2.0f;
+  }
+  for (float &f : blob_o_score) {
+    if (f == 0.0f) {
+      f = worst_score;
+      total_blob_o_score += worst_score;
+    }
+  }
+  // Normalize the orientation scores for the blob and use them to
+  // update the aggregated orientation score.
+  for (int i = 0; total_blob_o_score != 0 && i < 4; ++i) {
+    osr_->orientations[i] += log(blob_o_score[i] / total_blob_o_score);
+  }
+
+  // TODO(ranjith) Add an early exit test, based on min_orientation_margin,
+  // as used in pagesegmain.cpp.
+  return false;
+}
+
+int OrientationDetector::get_orientation() {
+  osr_->update_best_orientation();
+  return osr_->best_result.orientation_id;
+}
+
+ScriptDetector::ScriptDetector(const std::vector<int> *allowed_scripts, OSResults *osr,
+                               tesseract::Tesseract *tess) {
+  osr_ = osr;
+  tess_ = tess;
+  allowed_scripts_ = allowed_scripts;
+  katakana_id_ = tess_->unicharset.add_script(katakana_script);
+  hiragana_id_ = tess_->unicharset.add_script(hiragana_script);
+  han_id_ = tess_->unicharset.add_script(han_script);
+  hangul_id_ = tess_->unicharset.add_script(hangul_script);
+  japanese_id_ = tess_->unicharset.add_script(japanese_script_);
+  korean_id_ = tess_->unicharset.add_script(korean_script_);
+  latin_id_ = tess_->unicharset.add_script(latin_script);
+  fraktur_id_ = tess_->unicharset.add_script(fraktur_script_);
+}
+
+// Score the given blob and return true if it is now sure of the script after
+// adding this blob.
+void ScriptDetector::detect_blob(BLOB_CHOICE_LIST *scores) {
+  for (int i = 0; i < 4; ++i) {
+    bool done[kMaxNumberOfScripts] = {false};
+
+    BLOB_CHOICE_IT choice_it;
+    choice_it.set_to_list(scores + i);
+
+    float prev_score = -1;
+    int script_count = 0;
+    int prev_id = -1;
+    int prev_fontinfo_id = -1;
+    const char *prev_unichar = "";
+    const char *unichar = "";
+
+    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
+      BLOB_CHOICE *choice = choice_it.data();
+      int id = choice->script_id();
+      if (allowed_scripts_ != nullptr && !allowed_scripts_->empty()) {
+        // Check that the choice is in an allowed script.
+        int s = 0;
+        for (s = 0; s < allowed_scripts_->size(); ++s) {
+          if ((*allowed_scripts_)[s] == id) {
+            break;
+          }
+        }
+        if (s == allowed_scripts_->size()) {
+          continue; // Not found in list.
+        }
+      }
+      // Script already processed before.
+      if (done[id]) {
+        continue;
+      }
+      done[id] = true;
+
+      unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
+      // Save data from the first match
+      if (prev_score < 0) {
+        prev_score = -choice->certainty();
+        script_count = 1;
+        prev_id = id;
+        prev_unichar = unichar;
+        prev_fontinfo_id = choice->fontinfo_id();
+      } else if (-choice->certainty() < prev_score + kNonAmbiguousMargin) {
+        ++script_count;
+      }
+
+      if (strlen(prev_unichar) == 1) {
+        if (unichar[0] >= '0' && unichar[0] <= '9') {
+          break;
+        }
+      }
+
+      // if script_count is >= 2, character is ambiguous, skip other matches
+      // since they are useless.
+      if (script_count >= 2) {
+        break;
+      }
+    }
+    // Character is non ambiguous
+    if (script_count == 1) {
+      // Update the score of the winning script
+      osr_->scripts_na[i][prev_id] += 1.0;
+
+      // Workaround for Fraktur
+      if (prev_id == latin_id_) {
+        if (prev_fontinfo_id >= 0) {
+          const tesseract::FontInfo &fi = tess_->get_fontinfo_table().at(prev_fontinfo_id);
+          // printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name,
+          //       fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(),
+          //       fi.is_serif(), fi.is_fraktur(),
+          //       prev_unichar);
+          if (fi.is_fraktur()) {
+            osr_->scripts_na[i][prev_id] -= 1.0;
+            osr_->scripts_na[i][fraktur_id_] += 1.0;
+          }
+        }
+      }
+
+      // Update Japanese / Korean pseudo-scripts
+      if (prev_id == katakana_id_) {
+        osr_->scripts_na[i][japanese_id_] += 1.0;
+      }
+      if (prev_id == hiragana_id_) {
+        osr_->scripts_na[i][japanese_id_] += 1.0;
+      }
+      if (prev_id == hangul_id_) {
+        osr_->scripts_na[i][korean_id_] += 1.0;
+      }
+      if (prev_id == han_id_) {
+        osr_->scripts_na[i][korean_id_] += kHanRatioInKorean;
+        osr_->scripts_na[i][japanese_id_] += kHanRatioInJapanese;
+      }
+    }
+  } // iterate over each orientation
+}
+
+bool ScriptDetector::must_stop(int orientation) const {
+  osr_->update_best_script(orientation);
+  return osr_->best_result.sconfidence > 1;
+}
+
+// Helper method to convert an orientation index to its value in degrees.
+// The value represents the amount of clockwise rotation in degrees that must be
+// applied for the text to be upright (readable).
+int OrientationIdToValue(const int &id) {
+  switch (id) {
+    case 0:
+      return 0;
+    case 1:
+      return 270;
+    case 2:
+      return 180;
+    case 3:
+      return 90;
+    default:
+      return -1;
+  }
+}
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/output.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/output.cpp
@ -0,0 +1,416 @@
+/******************************************************************
+ * File:        output.cpp  (Formerly output.c)
+ * Description: Output pass
+ * Author:      Phil Cheatle
+ *
+ * (C) Copyright 1994, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "output.h"
+
+#include "control.h"
+#include "tesseractclass.h"
+#include "tessvars.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#  include "docqual.h"
+#  include "reject.h"
+#endif
+
+#include "helpers.h"
+
+#include <cctype>
+#include <cerrno>
+#include <cstring>
+
+#define CTRL_NEWLINE '\012'  // newline
+#define CTRL_HARDLINE '\015' // cr
+
+namespace tesseract {
+void Tesseract::output_pass( // Tess output pass //send to api
+    PAGE_RES_IT &page_res_it, const TBOX *target_word_box) {
+  BLOCK_RES *block_of_last_word;
+  bool force_eol;   // During output
+  BLOCK *nextblock; // block of next word
+  WERD *nextword;   // next word
+
+  page_res_it.restart_page();
+  block_of_last_word = nullptr;
+  while (page_res_it.word() != nullptr) {
+    check_debug_pt(page_res_it.word(), 120);
+
+    if (target_word_box) {
+      TBOX current_word_box = page_res_it.word()->word->bounding_box();
+      FCOORD center_pt((current_word_box.right() + current_word_box.left()) / 2,
+                       (current_word_box.bottom() + current_word_box.top()) / 2);
+      if (!target_word_box->contains(center_pt)) {
+        page_res_it.forward();
+        continue;
+      }
+    }
+    if (tessedit_write_block_separators && block_of_last_word != page_res_it.block()) {
+      block_of_last_word = page_res_it.block();
+    }
+
+    force_eol =
+        (tessedit_write_block_separators && (page_res_it.block() != page_res_it.next_block())) ||
+        (page_res_it.next_word() == nullptr);
+
+    if (page_res_it.next_word() != nullptr) {
+      nextword = page_res_it.next_word()->word;
+    } else {
+      nextword = nullptr;
+    }
+    if (page_res_it.next_block() != nullptr) {
+      nextblock = page_res_it.next_block()->block;
+    } else {
+      nextblock = nullptr;
+    }
+    // regardless of tilde crunching
+    write_results(page_res_it,
+                  determine_newline_type(page_res_it.word()->word, page_res_it.block()->block,
+                                         nextword, nextblock),
+                  force_eol);
+    page_res_it.forward();
+  }
+}
+
+/*************************************************************************
+ * write_results()
+ *
+ * All recognition and rejection has now been done. Generate the following:
+ *   .txt file     - giving the final best choices with NO highlighting
+ *   .raw file     - giving the tesseract top choice output for each word
+ *   .map file     - showing how the .txt file has been rejected in the .ep file
+ *   epchoice list - a list of one element per word, containing the text for the
+ *                   epaper. Reject strings are inserted.
+ *   inset list    - a list of bounding boxes of reject insets - indexed by the
+ *                   reject strings in the epchoice text.
+ *************************************************************************/
+void Tesseract::write_results(PAGE_RES_IT &page_res_it,
+                              char newline_type, // type of newline
+                              bool force_eol) {  // override tilde crunch?
+  WERD_RES *word = page_res_it.word();
+  const UNICHARSET &uchset = *word->uch_set;
+  int i;
+  bool need_reject = false;
+  UNICHAR_ID space = uchset.unichar_to_id(" ");
+
+  if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->empty()) &&
+      !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
+    if ((word->unlv_crunch_mode != CR_DELETE) &&
+        (!stats_.tilde_crunch_written ||
+         ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space() > 0) &&
+          !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {
+      if (!word->word->flag(W_BOL) && (word->word->space() > 0) && !word->word->flag(W_FUZZY_NON) &&
+          !word->word->flag(W_FUZZY_SP)) {
+        stats_.last_char_was_tilde = false;
+      }
+      need_reject = true;
+    }
+    if ((need_reject && !stats_.last_char_was_tilde) ||
+        (force_eol && stats_.write_results_empty_block)) {
+      /* Write a reject char - mark as rejected unless zero_rejection mode */
+      stats_.last_char_was_tilde = true;
+      stats_.tilde_crunch_written = true;
+      stats_.last_char_was_newline = false;
+      stats_.write_results_empty_block = false;
+    }
+
+    if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) {
+      stats_.tilde_crunch_written = false;
+      stats_.last_char_was_newline = true;
+      stats_.last_char_was_tilde = false;
+    }
+
+    if (force_eol) {
+      stats_.write_results_empty_block = true;
+    }
+    return;
+  }
+
+  /* NORMAL PROCESSING of non tilde crunched words */
+
+  stats_.tilde_crunch_written = false;
+  if (newline_type) {
+    stats_.last_char_was_newline = true;
+  } else {
+    stats_.last_char_was_newline = false;
+  }
+  stats_.write_results_empty_block = force_eol; // about to write a real word
+
+  if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) &&
+      !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
+      (word->best_choice->unichar_id(0) == space)) {
+    /* Prevent adjacent tilde across words - we know that adjacent tildes within
+   words have been removed */
+    word->MergeAdjacentBlobs(0);
+  }
+  if (newline_type || (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes)) {
+    stats_.last_char_was_tilde = false;
+  } else {
+    if (word->reject_map.length() > 0) {
+      if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) {
+        stats_.last_char_was_tilde = true;
+      } else {
+        stats_.last_char_was_tilde = false;
+      }
+    } else if (word->word->space() > 0) {
+      stats_.last_char_was_tilde = false;
+    }
+    /* else it is unchanged as there are no output chars */
+  }
+
+  ASSERT_HOST(word->best_choice->length() == word->reject_map.length());
+
+  set_unlv_suspects(word);
+  check_debug_pt(word, 120);
+  if (tessedit_rejection_debug) {
+    tprintf("Dict word: \"%s\": %d\n", word->best_choice->debug_string().c_str(),
+            dict_word(*(word->best_choice)));
+  }
+  if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
+    if (tessedit_zero_rejection) {
+      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
+      for (i = 0; i < word->best_choice->length(); ++i) {
+        if (word->reject_map[i].rejected()) {
+          word->reject_map[i].setrej_minimal_rej_accept();
+        }
+      }
+    }
+    if (tessedit_minimal_rejection) {
+      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
+      for (i = 0; i < word->best_choice->length(); ++i) {
+        if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) {
+          word->reject_map[i].setrej_minimal_rej_accept();
+        }
+      }
+    }
+  }
+}
+
+/**********************************************************************
+ * determine_newline_type
+ *
+ * Find whether we have a wrapping or hard newline.
+ * Return false if not at end of line.
+ **********************************************************************/
+
+char determine_newline_type( // test line ends
+    WERD *word,              // word to do
+    BLOCK *block,            // current block
+    WERD *next_word,         // next word
+    BLOCK *next_block        // block of next word
+) {
+  int16_t end_gap; // to right edge
+  int16_t width;   // of next word
+  TBOX word_box;   // bounding
+  TBOX next_box;   // next word
+  TBOX block_box;  // block bounding
+
+  if (!word->flag(W_EOL)) {
+    return false; // not end of line
+  }
+  if (next_word == nullptr || next_block == nullptr || block != next_block) {
+    return CTRL_NEWLINE;
+  }
+  if (next_word->space() > 0) {
+    return CTRL_HARDLINE; // it is tabbed
+  }
+  word_box = word->bounding_box();
+  next_box = next_word->bounding_box();
+  block_box = block->pdblk.bounding_box();
+  // gap to eol
+  end_gap = block_box.right() - word_box.right();
+  end_gap -= static_cast<int32_t>(block->space());
+  width = next_box.right() - next_box.left();
+  //      tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
+  //              block_box.right(),word_box.right(),end_gap,
+  //              next_box.right(),next_box.left(),width,
+  //              end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
+  return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
+}
+
+/*************************************************************************
+ * get_rep_char()
+ * Return the first accepted character from the repetition string. This is the
+ * character which is repeated - as determined earlier by fix_rep_char()
+ *************************************************************************/
+UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
+  int i;
+  for (i = 0; ((i < word->reject_map.length()) && (word->reject_map[i].rejected())); ++i) {
+    ;
+  }
+
+  if (i < word->reject_map.length()) {
+    return word->best_choice->unichar_id(i);
+  } else {
+    return word->uch_set->unichar_to_id(unrecognised_char.c_str());
+  }
+}
+
+/*************************************************************************
+ * SUSPECT LEVELS
+ *
+ * 0 - don't reject ANYTHING
+ * 1,2 - partial rejection
+ * 3 - BEST
+ *
+ * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
+ * tessedit_minimal_rejection.
+ *************************************************************************/
+void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
+  int len = word_res->reject_map.length();
+  const WERD_CHOICE &word = *(word_res->best_choice);
+  const UNICHARSET &uchset = *word.unicharset();
+  int i;
+  float rating_per_ch;
+
+  if (suspect_level == 0) {
+    for (i = 0; i < len; i++) {
+      if (word_res->reject_map[i].rejected()) {
+        word_res->reject_map[i].setrej_minimal_rej_accept();
+      }
+    }
+    return;
+  }
+
+  if (suspect_level >= 3) {
+    return; // Use defaults
+  }
+
+  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
+
+  if (safe_dict_word(word_res) && (count_alphas(word) > suspect_short_words)) {
+    /* Unreject alphas in dictionary words */
+    for (i = 0; i < len; ++i) {
+      if (word_res->reject_map[i].rejected() && uchset.get_isalpha(word.unichar_id(i))) {
+        word_res->reject_map[i].setrej_minimal_rej_accept();
+      }
+    }
+  }
+
+  rating_per_ch = word.rating() / word_res->reject_map.length();
+
+  if (rating_per_ch >= suspect_rating_per_ch) {
+    return; // Don't touch bad ratings
+  }
+
+  if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
+    /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
+    for (i = 0; i < len; ++i) {
+      if (word_res->reject_map[i].rejected() && (!uchset.eq(word.unichar_id(i), " "))) {
+        word_res->reject_map[i].setrej_minimal_rej_accept();
+      }
+    }
+  }
+
+  for (i = 0; i < len; i++) {
+    if (word_res->reject_map[i].rejected()) {
+      if (word_res->reject_map[i].flag(R_DOC_REJ)) {
+        word_res->reject_map[i].setrej_minimal_rej_accept();
+      }
+      if (word_res->reject_map[i].flag(R_BLOCK_REJ)) {
+        word_res->reject_map[i].setrej_minimal_rej_accept();
+      }
+      if (word_res->reject_map[i].flag(R_ROW_REJ)) {
+        word_res->reject_map[i].setrej_minimal_rej_accept();
+      }
+    }
+  }
+
+  if (suspect_level == 2) {
+    return;
+  }
+
+  if (!suspect_constrain_1Il || (word_res->reject_map.length() <= suspect_short_words)) {
+    for (i = 0; i < len; i++) {
+      if (word_res->reject_map[i].rejected()) {
+        if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
+             word_res->reject_map[i].flag(R_POSTNN_1IL))) {
+          word_res->reject_map[i].setrej_minimal_rej_accept();
+        }
+
+        if (!suspect_constrain_1Il && word_res->reject_map[i].flag(R_MM_REJECT)) {
+          word_res->reject_map[i].setrej_minimal_rej_accept();
+        }
+      }
+    }
+  }
+
+  if (acceptable_word_string(*word_res->uch_set, word.unichar_string().c_str(),
+                             word.unichar_lengths().c_str()) != AC_UNACCEPTABLE ||
+      acceptable_number_string(word.unichar_string().c_str(), word.unichar_lengths().c_str())) {
+    if (word_res->reject_map.length() > suspect_short_words) {
+      for (i = 0; i < len; i++) {
+        if (word_res->reject_map[i].rejected() && (!word_res->reject_map[i].perm_rejected() ||
+                                                   word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
+                                                   word_res->reject_map[i].flag(R_POSTNN_1IL) ||
+                                                   word_res->reject_map[i].flag(R_MM_REJECT))) {
+          word_res->reject_map[i].setrej_minimal_rej_accept();
+        }
+      }
+    }
+  }
+}
+
+int16_t Tesseract::count_alphas(const WERD_CHOICE &word) {
+  int count = 0;
+  for (int i = 0; i < word.length(); ++i) {
+    if (word.unicharset()->get_isalpha(word.unichar_id(i))) {
+      count++;
+    }
+  }
+  return count;
+}
+
+int16_t Tesseract::count_alphanums(const WERD_CHOICE &word) {
+  int count = 0;
+  for (int i = 0; i < word.length(); ++i) {
+    if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
+        word.unicharset()->get_isdigit(word.unichar_id(i))) {
+      count++;
+    }
+  }
+  return count;
+}
+
+bool Tesseract::acceptable_number_string(const char *s, const char *lengths) {
+  bool prev_digit = false;
+
+  if (*lengths == 1 && *s == '(') {
+    s++;
+  }
+
+  if (*lengths == 1 && ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))) {
+    s++;
+  }
+
+  for (; *s != '\0'; s += *(lengths++)) {
+    if (unicharset.get_isdigit(s, *lengths)) {
+      prev_digit = true;
+    } else if (prev_digit && (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) {
+      prev_digit = false;
+    } else if (prev_digit && *lengths == 1 && (*(s + *lengths) == '\0') &&
+               ((*s == '%') || (*s == ')'))) {
+      return true;
+    } else if (prev_digit && *lengths == 1 && (*s == '%') &&
+               (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
+               (*(s + *lengths + *(lengths + 1)) == '\0')) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/output.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/output.h
@ -0,0 +1,37 @@
+/******************************************************************
+ * File:        output.h  (Formerly output.h)
+ * Description: Output pass
+ * Author:      Phil Cheatle
+ * Created:     Thu Aug  4 10:56:08 BST 1994
+ *
+ * (C) Copyright 1994, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef OUTPUT_H
+#define OUTPUT_H
+
+namespace tesseract {
+
+class BLOCK;
+class WERD;
+
+/** test line ends */
+char determine_newline_type(WERD *word,       ///< word to do
+                            BLOCK *block,     ///< current block
+                            WERD *next_word,  ///< next word
+                            BLOCK *next_block ///< block of next word
+);
+
+} // namespace tesseract
+
+#endif
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/pageiterator.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/pageiterator.cpp
@ -0,0 +1,652 @@
+///////////////////////////////////////////////////////////////////////
+// File:        pageiterator.cpp
+// Description: Iterator for tesseract page structure that avoids using
+//              tesseract internal data structures.
+// Author:      Ray Smith
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include <allheaders.h>
+#include <tesseract/pageiterator.h>
+#include "helpers.h"
+#include "pageres.h"
+#include "tesseractclass.h"
+
+#include <algorithm>
+
+namespace tesseract {
+
+PageIterator::PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres,
+                           int rect_left, int rect_top, int rect_width, int rect_height)
+    : page_res_(page_res)
+    , tesseract_(tesseract)
+    , word_(nullptr)
+    , word_length_(0)
+    , blob_index_(0)
+    , cblob_it_(nullptr)
+    , include_upper_dots_(false)
+    , include_lower_dots_(false)
+    , scale_(scale)
+    , scaled_yres_(scaled_yres)
+    , rect_left_(rect_left)
+    , rect_top_(rect_top)
+    , rect_width_(rect_width)
+    , rect_height_(rect_height) {
+  it_ = new PAGE_RES_IT(page_res);
+  PageIterator::Begin();
+}
+
+PageIterator::~PageIterator() {
+  delete it_;
+  delete cblob_it_;
+}
+
+/**
+ * PageIterators may be copied! This makes it possible to iterate over
+ * all the objects at a lower level, while maintaining an iterator to
+ * objects at a higher level.
+ */
+PageIterator::PageIterator(const PageIterator &src)
+    : page_res_(src.page_res_)
+    , tesseract_(src.tesseract_)
+    , word_(nullptr)
+    , word_length_(src.word_length_)
+    , blob_index_(src.blob_index_)
+    , cblob_it_(nullptr)
+    , include_upper_dots_(src.include_upper_dots_)
+    , include_lower_dots_(src.include_lower_dots_)
+    , scale_(src.scale_)
+    , scaled_yres_(src.scaled_yres_)
+    , rect_left_(src.rect_left_)
+    , rect_top_(src.rect_top_)
+    , rect_width_(src.rect_width_)
+    , rect_height_(src.rect_height_) {
+  it_ = new PAGE_RES_IT(*src.it_);
+  BeginWord(src.blob_index_);
+}
+
+const PageIterator &PageIterator::operator=(const PageIterator &src) {
+  page_res_ = src.page_res_;
+  tesseract_ = src.tesseract_;
+  include_upper_dots_ = src.include_upper_dots_;
+  include_lower_dots_ = src.include_lower_dots_;
+  scale_ = src.scale_;
+  scaled_yres_ = src.scaled_yres_;
+  rect_left_ = src.rect_left_;
+  rect_top_ = src.rect_top_;
+  rect_width_ = src.rect_width_;
+  rect_height_ = src.rect_height_;
+  delete it_;
+  it_ = new PAGE_RES_IT(*src.it_);
+  BeginWord(src.blob_index_);
+  return *this;
+}
+
+bool PageIterator::PositionedAtSameWord(const PAGE_RES_IT *other) const {
+  return (it_ == nullptr && it_ == other) ||
+         ((other != nullptr) && (it_ != nullptr) && (*it_ == *other));
+}
+
+// ============= Moving around within the page ============.
+
+/** Resets the iterator to point to the start of the page. */
+void PageIterator::Begin() {
+  it_->restart_page_with_empties();
+  BeginWord(0);
+}
+
+void PageIterator::RestartParagraph() {
+  if (it_->block() == nullptr) {
+    return; // At end of the document.
+  }
+  PAGE_RES_IT para(page_res_);
+  PAGE_RES_IT next_para(para);
+  next_para.forward_paragraph();
+  while (next_para.cmp(*it_) <= 0) {
+    para = next_para;
+    next_para.forward_paragraph();
+  }
+  *it_ = para;
+  BeginWord(0);
+}
+
+bool PageIterator::IsWithinFirstTextlineOfParagraph() const {
+  PageIterator p_start(*this);
+  p_start.RestartParagraph();
+  return p_start.it_->row() == it_->row();
+}
+
+void PageIterator::RestartRow() {
+  it_->restart_row();
+  BeginWord(0);
+}
+
+/**
+ * Moves to the start of the next object at the given level in the
+ * page hierarchy, and returns false if the end of the page was reached.
+ * NOTE (CHANGED!) that ALL PageIteratorLevel level values will visit each
+ * non-text block at least once.
+ * Think of non text blocks as containing a single para, with at least one
+ * line, with a single imaginary word, containing a single symbol.
+ * The bounding boxes mark out any polygonal nature of the block, and
+ * PTIsTextType(BLockType()) is false for non-text blocks.
+ * Calls to Next with different levels may be freely intermixed.
+ * This function iterates words in right-to-left scripts correctly, if
+ * the appropriate language has been loaded into Tesseract.
+ */
+bool PageIterator::Next(PageIteratorLevel level) {
+  if (it_->block() == nullptr) {
+    return false; // Already at the end!
+  }
+  if (it_->word() == nullptr) {
+    level = RIL_BLOCK;
+  }
+
+  switch (level) {
+    case RIL_BLOCK:
+      it_->forward_block();
+      break;
+    case RIL_PARA:
+      it_->forward_paragraph();
+      break;
+    case RIL_TEXTLINE:
+      for (it_->forward_with_empties(); it_->row() == it_->prev_row();
+           it_->forward_with_empties()) {
+        ;
+      }
+      break;
+    case RIL_WORD:
+      it_->forward_with_empties();
+      break;
+    case RIL_SYMBOL:
+      if (cblob_it_ != nullptr) {
+        cblob_it_->forward();
+      }
+      ++blob_index_;
+      if (blob_index_ >= word_length_) {
+        it_->forward_with_empties();
+      } else {
+        return true;
+      }
+      break;
+  }
+  BeginWord(0);
+  return it_->block() != nullptr;
+}
+
+/**
+ * Returns true if the iterator is at the start of an object at the given
+ * level. Possible uses include determining if a call to Next(RIL_WORD)
+ * moved to the start of a RIL_PARA.
+ */
+bool PageIterator::IsAtBeginningOf(PageIteratorLevel level) const {
+  if (it_->block() == nullptr) {
+    return false; // Already at the end!
+  }
+  if (it_->word() == nullptr) {
+    return true; // In an image block.
+  }
+  switch (level) {
+    case RIL_BLOCK:
+      return blob_index_ == 0 && it_->block() != it_->prev_block();
+    case RIL_PARA:
+      return blob_index_ == 0 && (it_->block() != it_->prev_block() ||
+                                  it_->row()->row->para() != it_->prev_row()->row->para());
+    case RIL_TEXTLINE:
+      return blob_index_ == 0 && it_->row() != it_->prev_row();
+    case RIL_WORD:
+      return blob_index_ == 0;
+    case RIL_SYMBOL:
+      return true;
+  }
+  return false;
+}
+
+/**
+ * Returns whether the iterator is positioned at the last element in a
+ * given level. (e.g. the last word in a line, the last line in a block)
+ */
+bool PageIterator::IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const {
+  if (Empty(element)) {
+    return true; // Already at the end!
+  }
+  // The result is true if we step forward by element and find we are
+  // at the the end of the page or at beginning of *all* levels in:
+  // [level, element).
+  // When there is more than one level difference between element and level,
+  // we could for instance move forward one symbol and still be at the first
+  // word on a line, so we also have to be at the first symbol in a word.
+  PageIterator next(*this);
+  next.Next(element);
+  if (next.Empty(element)) {
+    return true; // Reached the end of the page.
+  }
+  while (element > level) {
+    element = static_cast<PageIteratorLevel>(element - 1);
+    if (!next.IsAtBeginningOf(element)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/**
+ * Returns whether this iterator is positioned
+ *   before other:   -1
+ *   equal to other:  0
+ *   after other:     1
+ */
+int PageIterator::Cmp(const PageIterator &other) const {
+  int word_cmp = it_->cmp(*other.it_);
+  if (word_cmp != 0) {
+    return word_cmp;
+  }
+  if (blob_index_ < other.blob_index_) {
+    return -1;
+  }
+  if (blob_index_ == other.blob_index_) {
+    return 0;
+  }
+  return 1;
+}
+
+// ============= Accessing data ==============.
+// Coordinate system:
+// Integer coordinates are at the cracks between the pixels.
+// The top-left corner of the top-left pixel in the image is at (0,0).
+// The bottom-right corner of the bottom-right pixel in the image is at
+// (width, height).
+// Every bounding box goes from the top-left of the top-left contained
+// pixel to the bottom-right of the bottom-right contained pixel, so
+// the bounding box of the single top-left pixel in the image is:
+// (0,0)->(1,1).
+// If an image rectangle has been set in the API, then returned coordinates
+// relate to the original (full) image, rather than the rectangle.
+
+/**
+ * Returns the bounding rectangle of the current object at the given level in
+ * the coordinates of the working image that is pix_binary().
+ * See comment on coordinate system above.
+ * Returns false if there is no such object at the current position.
+ */
+bool PageIterator::BoundingBoxInternal(PageIteratorLevel level, int *left, int *top, int *right,
+                                       int *bottom) const {
+  if (Empty(level)) {
+    return false;
+  }
+  TBOX box;
+  PARA *para = nullptr;
+  switch (level) {
+    case RIL_BLOCK:
+      box = it_->block()->block->restricted_bounding_box(include_upper_dots_, include_lower_dots_);
+      break;
+    case RIL_PARA:
+      para = it_->row()->row->para();
+      // Fall through.
+    case RIL_TEXTLINE:
+      box = it_->row()->row->restricted_bounding_box(include_upper_dots_, include_lower_dots_);
+      break;
+    case RIL_WORD:
+      box = it_->word()->word->restricted_bounding_box(include_upper_dots_, include_lower_dots_);
+      break;
+    case RIL_SYMBOL:
+      if (cblob_it_ == nullptr) {
+        box = it_->word()->box_word->BlobBox(blob_index_);
+      } else {
+        box = cblob_it_->data()->bounding_box();
+      }
+  }
+  if (level == RIL_PARA) {
+    PageIterator other = *this;
+    other.Begin();
+    do {
+      if (other.it_->block() && other.it_->block()->block == it_->block()->block &&
+          other.it_->row() && other.it_->row()->row && other.it_->row()->row->para() == para) {
+        box = box.bounding_union(other.it_->row()->row->bounding_box());
+      }
+    } while (other.Next(RIL_TEXTLINE));
+  }
+  if (level != RIL_SYMBOL || cblob_it_ != nullptr) {
+    box.rotate(it_->block()->block->re_rotation());
+  }
+  // Now we have a box in tesseract coordinates relative to the image rectangle,
+  // we have to convert the coords to a top-down system.
+  const int pix_height = pixGetHeight(tesseract_->pix_binary());
+  const int pix_width = pixGetWidth(tesseract_->pix_binary());
+  *left = ClipToRange(static_cast<int>(box.left()), 0, pix_width);
+  *top = ClipToRange(pix_height - box.top(), 0, pix_height);
+  *right = ClipToRange(static_cast<int>(box.right()), *left, pix_width);
+  *bottom = ClipToRange(pix_height - box.bottom(), *top, pix_height);
+  return true;
+}
+
+/**
+ * Returns the bounding rectangle of the current object at the given level in
+ * coordinates of the original image.
+ * See comment on coordinate system above.
+ * Returns false if there is no such object at the current position.
+ */
+bool PageIterator::BoundingBox(PageIteratorLevel level, int *left, int *top, int *right,
+                               int *bottom) const {
+  return BoundingBox(level, 0, left, top, right, bottom);
+}
+
+bool PageIterator::BoundingBox(PageIteratorLevel level, const int padding, int *left, int *top,
+                               int *right, int *bottom) const {
+  if (!BoundingBoxInternal(level, left, top, right, bottom)) {
+    return false;
+  }
+  // Convert to the coordinate system of the original image.
+  *left = ClipToRange(*left / scale_ + rect_left_ - padding, rect_left_, rect_left_ + rect_width_);
+  *top = ClipToRange(*top / scale_ + rect_top_ - padding, rect_top_, rect_top_ + rect_height_);
+  *right = ClipToRange((*right + scale_ - 1) / scale_ + rect_left_ + padding, *left,
+                       rect_left_ + rect_width_);
+  *bottom = ClipToRange((*bottom + scale_ - 1) / scale_ + rect_top_ + padding, *top,
+                        rect_top_ + rect_height_);
+  return true;
+}
+
+/** Return that there is no such object at a given level. */
+bool PageIterator::Empty(PageIteratorLevel level) const {
+  if (it_->block() == nullptr) {
+    return true; // Already at the end!
+  }
+  if (it_->word() == nullptr && level != RIL_BLOCK) {
+    return true; // image block
+  }
+  if (level == RIL_SYMBOL && blob_index_ >= word_length_) {
+    return true; // Zero length word, or already at the end of it.
+  }
+  return false;
+}
+
+/** Returns the type of the current block.
+ *  See tesseract/publictypes.h for PolyBlockType. */
+PolyBlockType PageIterator::BlockType() const {
+  if (it_->block() == nullptr || it_->block()->block == nullptr) {
+    return PT_UNKNOWN; // Already at the end!
+  }
+  if (it_->block()->block->pdblk.poly_block() == nullptr) {
+    return PT_FLOWING_TEXT; // No layout analysis used - assume text.
+  }
+  return it_->block()->block->pdblk.poly_block()->isA();
+}
+
+/** Returns the polygon outline of the current block. The returned Pta must
+ *  be ptaDestroy-ed after use. */
+Pta *PageIterator::BlockPolygon() const {
+  if (it_->block() == nullptr || it_->block()->block == nullptr) {
+    return nullptr; // Already at the end!
+  }
+  if (it_->block()->block->pdblk.poly_block() == nullptr) {
+    return nullptr; // No layout analysis used - no polygon.
+  }
+  // Copy polygon, so we can unrotate it to image coordinates.
+  POLY_BLOCK *internal_poly = it_->block()->block->pdblk.poly_block();
+  ICOORDELT_LIST vertices;
+  vertices.deep_copy(internal_poly->points(), ICOORDELT::deep_copy);
+  POLY_BLOCK poly(&vertices, internal_poly->isA());
+  poly.rotate(it_->block()->block->re_rotation());
+  ICOORDELT_IT it(poly.points());
+  Pta *pta = ptaCreate(it.length());
+  int num_pts = 0;
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++num_pts) {
+    ICOORD *pt = it.data();
+    // Convert to top-down coords within the input image.
+    int x = static_cast<float>(pt->x()) / scale_ + rect_left_;
+    int y = rect_top_ + rect_height_ - static_cast<float>(pt->y()) / scale_;
+    x = ClipToRange(x, rect_left_, rect_left_ + rect_width_);
+    y = ClipToRange(y, rect_top_, rect_top_ + rect_height_);
+    ptaAddPt(pta, x, y);
+  }
+  return pta;
+}
+
+/**
+ * Returns a binary image of the current object at the given level.
+ * The position and size match the return from BoundingBoxInternal, and so this
+ * could be upscaled with respect to the original input image.
+ * Use pixDestroy to delete the image after use.
+ * The following methods are used to generate the images:
+ * RIL_BLOCK: mask the page image with the block polygon.
+ * RIL_TEXTLINE: Clip the rectangle of the line box from the page image.
+ * TODO(rays) fix this to generate and use a line polygon.
+ * RIL_WORD: Clip the rectangle of the word box from the page image.
+ * RIL_SYMBOL: Render the symbol outline to an image for cblobs (prior
+ * to recognition) or the bounding box otherwise.
+ * A reconstruction of the original image (using xor to check for double
+ * representation) should be reasonably accurate,
+ * apart from removed noise, at the block level. Below the block level, the
+ * reconstruction will be missing images and line separators.
+ * At the symbol level, kerned characters will be invade the bounding box
+ * if rendered after recognition, making an xor reconstruction inaccurate, but
+ * an or construction better. Before recognition, symbol-level reconstruction
+ * should be good, even with xor, since the images come from the connected
+ * components.
+ */
+Pix *PageIterator::GetBinaryImage(PageIteratorLevel level) const {
+  int left, top, right, bottom;
+  if (!BoundingBoxInternal(level, &left, &top, &right, &bottom)) {
+    return nullptr;
+  }
+  if (level == RIL_SYMBOL && cblob_it_ != nullptr && cblob_it_->data()->area() != 0) {
+    return cblob_it_->data()->render();
+  }
+  Box *box = boxCreate(left, top, right - left, bottom - top);
+  Image pix = pixClipRectangle(tesseract_->pix_binary(), box, nullptr);
+  boxDestroy(&box);
+  if (level == RIL_BLOCK || level == RIL_PARA) {
+    // Clip to the block polygon as well.
+    TBOX mask_box;
+    Image mask = it_->block()->block->render_mask(&mask_box);
+    int mask_x = left - mask_box.left();
+    int mask_y = top - (tesseract_->ImageHeight() - mask_box.top());
+    // AND the mask and pix, putting the result in pix.
+    pixRasterop(pix, std::max(0, -mask_x), std::max(0, -mask_y), pixGetWidth(pix),
+                pixGetHeight(pix), PIX_SRC & PIX_DST, mask, std::max(0, mask_x),
+                std::max(0, mask_y));
+    mask.destroy();
+  }
+  return pix;
+}
+
+/**
+ * Returns an image of the current object at the given level in greyscale
+ * if available in the input. To guarantee a binary image use BinaryImage.
+ * NOTE that in order to give the best possible image, the bounds are
+ * expanded slightly over the binary connected component, by the supplied
+ * padding, so the top-left position of the returned image is returned
+ * in (left,top). These will most likely not match the coordinates
+ * returned by BoundingBox.
+ * If you do not supply an original image, you will get a binary one.
+ * Use pixDestroy to delete the image after use.
+ */
+Pix *PageIterator::GetImage(PageIteratorLevel level, int padding, Pix *original_img, int *left,
+                            int *top) const {
+  int right, bottom;
+  if (!BoundingBox(level, left, top, &right, &bottom)) {
+    return nullptr;
+  }
+  if (original_img == nullptr) {
+    return GetBinaryImage(level);
+  }
+
+  // Expand the box.
+  *left = std::max(*left - padding, 0);
+  *top = std::max(*top - padding, 0);
+  right = std::min(right + padding, rect_width_);
+  bottom = std::min(bottom + padding, rect_height_);
+  Box *box = boxCreate(*left, *top, right - *left, bottom - *top);
+  Image grey_pix = pixClipRectangle(original_img, box, nullptr);
+  boxDestroy(&box);
+  if (level == RIL_BLOCK || level == RIL_PARA) {
+    // Clip to the block polygon as well.
+    TBOX mask_box;
+    Image mask = it_->block()->block->render_mask(&mask_box);
+    // Copy the mask registered correctly into an image the size of grey_pix.
+    int mask_x = *left - mask_box.left();
+    int mask_y = *top - (pixGetHeight(original_img) - mask_box.top());
+    int width = pixGetWidth(grey_pix);
+    int height = pixGetHeight(grey_pix);
+    Image resized_mask = pixCreate(width, height, 1);
+    pixRasterop(resized_mask, std::max(0, -mask_x), std::max(0, -mask_y), width, height, PIX_SRC,
+                mask, std::max(0, mask_x), std::max(0, mask_y));
+    mask.destroy();
+    pixDilateBrick(resized_mask, resized_mask, 2 * padding + 1, 2 * padding + 1);
+    pixInvert(resized_mask, resized_mask);
+    pixSetMasked(grey_pix, resized_mask, UINT32_MAX);
+    resized_mask.destroy();
+  }
+  return grey_pix;
+}
+
+/**
+ * Returns the baseline of the current object at the given level.
+ * The baseline is the line that passes through (x1, y1) and (x2, y2).
+ * WARNING: with vertical text, baselines may be vertical!
+ */
+bool PageIterator::Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2, int *y2) const {
+  if (it_->word() == nullptr) {
+    return false; // Already at the end!
+  }
+  ROW *row = it_->row()->row;
+  WERD *word = it_->word()->word;
+  TBOX box =
+      (level == RIL_WORD || level == RIL_SYMBOL) ? word->bounding_box() : row->bounding_box();
+  int left = box.left();
+  ICOORD startpt(left, static_cast<int16_t>(row->base_line(left) + 0.5));
+  int right = box.right();
+  ICOORD endpt(right, static_cast<int16_t>(row->base_line(right) + 0.5));
+  // Rotate to image coordinates and convert to global image coords.
+  startpt.rotate(it_->block()->block->re_rotation());
+  endpt.rotate(it_->block()->block->re_rotation());
+  *x1 = startpt.x() / scale_ + rect_left_;
+  *y1 = (rect_height_ - startpt.y()) / scale_ + rect_top_;
+  *x2 = endpt.x() / scale_ + rect_left_;
+  *y2 = (rect_height_ - endpt.y()) / scale_ + rect_top_;
+  return true;
+}
+
+void PageIterator::Orientation(tesseract::Orientation *orientation,
+                               tesseract::WritingDirection *writing_direction,
+                               tesseract::TextlineOrder *textline_order,
+                               float *deskew_angle) const {
+  BLOCK *block = it_->block()->block;
+
+  // Orientation
+  FCOORD up_in_image(0.0, 1.0);
+  up_in_image.unrotate(block->classify_rotation());
+  up_in_image.rotate(block->re_rotation());
+
+  if (up_in_image.x() == 0.0F) {
+    if (up_in_image.y() > 0.0F) {
+      *orientation = ORIENTATION_PAGE_UP;
+    } else {
+      *orientation = ORIENTATION_PAGE_DOWN;
+    }
+  } else if (up_in_image.x() > 0.0F) {
+    *orientation = ORIENTATION_PAGE_RIGHT;
+  } else {
+    *orientation = ORIENTATION_PAGE_LEFT;
+  }
+
+  // Writing direction
+  bool is_vertical_text = (block->classify_rotation().x() == 0.0);
+  bool right_to_left = block->right_to_left();
+  *writing_direction = is_vertical_text ? WRITING_DIRECTION_TOP_TO_BOTTOM
+                                        : (right_to_left ? WRITING_DIRECTION_RIGHT_TO_LEFT
+                                                         : WRITING_DIRECTION_LEFT_TO_RIGHT);
+
+  // Textline Order
+  const bool is_mongolian = false; // TODO(eger): fix me
+  *textline_order = is_vertical_text ? (is_mongolian ? TEXTLINE_ORDER_LEFT_TO_RIGHT
+                                                     : TEXTLINE_ORDER_RIGHT_TO_LEFT)
+                                     : TEXTLINE_ORDER_TOP_TO_BOTTOM;
+
+  // Deskew angle
+  FCOORD skew = block->skew(); // true horizontal for textlines
+  *deskew_angle = -skew.angle();
+}
+
+void PageIterator::ParagraphInfo(tesseract::ParagraphJustification *just, bool *is_list_item,
+                                 bool *is_crown, int *first_line_indent) const {
+  *just = tesseract::JUSTIFICATION_UNKNOWN;
+  if (!it_->row() || !it_->row()->row || !it_->row()->row->para() ||
+      !it_->row()->row->para()->model) {
+    return;
+  }
+
+  PARA *para = it_->row()->row->para();
+  *is_list_item = para->is_list_item;
+  *is_crown = para->is_very_first_or_continuation;
+  *first_line_indent = para->model->first_indent() - para->model->body_indent();
+  *just = para->model->justification();
+}
+
+/**
+ * Sets up the internal data for iterating the blobs of a new word, then
+ * moves the iterator to the given offset.
+ */
+void PageIterator::BeginWord(int offset) {
+  WERD_RES *word_res = it_->word();
+  if (word_res == nullptr) {
+    // This is a non-text block, so there is no word.
+    word_length_ = 0;
+    blob_index_ = 0;
+    word_ = nullptr;
+    return;
+  }
+  if (word_res->best_choice != nullptr) {
+    // Recognition has been done, so we are using the box_word, which
+    // is already baseline denormalized.
+    word_length_ = word_res->best_choice->length();
+    if (word_res->box_word != nullptr) {
+      if (word_res->box_word->length() != word_length_) {
+        tprintf("Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: ", word_length_,
+                word_res->best_choice->unichar_string().c_str(), word_res->box_word->length());
+        word_res->box_word->bounding_box().print();
+      }
+      ASSERT_HOST(word_res->box_word->length() == word_length_);
+    }
+    word_ = nullptr;
+    // We will be iterating the box_word.
+    delete cblob_it_;
+    cblob_it_ = nullptr;
+  } else {
+    // No recognition yet, so a "symbol" is a cblob.
+    word_ = word_res->word;
+    ASSERT_HOST(word_->cblob_list() != nullptr);
+    word_length_ = word_->cblob_list()->length();
+    if (cblob_it_ == nullptr) {
+      cblob_it_ = new C_BLOB_IT;
+    }
+    cblob_it_->set_to_list(word_->cblob_list());
+  }
+  for (blob_index_ = 0; blob_index_ < offset; ++blob_index_) {
+    if (cblob_it_ != nullptr) {
+      cblob_it_->forward();
+    }
+  }
+}
+
+bool PageIterator::SetWordBlamerBundle(BlamerBundle *blamer_bundle) {
+  if (it_->word() != nullptr) {
+    it_->word()->blamer_bundle = blamer_bundle;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+} // namespace tesseract.
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/pagesegmain.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/pagesegmain.cpp
@ -0,0 +1,414 @@
+/**********************************************************************
+ * File:        pagesegmain.cpp
+ * Description: Top-level page segmenter for Tesseract.
+ * Author:      Ray Smith
+ *
+ * (C) Copyright 2008, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifdef _WIN32
+#  ifndef unlink
+#    include <io.h>
+#  endif
+#else
+#  include <unistd.h>
+#endif // _WIN32
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h"
+#endif
+
+#include <allheaders.h>
+#include "blobbox.h"
+#include "blread.h"
+#include "colfind.h"
+#include "debugpixa.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#  include "equationdetect.h"
+#endif
+#include <tesseract/osdetect.h>
+#include "imagefind.h"
+#include "linefind.h"
+#include "makerow.h"
+#include "tabvector.h"
+#include "tesseractclass.h"
+#include "tessvars.h"
+#include "textord.h"
+#include "tordmain.h"
+#include "wordseg.h"
+
+namespace tesseract {
+
+// Max erosions to perform in removing an enclosing circle.
+const int kMaxCircleErosions = 8;
+
+// Helper to remove an enclosing circle from an image.
+// If there isn't one, then the image will most likely get badly mangled.
+// The returned pix must be pixDestroyed after use. nullptr may be returned
+// if the image doesn't meet the trivial conditions that it uses to determine
+// success.
+static Image RemoveEnclosingCircle(Image pixs) {
+  Image pixsi = pixInvert(nullptr, pixs);
+  Image pixc = pixCreateTemplate(pixs);
+  pixSetOrClearBorder(pixc, 1, 1, 1, 1, PIX_SET);
+  pixSeedfillBinary(pixc, pixc, pixsi, 4);
+  pixInvert(pixc, pixc);
+  pixsi.destroy();
+  Image pixt = pixs & pixc;
+  l_int32 max_count;
+  pixCountConnComp(pixt, 8, &max_count);
+  // The count has to go up before we start looking for the minimum.
+  l_int32 min_count = INT32_MAX;
+  Image pixout = nullptr;
+  for (int i = 1; i < kMaxCircleErosions; i++) {
+    pixt.destroy();
+    pixErodeBrick(pixc, pixc, 3, 3);
+    pixt = pixs & pixc;
+    l_int32 count;
+    pixCountConnComp(pixt, 8, &count);
+    if (i == 1 || count > max_count) {
+      max_count = count;
+      min_count = count;
+    } else if (count < min_count) {
+      min_count = count;
+      pixout.destroy();
+      pixout = pixt.copy(); // Save the best.
+    } else if (count >= min_count) {
+      break; // We have passed by the best.
+    }
+  }
+  pixt.destroy();
+  pixc.destroy();
+  return pixout;
+}
+
+/**
+ * Segment the page according to the current value of tessedit_pageseg_mode.
+ * pix_binary_ is used as the source image and should not be nullptr.
+ * On return the blocks list owns all the constructed page layout.
+ */
+int Tesseract::SegmentPage(const char *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess,
+                           OSResults *osr) {
+  ASSERT_HOST(pix_binary_ != nullptr);
+  int width = pixGetWidth(pix_binary_);
+  int height = pixGetHeight(pix_binary_);
+  // Get page segmentation mode.
+  auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode));
+  // If a UNLV zone file can be found, use that instead of segmentation.
+  if (!PSM_COL_FIND_ENABLED(pageseg_mode) && input_file != nullptr && input_file[0] != '\0') {
+    std::string name = input_file;
+    const char *lastdot = strrchr(name.c_str(), '.');
+    if (lastdot != nullptr) {
+      name[lastdot - name.c_str()] = '\0';
+    }
+    read_unlv_file(name, width, height, blocks);
+  }
+  if (blocks->empty()) {
+    // No UNLV file present. Work according to the PageSegMode.
+    // First make a single block covering the whole image.
+    BLOCK_IT block_it(blocks);
+    auto *block = new BLOCK("", true, 0, 0, 0, 0, width, height);
+    block->set_right_to_left(right_to_left());
+    block_it.add_to_end(block);
+  } else {
+    // UNLV file present. Use PSM_SINGLE_BLOCK.
+    pageseg_mode = PSM_SINGLE_BLOCK;
+  }
+  // The diacritic_blobs holds noise blobs that may be diacritics. They
+  // are separated out on areas of the image that seem noisy and short-circuit
+  // the layout process, going straight from the initial partition creation
+  // right through to after word segmentation, where they are added to the
+  // rej_cblobs list of the most appropriate word. From there classification
+  // will determine whether they are used.
+  BLOBNBOX_LIST diacritic_blobs;
+  int auto_page_seg_ret_val = 0;
+  TO_BLOCK_LIST to_blocks;
+  if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
+      PSM_SPARSE(pageseg_mode)) {
+    auto_page_seg_ret_val =
+        AutoPageSeg(pageseg_mode, blocks, &to_blocks,
+                    enable_noise_removal ? &diacritic_blobs : nullptr, osd_tess, osr);
+    if (pageseg_mode == PSM_OSD_ONLY) {
+      return auto_page_seg_ret_val;
+    }
+    // To create blobs from the image region bounds uncomment this line:
+    //  to_blocks.clear();  // Uncomment to go back to the old mode.
+  } else {
+    deskew_ = FCOORD(1.0f, 0.0f);
+    reskew_ = FCOORD(1.0f, 0.0f);
+    if (pageseg_mode == PSM_CIRCLE_WORD) {
+      Image pixcleaned = RemoveEnclosingCircle(pix_binary_);
+      if (pixcleaned != nullptr) {
+        pix_binary_.destroy();
+        pix_binary_ = pixcleaned;
+      }
+    }
+  }
+
+  if (auto_page_seg_ret_val < 0) {
+    return -1;
+  }
+
+  if (blocks->empty()) {
+    if (textord_debug_tabfind) {
+      tprintf("Empty page\n");
+    }
+    return 0; // AutoPageSeg found an empty page.
+  }
+  bool splitting = pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT;
+  bool cjk_mode = textord_use_cjk_fp_model;
+
+  textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_, pix_thresholds_,
+                       pix_grey_, splitting || cjk_mode, &diacritic_blobs, blocks, &to_blocks);
+  return auto_page_seg_ret_val;
+}
+
+/**
+ * Auto page segmentation. Divide the page image into blocks of uniform
+ * text linespacing and images.
+ *
+ * Resolution (in ppi) is derived from the input image.
+ *
+ * The output goes in the blocks list with corresponding TO_BLOCKs in the
+ * to_blocks list.
+ *
+ * If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide
+ * the image into columns, but multiple blocks are still made if the text is
+ * of non-uniform linespacing.
+ *
+ * If diacritic_blobs is non-null, then diacritics/noise blobs, that would
+ * confuse layout analysis by causing textline overlap, are placed there,
+ * with the expectation that they will be reassigned to words later and
+ * noise/diacriticness determined via classification.
+ *
+ * If osd (orientation and script detection) is true then that is performed
+ * as well. If only_osd is true, then only orientation and script detection is
+ * performed. If osd is desired, (osd or only_osd) then osr_tess must be
+ * another Tesseract that was initialized especially for osd, and the results
+ * will be output into osr (orientation and script result).
+ */
+int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks,
+                           BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr) {
+  Image photomask_pix = nullptr;
+  Image musicmask_pix = nullptr;
+  // The blocks made by the ColumnFinder. Moved to blocks before return.
+  BLOCK_LIST found_blocks;
+  TO_BLOCK_LIST temp_blocks;
+
+  ColumnFinder *finder = SetupPageSegAndDetectOrientation(
+      pageseg_mode, blocks, osd_tess, osr, &temp_blocks, &photomask_pix,
+      pageseg_apply_music_mask ? &musicmask_pix : nullptr);
+  int result = 0;
+  if (finder != nullptr) {
+    TO_BLOCK_IT to_block_it(&temp_blocks);
+    TO_BLOCK *to_block = to_block_it.data();
+    if (musicmask_pix != nullptr) {
+      // TODO(rays) pass the musicmask_pix into FindBlocks and mark music
+      // blocks separately. For now combine with photomask_pix.
+      photomask_pix |= musicmask_pix;
+    }
+#ifndef DISABLED_LEGACY_ENGINE
+    if (equ_detect_) {
+      finder->SetEquationDetect(equ_detect_);
+    }
+#endif // ndef DISABLED_LEGACY_ENGINE
+    result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, to_block,
+                                photomask_pix, pix_thresholds_, pix_grey_, &pixa_debug_,
+                                &found_blocks, diacritic_blobs, to_blocks);
+    if (result >= 0) {
+      finder->GetDeskewVectors(&deskew_, &reskew_);
+    }
+    delete finder;
+  }
+  photomask_pix.destroy();
+  musicmask_pix.destroy();
+  if (result < 0) {
+    return result;
+  }
+
+  blocks->clear();
+  BLOCK_IT block_it(blocks);
+  // Move the found blocks to the input/output blocks.
+  block_it.add_list_after(&found_blocks);
+  return result;
+}
+
+// Helper adds all the scripts from sid_set converted to ids from osd_set to
+// allowed_ids.
+static void AddAllScriptsConverted(const UNICHARSET &sid_set, const UNICHARSET &osd_set,
+                                   std::vector<int> *allowed_ids) {
+  for (int i = 0; i < sid_set.get_script_table_size(); ++i) {
+    if (i != sid_set.null_sid()) {
+      const char *script = sid_set.get_script_from_script_id(i);
+      allowed_ids->push_back(osd_set.get_script_id_from_name(script));
+    }
+  }
+}
+
+/**
+ * Sets up auto page segmentation, determines the orientation, and corrects it.
+ * Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to
+ * facilitate testing.
+ * photo_mask_pix is a pointer to a nullptr pointer that will be filled on
+ * return with the leptonica photo mask, which must be pixDestroyed by the
+ * caller. to_blocks is an empty list that will be filled with (usually a
+ * single) block that is used during layout analysis. This ugly API is required
+ * because of the possibility of a unlv zone file.
+ * TODO(rays) clean this up.
+ * See AutoPageSeg for other arguments.
+ * The returned ColumnFinder must be deleted after use.
+ */
+ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode,
+                                                          BLOCK_LIST *blocks, Tesseract *osd_tess,
+                                                          OSResults *osr, TO_BLOCK_LIST *to_blocks,
+                                                          Image *photo_mask_pix,
+                                                          Image *music_mask_pix) {
+  int vertical_x = 0;
+  int vertical_y = 1;
+  TabVector_LIST v_lines;
+  TabVector_LIST h_lines;
+  ICOORD bleft(0, 0);
+
+  ASSERT_HOST(pix_binary_ != nullptr);
+  if (tessedit_dump_pageseg_images) {
+    pixa_debug_.AddPix(pix_binary_, "PageSegInput");
+  }
+  // Leptonica is used to find the rule/separator lines in the input.
+  LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_,
+                                 &vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines);
+  if (tessedit_dump_pageseg_images) {
+    pixa_debug_.AddPix(pix_binary_, "NoLines");
+  }
+  // Leptonica is used to find a mask of the photo regions in the input.
+  *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
+  if (tessedit_dump_pageseg_images) {
+    Image pix_no_image_ = nullptr;
+    if (*photo_mask_pix != nullptr) {
+      pix_no_image_ = pixSubtract(nullptr, pix_binary_, *photo_mask_pix);
+    } else {
+      pix_no_image_ = pix_binary_.clone();
+    }
+    pixa_debug_.AddPix(pix_no_image_, "NoImages");
+    pix_no_image_.destroy();
+  }
+  if (!PSM_COL_FIND_ENABLED(pageseg_mode)) {
+    v_lines.clear();
+  }
+
+  // The rest of the algorithm uses the usual connected components.
+  textord_.find_components(pix_binary_, blocks, to_blocks);
+
+  TO_BLOCK_IT to_block_it(to_blocks);
+  // There must be exactly one input block.
+  // TODO(rays) handle new textline finding with a UNLV zone file.
+  ASSERT_HOST(to_blocks->singleton());
+  TO_BLOCK *to_block = to_block_it.data();
+  TBOX blkbox = to_block->block->pdblk.bounding_box();
+  ColumnFinder *finder = nullptr;
+  int estimated_resolution = source_resolution_;
+  if (source_resolution_ == kMinCredibleResolution) {
+    // Try to estimate resolution from typical body text size.
+    int res = IntCastRounded(to_block->line_size * kResolutionEstimationFactor);
+    if (res > estimated_resolution && res < kMaxCredibleResolution) {
+      estimated_resolution = res;
+      tprintf("Estimating resolution as %d\n", estimated_resolution);
+    }
+  }
+
+  if (to_block->line_size >= 2) {
+    finder = new ColumnFinder(static_cast<int>(to_block->line_size), blkbox.botleft(),
+                              blkbox.topright(), estimated_resolution, textord_use_cjk_fp_model,
+                              textord_tabfind_aligned_gap_fraction, &v_lines, &h_lines, vertical_x,
+                              vertical_y);
+
+    finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+    if (equ_detect_) {
+      equ_detect_->LabelSpecialText(to_block);
+    }
+
+    BLOBNBOX_CLIST osd_blobs;
+    // osd_orientation is the number of 90 degree rotations to make the
+    // characters upright. (See tesseract/osdetect.h for precise definition.)
+    // We want the text lines horizontal, (vertical text indicates vertical
+    // textlines) which may conflict (eg vertically written CJK).
+    int osd_orientation = 0;
+    bool vertical_text =
+        textord_tabfind_force_vertical_text || pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
+    if (!vertical_text && textord_tabfind_vertical_text && PSM_ORIENTATION_ENABLED(pageseg_mode)) {
+      vertical_text = finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio, to_block,
+                                                      &osd_blobs);
+    }
+    if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != nullptr && osr != nullptr) {
+      std::vector<int> osd_scripts;
+      if (osd_tess != this) {
+        // We are running osd as part of layout analysis, so constrain the
+        // scripts to those allowed by *this.
+        AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
+        for (auto &lang : sub_langs_) {
+          AddAllScriptsConverted(lang->unicharset, osd_tess->unicharset, &osd_scripts);
+        }
+      }
+      os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
+      if (pageseg_mode == PSM_OSD_ONLY) {
+        delete finder;
+        return nullptr;
+      }
+      osd_orientation = osr->best_result.orientation_id;
+      double osd_score = osr->orientations[osd_orientation];
+      double osd_margin = min_orientation_margin * 2;
+      for (int i = 0; i < 4; ++i) {
+        if (i != osd_orientation && osd_score - osr->orientations[i] < osd_margin) {
+          osd_margin = osd_score - osr->orientations[i];
+        }
+      }
+      int best_script_id = osr->best_result.script_id;
+      const char *best_script_str = osd_tess->unicharset.get_script_from_script_id(best_script_id);
+      bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
+                 best_script_id == osd_tess->unicharset.hiragana_sid() ||
+                 best_script_id == osd_tess->unicharset.katakana_sid() ||
+                 strcmp("Japanese", best_script_str) == 0 ||
+                 strcmp("Korean", best_script_str) == 0 || strcmp("Hangul", best_script_str) == 0;
+      if (cjk) {
+        finder->set_cjk_script(true);
+      }
+      if (osd_margin < min_orientation_margin) {
+        // The margin is weak.
+        if (!cjk && !vertical_text && osd_orientation == 2) {
+          // upside down latin text is improbable with such a weak margin.
+          tprintf(
+              "OSD: Weak margin (%.2f), horiz textlines, not CJK: "
+              "Don't rotate.\n",
+              osd_margin);
+          osd_orientation = 0;
+        } else {
+          tprintf(
+              "OSD: Weak margin (%.2f) for %d blob text block, "
+              "but using orientation anyway: %d\n",
+              osd_margin, osd_blobs.length(), osd_orientation);
+        }
+      }
+    }
+    osd_blobs.shallow_clear();
+    finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
+
+#endif // ndef DISABLED_LEGACY_ENGINE
+  }
+
+  return finder;
+}
+
+} // namespace tesseract.
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/pagewalk.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/pagewalk.cpp
@ -0,0 +1,42 @@
+/**********************************************************************
+ * File:        pagewalk.cpp  (Formerly walkers.c)
+ * Description: Block list processors
+ * Author:      Phil Cheatle
+ * Created:     Thu Oct 10 16:25:24 BST 1991
+ *
+ * (C) Copyright 1991, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "pageres.h"
+#include "tesseractclass.h"
+
+namespace tesseract {
+/**
+ * @name process_selected_words()
+ *
+ * Walk the current block list applying the specified word processor function
+ * to each word that overlaps the selection_box.
+ */
+void Tesseract::process_selected_words(
+    PAGE_RES *page_res, // blocks to check
+    TBOX &selection_box, bool (tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it)) {
+  for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != nullptr; page_res_it.forward()) {
+    WERD *word = page_res_it.word()->word;
+    if (word->bounding_box().overlap(selection_box)) {
+      if (!(this->*word_processor)(&page_res_it)) {
+        return;
+      }
+    }
+  }
+}
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/par_control.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/par_control.cpp
@ -0,0 +1,70 @@
+///////////////////////////////////////////////////////////////////////
+// File:        par_control.cpp
+// Description: Control code for parallel implementation.
+// Author:      Ray Smith
+//
+// (C) Copyright 2013, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "tesseractclass.h"
+#ifdef _OPENMP
+#  include <omp.h>
+#endif // _OPENMP
+
+namespace tesseract {
+
+struct BlobData {
+  BlobData() = default;
+  BlobData(int index, Tesseract *tess, const WERD_RES &word)
+      : blob(word.chopped_word->blobs[index])
+      , tesseract(tess)
+      , choices(&(*word.ratings)(index, index)) {}
+
+  TBLOB *blob = nullptr;
+  Tesseract *tesseract = nullptr;
+  BLOB_CHOICE_LIST **choices = nullptr;
+};
+
+void Tesseract::PrerecAllWordsPar(const std::vector<WordData> &words) {
+  // Prepare all the blobs.
+  std::vector<BlobData> blobs;
+  for (const auto &w : words) {
+    if (w.word->ratings != nullptr && w.word->ratings->get(0, 0) == nullptr) {
+      for (int s = 0; s < w.lang_words.size(); ++s) {
+        Tesseract *sub = s < sub_langs_.size() ? sub_langs_[s] : this;
+        const WERD_RES &word = *w.lang_words[s];
+        for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) {
+          blobs.emplace_back(b, sub, word);
+        }
+      }
+    }
+  }
+  // Pre-classify all the blobs.
+  if (tessedit_parallelize > 1) {
+#ifdef _OPENMP
+#  pragma omp parallel for num_threads(10)
+#endif // _OPENMP
+    // NOLINTNEXTLINE(modernize-loop-convert)
+    for (size_t b = 0; b < blobs.size(); ++b) {
+      *blobs[b].choices =
+          blobs[b].tesseract->classify_blob(blobs[b].blob, "par", ScrollView::WHITE, nullptr);
+    }
+  } else {
+    // TODO(AMD) parallelize this.
+    for (auto &blob : blobs) {
+      *blob.choices = blob.tesseract->classify_blob(blob.blob, "par", ScrollView::WHITE, nullptr);
+    }
+  }
+}
+
+} // namespace tesseract.
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/paragraphs.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/paragraphs.cpp
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/paragraphs.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/paragraphs.h
@ -0,0 +1,104 @@
+/**********************************************************************
+ * File:        paragraphs.h
+ * Description: Paragraph Detection data structures.
+ * Author:      David Eger
+ * Created:     25 February 2011
+ *
+ * (C) Copyright 2011, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_
+#define TESSERACT_CCMAIN_PARAGRAPHS_H_
+
+#include <list>
+#include <string>
+#include "rect.h"   // for TBOX
+
+namespace tesseract {
+
+class MutableIterator;
+class ParagraphModel;
+class PARA_LIST;
+struct PARA;
+
+// This structure captures all information needed about a text line for the
+// purposes of paragraph detection.  It is meant to be exceedingly light-weight
+// so that we can easily test paragraph detection independent of the rest of
+// Tesseract.
+class RowInfo {
+public:
+  // Constant data derived from Tesseract output.
+  std::string text; // the full UTF-8 text of the line.
+  bool ltr;    // whether the majority of the text is left-to-right
+               // TODO(eger) make this more fine-grained.
+
+  bool has_leaders;            // does the line contain leader dots (.....)?
+  bool has_drop_cap;           // does the line have a drop cap?
+  int pix_ldistance;           // distance to the left pblock boundary in pixels
+  int pix_rdistance;           // distance to the right pblock boundary in pixels
+  float pix_xheight;           // guessed xheight for the line
+  int average_interword_space; // average space between words in pixels.
+
+  int num_words;
+  TBOX lword_box; // in normalized (horiz text rows) space
+  TBOX rword_box; // in normalized (horiz text rows) space
+
+  std::string lword_text; // the UTF-8 text of the leftmost werd
+  std::string rword_text; // the UTF-8 text of the rightmost werd
+
+  //   The text of a paragraph typically starts with the start of an idea and
+  // ends with the end of an idea.  Here we define paragraph as something that
+  // may have a first line indent and a body indent which may be different.
+  // Typical words that start an idea are:
+  //   1. Words in western scripts that start with
+  //      a capital letter, for example "The"
+  //   2. Bulleted or numbered list items, for
+  //      example "2."
+  // Typical words which end an idea are words ending in punctuation marks. In
+  // this vocabulary, each list item is represented as a paragraph.
+  bool lword_indicates_list_item;
+  bool lword_likely_starts_idea;
+  bool lword_likely_ends_idea;
+
+  bool rword_indicates_list_item;
+  bool rword_likely_starts_idea;
+  bool rword_likely_ends_idea;
+};
+
+// Main entry point for Paragraph Detection Algorithm.
+//
+// Given a set of equally spaced textlines (described by row_infos),
+// Split them into paragraphs.  See http://goto/paragraphstalk
+//
+// Output:
+//   row_owners - one pointer for each row, to the paragraph it belongs to.
+//   paragraphs - this is the actual list of PARA objects.
+//   models - the list of paragraph models referenced by the PARA objects.
+//            caller is responsible for deleting the models.
+TESS_API
+void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
+                      std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,
+                      std::vector<ParagraphModel *> *models);
+
+// Given a MutableIterator to the start of a block, run DetectParagraphs on
+// that block and commit the results to the underlying ROW and BLOCK structs,
+// saving the ParagraphModels in models.  Caller owns the models.
+// We use unicharset during the function to answer questions such as "is the
+// first letter of this word upper case?"
+TESS_API
+void DetectParagraphs(int debug_level, bool after_text_recognition,
+                      const MutableIterator *block_start, std::vector<ParagraphModel *> *models);
+
+} // namespace tesseract
+
+#endif // TESSERACT_CCMAIN_PARAGRAPHS_H_
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/paragraphs_internal.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/paragraphs_internal.h
@ -0,0 +1,309 @@
+/**********************************************************************
+ * File:        paragraphs_internal.h
+ * Description: Paragraph Detection internal data structures.
+ * Author:      David Eger
+ *
+ * (C) Copyright 2011, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
+#define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
+
+#include <tesseract/publictypes.h> // for ParagraphJustification
+#include "paragraphs.h"
+
+// NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
+// DATA STRUCTURES OR FUNCTIONS IN THIS FILE.
+
+namespace tesseract {
+
+class UNICHARSET;
+class WERD_CHOICE;
+
+// Return whether the given word is likely to be a list item start word.
+TESS_API
+bool AsciiLikelyListItem(const std::string &word);
+
+// Return the first Unicode Codepoint from werd[pos].
+int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
+
+// Set right word attributes given either a unicharset and werd or a utf8
+// string.
+TESS_API
+void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
+                         bool *is_list, bool *starts_idea, bool *ends_idea);
+
+// Set left word attributes given either a unicharset and werd or a utf8 string.
+TESS_API
+void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
+                        bool *is_list, bool *starts_idea, bool *ends_idea);
+
+enum LineType {
+  LT_START = 'S',    // First line of a paragraph.
+  LT_BODY = 'C',     // Continuation line of a paragraph.
+  LT_UNKNOWN = 'U',  // No clues.
+  LT_MULTIPLE = 'M', // Matches for both LT_START and LT_BODY.
+};
+
+// The first paragraph in a page of body text is often un-indented.
+// This is a typographic convention which is common to indicate either that:
+// (1) The paragraph is the continuation of a previous paragraph, or
+// (2) The paragraph is the first paragraph in a chapter.
+//
+// I refer to such paragraphs as "crown"s, and the output of the paragraph
+// detection algorithm attempts to give them the same paragraph model as
+// the rest of the body text.
+//
+// Nonetheless, while building hypotheses, it is useful to mark the lines
+// of crown paragraphs temporarily as crowns, either aligned left or right.
+extern const ParagraphModel *kCrownLeft;
+extern const ParagraphModel *kCrownRight;
+
+inline bool StrongModel(const ParagraphModel *model) {
+  return model != nullptr && model != kCrownLeft && model != kCrownRight;
+}
+
+struct LineHypothesis {
+  LineHypothesis() : ty(LT_UNKNOWN), model(nullptr) {}
+  LineHypothesis(LineType line_type, const ParagraphModel *m) : ty(line_type), model(m) {}
+  LineHypothesis(const LineHypothesis &other) = default;
+
+  // Copy assignment operator.
+  LineHypothesis &operator=(const LineHypothesis &other) = default;
+
+  bool operator==(const LineHypothesis &other) const {
+    return ty == other.ty && model == other.model;
+  }
+
+  LineType ty;
+  const ParagraphModel *model;
+};
+
+class ParagraphTheory; // Forward Declaration
+
+using SetOfModels = std::vector<const ParagraphModel *>;
+
+// Row Scratch Registers are data generated by the paragraph detection
+// algorithm based on a RowInfo input.
+class RowScratchRegisters {
+public:
+  // We presume row will outlive us.
+  void Init(const RowInfo &row);
+
+  LineType GetLineType() const;
+
+  LineType GetLineType(const ParagraphModel *model) const;
+
+  // Mark this as a start line type, sans model.  This is useful for the
+  // initial marking of probable body lines or paragraph start lines.
+  void SetStartLine();
+
+  // Mark this as a body line type, sans model.  This is useful for the
+  // initial marking of probably body lines or paragraph start lines.
+  void SetBodyLine();
+
+  // Record that this row fits as a paragraph start line in the given model,
+  void AddStartLine(const ParagraphModel *model);
+  // Record that this row fits as a paragraph body line in the given model,
+  void AddBodyLine(const ParagraphModel *model);
+
+  // Clear all hypotheses about this line.
+  void SetUnknown() {
+    hypotheses_.clear();
+  }
+
+  // Append all hypotheses of strong models that match this row as a start.
+  void StartHypotheses(SetOfModels *models) const;
+
+  // Append all hypotheses of strong models matching this row.
+  void StrongHypotheses(SetOfModels *models) const;
+
+  // Append all hypotheses for this row.
+  void NonNullHypotheses(SetOfModels *models) const;
+
+  // Discard any hypotheses whose model is not in the given list.
+  void DiscardNonMatchingHypotheses(const SetOfModels &models);
+
+  // If we have only one hypothesis and that is that this line is a paragraph
+  // start line of a certain model, return that model.  Else return nullptr.
+  const ParagraphModel *UniqueStartHypothesis() const;
+
+  // If we have only one hypothesis and that is that this line is a paragraph
+  // body line of a certain model, return that model.  Else return nullptr.
+  const ParagraphModel *UniqueBodyHypothesis() const;
+
+  // Return the indentation for the side opposite of the aligned side.
+  int OffsideIndent(tesseract::ParagraphJustification just) const {
+    switch (just) {
+      case tesseract::JUSTIFICATION_RIGHT:
+        return lindent_;
+      case tesseract::JUSTIFICATION_LEFT:
+        return rindent_;
+      default:
+        return lindent_ > rindent_ ? lindent_ : rindent_;
+    }
+  }
+
+  // Return the indentation for the side the text is aligned to.
+  int AlignsideIndent(tesseract::ParagraphJustification just) const {
+    switch (just) {
+      case tesseract::JUSTIFICATION_RIGHT:
+        return rindent_;
+      case tesseract::JUSTIFICATION_LEFT:
+        return lindent_;
+      default:
+        return lindent_ > rindent_ ? lindent_ : rindent_;
+    }
+  }
+
+  // Append header fields to a vector of row headings.
+  static void AppendDebugHeaderFields(std::vector<std::string> &header);
+
+  // Append data for this row to a vector of debug strings.
+  void AppendDebugInfo(const ParagraphTheory &theory, std::vector<std::string> &dbg) const;
+
+  const RowInfo *ri_;
+
+  // These four constants form a horizontal box model for the white space
+  // on the edges of each line.  At each point in the algorithm, the following
+  // shall hold:
+  //   ri_->pix_ldistance = lmargin_ + lindent_
+  //   ri_->pix_rdistance = rindent_ + rmargin_
+  int lmargin_;
+  int lindent_;
+  int rindent_;
+  int rmargin_;
+
+private:
+  // Hypotheses of either LT_START or LT_BODY
+  std::vector<LineHypothesis> hypotheses_;
+};
+
+// A collection of convenience functions for wrapping the set of
+// Paragraph Models we believe correctly model the paragraphs in the image.
+class ParagraphTheory {
+public:
+  // We presume models will outlive us, and that models will take ownership
+  // of any ParagraphModel *'s we add.
+  explicit ParagraphTheory(std::vector<ParagraphModel *> *models) : models_(models) {}
+  std::vector<ParagraphModel *> &models() {
+    return *models_;
+  }
+  const std::vector<ParagraphModel *> &models() const {
+    return *models_;
+  }
+
+  // Return an existing model if one that is Comparable() can be found.
+  // Else, allocate a new copy of model to save and return a pointer to it.
+  const ParagraphModel *AddModel(const ParagraphModel &model);
+
+  // Discard any models we've made that are not in the list of used models.
+  void DiscardUnusedModels(const SetOfModels &used_models);
+
+  // Return the set of all non-centered models.
+  void NonCenteredModels(SetOfModels *models);
+
+  // If any of the non-centered paragraph models we know about fit
+  // rows[start, end), return it.  Else nullptr.
+  const ParagraphModel *Fits(const std::vector<RowScratchRegisters> *rows, int start,
+                             int end) const;
+
+  int IndexOf(const ParagraphModel *model) const;
+
+private:
+  std::vector<ParagraphModel *> *models_;
+  std::vector<ParagraphModel *> models_we_added_;
+};
+
+bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,
+                    const ParagraphModel *model);
+bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,
+                   const ParagraphModel *model);
+bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,
+                     const ParagraphModel *model);
+
+// A class for smearing Paragraph Model hypotheses to surrounding rows.
+// The idea here is that StrongEvidenceClassify first marks only exceedingly
+// obvious start and body rows and constructs models of them.  Thereafter,
+// we may have left over unmarked lines (mostly end-of-paragraph lines) which
+// were too short to have much confidence about, but which fit the models we've
+// constructed perfectly and which we ought to mark.  This class is used to
+// "smear" our models over the text.
+class ParagraphModelSmearer {
+public:
+  ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
+                        ParagraphTheory *theory);
+
+  // Smear forward paragraph models from existing row markings to subsequent
+  // text lines if they fit, and mark any thereafter still unmodeled rows
+  // with any model in the theory that fits them.
+  void Smear();
+
+private:
+  // Record in open_models_ for rows [start_row, end_row) the list of models
+  // currently open at each row.
+  // A model is still open in a row if some previous row has said model as a
+  // start hypothesis, and all rows since (including this row) would fit as
+  // either a body or start line in that model.
+  void CalculateOpenModels(int row_start, int row_end);
+
+  SetOfModels &OpenModels(int row) {
+    return open_models_[row - row_start_ + 1];
+  }
+
+  ParagraphTheory *theory_;
+  std::vector<RowScratchRegisters> *rows_;
+  int row_start_;
+  int row_end_;
+
+  // open_models_ corresponds to rows[start_row_ - 1, end_row_]
+  //
+  // open_models_:  Contains models which there was an active (open) paragraph
+  //                as of the previous line and for which the left and right
+  //                indents admit the possibility that this text line continues
+  //                to fit the same model.
+  // TODO(eger): Think about whether we can get rid of "Open" models and just
+  //   use the current hypotheses on RowScratchRegisters.
+  std::vector<SetOfModels> open_models_;
+};
+
+// Clear all hypotheses about lines [start, end) and reset the margins to the
+// percentile (0..100) value of the left and right row edges for this run of
+// rows.
+void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,
+                                        int end, int percentile);
+
+// Return the median inter-word space in rows[row_start, row_end).
+int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end);
+
+// Return whether the first word on the after line can fit in the space at
+// the end of the before line (knowing which way the text is aligned and read).
+bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after,
+                           tesseract::ParagraphJustification justification);
+
+// Return whether the first word on the after line can fit in the space at
+// the end of the before line (not knowing the text alignment).
+bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after);
+
+// Do rows[start, end) form a single instance of the given paragraph model?
+bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,
+                  const ParagraphModel *model);
+
+// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
+// normalize each row_owner to point to an actual PARA, and output the
+// paragraphs in order onto paragraphs.
+void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs);
+
+} // namespace tesseract
+
+#endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/paramsd.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/paramsd.cpp
@ -0,0 +1,358 @@
+///////////////////////////////////////////////////////////////////////
+// File:        paramsd.cpp
+// Description: Tesseract parameter Editor
+// Author:      Joern Wanke
+//
+// (C) Copyright 2007, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+//
+// The parameters editor is used to edit all the parameters used within
+// tesseract from the ui.
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h"
+#endif
+
+#ifndef GRAPHICS_DISABLED
+
+#  include "params.h" // for ParamsVectors, StringParam, BoolParam
+#  include "paramsd.h"
+#  include "scrollview.h"     // for SVEvent, ScrollView, SVET_POPUP
+#  include "svmnode.h"        // for SVMenuNode
+#  include "tesseractclass.h" // for Tesseract
+
+#  include <cstdio>  // for fclose, fopen, fprintf, sprintf, FILE
+#  include <cstdlib> // for atoi
+#  include <cstring> // for strcmp, strcspn, strlen, strncpy
+#  include <locale>  // for std::locale::classic
+#  include <map>     // for map, _Rb_tree_iterator, map<>::iterator
+#  include <memory>  // for unique_ptr
+#  include <sstream> // for std::stringstream
+#  include <utility> // for pair
+
+namespace tesseract {
+
+#  define VARDIR "configs/" /*parameters files */
+#  define MAX_ITEMS_IN_SUBMENU 30
+
+// The following variables should remain static globals, since they
+// are used by debug editor, which uses a single Tesseract instance.
+//
+// Contains the mappings from unique VC ids to their actual pointers.
+static std::map<int, ParamContent *> vcMap;
+static int nrParams = 0;
+static int writeCommands[2];
+
+// Constructors for the various ParamTypes.
+ParamContent::ParamContent(tesseract::StringParam *it) {
+  my_id_ = nrParams;
+  nrParams++;
+  param_type_ = VT_STRING;
+  sIt = it;
+  vcMap[my_id_] = this;
+}
+// Constructors for the various ParamTypes.
+ParamContent::ParamContent(tesseract::IntParam *it) {
+  my_id_ = nrParams;
+  nrParams++;
+  param_type_ = VT_INTEGER;
+  iIt = it;
+  vcMap[my_id_] = this;
+}
+// Constructors for the various ParamTypes.
+ParamContent::ParamContent(tesseract::BoolParam *it) {
+  my_id_ = nrParams;
+  nrParams++;
+  param_type_ = VT_BOOLEAN;
+  bIt = it;
+  vcMap[my_id_] = this;
+}
+// Constructors for the various ParamTypes.
+ParamContent::ParamContent(tesseract::DoubleParam *it) {
+  my_id_ = nrParams;
+  nrParams++;
+  param_type_ = VT_DOUBLE;
+  dIt = it;
+  vcMap[my_id_] = this;
+}
+
+// Gets a VC object identified by its ID.
+ParamContent *ParamContent::GetParamContentById(int id) {
+  return vcMap[id];
+}
+
+// Copy the first N words from the source string to the target string.
+// Words are delimited by "_".
+void ParamsEditor::GetFirstWords(const char *s, // source string
+                                 int n,         // number of words
+                                 char *t        // target string
+) {
+  int full_length = strlen(s);
+  int reqd_len = 0; // No. of chars requird
+  const char *next_word = s;
+
+  while ((n > 0) && reqd_len < full_length) {
+    reqd_len += strcspn(next_word, "_") + 1;
+    next_word += reqd_len;
+    n--;
+  }
+  strncpy(t, s, reqd_len);
+  t[reqd_len] = '\0'; // ensure null terminal
+}
+
+// Getter for the name.
+const char *ParamContent::GetName() const {
+  if (param_type_ == VT_INTEGER) {
+    return iIt->name_str();
+  } else if (param_type_ == VT_BOOLEAN) {
+    return bIt->name_str();
+  } else if (param_type_ == VT_DOUBLE) {
+    return dIt->name_str();
+  } else if (param_type_ == VT_STRING) {
+    return sIt->name_str();
+  } else {
+    return "ERROR: ParamContent::GetName()";
+  }
+}
+
+// Getter for the description.
+const char *ParamContent::GetDescription() const {
+  if (param_type_ == VT_INTEGER) {
+    return iIt->info_str();
+  } else if (param_type_ == VT_BOOLEAN) {
+    return bIt->info_str();
+  } else if (param_type_ == VT_DOUBLE) {
+    return dIt->info_str();
+  } else if (param_type_ == VT_STRING) {
+    return sIt->info_str();
+  } else {
+    return nullptr;
+  }
+}
+
+// Getter for the value.
+std::string ParamContent::GetValue() const {
+  std::string result;
+  if (param_type_ == VT_INTEGER) {
+    result += std::to_string(*iIt);
+  } else if (param_type_ == VT_BOOLEAN) {
+    result += std::to_string(*bIt);
+  } else if (param_type_ == VT_DOUBLE) {
+    result += std::to_string(*dIt);
+  } else if (param_type_ == VT_STRING) {
+    result = sIt->c_str();
+  }
+  return result;
+}
+
+// Setter for the value.
+void ParamContent::SetValue(const char *val) {
+  // TODO (wanke) Test if the values actually are properly converted.
+  // (Quickly visible impacts?)
+  changed_ = true;
+  if (param_type_ == VT_INTEGER) {
+    iIt->set_value(atoi(val));
+  } else if (param_type_ == VT_BOOLEAN) {
+    bIt->set_value(atoi(val));
+  } else if (param_type_ == VT_DOUBLE) {
+    std::stringstream stream(val);
+    // Use "C" locale for reading double value.
+    stream.imbue(std::locale::classic());
+    double d = 0;
+    stream >> d;
+    dIt->set_value(d);
+  } else if (param_type_ == VT_STRING) {
+    sIt->set_value(val);
+  }
+}
+
+// Gets the up to the first 3 prefixes from s (split by _).
+// For example, tesseract_foo_bar will be split into tesseract,foo and bar.
+void ParamsEditor::GetPrefixes(const char *s, std::string *level_one, std::string *level_two,
+                               std::string *level_three) {
+  std::unique_ptr<char[]> p(new char[1024]);
+  GetFirstWords(s, 1, p.get());
+  *level_one = p.get();
+  GetFirstWords(s, 2, p.get());
+  *level_two = p.get();
+  GetFirstWords(s, 3, p.get());
+  *level_three = p.get();
+}
+
+// Compare two VC objects by their name.
+int ParamContent::Compare(const void *v1, const void *v2) {
+  const ParamContent *one = *static_cast<const ParamContent *const *>(v1);
+  const ParamContent *two = *static_cast<const ParamContent *const *>(v2);
+  return strcmp(one->GetName(), two->GetName());
+}
+
+// Find all editable parameters used within tesseract and create a
+// SVMenuNode tree from it.
+// TODO (wanke): This is actually sort of hackish.
+SVMenuNode *ParamsEditor::BuildListOfAllLeaves(tesseract::Tesseract *tess) {
+  auto *mr = new SVMenuNode();
+  ParamContent_LIST vclist;
+  ParamContent_IT vc_it(&vclist);
+  // Amount counts the number of entries for a specific char*.
+  // TODO(rays) get rid of the use of std::map.
+  std::map<const char *, int> amount;
+
+  // Add all parameters to a list.
+  int num_iterations = (tess->params() == nullptr) ? 1 : 2;
+  for (int v = 0; v < num_iterations; ++v) {
+    tesseract::ParamsVectors *vec = (v == 0) ? GlobalParams() : tess->params();
+    for (auto &param : vec->int_params) {
+      vc_it.add_after_then_move(new ParamContent(param));
+    }
+    for (auto &param : vec->bool_params) {
+      vc_it.add_after_then_move(new ParamContent(param));
+    }
+    for (auto &param : vec->string_params) {
+      vc_it.add_after_then_move(new ParamContent(param));
+    }
+    for (auto &param : vec->double_params) {
+      vc_it.add_after_then_move(new ParamContent(param));
+    }
+  }
+
+  // Count the # of entries starting with a specific prefix.
+  for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {
+    ParamContent *vc = vc_it.data();
+    std::string tag;
+    std::string tag2;
+    std::string tag3;
+
+    GetPrefixes(vc->GetName(), &tag, &tag2, &tag3);
+    amount[tag.c_str()]++;
+    amount[tag2.c_str()]++;
+    amount[tag3.c_str()]++;
+  }
+
+  vclist.sort(ParamContent::Compare); // Sort the list alphabetically.
+
+  SVMenuNode *other = mr->AddChild("OTHER");
+
+  // go through the list again and this time create the menu structure.
+  vc_it.move_to_first();
+  for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {
+    ParamContent *vc = vc_it.data();
+    std::string tag;
+    std::string tag2;
+    std::string tag3;
+    GetPrefixes(vc->GetName(), &tag, &tag2, &tag3);
+
+    if (amount[tag.c_str()] == 1) {
+      other->AddChild(vc->GetName(), vc->GetId(), vc->GetValue().c_str(), vc->GetDescription());
+    } else { // More than one would use this submenu -> create submenu.
+      SVMenuNode *sv = mr->AddChild(tag.c_str());
+      if ((amount[tag.c_str()] <= MAX_ITEMS_IN_SUBMENU) || (amount[tag2.c_str()] <= 1)) {
+        sv->AddChild(vc->GetName(), vc->GetId(), vc->GetValue().c_str(), vc->GetDescription());
+      } else { // Make subsubmenus.
+        SVMenuNode *sv2 = sv->AddChild(tag2.c_str());
+        sv2->AddChild(vc->GetName(), vc->GetId(), vc->GetValue().c_str(), vc->GetDescription());
+      }
+    }
+  }
+  return mr;
+}
+
+// Event listener. Waits for SVET_POPUP events and processes them.
+void ParamsEditor::Notify(const SVEvent *sve) {
+  if (sve->type == SVET_POPUP) { // only catch SVET_POPUP!
+    char *param = sve->parameter;
+    if (sve->command_id == writeCommands[0]) {
+      WriteParams(param, false);
+    } else if (sve->command_id == writeCommands[1]) {
+      WriteParams(param, true);
+    } else {
+      ParamContent *vc = ParamContent::GetParamContentById(sve->command_id);
+      vc->SetValue(param);
+      sv_window_->AddMessage("Setting %s to %s", vc->GetName(), vc->GetValue().c_str());
+    }
+  }
+}
+
+// Integrate the parameters editor as popupmenu into the existing scrollview
+// window (usually the pg editor). If sv == null, create a new empty
+// empty window and attach the parameters editor to that window (ugly).
+ParamsEditor::ParamsEditor(tesseract::Tesseract *tess, ScrollView *sv) {
+  if (sv == nullptr) {
+    const char *name = "ParamEditorMAIN";
+    sv = new ScrollView(name, 1, 1, 200, 200, 300, 200);
+  }
+
+  sv_window_ = sv;
+
+  // Only one event handler per window.
+  // sv->AddEventHandler((SVEventHandler*) this);
+
+  SVMenuNode *svMenuRoot = BuildListOfAllLeaves(tess);
+
+  std::string paramfile;
+  paramfile = tess->datadir;
+  paramfile += VARDIR;   // parameters dir
+  paramfile += "edited"; // actual name
+
+  SVMenuNode *std_menu = svMenuRoot->AddChild("Build Config File");
+
+  writeCommands[0] = nrParams + 1;
+  std_menu->AddChild("All Parameters", writeCommands[0], paramfile.c_str(), "Config file name?");
+
+  writeCommands[1] = nrParams + 2;
+  std_menu->AddChild("changed_ Parameters Only", writeCommands[1], paramfile.c_str(),
+                     "Config file name?");
+
+  svMenuRoot->BuildMenu(sv, false);
+}
+
+// Write all (changed_) parameters to a config file.
+void ParamsEditor::WriteParams(char *filename, bool changes_only) {
+  FILE *fp; // input file
+  char msg_str[255];
+  // if file exists
+  if ((fp = fopen(filename, "rb")) != nullptr) {
+    fclose(fp);
+    sprintf(msg_str,
+            "Overwrite file "
+            "%s"
+            "? (Y/N)",
+            filename);
+    int a = sv_window_->ShowYesNoDialog(msg_str);
+    if (a == 'n') {
+      return;
+    } // don't write
+  }
+
+  fp = fopen(filename, "wb"); // can we write to it?
+  if (fp == nullptr) {
+    sv_window_->AddMessage(
+        "Can't write to file "
+        "%s"
+        "",
+        filename);
+    return;
+  }
+  for (auto &iter : vcMap) {
+    ParamContent *cur = iter.second;
+    if (!changes_only || cur->HasChanged()) {
+      fprintf(fp, "%-25s   %-12s   # %s\n", cur->GetName(), cur->GetValue().c_str(),
+              cur->GetDescription());
+    }
+  }
+  fclose(fp);
+}
+
+} // namespace tesseract
+
+#endif // !GRAPHICS_DISABLED
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/paramsd.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/paramsd.h
@ -0,0 +1,130 @@
+///////////////////////////////////////////////////////////////////////
+// File:        paramsd.h
+// Description: Tesseract parameter editor
+// Author:      Joern Wanke
+//
+// (C) Copyright 2007, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+//
+// Tesseract parameter editor is used to edit all the parameters used
+// within tesseract from the ui.
+#ifndef TESSERACT_CCMAIN_PARAMSD_H_
+#define TESSERACT_CCMAIN_PARAMSD_H_
+
+#ifndef GRAPHICS_DISABLED
+
+#  include "elst.h"       // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
+#  include "scrollview.h" // for ScrollView (ptr only), SVEvent (ptr only)
+
+namespace tesseract {
+
+class SVMenuNode;
+
+class BoolParam;
+class DoubleParam;
+class IntParam;
+class StringParam;
+class Tesseract;
+
+// A list of all possible parameter types used.
+enum ParamType { VT_INTEGER, VT_BOOLEAN, VT_STRING, VT_DOUBLE };
+
+// A rather hackish helper structure which can take any kind of parameter input
+// (defined by ParamType) and do a couple of common operations on them, like
+// comparisond or getting its value. It is used in the context of the
+// ParamsEditor as a bridge from the internal tesseract parameters to the
+// ones displayed by the ScrollView server.
+class ParamContent : public ELIST_LINK {
+public:
+  // Compare two VC objects by their name.
+  static int Compare(const void *v1, const void *v2);
+
+  // Gets a VC object identified by its ID.
+  static ParamContent *GetParamContentById(int id);
+
+  // Constructors for the various ParamTypes.
+  ParamContent() = default;
+  explicit ParamContent(tesseract::StringParam *it);
+  explicit ParamContent(tesseract::IntParam *it);
+  explicit ParamContent(tesseract::BoolParam *it);
+  explicit ParamContent(tesseract::DoubleParam *it);
+
+  // Getters and Setters.
+  void SetValue(const char *val);
+  std::string GetValue() const;
+  const char *GetName() const;
+  const char *GetDescription() const;
+
+  int GetId() const {
+    return my_id_;
+  }
+  bool HasChanged() const {
+    return changed_;
+  }
+
+private:
+  // The unique ID of this VC object.
+  int my_id_;
+  // Whether the parameter was changed_ and thus needs to be rewritten.
+  bool changed_ = false;
+  // The actual ParamType of this VC object.
+  ParamType param_type_;
+
+  union {
+    tesseract::StringParam *sIt;
+    tesseract::IntParam *iIt;
+    tesseract::BoolParam *bIt;
+    tesseract::DoubleParam *dIt;
+  };
+};
+
+ELISTIZEH(ParamContent)
+
+// The parameters editor enables the user to edit all the parameters used within
+// tesseract. It can be invoked on its own, but is supposed to be invoked by
+// the program editor.
+class ParamsEditor : public SVEventHandler {
+public:
+  // Integrate the parameters editor as popupmenu into the existing scrollview
+  // window (usually the pg editor). If sv == null, create a new empty
+  // empty window and attach the parameter editor to that window (ugly).
+  explicit ParamsEditor(tesseract::Tesseract *, ScrollView *sv = nullptr);
+
+  // Event listener. Waits for SVET_POPUP events and processes them.
+  void Notify(const SVEvent *sve) override;
+
+private:
+  // Gets the up to the first 3 prefixes from s (split by _).
+  // For example, tesseract_foo_bar will be split into tesseract,foo and bar.
+  void GetPrefixes(const char *s, std::string *level_one, std::string *level_two, std::string *level_three);
+
+  // Gets the first n words (split by _) and puts them in t.
+  // For example, tesseract_foo_bar with N=2 will yield tesseract_foo_.
+  void GetFirstWords(const char *s, // source string
+                     int n,         // number of words
+                     char *t);      // target string
+
+  // Find all editable parameters used within tesseract and create a
+  // SVMenuNode tree from it.
+  SVMenuNode *BuildListOfAllLeaves(tesseract::Tesseract *tess);
+
+  // Write all (changed_) parameters to a config file.
+  void WriteParams(char *filename, bool changes_only);
+
+  ScrollView *sv_window_;
+};
+
+} // namespace tesseract
+
+#endif // !GRAPHICS_DISABLED
+#endif // TESSERACT_CCMAIN_PARAMSD_H_
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/pgedit.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/pgedit.cpp
@ -0,0 +1,958 @@
+/**********************************************************************
+ * File:        pgedit.cpp (Formerly pgeditor.c)
+ * Description: Page structure file editor
+ * Author:      Phil Cheatle
+ *
+ *(C) Copyright 1991, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0(the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http:// www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h"
+#endif
+
+#include "pgedit.h"
+
+#include "blread.h"
+#include "control.h"
+#include "pageres.h"
+#include "paramsd.h"
+#include "scrollview.h"
+#include "statistc.h"
+#include "svmnode.h"
+#include "tesseractclass.h"
+#include "tordmain.h"
+#include "werdit.h"
+
+#include <cctype>
+#include <cmath>
+
+#ifndef GRAPHICS_DISABLED
+namespace tesseract {
+#  define ASC_HEIGHT (2 * kBlnBaselineOffset + kBlnXHeight)
+#  define X_HEIGHT (kBlnBaselineOffset + kBlnXHeight)
+#  define BL_HEIGHT kBlnBaselineOffset
+#  define DESC_HEIGHT 0
+
+enum CMD_EVENTS {
+  NULL_CMD_EVENT,
+  CHANGE_DISP_CMD_EVENT,
+  DUMP_WERD_CMD_EVENT,
+  SHOW_POINT_CMD_EVENT,
+  SHOW_BLN_WERD_CMD_EVENT,
+  DEBUG_WERD_CMD_EVENT,
+  BLAMER_CMD_EVENT,
+  BOUNDING_BOX_CMD_EVENT,
+  CORRECT_TEXT_CMD_EVENT,
+  POLYGONAL_CMD_EVENT,
+  BL_NORM_CMD_EVENT,
+  BITMAP_CMD_EVENT,
+  IMAGE_CMD_EVENT,
+  BLOCKS_CMD_EVENT,
+  BASELINES_CMD_EVENT,
+  UNIFORM_DISP_CMD_EVENT,
+  REFRESH_CMD_EVENT,
+  QUIT_CMD_EVENT,
+  RECOG_WERDS,
+  RECOG_PSEUDO,
+  SHOW_BLOB_FEATURES,
+  SHOW_SUBSCRIPT_CMD_EVENT,
+  SHOW_SUPERSCRIPT_CMD_EVENT,
+  SHOW_ITALIC_CMD_EVENT,
+  SHOW_BOLD_CMD_EVENT,
+  SHOW_UNDERLINE_CMD_EVENT,
+  SHOW_FIXEDPITCH_CMD_EVENT,
+  SHOW_SERIF_CMD_EVENT,
+  SHOW_SMALLCAPS_CMD_EVENT,
+  SHOW_DROPCAPS_CMD_EVENT,
+};
+
+enum ColorationMode {
+  CM_RAINBOW,
+  CM_SUBSCRIPT,
+  CM_SUPERSCRIPT,
+  CM_ITALIC,
+  CM_BOLD,
+  CM_UNDERLINE,
+  CM_FIXEDPITCH,
+  CM_SERIF,
+  CM_SMALLCAPS,
+  CM_DROPCAPS
+};
+
+/*
+ *
+ *  Some global data
+ *
+ */
+
+static ScrollView *image_win;
+static ParamsEditor *pe;
+static bool stillRunning = false;
+
+static ScrollView *bln_word_window = nullptr; // baseline norm words
+
+static CMD_EVENTS mode = CHANGE_DISP_CMD_EVENT; // selected words op
+
+static bool recog_done = false; // recog_all_words was called
+
+// These variables should remain global, since they are only used for the
+// debug mode (in which only a single Tesseract thread/instance will exist).
+static std::bitset<16> word_display_mode;
+static ColorationMode color_mode = CM_RAINBOW;
+static bool display_image = false;
+static bool display_blocks = false;
+static bool display_baselines = false;
+
+static PAGE_RES *current_page_res = nullptr;
+
+STRING_VAR(editor_image_win_name, "EditorImage", "Editor image window name");
+INT_VAR(editor_image_xpos, 590, "Editor image X Pos");
+INT_VAR(editor_image_ypos, 10, "Editor image Y Pos");
+static INT_VAR(editor_image_menuheight, 50, "Add to image height for menu bar");
+INT_VAR(editor_image_word_bb_color, ScrollView::BLUE, "Word bounding box colour");
+INT_VAR(editor_image_blob_bb_color, ScrollView::YELLOW, "Blob bounding box colour");
+INT_VAR(editor_image_text_color, ScrollView::WHITE, "Correct text colour");
+
+STRING_VAR(editor_dbwin_name, "EditorDBWin", "Editor debug window name");
+INT_VAR(editor_dbwin_xpos, 50, "Editor debug window X Pos");
+INT_VAR(editor_dbwin_ypos, 500, "Editor debug window Y Pos");
+INT_VAR(editor_dbwin_height, 24, "Editor debug window height");
+INT_VAR(editor_dbwin_width, 80, "Editor debug window width");
+
+STRING_VAR(editor_word_name, "BlnWords", "BL normalized word window");
+INT_VAR(editor_word_xpos, 60, "Word window X Pos");
+INT_VAR(editor_word_ypos, 510, "Word window Y Pos");
+INT_VAR(editor_word_height, 240, "Word window height");
+INT_VAR(editor_word_width, 655, "Word window width");
+
+/**
+ * show_point()
+ *
+ * Show coords of point, blob bounding box, word bounding box and offset from
+ * row baseline
+ */
+
+static void show_point(PAGE_RES *page_res, float x, float y) {
+  FCOORD pt(x, y);
+  PAGE_RES_IT pr_it(page_res);
+
+  const int kBufsize = 512;
+  char msg[kBufsize];
+  char *msg_ptr = msg;
+
+  msg_ptr += sprintf(msg_ptr, "Pt:(%0.3f, %0.3f) ", x, y);
+
+  for (WERD_RES *word = pr_it.word(); word != nullptr; word = pr_it.forward()) {
+    if (pr_it.row() != pr_it.prev_row() && pr_it.row()->row->bounding_box().contains(pt)) {
+      msg_ptr += sprintf(msg_ptr, "BL(x)=%0.3f ", pr_it.row()->row->base_line(x));
+    }
+    if (word->word->bounding_box().contains(pt)) {
+      TBOX box = word->word->bounding_box();
+      msg_ptr += sprintf(msg_ptr, "Wd(%d, %d)/(%d, %d) ", box.left(), box.bottom(), box.right(),
+                         box.top());
+      C_BLOB_IT cblob_it(word->word->cblob_list());
+      for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) {
+        C_BLOB *cblob = cblob_it.data();
+        box = cblob->bounding_box();
+        if (box.contains(pt)) {
+          msg_ptr += sprintf(msg_ptr, "CBlb(%d, %d)/(%d, %d) ", box.left(), box.bottom(),
+                             box.right(), box.top());
+        }
+      }
+    }
+  }
+  image_win->AddMessage(msg);
+}
+
+/**
+ * pgeditor_msg()
+ *
+ * Display a message - in the command window if there is one, or to stdout
+ */
+
+static void pgeditor_msg( // message display
+    const char *msg) {
+  image_win->AddMessage(msg);
+}
+
+class BlnEventHandler : public SVEventHandler {
+public:
+  void Notify(const SVEvent *sv_event) override {
+    if (sv_event->type == SVET_DESTROY) {
+      bln_word_window = nullptr;
+    } else if (sv_event->type == SVET_CLICK) {
+      show_point(current_page_res, sv_event->x, sv_event->y);
+    }
+  }
+};
+
+/**
+ *  bln_word_window_handle()
+ *
+ *  @return a WINDOW for the word window, creating it if necessary
+ */
+static ScrollView *bln_word_window_handle() { // return handle
+                                              // not opened yet
+  if (bln_word_window == nullptr) {
+    pgeditor_msg("Creating BLN word window...");
+    bln_word_window = new ScrollView(editor_word_name.c_str(), editor_word_xpos, editor_word_ypos,
+                                     editor_word_width, editor_word_height, 4000, 4000, true);
+    auto *a = new BlnEventHandler();
+    bln_word_window->AddEventHandler(a);
+    pgeditor_msg("Creating BLN word window...Done");
+  }
+  return bln_word_window;
+}
+
+/**
+ *  build_image_window()
+ *
+ *  Destroy the existing image window if there is one.  Work out how big the
+ *  new window needs to be. Create it and re-display.
+ */
+
+static void build_image_window(int width, int height) {
+  delete image_win;
+  image_win = new ScrollView(editor_image_win_name.c_str(), editor_image_xpos, editor_image_ypos,
+                             width + 1, height + editor_image_menuheight + 1, width, height, true);
+}
+
+/**
+ *  display_bln_lines()
+ *
+ *  Display normalized baseline, x-height, ascender limit and descender limit
+ */
+
+static void display_bln_lines(ScrollView *window, ScrollView::Color colour, float scale_factor,
+                              float y_offset, float minx, float maxx) {
+  window->Pen(colour);
+  window->Line(minx, y_offset + scale_factor * DESC_HEIGHT, maxx,
+               y_offset + scale_factor * DESC_HEIGHT);
+  window->Line(minx, y_offset + scale_factor * BL_HEIGHT, maxx,
+               y_offset + scale_factor * BL_HEIGHT);
+  window->Line(minx, y_offset + scale_factor * X_HEIGHT, maxx, y_offset + scale_factor * X_HEIGHT);
+  window->Line(minx, y_offset + scale_factor * ASC_HEIGHT, maxx,
+               y_offset + scale_factor * ASC_HEIGHT);
+}
+
+/**
+ *  notify()
+ *
+ *  Event handler that processes incoming events, either forwarding
+ *  them to process_cmd_win_event or process_image_event.
+ *
+ */
+
+void PGEventHandler::Notify(const SVEvent *event) {
+  char myval = '0';
+  if (event->type == SVET_POPUP) {
+    pe->Notify(event);
+  } // These are handled by ParamsEditor
+  else if (event->type == SVET_EXIT) {
+    stillRunning = false;
+  } else if (event->type == SVET_MENU) {
+    if (strcmp(event->parameter, "true") == 0) {
+      myval = 'T';
+    } else if (strcmp(event->parameter, "false") == 0) {
+      myval = 'F';
+    }
+    tess_->process_cmd_win_event(event->command_id, &myval);
+  } else {
+    tess_->process_image_event(*event);
+  }
+}
+
+/**
+ *  build_menu()
+ *
+ *  Construct the menu tree used by the command window
+ */
+SVMenuNode *Tesseract::build_menu_new() {
+  SVMenuNode *parent_menu;
+  auto *root_menu_item = new SVMenuNode();
+
+  SVMenuNode *modes_menu_item = root_menu_item->AddChild("MODES");
+
+  modes_menu_item->AddChild("Change Display", CHANGE_DISP_CMD_EVENT);
+  modes_menu_item->AddChild("Dump Word", DUMP_WERD_CMD_EVENT);
+  modes_menu_item->AddChild("Show Point", SHOW_POINT_CMD_EVENT);
+  modes_menu_item->AddChild("Show BL Norm Word", SHOW_BLN_WERD_CMD_EVENT);
+  modes_menu_item->AddChild("Config Words", DEBUG_WERD_CMD_EVENT);
+  modes_menu_item->AddChild("Recog Words", RECOG_WERDS);
+  modes_menu_item->AddChild("Recog Blobs", RECOG_PSEUDO);
+  modes_menu_item->AddChild("Show Blob Features", SHOW_BLOB_FEATURES);
+
+  parent_menu = root_menu_item->AddChild("DISPLAY");
+
+  parent_menu->AddChild("Blamer", BLAMER_CMD_EVENT, false);
+  parent_menu->AddChild("Bounding Boxes", BOUNDING_BOX_CMD_EVENT, false);
+  parent_menu->AddChild("Correct Text", CORRECT_TEXT_CMD_EVENT, false);
+  parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, false);
+  parent_menu->AddChild("Baseline Normalized", BL_NORM_CMD_EVENT, false);
+  parent_menu->AddChild("Edge Steps", BITMAP_CMD_EVENT, true);
+  parent_menu->AddChild("Subscripts", SHOW_SUBSCRIPT_CMD_EVENT);
+  parent_menu->AddChild("Superscripts", SHOW_SUPERSCRIPT_CMD_EVENT);
+  parent_menu->AddChild("Italics", SHOW_ITALIC_CMD_EVENT);
+  parent_menu->AddChild("Bold", SHOW_BOLD_CMD_EVENT);
+  parent_menu->AddChild("Underline", SHOW_UNDERLINE_CMD_EVENT);
+  parent_menu->AddChild("FixedPitch", SHOW_FIXEDPITCH_CMD_EVENT);
+  parent_menu->AddChild("Serifs", SHOW_SERIF_CMD_EVENT);
+  parent_menu->AddChild("SmallCaps", SHOW_SMALLCAPS_CMD_EVENT);
+  parent_menu->AddChild("DropCaps", SHOW_DROPCAPS_CMD_EVENT);
+
+  parent_menu = root_menu_item->AddChild("OTHER");
+
+  parent_menu->AddChild("Quit", QUIT_CMD_EVENT);
+  parent_menu->AddChild("Show Image", IMAGE_CMD_EVENT, false);
+  parent_menu->AddChild("ShowBlock Outlines", BLOCKS_CMD_EVENT, false);
+  parent_menu->AddChild("Show Baselines", BASELINES_CMD_EVENT, false);
+  parent_menu->AddChild("Uniform Display", UNIFORM_DISP_CMD_EVENT);
+  parent_menu->AddChild("Refresh Display", REFRESH_CMD_EVENT);
+
+  return root_menu_item;
+}
+
+/**
+ *  do_re_display()
+ *
+ *  Redisplay page
+ */
+void Tesseract::do_re_display(bool (tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it)) {
+  int block_count = 1;
+
+  image_win->Clear();
+  if (display_image) {
+    image_win->Draw(pix_binary_, 0, 0);
+  }
+
+  image_win->Brush(ScrollView::NONE);
+  PAGE_RES_IT pr_it(current_page_res);
+  for (WERD_RES *word = pr_it.word(); word != nullptr; word = pr_it.forward()) {
+    (this->*word_painter)(&pr_it);
+    if (display_baselines && pr_it.row() != pr_it.prev_row()) {
+      pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN);
+    }
+    if (display_blocks && pr_it.block() != pr_it.prev_block()) {
+      pr_it.block()->block->pdblk.plot(image_win, block_count++, ScrollView::RED);
+    }
+  }
+  image_win->Update();
+}
+
+/**
+ *  pgeditor_main()
+ *
+ *  Top level editor operation:
+ *  Setup a new window and an according event handler
+ *
+ */
+
+void Tesseract::pgeditor_main(int width, int height, PAGE_RES *page_res) {
+  current_page_res = page_res;
+  if (current_page_res->block_res_list.empty()) {
+    return;
+  }
+
+  recog_done = false;
+  stillRunning = true;
+
+  build_image_window(width, height);
+  word_display_mode.set(DF_EDGE_STEP);
+  do_re_display(&tesseract::Tesseract::word_set_display);
+#  ifndef GRAPHICS_DISABLED
+  pe = new ParamsEditor(this, image_win);
+#  endif
+  PGEventHandler pgEventHandler(this);
+
+  image_win->AddEventHandler(&pgEventHandler);
+  image_win->AddMessageBox();
+
+  SVMenuNode *svMenuRoot = build_menu_new();
+
+  svMenuRoot->BuildMenu(image_win);
+  image_win->SetVisible(true);
+
+  image_win->AwaitEvent(SVET_DESTROY);
+  image_win->AddEventHandler(nullptr);
+}
+
+/**
+ *  process_cmd_win_event()
+ *
+ *  Process a command returned from the command window
+ * (Just call the appropriate command handler)
+ */
+
+bool Tesseract::process_cmd_win_event( // UI command semantics
+    int32_t cmd_event,                 // which menu item?
+    char *new_value                    // any prompt data
+) {
+  char msg[160];
+  bool exit = false;
+
+  color_mode = CM_RAINBOW;
+
+  // Run recognition on the full page if needed.
+  switch (cmd_event) {
+    case BLAMER_CMD_EVENT:
+    case SHOW_SUBSCRIPT_CMD_EVENT:
+    case SHOW_SUPERSCRIPT_CMD_EVENT:
+    case SHOW_ITALIC_CMD_EVENT:
+    case SHOW_BOLD_CMD_EVENT:
+    case SHOW_UNDERLINE_CMD_EVENT:
+    case SHOW_FIXEDPITCH_CMD_EVENT:
+    case SHOW_SERIF_CMD_EVENT:
+    case SHOW_SMALLCAPS_CMD_EVENT:
+    case SHOW_DROPCAPS_CMD_EVENT:
+      if (!recog_done) {
+        recog_all_words(current_page_res, nullptr, nullptr, nullptr, 0);
+        recog_done = true;
+      }
+      break;
+    default:
+      break;
+  }
+
+  char *parameter;
+
+  switch (cmd_event) {
+    case NULL_CMD_EVENT:
+      break;
+
+    case CHANGE_DISP_CMD_EVENT:
+    case DUMP_WERD_CMD_EVENT:
+    case SHOW_POINT_CMD_EVENT:
+    case SHOW_BLN_WERD_CMD_EVENT:
+    case RECOG_WERDS:
+    case RECOG_PSEUDO:
+    case SHOW_BLOB_FEATURES:
+      mode = static_cast<CMD_EVENTS>(cmd_event);
+      break;
+    case DEBUG_WERD_CMD_EVENT:
+      mode = DEBUG_WERD_CMD_EVENT;
+      parameter = image_win->ShowInputDialog("Config File Name");
+      word_config_ = parameter;
+      delete[] parameter;
+      break;
+    case BOUNDING_BOX_CMD_EVENT:
+      if (new_value[0] == 'T') {
+        word_display_mode.set(DF_BOX);
+      } else {
+        word_display_mode.reset(DF_BOX);
+      }
+      mode = CHANGE_DISP_CMD_EVENT;
+      break;
+    case BLAMER_CMD_EVENT:
+      if (new_value[0] == 'T') {
+        word_display_mode.set(DF_BLAMER);
+      } else {
+        word_display_mode.reset(DF_BLAMER);
+      }
+      do_re_display(&tesseract::Tesseract::word_display);
+      mode = CHANGE_DISP_CMD_EVENT;
+      break;
+    case CORRECT_TEXT_CMD_EVENT:
+      if (new_value[0] == 'T') {
+        word_display_mode.set(DF_TEXT);
+      } else {
+        word_display_mode.reset(DF_TEXT);
+      }
+      mode = CHANGE_DISP_CMD_EVENT;
+      break;
+    case POLYGONAL_CMD_EVENT:
+      if (new_value[0] == 'T') {
+        word_display_mode.set(DF_POLYGONAL);
+      } else {
+        word_display_mode.reset(DF_POLYGONAL);
+      }
+      mode = CHANGE_DISP_CMD_EVENT;
+      break;
+    case BL_NORM_CMD_EVENT:
+      if (new_value[0] == 'T') {
+        word_display_mode.set(DF_BN_POLYGONAL);
+      } else {
+        word_display_mode.reset(DF_BN_POLYGONAL);
+      }
+      mode = CHANGE_DISP_CMD_EVENT;
+      break;
+    case BITMAP_CMD_EVENT:
+      if (new_value[0] == 'T') {
+        word_display_mode.set(DF_EDGE_STEP);
+      } else {
+        word_display_mode.reset(DF_EDGE_STEP);
+      }
+      mode = CHANGE_DISP_CMD_EVENT;
+      break;
+    case UNIFORM_DISP_CMD_EVENT:
+      do_re_display(&tesseract::Tesseract::word_set_display);
+      break;
+    case IMAGE_CMD_EVENT:
+      display_image = (new_value[0] == 'T');
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case BLOCKS_CMD_EVENT:
+      display_blocks = (new_value[0] == 'T');
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case BASELINES_CMD_EVENT:
+      display_baselines = (new_value[0] == 'T');
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case SHOW_SUBSCRIPT_CMD_EVENT:
+      color_mode = CM_SUBSCRIPT;
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case SHOW_SUPERSCRIPT_CMD_EVENT:
+      color_mode = CM_SUPERSCRIPT;
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case SHOW_ITALIC_CMD_EVENT:
+      color_mode = CM_ITALIC;
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case SHOW_BOLD_CMD_EVENT:
+      color_mode = CM_BOLD;
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case SHOW_UNDERLINE_CMD_EVENT:
+      color_mode = CM_UNDERLINE;
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case SHOW_FIXEDPITCH_CMD_EVENT:
+      color_mode = CM_FIXEDPITCH;
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case SHOW_SERIF_CMD_EVENT:
+      color_mode = CM_SERIF;
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case SHOW_SMALLCAPS_CMD_EVENT:
+      color_mode = CM_SMALLCAPS;
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case SHOW_DROPCAPS_CMD_EVENT:
+      color_mode = CM_DROPCAPS;
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case REFRESH_CMD_EVENT:
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case QUIT_CMD_EVENT:
+      exit = true;
+      ScrollView::Exit();
+      break;
+
+    default:
+      snprintf(msg, sizeof(msg), "Unrecognised event %" PRId32 "(%s)", cmd_event, new_value);
+      image_win->AddMessage(msg);
+      break;
+  }
+  return exit;
+}
+
+/**
+ * process_image_event()
+ *
+ * User has done something in the image window - mouse down or up.  Work out
+ * what it is and do something with it.
+ * If DOWN - just remember where it was.
+ * If UP - for each word in the selected area do the operation defined by
+ * the current mode.
+ */
+void Tesseract::process_image_event( // action in image win
+    const SVEvent &event) {
+  // The following variable should remain static, since it is used by
+  // debug editor, which uses a single Tesseract instance.
+  static ICOORD down;
+  ICOORD up;
+  TBOX selection_box;
+  char msg[80];
+
+  switch (event.type) {
+    case SVET_SELECTION:
+      if (event.type == SVET_SELECTION) {
+        down.set_x(event.x + event.x_size);
+        down.set_y(event.y + event.y_size);
+        if (mode == SHOW_POINT_CMD_EVENT) {
+          show_point(current_page_res, event.x, event.y);
+        }
+      }
+
+      up.set_x(event.x);
+      up.set_y(event.y);
+
+      selection_box = TBOX(down, up);
+
+      switch (mode) {
+        case CHANGE_DISP_CMD_EVENT:
+          process_selected_words(current_page_res, selection_box,
+                                 &tesseract::Tesseract::word_blank_and_set_display);
+          break;
+        case DUMP_WERD_CMD_EVENT:
+          process_selected_words(current_page_res, selection_box,
+                                 &tesseract::Tesseract::word_dumper);
+          break;
+        case SHOW_BLN_WERD_CMD_EVENT:
+          process_selected_words(current_page_res, selection_box,
+                                 &tesseract::Tesseract::word_bln_display);
+          break;
+        case DEBUG_WERD_CMD_EVENT:
+          debug_word(current_page_res, selection_box);
+          break;
+        case SHOW_POINT_CMD_EVENT:
+          break; // ignore up event
+
+        case RECOG_WERDS:
+#  ifndef DISABLED_LEGACY_ENGINE
+          image_win->AddMessage("Recogging selected words");
+          this->process_selected_words(current_page_res, selection_box,
+                                       &Tesseract::recog_interactive);
+#  endif // ndef DISABLED_LEGACY_ENGINE
+          break;
+        case RECOG_PSEUDO:
+          image_win->AddMessage("Recogging selected blobs");
+          recog_pseudo_word(current_page_res, selection_box);
+          break;
+        case SHOW_BLOB_FEATURES:
+          blob_feature_display(current_page_res, selection_box);
+          break;
+
+        default:
+          sprintf(msg, "Mode %d not yet implemented", mode);
+          image_win->AddMessage(msg);
+          break;
+      }
+    default:
+      break;
+  }
+}
+
+/**
+ * debug_word
+ *
+ * Process the whole image, but load word_config_ for the selected word(s).
+ */
+void Tesseract::debug_word(PAGE_RES *page_res, const TBOX &selection_box) {
+#  ifndef DISABLED_LEGACY_ENGINE
+  ResetAdaptiveClassifier();
+#  endif
+  recog_all_words(page_res, nullptr, &selection_box, word_config_.c_str(), 0);
+}
+
+/**********************************************************************
+ * WERD PROCESSOR FUNCTIONS
+ * ========================
+ *
+ * These routines are invoked by one or more of:
+ *    process_all_words()
+ *    process_selected_words()
+ * or
+ *    process_all_words_it()
+ *    process_selected_words_it()
+ * for each word to be processed
+ **********************************************************************/
+
+/**
+ * word_blank_and_set_display()  Word processor
+ *
+ * Blank display of word then redisplay word according to current display mode
+ * settings
+ */
+
+bool Tesseract::word_blank_and_set_display(PAGE_RES_IT *pr_it) {
+  pr_it->word()->word->bounding_box().plot(image_win, ScrollView::BLACK, ScrollView::BLACK);
+  return word_set_display(pr_it);
+}
+
+/**
+ * word_bln_display()
+ *
+ * Normalize word and display in word window
+ */
+bool Tesseract::word_bln_display(PAGE_RES_IT *pr_it) {
+  WERD_RES *word_res = pr_it->word();
+  if (word_res->chopped_word == nullptr) {
+    // Setup word normalization parameters.
+    word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
+                                  classify_bln_numeric_mode, textord_use_cjk_fp_model,
+                                  poly_allow_detailed_fx, pr_it->row()->row, pr_it->block()->block);
+  }
+  bln_word_window_handle()->Clear();
+  display_bln_lines(bln_word_window_handle(), ScrollView::CYAN, 1.0, 0.0f, -1000.0f, 1000.0f);
+  C_BLOB_IT it(word_res->word->cblob_list());
+  ScrollView::Color color = WERD::NextColor(ScrollView::BLACK);
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    it.data()->plot_normed(word_res->denorm, color, ScrollView::BROWN, bln_word_window_handle());
+    color = WERD::NextColor(color);
+  }
+  bln_word_window_handle()->Update();
+  return true;
+}
+
+/**
+ *  word_display()  Word Processor
+ *
+ *  Display a word according to its display modes
+ */
+bool Tesseract::word_display(PAGE_RES_IT *pr_it) {
+  WERD_RES *word_res = pr_it->word();
+  WERD *word = word_res->word;
+  TBOX word_bb;    // word bounding box
+  int word_height; // ht of word BB
+  bool displayed_something = false;
+  float shift; // from bot left
+
+  if (color_mode != CM_RAINBOW && word_res->box_word != nullptr) {
+#  ifndef DISABLED_LEGACY_ENGINE
+    BoxWord *box_word = word_res->box_word;
+    WERD_CHOICE *best_choice = word_res->best_choice;
+    int length = box_word->length();
+    if (word_res->fontinfo == nullptr) {
+      return false;
+    }
+    const FontInfo &font_info = *word_res->fontinfo;
+    for (int i = 0; i < length; ++i) {
+      ScrollView::Color color = ScrollView::GREEN;
+      switch (color_mode) {
+        case CM_SUBSCRIPT:
+          if (best_choice->BlobPosition(i) == SP_SUBSCRIPT) {
+            color = ScrollView::RED;
+          }
+          break;
+        case CM_SUPERSCRIPT:
+          if (best_choice->BlobPosition(i) == SP_SUPERSCRIPT) {
+            color = ScrollView::RED;
+          }
+          break;
+        case CM_ITALIC:
+          if (font_info.is_italic()) {
+            color = ScrollView::RED;
+          }
+          break;
+        case CM_BOLD:
+          if (font_info.is_bold()) {
+            color = ScrollView::RED;
+          }
+          break;
+        case CM_FIXEDPITCH:
+          if (font_info.is_fixed_pitch()) {
+            color = ScrollView::RED;
+          }
+          break;
+        case CM_SERIF:
+          if (font_info.is_serif()) {
+            color = ScrollView::RED;
+          }
+          break;
+        case CM_SMALLCAPS:
+          if (word_res->small_caps) {
+            color = ScrollView::RED;
+          }
+          break;
+        case CM_DROPCAPS:
+          if (best_choice->BlobPosition(i) == SP_DROPCAP) {
+            color = ScrollView::RED;
+          }
+          break;
+          // TODO(rays) underline is currently completely unsupported.
+        case CM_UNDERLINE:
+        default:
+          break;
+      }
+      image_win->Pen(color);
+      TBOX box = box_word->BlobBox(i);
+      image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top());
+    }
+    return true;
+#  else
+    return false;
+#  endif // ndef DISABLED_LEGACY_ENGINE
+  }
+  /*
+  Note the double coercions of(COLOUR)((int32_t)editor_image_word_bb_color)
+  etc. are to keep the compiler happy.
+*/
+  // display bounding box
+  if (word->display_flag(DF_BOX)) {
+    word->bounding_box().plot(image_win,
+                              static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color),
+                              static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color));
+
+    auto c = static_cast<ScrollView::Color>((int32_t)editor_image_blob_bb_color);
+    image_win->Pen(c);
+    // cblob iterator
+    C_BLOB_IT c_it(word->cblob_list());
+    for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
+      c_it.data()->bounding_box().plot(image_win);
+    }
+    displayed_something = true;
+  }
+
+  // display edge steps
+  if (word->display_flag(DF_EDGE_STEP)) { // edgesteps available
+    word->plot(image_win);                // rainbow colors
+    displayed_something = true;
+  }
+
+  // display poly approx
+  if (word->display_flag(DF_POLYGONAL)) {
+    // need to convert
+    TWERD *tword = TWERD::PolygonalCopy(poly_allow_detailed_fx, word);
+    tword->plot(image_win);
+    delete tword;
+    displayed_something = true;
+  }
+
+  // Display correct text and blamer information.
+  std::string text;
+  std::string blame;
+  if (word->display_flag(DF_TEXT) && word->text() != nullptr) {
+    text = word->text();
+  }
+  if (word->display_flag(DF_BLAMER) &&
+      !(word_res->blamer_bundle != nullptr &&
+        word_res->blamer_bundle->incorrect_result_reason() == IRR_CORRECT)) {
+    text = "";
+    const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
+    if (blamer_bundle == nullptr) {
+      text += "NULL";
+    } else {
+      text = blamer_bundle->TruthString();
+    }
+    text += " -> ";
+    std::string best_choice_str;
+    if (word_res->best_choice == nullptr) {
+      best_choice_str = "NULL";
+    } else {
+      word_res->best_choice->string_and_lengths(&best_choice_str, nullptr);
+    }
+    text += best_choice_str;
+    IncorrectResultReason reason =
+        (blamer_bundle == nullptr) ? IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason();
+    ASSERT_HOST(reason < IRR_NUM_REASONS);
+    blame += " [";
+    blame += BlamerBundle::IncorrectReasonName(reason);
+    blame += "]";
+  }
+  if (text.length() > 0) {
+    word_bb = word->bounding_box();
+    image_win->Pen(ScrollView::RED);
+    word_height = word_bb.height();
+    int text_height = 0.50 * word_height;
+    if (text_height > 20) {
+      text_height = 20;
+    }
+    image_win->TextAttributes("Arial", text_height, false, false, false);
+    shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
+    image_win->Text(word_bb.left() + shift, word_bb.bottom() + 0.25 * word_height, text.c_str());
+    if (blame.length() > 0) {
+      image_win->Text(word_bb.left() + shift, word_bb.bottom() + 0.25 * word_height - text_height,
+                      blame.c_str());
+    }
+
+    displayed_something = true;
+  }
+
+  if (!displayed_something) { // display BBox anyway
+    word->bounding_box().plot(image_win,
+                              static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color),
+                              static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color));
+  }
+  return true;
+}
+} // namespace tesseract
+#endif // !GRAPHICS_DISABLED
+
+namespace tesseract {
+/**
+ * word_dumper()
+ *
+ * Dump members to the debug window
+ */
+bool Tesseract::word_dumper(PAGE_RES_IT *pr_it) {
+  if (pr_it->block()->block != nullptr) {
+    tprintf("\nBlock data...\n");
+    pr_it->block()->block->print(nullptr, false);
+  }
+  tprintf("\nRow data...\n");
+  pr_it->row()->row->print(nullptr);
+  tprintf("\nWord data...\n");
+  WERD_RES *word_res = pr_it->word();
+  word_res->word->print();
+  if (word_res->blamer_bundle != nullptr && wordrec_debug_blamer &&
+      word_res->blamer_bundle->incorrect_result_reason() != IRR_CORRECT) {
+    tprintf("Current blamer debug: %s\n", word_res->blamer_bundle->debug().c_str());
+  }
+  return true;
+}
+
+#ifndef GRAPHICS_DISABLED
+/**
+ * word_set_display()  Word processor
+ *
+ * Display word according to current display mode settings
+ */
+bool Tesseract::word_set_display(PAGE_RES_IT *pr_it) {
+  WERD *word = pr_it->word()->word;
+  word->set_display_flag(DF_BOX, word_display_mode[DF_BOX]);
+  word->set_display_flag(DF_TEXT, word_display_mode[DF_TEXT]);
+  word->set_display_flag(DF_POLYGONAL, word_display_mode[DF_POLYGONAL]);
+  word->set_display_flag(DF_EDGE_STEP, word_display_mode[DF_EDGE_STEP]);
+  word->set_display_flag(DF_BN_POLYGONAL, word_display_mode[DF_BN_POLYGONAL]);
+  word->set_display_flag(DF_BLAMER, word_display_mode[DF_BLAMER]);
+  return word_display(pr_it);
+}
+
+// page_res is non-const because the iterator doesn't know if you are going
+// to change the items it points to! Really a const here though.
+void Tesseract::blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box) {
+#  ifndef DISABLED_LEGACY_ENGINE
+  PAGE_RES_IT *it = make_pseudo_word(page_res, selection_box);
+  if (it != nullptr) {
+    WERD_RES *word_res = it->word();
+    word_res->x_height = it->row()->row->x_height();
+    word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
+                                  classify_bln_numeric_mode, textord_use_cjk_fp_model,
+                                  poly_allow_detailed_fx, it->row()->row, it->block()->block);
+    TWERD *bln_word = word_res->chopped_word;
+    TBLOB *bln_blob = bln_word->blobs[0];
+    INT_FX_RESULT_STRUCT fx_info;
+    std::vector<INT_FEATURE_STRUCT> bl_features;
+    std::vector<INT_FEATURE_STRUCT> cn_features;
+    Classify::ExtractFeatures(*bln_blob, classify_nonlinear_norm, &bl_features, &cn_features,
+                              &fx_info, nullptr);
+    // Display baseline features.
+    ScrollView *bl_win = CreateFeatureSpaceWindow("BL Features", 512, 0);
+    ClearFeatureSpaceWindow(baseline, bl_win);
+    for (auto &bl_feature : bl_features) {
+      RenderIntFeature(bl_win, &bl_feature, ScrollView::GREEN);
+    }
+    bl_win->Update();
+    // Display cn features.
+    ScrollView *cn_win = CreateFeatureSpaceWindow("CN Features", 512, 0);
+    ClearFeatureSpaceWindow(character, cn_win);
+    for (auto &cn_feature : cn_features) {
+      RenderIntFeature(cn_win, &cn_feature, ScrollView::GREEN);
+    }
+    cn_win->Update();
+
+    it->DeleteCurrentWord();
+    delete it;
+  }
+#  endif // ndef DISABLED_LEGACY_ENGINE
+}
+
+#endif // !GRAPHICS_DISABLED
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/pgedit.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/pgedit.h
@ -0,0 +1,68 @@
+///////////////////////////////////////////////////////////////////////
+// File:        pgedit.h
+// Description: Page structure file editor
+// Author:      Joern Wanke
+//
+// (C) Copyright 2007, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef PGEDIT_H
+#define PGEDIT_H
+
+#include "params.h"     // for INT_VAR_H, IntParam, STRING_VAR_H, StringParam
+#include "scrollview.h" // for SVEvent (ptr only), SVEventHandler, ScrollView
+
+namespace tesseract {
+
+class BLOCK_LIST;
+class PAGE_RES;
+
+class Tesseract;
+
+#ifndef GRAPHICS_DISABLED
+// A small event handler class to process incoming events to
+// this window.
+class PGEventHandler : public SVEventHandler {
+public:
+  PGEventHandler(tesseract::Tesseract *tess) : tess_(tess) {}
+  void Notify(const SVEvent *sve) override;
+
+private:
+  tesseract::Tesseract *tess_;
+};
+#endif // !GRAPHICS_DISABLED
+
+extern BLOCK_LIST *current_block_list;
+extern STRING_VAR_H(editor_image_win_name, "EditorImage", "Editor image window name");
+extern INT_VAR_H(editor_image_xpos, 590, "Editor image X Pos");
+extern INT_VAR_H(editor_image_ypos, 10, "Editor image Y Pos");
+extern INT_VAR_H(editor_image_height, 680, "Editor image height");
+extern INT_VAR_H(editor_image_width, 655, "Editor image width");
+extern INT_VAR_H(editor_image_word_bb_color, BLUE, "Word bounding box colour");
+extern INT_VAR_H(editor_image_blob_bb_color, YELLOW, "Blob bounding box colour");
+extern INT_VAR_H(editor_image_text_color, WHITE, "Correct text colour");
+extern STRING_VAR_H(editor_dbwin_name, "EditorDBWin", "Editor debug window name");
+extern INT_VAR_H(editor_dbwin_xpos, 50, "Editor debug window X Pos");
+extern INT_VAR_H(editor_dbwin_ypos, 500, "Editor debug window Y Pos");
+extern INT_VAR_H(editor_dbwin_height, 24, "Editor debug window height");
+extern INT_VAR_H(editor_dbwin_width, 80, "Editor debug window width");
+extern STRING_VAR_H(editor_word_name, "BlnWords", "BL normalised word window");
+extern INT_VAR_H(editor_word_xpos, 60, "Word window X Pos");
+extern INT_VAR_H(editor_word_ypos, 510, "Word window Y Pos");
+extern INT_VAR_H(editor_word_height, 240, "Word window height");
+extern INT_VAR_H(editor_word_width, 655, "Word window width");
+extern double_VAR_H(editor_smd_scale_factor, 1.0, "Scaling for smd image");
+
+} // namespace tesseract
+
+#endif
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/recogtraining.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/recogtraining.cpp
@ -0,0 +1,228 @@
+///////////////////////////////////////////////////////////////////////
+// File:        recogtraining.cpp
+// Description: Functions for ambiguity and parameter training.
+// Author:      Daria Antonova
+//
+// (C) Copyright 2009, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "tesseractclass.h"
+
+#include "boxread.h"
+#include "control.h"
+#include "host.h" // for NearlyEqual
+#include "ratngs.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#  include "reject.h"
+#endif
+#include "stopper.h"
+
+namespace tesseract {
+
+const int16_t kMaxBoxEdgeDiff = 2;
+
+// Sets flags necessary for recognition in the training mode.
+// Opens and returns the pointer to the output file.
+FILE *Tesseract::init_recog_training(const char *filename) {
+  if (tessedit_ambigs_training) {
+    tessedit_tess_adaption_mode.set_value(0); // turn off adaption
+    tessedit_enable_doc_dict.set_value(false); // turn off document dictionary
+    // Explore all segmentations.
+    getDict().stopper_no_acceptable_choices.set_value(true);
+  }
+
+  std::string output_fname = filename;
+  const char *lastdot = strrchr(output_fname.c_str(), '.');
+  if (lastdot != nullptr) {
+    output_fname[lastdot - output_fname.c_str()] = '\0';
+  }
+  output_fname += ".txt";
+  FILE *output_file = fopen(output_fname.c_str(), "a+");
+  if (output_file == nullptr) {
+    tprintf("Error: Could not open file %s\n", output_fname.c_str());
+    ASSERT_HOST(output_file);
+  }
+  return output_file;
+}
+
+// Copies the bounding box from page_res_it->word() to the given TBOX.
+static bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) {
+  while (page_res_it->block() != nullptr && page_res_it->word() == nullptr) {
+    page_res_it->forward();
+  }
+
+  if (page_res_it->word() != nullptr) {
+    *tbox = page_res_it->word()->word->bounding_box();
+
+    // If tbox->left() is negative, the training image has vertical text and
+    // all the coordinates of bounding boxes of page_res are rotated by 90
+    // degrees in a counterclockwise direction. We need to rotate the TBOX back
+    // in order to compare with the TBOXes of box files.
+    if (tbox->left() < 0) {
+      tbox->rotate(FCOORD(0.0, -1.0));
+    }
+
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// This function takes tif/box pair of files and runs recognition on the image,
+// while making sure that the word bounds that tesseract identified roughly
+// match to those specified by the input box file. For each word (ngram in a
+// single bounding box from the input box file) it outputs the ocred result,
+// the correct label, rating and certainty.
+void Tesseract::recog_training_segmented(const char *filename, PAGE_RES *page_res,
+                                         volatile ETEXT_DESC *monitor, FILE *output_file) {
+  std::string box_fname = filename;
+  const char *lastdot = strrchr(box_fname.c_str(), '.');
+  if (lastdot != nullptr) {
+    box_fname[lastdot - box_fname.c_str()] = '\0';
+  }
+  box_fname += ".box";
+  // ReadNextBox() will close box_file
+  FILE *box_file = fopen(box_fname.c_str(), "r");
+  if (box_file == nullptr) {
+    tprintf("Error: Could not open file %s\n", box_fname.c_str());
+    ASSERT_HOST(box_file);
+  }
+
+  PAGE_RES_IT page_res_it;
+  page_res_it.page_res = page_res;
+  page_res_it.restart_page();
+  std::string label;
+
+  // Process all the words on this page.
+  TBOX tbox; // tesseract-identified box
+  TBOX bbox; // box from the box file
+  bool keep_going;
+  int line_number = 0;
+  int examined_words = 0;
+  do {
+    keep_going = read_t(&page_res_it, &tbox);
+    keep_going &= ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
+    // Align bottom left points of the TBOXes.
+    while (keep_going && !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
+      if (bbox.bottom() < tbox.bottom()) {
+        page_res_it.forward();
+        keep_going = read_t(&page_res_it, &tbox);
+      } else {
+        keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
+      }
+    }
+    while (keep_going && !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
+      if (bbox.left() > tbox.left()) {
+        page_res_it.forward();
+        keep_going = read_t(&page_res_it, &tbox);
+      } else {
+        keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
+      }
+    }
+    // OCR the word if top right points of the TBOXes are similar.
+    if (keep_going && NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
+        NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
+      ambigs_classify_and_output(label.c_str(), &page_res_it, output_file);
+      examined_words++;
+    }
+    page_res_it.forward();
+  } while (keep_going);
+
+  // Set up scripts on all of the words that did not get sent to
+  // ambigs_classify_and_output.  They all should have, but if all the
+  // werd_res's don't get uch_sets, tesseract will crash when you try
+  // to iterate over them. :-(
+  int total_words = 0;
+  for (page_res_it.restart_page(); page_res_it.block() != nullptr; page_res_it.forward()) {
+    if (page_res_it.word()) {
+      if (page_res_it.word()->uch_set == nullptr) {
+        page_res_it.word()->SetupFake(unicharset);
+      }
+      total_words++;
+    }
+  }
+  if (examined_words < 0.85 * total_words) {
+    tprintf(
+        "TODO(antonova): clean up recog_training_segmented; "
+        " It examined only a small fraction of the ambigs image.\n");
+  }
+  tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words, total_words);
+}
+
+// Helper prints the given set of blob choices.
+static void PrintPath(int length, const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset,
+                      const char *label, FILE *output_file) {
+  float rating = 0.0f;
+  float certainty = 0.0f;
+  for (int i = 0; i < length; ++i) {
+    const BLOB_CHOICE *blob_choice = blob_choices[i];
+    fprintf(output_file, "%s", unicharset.id_to_unichar(blob_choice->unichar_id()));
+    rating += blob_choice->rating();
+    if (certainty > blob_choice->certainty()) {
+      certainty = blob_choice->certainty();
+    }
+  }
+  fprintf(output_file, "\t%s\t%.4f\t%.4f\n", label, rating, certainty);
+}
+
+// Helper recursively prints all paths through the ratings matrix, starting
+// at column col.
+static void PrintMatrixPaths(int col, int dim, const MATRIX &ratings, int length,
+                             const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset,
+                             const char *label, FILE *output_file) {
+  for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) {
+    if (ratings.get(col, row) != NOT_CLASSIFIED) {
+      BLOB_CHOICE_IT bc_it(ratings.get(col, row));
+      for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
+        blob_choices[length] = bc_it.data();
+        if (row + 1 < dim) {
+          PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices, unicharset, label,
+                           output_file);
+        } else {
+          PrintPath(length + 1, blob_choices, unicharset, label, output_file);
+        }
+      }
+    }
+  }
+}
+
+// Runs classify_word_pass1() on the current word. Outputs Tesseract's
+// raw choice as a result of the classification. For words labeled with a
+// single unichar also outputs all alternatives from blob_choices of the
+// best choice.
+void Tesseract::ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it,
+                                           FILE *output_file) {
+  // Classify word.
+  fflush(stdout);
+  WordData word_data(*pr_it);
+  SetupWordPassN(1, &word_data);
+  classify_word_and_language(1, pr_it, &word_data);
+  WERD_RES *werd_res = word_data.word;
+  WERD_CHOICE *best_choice = werd_res->best_choice;
+  ASSERT_HOST(best_choice != nullptr);
+
+  // Compute the number of unichars in the label.
+  std::vector<UNICHAR_ID> encoding;
+  if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
+    tprintf("Not outputting illegal unichar %s\n", label);
+    return;
+  }
+
+  // Dump all paths through the ratings matrix (which is normally small).
+  int dim = werd_res->ratings->dimension();
+  const auto **blob_choices = new const BLOB_CHOICE *[dim];
+  PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset, label, output_file);
+  delete[] blob_choices;
+}
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/reject.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/reject.cpp
@ -0,0 +1,785 @@
+/**********************************************************************
+ * File:        reject.cpp  (Formerly reject.c)
+ * Description: Rejection functions used in tessedit
+ * Author:      Phil Cheatle
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h"
+#endif
+
+#include "reject.h"
+
+#ifdef DISABLED_LEGACY_ENGINE
+
+#  include "tesseractclass.h"
+
+namespace tesseract {
+
+int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
+  const WERD_CHOICE &word = *werd_res->best_choice;
+  int dict_word_type = werd_res->tesseract->dict_word(word);
+  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
+}
+} // namespace tesseract
+
+#else
+
+#  include "control.h"
+#  include "docqual.h"
+#  include "tesseractclass.h"
+#  include "tessvars.h"
+
+#  include "helpers.h"
+
+#  include <algorithm> // for std::sort
+#  include <cctype>
+#  include <cerrno>
+#  include <cstring>
+#  include <vector> // for std::vector
+
+namespace tesseract {
+
+/*************************************************************************
+ * set_done()
+ *
+ * Set the done flag based on the word acceptability criteria
+ *************************************************************************/
+
+void Tesseract::set_done(WERD_RES *word, int16_t pass) {
+  word->done =
+      word->tess_accepted && (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr);
+  bool word_is_ambig = word->best_choice->dangerous_ambig_found();
+  bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
+                        word->best_choice->permuter() == FREQ_DAWG_PERM ||
+                        word->best_choice->permuter() == USER_DAWG_PERM;
+  if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
+      one_ell_conflict(word, false)) {
+    if (tessedit_rejection_debug) {
+      tprintf("one_ell_conflict detected\n");
+    }
+    word->done = false;
+  }
+  if (word->done &&
+      ((!word_from_dict && word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
+    if (tessedit_rejection_debug) {
+      tprintf("non-dict or ambig word detected\n");
+    }
+    word->done = false;
+  }
+  if (tessedit_rejection_debug) {
+    tprintf("set_done(): done=%d\n", word->done);
+    word->best_choice->print("");
+  }
+}
+
+/*************************************************************************
+ * make_reject_map()
+ *
+ * Sets the done flag to indicate whether the resylt is acceptable.
+ *
+ * Sets a reject map for the word.
+ *************************************************************************/
+void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
+  int i;
+  int offset;
+
+  flip_0O(word);
+  check_debug_pt(word, -1); // For trap only
+  set_done(word, pass);     // Set acceptance
+  word->reject_map.initialise(word->best_choice->unichar_lengths().length());
+  reject_blanks(word);
+  /*
+0: Rays original heuristic - the baseline
+*/
+  if (tessedit_reject_mode == 0) {
+    if (!word->done) {
+      reject_poor_matches(word);
+    }
+  } else if (tessedit_reject_mode == 5) {
+    /*
+5: Reject I/1/l from words where there is no strong contextual confirmation;
+  the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
+  and the whole of any words which are very small
+*/
+    if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
+      word->reject_map.rej_word_small_xht();
+    } else {
+      one_ell_conflict(word, true);
+      /*
+  Originally the code here just used the done flag. Now I have duplicated
+  and unpacked the conditions for setting the done flag so that each
+  mechanism can be turned on or off independently. This works WITHOUT
+  affecting the done flag setting.
+*/
+      if (rej_use_tess_accepted && !word->tess_accepted) {
+        word->reject_map.rej_word_not_tess_accepted();
+      }
+
+      if (rej_use_tess_blanks &&
+          (strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {
+        word->reject_map.rej_word_contains_blanks();
+      }
+
+      WERD_CHOICE *best_choice = word->best_choice;
+      if (rej_use_good_perm) {
+        if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
+             best_choice->permuter() == FREQ_DAWG_PERM ||
+             best_choice->permuter() == USER_DAWG_PERM) &&
+            (!rej_use_sensible_wd ||
+             acceptable_word_string(*word->uch_set, best_choice->unichar_string().c_str(),
+                                    best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE)) {
+          // PASSED TEST
+        } else if (best_choice->permuter() == NUMBER_PERM) {
+          if (rej_alphas_in_number_perm) {
+            for (i = 0, offset = 0; best_choice->unichar_string()[offset] != '\0';
+                 offset += best_choice->unichar_lengths()[i++]) {
+              if (word->reject_map[i].accepted() &&
+                  word->uch_set->get_isalpha(best_choice->unichar_string().c_str() + offset,
+                                             best_choice->unichar_lengths()[i])) {
+                word->reject_map[i].setrej_bad_permuter();
+              }
+              // rej alpha
+            }
+          }
+        } else {
+          word->reject_map.rej_word_bad_permuter();
+        }
+      }
+      /* Ambig word rejection was here once !!*/
+    }
+  } else {
+    tprintf("BAD tessedit_reject_mode\n");
+    ASSERT_HOST("Fatal error encountered!" == nullptr);
+  }
+
+  if (tessedit_image_border > -1) {
+    reject_edge_blobs(word);
+  }
+
+  check_debug_pt(word, 10);
+  if (tessedit_rejection_debug) {
+    tprintf("Permuter Type = %d\n", word->best_choice->permuter());
+    tprintf("Certainty: %f     Rating: %f\n", word->best_choice->certainty(),
+            word->best_choice->rating());
+    tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
+  }
+
+  flip_hyphens(word);
+  check_debug_pt(word, 20);
+}
+
+void reject_blanks(WERD_RES *word) {
+  int16_t i;
+  int16_t offset;
+
+  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
+       offset += word->best_choice->unichar_lengths()[i], i += 1) {
+    if (word->best_choice->unichar_string()[offset] == ' ') {
+      // rej unrecognised blobs
+      word->reject_map[i].setrej_tess_failure();
+    }
+  }
+}
+
+void Tesseract::reject_I_1_L(WERD_RES *word) {
+  int16_t i;
+  int16_t offset;
+
+  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
+       offset += word->best_choice->unichar_lengths()[i], i += 1) {
+    if (conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])) {
+      // rej 1Il conflict
+      word->reject_map[i].setrej_1Il_conflict();
+    }
+  }
+}
+
+void reject_poor_matches(WERD_RES *word) {
+  float threshold = compute_reject_threshold(word->best_choice);
+  for (int i = 0; i < word->best_choice->length(); ++i) {
+    if (word->best_choice->unichar_id(i) == UNICHAR_SPACE) {
+      word->reject_map[i].setrej_tess_failure();
+    } else if (word->best_choice->certainty(i) < threshold) {
+      word->reject_map[i].setrej_poor_match();
+    }
+  }
+}
+
+/**********************************************************************
+ * compute_reject_threshold
+ *
+ * Set a rejection threshold for this word.
+ * Initially this is a trivial function which looks for the largest
+ * gap in the certainty value.
+ **********************************************************************/
+
+float compute_reject_threshold(WERD_CHOICE *word) {
+  float threshold;      // rejection threshold
+  float bestgap = 0.0f; // biggest gap
+  float gapstart;       // bottom of gap
+
+  int blob_count = word->length();
+  std::vector<float> ratings;
+  ratings.reserve(blob_count);
+  for (int i = 0; i < blob_count; ++i) {
+    ratings.push_back(word->certainty(i));
+  }
+  std::sort(ratings.begin(), ratings.end());
+  gapstart = ratings[0] - 1; // all reject if none better
+  if (blob_count >= 3) {
+    for (int index = 0; index < blob_count - 1; index++) {
+      if (ratings[index + 1] - ratings[index] > bestgap) {
+        bestgap = ratings[index + 1] - ratings[index];
+        // find biggest
+        gapstart = ratings[index];
+      }
+    }
+  }
+  threshold = gapstart + bestgap / 2;
+
+  return threshold;
+}
+
+/*************************************************************************
+ * reject_edge_blobs()
+ *
+ * If the word is perilously close to the edge of the image, reject those blobs
+ * in the word which are too close to the edge as they could be clipped.
+ *************************************************************************/
+void Tesseract::reject_edge_blobs(WERD_RES *word) {
+  TBOX word_box = word->word->bounding_box();
+  // Use the box_word as it is already denormed back to image coordinates.
+  int blobcount = word->box_word->length();
+
+  if (word_box.left() < tessedit_image_border || word_box.bottom() < tessedit_image_border ||
+      word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
+      word_box.top() + tessedit_image_border > ImageHeight() - 1) {
+    ASSERT_HOST(word->reject_map.length() == blobcount);
+    for (int blobindex = 0; blobindex < blobcount; blobindex++) {
+      TBOX blob_box = word->box_word->BlobBox(blobindex);
+      if (blob_box.left() < tessedit_image_border || blob_box.bottom() < tessedit_image_border ||
+          blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
+          blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
+        word->reject_map[blobindex].setrej_edge_char();
+        // Close to edge
+      }
+    }
+  }
+}
+
+/**********************************************************************
+ * one_ell_conflict()
+ *
+ * Identify words where there is a potential I/l/1 error.
+ * - A bundle of contextual heuristics!
+ **********************************************************************/
+bool Tesseract::one_ell_conflict(WERD_RES *word_res, bool update_map) {
+  const char *word;
+  const char *lengths;
+  int16_t word_len; // its length
+  int16_t first_alphanum_index_;
+  int16_t first_alphanum_offset_;
+  int16_t i;
+  int16_t offset;
+  bool non_conflict_set_char; // non conf set a/n?
+  bool conflict = false;
+  bool allow_1s;
+  ACCEPTABLE_WERD_TYPE word_type;
+  bool dict_perm_type;
+  bool dict_word_ok;
+  int dict_word_type;
+
+  word = word_res->best_choice->unichar_string().c_str();
+  lengths = word_res->best_choice->unichar_lengths().c_str();
+  word_len = strlen(lengths);
+  /*
+  If there are no occurrences of the conflict set characters then the word
+  is OK.
+*/
+  if (strpbrk(word, conflict_set_I_l_1.c_str()) == nullptr) {
+    return false;
+  }
+
+  /*
+  There is a conflict if there are NO other (confirmed) alphanumerics apart
+  from those in the conflict set.
+*/
+
+  for (i = 0, offset = 0, non_conflict_set_char = false; (i < word_len) && !non_conflict_set_char;
+       offset += lengths[i++]) {
+    non_conflict_set_char = (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
+                             word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
+                            !conflict_set_I_l_1.contains(word[offset]);
+  }
+  if (!non_conflict_set_char) {
+    if (update_map) {
+      reject_I_1_L(word_res);
+    }
+    return true;
+  }
+
+  /*
+  If the word is accepted by a dawg permuter, and the first alpha character
+  is "I" or "l", check to see if the alternative is also a dawg word. If it
+  is, then there is a potential error otherwise the word is ok.
+*/
+
+  dict_perm_type = (word_res->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
+                   (word_res->best_choice->permuter() == USER_DAWG_PERM) ||
+                   (rej_trust_doc_dawg && (word_res->best_choice->permuter() == DOC_DAWG_PERM)) ||
+                   (word_res->best_choice->permuter() == FREQ_DAWG_PERM);
+  dict_word_type = dict_word(*(word_res->best_choice));
+  dict_word_ok = (dict_word_type > 0) && (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
+
+  if ((rej_1Il_use_dict_word && dict_word_ok) || (rej_1Il_trust_permuter_type && dict_perm_type) ||
+      (dict_perm_type && dict_word_ok)) {
+    first_alphanum_index_ = first_alphanum_index(word, lengths);
+    first_alphanum_offset_ = first_alphanum_offset(word, lengths);
+    if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {
+      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
+      if (safe_dict_word(word_res) > 0) {
+        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
+        if (update_map) {
+          word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
+        }
+        return true;
+      } else {
+        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
+        return false;
+      }
+    }
+
+    if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {
+      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
+      if (safe_dict_word(word_res) > 0) {
+        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
+        if (update_map) {
+          word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
+        }
+        return true;
+      } else {
+        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
+        return false;
+      }
+    }
+    return false;
+  }
+
+  /*
+  NEW 1Il code. The old code relied on permuter types too much. In fact,
+  tess will use TOP_CHOICE permute for good things like "palette".
+  In this code the string is examined independently to see if it looks like
+  a well formed word.
+*/
+
+  /*
+  REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
+  dictionary word.
+*/
+  first_alphanum_index_ = first_alphanum_index(word, lengths);
+  first_alphanum_offset_ = first_alphanum_offset(word, lengths);
+  if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {
+    word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
+    if (safe_dict_word(word_res) > 0) {
+      return false;
+    } else {
+      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
+    }
+  } else if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {
+    word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
+    if (safe_dict_word(word_res) > 0) {
+      return false;
+    } else {
+      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
+    }
+  }
+  /*
+  For strings containing digits:
+    If there are no alphas OR the numeric permuter liked the word,
+      reject any non 1 conflict chs
+    Else reject all conflict chs
+*/
+  if (word_contains_non_1_digit(word, lengths)) {
+    allow_1s =
+        (alpha_count(word, lengths) == 0) || (word_res->best_choice->permuter() == NUMBER_PERM);
+
+    int16_t offset;
+    conflict = false;
+    for (i = 0, offset = 0; word[offset] != '\0';
+         offset += word_res->best_choice->unichar_lengths()[i++]) {
+      if ((!allow_1s || (word[offset] != '1')) &&
+          conflict_set_I_l_1.contains(word[offset])) {
+        if (update_map) {
+          word_res->reject_map[i].setrej_1Il_conflict();
+        }
+        conflict = true;
+      }
+    }
+    return conflict;
+  }
+  /*
+  For anything else. See if it conforms to an acceptable word type. If so,
+  treat accordingly.
+*/
+  word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
+  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
+    first_alphanum_index_ = first_alphanum_index(word, lengths);
+    first_alphanum_offset_ = first_alphanum_offset(word, lengths);
+    if (conflict_set_I_l_1.contains(word[first_alphanum_offset_])) {
+      if (update_map) {
+        word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
+      }
+      return true;
+    } else {
+      return false;
+    }
+  } else if (word_type == AC_UPPER_CASE) {
+    return false;
+  } else {
+    if (update_map) {
+      reject_I_1_L(word_res);
+    }
+    return true;
+  }
+}
+
+int16_t Tesseract::first_alphanum_index(const char *word, const char *word_lengths) {
+  int16_t i;
+  int16_t offset;
+
+  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
+    if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
+        unicharset.get_isdigit(word + offset, word_lengths[i])) {
+      return i;
+    }
+  }
+  return -1;
+}
+
+int16_t Tesseract::first_alphanum_offset(const char *word, const char *word_lengths) {
+  int16_t i;
+  int16_t offset;
+
+  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
+    if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
+        unicharset.get_isdigit(word + offset, word_lengths[i])) {
+      return offset;
+    }
+  }
+  return -1;
+}
+
+int16_t Tesseract::alpha_count(const char *word, const char *word_lengths) {
+  int16_t i;
+  int16_t offset;
+  int16_t count = 0;
+
+  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
+    if (unicharset.get_isalpha(word + offset, word_lengths[i])) {
+      count++;
+    }
+  }
+  return count;
+}
+
+bool Tesseract::word_contains_non_1_digit(const char *word, const char *word_lengths) {
+  int16_t i;
+  int16_t offset;
+
+  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
+    if (unicharset.get_isdigit(word + offset, word_lengths[i]) &&
+        (word_lengths[i] != 1 || word[offset] != '1')) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/*************************************************************************
+ * dont_allow_1Il()
+ * Don't unreject LONE accepted 1Il conflict set chars
+ *************************************************************************/
+void Tesseract::dont_allow_1Il(WERD_RES *word) {
+  int i = 0;
+  int offset;
+  int word_len = word->reject_map.length();
+  const char *s = word->best_choice->unichar_string().c_str();
+  const char *lengths = word->best_choice->unichar_lengths().c_str();
+  bool accepted_1Il = false;
+
+  for (i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
+    if (word->reject_map[i].accepted()) {
+      if (conflict_set_I_l_1.contains(s[offset])) {
+        accepted_1Il = true;
+      } else {
+        if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
+            word->uch_set->get_isdigit(s + offset, lengths[i])) {
+          return; // >=1 non 1Il ch accepted
+        }
+      }
+    }
+  }
+  if (!accepted_1Il) {
+    return; // Nothing to worry about
+  }
+
+  for (i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
+    if (conflict_set_I_l_1.contains(s[offset]) && word->reject_map[i].accepted()) {
+      word->reject_map[i].setrej_postNN_1Il();
+    }
+  }
+}
+
+int16_t Tesseract::count_alphanums(WERD_RES *word_res) {
+  int count = 0;
+  const WERD_CHOICE *best_choice = word_res->best_choice;
+  for (int i = 0; i < word_res->reject_map.length(); ++i) {
+    if ((word_res->reject_map[i].accepted()) &&
+        (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
+         word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
+      count++;
+    }
+  }
+  return count;
+}
+
+// reject all if most rejected.
+void Tesseract::reject_mostly_rejects(WERD_RES *word) {
+  /* Reject the whole of the word if the fraction of rejects exceeds a limit */
+
+  if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >=
+      rej_whole_of_mostly_reject_word_fract) {
+    word->reject_map.rej_word_mostly_rej();
+  }
+}
+
+bool Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
+  int16_t char_quality;
+  int16_t accepted_char_quality;
+
+  if (word->best_choice->unichar_lengths().length() <= 1) {
+    return false;
+  }
+
+  if (!ok_repeated_ch_non_alphanum_wds.contains(word->best_choice->unichar_string()[0])) {
+    return false;
+  }
+
+  UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
+  for (int i = 1; i < word->best_choice->length(); ++i) {
+    if (word->best_choice->unichar_id(i) != uch_id) {
+      return false;
+    }
+  }
+
+  word_char_quality(word, &char_quality, &accepted_char_quality);
+
+  if ((word->best_choice->unichar_lengths().length() == char_quality) &&
+      (char_quality == accepted_char_quality)) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
+  const WERD_CHOICE &word = *werd_res->best_choice;
+  int dict_word_type = werd_res->tesseract->dict_word(word);
+  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
+}
+
+// Note: After running this function word_res->ratings
+// might not contain the right BLOB_CHOICE corresponding to each character
+// in word_res->best_choice.
+void Tesseract::flip_hyphens(WERD_RES *word_res) {
+  WERD_CHOICE *best_choice = word_res->best_choice;
+  int i;
+  int prev_right = -9999;
+  int next_left;
+  TBOX out_box;
+  float aspect_ratio;
+
+  if (tessedit_lower_flip_hyphen <= 1) {
+    return;
+  }
+
+  int num_blobs = word_res->rebuild_word->NumBlobs();
+  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
+  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
+    TBLOB *blob = word_res->rebuild_word->blobs[i];
+    out_box = blob->bounding_box();
+    if (i + 1 == num_blobs) {
+      next_left = 9999;
+    } else {
+      next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
+    }
+    // Don't touch small or touching blobs - it is too dangerous.
+    if ((out_box.width() > 8 * word_res->denorm.x_scale()) && (out_box.left() > prev_right) &&
+        (out_box.right() < next_left)) {
+      aspect_ratio = out_box.width() / static_cast<float>(out_box.height());
+      if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
+        if (aspect_ratio >= tessedit_upper_flip_hyphen &&
+            word_res->uch_set->contains_unichar_id(unichar_dash) &&
+            word_res->uch_set->get_enabled(unichar_dash)) {
+          /* Certain HYPHEN */
+          best_choice->set_unichar_id(unichar_dash, i);
+          if (word_res->reject_map[i].rejected()) {
+            word_res->reject_map[i].setrej_hyphen_accept();
+          }
+        }
+        if ((aspect_ratio > tessedit_lower_flip_hyphen) && word_res->reject_map[i].accepted()) {
+          // Suspected HYPHEN
+          word_res->reject_map[i].setrej_hyphen();
+        }
+      } else if (best_choice->unichar_id(i) == unichar_dash) {
+        if ((aspect_ratio >= tessedit_upper_flip_hyphen) && (word_res->reject_map[i].rejected())) {
+          word_res->reject_map[i].setrej_hyphen_accept();
+        }
+        // Certain HYPHEN
+
+        if ((aspect_ratio <= tessedit_lower_flip_hyphen) && (word_res->reject_map[i].accepted())) {
+          // Suspected HYPHEN
+          word_res->reject_map[i].setrej_hyphen();
+        }
+      }
+    }
+    prev_right = out_box.right();
+  }
+}
+
+// Note: After running this function word_res->ratings
+// might not contain the right BLOB_CHOICE corresponding to each character
+// in word_res->best_choice.
+void Tesseract::flip_0O(WERD_RES *word_res) {
+  WERD_CHOICE *best_choice = word_res->best_choice;
+  int i;
+  TBOX out_box;
+
+  if (!tessedit_flip_0O) {
+    return;
+  }
+
+  int num_blobs = word_res->rebuild_word->NumBlobs();
+  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
+    TBLOB *blob = word_res->rebuild_word->blobs[i];
+    if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
+        word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
+      out_box = blob->bounding_box();
+      if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
+          (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4)) {
+        return; // Beware words with sub/superscripts
+      }
+    }
+  }
+  UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
+  UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
+  if (unichar_0 == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_0) ||
+      unichar_O == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_O)) {
+    return; // 0 or O are not present/enabled in unicharset
+  }
+  for (i = 1; i < best_choice->length(); ++i) {
+    if (best_choice->unichar_id(i) == unichar_0 || best_choice->unichar_id(i) == unichar_O) {
+      /* A0A */
+      if ((i + 1) < best_choice->length() &&
+          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
+          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 1))) {
+        best_choice->set_unichar_id(unichar_O, i);
+      }
+      /* A00A */
+      if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
+          (i + 1) < best_choice->length() &&
+          (best_choice->unichar_id(i + 1) == unichar_0 ||
+           best_choice->unichar_id(i + 1) == unichar_O) &&
+          (i + 2) < best_choice->length() &&
+          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 2))) {
+        best_choice->set_unichar_id(unichar_O, i);
+        i++;
+      }
+      /* AA0<non digit or end of word> */
+      if ((i > 1) && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 2)) &&
+          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
+          (((i + 1) < best_choice->length() &&
+            !word_res->uch_set->get_isdigit(best_choice->unichar_id(i + 1)) &&
+            !word_res->uch_set->eq(best_choice->unichar_id(i + 1), "l") &&
+            !word_res->uch_set->eq(best_choice->unichar_id(i + 1), "I")) ||
+           (i == best_choice->length() - 1))) {
+        best_choice->set_unichar_id(unichar_O, i);
+      }
+      /* 9O9 */
+      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
+          (i + 1) < best_choice->length() &&
+          non_0_digit(*word_res->uch_set, best_choice->unichar_id(i + 1))) {
+        best_choice->set_unichar_id(unichar_0, i);
+      }
+      /* 9OOO */
+      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
+          (i + 2) < best_choice->length() &&
+          (best_choice->unichar_id(i + 1) == unichar_0 ||
+           best_choice->unichar_id(i + 1) == unichar_O) &&
+          (best_choice->unichar_id(i + 2) == unichar_0 ||
+           best_choice->unichar_id(i + 2) == unichar_O)) {
+        best_choice->set_unichar_id(unichar_0, i);
+        best_choice->set_unichar_id(unichar_0, i + 1);
+        best_choice->set_unichar_id(unichar_0, i + 2);
+        i += 2;
+      }
+      /* 9OO<non upper> */
+      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
+          (i + 2) < best_choice->length() &&
+          (best_choice->unichar_id(i + 1) == unichar_0 ||
+           best_choice->unichar_id(i + 1) == unichar_O) &&
+          !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 2))) {
+        best_choice->set_unichar_id(unichar_0, i);
+        best_choice->set_unichar_id(unichar_0, i + 1);
+        i++;
+      }
+      /* 9O<non upper> */
+      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
+          (i + 1) < best_choice->length() &&
+          !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 1))) {
+        best_choice->set_unichar_id(unichar_0, i);
+      }
+      /* 9[.,]OOO.. */
+      if ((i > 1) &&
+          (word_res->uch_set->eq(best_choice->unichar_id(i - 1), ".") ||
+           word_res->uch_set->eq(best_choice->unichar_id(i - 1), ",")) &&
+          (word_res->uch_set->get_isdigit(best_choice->unichar_id(i - 2)) ||
+           best_choice->unichar_id(i - 2) == unichar_O)) {
+        if (best_choice->unichar_id(i - 2) == unichar_O) {
+          best_choice->set_unichar_id(unichar_0, i - 2);
+        }
+        while (i < best_choice->length() && (best_choice->unichar_id(i) == unichar_O ||
+                                             best_choice->unichar_id(i) == unichar_0)) {
+          best_choice->set_unichar_id(unichar_0, i);
+          i++;
+        }
+        i--;
+      }
+    }
+  }
+}
+
+bool Tesseract::non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) {
+  return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
+}
+
+bool Tesseract::non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) {
+  return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
+}
+} // namespace tesseract
+
+#endif // def DISABLED_LEGACY_ENGINE
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/reject.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/reject.h
@ -0,0 +1,39 @@
+/**********************************************************************
+ * File:        reject.h
+ * Description: Rejection functions used in tessedit
+ * Author:      Phil Cheatle
+ * Created:     Wed Sep 23 16:50:21 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef REJECT_H
+#define REJECT_H
+
+namespace tesseract {
+
+class WERD_CHOICE;
+class WERD_RES;
+
+void reject_blanks(WERD_RES *word);
+void reject_poor_matches(WERD_RES *word);
+float compute_reject_threshold(WERD_CHOICE *word);
+bool word_contains_non_1_digit(const char *word, const char *word_lengths);
+void dont_allow_1Il(WERD_RES *word);
+void flip_hyphens(WERD_RES *word);
+void flip_0O(WERD_RES *word);
+bool non_0_digit(const char *str, int length);
+
+} // namespace tesseract
+
+#endif
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/resultiterator.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/resultiterator.cpp
@ -0,0 +1,789 @@
+///////////////////////////////////////////////////////////////////////
+// File:        resultiterator.cpp
+// Description: Iterator for tesseract results that is capable of
+//              iterating in proper reading order over Bi Directional
+//              (e.g. mixed Hebrew and English) text.
+// Author:      David Eger
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include <tesseract/resultiterator.h>
+
+#include "pageres.h"
+#include "tesseractclass.h"
+#include "unicharset.h"
+
+#include <allheaders.h>
+
+#include <set>
+#include <vector>
+
+static const char *const kLRM = "\u200E"; // Left-to-Right Mark
+static const char *const kRLM = "\u200F"; // Right-to-Left Mark
+
+namespace tesseract {
+
+ResultIterator::ResultIterator(const LTRResultIterator &resit) : LTRResultIterator(resit) {
+  in_minor_direction_ = false;
+  at_beginning_of_minor_run_ = false;
+  preserve_interword_spaces_ = false;
+
+  auto *p = ParamUtils::FindParam<BoolParam>(
+      "preserve_interword_spaces", GlobalParams()->bool_params, tesseract_->params()->bool_params);
+  if (p != nullptr) {
+    preserve_interword_spaces_ = (bool)(*p);
+  }
+
+  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
+  MoveToLogicalStartOfTextline();
+}
+
+ResultIterator *ResultIterator::StartOfParagraph(const LTRResultIterator &resit) {
+  return new ResultIterator(resit);
+}
+
+bool ResultIterator::ParagraphIsLtr() const {
+  return current_paragraph_is_ltr_;
+}
+
+bool ResultIterator::CurrentParagraphIsLtr() const {
+  if (!it_->word()) {
+    return true; // doesn't matter.
+  }
+  LTRResultIterator it(*this);
+  it.RestartParagraph();
+  // Try to figure out the ltr-ness of the paragraph.  The rules below
+  // make more sense in the context of a difficult paragraph example.
+  // Here we denote {ltr characters, RTL CHARACTERS}:
+  //
+  //   "don't go in there!" DAIS EH
+  //   EHT OTNI DEPMUJ FELSMIH NEHT DNA
+  //                  .GNIDLIUB GNINRUB
+  //
+  // On the first line, the left-most word is LTR and the rightmost word
+  // is RTL.  Thus, we are better off taking the majority direction for
+  // the whole paragraph contents.  So instead of "the leftmost word is LTR"
+  // indicating an LTR paragraph, we use a heuristic about what RTL paragraphs
+  // would not do:  Typically an RTL paragraph would *not* start with an LTR
+  // word.  So our heuristics are as follows:
+  //
+  // (1) If the first text line has an RTL word in the left-most position
+  //     it is RTL.
+  // (2) If the first text line has an LTR word in the right-most position
+  //     it is LTR.
+  // (3) If neither of the above is true, take the majority count for the
+  //     paragraph -- if there are more rtl words, it is RTL.  If there
+  //     are more LTR words, it's LTR.
+  bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;
+  bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;
+  int num_ltr, num_rtl;
+  num_rtl = leftmost_rtl ? 1 : 0;
+  num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;
+  for (it.Next(RIL_WORD); !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);
+       it.Next(RIL_WORD)) {
+    StrongScriptDirection dir = it.WordDirection();
+    rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);
+    num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
+    num_ltr += rightmost_ltr ? 1 : 0;
+  }
+  if (leftmost_rtl) {
+    return false;
+  }
+  if (rightmost_ltr) {
+    return true;
+  }
+  // First line is ambiguous.  Take statistics on the whole paragraph.
+  if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) {
+    do {
+      StrongScriptDirection dir = it.WordDirection();
+      num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
+      num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;
+    } while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));
+  }
+  return num_ltr >= num_rtl;
+}
+
+const int ResultIterator::kMinorRunStart = -1;
+const int ResultIterator::kMinorRunEnd = -2;
+const int ResultIterator::kComplexWord = -3;
+
+void ResultIterator::CalculateBlobOrder(std::vector<int> *blob_indices) const {
+  bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
+  blob_indices->clear();
+  if (Empty(RIL_WORD)) {
+    return;
+  }
+  if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {
+    // Easy! just return the blobs in order;
+    for (int i = 0; i < word_length_; i++) {
+      blob_indices->push_back(i);
+    }
+    return;
+  }
+
+  // The blobs are in left-to-right order, but the current reading context
+  // is right-to-left.
+  const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;
+  const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;
+  const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;
+  const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;
+  const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;
+  const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;
+  const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;
+
+  // Step 1: Scan for and mark European Number sequences
+  //   [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*
+  std::vector<int> letter_types;
+  letter_types.reserve(word_length_);
+  for (int i = 0; i < word_length_; i++) {
+    letter_types.push_back(it_->word()->SymbolDirection(i));
+  }
+  // Convert a single separtor sandwiched between two EN's into an EN.
+  for (int i = 0; i + 2 < word_length_; i++) {
+    if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&
+        (letter_types[i + 1] == U_EURO_NUM_SEP || letter_types[i + 1] == U_COMMON_NUM_SEP)) {
+      letter_types[i + 1] = U_EURO_NUM;
+    }
+  }
+  // Scan for sequences of European Number Terminators around ENs and convert
+  // them to ENs.
+  for (int i = 0; i < word_length_; i++) {
+    if (letter_types[i] == U_EURO_NUM_TERM) {
+      int j = i + 1;
+      while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) {
+        j++;
+      }
+      if (j < word_length_ && letter_types[j] == U_EURO_NUM) {
+        // The sequence [i..j] should be converted to all European Numbers.
+        for (int k = i; k < j; k++) {
+          letter_types[k] = U_EURO_NUM;
+        }
+      }
+      j = i - 1;
+      while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) {
+        j--;
+      }
+      if (j > -1 && letter_types[j] == U_EURO_NUM) {
+        // The sequence [j..i] should be converted to all European Numbers.
+        for (int k = j; k <= i; k++) {
+          letter_types[k] = U_EURO_NUM;
+        }
+      }
+    }
+  }
+  // Step 2: Convert all remaining types to either L or R.
+  // Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.
+  // All other are R.
+  for (int i = 0; i < word_length_;) {
+    int ti = letter_types[i];
+    if (ti == U_LTR || ti == U_EURO_NUM) {
+      // Left to right sequence; scan to the end of it.
+      int last_good = i;
+      for (int j = i + 1; j < word_length_; j++) {
+        int tj = letter_types[j];
+        if (tj == U_LTR || tj == U_EURO_NUM) {
+          last_good = j;
+        } else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
+          // do nothing.
+        } else {
+          break;
+        }
+      }
+      // [i..last_good] is the L sequence
+      for (int k = i; k <= last_good; k++) {
+        letter_types[k] = U_LTR;
+      }
+      i = last_good + 1;
+    } else {
+      letter_types[i] = U_RTL;
+      i++;
+    }
+  }
+
+  // At this point, letter_types is entirely U_LTR or U_RTL.
+  for (int i = word_length_ - 1; i >= 0;) {
+    if (letter_types[i] == U_RTL) {
+      blob_indices->push_back(i);
+      i--;
+    } else {
+      // left to right sequence.  scan to the beginning.
+      int j = i - 1;
+      for (; j >= 0 && letter_types[j] != U_RTL; j--) {
+      } // pass
+      // Now (j, i] is LTR
+      for (int k = j + 1; k <= i; k++) {
+        blob_indices->push_back(k);
+      }
+      i = j;
+    }
+  }
+  ASSERT_HOST(blob_indices->size() == word_length_);
+}
+
+static void PrintScriptDirs(const std::vector<StrongScriptDirection> &dirs) {
+  for (auto dir : dirs) {
+    switch (dir) {
+      case DIR_NEUTRAL:
+        tprintf("N ");
+        break;
+      case DIR_LEFT_TO_RIGHT:
+        tprintf("L ");
+        break;
+      case DIR_RIGHT_TO_LEFT:
+        tprintf("R ");
+        break;
+      case DIR_MIX:
+        tprintf("Z ");
+        break;
+      default:
+        tprintf("? ");
+        break;
+    }
+  }
+  tprintf("\n");
+}
+
+void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit,
+                                            std::vector<int> *word_indices) const {
+  std::vector<StrongScriptDirection> directions;
+  CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);
+}
+
+void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit,
+                                            std::vector<StrongScriptDirection> *dirs_arg,
+                                            std::vector<int> *word_indices) const {
+  std::vector<StrongScriptDirection> dirs;
+  std::vector<StrongScriptDirection> *directions;
+  directions = (dirs_arg != nullptr) ? dirs_arg : &dirs;
+  directions->clear();
+
+  // A LTRResultIterator goes strictly left-to-right word order.
+  LTRResultIterator ltr_it(resit);
+  ltr_it.RestartRow();
+  if (ltr_it.Empty(RIL_WORD)) {
+    return;
+  }
+  do {
+    directions->push_back(ltr_it.WordDirection());
+  } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));
+
+  word_indices->clear();
+  CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
+}
+
+void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr,
+                                            const std::vector<StrongScriptDirection> &word_dirs,
+                                            std::vector<int> *reading_order) {
+  reading_order->clear();
+  if (word_dirs.empty()) {
+    return;
+  }
+
+  // Take all of the runs of minor direction words and insert them
+  // in reverse order.
+  int minor_direction, major_direction, major_step, start, end;
+  if (paragraph_is_ltr) {
+    start = 0;
+    end = word_dirs.size();
+    major_step = 1;
+    major_direction = DIR_LEFT_TO_RIGHT;
+    minor_direction = DIR_RIGHT_TO_LEFT;
+  } else {
+    start = word_dirs.size() - 1;
+    end = -1;
+    major_step = -1;
+    major_direction = DIR_RIGHT_TO_LEFT;
+    minor_direction = DIR_LEFT_TO_RIGHT;
+    // Special rule: if there are neutral words at the right most side
+    //   of a line adjacent to a left-to-right word in the middle of the
+    //   line, we interpret the end of the line as a single LTR sequence.
+    if (word_dirs[start] == DIR_NEUTRAL) {
+      int neutral_end = start;
+      while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
+        neutral_end--;
+      }
+      if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
+        // LTR followed by neutrals.
+        // Scan for the beginning of the minor left-to-right run.
+        int left = neutral_end;
+        for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
+          if (word_dirs[i] == DIR_LEFT_TO_RIGHT) {
+            left = i;
+          }
+        }
+        reading_order->push_back(kMinorRunStart);
+        for (unsigned i = left; i < word_dirs.size(); i++) {
+          reading_order->push_back(i);
+          if (word_dirs[i] == DIR_MIX) {
+            reading_order->push_back(kComplexWord);
+          }
+        }
+        reading_order->push_back(kMinorRunEnd);
+        start = left - 1;
+      }
+    }
+  }
+  for (int i = start; i != end;) {
+    if (word_dirs[i] == minor_direction) {
+      int j = i;
+      while (j != end && word_dirs[j] != major_direction) {
+        j += major_step;
+      }
+      if (j == end) {
+        j -= major_step;
+      }
+      while (j != i && word_dirs[j] != minor_direction) {
+        j -= major_step;
+      }
+      //  [j..i] is a minor direction run.
+      reading_order->push_back(kMinorRunStart);
+      for (int k = j; k != i; k -= major_step) {
+        reading_order->push_back(k);
+      }
+      reading_order->push_back(i);
+      reading_order->push_back(kMinorRunEnd);
+      i = j + major_step;
+    } else {
+      reading_order->push_back(i);
+      if (word_dirs[i] == DIR_MIX) {
+        reading_order->push_back(kComplexWord);
+      }
+      i += major_step;
+    }
+  }
+}
+
+int ResultIterator::LTRWordIndex() const {
+  int this_word_index = 0;
+  LTRResultIterator textline(*this);
+  textline.RestartRow();
+  while (!textline.PositionedAtSameWord(it_)) {
+    this_word_index++;
+    textline.Next(RIL_WORD);
+  }
+  return this_word_index;
+}
+
+void ResultIterator::MoveToLogicalStartOfWord() {
+  if (word_length_ == 0) {
+    BeginWord(0);
+    return;
+  }
+  std::vector<int> blob_order;
+  CalculateBlobOrder(&blob_order);
+  if (blob_order.empty() || blob_order[0] == 0) {
+    return;
+  }
+  BeginWord(blob_order[0]);
+}
+
+bool ResultIterator::IsAtFinalSymbolOfWord() const {
+  if (!it_->word()) {
+    return true;
+  }
+  std::vector<int> blob_order;
+  CalculateBlobOrder(&blob_order);
+  return blob_order.empty() || blob_order.back() == blob_index_;
+}
+
+bool ResultIterator::IsAtFirstSymbolOfWord() const {
+  if (!it_->word()) {
+    return true;
+  }
+  std::vector<int> blob_order;
+  CalculateBlobOrder(&blob_order);
+  return blob_order.empty() || blob_order[0] == blob_index_;
+}
+
+void ResultIterator::AppendSuffixMarks(std::string *text) const {
+  if (!it_->word()) {
+    return;
+  }
+  bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
+  // scan forward to see what meta-information the word ordering algorithm
+  // left us.
+  // If this word is at the  *end* of a minor run, insert the other
+  // direction's mark;  else if this was a complex word, insert the
+  // current reading order's mark.
+  std::vector<int> textline_order;
+  CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &textline_order);
+  int this_word_index = LTRWordIndex();
+  size_t i = 0;
+  for (const auto word_index : textline_order) {
+    if (word_index == this_word_index) {
+      break;
+    }
+    i++;
+  }
+  if (i == textline_order.size()) {
+    return;
+  }
+
+  int last_non_word_mark = 0;
+  for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {
+    last_non_word_mark = textline_order[i];
+  }
+  if (last_non_word_mark == kComplexWord) {
+    *text += reading_direction_is_ltr ? kLRM : kRLM;
+  } else if (last_non_word_mark == kMinorRunEnd) {
+    if (current_paragraph_is_ltr_) {
+      *text += kLRM;
+    } else {
+      *text += kRLM;
+    }
+  }
+}
+
+void ResultIterator::MoveToLogicalStartOfTextline() {
+  std::vector<int> word_indices;
+  RestartRow();
+  CalculateTextlineOrder(current_paragraph_is_ltr_, dynamic_cast<const LTRResultIterator &>(*this),
+                         &word_indices);
+  unsigned i = 0;
+  for (; i < word_indices.size() && word_indices[i] < 0; i++) {
+    if (word_indices[i] == kMinorRunStart) {
+      in_minor_direction_ = true;
+    } else if (word_indices[i] == kMinorRunEnd) {
+      in_minor_direction_ = false;
+    }
+  }
+  if (in_minor_direction_) {
+    at_beginning_of_minor_run_ = true;
+  }
+  if (i >= word_indices.size()) {
+    return;
+  }
+  int first_word_index = word_indices[i];
+  for (int j = 0; j < first_word_index; j++) {
+    PageIterator::Next(RIL_WORD);
+  }
+  MoveToLogicalStartOfWord();
+}
+
+void ResultIterator::Begin() {
+  LTRResultIterator::Begin();
+  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
+  in_minor_direction_ = false;
+  at_beginning_of_minor_run_ = false;
+  MoveToLogicalStartOfTextline();
+}
+
+bool ResultIterator::Next(PageIteratorLevel level) {
+  if (it_->block() == nullptr) {
+    return false; // already at end!
+  }
+  switch (level) {
+    case RIL_BLOCK: // explicit fall-through
+    case RIL_PARA:  // explicit fall-through
+    case RIL_TEXTLINE:
+      if (!PageIterator::Next(level)) {
+        return false;
+      }
+      if (IsWithinFirstTextlineOfParagraph()) {
+        // if we've advanced to a new paragraph,
+        // recalculate current_paragraph_is_ltr_
+        current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
+      }
+      in_minor_direction_ = false;
+      MoveToLogicalStartOfTextline();
+      return it_->block() != nullptr;
+    case RIL_SYMBOL: {
+      std::vector<int> blob_order;
+      CalculateBlobOrder(&blob_order);
+      int next_blob = 0;
+      while (next_blob < blob_order.size() && blob_index_ != blob_order[next_blob]) {
+        next_blob++;
+      }
+      next_blob++;
+      if (next_blob < blob_order.size()) {
+        // we're in the same word; simply advance one blob.
+        BeginWord(blob_order[next_blob]);
+        at_beginning_of_minor_run_ = false;
+        return true;
+      }
+      level = RIL_WORD; // we've fallen through to the next word.
+    }
+      // Fall through.
+    case RIL_WORD: // explicit fall-through.
+    {
+      if (it_->word() == nullptr) {
+        return Next(RIL_BLOCK);
+      }
+      std::vector<int> word_indices;
+      int this_word_index = LTRWordIndex();
+      CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &word_indices);
+      int final_real_index = word_indices.size() - 1;
+      while (final_real_index > 0 && word_indices[final_real_index] < 0) {
+        final_real_index--;
+      }
+      for (int i = 0; i < final_real_index; i++) {
+        if (word_indices[i] == this_word_index) {
+          int j = i + 1;
+          for (; j < final_real_index && word_indices[j] < 0; j++) {
+            if (word_indices[j] == kMinorRunStart) {
+              in_minor_direction_ = true;
+            }
+            if (word_indices[j] == kMinorRunEnd) {
+              in_minor_direction_ = false;
+            }
+          }
+          at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
+          // awesome, we move to word_indices[j]
+          if (BidiDebug(3)) {
+            tprintf("Next(RIL_WORD): %d -> %d\n", this_word_index, word_indices[j]);
+          }
+          PageIterator::RestartRow();
+          for (int k = 0; k < word_indices[j]; k++) {
+            PageIterator::Next(RIL_WORD);
+          }
+          MoveToLogicalStartOfWord();
+          return true;
+        }
+      }
+      if (BidiDebug(3)) {
+        tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
+      }
+      // we're going off the end of the text line.
+      return Next(RIL_TEXTLINE);
+    }
+  }
+  ASSERT_HOST(false); // shouldn't happen.
+  return false;
+}
+
+bool ResultIterator::IsAtBeginningOf(PageIteratorLevel level) const {
+  if (it_->block() == nullptr) {
+    return false; // Already at the end!
+  }
+  if (it_->word() == nullptr) {
+    return true; // In an image block.
+  }
+  if (level == RIL_SYMBOL) {
+    return true; // Always at beginning of a symbol.
+  }
+
+  bool at_word_start = IsAtFirstSymbolOfWord();
+  if (level == RIL_WORD) {
+    return at_word_start;
+  }
+
+  ResultIterator line_start(*this);
+  // move to the first word in the line...
+  line_start.MoveToLogicalStartOfTextline();
+
+  bool at_textline_start = at_word_start && *line_start.it_ == *it_;
+  if (level == RIL_TEXTLINE) {
+    return at_textline_start;
+  }
+
+  // now we move to the left-most word...
+  line_start.RestartRow();
+  bool at_block_start =
+      at_textline_start && line_start.it_->block() != line_start.it_->prev_block();
+  if (level == RIL_BLOCK) {
+    return at_block_start;
+  }
+
+  bool at_para_start =
+      at_block_start || (at_textline_start && line_start.it_->row()->row->para() !=
+                                                  line_start.it_->prev_row()->row->para());
+  if (level == RIL_PARA) {
+    return at_para_start;
+  }
+
+  ASSERT_HOST(false); // shouldn't happen.
+  return false;
+}
+
+/**
+ * NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the
+ *   change that the variable next is now a ResultIterator instead of a
+ *   PageIterator.
+ */
+bool ResultIterator::IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const {
+  if (Empty(element)) {
+    return true; // Already at the end!
+  }
+  // The result is true if we step forward by element and find we are
+  // at the the end of the page or at beginning of *all* levels in:
+  // [level, element).
+  // When there is more than one level difference between element and level,
+  // we could for instance move forward one symbol and still be at the first
+  // word on a line, so we also have to be at the first symbol in a word.
+  ResultIterator next(*this);
+  next.Next(element);
+  if (next.Empty(element)) {
+    return true; // Reached the end of the page.
+  }
+  while (element > level) {
+    element = static_cast<PageIteratorLevel>(element - 1);
+    if (!next.IsAtBeginningOf(element)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Returns the number of blanks before the current word.
+int ResultIterator::BlanksBeforeWord() const {
+  if (CurrentParagraphIsLtr()) {
+    return LTRResultIterator::BlanksBeforeWord();
+  }
+  return IsAtBeginningOf(RIL_TEXTLINE) ? 0 : 1;
+}
+
+/**
+ * Returns the null terminated UTF-8 encoded text string for the current
+ * object at the given level. Use delete [] to free after use.
+ */
+char *ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
+  if (it_->word() == nullptr) {
+    return nullptr; // Already at the end!
+  }
+  std::string text;
+  switch (level) {
+    case RIL_BLOCK: {
+      ResultIterator pp(*this);
+      do {
+        pp.AppendUTF8ParagraphText(&text);
+      } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
+    } break;
+    case RIL_PARA:
+      AppendUTF8ParagraphText(&text);
+      break;
+    case RIL_TEXTLINE: {
+      ResultIterator it(*this);
+      it.MoveToLogicalStartOfTextline();
+      it.IterateAndAppendUTF8TextlineText(&text);
+    } break;
+    case RIL_WORD:
+      AppendUTF8WordText(&text);
+      break;
+    case RIL_SYMBOL: {
+      bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
+      if (at_beginning_of_minor_run_) {
+        text += reading_direction_is_ltr ? kLRM : kRLM;
+      }
+      text = it_->word()->BestUTF8(blob_index_, false);
+      if (IsAtFinalSymbolOfWord()) {
+        AppendSuffixMarks(&text);
+      }
+    } break;
+  }
+  int length = text.length() + 1;
+  char *result = new char[length];
+  strncpy(result, text.c_str(), length);
+  return result;
+}
+std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
+    *ResultIterator::GetRawLSTMTimesteps() const {
+  if (it_->word() != nullptr) {
+    return &it_->word()->segmented_timesteps;
+  } else {
+    return nullptr;
+  }
+}
+
+std::vector<std::vector<std::pair<const char *, float>>> *ResultIterator::GetBestLSTMSymbolChoices()
+    const {
+  if (it_->word() != nullptr) {
+    return &it_->word()->CTC_symbol_choices;
+  } else {
+    return nullptr;
+  }
+}
+
+void ResultIterator::AppendUTF8WordText(std::string *text) const {
+  if (!it_->word()) {
+    return;
+  }
+  ASSERT_HOST(it_->word()->best_choice != nullptr);
+  bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
+  if (at_beginning_of_minor_run_) {
+    *text += reading_direction_is_ltr ? kLRM : kRLM;
+  }
+
+  std::vector<int> blob_order;
+  CalculateBlobOrder(&blob_order);
+  for (int i : blob_order) {
+    *text += it_->word()->BestUTF8(i, false);
+  }
+  AppendSuffixMarks(text);
+}
+
+void ResultIterator::IterateAndAppendUTF8TextlineText(std::string *text) {
+  if (Empty(RIL_WORD)) {
+    Next(RIL_WORD);
+    return;
+  }
+  if (BidiDebug(1)) {
+    std::vector<int> textline_order;
+    std::vector<StrongScriptDirection> dirs;
+    CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &dirs, &textline_order);
+    tprintf("Strong Script dirs     [%p/P=%s]: ", it_->row(),
+            current_paragraph_is_ltr_ ? "ltr" : "rtl");
+    PrintScriptDirs(dirs);
+    tprintf("Logical textline order [%p/P=%s]: ", it_->row(),
+            current_paragraph_is_ltr_ ? "ltr" : "rtl");
+    for (int i : textline_order) {
+      tprintf("%d ", i);
+    }
+    tprintf("\n");
+  }
+
+  int words_appended = 0;
+  do {
+    int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space() : (words_appended > 0);
+    for (int i = 0; i < numSpaces; ++i) {
+      *text += " ";
+    }
+    AppendUTF8WordText(text);
+    words_appended++;
+    if (BidiDebug(2)) {
+      tprintf("Num spaces=%d, text=%s\n", numSpaces, text->c_str());
+    }
+  } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
+  if (BidiDebug(1)) {
+    tprintf("%d words printed\n", words_appended);
+  }
+  *text += line_separator_;
+  // If we just finished a paragraph, add an extra newline.
+  if (IsAtBeginningOf(RIL_PARA)) {
+    *text += paragraph_separator_;
+  }
+}
+
+void ResultIterator::AppendUTF8ParagraphText(std::string *text) const {
+  ResultIterator it(*this);
+  it.RestartParagraph();
+  it.MoveToLogicalStartOfTextline();
+  if (it.Empty(RIL_WORD)) {
+    return;
+  }
+  do {
+    it.IterateAndAppendUTF8TextlineText(text);
+  } while (it.it_->block() != nullptr && !it.IsAtBeginningOf(RIL_PARA));
+}
+
+bool ResultIterator::BidiDebug(int min_level) const {
+  int debug_level = 1;
+  auto *p = ParamUtils::FindParam<IntParam>("bidi_debug", GlobalParams()->int_params,
+                                            tesseract_->params()->int_params);
+  if (p != nullptr) {
+    debug_level = (int32_t)(*p);
+  }
+  return debug_level >= min_level;
+}
+
+} // namespace tesseract.
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/superscript.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/superscript.cpp
@ -0,0 +1,592 @@
+/******************************************************************
+ * File:        superscript.cpp
+ * Description: Correction pass to fix superscripts and subscripts.
+ * Author:      David Eger
+ *
+ * (C) Copyright 2012, Google, Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "normalis.h"
+#include "tesseractclass.h"
+
+namespace tesseract {
+
+static int LeadingUnicharsToChopped(WERD_RES *word, int num_unichars) {
+  int num_chopped = 0;
+  for (int i = 0; i < num_unichars; i++) {
+    num_chopped += word->best_state[i];
+  }
+  return num_chopped;
+}
+
+static int TrailingUnicharsToChopped(WERD_RES *word, int num_unichars) {
+  int num_chopped = 0;
+  for (int i = 0; i < num_unichars; i++) {
+    num_chopped += word->best_state[word->best_state.size() - 1 - i];
+  }
+  return num_chopped;
+}
+
+/**
+ * Given a recognized blob, see if a contiguous collection of sub-pieces
+ * (chopped blobs) starting at its left might qualify as being a subscript
+ * or superscript letter based only on y position.  Also do this for the
+ * right side.
+ */
+static void YOutlierPieces(WERD_RES *word, int rebuilt_blob_index, int super_y_bottom,
+                           int sub_y_top, ScriptPos *leading_pos, int *num_leading_outliers,
+                           ScriptPos *trailing_pos, int *num_trailing_outliers) {
+  ScriptPos sp_unused1, sp_unused2;
+  int unused1, unused2;
+  if (!leading_pos) {
+    leading_pos = &sp_unused1;
+  }
+  if (!num_leading_outliers) {
+    num_leading_outliers = &unused1;
+  }
+  if (!trailing_pos) {
+    trailing_pos = &sp_unused2;
+  }
+  if (!num_trailing_outliers) {
+    num_trailing_outliers = &unused2;
+  }
+
+  *num_leading_outliers = *num_trailing_outliers = 0;
+  *leading_pos = *trailing_pos = SP_NORMAL;
+
+  int chopped_start = LeadingUnicharsToChopped(word, rebuilt_blob_index);
+  int num_chopped_pieces = word->best_state[rebuilt_blob_index];
+  ScriptPos last_pos = SP_NORMAL;
+  int trailing_outliers = 0;
+  for (int i = 0; i < num_chopped_pieces; i++) {
+    TBOX box = word->chopped_word->blobs[chopped_start + i]->bounding_box();
+    ScriptPos pos = SP_NORMAL;
+    if (box.bottom() >= super_y_bottom) {
+      pos = SP_SUPERSCRIPT;
+    } else if (box.top() <= sub_y_top) {
+      pos = SP_SUBSCRIPT;
+    }
+    if (pos == SP_NORMAL) {
+      if (trailing_outliers == i) {
+        *num_leading_outliers = trailing_outliers;
+        *leading_pos = last_pos;
+      }
+      trailing_outliers = 0;
+    } else {
+      if (pos == last_pos) {
+        trailing_outliers++;
+      } else {
+        trailing_outliers = 1;
+      }
+    }
+    last_pos = pos;
+  }
+  *num_trailing_outliers = trailing_outliers;
+  *trailing_pos = last_pos;
+}
+
+/**
+ * Attempt to split off any high (or low) bits at the ends of the word with poor
+ * certainty and recognize them separately.  If the certainty gets much better
+ * and other sanity checks pass, accept.
+ *
+ * This superscript fix is meant to be called in the second pass of recognition
+ * when we have tried once and already have a preliminary answer for word.
+ *
+ * @return Whether we modified the given word.
+ */
+bool Tesseract::SubAndSuperscriptFix(WERD_RES *word) {
+  if (word->tess_failed || word->word->flag(W_REP_CHAR) || !word->best_choice) {
+    return false;
+  }
+  int num_leading, num_trailing;
+  ScriptPos sp_leading, sp_trailing;
+  float leading_certainty, trailing_certainty;
+  float avg_certainty, unlikely_threshold;
+
+  // Calculate the number of whole suspicious characters at the edges.
+  GetSubAndSuperscriptCandidates(word, &num_leading, &sp_leading, &leading_certainty, &num_trailing,
+                                 &sp_trailing, &trailing_certainty, &avg_certainty,
+                                 &unlikely_threshold);
+
+  const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super";
+  const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super";
+
+  int num_blobs = word->best_choice->length();
+
+  // Calculate the remainder (partial characters) at the edges.
+  // This accounts for us having classified the best version of
+  // a word as [speaker?'] when it was instead [speaker.^{21}]
+  // (that is we accidentally thought the 2 was attached to the period).
+  int num_remainder_leading = 0, num_remainder_trailing = 0;
+  if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
+    int super_y_bottom = kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
+    int sub_y_top = kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
+    int last_word_char = num_blobs - 1 - num_trailing;
+    float last_char_certainty = word->best_choice->certainty(last_word_char);
+    if (word->best_choice->unichar_id(last_word_char) != 0 &&
+        last_char_certainty <= unlikely_threshold) {
+      ScriptPos rpos;
+      YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top, nullptr, nullptr, &rpos,
+                     &num_remainder_trailing);
+      if (num_trailing > 0 && rpos != sp_trailing) {
+        num_remainder_trailing = 0;
+      }
+      if (num_remainder_trailing > 0 && last_char_certainty < trailing_certainty) {
+        trailing_certainty = last_char_certainty;
+      }
+    }
+    bool another_blob_available =
+        (num_remainder_trailing == 0) || num_leading + num_trailing + 1 < num_blobs;
+    int first_char_certainty = word->best_choice->certainty(num_leading);
+    if (another_blob_available && word->best_choice->unichar_id(num_leading) != 0 &&
+        first_char_certainty <= unlikely_threshold) {
+      ScriptPos lpos;
+      YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top, &lpos, &num_remainder_leading,
+                     nullptr, nullptr);
+      if (num_leading > 0 && lpos != sp_leading) {
+        num_remainder_leading = 0;
+      }
+      if (num_remainder_leading > 0 && first_char_certainty < leading_certainty) {
+        leading_certainty = first_char_certainty;
+      }
+    }
+  }
+
+  // If nothing to do, bail now.
+  if (num_leading + num_trailing + num_remainder_leading + num_remainder_trailing == 0) {
+    return false;
+  }
+
+  if (superscript_debug >= 1) {
+    tprintf("Candidate for superscript detection: %s (",
+            word->best_choice->unichar_string().c_str());
+    if (num_leading || num_remainder_leading) {
+      tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading, leading_pos);
+    }
+    if (num_trailing || num_remainder_trailing) {
+      tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing, trailing_pos);
+    }
+    tprintf(")\n");
+  }
+  if (superscript_debug >= 3) {
+    word->best_choice->print();
+  }
+  if (superscript_debug >= 2) {
+    tprintf(" Certainties -- Average: %.2f  Unlikely thresh: %.2f  ", avg_certainty,
+            unlikely_threshold);
+    if (num_leading) {
+      tprintf("Orig. leading (min): %.2f  ", leading_certainty);
+    }
+    if (num_trailing) {
+      tprintf("Orig. trailing (min): %.2f  ", trailing_certainty);
+    }
+    tprintf("\n");
+  }
+
+  // We've now calculated the number of rebuilt blobs we want to carve off.
+  // However, split_word() works from TBLOBs in chopped_word, so we need to
+  // convert to those.
+  int num_chopped_leading = LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
+  int num_chopped_trailing = TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
+
+  int retry_leading = 0;
+  int retry_trailing = 0;
+  bool is_good = false;
+  WERD_RES *revised = TrySuperscriptSplits(num_chopped_leading, leading_certainty, sp_leading,
+                                           num_chopped_trailing, trailing_certainty, sp_trailing,
+                                           word, &is_good, &retry_leading, &retry_trailing);
+  if (is_good) {
+    word->ConsumeWordResults(revised);
+  } else if (retry_leading || retry_trailing) {
+    int retry_chopped_leading = LeadingUnicharsToChopped(revised, retry_leading);
+    int retry_chopped_trailing = TrailingUnicharsToChopped(revised, retry_trailing);
+    WERD_RES *revised2 = TrySuperscriptSplits(
+        retry_chopped_leading, leading_certainty, sp_leading, retry_chopped_trailing,
+        trailing_certainty, sp_trailing, revised, &is_good, &retry_leading, &retry_trailing);
+    if (is_good) {
+      word->ConsumeWordResults(revised2);
+    }
+    delete revised2;
+  }
+  delete revised;
+  return is_good;
+}
+
+/**
+ * Determine how many characters (rebuilt blobs) on each end of a given word
+ * might plausibly be superscripts so SubAndSuperscriptFix can try to
+ * re-recognize them.  Even if we find no whole blobs at either end,
+ * we will set *unlikely_threshold to a certainty that might be used to
+ * select "bad enough" outlier characters.  If *unlikely_threshold is set to 0,
+ * though, there's really no hope.
+ *
+ * @param[in]  word    The word to examine.
+ * @param[out] num_rebuilt_leading   the number of rebuilt blobs at the start
+ *                                   of the word which are all up or down and
+ *                                   seem badly classified.
+ * @param[out] leading_pos        "super" or "sub" (for debugging)
+ * @param[out] leading_certainty  the worst certainty in the leading blobs.
+ * @param[out] num_rebuilt_trailing   the number of rebuilt blobs at the end
+ *                                    of the word which are all up or down and
+ *                                    seem badly classified.
+ * @param[out] trailing_pos        "super" or "sub" (for debugging)
+ * @param[out] trailing_certainty  the worst certainty in the trailing blobs.
+ * @param[out] avg_certainty       the average certainty of "normal" blobs in
+ *                                 the word.
+ * @param[out] unlikely_threshold  the threshold (on certainty) we used to
+ *                                 select "bad enough" outlier characters.
+ */
+void Tesseract::GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading,
+                                               ScriptPos *leading_pos, float *leading_certainty,
+                                               int *num_rebuilt_trailing, ScriptPos *trailing_pos,
+                                               float *trailing_certainty, float *avg_certainty,
+                                               float *unlikely_threshold) {
+  *avg_certainty = *unlikely_threshold = 0.0f;
+  *num_rebuilt_leading = *num_rebuilt_trailing = 0;
+  *leading_certainty = *trailing_certainty = 0.0f;
+
+  int super_y_bottom = kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
+  int sub_y_top = kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
+
+  // Step one: Get an average certainty for "normally placed" characters.
+
+  // Counts here are of blobs in the rebuild_word / unichars in best_choice.
+  *leading_pos = *trailing_pos = SP_NORMAL;
+  int leading_outliers = 0;
+  int trailing_outliers = 0;
+  int num_normal = 0;
+  float normal_certainty_total = 0.0f;
+  float worst_normal_certainty = 0.0f;
+  ScriptPos last_pos = SP_NORMAL;
+  int num_blobs = word->rebuild_word->NumBlobs();
+  for (int b = 0; b < num_blobs; ++b) {
+    TBOX box = word->rebuild_word->blobs[b]->bounding_box();
+    ScriptPos pos = SP_NORMAL;
+    if (box.bottom() >= super_y_bottom) {
+      pos = SP_SUPERSCRIPT;
+    } else if (box.top() <= sub_y_top) {
+      pos = SP_SUBSCRIPT;
+    }
+    if (pos == SP_NORMAL) {
+      if (word->best_choice->unichar_id(b) != 0) {
+        float char_certainty = word->best_choice->certainty(b);
+        if (char_certainty < worst_normal_certainty) {
+          worst_normal_certainty = char_certainty;
+        }
+        num_normal++;
+        normal_certainty_total += char_certainty;
+      }
+      if (trailing_outliers == b) {
+        leading_outliers = trailing_outliers;
+        *leading_pos = last_pos;
+      }
+      trailing_outliers = 0;
+    } else {
+      if (last_pos == pos) {
+        trailing_outliers++;
+      } else {
+        trailing_outliers = 1;
+      }
+    }
+    last_pos = pos;
+  }
+  *trailing_pos = last_pos;
+  if (num_normal >= 3) { // throw out the worst as an outlier.
+    num_normal--;
+    normal_certainty_total -= worst_normal_certainty;
+  }
+  if (num_normal > 0) {
+    *avg_certainty = normal_certainty_total / num_normal;
+    *unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
+  }
+  if (num_normal == 0 || (leading_outliers == 0 && trailing_outliers == 0)) {
+    return;
+  }
+
+  // Step two: Try to split off bits of the word that are both outliers
+  //           and have much lower certainty than average
+  // Calculate num_leading and leading_certainty.
+  for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0; *num_rebuilt_leading < leading_outliers;
+       (*num_rebuilt_leading)++) {
+    float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);
+    if (char_certainty > *unlikely_threshold) {
+      break;
+    }
+    if (char_certainty < *leading_certainty) {
+      *leading_certainty = char_certainty;
+    }
+  }
+
+  // Calculate num_trailing and trailing_certainty.
+  for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
+       *num_rebuilt_trailing < trailing_outliers; (*num_rebuilt_trailing)++) {
+    int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
+    float char_certainty = word->best_choice->certainty(blob_idx);
+    if (char_certainty > *unlikely_threshold) {
+      break;
+    }
+    if (char_certainty < *trailing_certainty) {
+      *trailing_certainty = char_certainty;
+    }
+  }
+}
+
+/**
+ * Try splitting off the given number of (chopped) blobs from the front and
+ * back of the given word and recognizing the pieces.
+ *
+ * @param[in]  num_chopped_leading   how many chopped blobs from the left
+ *                    end of the word to chop off and try recognizing as a
+ *                    superscript (or subscript)
+ * @param[in]  leading_certainty     the (minimum) certainty had by the
+ *                    characters in the original leading section.
+ * @param[in]  leading_pos    "super" or "sub" (for debugging)
+ * @param[in]  num_chopped_trailing  how many chopped blobs from the right
+ *                    end of the word to chop off and try recognizing as a
+ *                    superscript (or subscript)
+ * @param[in]  trailing_certainty    the (minimum) certainty had by the
+ *                    characters in the original trailing section.
+ * @param[in]  trailing_pos      "super" or "sub" (for debugging)
+ * @param[in]  word              the word to try to chop up.
+ * @param[out] is_good           do we believe our result?
+ * @param[out] retry_rebuild_leading, retry_rebuild_trailing
+ *         If non-zero, and !is_good, then the caller may have luck trying
+ *         to split the returned word with this number of (rebuilt) leading
+ *         and trailing blobs / unichars.
+ * @return A word which is the result of re-recognizing as asked.
+ */
+WERD_RES *Tesseract::TrySuperscriptSplits(int num_chopped_leading, float leading_certainty,
+                                          ScriptPos leading_pos, int num_chopped_trailing,
+                                          float trailing_certainty, ScriptPos trailing_pos,
+                                          WERD_RES *word, bool *is_good, int *retry_rebuild_leading,
+                                          int *retry_rebuild_trailing) {
+  int num_chopped = word->chopped_word->NumBlobs();
+
+  *retry_rebuild_leading = *retry_rebuild_trailing = 0;
+
+  // Chop apart the word into up to three pieces.
+
+  BlamerBundle *bb0 = nullptr;
+  BlamerBundle *bb1 = nullptr;
+  WERD_RES *prefix = nullptr;
+  WERD_RES *core = nullptr;
+  WERD_RES *suffix = nullptr;
+  if (num_chopped_leading > 0) {
+    prefix = new WERD_RES(*word);
+    split_word(prefix, num_chopped_leading, &core, &bb0);
+  } else {
+    core = new WERD_RES(*word);
+  }
+
+  if (num_chopped_trailing > 0) {
+    int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
+    split_word(core, split_pt, &suffix, &bb1);
+  }
+
+  //  Recognize the pieces in turn.
+  int saved_cp_multiplier = classify_class_pruner_multiplier;
+  int saved_im_multiplier = classify_integer_matcher_multiplier;
+  if (prefix) {
+    // Turn off Tesseract's y-position penalties for the leading superscript.
+    classify_class_pruner_multiplier.set_value(0);
+    classify_integer_matcher_multiplier.set_value(0);
+
+    // Adjust our expectations about the baseline for this prefix.
+    if (superscript_debug >= 3) {
+      tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
+    }
+    recog_word_recursive(prefix);
+    if (superscript_debug >= 2) {
+      tprintf(" The leading bits look like %s %s\n", ScriptPosToString(leading_pos),
+              prefix->best_choice->unichar_string().c_str());
+    }
+
+    // Restore the normal y-position penalties.
+    classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
+    classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
+  }
+
+  if (superscript_debug >= 3) {
+    tprintf(" recognizing middle %d chopped blobs\n",
+            num_chopped - num_chopped_leading - num_chopped_trailing);
+  }
+
+  if (suffix) {
+    // Turn off Tesseract's y-position penalties for the trailing superscript.
+    classify_class_pruner_multiplier.set_value(0);
+    classify_integer_matcher_multiplier.set_value(0);
+
+    if (superscript_debug >= 3) {
+      tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
+    }
+    recog_word_recursive(suffix);
+    if (superscript_debug >= 2) {
+      tprintf(" The trailing bits look like %s %s\n", ScriptPosToString(trailing_pos),
+              suffix->best_choice->unichar_string().c_str());
+    }
+
+    // Restore the normal y-position penalties.
+    classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
+    classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
+  }
+
+  // Evaluate whether we think the results are believably better
+  // than what we already had.
+  bool good_prefix =
+      !prefix || BelievableSuperscript(superscript_debug >= 1, *prefix,
+                                       superscript_bettered_certainty * leading_certainty,
+                                       retry_rebuild_leading, nullptr);
+  bool good_suffix =
+      !suffix || BelievableSuperscript(superscript_debug >= 1, *suffix,
+                                       superscript_bettered_certainty * trailing_certainty, nullptr,
+                                       retry_rebuild_trailing);
+
+  *is_good = good_prefix && good_suffix;
+  if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
+    // None of it is any good. Quit now.
+    delete core;
+    delete prefix;
+    delete suffix;
+    delete bb1;
+    return nullptr;
+  }
+  recog_word_recursive(core);
+
+  // Now paste the results together into core.
+  if (suffix) {
+    suffix->SetAllScriptPositions(trailing_pos);
+    join_words(core, suffix, bb1);
+  }
+  if (prefix) {
+    prefix->SetAllScriptPositions(leading_pos);
+    join_words(prefix, core, bb0);
+    core = prefix;
+    prefix = nullptr;
+  }
+
+  if (superscript_debug >= 1) {
+    tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
+            core->best_choice->unichar_string().c_str());
+  }
+  return core;
+}
+
+/**
+ * Return whether this is believable superscript or subscript text.
+ *
+ * We insist that:
+ *   + there are no punctuation marks.
+ *   + there are no italics.
+ *   + no normal-sized character is smaller than superscript_scaledown_ratio
+ *     of what it ought to be, and
+ *   + each character is at least as certain as certainty_threshold.
+ *
+ *  @param[in]  debug  If true, spew debug output
+ *  @param[in]  word   The word whose best_choice we're evaluating
+ *  @param[in]  certainty_threshold   If any of the characters have less
+ *                    certainty than this, reject.
+ *  @param[out]  left_ok  How many left-side characters were ok?
+ *  @param[out]  right_ok  How many right-side characters were ok?
+ *  @return  Whether the complete best choice is believable as a superscript.
+ */
+bool Tesseract::BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold,
+                                      int *left_ok, int *right_ok) const {
+  int initial_ok_run_count = 0;
+  int ok_run_count = 0;
+  float worst_certainty = 0.0f;
+  const WERD_CHOICE &wc = *word.best_choice;
+
+  const UnicityTable<FontInfo> &fontinfo_table = get_fontinfo_table();
+  for (int i = 0; i < wc.length(); i++) {
+    TBLOB *blob = word.rebuild_word->blobs[i];
+    UNICHAR_ID unichar_id = wc.unichar_id(i);
+    float char_certainty = wc.certainty(i);
+    bool bad_certainty = char_certainty < certainty_threshold;
+    bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
+    bool is_italic = word.fontinfo && word.fontinfo->is_italic();
+    BLOB_CHOICE *choice = word.GetBlobChoice(i);
+    if (choice && fontinfo_table.size() > 0) {
+      // Get better information from the specific choice, if available.
+      int font_id1 = choice->fontinfo_id();
+      bool font1_is_italic = font_id1 >= 0 ? fontinfo_table.at(font_id1).is_italic() : false;
+      int font_id2 = choice->fontinfo_id2();
+      is_italic = font1_is_italic && (font_id2 < 0 || fontinfo_table.at(font_id2).is_italic());
+    }
+
+    float height_fraction = 1.0f;
+    float char_height = blob->bounding_box().height();
+    float normal_height = char_height;
+    if (wc.unicharset()->top_bottom_useful()) {
+      int min_bot, max_bot, min_top, max_top;
+      wc.unicharset()->get_top_bottom(unichar_id, &min_bot, &max_bot, &min_top, &max_top);
+      float hi_height = max_top - max_bot;
+      float lo_height = min_top - min_bot;
+      normal_height = (hi_height + lo_height) / 2;
+      if (normal_height >= kBlnXHeight) {
+        // Only ding characters that we have decent information for because
+        // they're supposed to be normal sized, not tiny specks or dashes.
+        height_fraction = char_height / normal_height;
+      }
+    }
+    bool bad_height = height_fraction < superscript_scaledown_ratio;
+
+    if (debug) {
+      if (is_italic) {
+        tprintf(" Rejecting: superscript is italic.\n");
+      }
+      if (is_punc) {
+        tprintf(" Rejecting: punctuation present.\n");
+      }
+      const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
+      if (bad_certainty) {
+        tprintf(
+            " Rejecting: don't believe character %s with certainty %.2f "
+            "which is less than threshold %.2f\n",
+            char_str, char_certainty, certainty_threshold);
+      }
+      if (bad_height) {
+        tprintf(
+            " Rejecting: character %s seems too small @ %.2f versus "
+            "expected %.2f\n",
+            char_str, char_height, normal_height);
+      }
+    }
+    if (bad_certainty || bad_height || is_punc || is_italic) {
+      if (ok_run_count == i) {
+        initial_ok_run_count = ok_run_count;
+      }
+      ok_run_count = 0;
+    } else {
+      ok_run_count++;
+    }
+    if (char_certainty < worst_certainty) {
+      worst_certainty = char_certainty;
+    }
+  }
+  bool all_ok = ok_run_count == wc.length();
+  if (all_ok && debug) {
+    tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
+  }
+  if (!all_ok) {
+    if (left_ok) {
+      *left_ok = initial_ok_run_count;
+    }
+    if (right_ok) {
+      *right_ok = ok_run_count;
+    }
+  }
+  return all_ok;
+}
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/tessbox.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/tessbox.cpp
@ -0,0 +1,76 @@
+/**********************************************************************
+ * File:        tessbox.cpp  (Formerly tessbox.c)
+ * Description: Black boxed Tess for developing a resaljet.
+ * Author:      Ray Smith
+ * Created:     Thu Apr 23 11:03:36 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "mfoutline.h"
+#include "tesseractclass.h"
+
+/**
+ * @name tess_segment_pass_n
+ *
+ * Segment a word using the pass_n conditions of the tess segmenter.
+ * @param pass_n pass number
+ * @param word word to do
+ */
+
+namespace tesseract {
+void Tesseract::tess_segment_pass_n(int pass_n, WERD_RES *word) {
+  int saved_enable_assoc = 0;
+  int saved_chop_enable = 0;
+
+  if (word->word->flag(W_DONT_CHOP)) {
+    saved_enable_assoc = wordrec_enable_assoc;
+    saved_chop_enable = chop_enable;
+    wordrec_enable_assoc.set_value(false);
+    chop_enable.set_value(false);
+  }
+  if (pass_n == 1) {
+    set_pass1();
+  } else {
+    set_pass2();
+  }
+  recog_word(word);
+  if (word->best_choice == nullptr) {
+    word->SetupFake(*word->uch_set);
+  }
+  if (word->word->flag(W_DONT_CHOP)) {
+    wordrec_enable_assoc.set_value(saved_enable_assoc);
+    chop_enable.set_value(saved_chop_enable);
+  }
+}
+
+/**
+ * @name tess_acceptable_word
+ *
+ * @return true if the word is regarded as "good enough".
+ * @param word_choice after context
+ * @param raw_choice before context
+ */
+bool Tesseract::tess_acceptable_word(WERD_RES *word) {
+  return getDict().AcceptableResult(word);
+}
+
+/**
+ * @name tess_add_doc_word
+ *
+ * Add the given word to the document dictionary
+ */
+void Tesseract::tess_add_doc_word(WERD_CHOICE *word_choice) {
+  getDict().add_document_word(*word_choice);
+}
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/tessedit.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/tessedit.cpp
@ -0,0 +1,463 @@
+/**********************************************************************
+ * File:        tessedit.cpp  (Formerly tessedit.c)
+ * Description: (Previously) Main program for merge of tess and editor.
+ *              Now just code to load the language model and various
+ *              engine-specific data files.
+ * Author:      Ray Smith
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h"
+#endif
+
+#include "control.h"
+#include "matchdefs.h"
+#include "pageres.h"
+#include "params.h"
+#include "stopper.h"
+#include "tesseractclass.h"
+#include "tessvars.h"
+#include "tprintf.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#  include "chop.h"
+#  include "intmatcher.h"
+#  include "reject.h"
+#endif
+#include "lstmrecognizer.h"
+
+namespace tesseract {
+
+// Read a "config" file containing a set of variable, value pairs.
+// Searches the standard places: tessdata/configs, tessdata/tessconfigs
+// and also accepts a relative or absolute path name.
+void Tesseract::read_config_file(const char *filename, SetParamConstraint constraint) {
+  std::string path = datadir;
+  path += "configs/";
+  path += filename;
+  FILE *fp;
+  if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
+    fclose(fp);
+  } else {
+    path = datadir;
+    path += "tessconfigs/";
+    path += filename;
+    if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
+      fclose(fp);
+    } else {
+      path = filename;
+    }
+  }
+  ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params());
+}
+
+// Returns false if a unicharset file for the specified language was not found
+// or was invalid.
+// This function initializes TessdataManager. After TessdataManager is
+// no longer needed, TessdataManager::End() should be called.
+//
+// This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
+// it is OEM_DEFAULT, in which case the value of the variable will be obtained
+// from the language-specific config file (stored in [lang].traineddata), from
+// the config files specified on the command line or left as the default
+// OEM_TESSERACT_ONLY if none of the configs specify this variable.
+bool Tesseract::init_tesseract_lang_data(const std::string &arg0, const std::string &textbase,
+                                         const std::string &language, OcrEngineMode oem,
+                                         char **configs, int configs_size,
+                                         const std::vector<std::string> *vars_vec,
+                                         const std::vector<std::string> *vars_values,
+                                         bool set_only_non_debug_params, TessdataManager *mgr) {
+  // Set the basename, compute the data directory.
+  main_setup(arg0, textbase);
+
+  // Set the language data path prefix
+  lang = !language.empty() ? language : "eng";
+  language_data_path_prefix = datadir;
+  language_data_path_prefix += lang;
+  language_data_path_prefix += ".";
+
+  // Initialize TessdataManager.
+  std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
+  if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) {
+    tprintf("Error opening data file %s\n", tessdata_path.c_str());
+    tprintf(
+        "Please make sure the TESSDATA_PREFIX environment variable is set"
+        " to your \"tessdata\" directory.\n");
+    return false;
+  }
+#ifdef DISABLED_LEGACY_ENGINE
+  tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
+#else
+  if (oem == OEM_DEFAULT) {
+    // Set the engine mode from availability, which can then be overridden by
+    // the config file when we read it below.
+    if (!mgr->IsLSTMAvailable()) {
+      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
+    } else if (!mgr->IsBaseAvailable()) {
+      tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
+    } else {
+      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);
+    }
+  }
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+  // If a language specific config file (lang.config) exists, load it in.
+  TFile fp;
+  if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
+    ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp, this->params());
+  }
+
+  SetParamConstraint set_params_constraint =
+      set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE;
+  // Load tesseract variables from config files. This is done after loading
+  // language-specific variables from [lang].traineddata file, so that custom
+  // config files can override values in [lang].traineddata file.
+  for (int i = 0; i < configs_size; ++i) {
+    read_config_file(configs[i], set_params_constraint);
+  }
+
+  // Set params specified in vars_vec (done after setting params from config
+  // files, so that params in vars_vec can override those from files).
+  if (vars_vec != nullptr && vars_values != nullptr) {
+    for (unsigned i = 0; i < vars_vec->size(); ++i) {
+      if (!ParamUtils::SetParam((*vars_vec)[i].c_str(), (*vars_values)[i].c_str(),
+                                set_params_constraint, this->params())) {
+        tprintf("Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].c_str());
+      }
+    }
+  }
+
+  if (!tessedit_write_params_to_file.empty()) {
+    FILE *params_file = fopen(tessedit_write_params_to_file.c_str(), "wb");
+    if (params_file != nullptr) {
+      ParamUtils::PrintParams(params_file, this->params());
+      fclose(params_file);
+    } else {
+      tprintf("Failed to open %s for writing params.\n", tessedit_write_params_to_file.c_str());
+    }
+  }
+
+#ifndef DISABLED_LEGACY_ENGINE
+  // Determine which ocr engine(s) should be loaded and used for recognition.
+  if (oem != OEM_DEFAULT) {
+    tessedit_ocr_engine_mode.set_value(oem);
+  }
+#endif
+
+  // If we are only loading the config file (and so not planning on doing any
+  // recognition) then there's nothing else do here.
+  if (tessedit_init_config_only) {
+    return true;
+  }
+
+// The various OcrEngineMode settings (see tesseract/publictypes.h) determine
+// which engine-specific data files need to be loaded. If LSTM_ONLY is
+// requested, the base Tesseract files are *Not* required.
+#ifdef DISABLED_LEGACY_ENGINE
+  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
+#else
+  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
+      tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
+#endif // ndef DISABLED_LEGACY_ENGINE
+    if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {
+      lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix.c_str());
+      ASSERT_HOST(lstm_recognizer_->Load(this->params(), lstm_use_matrix ? language : "", mgr));
+    } else {
+      tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
+      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
+    }
+  }
+
+  // Load the unicharset
+  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
+    // Avoid requiring a unicharset when we aren't running base tesseract.
+    unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
+  }
+#ifndef DISABLED_LEGACY_ENGINE
+  else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || !unicharset.load_from_file(&fp, false)) {
+    tprintf(
+        "Error: Tesseract (legacy) engine requested, but components are "
+        "not present in %s!!\n",
+        tessdata_path.c_str());
+    return false;
+  }
+#endif // ndef DISABLED_LEGACY_ENGINE
+  if (unicharset.size() > MAX_NUM_CLASSES) {
+    tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
+    return false;
+  }
+  right_to_left_ = unicharset.major_right_to_left();
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+  // Setup initial unichar ambigs table and read universal ambigs.
+  UNICHARSET encoder_unicharset;
+  encoder_unicharset.CopyFrom(unicharset);
+  unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
+  unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
+
+  if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
+    unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp, ambigs_debug_level,
+                                     use_ambigs_for_adaption, &unicharset);
+  }
+
+  // Init ParamsModel.
+  // Load pass1 and pass2 weights (for now these two sets are the same, but in
+  // the future separate sets of weights can be generated).
+  for (int p = ParamsModel::PTRAIN_PASS1; p < ParamsModel::PTRAIN_NUM_PASSES; ++p) {
+    language_model_->getParamsModel().SetPass(static_cast<ParamsModel::PassEnum>(p));
+    if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
+      if (!language_model_->getParamsModel().LoadFromFp(lang.c_str(), &fp)) {
+        return false;
+      }
+    }
+  }
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+  return true;
+}
+
+// Helper returns true if the given string is in the vector of strings.
+static bool IsStrInList(const std::string &str, const std::vector<std::string> &str_list) {
+  for (const auto &i : str_list) {
+    if (i == str) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Parse a string of the form [~]<lang>[+[~]<lang>]*.
+// Langs with no prefix get appended to to_load, provided they
+// are not in there already.
+// Langs with ~ prefix get appended to not_to_load, provided they are not in
+// there already.
+void Tesseract::ParseLanguageString(const std::string &lang_str, std::vector<std::string> *to_load,
+                                    std::vector<std::string> *not_to_load) {
+  std::string remains(lang_str);
+  while (!remains.empty()) {
+    // Find the start of the lang code and which vector to add to.
+    const char *start = remains.c_str();
+    while (*start == '+') {
+      ++start;
+    }
+    std::vector<std::string> *target = to_load;
+    if (*start == '~') {
+      target = not_to_load;
+      ++start;
+    }
+    // Find the index of the end of the lang code in string start.
+    int end = strlen(start);
+    const char *plus = strchr(start, '+');
+    if (plus != nullptr && plus - start < end) {
+      end = plus - start;
+    }
+    std::string lang_code(start);
+    lang_code.resize(end);
+    std::string next(start + end);
+    remains = next;
+    // Check whether lang_code is already in the target vector and add.
+    if (!IsStrInList(lang_code, *target)) {
+      target->push_back(lang_code);
+    }
+  }
+}
+
+// Initialize for potentially a set of languages defined by the language
+// string and recursively any additional languages required by any language
+// traineddata file (via tessedit_load_sublangs in its config) that is loaded.
+// See init_tesseract_internal for args.
+int Tesseract::init_tesseract(const std::string &arg0, const std::string &textbase,
+                              const std::string &language, OcrEngineMode oem, char **configs,
+                              int configs_size, const std::vector<std::string> *vars_vec,
+                              const std::vector<std::string> *vars_values,
+                              bool set_only_non_debug_params, TessdataManager *mgr) {
+  std::vector<std::string> langs_to_load;
+  std::vector<std::string> langs_not_to_load;
+  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
+
+  for (auto *lang : sub_langs_) {
+    delete lang;
+  }
+  sub_langs_.clear();
+  // Find the first loadable lang and load into this.
+  // Add any languages that this language requires
+  bool loaded_primary = false;
+  // Load the rest into sub_langs_.
+  for (unsigned lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
+    if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
+      const char *lang_str = langs_to_load[lang_index].c_str();
+      Tesseract *tess_to_init;
+      if (!loaded_primary) {
+        tess_to_init = this;
+      } else {
+        tess_to_init = new Tesseract;
+      }
+
+      int result = tess_to_init->init_tesseract_internal(arg0, textbase, lang_str, oem, configs,
+                                                         configs_size, vars_vec, vars_values,
+                                                         set_only_non_debug_params, mgr);
+      // Forget that language, but keep any reader we were given.
+      mgr->Clear();
+
+      if (!loaded_primary) {
+        if (result < 0) {
+          tprintf("Failed loading language '%s'\n", lang_str);
+        } else {
+          ParseLanguageString(tess_to_init->tessedit_load_sublangs.c_str(), &langs_to_load,
+                              &langs_not_to_load);
+          loaded_primary = true;
+        }
+      } else {
+        if (result < 0) {
+          tprintf("Failed loading language '%s'\n", lang_str);
+          delete tess_to_init;
+        } else {
+          sub_langs_.push_back(tess_to_init);
+          // Add any languages that this language requires
+          ParseLanguageString(tess_to_init->tessedit_load_sublangs.c_str(), &langs_to_load,
+                              &langs_not_to_load);
+        }
+      }
+    }
+  }
+  if (!loaded_primary) {
+    tprintf("Tesseract couldn't load any languages!\n");
+    return -1; // Couldn't load any language!
+  }
+#ifndef DISABLED_LEGACY_ENGINE
+  if (!sub_langs_.empty()) {
+    // In multilingual mode word ratings have to be directly comparable,
+    // so use the same language model weights for all languages:
+    // use the primary language's params model if
+    // tessedit_use_primary_params_model is set,
+    // otherwise use default language model weights.
+    if (tessedit_use_primary_params_model) {
+      for (auto &sub_lang : sub_langs_) {
+        sub_lang->language_model_->getParamsModel().Copy(this->language_model_->getParamsModel());
+      }
+      tprintf("Using params model of the primary language\n");
+    } else {
+      this->language_model_->getParamsModel().Clear();
+      for (auto &sub_lang : sub_langs_) {
+        sub_lang->language_model_->getParamsModel().Clear();
+      }
+    }
+  }
+
+  SetupUniversalFontIds();
+#endif // ndef DISABLED_LEGACY_ENGINE
+  return 0;
+}
+
+// Common initialization for a single language.
+// arg0 is the datapath for the tessdata directory, which could be the
+// path of the tessdata directory with no trailing /, or (if tessdata
+// lives in the same directory as the executable, the path of the executable,
+// hence the name arg0.
+// textbase is an optional output file basename (used only for training)
+// language is the language code to load.
+// oem controls which engine(s) will operate on the image
+// configs (argv) is an array of config filenames to load variables from.
+// May be nullptr.
+// configs_size (argc) is the number of elements in configs.
+// vars_vec is an optional vector of variables to set.
+// vars_values is an optional corresponding vector of values for the variables
+// in vars_vec.
+// If set_only_non_debug_params is true, only params that do not contain
+// "debug" in the name will be set.
+int Tesseract::init_tesseract_internal(const std::string &arg0, const std::string &textbase,
+                                       const std::string &language, OcrEngineMode oem,
+                                       char **configs, int configs_size,
+                                       const std::vector<std::string> *vars_vec,
+                                       const std::vector<std::string> *vars_values,
+                                       bool set_only_non_debug_params, TessdataManager *mgr) {
+  if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs, configs_size, vars_vec,
+                                vars_values, set_only_non_debug_params, mgr)) {
+    return -1;
+  }
+  if (tessedit_init_config_only) {
+    return 0;
+  }
+  // If only LSTM will be used, skip loading Tesseract classifier's
+  // pre-trained templates and dictionary.
+  bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY;
+  program_editup(textbase, init_tesseract ? mgr : nullptr, init_tesseract ? mgr : nullptr);
+  return 0; // Normal exit
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+// Helper builds the all_fonts table by adding new fonts from new_fonts.
+static void CollectFonts(const UnicityTable<FontInfo> &new_fonts,
+                         UnicityTable<FontInfo> *all_fonts) {
+  for (int i = 0; i < new_fonts.size(); ++i) {
+    // UnicityTable uniques as we go.
+    all_fonts->push_back(new_fonts.at(i));
+  }
+}
+
+// Helper assigns an id to lang_fonts using the index in all_fonts table.
+static void AssignIds(const UnicityTable<FontInfo> &all_fonts, UnicityTable<FontInfo> *lang_fonts) {
+  for (int i = 0; i < lang_fonts->size(); ++i) {
+    int index = all_fonts.get_id(lang_fonts->at(i));
+    lang_fonts->at(i).universal_id = index;
+  }
+}
+
+// Set the universal_id member of each font to be unique among all
+// instances of the same font loaded.
+void Tesseract::SetupUniversalFontIds() {
+  // Note that we can get away with bitwise copying FontInfo in
+  // all_fonts, as it is a temporary structure and we avoid setting the
+  // delete callback.
+  UnicityTable<FontInfo> all_fonts;
+
+  // Create the universal ID table.
+  CollectFonts(get_fontinfo_table(), &all_fonts);
+  for (auto &sub_lang : sub_langs_) {
+    CollectFonts(sub_lang->get_fontinfo_table(), &all_fonts);
+  }
+  // Assign ids from the table to each font table.
+  AssignIds(all_fonts, &get_fontinfo_table());
+  for (auto &sub_lang : sub_langs_) {
+    AssignIds(all_fonts, &sub_lang->get_fontinfo_table());
+  }
+  font_table_size_ = all_fonts.size();
+}
+
+// init the LM component
+int Tesseract::init_tesseract_lm(const std::string &arg0, const std::string &textbase,
+                                 const std::string &language, TessdataManager *mgr) {
+  if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY, nullptr, 0, nullptr,
+                                nullptr, false, mgr)) {
+    return -1;
+  }
+  getDict().SetupForLoad(Dict::GlobalDawgCache());
+  getDict().Load(lang, mgr);
+  getDict().FinishLoad();
+  return 0;
+}
+
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+void Tesseract::end_tesseract() {
+  end_recog();
+}
+
+/* Define command type identifiers */
+
+enum CMD_EVENTS { ACTION_1_CMD_EVENT, RECOG_WERDS, RECOG_PSEUDO, ACTION_2_CMD_EVENT };
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/tesseractclass.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/tesseractclass.cpp
@ -0,0 +1,574 @@
+///////////////////////////////////////////////////////////////////////
+// File:        tesseractclass.cpp
+// Description: The Tesseract class. It holds/owns everything needed
+//              to run Tesseract on a single language, and also a set of
+//              sub-Tesseracts to run sub-languages. For thread safety, *every*
+//              variable that was previously global or static (except for
+//              constant data, and some visual debugging flags) has been moved
+//              in here, directly, or indirectly.
+//              This makes it safe to run multiple Tesseracts in different
+//              threads in parallel, and keeps the different language
+//              instances separate.
+//              Some global functions remain, but they are isolated re-entrant
+//              functions that operate on their arguments. Functions that work
+//              on variable data have been moved to an appropriate class based
+//              mostly on the directory hierarchy. For more information see
+//              slide 6 of "2ArchitectureAndDataStructures" in
+// https://drive.google.com/file/d/0B7l10Bj_LprhbUlIUFlCdGtDYkE/edit?usp=sharing
+//              Some global data and related functions still exist in the
+//              training-related code, but they don't interfere with normal
+//              recognition operation.
+// Author:      Ray Smith
+//
+// (C) Copyright 2008, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h"
+#endif
+
+#include "tesseractclass.h"
+
+#include <allheaders.h>
+#include "edgblob.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#  include "equationdetect.h"
+#endif
+#include "lstmrecognizer.h"
+
+namespace tesseract {
+
+Tesseract::Tesseract()
+    : BOOL_MEMBER(tessedit_resegment_from_boxes, false,
+                  "Take segmentation and labeling from box file", this->params())
+    , BOOL_MEMBER(tessedit_resegment_from_line_boxes, false,
+                  "Conversion of word/line box file to char box file", this->params())
+    , BOOL_MEMBER(tessedit_train_from_boxes, false, "Generate training data from boxed chars",
+                  this->params())
+    , BOOL_MEMBER(tessedit_make_boxes_from_boxes, false, "Generate more boxes from boxed chars",
+                  this->params())
+    , BOOL_MEMBER(tessedit_train_line_recognizer, false,
+                  "Break input into lines and remap boxes if present", this->params())
+    , BOOL_MEMBER(tessedit_dump_pageseg_images, false,
+                  "Dump intermediate images made during page segmentation", this->params())
+    , BOOL_MEMBER(tessedit_do_invert, true, "Try inverting the image in `LSTMRecognizeWord`",
+                  this->params())
+    ,
+    // The default for pageseg_mode is the old behaviour, so as not to
+    // upset anything that relies on that.
+    INT_MEMBER(tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
+               "Page seg mode: 0=osd only, 1=auto+osd, 2=auto_only, 3=auto, "
+               "4=column,"
+               " 5=block_vert, 6=block, 7=line, 8=word, 9=word_circle, 10=char,"
+               "11=sparse_text, 12=sparse_text+osd, 13=raw_line"
+               " (Values from PageSegMode enum in tesseract/publictypes.h)",
+               this->params())
+    , INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT,
+                      "Which OCR engine(s) to run (Tesseract, LSTM, both)."
+                      " Defaults to loading and running the most accurate"
+                      " available.",
+                      this->params())
+    , STRING_MEMBER(tessedit_char_blacklist, "", "Blacklist of chars not to recognize",
+                    this->params())
+    , STRING_MEMBER(tessedit_char_whitelist, "", "Whitelist of chars to recognize", this->params())
+    , STRING_MEMBER(tessedit_char_unblacklist, "",
+                    "List of chars to override tessedit_char_blacklist", this->params())
+    , BOOL_MEMBER(tessedit_ambigs_training, false, "Perform training for ambiguities",
+                  this->params())
+    , INT_MEMBER(pageseg_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT,
+                 "Whether to use the top-line splitting process for Devanagari "
+                 "documents while performing page-segmentation.",
+                 this->params())
+    , INT_MEMBER(ocr_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT,
+                 "Whether to use the top-line splitting process for Devanagari "
+                 "documents while performing ocr.",
+                 this->params())
+    , STRING_MEMBER(tessedit_write_params_to_file, "", "Write all parameters to the given file.",
+                    this->params())
+    , BOOL_MEMBER(tessedit_adaption_debug, false,
+                  "Generate and print debug"
+                  " information for adaption",
+                  this->params())
+    , INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params())
+    , INT_MEMBER(applybox_debug, 1, "Debug level", this->params())
+    , INT_MEMBER(applybox_page, 0, "Page number to apply boxes from", this->params())
+    , STRING_MEMBER(applybox_exposure_pattern, ".exp",
+                    "Exposure value follows"
+                    " this pattern in the image filename. The name of the image"
+                    " files are expected to be in the form"
+                    " [lang].[fontname].exp[num].tif",
+                    this->params())
+    , BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,
+                  "Learn both character fragments (as is done in the"
+                  " special low exposure mode) as well as unfragmented"
+                  " characters.",
+                  this->params())
+    , BOOL_MEMBER(applybox_learn_ngrams_mode, false,
+                  "Each bounding box"
+                  " is assumed to contain ngrams. Only learn the ngrams"
+                  " whose outlines overlap horizontally.",
+                  this->params())
+    , BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words", this->params())
+    , BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices", this->params())
+    , BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats", this->params())
+    , BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true, "Try to improve fuzzy spaces", this->params())
+    , BOOL_MEMBER(tessedit_unrej_any_wd, false, "Don't bother with word plausibility",
+                  this->params())
+    , BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?", this->params())
+    , BOOL_MEMBER(tessedit_enable_doc_dict, true, "Add words to the document dictionary",
+                  this->params())
+    , BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char", this->params())
+    , INT_MEMBER(tessedit_font_id, 0, "Font ID to use or zero", this->params())
+    , BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats", this->params())
+    , BOOL_MEMBER(tessedit_enable_bigram_correction, true,
+                  "Enable correction based on the word bigram dictionary.", this->params())
+    , BOOL_MEMBER(tessedit_enable_dict_correction, false,
+                  "Enable single word correction based on the dictionary.", this->params())
+    , INT_MEMBER(tessedit_bigram_debug, 0, "Amount of debug output for bigram correction.",
+                 this->params())
+    , BOOL_MEMBER(enable_noise_removal, true,
+                  "Remove and conditionally reassign small outlines when they"
+                  " confuse layout analysis, determining diacritics vs noise",
+                  this->params())
+    , INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines", this->params())
+    ,
+    // Worst (min) certainty, for which a diacritic is allowed to make the
+    // base
+    // character worse and still be included.
+    double_MEMBER(noise_cert_basechar, -8.0, "Hingepoint for base char certainty", this->params())
+    ,
+    // Worst (min) certainty, for which a non-overlapping diacritic is allowed
+    // to make the base character worse and still be included.
+    double_MEMBER(noise_cert_disjoint, -1.0, "Hingepoint for disjoint certainty", this->params())
+    ,
+    // Worst (min) certainty, for which a diacritic is allowed to make a new
+    // stand-alone blob.
+    double_MEMBER(noise_cert_punc, -3.0, "Threshold for new punc char certainty", this->params())
+    ,
+    // Factor of certainty margin for adding diacritics to not count as worse.
+    double_MEMBER(noise_cert_factor, 0.375, "Scaling on certainty diff from Hingepoint",
+                  this->params())
+    , INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob", this->params())
+    , INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word", this->params())
+    , INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params())
+    , STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation", this->params())
+    , STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation", this->params())
+    , STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation", this->params())
+    , double_MEMBER(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit", this->params())
+    , double_MEMBER(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit", this->params())
+    , double_MEMBER(quality_outline_pc, 1.0, "good_quality_doc lte outline error limit",
+                    this->params())
+    , double_MEMBER(quality_char_pc, 0.95, "good_quality_doc gte good char limit", this->params())
+    , INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word", this->params())
+    , INT_MEMBER(tessedit_tess_adaption_mode, 0x27, "Adaptation decision algorithm for tess",
+                 this->params())
+    , BOOL_MEMBER(tessedit_minimal_rej_pass1, false, "Do minimal rejection on pass 1 output",
+                  this->params())
+    , BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria", this->params())
+    , BOOL_MEMBER(test_pt, false, "Test for point", this->params())
+    , double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params())
+    , double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params())
+    , INT_MEMBER(multilang_debug_level, 0, "Print multilang debug info.", this->params())
+    , INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.", this->params())
+    , BOOL_MEMBER(paragraph_text_based, true,
+                  "Run paragraph detection on the post-text-recognition "
+                  "(more accurate)",
+                  this->params())
+    , BOOL_MEMBER(lstm_use_matrix, 1, "Use ratings matrix/beam search with lstm", this->params())
+    , STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines", this->params())
+    , STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines", this->params())
+    , BOOL_MEMBER(tessedit_good_quality_unrej, true, "Reduce rejection on good docs",
+                  this->params())
+    , BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?", this->params())
+    , double_MEMBER(tessedit_reject_doc_percent, 65.00, "%rej allowed before rej whole doc",
+                    this->params())
+    , double_MEMBER(tessedit_reject_block_percent, 45.00, "%rej allowed before rej whole block",
+                    this->params())
+    , double_MEMBER(tessedit_reject_row_percent, 40.00, "%rej allowed before rej whole row",
+                    this->params())
+    , double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00,
+                    "Number of row rejects in whole word rejects"
+                    " which prevents whole row rejection",
+                    this->params())
+    , BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,
+                  "Only rej partially rejected words in block rejection", this->params())
+    , BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true,
+                  "Only rej partially rejected words in row rejection", this->params())
+    , BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false, "Use word segmentation quality metric",
+                  this->params())
+    , BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false, "Use word segmentation quality metric",
+                  this->params())
+    , INT_MEMBER(tessedit_preserve_min_wd_len, 2, "Only preserve wds longer than this",
+                 this->params())
+    , BOOL_MEMBER(tessedit_row_rej_good_docs, true, "Apply row rejection to good docs",
+                  this->params())
+    , double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1,
+                    "rej good doc wd if more than this fraction rejected", this->params())
+    , BOOL_MEMBER(tessedit_reject_bad_qual_wds, true, "Reject all bad quality wds", this->params())
+    , BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats", this->params())
+    , BOOL_MEMBER(tessedit_debug_quality_metrics, false, "Output data to debug file",
+                  this->params())
+    , BOOL_MEMBER(bland_unrej, false, "unrej potential with no checks", this->params())
+    , double_MEMBER(quality_rowrej_pc, 1.1, "good_quality_doc gte good char limit", this->params())
+    , BOOL_MEMBER(unlv_tilde_crunching, false, "Mark v.bad words for tilde crunch", this->params())
+    , BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output", this->params())
+    , BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output",
+                  this->params())
+    , BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?", this->params())
+    , BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?", this->params())
+    , double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this", this->params())
+    , BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params())
+    , double_MEMBER(crunch_poor_garbage_cert, -9.0, "crunch garbage cert lt this", this->params())
+    , double_MEMBER(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this", this->params())
+    , double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this", this->params())
+    , double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this", this->params())
+    , double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this", this->params())
+    , double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this", this->params())
+    , double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this", this->params())
+    , double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this", this->params())
+    , double_MEMBER(crunch_del_min_width, 3.0, "Del if word width lt xht x this", this->params())
+    , double_MEMBER(crunch_del_high_word, 1.5, "Del if word gt xht x this above bl", this->params())
+    , double_MEMBER(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl", this->params())
+    , double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this", this->params())
+    , INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch", this->params())
+    , INT_MEMBER(crunch_pot_indicators, 1, "How many potential indicators needed", this->params())
+    , BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings", this->params())
+    , BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring", this->params())
+    , BOOL_MEMBER(crunch_leave_accept_strings, false, "Don't pot crunch sensible strings",
+                  this->params())
+    , BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures", this->params())
+    , INT_MEMBER(crunch_leave_lc_strings, 4, "Don't crunch words with long lower case strings",
+                 this->params())
+    , INT_MEMBER(crunch_leave_uc_strings, 4, "Don't crunch words with long lower case strings",
+                 this->params())
+    , INT_MEMBER(crunch_long_repetitions, 3, "Crunch words with long repetitions", this->params())
+    , INT_MEMBER(crunch_debug, 0, "As it says", this->params())
+    , INT_MEMBER(fixsp_non_noise_limit, 1, "How many non-noise blbs either side?", this->params())
+    , double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this", this->params())
+    , BOOL_MEMBER(tessedit_prefer_joined_punct, false, "Reward punctuation joins", this->params())
+    , INT_MEMBER(fixsp_done_mode, 1, "What constitutes done for spacing", this->params())
+    , INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug", this->params())
+    , STRING_MEMBER(numeric_punctuation, ".,", "Punct. chs expected WITHIN numbers", this->params())
+    , INT_MEMBER(x_ht_acceptance_tolerance, 8,
+                 "Max allowed deviation of blob top outside of font data", this->params())
+    , INT_MEMBER(x_ht_min_change, 8, "Min change in xht before actually trying it", this->params())
+    , INT_MEMBER(superscript_debug, 0, "Debug level for sub & superscript fixer", this->params())
+    , double_MEMBER(superscript_worse_certainty, 2.0,
+                    "How many times worse "
+                    "certainty does a superscript position glyph need to be for "
+                    "us to try classifying it as a char with a different "
+                    "baseline?",
+                    this->params())
+    , double_MEMBER(superscript_bettered_certainty, 0.97,
+                    "What reduction in "
+                    "badness do we think sufficient to choose a superscript "
+                    "over what we'd thought.  For example, a value of 0.6 means "
+                    "we want to reduce badness of certainty by at least 40%",
+                    this->params())
+    , double_MEMBER(superscript_scaledown_ratio, 0.4,
+                    "A superscript scaled down more than this is unbelievably "
+                    "small.  For example, 0.3 means we expect the font size to "
+                    "be no smaller than 30% of the text line font size.",
+                    this->params())
+    , double_MEMBER(subscript_max_y_top, 0.5,
+                    "Maximum top of a character measured as a multiple of "
+                    "x-height above the baseline for us to reconsider whether "
+                    "it's a subscript.",
+                    this->params())
+    , double_MEMBER(superscript_min_y_bottom, 0.3,
+                    "Minimum bottom of a character measured as a multiple of "
+                    "x-height above the baseline for us to reconsider whether "
+                    "it's a superscript.",
+                    this->params())
+    , BOOL_MEMBER(tessedit_write_block_separators, false, "Write block separators in output",
+                  this->params())
+    , BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code", this->params())
+    , BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file", this->params())
+    , BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file", this->params())
+    , BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file", this->params())
+    , BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file", this->params())
+    , BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",
+                  this->params())
+    , BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file", this->params())
+    , BOOL_MEMBER(tessedit_create_wordstrbox, false, "Write WordStr format .box output file",
+                  this->params())
+    , BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file", this->params())
+    , BOOL_MEMBER(textonly_pdf, false, "Create PDF with only one invisible text layer",
+                  this->params())
+    , INT_MEMBER(jpg_quality, 85, "Set JPEG quality level", this->params())
+    , INT_MEMBER(user_defined_dpi, 0, "Specify DPI for input image", this->params())
+    , INT_MEMBER(min_characters_to_try, 50, "Specify minimum characters to try during OSD",
+                 this->params())
+    , STRING_MEMBER(unrecognised_char, "|", "Output char for unidentified blobs", this->params())
+    , INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params())
+    , INT_MEMBER(suspect_short_words, 2, "Don't suspect dict wds longer than this", this->params())
+    , BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected", this->params())
+    , double_MEMBER(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit", this->params())
+    , double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit", this->params())
+    , BOOL_MEMBER(tessedit_minimal_rejection, false, "Only reject tess failures", this->params())
+    , BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING", this->params())
+    , BOOL_MEMBER(tessedit_word_for_word, false, "Make output have exactly one word per WERD",
+                  this->params())
+    , BOOL_MEMBER(tessedit_zero_kelvin_rejection, false, "Don't reject ANYTHING AT ALL",
+                  this->params())
+    , INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", this->params())
+    , BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug", this->params())
+    , BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips", this->params())
+    , double_MEMBER(tessedit_lower_flip_hyphen, 1.5, "Aspect ratio dot/hyphen test", this->params())
+    , double_MEMBER(tessedit_upper_flip_hyphen, 1.8, "Aspect ratio dot/hyphen test", this->params())
+    , BOOL_MEMBER(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector", this->params())
+    , BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test", this->params())
+    , BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check", this->params())
+    , BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control", this->params())
+    , BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control", this->params())
+    , BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control", this->params())
+    , BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check", this->params())
+    , BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check", this->params())
+    , double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85, "if >this fract", this->params())
+    , INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit", this->params())
+    , STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075", "Allow NN to unrej", this->params())
+    , STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set", this->params())
+    , INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this", this->params())
+    , BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes", this->params())
+    , INT_MEMBER(tessedit_page_number, -1, "-1 -> All pages, else specific page to process",
+                 this->params())
+    , BOOL_MEMBER(tessedit_write_images, false, "Capture the image from the IPE", this->params())
+    , BOOL_MEMBER(interactive_display_mode, false, "Run interactively?", this->params())
+    , STRING_MEMBER(file_type, ".tif", "Filename extension", this->params())
+    , BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word", this->params())
+    , STRING_MEMBER(tessedit_load_sublangs, "", "List of languages to load with this one",
+                    this->params())
+    , BOOL_MEMBER(tessedit_use_primary_params_model, false,
+                  "In multilingual mode use params model of the"
+                  " primary language",
+                  this->params())
+    , double_MEMBER(min_orientation_margin, 7.0, "Min acceptable orientation margin",
+                    this->params())
+    , BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", this->params())
+    , BOOL_MEMBER(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model", this->params())
+    , BOOL_MEMBER(poly_allow_detailed_fx, false,
+                  "Allow feature extractors to see the original outline", this->params())
+    , BOOL_INIT_MEMBER(tessedit_init_config_only, false,
+                       "Only initialize with the config file. Useful if the "
+                       "instance is not going to be used for OCR but say only "
+                       "for layout analysis.",
+                       this->params())
+    , BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector", this->params())
+    , BOOL_MEMBER(textord_tabfind_vertical_text, true, "Enable vertical detection", this->params())
+    , BOOL_MEMBER(textord_tabfind_force_vertical_text, false, "Force using vertical text page mode",
+                  this->params())
+    , double_MEMBER(textord_tabfind_vertical_text_ratio, 0.5,
+                    "Fraction of textlines deemed vertical to use vertical page "
+                    "mode",
+                    this->params())
+    , double_MEMBER(textord_tabfind_aligned_gap_fraction, 0.75,
+                    "Fraction of height used as a minimum gap for aligned blobs.", this->params())
+    , INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible", this->params())
+    , BOOL_MEMBER(preserve_interword_spaces, false, "Preserve multiple interword spaces",
+                  this->params())
+    , STRING_MEMBER(page_separator, "\f", "Page separator (default is form feed control character)",
+                    this->params())
+    , INT_MEMBER(lstm_choice_mode, 0,
+                 "Allows to include alternative symbols choices in the hOCR output. "
+                 "Valid input values are 0, 1 and 2. 0 is the default value. "
+                 "With 1 the alternative symbol choices per timestep are included. "
+                 "With 2 alternative symbol choices are extracted from the CTC "
+                 "process instead of the lattice. The choices are mapped per "
+                 "character.",
+                 this->params())
+    , INT_MEMBER(lstm_choice_iterations, 5,
+                 "Sets the number of cascading iterations for the Beamsearch in "
+                 "lstm_choice_mode. Note that lstm_choice_mode must be set to a "
+                 "value greater than 0 to produce results.",
+                 this->params())
+    , double_MEMBER(lstm_rating_coefficient, 5,
+                    "Sets the rating coefficient for the lstm choices. The smaller the "
+                    "coefficient, the better are the ratings for each choice and less "
+                    "information is lost due to the cut off at 0. The standard value is "
+                    "5",
+                    this->params())
+    , BOOL_MEMBER(pageseg_apply_music_mask, true,
+                  "Detect music staff and remove intersecting components", this->params())
+    ,
+
+    backup_config_file_(nullptr)
+    , pix_binary_(nullptr)
+    , pix_grey_(nullptr)
+    , pix_original_(nullptr)
+    , pix_thresholds_(nullptr)
+    , source_resolution_(0)
+    , textord_(this)
+    , right_to_left_(false)
+    , scaled_color_(nullptr)
+    , scaled_factor_(-1)
+    , deskew_(1.0f, 0.0f)
+    , reskew_(1.0f, 0.0f)
+    , most_recently_used_(this)
+    , font_table_size_(0)
+    , equ_detect_(nullptr)
+    , lstm_recognizer_(nullptr)
+    , train_line_page_num_(0) {}
+
+Tesseract::~Tesseract() {
+  Clear();
+  pix_original_.destroy();
+  end_tesseract();
+  for (auto *lang : sub_langs_) {
+    delete lang;
+  }
+  delete lstm_recognizer_;
+  lstm_recognizer_ = nullptr;
+}
+
+Dict &Tesseract::getDict() {
+  if (0 == Classify::getDict().NumDawgs() && AnyLSTMLang()) {
+    if (lstm_recognizer_ && lstm_recognizer_->GetDict()) {
+      return *lstm_recognizer_->GetDict();
+    }
+  }
+  return Classify::getDict();
+}
+
+void Tesseract::Clear() {
+  std::string debug_name = imagebasename + "_debug.pdf";
+  pixa_debug_.WritePDF(debug_name.c_str());
+  pix_binary_.destroy();
+  pix_grey_.destroy();
+  pix_thresholds_.destroy();
+  scaled_color_.destroy();
+  deskew_ = FCOORD(1.0f, 0.0f);
+  reskew_ = FCOORD(1.0f, 0.0f);
+  splitter_.Clear();
+  scaled_factor_ = -1;
+  for (auto &sub_lang : sub_langs_) {
+    sub_lang->Clear();
+  }
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+void Tesseract::SetEquationDetect(EquationDetect *detector) {
+  equ_detect_ = detector;
+  equ_detect_->SetLangTesseract(this);
+}
+
+// Clear all memory of adaption for this and all subclassifiers.
+void Tesseract::ResetAdaptiveClassifier() {
+  ResetAdaptiveClassifierInternal();
+  for (auto &sub_lang : sub_langs_) {
+    sub_lang->ResetAdaptiveClassifierInternal();
+  }
+}
+
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+// Clear the document dictionary for this and all subclassifiers.
+void Tesseract::ResetDocumentDictionary() {
+  getDict().ResetDocumentDictionary();
+  for (auto &sub_lang : sub_langs_) {
+    sub_lang->getDict().ResetDocumentDictionary();
+  }
+}
+
+void Tesseract::SetBlackAndWhitelist() {
+  // Set the white and blacklists (if any)
+  unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
+                                     tessedit_char_whitelist.c_str(),
+                                     tessedit_char_unblacklist.c_str());
+  if (lstm_recognizer_) {
+    UNICHARSET &lstm_unicharset = lstm_recognizer_->GetUnicharset();
+    lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
+                                            tessedit_char_whitelist.c_str(),
+                                            tessedit_char_unblacklist.c_str());
+  }
+  // Black and white lists should apply to all loaded classifiers.
+  for (auto &sub_lang : sub_langs_) {
+    sub_lang->unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
+                                                 tessedit_char_whitelist.c_str(),
+                                                 tessedit_char_unblacklist.c_str());
+    if (sub_lang->lstm_recognizer_) {
+      UNICHARSET &lstm_unicharset = sub_lang->lstm_recognizer_->GetUnicharset();
+      lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
+                                              tessedit_char_whitelist.c_str(),
+                                              tessedit_char_unblacklist.c_str());
+    }
+  }
+}
+
+// Perform steps to prepare underlying binary image/other data structures for
+// page segmentation.
+void Tesseract::PrepareForPageseg() {
+  textord_.set_use_cjk_fp_model(textord_use_cjk_fp_model);
+  // Find the max splitter strategy over all langs.
+  auto max_pageseg_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(
+      static_cast<int32_t>(pageseg_devanagari_split_strategy));
+  for (auto &sub_lang : sub_langs_) {
+    auto pageseg_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(
+        static_cast<int32_t>(sub_lang->pageseg_devanagari_split_strategy));
+    if (pageseg_strategy > max_pageseg_strategy) {
+      max_pageseg_strategy = pageseg_strategy;
+    }
+    sub_lang->pix_binary_.destroy();
+    sub_lang->pix_binary_ = pix_binary().clone();
+  }
+  // Perform shiro-rekha (top-line) splitting and replace the current image by
+  // the newly split image.
+  splitter_.set_orig_pix(pix_binary());
+  splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
+  if (splitter_.Split(true, &pixa_debug_)) {
+    ASSERT_HOST(splitter_.splitted_image());
+    pix_binary_.destroy();
+    pix_binary_ = splitter_.splitted_image().clone();
+  }
+}
+
+// Perform steps to prepare underlying binary image/other data structures for
+// OCR. The current segmentation is required by this method.
+// Note that this method resets pix_binary_ to the original binarized image,
+// which may be different from the image actually used for OCR depending on the
+// value of devanagari_ocr_split_strategy.
+void Tesseract::PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr) {
+  // Find the max splitter strategy over all langs.
+  auto max_ocr_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(
+      static_cast<int32_t>(ocr_devanagari_split_strategy));
+  for (auto &sub_lang : sub_langs_) {
+    auto ocr_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(
+        static_cast<int32_t>(sub_lang->ocr_devanagari_split_strategy));
+    if (ocr_strategy > max_ocr_strategy) {
+      max_ocr_strategy = ocr_strategy;
+    }
+  }
+  // Utilize the segmentation information available.
+  splitter_.set_segmentation_block_list(block_list);
+  splitter_.set_ocr_split_strategy(max_ocr_strategy);
+  // Run the splitter for OCR
+  bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
+  // Restore pix_binary to the binarized original pix for future reference.
+  ASSERT_HOST(splitter_.orig_pix());
+  pix_binary_.destroy();
+  pix_binary_ = splitter_.orig_pix().clone();
+  // If the pageseg and ocr strategies are different, refresh the block list
+  // (from the last SegmentImage call) with blobs from the real image to be used
+  // for OCR.
+  if (splitter_.HasDifferentSplitStrategies()) {
+    BLOCK block("", true, 0, 0, 0, 0, pixGetWidth(pix_binary_), pixGetHeight(pix_binary_));
+    Image pix_for_ocr = split_for_ocr ? splitter_.splitted_image() : splitter_.orig_pix();
+    extract_edges(pix_for_ocr, &block);
+    splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());
+  }
+  // The splitter isn't needed any more after this, so save memory by clearing.
+  splitter_.Clear();
+}
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/tesseractclass.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/tesseractclass.h
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/tessvars.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/tessvars.cpp
@ -0,0 +1,24 @@
+/**********************************************************************
+ * File:        tessvars.cpp  (Formerly tessvars.c)
+ * Description: Variables and other globals for tessedit.
+ * Author:      Ray Smith
+ * Created:     Mon Apr 13 13:13:23 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include <cstdio>
+
+#include "tessvars.h"
+
+FILE *debug_fp = stderr; // write debug stuff here
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/tessvars.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/tessvars.h
@ -0,0 +1,27 @@
+/**********************************************************************
+ * File:        tessvars.h  (Formerly tessvars.h)
+ * Description: Variables and other globals for tessedit.
+ * Author:      Ray Smith
+ * Created:     Mon Apr 13 13:13:23 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef TESSVARS_H
+#define TESSVARS_H
+
+#include <cstdio>
+
+extern FILE *debug_fp; // write debug stuff here
+
+#endif
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/tfacepp.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/tfacepp.cpp
@ -0,0 +1,306 @@
+/**********************************************************************
+ * File:        tfacepp.cpp  (Formerly tface++.c)
+ * Description: C++ side of the C/C++ Tess/Editor interface.
+ * Author:      Ray Smith
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include <cmath>
+
+#include "blamer.h"
+#include "errcode.h"
+#include "ratngs.h"
+#include "reject.h"
+#include "tesseractclass.h"
+#include "werd.h"
+
+#define MAX_UNDIVIDED_LENGTH 24
+
+/**********************************************************************
+ * recog_word
+ *
+ * Convert the word to tess form and pass it to the tess segmenter.
+ * Convert the output back to editor form.
+ **********************************************************************/
+namespace tesseract {
+void Tesseract::recog_word(WERD_RES *word) {
+  if (wordrec_skip_no_truth_words &&
+      (word->blamer_bundle == nullptr ||
+       word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) {
+    if (classify_debug_level) {
+      tprintf("No truth for word - skipping\n");
+    }
+    word->tess_failed = true;
+    return;
+  }
+  ASSERT_HOST(!word->chopped_word->blobs.empty());
+  recog_word_recursive(word);
+  word->SetupBoxWord();
+  if (word->best_choice->length() != word->box_word->length()) {
+    tprintf(
+        "recog_word ASSERT FAIL String:\"%s\"; "
+        "Strlen=%d; #Blobs=%d\n",
+        word->best_choice->debug_string().c_str(), word->best_choice->length(),
+        word->box_word->length());
+  }
+  ASSERT_HOST(word->best_choice->length() == word->box_word->length());
+  // Check that the ratings matrix size matches the sum of all the
+  // segmentation states.
+  if (!word->StatesAllValid()) {
+    tprintf("Not all words have valid states relative to ratings matrix!!");
+    word->DebugWordChoices(true, nullptr);
+    ASSERT_HOST(word->StatesAllValid());
+  }
+  if (tessedit_override_permuter) {
+    /* Override the permuter type if a straight dictionary check disagrees. */
+    uint8_t perm_type = word->best_choice->permuter();
+    if ((perm_type != SYSTEM_DAWG_PERM) && (perm_type != FREQ_DAWG_PERM) &&
+        (perm_type != USER_DAWG_PERM)) {
+      uint8_t real_dict_perm_type = dict_word(*word->best_choice);
+      if (((real_dict_perm_type == SYSTEM_DAWG_PERM) || (real_dict_perm_type == FREQ_DAWG_PERM) ||
+           (real_dict_perm_type == USER_DAWG_PERM)) &&
+          (alpha_count(word->best_choice->unichar_string().c_str(),
+                       word->best_choice->unichar_lengths().c_str()) > 0)) {
+        word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
+      }
+    }
+    if (tessedit_rejection_debug && perm_type != word->best_choice->permuter()) {
+      tprintf("Permuter Type Flipped from %d to %d\n", perm_type, word->best_choice->permuter());
+    }
+  }
+  // Factored out from control.cpp
+  ASSERT_HOST((word->best_choice == nullptr) == (word->raw_choice == nullptr));
+  if (word->best_choice == nullptr || word->best_choice->empty() ||
+      static_cast<int>(strspn(word->best_choice->unichar_string().c_str(), " ")) ==
+          word->best_choice->length()) {
+    word->tess_failed = true;
+    word->reject_map.initialise(word->box_word->length());
+    word->reject_map.rej_word_tess_failure();
+  } else {
+    word->tess_failed = false;
+  }
+}
+
+/**********************************************************************
+ * recog_word_recursive
+ *
+ * Convert the word to tess form and pass it to the tess segmenter.
+ * Convert the output back to editor form.
+ **********************************************************************/
+void Tesseract::recog_word_recursive(WERD_RES *word) {
+  int word_length = word->chopped_word->NumBlobs(); // no of blobs
+  if (word_length > MAX_UNDIVIDED_LENGTH) {
+    return split_and_recog_word(word);
+  }
+  cc_recog(word);
+  word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
+
+  // Do sanity checks and minor fixes on best_choice.
+  if (word->best_choice->length() > word_length) {
+    word->best_choice->make_bad(); // should never happen
+    tprintf(
+        "recog_word: Discarded long string \"%s\""
+        " (%d characters vs %d blobs)\n",
+        word->best_choice->unichar_string().c_str(), word->best_choice->length(), word_length);
+    tprintf("Word is at:");
+    word->word->bounding_box().print();
+  }
+  if (word->best_choice->length() < word_length) {
+    UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
+    while (word->best_choice->length() < word_length) {
+      word->best_choice->append_unichar_id(space_id, 1, 0.0, word->best_choice->certainty());
+    }
+  }
+}
+
+/**********************************************************************
+ * split_and_recog_word
+ *
+ * Split the word into 2 smaller pieces at the largest gap.
+ * Recognize the pieces and stick the results back together.
+ **********************************************************************/
+void Tesseract::split_and_recog_word(WERD_RES *word) {
+  // Find the biggest blob gap in the chopped_word.
+  int bestgap = -INT32_MAX;
+  int split_index = 0;
+  for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
+    TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
+    TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
+    int gap = blob_box.left() - prev_box.right();
+    if (gap > bestgap) {
+      bestgap = gap;
+      split_index = b;
+    }
+  }
+  ASSERT_HOST(split_index > 0);
+
+  WERD_RES *word2 = nullptr;
+  BlamerBundle *orig_bb = nullptr;
+  split_word(word, split_index, &word2, &orig_bb);
+
+  // Recognize the first part of the word.
+  recog_word_recursive(word);
+  // Recognize the second part of the word.
+  recog_word_recursive(word2);
+
+  join_words(word, word2, orig_bb);
+}
+
+/**********************************************************************
+ * split_word
+ *
+ * Split a given WERD_RES in place into two smaller words for recognition.
+ * split_pt is the index of the first blob to go in the second word.
+ * The underlying word is left alone, only the TWERD (and subsequent data)
+ * are split up.  orig_blamer_bundle is set to the original blamer bundle,
+ * and will now be owned by the caller.  New blamer bundles are forged for the
+ * two pieces.
+ **********************************************************************/
+void Tesseract::split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece,
+                           BlamerBundle **orig_blamer_bundle) const {
+  ASSERT_HOST(split_pt > 0 && split_pt < word->chopped_word->NumBlobs());
+
+  // Save a copy of the blamer bundle so we can try to reconstruct it below.
+  BlamerBundle *orig_bb = word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : nullptr;
+
+  auto *word2 = new WERD_RES(*word);
+
+  // blow away the copied chopped_word, as we want to work with
+  // the blobs from the input chopped_word so seam_arrays can be merged.
+  TWERD *chopped = word->chopped_word;
+  auto *chopped2 = new TWERD;
+  chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
+  for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
+    chopped2->blobs.push_back(chopped->blobs[i]);
+  }
+  chopped->blobs.resize(split_pt);
+  word->chopped_word = nullptr;
+  delete word2->chopped_word;
+  word2->chopped_word = nullptr;
+
+  const UNICHARSET &unicharset = *word->uch_set;
+  word->ClearResults();
+  word2->ClearResults();
+  word->chopped_word = chopped;
+  word2->chopped_word = chopped2;
+  word->SetupBasicsFromChoppedWord(unicharset);
+  word2->SetupBasicsFromChoppedWord(unicharset);
+
+  // Try to adjust the blamer bundle.
+  if (orig_bb != nullptr) {
+    // TODO(rays) Looks like a leak to me.
+    // orig_bb should take, rather than copy.
+    word->blamer_bundle = new BlamerBundle();
+    word2->blamer_bundle = new BlamerBundle();
+    orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
+                         word2->chopped_word->blobs[0]->bounding_box().left(), wordrec_debug_blamer,
+                         word->blamer_bundle, word2->blamer_bundle);
+  }
+
+  *right_piece = word2;
+  *orig_blamer_bundle = orig_bb;
+}
+
+/**********************************************************************
+ * join_words
+ *
+ * The opposite of split_word():
+ *  join word2 (including any recognized data / seam array / etc)
+ *  onto the right of word and then delete word2.
+ *  Also, if orig_bb is provided, stitch it back into word.
+ **********************************************************************/
+void Tesseract::join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const {
+  TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
+  TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
+  // Tack the word2 outputs onto the end of the word outputs.
+  word->chopped_word->blobs.insert(word->chopped_word->blobs.end(), word2->chopped_word->blobs.begin(), word2->chopped_word->blobs.end());
+  word->rebuild_word->blobs.insert(word->rebuild_word->blobs.end(), word2->rebuild_word->blobs.begin(), word2->rebuild_word->blobs.end());
+  word2->chopped_word->blobs.clear();
+  word2->rebuild_word->blobs.clear();
+  TPOINT split_pt;
+  split_pt.x = (prev_box.right() + blob_box.left()) / 2;
+  split_pt.y = (prev_box.top() + prev_box.bottom() + blob_box.top() + blob_box.bottom()) / 4;
+  // Move the word2 seams onto the end of the word1 seam_array.
+  // Since the seam list is one element short, an empty seam marking the
+  // end of the last blob in the first word is needed first.
+  word->seam_array.push_back(new SEAM(0.0f, split_pt));
+  word->seam_array.insert(word->seam_array.end(), word2->seam_array.begin(), word2->seam_array.end());
+  word2->seam_array.clear();
+  // Fix widths and gaps.
+  word->blob_widths.insert(word->blob_widths.end(), word2->blob_widths.begin(), word2->blob_widths.end());
+  word->blob_gaps.insert(word->blob_gaps.end(), word2->blob_gaps.begin(), word2->blob_gaps.end());
+  // Fix the ratings matrix.
+  int rat1 = word->ratings->dimension();
+  int rat2 = word2->ratings->dimension();
+  word->ratings->AttachOnCorner(word2->ratings);
+  ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
+  word->best_state.insert(word->best_state.end(), word2->best_state.begin(), word2->best_state.end());
+  // Append the word choices.
+  *word->raw_choice += *word2->raw_choice;
+
+  // How many alt choices from each should we try to get?
+  const int kAltsPerPiece = 2;
+  // When do we start throwing away extra alt choices?
+  const int kTooManyAltChoices = 100;
+
+  // Construct the cartesian product of the best_choices of word(1) and word2.
+  WERD_CHOICE_LIST joined_choices;
+  WERD_CHOICE_IT jc_it(&joined_choices);
+  WERD_CHOICE_IT bc1_it(&word->best_choices);
+  WERD_CHOICE_IT bc2_it(&word2->best_choices);
+  int num_word1_choices = word->best_choices.length();
+  int total_joined_choices = num_word1_choices;
+  // Nota Bene: For the main loop here, we operate only on the 2nd and greater
+  // word2 choices, and put them in the joined_choices list. The 1st word2
+  // choice gets added to the original word1 choices in-place after we have
+  // finished with them.
+  int bc2_index = 1;
+  for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
+    if (total_joined_choices >= kTooManyAltChoices && bc2_index > kAltsPerPiece) {
+      break;
+    }
+    int bc1_index = 0;
+    for (bc1_it.move_to_first(); bc1_index < num_word1_choices; ++bc1_index, bc1_it.forward()) {
+      if (total_joined_choices >= kTooManyAltChoices && bc1_index > kAltsPerPiece) {
+        break;
+      }
+      auto *wc = new WERD_CHOICE(*bc1_it.data());
+      *wc += *bc2_it.data();
+      jc_it.add_after_then_move(wc);
+      ++total_joined_choices;
+    }
+  }
+  // Now that we've filled in as many alternates as we want, paste the best
+  // choice for word2 onto the original word alt_choices.
+  bc1_it.move_to_first();
+  bc2_it.move_to_first();
+  for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
+    *bc1_it.data() += *bc2_it.data();
+  }
+  bc1_it.move_to_last();
+  bc1_it.add_list_after(&joined_choices);
+
+  // Restore the pointer to original blamer bundle and combine blamer
+  // information recorded in the splits.
+  if (orig_bb != nullptr) {
+    orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle, wordrec_debug_blamer);
+    delete word->blamer_bundle;
+    word->blamer_bundle = orig_bb;
+  }
+  word->SetupBoxWord();
+  word->reject_map.initialise(word->box_word->length());
+  delete word2;
+}
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/thresholder.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/thresholder.cpp
@ -0,0 +1,331 @@
+///////////////////////////////////////////////////////////////////////
+// File:        thresholder.cpp
+// Description: Base API for thresholding images in tesseract.
+// Author:      Ray Smith
+//
+// (C) Copyright 2008, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include <allheaders.h>
+
+#include <cstdint> // for uint32_t
+#include <cstring>
+
+#include "otsuthr.h"
+#include "thresholder.h"
+#include "tprintf.h" // for tprintf
+
+#if defined(USE_OPENCL)
+#  include "openclwrapper.h" // for OpenclDevice
+#endif
+
+namespace tesseract {
+
+ImageThresholder::ImageThresholder()
+    : pix_(nullptr)
+    , image_width_(0)
+    , image_height_(0)
+    , pix_channels_(0)
+    , pix_wpl_(0)
+    , scale_(1)
+    , yres_(300)
+    , estimated_res_(300) {
+  SetRectangle(0, 0, 0, 0);
+}
+
+ImageThresholder::~ImageThresholder() {
+  Clear();
+}
+
+// Destroy the Pix if there is one, freeing memory.
+void ImageThresholder::Clear() {
+  pix_.destroy();
+}
+
+// Return true if no image has been set.
+bool ImageThresholder::IsEmpty() const {
+  return pix_ == nullptr;
+}
+
+// SetImage makes a copy of all the image data, so it may be deleted
+// immediately after this call.
+// Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
+// Palette color images will not work properly and must be converted to
+// 24 bit.
+// Binary images of 1 bit per pixel may also be given but they must be
+// byte packed with the MSB of the first byte being the first pixel, and a
+// one pixel is WHITE. For binary images set bytes_per_pixel=0.
+void ImageThresholder::SetImage(const unsigned char *imagedata, int width, int height,
+                                int bytes_per_pixel, int bytes_per_line) {
+  int bpp = bytes_per_pixel * 8;
+  if (bpp == 0) {
+    bpp = 1;
+  }
+  Image pix = pixCreate(width, height, bpp == 24 ? 32 : bpp);
+  l_uint32 *data = pixGetData(pix);
+  int wpl = pixGetWpl(pix);
+  switch (bpp) {
+    case 1:
+      for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) {
+        for (int x = 0; x < width; ++x) {
+          if (imagedata[x / 8] & (0x80 >> (x % 8))) {
+            CLEAR_DATA_BIT(data, x);
+          } else {
+            SET_DATA_BIT(data, x);
+          }
+        }
+      }
+      break;
+
+    case 8:
+      // Greyscale just copies the bytes in the right order.
+      for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) {
+        for (int x = 0; x < width; ++x) {
+          SET_DATA_BYTE(data, x, imagedata[x]);
+        }
+      }
+      break;
+
+    case 24:
+      // Put the colors in the correct places in the line buffer.
+      for (int y = 0; y < height; ++y, imagedata += bytes_per_line) {
+        for (int x = 0; x < width; ++x, ++data) {
+          SET_DATA_BYTE(data, COLOR_RED, imagedata[3 * x]);
+          SET_DATA_BYTE(data, COLOR_GREEN, imagedata[3 * x + 1]);
+          SET_DATA_BYTE(data, COLOR_BLUE, imagedata[3 * x + 2]);
+        }
+      }
+      break;
+
+    case 32:
+      // Maintain byte order consistency across different endianness.
+      for (int y = 0; y < height; ++y, imagedata += bytes_per_line, data += wpl) {
+        for (int x = 0; x < width; ++x) {
+          data[x] = (imagedata[x * 4] << 24) | (imagedata[x * 4 + 1] << 16) |
+                    (imagedata[x * 4 + 2] << 8) | imagedata[x * 4 + 3];
+        }
+      }
+      break;
+
+    default:
+      tprintf("Cannot convert RAW image to Pix with bpp = %d\n", bpp);
+  }
+  SetImage(pix);
+  pix.destroy();
+}
+
+// Store the coordinates of the rectangle to process for later use.
+// Doesn't actually do any thresholding.
+void ImageThresholder::SetRectangle(int left, int top, int width, int height) {
+  rect_left_ = left;
+  rect_top_ = top;
+  rect_width_ = width;
+  rect_height_ = height;
+}
+
+// Get enough parameters to be able to rebuild bounding boxes in the
+// original image (not just within the rectangle).
+// Left and top are enough with top-down coordinates, but
+// the height of the rectangle and the image are needed for bottom-up.
+void ImageThresholder::GetImageSizes(int *left, int *top, int *width, int *height, int *imagewidth,
+                                     int *imageheight) {
+  *left = rect_left_;
+  *top = rect_top_;
+  *width = rect_width_;
+  *height = rect_height_;
+  *imagewidth = image_width_;
+  *imageheight = image_height_;
+}
+
+// Pix vs raw, which to use? Pix is the preferred input for efficiency,
+// since raw buffers are copied.
+// SetImage for Pix clones its input, so the source pix may be pixDestroyed
+// immediately after, but may not go away until after the Thresholder has
+// finished with it.
+void ImageThresholder::SetImage(const Image pix) {
+  if (pix_ != nullptr) {
+    pix_.destroy();
+  }
+  Image src = pix;
+  int depth;
+  pixGetDimensions(src, &image_width_, &image_height_, &depth);
+  // Convert the image as necessary so it is one of binary, plain RGB, or
+  // 8 bit with no colormap. Guarantee that we always end up with our own copy,
+  // not just a clone of the input.
+  if (pixGetColormap(src)) {
+    Image tmp = pixRemoveColormap(src, REMOVE_CMAP_BASED_ON_SRC);
+    depth = pixGetDepth(tmp);
+    if (depth > 1 && depth < 8) {
+      pix_ = pixConvertTo8(tmp, false);
+      tmp.destroy();
+    } else {
+      pix_ = tmp;
+    }
+  } else if (depth > 1 && depth < 8) {
+    pix_ = pixConvertTo8(src, false);
+  } else {
+    pix_ = src.copy();
+  }
+  depth = pixGetDepth(pix_);
+  pix_channels_ = depth / 8;
+  pix_wpl_ = pixGetWpl(pix_);
+  scale_ = 1;
+  estimated_res_ = yres_ = pixGetYRes(pix_);
+  Init();
+}
+
+// Threshold the source image as efficiently as possible to the output Pix.
+// Creates a Pix and sets pix to point to the resulting pointer.
+// Caller must use pixDestroy to free the created Pix.
+/// Returns false on error.
+bool ImageThresholder::ThresholdToPix(PageSegMode pageseg_mode, Image *pix) {
+  if (image_width_ > INT16_MAX || image_height_ > INT16_MAX) {
+    tprintf("Image too large: (%d, %d)\n", image_width_, image_height_);
+    return false;
+  }
+  if (pix_channels_ == 0) {
+    // We have a binary image, but it still has to be copied, as this API
+    // allows the caller to modify the output.
+    Image original = GetPixRect();
+    *pix = original.copy();
+    original.destroy();
+  } else {
+    OtsuThresholdRectToPix(pix_, pix);
+  }
+  return true;
+}
+
+// Gets a pix that contains an 8 bit threshold value at each pixel. The
+// returned pix may be an integer reduction of the binary image such that
+// the scale factor may be inferred from the ratio of the sizes, even down
+// to the extreme of a 1x1 pixel thresholds image.
+// Ideally the 8 bit threshold should be the exact threshold used to generate
+// the binary image in ThresholdToPix, but this is not a hard constraint.
+// Returns nullptr if the input is binary. PixDestroy after use.
+Image ImageThresholder::GetPixRectThresholds() {
+  if (IsBinary()) {
+    return nullptr;
+  }
+  Image pix_grey = GetPixRectGrey();
+  int width = pixGetWidth(pix_grey);
+  int height = pixGetHeight(pix_grey);
+  std::vector<int> thresholds;
+  std::vector<int> hi_values;
+  OtsuThreshold(pix_grey, 0, 0, width, height, thresholds, hi_values);
+  pix_grey.destroy();
+  Image pix_thresholds = pixCreate(width, height, 8);
+  int threshold = thresholds[0] > 0 ? thresholds[0] : 128;
+  pixSetAllArbitrary(pix_thresholds, threshold);
+  return pix_thresholds;
+}
+
+// Common initialization shared between SetImage methods.
+void ImageThresholder::Init() {
+  SetRectangle(0, 0, image_width_, image_height_);
+}
+
+// Get a clone/copy of the source image rectangle.
+// The returned Pix must be pixDestroyed.
+// This function will be used in the future by the page layout analysis, and
+// the layout analysis that uses it will only be available with Leptonica,
+// so there is no raw equivalent.
+Image ImageThresholder::GetPixRect() {
+  if (IsFullImage()) {
+    // Just clone the whole thing.
+    return pix_.clone();
+  } else {
+    // Crop to the given rectangle.
+    Box *box = boxCreate(rect_left_, rect_top_, rect_width_, rect_height_);
+    Image cropped = pixClipRectangle(pix_, box, nullptr);
+    boxDestroy(&box);
+    return cropped;
+  }
+}
+
+// Get a clone/copy of the source image rectangle, reduced to greyscale,
+// and at the same resolution as the output binary.
+// The returned Pix must be pixDestroyed.
+// Provided to the classifier to extract features from the greyscale image.
+Image ImageThresholder::GetPixRectGrey() {
+  auto pix = GetPixRect(); // May have to be reduced to grey.
+  int depth = pixGetDepth(pix);
+  if (depth != 8) {
+    if (depth == 24) {
+      auto tmp = pixConvert24To32(pix);
+      pix.destroy();
+      pix = tmp;
+    }
+    auto result = pixConvertTo8(pix, false);
+    pix.destroy();
+    return result;
+  }
+  return pix;
+}
+
+// Otsu thresholds the rectangle, taking the rectangle from *this.
+void ImageThresholder::OtsuThresholdRectToPix(Image src_pix, Image *out_pix) const {
+  std::vector<int> thresholds;
+  std::vector<int> hi_values;
+
+  int num_channels = OtsuThreshold(src_pix, rect_left_, rect_top_, rect_width_, rect_height_,
+                                   thresholds, hi_values);
+  // only use opencl if compiled w/ OpenCL and selected device is opencl
+#ifdef USE_OPENCL
+  OpenclDevice od;
+  if (num_channels == 4 && od.selectedDeviceIsOpenCL() && rect_top_ == 0 && rect_left_ == 0) {
+    od.ThresholdRectToPixOCL((unsigned char *)pixGetData(src_pix), num_channels,
+                             pixGetWpl(src_pix) * 4, &thresholds[0], &hi_values[0], out_pix /*pix_OCL*/,
+                             rect_height_, rect_width_, rect_top_, rect_left_);
+  } else {
+#endif
+    ThresholdRectToPix(src_pix, num_channels, thresholds, hi_values, out_pix);
+#ifdef USE_OPENCL
+  }
+#endif
+}
+
+/// Threshold the rectangle, taking everything except the src_pix
+/// from the class, using thresholds/hi_values to the output pix.
+/// NOTE that num_channels is the size of the thresholds and hi_values
+// arrays and also the bytes per pixel in src_pix.
+void ImageThresholder::ThresholdRectToPix(Image src_pix, int num_channels, const std::vector<int> &thresholds,
+                                          const std::vector<int> &hi_values, Image *pix) const {
+  *pix = pixCreate(rect_width_, rect_height_, 1);
+  uint32_t *pixdata = pixGetData(*pix);
+  int wpl = pixGetWpl(*pix);
+  int src_wpl = pixGetWpl(src_pix);
+  uint32_t *srcdata = pixGetData(src_pix);
+  pixSetXRes(*pix, pixGetXRes(src_pix));
+  pixSetYRes(*pix, pixGetYRes(src_pix));
+  for (int y = 0; y < rect_height_; ++y) {
+    const uint32_t *linedata = srcdata + (y + rect_top_) * src_wpl;
+    uint32_t *pixline = pixdata + y * wpl;
+    for (int x = 0; x < rect_width_; ++x) {
+      bool white_result = true;
+      for (int ch = 0; ch < num_channels; ++ch) {
+        int pixel = GET_DATA_BYTE(linedata, (x + rect_left_) * num_channels + ch);
+        if (hi_values[ch] >= 0 && (pixel > thresholds[ch]) == (hi_values[ch] == 0)) {
+          white_result = false;
+          break;
+        }
+      }
+      if (white_result) {
+        CLEAR_DATA_BIT(pixline, x);
+      } else {
+        SET_DATA_BIT(pixline, x);
+      }
+    }
+  }
+}
+
+} // namespace tesseract.
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/thresholder.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/thresholder.h
@ -0,0 +1,190 @@
+///////////////////////////////////////////////////////////////////////
+// File:        thresholder.h
+// Description: Base API for thresholding images in tesseract.
+// Author:      Ray Smith
+//
+// (C) Copyright 2008, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCMAIN_THRESHOLDER_H_
+#define TESSERACT_CCMAIN_THRESHOLDER_H_
+
+#include <tesseract/export.h>
+#include <tesseract/publictypes.h>
+
+#include <vector> // for std::vector
+
+struct Pix;
+
+namespace tesseract {
+
+/// Base class for all tesseract image thresholding classes.
+/// Specific classes can add new thresholding methods by
+/// overriding ThresholdToPix.
+/// Each instance deals with a single image, but the design is intended to
+/// be useful for multiple calls to SetRectangle and ThresholdTo* if
+/// desired.
+class TESS_API ImageThresholder {
+public:
+  ImageThresholder();
+  virtual ~ImageThresholder();
+
+  /// Destroy the Pix if there is one, freeing memory.
+  virtual void Clear();
+
+  /// Return true if no image has been set.
+  bool IsEmpty() const;
+
+  /// SetImage makes a copy of all the image data, so it may be deleted
+  /// immediately after this call.
+  /// Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
+  /// Palette color images will not work properly and must be converted to
+  /// 24 bit.
+  /// Binary images of 1 bit per pixel may also be given but they must be
+  /// byte packed with the MSB of the first byte being the first pixel, and a
+  /// one pixel is WHITE. For binary images set bytes_per_pixel=0.
+  void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel,
+                int bytes_per_line);
+
+  /// Store the coordinates of the rectangle to process for later use.
+  /// Doesn't actually do any thresholding.
+  void SetRectangle(int left, int top, int width, int height);
+
+  /// Get enough parameters to be able to rebuild bounding boxes in the
+  /// original image (not just within the rectangle).
+  /// Left and top are enough with top-down coordinates, but
+  /// the height of the rectangle and the image are needed for bottom-up.
+  virtual void GetImageSizes(int *left, int *top, int *width, int *height, int *imagewidth,
+                             int *imageheight);
+
+  /// Return true if the source image is color.
+  bool IsColor() const {
+    return pix_channels_ >= 3;
+  }
+
+  /// Returns true if the source image is binary.
+  bool IsBinary() const {
+    return pix_channels_ == 0;
+  }
+
+  int GetScaleFactor() const {
+    return scale_;
+  }
+
+  // Set the resolution of the source image in pixels per inch.
+  // This should be called right after SetImage(), and will let us return
+  // appropriate font sizes for the text.
+  void SetSourceYResolution(int ppi) {
+    yres_ = ppi;
+    estimated_res_ = ppi;
+  }
+  int GetSourceYResolution() const {
+    return yres_;
+  }
+  int GetScaledYResolution() const {
+    return scale_ * yres_;
+  }
+  // Set the resolution of the source image in pixels per inch, as estimated
+  // by the thresholder from the text size found during thresholding.
+  // This value will be used to set internal size thresholds during recognition
+  // and will not influence the output "point size." The default value is
+  // the same as the source resolution. (yres_)
+  void SetEstimatedResolution(int ppi) {
+    estimated_res_ = ppi;
+  }
+  // Returns the estimated resolution, including any active scaling.
+  // This value will be used to set internal size thresholds during recognition.
+  int GetScaledEstimatedResolution() const {
+    return scale_ * estimated_res_;
+  }
+
+  /// Pix vs raw, which to use? Pix is the preferred input for efficiency,
+  /// since raw buffers are copied.
+  /// SetImage for Pix clones its input, so the source pix may be pixDestroyed
+  /// immediately after, but may not go away until after the Thresholder has
+  /// finished with it.
+  void SetImage(const Image pix);
+
+  /// Threshold the source image as efficiently as possible to the output Pix.
+  /// Creates a Pix and sets pix to point to the resulting pointer.
+  /// Caller must use pixDestroy to free the created Pix.
+  /// Returns false on error.
+  virtual bool ThresholdToPix(PageSegMode pageseg_mode, Image *pix);
+
+  // Gets a pix that contains an 8 bit threshold value at each pixel. The
+  // returned pix may be an integer reduction of the binary image such that
+  // the scale factor may be inferred from the ratio of the sizes, even down
+  // to the extreme of a 1x1 pixel thresholds image.
+  // Ideally the 8 bit threshold should be the exact threshold used to generate
+  // the binary image in ThresholdToPix, but this is not a hard constraint.
+  // Returns nullptr if the input is binary. PixDestroy after use.
+  virtual Image GetPixRectThresholds();
+
+  /// Get a clone/copy of the source image rectangle.
+  /// The returned Pix must be pixDestroyed.
+  /// This function will be used in the future by the page layout analysis, and
+  /// the layout analysis that uses it will only be available with Leptonica,
+  /// so there is no raw equivalent.
+  Image GetPixRect();
+
+  // Get a clone/copy of the source image rectangle, reduced to greyscale,
+  // and at the same resolution as the output binary.
+  // The returned Pix must be pixDestroyed.
+  // Provided to the classifier to extract features from the greyscale image.
+  virtual Image GetPixRectGrey();
+
+protected:
+  // ----------------------------------------------------------------------
+  // Utility functions that may be useful components for other thresholders.
+
+  /// Common initialization shared between SetImage methods.
+  virtual void Init();
+
+  /// Return true if we are processing the full image.
+  bool IsFullImage() const {
+    return rect_left_ == 0 && rect_top_ == 0 && rect_width_ == image_width_ &&
+           rect_height_ == image_height_;
+  }
+
+  // Otsu thresholds the rectangle, taking the rectangle from *this.
+  void OtsuThresholdRectToPix(Image src_pix, Image *out_pix) const;
+
+  /// Threshold the rectangle, taking everything except the src_pix
+  /// from the class, using thresholds/hi_values to the output pix.
+  /// NOTE that num_channels is the size of the thresholds and hi_values
+  // arrays and also the bytes per pixel in src_pix.
+  void ThresholdRectToPix(Image src_pix, int num_channels, const std::vector<int> &thresholds,
+                          const std::vector <int> &hi_values, Image *pix) const;
+
+protected:
+  /// Clone or other copy of the source Pix.
+  /// The pix will always be PixDestroy()ed on destruction of the class.
+  Image pix_;
+
+  int image_width_;  ///< Width of source pix_.
+  int image_height_; ///< Height of source pix_.
+  int pix_channels_; ///< Number of 8-bit channels in pix_.
+  int pix_wpl_;      ///< Words per line of pix_.
+  // Limits of image rectangle to be processed.
+  int scale_;         ///< Scale factor from original image.
+  int yres_;          ///< y pixels/inch in source image.
+  int estimated_res_; ///< Resolution estimate from text size.
+  int rect_left_;
+  int rect_top_;
+  int rect_width_;
+  int rect_height_;
+};
+
+} // namespace tesseract.
+
+#endif // TESSERACT_CCMAIN_THRESHOLDER_H_
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/werdit.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/werdit.cpp
@ -0,0 +1,68 @@
+/**********************************************************************
+ * File:        werdit.cpp  (Formerly wordit.c)
+ * Description: An iterator for passing over all the words in a document.
+ * Author:      Ray Smith
+ * Created:     Mon Apr 27 08:51:22 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "werdit.h"
+
+#include "errcode.h"  // for ASSERT_HOST
+#include "pageres.h"  // for PAGE_RES_IT, PAGE_RES (ptr only), WERD_RES
+#include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
+#include "werd.h"     // for WERD
+
+namespace tesseract {
+
+/**********************************************************************
+ * make_pseudo_word
+ *
+ * Make all the blobs inside a selection into a single word.
+ * The returned PAGE_RES_IT* it points to the new word. After use, call
+ * it->DeleteCurrentWord() to delete the fake word, and then
+ * delete it to get rid of the iterator itself.
+ **********************************************************************/
+
+PAGE_RES_IT *make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box) {
+  PAGE_RES_IT pr_it(page_res);
+  C_BLOB_LIST new_blobs;              // list of gathered blobs
+  C_BLOB_IT new_blob_it = &new_blobs; // iterator
+
+  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
+    WERD *word = word_res->word;
+    if (word->bounding_box().overlap(selection_box)) {
+      C_BLOB_IT blob_it(word->cblob_list());
+      for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+        C_BLOB *blob = blob_it.data();
+        if (blob->bounding_box().overlap(selection_box)) {
+          new_blob_it.add_after_then_move(C_BLOB::deep_copy(blob));
+        }
+      }
+      if (!new_blobs.empty()) {
+        WERD *pseudo_word = new WERD(&new_blobs, 1, nullptr);
+        word_res = pr_it.InsertSimpleCloneWord(*word_res, pseudo_word);
+        auto *it = new PAGE_RES_IT(page_res);
+        while (it->word() != word_res && it->word() != nullptr) {
+          it->forward();
+        }
+        ASSERT_HOST(it->word() == word_res);
+        return it;
+      }
+    }
+  }
+  return nullptr;
+}
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/ccmain/werdit.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccmain/werdit.h
@ -0,0 +1,34 @@
+/**********************************************************************
+ * File:        wordit.h
+ * Description: An iterator for passing over all the words in a document.
+ * Author:      Ray Smith
+ * Created:     Mon Apr 27 08:51:22 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef WERDIT_H
+#define WERDIT_H
+
+#include "rect.h" // for TBOX
+
+namespace tesseract {
+
+class PAGE_RES;
+class PAGE_RES_IT;
+
+PAGE_RES_IT *make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box);
+
+} // namespace tesseract
+
+#endif
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/blamer.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/blamer.cpp
@ -0,0 +1,578 @@
+///////////////////////////////////////////////////////////////////////
+// File:        blamer.cpp
+// Description: Module allowing precise error causes to be allocated.
+// Author:      Rike Antonova
+// Refactored:  Ray Smith
+//
+// (C) Copyright 2013, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "blamer.h"
+
+#include "blobs.h"   // for TPOINT, TWERD, TBLOB
+#include "errcode.h" // for ASSERT_HOST
+#if !defined(DISABLED_LEGACY_ENGINE)
+#  include "lm_pain_points.h" // for LMPainPoints
+#endif
+#include "matrix.h"     // for MATRIX
+#include "normalis.h"   // for DENORM
+#include "pageres.h"    // for WERD_RES
+#include "unicharset.h" // for UNICHARSET
+
+#include <cmath>   // for abs
+#include <cstdlib> // for abs
+
+namespace tesseract {
+
+// Names for each value of IncorrectResultReason enum. Keep in sync.
+const char kBlameCorrect[] = "corr";
+const char kBlameClassifier[] = "cl";
+const char kBlameChopper[] = "chop";
+const char kBlameClassLMTradeoff[] = "cl/LM";
+const char kBlamePageLayout[] = "pglt";
+const char kBlameSegsearchHeur[] = "ss_heur";
+const char kBlameSegsearchPP[] = "ss_pp";
+const char kBlameClassOldLMTradeoff[] = "cl/old_LM";
+const char kBlameAdaption[] = "adapt";
+const char kBlameNoTruthSplit[] = "no_tr_spl";
+const char kBlameNoTruth[] = "no_tr";
+const char kBlameUnknown[] = "unkn";
+
+const char *const kIncorrectResultReasonNames[] = {
+    kBlameCorrect,    kBlameClassifier,    kBlameChopper,     kBlameClassLMTradeoff,
+    kBlamePageLayout, kBlameSegsearchHeur, kBlameSegsearchPP, kBlameClassOldLMTradeoff,
+    kBlameAdaption,   kBlameNoTruthSplit,  kBlameNoTruth,     kBlameUnknown};
+
+const char *BlamerBundle::IncorrectReasonName(IncorrectResultReason irr) {
+  return kIncorrectResultReasonNames[irr];
+}
+
+const char *BlamerBundle::IncorrectReason() const {
+  return kIncorrectResultReasonNames[incorrect_result_reason_];
+}
+
+// Functions to setup the blamer.
+// Whole word string, whole word bounding box.
+void BlamerBundle::SetWordTruth(const UNICHARSET &unicharset, const char *truth_str,
+                                const TBOX &word_box) {
+  truth_word_.InsertBox(0, word_box);
+  truth_has_char_boxes_ = false;
+  // Encode the string as UNICHAR_IDs.
+  std::vector<UNICHAR_ID> encoding;
+  std::vector<char> lengths;
+  unicharset.encode_string(truth_str, false, &encoding, &lengths, nullptr);
+  int total_length = 0;
+  for (int i = 0; i < encoding.size(); total_length += lengths[i++]) {
+    std::string uch(truth_str + total_length);
+    uch.resize(lengths[i] - total_length);
+    UNICHAR_ID id = encoding[i];
+    if (id != INVALID_UNICHAR_ID) {
+      uch = unicharset.get_normed_unichar(id);
+    }
+    truth_text_.push_back(uch);
+  }
+}
+
+// Single "character" string, "character" bounding box.
+// May be called multiple times to indicate the characters in a word.
+void BlamerBundle::SetSymbolTruth(const UNICHARSET &unicharset, const char *char_str,
+                                  const TBOX &char_box) {
+  std::string symbol_str(char_str);
+  UNICHAR_ID id = unicharset.unichar_to_id(char_str);
+  if (id != INVALID_UNICHAR_ID) {
+    std::string normed_uch(unicharset.get_normed_unichar(id));
+    if (normed_uch.length() > 0) {
+      symbol_str = normed_uch;
+    }
+  }
+  int length = truth_word_.length();
+  truth_text_.push_back(symbol_str);
+  truth_word_.InsertBox(length, char_box);
+  if (length == 0) {
+    truth_has_char_boxes_ = true;
+  } else if (truth_word_.BlobBox(length - 1) == char_box) {
+    truth_has_char_boxes_ = false;
+  }
+}
+
+// Marks that there is something wrong with the truth text, like it contains
+// reject characters.
+void BlamerBundle::SetRejectedTruth() {
+  incorrect_result_reason_ = IRR_NO_TRUTH;
+  truth_has_char_boxes_ = false;
+}
+
+// Returns true if the provided word_choice is correct.
+bool BlamerBundle::ChoiceIsCorrect(const WERD_CHOICE *word_choice) const {
+  if (word_choice == nullptr) {
+    return false;
+  }
+  const UNICHARSET *uni_set = word_choice->unicharset();
+  std::string normed_choice_str;
+  for (int i = 0; i < word_choice->length(); ++i) {
+    normed_choice_str += uni_set->get_normed_unichar(word_choice->unichar_id(i));
+  }
+  std::string truth_str = TruthString();
+  return truth_str == normed_choice_str;
+}
+
+void BlamerBundle::FillDebugString(const std::string &msg, const WERD_CHOICE *choice, std::string &debug) {
+  debug += "Truth ";
+  for (auto &text : this->truth_text_) {
+    debug += text;
+  }
+  if (!this->truth_has_char_boxes_) {
+    debug += " (no char boxes)";
+  }
+  if (choice != nullptr) {
+    debug += " Choice ";
+    std::string choice_str;
+    choice->string_and_lengths(&choice_str, nullptr);
+    debug += choice_str;
+  }
+  if (msg.length() > 0) {
+    debug += "\n";
+    debug += msg;
+  }
+  debug += "\n";
+}
+
+// Sets up the norm_truth_word from truth_word using the given DENORM.
+void BlamerBundle::SetupNormTruthWord(const DENORM &denorm) {
+  // TODO(rays) Is this the last use of denorm in WERD_RES and can it go?
+  norm_box_tolerance_ = kBlamerBoxTolerance * denorm.x_scale();
+  TPOINT topleft;
+  TPOINT botright;
+  TPOINT norm_topleft;
+  TPOINT norm_botright;
+  for (int b = 0; b < truth_word_.length(); ++b) {
+    const TBOX &box = truth_word_.BlobBox(b);
+    topleft.x = box.left();
+    topleft.y = box.top();
+    botright.x = box.right();
+    botright.y = box.bottom();
+    denorm.NormTransform(nullptr, topleft, &norm_topleft);
+    denorm.NormTransform(nullptr, botright, &norm_botright);
+    TBOX norm_box(norm_topleft.x, norm_botright.y, norm_botright.x, norm_topleft.y);
+    norm_truth_word_.InsertBox(b, norm_box);
+  }
+}
+
+// Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
+// bundles) where the right edge/ of the left-hand word is word1_right,
+// and the left edge of the right-hand word is word2_left.
+void BlamerBundle::SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1,
+                               BlamerBundle *bundle2) const {
+  std::string debug_str;
+  // Find truth boxes that correspond to the split in the blobs.
+  int b;
+  int begin2_truth_index = -1;
+  if (incorrect_result_reason_ != IRR_NO_TRUTH && truth_has_char_boxes_) {
+    debug_str = "Looking for truth split at";
+    debug_str += " end1_x " + std::to_string(word1_right);
+    debug_str += " begin2_x " + std::to_string(word2_left);
+    debug_str += "\nnorm_truth_word boxes:\n";
+    if (norm_truth_word_.length() > 1) {
+      norm_truth_word_.BlobBox(0).print_to_str(debug_str);
+      for (b = 1; b < norm_truth_word_.length(); ++b) {
+        norm_truth_word_.BlobBox(b).print_to_str(debug_str);
+        if ((abs(word1_right - norm_truth_word_.BlobBox(b - 1).right()) < norm_box_tolerance_) &&
+            (abs(word2_left - norm_truth_word_.BlobBox(b).left()) < norm_box_tolerance_)) {
+          begin2_truth_index = b;
+          debug_str += "Split found";
+          break;
+        }
+      }
+      debug_str += '\n';
+    }
+  }
+  // Populate truth information in word and word2 with the first and second
+  // part of the original truth.
+  if (begin2_truth_index > 0) {
+    bundle1->truth_has_char_boxes_ = true;
+    bundle1->norm_box_tolerance_ = norm_box_tolerance_;
+    bundle2->truth_has_char_boxes_ = true;
+    bundle2->norm_box_tolerance_ = norm_box_tolerance_;
+    BlamerBundle *curr_bb = bundle1;
+    for (b = 0; b < norm_truth_word_.length(); ++b) {
+      if (b == begin2_truth_index) {
+        curr_bb = bundle2;
+      }
+      curr_bb->norm_truth_word_.InsertBox(b, norm_truth_word_.BlobBox(b));
+      curr_bb->truth_word_.InsertBox(b, truth_word_.BlobBox(b));
+      curr_bb->truth_text_.push_back(truth_text_[b]);
+    }
+  } else if (incorrect_result_reason_ == IRR_NO_TRUTH) {
+    bundle1->incorrect_result_reason_ = IRR_NO_TRUTH;
+    bundle2->incorrect_result_reason_ = IRR_NO_TRUTH;
+  } else {
+    debug_str += "Truth split not found";
+    debug_str += truth_has_char_boxes_ ? "\n" : " (no truth char boxes)\n";
+    bundle1->SetBlame(IRR_NO_TRUTH_SPLIT, debug_str, nullptr, debug);
+    bundle2->SetBlame(IRR_NO_TRUTH_SPLIT, debug_str, nullptr, debug);
+  }
+}
+
+// "Joins" the blames from bundle1 and bundle2 into *this.
+void BlamerBundle::JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2,
+                              bool debug) {
+  std::string debug_str;
+  IncorrectResultReason irr = incorrect_result_reason_;
+  if (irr != IRR_NO_TRUTH_SPLIT) {
+    debug_str = "";
+  }
+  if (bundle1.incorrect_result_reason_ != IRR_CORRECT &&
+      bundle1.incorrect_result_reason_ != IRR_NO_TRUTH &&
+      bundle1.incorrect_result_reason_ != IRR_NO_TRUTH_SPLIT) {
+    debug_str += "Blame from part 1: ";
+    debug_str += bundle1.debug_;
+    irr = bundle1.incorrect_result_reason_;
+  }
+  if (bundle2.incorrect_result_reason_ != IRR_CORRECT &&
+      bundle2.incorrect_result_reason_ != IRR_NO_TRUTH &&
+      bundle2.incorrect_result_reason_ != IRR_NO_TRUTH_SPLIT) {
+    debug_str += "Blame from part 2: ";
+    debug_str += bundle2.debug_;
+    if (irr == IRR_CORRECT) {
+      irr = bundle2.incorrect_result_reason_;
+    } else if (irr != bundle2.incorrect_result_reason_) {
+      irr = IRR_UNKNOWN;
+    }
+  }
+  incorrect_result_reason_ = irr;
+  if (irr != IRR_CORRECT && irr != IRR_NO_TRUTH) {
+    SetBlame(irr, debug_str, nullptr, debug);
+  }
+}
+
+// If a blob with the same bounding box as one of the truth character
+// bounding boxes is not classified as the corresponding truth character
+// blames character classifier for incorrect answer.
+void BlamerBundle::BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box,
+                                   const BLOB_CHOICE_LIST &choices, bool debug) {
+  if (!truth_has_char_boxes_ || incorrect_result_reason_ != IRR_CORRECT) {
+    return; // Nothing to do here.
+  }
+
+  for (int b = 0; b < norm_truth_word_.length(); ++b) {
+    const TBOX &truth_box = norm_truth_word_.BlobBox(b);
+    // Note that we are more strict on the bounding box boundaries here
+    // than in other places (chopper, segmentation search), since we do
+    // not have the ability to check the previous and next bounding box.
+    if (blob_box.x_almost_equal(truth_box, norm_box_tolerance_ / 2)) {
+      bool found = false;
+      bool incorrect_adapted = false;
+      UNICHAR_ID incorrect_adapted_id = INVALID_UNICHAR_ID;
+      const char *truth_str = truth_text_[b].c_str();
+      // We promise not to modify the list or its contents, using a
+      // const BLOB_CHOICE* below.
+      BLOB_CHOICE_IT choices_it(const_cast<BLOB_CHOICE_LIST *>(&choices));
+      for (choices_it.mark_cycle_pt(); !choices_it.cycled_list(); choices_it.forward()) {
+        const BLOB_CHOICE *choice = choices_it.data();
+        if (strcmp(truth_str, unicharset.get_normed_unichar(choice->unichar_id())) == 0) {
+          found = true;
+          break;
+        } else if (choice->IsAdapted()) {
+          incorrect_adapted = true;
+          incorrect_adapted_id = choice->unichar_id();
+        }
+      } // end choices_it for loop
+      if (!found) {
+        std::string debug_str = "unichar ";
+        debug_str += truth_str;
+        debug_str += " not found in classification list";
+        SetBlame(IRR_CLASSIFIER, debug_str, nullptr, debug);
+      } else if (incorrect_adapted) {
+        std::string debug_str = "better rating for adapted ";
+        debug_str += unicharset.id_to_unichar(incorrect_adapted_id);
+        debug_str += " than for correct ";
+        debug_str += truth_str;
+        SetBlame(IRR_ADAPTION, debug_str, nullptr, debug);
+      }
+      break;
+    }
+  } // end iterating over blamer_bundle->norm_truth_word
+}
+
+// Checks whether chops were made at all the character bounding box
+// boundaries in word->truth_word. If not - blames the chopper for an
+// incorrect answer.
+void BlamerBundle::SetChopperBlame(const WERD_RES *word, bool debug) {
+  if (NoTruth() || !truth_has_char_boxes_ || word->chopped_word->blobs.empty()) {
+    return;
+  }
+  std::string debug_str;
+  bool missing_chop = false;
+  int num_blobs = word->chopped_word->blobs.size();
+  int box_index = 0;
+  int blob_index = 0;
+  int16_t truth_x = -1;
+  while (box_index < truth_word_.length() && blob_index < num_blobs) {
+    truth_x = norm_truth_word_.BlobBox(box_index).right();
+    TBLOB *curr_blob = word->chopped_word->blobs[blob_index];
+    if (curr_blob->bounding_box().right() < truth_x - norm_box_tolerance_) {
+      ++blob_index;
+      continue; // encountered an extra chop, keep looking
+    } else if (curr_blob->bounding_box().right() > truth_x + norm_box_tolerance_) {
+      missing_chop = true;
+      break;
+    } else {
+      ++blob_index;
+    }
+  }
+  if (missing_chop || box_index < norm_truth_word_.length()) {
+    std::string debug_str;
+    if (missing_chop) {
+      debug_str += "Detected missing chop (tolerance=" + std::to_string(norm_box_tolerance_);
+      debug_str += ") at Bounding Box=";
+      TBLOB *curr_blob = word->chopped_word->blobs[blob_index];
+      curr_blob->bounding_box().print_to_str(debug_str);
+      debug_str += "\nNo chop for truth at x=" + std::to_string(truth_x);
+    } else {
+      debug_str += "Missing chops for last " + std::to_string(norm_truth_word_.length() - box_index);
+      debug_str += " truth box(es)";
+    }
+    debug_str += "\nMaximally chopped word boxes:\n";
+    for (blob_index = 0; blob_index < num_blobs; ++blob_index) {
+      TBLOB *curr_blob = word->chopped_word->blobs[blob_index];
+      curr_blob->bounding_box().print_to_str(debug_str);
+      debug_str += '\n';
+    }
+    debug_str += "Truth  bounding  boxes:\n";
+    for (box_index = 0; box_index < norm_truth_word_.length(); ++box_index) {
+      norm_truth_word_.BlobBox(box_index).print_to_str(debug_str);
+      debug_str += '\n';
+    }
+    SetBlame(IRR_CHOPPER, debug_str, word->best_choice, debug);
+  }
+}
+
+// Blames the classifier or the language model if, after running only the
+// chopper, best_choice is incorrect and no blame has been yet set.
+// Blames the classifier if best_choice is classifier's top choice and is a
+// dictionary word (i.e. language model could not have helped).
+// Otherwise, blames the language model (formerly permuter word adjustment).
+void BlamerBundle::BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset,
+                                              bool valid_permuter, bool debug) {
+  if (valid_permuter) {
+    // Find out whether best choice is a top choice.
+    best_choice_is_dict_and_top_choice_ = true;
+    for (int i = 0; i < word->best_choice->length(); ++i) {
+      BLOB_CHOICE_IT blob_choice_it(word->GetBlobChoices(i));
+      ASSERT_HOST(!blob_choice_it.empty());
+      BLOB_CHOICE *first_choice = nullptr;
+      for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
+           blob_choice_it.forward()) { // find first non-fragment choice
+        if (!(unicharset.get_fragment(blob_choice_it.data()->unichar_id()))) {
+          first_choice = blob_choice_it.data();
+          break;
+        }
+      }
+      ASSERT_HOST(first_choice != nullptr);
+      if (first_choice->unichar_id() != word->best_choice->unichar_id(i)) {
+        best_choice_is_dict_and_top_choice_ = false;
+        break;
+      }
+    }
+  }
+  std::string debug_str;
+  if (best_choice_is_dict_and_top_choice_) {
+    debug_str = "Best choice is: incorrect, top choice, dictionary word";
+    debug_str += " with permuter ";
+    debug_str += word->best_choice->permuter_name();
+  } else {
+    debug_str = "Classifier/Old LM tradeoff is to blame";
+  }
+  SetBlame(best_choice_is_dict_and_top_choice_ ? IRR_CLASSIFIER : IRR_CLASS_OLD_LM_TRADEOFF,
+           debug_str, word->best_choice, debug);
+}
+
+// Sets up the correct_segmentation_* to mark the correct bounding boxes.
+void BlamerBundle::SetupCorrectSegmentation(const TWERD *word, bool debug) {
+#ifndef DISABLED_LEGACY_ENGINE
+  params_training_bundle_.StartHypothesisList();
+#endif //  ndef DISABLED_LEGACY_ENGINE
+  if (incorrect_result_reason_ != IRR_CORRECT || !truth_has_char_boxes_) {
+    return; // Nothing to do here.
+  }
+
+  std::string debug_str = "Blamer computing correct_segmentation_cols\n";
+  int curr_box_col = 0;
+  int next_box_col = 0;
+  int num_blobs = word->NumBlobs();
+  if (num_blobs == 0) {
+    return; // No blobs to play with.
+  }
+  int blob_index = 0;
+  int16_t next_box_x = word->blobs[blob_index]->bounding_box().right();
+  for (int truth_idx = 0; blob_index < num_blobs && truth_idx < norm_truth_word_.length();
+       ++blob_index) {
+    ++next_box_col;
+    int16_t curr_box_x = next_box_x;
+    if (blob_index + 1 < num_blobs) {
+      next_box_x = word->blobs[blob_index + 1]->bounding_box().right();
+    }
+    int16_t truth_x = norm_truth_word_.BlobBox(truth_idx).right();
+    debug_str += "Box x coord vs. truth: " + std::to_string(curr_box_x);
+    debug_str += " " + std::to_string(truth_x);
+    debug_str += "\n";
+    if (curr_box_x > (truth_x + norm_box_tolerance_)) {
+      break;                                                  // failed to find a matching box
+    } else if (curr_box_x >= truth_x - norm_box_tolerance_ && // matched
+               (blob_index + 1 >= num_blobs ||                // next box can't be included
+                next_box_x > truth_x + norm_box_tolerance_)) {
+      correct_segmentation_cols_.push_back(curr_box_col);
+      correct_segmentation_rows_.push_back(next_box_col - 1);
+      ++truth_idx;
+      debug_str += "col=" + std::to_string(curr_box_col);
+      debug_str += " row=" + std::to_string(next_box_col - 1);
+      debug_str += "\n";
+      curr_box_col = next_box_col;
+    }
+  }
+  if (blob_index < num_blobs || // trailing blobs
+      correct_segmentation_cols_.size() != norm_truth_word_.length()) {
+    debug_str += 
+        "Blamer failed to find correct segmentation"
+        " (tolerance=" +
+        std::to_string(norm_box_tolerance_);
+    if (blob_index >= num_blobs) {
+      debug_str += " blob == nullptr";
+    }
+    debug_str += ")\n";
+    debug_str += " path length " + std::to_string(correct_segmentation_cols_.size());
+    debug_str += " vs. truth " + std::to_string(norm_truth_word_.length());
+    debug_str += "\n";
+    SetBlame(IRR_UNKNOWN, debug_str, nullptr, debug);
+    correct_segmentation_cols_.clear();
+    correct_segmentation_rows_.clear();
+  }
+}
+
+// Returns true if a guided segmentation search is needed.
+bool BlamerBundle::GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const {
+  return incorrect_result_reason_ == IRR_CORRECT && !segsearch_is_looking_for_blame_ &&
+         truth_has_char_boxes_ && !ChoiceIsCorrect(best_choice);
+}
+
+#if !defined(DISABLED_LEGACY_ENGINE)
+// Setup ready to guide the segmentation search to the correct segmentation.
+void BlamerBundle::InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings,
+                                    UNICHAR_ID wildcard_id, bool debug, std::string &debug_str,
+                                    tesseract::LMPainPoints *pain_points, double max_char_wh_ratio,
+                                    WERD_RES *word_res) {
+  segsearch_is_looking_for_blame_ = true;
+  if (debug) {
+    tprintf("segsearch starting to look for blame\n");
+  }
+  // Fill pain points for any unclassifed blob corresponding to the
+  // correct segmentation state.
+  debug_str += "Correct segmentation:\n";
+  for (int idx = 0; idx < correct_segmentation_cols_.size(); ++idx) {
+    debug_str += "col=" + std::to_string(correct_segmentation_cols_[idx]);
+    debug_str += " row=" + std::to_string(correct_segmentation_rows_[idx]);
+    debug_str += "\n";
+    if (!ratings->Classified(correct_segmentation_cols_[idx], correct_segmentation_rows_[idx],
+                             wildcard_id) &&
+        !pain_points->GeneratePainPoint(
+            correct_segmentation_cols_[idx], correct_segmentation_rows_[idx],
+            tesseract::LM_PPTYPE_BLAMER, 0.0, false, max_char_wh_ratio, word_res)) {
+      segsearch_is_looking_for_blame_ = false;
+      debug_str += "\nFailed to insert pain point\n";
+      SetBlame(IRR_SEGSEARCH_HEUR, debug_str, best_choice, debug);
+      break;
+    }
+  } // end for blamer_bundle->correct_segmentation_cols/rows
+}
+#endif // !defined(DISABLED_LEGACY_ENGINE)
+
+// Returns true if the guided segsearch is in progress.
+bool BlamerBundle::GuidedSegsearchStillGoing() const {
+  return segsearch_is_looking_for_blame_;
+}
+
+// The segmentation search has ended. Sets the blame appropriately.
+void BlamerBundle::FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, std::string &debug_str) {
+  // If we are still looking for blame (i.e. best_choice is incorrect, but a
+  // path representing the correct segmentation could be constructed), we can
+  // blame segmentation search pain point prioritization if the rating of the
+  // path corresponding to the correct segmentation is better than that of
+  // best_choice (i.e. language model would have done the correct thing, but
+  // because of poor pain point prioritization the correct segmentation was
+  // never explored). Otherwise we blame the tradeoff between the language model
+  // and the classifier, since even after exploring the path corresponding to
+  // the correct segmentation incorrect best_choice would have been chosen.
+  // One special case when we blame the classifier instead is when best choice
+  // is incorrect, but it is a dictionary word and it classifier's top choice.
+  if (segsearch_is_looking_for_blame_) {
+    segsearch_is_looking_for_blame_ = false;
+    if (best_choice_is_dict_and_top_choice_) {
+      debug_str = "Best choice is: incorrect, top choice, dictionary word";
+      debug_str += " with permuter ";
+      debug_str += best_choice->permuter_name();
+      SetBlame(IRR_CLASSIFIER, debug_str, best_choice, debug);
+    } else if (best_correctly_segmented_rating_ < best_choice->rating()) {
+      debug_str += "Correct segmentation state was not explored";
+      SetBlame(IRR_SEGSEARCH_PP, debug_str, best_choice, debug);
+    } else {
+      if (best_correctly_segmented_rating_ >= WERD_CHOICE::kBadRating) {
+        debug_str += "Correct segmentation paths were pruned by LM\n";
+      } else {
+        debug_str += "Best correct segmentation rating " +
+                                  std::to_string(best_correctly_segmented_rating_);
+        debug_str += " vs. best choice rating " + std::to_string(best_choice->rating());
+      }
+      SetBlame(IRR_CLASS_LM_TRADEOFF, debug_str, best_choice, debug);
+    }
+  }
+}
+
+// If the bundle is null or still does not indicate the correct result,
+// fix it and use some backup reason for the blame.
+void BlamerBundle::LastChanceBlame(bool debug, WERD_RES *word) {
+  if (word->blamer_bundle == nullptr) {
+    word->blamer_bundle = new BlamerBundle();
+    word->blamer_bundle->SetBlame(IRR_PAGE_LAYOUT, "LastChanceBlame", word->best_choice, debug);
+  } else if (word->blamer_bundle->incorrect_result_reason_ == IRR_NO_TRUTH) {
+    word->blamer_bundle->SetBlame(IRR_NO_TRUTH, "Rejected truth", word->best_choice, debug);
+  } else {
+    bool correct = word->blamer_bundle->ChoiceIsCorrect(word->best_choice);
+    IncorrectResultReason irr = word->blamer_bundle->incorrect_result_reason_;
+    if (irr == IRR_CORRECT && !correct) {
+      std::string debug_str = "Choice is incorrect after recognition";
+      word->blamer_bundle->SetBlame(IRR_UNKNOWN, debug_str, word->best_choice, debug);
+    } else if (irr != IRR_CORRECT && correct) {
+      if (debug) {
+        tprintf("Corrected %s\n", word->blamer_bundle->debug_.c_str());
+      }
+      word->blamer_bundle->incorrect_result_reason_ = IRR_CORRECT;
+      word->blamer_bundle->debug_ = "";
+    }
+  }
+}
+
+// Sets the misadaption debug if this word is incorrect, as this word is
+// being adapted to.
+void BlamerBundle::SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug) {
+  if (incorrect_result_reason_ != IRR_NO_TRUTH && !ChoiceIsCorrect(best_choice)) {
+    misadaption_debug_ = "misadapt to word (";
+    misadaption_debug_ += best_choice->permuter_name();
+    misadaption_debug_ += "): ";
+    FillDebugString("", best_choice, misadaption_debug_);
+    if (debug) {
+      tprintf("%s\n", misadaption_debug_.c_str());
+    }
+  }
+}
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/blamer.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/blamer.h
@ -0,0 +1,350 @@
+///////////////////////////////////////////////////////////////////////
+// File:        blamer.h
+// Description: Module allowing precise error causes to be allocated.
+// Author:      Rike Antonova
+// Refactored:  Ray Smith
+//
+// (C) Copyright 2013, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCSTRUCT_BLAMER_H_
+#define TESSERACT_CCSTRUCT_BLAMER_H_
+
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h" // DISABLED_LEGACY_ENGINE
+#endif
+#include "boxword.h" // for BoxWord
+#ifndef DISABLED_LEGACY_ENGINE
+#  include "params_training_featdef.h" // for ParamsTrainingBundle, ParamsTra...
+#endif                                 //  ndef DISABLED_LEGACY_ENGINE
+#include "ratngs.h"                    // for BLOB_CHOICE_LIST (ptr only)
+#include "rect.h"                      // for TBOX
+#include "tprintf.h"                   // for tprintf
+
+#include <tesseract/unichar.h> // for UNICHAR_ID
+
+#include <cstdint> // for int16_t
+#include <cstring> // for memcpy
+#include <vector>  // for std::vector
+
+namespace tesseract {
+
+class DENORM;
+class MATRIX;
+class UNICHARSET;
+class WERD_RES;
+
+struct MATRIX_COORD;
+struct TWERD;
+
+class LMPainPoints;
+
+static const int16_t kBlamerBoxTolerance = 5;
+
+// Enum for expressing the source of error.
+// Note: Please update kIncorrectResultReasonNames when modifying this enum.
+enum IncorrectResultReason {
+  // The text recorded in best choice == truth text
+  IRR_CORRECT,
+  // Either: Top choice is incorrect and is a dictionary word (language model
+  // is unlikely to help correct such errors, so blame the classifier).
+  // Or: the correct unichar was not included in shortlist produced by the
+  // classifier at all.
+  IRR_CLASSIFIER,
+  // Chopper have not found one or more splits that correspond to the correct
+  // character bounding boxes recorded in BlamerBundle::truth_word.
+  IRR_CHOPPER,
+  // Classifier did include correct unichars for each blob in the correct
+  // segmentation, however its rating could have been too bad to allow the
+  // language model to pull out the correct choice. On the other hand the
+  // strength of the language model might have been too weak to favor the
+  // correct answer, this we call this case a classifier-language model
+  // tradeoff error.
+  IRR_CLASS_LM_TRADEOFF,
+  // Page layout failed to produce the correct bounding box. Blame page layout
+  // if the truth was not found for the word, which implies that the bounding
+  // box of the word was incorrect (no truth word had a similar bounding box).
+  IRR_PAGE_LAYOUT,
+  // SegSearch heuristic prevented one or more blobs from the correct
+  // segmentation state to be classified (e.g. the blob was too wide).
+  IRR_SEGSEARCH_HEUR,
+  // The correct segmentaiton state was not explored because of poor SegSearch
+  // pain point prioritization. We blame SegSearch pain point prioritization
+  // if the best rating of a choice constructed from correct segmentation is
+  // better than that of the best choice (i.e. if we got to explore the correct
+  // segmentation state, language model would have picked the correct choice).
+  IRR_SEGSEARCH_PP,
+  // Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
+  // and thus use the old language model (permuters).
+  // TODO(antonova): integrate the new language mode with chopper
+  IRR_CLASS_OLD_LM_TRADEOFF,
+  // If there is an incorrect adaptive template match with a better score than
+  // a correct one (either pre-trained or adapted), mark this as adaption error.
+  IRR_ADAPTION,
+  // split_and_recog_word() failed to find a suitable split in truth.
+  IRR_NO_TRUTH_SPLIT,
+  // Truth is not available for this word (e.g. when words in corrected content
+  // file are turned into ~~~~ because an appropriate alignment was not found.
+  IRR_NO_TRUTH,
+  // The text recorded in best choice != truth text, but none of the above
+  // reasons are set.
+  IRR_UNKNOWN,
+
+  IRR_NUM_REASONS
+};
+
+// Blamer-related information to determine the source of errors.
+struct BlamerBundle {
+  static const char *IncorrectReasonName(IncorrectResultReason irr);
+  BlamerBundle()
+      : truth_has_char_boxes_(false)
+      , incorrect_result_reason_(IRR_CORRECT)
+      , lattice_data_(nullptr) {
+    ClearResults();
+  }
+  BlamerBundle(const BlamerBundle &other) {
+    this->CopyTruth(other);
+    this->CopyResults(other);
+  }
+  ~BlamerBundle() {
+    delete[] lattice_data_;
+  }
+
+  // Accessors.
+  std::string TruthString() const {
+    std::string truth_str;
+    for (auto &text : truth_text_) {
+      truth_str += text;
+    }
+    return truth_str;
+  }
+  IncorrectResultReason incorrect_result_reason() const {
+    return incorrect_result_reason_;
+  }
+  bool NoTruth() const {
+    return incorrect_result_reason_ == IRR_NO_TRUTH || incorrect_result_reason_ == IRR_PAGE_LAYOUT;
+  }
+  bool HasDebugInfo() const {
+    return debug_.length() > 0 || misadaption_debug_.length() > 0;
+  }
+  const std::string &debug() const {
+    return debug_;
+  }
+  const std::string &misadaption_debug() const {
+    return misadaption_debug_;
+  }
+  void UpdateBestRating(float rating) {
+    if (rating < best_correctly_segmented_rating_) {
+      best_correctly_segmented_rating_ = rating;
+    }
+  }
+  int correct_segmentation_length() const {
+    return correct_segmentation_cols_.size();
+  }
+  // Returns true if the given ratings matrix col,row position is included
+  // in the correct segmentation path at the given index.
+  bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord) {
+    return correct_segmentation_cols_[index] == coord.col &&
+           correct_segmentation_rows_[index] == coord.row;
+  }
+  void set_best_choice_is_dict_and_top_choice(bool value) {
+    best_choice_is_dict_and_top_choice_ = value;
+  }
+  const char *lattice_data() const {
+    return lattice_data_;
+  }
+  int lattice_size() const {
+    return lattice_size_; // size of lattice_data in bytes
+  }
+  void set_lattice_data(const char *data, int size) {
+    lattice_size_ = size;
+    delete[] lattice_data_;
+    lattice_data_ = new char[lattice_size_];
+    memcpy(lattice_data_, data, lattice_size_);
+  }
+#ifndef DISABLED_LEGACY_ENGINE
+  const tesseract::ParamsTrainingBundle &params_training_bundle() const {
+    return params_training_bundle_;
+  }
+  // Adds a new ParamsTrainingHypothesis to the current hypothesis list.
+  void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo) {
+    params_training_bundle_.AddHypothesis(hypo);
+  }
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+  // Functions to setup the blamer.
+  // Whole word string, whole word bounding box.
+  void SetWordTruth(const UNICHARSET &unicharset, const char *truth_str, const TBOX &word_box);
+  // Single "character" string, "character" bounding box.
+  // May be called multiple times to indicate the characters in a word.
+  void SetSymbolTruth(const UNICHARSET &unicharset, const char *char_str, const TBOX &char_box);
+  // Marks that there is something wrong with the truth text, like it contains
+  // reject characters.
+  void SetRejectedTruth();
+
+  // Returns true if the provided word_choice is correct.
+  bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const;
+
+  void ClearResults() {
+    norm_truth_word_.DeleteAllBoxes();
+    norm_box_tolerance_ = 0;
+    if (!NoTruth()) {
+      incorrect_result_reason_ = IRR_CORRECT;
+    }
+    debug_ = "";
+    segsearch_is_looking_for_blame_ = false;
+    best_correctly_segmented_rating_ = WERD_CHOICE::kBadRating;
+    correct_segmentation_cols_.clear();
+    correct_segmentation_rows_.clear();
+    best_choice_is_dict_and_top_choice_ = false;
+    delete[] lattice_data_;
+    lattice_data_ = nullptr;
+    lattice_size_ = 0;
+  }
+  void CopyTruth(const BlamerBundle &other) {
+    truth_has_char_boxes_ = other.truth_has_char_boxes_;
+    truth_word_ = other.truth_word_;
+    truth_text_ = other.truth_text_;
+    incorrect_result_reason_ = (other.NoTruth() ? other.incorrect_result_reason_ : IRR_CORRECT);
+  }
+  void CopyResults(const BlamerBundle &other) {
+    norm_truth_word_ = other.norm_truth_word_;
+    norm_box_tolerance_ = other.norm_box_tolerance_;
+    incorrect_result_reason_ = other.incorrect_result_reason_;
+    segsearch_is_looking_for_blame_ = other.segsearch_is_looking_for_blame_;
+    best_correctly_segmented_rating_ = other.best_correctly_segmented_rating_;
+    correct_segmentation_cols_ = other.correct_segmentation_cols_;
+    correct_segmentation_rows_ = other.correct_segmentation_rows_;
+    best_choice_is_dict_and_top_choice_ = other.best_choice_is_dict_and_top_choice_;
+    if (other.lattice_data_ != nullptr) {
+      lattice_data_ = new char[other.lattice_size_];
+      memcpy(lattice_data_, other.lattice_data_, other.lattice_size_);
+      lattice_size_ = other.lattice_size_;
+    } else {
+      lattice_data_ = nullptr;
+    }
+  }
+  const char *IncorrectReason() const;
+
+  // Appends choice and truth details to the given debug string.
+  void FillDebugString(const std::string &msg, const WERD_CHOICE *choice, std::string &debug);
+
+  // Sets up the norm_truth_word from truth_word using the given DENORM.
+  void SetupNormTruthWord(const DENORM &denorm);
+
+  // Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
+  // bundles) where the right edge/ of the left-hand word is word1_right,
+  // and the left edge of the right-hand word is word2_left.
+  void SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1,
+                   BlamerBundle *bundle2) const;
+  // "Joins" the blames from bundle1 and bundle2 into *this.
+  void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug);
+
+  // If a blob with the same bounding box as one of the truth character
+  // bounding boxes is not classified as the corresponding truth character
+  // blames character classifier for incorrect answer.
+  void BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box,
+                       const BLOB_CHOICE_LIST &choices, bool debug);
+
+  // Checks whether chops were made at all the character bounding box
+  // boundaries in word->truth_word. If not - blames the chopper for an
+  // incorrect answer.
+  void SetChopperBlame(const WERD_RES *word, bool debug);
+  // Blames the classifier or the language model if, after running only the
+  // chopper, best_choice is incorrect and no blame has been yet set.
+  // Blames the classifier if best_choice is classifier's top choice and is a
+  // dictionary word (i.e. language model could not have helped).
+  // Otherwise, blames the language model (formerly permuter word adjustment).
+  void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset,
+                                  bool valid_permuter, bool debug);
+  // Sets up the correct_segmentation_* to mark the correct bounding boxes.
+  void SetupCorrectSegmentation(const TWERD *word, bool debug);
+
+  // Returns true if a guided segmentation search is needed.
+  bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const;
+  // Setup ready to guide the segmentation search to the correct segmentation.
+  void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id,
+                        bool debug, std::string &debug_str, tesseract::LMPainPoints *pain_points,
+                        double max_char_wh_ratio, WERD_RES *word_res);
+  // Returns true if the guided segsearch is in progress.
+  bool GuidedSegsearchStillGoing() const;
+  // The segmentation search has ended. Sets the blame appropriately.
+  void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, std::string &debug_str);
+
+  // If the bundle is null or still does not indicate the correct result,
+  // fix it and use some backup reason for the blame.
+  static void LastChanceBlame(bool debug, WERD_RES *word);
+
+  // Sets the misadaption debug if this word is incorrect, as this word is
+  // being adapted to.
+  void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug);
+
+private:
+  // Copy assignment operator (currently unused, therefore private).
+  BlamerBundle &operator=(const BlamerBundle &other) = delete;
+  void SetBlame(IncorrectResultReason irr, const std::string &msg, const WERD_CHOICE *choice,
+                bool debug) {
+    incorrect_result_reason_ = irr;
+    debug_ = IncorrectReason();
+    debug_ += " to blame: ";
+    FillDebugString(msg, choice, debug_);
+    if (debug) {
+      tprintf("SetBlame(): %s", debug_.c_str());
+    }
+  }
+
+private:
+  // Set to true when bounding boxes for individual unichars are recorded.
+  bool truth_has_char_boxes_;
+  // Variables used by the segmentation search when looking for the blame.
+  // Set to true while segmentation search is continued after the usual
+  // termination condition in order to look for the blame.
+  bool segsearch_is_looking_for_blame_;
+  // Set to true if best choice is a dictionary word and
+  // classifier's top choice.
+  bool best_choice_is_dict_and_top_choice_;
+  // Tolerance for bounding box comparisons in normalized space.
+  int norm_box_tolerance_;
+  // The true_word (in the original image coordinate space) contains ground
+  // truth bounding boxes for this WERD_RES.
+  tesseract::BoxWord truth_word_;
+  // Same as above, but in normalized coordinates
+  // (filled in by WERD_RES::SetupForRecognition()).
+  tesseract::BoxWord norm_truth_word_;
+  // Contains ground truth unichar for each of the bounding boxes in truth_word.
+  std::vector<std::string> truth_text_;
+  // The reason for incorrect OCR result.
+  IncorrectResultReason incorrect_result_reason_;
+  // Debug text associated with the blame.
+  std::string debug_;
+  // Misadaption debug information (filled in if this word was misadapted to).
+  std::string misadaption_debug_;
+  // Vectors populated by SegSearch to indicate column and row indices that
+  // correspond to blobs with correct bounding boxes.
+  std::vector<int> correct_segmentation_cols_;
+  std::vector<int> correct_segmentation_rows_;
+  // Best rating for correctly segmented path
+  // (set and used by SegSearch when looking for blame).
+  float best_correctly_segmented_rating_;
+  int lattice_size_; // size of lattice_data in bytes
+  // Serialized segmentation search lattice.
+  char *lattice_data_;
+  // Information about hypotheses (paths) explored by the segmentation search.
+#ifndef DISABLED_LEGACY_ENGINE
+  tesseract::ParamsTrainingBundle params_training_bundle_;
+#endif // ndef DISABLED_LEGACY_ENGINE
+};
+
+} // namespace tesseract
+
+#endif // TESSERACT_CCSTRUCT_BLAMER_H_
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/blobbox.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/blobbox.cpp
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/blobbox.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/blobbox.h
@ -0,0 +1,853 @@
+/**********************************************************************
+ * File:        blobbox.h  (Formerly blobnbox.h)
+ * Description: Code for the textord blob class.
+ * Author:      Ray Smith
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef BLOBBOX_H
+#define BLOBBOX_H
+
+#include "elst.h"       // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
+#include "elst2.h"      // for ELIST2_ITERATOR, ELIST2IZEH, ELIST2_LINK
+#include "errcode.h"    // for ASSERT_HOST
+#include "ocrblock.h"   // for BLOCK
+#include "params.h"     // for DoubleParam, double_VAR_H
+#include "pdblock.h"    // for PDBLK
+#include "points.h"     // for FCOORD, ICOORD, ICOORDELT_LIST
+#include "quspline.h"   // for QSPLINE
+#include "rect.h"       // for TBOX
+#include "scrollview.h" // for ScrollView, ScrollView::Color
+#include "statistc.h"   // for STATS
+#include "stepblob.h"   // for C_BLOB
+#include "tprintf.h"    // for tprintf
+#include "werd.h"       // for WERD_LIST
+
+#include <cinttypes> // for PRId32
+#include <cmath>     // for std::sqrt
+#include <cstdint>   // for int16_t, int32_t
+
+struct Pix;
+
+namespace tesseract {
+
+class C_OUTLINE;
+
+enum PITCH_TYPE {
+  PITCH_DUNNO,       // insufficient data
+  PITCH_DEF_FIXED,   // definitely fixed
+  PITCH_MAYBE_FIXED, // could be
+  PITCH_DEF_PROP,
+  PITCH_MAYBE_PROP,
+  PITCH_CORR_FIXED,
+  PITCH_CORR_PROP
+};
+
+// The possible tab-stop types of each side of a BLOBNBOX.
+// The ordering is important, as it is used for deleting dead-ends in the
+// search. ALIGNED, CONFIRMED and VLINE should remain greater than the
+// non-aligned, unset, or deleted members.
+enum TabType {
+  TT_NONE,          // Not a tab.
+  TT_DELETED,       // Not a tab after detailed analysis.
+  TT_MAYBE_RAGGED,  // Initial designation of a tab-stop candidate.
+  TT_MAYBE_ALIGNED, // Initial designation of a tab-stop candidate.
+  TT_CONFIRMED,     // Aligned with neighbours.
+  TT_VLINE          // Detected as a vertical line.
+};
+
+// The possible region types of a BLOBNBOX.
+// Note: keep all the text types > BRT_UNKNOWN and all the image types less.
+// Keep in sync with kBlobTypes in colpartition.cpp and BoxColor, and the
+// *Type static functions below.
+enum BlobRegionType {
+  BRT_NOISE,     // Neither text nor image.
+  BRT_HLINE,     // Horizontal separator line.
+  BRT_VLINE,     // Vertical separator line.
+  BRT_RECTIMAGE, // Rectangular image.
+  BRT_POLYIMAGE, // Non-rectangular image.
+  BRT_UNKNOWN,   // Not determined yet.
+  BRT_VERT_TEXT, // Vertical alignment, not necessarily vertically oriented.
+  BRT_TEXT,      // Convincing text.
+
+  BRT_COUNT // Number of possibilities.
+};
+
+// enum for elements of arrays that refer to neighbours.
+// NOTE: keep in this order, so ^2 can be used to flip direction.
+enum BlobNeighbourDir { BND_LEFT, BND_BELOW, BND_RIGHT, BND_ABOVE, BND_COUNT };
+
+// enum for special type of text characters, such as math symbol or italic.
+enum BlobSpecialTextType {
+  BSTT_NONE,    // No special.
+  BSTT_ITALIC,  // Italic style.
+  BSTT_DIGIT,   // Digit symbols.
+  BSTT_MATH,    // Mathematical symbols (not including digit).
+  BSTT_UNCLEAR, // Characters with low recognition rate.
+  BSTT_SKIP,    // Characters that we skip labeling (usually too small).
+  BSTT_COUNT
+};
+
+inline BlobNeighbourDir DirOtherWay(BlobNeighbourDir dir) {
+  return static_cast<BlobNeighbourDir>(dir ^ 2);
+}
+
+// BlobTextFlowType indicates the quality of neighbouring information
+// related to a chain of connected components, either horizontally or
+// vertically. Also used by ColPartition for the collection of blobs
+// within, which should all have the same value in most cases.
+enum BlobTextFlowType {
+  BTFT_NONE,          // No text flow set yet.
+  BTFT_NONTEXT,       // Flow too poor to be likely text.
+  BTFT_NEIGHBOURS,    // Neighbours support flow in this direction.
+  BTFT_CHAIN,         // There is a weak chain of text in this direction.
+  BTFT_STRONG_CHAIN,  // There is a strong chain of text in this direction.
+  BTFT_TEXT_ON_IMAGE, // There is a strong chain of text on an image.
+  BTFT_LEADER,        // Leader dots/dashes etc.
+  BTFT_COUNT
+};
+
+// Returns true if type1 dominates type2 in a merge. Mostly determined by the
+// ordering of the enum, LEADER is weak and dominates nothing.
+// The function is anti-symmetric (t1 > t2) === !(t2 > t1), except that
+// this cannot be true if t1 == t2, so the result is undefined.
+inline bool DominatesInMerge(BlobTextFlowType type1, BlobTextFlowType type2) {
+  // LEADER always loses.
+  if (type1 == BTFT_LEADER) {
+    return false;
+  }
+  if (type2 == BTFT_LEADER) {
+    return true;
+  }
+  // With those out of the way, the ordering of the enum determines the result.
+  return type1 >= type2;
+}
+
+class ColPartition;
+
+class BLOBNBOX;
+ELISTIZEH(BLOBNBOX)
+class BLOBNBOX : public ELIST_LINK {
+public:
+  BLOBNBOX() {
+    ReInit();
+  }
+  explicit BLOBNBOX(C_BLOB *srcblob) {
+    box = srcblob->bounding_box();
+    ReInit();
+    cblob_ptr = srcblob;
+    area = static_cast<int>(srcblob->area());
+  }
+  ~BLOBNBOX() {
+    if (owns_cblob_) {
+      delete cblob_ptr;
+    }
+  }
+
+  static void clear_blobnboxes(BLOBNBOX_LIST *boxes) {
+    BLOBNBOX_IT it = boxes;
+    // A BLOBNBOX generally doesn't own its blobs, so if they do, you
+    // have to delete them explicitly.
+    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+      BLOBNBOX *box = it.data();
+      // TODO: remove next line, currently still needed for resultiterator_test.
+      delete box->remove_cblob();
+    }
+  }
+
+  static BLOBNBOX *RealBlob(C_OUTLINE *outline) {
+    auto *blob = new C_BLOB(outline);
+    return new BLOBNBOX(blob);
+  }
+
+  // Rotates the box and the underlying blob.
+  void rotate(FCOORD rotation);
+
+  // Methods that act on the box without touching the underlying blob.
+  // Reflect the box in the y-axis, leaving the underlying blob untouched.
+  void reflect_box_in_y_axis();
+  // Rotates the box by the angle given by rotation.
+  // If the blob is a diacritic, then only small rotations for skew
+  // correction can be applied.
+  void rotate_box(FCOORD rotation);
+  // Moves just the box by the given vector.
+  void translate_box(ICOORD v) {
+    if (IsDiacritic()) {
+      box.move(v);
+      base_char_top_ += v.y();
+      base_char_bottom_ += v.y();
+    } else {
+      box.move(v);
+      set_diacritic_box(box);
+    }
+  }
+  void merge(BLOBNBOX *nextblob);
+  void really_merge(BLOBNBOX *other);
+  void chop(                 // fake chop blob
+      BLOBNBOX_IT *start_it, // location of this
+      BLOBNBOX_IT *blob_it,  // iterator
+      FCOORD rotation,       // for landscape
+      float xheight);        // line height
+
+  void NeighbourGaps(int gaps[BND_COUNT]) const;
+  void MinMaxGapsClipped(int *h_min, int *h_max, int *v_min, int *v_max) const;
+  void CleanNeighbours();
+  // Returns positive if there is at least one side neighbour that has a
+  // similar stroke width and is not on the other side of a rule line.
+  int GoodTextBlob() const;
+  // Returns the number of side neighbours that are of type BRT_NOISE.
+  int NoisyNeighbours() const;
+
+  // Returns true if the blob is noise and has no owner.
+  bool DeletableNoise() const {
+    return owner() == nullptr && region_type() == BRT_NOISE;
+  }
+
+  // Returns true, and sets vert_possible/horz_possible if the blob has some
+  // feature that makes it individually appear to flow one way.
+  // eg if it has a high aspect ratio, yet has a complex shape, such as a
+  // joined word in Latin, Arabic, or Hindi, rather than being a -, I, l, 1.
+  bool DefiniteIndividualFlow();
+
+  // Returns true if there is no tabstop violation in merging this and other.
+  bool ConfirmNoTabViolation(const BLOBNBOX &other) const;
+
+  // Returns true if other has a similar stroke width to this.
+  bool MatchingStrokeWidth(const BLOBNBOX &other, double fractional_tolerance,
+                           double constant_tolerance) const;
+
+  // Returns a bounding box of the outline contained within the
+  // given horizontal range.
+  TBOX BoundsWithinLimits(int left, int right);
+
+  // Estimates and stores the baseline position based on the shape of the
+  // outline.
+  void EstimateBaselinePosition();
+
+  // Simple accessors.
+  const TBOX &bounding_box() const {
+    return box;
+  }
+  // Set the bounding box. Use with caution.
+  // Normally use compute_bounding_box instead.
+  void set_bounding_box(const TBOX &new_box) {
+    box = new_box;
+    base_char_top_ = box.top();
+    base_char_bottom_ = box.bottom();
+  }
+  void compute_bounding_box() {
+    box = cblob_ptr->bounding_box();
+    base_char_top_ = box.top();
+    base_char_bottom_ = box.bottom();
+    baseline_y_ = box.bottom();
+  }
+  const TBOX &reduced_box() const {
+    return red_box;
+  }
+  void set_reduced_box(TBOX new_box) {
+    red_box = new_box;
+    reduced = true;
+  }
+  int32_t enclosed_area() const {
+    return area;
+  }
+  bool joined_to_prev() const {
+    return joined;
+  }
+  bool red_box_set() const {
+    return reduced;
+  }
+  int repeated_set() const {
+    return repeated_set_;
+  }
+  void set_repeated_set(int set_id) {
+    repeated_set_ = set_id;
+  }
+  C_BLOB *cblob() const {
+    return cblob_ptr;
+  }
+  C_BLOB *remove_cblob() {
+    auto blob = cblob_ptr;
+    cblob_ptr = nullptr;
+    owns_cblob_ = false;
+    return blob;
+  }
+  TabType left_tab_type() const {
+    return left_tab_type_;
+  }
+  void set_left_tab_type(TabType new_type) {
+    left_tab_type_ = new_type;
+  }
+  TabType right_tab_type() const {
+    return right_tab_type_;
+  }
+  void set_right_tab_type(TabType new_type) {
+    right_tab_type_ = new_type;
+  }
+  BlobRegionType region_type() const {
+    return region_type_;
+  }
+  void set_region_type(BlobRegionType new_type) {
+    region_type_ = new_type;
+  }
+  BlobSpecialTextType special_text_type() const {
+    return spt_type_;
+  }
+  void set_special_text_type(BlobSpecialTextType new_type) {
+    spt_type_ = new_type;
+  }
+  BlobTextFlowType flow() const {
+    return flow_;
+  }
+  void set_flow(BlobTextFlowType value) {
+    flow_ = value;
+  }
+  bool vert_possible() const {
+    return vert_possible_;
+  }
+  void set_vert_possible(bool value) {
+    vert_possible_ = value;
+  }
+  bool horz_possible() const {
+    return horz_possible_;
+  }
+  void set_horz_possible(bool value) {
+    horz_possible_ = value;
+  }
+  int left_rule() const {
+    return left_rule_;
+  }
+  void set_left_rule(int new_left) {
+    left_rule_ = new_left;
+  }
+  int right_rule() const {
+    return right_rule_;
+  }
+  void set_right_rule(int new_right) {
+    right_rule_ = new_right;
+  }
+  int left_crossing_rule() const {
+    return left_crossing_rule_;
+  }
+  void set_left_crossing_rule(int new_left) {
+    left_crossing_rule_ = new_left;
+  }
+  int right_crossing_rule() const {
+    return right_crossing_rule_;
+  }
+  void set_right_crossing_rule(int new_right) {
+    right_crossing_rule_ = new_right;
+  }
+  float horz_stroke_width() const {
+    return horz_stroke_width_;
+  }
+  void set_horz_stroke_width(float width) {
+    horz_stroke_width_ = width;
+  }
+  float vert_stroke_width() const {
+    return vert_stroke_width_;
+  }
+  void set_vert_stroke_width(float width) {
+    vert_stroke_width_ = width;
+  }
+  float area_stroke_width() const {
+    return area_stroke_width_;
+  }
+  tesseract::ColPartition *owner() const {
+    return owner_;
+  }
+  void set_owner(tesseract::ColPartition *new_owner) {
+    owner_ = new_owner;
+  }
+  bool leader_on_left() const {
+    return leader_on_left_;
+  }
+  void set_leader_on_left(bool flag) {
+    leader_on_left_ = flag;
+  }
+  bool leader_on_right() const {
+    return leader_on_right_;
+  }
+  void set_leader_on_right(bool flag) {
+    leader_on_right_ = flag;
+  }
+  BLOBNBOX *neighbour(BlobNeighbourDir n) const {
+    return neighbours_[n];
+  }
+  bool good_stroke_neighbour(BlobNeighbourDir n) const {
+    return good_stroke_neighbours_[n];
+  }
+  void set_neighbour(BlobNeighbourDir n, BLOBNBOX *neighbour, bool good) {
+    neighbours_[n] = neighbour;
+    good_stroke_neighbours_[n] = good;
+  }
+  bool IsDiacritic() const {
+    return base_char_top_ != box.top() || base_char_bottom_ != box.bottom();
+  }
+  int base_char_top() const {
+    return base_char_top_;
+  }
+  int base_char_bottom() const {
+    return base_char_bottom_;
+  }
+  int baseline_position() const {
+    return baseline_y_;
+  }
+  int line_crossings() const {
+    return line_crossings_;
+  }
+  void set_line_crossings(int value) {
+    line_crossings_ = value;
+  }
+  void set_diacritic_box(const TBOX &diacritic_box) {
+    base_char_top_ = diacritic_box.top();
+    base_char_bottom_ = diacritic_box.bottom();
+  }
+  BLOBNBOX *base_char_blob() const {
+    return base_char_blob_;
+  }
+  void set_base_char_blob(BLOBNBOX *blob) {
+    base_char_blob_ = blob;
+  }
+  void set_owns_cblob(bool value) {
+    owns_cblob_ = value;
+  }
+
+  bool UniquelyVertical() const {
+    return vert_possible_ && !horz_possible_;
+  }
+  bool UniquelyHorizontal() const {
+    return horz_possible_ && !vert_possible_;
+  }
+
+  // Returns true if the region type is text.
+  static bool IsTextType(BlobRegionType type) {
+    return type == BRT_TEXT || type == BRT_VERT_TEXT;
+  }
+  // Returns true if the region type is image.
+  static bool IsImageType(BlobRegionType type) {
+    return type == BRT_RECTIMAGE || type == BRT_POLYIMAGE;
+  }
+  // Returns true if the region type is line.
+  static bool IsLineType(BlobRegionType type) {
+    return type == BRT_HLINE || type == BRT_VLINE;
+  }
+  // Returns true if the region type cannot be merged.
+  static bool UnMergeableType(BlobRegionType type) {
+    return IsLineType(type) || IsImageType(type);
+  }
+  // Helper to call CleanNeighbours on all blobs on the list.
+  static void CleanNeighbours(BLOBNBOX_LIST *blobs);
+  // Helper to delete all the deletable blobs on the list.
+  static void DeleteNoiseBlobs(BLOBNBOX_LIST *blobs);
+  // Helper to compute edge offsets for  all the blobs on the list.
+  // See coutln.h for an explanation of edge offsets.
+  static void ComputeEdgeOffsets(Image thresholds, Image grey, BLOBNBOX_LIST *blobs);
+
+#ifndef GRAPHICS_DISABLED
+  // Helper to draw all the blobs on the list in the given body_colour,
+  // with child outlines in the child_colour.
+  static void PlotBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour,
+                        ScrollView::Color child_colour, ScrollView *win);
+  // Helper to draw only DeletableNoise blobs (unowned, BRT_NOISE) on the
+  // given list in the given body_colour, with child outlines in the
+  // child_colour.
+  static void PlotNoiseBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour,
+                             ScrollView::Color child_colour, ScrollView *win);
+
+  static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type);
+
+  // Keep in sync with BlobRegionType.
+  ScrollView::Color BoxColor() const;
+
+  void plot(ScrollView *window,              // window to draw in
+            ScrollView::Color blob_colour,   // for outer bits
+            ScrollView::Color child_colour); // for holes
+#endif
+
+  // Initializes members set by StrokeWidth and beyond, without discarding
+  // stored area and strokewidth values, which are expensive to calculate.
+  void ReInit() {
+    joined = false;
+    reduced = false;
+    repeated_set_ = 0;
+    left_tab_type_ = TT_NONE;
+    right_tab_type_ = TT_NONE;
+    region_type_ = BRT_UNKNOWN;
+    flow_ = BTFT_NONE;
+    spt_type_ = BSTT_SKIP;
+    left_rule_ = 0;
+    right_rule_ = 0;
+    left_crossing_rule_ = 0;
+    right_crossing_rule_ = 0;
+    if (area_stroke_width_ == 0.0f && area > 0 && cblob() != nullptr && cblob()->perimeter() != 0) {
+      area_stroke_width_ = 2.0f * area / cblob()->perimeter();
+    }
+    owner_ = nullptr;
+    base_char_top_ = box.top();
+    base_char_bottom_ = box.bottom();
+    baseline_y_ = box.bottom();
+    line_crossings_ = 0;
+    base_char_blob_ = nullptr;
+    horz_possible_ = false;
+    vert_possible_ = false;
+    leader_on_left_ = false;
+    leader_on_right_ = false;
+    ClearNeighbours();
+  }
+
+  void ClearNeighbours() {
+    for (int n = 0; n < BND_COUNT; ++n) {
+      neighbours_[n] = nullptr;
+      good_stroke_neighbours_[n] = false;
+    }
+  }
+
+private:
+  C_BLOB *cblob_ptr = nullptr;               // edgestep blob
+  TBOX box;                                  // bounding box
+  TBOX red_box;                              // bounding box
+  int32_t area = 0;                          // enclosed area
+  int32_t repeated_set_ = 0;                 // id of the set of repeated blobs
+  TabType left_tab_type_ = TT_NONE;          // Indicates tab-stop assessment
+  TabType right_tab_type_ = TT_NONE;         // Indicates tab-stop assessment
+  BlobRegionType region_type_ = BRT_UNKNOWN; // Type of region this blob belongs to
+  BlobTextFlowType flow_ = BTFT_NONE;        // Quality of text flow.
+  BlobSpecialTextType spt_type_;             // Special text type.
+  bool joined = false;                       // joined to prev
+  bool reduced = false;                      // reduced box set
+  int16_t left_rule_ = 0;                    // x-coord of nearest but not crossing rule line
+  int16_t right_rule_ = 0;                   // x-coord of nearest but not crossing rule line
+  int16_t left_crossing_rule_;               // x-coord of nearest or crossing rule line
+  int16_t right_crossing_rule_;              // x-coord of nearest or crossing rule line
+  int16_t base_char_top_;                    // y-coord of top/bottom of diacritic base,
+  int16_t base_char_bottom_;                 // if it exists else top/bottom of this blob.
+  int16_t baseline_y_;                       // Estimate of baseline position.
+  int32_t line_crossings_;                   // Number of line intersections touched.
+  BLOBNBOX *base_char_blob_;                 // The blob that was the base char.
+  tesseract::ColPartition *owner_;           // Who will delete me when I am not needed
+  BLOBNBOX *neighbours_[BND_COUNT];
+  float horz_stroke_width_ = 0.0f; // Median horizontal stroke width
+  float vert_stroke_width_ = 0.0f; // Median vertical stroke width
+  float area_stroke_width_ = 0.0f; // Stroke width from area/perimeter ratio.
+  bool good_stroke_neighbours_[BND_COUNT];
+  bool horz_possible_;   // Could be part of horizontal flow.
+  bool vert_possible_;   // Could be part of vertical flow.
+  bool leader_on_left_;  // There is a leader to the left.
+  bool leader_on_right_; // There is a leader to the right.
+  // Iff true, then the destructor should delete the cblob_ptr.
+  // TODO(rays) migrate all uses to correctly setting this flag instead of
+  // deleting the C_BLOB before deleting the BLOBNBOX.
+  bool owns_cblob_ = false;
+};
+
+class TO_ROW : public ELIST2_LINK {
+public:
+  static const int kErrorWeight = 3;
+
+  TO_ROW() {
+    clear();
+  }                   // empty
+  TO_ROW(             // constructor
+      BLOBNBOX *blob, // from first blob
+      float top,      // of row //target height
+      float bottom, float row_size);
+
+  void print() const;
+  float max_y() const { // access function
+    return y_max;
+  }
+  float min_y() const {
+    return y_min;
+  }
+  float mean_y() const {
+    return (y_min + y_max) / 2.0f;
+  }
+  float initial_min_y() const {
+    return initial_y_min;
+  }
+  float line_m() const { // access to line fit
+    return m;
+  }
+  float line_c() const {
+    return c;
+  }
+  float line_error() const {
+    return error;
+  }
+  float parallel_c() const {
+    return para_c;
+  }
+  float parallel_error() const {
+    return para_error;
+  }
+  float believability() const { // baseline goodness
+    return credibility;
+  }
+  float intercept() const { // real parallel_c
+    return y_origin;
+  }
+  void add_blob(      // put in row
+      BLOBNBOX *blob, // blob to add
+      float top,      // of row //target height
+      float bottom, float row_size);
+  void insert_blob( // put in row in order
+      BLOBNBOX *blob);
+
+  BLOBNBOX_LIST *blob_list() { // get list
+    return &blobs;
+  }
+
+  void set_line(   // set line spec
+      float new_m, // line to set
+      float new_c, float new_error) {
+    m = new_m;
+    c = new_c;
+    error = new_error;
+  }
+  void set_parallel_line( // set fixed gradient line
+      float gradient,     // page gradient
+      float new_c, float new_error) {
+    para_c = new_c;
+    para_error = new_error;
+    credibility = blobs.length() - kErrorWeight * new_error;
+    y_origin = new_c / std::sqrt(1 + gradient * gradient);
+    // real intercept
+  }
+  void set_limits(     // set min,max
+      float new_min,   // bottom and
+      float new_max) { // top of row
+    y_min = new_min;
+    y_max = new_max;
+  }
+  void compute_vertical_projection();
+  // get projection
+
+  bool rep_chars_marked() const {
+    return num_repeated_sets_ != -1;
+  }
+  void clear_rep_chars_marked() {
+    num_repeated_sets_ = -1;
+  }
+  int num_repeated_sets() const {
+    return num_repeated_sets_;
+  }
+  void set_num_repeated_sets(int num_sets) {
+    num_repeated_sets_ = num_sets;
+  }
+
+  // true when dead
+  bool merged = false;
+  bool all_caps;             // had no ascenders
+  bool used_dm_model;        // in guessing pitch
+  int16_t projection_left;   // start of projection
+  int16_t projection_right;  // start of projection
+  PITCH_TYPE pitch_decision; // how strong is decision
+  float fixed_pitch;         // pitch or 0
+  float fp_space;            // sp if fixed pitch
+  float fp_nonsp;            // nonsp if fixed pitch
+  float pr_space;            // sp if prop
+  float pr_nonsp;            // non sp if prop
+  float spacing;             // to "next" row
+  float xheight;             // of line
+  int xheight_evidence;      // number of blobs of height xheight
+  float ascrise;             // ascenders
+  float descdrop;            // descenders
+  float body_size;           // of CJK characters.  Assumed to be
+                             // xheight+ascrise for non-CJK text.
+  int32_t min_space;         // min size for real space
+  int32_t max_nonspace;      // max size of non-space
+  int32_t space_threshold;   // space vs nonspace
+  float kern_size;           // average non-space
+  float space_size;          // average space
+  WERD_LIST rep_words;       // repeated chars
+  ICOORDELT_LIST char_cells; // fixed pitch cells
+  QSPLINE baseline;          // curved baseline
+  STATS projection;          // vertical projection
+
+private:
+  void clear(); // clear all values to reasonable defaults
+
+  BLOBNBOX_LIST blobs; // blobs in row
+  float y_min;         // coords
+  float y_max;
+  float initial_y_min;
+  float m, c;   // line spec
+  float error;  // line error
+  float para_c; // constrained fit
+  float para_error;
+  float y_origin;         // rotated para_c;
+  float credibility;      // baseline believability
+  int num_repeated_sets_; // number of sets of repeated blobs
+                          // set to -1 if we have not searched
+                          // for repeated blobs in this row yet
+};
+
+ELIST2IZEH(TO_ROW)
+class TESS_API TO_BLOCK : public ELIST_LINK {
+public:
+  TO_BLOCK() : pitch_decision(PITCH_DUNNO) {
+    clear();
+  }                      // empty
+  TO_BLOCK(              // constructor
+      BLOCK *src_block); // real block
+  ~TO_BLOCK();
+
+  void clear(); // clear all scalar members.
+
+  TO_ROW_LIST *get_rows() { // access function
+    return &row_list;
+  }
+
+  // Rotate all the blobnbox lists and the underlying block. Then update the
+  // median size statistic from the blobs list.
+  void rotate(const FCOORD &rotation) {
+    BLOBNBOX_LIST *blobnbox_list[] = {&blobs,       &underlines,  &noise_blobs,
+                                      &small_blobs, &large_blobs, nullptr};
+    for (BLOBNBOX_LIST **list = blobnbox_list; *list != nullptr; ++list) {
+      BLOBNBOX_IT it(*list);
+      for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+        it.data()->rotate(rotation);
+      }
+    }
+    // Rotate the block
+    ASSERT_HOST(block->pdblk.poly_block() != nullptr);
+    block->rotate(rotation);
+    // Update the median size statistic from the blobs list.
+    STATS widths(0, block->pdblk.bounding_box().width());
+    STATS heights(0, block->pdblk.bounding_box().height());
+    BLOBNBOX_IT blob_it(&blobs);
+    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+      widths.add(blob_it.data()->bounding_box().width(), 1);
+      heights.add(blob_it.data()->bounding_box().height(), 1);
+    }
+    block->set_median_size(static_cast<int>(widths.median() + 0.5),
+                           static_cast<int>(heights.median() + 0.5));
+  }
+
+  void print_rows() { // debug info
+    TO_ROW_IT row_it = &row_list;
+    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
+      auto row = row_it.data();
+      tprintf("Row range (%g,%g), para_c=%g, blobcount=%" PRId32 "\n", row->min_y(), row->max_y(),
+              row->parallel_c(), row->blob_list()->length());
+    }
+  }
+
+  // Reorganizes the blob lists with a different definition of small, medium
+  // and large, compared to the original definition.
+  // Height is still the primary filter key, but medium width blobs of small
+  // height become medium, and very wide blobs of small height stay small.
+  void ReSetAndReFilterBlobs();
+
+  // Deletes noise blobs from all lists where not owned by a ColPartition.
+  void DeleteUnownedNoise();
+
+  // Computes and stores the edge offsets on each blob for use in feature
+  // extraction, using greyscale if the supplied grey and thresholds pixes
+  // are 8-bit or otherwise (if nullptr or not 8 bit) the original binary
+  // edge step outlines.
+  // Thresholds must either be the same size as grey or an integer down-scale
+  // of grey.
+  // See coutln.h for an explanation of edge offsets.
+  void ComputeEdgeOffsets(Image thresholds, Image grey);
+
+#ifndef GRAPHICS_DISABLED
+  // Draw the noise blobs from all lists in red.
+  void plot_noise_blobs(ScrollView *to_win);
+  // Draw the blobs on on the various lists in the block in different colors.
+  void plot_graded_blobs(ScrollView *to_win);
+#endif
+
+  BLOBNBOX_LIST blobs;       // medium size
+  BLOBNBOX_LIST underlines;  // underline blobs
+  BLOBNBOX_LIST noise_blobs; // very small
+  BLOBNBOX_LIST small_blobs; // fairly small
+  BLOBNBOX_LIST large_blobs; // big blobs
+  BLOCK *block;              // real block
+  PITCH_TYPE pitch_decision; // how strong is decision
+  float line_spacing;        // estimate
+  // line_size is a lower-bound estimate of the font size in pixels of
+  // the text in the block (with ascenders and descenders), being a small
+  // (1.25) multiple of the median height of filtered blobs.
+  // In most cases the font size will be bigger, but it will be closer
+  // if the text is allcaps, or in a no-x-height script.
+  float line_size;       // estimate
+  float max_blob_size;   // line assignment limit
+  float baseline_offset; // phase shift
+  float xheight;         // median blob size
+  float fixed_pitch;     // pitch or 0
+  float kern_size;       // average non-space
+  float space_size;      // average space
+  int32_t min_space;     // min definite space
+  int32_t max_nonspace;  // max definite
+  float fp_space;        // sp if fixed pitch
+  float fp_nonsp;        // nonsp if fixed pitch
+  float pr_space;        // sp if prop
+  float pr_nonsp;        // non sp if prop
+  TO_ROW *key_row;       // starting row
+
+private:
+  TO_ROW_LIST row_list; // temporary rows
+};
+
+ELISTIZEH(TO_BLOCK)
+extern double_VAR_H(textord_error_weight, 3, "Weighting for error in believability");
+void find_cblob_limits( // get y limits
+    C_BLOB *blob,       // blob to search
+    float leftx,        // x limits
+    float rightx,
+    FCOORD rotation, // for landscape
+    float &ymin,     // output y limits
+    float &ymax);
+void find_cblob_vlimits( // get y limits
+    C_BLOB *blob,        // blob to search
+    float leftx,         // x limits
+    float rightx,
+    float &ymin, // output y limits
+    float &ymax);
+void find_cblob_hlimits( // get x limits
+    C_BLOB *blob,        // blob to search
+    float bottomy,       // y limits
+    float topy,
+    float &xmin, // output x limits
+    float &xymax);
+C_BLOB *crotate_cblob( // rotate it
+    C_BLOB *blob,      // blob to search
+    FCOORD rotation    // for landscape
+);
+TBOX box_next(      // get bounding box
+    BLOBNBOX_IT *it // iterator to blobds
+);
+TBOX box_next_pre_chopped( // get bounding box
+    BLOBNBOX_IT *it        // iterator to blobds
+);
+void vertical_cblob_projection( // project outlines
+    C_BLOB *blob,               // blob to project
+    STATS *stats                // output
+);
+void vertical_coutline_projection( // project outlines
+    C_OUTLINE *outline,            // outline to project
+    STATS *stats                   // output
+);
+#ifndef GRAPHICS_DISABLED
+void plot_blob_list(ScrollView *win,                 // window to draw in
+                    BLOBNBOX_LIST *list,             // blob list
+                    ScrollView::Color body_colour,   // colour to draw
+                    ScrollView::Color child_colour); // colour of child
+#endif                                               // !GRAPHICS_DISABLED
+
+} // namespace tesseract
+
+#endif
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/blobs.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/blobs.cpp
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/blobs.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/blobs.h
@ -0,0 +1,476 @@
+/******************************************************************************
+ *
+ * File:        blobs.h
+ * Description: Blob definition
+ * Author:      Mark Seaman, OCR Technology
+ *
+ * (c) Copyright 1989, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *****************************************************************************/
+
+#ifndef BLOBS_H
+#define BLOBS_H
+
+#include "clst.h"       // for CLIST_ITERATOR, CLISTIZEH
+#include "normalis.h"   // for DENORM
+#include "points.h"     // for FCOORD, ICOORD
+#include "rect.h"       // for TBOX
+#include "scrollview.h" // for ScrollView, ScrollView::Color
+
+#include <tesseract/publictypes.h> // for OcrEngineMode
+
+#include <cstdint> // for int16_t
+
+struct Pix;
+
+namespace tesseract {
+
+class BLOCK;
+class C_BLOB;
+class C_OUTLINE;
+class LLSQ;
+class ROW;
+class WERD;
+
+/*----------------------------------------------------------------------
+              T y p e s
+----------------------------------------------------------------------*/
+
+struct TPOINT {
+  TPOINT() : x(0), y(0) {}
+  TPOINT(int16_t vx, int16_t vy) : x(vx), y(vy) {}
+  TPOINT(const ICOORD &ic) : x(ic.x()), y(ic.y()) {}
+
+  void operator+=(const TPOINT &other) {
+    x += other.x;
+    y += other.y;
+  }
+  void operator/=(int divisor) {
+    x /= divisor;
+    y /= divisor;
+  }
+  bool operator==(const TPOINT &other) const {
+    return x == other.x && y == other.y;
+  }
+  // Returns true when the two line segments cross each other.
+  // (Moved from outlines.cpp).
+  static bool IsCrossed(const TPOINT &a0, const TPOINT &a1, const TPOINT &b0, const TPOINT &b1);
+
+  // Assign the difference from point p1 to point p2.
+  void diff(const TPOINT &p1, const TPOINT &p2) {
+    x = p1.x - p2.x;
+    y = p1.y - p2.y;
+  }
+
+  // Return cross product.
+  int cross(const TPOINT &other) const {
+    return x * other.y - y * other.x;
+  }
+
+  // Return scalar or dot product.
+  int dot(const TPOINT &other) const {
+    return x * other.x + y * other.y;
+  }
+
+  // Calculate length of vector.
+  int length() const {
+    return x * x + y * y;
+  }
+
+  int16_t x; // absolute x coord.
+  int16_t y; // absolute y coord.
+};
+
+using VECTOR = TPOINT; // structure for coordinates.
+
+struct EDGEPT {
+  EDGEPT() = default;
+  EDGEPT(const EDGEPT &src) : next(nullptr), prev(nullptr) {
+    CopyFrom(src);
+  }
+  EDGEPT &operator=(const EDGEPT &src) {
+    CopyFrom(src);
+    return *this;
+  }
+  // Copies the data elements, but leaves the pointers untouched.
+  void CopyFrom(const EDGEPT &src) {
+    pos = src.pos;
+    vec = src.vec;
+    is_hidden = src.is_hidden;
+    runlength = src.runlength;
+    dir = src.dir;
+    fixed = src.fixed;
+    src_outline = src.src_outline;
+    start_step = src.start_step;
+    step_count = src.step_count;
+  }
+  // Returns the squared distance between the points, with the x-component
+  // weighted by x_factor.
+  int WeightedDistance(const EDGEPT &other, int x_factor) const {
+    int x_dist = pos.x - other.pos.x;
+    int y_dist = pos.y - other.pos.y;
+    return x_dist * x_dist * x_factor + y_dist * y_dist;
+  }
+  // Returns true if the positions are equal.
+  bool EqualPos(const EDGEPT &other) const {
+    return pos == other.pos;
+  }
+  // Returns the bounding box of the outline segment from *this to *end.
+  // Ignores hidden edge flags.
+  TBOX SegmentBox(const EDGEPT *end) const {
+    TBOX box(pos.x, pos.y, pos.x, pos.y);
+    const EDGEPT *pt = this;
+    do {
+      pt = pt->next;
+      if (pt->pos.x < box.left()) {
+        box.set_left(pt->pos.x);
+      }
+      if (pt->pos.x > box.right()) {
+        box.set_right(pt->pos.x);
+      }
+      if (pt->pos.y < box.bottom()) {
+        box.set_bottom(pt->pos.y);
+      }
+      if (pt->pos.y > box.top()) {
+        box.set_top(pt->pos.y);
+      }
+    } while (pt != end && pt != this);
+    return box;
+  }
+  // Returns the area of the outline segment from *this to *end.
+  // Ignores hidden edge flags.
+  int SegmentArea(const EDGEPT *end) const {
+    int area = 0;
+    const EDGEPT *pt = this->next;
+    do {
+      TPOINT origin_vec(pt->pos.x - pos.x, pt->pos.y - pos.y);
+      area += origin_vec.cross(pt->vec);
+      pt = pt->next;
+    } while (pt != end && pt != this);
+    return area;
+  }
+  // Returns true if the number of points in the outline segment from *this to
+  // *end is less that min_points and false if we get back to *this first.
+  // Ignores hidden edge flags.
+  bool ShortNonCircularSegment(int min_points, const EDGEPT *end) const {
+    int count = 0;
+    const EDGEPT *pt = this;
+    do {
+      if (pt == end) {
+        return true;
+      }
+      pt = pt->next;
+      ++count;
+    } while (pt != this && count <= min_points);
+    return false;
+  }
+
+  // Accessors to hide or reveal a cut edge from feature extractors.
+  void Hide() {
+    is_hidden = true;
+  }
+  void Reveal() {
+    is_hidden = false;
+  }
+  bool IsHidden() const {
+    return is_hidden;
+  }
+  void MarkChop() {
+    dir = 1;
+  }
+  bool IsChopPt() const {
+    return dir != 0;
+  }
+
+  TPOINT pos; // position
+  VECTOR vec; // vector to next point
+  bool is_hidden = false;
+  uint8_t runlength = 0;
+  int8_t dir = 0;
+  int8_t fixed = 0;
+  EDGEPT *next = nullptr;           // anticlockwise element
+  EDGEPT *prev = nullptr;           // clockwise element
+  C_OUTLINE *src_outline = nullptr; // Outline it came from.
+  // The following fields are not used if src_outline is nullptr.
+  int start_step = 0; // Location of pos in src_outline.
+  int step_count = 0; // Number of steps used (may wrap around).
+};
+
+// For use in chop and findseam to keep a list of which EDGEPTs were inserted.
+CLISTIZEH(EDGEPT)
+
+struct TESSLINE {
+  TESSLINE() : is_hole(false), loop(nullptr), next(nullptr) {}
+  TESSLINE(const TESSLINE &src) : loop(nullptr), next(nullptr) {
+    CopyFrom(src);
+  }
+  ~TESSLINE() {
+    Clear();
+  }
+  TESSLINE &operator=(const TESSLINE &src) {
+    CopyFrom(src);
+    return *this;
+  }
+  // Consume the circular list of EDGEPTs to make a TESSLINE.
+  static TESSLINE *BuildFromOutlineList(EDGEPT *outline);
+  // Copies the data and the outline, but leaves next untouched.
+  void CopyFrom(const TESSLINE &src);
+  // Deletes owned data.
+  void Clear();
+  // Normalize in-place using the DENORM.
+  void Normalize(const DENORM &denorm);
+  // Rotates by the given rotation in place.
+  void Rotate(const FCOORD rotation);
+  // Moves by the given vec in place.
+  void Move(const ICOORD vec);
+  // Scales by the given factor in place.
+  void Scale(float factor);
+  // Sets up the start and vec members of the loop from the pos members.
+  void SetupFromPos();
+  // Recomputes the bounding box from the points in the loop.
+  void ComputeBoundingBox();
+  // Computes the min and max cross product of the outline points with the
+  // given vec and returns the results in min_xp and max_xp. Geometrically
+  // this is the left and right edge of the outline perpendicular to the
+  // given direction, but to get the distance units correct, you would
+  // have to divide by the modulus of vec.
+  void MinMaxCrossProduct(const TPOINT vec, int *min_xp, int *max_xp) const;
+
+  TBOX bounding_box() const;
+  // Returns true if *this and other have equal bounding boxes.
+  bool SameBox(const TESSLINE &other) const {
+    return topleft == other.topleft && botright == other.botright;
+  }
+  // Returns true if the given line segment crosses any outline of this blob.
+  bool SegmentCrosses(const TPOINT &pt1, const TPOINT &pt2) const {
+    if (Contains(pt1) && Contains(pt2)) {
+      EDGEPT *pt = loop;
+      do {
+        if (TPOINT::IsCrossed(pt1, pt2, pt->pos, pt->next->pos)) {
+          return true;
+        }
+        pt = pt->next;
+      } while (pt != loop);
+    }
+    return false;
+  }
+  // Returns true if the point is contained within the outline box.
+  bool Contains(const TPOINT &pt) const {
+    return topleft.x <= pt.x && pt.x <= botright.x && botright.y <= pt.y && pt.y <= topleft.y;
+  }
+
+#ifndef GRAPHICS_DISABLED
+  void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color);
+#endif // !GRAPHICS_DISABLED
+
+  // Returns the first outline point that has a different src_outline to its
+  // predecessor, or, if all the same, the lowest indexed point.
+  EDGEPT *FindBestStartPt() const;
+
+  int BBArea() const {
+    return (botright.x - topleft.x) * (topleft.y - botright.y);
+  }
+
+  TPOINT topleft;  // Top left of loop.
+  TPOINT botright; // Bottom right of loop.
+  TPOINT start;    // Start of loop.
+  bool is_hole;    // True if this is a hole/child outline.
+  EDGEPT *loop;    // Edgeloop.
+  TESSLINE *next;  // Next outline in blob.
+};                 // Outline structure.
+
+struct TBLOB {
+  TBLOB() : outlines(nullptr) {}
+  TBLOB(const TBLOB &src) : outlines(nullptr) {
+    CopyFrom(src);
+  }
+  ~TBLOB() {
+    Clear();
+  }
+  TBLOB &operator=(const TBLOB &src) {
+    CopyFrom(src);
+    return *this;
+  }
+  // Factory to build a TBLOB from a C_BLOB with polygonal approximation along
+  // the way. If allow_detailed_fx is true, the EDGEPTs in the returned TBLOB
+  // contain pointers to the input C_OUTLINEs that enable higher-resolution
+  // feature extraction that does not use the polygonal approximation.
+  static TBLOB *PolygonalCopy(bool allow_detailed_fx, C_BLOB *src);
+  // Factory builds a blob with no outlines, but copies the other member data.
+  static TBLOB *ShallowCopy(const TBLOB &src);
+  // Normalizes the blob for classification only if needed.
+  // (Normally this means a non-zero classify rotation.)
+  // If no Normalization is needed, then nullptr is returned, and the input blob
+  // can be used directly. Otherwise a new TBLOB is returned which must be
+  // deleted after use.
+  TBLOB *ClassifyNormalizeIfNeeded() const;
+
+  // Copies the data and the outlines, but leaves next untouched.
+  void CopyFrom(const TBLOB &src);
+  // Deletes owned data.
+  void Clear();
+  // Sets up the built-in DENORM and normalizes the blob in-place.
+  // For parameters see DENORM::SetupNormalization, plus the inverse flag for
+  // this blob and the Pix for the full image.
+  void Normalize(const BLOCK *block, const FCOORD *rotation, const DENORM *predecessor,
+                 float x_origin, float y_origin, float x_scale, float y_scale, float final_xshift,
+                 float final_yshift, bool inverse, Image pix);
+  // Rotates by the given rotation in place.
+  void Rotate(const FCOORD rotation);
+  // Moves by the given vec in place.
+  void Move(const ICOORD vec);
+  // Scales by the given factor in place.
+  void Scale(float factor);
+  // Recomputes the bounding boxes of the outlines.
+  void ComputeBoundingBoxes();
+
+  // Returns the number of outlines.
+  int NumOutlines() const;
+
+  TBOX bounding_box() const;
+
+  // Returns true if the given line segment crosses any outline of this blob.
+  bool SegmentCrossesOutline(const TPOINT &pt1, const TPOINT &pt2) const {
+    for (const TESSLINE *outline = outlines; outline != nullptr; outline = outline->next) {
+      if (outline->SegmentCrosses(pt1, pt2)) {
+        return true;
+      }
+    }
+    return false;
+  }
+  // Returns true if the point is contained within any of the outline boxes.
+  bool Contains(const TPOINT &pt) const {
+    for (const TESSLINE *outline = outlines; outline != nullptr; outline = outline->next) {
+      if (outline->Contains(pt)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Finds and deletes any duplicate outlines in this blob, without deleting
+  // their EDGEPTs.
+  void EliminateDuplicateOutlines();
+
+  // Swaps the outlines of *this and next if needed to keep the centers in
+  // increasing x.
+  void CorrectBlobOrder(TBLOB *next);
+
+  const DENORM &denorm() const {
+    return denorm_;
+  }
+
+#ifndef GRAPHICS_DISABLED
+  void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color);
+#endif // !GRAPHICS_DISABLED
+
+  int BBArea() const {
+    int total_area = 0;
+    for (TESSLINE *outline = outlines; outline != nullptr; outline = outline->next) {
+      total_area += outline->BBArea();
+    }
+    return total_area;
+  }
+
+  // Computes the center of mass and second moments for the old baseline and
+  // 2nd moment normalizations. Returns the outline length.
+  // The input denorm should be the normalizations that have been applied from
+  // the image to the current state of this TBLOB.
+  int ComputeMoments(FCOORD *center, FCOORD *second_moments) const;
+  // Computes the precise bounding box of the coords that are generated by
+  // GetEdgeCoords. This may be different from the bounding box of the polygon.
+  void GetPreciseBoundingBox(TBOX *precise_box) const;
+  // Adds edges to the given vectors.
+  // For all the edge steps in all the outlines, or polygonal approximation
+  // where there are no edge steps, collects the steps into x_coords/y_coords.
+  // x_coords is a collection of the x-coords of vertical edges for each
+  // y-coord starting at box.bottom().
+  // y_coords is a collection of the y-coords of horizontal edges for each
+  // x-coord starting at box.left().
+  // Eg x_coords[0] is a collection of the x-coords of edges at y=bottom.
+  // Eg x_coords[1] is a collection of the x-coords of edges at y=bottom + 1.
+  void GetEdgeCoords(const TBOX &box, std::vector<std::vector<int>> &x_coords,
+                     std::vector<std::vector<int>> &y_coords) const;
+
+  TESSLINE *outlines; // List of outlines in blob.
+
+private: // TODO(rays) Someday the data members will be private too.
+  // For all the edge steps in all the outlines, or polygonal approximation
+  // where there are no edge steps, collects the steps into the bounding_box,
+  // llsq and/or the x_coords/y_coords. Both are used in different kinds of
+  // normalization.
+  // For a description of x_coords, y_coords, see GetEdgeCoords above.
+  void CollectEdges(const TBOX &box, TBOX *bounding_box, LLSQ *llsq,
+                    std::vector<std::vector<int>> *x_coords,
+                    std::vector<std::vector<int>> *y_coords) const;
+
+private:
+  // DENORM indicating the transformations that this blob has undergone so far.
+  DENORM denorm_;
+}; // Blob structure.
+
+struct TWERD {
+  TWERD() : latin_script(false) {}
+  TWERD(const TWERD &src) {
+    CopyFrom(src);
+  }
+  ~TWERD() {
+    Clear();
+  }
+  TWERD &operator=(const TWERD &src) {
+    CopyFrom(src);
+    return *this;
+  }
+  // Factory to build a TWERD from a (C_BLOB) WERD, with polygonal
+  // approximation along the way.
+  static TWERD *PolygonalCopy(bool allow_detailed_fx, WERD *src);
+  // Baseline normalizes the blobs in-place, recording the normalization in the
+  // DENORMs in the blobs.
+  void BLNormalize(const BLOCK *block, const ROW *row, Image pix, bool inverse, float x_height,
+                   float baseline_shift, bool numeric_mode, tesseract::OcrEngineMode hint,
+                   const TBOX *norm_box, DENORM *word_denorm);
+  // Copies the data and the blobs, but leaves next untouched.
+  void CopyFrom(const TWERD &src);
+  // Deletes owned data.
+  void Clear();
+  // Recomputes the bounding boxes of the blobs.
+  void ComputeBoundingBoxes();
+
+  // Returns the number of blobs in the word.
+  int NumBlobs() const {
+    return blobs.size();
+  }
+  TBOX bounding_box() const;
+
+  // Merges the blobs from start to end, not including end, and deletes
+  // the blobs between start and end.
+  void MergeBlobs(int start, int end);
+
+#ifndef GRAPHICS_DISABLED
+  void plot(ScrollView *window);
+#endif // !GRAPHICS_DISABLED
+
+  std::vector<TBLOB *> blobs; // Blobs in word.
+  bool latin_script;          // This word is in a latin-based script.
+};
+
+/*----------------------------------------------------------------------
+              F u n c t i o n s
+----------------------------------------------------------------------*/
+// TODO(rays) Make divisible_blob and divide_blobs members of TBLOB.
+bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT *location);
+
+void divide_blobs(TBLOB *blob, TBLOB *other_blob, bool italic_blob, const TPOINT &location);
+
+} // namespace tesseract
+
+#endif
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/blread.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/blread.cpp
@ -0,0 +1,74 @@
+/**********************************************************************
+ * File:        blread.cpp  (Formerly pdread.c)
+ * Description: Friend function of BLOCK to read the uscan pd file.
+ * Author:      Ray Smith
+ *
+ * (C) Copyright 1991, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "blread.h"
+
+#include "ocrblock.h"  // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)
+#include "scanutils.h" // for tfscanf
+
+#include <cstdio> // for fclose, fopen, FILE
+
+namespace tesseract {
+
+#define UNLV_EXT ".uzn" // unlv zone file
+
+/**********************************************************************
+ * read_unlv_file
+ *
+ * Read a whole unlv zone file to make a list of blocks.
+ **********************************************************************/
+
+bool read_unlv_file(   // print list of sides
+    std::string &name, // basename of file
+    int32_t xsize,     // image size
+    int32_t ysize,     // image size
+    BLOCK_LIST *blocks // output list
+) {
+  FILE *pdfp;   // file pointer
+  BLOCK *block; // current block
+  int x;        // current top-down coords
+  int y;
+  int width; // of current block
+  int height;
+  BLOCK_IT block_it = blocks; // block iterator
+
+  name += UNLV_EXT; // add extension
+  if ((pdfp = fopen(name.c_str(), "rb")) == nullptr) {
+    return false; // didn't read one
+  } else {
+    while (tfscanf(pdfp, "%d %d %d %d %*s", &x, &y, &width, &height) >= 4) {
+      // make rect block
+      block = new BLOCK(name.c_str(), true, 0, 0, static_cast<int16_t>(x),
+                        static_cast<int16_t>(ysize - y - height), static_cast<int16_t>(x + width),
+                        static_cast<int16_t>(ysize - y));
+      // on end of list
+      block_it.add_to_end(block);
+    }
+    fclose(pdfp);
+  }
+  tprintf("UZN file %s loaded.\n", name.c_str());
+  return true;
+}
+
+void FullPageBlock(int width, int height, BLOCK_LIST *blocks) {
+  BLOCK_IT block_it(blocks);
+  auto *block = new BLOCK("", true, 0, 0, 0, 0, width, height);
+  block_it.add_to_end(block);
+}
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/blread.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/blread.h
@ -0,0 +1,40 @@
+/**********************************************************************
+ * File:        blread.h  (Formerly pdread.h)
+ * Description: Friend function of BLOCK to read the uscan pd file.
+ * Author:      Ray Smith
+ *
+ * (C) Copyright 1991, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef BLREAD_H
+#define BLREAD_H
+
+#include <cstdint> // for int32_t
+#include <string>  // for std::string
+
+namespace tesseract {
+
+class BLOCK_LIST;
+
+bool read_unlv_file(   // print list of sides
+    std::string &name, // basename of file
+    int32_t xsize,     // image size
+    int32_t ysize,     // image size
+    BLOCK_LIST *blocks // output list
+);
+
+void FullPageBlock(int width, int height, BLOCK_LIST *blocks);
+
+} // namespace tesseract
+
+#endif
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/boxread.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/boxread.cpp
@ -0,0 +1,282 @@
+/**********************************************************************
+ * File:        boxread.cpp
+ * Description: Read data from a box file.
+ * Author:      Ray Smith
+ *
+ * (C) Copyright 2007, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "boxread.h"
+
+#include "errcode.h" // for ERRCODE, TESSEXIT
+#include "fileerr.h" // for CANTOPENFILE
+#include "rect.h"    // for TBOX
+#include "tprintf.h" // for tprintf
+
+#include <tesseract/unichar.h> // for UNICHAR
+#include "helpers.h"           // for chomp_string
+
+#include <climits> // for INT_MAX
+#include <cstring> // for strchr, strcmp
+#include <fstream> // for std::ifstream
+#include <locale>  // for std::locale::classic
+#include <sstream> // for std::stringstream
+#include <string>  // for std::string
+
+namespace tesseract {
+
+// Special char code used to identify multi-blob labels.
+static const char *kMultiBlobLabelCode = "WordStr";
+
+// Returns the box file name corresponding to the given image_filename.
+static std::string BoxFileName(const char *image_filename) {
+  std::string box_filename = image_filename;
+  size_t length = box_filename.length();
+  std::string last = (length > 8) ? box_filename.substr(length - 8) : "";
+  if (last == ".bin.png" || last == ".nrm.png") {
+    box_filename.resize(length - 8);
+  } else {
+    size_t lastdot = box_filename.find_last_of('.');
+    if (lastdot < length) {
+      box_filename.resize(lastdot);
+    }
+  }
+  box_filename += ".box";
+  return box_filename;
+}
+
+// Open the boxfile based on the given image filename.
+FILE *OpenBoxFile(const char *fname) {
+  std::string filename = BoxFileName(fname);
+  FILE *box_file = nullptr;
+  if (!(box_file = fopen(filename.c_str(), "rb"))) {
+    CANTOPENFILE.error("read_next_box", TESSEXIT, "Can't open box file %s", filename.c_str());
+  }
+  return box_file;
+}
+
+// Reads all boxes from the given filename.
+// Reads a specific target_page number if >= 0, or all pages otherwise.
+// Skips blanks if skip_blanks is true.
+// The UTF-8 label of the box is put in texts, and the full box definition as
+// a string is put in box_texts, with the corresponding page number in pages.
+// Each of the output vectors is optional (may be nullptr).
+// Returns false if no boxes are found.
+bool ReadAllBoxes(int target_page, bool skip_blanks, const char *filename, std::vector<TBOX> *boxes,
+                  std::vector<std::string> *texts, std::vector<std::string> *box_texts,
+                  std::vector<int> *pages) {
+  std::ifstream input(BoxFileName(filename).c_str(), std::ios::in | std::ios::binary);
+  std::vector<char> box_data(std::istreambuf_iterator<char>(input), {});
+  if (box_data.empty()) {
+    return false;
+  }
+  // Convert the array of bytes to a string, so it can be used by the parser.
+  box_data.push_back('\0');
+  return ReadMemBoxes(target_page, skip_blanks, &box_data[0],
+                      /*continue_on_failure*/ true, boxes, texts, box_texts, pages);
+}
+
+// Reads all boxes from the string. Otherwise, as ReadAllBoxes.
+bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure,
+                  std::vector<TBOX> *boxes, std::vector<std::string> *texts,
+                  std::vector<std::string> *box_texts, std::vector<int> *pages) {
+  std::string box_str(box_data);
+  std::vector<std::string> lines = split(box_str, '\n');
+  if (lines.empty()) {
+    return false;
+  }
+  int num_boxes = 0;
+  for (auto &line : lines) {
+    int page = 0;
+    std::string utf8_str;
+    TBOX box;
+    if (!ParseBoxFileStr(line.c_str(), &page, utf8_str, &box)) {
+      if (continue_on_failure) {
+        continue;
+      } else {
+        return false;
+      }
+    }
+    if (skip_blanks && (utf8_str == " " || utf8_str == "\t")) {
+      continue;
+    }
+    if (target_page >= 0 && page != target_page) {
+      continue;
+    }
+    if (boxes != nullptr) {
+      boxes->push_back(box);
+    }
+    if (texts != nullptr) {
+      texts->push_back(utf8_str);
+    }
+    if (box_texts != nullptr) {
+      std::string full_text;
+      MakeBoxFileStr(utf8_str.c_str(), box, target_page, full_text);
+      box_texts->push_back(full_text);
+    }
+    if (pages != nullptr) {
+      pages->push_back(page);
+    }
+    ++num_boxes;
+  }
+  return num_boxes > 0;
+}
+
+// TODO(rays) convert all uses of ReadNextBox to use the new ReadAllBoxes.
+// Box files are used ONLY DURING TRAINING, but by both processes of
+// creating tr files with tesseract, and unicharset_extractor.
+// ReadNextBox factors out the code to interpret a line of a box
+// file so that applybox and unicharset_extractor interpret the same way.
+// This function returns the next valid box file utf8 string and coords
+// and returns true, or false on eof (and closes the file).
+// It ignores the utf8 file signature ByteOrderMark (U+FEFF=EF BB BF), checks
+// for valid utf-8 and allows space or tab between fields.
+// utf8_str is set with the unichar string, and bounding box with the box.
+// If there are page numbers in the file, it reads them all.
+bool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box) {
+  return ReadNextBox(-1, line_number, box_file, utf8_str, bounding_box);
+}
+
+// As ReadNextBox above, but get a specific page number. (0-based)
+// Use -1 to read any page number. Files without page number all
+// read as if they are page 0.
+bool ReadNextBox(int target_page, int *line_number, FILE *box_file, std::string &utf8_str,
+                 TBOX *bounding_box) {
+  int page = 0;
+  char buff[kBoxReadBufSize]; // boxfile read buffer
+  char *buffptr = buff;
+
+  while (fgets(buff, sizeof(buff) - 1, box_file)) {
+    (*line_number)++;
+
+    buffptr = buff;
+    const auto *ubuf = reinterpret_cast<const unsigned char *>(buffptr);
+    if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf) {
+      buffptr += 3; // Skip unicode file designation.
+    }
+    // Check for blank lines in box file
+    if (*buffptr == '\n' || *buffptr == '\0') {
+      continue;
+    }
+    // Skip blank boxes.
+    if (*buffptr == ' ' || *buffptr == '\t') {
+      continue;
+    }
+    if (*buffptr != '\0') {
+      if (!ParseBoxFileStr(buffptr, &page, utf8_str, bounding_box)) {
+        tprintf("Box file format error on line %i; ignored\n", *line_number);
+        continue;
+      }
+      if (target_page >= 0 && target_page != page) {
+        continue; // Not on the appropriate page.
+      }
+      return true; // Successfully read a box.
+    }
+  }
+  fclose(box_file);
+  return false; // EOF
+}
+
+// Parses the given box file string into a page_number, utf8_str, and
+// bounding_box. Returns true on a successful parse.
+// The box file is assumed to contain box definitions, one per line, of the
+// following format for blob-level boxes:
+//   <UTF8 str> <left> <bottom> <right> <top> <page id>
+// and for word/line-level boxes:
+//   WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
+// See applyybox.cpp for more information.
+bool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str,
+                     TBOX *bounding_box) {
+  *bounding_box = TBOX(); // Initialize it to empty.
+  utf8_str = "";
+  char uch[kBoxReadBufSize];
+  const char *buffptr = boxfile_str;
+  // Read the unichar without messing up on Tibetan.
+  // According to issue 253 the utf-8 surrogates 85 and A0 are treated
+  // as whitespace by sscanf, so it is more reliable to just find
+  // ascii space and tab.
+  int uch_len = 0;
+  // Skip unicode file designation, if present.
+  const auto *ubuf = reinterpret_cast<const unsigned char *>(buffptr);
+  if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf) {
+    buffptr += 3;
+  }
+  // Allow a single blank as the UTF-8 string. Check for empty string and
+  // then blindly eat the first character.
+  if (*buffptr == '\0') {
+    return false;
+  }
+  do {
+    uch[uch_len++] = *buffptr++;
+  } while (*buffptr != '\0' && *buffptr != ' ' && *buffptr != '\t' &&
+           uch_len < kBoxReadBufSize - 1);
+  uch[uch_len] = '\0';
+  if (*buffptr != '\0') {
+    ++buffptr;
+  }
+  int x_min = INT_MAX;
+  int y_min = INT_MAX;
+  int x_max = INT_MIN;
+  int y_max = INT_MIN;
+  *page_number = 0;
+  std::stringstream stream(buffptr);
+  stream.imbue(std::locale::classic());
+  stream >> x_min;
+  stream >> y_min;
+  stream >> x_max;
+  stream >> y_max;
+  stream >> *page_number;
+  if (x_max < x_min || y_max < y_min) {
+    tprintf("Bad box coordinates in boxfile string! %s\n", ubuf);
+    return false;
+  }
+  // Test for long space-delimited string label.
+  if (strcmp(uch, kMultiBlobLabelCode) == 0 && (buffptr = strchr(buffptr, '#')) != nullptr) {
+    strncpy(uch, buffptr + 1, kBoxReadBufSize - 1);
+    uch[kBoxReadBufSize - 1] = '\0'; // Prevent buffer overrun.
+    chomp_string(uch);
+    uch_len = strlen(uch);
+  }
+  // Validate UTF8 by making unichars with it.
+  int used = 0;
+  while (used < uch_len) {
+    tesseract::UNICHAR ch(uch + used, uch_len - used);
+    int new_used = ch.utf8_len();
+    if (new_used == 0) {
+      tprintf("Bad UTF-8 str %s starts with 0x%02x at col %d\n", uch + used, uch[used], used + 1);
+      return false;
+    }
+    used += new_used;
+  }
+  utf8_str = uch;
+  if (x_min > x_max) {
+    std::swap(x_min, x_max);
+  }
+  if (y_min > y_max) {
+    std::swap(y_min, y_max);
+  }
+  bounding_box->set_to_given_coords(x_min, y_min, x_max, y_max);
+  return true; // Successfully read a box.
+}
+
+// Creates a box file string from a unichar string, TBOX and page number.
+void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, std::string &box_str) {
+  box_str = unichar_str;
+  box_str += " " + std::to_string(box.left());
+  box_str += " " + std::to_string(box.bottom());
+  box_str += " " + std::to_string(box.right());
+  box_str += " " + std::to_string(box.top());
+  box_str += " " + std::to_string(page_num);
+}
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/boxread.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/boxread.h
@ -0,0 +1,89 @@
+/**********************************************************************
+ * File:        boxread.h
+ * Description: Read data from a box file.
+ * Author:      Ray Smith
+ *
+ * (C) Copyright 2007, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef TESSERACT_CCUTIL_BOXREAD_H_
+#define TESSERACT_CCUTIL_BOXREAD_H_
+
+#include <cstdio> // for FILE
+#include <string> // for std::string
+#include <vector> // for std::vector
+
+#include <tesseract/export.h> // for TESS_API
+
+namespace tesseract {
+
+class TBOX;
+
+// Size of buffer used to read a line from a box file.
+const int kBoxReadBufSize = 1024;
+
+// Open the boxfile based on the given image filename.
+// Returns nullptr if the box file cannot be opened.
+TESS_API
+FILE *OpenBoxFile(const char *filename);
+
+// Reads all boxes from the given filename.
+// Reads a specific target_page number if >= 0, or all pages otherwise.
+// Skips blanks if skip_blanks is true.
+// The UTF-8 label of the box is put in texts, and the full box definition as
+// a string is put in box_texts, with the corresponding page number in pages.
+// Each of the output vectors is optional (may be nullptr).
+// Returns false if no boxes are found.
+bool ReadAllBoxes(int target_page, bool skip_blanks, const char *filename, std::vector<TBOX> *boxes,
+                  std::vector<std::string> *texts, std::vector<std::string> *box_texts,
+                  std::vector<int> *pages);
+
+// Reads all boxes from the string. Otherwise, as ReadAllBoxes.
+// continue_on_failure allows reading to continue even if an invalid box is
+// encountered and will return true if it succeeds in reading some boxes.
+// It otherwise gives up and returns false on encountering an invalid box.
+TESS_API
+bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure,
+                  std::vector<TBOX> *boxes, std::vector<std::string> *texts,
+                  std::vector<std::string> *box_texts, std::vector<int> *pages);
+
+// ReadNextBox factors out the code to interpret a line of a box
+// file so that applybox and unicharset_extractor interpret the same way.
+// This function returns the next valid box file utf8 string and coords
+// and returns true, or false on eof (and closes the file).
+// It ignores the utf8 file signature ByteOrderMark (U+FEFF=EF BB BF), checks
+// for valid utf-8 and allows space or tab between fields.
+// utf8_str is set with the unichar string, and bounding box with the box.
+// If there are page numbers in the file, it reads them all.
+TESS_API
+bool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box);
+// As ReadNextBox above, but get a specific page number. (0-based)
+// Use -1 to read any page number. Files without page number all
+// read as if they are page 0.
+TESS_API
+bool ReadNextBox(int target_page, int *line_number, FILE *box_file, std::string &utf8_str,
+                 TBOX *bounding_box);
+
+// Parses the given box file string into a page_number, utf8_str, and
+// bounding_box. Returns true on a successful parse.
+TESS_API
+bool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str,
+                     TBOX *bounding_box);
+
+// Creates a box file string from a unichar string, TBOX and page number.
+TESS_API
+void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, std::string &box_str);
+
+} // namespace tesseract
+
+#endif // TESSERACT_CCUTIL_BOXREAD_H_
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/boxword.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/boxword.cpp
@ -0,0 +1,205 @@
+///////////////////////////////////////////////////////////////////////
+// File:        boxword.cpp
+// Description: Class to represent the bounding boxes of the output.
+// Author:      Ray Smith
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "boxword.h"
+#include "blobs.h"
+#include "host.h" // for NearlyEqual
+#include "normalis.h"
+#include "ocrblock.h"
+#include "pageres.h"
+
+namespace tesseract {
+
+// Clip output boxes to input blob boxes for bounds that are within this
+// tolerance. Otherwise, the blob may be chopped and we have to just use
+// the word bounding box.
+const int kBoxClipTolerance = 2;
+
+BoxWord::BoxWord() : length_(0) {}
+
+BoxWord::BoxWord(const BoxWord &src) {
+  CopyFrom(src);
+}
+
+BoxWord &BoxWord::operator=(const BoxWord &src) {
+  CopyFrom(src);
+  return *this;
+}
+
+void BoxWord::CopyFrom(const BoxWord &src) {
+  bbox_ = src.bbox_;
+  length_ = src.length_;
+  boxes_.clear();
+  boxes_.reserve(length_);
+  for (int i = 0; i < length_; ++i) {
+    boxes_.push_back(src.boxes_[i]);
+  }
+}
+
+// Factory to build a BoxWord from a TWERD using the DENORMs on each blob to
+// switch back to original image coordinates.
+BoxWord *BoxWord::CopyFromNormalized(TWERD *tessword) {
+  auto *boxword = new BoxWord();
+  // Count the blobs.
+  boxword->length_ = tessword->NumBlobs();
+  // Allocate memory.
+  boxword->boxes_.reserve(boxword->length_);
+
+  for (int b = 0; b < boxword->length_; ++b) {
+    TBLOB *tblob = tessword->blobs[b];
+    TBOX blob_box;
+    for (TESSLINE *outline = tblob->outlines; outline != nullptr; outline = outline->next) {
+      EDGEPT *edgept = outline->loop;
+      // Iterate over the edges.
+      do {
+        if (!edgept->IsHidden() || !edgept->prev->IsHidden()) {
+          ICOORD pos(edgept->pos.x, edgept->pos.y);
+          TPOINT denormed;
+          tblob->denorm().DenormTransform(nullptr, edgept->pos, &denormed);
+          pos.set_x(denormed.x);
+          pos.set_y(denormed.y);
+          TBOX pt_box(pos, pos);
+          blob_box += pt_box;
+        }
+        edgept = edgept->next;
+      } while (edgept != outline->loop);
+    }
+    boxword->boxes_.push_back(blob_box);
+  }
+  boxword->ComputeBoundingBox();
+  return boxword;
+}
+
+// Clean up the bounding boxes from the polygonal approximation by
+// expanding slightly, then clipping to the blobs from the original_word
+// that overlap. If not null, the block provides the inverse rotation.
+void BoxWord::ClipToOriginalWord(const BLOCK *block, WERD *original_word) {
+  for (int i = 0; i < length_; ++i) {
+    TBOX box = boxes_[i];
+    // Expand by a single pixel, as the poly approximation error is 1 pixel.
+    box = TBOX(box.left() - 1, box.bottom() - 1, box.right() + 1, box.top() + 1);
+    // Now find the original box that matches.
+    TBOX original_box;
+    C_BLOB_IT b_it(original_word->cblob_list());
+    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+      TBOX blob_box = b_it.data()->bounding_box();
+      if (block != nullptr) {
+        blob_box.rotate(block->re_rotation());
+      }
+      if (blob_box.major_overlap(box)) {
+        original_box += blob_box;
+      }
+    }
+    if (!original_box.null_box()) {
+      if (NearlyEqual<int>(original_box.left(), box.left(), kBoxClipTolerance)) {
+        box.set_left(original_box.left());
+      }
+      if (NearlyEqual<int>(original_box.right(), box.right(), kBoxClipTolerance)) {
+        box.set_right(original_box.right());
+      }
+      if (NearlyEqual<int>(original_box.top(), box.top(), kBoxClipTolerance)) {
+        box.set_top(original_box.top());
+      }
+      if (NearlyEqual<int>(original_box.bottom(), box.bottom(), kBoxClipTolerance)) {
+        box.set_bottom(original_box.bottom());
+      }
+    }
+    original_box = original_word->bounding_box();
+    if (block != nullptr) {
+      original_box.rotate(block->re_rotation());
+    }
+    boxes_[i] = box.intersection(original_box);
+  }
+  ComputeBoundingBox();
+}
+
+// Merges the boxes from start to end, not including end, and deletes
+// the boxes between start and end.
+void BoxWord::MergeBoxes(int start, int end) {
+  start = ClipToRange(start, 0, length_);
+  end = ClipToRange(end, 0, length_);
+  if (end <= start + 1) {
+    return;
+  }
+  for (int i = start + 1; i < end; ++i) {
+    boxes_[start] += boxes_[i];
+  }
+  int shrinkage = end - 1 - start;
+  length_ -= shrinkage;
+  for (int i = start + 1; i < length_; ++i) {
+    boxes_[i] = boxes_[i + shrinkage];
+  }
+  boxes_.resize(length_);
+}
+
+// Inserts a new box before the given index.
+// Recomputes the bounding box.
+void BoxWord::InsertBox(int index, const TBOX &box) {
+  if (index < length_) {
+    boxes_.insert(boxes_.begin() + index, box);
+  } else {
+    boxes_.push_back(box);
+  }
+  length_ = boxes_.size();
+  ComputeBoundingBox();
+}
+
+// Changes the box at the given index to the new box.
+// Recomputes the bounding box.
+void BoxWord::ChangeBox(int index, const TBOX &box) {
+  boxes_[index] = box;
+  ComputeBoundingBox();
+}
+
+// Deletes the box with the given index, and shuffles up the rest.
+// Recomputes the bounding box.
+void BoxWord::DeleteBox(int index) {
+  ASSERT_HOST(0 <= index && index < length_);
+  boxes_.erase(boxes_.begin() + index);
+  --length_;
+  ComputeBoundingBox();
+}
+
+// Deletes all the boxes stored in BoxWord.
+void BoxWord::DeleteAllBoxes() {
+  length_ = 0;
+  boxes_.clear();
+  bbox_ = TBOX();
+}
+
+// Computes the bounding box of the word.
+void BoxWord::ComputeBoundingBox() {
+  bbox_ = TBOX();
+  for (int i = 0; i < length_; ++i) {
+    bbox_ += boxes_[i];
+  }
+}
+
+// This and other putatively are the same, so call the (permanent) callback
+// for each blob index where the bounding boxes match.
+// The callback is deleted on completion.
+void BoxWord::ProcessMatchedBlobs(const TWERD &other, std::function<void(int)> cb) const {
+  for (int i = 0; i < length_ && i < other.NumBlobs(); ++i) {
+    TBOX blob_box = other.blobs[i]->bounding_box();
+    if (blob_box == boxes_[i]) {
+      cb(i);
+    }
+  }
+}
+
+} // namespace tesseract.
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/boxword.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/boxword.h
@ -0,0 +1,97 @@
+///////////////////////////////////////////////////////////////////////
+// File:        boxword.h
+// Description: Class to represent the bounding boxes of the output.
+// Author:      Ray Smith
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CSTRUCT_BOXWORD_H_
+#define TESSERACT_CSTRUCT_BOXWORD_H_
+
+#include "rect.h" // for TBOX
+
+#include <functional> // for std::function
+
+namespace tesseract {
+
+class BLOCK;
+class WERD;
+struct TWERD;
+
+// Class to hold an array of bounding boxes for an output word and
+// the bounding box of the whole word.
+class BoxWord {
+public:
+  BoxWord();
+  explicit BoxWord(const BoxWord &src);
+  ~BoxWord() = default;
+
+  BoxWord &operator=(const BoxWord &src);
+
+  void CopyFrom(const BoxWord &src);
+
+  // Factory to build a BoxWord from a TWERD using the DENORMs on each blob to
+  // switch back to original image coordinates.
+  static BoxWord *CopyFromNormalized(TWERD *tessword);
+
+  // Clean up the bounding boxes from the polygonal approximation by
+  // expanding slightly, then clipping to the blobs from the original_word
+  // that overlap. If not null, the block provides the inverse rotation.
+  void ClipToOriginalWord(const BLOCK *block, WERD *original_word);
+
+  // Merges the boxes from start to end, not including end, and deletes
+  // the boxes between start and end.
+  void MergeBoxes(int start, int end);
+
+  // Inserts a new box before the given index.
+  // Recomputes the bounding box.
+  void InsertBox(int index, const TBOX &box);
+
+  // Changes the box at the given index to the new box.
+  // Recomputes the bounding box.
+  void ChangeBox(int index, const TBOX &box);
+
+  // Deletes the box with the given index, and shuffles up the rest.
+  // Recomputes the bounding box.
+  void DeleteBox(int index);
+
+  // Deletes all the boxes stored in BoxWord.
+  void DeleteAllBoxes();
+
+  // This and other putatively are the same, so call the (permanent) callback
+  // for each blob index where the bounding boxes match.
+  // The callback is deleted on completion.
+  void ProcessMatchedBlobs(const TWERD &other, std::function<void(int)> cb) const;
+
+  const TBOX &bounding_box() const {
+    return bbox_;
+  }
+  int length() const {
+    return length_;
+  }
+  const TBOX &BlobBox(int index) const {
+    return boxes_[index];
+  }
+
+private:
+  void ComputeBoundingBox();
+
+  TBOX bbox_;
+  int length_;
+  std::vector<TBOX> boxes_;
+};
+
+} // namespace tesseract.
+
+#endif // TESSERACT_CSTRUCT_BOXWORD_H_
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/ccstruct.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/ccstruct.cpp
@ -0,0 +1,36 @@
+///////////////////////////////////////////////////////////////////////
+// File:        ccstruct.cpp
+// Description: ccstruct class.
+// Author:      Samuel Charron
+//
+// (C) Copyright 2006, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "ccstruct.h"
+
+namespace tesseract {
+
+// APPROXIMATIONS of the fractions of the character cell taken by
+// the descenders, ascenders, and x-height.
+const double CCStruct::kDescenderFraction = 0.25;
+const double CCStruct::kXHeightFraction = 0.5;
+const double CCStruct::kAscenderFraction = 0.25;
+const double CCStruct::kXHeightCapRatio =
+    CCStruct::kXHeightFraction / (CCStruct::kXHeightFraction + CCStruct::kAscenderFraction);
+
+// Destructor.
+// It is defined here, so the compiler can create a single vtable
+// instead of weak vtables in every compilation unit.
+CCStruct::~CCStruct() = default;
+
+} // namespace tesseract
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/ccstruct.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/ccstruct.h
@ -0,0 +1,41 @@
+///////////////////////////////////////////////////////////////////////
+// File:        ccstruct.h
+// Description: ccstruct class.
+// Author:      Samuel Charron
+//
+// (C) Copyright 2006, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCSTRUCT_CCSTRUCT_H_
+#define TESSERACT_CCSTRUCT_CCSTRUCT_H_
+
+#include "ccutil.h" // for CCUtil
+
+namespace tesseract {
+class TESS_API CCStruct : public CCUtil {
+public:
+  CCStruct() = default;
+  ~CCStruct() override;
+
+  // Globally accessible constants.
+  // APPROXIMATIONS of the fractions of the character cell taken by
+  // the descenders, ascenders, and x-height.
+  static const double kDescenderFraction; // = 0.25;
+  static const double kXHeightFraction;   // = 0.5;
+  static const double kAscenderFraction;  // = 0.25;
+  // Derived value giving the x-height as a fraction of cap-height.
+  static const double kXHeightCapRatio; // = XHeight/(XHeight + Ascender).
+};
+} // namespace tesseract
+
+#endif // TESSERACT_CCSTRUCT_CCSTRUCT_H_
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/coutln.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/coutln.cpp
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/coutln.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/coutln.h
@ -0,0 +1,297 @@
+/**********************************************************************
+ * File:        coutln.h
+ * Description: Code for the C_OUTLINE class.
+ * Author:      Ray Smith
+ *
+ * (C) Copyright 1991, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef COUTLN_H
+#define COUTLN_H
+
+#include "elst.h"       // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
+#include "mod128.h"     // for DIR128, DIRBITS
+#include "points.h"     // for ICOORD, FCOORD
+#include "rect.h"       // for TBOX
+#include "scrollview.h" // for ScrollView, ScrollView::Color
+
+#include <tesseract/export.h> // for DLLSYM
+
+#include <cstdint> // for int16_t, int32_t
+#include <bitset>  // for std::bitset<16>
+
+struct Pix;
+
+namespace tesseract {
+
+class CRACKEDGE;
+class DENORM;
+
+#define INTERSECTING INT16_MAX // no winding number
+
+// mask to get step
+#define STEP_MASK 3
+
+enum C_OUTLINE_FLAGS {
+  COUT_INVERSE // White on black blob
+};
+
+// Simple struct to hold the 3 values needed to compute a more precise edge
+// position and direction. The offset_numerator is the difference between the
+// grey threshold and the mean pixel value. pixel_diff is the difference between
+// the pixels in the edge. Consider the following row of pixels: p1 p2 p3 p4 p5
+// Say the image was thresholded  at threshold t, making p1, p2, p3 black
+// and p4, p5 white (p1, p2, p3 < t, and p4, p5 >= t), but suppose that
+// max(p[i+1] - p[i]) is p3 - p2. Then the extrapolated position of the edge,
+// based on the maximum gradient, is at the crack between p2 and p3 plus the
+// offset (t - (p2+p3)/2)/(p3 - p2). We store the pixel difference p3-p2
+// denominator in pixel_diff and the offset numerator, relative to the original
+// binary edge (t - (p2+p3)/2) - (p3 -p2) in offset_numerator.
+// The sign of offset_numerator and pixel_diff are manipulated to ensure
+// that the pixel_diff, which will be used as a weight, is always positive.
+// The direction stores the quantized feature direction for the given step
+// computed from the edge gradient. (Using binary_angle_plus_pi.)
+// If the pixel_diff is zero, it means that the direction of the gradient
+// is in conflict with the step direction, so this step is to be ignored.
+struct EdgeOffset {
+  int8_t offset_numerator;
+  uint8_t pixel_diff;
+  uint8_t direction;
+};
+
+class C_OUTLINE; // forward declaration
+
+ELISTIZEH(C_OUTLINE)
+class C_OUTLINE : public ELIST_LINK {
+public:
+  C_OUTLINE() {
+    stepcount = 0;
+    offsets = nullptr;
+  }
+  C_OUTLINE(              // constructor
+      CRACKEDGE *startpt, // from edge detector
+      ICOORD bot_left,    // bounding box //length of loop
+      ICOORD top_right, int16_t length);
+  C_OUTLINE(ICOORD startpt,                       // start of loop
+            DIR128 *new_steps,                    // steps in loop
+            int16_t length);                      // length of loop
+                                                  // outline to copy
+  C_OUTLINE(C_OUTLINE *srcline, FCOORD rotation); // and rotate
+
+  // Build a fake outline, given just a bounding box and append to the list.
+  static void FakeOutline(const TBOX &box, C_OUTLINE_LIST *outlines);
+
+  ~C_OUTLINE() { // destructor
+    delete[] offsets;
+  }
+
+  bool flag(                        // test flag
+      C_OUTLINE_FLAGS mask) const { // flag to test
+    return flags[mask];
+  }
+  void set_flag(            // set flag value
+      C_OUTLINE_FLAGS mask, // flag to test
+      bool value) {         // value to set
+    flags.set(mask, value);
+  }
+
+  C_OUTLINE_LIST *child() { // get child list
+    return &children;
+  }
+
+  // access function
+  const TBOX &bounding_box() const {
+    return box;
+  }
+  void set_step(         // set a step
+      int16_t stepindex, // index of step
+      int8_t stepdir) {  // chain code
+    int shift = stepindex % 4 * 2;
+    uint8_t mask = 3 << shift;
+    steps[stepindex / 4] = ((stepdir << shift) & mask) | (steps[stepindex / 4] & ~mask);
+    // squeeze 4 into byte
+  }
+  void set_step(         // set a step
+      int16_t stepindex, // index of step
+      DIR128 stepdir) {  // direction
+    // clean it
+    int8_t chaindir = stepdir.get_dir() >> (DIRBITS - 2);
+    // difference
+    set_step(stepindex, chaindir);
+    // squeeze 4 into byte
+  }
+
+  int32_t pathlength() const { // get path length
+    return stepcount;
+  }
+  // Return step at a given index as a DIR128.
+  DIR128 step_dir(int index) const {
+    return DIR128(
+        static_cast<int16_t>(((steps[index / 4] >> (index % 4 * 2)) & STEP_MASK) << (DIRBITS - 2)));
+  }
+  // Return the step vector for the given outline position.
+  ICOORD step(int index) const { // index of step
+    return step_coords[chain_code(index)];
+  }
+  // get start position
+  const ICOORD &start_pos() const {
+    return start;
+  }
+  // Returns the position at the given index on the outline.
+  // NOT to be used lightly, as it has to iterate the outline to find out.
+  ICOORD position_at_index(int index) const {
+    ICOORD pos = start;
+    for (int i = 0; i < index; ++i) {
+      pos += step(i);
+    }
+    return pos;
+  }
+  // Returns the sub-pixel accurate position given the integer position pos
+  // at the given index on the outline. pos may be a return value of
+  // position_at_index, or computed by repeatedly adding step to the
+  // start_pos() in the usual way.
+  FCOORD sub_pixel_pos_at_index(const ICOORD &pos, int index) const {
+    const ICOORD &step_to_next(step(index));
+    FCOORD f_pos(pos.x() + step_to_next.x() / 2.0f, pos.y() + step_to_next.y() / 2.0f);
+    if (offsets != nullptr && offsets[index].pixel_diff > 0) {
+      float offset = offsets[index].offset_numerator;
+      offset /= offsets[index].pixel_diff;
+      if (step_to_next.x() != 0) {
+        f_pos.set_y(f_pos.y() + offset);
+      } else {
+        f_pos.set_x(f_pos.x() + offset);
+      }
+    }
+    return f_pos;
+  }
+  // Returns the step direction for the given index or -1 if there is none.
+  int direction_at_index(int index) const {
+    if (offsets != nullptr && offsets[index].pixel_diff > 0) {
+      return offsets[index].direction;
+    }
+    return -1;
+  }
+  // Returns the edge strength for the given index.
+  // If there are no recorded edge strengths, returns 1 (assuming the image
+  // is binary). Returns 0 if the gradient direction conflicts with the
+  // step direction, indicating that this position could be skipped.
+  int edge_strength_at_index(int index) const {
+    if (offsets != nullptr) {
+      return offsets[index].pixel_diff;
+    }
+    return 1;
+  }
+  // Return the step as a chain code (0-3) related to the standard feature
+  // direction of binary_angle_plus_pi by:
+  // chain_code * 64 = feature direction.
+  int chain_code(int index) const { // index of step
+    return (steps[index / 4] >> (index % 4 * 2)) & STEP_MASK;
+  }
+
+  int32_t area() const;       // Returns area of self and 1st level children.
+  int32_t perimeter() const;  // Total perimeter of self and 1st level children.
+  int32_t outer_area() const; // Returns area of self only.
+  int32_t count_transitions(  // count maxima
+      int32_t threshold);     // size threshold
+
+  bool operator<( // containment test
+      const C_OUTLINE &other) const;
+  bool operator>( // containment test
+      C_OUTLINE &other) const {
+    return other < *this; // use the < to do it
+  }
+  int16_t winding_number(   // get winding number
+      ICOORD testpt) const; // around this point
+                            // get direction
+  int16_t turn_direction() const;
+  void reverse(); // reverse direction
+
+  void move(             // reposition outline
+      const ICOORD vec); // by vector
+
+  // Returns true if *this and its children are legally nested.
+  // The outer area of a child should have the opposite sign to the
+  // parent. If not, it means we have discarded an outline in between
+  // (probably due to excessive length).
+  bool IsLegallyNested() const;
+
+  // If this outline is smaller than the given min_size, delete this and
+  // remove from its list, via *it, after checking that *it points to this.
+  // Otherwise, if any children of this are too small, delete them.
+  // On entry, *it must be an iterator pointing to this. If this gets deleted
+  // then this is extracted from *it, so an iteration can continue.
+  void RemoveSmallRecursive(int min_size, C_OUTLINE_IT *it);
+
+  // Adds sub-pixel resolution EdgeOffsets for the outline if the supplied
+  // pix is 8-bit. Does nothing otherwise.
+  void ComputeEdgeOffsets(int threshold, Image pix);
+  // Adds sub-pixel resolution EdgeOffsets for the outline using only
+  // a binary image source.
+  void ComputeBinaryOffsets();
+
+  // Renders the outline to the given pix, with left and top being
+  // the coords of the upper-left corner of the pix.
+  void render(int left, int top, Image pix) const;
+
+  // Renders just the outline to the given pix (no fill), with left and top
+  // being the coords of the upper-left corner of the pix.
+  void render_outline(int left, int top, Image pix) const;
+
+#ifndef GRAPHICS_DISABLED
+  void plot(                           // draw one
+      ScrollView *window,              // window to draw in
+      ScrollView::Color colour) const; // colour to draw it
+  // Draws the outline in the given colour, normalized using the given denorm,
+  // making use of sub-pixel accurate information if available.
+  void plot_normed(const DENORM &denorm, ScrollView::Color colour, ScrollView *window) const;
+#endif // !GRAPHICS_DISABLED
+
+  C_OUTLINE &operator=(const C_OUTLINE &source);
+
+  static C_OUTLINE *deep_copy(const C_OUTLINE *src) {
+    auto *outline = new C_OUTLINE;
+    *outline = *src;
+    return outline;
+  }
+
+  static ICOORD chain_step(int chaindir);
+
+  // The maximum length of any outline. The stepcount is stored as 16 bits,
+  // but it is probably not a good idea to increase this constant by much
+  // and switch to 32 bits, as it plays an important role in keeping huge
+  // outlines invisible, which prevents bad speed behavior.
+  static const int kMaxOutlineLength = 16000;
+
+private:
+  // Helper for ComputeBinaryOffsets. Increments pos, dir_counts, pos_totals
+  // by the step, increment, and vertical step ? x : y position * increment
+  // at step s Mod stepcount respectively. Used to add or subtract the
+  // direction and position to/from accumulators of a small neighbourhood.
+  void increment_step(int s, int increment, ICOORD *pos, int *dir_counts, int *pos_totals) const;
+  int step_mem() const {
+    return (stepcount + 3) / 4;
+  }
+
+  TBOX box;                // bounding box
+  ICOORD start;            // start coord
+  int16_t stepcount;       // no of steps
+  std::bitset<16> flags;   // flags about outline
+  std::vector<uint8_t> steps; // step array
+  EdgeOffset *offsets;     // Higher precision edge.
+  C_OUTLINE_LIST children; // child elements
+  static ICOORD step_coords[4];
+};
+
+} // namespace tesseract
+
+#endif
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/crakedge.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/crakedge.h
@ -0,0 +1,42 @@
+/**********************************************************************
+ * File:        crakedge.h      (Formerly: crkedge.h)
+ * Description: Structures for the Crack following edge detector.
+ * Author:      Ray Smith
+ * Created:     Fri Mar 22 16:06:38 GMT 1991
+ *
+ * (C) Copyright 1991, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef CRAKEDGE_H
+#define CRAKEDGE_H
+
+#include "mod128.h"
+#include "points.h"
+
+namespace tesseract {
+
+class CRACKEDGE {
+public:
+  CRACKEDGE() = default;
+
+  ICOORD pos;   /*position of crack */
+  int8_t stepx; // edge step
+  int8_t stepy;
+  int8_t stepdir;  // chaincode
+  CRACKEDGE *prev; /*previous point */
+  CRACKEDGE *next; /*next point */
+};
+
+} // namespace tesseract
+
+#endif
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/debugpixa.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/debugpixa.h
@ -0,0 +1,58 @@
+#ifndef TESSERACT_CCSTRUCT_DEBUGPIXA_H_
+#define TESSERACT_CCSTRUCT_DEBUGPIXA_H_
+
+#include "image.h"
+
+#include <allheaders.h>
+
+namespace tesseract {
+
+// Class to hold a Pixa collection of debug images with captions and save them
+// to a PDF file.
+class DebugPixa {
+public:
+  // TODO(rays) add another constructor with size control.
+  DebugPixa() {
+    pixa_ = pixaCreate(0);
+#ifdef TESSERACT_DISABLE_DEBUG_FONTS
+    fonts_ = NULL;
+#else
+    fonts_ = bmfCreate(nullptr, 14);
+#endif
+  }
+  // If the filename_ has been set and there are any debug images, they are
+  // written to the set filename_.
+  ~DebugPixa() {
+    pixaDestroy(&pixa_);
+    bmfDestroy(&fonts_);
+  }
+
+  // Adds the given pix to the set of pages in the PDF file, with the given
+  // caption added to the top.
+  void AddPix(const Image pix, const char *caption) {
+    int depth = pixGetDepth(pix);
+    int color = depth < 8 ? 1 : (depth > 8 ? 0x00ff0000 : 0x80);
+    Image pix_debug =
+        pixAddSingleTextblock(pix, fonts_, caption, color, L_ADD_BELOW, nullptr);
+    pixaAddPix(pixa_, pix_debug, L_INSERT);
+  }
+
+  // Sets the destination filename and enables images to be written to a PDF
+  // on destruction.
+  void WritePDF(const char *filename) {
+    if (pixaGetCount(pixa_) > 0) {
+      pixaConvertToPdf(pixa_, 300, 1.0f, 0, 0, "AllDebugImages", filename);
+      pixaClear(pixa_);
+    }
+  }
+
+private:
+  // The collection of images to put in the PDF.
+  Pixa *pixa_;
+  // The fonts used to draw text captions.
+  L_Bmf *fonts_;
+};
+
+} // namespace tesseract
+
+#endif // TESSERACT_CCSTRUCT_DEBUGPIXA_H_
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/detlinefit.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/detlinefit.cpp
@ -0,0 +1,302 @@
+///////////////////////////////////////////////////////////////////////
+// File:        detlinefit.cpp
+// Description: Deterministic least median squares line fitting.
+// Author:      Ray Smith
+//
+// (C) Copyright 2008, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "detlinefit.h"
+#include "helpers.h"    // for IntCastRounded
+#include "statistc.h"
+#include "tprintf.h"
+
+#include <algorithm>
+#include <cfloat> // for FLT_MAX
+
+namespace tesseract {
+
+// The number of points to consider at each end.
+const int kNumEndPoints = 3;
+// The minimum number of points at which to switch to number of points
+// for badly fitted lines.
+// To ensure a sensible error metric, kMinPointsForErrorCount should be at
+// least kMaxRealDistance / (1 - %ile) where %ile is the fractile used in
+// ComputeUpperQuartileError.
+const int kMinPointsForErrorCount = 16;
+// The maximum real distance to use before switching to number of
+// mis-fitted points, which will get square-rooted for true distance.
+const int kMaxRealDistance = 2.0;
+
+DetLineFit::DetLineFit() : square_length_(0.0) {}
+
+// Delete all Added points.
+void DetLineFit::Clear() {
+  pts_.clear();
+  distances_.clear();
+}
+
+// Add a new point. Takes a copy - the pt doesn't need to stay in scope.
+void DetLineFit::Add(const ICOORD &pt) {
+  pts_.emplace_back(pt, 0);
+}
+// Associates a half-width with the given point if a point overlaps the
+// previous point by more than half the width, and its distance is further
+// than the previous point, then the more distant point is ignored in the
+// distance calculation. Useful for ignoring i dots and other diacritics.
+void DetLineFit::Add(const ICOORD &pt, int halfwidth) {
+  pts_.emplace_back(pt, halfwidth);
+}
+
+// Fits a line to the points, ignoring the skip_first initial points and the
+// skip_last final points, returning the fitted line as a pair of points,
+// and the upper quartile error.
+double DetLineFit::Fit(int skip_first, int skip_last, ICOORD *pt1, ICOORD *pt2) {
+  // Do something sensible with no points.
+  if (pts_.empty()) {
+    pt1->set_x(0);
+    pt1->set_y(0);
+    *pt2 = *pt1;
+    return 0.0;
+  }
+  // Count the points and find the first and last kNumEndPoints.
+  int pt_count = pts_.size();
+  ICOORD *starts[kNumEndPoints];
+  if (skip_first >= pt_count) {
+    skip_first = pt_count - 1;
+  }
+  int start_count = 0;
+  int end_i = std::min(skip_first + kNumEndPoints, pt_count);
+  for (int i = skip_first; i < end_i; ++i) {
+    starts[start_count++] = &pts_[i].pt;
+  }
+  ICOORD *ends[kNumEndPoints];
+  if (skip_last >= pt_count) {
+    skip_last = pt_count - 1;
+  }
+  int end_count = 0;
+  end_i = std::max(0, pt_count - kNumEndPoints - skip_last);
+  for (int i = pt_count - 1 - skip_last; i >= end_i; --i) {
+    ends[end_count++] = &pts_[i].pt;
+  }
+  // 1 or 2 points need special treatment.
+  if (pt_count <= 2) {
+    *pt1 = *starts[0];
+    if (pt_count > 1) {
+      *pt2 = *ends[0];
+    } else {
+      *pt2 = *pt1;
+    }
+    return 0.0;
+  }
+  // Although with between 2 and 2*kNumEndPoints-1 points, there will be
+  // overlap in the starts, ends sets, this is OK and taken care of by the
+  // if (*start != *end) test below, which also tests for equal input points.
+  double best_uq = -1.0;
+  // Iterate each pair of points and find the best fitting line.
+  for (int i = 0; i < start_count; ++i) {
+    ICOORD *start = starts[i];
+    for (int j = 0; j < end_count; ++j) {
+      ICOORD *end = ends[j];
+      if (*start != *end) {
+        ComputeDistances(*start, *end);
+        // Compute the upper quartile error from the line.
+        double dist = EvaluateLineFit();
+        if (dist < best_uq || best_uq < 0.0) {
+          best_uq = dist;
+          *pt1 = *start;
+          *pt2 = *end;
+        }
+      }
+    }
+  }
+  // Finally compute the square root to return the true distance.
+  return best_uq > 0.0 ? sqrt(best_uq) : best_uq;
+}
+
+// Constrained fit with a supplied direction vector. Finds the best line_pt,
+// that is one of the supplied points having the median cross product with
+// direction, ignoring points that have a cross product outside of the range
+// [min_dist, max_dist]. Returns the resulting error metric using the same
+// reduced set of points.
+// *Makes use of floating point arithmetic*
+double DetLineFit::ConstrainedFit(const FCOORD &direction, double min_dist, double max_dist,
+                                  bool debug, ICOORD *line_pt) {
+  ComputeConstrainedDistances(direction, min_dist, max_dist);
+  // Do something sensible with no points or computed distances.
+  if (pts_.empty() || distances_.empty()) {
+    line_pt->set_x(0);
+    line_pt->set_y(0);
+    return 0.0;
+  }
+  auto median_index = distances_.size() / 2;
+  std::nth_element(distances_.begin(), distances_.begin() + median_index, distances_.end());
+  *line_pt = distances_[median_index].data();
+  if (debug) {
+    tprintf("Constrained fit to dir %g, %g = %d, %d :%zu distances:\n", direction.x(), direction.y(),
+            line_pt->x(), line_pt->y(), distances_.size());
+    for (int i = 0; i < distances_.size(); ++i) {
+      tprintf("%d: %d, %d -> %g\n", i, distances_[i].data().x(), distances_[i].data().y(),
+              distances_[i].key());
+    }
+    tprintf("Result = %zu\n", median_index);
+  }
+  // Center distances on the fitted point.
+  double dist_origin = direction * *line_pt;
+  for (auto &distance : distances_) {
+    distance.key() -= dist_origin;
+  }
+  return sqrt(EvaluateLineFit());
+}
+
+// Returns true if there were enough points at the last call to Fit or
+// ConstrainedFit for the fitted points to be used on a badly fitted line.
+bool DetLineFit::SufficientPointsForIndependentFit() const {
+  return distances_.size() >= kMinPointsForErrorCount;
+}
+
+// Backwards compatible fit returning a gradient and constant.
+// Deprecated. Prefer Fit(ICOORD*, ICOORD*) where possible, but use this
+// function in preference to the LMS class.
+double DetLineFit::Fit(float *m, float *c) {
+  ICOORD start, end;
+  double error = Fit(&start, &end);
+  if (end.x() != start.x()) {
+    *m = static_cast<float>(end.y() - start.y()) / (end.x() - start.x());
+    *c = start.y() - *m * start.x();
+  } else {
+    *m = 0.0f;
+    *c = 0.0f;
+  }
+  return error;
+}
+
+// Backwards compatible constrained fit with a supplied gradient.
+// Deprecated. Use ConstrainedFit(const FCOORD& direction) where possible
+// to avoid potential difficulties with infinite gradients.
+double DetLineFit::ConstrainedFit(double m, float *c) {
+  // Do something sensible with no points.
+  if (pts_.empty()) {
+    *c = 0.0f;
+    return 0.0;
+  }
+  double cos = 1.0 / sqrt(1.0 + m * m);
+  FCOORD direction(cos, m * cos);
+  ICOORD line_pt;
+  double error = ConstrainedFit(direction, -FLT_MAX, FLT_MAX, false, &line_pt);
+  *c = line_pt.y() - line_pt.x() * m;
+  return error;
+}
+
+// Computes and returns the squared evaluation metric for a line fit.
+double DetLineFit::EvaluateLineFit() {
+  // Compute the upper quartile error from the line.
+  double dist = ComputeUpperQuartileError();
+  if (distances_.size() >= kMinPointsForErrorCount && dist > kMaxRealDistance * kMaxRealDistance) {
+    // Use the number of mis-fitted points as the error metric, as this
+    // gives a better measure of fit for badly fitted lines where more
+    // than a quarter are badly fitted.
+    double threshold = kMaxRealDistance * sqrt(square_length_);
+    dist = NumberOfMisfittedPoints(threshold);
+  }
+  return dist;
+}
+
+// Computes the absolute error distances of the points from the line,
+// and returns the squared upper-quartile error distance.
+double DetLineFit::ComputeUpperQuartileError() {
+  int num_errors = distances_.size();
+  if (num_errors == 0) {
+    return 0.0;
+  }
+  // Get the absolute values of the errors.
+  for (int i = 0; i < num_errors; ++i) {
+    if (distances_[i].key() < 0) {
+      distances_[i].key() = -distances_[i].key();
+    }
+  }
+  // Now get the upper quartile distance.
+  auto index = 3 * num_errors / 4;
+  std::nth_element(distances_.begin(), distances_.begin() + index, distances_.end());
+  double dist = distances_[index].key();
+  // The true distance is the square root of the dist squared / square_length.
+  // Don't bother with the square root. Just return the square distance.
+  return square_length_ > 0.0 ? dist * dist / square_length_ : 0.0;
+}
+
+// Returns the number of sample points that have an error more than threshold.
+int DetLineFit::NumberOfMisfittedPoints(double threshold) const {
+  int num_misfits = 0;
+  int num_dists = distances_.size();
+  // Get the absolute values of the errors.
+  for (int i = 0; i < num_dists; ++i) {
+    if (distances_[i].key() > threshold) {
+      ++num_misfits;
+    }
+  }
+  return num_misfits;
+}
+
+// Computes all the cross product distances of the points from the line,
+// storing the actual (signed) cross products in distances.
+// Ignores distances of points that are further away than the previous point,
+// and overlaps the previous point by at least half.
+void DetLineFit::ComputeDistances(const ICOORD &start, const ICOORD &end) {
+  distances_.clear();
+  ICOORD line_vector = end;
+  line_vector -= start;
+  square_length_ = line_vector.sqlength();
+  int line_length = IntCastRounded(sqrt(square_length_));
+  // Compute the distance of each point from the line.
+  int prev_abs_dist = 0;
+  int prev_dot = 0;
+  for (int i = 0; i < pts_.size(); ++i) {
+    ICOORD pt_vector = pts_[i].pt;
+    pt_vector -= start;
+    int dot = line_vector % pt_vector;
+    // Compute |line_vector||pt_vector|sin(angle between)
+    int dist = line_vector * pt_vector;
+    int abs_dist = dist < 0 ? -dist : dist;
+    if (abs_dist > prev_abs_dist && i > 0) {
+      // Ignore this point if it overlaps the previous one.
+      int separation = abs(dot - prev_dot);
+      if (separation < line_length * pts_[i].halfwidth ||
+          separation < line_length * pts_[i - 1].halfwidth) {
+        continue;
+      }
+    }
+    distances_.emplace_back(dist, pts_[i].pt);
+    prev_abs_dist = abs_dist;
+    prev_dot = dot;
+  }
+}
+
+// Computes all the cross product distances of the points perpendicular to
+// the given direction, ignoring distances outside of the give distance range,
+// storing the actual (signed) cross products in distances_.
+void DetLineFit::ComputeConstrainedDistances(const FCOORD &direction, double min_dist,
+                                             double max_dist) {
+  distances_.clear();
+  square_length_ = direction.sqlength();
+  // Compute the distance of each point from the line.
+  for (auto &pt : pts_) {
+    FCOORD pt_vector = pt.pt;
+    // Compute |line_vector||pt_vector|sin(angle between)
+    double dist = direction * pt_vector;
+    if (min_dist <= dist && dist <= max_dist) {
+      distances_.emplace_back(dist, pt.pt);
+    }
+  }
+}
+
+} // namespace tesseract.
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/detlinefit.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/detlinefit.h
@ -0,0 +1,157 @@
+///////////////////////////////////////////////////////////////////////
+// File:        detlinefit.h
+// Description: Deterministic least upper-quartile squares line fitting.
+// Author:      Ray Smith
+//
+// (C) Copyright 2008, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCSTRUCT_DETLINEFIT_H_
+#define TESSERACT_CCSTRUCT_DETLINEFIT_H_
+
+#include "kdpair.h"
+#include "points.h"
+
+namespace tesseract {
+
+// This class fits a line to a set of ICOORD points.
+// There is no restriction on the direction of the line, as it
+// uses a vector method, ie no concern over infinite gradients.
+// The fitted line has the least upper quartile of squares of perpendicular
+// distances of all source points from the line, subject to the constraint
+// that the line is made from one of the pairs of [{p1,p2,p3},{pn-2, pn-1, pn}]
+// i.e. the 9 combinations of one of the first 3 and last 3 points.
+// A fundamental assumption of this algorithm is that one of the first 3 and
+// one of the last 3 points are near the best line fit.
+// The points must be Added in line order for the algorithm to work properly.
+// No floating point calculations are needed* to make an accurate fit,
+// and no random numbers are needed** so the algorithm is deterministic,
+// architecture-stable, and compiler-stable as well as stable to minor
+// changes in the input.
+// *A single floating point division is used to compute each line's distance.
+// This is unlikely to result in choice of a different line, but if it does,
+// it would be easy to replace with a 64 bit integer calculation.
+// **Random numbers are used in the nth_item function, but the worst
+// non-determinism that can result is picking a different result among equals,
+// and that wouldn't make any difference to the end-result distance, so the
+// randomness does not affect the determinism of the algorithm. The random
+// numbers are only there to guarantee average linear time.
+// Fitting time is linear, but with a high constant, as it tries 9 different
+// lines and computes the distance of all points each time.
+// This class is aimed at replacing the LLSQ (linear least squares) and
+// LMS (least median of squares) classes that are currently used for most
+// of the line fitting in Tesseract.
+class DetLineFit {
+public:
+  DetLineFit();
+  ~DetLineFit() = default;
+
+  // Delete all Added points.
+  void Clear();
+
+  // Adds a new point. Takes a copy - the pt doesn't need to stay in scope.
+  // Add must be called on points in sequence along the line.
+  void Add(const ICOORD &pt);
+  // Associates a half-width with the given point if a point overlaps the
+  // previous point by more than half the width, and its distance is further
+  // than the previous point, then the more distant point is ignored in the
+  // distance calculation. Useful for ignoring i dots and other diacritics.
+  void Add(const ICOORD &pt, int halfwidth);
+
+  // Fits a line to the points, returning the fitted line as a pair of
+  // points, and the upper quartile error.
+  double Fit(ICOORD *pt1, ICOORD *pt2) {
+    return Fit(0, 0, pt1, pt2);
+  }
+  // Fits a line to the points, ignoring the skip_first initial points and the
+  // skip_last final points, returning the fitted line as a pair of points,
+  // and the upper quartile error.
+  double Fit(int skip_first, int skip_last, ICOORD *pt1, ICOORD *pt2);
+
+  // Constrained fit with a supplied direction vector. Finds the best line_pt,
+  // that is one of the supplied points having the median cross product with
+  // direction, ignoring points that have a cross product outside of the range
+  // [min_dist, max_dist]. Returns the resulting error metric using the same
+  // reduced set of points.
+  // *Makes use of floating point arithmetic*
+  double ConstrainedFit(const FCOORD &direction, double min_dist, double max_dist, bool debug,
+                        ICOORD *line_pt);
+
+  // Returns true if there were enough points at the last call to Fit or
+  // ConstrainedFit for the fitted points to be used on a badly fitted line.
+  bool SufficientPointsForIndependentFit() const;
+
+  // Backwards compatible fit returning a gradient and constant.
+  // Deprecated. Prefer Fit(ICOORD*, ICOORD*) where possible, but use this
+  // function in preference to the LMS class.
+  double Fit(float *m, float *c);
+
+  // Backwards compatible constrained fit with a supplied gradient.
+  // Deprecated. Use ConstrainedFit(const FCOORD& direction) where possible
+  // to avoid potential difficulties with infinite gradients.
+  double ConstrainedFit(double m, float *c);
+
+private:
+  // Simple struct to hold an ICOORD point and a halfwidth representing half
+  // the "width" (supposedly approximately parallel to the direction of the
+  // line) of each point, such that distant points can be discarded when they
+  // overlap nearer points. (Think i dot and other diacritics or noise.)
+  struct PointWidth {
+    PointWidth() : pt(ICOORD(0, 0)), halfwidth(0) {}
+    PointWidth(const ICOORD &pt0, int halfwidth0) : pt(pt0), halfwidth(halfwidth0) {}
+
+    ICOORD pt;
+    int halfwidth;
+  };
+  // Type holds the distance of each point from the fitted line and the point
+  // itself. Use of double allows integer distances from ICOORDs to be stored
+  // exactly, and also the floating point results from ConstrainedFit.
+  using DistPointPair = KDPairInc<double, ICOORD>;
+
+  // Computes and returns the squared evaluation metric for a line fit.
+  double EvaluateLineFit();
+
+  // Computes the absolute values of the precomputed distances_,
+  // and returns the squared upper-quartile error distance.
+  double ComputeUpperQuartileError();
+
+  // Returns the number of sample points that have an error more than threshold.
+  int NumberOfMisfittedPoints(double threshold) const;
+
+  // Computes all the cross product distances of the points from the line,
+  // storing the actual (signed) cross products in distances_.
+  // Ignores distances of points that are further away than the previous point,
+  // and overlaps the previous point by at least half.
+  void ComputeDistances(const ICOORD &start, const ICOORD &end);
+
+  // Computes all the cross product distances of the points perpendicular to
+  // the given direction, ignoring distances outside of the give distance range,
+  // storing the actual (signed) cross products in distances_.
+  void ComputeConstrainedDistances(const FCOORD &direction, double min_dist, double max_dist);
+
+  // Stores all the source points in the order they were given and their
+  // halfwidths, if any.
+  std::vector<PointWidth> pts_;
+  // Stores the computed perpendicular distances of (some of) the pts_ from a
+  // given vector (assuming it goes through the origin, making it a line).
+  // Since the distances may be a subset of the input points, and get
+  // re-ordered by the nth_item function, the original point is stored
+  // along side the distance.
+  std::vector<DistPointPair> distances_; // Distances of points.
+  // The squared length of the vector used to compute distances_.
+  double square_length_;
+};
+
+} // namespace tesseract.
+
+#endif // TESSERACT_CCSTRUCT_DETLINEFIT_H_
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/dppoint.cpp
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/dppoint.cpp
@ -0,0 +1,99 @@
+/**********************************************************************
+ * File:        dppoint.cpp
+ * Description: Simple generic dynamic programming class.
+ * Author:      Ray Smith
+ * Created:     Wed Mar 25 19:08:01 PDT 2009
+ *
+ * (C) Copyright 2009, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "dppoint.h"
+#include "errcode.h"
+#include "tprintf.h"
+
+namespace tesseract {
+
+// Solve the dynamic programming problem for the given array of points, with
+// the given size and cost function.
+// Steps backwards are limited to being between min_step and max_step
+// inclusive.
+// The return value is the tail of the best path.
+DPPoint *DPPoint::Solve(int min_step, int max_step, bool debug, CostFunc cost_func, int size,
+                        DPPoint *points) {
+  if (size <= 0 || max_step < min_step || min_step >= size) {
+    return nullptr; // Degenerate, but not necessarily an error.
+  }
+  ASSERT_HOST(min_step > 0); // Infinite loop possible if this is not true.
+  if (debug) {
+    tprintf("min = %d, max=%d\n", min_step, max_step);
+  }
+  // Evaluate the total cost at each point.
+  for (int i = 0; i < size; ++i) {
+    for (int offset = min_step; offset <= max_step; ++offset) {
+      DPPoint *prev = offset <= i ? points + i - offset : nullptr;
+      int64_t new_cost = (points[i].*cost_func)(prev);
+      if (points[i].best_prev_ != nullptr && offset > min_step * 2 &&
+          new_cost > points[i].total_cost_) {
+        break; // Find only the first minimum if going over twice the min.
+      }
+    }
+    points[i].total_cost_ += points[i].local_cost_;
+    if (debug) {
+      tprintf("At point %d, local cost=%d, total_cost=%d, steps=%d\n", i, points[i].local_cost_,
+              points[i].total_cost_, points[i].total_steps_);
+    }
+  }
+  // Now find the end of the best path and return it.
+  int best_cost = points[size - 1].total_cost_;
+  int best_end = size - 1;
+  for (int end = best_end - 1; end >= size - min_step; --end) {
+    int cost = points[end].total_cost_;
+    if (cost < best_cost) {
+      best_cost = cost;
+      best_end = end;
+    }
+  }
+  return points + best_end;
+}
+
+// A CostFunc that takes the variance of step into account in the cost.
+int64_t DPPoint::CostWithVariance(const DPPoint *prev) {
+  if (prev == nullptr || prev == this) {
+    UpdateIfBetter(0, 1, nullptr, 0, 0, 0);
+    return 0;
+  }
+
+  int delta = this - prev;
+  int32_t n = prev->n_ + 1;
+  int32_t sig_x = prev->sig_x_ + delta;
+  int64_t sig_xsq = prev->sig_xsq_ + delta * delta;
+  int64_t cost = (sig_xsq - sig_x * sig_x / n) / n;
+  cost += prev->total_cost_;
+  UpdateIfBetter(cost, prev->total_steps_ + 1, prev, n, sig_x, sig_xsq);
+  return cost;
+}
+
+// Update the other members if the cost is lower.
+void DPPoint::UpdateIfBetter(int64_t cost, int32_t steps, const DPPoint *prev, int32_t n,
+                             int32_t sig_x, int64_t sig_xsq) {
+  if (cost < total_cost_) {
+    total_cost_ = cost;
+    total_steps_ = steps;
+    best_prev_ = prev;
+    n_ = n;
+    sig_x_ = sig_x;
+    sig_xsq_ = sig_xsq;
+  }
+}
+
+} // namespace tesseract.
--- a/3rdparty/tesseract_ocr/tesseract/src/ccstruct/dppoint.h
+++ b/3rdparty/tesseract_ocr/tesseract/src/ccstruct/dppoint.h
@ -0,0 +1,105 @@
+/**********************************************************************
+ * File:        dppoint.h
+ * Description: Simple generic dynamic programming class.
+ * Author:      Ray Smith
+ * Created:     Wed Mar 25 18:57:01 PDT 2009
+ *
+ * (C) Copyright 2009, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef TESSERACT_CCSTRUCT_DPPOINT_H_
+#define TESSERACT_CCSTRUCT_DPPOINT_H_
+
+#include <cstdint>
+
+namespace tesseract {
+
+// A simple class to provide a dynamic programming solution to a class of
+// 1st-order problems in which the cost is dependent only on the current
+// step and the best cost to that step, with a possible special case
+// of using the variance of the steps, and only the top choice is required.
+// Useful for problems such as finding the optimal cut points in a fixed-pitch
+// (vertical or horizontal) situation.
+// Skeletal Example:
+// DPPoint* array = new DPPoint[width];
+// for (int i = 0; i < width; i++) {
+//   array[i].AddLocalCost(cost_at_i)
+// }
+// DPPoint* best_end = DPPoint::Solve(..., array);
+// while (best_end != nullptr) {
+//   int cut_index = best_end - array;
+//   best_end = best_end->best_prev();
+// }
+// delete [] array;
+class DPPoint {
+public:
+  // The cost function evaluates the total cost at this (excluding this's
+  // local_cost) and if it beats this's total_cost, then
+  // replace the appropriate values in this.
+  using CostFunc = int64_t (DPPoint::*)(const DPPoint *);
+
+  DPPoint()
+      : local_cost_(0)
+      , total_cost_(INT32_MAX)
+      , total_steps_(1)
+      , best_prev_(nullptr)
+      , n_(0)
+      , sig_x_(0)
+      , sig_xsq_(0) {}
+
+  // Solve the dynamic programming problem for the given array of points, with
+  // the given size and cost function.
+  // Steps backwards are limited to being between min_step and max_step
+  // inclusive.
+  // The return value is the tail of the best path.
+  static DPPoint *Solve(int min_step, int max_step, bool debug, CostFunc cost_func, int size,
+                        DPPoint *points);
+
+  // A CostFunc that takes the variance of step into account in the cost.
+  int64_t CostWithVariance(const DPPoint *prev);
+
+  // Accessors.
+  int total_cost() const {
+    return total_cost_;
+  }
+  int Pathlength() const {
+    return total_steps_;
+  }
+  const DPPoint *best_prev() const {
+    return best_prev_;
+  }
+  void AddLocalCost(int new_cost) {
+    local_cost_ += new_cost;
+  }
+
+private:
+  // Code common to different cost functions.
+
+  // Update the other members if the cost is lower.
+  void UpdateIfBetter(int64_t cost, int32_t steps, const DPPoint *prev, int32_t n, int32_t sig_x,
+                      int64_t sig_xsq);
+
+  int32_t local_cost_;       // Cost of this point on its own.
+  int32_t total_cost_;       // Sum of all costs in best path to here.
+                             // During cost calculations local_cost is excluded.
+  int32_t total_steps_;      // Number of steps in best path to here.
+  const DPPoint *best_prev_; // Pointer to prev point in best path from here.
+  // Information for computing the variance part of the cost.
+  int32_t n_;       // Number of steps in best path to here for variance.
+  int32_t sig_x_;   // Sum of step sizes for computing variance.
+  int64_t sig_xsq_; // Sum of squares of steps for computing variance.
+};
+
+} // namespace tesseract.
+
+#endif // TESSERACT_CCSTRUCT_DPPOINT_H_
--- a/Show More
+++ b/Show More