feat: 集成Tesseract源码到项目中

Description:   由于仓库中的Tesseract不是最新版本导致产生了一个bug,因此将Tesseract源码集成到项目中

Log: no
Change-Id: I088de95d6c6ab670406daa8d47ed2ed46929c2c0
This commit is contained in:
wangcong 2021-06-22 20:13:39 +08:00
parent 40c90fc3c7
commit 0cfed22ed4
439 changed files with 185083 additions and 13 deletions

View File

@ -0,0 +1,848 @@
///////////////////////////////////////////////////////////////////////
// File: baseapi.h
// Description: Simple API for calling tesseract.
// Author: Ray Smith
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_API_BASEAPI_H_
#define TESSERACT_API_BASEAPI_H_
#ifdef HAVE_CONFIG_H
# include "config_auto.h" // DISABLED_LEGACY_ENGINE
#endif
#include "export.h"
#include "pageiterator.h"
#include "publictypes.h"
#include "resultiterator.h"
#include "unichar.h"
#include "3rdparty/tesseract_ocr/tesseract/include/tesseract/version.h"
#include <cstdio>
#include <tuple> // for std::tuple
#include <vector> // for std::vector
struct Pix;
struct Pixa;
struct Boxa;
namespace tesseract {
class PAGE_RES;
class ParagraphModel;
class BLOCK_LIST;
class ETEXT_DESC;
struct OSResults;
class UNICHARSET;
class Dawg;
class Dict;
class EquationDetect;
class PageIterator;
class ImageThresholder;
class LTRResultIterator;
class ResultIterator;
class MutableIterator;
class TessResultRenderer;
class Tesseract;
// Function to read a std::vector<char> from a whole file.
// Returns false on failure.
using FileReader = bool (*)(const char *filename, std::vector<char> *data);
using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID,
bool) const;
using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *,
int, const char *, int);
/**
* Base class for all tesseract APIs.
* Specific classes can add ability to work on different inputs or produce
* different outputs.
* This class is mostly an interface layer on top of the Tesseract instance
* class to hide the data types so that users of this class don't have to
* include any other Tesseract headers.
*/
class TESS_API TessBaseAPI {
public:
TessBaseAPI();
virtual ~TessBaseAPI();
// Copy constructor and assignment operator are currently unsupported.
TessBaseAPI(TessBaseAPI const &) = delete;
TessBaseAPI &operator=(TessBaseAPI const &) = delete;
/**
* Returns the version identifier as a static string. Do not delete.
*/
static const char *Version();
/**
* If compiled with OpenCL AND an available OpenCL
* device is deemed faster than serial code, then
* "device" is populated with the cl_device_id
* and returns sizeof(cl_device_id)
* otherwise *device=nullptr and returns 0.
*/
static size_t getOpenCLDevice(void **device);
/**
* Set the name of the input file. Needed for training and
* reading a UNLV zone file, and for searchable PDF output.
*/
void SetInputName(const char *name);
/**
* These functions are required for searchable PDF output.
* We need our hands on the input file so that we can include
* it in the PDF without transcoding. If that is not possible,
* we need the original image. Finally, resolution metadata
* is stored in the PDF so we need that as well.
*/
const char *GetInputName();
// Takes ownership of the input pix.
void SetInputImage(Pix *pix);
Pix *GetInputImage();
int GetSourceYResolution();
const char *GetDatapath();
/** Set the name of the bonus output files. Needed only for debugging. */
void SetOutputName(const char *name);
/**
* Set the value of an internal "parameter."
* Supply the name of the parameter and the value as a string, just as
* you would in a config file.
* Returns false if the name lookup failed.
* Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z.
* Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode.
* SetVariable may be used before Init, but settings will revert to
* defaults on End().
*
* Note: Must be called after Init(). Only works for non-init variables
* (init variables should be passed to Init()).
*/
bool SetVariable(const char *name, const char *value);
bool SetDebugVariable(const char *name, const char *value);
/**
* Returns true if the parameter was found among Tesseract parameters.
* Fills in value with the value of the parameter.
*/
bool GetIntVariable(const char *name, int *value) const;
bool GetBoolVariable(const char *name, bool *value) const;
bool GetDoubleVariable(const char *name, double *value) const;
/**
* Returns the pointer to the string that represents the value of the
* parameter if it was found among Tesseract parameters.
*/
const char *GetStringVariable(const char *name) const;
#ifndef DISABLED_LEGACY_ENGINE
/**
* Print Tesseract fonts table to the given file.
*/
void PrintFontsTable(FILE* fp) const;
#endif
/**
* Print Tesseract parameters to the given file.
*/
void PrintVariables(FILE *fp) const;
/**
* Get value of named variable as a string, if it exists.
*/
bool GetVariableAsString(const char *name, std::string *val) const;
/**
* Instances are now mostly thread-safe and totally independent,
* but some global parameters remain. Basically it is safe to use multiple
* TessBaseAPIs in different threads in parallel, UNLESS:
* you use SetVariable on some of the Params in classify and textord.
* If you do, then the effect will be to change it for all your instances.
*
* Start tesseract. Returns zero on success and -1 on failure.
* NOTE that the only members that may be called before Init are those
* listed above here in the class definition.
*
* The datapath must be the name of the tessdata directory.
* The language is (usually) an ISO 639-3 string or nullptr will default to
* eng. It is entirely safe (and eventually will be efficient too) to call
* Init multiple times on the same instance to change language, or just
* to reset the classifier.
* The language may be a string of the form [~]<lang>[+[~]<lang>]* indicating
* that multiple languages are to be loaded. Eg hin+eng will load Hindi and
* English. Languages may specify internally that they want to be loaded
* with one or more other languages, so the ~ sign is available to override
* that. Eg if hin were set to load eng by default, then hin+~eng would force
* loading only hin. The number of loaded languages is limited only by
* memory, with the caveat that loading additional languages will impact
* both speed and accuracy, as there is more work to do to decide on the
* applicable language, and there is more chance of hallucinating incorrect
* words.
* WARNING: On changing languages, all Tesseract parameters are reset
* back to their default values. (Which may vary between languages.)
* If you have a rare need to set a Variable that controls
* initialization for a second call to Init you should explicitly
* call End() and then use SetVariable before Init. This is only a very
* rare use case, since there are very few uses that require any parameters
* to be set before Init.
*
* If set_only_non_debug_params is true, only params that do not contain
* "debug" in the name will be set.
*/
int Init(const char *datapath, const char *language, OcrEngineMode mode,
char **configs, int configs_size,
const std::vector<std::string> *vars_vec,
const std::vector<std::string> *vars_values,
bool set_only_non_debug_params);
int Init(const char *datapath, const char *language, OcrEngineMode oem) {
return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false);
}
int Init(const char *datapath, const char *language) {
return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr,
false);
}
// In-memory version reads the traineddata file directly from the given
// data[data_size] array, and/or reads data via a FileReader.
int Init(const char *data, int data_size, const char *language,
OcrEngineMode mode, char **configs, int configs_size,
const std::vector<std::string> *vars_vec,
const std::vector<std::string> *vars_values,
bool set_only_non_debug_params, FileReader reader);
/**
* Returns the languages string used in the last valid initialization.
* If the last initialization specified "deu+hin" then that will be
* returned. If hin loaded eng automatically as well, then that will
* not be included in this list. To find the languages actually
* loaded use GetLoadedLanguagesAsVector.
* The returned string should NOT be deleted.
*/
const char *GetInitLanguagesAsString() const;
/**
* Returns the loaded languages in the vector of std::string.
* Includes all languages loaded by the last Init, including those loaded
* as dependencies of other loaded languages.
*/
void GetLoadedLanguagesAsVector(std::vector<std::string> *langs) const;
/**
* Returns the available languages in the sorted vector of std::string.
*/
void GetAvailableLanguagesAsVector(std::vector<std::string> *langs) const;
/**
* Init only the lang model component of Tesseract. The only functions
* that work after this init are SetVariable and IsValidWord.
* WARNING: temporary! This function will be removed from here and placed
* in a separate API at some future time.
*/
int InitLangMod(const char *datapath, const char *language);
/**
* Init only for page layout analysis. Use only for calls to SetImage and
* AnalysePage. Calls that attempt recognition will generate an error.
*/
void InitForAnalysePage();
/**
* Read a "config" file containing a set of param, value pairs.
* Searches the standard places: tessdata/configs, tessdata/tessconfigs
* and also accepts a relative or absolute path name.
* Note: only non-init params will be set (init params are set by Init()).
*/
void ReadConfigFile(const char *filename);
/** Same as above, but only set debug params from the given config file. */
void ReadDebugConfigFile(const char *filename);
/**
* Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK.
* The mode is stored as an IntParam so it can also be modified by
* ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
*/
void SetPageSegMode(PageSegMode mode);
/** Return the current page segmentation mode. */
PageSegMode GetPageSegMode() const;
/**
* Recognize a rectangle from an image and return the result as a string.
* May be called many times for a single Init.
* Currently has no error checking.
* Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
* Palette color images will not work properly and must be converted to
* 24 bit.
* Binary images of 1 bit per pixel may also be given but they must be
* byte packed with the MSB of the first byte being the first pixel, and a
* 1 represents WHITE. For binary images set bytes_per_pixel=0.
* The recognized text is returned as a char* which is coded
* as UTF8 and must be freed with the delete [] operator.
*
* Note that TesseractRect is the simplified convenience interface.
* For advanced uses, use SetImage, (optionally) SetRectangle, Recognize,
* and one or more of the Get*Text functions below.
*/
char *TesseractRect(const unsigned char *imagedata, int bytes_per_pixel,
int bytes_per_line, int left, int top, int width,
int height);
/**
* Call between pages or documents etc to free up memory and forget
* adaptive data.
*/
void ClearAdaptiveClassifier();
/**
* @defgroup AdvancedAPI Advanced API
* The following methods break TesseractRect into pieces, so you can
* get hold of the thresholded image, get the text in different formats,
* get bounding boxes, confidences etc.
*/
/* @{ */
/**
* Provide an image for Tesseract to recognize. Format is as
* TesseractRect above. Copies the image buffer and converts to Pix.
* SetImage clears all recognition results, and sets the rectangle to the
* full image, so it may be followed immediately by a GetUTF8Text, and it
* will automatically perform recognition.
*/
void SetImage(const unsigned char *imagedata, int width, int height,
int bytes_per_pixel, int bytes_per_line);
/**
* Provide an image for Tesseract to recognize. As with SetImage above,
* Tesseract takes its own copy of the image, so it need not persist until
* after Recognize.
* Pix vs raw, which to use?
* Use Pix where possible. Tesseract uses Pix as its internal representation
* and it is therefore more efficient to provide a Pix directly.
*/
void SetImage(Pix *pix);
/**
* Set the resolution of the source image in pixels per inch so font size
* information can be calculated in results. Call this after SetImage().
*/
void SetSourceResolution(int ppi);
/**
* Restrict recognition to a sub-rectangle of the image. Call after SetImage.
* Each SetRectangle clears the recogntion results so multiple rectangles
* can be recognized with the same image.
*/
void SetRectangle(int left, int top, int width, int height);
/**
* Get a copy of the internal thresholded image from Tesseract.
* Caller takes ownership of the Pix and must pixDestroy it.
* May be called any time after SetImage, or after TesseractRect.
*/
Pix *GetThresholdedImage();
/**
* Get the result of page layout analysis as a leptonica-style
* Boxa, Pixa pair, in reading order.
* Can be called before or after Recognize.
*/
Boxa *GetRegions(Pixa **pixa);
/**
* Get the textlines as a leptonica-style
* Boxa, Pixa pair, in reading order.
* Can be called before or after Recognize.
* If raw_image is true, then extract from the original image instead of the
* thresholded image and pad by raw_padding pixels.
* If blockids is not nullptr, the block-id of each line is also returned as
* an array of one element per line. delete [] after use. If paraids is not
* nullptr, the paragraph-id of each line within its block is also returned as
* an array of one element per line. delete [] after use.
*/
Boxa *GetTextlines(bool raw_image, int raw_padding, Pixa **pixa,
int **blockids, int **paraids);
/*
Helper method to extract from the thresholded image. (most common usage)
*/
Boxa *GetTextlines(Pixa **pixa, int **blockids) {
return GetTextlines(false, 0, pixa, blockids, nullptr);
}
/**
* Get textlines and strips of image regions as a leptonica-style Boxa, Pixa
* pair, in reading order. Enables downstream handling of non-rectangular
* regions.
* Can be called before or after Recognize.
* If blockids is not nullptr, the block-id of each line is also returned as
* an array of one element per line. delete [] after use.
*/
Boxa *GetStrips(Pixa **pixa, int **blockids);
/**
* Get the words as a leptonica-style
* Boxa, Pixa pair, in reading order.
* Can be called before or after Recognize.
*/
Boxa *GetWords(Pixa **pixa);
/**
* Gets the individual connected (text) components (created
* after pages segmentation step, but before recognition)
* as a leptonica-style Boxa, Pixa pair, in reading order.
* Can be called before or after Recognize.
* Note: the caller is responsible for calling boxaDestroy()
* on the returned Boxa array and pixaDestroy() on cc array.
*/
Boxa *GetConnectedComponents(Pixa **cc);
/**
* Get the given level kind of components (block, textline, word etc.) as a
* leptonica-style Boxa, Pixa pair, in reading order.
* Can be called before or after Recognize.
* If blockids is not nullptr, the block-id of each component is also returned
* as an array of one element per component. delete [] after use.
* If blockids is not nullptr, the paragraph-id of each component with its
* block is also returned as an array of one element per component. delete []
* after use. If raw_image is true, then portions of the original image are
* extracted instead of the thresholded image and padded with raw_padding. If
* text_only is true, then only text components are returned.
*/
Boxa *GetComponentImages(PageIteratorLevel level, bool text_only,
bool raw_image, int raw_padding, Pixa **pixa,
int **blockids, int **paraids);
// Helper function to get binary images with no padding (most common usage).
Boxa *GetComponentImages(const PageIteratorLevel level, const bool text_only,
Pixa **pixa, int **blockids) {
return GetComponentImages(level, text_only, false, 0, pixa, blockids,
nullptr);
}
/**
* Returns the scale factor of the thresholded image that would be returned by
* GetThresholdedImage() and the various GetX() methods that call
* GetComponentImages().
* Returns 0 if no thresholder has been set.
*/
int GetThresholdedImageScaleFactor() const;
/**
* Runs page layout analysis in the mode set by SetPageSegMode.
* May optionally be called prior to Recognize to get access to just
* the page layout results. Returns an iterator to the results.
* If merge_similar_words is true, words are combined where suitable for use
* with a line recognizer. Use if you want to use AnalyseLayout to find the
* textlines, and then want to process textline fragments with an external
* line recognizer.
* Returns nullptr on error or an empty page.
* The returned iterator must be deleted after use.
* WARNING! This class points to data held within the TessBaseAPI class, and
* therefore can only be used while the TessBaseAPI class still exists and
* has not been subjected to a call of Init, SetImage, Recognize, Clear, End
* DetectOS, or anything else that changes the internal PAGE_RES.
*/
PageIterator *AnalyseLayout();
PageIterator *AnalyseLayout(bool merge_similar_words);
/**
* Recognize the image from SetAndThresholdImage, generating Tesseract
* internal structures. Returns 0 on success.
* Optional. The Get*Text functions below will call Recognize if needed.
* After Recognize, the output is kept internally until the next SetImage.
*/
int Recognize(ETEXT_DESC *monitor);
/**
* Methods to retrieve information after SetAndThresholdImage(),
* Recognize() or TesseractRect(). (Recognize is called implicitly if needed.)
*/
/**
* Turns images into symbolic text.
*
* filename can point to a single image, a multi-page TIFF,
* or a plain text list of image filenames.
*
* retry_config is useful for debugging. If not nullptr, you can fall
* back to an alternate configuration if a page fails for some
* reason.
*
* timeout_millisec terminates processing if any single page
* takes too long. Set to 0 for unlimited time.
*
* renderer is responible for creating the output. For example,
* use the TessTextRenderer if you want plaintext output, or
* the TessPDFRender to produce searchable PDF.
*
* If tessedit_page_number is non-negative, will only process that
* single page. Works for multi-page tiff file, or filelist.
*
* Returns true if successful, false on error.
*/
bool ProcessPages(const char *filename, const char *retry_config,
int timeout_millisec, TessResultRenderer *renderer);
// Does the real work of ProcessPages.
bool ProcessPagesInternal(const char *filename, const char *retry_config,
int timeout_millisec, TessResultRenderer *renderer);
/**
* Turn a single image into symbolic text.
*
* The pix is the image processed. filename and page_index are
* metadata used by side-effect processes, such as reading a box
* file or formatting as hOCR.
*
* See ProcessPages for descriptions of other parameters.
*/
bool ProcessPage(Pix *pix, int page_index, const char *filename,
const char *retry_config, int timeout_millisec,
TessResultRenderer *renderer);
/**
* Get a reading-order iterator to the results of LayoutAnalysis and/or
* Recognize. The returned iterator must be deleted after use.
* WARNING! This class points to data held within the TessBaseAPI class, and
* therefore can only be used while the TessBaseAPI class still exists and
* has not been subjected to a call of Init, SetImage, Recognize, Clear, End
* DetectOS, or anything else that changes the internal PAGE_RES.
*/
ResultIterator *GetIterator();
/**
* Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.
* The returned iterator must be deleted after use.
* WARNING! This class points to data held within the TessBaseAPI class, and
* therefore can only be used while the TessBaseAPI class still exists and
* has not been subjected to a call of Init, SetImage, Recognize, Clear, End
* DetectOS, or anything else that changes the internal PAGE_RES.
*/
MutableIterator *GetMutableIterator();
/**
* The recognized text is returned as a char* which is coded
* as UTF8 and must be freed with the delete [] operator.
*/
char *GetUTF8Text();
size_t GetNumberOfTables() const;
/// Return the i-th table bounding box coordinates
///
/// Gives the (top_left.x, top_left.y, bottom_right.x, bottom_right.y)
/// coordinates of the i-th table.
std::tuple<int, int, int, int> GetTableBoundingBox(
unsigned
i ///< Index of the table, for upper limit \see GetNumberOfTables()
);
/// Get bounding boxes of the rows of a table
/// return values are (top_left.x, top_left.y, bottom_right.x, bottom_right.y)
std::vector<std::tuple<int, int, int, int> > GetTableRows(
unsigned
i ///< Index of the table, for upper limit \see GetNumberOfTables()
);
/// Get bounding boxes of the cols of a table
/// return values are (top_left.x, top_left.y, bottom_right.x, bottom_right.y)
std::vector<std::tuple<int, int, int, int> > GetTableCols(
unsigned
i ///< Index of the table, for upper limit \see GetNumberOfTables()
);
/**
* Make a HTML-formatted string with hOCR markup from the internal
* data structures.
* page_number is 0-based but will appear in the output as 1-based.
* monitor can be used to
* cancel the recognition
* receive progress callbacks
* Returned string must be freed with the delete [] operator.
*/
char *GetHOCRText(ETEXT_DESC *monitor, int page_number);
/**
* Make a HTML-formatted string with hOCR markup from the internal
* data structures.
* page_number is 0-based but will appear in the output as 1-based.
* Returned string must be freed with the delete [] operator.
*/
char *GetHOCRText(int page_number);
/**
* Make an XML-formatted string with Alto markup from the internal
* data structures.
*/
char *GetAltoText(ETEXT_DESC *monitor, int page_number);
/**
* Make an XML-formatted string with Alto markup from the internal
* data structures.
*/
char *GetAltoText(int page_number);
/**
* Make a TSV-formatted string from the internal data structures.
* page_number is 0-based but will appear in the output as 1-based.
* Returned string must be freed with the delete [] operator.
*/
char *GetTSVText(int page_number);
/**
* Make a box file for LSTM training from the internal data structures.
* Constructs coordinates in the original image - not just the rectangle.
* page_number is a 0-based page index that will appear in the box file.
* Returned string must be freed with the delete [] operator.
*/
char *GetLSTMBoxText(int page_number);
/**
* The recognized text is returned as a char* which is coded in the same
* format as a box file used in training.
* Constructs coordinates in the original image - not just the rectangle.
* page_number is a 0-based page index that will appear in the box file.
* Returned string must be freed with the delete [] operator.
*/
char *GetBoxText(int page_number);
/**
* The recognized text is returned as a char* which is coded in the same
* format as a WordStr box file used in training.
* page_number is a 0-based page index that will appear in the box file.
* Returned string must be freed with the delete [] operator.
*/
char *GetWordStrBoxText(int page_number);
/**
* The recognized text is returned as a char* which is coded
* as UNLV format Latin-1 with specific reject and suspect codes.
* Returned string must be freed with the delete [] operator.
*/
char *GetUNLVText();
/**
* Detect the orientation of the input image and apparent script (alphabet).
* orient_deg is the detected clockwise rotation of the input image in degrees
* (0, 90, 180, 270)
* orient_conf is the confidence (15.0 is reasonably confident)
* script_name is an ASCII string, the name of the script, e.g. "Latin"
* script_conf is confidence level in the script
* Returns true on success and writes values to each parameter as an output
*/
bool DetectOrientationScript(int *orient_deg, float *orient_conf,
const char **script_name, float *script_conf);
/**
* The recognized text is returned as a char* which is coded
* as UTF8 and must be freed with the delete [] operator.
* page_number is a 0-based page index that will appear in the osd file.
*/
char *GetOsdText(int page_number);
/** Returns the (average) confidence value between 0 and 100. */
int MeanTextConf();
/**
* Returns all word confidences (between 0 and 100) in an array, terminated
* by -1. The calling function must delete [] after use.
* The number of confidences should correspond to the number of space-
* delimited words in GetUTF8Text.
*/
int *AllWordConfidences();
#ifndef DISABLED_LEGACY_ENGINE
/**
* Applies the given word to the adaptive classifier if possible.
* The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can
* tell the boundaries of the graphemes.
* Assumes that SetImage/SetRectangle have been used to set the image
* to the given word. The mode arg should be PSM_SINGLE_WORD or
* PSM_CIRCLE_WORD, as that will be used to control layout analysis.
* The currently set PageSegMode is preserved.
* Returns false if adaption was not possible for some reason.
*/
bool AdaptToWordStr(PageSegMode mode, const char *wordstr);
#endif // ndef DISABLED_LEGACY_ENGINE
/**
* Free up recognition results and any stored image data, without actually
* freeing any recognition data that would be time-consuming to reload.
* Afterwards, you must call SetImage or TesseractRect before doing
* any Recognize or Get* operation.
*/
void Clear();
/**
* Close down tesseract and free up all memory. End() is equivalent to
* destructing and reconstructing your TessBaseAPI.
* Once End() has been used, none of the other API functions may be used
* other than Init and anything declared above it in the class definition.
*/
void End();
/**
* Clear any library-level memory caches.
* There are a variety of expensive-to-load constant data structures (mostly
* language dictionaries) that are cached globally -- surviving the Init()
* and End() of individual TessBaseAPI's. This function allows the clearing
* of these caches.
**/
static void ClearPersistentCache();
/**
* Check whether a word is valid according to Tesseract's language model
* @return 0 if the word is invalid, non-zero if valid.
* @warning temporary! This function will be removed from here and placed
* in a separate API at some future time.
*/
int IsValidWord(const char *word) const;
// Returns true if utf8_character is defined in the UniCharset.
bool IsValidCharacter(const char *utf8_character) const;
bool GetTextDirection(int *out_offset, float *out_slope);
/** Sets Dict::letter_is_okay_ function to point to the given function. */
void SetDictFunc(DictFunc f);
/** Sets Dict::probability_in_context_ function to point to the given
* function.
*/
void SetProbabilityInContextFunc(ProbabilityInContextFunc f);
/**
* Estimates the Orientation And Script of the image.
* @return true if the image was processed successfully.
*/
bool DetectOS(OSResults *);
/**
* Return text orientation of each block as determined by an earlier run
* of layout analysis.
*/
void GetBlockTextOrientations(int **block_orientation,
bool **vertical_writing);
/** This method returns the string form of the specified unichar. */
const char *GetUnichar(int unichar_id) const;
/** Return the pointer to the i-th dawg loaded into tesseract_ object. */
const Dawg *GetDawg(int i) const;
/** Return the number of dawgs loaded into tesseract_ object. */
int NumDawgs() const;
Tesseract *tesseract() const {
return tesseract_;
}
OcrEngineMode oem() const {
return last_oem_requested_;
}
void set_min_orientation_margin(double margin);
/* @} */
protected:
/** Common code for setting the image. Returns true if Init has been called.
*/
bool InternalSetImage();
/**
* Run the thresholder to make the thresholded image. If pix is not nullptr,
* the source is thresholded to pix instead of the internal IMAGE.
*/
virtual bool Threshold(Pix **pix);
/**
* Find lines from the image making the BLOCK_LIST.
* @return 0 on success.
*/
int FindLines();
/** Delete the pageres and block list ready for a new page. */
void ClearResults();
/**
* Return an LTR Result Iterator -- used only for training, as we really want
* to ignore all BiDi smarts at that point.
* delete once you're done with it.
*/
LTRResultIterator *GetLTRIterator();
/**
* Return the length of the output text string, as UTF8, assuming
* one newline per line and one per block, with a terminator,
* and assuming a single character reject marker for each rejected character.
* Also return the number of recognized blobs in blob_count.
*/
int TextLength(int *blob_count) const;
//// paragraphs.cpp ////////////////////////////////////////////////////
void DetectParagraphs(bool after_text_recognition);
const PAGE_RES *GetPageRes() const {
return page_res_;
}
protected:
Tesseract *tesseract_; ///< The underlying data object.
Tesseract *osd_tesseract_; ///< For orientation & script detection.
EquationDetect *equ_detect_; ///< The equation detector.
FileReader reader_; ///< Reads files from any filesystem.
ImageThresholder *thresholder_; ///< Image thresholding module.
std::vector<ParagraphModel *> *paragraph_models_;
BLOCK_LIST *block_list_; ///< The page layout.
PAGE_RES *page_res_; ///< The page-level data.
std::string input_file_; ///< Name used by training code.
std::string output_file_; ///< Name used by debug code.
std::string datapath_; ///< Current location of tessdata.
std::string language_; ///< Last initialized language.
OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested.
bool recognition_done_; ///< page_res_ contains recognition data.
/**
* @defgroup ThresholderParams Thresholder Parameters
* Parameters saved from the Thresholder. Needed to rebuild coordinates.
*/
/* @{ */
int rect_left_;
int rect_top_;
int rect_width_;
int rect_height_;
int image_width_;
int image_height_;
/* @} */
private:
// A list of image filenames gets special consideration
bool ProcessPagesFileList(FILE *fp, std::string *buf,
const char *retry_config, int timeout_millisec,
TessResultRenderer *renderer,
int tessedit_page_number);
// TIFF supports multipage so gets special consideration.
bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size,
const char *filename, const char *retry_config,
int timeout_millisec,
TessResultRenderer *renderer,
int tessedit_page_number);
}; // class TessBaseAPI.
/** Escape a char string - remove &<>"' with HTML codes. */
std::string HOcrEscape(const char *text);
} // namespace tesseract
#endif // TESSERACT_API_BASEAPI_H_

View File

@ -0,0 +1,482 @@
///////////////////////////////////////////////////////////////////////
// File: capi.h
// Description: C-API TessBaseAPI
//
// (C) Copyright 2012, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef API_CAPI_H_
#define API_CAPI_H_
#include "export.h"
#ifdef __cplusplus
# include <tesseract/baseapi.h>
# include <tesseract/ocrclass.h>
# include <tesseract/pageiterator.h>
# include <tesseract/renderer.h>
# include <tesseract/resultiterator.h>
#endif
#include <stdbool.h>
#include <stdio.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifndef BOOL
# define BOOL int
# define TRUE 1
# define FALSE 0
#endif
#ifdef __cplusplus
typedef tesseract::TessResultRenderer TessResultRenderer;
typedef tesseract::TessBaseAPI TessBaseAPI;
typedef tesseract::PageIterator TessPageIterator;
typedef tesseract::ResultIterator TessResultIterator;
typedef tesseract::MutableIterator TessMutableIterator;
typedef tesseract::ChoiceIterator TessChoiceIterator;
typedef tesseract::OcrEngineMode TessOcrEngineMode;
typedef tesseract::PageSegMode TessPageSegMode;
typedef tesseract::PageIteratorLevel TessPageIteratorLevel;
typedef tesseract::Orientation TessOrientation;
typedef tesseract::ParagraphJustification TessParagraphJustification;
typedef tesseract::WritingDirection TessWritingDirection;
typedef tesseract::TextlineOrder TessTextlineOrder;
typedef tesseract::PolyBlockType TessPolyBlockType;
typedef tesseract::ETEXT_DESC ETEXT_DESC;
#else
typedef struct TessResultRenderer TessResultRenderer;
typedef struct TessBaseAPI TessBaseAPI;
typedef struct TessPageIterator TessPageIterator;
typedef struct TessResultIterator TessResultIterator;
typedef struct TessMutableIterator TessMutableIterator;
typedef struct TessChoiceIterator TessChoiceIterator;
typedef enum TessOcrEngineMode {
OEM_TESSERACT_ONLY,
OEM_LSTM_ONLY,
OEM_TESSERACT_LSTM_COMBINED,
OEM_DEFAULT
} TessOcrEngineMode;
typedef enum TessPageSegMode {
PSM_OSD_ONLY,
PSM_AUTO_OSD,
PSM_AUTO_ONLY,
PSM_AUTO,
PSM_SINGLE_COLUMN,
PSM_SINGLE_BLOCK_VERT_TEXT,
PSM_SINGLE_BLOCK,
PSM_SINGLE_LINE,
PSM_SINGLE_WORD,
PSM_CIRCLE_WORD,
PSM_SINGLE_CHAR,
PSM_SPARSE_TEXT,
PSM_SPARSE_TEXT_OSD,
PSM_RAW_LINE,
PSM_COUNT
} TessPageSegMode;
typedef enum TessPageIteratorLevel {
RIL_BLOCK,
RIL_PARA,
RIL_TEXTLINE,
RIL_WORD,
RIL_SYMBOL
} TessPageIteratorLevel;
typedef enum TessPolyBlockType {
PT_UNKNOWN,
PT_FLOWING_TEXT,
PT_HEADING_TEXT,
PT_PULLOUT_TEXT,
PT_EQUATION,
PT_INLINE_EQUATION,
PT_TABLE,
PT_VERTICAL_TEXT,
PT_CAPTION_TEXT,
PT_FLOWING_IMAGE,
PT_HEADING_IMAGE,
PT_PULLOUT_IMAGE,
PT_HORZ_LINE,
PT_VERT_LINE,
PT_NOISE,
PT_COUNT
} TessPolyBlockType;
typedef enum TessOrientation {
ORIENTATION_PAGE_UP,
ORIENTATION_PAGE_RIGHT,
ORIENTATION_PAGE_DOWN,
ORIENTATION_PAGE_LEFT
} TessOrientation;
typedef enum TessParagraphJustification {
JUSTIFICATION_UNKNOWN,
JUSTIFICATION_LEFT,
JUSTIFICATION_CENTER,
JUSTIFICATION_RIGHT
} TessParagraphJustification;
typedef enum TessWritingDirection {
WRITING_DIRECTION_LEFT_TO_RIGHT,
WRITING_DIRECTION_RIGHT_TO_LEFT,
WRITING_DIRECTION_TOP_TO_BOTTOM
} TessWritingDirection;
typedef enum TessTextlineOrder {
TEXTLINE_ORDER_LEFT_TO_RIGHT,
TEXTLINE_ORDER_RIGHT_TO_LEFT,
TEXTLINE_ORDER_TOP_TO_BOTTOM
} TessTextlineOrder;
typedef struct ETEXT_DESC ETEXT_DESC;
#endif
typedef bool (*TessCancelFunc)(void *cancel_this, int words);
typedef bool (*TessProgressFunc)(ETEXT_DESC *ths, int left, int right, int top,
int bottom);
struct Pix;
struct Boxa;
struct Pixa;
/* General free functions */
TESS_API const char *TessVersion();
TESS_API void TessDeleteText(const char *text);
TESS_API void TessDeleteTextArray(char **arr);
TESS_API void TessDeleteIntArray(const int *arr);
/* Renderer API */
TESS_API TessResultRenderer *TessTextRendererCreate(const char *outputbase);
TESS_API TessResultRenderer *TessHOcrRendererCreate(const char *outputbase);
TESS_API TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase,
BOOL font_info);
TESS_API TessResultRenderer *TessAltoRendererCreate(const char *outputbase);
TESS_API TessResultRenderer *TessTsvRendererCreate(const char *outputbase);
TESS_API TessResultRenderer *TessPDFRendererCreate(const char *outputbase,
const char *datadir,
BOOL textonly);
TESS_API TessResultRenderer *TessUnlvRendererCreate(const char *outputbase);
TESS_API TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase);
TESS_API TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase);
TESS_API TessResultRenderer *TessWordStrBoxRendererCreate(
const char *outputbase);
TESS_API void TessDeleteResultRenderer(TessResultRenderer *renderer);
TESS_API void TessResultRendererInsert(TessResultRenderer *renderer,
TessResultRenderer *next);
TESS_API TessResultRenderer *TessResultRendererNext(
TessResultRenderer *renderer);
TESS_API BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer,
const char *title);
TESS_API BOOL TessResultRendererAddImage(TessResultRenderer *renderer,
TessBaseAPI *api);
TESS_API BOOL TessResultRendererEndDocument(TessResultRenderer *renderer);
TESS_API const char *TessResultRendererExtention(TessResultRenderer *renderer);
TESS_API const char *TessResultRendererTitle(TessResultRenderer *renderer);
TESS_API int TessResultRendererImageNum(TessResultRenderer *renderer);
/* Base API */
TESS_API TessBaseAPI *TessBaseAPICreate();
TESS_API void TessBaseAPIDelete(TessBaseAPI *handle);
TESS_API size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI *handle, void **device);
TESS_API void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name);
TESS_API const char *TessBaseAPIGetInputName(TessBaseAPI *handle);
TESS_API void TessBaseAPISetInputImage(TessBaseAPI *handle, struct Pix *pix);
TESS_API struct Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle);
TESS_API int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle);
TESS_API const char *TessBaseAPIGetDatapath(TessBaseAPI *handle);
TESS_API void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name);
TESS_API BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name,
const char *value);
TESS_API BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name,
const char *value);
TESS_API BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle,
const char *name, int *value);
TESS_API BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle,
const char *name, BOOL *value);
TESS_API BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle,
const char *name, double *value);
TESS_API const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle,
const char *name);
TESS_API void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp);
TESS_API BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle,
const char *filename);
TESS_API int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath,
const char *language, TessOcrEngineMode oem,
char **configs, int configs_size);
TESS_API int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath,
const char *language, TessOcrEngineMode oem);
TESS_API int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath,
const char *language);
TESS_API int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath,
const char *language, TessOcrEngineMode mode,
char **configs, int configs_size, char **vars_vec,
char **vars_values, size_t vars_vec_size,
BOOL set_only_non_debug_params);
TESS_API const char *TessBaseAPIGetInitLanguagesAsString(
const TessBaseAPI *handle);
TESS_API char **TessBaseAPIGetLoadedLanguagesAsVector(
const TessBaseAPI *handle);
TESS_API char **TessBaseAPIGetAvailableLanguagesAsVector(
const TessBaseAPI *handle);
TESS_API int TessBaseAPIInitLangMod(TessBaseAPI *handle, const char *datapath,
const char *language);
TESS_API void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle);
TESS_API void TessBaseAPIReadConfigFile(TessBaseAPI *handle,
const char *filename);
TESS_API void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle,
const char *filename);
TESS_API void TessBaseAPISetPageSegMode(TessBaseAPI *handle,
TessPageSegMode mode);
TESS_API TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle);
TESS_API char *TessBaseAPIRect(TessBaseAPI *handle,
const unsigned char *imagedata,
int bytes_per_pixel, int bytes_per_line,
int left, int top, int width, int height);
TESS_API void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle);
TESS_API void TessBaseAPISetImage(TessBaseAPI *handle,
const unsigned char *imagedata, int width,
int height, int bytes_per_pixel,
int bytes_per_line);
TESS_API void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix);
TESS_API void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi);
TESS_API void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top,
int width, int height);
TESS_API struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle);
TESS_API struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle,
struct Pixa **pixa);
TESS_API struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle,
struct Pixa **pixa,
int **blockids);
TESS_API struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle,
BOOL raw_image, int raw_padding,
struct Pixa **pixa,
int **blockids, int **paraids);
TESS_API struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle,
struct Pixa **pixa, int **blockids);
TESS_API struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle,
struct Pixa **pixa);
TESS_API struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle,
struct Pixa **cc);
TESS_API struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle,
TessPageIteratorLevel level,
BOOL text_only,
struct Pixa **pixa,
int **blockids);
TESS_API struct Boxa *TessBaseAPIGetComponentImages1(
TessBaseAPI *handle, TessPageIteratorLevel level, BOOL text_only,
BOOL raw_image, int raw_padding, struct Pixa **pixa, int **blockids,
int **paraids);
TESS_API int TessBaseAPIGetThresholdedImageScaleFactor(
const TessBaseAPI *handle);
TESS_API TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle);
TESS_API int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor);
TESS_API BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename,
const char *retry_config,
int timeout_millisec,
TessResultRenderer *renderer);
TESS_API BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix,
int page_index, const char *filename,
const char *retry_config,
int timeout_millisec,
TessResultRenderer *renderer);
TESS_API TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle);
TESS_API TessMutableIterator *TessBaseAPIGetMutableIterator(
TessBaseAPI *handle);
TESS_API char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle);
TESS_API char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number);
TESS_API char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number);
TESS_API char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number);
TESS_API char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number);
TESS_API char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number);
TESS_API char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle,
int page_number);
TESS_API char *TessBaseAPIGetUNLVText(TessBaseAPI *handle);
TESS_API int TessBaseAPIMeanTextConf(TessBaseAPI *handle);
TESS_API int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle);
#ifndef DISABLED_LEGACY_ENGINE
TESS_API BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle,
TessPageSegMode mode,
const char *wordstr);
#endif // #ifndef DISABLED_LEGACY_ENGINE
TESS_API void TessBaseAPIClear(TessBaseAPI *handle);
TESS_API void TessBaseAPIEnd(TessBaseAPI *handle);
TESS_API int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word);
TESS_API BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset,
float *out_slope);
TESS_API const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id);
TESS_API void TessBaseAPIClearPersistentCache(TessBaseAPI *handle);
#ifndef DISABLED_LEGACY_ENGINE
// Call TessDeleteText(*best_script_name) to free memory allocated by this
// function
TESS_API BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle,
int *orient_deg,
float *orient_conf,
const char **script_name,
float *script_conf);
#endif // #ifndef DISABLED_LEGACY_ENGINE
TESS_API void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle,
double margin);
TESS_API int TessBaseAPINumDawgs(const TessBaseAPI *handle);
TESS_API TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle);
TESS_API void TessBaseGetBlockTextOrientations(TessBaseAPI *handle,
int **block_orientation,
bool **vertical_writing);
/* Page iterator */
TESS_API void TessPageIteratorDelete(TessPageIterator *handle);
TESS_API TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle);
TESS_API void TessPageIteratorBegin(TessPageIterator *handle);
TESS_API BOOL TessPageIteratorNext(TessPageIterator *handle,
TessPageIteratorLevel level);
TESS_API BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle,
TessPageIteratorLevel level);
TESS_API BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle,
TessPageIteratorLevel level,
TessPageIteratorLevel element);
TESS_API BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle,
TessPageIteratorLevel level,
int *left, int *top, int *right,
int *bottom);
TESS_API TessPolyBlockType
TessPageIteratorBlockType(const TessPageIterator *handle);
TESS_API struct Pix *TessPageIteratorGetBinaryImage(
const TessPageIterator *handle, TessPageIteratorLevel level);
TESS_API struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle,
TessPageIteratorLevel level,
int padding,
struct Pix *original_image,
int *left, int *top);
TESS_API BOOL TessPageIteratorBaseline(const TessPageIterator *handle,
TessPageIteratorLevel level, int *x1,
int *y1, int *x2, int *y2);
TESS_API void TessPageIteratorOrientation(
TessPageIterator *handle, TessOrientation *orientation,
TessWritingDirection *writing_direction, TessTextlineOrder *textline_order,
float *deskew_angle);
TESS_API void TessPageIteratorParagraphInfo(
TessPageIterator *handle, TessParagraphJustification *justification,
BOOL *is_list_item, BOOL *is_crown, int *first_line_indent);
/* Result iterator */
TESS_API void TessResultIteratorDelete(TessResultIterator *handle);
TESS_API TessResultIterator *TessResultIteratorCopy(
const TessResultIterator *handle);
TESS_API TessPageIterator *TessResultIteratorGetPageIterator(
TessResultIterator *handle);
TESS_API const TessPageIterator *TessResultIteratorGetPageIteratorConst(
const TessResultIterator *handle);
TESS_API TessChoiceIterator *TessResultIteratorGetChoiceIterator(
const TessResultIterator *handle);
TESS_API BOOL TessResultIteratorNext(TessResultIterator *handle,
TessPageIteratorLevel level);
TESS_API char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle,
TessPageIteratorLevel level);
TESS_API float TessResultIteratorConfidence(const TessResultIterator *handle,
TessPageIteratorLevel level);
TESS_API const char *TessResultIteratorWordRecognitionLanguage(
const TessResultIterator *handle);
TESS_API const char *TessResultIteratorWordFontAttributes(
const TessResultIterator *handle, BOOL *is_bold, BOOL *is_italic,
BOOL *is_underlined, BOOL *is_monospace, BOOL *is_serif, BOOL *is_smallcaps,
int *pointsize, int *font_id);
TESS_API BOOL
TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle);
TESS_API BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle);
TESS_API BOOL
TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle);
TESS_API BOOL
TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle);
TESS_API BOOL
TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle);
TESS_API void TessChoiceIteratorDelete(TessChoiceIterator *handle);
TESS_API BOOL TessChoiceIteratorNext(TessChoiceIterator *handle);
TESS_API const char *TessChoiceIteratorGetUTF8Text(
const TessChoiceIterator *handle);
TESS_API float TessChoiceIteratorConfidence(const TessChoiceIterator *handle);
/* Progress monitor */
TESS_API ETEXT_DESC *TessMonitorCreate();
TESS_API void TessMonitorDelete(ETEXT_DESC *monitor);
TESS_API void TessMonitorSetCancelFunc(ETEXT_DESC *monitor,
TessCancelFunc cancelFunc);
TESS_API void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis);
TESS_API void *TessMonitorGetCancelThis(ETEXT_DESC *monitor);
TESS_API void TessMonitorSetProgressFunc(ETEXT_DESC *monitor,
TessProgressFunc progressFunc);
TESS_API int TessMonitorGetProgress(ETEXT_DESC *monitor);
TESS_API void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline);
#ifdef __cplusplus
}
#endif
#endif // API_CAPI_H_

View File

@ -0,0 +1,39 @@
///////////////////////////////////////////////////////////////////////
// File: export.h
// Description: Place holder
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_PLATFORM_H_
#define TESSERACT_PLATFORM_H_
#ifndef TESS_API
# if defined(_WIN32) || defined(__CYGWIN__)
# if defined(TESS_EXPORTS)
# define TESS_API __declspec(dllexport)
# elif defined(TESS_IMPORTS)
# define TESS_API __declspec(dllimport)
# else
# define TESS_API
# endif
# else
# if defined(TESS_EXPORTS) || defined(TESS_IMPORTS)
# define TESS_API __attribute__((visibility("default")))
# else
# define TESS_API
# endif
# endif
#endif
#endif // TESSERACT_PLATFORM_H_

View File

@ -0,0 +1,241 @@
///////////////////////////////////////////////////////////////////////
// File: ltrresultiterator.h
// Description: Iterator for tesseract results in strict left-to-right
// order that avoids using tesseract internal data structures.
// Author: Ray Smith
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
#include "export.h" // for TESS_API
#include "pageiterator.h" // for PageIterator
#include "publictypes.h" // for PageIteratorLevel
#include "unichar.h" // for StrongScriptDirection
namespace tesseract {
class BLOB_CHOICE_IT;
class PAGE_RES;
class WERD_RES;
class Tesseract;
// Class to iterate over tesseract results, providing access to all levels
// of the page hierarchy, without including any tesseract headers or having
// to handle any tesseract structures.
// WARNING! This class points to data held within the TessBaseAPI class, and
// therefore can only be used while the TessBaseAPI class still exists and
// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
// DetectOS, or anything else that changes the internal PAGE_RES.
// See tesseract/publictypes.h for the definition of PageIteratorLevel.
// See also base class PageIterator, which contains the bulk of the interface.
// LTRResultIterator adds text-specific methods for access to OCR output.
class TESS_API LTRResultIterator : public PageIterator {
friend class ChoiceIterator;
public:
// page_res and tesseract come directly from the BaseAPI.
// The rectangle parameters are copied indirectly from the Thresholder,
// via the BaseAPI. They represent the coordinates of some rectangle in an
// original image (in top-left-origin coordinates) and therefore the top-left
// needs to be added to any output boxes in order to specify coordinates
// in the original image. See TessBaseAPI::SetRectangle.
// The scale and scaled_yres are in case the Thresholder scaled the image
// rectangle prior to thresholding. Any coordinates in tesseract's image
// must be divided by scale before adding (rect_left, rect_top).
// The scaled_yres indicates the effective resolution of the binary image
// that tesseract has been given by the Thresholder.
// After the constructor, Begin has already been called.
LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
int scaled_yres, int rect_left, int rect_top,
int rect_width, int rect_height);
~LTRResultIterator() override;
// LTRResultIterators may be copied! This makes it possible to iterate over
// all the objects at a lower level, while maintaining an iterator to
// objects at a higher level. These constructors DO NOT CALL Begin, so
// iterations will continue from the location of src.
// TODO: For now the copy constructor and operator= only need the base class
// versions, but if new data members are added, don't forget to add them!
// ============= Moving around within the page ============.
// See PageIterator.
// ============= Accessing data ==============.
// Returns the null terminated UTF-8 encoded text string for the current
// object at the given level. Use delete [] to free after use.
char *GetUTF8Text(PageIteratorLevel level) const;
// Set the string inserted at the end of each text line. "\n" by default.
void SetLineSeparator(const char *new_line);
// Set the string inserted at the end of each paragraph. "\n" by default.
void SetParagraphSeparator(const char *new_para);
// Returns the mean confidence of the current object at the given level.
// The number should be interpreted as a percent probability. (0.0f-100.0f)
float Confidence(PageIteratorLevel level) const;
// Returns the attributes of the current row.
void RowAttributes(float *row_height, float *descenders,
float *ascenders) const;
// ============= Functions that refer to words only ============.
// Returns the font attributes of the current word. If iterating at a higher
// level object than words, eg textlines, then this will return the
// attributes of the first word in that textline.
// The actual return value is a string representing a font name. It points
// to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
// the iterator itself, ie rendered invalid by various members of
// TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
// Pointsize is returned in printers points (1/72 inch.)
const char *WordFontAttributes(bool *is_bold, bool *is_italic,
bool *is_underlined, bool *is_monospace,
bool *is_serif, bool *is_smallcaps,
int *pointsize, int *font_id) const;
// Return the name of the language used to recognize this word.
// On error, nullptr. Do not delete this pointer.
const char *WordRecognitionLanguage() const;
// Return the overall directionality of this word.
StrongScriptDirection WordDirection() const;
// Returns true if the current word was found in a dictionary.
bool WordIsFromDictionary() const;
// Returns the number of blanks before the current word.
int BlanksBeforeWord() const;
// Returns true if the current word is numeric.
bool WordIsNumeric() const;
// Returns true if the word contains blamer information.
bool HasBlamerInfo() const;
// Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
// of the current word.
const void *GetParamsTrainingBundle() const;
// Returns a pointer to the string with blamer information for this word.
// Assumes that the word's blamer_bundle is not nullptr.
const char *GetBlamerDebug() const;
// Returns a pointer to the string with misadaption information for this word.
// Assumes that the word's blamer_bundle is not nullptr.
const char *GetBlamerMisadaptionDebug() const;
// Returns true if a truth string was recorded for the current word.
bool HasTruthString() const;
// Returns true if the given string is equivalent to the truth string for
// the current word.
bool EquivalentToTruth(const char *str) const;
// Returns a null terminated UTF-8 encoded truth string for the current word.
// Use delete [] to free after use.
char *WordTruthUTF8Text() const;
// Returns a null terminated UTF-8 encoded normalized OCR string for the
// current word. Use delete [] to free after use.
char *WordNormedUTF8Text() const;
// Returns a pointer to serialized choice lattice.
// Fills lattice_size with the number of bytes in lattice data.
const char *WordLattice(int *lattice_size) const;
// ============= Functions that refer to symbols only ============.
// Returns true if the current symbol is a superscript.
// If iterating at a higher level object than symbols, eg words, then
// this will return the attributes of the first symbol in that word.
bool SymbolIsSuperscript() const;
// Returns true if the current symbol is a subscript.
// If iterating at a higher level object than symbols, eg words, then
// this will return the attributes of the first symbol in that word.
bool SymbolIsSubscript() const;
// Returns true if the current symbol is a dropcap.
// If iterating at a higher level object than symbols, eg words, then
// this will return the attributes of the first symbol in that word.
bool SymbolIsDropcap() const;
protected:
const char *line_separator_;
const char *paragraph_separator_;
};
// Class to iterate over the classifier choices for a single RIL_SYMBOL.
class TESS_API ChoiceIterator {
public:
// Construction is from a LTRResultIterator that points to the symbol of
// interest. The ChoiceIterator allows a one-shot iteration over the
// choices for this symbol and after that is is useless.
explicit ChoiceIterator(const LTRResultIterator &result_it);
~ChoiceIterator();
// Moves to the next choice for the symbol and returns false if there
// are none left.
bool Next();
// ============= Accessing data ==============.
// Returns the null terminated UTF-8 encoded text string for the current
// choice.
// NOTE: Unlike LTRResultIterator::GetUTF8Text, the return points to an
// internal structure and should NOT be delete[]ed to free after use.
const char *GetUTF8Text() const;
// Returns the confidence of the current choice depending on the used language
// data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All
// choices for one symbol should roughly add up to 1.0f.
// If only traineddata of the legacy engine is used, the number should be
// interpreted as a percent probability. (0.0f-100.0f) In this case
// probabilities won't add up to 100. Each one stands on its own.
float Confidence() const;
// Returns a vector containing all timesteps, which belong to the currently
// selected symbol. A timestep is a vector containing pairs of symbols and
// floating point numbers. The number states the probability for the
// corresponding symbol.
std::vector<std::vector<std::pair<const char *, float>>> *Timesteps() const;
private:
// clears the remaining spaces out of the results and adapt the probabilities
void filterSpaces();
// Pointer to the WERD_RES object owned by the API.
WERD_RES *word_res_;
// Iterator over the blob choices.
BLOB_CHOICE_IT *choice_it_;
std::vector<std::pair<const char *, float>> *LSTM_choices_ = nullptr;
std::vector<std::pair<const char *, float>>::iterator LSTM_choice_it_;
const int *tstep_index_;
// regulates the rating granularity
double rating_coefficient_;
// leading blanks
int blanks_before_word_;
// true when there is lstm engine related trained data
bool oemLSTM_;
};
} // namespace tesseract.
#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_

View File

@ -0,0 +1,157 @@
/**********************************************************************
* File: ocrclass.h
* Description: Class definitions and constants for the OCR API.
* Author: Hewlett-Packard Co
*
* (C) Copyright 1996, Hewlett-Packard Co.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
/**********************************************************************
* This file contains typedefs for all the structures used by
* the HP OCR interface.
* The structures are designed to allow them to be used with any
* structure alignment up to 8.
**********************************************************************/
#ifndef CCUTIL_OCRCLASS_H_
#define CCUTIL_OCRCLASS_H_
#include <chrono>
#include <ctime>
namespace tesseract {
/**********************************************************************
* EANYCODE_CHAR
* Description of a single character. The character code is defined by
* the character set of the current font.
* Output text is sent as an array of these structures.
* Spaces and line endings in the output are represented in the
* structures of the surrounding characters. They are not directly
* represented as characters.
* The first character in a word has a positive value of blanks.
* Missing information should be set to the defaults in the comments.
* If word bounds are known, but not character bounds, then the top and
* bottom of each character should be those of the word. The left of the
* first and right of the last char in each word should be set. All other
* lefts and rights should be set to -1.
* If set, the values of right and bottom are left+width and top+height.
* Most of the members come directly from the parameters to ocr_append_char.
* The formatting member uses the enhancement parameter and combines the
* line direction stuff into the top 3 bits.
* The coding is 0=RL char, 1=LR char, 2=DR NL, 3=UL NL, 4=DR Para,
* 5=UL Para, 6=TB char, 7=BT char. API users do not need to know what
* the coding is, only that it is backwards compatible with the previous
* version.
**********************************************************************/
struct EANYCODE_CHAR { /*single character */
// It should be noted that the format for char_code for version 2.0 and beyond
// is UTF8 which means that ASCII characters will come out as one structure
// but other characters will be returned in two or more instances of this
// structure with a single byte of the UTF8 code in each, but each will have
// the same bounding box. Programs which want to handle languagues with
// different characters sets will need to handle extended characters
// appropriately, but *all* code needs to be prepared to receive UTF8 coded
// characters for characters such as bullet and fancy quotes.
uint16_t char_code; /*character itself */
int16_t left; /*of char (-1) */
int16_t right; /*of char (-1) */
int16_t top; /*of char (-1) */
int16_t bottom; /*of char (-1) */
int16_t font_index; /*what font (0) */
uint8_t confidence; /*0=perfect, 100=reject (0/100) */
uint8_t point_size; /*of char, 72=i inch, (10) */
int8_t blanks; /*no of spaces before this char (1) */
uint8_t formatting; /*char formatting (0) */
};
/**********************************************************************
* ETEXT_DESC
* Description of the output of the OCR engine.
* This structure is used as both a progress monitor and the final
* output header, since it needs to be a valid progress monitor while
* the OCR engine is storing its output to shared memory.
* During progress, all the buffer info is -1.
* Progress starts at 0 and increases to 100 during OCR. No other constraint.
* Additionally the progress callback contains the bounding box of the word that
* is currently being processed.
* Every progress callback, the OCR engine must set ocr_alive to 1.
* The HP side will set ocr_alive to 0. Repeated failure to reset
* to 1 indicates that the OCR engine is dead.
* If the cancel function is not null then it is called with the number of
* user words found. If it returns true then operation is cancelled.
**********************************************************************/
class ETEXT_DESC;
using CANCEL_FUNC = bool (*)(void *, int);
using PROGRESS_FUNC = bool (*)(int, int, int, int, int);
using PROGRESS_FUNC2 = bool (*)(ETEXT_DESC *, int, int, int, int);
class ETEXT_DESC { // output header
public:
int16_t count{0}; /// chars in this buffer(0)
int16_t progress{0}; /// percent complete increasing (0-100)
/** Progress monitor covers word recognition and it does not cover layout
* analysis.
* See Ray comment in https://github.com/tesseract-ocr/tesseract/pull/27 */
int8_t more_to_come{0}; /// true if not last
volatile int8_t ocr_alive{0}; /// ocr sets to 1, HP 0
int8_t err_code{0}; /// for errcode use
CANCEL_FUNC cancel{nullptr}; /// returns true to cancel
PROGRESS_FUNC progress_callback{
nullptr}; /// called whenever progress increases
PROGRESS_FUNC2 progress_callback2; /// monitor-aware progress callback
void *cancel_this{nullptr}; /// this or other data for cancel
std::chrono::steady_clock::time_point end_time;
/// Time to stop. Expected to be set only
/// by call to set_deadline_msecs().
EANYCODE_CHAR text[1]{}; /// character data
ETEXT_DESC() : progress_callback2(&default_progress_func) {
end_time = std::chrono::time_point<std::chrono::steady_clock,
std::chrono::milliseconds>();
}
// Sets the end time to be deadline_msecs milliseconds from now.
void set_deadline_msecs(int32_t deadline_msecs) {
if (deadline_msecs > 0) {
end_time = std::chrono::steady_clock::now() +
std::chrono::milliseconds(deadline_msecs);
}
}
// Returns false if we've not passed the end_time, or have not set a deadline.
bool deadline_exceeded() const {
if (end_time.time_since_epoch() ==
std::chrono::steady_clock::duration::zero()) {
return false;
}
auto now = std::chrono::steady_clock::now();
return (now > end_time);
}
private:
static bool default_progress_func(ETEXT_DESC *ths, int left, int right,
int top, int bottom) {
if (ths->progress_callback != nullptr) {
return (*(ths->progress_callback))(ths->progress, left, right, top,
bottom);
}
return true;
}
};
} // namespace tesseract
#endif // CCUTIL_OCRCLASS_H_

View File

@ -0,0 +1,141 @@
///////////////////////////////////////////////////////////////////////
// File: osdetect.h
// Description: Orientation and script detection.
// Author: Samuel Charron
// Ranjith Unnikrishnan
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCMAIN_OSDETECT_H_
#define TESSERACT_CCMAIN_OSDETECT_H_
#include "export.h" // for TESS_API
#include <vector> // for std::vector
namespace tesseract {
class BLOBNBOX;
class BLOBNBOX_CLIST;
class BLOB_CHOICE_LIST;
class TO_BLOCK_LIST;
class UNICHARSET;
class Tesseract;
// Max number of scripts in ICU + "NULL" + Japanese and Korean + Fraktur
const int kMaxNumberOfScripts = 116 + 1 + 2 + 1;
struct OSBestResult {
OSBestResult()
: orientation_id(0), script_id(0), sconfidence(0.0), oconfidence(0.0) {}
int orientation_id;
int script_id;
float sconfidence;
float oconfidence;
};
struct OSResults {
OSResults() : unicharset(nullptr) {
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < kMaxNumberOfScripts; ++j) {
scripts_na[i][j] = 0;
}
orientations[i] = 0;
}
}
void update_best_orientation();
// Set the estimate of the orientation to the given id.
void set_best_orientation(int orientation_id);
// Update/Compute the best estimate of the script assuming the given
// orientation id.
void update_best_script(int orientation_id);
// Return the index of the script with the highest score for this orientation.
TESS_API int get_best_script(int orientation_id) const;
// Accumulate scores with given OSResults instance and update the best script.
void accumulate(const OSResults &osr);
// Print statistics.
void print_scores(void) const;
void print_scores(int orientation_id) const;
// Array holding scores for each orientation id [0,3].
// Orientation ids [0..3] map to [0, 270, 180, 90] degree orientations of the
// page respectively, where the values refer to the amount of clockwise
// rotation to be applied to the page for the text to be upright and readable.
float orientations[4];
// Script confidence scores for each of 4 possible orientations.
float scripts_na[4][kMaxNumberOfScripts];
UNICHARSET *unicharset;
OSBestResult best_result;
};
class OrientationDetector {
public:
OrientationDetector(const std::vector<int> *allowed_scripts,
OSResults *results);
bool detect_blob(BLOB_CHOICE_LIST *scores);
int get_orientation();
private:
OSResults *osr_;
const std::vector<int> *allowed_scripts_;
};
class ScriptDetector {
public:
ScriptDetector(const std::vector<int> *allowed_scripts, OSResults *osr,
tesseract::Tesseract *tess);
void detect_blob(BLOB_CHOICE_LIST *scores);
bool must_stop(int orientation) const;
private:
OSResults *osr_;
static const char *korean_script_;
static const char *japanese_script_;
static const char *fraktur_script_;
int korean_id_;
int japanese_id_;
int katakana_id_;
int hiragana_id_;
int han_id_;
int hangul_id_;
int latin_id_;
int fraktur_id_;
tesseract::Tesseract *tess_;
const std::vector<int> *allowed_scripts_;
};
int orientation_and_script_detection(const char *filename, OSResults *,
tesseract::Tesseract *);
int os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr,
tesseract::Tesseract *tess);
int os_detect_blobs(const std::vector<int> *allowed_scripts,
BLOBNBOX_CLIST *blob_list, OSResults *osr,
tesseract::Tesseract *tess);
bool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s,
OSResults *, tesseract::Tesseract *tess);
// Helper method to convert an orientation index to its value in degrees.
// The value represents the amount of clockwise rotation in degrees that must be
// applied for the text to be upright (readable).
TESS_API int OrientationIdToValue(const int &id);
} // namespace tesseract
#endif // TESSERACT_CCMAIN_OSDETECT_H_

View File

@ -0,0 +1,362 @@
///////////////////////////////////////////////////////////////////////
// File: pageiterator.h
// Description: Iterator for tesseract page structure that avoids using
// tesseract internal data structures.
// Author: Ray Smith
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_
#define TESSERACT_CCMAIN_PAGEITERATOR_H_
#include "export.h"
#include "publictypes.h"
struct Pix;
struct Pta;
namespace tesseract {
struct BlamerBundle;
class C_BLOB_IT;
class PAGE_RES;
class PAGE_RES_IT;
class WERD;
class Tesseract;
/**
* Class to iterate over tesseract page structure, providing access to all
* levels of the page hierarchy, without including any tesseract headers or
* having to handle any tesseract structures.
* WARNING! This class points to data held within the TessBaseAPI class, and
* therefore can only be used while the TessBaseAPI class still exists and
* has not been subjected to a call of Init, SetImage, Recognize, Clear, End
* DetectOS, or anything else that changes the internal PAGE_RES.
* See tesseract/publictypes.h for the definition of PageIteratorLevel.
* See also ResultIterator, derived from PageIterator, which adds in the
* ability to access OCR output with text-specific methods.
*/
class TESS_API PageIterator {
public:
/**
* page_res and tesseract come directly from the BaseAPI.
* The rectangle parameters are copied indirectly from the Thresholder,
* via the BaseAPI. They represent the coordinates of some rectangle in an
* original image (in top-left-origin coordinates) and therefore the top-left
* needs to be added to any output boxes in order to specify coordinates
* in the original image. See TessBaseAPI::SetRectangle.
* The scale and scaled_yres are in case the Thresholder scaled the image
* rectangle prior to thresholding. Any coordinates in tesseract's image
* must be divided by scale before adding (rect_left, rect_top).
* The scaled_yres indicates the effective resolution of the binary image
* that tesseract has been given by the Thresholder.
* After the constructor, Begin has already been called.
*/
PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
int scaled_yres, int rect_left, int rect_top, int rect_width,
int rect_height);
virtual ~PageIterator();
/**
* Page/ResultIterators may be copied! This makes it possible to iterate over
* all the objects at a lower level, while maintaining an iterator to
* objects at a higher level. These constructors DO NOT CALL Begin, so
* iterations will continue from the location of src.
*/
PageIterator(const PageIterator &src);
const PageIterator &operator=(const PageIterator &src);
/** Are we positioned at the same location as other? */
bool PositionedAtSameWord(const PAGE_RES_IT *other) const;
// ============= Moving around within the page ============.
/**
* Moves the iterator to point to the start of the page to begin an
* iteration.
*/
virtual void Begin();
/**
* Moves the iterator to the beginning of the paragraph.
* This class implements this functionality by moving it to the zero indexed
* blob of the first (leftmost) word on the first row of the paragraph.
*/
virtual void RestartParagraph();
/**
* Return whether this iterator points anywhere in the first textline of a
* paragraph.
*/
bool IsWithinFirstTextlineOfParagraph() const;
/**
* Moves the iterator to the beginning of the text line.
* This class implements this functionality by moving it to the zero indexed
* blob of the first (leftmost) word of the row.
*/
virtual void RestartRow();
/**
* Moves to the start of the next object at the given level in the
* page hierarchy, and returns false if the end of the page was reached.
* NOTE that RIL_SYMBOL will skip non-text blocks, but all other
* PageIteratorLevel level values will visit each non-text block once.
* Think of non text blocks as containing a single para, with a single line,
* with a single imaginary word.
* Calls to Next with different levels may be freely intermixed.
* This function iterates words in right-to-left scripts correctly, if
* the appropriate language has been loaded into Tesseract.
*/
virtual bool Next(PageIteratorLevel level);
/**
* Returns true if the iterator is at the start of an object at the given
* level.
*
* For instance, suppose an iterator it is pointed to the first symbol of the
* first word of the third line of the second paragraph of the first block in
* a page, then:
* it.IsAtBeginningOf(RIL_BLOCK) = false
* it.IsAtBeginningOf(RIL_PARA) = false
* it.IsAtBeginningOf(RIL_TEXTLINE) = true
* it.IsAtBeginningOf(RIL_WORD) = true
* it.IsAtBeginningOf(RIL_SYMBOL) = true
*/
virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
/**
* Returns whether the iterator is positioned at the last element in a
* given level. (e.g. the last word in a line, the last line in a block)
*
* Here's some two-paragraph example
* text. It starts off innocuously
* enough but quickly turns bizarre.
* The author inserts a cornucopia
* of words to guard against confused
* references.
*
* Now take an iterator it pointed to the start of "bizarre."
* it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false
* it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true
* it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false
*/
virtual bool IsAtFinalElement(PageIteratorLevel level,
PageIteratorLevel element) const;
/**
* Returns whether this iterator is positioned
* before other: -1
* equal to other: 0
* after other: 1
*/
int Cmp(const PageIterator &other) const;
// ============= Accessing data ==============.
// Coordinate system:
// Integer coordinates are at the cracks between the pixels.
// The top-left corner of the top-left pixel in the image is at (0,0).
// The bottom-right corner of the bottom-right pixel in the image is at
// (width, height).
// Every bounding box goes from the top-left of the top-left contained
// pixel to the bottom-right of the bottom-right contained pixel, so
// the bounding box of the single top-left pixel in the image is:
// (0,0)->(1,1).
// If an image rectangle has been set in the API, then returned coordinates
// relate to the original (full) image, rather than the rectangle.
/**
* Controls what to include in a bounding box. Bounding boxes of all levels
* between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics.
* Between layout analysis and recognition, it isn't known where all
* diacritics belong, so this control is used to include or exclude some
* diacritics that are above or below the main body of the word. In most cases
* where the placement is obvious, and after recognition, it doesn't make as
* much difference, as the diacritics will already be included in the word.
*/
void SetBoundingBoxComponents(bool include_upper_dots,
bool include_lower_dots) {
include_upper_dots_ = include_upper_dots;
include_lower_dots_ = include_lower_dots;
}
/**
* Returns the bounding rectangle of the current object at the given level.
* See comment on coordinate system above.
* Returns false if there is no such object at the current position.
* The returned bounding box is guaranteed to match the size and position
* of the image returned by GetBinaryImage, but may clip foreground pixels
* from a grey image. The padding argument to GetImage can be used to expand
* the image to include more foreground pixels. See GetImage below.
*/
bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right,
int *bottom) const;
bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top,
int *right, int *bottom) const;
/**
* Returns the bounding rectangle of the object in a coordinate system of the
* working image rectangle having its origin at (rect_left_, rect_top_) with
* respect to the original image and is scaled by a factor scale_.
*/
bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top,
int *right, int *bottom) const;
/** Returns whether there is no object of a given level. */
bool Empty(PageIteratorLevel level) const;
/**
* Returns the type of the current block.
* See tesseract/publictypes.h for PolyBlockType.
*/
PolyBlockType BlockType() const;
/**
* Returns the polygon outline of the current block. The returned Pta must
* be ptaDestroy-ed after use. Note that the returned Pta lists the vertices
* of the polygon, and the last edge is the line segment between the last
* point and the first point. nullptr will be returned if the iterator is
* at the end of the document or layout analysis was not used.
*/
Pta *BlockPolygon() const;
/**
* Returns a binary image of the current object at the given level.
* The position and size match the return from BoundingBoxInternal, and so
* this could be upscaled with respect to the original input image.
* Use pixDestroy to delete the image after use.
*/
Pix *GetBinaryImage(PageIteratorLevel level) const;
/**
* Returns an image of the current object at the given level in greyscale
* if available in the input. To guarantee a binary image use BinaryImage.
* NOTE that in order to give the best possible image, the bounds are
* expanded slightly over the binary connected component, by the supplied
* padding, so the top-left position of the returned image is returned
* in (left,top). These will most likely not match the coordinates
* returned by BoundingBox.
* If you do not supply an original image, you will get a binary one.
* Use pixDestroy to delete the image after use.
*/
Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img,
int *left, int *top) const;
/**
* Returns the baseline of the current object at the given level.
* The baseline is the line that passes through (x1, y1) and (x2, y2).
* WARNING: with vertical text, baselines may be vertical!
* Returns false if there is no baseline at the current position.
*/
bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2,
int *y2) const;
/**
* Returns orientation for the block the iterator points to.
* orientation, writing_direction, textline_order: see publictypes.h
* deskew_angle: after rotating the block so the text orientation is
* upright, how many radians does one have to rotate the
* block anti-clockwise for it to be level?
* -Pi/4 <= deskew_angle <= Pi/4
*/
void Orientation(tesseract::Orientation *orientation,
tesseract::WritingDirection *writing_direction,
tesseract::TextlineOrder *textline_order,
float *deskew_angle) const;
/**
* Returns information about the current paragraph, if available.
*
* justification -
* LEFT if ragged right, or fully justified and script is left-to-right.
* RIGHT if ragged left, or fully justified and script is right-to-left.
* unknown if it looks like source code or we have very few lines.
* is_list_item -
* true if we believe this is a member of an ordered or unordered list.
* is_crown -
* true if the first line of the paragraph is aligned with the other
* lines of the paragraph even though subsequent paragraphs have first
* line indents. This typically indicates that this is the continuation
* of a previous paragraph or that it is the very first paragraph in
* the chapter.
* first_line_indent -
* For LEFT aligned paragraphs, the first text line of paragraphs of
* this kind are indented this many pixels from the left edge of the
* rest of the paragraph.
* for RIGHT aligned paragraphs, the first text line of paragraphs of
* this kind are indented this many pixels from the right edge of the
* rest of the paragraph.
* NOTE 1: This value may be negative.
* NOTE 2: if *is_crown == true, the first line of this paragraph is
* actually flush, and first_line_indent is set to the "common"
* first_line_indent for subsequent paragraphs in this block
* of text.
*/
void ParagraphInfo(tesseract::ParagraphJustification *justification,
bool *is_list_item, bool *is_crown,
int *first_line_indent) const;
// If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle
// of the current word to the given pointer (takes ownership of the pointer)
// and returns true.
// Can only be used when iterating on the word level.
bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);
protected:
/**
* Sets up the internal data for iterating the blobs of a new word, then
* moves the iterator to the given offset.
*/
void BeginWord(int offset);
/** Pointer to the page_res owned by the API. */
PAGE_RES *page_res_;
/** Pointer to the Tesseract object owned by the API. */
Tesseract *tesseract_;
/**
* The iterator to the page_res_. Owned by this ResultIterator.
* A pointer just to avoid dragging in Tesseract includes.
*/
PAGE_RES_IT *it_;
/**
* The current input WERD being iterated. If there is an output from OCR,
* then word_ is nullptr. Owned by the API
*/
WERD *word_;
/** The length of the current word_. */
int word_length_;
/** The current blob index within the word. */
int blob_index_;
/**
* Iterator to the blobs within the word. If nullptr, then we are iterating
* OCR results in the box_word.
* Owned by this ResultIterator.
*/
C_BLOB_IT *cblob_it_;
/** Control over what to include in bounding boxes. */
bool include_upper_dots_;
bool include_lower_dots_;
/** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/
int scale_;
int scaled_yres_;
int rect_left_;
int rect_top_;
int rect_width_;
int rect_height_;
};
} // namespace tesseract.
#endif // TESSERACT_CCMAIN_PAGEITERATOR_H_

View File

@ -0,0 +1,283 @@
///////////////////////////////////////////////////////////////////////
// File: publictypes.h
// Description: Types used in both the API and internally
// Author: Ray Smith
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCSTRUCT_PUBLICTYPES_H_
#define TESSERACT_CCSTRUCT_PUBLICTYPES_H_
namespace tesseract {
// This file contains types that are used both by the API and internally
// to Tesseract. In order to decouple the API from Tesseract and prevent cyclic
// dependencies, THIS FILE SHOULD NOT DEPEND ON ANY OTHER PART OF TESSERACT.
// Restated: It is OK for low-level Tesseract files to include publictypes.h,
// but not for the low-level tesseract code to include top-level API code.
// This file should not use other Tesseract types, as that would drag
// their includes into the API-level.
/** Number of printers' points in an inch. The unit of the pointsize return. */
constexpr int kPointsPerInch = 72;
/**
* Minimum believable resolution. Used as a default if there is no other
* information, as it is safer to under-estimate than over-estimate.
*/
constexpr int kMinCredibleResolution = 70;
/** Maximum believable resolution. */
constexpr int kMaxCredibleResolution = 2400;
/**
* Ratio between median blob size and likely resolution. Used to estimate
* resolution when none is provided. This is basically 1/usual text size in
* inches. */
constexpr int kResolutionEstimationFactor = 10;
/**
* Possible types for a POLY_BLOCK or ColPartition.
* Must be kept in sync with kPBColors in polyblk.cpp and PTIs*Type functions
* below, as well as kPolyBlockNames in layout_test.cc.
* Used extensively by ColPartition, and POLY_BLOCK.
*/
enum PolyBlockType {
PT_UNKNOWN, // Type is not yet known. Keep as the first element.
PT_FLOWING_TEXT, // Text that lives inside a column.
PT_HEADING_TEXT, // Text that spans more than one column.
PT_PULLOUT_TEXT, // Text that is in a cross-column pull-out region.
PT_EQUATION, // Partition belonging to an equation region.
PT_INLINE_EQUATION, // Partition has inline equation.
PT_TABLE, // Partition belonging to a table region.
PT_VERTICAL_TEXT, // Text-line runs vertically.
PT_CAPTION_TEXT, // Text that belongs to an image.
PT_FLOWING_IMAGE, // Image that lives inside a column.
PT_HEADING_IMAGE, // Image that spans more than one column.
PT_PULLOUT_IMAGE, // Image that is in a cross-column pull-out region.
PT_HORZ_LINE, // Horizontal Line.
PT_VERT_LINE, // Vertical Line.
PT_NOISE, // Lies outside of any column.
PT_COUNT
};
/** Returns true if PolyBlockType is of horizontal line type */
inline bool PTIsLineType(PolyBlockType type) {
return type == PT_HORZ_LINE || type == PT_VERT_LINE;
}
/** Returns true if PolyBlockType is of image type */
inline bool PTIsImageType(PolyBlockType type) {
return type == PT_FLOWING_IMAGE || type == PT_HEADING_IMAGE ||
type == PT_PULLOUT_IMAGE;
}
/** Returns true if PolyBlockType is of text type */
inline bool PTIsTextType(PolyBlockType type) {
return type == PT_FLOWING_TEXT || type == PT_HEADING_TEXT ||
type == PT_PULLOUT_TEXT || type == PT_TABLE ||
type == PT_VERTICAL_TEXT || type == PT_CAPTION_TEXT ||
type == PT_INLINE_EQUATION;
}
// Returns true if PolyBlockType is of pullout(inter-column) type
inline bool PTIsPulloutType(PolyBlockType type) {
return type == PT_PULLOUT_IMAGE || type == PT_PULLOUT_TEXT;
}
/**
* +------------------+ Orientation Example:
* | 1 Aaaa Aaaa Aaaa | ====================
* | Aaa aa aaa aa | To left is a diagram of some (1) English and
* | aaaaaa A aa aaa. | (2) Chinese text and a (3) photo credit.
* | 2 |
* | ####### c c C | Upright Latin characters are represented as A and a.
* | ####### c c c | '<' represents a latin character rotated
* | < ####### c c c | anti-clockwise 90 degrees.
* | < ####### c c |
* | < ####### . c | Upright Chinese characters are represented C and c.
* | 3 ####### c |
* +------------------+ NOTA BENE: enum values here should match goodoc.proto
* If you orient your head so that "up" aligns with Orientation,
* then the characters will appear "right side up" and readable.
*
* In the example above, both the English and Chinese paragraphs are oriented
* so their "up" is the top of the page (page up). The photo credit is read
* with one's head turned leftward ("up" is to page left).
*
* The values of this enum match the convention of Tesseract's osdetect.h
*/
enum Orientation {
ORIENTATION_PAGE_UP = 0,
ORIENTATION_PAGE_RIGHT = 1,
ORIENTATION_PAGE_DOWN = 2,
ORIENTATION_PAGE_LEFT = 3,
};
/**
* The grapheme clusters within a line of text are laid out logically
* in this direction, judged when looking at the text line rotated so that
* its Orientation is "page up".
*
* For English text, the writing direction is left-to-right. For the
* Chinese text in the above example, the writing direction is top-to-bottom.
*/
enum WritingDirection {
WRITING_DIRECTION_LEFT_TO_RIGHT = 0,
WRITING_DIRECTION_RIGHT_TO_LEFT = 1,
WRITING_DIRECTION_TOP_TO_BOTTOM = 2,
};
/**
* The text lines are read in the given sequence.
*
* In English, the order is top-to-bottom.
* In Chinese, vertical text lines are read right-to-left. Mongolian is
* written in vertical columns top to bottom like Chinese, but the lines
* order left-to right.
*
* Note that only some combinations make sense. For example,
* WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM
*/
enum TextlineOrder {
TEXTLINE_ORDER_LEFT_TO_RIGHT = 0,
TEXTLINE_ORDER_RIGHT_TO_LEFT = 1,
TEXTLINE_ORDER_TOP_TO_BOTTOM = 2,
};
/**
* Possible modes for page layout analysis. These *must* be kept in order
* of decreasing amount of layout analysis to be done, except for OSD_ONLY,
* so that the inequality test macros below work.
*/
enum PageSegMode {
PSM_OSD_ONLY = 0, ///< Orientation and script detection only.
PSM_AUTO_OSD = 1, ///< Automatic page segmentation with orientation and
///< script detection. (OSD)
PSM_AUTO_ONLY = 2, ///< Automatic page segmentation, but no OSD, or OCR.
PSM_AUTO = 3, ///< Fully automatic page segmentation, but no OSD.
PSM_SINGLE_COLUMN = 4, ///< Assume a single column of text of variable sizes.
PSM_SINGLE_BLOCK_VERT_TEXT = 5, ///< Assume a single uniform block of
///< vertically aligned text.
PSM_SINGLE_BLOCK = 6, ///< Assume a single uniform block of text. (Default.)
PSM_SINGLE_LINE = 7, ///< Treat the image as a single text line.
PSM_SINGLE_WORD = 8, ///< Treat the image as a single word.
PSM_CIRCLE_WORD = 9, ///< Treat the image as a single word in a circle.
PSM_SINGLE_CHAR = 10, ///< Treat the image as a single character.
PSM_SPARSE_TEXT =
11, ///< Find as much text as possible in no particular order.
PSM_SPARSE_TEXT_OSD = 12, ///< Sparse text with orientation and script det.
PSM_RAW_LINE = 13, ///< Treat the image as a single text line, bypassing
///< hacks that are Tesseract-specific.
PSM_COUNT ///< Number of enum entries.
};
/**
* Inline functions that act on a PageSegMode to determine whether components of
* layout analysis are enabled.
* *Depend critically on the order of elements of PageSegMode.*
* NOTE that arg is an int for compatibility with INT_PARAM.
*/
inline bool PSM_OSD_ENABLED(int pageseg_mode) {
return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD;
}
inline bool PSM_ORIENTATION_ENABLED(int pageseg_mode) {
return pageseg_mode <= PSM_AUTO || pageseg_mode == PSM_SPARSE_TEXT_OSD;
}
inline bool PSM_COL_FIND_ENABLED(int pageseg_mode) {
return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO;
}
inline bool PSM_SPARSE(int pageseg_mode) {
return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
}
inline bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode) {
return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN;
}
inline bool PSM_LINE_FIND_ENABLED(int pageseg_mode) {
return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK;
}
inline bool PSM_WORD_FIND_ENABLED(int pageseg_mode) {
return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) ||
pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
}
/**
* enum of the elements of the page hierarchy, used in ResultIterator
* to provide functions that operate on each level without having to
* have 5x as many functions.
*/
enum PageIteratorLevel {
RIL_BLOCK, // Block of text/image/separator line.
RIL_PARA, // Paragraph within a block.
RIL_TEXTLINE, // Line within a paragraph.
RIL_WORD, // Word within a textline.
RIL_SYMBOL // Symbol/character within a word.
};
/**
* JUSTIFICATION_UNKNOWN
* The alignment is not clearly one of the other options. This could happen
* for example if there are only one or two lines of text or the text looks
* like source code or poetry.
*
* NOTA BENE: Fully justified paragraphs (text aligned to both left and right
* margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text
* is written with a left-to-right script and with JUSTIFICATION_RIGHT if
* their text is written in a right-to-left script.
*
* Interpretation for text read in vertical lines:
* "Left" is wherever the starting reading position is.
*
* JUSTIFICATION_LEFT
* Each line, except possibly the first, is flush to the same left tab stop.
*
* JUSTIFICATION_CENTER
* The text lines of the paragraph are centered about a line going
* down through their middle of the text lines.
*
* JUSTIFICATION_RIGHT
* Each line, except possibly the first, is flush to the same right tab stop.
*/
enum ParagraphJustification {
JUSTIFICATION_UNKNOWN,
JUSTIFICATION_LEFT,
JUSTIFICATION_CENTER,
JUSTIFICATION_RIGHT,
};
/**
* When Tesseract/Cube is initialized we can choose to instantiate/load/run
* only the Tesseract part, only the Cube part or both along with the combiner.
* The preference of which engine to use is stored in tessedit_ocr_engine_mode.
*
* ATTENTION: When modifying this enum, please make sure to make the
* appropriate changes to all the enums mirroring it (e.g. OCREngine in
* cityblock/workflow/detection/detection_storage.proto). Such enums will
* mention the connection to OcrEngineMode in the comments.
*/
enum OcrEngineMode {
OEM_TESSERACT_ONLY, // Run Tesseract only - fastest; deprecated
OEM_LSTM_ONLY, // Run just the LSTM line recognizer.
OEM_TESSERACT_LSTM_COMBINED, // Run the LSTM recognizer, but allow fallback
// to Tesseract when things get difficult.
// deprecated
OEM_DEFAULT, // Specify this mode when calling init_*(),
// to indicate that any of the above modes
// should be automatically inferred from the
// variables in the language-specific config,
// command-line configs, or if not specified
// in any of the above should be set to the
// default OEM_TESSERACT_ONLY.
OEM_COUNT // Number of OEMs
};
} // namespace tesseract.
#endif // TESSERACT_CCSTRUCT_PUBLICTYPES_H_

View File

@ -0,0 +1,310 @@
///////////////////////////////////////////////////////////////////////
// File: renderer.h
// Description: Rendering interface to inject into TessBaseAPI
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_API_RENDERER_H_
#define TESSERACT_API_RENDERER_H_
#include "export.h"
// To avoid collision with other typenames include the ABSOLUTE MINIMUM
// complexity of includes here. Use forward declarations wherever possible
// and hide includes of complex types in baseapi.cpp.
#include <string> // for std::string
#include <vector> // for std::vector
struct Pix;
namespace tesseract {
class TessBaseAPI;
/**
* Interface for rendering tesseract results into a document, such as text,
* HOCR or pdf. This class is abstract. Specific classes handle individual
* formats. This interface is then used to inject the renderer class into
* tesseract when processing images.
*
* For simplicity implementing this with tesseract version 3.01,
* the renderer contains document state that is cleared from document
* to document just as the TessBaseAPI is. This way the base API can just
* delegate its rendering functionality to injected renderers, and the
* renderers can manage the associated state needed for the specific formats
* in addition to the heuristics for producing it.
*/
class TESS_API TessResultRenderer {
public:
virtual ~TessResultRenderer();
// Takes ownership of pointer so must be new'd instance.
// Renderers aren't ordered, but appends the sequences of next parameter
// and existing next(). The renderers should be unique across both lists.
void insert(TessResultRenderer *next);
// Returns the next renderer or nullptr.
TessResultRenderer *next() {
return next_;
}
/**
* Starts a new document with the given title.
* This clears the contents of the output data.
* Title should use UTF-8 encoding.
*/
bool BeginDocument(const char *title);
/**
* Adds the recognized text from the source image to the current document.
* Invalid if BeginDocument not yet called.
*
* Note that this API is a bit weird but is designed to fit into the
* current TessBaseAPI implementation where the api has lots of state
* information that we might want to add in.
*/
bool AddImage(TessBaseAPI *api);
/**
* Finishes the document and finalizes the output data
* Invalid if BeginDocument not yet called.
*/
bool EndDocument();
const char *file_extension() const {
return file_extension_;
}
const char *title() const {
return title_.c_str();
}
// Is everything fine? Otherwise something went wrong.
bool happy() const {
return happy_;
}
/**
* Returns the index of the last image given to AddImage
* (i.e. images are incremented whether the image succeeded or not)
*
* This is always defined. It means either the number of the
* current image, the last image ended, or in the completed document
* depending on when in the document lifecycle you are looking at it.
* Will return -1 if a document was never started.
*/
int imagenum() const {
return imagenum_;
}
protected:
/**
* Called by concrete classes.
*
* outputbase is the name of the output file excluding
* extension. For example, "/path/to/chocolate-chip-cookie-recipe"
*
* extension indicates the file extension to be used for output
* files. For example "pdf" will produce a .pdf file, and "hocr"
* will produce .hocr files.
*/
TessResultRenderer(const char *outputbase, const char *extension);
// Hook for specialized handling in BeginDocument()
virtual bool BeginDocumentHandler();
// This must be overridden to render the OCR'd results
virtual bool AddImageHandler(TessBaseAPI *api) = 0;
// Hook for specialized handling in EndDocument()
virtual bool EndDocumentHandler();
// Renderers can call this to append '\0' terminated strings into
// the output string returned by GetOutput.
// This method will grow the output buffer if needed.
void AppendString(const char *s);
// Renderers can call this to append binary byte sequences into
// the output string returned by GetOutput. Note that s is not necessarily
// '\0' terminated (and can contain '\0' within it).
// This method will grow the output buffer if needed.
void AppendData(const char *s, int len);
private:
const char *file_extension_; // standard extension for generated output
std::string title_; // title of document being rendered
int imagenum_; // index of last image added
FILE *fout_; // output file pointer
TessResultRenderer *next_; // Can link multiple renderers together
bool happy_; // I get grumpy when the disk fills up, etc.
};
/**
* Renders tesseract output into a plain UTF-8 text string
*/
class TESS_API TessTextRenderer : public TessResultRenderer {
public:
explicit TessTextRenderer(const char *outputbase);
protected:
bool AddImageHandler(TessBaseAPI *api) override;
};
/**
* Renders tesseract output into an hocr text string
*/
class TESS_API TessHOcrRenderer : public TessResultRenderer {
public:
explicit TessHOcrRenderer(const char *outputbase, bool font_info);
explicit TessHOcrRenderer(const char *outputbase);
protected:
bool BeginDocumentHandler() override;
bool AddImageHandler(TessBaseAPI *api) override;
bool EndDocumentHandler() override;
private:
bool font_info_; // whether to print font information
};
/**
* Renders tesseract output into an alto text string
*/
class TESS_API TessAltoRenderer : public TessResultRenderer {
public:
explicit TessAltoRenderer(const char *outputbase);
protected:
bool BeginDocumentHandler() override;
bool AddImageHandler(TessBaseAPI *api) override;
bool EndDocumentHandler() override;
};
/**
* Renders Tesseract output into a TSV string
*/
class TESS_API TessTsvRenderer : public TessResultRenderer {
public:
explicit TessTsvRenderer(const char *outputbase, bool font_info);
explicit TessTsvRenderer(const char *outputbase);
protected:
bool BeginDocumentHandler() override;
bool AddImageHandler(TessBaseAPI *api) override;
bool EndDocumentHandler() override;
private:
bool font_info_; // whether to print font information
};
/**
* Renders tesseract output into searchable PDF
*/
class TESS_API TessPDFRenderer : public TessResultRenderer {
public:
// datadir is the location of the TESSDATA. We need it because
// we load a custom PDF font from this location.
TessPDFRenderer(const char *outputbase, const char *datadir,
bool textonly = false);
protected:
bool BeginDocumentHandler() override;
bool AddImageHandler(TessBaseAPI *api) override;
bool EndDocumentHandler() override;
private:
// We don't want to have every image in memory at once,
// so we store some metadata as we go along producing
// PDFs one page at a time. At the end, that metadata is
// used to make everything that isn't easily handled in a
// streaming fashion.
long int obj_; // counter for PDF objects
std::vector<long int> offsets_; // offset of every PDF object in bytes
std::vector<long int> pages_; // object number for every /Page object
std::string datadir_; // where to find the custom font
bool textonly_; // skip images if set
// Bookkeeping only. DIY = Do It Yourself.
void AppendPDFObjectDIY(size_t objectsize);
// Bookkeeping + emit data.
void AppendPDFObject(const char *data);
// Create the /Contents object for an entire page.
char *GetPDFTextObjects(TessBaseAPI *api, double width, double height);
// Turn an image into a PDF object. Only transcode if we have to.
static bool imageToPDFObj(Pix *pix, const char *filename, long int objnum,
char **pdf_object, long int *pdf_object_size,
int jpg_quality);
};
/**
* Renders tesseract output into a plain UTF-8 text string
*/
class TESS_API TessUnlvRenderer : public TessResultRenderer {
public:
explicit TessUnlvRenderer(const char *outputbase);
protected:
bool AddImageHandler(TessBaseAPI *api) override;
};
/**
* Renders tesseract output into a plain UTF-8 text string for LSTMBox
*/
class TESS_API TessLSTMBoxRenderer : public TessResultRenderer {
public:
explicit TessLSTMBoxRenderer(const char *outputbase);
protected:
bool AddImageHandler(TessBaseAPI *api) override;
};
/**
* Renders tesseract output into a plain UTF-8 text string
*/
class TESS_API TessBoxTextRenderer : public TessResultRenderer {
public:
explicit TessBoxTextRenderer(const char *outputbase);
protected:
bool AddImageHandler(TessBaseAPI *api) override;
};
/**
* Renders tesseract output into a plain UTF-8 text string in WordStr format
*/
class TESS_API TessWordStrBoxRenderer : public TessResultRenderer {
public:
explicit TessWordStrBoxRenderer(const char *outputbase);
protected:
bool AddImageHandler(TessBaseAPI *api) override;
};
#ifndef DISABLED_LEGACY_ENGINE
/**
* Renders tesseract output into an osd text string
*/
class TESS_API TessOsdRenderer : public TessResultRenderer {
public:
explicit TessOsdRenderer(const char *outputbase);
protected:
bool AddImageHandler(TessBaseAPI *api) override;
};
#endif // ndef DISABLED_LEGACY_ENGINE
} // namespace tesseract.
#endif // TESSERACT_API_RENDERER_H_

View File

@ -0,0 +1,252 @@
///////////////////////////////////////////////////////////////////////
// File: resultiterator.h
// Description: Iterator for tesseract results that is capable of
// iterating in proper reading order over Bi Directional
// (e.g. mixed Hebrew and English) text.
// Author: David Eger
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_
#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_
#include "export.h" // for TESS_API, TESS_LOCAL
#include "ltrresultiterator.h" // for LTRResultIterator
#include "publictypes.h" // for PageIteratorLevel
#include "unichar.h" // for StrongScriptDirection
#include <set> // for std::pair
#include <vector> // for std::vector
namespace tesseract {
class TESS_API ResultIterator : public LTRResultIterator {
public:
static ResultIterator *StartOfParagraph(const LTRResultIterator &resit);
/**
* ResultIterator is copy constructible!
* The default copy constructor works just fine for us.
*/
~ResultIterator() override = default;
// ============= Moving around within the page ============.
/**
* Moves the iterator to point to the start of the page to begin
* an iteration.
*/
void Begin() override;
/**
* Moves to the start of the next object at the given level in the
* page hierarchy in the appropriate reading order and returns false if
* the end of the page was reached.
* NOTE that RIL_SYMBOL will skip non-text blocks, but all other
* PageIteratorLevel level values will visit each non-text block once.
* Think of non text blocks as containing a single para, with a single line,
* with a single imaginary word.
* Calls to Next with different levels may be freely intermixed.
* This function iterates words in right-to-left scripts correctly, if
* the appropriate language has been loaded into Tesseract.
*/
bool Next(PageIteratorLevel level) override;
/**
* IsAtBeginningOf() returns whether we're at the logical beginning of the
* given level. (as opposed to ResultIterator's left-to-right top-to-bottom
* order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf().
* For a full description, see pageiterator.h
*/
bool IsAtBeginningOf(PageIteratorLevel level) const override;
/**
* Implement PageIterator's IsAtFinalElement correctly in a BiDi context.
* For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we
* point at the last word in a paragraph. See PageIterator for full comment.
*/
bool IsAtFinalElement(PageIteratorLevel level,
PageIteratorLevel element) const override;
// ============= Functions that refer to words only ============.
// Returns the number of blanks before the current word.
int BlanksBeforeWord() const;
// ============= Accessing data ==============.
/**
* Returns the null terminated UTF-8 encoded text string for the current
* object at the given level. Use delete [] to free after use.
*/
virtual char *GetUTF8Text(PageIteratorLevel level) const;
/**
* Returns the LSTM choices for every LSTM timestep for the current word.
*/
virtual std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
*GetRawLSTMTimesteps() const;
virtual std::vector<std::vector<std::pair<const char *, float>>>
*GetBestLSTMSymbolChoices() const;
/**
* Return whether the current paragraph's dominant reading direction
* is left-to-right (as opposed to right-to-left).
*/
bool ParagraphIsLtr() const;
// ============= Exposed only for testing =============.
/**
* Yields the reading order as a sequence of indices and (optional)
* meta-marks for a set of words (given left-to-right).
* The meta marks are passed as negative values:
* kMinorRunStart Start of minor direction text.
* kMinorRunEnd End of minor direction text.
* kComplexWord The next indexed word contains both left-to-right and
* right-to-left characters and was treated as neutral.
*
* For example, suppose we have five words in a text line,
* indexed [0,1,2,3,4] from the leftmost side of the text line.
* The following are all believable reading_orders:
*
* Left-to-Right (in ltr paragraph):
* { 0, 1, 2, 3, 4 }
* Left-to-Right (in rtl paragraph):
* { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd }
* Right-to-Left (in rtl paragraph):
* { 4, 3, 2, 1, 0 }
* Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph:
* { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }
*/
static void CalculateTextlineOrder(
bool paragraph_is_ltr,
const std::vector<StrongScriptDirection> &word_dirs,
std::vector<int> *reading_order);
static const int kMinorRunStart;
static const int kMinorRunEnd;
static const int kComplexWord;
protected:
/**
* We presume the data associated with the given iterator will outlive us.
* NB: This is private because it does something that is non-obvious:
* it resets to the beginning of the paragraph instead of staying wherever
* resit might have pointed.
*/
explicit ResultIterator(const LTRResultIterator &resit);
private:
/**
* Calculates the current paragraph's dominant writing direction.
* Typically, members should use current_paragraph_ltr_ instead.
*/
bool CurrentParagraphIsLtr() const;
/**
* Returns word indices as measured from resit->RestartRow() = index 0
* for the reading order of words within a textline given an iterator
* into the middle of the text line.
* In addition to non-negative word indices, the following negative values
* may be inserted:
* kMinorRunStart Start of minor direction text.
* kMinorRunEnd End of minor direction text.
* kComplexWord The previous word contains both left-to-right and
* right-to-left characters and was treated as neutral.
*/
void CalculateTextlineOrder(bool paragraph_is_ltr,
const LTRResultIterator &resit,
std::vector<int> *indices) const;
/** Same as above, but the caller's ssd gets filled in if ssd != nullptr. */
void CalculateTextlineOrder(bool paragraph_is_ltr,
const LTRResultIterator &resit,
std::vector<StrongScriptDirection> *ssd,
std::vector<int> *indices) const;
/**
* What is the index of the current word in a strict left-to-right reading
* of the row?
*/
int LTRWordIndex() const;
/**
* Given an iterator pointing at a word, returns the logical reading order
* of blob indices for the word.
*/
void CalculateBlobOrder(std::vector<int> *blob_indices) const;
/** Precondition: current_paragraph_is_ltr_ is set. */
void MoveToLogicalStartOfTextline();
/**
* Precondition: current_paragraph_is_ltr_ and in_minor_direction_
* are set.
*/
void MoveToLogicalStartOfWord();
/** Are we pointing at the final (reading order) symbol of the word? */
bool IsAtFinalSymbolOfWord() const;
/** Are we pointing at the first (reading order) symbol of the word? */
bool IsAtFirstSymbolOfWord() const;
/**
* Append any extra marks that should be appended to this word when printed.
* Mostly, these are Unicode BiDi control characters.
*/
void AppendSuffixMarks(std::string *text) const;
/** Appends the current word in reading order to the given buffer.*/
void AppendUTF8WordText(std::string *text) const;
/**
* Appends the text of the current text line, *assuming this iterator is
* positioned at the beginning of the text line* This function
* updates the iterator to point to the first position past the text line.
* Each textline is terminated in a single newline character.
* If the textline ends a paragraph, it gets a second terminal newline.
*/
void IterateAndAppendUTF8TextlineText(std::string *text);
/**
* Appends the text of the current paragraph in reading order
* to the given buffer.
* Each textline is terminated in a single newline character, and the
* paragraph gets an extra newline at the end.
*/
void AppendUTF8ParagraphText(std::string *text) const;
/** Returns whether the bidi_debug flag is set to at least min_level. */
bool BidiDebug(int min_level) const;
bool current_paragraph_is_ltr_;
/**
* Is the currently pointed-at character at the beginning of
* a minor-direction run?
*/
bool at_beginning_of_minor_run_;
/** Is the currently pointed-at character in a minor-direction sequence? */
bool in_minor_direction_;
/**
* Should detected inter-word spaces be preserved, or "compressed" to a single
* space character (default behavior).
*/
bool preserve_interword_spaces_;
};
} // namespace tesseract.
#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H_

View File

@ -0,0 +1,177 @@
///////////////////////////////////////////////////////////////////////
// File: unichar.h
// Description: Unicode character/ligature class.
// Author: Ray Smith
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCUTIL_UNICHAR_H_
#define TESSERACT_CCUTIL_UNICHAR_H_
#include "export.h"
#include <memory.h>
#include <cstring>
#include <string>
#include <vector>
namespace tesseract {
// Maximum number of characters that can be stored in a UNICHAR. Must be
// at least 4. Must not exceed 31 without changing the coding of length.
#define UNICHAR_LEN 30
// TODO(rays) Move these to the tesseract namespace.
// A UNICHAR_ID is the unique id of a unichar.
using UNICHAR_ID = int;
// A variable to indicate an invalid or uninitialized unichar id.
static const int INVALID_UNICHAR_ID = -1;
// A special unichar that corresponds to INVALID_UNICHAR_ID.
static const char INVALID_UNICHAR[] = "__INVALID_UNICHAR__";
enum StrongScriptDirection {
DIR_NEUTRAL = 0, // Text contains only neutral characters.
DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters.
DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters.
DIR_MIX = 3, // Text contains a mixture of left-to-right
// and right-to-left characters.
};
using char32 = signed int;
// The UNICHAR class holds a single classification result. This may be
// a single Unicode character (stored as between 1 and 4 utf8 bytes) or
// multiple Unicode characters representing the NFKC expansion of a ligature
// such as fi, ffl etc. These are also stored as utf8.
class TESS_API UNICHAR {
public:
UNICHAR() {
memset(chars, 0, UNICHAR_LEN);
}
// Construct from a utf8 string. If len<0 then the string is null terminated.
// If the string is too long to fit in the UNICHAR then it takes only what
// will fit.
UNICHAR(const char *utf8_str, int len);
// Construct from a single UCS4 character.
explicit UNICHAR(int unicode);
// Default copy constructor and operator= are OK.
// Get the first character as UCS-4.
int first_uni() const;
// Get the length of the UTF8 string.
int utf8_len() const {
int len = chars[UNICHAR_LEN - 1];
return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;
}
// Get a UTF8 string, but NOT nullptr terminated.
const char *utf8() const {
return chars;
}
// Get a terminated UTF8 string: Must delete[] it after use.
char *utf8_str() const;
// Get the number of bytes in the first character of the given utf8 string.
static int utf8_step(const char *utf8_str);
// A class to simplify iterating over and accessing elements of a UTF8
// string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or
// take ownership of the underlying byte array. It also does not permit
// modification of the array (as the name suggests).
//
// Example:
// for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len);
// it != UNICHAR::end(str, len);
// ++it) {
// tprintf("UCS-4 symbol code = %d\n", *it);
// char buf[5];
// int char_len = it.get_utf8(buf); buf[char_len] = '\0';
// tprintf("Char = %s\n", buf);
// }
class TESS_API const_iterator {
using CI = const_iterator;
public:
// Step to the next UTF8 character.
// If the current position is at an illegal UTF8 character, then print an
// error message and step by one byte. If the current position is at a
// nullptr value, don't step past it.
const_iterator &operator++();
// Return the UCS-4 value at the current position.
// If the current position is at an illegal UTF8 value, return a single
// space character.
int operator*() const;
// Store the UTF-8 encoding of the current codepoint into buf, which must be
// at least 4 bytes long. Return the number of bytes written.
// If the current position is at an illegal UTF8 value, writes a single
// space character and returns 1.
// Note that this method does not null-terminate the buffer.
int get_utf8(char *buf) const;
// Returns the number of bytes of the current codepoint. Returns 1 if the
// current position is at an illegal UTF8 value.
int utf8_len() const;
// Returns true if the UTF-8 encoding at the current position is legal.
bool is_legal() const;
// Return the pointer into the string at the current position.
const char *utf8_data() const {
return it_;
}
// Iterator equality operators.
friend bool operator==(const CI &lhs, const CI &rhs) {
return lhs.it_ == rhs.it_;
}
friend bool operator!=(const CI &lhs, const CI &rhs) {
return !(lhs == rhs);
}
private:
friend class UNICHAR;
explicit const_iterator(const char *it) : it_(it) {}
const char *it_; // Pointer into the string.
};
// Create a start/end iterator pointing to a string. Note that these methods
// are static and do NOT create a copy or take ownership of the underlying
// array.
static const_iterator begin(const char *utf8_str, int byte_length);
static const_iterator end(const char *utf8_str, int byte_length);
// Converts a utf-8 string to a vector of unicodes.
// Returns an empty vector if the input contains invalid UTF-8.
static std::vector<char32> UTF8ToUTF32(const char *utf8_str);
// Converts a vector of unicodes to a utf8 string.
// Returns an empty string if the input contains an invalid unicode.
static std::string UTF32ToUTF8(const std::vector<char32> &str32);
private:
// A UTF-8 representation of 1 or more Unicode characters.
// The last element (chars[UNICHAR_LEN - 1]) is a length if
// its value < UNICHAR_LEN, otherwise it is a genuine character.
char chars[UNICHAR_LEN]{};
};
} // namespace tesseract
#endif // TESSERACT_CCUTIL_UNICHAR_H_

View File

@ -0,0 +1,36 @@
///////////////////////////////////////////////////////////////////////
// File: version.h
// Description: Version information
//
// (C) Copyright 2018, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_API_VERSION_H_
#define TESSERACT_API_VERSION_H_
// clang-format off
#define TESSERACT_MAJOR_VERSION 5
#define TESSERACT_MINOR_VERSION 0
#define TESSERACT_MICRO_VERSION 0
#define TESSERACT_VERSION \
(TESSERACT_MAJOR_VERSION << 16 | \
TESSERACT_MINOR_VERSION << 8 | \
TESSERACT_MICRO_VERSION)
#define TESSERACT_VERSION_STR "5.0.0-alpha-20210401-98-g176d"
// clang-format on
#endif // TESSERACT_API_VERSION_H_

View File

@ -0,0 +1,245 @@
// File: altorenderer.cpp
// Description: ALTO rendering interface
// Author: Jake Sebright
// (C) Copyright 2018
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef _WIN32
# include "host.h" // windows.h for MultiByteToWideChar, ...
#endif
#include <tesseract/baseapi.h>
#include <tesseract/renderer.h>
#include <memory>
#include <sstream> // for std::stringstream
namespace tesseract {
/// Add coordinates to specified TextBlock, TextLine or String bounding box.
/// Add word confidence if adding to a String bounding box.
///
static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level,
std::stringstream &alto_str) {
int left, top, right, bottom;
it->BoundingBox(level, &left, &top, &right, &bottom);
int hpos = left;
int vpos = top;
int height = bottom - top;
int width = right - left;
alto_str << " HPOS=\"" << hpos << "\"";
alto_str << " VPOS=\"" << vpos << "\"";
alto_str << " WIDTH=\"" << width << "\"";
alto_str << " HEIGHT=\"" << height << "\"";
if (level == RIL_WORD) {
int wc = it->Confidence(RIL_WORD);
alto_str << " WC=\"0." << wc << "\"";
} else {
alto_str << ">";
}
}
///
/// Append the ALTO XML for the beginning of the document
///
bool TessAltoRenderer::BeginDocumentHandler() {
AppendString(
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
"<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" "
"xmlns:xlink=\"http://www.w3.org/1999/xlink\" "
"xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
"xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# "
"http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n"
"\t<Description>\n"
"\t\t<MeasurementUnit>pixel</MeasurementUnit>\n"
"\t\t<sourceImageInformation>\n"
"\t\t\t<fileName>");
AppendString(title());
AppendString(
"</fileName>\n"
"\t\t</sourceImageInformation>\n"
"\t\t<OCRProcessing ID=\"OCR_0\">\n"
"\t\t\t<ocrProcessingStep>\n"
"\t\t\t\t<processingSoftware>\n"
"\t\t\t\t\t<softwareName>tesseract ");
AppendString(TessBaseAPI::Version());
AppendString(
"</softwareName>\n"
"\t\t\t\t</processingSoftware>\n"
"\t\t\t</ocrProcessingStep>\n"
"\t\t</OCRProcessing>\n"
"\t</Description>\n"
"\t<Layout>\n");
return true;
}
///
/// Append the ALTO XML for the layout of the image
///
bool TessAltoRenderer::AddImageHandler(TessBaseAPI *api) {
const std::unique_ptr<const char[]> text(api->GetAltoText(imagenum()));
if (text == nullptr) {
return false;
}
AppendString(text.get());
return true;
}
///
/// Append the ALTO XML for the end of the document
///
bool TessAltoRenderer::EndDocumentHandler() {
AppendString("\t</Layout>\n</alto>\n");
return true;
}
TessAltoRenderer::TessAltoRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "xml") {}
///
/// Make an XML-formatted string with ALTO markup from the internal
/// data structures.
///
char *TessBaseAPI::GetAltoText(int page_number) {
return GetAltoText(nullptr, page_number);
}
///
/// Make an XML-formatted string with ALTO markup from the internal
/// data structures.
///
char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) {
return nullptr;
}
int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0;
if (input_file_.empty()) {
SetInputName(nullptr);
}
#ifdef _WIN32
// convert input name from ANSI encoding to utf-8
int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
wchar_t *uni16_str = new WCHAR[str16_len];
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, str16_len);
int utf8_len =
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr);
char *utf8_str = new char[utf8_len];
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr);
input_file_ = utf8_str;
delete[] uni16_str;
delete[] utf8_str;
#endif
std::stringstream alto_str;
// Use "C" locale (needed for int values larger than 999).
alto_str.imbue(std::locale::classic());
alto_str << "\t\t<Page WIDTH=\"" << rect_width_ << "\" HEIGHT=\"" << rect_height_
<< "\" PHYSICAL_IMG_NR=\"" << page_number << "\""
<< " ID=\"page_" << page_number << "\">\n"
<< "\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\""
<< " WIDTH=\"" << rect_width_ << "\""
<< " HEIGHT=\"" << rect_height_ << "\">\n";
ResultIterator *res_it = GetIterator();
while (!res_it->Empty(RIL_BLOCK)) {
if (res_it->Empty(RIL_WORD)) {
res_it->Next(RIL_WORD);
continue;
}
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\"";
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
alto_str << "\n";
}
if (res_it->IsAtBeginningOf(RIL_PARA)) {
alto_str << "\t\t\t\t\t<TextBlock ID=\"block_" << tcnt << "\"";
AddBoxToAlto(res_it, RIL_PARA, alto_str);
alto_str << "\n";
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
AddBoxToAlto(res_it, RIL_TEXTLINE, alto_str);
alto_str << "\n";
}
alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
AddBoxToAlto(res_it, RIL_WORD, alto_str);
alto_str << " CONTENT=\"";
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
bool last_word_in_tblock = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
int left, top, right, bottom;
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
do {
const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
if (grapheme && grapheme[0] != 0) {
alto_str << HOcrEscape(grapheme.get()).c_str();
}
res_it->Next(RIL_SYMBOL);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
alto_str << "\"/>";
wcnt++;
if (last_word_in_line) {
alto_str << "\n\t\t\t\t\t\t</TextLine>\n";
lcnt++;
} else {
int hpos = right;
int vpos = top;
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
int width = left - hpos;
alto_str << "<SP WIDTH=\"" << width << "\" VPOS=\"" << vpos << "\" HPOS=\"" << hpos
<< "\"/>\n";
}
if (last_word_in_tblock) {
alto_str << "\t\t\t\t\t</TextBlock>\n";
tcnt++;
}
if (last_word_in_cblock) {
alto_str << "\t\t\t\t</ComposedBlock>\n";
bcnt++;
}
}
alto_str << "\t\t\t</PrintSpace>\n"
<< "\t\t</Page>\n";
const std::string &text = alto_str.str();
char *result = new char[text.length() + 1];
strcpy(result, text.c_str());
delete res_it;
return result;
}
} // namespace tesseract

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,689 @@
///////////////////////////////////////////////////////////////////////
// File: capi.cpp
// Description: C-API TessBaseAPI
//
// (C) Copyright 2012, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include <tesseract/capi.h>
#include <cstring> // for strdup
const char *TessVersion() {
return TessBaseAPI::Version();
}
void TessDeleteText(const char *text) {
delete[] text;
}
void TessDeleteTextArray(char **arr) {
for (char **pos = arr; *pos != nullptr; ++pos) {
delete[] * pos;
}
delete[] arr;
}
void TessDeleteIntArray(const int *arr) {
delete[] arr;
}
TessResultRenderer *TessTextRendererCreate(const char *outputbase) {
return new tesseract::TessTextRenderer(outputbase);
}
TessResultRenderer *TessHOcrRendererCreate(const char *outputbase) {
return new tesseract::TessHOcrRenderer(outputbase);
}
TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase, BOOL font_info) {
return new tesseract::TessHOcrRenderer(outputbase, font_info != 0);
}
TessResultRenderer *TessAltoRendererCreate(const char *outputbase) {
return new tesseract::TessAltoRenderer(outputbase);
}
TessResultRenderer *TessTsvRendererCreate(const char *outputbase) {
return new tesseract::TessTsvRenderer(outputbase);
}
TessResultRenderer *TessPDFRendererCreate(const char *outputbase, const char *datadir,
BOOL textonly) {
return new tesseract::TessPDFRenderer(outputbase, datadir, textonly != 0);
}
TessResultRenderer *TessUnlvRendererCreate(const char *outputbase) {
return new tesseract::TessUnlvRenderer(outputbase);
}
TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase) {
return new tesseract::TessBoxTextRenderer(outputbase);
}
TessResultRenderer *TessWordStrBoxRendererCreate(const char *outputbase) {
return new tesseract::TessWordStrBoxRenderer(outputbase);
}
TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase) {
return new tesseract::TessLSTMBoxRenderer(outputbase);
}
void TessDeleteResultRenderer(TessResultRenderer *renderer) {
delete renderer;
}
void TessResultRendererInsert(TessResultRenderer *renderer, TessResultRenderer *next) {
renderer->insert(next);
}
TessResultRenderer *TessResultRendererNext(TessResultRenderer *renderer) {
return renderer->next();
}
BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer, const char *title) {
return static_cast<int>(renderer->BeginDocument(title));
}
BOOL TessResultRendererAddImage(TessResultRenderer *renderer, TessBaseAPI *api) {
return static_cast<int>(renderer->AddImage(api));
}
BOOL TessResultRendererEndDocument(TessResultRenderer *renderer) {
return static_cast<int>(renderer->EndDocument());
}
const char *TessResultRendererExtention(TessResultRenderer *renderer) {
return renderer->file_extension();
}
const char *TessResultRendererTitle(TessResultRenderer *renderer) {
return renderer->title();
}
int TessResultRendererImageNum(TessResultRenderer *renderer) {
return renderer->imagenum();
}
TessBaseAPI *TessBaseAPICreate() {
return new TessBaseAPI;
}
void TessBaseAPIDelete(TessBaseAPI *handle) {
delete handle;
}
size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI * /*handle*/, void **device) {
return TessBaseAPI::getOpenCLDevice(device);
}
void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name) {
handle->SetInputName(name);
}
const char *TessBaseAPIGetInputName(TessBaseAPI *handle) {
return handle->GetInputName();
}
void TessBaseAPISetInputImage(TessBaseAPI *handle, Pix *pix) {
handle->SetInputImage(pix);
}
Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle) {
return handle->GetInputImage();
}
int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle) {
return handle->GetSourceYResolution();
}
const char *TessBaseAPIGetDatapath(TessBaseAPI *handle) {
return handle->GetDatapath();
}
void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name) {
handle->SetOutputName(name);
}
BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name, const char *value) {
return static_cast<int>(handle->SetVariable(name, value));
}
BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name, const char *value) {
return static_cast<int>(handle->SetDebugVariable(name, value));
}
BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle, const char *name, int *value) {
return static_cast<int>(handle->GetIntVariable(name, value));
}
BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle, const char *name, BOOL *value) {
bool boolValue;
bool result = handle->GetBoolVariable(name, &boolValue);
if (result) {
*value = static_cast<int>(boolValue);
}
return static_cast<int>(result);
}
BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle, const char *name, double *value) {
return static_cast<int>(handle->GetDoubleVariable(name, value));
}
const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle, const char *name) {
return handle->GetStringVariable(name);
}
void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp) {
handle->PrintVariables(fp);
}
BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle, const char *filename) {
FILE *fp = fopen(filename, "w");
if (fp != nullptr) {
handle->PrintVariables(fp);
fclose(fp);
return TRUE;
}
return FALSE;
}
int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath, const char *language,
TessOcrEngineMode mode, char **configs, int configs_size, char **vars_vec,
char **vars_values, size_t vars_vec_size, BOOL set_only_non_debug_params) {
std::vector<std::string> varNames;
std::vector<std::string> varValues;
if (vars_vec != nullptr && vars_values != nullptr) {
for (size_t i = 0; i < vars_vec_size; i++) {
varNames.emplace_back(vars_vec[i]);
varValues.emplace_back(vars_values[i]);
}
}
return handle->Init(datapath, language, mode, configs, configs_size, &varNames, &varValues,
set_only_non_debug_params != 0);
}
int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath, const char *language,
TessOcrEngineMode oem, char **configs, int configs_size) {
return handle->Init(datapath, language, oem, configs, configs_size, nullptr, nullptr, false);
}
int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath, const char *language,
TessOcrEngineMode oem) {
return handle->Init(datapath, language, oem);
}
int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath, const char *language) {
return handle->Init(datapath, language);
}
const char *TessBaseAPIGetInitLanguagesAsString(const TessBaseAPI *handle) {
return handle->GetInitLanguagesAsString();
}
char **TessBaseAPIGetLoadedLanguagesAsVector(const TessBaseAPI *handle) {
std::vector<std::string> languages;
handle->GetLoadedLanguagesAsVector(&languages);
char **arr = new char *[languages.size() + 1];
for (auto &language : languages) {
arr[&language - &languages[0]] = strdup(language.c_str());
}
arr[languages.size()] = nullptr;
return arr;
}
char **TessBaseAPIGetAvailableLanguagesAsVector(const TessBaseAPI *handle) {
std::vector<std::string> languages;
handle->GetAvailableLanguagesAsVector(&languages);
char **arr = new char *[languages.size() + 1];
for (auto &language : languages) {
arr[&language - &languages[0]] = strdup(language.c_str());
}
arr[languages.size()] = nullptr;
return arr;
}
#ifndef DISABLED_LEGACY_ENGINE
int TessBaseAPIInitLangMod(TessBaseAPI *handle, const char *datapath, const char *language) {
return handle->InitLangMod(datapath, language);
}
#endif
void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle) {
handle->InitForAnalysePage();
}
void TessBaseAPIReadConfigFile(TessBaseAPI *handle, const char *filename) {
handle->ReadConfigFile(filename);
}
void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle, const char *filename) {
handle->ReadDebugConfigFile(filename);
}
void TessBaseAPISetPageSegMode(TessBaseAPI *handle, TessPageSegMode mode) {
handle->SetPageSegMode(mode);
}
TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle) {
return handle->GetPageSegMode();
}
char *TessBaseAPIRect(TessBaseAPI *handle, const unsigned char *imagedata, int bytes_per_pixel,
int bytes_per_line, int left, int top, int width, int height) {
return handle->TesseractRect(imagedata, bytes_per_pixel, bytes_per_line, left, top, width,
height);
}
#ifndef DISABLED_LEGACY_ENGINE
void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle) {
handle->ClearAdaptiveClassifier();
}
#endif
void TessBaseAPISetImage(TessBaseAPI *handle, const unsigned char *imagedata, int width, int height,
int bytes_per_pixel, int bytes_per_line) {
handle->SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line);
}
void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix) {
return handle->SetImage(pix);
}
void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi) {
handle->SetSourceResolution(ppi);
}
void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top, int width, int height) {
handle->SetRectangle(left, top, width, height);
}
struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle) {
return handle->GetThresholdedImage();
}
void TessBaseAPIClearPersistentCache(TessBaseAPI * /*handle*/) {
TessBaseAPI::ClearPersistentCache();
}
#ifndef DISABLED_LEGACY_ENGINE
BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle, int *orient_deg, float *orient_conf,
const char **script_name, float *script_conf) {
auto success = handle->DetectOrientationScript(orient_deg, orient_conf, script_name, script_conf);
return static_cast<BOOL>(success);
}
#endif
struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle, struct Pixa **pixa) {
return handle->GetRegions(pixa);
}
struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle, struct Pixa **pixa, int **blockids) {
return handle->GetTextlines(pixa, blockids);
}
struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle, const BOOL raw_image,
const int raw_padding, struct Pixa **pixa, int **blockids,
int **paraids) {
return handle->GetTextlines(raw_image != 0, raw_padding, pixa, blockids, paraids);
}
struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle, struct Pixa **pixa, int **blockids) {
return handle->GetStrips(pixa, blockids);
}
struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle, struct Pixa **pixa) {
return handle->GetWords(pixa);
}
struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle, struct Pixa **cc) {
return handle->GetConnectedComponents(cc);
}
struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle, TessPageIteratorLevel level,
BOOL text_only, struct Pixa **pixa, int **blockids) {
return handle->GetComponentImages(level, static_cast<bool>(text_only), pixa, blockids);
}
struct Boxa *TessBaseAPIGetComponentImages1(TessBaseAPI *handle, const TessPageIteratorLevel level,
const BOOL text_only, const BOOL raw_image,
const int raw_padding, struct Pixa **pixa,
int **blockids, int **paraids) {
return handle->GetComponentImages(level, static_cast<bool>(text_only), raw_image != 0,
raw_padding, pixa, blockids, paraids);
}
int TessBaseAPIGetThresholdedImageScaleFactor(const TessBaseAPI *handle) {
return handle->GetThresholdedImageScaleFactor();
}
TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle) {
return handle->AnalyseLayout();
}
int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor) {
return handle->Recognize(monitor);
}
BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename, const char *retry_config,
int timeout_millisec, TessResultRenderer *renderer) {
return static_cast<int>(handle->ProcessPages(filename, retry_config, timeout_millisec, renderer));
}
BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix, int page_index,
const char *filename, const char *retry_config, int timeout_millisec,
TessResultRenderer *renderer) {
return static_cast<int>(
handle->ProcessPage(pix, page_index, filename, retry_config, timeout_millisec, renderer));
}
TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle) {
return handle->GetIterator();
}
TessMutableIterator *TessBaseAPIGetMutableIterator(TessBaseAPI *handle) {
return handle->GetMutableIterator();
}
char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle) {
return handle->GetUTF8Text();
}
char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number) {
return handle->GetHOCRText(nullptr, page_number);
}
char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number) {
return handle->GetAltoText(page_number);
}
char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number) {
return handle->GetTSVText(page_number);
}
char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number) {
return handle->GetBoxText(page_number);
}
char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle, int page_number) {
return handle->GetWordStrBoxText(page_number);
}
char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number) {
return handle->GetLSTMBoxText(page_number);
}
char *TessBaseAPIGetUNLVText(TessBaseAPI *handle) {
return handle->GetUNLVText();
}
int TessBaseAPIMeanTextConf(TessBaseAPI *handle) {
return handle->MeanTextConf();
}
int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle) {
return handle->AllWordConfidences();
}
#ifndef DISABLED_LEGACY_ENGINE
BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle, TessPageSegMode mode, const char *wordstr) {
return static_cast<int>(handle->AdaptToWordStr(mode, wordstr));
}
#endif
void TessBaseAPIClear(TessBaseAPI *handle) {
handle->Clear();
}
void TessBaseAPIEnd(TessBaseAPI *handle) {
handle->End();
}
int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word) {
return handle->IsValidWord(word);
}
BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset, float *out_slope) {
return static_cast<int>(handle->GetTextDirection(out_offset, out_slope));
}
const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id) {
return handle->GetUnichar(unichar_id);
}
void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle, double margin) {
handle->set_min_orientation_margin(margin);
}
int TessBaseAPINumDawgs(const TessBaseAPI *handle) {
return handle->NumDawgs();
}
TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle) {
return handle->oem();
}
void TessBaseGetBlockTextOrientations(TessBaseAPI *handle, int **block_orientation,
bool **vertical_writing) {
handle->GetBlockTextOrientations(block_orientation, vertical_writing);
}
void TessPageIteratorDelete(TessPageIterator *handle) {
delete handle;
}
TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle) {
return new TessPageIterator(*handle);
}
void TessPageIteratorBegin(TessPageIterator *handle) {
handle->Begin();
}
BOOL TessPageIteratorNext(TessPageIterator *handle, TessPageIteratorLevel level) {
return static_cast<int>(handle->Next(level));
}
BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle, TessPageIteratorLevel level) {
return static_cast<int>(handle->IsAtBeginningOf(level));
}
BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle, TessPageIteratorLevel level,
TessPageIteratorLevel element) {
return static_cast<int>(handle->IsAtFinalElement(level, element));
}
BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle, TessPageIteratorLevel level,
int *left, int *top, int *right, int *bottom) {
return static_cast<int>(handle->BoundingBox(level, left, top, right, bottom));
}
TessPolyBlockType TessPageIteratorBlockType(const TessPageIterator *handle) {
return handle->BlockType();
}
struct Pix *TessPageIteratorGetBinaryImage(const TessPageIterator *handle,
TessPageIteratorLevel level) {
return handle->GetBinaryImage(level);
}
struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle, TessPageIteratorLevel level,
int padding, struct Pix *original_image, int *left, int *top) {
return handle->GetImage(level, padding, original_image, left, top);
}
BOOL TessPageIteratorBaseline(const TessPageIterator *handle, TessPageIteratorLevel level, int *x1,
int *y1, int *x2, int *y2) {
return static_cast<int>(handle->Baseline(level, x1, y1, x2, y2));
}
void TessPageIteratorOrientation(TessPageIterator *handle, TessOrientation *orientation,
TessWritingDirection *writing_direction,
TessTextlineOrder *textline_order, float *deskew_angle) {
handle->Orientation(orientation, writing_direction, textline_order, deskew_angle);
}
void TessPageIteratorParagraphInfo(TessPageIterator *handle,
TessParagraphJustification *justification, BOOL *is_list_item,
BOOL *is_crown, int *first_line_indent) {
bool bool_is_list_item;
bool bool_is_crown;
handle->ParagraphInfo(justification, &bool_is_list_item, &bool_is_crown, first_line_indent);
if (is_list_item != nullptr) {
*is_list_item = static_cast<int>(bool_is_list_item);
}
if (is_crown != nullptr) {
*is_crown = static_cast<int>(bool_is_crown);
}
}
void TessResultIteratorDelete(TessResultIterator *handle) {
delete handle;
}
TessResultIterator *TessResultIteratorCopy(const TessResultIterator *handle) {
return new TessResultIterator(*handle);
}
TessPageIterator *TessResultIteratorGetPageIterator(TessResultIterator *handle) {
return handle;
}
const TessPageIterator *TessResultIteratorGetPageIteratorConst(const TessResultIterator *handle) {
return handle;
}
TessChoiceIterator *TessResultIteratorGetChoiceIterator(const TessResultIterator *handle) {
return new TessChoiceIterator(*handle);
}
BOOL TessResultIteratorNext(TessResultIterator *handle, TessPageIteratorLevel level) {
return static_cast<int>(handle->Next(level));
}
char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle, TessPageIteratorLevel level) {
return handle->GetUTF8Text(level);
}
float TessResultIteratorConfidence(const TessResultIterator *handle, TessPageIteratorLevel level) {
return handle->Confidence(level);
}
const char *TessResultIteratorWordRecognitionLanguage(const TessResultIterator *handle) {
return handle->WordRecognitionLanguage();
}
const char *TessResultIteratorWordFontAttributes(const TessResultIterator *handle, BOOL *is_bold,
BOOL *is_italic, BOOL *is_underlined,
BOOL *is_monospace, BOOL *is_serif,
BOOL *is_smallcaps, int *pointsize, int *font_id) {
bool bool_is_bold;
bool bool_is_italic;
bool bool_is_underlined;
bool bool_is_monospace;
bool bool_is_serif;
bool bool_is_smallcaps;
const char *ret = handle->WordFontAttributes(&bool_is_bold, &bool_is_italic, &bool_is_underlined,
&bool_is_monospace, &bool_is_serif,
&bool_is_smallcaps, pointsize, font_id);
if (is_bold != nullptr) {
*is_bold = static_cast<int>(bool_is_bold);
}
if (is_italic != nullptr) {
*is_italic = static_cast<int>(bool_is_italic);
}
if (is_underlined != nullptr) {
*is_underlined = static_cast<int>(bool_is_underlined);
}
if (is_monospace != nullptr) {
*is_monospace = static_cast<int>(bool_is_monospace);
}
if (is_serif != nullptr) {
*is_serif = static_cast<int>(bool_is_serif);
}
if (is_smallcaps != nullptr) {
*is_smallcaps = static_cast<int>(bool_is_smallcaps);
}
return ret;
}
BOOL TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle) {
return static_cast<int>(handle->WordIsFromDictionary());
}
BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle) {
return static_cast<int>(handle->WordIsNumeric());
}
BOOL TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle) {
return static_cast<int>(handle->SymbolIsSuperscript());
}
BOOL TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle) {
return static_cast<int>(handle->SymbolIsSubscript());
}
BOOL TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle) {
return static_cast<int>(handle->SymbolIsDropcap());
}
void TessChoiceIteratorDelete(TessChoiceIterator *handle) {
delete handle;
}
BOOL TessChoiceIteratorNext(TessChoiceIterator *handle) {
return static_cast<int>(handle->Next());
}
const char *TessChoiceIteratorGetUTF8Text(const TessChoiceIterator *handle) {
return handle->GetUTF8Text();
}
float TessChoiceIteratorConfidence(const TessChoiceIterator *handle) {
return handle->Confidence();
}
ETEXT_DESC *TessMonitorCreate() {
return new ETEXT_DESC();
}
void TessMonitorDelete(ETEXT_DESC *monitor) {
delete monitor;
}
void TessMonitorSetCancelFunc(ETEXT_DESC *monitor, TessCancelFunc cancelFunc) {
monitor->cancel = cancelFunc;
}
void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis) {
monitor->cancel_this = cancelThis;
}
void *TessMonitorGetCancelThis(ETEXT_DESC *monitor) {
return monitor->cancel_this;
}
void TessMonitorSetProgressFunc(ETEXT_DESC *monitor, TessProgressFunc progressFunc) {
monitor->progress_callback2 = progressFunc;
}
int TessMonitorGetProgress(ETEXT_DESC *monitor) {
return monitor->progress;
}
void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline) {
monitor->set_deadline_msecs(deadline);
}

View File

@ -0,0 +1,489 @@
/**********************************************************************
* File: hocrrenderer.cpp
* Description: Simple API for calling tesseract.
* Author: Ray Smith (original code from baseapi.cpp)
* Author: Stefan Weil (moved to separate file and cleaned code)
*
* (C) Copyright 2006, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include <tesseract/baseapi.h> // for TessBaseAPI
#include <locale> // for std::locale::classic
#include <memory> // for std::unique_ptr
#include <sstream> // for std::stringstream
#ifdef _WIN32
# include "host.h" // windows.h for MultiByteToWideChar, ...
#endif
#include <tesseract/renderer.h>
#include "tesseractclass.h" // for Tesseract
namespace tesseract {
/**
* Gets the block orientation at the current iterator position.
*/
static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) {
tesseract::Orientation orientation;
tesseract::WritingDirection writing_direction;
tesseract::TextlineOrder textline_order;
float deskew_angle;
it->Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle);
return orientation;
}
/**
* Fits a line to the baseline at the given level, and appends its coefficients
* to the hOCR string.
* NOTE: The hOCR spec is unclear on how to specify baseline coefficients for
* rotated textlines. For this reason, on textlines that are not upright, this
* method currently only inserts a 'textangle' property to indicate the rotation
* direction and does not add any baseline information to the hocr string.
*/
static void AddBaselineCoordsTohOCR(const PageIterator *it, PageIteratorLevel level,
std::stringstream &hocr_str) {
tesseract::Orientation orientation = GetBlockTextOrientation(it);
if (orientation != ORIENTATION_PAGE_UP) {
hocr_str << "; textangle " << 360 - orientation * 90;
return;
}
int left, top, right, bottom;
it->BoundingBox(level, &left, &top, &right, &bottom);
// Try to get the baseline coordinates at this level.
int x1, y1, x2, y2;
if (!it->Baseline(level, &x1, &y1, &x2, &y2)) {
return;
}
// Following the description of this field of the hOCR spec, we convert the
// baseline coordinates so that "the bottom left of the bounding box is the
// origin".
x1 -= left;
x2 -= left;
y1 -= bottom;
y2 -= bottom;
// Now fit a line through the points so we can extract coefficients for the
// equation: y = p1 x + p0
if (x1 == x2) {
// Problem computing the polynomial coefficients.
return;
}
double p1 = (y2 - y1) / static_cast<double>(x2 - x1);
double p0 = y1 - p1 * x1;
hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " " << round(p0 * 1000.0) / 1000.0;
}
static void AddBoxTohOCR(const ResultIterator *it, PageIteratorLevel level,
std::stringstream &hocr_str) {
int left, top, right, bottom;
it->BoundingBox(level, &left, &top, &right, &bottom);
// This is the only place we use double quotes instead of single quotes,
// but it may too late to change for consistency
hocr_str << " title=\"bbox " << left << " " << top << " " << right << " " << bottom;
// Add baseline coordinates & heights for textlines only.
if (level == RIL_TEXTLINE) {
AddBaselineCoordsTohOCR(it, level, hocr_str);
// add custom height measures
float row_height, descenders, ascenders; // row attributes
it->RowAttributes(&row_height, &descenders, &ascenders);
// TODO(rays): Do we want to limit these to a single decimal place?
hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders << "; x_ascenders "
<< ascenders;
}
hocr_str << "\">";
}
/**
* Make a HTML-formatted string with hOCR markup from the internal
* data structures.
* page_number is 0-based but will appear in the output as 1-based.
* Image name/input_file_ can be set by SetInputName before calling
* GetHOCRText
* STL removed from original patch submission and refactored by rays.
* Returned string must be freed with the delete [] operator.
*/
char *TessBaseAPI::GetHOCRText(int page_number) {
return GetHOCRText(nullptr, page_number);
}
/**
* Make a HTML-formatted string with hOCR markup from the internal
* data structures.
* page_number is 0-based but will appear in the output as 1-based.
* Image name/input_file_ can be set by SetInputName before calling
* GetHOCRText
* STL removed from original patch submission and refactored by rays.
* Returned string must be freed with the delete [] operator.
*/
char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) {
return nullptr;
}
int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, ccnt = 1;
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
bool para_is_ltr = true; // Default direction is LTR
const char *paragraph_lang = nullptr;
bool font_info = false;
bool hocr_boxes = false;
GetBoolVariable("hocr_font_info", &font_info);
GetBoolVariable("hocr_char_boxes", &hocr_boxes);
if (input_file_.empty()) {
SetInputName(nullptr);
}
#ifdef _WIN32
// convert input name from ANSI encoding to utf-8
int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
wchar_t *uni16_str = new WCHAR[str16_len];
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, str16_len);
int utf8_len =
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr);
char *utf8_str = new char[utf8_len];
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr);
input_file_ = utf8_str;
delete[] uni16_str;
delete[] utf8_str;
#endif
std::stringstream hocr_str;
// Use "C" locale (needed for double values x_size and x_descenders).
hocr_str.imbue(std::locale::classic());
// Use 8 digits for double values.
hocr_str.precision(8);
hocr_str << " <div class='ocr_page'"
<< " id='"
<< "page_" << page_id << "'"
<< " title='image \"";
if (!input_file_.empty()) {
hocr_str << HOcrEscape(input_file_.c_str());
} else {
hocr_str << "unknown";
}
hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " " << rect_width_ << " "
<< rect_height_ << "; ppageno " << page_number << "'>\n";
std::unique_ptr<ResultIterator> res_it(GetIterator());
while (!res_it->Empty(RIL_BLOCK)) {
if (res_it->Empty(RIL_WORD)) {
res_it->Next(RIL_WORD);
continue;
}
// Open any new block/paragraph/textline.
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
para_is_ltr = true; // reset to default direction
hocr_str << " <div class='ocr_carea'"
<< " id='"
<< "block_" << page_id << "_" << bcnt << "'";
AddBoxTohOCR(res_it.get(), RIL_BLOCK, hocr_str);
}
if (res_it->IsAtBeginningOf(RIL_PARA)) {
hocr_str << "\n <p class='ocr_par'";
para_is_ltr = res_it->ParagraphIsLtr();
if (!para_is_ltr) {
hocr_str << " dir='rtl'";
}
hocr_str << " id='"
<< "par_" << page_id << "_" << pcnt << "'";
paragraph_lang = res_it->WordRecognitionLanguage();
if (paragraph_lang) {
hocr_str << " lang='" << paragraph_lang << "'";
}
AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str);
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
hocr_str << "\n <span class='";
switch (res_it->BlockType()) {
case PT_HEADING_TEXT:
hocr_str << "ocr_header";
break;
case PT_PULLOUT_TEXT:
hocr_str << "ocr_textfloat";
break;
case PT_CAPTION_TEXT:
hocr_str << "ocr_caption";
break;
default:
hocr_str << "ocr_line";
}
hocr_str << "' id='"
<< "line_" << page_id << "_" << lcnt << "'";
AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str);
}
// Now, process the word...
int32_t lstm_choice_mode = tesseract_->lstm_choice_mode;
std::vector<std::vector<std::vector<std::pair<const char *, float>>>> *rawTimestepMap = nullptr;
std::vector<std::vector<std::pair<const char *, float>>> *CTCMap = nullptr;
if (lstm_choice_mode) {
CTCMap = res_it->GetBestLSTMSymbolChoices();
rawTimestepMap = res_it->GetRawLSTMTimesteps();
}
hocr_str << "\n <span class='ocrx_word'"
<< " id='"
<< "word_" << page_id << "_" << wcnt << "'";
int left, top, right, bottom;
bool bold, italic, underlined, monospace, serif, smallcaps;
int pointsize, font_id;
const char *font_name;
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
font_name = res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif,
&smallcaps, &pointsize, &font_id);
hocr_str << " title='bbox " << left << " " << top << " " << right << " " << bottom
<< "; x_wconf " << static_cast<int>(res_it->Confidence(RIL_WORD));
if (font_info) {
if (font_name) {
hocr_str << "; x_font " << HOcrEscape(font_name).c_str();
}
hocr_str << "; x_fsize " << pointsize;
}
hocr_str << "'";
const char *lang = res_it->WordRecognitionLanguage();
if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
hocr_str << " lang='" << lang << "'";
}
switch (res_it->WordDirection()) {
// Only emit direction if different from current paragraph direction
case DIR_LEFT_TO_RIGHT:
if (!para_is_ltr) {
hocr_str << " dir='ltr'";
}
break;
case DIR_RIGHT_TO_LEFT:
if (para_is_ltr) {
hocr_str << " dir='rtl'";
}
break;
case DIR_MIX:
case DIR_NEUTRAL:
default: // Do nothing.
break;
}
hocr_str << ">";
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
if (bold) {
hocr_str << "<strong>";
}
if (italic) {
hocr_str << "<em>";
}
do {
const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
if (grapheme && grapheme[0] != 0) {
if (hocr_boxes) {
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes " << left << " " << top
<< " " << right << " " << bottom << "; x_conf " << res_it->Confidence(RIL_SYMBOL)
<< "'>";
}
hocr_str << HOcrEscape(grapheme.get()).c_str();
if (hocr_boxes) {
hocr_str << "</span>";
tesseract::ChoiceIterator ci(*res_it);
if (lstm_choice_mode == 1 && ci.Timesteps() != nullptr) {
std::vector<std::vector<std::pair<const char *, float>>> *symbol = ci.Timesteps();
hocr_str << "\n <span class='ocr_symbol'"
<< " id='"
<< "symbol_" << page_id << "_" << wcnt << "_" << scnt << "'>";
for (auto timestep : *symbol) {
hocr_str << "\n <span class='ocrx_cinfo'"
<< " id='"
<< "timestep" << page_id << "_" << wcnt << "_" << tcnt << "'>";
for (auto conf : timestep) {
hocr_str << "\n <span class='ocrx_cinfo'"
<< " id='"
<< "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'"
<< " title='x_confs " << int(conf.second * 100) << "'>"
<< HOcrEscape(conf.first).c_str() << "</span>";
++ccnt;
}
hocr_str << "</span>";
++tcnt;
}
hocr_str << "\n </span>";
++scnt;
} else if (lstm_choice_mode == 2) {
tesseract::ChoiceIterator ci(*res_it);
hocr_str << "\n <span class='ocrx_cinfo'"
<< " id='"
<< "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt << "'>";
do {
const char *choice = ci.GetUTF8Text();
float choiceconf = ci.Confidence();
if (choice != nullptr) {
hocr_str << "\n <span class='ocrx_cinfo'"
<< " id='"
<< "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'"
<< " title='x_confs " << choiceconf << "'>" << HOcrEscape(choice).c_str()
<< "</span>";
ccnt++;
}
} while (ci.Next());
hocr_str << "\n </span>";
tcnt++;
}
}
}
res_it->Next(RIL_SYMBOL);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
if (italic) {
hocr_str << "</em>";
}
if (bold) {
hocr_str << "</strong>";
}
// If the lstm choice mode is required it is added here
if (lstm_choice_mode == 1 && !hocr_boxes && rawTimestepMap != nullptr) {
for (auto symbol : *rawTimestepMap) {
hocr_str << "\n <span class='ocr_symbol'"
<< " id='"
<< "symbol_" << page_id << "_" << wcnt << "_" << scnt << "'>";
for (auto timestep : symbol) {
hocr_str << "\n <span class='ocrx_cinfo'"
<< " id='"
<< "timestep" << page_id << "_" << wcnt << "_" << tcnt << "'>";
for (auto conf : timestep) {
hocr_str << "\n <span class='ocrx_cinfo'"
<< " id='"
<< "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'"
<< " title='x_confs " << int(conf.second * 100) << "'>"
<< HOcrEscape(conf.first).c_str() << "</span>";
++ccnt;
}
hocr_str << "</span>";
++tcnt;
}
hocr_str << "</span>";
++scnt;
}
} else if (lstm_choice_mode == 2 && !hocr_boxes && CTCMap != nullptr) {
for (auto timestep : *CTCMap) {
if (timestep.size() > 0) {
hocr_str << "\n <span class='ocrx_cinfo'"
<< " id='"
<< "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt << "'>";
for (auto &j : timestep) {
float conf = 100 - tesseract_->lstm_rating_coefficient * j.second;
if (conf < 0.0f) {
conf = 0.0f;
}
if (conf > 100.0f) {
conf = 100.0f;
}
hocr_str << "\n <span class='ocrx_cinfo'"
<< " id='"
<< "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'"
<< " title='x_confs " << conf << "'>" << HOcrEscape(j.first).c_str()
<< "</span>";
ccnt++;
}
hocr_str << "</span>";
tcnt++;
}
}
}
// Close ocrx_word.
if (hocr_boxes || lstm_choice_mode > 0) {
hocr_str << "\n ";
}
hocr_str << "</span>";
tcnt = 1;
ccnt = 1;
wcnt++;
// Close any ending block/paragraph/textline.
if (last_word_in_line) {
hocr_str << "\n </span>";
lcnt++;
}
if (last_word_in_para) {
hocr_str << "\n </p>\n";
pcnt++;
para_is_ltr = true; // back to default direction
}
if (last_word_in_block) {
hocr_str << " </div>\n";
bcnt++;
}
}
hocr_str << " </div>\n";
const std::string &text = hocr_str.str();
char *result = new char[text.length() + 1];
strcpy(result, text.c_str());
return result;
}
/**********************************************************************
* HOcr Text Renderer interface implementation
**********************************************************************/
TessHOcrRenderer::TessHOcrRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "hocr") {
font_info_ = false;
}
TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info)
: TessResultRenderer(outputbase, "hocr") {
font_info_ = font_info;
}
bool TessHOcrRenderer::BeginDocumentHandler() {
AppendString(
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
"lang=\"en\">\n <head>\n <title>");
AppendString(title());
AppendString(
"</title>\n"
" <meta http-equiv=\"Content-Type\" content=\"text/html;"
"charset=utf-8\"/>\n"
" <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR
"' />\n"
" <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
" ocr_line ocrx_word ocrp_wconf");
if (font_info_) {
AppendString(" ocrp_lang ocrp_dir ocrp_font ocrp_fsize");
}
AppendString(
"'/>\n"
" </head>\n"
" <body>\n");
return true;
}
bool TessHOcrRenderer::EndDocumentHandler() {
AppendString(" </body>\n</html>\n");
return true;
}
bool TessHOcrRenderer::AddImageHandler(TessBaseAPI *api) {
const std::unique_ptr<const char[]> hocr(api->GetHOCRText(imagenum()));
if (hocr == nullptr) {
return false;
}
AppendString(hocr.get());
return true;
}
} // namespace tesseract

View File

@ -0,0 +1,107 @@
/**********************************************************************
* File: lstmboxrenderer.cpp
* Description: Renderer for creating box file for LSTM training.
* based on the tsv renderer.
*
* (C) Copyright 2019, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include <tesseract/baseapi.h> // for TessBaseAPI
#include <tesseract/renderer.h>
#include "tesseractclass.h" // for Tesseract
namespace tesseract {
/**
* Create a UTF8 box file for LSTM training from the internal data structures.
* page_number is a 0-base page index that will appear in the box file.
* Returned string must be freed with the delete [] operator.
*/
static void AddBoxToLSTM(int right, int bottom, int top, int image_height, int page_num,
std::string &text) {
text += " " + std::to_string(image_height - bottom);
text += " " + std::to_string(right + 5);
text += " " + std::to_string(image_height - top);
text += " " + std::to_string(page_num);
}
char *TessBaseAPI::GetLSTMBoxText(int page_number = 0) {
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
return nullptr;
}
std::string lstm_box_str;
bool first_word = true;
int left = 0, top = 0, right = 0, bottom = 0;
LTRResultIterator *res_it = GetLTRIterator();
while (!res_it->Empty(RIL_BLOCK)) {
if (res_it->Empty(RIL_SYMBOL)) {
res_it->Next(RIL_SYMBOL);
continue;
}
if (!first_word) {
if (!(res_it->IsAtBeginningOf(RIL_TEXTLINE))) {
if (res_it->IsAtBeginningOf(RIL_WORD)) {
lstm_box_str += " " + std::to_string(left);
AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
lstm_box_str += "\n"; // end of row for word
} // word
} else {
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
lstm_box_str += "\t " + std::to_string(left);
AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
lstm_box_str += "\n"; // end of row for line
} // line
}
} // not first word
first_word = false;
// Use bounding box for whole line for everything
res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
do {
lstm_box_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
res_it->Next(RIL_SYMBOL);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_SYMBOL));
lstm_box_str += " " + std::to_string(left);
AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
lstm_box_str += "\n"; // end of row for symbol
}
if (!first_word) { // if first_word is true => empty page
lstm_box_str += "\t " + std::to_string(left);
AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
lstm_box_str += "\n"; // end of PAGE
}
char *ret = new char[lstm_box_str.length() + 1];
strcpy(ret, lstm_box_str.c_str());
delete res_it;
return ret;
}
/**********************************************************************
* LSTMBox Renderer interface implementation
**********************************************************************/
TessLSTMBoxRenderer::TessLSTMBoxRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "box") {}
bool TessLSTMBoxRenderer::AddImageHandler(TessBaseAPI *api) {
const std::unique_ptr<const char[]> lstmbox(api->GetLSTMBoxText(imagenum()));
if (lstmbox == nullptr) {
return false;
}
AppendString(lstmbox.get());
return true;
}
} // namespace tesseract.

View File

@ -0,0 +1,63 @@
///////////////////////////////////////////////////////////////////////
// File: pdf_ttf.h
// Description: pdf.ttf (GlyphLessFont) replacement.
// Generated with: "bin2cpp pdf.ttf pdf_ttf cpp17"
// Author: Zdenko Podobny
//
// (C) Copyright 2020, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef pdf_ttf__H
#define pdf_ttf__H
#include <cstdint> // uint8_t
static const uint8_t pdf_ttf[] = {
0x0, 0x1, 0x0, 0x0, 0x0, 0xa, 0x0, 0x80, 0x0, 0x3, 0x0, 0x20, 0x4f, 0x53, 0x2f, 0x32,
0x56, 0xde, 0xc8, 0x94, 0x0, 0x0, 0x1, 0x28, 0x0, 0x0, 0x0, 0x60, 0x63, 0x6d, 0x61, 0x70,
0x0, 0xa, 0x0, 0x34, 0x0, 0x0, 0x1, 0x90, 0x0, 0x0, 0x0, 0x1e, 0x67, 0x6c, 0x79, 0x66,
0x15, 0x22, 0x41, 0x24, 0x0, 0x0, 0x1, 0xb8, 0x0, 0x0, 0x0, 0x18, 0x68, 0x65, 0x61, 0x64,
0xb, 0x78, 0xf1, 0x65, 0x0, 0x0, 0x0, 0xac, 0x0, 0x0, 0x0, 0x36, 0x68, 0x68, 0x65, 0x61,
0xc, 0x2, 0x4, 0x2, 0x0, 0x0, 0x0, 0xe4, 0x0, 0x0, 0x0, 0x24, 0x68, 0x6d, 0x74, 0x78,
0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x88, 0x0, 0x0, 0x0, 0x8, 0x6c, 0x6f, 0x63, 0x61,
0x0, 0xc, 0x0, 0x0, 0x0, 0x0, 0x1, 0xb0, 0x0, 0x0, 0x0, 0x6, 0x6d, 0x61, 0x78, 0x70,
0x0, 0x4, 0x0, 0x5, 0x0, 0x0, 0x1, 0x8, 0x0, 0x0, 0x0, 0x20, 0x6e, 0x61, 0x6d, 0x65,
0xf2, 0xeb, 0x16, 0xda, 0x0, 0x0, 0x1, 0xd0, 0x0, 0x0, 0x0, 0x4b, 0x70, 0x6f, 0x73, 0x74,
0x0, 0x1, 0x0, 0x1, 0x0, 0x0, 0x2, 0x1c, 0x0, 0x0, 0x0, 0x20, 0x0, 0x1, 0x0, 0x0,
0x0, 0x1, 0x0, 0x0, 0xb0, 0x94, 0x71, 0x10, 0x5f, 0xf, 0x3c, 0xf5, 0x4, 0x7, 0x8, 0x0,
0x0, 0x0, 0x0, 0x0, 0xcf, 0x9a, 0xfc, 0x6e, 0x0, 0x0, 0x0, 0x0, 0xd4, 0xc3, 0xa7, 0xf2,
0x0, 0x0, 0x0, 0x0, 0x4, 0x0, 0x8, 0x0, 0x0, 0x0, 0x0, 0x10, 0x0, 0x2, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x8, 0x0, 0xff, 0xff, 0x0, 0x0, 0x4, 0x0,
0x0, 0x0, 0x0, 0x0, 0x4, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x1, 0x0, 0x0, 0x0, 0x2, 0x0, 0x4,
0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x1, 0x90, 0x0, 0x5,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5, 0x0, 0x1, 0x0, 0x1, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x47, 0x4f, 0x4f, 0x47, 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0xff, 0xff,
0x0, 0x0, 0x0, 0x1, 0x0, 0x1, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x2, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x14, 0x0, 0x3, 0x0, 0x0,
0x0, 0x0, 0x0, 0x14, 0x0, 0x6, 0x0, 0xa, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0xc, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x4, 0x0,
0x8, 0x0, 0x0, 0x3, 0x0, 0x0, 0x31, 0x21, 0x11, 0x21, 0x4, 0x0, 0xfc, 0x0, 0x8, 0x0,
0x0, 0x0, 0x0, 0x3, 0x0, 0x2a, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x5, 0x0, 0x16,
0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5, 0x0, 0xb, 0x0, 0x16, 0x0, 0x3,
0x0, 0x1, 0x4, 0x9, 0x0, 0x5, 0x0, 0x16, 0x0, 0x0, 0x0, 0x56, 0x0, 0x65, 0x0, 0x72,
0x0, 0x73, 0x0, 0x69, 0x0, 0x6f, 0x0, 0x6e, 0x0, 0x20, 0x0, 0x31, 0x0, 0x2e, 0x0, 0x30,
0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x31, 0x2e, 0x30, 0x0, 0x0, 0x1, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
#endif

View File

@ -0,0 +1,969 @@
///////////////////////////////////////////////////////////////////////
// File: pdfrenderer.cpp
// Description: PDF rendering interface to inject into TessBaseAPI
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
# include "config_auto.h"
#endif
#include "pdf_ttf.h"
#include "tprintf.h"
#include <allheaders.h>
#include <tesseract/baseapi.h>
#include <tesseract/renderer.h>
#include <cmath>
#include <cstring>
#include <fstream> // for std::ifstream
#include <locale> // for std::locale::classic
#include <memory> // std::unique_ptr
#include <sstream> // for std::stringstream
#include "helpers.h" // for Swap
/*
Design notes from Ken Sharp, with light editing.
We think one solution is a font with a single glyph (.notdef) and a
CIDToGIDMap which maps all the CIDs to 0. That map would then be
stored as a stream in the PDF file, and when flat compressed should
be pretty small. The font, of course, will be approximately the same
size as the one you currently use.
I'm working on such a font now, the CIDToGIDMap is trivial, you just
create a stream object which contains 128k bytes (2 bytes per possible
CID and your CIDs range from 0 to 65535) and where you currently have
"/CIDToGIDMap /Identity" you would have "/CIDToGIDMap <object> 0 R".
Note that if, in future, you were to use a different (ie not 2 byte)
CMap for character codes you could trivially extend the CIDToGIDMap.
The following is an explanation of how some of the font stuff works,
this may be too simple for you in which case please accept my
apologies, its hard to know how much knowledge someone has. You can
skip all this anyway, its just for information.
The font embedded in a PDF file is usually intended just to be
rendered, but extensions allow for at least some ability to locate (or
copy) text from a document. This isn't something which was an original
goal of the PDF format, but its been retro-fitted, presumably due to
popular demand.
To do this reliably the PDF file must contain a ToUnicode CMap, a
device for mapping character codes to Unicode code points. If one of
these is present, then this will be used to convert the character
codes into Unicode values. If its not present then the reader will
fall back through a series of heuristics to try and guess the
result. This is, as you would expect, prone to failure.
This doesn't concern you of course, since you always write a ToUnicode
CMap, so because you are writing the text in text rendering mode 3 it
would seem that you don't really need to worry about this, but in the
PDF spec you cannot have an isolated ToUnicode CMap, it has to be
attached to a font, so in order to get even copy/paste to work you
need to define a font.
This is what leads to problems, tools like pdfwrite assume that they
are going to be able to (or even have to) modify the font entries, so
they require that the font being embedded be valid, and to be honest
the font Tesseract embeds isn't valid (for this purpose).
To see why lets look at how text is specified in a PDF file:
(Test) Tj
Now that looks like text but actually it isn't. Each of those bytes is
a 'character code'. When it comes to rendering the text a complex
sequence of events takes place, which converts the character code into
'something' which the font understands. Its entirely possible via
character mappings to have that text render as 'Sftu'
For simple fonts (PostScript type 1), we use the character code as the
index into an Encoding array (256 elements), each element of which is
a glyph name, so this gives us a glyph name. We then consult the
CharStrings dictionary in the font, that's a complex object which
contains pairs of keys and values, you can use the key to retrieve a
given value. So we have a glyph name, we then use that as the key to
the dictionary and retrieve the associated value. For a type 1 font,
the value is a glyph program that describes how to draw the glyph.
For CIDFonts, its a little more complicated. Because CIDFonts can be
large, using a glyph name as the key is unreasonable (it would also
lead to unfeasibly large Encoding arrays), so instead we use a 'CID'
as the key. CIDs are just numbers.
But.... We don't use the character code as the CID. What we do is use
a CMap to convert the character code into a CID. We then use the CID
to key the CharStrings dictionary and proceed as before. So the 'CMap'
is the equivalent of the Encoding array, but its a more compact and
flexible representation.
Note that you have to use the CMap just to find out how many bytes
constitute a character code, and it can be variable. For example you
can say if the first byte is 0x00->0x7f then its just one byte, if its
0x80->0xf0 then its 2 bytes and if its 0xf0->0xff then its 3 bytes. I
have seen CMaps defining character codes up to 5 bytes wide.
Now that's fine for 'PostScript' CIDFonts, but its not sufficient for
TrueType CIDFonts. The thing is that TrueType fonts are accessed using
a Glyph ID (GID) (and the LOCA table) which may well not be anything
like the CID. So for this case PDF includes a CIDToGIDMap. That maps
the CIDs to GIDs, and we can then use the GID to get the glyph
description from the GLYF table of the font.
So for a TrueType CIDFont, character-code->CID->GID->glyf-program.
Looking at the PDF file I was supplied with we see that it contains
text like :
<0x0075> Tj
So we start by taking the character code (117) and look it up in the
CMap. Well you don't supply a CMap, you just use the Identity-H one
which is predefined. So character code 117 maps to CID 117. Then we
use the CIDToGIDMap, again you don't supply one, you just use the
predefined 'Identity' map. So CID 117 maps to GID 117. But the font we
were supplied with only contains 116 glyphs.
Now for Latin that's not a huge problem, you can just supply a bigger
font. But for more complex languages that *is* going to be more of a
problem. Either you need to supply a font which contains glyphs for
all the possible CID->GID mappings, or we need to think laterally.
Our solution using a TrueType CIDFont is to intervene at the
CIDToGIDMap stage and convert all the CIDs to GID 0. Then we have a
font with just one glyph, the .notdef glyph at GID 0. This is what I'm
looking into now.
It would also be possible to have a 'PostScript' (ie type 1 outlines)
CIDFont which contained 1 glyph, and a CMap which mapped all character
codes to CID 0. The effect would be the same.
Its possible (I haven't checked) that the PostScript CIDFont and
associated CMap would be smaller than the TrueType font and associated
CIDToGIDMap.
--- in a followup ---
OK there is a small problem there, if I use GID 0 then Acrobat gets
upset about it and complains it cannot extract the font. If I set the
CIDToGIDMap so that all the entries are 1 instead, it's happy. Totally
mad......
*/
namespace tesseract {
// If the font is 10 pts, nominal character width is 5 pts
static const int kCharWidth = 2;
// Used for memory allocation. A codepoint must take no more than this
// many bytes, when written in the PDF way. e.g. "<0063>" for the
// letter 'c'
static const int kMaxBytesPerCodepoint = 20;
/**********************************************************************
* PDF Renderer interface implementation
**********************************************************************/
TessPDFRenderer::TessPDFRenderer(const char *outputbase, const char *datadir, bool textonly)
: TessResultRenderer(outputbase, "pdf"), datadir_(datadir) {
obj_ = 0;
textonly_ = textonly;
offsets_.push_back(0);
}
void TessPDFRenderer::AppendPDFObjectDIY(size_t objectsize) {
offsets_.push_back(objectsize + offsets_.back());
obj_++;
}
void TessPDFRenderer::AppendPDFObject(const char *data) {
AppendPDFObjectDIY(strlen(data));
AppendString(data);
}
// Helper function to prevent us from accidentally writing
// scientific notation to an HOCR or PDF file. Besides, three
// decimal points are all you really need.
static double prec(double x) {
double kPrecision = 1000.0;
double a = round(x * kPrecision) / kPrecision;
if (a == -0) {
return 0;
}
return a;
}
static long dist2(int x1, int y1, int x2, int y2) {
return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
}
// Viewers like evince can get really confused during copy-paste when
// the baseline wanders around. So I've decided to project every word
// onto the (straight) line baseline. All numbers are in the native
// PDF coordinate system, which has the origin in the bottom left and
// the unit is points, which is 1/72 inch. Tesseract reports baselines
// left-to-right no matter what the reading order is. We need the
// word baseline in reading order, so we do that conversion here. Returns
// the word's baseline origin and length.
static void GetWordBaseline(int writing_direction, int ppi, int height, int word_x1, int word_y1,
int word_x2, int word_y2, int line_x1, int line_y1, int line_x2,
int line_y2, double *x0, double *y0, double *length) {
if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {
std::swap(word_x1, word_x2);
std::swap(word_y1, word_y2);
}
double word_length;
double x, y;
{
int px = word_x1;
int py = word_y1;
double l2 = dist2(line_x1, line_y1, line_x2, line_y2);
if (l2 == 0) {
x = line_x1;
y = line_y1;
} else {
double t = ((px - line_x2) * (line_x2 - line_x1) + (py - line_y2) * (line_y2 - line_y1)) / l2;
x = line_x2 + t * (line_x2 - line_x1);
y = line_y2 + t * (line_y2 - line_y1);
}
word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1, word_x2, word_y2)));
word_length = word_length * 72.0 / ppi;
x = x * 72 / ppi;
y = height - (y * 72.0 / ppi);
}
*x0 = x;
*y0 = y;
*length = word_length;
}
// Compute coefficients for an affine matrix describing the rotation
// of the text. If the text is right-to-left such as Arabic or Hebrew,
// we reflect over the Y-axis. This matrix will set the coordinate
// system for placing text in the PDF file.
//
// RTL
// [ x' ] = [ a b ][ x ] = [-1 0 ] [ cos sin ][ x ]
// [ y' ] [ c d ][ y ] [ 0 1 ] [-sin cos ][ y ]
static void AffineMatrix(int writing_direction, int line_x1, int line_y1, int line_x2, int line_y2,
double *a, double *b, double *c, double *d) {
double theta =
atan2(static_cast<double>(line_y1 - line_y2), static_cast<double>(line_x2 - line_x1));
*a = cos(theta);
*b = sin(theta);
*c = -sin(theta);
*d = cos(theta);
switch (writing_direction) {
case WRITING_DIRECTION_RIGHT_TO_LEFT:
*a = -*a;
*b = -*b;
break;
case WRITING_DIRECTION_TOP_TO_BOTTOM:
// TODO(jbreiden) Consider using the vertical PDF writing mode.
break;
default:
break;
}
}
// There are some really awkward PDF viewers in the wild, such as
// 'Preview' which ships with the Mac. They do a better job with text
// selection and highlighting when given perfectly flat baseline
// instead of very slightly tilted. We clip small tilts to appease
// these viewers. I chose this threshold large enough to absorb noise,
// but small enough that lines probably won't cross each other if the
// whole page is tilted at almost exactly the clipping threshold.
static void ClipBaseline(int ppi, int x1, int y1, int x2, int y2, int *line_x1, int *line_y1,
int *line_x2, int *line_y2) {
*line_x1 = x1;
*line_y1 = y1;
*line_x2 = x2;
*line_y2 = y2;
int rise = abs(y2 - y1) * 72;
int run = abs(x2 - x1) * 72;
if (rise < 2 * ppi && 2 * ppi < run) {
*line_y1 = *line_y2 = (y1 + y2) / 2;
}
}
static bool CodepointToUtf16be(int code, char utf16[kMaxBytesPerCodepoint]) {
if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
tprintf("Dropping invalid codepoint %d\n", code);
return false;
}
if (code < 0x10000) {
snprintf(utf16, kMaxBytesPerCodepoint, "%04X", code);
} else {
int a = code - 0x010000;
int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
int low_surrogate = (0x03FF & a) + 0xDC00;
snprintf(utf16, kMaxBytesPerCodepoint, "%04X%04X", high_surrogate, low_surrogate);
}
return true;
}
char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double height) {
double ppi = api->GetSourceYResolution();
// These initial conditions are all arbitrary and will be overwritten
double old_x = 0.0, old_y = 0.0;
int old_fontsize = 0;
tesseract::WritingDirection old_writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;
bool new_block = true;
int fontsize = 0;
double a = 1;
double b = 0;
double c = 0;
double d = 1;
std::stringstream pdf_str;
// Use "C" locale (needed for double values prec()).
pdf_str.imbue(std::locale::classic());
// Use 8 digits for double values.
pdf_str.precision(8);
// TODO(jbreiden) This marries the text and image together.
// Slightly cleaner from an abstraction standpoint if this were to
// live inside a separate text object.
pdf_str << "q " << prec(width) << " 0 0 " << prec(height) << " 0 0 cm";
if (!textonly_) {
pdf_str << " /Im1 Do";
}
pdf_str << " Q\n";
int line_x1 = 0;
int line_y1 = 0;
int line_x2 = 0;
int line_y2 = 0;
const std::unique_ptr</*non-const*/ ResultIterator> res_it(api->GetIterator());
while (!res_it->Empty(RIL_BLOCK)) {
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
pdf_str << "BT\n3 Tr"; // Begin text object, use invisible ink
old_fontsize = 0; // Every block will declare its fontsize
new_block = true; // Every block will declare its affine matrix
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
int x1, y1, x2, y2;
res_it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
}
if (res_it->Empty(RIL_WORD)) {
res_it->Next(RIL_WORD);
continue;
}
// Writing direction changes at a per-word granularity
tesseract::WritingDirection writing_direction;
{
tesseract::Orientation orientation;
tesseract::TextlineOrder textline_order;
float deskew_angle;
res_it->Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle);
if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) {
switch (res_it->WordDirection()) {
case DIR_LEFT_TO_RIGHT:
writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;
break;
case DIR_RIGHT_TO_LEFT:
writing_direction = WRITING_DIRECTION_RIGHT_TO_LEFT;
break;
default:
writing_direction = old_writing_direction;
}
}
}
// Where is word origin and how long is it?
double x, y, word_length;
{
int word_x1, word_y1, word_x2, word_y2;
res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
GetWordBaseline(writing_direction, ppi, height, word_x1, word_y1, word_x2, word_y2, line_x1,
line_y1, line_x2, line_y2, &x, &y, &word_length);
}
if (writing_direction != old_writing_direction || new_block) {
AffineMatrix(writing_direction, line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
pdf_str << " " << prec(a) // . This affine matrix
<< " " << prec(b) // . sets the coordinate
<< " " << prec(c) // . system for all
<< " " << prec(d) // . text that follows.
<< " " << prec(x) // .
<< " " << prec(y) // .
<< (" Tm "); // Place cursor absolutely
new_block = false;
} else {
double dx = x - old_x;
double dy = y - old_y;
pdf_str << " " << prec(dx * a + dy * b) << " " << prec(dx * c + dy * d)
<< (" Td "); // Relative moveto
}
old_x = x;
old_y = y;
old_writing_direction = writing_direction;
// Adjust font size on a per word granularity. Pay attention to
// fontsize, old_fontsize, and pdf_str. We've found that for
// in Arabic, Tesseract will happily return a fontsize of zero,
// so we make up a default number to protect ourselves.
{
bool bold, italic, underlined, monospace, serif, smallcaps;
int font_id;
res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps,
&fontsize, &font_id);
const int kDefaultFontsize = 8;
if (fontsize <= 0) {
fontsize = kDefaultFontsize;
}
if (fontsize != old_fontsize) {
pdf_str << "/f-0-0 " << fontsize << " Tf ";
old_fontsize = fontsize;
}
}
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
std::string pdf_word;
int pdf_word_len = 0;
do {
const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
if (grapheme && grapheme[0] != '\0') {
std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(grapheme.get());
char utf16[kMaxBytesPerCodepoint];
for (char32 code : unicodes) {
if (CodepointToUtf16be(code, utf16)) {
pdf_word += utf16;
pdf_word_len++;
}
}
}
res_it->Next(RIL_SYMBOL);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
if (res_it->IsAtBeginningOf(RIL_WORD)) {
pdf_word += "0020";
pdf_word_len++;
}
if (word_length > 0 && pdf_word_len > 0) {
double h_stretch = kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
pdf_str << h_stretch << " Tz" // horizontal stretch
<< " [ <" << pdf_word // UTF-16BE representation
<< "> ] TJ"; // show the text
}
if (last_word_in_line) {
pdf_str << " \n";
}
if (last_word_in_block) {
pdf_str << "ET\n"; // end the text object
}
}
const std::string &text = pdf_str.str();
char *result = new char[text.length() + 1];
strcpy(result, text.c_str());
return result;
}
bool TessPDFRenderer::BeginDocumentHandler() {
AppendPDFObject("%PDF-1.5\n%\xDE\xAD\xBE\xEB\n");
// CATALOG
AppendPDFObject(
"1 0 obj\n"
"<<\n"
" /Type /Catalog\n"
" /Pages 2 0 R\n"
">>\nendobj\n");
// We are reserving object #2 for the /Pages
// object, which I am going to create and write
// at the end of the PDF file.
AppendPDFObject("");
// TYPE0 FONT
AppendPDFObject(
"3 0 obj\n"
"<<\n"
" /BaseFont /GlyphLessFont\n"
" /DescendantFonts [ 4 0 R ]\n" // CIDFontType2 font
" /Encoding /Identity-H\n"
" /Subtype /Type0\n"
" /ToUnicode 6 0 R\n" // ToUnicode
" /Type /Font\n"
">>\n"
"endobj\n");
// CIDFONTTYPE2
std::stringstream stream;
// Use "C" locale (needed for int values larger than 999).
stream.imbue(std::locale::classic());
stream << "4 0 obj\n"
"<<\n"
" /BaseFont /GlyphLessFont\n"
" /CIDToGIDMap 5 0 R\n" // CIDToGIDMap
" /CIDSystemInfo\n"
" <<\n"
" /Ordering (Identity)\n"
" /Registry (Adobe)\n"
" /Supplement 0\n"
" >>\n"
" /FontDescriptor 7 0 R\n" // Font descriptor
" /Subtype /CIDFontType2\n"
" /Type /Font\n"
" /DW "
<< (1000 / kCharWidth)
<< "\n"
">>\n"
"endobj\n";
AppendPDFObject(stream.str().c_str());
// CIDTOGIDMAP
const int kCIDToGIDMapSize = 2 * (1 << 16);
const std::unique_ptr<unsigned char[]> cidtogidmap(new unsigned char[kCIDToGIDMapSize]);
for (int i = 0; i < kCIDToGIDMapSize; i++) {
cidtogidmap[i] = (i % 2) ? 1 : 0;
}
size_t len;
unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
stream.str("");
stream << "5 0 obj\n"
"<<\n"
" /Length "
<< len
<< " /Filter /FlateDecode\n"
">>\n"
"stream\n";
AppendString(stream.str().c_str());
long objsize = stream.str().size();
AppendData(reinterpret_cast<char *>(comp), len);
objsize += len;
lept_free(comp);
const char *endstream_endobj =
"endstream\n"
"endobj\n";
AppendString(endstream_endobj);
objsize += strlen(endstream_endobj);
AppendPDFObjectDIY(objsize);
const char stream2[] =
"/CIDInit /ProcSet findresource begin\n"
"12 dict begin\n"
"begincmap\n"
"/CIDSystemInfo\n"
"<<\n"
" /Registry (Adobe)\n"
" /Ordering (UCS)\n"
" /Supplement 0\n"
">> def\n"
"/CMapName /Adobe-Identify-UCS def\n"
"/CMapType 2 def\n"
"1 begincodespacerange\n"
"<0000> <FFFF>\n"
"endcodespacerange\n"
"1 beginbfrange\n"
"<0000> <FFFF> <0000>\n"
"endbfrange\n"
"endcmap\n"
"CMapName currentdict /CMap defineresource pop\n"
"end\n"
"end\n";
// TOUNICODE
stream.str("");
stream << "6 0 obj\n"
"<< /Length "
<< (sizeof(stream2) - 1)
<< " >>\n"
"stream\n"
<< stream2
<< "endstream\n"
"endobj\n";
AppendPDFObject(stream.str().c_str());
// FONT DESCRIPTOR
stream.str("");
stream << "7 0 obj\n"
"<<\n"
" /Ascent 1000\n"
" /CapHeight 1000\n"
" /Descent -1\n" // Spec says must be negative
" /Flags 5\n" // FixedPitch + Symbolic
" /FontBBox [ 0 0 "
<< (1000 / kCharWidth)
<< " 1000 ]\n"
" /FontFile2 8 0 R\n"
" /FontName /GlyphLessFont\n"
" /ItalicAngle 0\n"
" /StemV 80\n"
" /Type /FontDescriptor\n"
">>\n"
"endobj\n";
AppendPDFObject(stream.str().c_str());
stream.str("");
stream << datadir_.c_str() << "/pdf.ttf";
const uint8_t *font;
std::ifstream input(stream.str().c_str(), std::ios::in | std::ios::binary);
std::vector<unsigned char> buffer(std::istreambuf_iterator<char>(input), {});
auto size = buffer.size();
if (size) {
font = buffer.data();
} else {
#if !defined(NDEBUG)
tprintf("Cannot open file \"%s\"!\nUsing internal glyphless font.\n", stream.str().c_str());
#endif
font = pdf_ttf;
size = sizeof(pdf_ttf);
}
// FONTFILE2
stream.str("");
stream << "8 0 obj\n"
"<<\n"
" /Length "
<< size
<< "\n"
" /Length1 "
<< size
<< "\n"
">>\n"
"stream\n";
AppendString(stream.str().c_str());
objsize = stream.str().size();
AppendData(reinterpret_cast<const char *>(font), size);
objsize += size;
AppendString(endstream_endobj);
objsize += strlen(endstream_endobj);
AppendPDFObjectDIY(objsize);
return true;
}
bool TessPDFRenderer::imageToPDFObj(Pix *pix, const char *filename, long int objnum,
char **pdf_object, long int *pdf_object_size,
const int jpg_quality) {
if (!pdf_object_size || !pdf_object) {
return false;
}
*pdf_object = nullptr;
*pdf_object_size = 0;
if (!filename && !pix) {
return false;
}
L_Compressed_Data *cid = nullptr;
int sad = 0;
if (pixGetInputFormat(pix) == IFF_PNG) {
sad = pixGenerateCIData(pix, L_FLATE_ENCODE, 0, 0, &cid);
}
if (!cid) {
sad = l_generateCIDataForPdf(filename, pix, jpg_quality, &cid);
}
if (sad || !cid) {
l_CIDataDestroy(&cid);
return false;
}
const char *group4 = "";
const char *filter;
switch (cid->type) {
case L_FLATE_ENCODE:
filter = "/FlateDecode";
break;
case L_JPEG_ENCODE:
filter = "/DCTDecode";
break;
case L_G4_ENCODE:
filter = "/CCITTFaxDecode";
group4 = " /K -1\n";
break;
case L_JP2K_ENCODE:
filter = "/JPXDecode";
break;
default:
l_CIDataDestroy(&cid);
return false;
}
// Maybe someday we will accept RGBA but today is not that day.
// It requires creating an /SMask for the alpha channel.
// http://stackoverflow.com/questions/14220221
std::stringstream colorspace;
// Use "C" locale (needed for int values larger than 999).
colorspace.imbue(std::locale::classic());
if (cid->ncolors > 0) {
colorspace << " /ColorSpace [ /Indexed /DeviceRGB " << (cid->ncolors - 1) << " "
<< cid->cmapdatahex << " ]\n";
} else {
switch (cid->spp) {
case 1:
if (cid->bps == 1 && pixGetInputFormat(pix) == IFF_PNG) {
colorspace.str(
" /ColorSpace /DeviceGray\n"
" /Decode [1 0]\n");
} else {
colorspace.str(" /ColorSpace /DeviceGray\n");
}
break;
case 3:
colorspace.str(" /ColorSpace /DeviceRGB\n");
break;
default:
l_CIDataDestroy(&cid);
return false;
}
}
int predictor = (cid->predictor) ? 14 : 1;
// IMAGE
std::stringstream b1;
// Use "C" locale (needed for int values larger than 999).
b1.imbue(std::locale::classic());
b1 << objnum
<< " 0 obj\n"
"<<\n"
" /Length "
<< cid->nbytescomp
<< "\n"
" /Subtype /Image\n";
std::stringstream b2;
// Use "C" locale (needed for int values larger than 999).
b2.imbue(std::locale::classic());
b2 << " /Width " << cid->w
<< "\n"
" /Height "
<< cid->h
<< "\n"
" /BitsPerComponent "
<< cid->bps
<< "\n"
" /Filter "
<< filter
<< "\n"
" /DecodeParms\n"
" <<\n"
" /Predictor "
<< predictor
<< "\n"
" /Colors "
<< cid->spp << "\n"
<< group4 << " /Columns " << cid->w
<< "\n"
" /BitsPerComponent "
<< cid->bps
<< "\n"
" >>\n"
">>\n"
"stream\n";
const char *b3 =
"endstream\n"
"endobj\n";
size_t b1_len = b1.str().size();
size_t b2_len = b2.str().size();
size_t b3_len = strlen(b3);
size_t colorspace_len = colorspace.str().size();
*pdf_object_size = b1_len + colorspace_len + b2_len + cid->nbytescomp + b3_len;
*pdf_object = new char[*pdf_object_size];
char *p = *pdf_object;
memcpy(p, b1.str().c_str(), b1_len);
p += b1_len;
memcpy(p, colorspace.str().c_str(), colorspace_len);
p += colorspace_len;
memcpy(p, b2.str().c_str(), b2_len);
p += b2_len;
memcpy(p, cid->datacomp, cid->nbytescomp);
p += cid->nbytescomp;
memcpy(p, b3, b3_len);
l_CIDataDestroy(&cid);
return true;
}
bool TessPDFRenderer::AddImageHandler(TessBaseAPI *api) {
Pix *pix = api->GetInputImage();
const char *filename = api->GetInputName();
int ppi = api->GetSourceYResolution();
if (!pix || ppi <= 0) {
return false;
}
double width = pixGetWidth(pix) * 72.0 / ppi;
double height = pixGetHeight(pix) * 72.0 / ppi;
std::stringstream xobject;
// Use "C" locale (needed for int values larger than 999).
xobject.imbue(std::locale::classic());
if (!textonly_) {
xobject << "/XObject << /Im1 " << (obj_ + 2) << " 0 R >>\n";
}
// PAGE
std::stringstream stream;
// Use "C" locale (needed for double values width and height).
stream.imbue(std::locale::classic());
stream.precision(2);
stream << std::fixed << obj_
<< " 0 obj\n"
"<<\n"
" /Type /Page\n"
" /Parent 2 0 R\n" // Pages object
" /MediaBox [0 0 "
<< width << " " << height
<< "]\n"
" /Contents "
<< (obj_ + 1)
<< " 0 R\n" // Contents object
" /Resources\n"
" <<\n"
" "
<< xobject.str() << // Image object
" /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
" /Font << /f-0-0 3 0 R >>\n" // Type0 Font
" >>\n"
">>\n"
"endobj\n";
pages_.push_back(obj_);
AppendPDFObject(stream.str().c_str());
// CONTENTS
const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));
const size_t pdftext_len = strlen(pdftext.get());
size_t len;
unsigned char *comp_pdftext =
zlibCompress(reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
long comp_pdftext_len = len;
stream.str("");
stream << obj_
<< " 0 obj\n"
"<<\n"
" /Length "
<< comp_pdftext_len
<< " /Filter /FlateDecode\n"
">>\n"
"stream\n";
AppendString(stream.str().c_str());
long objsize = stream.str().size();
AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
objsize += comp_pdftext_len;
lept_free(comp_pdftext);
const char *b2 =
"endstream\n"
"endobj\n";
AppendString(b2);
objsize += strlen(b2);
AppendPDFObjectDIY(objsize);
if (!textonly_) {
char *pdf_object = nullptr;
int jpg_quality;
api->GetIntVariable("jpg_quality", &jpg_quality);
if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize, jpg_quality)) {
return false;
}
AppendData(pdf_object, objsize);
AppendPDFObjectDIY(objsize);
delete[] pdf_object;
}
return true;
}
bool TessPDFRenderer::EndDocumentHandler() {
// We reserved the /Pages object number early, so that the /Page
// objects could refer to their parent. We finally have enough
// information to go fill it in. Using lower level calls to manipulate
// the offset record in two spots, because we are placing objects
// out of order in the file.
// PAGES
const long int kPagesObjectNumber = 2;
offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1
std::stringstream stream;
// Use "C" locale (needed for int values larger than 999).
stream.imbue(std::locale::classic());
stream << kPagesObjectNumber << " 0 obj\n<<\n /Type /Pages\n /Kids [ ";
AppendString(stream.str().c_str());
size_t pages_objsize = stream.str().size();
for (const auto &page : pages_) {
stream.str("");
stream << page << " 0 R ";
AppendString(stream.str().c_str());
pages_objsize += stream.str().size();
}
stream.str("");
stream << "]\n /Count " << pages_.size() << "\n>>\nendobj\n";
AppendString(stream.str().c_str());
pages_objsize += stream.str().size();
offsets_.back() += pages_objsize; // manipulation #2
// INFO
std::string utf16_title = "FEFF"; // byte_order_marker
std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(title());
char utf16[kMaxBytesPerCodepoint];
for (char32 code : unicodes) {
if (CodepointToUtf16be(code, utf16)) {
utf16_title += utf16;
}
}
char *datestr = l_getFormattedDate();
stream.str("");
stream << obj_
<< " 0 obj\n"
"<<\n"
" /Producer (Tesseract "
<< tesseract::TessBaseAPI::Version()
<< ")\n"
" /CreationDate (D:"
<< datestr
<< ")\n"
" /Title <"
<< utf16_title.c_str()
<< ">\n"
">>\n"
"endobj\n";
lept_free(datestr);
AppendPDFObject(stream.str().c_str());
stream.str("");
stream << "xref\n0 " << obj_ << "\n0000000000 65535 f \n";
AppendString(stream.str().c_str());
for (int i = 1; i < obj_; i++) {
stream.str("");
stream.width(10);
stream.fill('0');
stream << offsets_[i] << " 00000 n \n";
AppendString(stream.str().c_str());
}
stream.str("");
stream << "trailer\n<<\n /Size " << obj_
<< "\n"
" /Root 1 0 R\n" // catalog
" /Info "
<< (obj_ - 1)
<< " 0 R\n" // info
">>\nstartxref\n"
<< offsets_.back() << "\n%%EOF\n";
AppendString(stream.str().c_str());
return true;
}
} // namespace tesseract

View File

@ -0,0 +1,241 @@
///////////////////////////////////////////////////////////////////////
// File: renderer.cpp
// Description: Rendering interface to inject into TessBaseAPI
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifdef HAVE_CONFIG_H
# include "config_auto.h"
#endif
#include <tesseract/baseapi.h>
#include <tesseract/renderer.h>
#include <cstring>
#include <memory> // std::unique_ptr
#include <string> // std::string
#include "serialis.h" // Serialize
namespace tesseract {
/**********************************************************************
* Base Renderer interface implementation
**********************************************************************/
TessResultRenderer::TessResultRenderer(const char *outputbase, const char *extension)
: file_extension_(extension)
, title_("")
, imagenum_(-1)
, fout_(stdout)
, next_(nullptr)
, happy_(true) {
if (strcmp(outputbase, "-") && strcmp(outputbase, "stdout")) {
std::string outfile = std::string(outputbase) + "." + extension;
fout_ = fopen(outfile.c_str(), "wb");
if (fout_ == nullptr) {
happy_ = false;
}
}
}
TessResultRenderer::~TessResultRenderer() {
if (fout_ != nullptr) {
if (fout_ != stdout) {
fclose(fout_);
} else {
clearerr(fout_);
}
}
delete next_;
}
void TessResultRenderer::insert(TessResultRenderer *next) {
if (next == nullptr) {
return;
}
TessResultRenderer *remainder = next_;
next_ = next;
if (remainder) {
while (next->next_ != nullptr) {
next = next->next_;
}
next->next_ = remainder;
}
}
bool TessResultRenderer::BeginDocument(const char *title) {
if (!happy_) {
return false;
}
title_ = title;
imagenum_ = -1;
bool ok = BeginDocumentHandler();
if (next_) {
ok = next_->BeginDocument(title) && ok;
}
return ok;
}
bool TessResultRenderer::AddImage(TessBaseAPI *api) {
if (!happy_) {
return false;
}
++imagenum_;
bool ok = AddImageHandler(api);
if (next_) {
ok = next_->AddImage(api) && ok;
}
return ok;
}
bool TessResultRenderer::EndDocument() {
if (!happy_) {
return false;
}
bool ok = EndDocumentHandler();
if (next_) {
ok = next_->EndDocument() && ok;
}
return ok;
}
void TessResultRenderer::AppendString(const char *s) {
AppendData(s, strlen(s));
}
void TessResultRenderer::AppendData(const char *s, int len) {
if (!tesseract::Serialize(fout_, s, len)) {
happy_ = false;
}
fflush(fout_);
}
bool TessResultRenderer::BeginDocumentHandler() {
return happy_;
}
bool TessResultRenderer::EndDocumentHandler() {
return happy_;
}
/**********************************************************************
* UTF8 Text Renderer interface implementation
**********************************************************************/
TessTextRenderer::TessTextRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "txt") {}
bool TessTextRenderer::AddImageHandler(TessBaseAPI *api) {
const std::unique_ptr<const char[]> utf8(api->GetUTF8Text());
if (utf8 == nullptr) {
return false;
}
AppendString(utf8.get());
const char *pageSeparator = api->GetStringVariable("page_separator");
if (pageSeparator != nullptr && *pageSeparator != '\0') {
AppendString(pageSeparator);
}
return true;
}
/**********************************************************************
* TSV Text Renderer interface implementation
**********************************************************************/
TessTsvRenderer::TessTsvRenderer(const char *outputbase) : TessResultRenderer(outputbase, "tsv") {
font_info_ = false;
}
TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info)
: TessResultRenderer(outputbase, "tsv") {
font_info_ = font_info;
}
bool TessTsvRenderer::BeginDocumentHandler() {
// Output TSV column headings
AppendString(
"level\tpage_num\tblock_num\tpar_num\tline_num\tword_"
"num\tleft\ttop\twidth\theight\tconf\ttext\n");
return true;
}
bool TessTsvRenderer::EndDocumentHandler() {
return true;
}
bool TessTsvRenderer::AddImageHandler(TessBaseAPI *api) {
const std::unique_ptr<const char[]> tsv(api->GetTSVText(imagenum()));
if (tsv == nullptr) {
return false;
}
AppendString(tsv.get());
return true;
}
/**********************************************************************
* UNLV Text Renderer interface implementation
**********************************************************************/
TessUnlvRenderer::TessUnlvRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "unlv") {}
bool TessUnlvRenderer::AddImageHandler(TessBaseAPI *api) {
const std::unique_ptr<const char[]> unlv(api->GetUNLVText());
if (unlv == nullptr) {
return false;
}
AppendString(unlv.get());
return true;
}
/**********************************************************************
* BoxText Renderer interface implementation
**********************************************************************/
TessBoxTextRenderer::TessBoxTextRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "box") {}
bool TessBoxTextRenderer::AddImageHandler(TessBaseAPI *api) {
const std::unique_ptr<const char[]> text(api->GetBoxText(imagenum()));
if (text == nullptr) {
return false;
}
AppendString(text.get());
return true;
}
#ifndef DISABLED_LEGACY_ENGINE
/**********************************************************************
* Osd Text Renderer interface implementation
**********************************************************************/
TessOsdRenderer::TessOsdRenderer(const char *outputbase) : TessResultRenderer(outputbase, "osd") {}
bool TessOsdRenderer::AddImageHandler(TessBaseAPI *api) {
const std::unique_ptr<const char[]> osd(api->GetOsdText(imagenum()));
if (osd == nullptr) {
return false;
}
AppendString(osd.get());
return true;
}
#endif // ndef DISABLED_LEGACY_ENGINE
} // namespace tesseract

View File

@ -0,0 +1,106 @@
/**********************************************************************
* File: wordstrboxrenderer.cpp
* Description: Renderer for creating box file with WordStr strings.
* based on the tsv renderer.
*
* (C) Copyright 2019, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include <tesseract/baseapi.h> // for TessBaseAPI
#include <tesseract/renderer.h>
#include "tesseractclass.h" // for Tesseract
namespace tesseract {
/**
* Create a UTF8 box file with WordStr strings from the internal data
* structures. page_number is a 0-base page index that will appear in the box
* file. Returned string must be freed with the delete [] operator.
*/
char *TessBaseAPI::GetWordStrBoxText(int page_number = 0) {
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
return nullptr;
}
std::string wordstr_box_str;
int left = 0, top = 0, right = 0, bottom = 0;
bool first_line = true;
LTRResultIterator *res_it = GetLTRIterator();
while (!res_it->Empty(RIL_BLOCK)) {
if (res_it->Empty(RIL_WORD)) {
res_it->Next(RIL_WORD);
continue;
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
if (!first_line) {
wordstr_box_str += "\n\t " + std::to_string(right + 1);
wordstr_box_str += " " + std::to_string(image_height_ - bottom);
wordstr_box_str += " " + std::to_string(right + 5);
wordstr_box_str += " " + std::to_string(image_height_ - top);
wordstr_box_str += " " + std::to_string(page_number); // row for tab for EOL
wordstr_box_str += "\n";
} else {
first_line = false;
}
// Use bounding box for whole line for WordStr
res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
wordstr_box_str += "WordStr " + std::to_string(left);
wordstr_box_str += " " + std::to_string(image_height_ - bottom);
wordstr_box_str += " " + std::to_string(right);
wordstr_box_str += " " + std::to_string(image_height_ - top);
wordstr_box_str += " " + std::to_string(page_number); // word
wordstr_box_str += " #";
}
do {
wordstr_box_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
wordstr_box_str += " ";
res_it->Next(RIL_WORD);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
}
if (left != 0 && top != 0 && right != 0 && bottom != 0) {
wordstr_box_str += "\n\t " + std::to_string(right + 1);
wordstr_box_str += " " + std::to_string(image_height_ - bottom);
wordstr_box_str += " " + std::to_string(right + 5);
wordstr_box_str += " " + std::to_string(image_height_ - top);
wordstr_box_str += " " + std::to_string(page_number); // row for tab for EOL
wordstr_box_str += "\n";
}
char *ret = new char[wordstr_box_str.length() + 1];
strcpy(ret, wordstr_box_str.c_str());
delete res_it;
return ret;
}
/**********************************************************************
* WordStrBox Renderer interface implementation
**********************************************************************/
TessWordStrBoxRenderer::TessWordStrBoxRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "box") {}
bool TessWordStrBoxRenderer::AddImageHandler(TessBaseAPI *api) {
const std::unique_ptr<const char[]> wordstrbox(api->GetWordStrBoxText(imagenum()));
if (wordstrbox == nullptr) {
return false;
}
AppendString(wordstrbox.get());
return true;
}
} // namespace tesseract.

View File

@ -0,0 +1,30 @@
///////////////////////////////////////////////////////////////////////
// File: dotproduct.h
// Description: Native dot product function.
//
// (C) Copyright 2018, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////
#include "dotproduct.h"
namespace tesseract {
// Computes and returns the dot product of the two n-vectors u and v.
double DotProductNative(const double *u, const double *v, int n) {
double total = 0.0;
for (int k = 0; k < n; ++k) {
total += u[k] * v[k];
}
return total;
}
} // namespace tesseract

View File

@ -0,0 +1,36 @@
///////////////////////////////////////////////////////////////////////
// File: dotproduct.h
// Description: Native dot product function.
//
// (C) Copyright 2018, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_ARCH_DOTPRODUCT_H_
#define TESSERACT_ARCH_DOTPRODUCT_H_
namespace tesseract {
// Computes and returns the dot product of the n-vectors u and v.
double DotProductNative(const double *u, const double *v, int n);
// Uses Intel AVX intrinsics to access the SIMD instruction set.
double DotProductAVX(const double *u, const double *v, int n);
// Use Intel FMA.
double DotProductFMA(const double *u, const double *v, int n);
// Uses Intel SSE intrinsics to access the SIMD instruction set.
double DotProductSSE(const double *u, const double *v, int n);
} // namespace tesseract.
#endif // TESSERACT_ARCH_DOTPRODUCT_H_

View File

@ -0,0 +1,63 @@
///////////////////////////////////////////////////////////////////////
// File: dotproductavx.cpp
// Description: Architecture-specific dot-product function.
// Author: Ray Smith
//
// (C) Copyright 2015, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////
#if !defined(__AVX__)
# if defined(__i686__) || defined(__x86_64__)
# error Implementation only for AVX capable architectures
# endif
#else
# include <immintrin.h>
# include <cstdint>
# include "dotproduct.h"
namespace tesseract {
// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel AVX intrinsics to access the SIMD instruction set.
double DotProductAVX(const double *u, const double *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
__m256d t0 = _mm256_setzero_pd();
__m256d t1 = _mm256_setzero_pd();
for (unsigned k = 0; k < quot; k++) {
__m256d f0 = _mm256_loadu_pd(u);
__m256d f1 = _mm256_loadu_pd(v);
f0 = _mm256_mul_pd(f0, f1);
t0 = _mm256_add_pd(t0, f0);
u += 4;
v += 4;
__m256d f2 = _mm256_loadu_pd(u);
__m256d f3 = _mm256_loadu_pd(v);
f2 = _mm256_mul_pd(f2, f3);
t1 = _mm256_add_pd(t1, f2);
u += 4;
v += 4;
}
t0 = _mm256_hadd_pd(t0, t1);
alignas(32) double tmp[4];
_mm256_store_pd(tmp, t0);
double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}
} // namespace tesseract.
#endif

View File

@ -0,0 +1,61 @@
///////////////////////////////////////////////////////////////////////
// File: dotproductfma.cpp
// Description: Architecture-specific dot-product function.
// Author: Stefan Weil
//
// (C) Copyright 2015, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////
#if !defined(__FMA__)
# if defined(__i686__) || defined(__x86_64__)
# error Implementation only for FMA capable architectures
# endif
#else
# include <immintrin.h>
# include <cstdint>
# include "dotproduct.h"
namespace tesseract {
// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel FMA intrinsics to access the SIMD instruction set.
double DotProductFMA(const double *u, const double *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
__m256d t0 = _mm256_setzero_pd();
__m256d t1 = _mm256_setzero_pd();
for (unsigned k = 0; k < quot; k++) {
__m256d f0 = _mm256_loadu_pd(u);
__m256d f1 = _mm256_loadu_pd(v);
t0 = _mm256_fmadd_pd(f0, f1, t0);
u += 4;
v += 4;
__m256d f2 = _mm256_loadu_pd(u);
__m256d f3 = _mm256_loadu_pd(v);
t1 = _mm256_fmadd_pd(f2, f3, t1);
u += 4;
v += 4;
}
t0 = _mm256_hadd_pd(t0, t1);
alignas(32) double tmp[4];
_mm256_store_pd(tmp, t0);
double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}
} // namespace tesseract.
#endif

View File

@ -0,0 +1,84 @@
///////////////////////////////////////////////////////////////////////
// File: dotproductsse.cpp
// Description: Architecture-specific dot-product function.
// Author: Ray Smith
//
// (C) Copyright 2015, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////
#if !defined(__SSE4_1__)
# if defined(__i686__) || defined(__x86_64__)
# error Implementation only for SSE 4.1 capable architectures
# endif
#else
# include <emmintrin.h>
# include <smmintrin.h>
# include <cstdint>
# include "dotproduct.h"
namespace tesseract {
// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel SSE intrinsics to access the SIMD instruction set.
double DotProductSSE(const double *u, const double *v, int n) {
int max_offset = n - 2;
int offset = 0;
// Accumulate a set of 2 sums in sum, by loading pairs of 2 values from u and
// v, and multiplying them together in parallel.
__m128d sum = _mm_setzero_pd();
if (offset <= max_offset) {
offset = 2;
// Aligned load is reputedly faster but requires 16 byte aligned input.
if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 && (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
// Use aligned load.
sum = _mm_load_pd(u);
__m128d floats2 = _mm_load_pd(v);
// Multiply.
sum = _mm_mul_pd(sum, floats2);
while (offset <= max_offset) {
__m128d floats1 = _mm_load_pd(u + offset);
floats2 = _mm_load_pd(v + offset);
offset += 2;
floats1 = _mm_mul_pd(floats1, floats2);
sum = _mm_add_pd(sum, floats1);
}
} else {
// Use unaligned load.
sum = _mm_loadu_pd(u);
__m128d floats2 = _mm_loadu_pd(v);
// Multiply.
sum = _mm_mul_pd(sum, floats2);
while (offset <= max_offset) {
__m128d floats1 = _mm_loadu_pd(u + offset);
floats2 = _mm_loadu_pd(v + offset);
offset += 2;
floats1 = _mm_mul_pd(floats1, floats2);
sum = _mm_add_pd(sum, floats1);
}
}
}
// Add the 2 sums in sum horizontally.
sum = _mm_hadd_pd(sum, sum);
// Extract the low result.
double result = _mm_cvtsd_f64(sum);
// Add on any left-over products.
while (offset < n) {
result += u[offset] * v[offset];
++offset;
}
return result;
}
} // namespace tesseract.
#endif

View File

@ -0,0 +1,94 @@
///////////////////////////////////////////////////////////////////////
// File: intsimdmatrix.cpp
// Description: Base class for 8-bit int SIMD matrix multipliers.
// Author: Ray Smith
//
// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////
#include "intsimdmatrix.h"
#include "matrix.h" // for GENERIC_2D_ARRAY
#include "simddetect.h" // for SIMDDetect
namespace tesseract {
const IntSimdMatrix *IntSimdMatrix::intSimdMatrix = nullptr;
// Computes a reshaped copy of the weight matrix w.
void IntSimdMatrix::Init(const GENERIC_2D_ARRAY<int8_t> &w, std::vector<int8_t> &shaped_w,
int32_t &rounded_num_out) const {
const int num_out = w.dim1();
const int num_in = w.dim2() - 1;
// The rounded-up sizes of the reshaped weight matrix, excluding biases.
int rounded_num_in = Roundup(num_in, num_inputs_per_group_);
rounded_num_out = RoundOutputs(num_out);
// Add the bias and compute the required size.
shaped_w.resize((rounded_num_in + 1) * rounded_num_out, 0);
int shaped_index = 0;
int output = 0;
// Each number of registers needs a different format! Iterates over the
// different numbers of registers (each a power of 2).
for (int num_registers = max_output_registers_; num_registers >= 1; num_registers /= 2) {
// The number of outputs that we will generate with this many registers.
int num_outputs_per_register_set = num_registers * num_outputs_per_register_;
// Use the max number of registers until we have to go fewer.
while (output + num_outputs_per_register_set <= rounded_num_out) {
// Accumulating outputs in registers saves iterating over the inputs, so
// we only have to do it once per output register set.
for (int input = 0; input < num_in; input += num_inputs_per_group_) {
// Iterate over the number of outputs in a register set.
for (int j = 0; j < num_outputs_per_register_set; ++j) {
// Inner-most loop corresponds to the number of inputs in an input
// group.
for (int i = 0; i < num_inputs_per_group_; ++i) {
int8_t weight = 0;
if (output + j < num_out && input + i < num_in) {
weight = w(output + j, input + i);
}
shaped_w[shaped_index++] = weight;
}
}
}
// Append the bias weights for the register set.
for (int j = 0; j < num_outputs_per_register_set; ++j) {
int8_t weight = 0;
if (output + j < num_out) {
weight = w(output + j, num_in);
}
shaped_w[shaped_index++] = weight;
}
output += num_outputs_per_register_set;
}
}
}
// Computes matrix.vector v = Wu.
// u is of size W.dim2() - 1 and the output v is of size W.dim1().
// u is imagined to have an extra element at the end with value 1, to
// implement the bias, but it doesn't actually have it.
void IntSimdMatrix::MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w,
const std::vector<double> &scales, const int8_t *u, double *v) {
int num_out = w.dim1();
int num_in = w.dim2() - 1;
// Base implementation.
for (int i = 0; i < num_out; ++i) {
const int8_t *wi = w[i];
int total = 0;
for (int j = 0; j < num_in; ++j) {
total += wi[j] * u[j];
}
// Add in the bias and correct for integer values.
v[i] = (total + wi[num_in] * INT8_MAX) * scales[i];
}
}
} // namespace tesseract

View File

@ -0,0 +1,123 @@
///////////////////////////////////////////////////////////////////////
// File: intsimdmatrix.h
// Description: Base class for 8-bit int SIMD matrix multipliers.
// Author: Ray Smith
//
// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_ARCH_INTSIMDMATRIX_H_
#define TESSERACT_ARCH_INTSIMDMATRIX_H_
#include <tesseract/export.h>
#include <cstdint>
#include <vector>
namespace tesseract {
template <class T>
class GENERIC_2D_ARRAY;
// Base class for a SIMD function to multiply a matrix by a vector, with sources
// of 8-bit signed integer, and result in a double, after appropriate scaling.
// Assumes a specific method of multiplication that can be applied to any size
// and number of SIMD registers as follows:
// int32_t results are computed with num_outputs_per_register_ in each of
// max_output_registers_ result registers, repeatedly until it would make too
// many results, then the number of registers is halved, and so-on down to a
// single result register. The last calculation only outputs the required number
// of results instead of writing beyond the bounds. Eg: matrix has 75 outputs,
// num_outputs_per_register_ = 4, and max_output_registers_ = 8,
// Step 1: 8x4=32 results are computed,
// Step 2: 8x4=32 again, total 64,
// Step 3: 2x4=8 (since 8x4 is too many, so is 4x4), total 72,
// Step 4: 1x3, total 75.
// Each step above is computed using a PartialFunc, which runs over the input
// vector once. The input is read one registerful of num_inputs_per_register_
// at a time (presumably 4x num_outputs_per_register_ since they are int8_t)
// so the inputs MUST BE PADDED to a multiple of num_inputs_per_register_.
// Since it is slow (on Intel at least) to horizontally add in a register,
// provision is made to process num_inputs_per_group_ inputs at a time, with
// the group being replicated num_input_groups_ times and multiplied by a
// num_inputs_per_group_ by num_input_groups_ rectangle of the weights matrix.
// This is most convenient if num_inputs_per_group_ is 4, and the product
// sign-extends and sums 8x8=16 bit results to 32 bits, adding 4 adjacent
// results in the process, but it doesn't have to be implemented that way.
// The weights are re-ordered by Init() to be used sequentially by the above
// algorithm, followed by the biases, so they can be added at the end.
// The base class computes the base C++ implementation.
// NOTE that, although the subclasses execute on different SIMD hardware, no
// virtual methods are needed, as the constructor sets up everything that
// is required to allow the base class implementation to do all the work.
struct TESS_API IntSimdMatrix {
// Computes a reshaped copy of the weight matrix w.
void Init(const GENERIC_2D_ARRAY<int8_t> &w, std::vector<int8_t> &shaped_w,
int32_t &rounded_num_out) const;
// Rounds the size up to a multiple of the input register size (in int8_t).
int RoundInputs(int size) const {
return Roundup(size, num_inputs_per_register_);
}
// Rounds the size up to a multiple of the output register size (in int32_t).
int RoundOutputs(int size) const {
return Roundup(size, num_outputs_per_register_);
}
// Computes matrix.vector v = Wu.
// u is of size W.dim2() - 1 and the output v is of size W.dim1().
// u is imagined to have an extra element at the end with value 1, to
// implement the bias, but it doesn't actually have it.
// Computes the base C++ implementation.
static void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w, const std::vector<double> &scales,
const int8_t *u, double *v);
// Rounds the input up to a multiple of the given factor.
static int Roundup(int input, int factor) {
return (input + factor - 1) / factor * factor;
}
// Computes matrix.vector v = Wu.
// u is of size W.dim2() - 1 and the output v is of size W.dim1().
// u is imagined to have an extra element at the end with value 1, to
// implement the bias, but it doesn't actually have it.
// Uses an optimized implementation with partial funcs.
// NOTE: The size of the input vector (u) must be padded using
// RoundInputs above.
// The input will be over-read to the extent of the padding. There are no
// alignment requirements.
using MatrixDotVectorFunction = void (*)(int, int, const int8_t *, const double *, const int8_t *,
double *);
MatrixDotVectorFunction matrixDotVectorFunction;
// Number of 32 bit outputs held in each register.
int num_outputs_per_register_;
// Maximum number of registers that we will use to hold outputs.
int max_output_registers_;
// Number of 8 bit inputs in the inputs register.
int num_inputs_per_register_;
// Number of inputs in each weight group.
int num_inputs_per_group_;
// Number of groups of inputs to be broadcast.
// num_input_groups_ = num_inputs_per_register_ / num_inputs_per_group_
static const IntSimdMatrix *intSimdMatrix;
// Only available with NEON.
static const IntSimdMatrix intSimdMatrixNEON;
// Only available with AVX2 / SSE.
static const IntSimdMatrix intSimdMatrixAVX2;
static const IntSimdMatrix intSimdMatrixSSE;
};
} // namespace tesseract
#endif // TESSERACT_ARCH_INTSIMDMATRIX_H_

View File

@ -0,0 +1,348 @@
///////////////////////////////////////////////////////////////////////
// File: intsimdmatrixavx2.cpp
// Description: matrix-vector product for 8-bit data on avx2.
// Author: Ray Smith
//
// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////
#if !defined(__AVX2__)
# if defined(__i686__) || defined(__x86_64__)
# error Implementation only for AVX2 capable architectures
# endif
#else
# include "intsimdmatrix.h"
# include <immintrin.h>
# include <algorithm>
# include <cstdint>
# include <vector>
namespace tesseract {
// Number of outputs held in each register. 8 x 32 bit ints.
constexpr int kNumOutputsPerRegister = 8;
// Maximum number of registers that we will use.
constexpr int kMaxOutputRegisters = 8;
// Number of inputs in the inputs register.
constexpr int kNumInputsPerRegister = 32;
// Number of inputs in each weight group.
constexpr int kNumInputsPerGroup = 4;
// Number of groups of inputs to be broadcast.
constexpr int kNumInputGroups = kNumInputsPerRegister / kNumInputsPerGroup;
// Functions to compute part of a matrix.vector multiplication. The weights
// are in a very specific order (see above) in w, which is multiplied by
// u of length num_in, to produce output v after scaling the integer results
// by the corresponding member of scales.
// The amount of w and scales consumed is fixed and not available to the
// caller. The number of outputs written to v will be at most num_out.
// Computes one set of 4x8 products of inputs and weights, adding to result.
// Horizontally adds 4 adjacent results, making 8x32-bit results.
// rep_input is assumed to be an 8x replicated set of 4x8-bit signed integers.
// Note that wi must previously have been re-organized with blocks of 4x8
// weights in contiguous memory.
// ones is a register of 16x16-bit values all equal to 1.
// Note: wi is incremented by the amount of data read.
// weights and reps are scratch registers.
// This function must be inlined with references in order for the compiler to
// correctly use the registers declared in the caller.
static inline void MultiplyGroup(const __m256i &rep_input, const __m256i &ones, const int8_t *&wi,
__m256i &weights, __m256i &reps, __m256i &result) {
// Load a 4x8 block of weights.
weights = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(wi));
wi += kNumInputsPerRegister;
// Normalize the signs on rep_input, weights, so weights is always +ve.
reps = _mm256_sign_epi8(rep_input, weights);
weights = _mm256_sign_epi8(weights, weights);
// Multiply 32x8-bit reps by 32x8-bit weights to make 16x16-bit results,
// with adjacent pairs added.
weights = _mm256_maddubs_epi16(weights, reps);
// Multiply 16x16-bit result by 16x16-bit ones to make 8x32-bit results,
// with adjacent pairs added. What we really want is a horizontal add of
// 16+16=32 bit result, but there is no such instruction, so multiply by
// 16-bit ones instead. It is probably faster than all the sign-extending,
// permuting and adding that would otherwise be required.
weights = _mm256_madd_epi16(weights, ones);
result = _mm256_add_epi32(result, weights);
}
// Load 64 bits into the bottom of a 128bit register.
// We don't actually care what the top 64bits are, but this ends
// up with them being zero.
static inline __m128i load64_to_128(const int8_t *wi_) {
const auto *wi = reinterpret_cast<const int64_t *>(wi_);
return _mm_set_epi64x(0, wi[0]);
}
static inline void ExtractResults8(__m256i result, const int8_t *wi, const double *scales,
double *v) {
__m128i w128 = load64_to_128(wi); // 8x8bit vals in bottom of 128bit reg
__m256i w256 = _mm256_cvtepi8_epi32(w128); // 8x32bit vals in 256bit reg
__m256i bias_scale = _mm256_set_epi32(127, 127, 127, 127, 127, 127, 127, 127);
__m256d scale0123 = _mm256_loadu_pd(scales);
__m256d scale4567 = _mm256_loadu_pd(scales + 4);
w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 <bias * 127>
result = _mm256_add_epi32(result, w256); // result += bias * 127
__m256d res0123 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result));
result = _mm256_permute4x64_epi64(result, 2 + (3 << 2));
__m256d res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result));
res0123 = _mm256_mul_pd(res0123, scale0123);
res4567 = _mm256_mul_pd(res4567, scale4567);
_mm256_storeu_pd(v, res0123);
_mm256_storeu_pd(v + 4, res4567);
}
static inline void ExtractResults16(__m256i result0, __m256i result1, const int8_t *&wi,
const double *&scales, double *&v) {
__m128i w8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(wi));
// 8x8bit vals in bottom of 128bit reg
const __m256i bias_scale = _mm256_set_epi32(127, 127, 127, 127, 127, 127, 127, 127);
__m256i w256 = _mm256_cvtepi8_epi32(w8); // 8x32bit vals in 256bit reg
__m256d scale0123 = _mm256_loadu_pd(scales);
__m256d scale4567 = _mm256_loadu_pd(scales + 4);
w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 <bias * 127>
result0 = _mm256_add_epi32(result0, w256); // result += bias * 127
__m256d res0123 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result0));
result0 = _mm256_permute4x64_epi64(result0, 2 + (3 << 2));
__m256d res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result0));
res0123 = _mm256_mul_pd(res0123, scale0123);
res4567 = _mm256_mul_pd(res4567, scale4567);
_mm256_storeu_pd(v, res0123);
_mm256_storeu_pd(v + 4, res4567);
w8 = _mm_shuffle_epi32(w8, 2 + (3 << 2));
w256 = _mm256_cvtepi8_epi32(w8); // 8x32bit vals in 256bit reg
scale0123 = _mm256_loadu_pd(scales + 8);
scale4567 = _mm256_loadu_pd(scales + 12);
w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 <bias * 127>
result1 = _mm256_add_epi32(result1, w256); // result += bias * 127
res0123 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result1));
result1 = _mm256_permute4x64_epi64(result1, 2 + (3 << 2));
res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result1));
res0123 = _mm256_mul_pd(res0123, scale0123);
res4567 = _mm256_mul_pd(res4567, scale4567);
_mm256_storeu_pd(v + 8, res0123);
_mm256_storeu_pd(v + 12, res4567);
wi += 16;
scales += 16;
v += 16;
}
// Computes part of matrix.vector v = Wu. Computes N=64 results.
// The weights *must* be arranged so that consecutive reads from wi
// provides (num_in/kNumInputsPerGroup groups of (N output dim groups of
// (kNumInputsPerGroup inputs))). After that there must be N consecutive
// bias weights, before continuing with any more weights.
// u must be padded out with zeros to
// kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements.
static void PartialMatrixDotVector64(const int8_t *wi, const double *scales, const int8_t *u,
int num_in, double *v) {
// Register containing 16-bit ones for horizontal add with 16->32 bit
// conversion.
__m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
// Initialize all the results to 0.
__m256i result0 = _mm256_setzero_si256();
__m256i result1 = _mm256_setzero_si256();
__m256i result2 = _mm256_setzero_si256();
__m256i result3 = _mm256_setzero_si256();
__m256i result4 = _mm256_setzero_si256();
__m256i result5 = _mm256_setzero_si256();
__m256i result6 = _mm256_setzero_si256();
__m256i result7 = _mm256_setzero_si256();
// Iterate over the input (u), one registerful at a time.
for (int j = 0; j < num_in;) {
__m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
// Inputs are processed in groups of kNumInputsPerGroup, replicated
// kNumInputGroups times.
for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
// Replicate the low 32 bits (4 inputs) 8 times.
__m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
// Rotate the inputs in groups of 4, so the next 4 inputs are ready.
inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
__m256i weights, reps;
// Mul-add, with horizontal add of the 4 inputs to each of the results.
MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
MultiplyGroup(rep_input, ones, wi, weights, reps, result1);
MultiplyGroup(rep_input, ones, wi, weights, reps, result2);
MultiplyGroup(rep_input, ones, wi, weights, reps, result3);
MultiplyGroup(rep_input, ones, wi, weights, reps, result4);
MultiplyGroup(rep_input, ones, wi, weights, reps, result5);
MultiplyGroup(rep_input, ones, wi, weights, reps, result6);
MultiplyGroup(rep_input, ones, wi, weights, reps, result7);
}
}
ExtractResults16(result0, result1, wi, scales, v);
ExtractResults16(result2, result3, wi, scales, v);
ExtractResults16(result4, result5, wi, scales, v);
ExtractResults16(result6, result7, wi, scales, v);
}
// Computes part of matrix.vector v = Wu. Computes N=32 results.
// For details see PartialMatrixDotVector64 with N=32.
static void PartialMatrixDotVector32(const int8_t *wi, const double *scales, const int8_t *u,
int num_in, double *v) {
// Register containing 16-bit ones for horizontal add with 16->32 bit
// conversion.
__m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
// Initialize all the results to 0.
__m256i result0 = _mm256_setzero_si256();
__m256i result1 = _mm256_setzero_si256();
__m256i result2 = _mm256_setzero_si256();
__m256i result3 = _mm256_setzero_si256();
// Iterate over the input (u), one registerful at a time.
for (int j = 0; j < num_in;) {
__m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
// Inputs are processed in groups of kNumInputsPerGroup, replicated
// kNumInputGroups times.
for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
// Replicate the low 32 bits (4 inputs) 8 times.
__m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
// Rotate the inputs in groups of 4, so the next 4 inputs are ready.
inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
__m256i weights, reps;
// Mul-add, with horizontal add of the 4 inputs to each of the results.
MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
MultiplyGroup(rep_input, ones, wi, weights, reps, result1);
MultiplyGroup(rep_input, ones, wi, weights, reps, result2);
MultiplyGroup(rep_input, ones, wi, weights, reps, result3);
}
}
ExtractResults16(result0, result1, wi, scales, v);
ExtractResults16(result2, result3, wi, scales, v);
}
// Computes part of matrix.vector v = Wu. Computes N=16 results.
// For details see PartialMatrixDotVector64 with N=16.
static void PartialMatrixDotVector16(const int8_t *wi, const double *scales, const int8_t *u,
int num_in, double *v) {
// Register containing 16-bit ones for horizontal add with 16->32 bit
// conversion.
__m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
// Initialize all the results to 0.
__m256i result0 = _mm256_setzero_si256();
__m256i result1 = _mm256_setzero_si256();
// Iterate over the input (u), one registerful at a time.
for (int j = 0; j < num_in;) {
__m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
// Inputs are processed in groups of kNumInputsPerGroup, replicated
// kNumInputGroups times.
for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
// Replicate the low 32 bits (4 inputs) 8 times.
__m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
// Rotate the inputs in groups of 4, so the next 4 inputs are ready.
inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
__m256i weights, reps;
// Mul-add, with horizontal add of the 4 inputs to each of the results.
MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
MultiplyGroup(rep_input, ones, wi, weights, reps, result1);
}
}
ExtractResults16(result0, result1, wi, scales, v);
}
// Computes part of matrix.vector v = Wu. Computes N=8 results.
// For details see PartialMatrixDotVector64 with N=8.
static inline void PartialMatrixDotVector8(const int8_t *wi, const double *scales, const int8_t *u,
int num_in, double *v) {
// Register containing 16-bit ones for horizontal add with 16->32 bit
// conversion.
__m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
// Initialize all the results to 0.
__m256i result0 = _mm256_setzero_si256();
// Iterate over the input (u), one registerful at a time.
for (int j = 0; j < num_in;) {
__m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
// Inputs are processed in groups of kNumInputsPerGroup, replicated
// kNumInputGroups times.
for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
// Replicate the low 32 bits (4 inputs) 8 times.
__m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
// Rotate the inputs in groups of 4, so the next 4 inputs are ready.
inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
__m256i weights, reps;
// Mul-add, with horizontal add of the 4 inputs to each of the results.
MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
}
}
ExtractResults8(result0, wi, scales, v);
}
static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales,
const int8_t *u, double *v) {
const int num_out = dim1;
const int num_in = dim2 - 1;
// Each call to a partial_func_ produces group_size outputs, except the
// last one, which can produce less.
const int rounded_num_in = IntSimdMatrix::Roundup(num_in, kNumInputsPerGroup);
const int rounded_num_out = IntSimdMatrix::Roundup(num_out, kNumOutputsPerRegister);
int group_size = kNumOutputsPerRegister * kMaxOutputRegisters;
int output = 0;
int w_step = (rounded_num_in + 1) * group_size;
// Run with this group size, until it would produce too much output, then
// switch to a smaller size.
for (; output + group_size <= rounded_num_out; output += group_size) {
PartialMatrixDotVector64(wi, scales, u, rounded_num_in, v);
wi += w_step;
scales += group_size;
v += group_size;
}
group_size /= 2;
w_step /= 2;
if (output + group_size <= rounded_num_out) {
PartialMatrixDotVector32(wi, scales, u, rounded_num_in, v);
wi += w_step;
scales += group_size;
v += group_size;
output += group_size;
}
group_size /= 2;
w_step /= 2;
if (output + group_size <= rounded_num_out) {
PartialMatrixDotVector16(wi, scales, u, rounded_num_in, v);
wi += w_step;
scales += group_size;
v += group_size;
output += group_size;
}
group_size /= 2;
w_step /= 2;
if (output + group_size <= rounded_num_out) {
PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v);
}
}
const IntSimdMatrix IntSimdMatrix::intSimdMatrixAVX2 = {
// Function.
matrixDotVector,
// Number of 32 bit outputs held in each register.
kNumOutputsPerRegister,
// Maximum number of registers that we will use to hold outputs.
kMaxOutputRegisters,
// Number of 8 bit inputs in the inputs register.
kNumInputsPerRegister,
// Number of inputs in each weight group.
kNumInputsPerGroup};
} // namespace tesseract.
#endif

View File

@ -0,0 +1,203 @@
///////////////////////////////////////////////////////////////////////
// File: intsimdmatrixneon.cpp
// Description: matrix-vector product for 8-bit data on neon.
// Author: Robin Watts (from the AVX2 original by Ray Smith)
//
// (C) Copyright 2017, Google Inc.
// (C) Copyright 2020, Artifex Software Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////
#if defined(__ARM_NEON)
# include "intsimdmatrix.h"
# include <algorithm>
# include <cstdint>
# include <vector>
# include "arm_neon.h"
namespace tesseract {
// Number of outputs held in each register. (Actually, we use a
// pair of 4x32 registers, so 8 x 32 bit ints).
constexpr int kNumOutputsPerRegister = 8;
// Maximum number of registers that we will use.
constexpr int kMaxOutputRegisters = 1;
// Number of inputs in the inputs register.
constexpr int kNumInputsPerRegister = 8;
// Number of inputs in each weight group.
constexpr int kNumInputsPerGroup = 8;
// Function to compute part of a matrix.vector multiplication. The weights
// are in a very specific order (see above) in w, which is multiplied by
// u of length num_in, to produce output v after scaling the integer results
// by the corresponding member of scales.
// The amount of w and scales consumed is fixed and not available to the
// caller.
// Computes part of matrix.vector v = Wu. Computes N=8 results.
// The weights *must* be arranged so that consecutive reads from wi
// provides (num_in/kNumInputsPerGroup groups of (N output dim groups of
// (kNumInputsPerGroup inputs))). After that there must be N consecutive
// bias weights, before continuing with any more weights.
// u must be padded out with zeros to
// kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements.
static inline void PartialMatrixDotVector8(const int8_t *__restrict wi,
const double *__restrict scales,
const int8_t *__restrict u, int num_in,
double *__restrict v, int num_out) {
// Initialize all the results to 0.
int32x4_t result0123 = {0, 0, 0, 0};
int32x4_t result4567 = {0, 0, 0, 0};
int8x8_t bias_scale = {127, 127, 127, 127, 127, 127, 127, 127};
// Iterate over the input (u), one registerful at a time.
for (int j = 0; j < num_in; j += 8) {
int8x8_t vu = vld1_s8(u); // vu = u0 u1 u2 u3 u4 u5 u6 u7
int8x16_t vw01 = vld1q_s8(wi); // vw0 = w00 w01 w02 w03 w04 w05 w06 w07
// w10 w11 w12 w13 w14 w15 w16 w17
int8x16_t vw23 = vld1q_s8(wi + 8 * 2); // vw2 = w20 w21 w22 w23 w24 w25 w26 w27 w30
// w31 w32 w33 w34 w35 w36 w37
int8x16_t vw45 = vld1q_s8(wi + 8 * 4); // vw4 = w40 w41 w42 w43 w44 w45 w46 w47 w50
// w51 w52 w53 w54 w55 w56 w57
int8x16_t vw67 = vld1q_s8(wi + 8 * 6); // vw6 = w60 w61 w62 w63 w64 w65 w66 w67 w70
// w71 w72 w73 w74 w75 w76 w77
int16x8_t vrow0q = vmull_s8(vget_low_s8(vw01), vu); // vrow0q = vw00.u0 w01.u1 w02.u2
// w03.u3 vw04.u4 w05.u5 w06.u6 w07.u7
int16x8_t vrow1q = vmull_s8(vget_high_s8(vw01),
vu); // vrow1q = vw10.u0 w11.u1 w12.u2 w13.u3
// vw14.u4 w15.u5 w16.u6 w17.u7
int16x8_t vrow2q = vmull_s8(vget_low_s8(vw23), vu); // vrow2q = vw20.u0 w21.u1 w22.u2
// w23.u3 vw24.u4 w25.u5 w26.u6 w27.u7
int16x8_t vrow3q = vmull_s8(vget_high_s8(vw23),
vu); // vrow3q = vw30.u0 w31.u1 w32.u2 w33.u3
// vw34.u4 w35.u5 w36.u6 w37.u7
int16x8_t vrow4q = vmull_s8(vget_low_s8(vw45), vu); // vrow4q = vw40.u0 w41.u1 w42.u2
// w43.u3 vw44.u4 w45.u5 w46.u6 w47.u7
int16x8_t vrow5q = vmull_s8(vget_high_s8(vw45),
vu); // vrow5q = vw50.u0 w51.u1 w52.u2 w53.u3
// vw54.u4 w55.u5 w56.u6 w57.u7
int16x8_t vrow6q = vmull_s8(vget_low_s8(vw67), vu); // vrow6q = vw60.u0 w61.u1 w62.u2
// w63.u3 vw64.u4 w65.u5 w66.u6 w67.u7
int16x8_t vrow7q = vmull_s8(vget_high_s8(vw67),
vu); // vrow7q = vw70.u0 w71.u1 w72.u2 w73.u3
// vw74.u4 w75.u5 w76.u6 w77.u7
int32x4_t vrow0q2 = vpaddlq_s16(vrow0q); // vrow0q2 = vw00.u0+w01.u1 w02.u2+w03.u3
// vw04.u4+w05.u5 w06.u6+w07.u7
int32x4_t vrow1q2 = vpaddlq_s16(vrow1q); // vrow1q2 = vw10.u0+w11.u1 w12.u2+w13.u3
// vw14.u4+w15.u5 w16.u6+w17.u7
int32x4_t vrow2q2 = vpaddlq_s16(vrow2q); // vrow2q2 = vw20.u0+w21.u1 w22.u2+w23.u3
// vw24.u4+w25.u5 w26.u6+w27.u7
int32x4_t vrow3q2 = vpaddlq_s16(vrow3q); // vrow3q2 = vw30.u0+w31.u1 w32.u2+w33.u3
// vw34.u4+w35.u5 w36.u6+w37.u7
int32x4_t vrow4q2 = vpaddlq_s16(vrow4q); // vrow4q2 = vw40.u0+w41.u1 w42.u2+w43.u3
// vw44.u4+w45.u5 w46.u6+w47.u7
int32x4_t vrow5q2 = vpaddlq_s16(vrow5q); // vrow5q2 = vw50.u0+w51.u1 w52.u2+w53.u3
// vw54.u4+w55.u5 w56.u6+w57.u7
int32x4_t vrow6q2 = vpaddlq_s16(vrow6q); // vrow6q2 = vw60.u0+w61.u1 w62.u2+w63.u3
// vw64.u4+w65.u5 w66.u6+w67.u7
int32x4_t vrow7q2 = vpaddlq_s16(vrow7q); // vrow7q2 = vw70.u0+w71.u1 w72.u2+w73.u3
// vw74.u4+w75.u5 w76.u6+w77.u7
vrow0q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow0q2), vget_high_s32(vrow0q2)),
vpadd_s32(vget_low_s32(vrow1q2), vget_high_s32(vrow1q2)));
// vrow0q2 = vw00.u0+...+w03.u3 vw04.u4+...+w07.u7 vw10.u0+...+w13.u3
// vw14.u4+...+w17.u7
vrow2q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow2q2), vget_high_s32(vrow2q2)),
vpadd_s32(vget_low_s32(vrow3q2), vget_high_s32(vrow3q2)));
// vrow0q2 = vw20.u0+...+w23.u3 vw24.u4+...+w27.u7 vw30.u0+...+w33.u3
// vw34.u4+...+w37.u7
vrow4q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow4q2), vget_high_s32(vrow4q2)),
vpadd_s32(vget_low_s32(vrow5q2), vget_high_s32(vrow5q2)));
// vrow0q2 = vw40.u0+...+w43.u3 vw44.u4+...+w47.u7 vw50.u0+...+w53.u3
// vw54.u4+...+w57.u7
vrow6q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow6q2), vget_high_s32(vrow6q2)),
vpadd_s32(vget_low_s32(vrow7q2), vget_high_s32(vrow7q2)));
// vrow0q2 = vw60.u0+...+w63.u3 vw64.u4+...+w67.u7 vw70.u0+...+w73.u3
// vw74.u4+...+w77.u7
vrow0q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow0q2), vget_high_s32(vrow0q2)),
vpadd_s32(vget_low_s32(vrow2q2), vget_high_s32(vrow2q2)));
// vrow0q2 = vw00.u0+...+w07.u7 vw10.u0+...+w17.u7 vw20.u0+...+w27.u7
// vw30.u0+...+w37.u7
vrow4q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow4q2), vget_high_s32(vrow4q2)),
vpadd_s32(vget_low_s32(vrow6q2), vget_high_s32(vrow6q2)));
// vrow0q2 = vw40.u0+...+w47.u7 vw50.u0+...+w57.u7 vw60.u0+...+w67.u7
// vw70.u0+...+w77.u7
result0123 = vaddq_s32(result0123, vrow0q2);
result4567 = vaddq_s32(result4567, vrow4q2);
u += 8;
wi += 64;
}
{
int8x8_t bias = vld1_s8(wi); // vw0 = b0 b1 b2 b3 b4 b5 b6 b7
int16x8_t scaled_bias = vmull_s8(bias, bias_scale);
result0123 = vaddw_s16(result0123, vget_low_s16(scaled_bias));
result4567 = vaddw_s16(result4567, vget_high_s16(scaled_bias));
*v++ = vget_lane_s32(vget_low_s32(result0123), 0) * *scales++;
if (num_out > 1)
*v++ = vget_lane_s32(vget_low_s32(result0123), 1) * *scales++;
if (num_out > 2)
*v++ = vget_lane_s32(vget_high_s32(result0123), 0) * *scales++;
if (num_out > 3)
*v++ = vget_lane_s32(vget_high_s32(result0123), 1) * *scales++;
if (num_out > 4)
*v++ = vget_lane_s32(vget_low_s32(result4567), 0) * *scales++;
if (num_out > 5)
*v++ = vget_lane_s32(vget_low_s32(result4567), 1) * *scales++;
if (num_out > 6)
*v++ = vget_lane_s32(vget_high_s32(result4567), 0) * *scales++;
if (num_out > 7)
*v = vget_lane_s32(vget_high_s32(result4567), 1) * *scales;
}
}
static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales,
const int8_t *u, double *v) {
const int num_out = dim1;
const int num_in = dim2 - 1;
// Each call to a partial_func_ produces group_size outputs, except the
// last one, which can produce less.
const int rounded_num_in = IntSimdMatrix::Roundup(num_in, kNumInputsPerGroup);
int group_size = kNumOutputsPerRegister * kMaxOutputRegisters;
int output = 0;
int w_step = (rounded_num_in + 1) * group_size;
for (; output + group_size <= num_out; output += group_size) {
PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v, kNumOutputsPerRegister);
wi += w_step;
scales += group_size;
v += group_size;
}
if (output < num_out)
PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v,
num_out & (kNumOutputsPerRegister - 1));
}
const IntSimdMatrix IntSimdMatrix::intSimdMatrixNEON = {
// Function.
matrixDotVector,
// Number of 32 bit outputs held in each register.
kNumOutputsPerRegister,
// Maximum number of registers that we will use to hold outputs.
kMaxOutputRegisters,
// Number of 8 bit inputs in the inputs register.
kNumInputsPerRegister,
// Number of inputs in each weight group.
kNumInputsPerGroup};
} // namespace tesseract.
#endif /* __ARM_NEON */

View File

@ -0,0 +1,106 @@
///////////////////////////////////////////////////////////////////////
// File: intsindmatrixsse.cpp
// Description: SSE implementation of 8-bit int SIMD matrix multiply.
// Author: Ray Smith
//
// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////
#if !defined(__SSE4_1__)
# if defined(__i686__) || defined(__x86_64__)
# error Implementation only for SSE 4.1 capable architectures
# endif
#else
# include "intsimdmatrix.h"
# include <emmintrin.h>
# include <smmintrin.h>
# include <cstdint>
namespace tesseract {
// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel SSE intrinsics to access the SIMD instruction set.
static int32_t IntDotProductSSE(const int8_t *u, const int8_t *v, int n) {
int max_offset = n - 8;
int offset = 0;
// Accumulate a set of 4 32-bit sums in sum, by loading 8 pairs of 8-bit
// values, extending to 16 bit, multiplying to make 32 bit results.
int32_t result = 0;
if (offset <= max_offset) {
offset = 8;
__m128i packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(u));
__m128i packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(v));
__m128i sum = _mm_cvtepi8_epi16(packed1);
packed2 = _mm_cvtepi8_epi16(packed2);
// The magic _mm_add_epi16 is perfect here. It multiplies 8 pairs of 16 bit
// ints to make 32 bit results, which are then horizontally added in pairs
// to make 4 32 bit results that still fit in a 128 bit register.
sum = _mm_madd_epi16(sum, packed2);
while (offset <= max_offset) {
packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(u + offset));
packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(v + offset));
offset += 8;
packed1 = _mm_cvtepi8_epi16(packed1);
packed2 = _mm_cvtepi8_epi16(packed2);
packed1 = _mm_madd_epi16(packed1, packed2);
sum = _mm_add_epi32(sum, packed1);
}
// Sum the 4 packed 32 bit sums and extract the low result.
sum = _mm_hadd_epi32(sum, sum);
sum = _mm_hadd_epi32(sum, sum);
result = _mm_cvtsi128_si32(sum);
}
while (offset < n) {
result += u[offset] * v[offset];
++offset;
}
return result;
}
// Computes part of matrix.vector v = Wu. Computes 1 result.
static void PartialMatrixDotVector1(const int8_t *wi, const double *scales, const int8_t *u,
int num_in, double *v) {
double total = IntDotProductSSE(u, wi, num_in);
// Add in the bias and correct for integer values.
*v = (total + wi[num_in] * INT8_MAX) * *scales;
}
static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales,
const int8_t *u, double *v) {
const int num_out = dim1;
const int num_in = dim2 - 1;
int output = 0;
for (; output < num_out; output++) {
PartialMatrixDotVector1(wi, scales, u, num_in, v);
wi += dim2;
scales++;
v++;
}
}
const IntSimdMatrix IntSimdMatrix::intSimdMatrixSSE = {
matrixDotVector,
// Number of 32 bit outputs held in each register.
1,
// Maximum number of registers that we will use to hold outputs.
1,
// Number of 8 bit inputs in the inputs register.
1,
// Number of inputs in each weight group.
1};
} // namespace tesseract.
#endif

View File

@ -0,0 +1,283 @@
///////////////////////////////////////////////////////////////////////
// File: simddetect.cpp
// Description: Architecture detector.
// Author: Stefan Weil (based on code from Ray Smith)
//
// (C) Copyright 2014, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////
#ifdef HAVE_CONFIG_H
# include "config_auto.h" // for HAVE_AVX, ...
#endif
#include <numeric> // for std::inner_product
#include "dotproduct.h"
#include "intsimdmatrix.h" // for IntSimdMatrix
#include "params.h" // for STRING_VAR
#include "simddetect.h"
#include "tprintf.h" // for tprintf
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
# define HAS_CPUID
#endif
#if defined(HAS_CPUID)
# if defined(__GNUC__)
# include <cpuid.h>
# elif defined(_WIN32)
# include <intrin.h>
# endif
#endif
#if defined(HAVE_NEON) && !defined(__aarch64__)
# ifdef ANDROID
# include <cpu-features.h>
# else
/* Assume linux */
# include <asm/hwcap.h>
# include <sys/auxv.h>
# endif
#endif
namespace tesseract {
// Computes and returns the dot product of the two n-vectors u and v.
// Note: because the order of addition is different among the different dot
// product functions, the results can (and do) vary slightly (although they
// agree to within about 4e-15). This produces different results when running
// training, despite all random inputs being precisely equal.
// To get consistent results, use just one of these dot product functions.
// On a test multi-layer network, serial is 57% slower than SSE, and AVX
// is about 8% faster than SSE. This suggests that the time is memory
// bandwidth constrained and could benefit from holding the reused vector
// in AVX registers.
DotProductFunction DotProduct;
static STRING_VAR(dotproduct, "auto", "Function used for calculation of dot product");
SIMDDetect SIMDDetect::detector;
#if defined(__aarch64__)
// ARMv8 always has NEON.
bool SIMDDetect::neon_available_ = true;
#elif defined(HAVE_NEON)
// If true, then Neon has been detected.
bool SIMDDetect::neon_available_;
#else
// If true, then AVX has been detected.
bool SIMDDetect::avx_available_;
bool SIMDDetect::avx2_available_;
bool SIMDDetect::avx512F_available_;
bool SIMDDetect::avx512BW_available_;
// If true, then FMA has been detected.
bool SIMDDetect::fma_available_;
// If true, then SSe4.1 has been detected.
bool SIMDDetect::sse_available_;
#endif
// Computes and returns the dot product of the two n-vectors u and v.
static double DotProductGeneric(const double *u, const double *v, int n) {
double total = 0.0;
for (int k = 0; k < n; ++k) {
total += u[k] * v[k];
}
return total;
}
// Compute dot product using std::inner_product.
static double DotProductStdInnerProduct(const double *u, const double *v, int n) {
return std::inner_product(u, u + n, v, 0.0);
}
static void SetDotProduct(DotProductFunction f, const IntSimdMatrix *m = nullptr) {
DotProduct = f;
IntSimdMatrix::intSimdMatrix = m;
}
// Constructor.
// Tests the architecture in a system-dependent way to detect AVX, SSE and
// any other available SIMD equipment.
// __GNUC__ is also defined by compilers that include GNU extensions such as
// clang.
SIMDDetect::SIMDDetect() {
// The fallback is a generic dot product calculation.
SetDotProduct(DotProductGeneric);
#if defined(HAS_CPUID)
# if defined(__GNUC__)
unsigned int eax, ebx, ecx, edx;
if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
// Note that these tests all use hex because the older compilers don't have
// the newer flags.
# if defined(HAVE_SSE4_1)
sse_available_ = (ecx & 0x00080000) != 0;
# endif
# if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
auto xgetbv = []() {
uint32_t xcr0;
__asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
return xcr0;
};
if ((ecx & 0x08000000) && ((xgetbv() & 6) == 6)) {
// OSXSAVE bit is set, XMM state and YMM state are fine.
# if defined(HAVE_FMA)
fma_available_ = (ecx & 0x00001000) != 0;
# endif
# if defined(HAVE_AVX)
avx_available_ = (ecx & 0x10000000) != 0;
if (avx_available_) {
// There is supposed to be a __get_cpuid_count function, but this is all
// there is in my cpuid.h. It is a macro for an asm statement and cannot
// be used inside an if.
__cpuid_count(7, 0, eax, ebx, ecx, edx);
avx2_available_ = (ebx & 0x00000020) != 0;
avx512F_available_ = (ebx & 0x00010000) != 0;
avx512BW_available_ = (ebx & 0x40000000) != 0;
}
# endif
}
# endif
}
# elif defined(_WIN32)
int cpuInfo[4];
int max_function_id;
__cpuid(cpuInfo, 0);
max_function_id = cpuInfo[0];
if (max_function_id >= 1) {
__cpuid(cpuInfo, 1);
# if defined(HAVE_SSE4_1)
sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
# endif
# if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
if ((cpuInfo[2] & 0x08000000) && ((_xgetbv(0) & 6) == 6)) {
// OSXSAVE bit is set, XMM state and YMM state are fine.
# if defined(HAVE_FMA)
fma_available_ = (cpuInfo[2] & 0x00001000) != 0;
# endif
# if defined(HAVE_AVX)
avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
# endif
# if defined(HAVE_AVX2)
if (max_function_id >= 7) {
__cpuid(cpuInfo, 7);
avx2_available_ = (cpuInfo[1] & 0x00000020) != 0;
avx512F_available_ = (cpuInfo[1] & 0x00010000) != 0;
avx512BW_available_ = (cpuInfo[1] & 0x40000000) != 0;
}
# endif
}
# endif
}
# else
# error "I don't know how to test for SIMD with this compiler"
# endif
#endif
#if defined(HAVE_NEON) && !defined(__aarch64__)
# ifdef ANDROID
{
AndroidCpuFamily family = android_getCpuFamily();
if (family == ANDROID_CPU_FAMILY_ARM)
neon_available_ = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON);
}
# else
/* Assume linux */
neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON;
# endif
#endif
// Select code for calculation of dot product based on autodetection.
if (false) {
// This is a dummy to support conditional compilation.
#if defined(HAVE_AVX2)
} else if (avx2_available_) {
// AVX2 detected.
SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2);
#endif
#if defined(HAVE_AVX)
} else if (avx_available_) {
// AVX detected.
SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
#endif
#if defined(HAVE_SSE4_1)
} else if (sse_available_) {
// SSE detected.
SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);
#endif
#if defined(HAVE_NEON) || defined(__aarch64__)
} else if (neon_available_) {
// NEON detected.
SetDotProduct(DotProduct, &IntSimdMatrix::intSimdMatrixNEON);
#endif
}
}
void SIMDDetect::Update() {
// Select code for calculation of dot product based on the
// value of the config variable if that value is not empty.
const char *dotproduct_method = "generic";
if (!strcmp(dotproduct.c_str(), "auto")) {
// Automatic detection. Nothing to be done.
} else if (!strcmp(dotproduct.c_str(), "generic")) {
// Generic code selected by config variable.
SetDotProduct(DotProductGeneric);
dotproduct_method = "generic";
} else if (!strcmp(dotproduct.c_str(), "native")) {
// Native optimized code selected by config variable.
SetDotProduct(DotProductNative);
dotproduct_method = "native";
#if defined(HAVE_AVX2)
} else if (!strcmp(dotproduct.c_str(), "avx2")) {
// AVX2 selected by config variable.
SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2);
dotproduct_method = "avx2";
#endif
#if defined(HAVE_AVX)
} else if (!strcmp(dotproduct.c_str(), "avx")) {
// AVX selected by config variable.
SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
dotproduct_method = "avx";
#endif
#if defined(HAVE_FMA)
} else if (!strcmp(dotproduct.c_str(), "fma")) {
// FMA selected by config variable.
SetDotProduct(DotProductFMA, IntSimdMatrix::intSimdMatrix);
dotproduct_method = "fma";
#endif
#if defined(HAVE_SSE4_1)
} else if (!strcmp(dotproduct.c_str(), "sse")) {
// SSE selected by config variable.
SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);
dotproduct_method = "sse";
#endif
} else if (!strcmp(dotproduct.c_str(), "std::inner_product")) {
// std::inner_product selected by config variable.
SetDotProduct(DotProductStdInnerProduct);
dotproduct_method = "std::inner_product";
} else {
// Unsupported value of config variable.
tprintf("Warning, ignoring unsupported config variable value: dotproduct=%s\n",
dotproduct.c_str());
tprintf(
"Support values for dotproduct: auto generic native"
#if defined(HAVE_AVX)
" avx"
#endif
#if defined(HAVE_SSE4_1)
" sse"
#endif
" std::inner_product.\n");
}
dotproduct.set_value(dotproduct_method);
}
} // namespace tesseract

View File

@ -0,0 +1,87 @@
///////////////////////////////////////////////////////////////////////
// File: simddetect.h
// Description: Architecture detector.
// Author: Stefan Weil (based on code from Ray Smith)
//
// (C) Copyright 2014, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_ARCH_SIMDDETECT_H_
#define TESSERACT_ARCH_SIMDDETECT_H_
#include <tesseract/export.h>
namespace tesseract {
// Function pointer for best calculation of dot product.
using DotProductFunction = double (*)(const double *, const double *, int);
extern DotProductFunction DotProduct;
// Architecture detector. Add code here to detect any other architectures for
// SIMD-based faster dot product functions. Intended to be a single static
// object, but it does no real harm to have more than one.
class SIMDDetect {
public:
// Returns true if AVX is available on this system.
static inline bool IsAVXAvailable() {
return detector.avx_available_;
}
// Returns true if AVX2 (integer support) is available on this system.
static inline bool IsAVX2Available() {
return detector.avx2_available_;
}
// Returns true if AVX512 Foundation (float) is available on this system.
static inline bool IsAVX512FAvailable() {
return detector.avx512F_available_;
}
// Returns true if AVX512 integer is available on this system.
static inline bool IsAVX512BWAvailable() {
return detector.avx512BW_available_;
}
// Returns true if FMA is available on this system.
static inline bool IsFMAAvailable() {
return detector.fma_available_;
}
// Returns true if SSE4.1 is available on this system.
static inline bool IsSSEAvailable() {
return detector.sse_available_;
}
// Returns true if NEON is available on this system.
static inline bool IsNEONAvailable() {
return detector.neon_available_;
}
// Update settings after config variable was set.
static TESS_API void Update();
private:
// Constructor, must set all static member variables.
SIMDDetect();
private:
// Singleton.
static SIMDDetect detector;
// If true, then AVX has been detected.
static TESS_API bool avx_available_;
static TESS_API bool avx2_available_;
static TESS_API bool avx512F_available_;
static TESS_API bool avx512BW_available_;
// If true, then FMA has been detected.
static TESS_API bool fma_available_;
// If true, then SSe4.1 has been detected.
static TESS_API bool sse_available_;
// If true, then NEON has been detected.
static TESS_API bool neon_available_;
};
} // namespace tesseract
#endif // TESSERACT_ARCH_SIMDDETECT_H_

View File

@ -0,0 +1,120 @@
/**********************************************************************
* File: adaptions.cpp (Formerly adaptions.c)
* Description: Functions used to adapt to blobs already confidently
* identified
* Author: Chris Newton
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include <cctype>
#include <cstring>
#include "control.h"
#include "reject.h"
#include "stopper.h"
#include "tesseractclass.h"
#include "tessvars.h"
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
# include "config_auto.h"
#endif
namespace tesseract {
bool Tesseract::word_adaptable( // should we adapt?
WERD_RES *word, uint16_t mode) {
if (tessedit_adaption_debug) {
tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
word->best_choice->unichar_string().c_str(), word->best_choice->rating(),
word->best_choice->certainty());
}
bool status = false;
std::bitset<16> flags(mode);
enum MODES {
ADAPTABLE_WERD,
ACCEPTABLE_WERD,
CHECK_DAWGS,
CHECK_SPACES,
CHECK_ONE_ELL_CONFLICT,
CHECK_AMBIG_WERD
};
/*
0: NO adaption
*/
if (mode == 0) {
if (tessedit_adaption_debug) {
tprintf("adaption disabled\n");
}
return false;
}
if (flags[ADAPTABLE_WERD]) {
status |= word->tess_would_adapt; // result of Classify::AdaptableWord()
if (tessedit_adaption_debug && !status) {
tprintf("tess_would_adapt bit is false\n");
}
}
if (flags[ACCEPTABLE_WERD]) {
status |= word->tess_accepted;
if (tessedit_adaption_debug && !status) {
tprintf("tess_accepted bit is false\n");
}
}
if (!status) { // If not set then
return false; // ignore other checks
}
if (flags[CHECK_DAWGS] && (word->best_choice->permuter() != SYSTEM_DAWG_PERM) &&
(word->best_choice->permuter() != FREQ_DAWG_PERM) &&
(word->best_choice->permuter() != USER_DAWG_PERM) &&
(word->best_choice->permuter() != NUMBER_PERM)) {
if (tessedit_adaption_debug) {
tprintf("word not in dawgs\n");
}
return false;
}
if (flags[CHECK_ONE_ELL_CONFLICT] && one_ell_conflict(word, false)) {
if (tessedit_adaption_debug) {
tprintf("word has ell conflict\n");
}
return false;
}
if (flags[CHECK_SPACES] &&
(strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {
if (tessedit_adaption_debug) {
tprintf("word contains spaces\n");
}
return false;
}
if (flags[CHECK_AMBIG_WERD] && word->best_choice->dangerous_ambig_found()) {
if (tessedit_adaption_debug) {
tprintf("word is ambiguous\n");
}
return false;
}
if (tessedit_adaption_debug) {
tprintf("returning status %d\n", status);
}
return status;
}
} // namespace tesseract

View File

@ -0,0 +1,781 @@
/**********************************************************************
* File: applybox.cpp (Formerly applybox.c)
* Description: Re segment rows according to box file data
* Author: Phil Cheatle
*
* (C) Copyright 1993, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef DISABLED_LEGACY_ENGINE
# include <allheaders.h>
# include <cctype>
# include <cerrno>
# include <cstring>
# include "boxread.h"
#endif // ndef DISABLED_LEGACY_ENGINE
#include <tesseract/unichar.h>
#include "pageres.h"
#include "tesseractclass.h"
#include "unicharset.h"
#ifndef DISABLED_LEGACY_ENGINE
/** Max number of blobs to classify together in FindSegmentation. */
const int kMaxGroupSize = 4;
/// Max fraction of median allowed as deviation in xheight before switching
/// to median.
const double kMaxXHeightDeviationFraction = 0.125;
#endif // ndef DISABLED_LEGACY_ENGINE
/**
* The box file is assumed to contain box definitions, one per line, of the
* following format for blob-level boxes:
* @verbatim
* <UTF8 str> <left> <bottom> <right> <top> <page id>
* @endverbatim
* and for word/line-level boxes:
* @verbatim
* WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
* @endverbatim
* NOTES:
* The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
*
* <page id> is 0-based, and the page number is used for multipage input (tiff).
*
* In the blob-level form, each line represents a recognizable unit, which may
* be several UTF-8 bytes, but there is a bounding box around each recognizable
* unit, and no classifier is needed to train in this mode (bootstrapping.)
*
* In the word/line-level form, the line begins with the literal "WordStr", and
* the bounding box bounds either a whole line or a whole word. The recognizable
* units in the word/line are listed after the # at the end of the line and
* are space delimited, ignoring any original spaces on the line.
* Eg.
* @verbatim
* word -> #w o r d
* multi word line -> #m u l t i w o r d l i n e
* @endverbatim
* The recognizable units must be space-delimited in order to allow multiple
* unicodes to be used for a single recognizable unit, eg Hindi.
*
* In this mode, the classifier must have been pre-trained with the desired
* character set, or it will not be able to find the character segmentations.
*/
namespace tesseract {
#ifndef DISABLED_LEGACY_ENGINE
static void clear_any_old_text(BLOCK_LIST *block_list) {
BLOCK_IT block_it(block_list);
for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
ROW_IT row_it(block_it.data()->row_list());
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
WERD_IT word_it(row_it.data()->word_list());
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
word_it.data()->set_text("");
}
}
}
}
// Applies the box file based on the image name filename, and resegments
// the words in the block_list (page), with:
// blob-mode: one blob per line in the box file, words as input.
// word/line-mode: one blob per space-delimited unit after the #, and one word
// per line in the box file. (See comment above for box file format.)
// If find_segmentation is true, (word/line mode) then the classifier is used
// to re-segment words/lines to match the space-delimited truth string for
// each box. In this case, the input box may be for a word or even a whole
// text line, and the output words will contain multiple blobs corresponding
// to the space-delimited input string.
// With find_segmentation false, no classifier is needed, but the chopper
// can still be used to correctly segment touching characters with the help
// of the input boxes.
// In the returned PAGE_RES, the WERD_RES are setup as they would be returned
// from normal classification, ie. with a word, chopped_word, rebuild_word,
// seam_array, denorm, box_word, and best_state, but NO best_choice or
// raw_choice, as they would require a UNICHARSET, which we aim to avoid.
// Instead, the correct_text member of WERD_RES is set, and this may be later
// converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
// is not required before calling ApplyBoxTraining.
PAGE_RES *Tesseract::ApplyBoxes(const char *filename, bool find_segmentation,
BLOCK_LIST *block_list) {
std::vector<TBOX> boxes;
std::vector<std::string> texts, full_texts;
if (!ReadAllBoxes(applybox_page, true, filename, &boxes, &texts, &full_texts, nullptr)) {
return nullptr; // Can't do it.
}
const int box_count = boxes.size();
int box_failures = 0;
// In word mode, we use the boxes to make a word for each box, but
// in blob mode we use the existing words and maximally chop them first.
PAGE_RES *page_res = find_segmentation ? nullptr : SetupApplyBoxes(boxes, block_list);
clear_any_old_text(block_list);
for (int i = 0; i < box_count; i++) {
bool foundit = false;
if (page_res != nullptr) {
foundit =
ResegmentCharBox(page_res, (i == 0) ? nullptr : &boxes[i - 1], boxes[i],
(i == box_count - 1) ? nullptr : &boxes[i + 1], full_texts[i].c_str());
} else {
foundit = ResegmentWordBox(block_list, boxes[i],
(i == box_count - 1) ? nullptr : &boxes[i + 1], texts[i].c_str());
}
if (!foundit) {
box_failures++;
ReportFailedBox(i, boxes[i], texts[i].c_str(), "FAILURE! Couldn't find a matching blob");
}
}
if (page_res == nullptr) {
// In word/line mode, we now maximally chop all the words and resegment
// them with the classifier.
page_res = SetupApplyBoxes(boxes, block_list);
ReSegmentByClassification(page_res);
}
if (applybox_debug > 0) {
tprintf("APPLY_BOXES:\n");
tprintf(" Boxes read from boxfile: %6d\n", box_count);
if (box_failures > 0) {
tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
}
}
TidyUp(page_res);
return page_res;
}
// Helper computes median xheight in the image.
static double MedianXHeight(BLOCK_LIST *block_list) {
BLOCK_IT block_it(block_list);
STATS xheights(0, block_it.data()->pdblk.bounding_box().height());
for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
ROW_IT row_it(block_it.data()->row_list());
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
}
}
return xheights.median();
}
/// Any row xheight that is significantly different from the median is set
/// to the median.
void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
const double median_xheight = MedianXHeight(block_list);
const double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
// Strip all fuzzy space markers to simplify the PAGE_RES.
BLOCK_IT b_it(block_list);
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
BLOCK *block = b_it.data();
ROW_IT r_it(block->row_list());
for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
ROW *row = r_it.data();
const double diff = fabs(row->x_height() - median_xheight);
if (diff > max_deviation) {
if (applybox_debug) {
tprintf("row xheight=%g, but median xheight = %g\n", row->x_height(), median_xheight);
}
row->set_x_height(static_cast<float>(median_xheight));
}
}
}
}
/// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
/// All fuzzy spaces are removed, and all the words are maximally chopped.
PAGE_RES *Tesseract::SetupApplyBoxes(const std::vector<TBOX> &boxes, BLOCK_LIST *block_list) {
PreenXHeights(block_list);
// Strip all fuzzy space markers to simplify the PAGE_RES.
BLOCK_IT b_it(block_list);
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
BLOCK *block = b_it.data();
ROW_IT r_it(block->row_list());
for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
ROW *row = r_it.data();
WERD_IT w_it(row->word_list());
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
WERD *word = w_it.data();
if (word->cblob_list()->empty()) {
delete w_it.extract();
} else {
word->set_flag(W_FUZZY_SP, false);
word->set_flag(W_FUZZY_NON, false);
}
}
}
}
auto *page_res = new PAGE_RES(false, block_list, nullptr);
PAGE_RES_IT pr_it(page_res);
WERD_RES *word_res;
while ((word_res = pr_it.word()) != nullptr) {
MaximallyChopWord(boxes, pr_it.block()->block, pr_it.row()->row, word_res);
pr_it.forward();
}
return page_res;
}
/// Tests the chopper by exhaustively running chop_one_blob.
/// The word_res will contain filled chopped_word, seam_array, denorm,
/// box_word and best_state for the maximally chopped word.
void Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block, ROW *row,
WERD_RES *word_res) {
if (!word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
classify_bln_numeric_mode, textord_use_cjk_fp_model,
poly_allow_detailed_fx, row, block)) {
word_res->CloneChoppedToRebuild();
return;
}
if (chop_debug) {
tprintf("Maximally chopping word at:");
word_res->word->bounding_box().print();
}
std::vector<BLOB_CHOICE *> blob_choices;
ASSERT_HOST(!word_res->chopped_word->blobs.empty());
auto rating = static_cast<float>(INT8_MAX);
for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
// The rating and certainty are not quite arbitrary. Since
// select_blob_to_chop uses the worst certainty to choose, they all have
// to be different, so starting with INT8_MAX, subtract 1/8 for each blob
// in here, and then divide by e each time they are chopped, which
// should guarantee a set of unequal values for the whole tree of blobs
// produced, however much chopping is required. The chops are thus only
// limited by the ability of the chopper to find suitable chop points,
// and not by the value of the certainties.
auto *choice = new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
blob_choices.push_back(choice);
rating -= 0.125f;
}
const double e = exp(1.0); // The base of natural logs.
int blob_number;
int right_chop_index = 0;
if (!assume_fixed_pitch_char_segment) {
// We only chop if the language is not fixed pitch like CJK.
SEAM *seam = nullptr;
while ((seam = chop_one_blob(boxes, blob_choices, word_res, &blob_number)) != nullptr) {
word_res->InsertSeam(blob_number, seam);
BLOB_CHOICE *left_choice = blob_choices[blob_number];
rating = left_choice->rating() / e;
left_choice->set_rating(rating);
left_choice->set_certainty(-rating);
// combine confidence w/ serial #
auto *right_choice = new BLOB_CHOICE(++right_chop_index, rating - 0.125f, -rating, -1, 0.0f,
0.0f, 0.0f, BCC_FAKE);
blob_choices.insert(blob_choices.begin() + blob_number + 1, right_choice);
}
}
word_res->CloneChoppedToRebuild();
word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
}
/// Helper to compute the dispute resolution metric.
/// Disputed blob resolution. The aim is to give the blob to the most
/// appropriate boxfile box. Most of the time it is obvious, but if
/// two boxfile boxes overlap significantly it is not. If a small boxfile
/// box takes most of the blob, and a large boxfile box does too, then
/// we want the small boxfile box to get it, but if the small box
/// is much smaller than the blob, we don't want it to get it.
/// Details of the disputed blob resolution:
/// Given a box with area A, and a blob with area B, with overlap area C,
/// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
/// miss metric gets the blob.
static double BoxMissMetric(const TBOX &box1, const TBOX &box2) {
const int overlap_area = box1.intersection(box2).area();
const int a = box1.area();
const int b = box2.area();
ASSERT_HOST(a != 0 && b != 0);
return 1.0 * (a - overlap_area) * (b - overlap_area) / a / b;
}
/// Gather consecutive blobs that match the given box into the best_state
/// and corresponding correct_text.
///
/// Fights over which box owns which blobs are settled by pre-chopping and
/// applying the blobs to box or next_box with the least non-overlap.
/// @return false if the box was in error, which can only be caused by
/// failing to find an appropriate blob for a box.
///
/// This means that occasionally, blobs may be incorrectly segmented if the
/// chopper fails to find a suitable chop point.
bool Tesseract::ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box,
const TBOX *next_box, const char *correct_text) {
if (applybox_debug > 1) {
tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
}
PAGE_RES_IT page_res_it(page_res);
WERD_RES *word_res;
for (word_res = page_res_it.word(); word_res != nullptr; word_res = page_res_it.forward()) {
if (!word_res->box_word->bounding_box().major_overlap(box)) {
continue;
}
if (applybox_debug > 1) {
tprintf("Checking word box:");
word_res->box_word->bounding_box().print();
}
int word_len = word_res->box_word->length();
for (int i = 0; i < word_len; ++i) {
TBOX char_box = TBOX();
int blob_count = 0;
for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
if (!blob_box.major_overlap(box)) {
break;
}
if (word_res->correct_text[i + blob_count].length() > 0) {
break; // Blob is claimed already.
}
if (next_box != nullptr) {
const double current_box_miss_metric = BoxMissMetric(blob_box, box);
const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
if (applybox_debug > 2) {
tprintf("Checking blob:");
blob_box.print();
tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,
next_box_miss_metric);
}
if (current_box_miss_metric > next_box_miss_metric) {
break; // Blob is a better match for next box.
}
}
char_box += blob_box;
}
if (blob_count > 0) {
if (applybox_debug > 1) {
tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
}
if (!char_box.almost_equal(box, 3) &&
((next_box != nullptr && box.x_gap(*next_box) < -3) ||
(prev_box != nullptr && prev_box->x_gap(box) < -3))) {
return false;
}
// We refine just the box_word, best_state and correct_text here.
// The rebuild_word is made in TidyUp.
// blob_count blobs are put together to match the box. Merge the
// box_word boxes, save the blob_count in the state and the text.
word_res->box_word->MergeBoxes(i, i + blob_count);
word_res->best_state[i] = blob_count;
word_res->correct_text[i] = correct_text;
if (applybox_debug > 2) {
tprintf("%d Blobs match: blob box:", blob_count);
word_res->box_word->BlobBox(i).print();
tprintf("Matches box:");
box.print();
if (next_box != nullptr) {
tprintf("With next box:");
next_box->print();
}
}
// Eliminated best_state and correct_text entries for the consumed
// blobs.
for (int j = 1; j < blob_count; ++j) {
word_res->best_state.erase(word_res->best_state.begin() + i + 1);
word_res->correct_text.erase(word_res->correct_text.begin() + i + 1);
}
// Assume that no box spans multiple source words, so we are done with
// this box.
if (applybox_debug > 1) {
tprintf("Best state = ");
for (auto best_state : word_res->best_state) {
tprintf("%d ", best_state);
}
tprintf("\n");
tprintf("Correct text = [[ ");
for (auto &it : word_res->correct_text) {
tprintf("%s ", it.c_str());
}
tprintf("]]\n");
}
return true;
}
}
}
if (applybox_debug > 0) {
tprintf("FAIL!\n");
}
return false; // Failure.
}
/// Consume all source blobs that strongly overlap the given box,
/// putting them into a new word, with the correct_text label.
/// Fights over which box owns which blobs are settled by
/// applying the blobs to box or next_box with the least non-overlap.
/// @return false if the box was in error, which can only be caused by
/// failing to find an overlapping blob for a box.
bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box,
const char *correct_text) {
if (applybox_debug > 1) {
tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
}
WERD *new_word = nullptr;
BLOCK_IT b_it(block_list);
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
BLOCK *block = b_it.data();
if (!box.major_overlap(block->pdblk.bounding_box())) {
continue;
}
ROW_IT r_it(block->row_list());
for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
ROW *row = r_it.data();
if (!box.major_overlap(row->bounding_box())) {
continue;
}
WERD_IT w_it(row->word_list());
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
WERD *word = w_it.data();
if (applybox_debug > 2) {
tprintf("Checking word:");
word->bounding_box().print();
}
if (word->text() != nullptr && word->text()[0] != '\0') {
continue; // Ignore words that are already done.
}
if (!box.major_overlap(word->bounding_box())) {
continue;
}
C_BLOB_IT blob_it(word->cblob_list());
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
C_BLOB *blob = blob_it.data();
TBOX blob_box = blob->bounding_box();
if (!blob_box.major_overlap(box)) {
continue;
}
if (next_box != nullptr) {
const double current_box_miss_metric = BoxMissMetric(blob_box, box);
const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
if (applybox_debug > 2) {
tprintf("Checking blob:");
blob_box.print();
tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,
next_box_miss_metric);
}
if (current_box_miss_metric > next_box_miss_metric) {
continue; // Blob is a better match for next box.
}
}
if (applybox_debug > 2) {
tprintf("Blob match: blob:");
blob_box.print();
tprintf("Matches box:");
box.print();
if (next_box != nullptr) {
tprintf("With next box:");
next_box->print();
}
}
if (new_word == nullptr) {
// Make a new word with a single blob.
new_word = word->shallow_copy();
new_word->set_text(correct_text);
w_it.add_to_end(new_word);
}
C_BLOB_IT new_blob_it(new_word->cblob_list());
new_blob_it.add_to_end(blob_it.extract());
}
}
}
}
if (new_word == nullptr && applybox_debug > 0) {
tprintf("FAIL!\n");
}
return new_word != nullptr;
}
/// Resegments the words by running the classifier in an attempt to find the
/// correct segmentation that produces the required string.
void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) {
PAGE_RES_IT pr_it(page_res);
WERD_RES *word_res;
for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
const WERD *word = word_res->word;
if (word->text() == nullptr || word->text()[0] == '\0') {
continue; // Ignore words that have no text.
}
// Convert the correct text to a vector of UNICHAR_ID
std::vector<UNICHAR_ID> target_text;
if (!ConvertStringToUnichars(word->text(), &target_text)) {
tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text());
pr_it.DeleteCurrentWord();
continue;
}
if (!FindSegmentation(target_text, word_res)) {
tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n", word->text());
pr_it.DeleteCurrentWord();
continue;
}
}
}
/// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
/// @return false if an invalid UNICHAR_ID is encountered.
bool Tesseract::ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids) {
for (int step = 0; *utf8 != '\0'; utf8 += step) {
const char *next_space = strchr(utf8, ' ');
if (next_space == nullptr) {
next_space = utf8 + strlen(utf8);
}
step = next_space - utf8;
UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
if (class_id == INVALID_UNICHAR_ID) {
return false;
}
while (utf8[step] == ' ') {
++step;
}
class_ids->push_back(class_id);
}
return true;
}
/// Resegments the word to achieve the target_text from the classifier.
/// Returns false if the re-segmentation fails.
/// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and
/// applies a full search on the classifier results to find the best classified
/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
/// substitutions ARE used.
bool Tesseract::FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res) {
// Classify all required combinations of blobs and save results in choices.
const int word_length = word_res->box_word->length();
auto *choices = new std::vector<BLOB_CHOICE_LIST *>[word_length];
for (int i = 0; i < word_length; ++i) {
for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
BLOB_CHOICE_LIST *match_result =
classify_piece(word_res->seam_array, i, i + j - 1, "Applybox", word_res->chopped_word,
word_res->blamer_bundle);
if (applybox_debug > 2) {
tprintf("%d+%d:", i, j);
print_ratings_list("Segment:", match_result, unicharset);
}
choices[i].push_back(match_result);
}
}
// Search the segmentation graph for the target text. Must be an exact
// match. Using wildcards makes it difficult to find the correct
// segmentation even when it is there.
word_res->best_state.clear();
std::vector<int> search_segmentation;
float best_rating = 0.0f;
SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating,
&word_res->best_state);
for (int i = 0; i < word_length; ++i) {
for (auto choice : choices[i]) {
delete choice;
}
}
delete[] choices;
if (word_res->best_state.empty()) {
// Build the original segmentation and if it is the same length as the
// truth, assume it will do.
int blob_count = 1;
for (auto s : word_res->seam_array) {
SEAM *seam = s;
if (!seam->HasAnySplits()) {
word_res->best_state.push_back(blob_count);
blob_count = 1;
} else {
++blob_count;
}
}
word_res->best_state.push_back(blob_count);
if (word_res->best_state.size() != target_text.size()) {
word_res->best_state.clear(); // No good. Original segmentation bad size.
return false;
}
}
word_res->correct_text.clear();
for (auto &text : target_text) {
word_res->correct_text.emplace_back(unicharset.id_to_unichar(text));
}
return true;
}
/// Recursive helper to find a match to the target_text (from text_index
/// position) in the choices (from choices_pos position).
/// @param choices is an array of vectors of length choices_length,
/// with each element representing a starting position in the word, and the
/// #vector holding classification results for a sequence of consecutive
/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
/// @param choices_pos
/// @param choices_length
/// @param target_text
/// @param text_index
/// @param rating
/// @param segmentation
/// @param best_rating
/// @param best_segmentation
void Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
int choices_length, const std::vector<UNICHAR_ID> &target_text,
int text_index, float rating, std::vector<int> *segmentation,
float *best_rating, std::vector<int> *best_segmentation) {
const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs();
for (unsigned length = 1; length <= choices[choices_pos].size(); ++length) {
// Rating of matching choice or worst choice if no match.
float choice_rating = 0.0f;
// Find the corresponding best BLOB_CHOICE.
BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
const BLOB_CHOICE *choice = choice_it.data();
choice_rating = choice->rating();
UNICHAR_ID class_id = choice->unichar_id();
if (class_id == target_text[text_index]) {
break;
}
// Search ambigs table.
if (class_id < table.size() && table[class_id] != nullptr) {
AmbigSpec_IT spec_it(table[class_id]);
for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) {
const AmbigSpec *ambig_spec = spec_it.data();
// We'll only do 1-1.
if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
ambig_spec->correct_ngram_id == target_text[text_index]) {
break;
}
}
if (!spec_it.cycled_list()) {
break; // Found an ambig.
}
}
}
if (choice_it.cycled_list()) {
continue; // No match.
}
segmentation->push_back(length);
if (choices_pos + length == choices_length && text_index + 1 == target_text.size()) {
// This is a complete match. If the rating is good record a new best.
if (applybox_debug > 2) {
tprintf("Complete match, rating = %g, best=%g, seglength=%zu, best=%zu\n",
rating + choice_rating, *best_rating, segmentation->size(),
best_segmentation->size());
}
if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
*best_segmentation = *segmentation;
*best_rating = rating + choice_rating;
}
} else if (choices_pos + length < choices_length && text_index + 1 < target_text.size()) {
if (applybox_debug > 3) {
tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n", target_text[text_index],
unicharset.id_to_unichar(target_text[text_index]),
choice_it.data()->unichar_id() == target_text[text_index] ? "Match" : "Ambig",
choices_pos, length);
}
SearchForText(choices, choices_pos + length, choices_length, target_text, text_index + 1,
rating + choice_rating, segmentation, best_rating, best_segmentation);
if (applybox_debug > 3) {
tprintf("End recursion for %d=%s\n", target_text[text_index],
unicharset.id_to_unichar(target_text[text_index]));
}
}
segmentation->resize(segmentation->size() - 1);
}
}
/// - Counts up the labelled words and the blobs within.
/// - Deletes all unused or emptied words, counting the unused ones.
/// - Resets W_BOL and W_EOL flags correctly.
/// - Builds the rebuild_word and rebuilds the box_word and the best_choice.
void Tesseract::TidyUp(PAGE_RES *page_res) {
int ok_blob_count = 0;
int bad_blob_count = 0;
int ok_word_count = 0;
int unlabelled_words = 0;
PAGE_RES_IT pr_it(page_res);
WERD_RES *word_res;
for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
int ok_in_word = 0;
int blob_count = word_res->correct_text.size();
auto *word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
word_choice->set_permuter(TOP_CHOICE_PERM);
for (int c = 0; c < blob_count; ++c) {
if (word_res->correct_text[c].length() > 0) {
++ok_in_word;
}
// Since we only need a fake word_res->best_choice, the actual
// unichar_ids do not matter. Which is fortunate, since TidyUp()
// can be called while training Tesseract, at the stage where
// unicharset is not meaningful yet.
word_choice->append_unichar_id_space_allocated(INVALID_UNICHAR_ID, word_res->best_state[c],
1.0f, -1.0f);
}
if (ok_in_word > 0) {
ok_blob_count += ok_in_word;
bad_blob_count += word_res->correct_text.size() - ok_in_word;
word_res->LogNewRawChoice(word_choice);
word_res->LogNewCookedChoice(1, false, word_choice);
} else {
++unlabelled_words;
if (applybox_debug > 0) {
tprintf("APPLY_BOXES: Unlabelled word at :");
word_res->word->bounding_box().print();
}
pr_it.DeleteCurrentWord();
delete word_choice;
}
}
pr_it.restart_page();
for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
// Denormalize back to a BoxWord.
word_res->RebuildBestState();
word_res->SetupBoxWord();
word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
}
if (applybox_debug > 0) {
tprintf(" Found %d good blobs.\n", ok_blob_count);
if (bad_blob_count > 0) {
tprintf(" Leaving %d unlabelled blobs in %d words.\n", bad_blob_count, ok_word_count);
}
if (unlabelled_words > 0) {
tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
}
}
}
/** Logs a bad box by line in the box file and box coords.*/
void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch,
const char *err_msg) {
tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n", boxfile_lineno + 1, box_ch,
box.left(), box.bottom(), box.right(), box.top(), err_msg);
}
/// Calls #LearnWord to extract features for labelled blobs within each word.
/// Features are stored in an internal buffer.
void Tesseract::ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res) {
PAGE_RES_IT pr_it(page_res);
int word_count = 0;
for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
LearnWord(fontname.c_str(), word_res);
++word_count;
}
tprintf("Generated training data for %d words\n", word_count);
}
#endif // ndef DISABLED_LEGACY_ENGINE
/** Creates a fake best_choice entry in each WERD_RES with the correct text.*/
void Tesseract::CorrectClassifyWords(PAGE_RES *page_res) {
PAGE_RES_IT pr_it(page_res);
for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
auto *choice = new WERD_CHOICE(word_res->uch_set, word_res->correct_text.size());
for (auto &correct_text : word_res->correct_text) {
// The part before the first space is the real ground truth, and the
// rest is the bounding box location and page number.
std::vector<std::string> tokens = split(correct_text, ' ');
UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].c_str());
choice->append_unichar_id_space_allocated(char_id, word_res->best_state[&correct_text - &word_res->correct_text[0]], 0.0f, 0.0f);
}
word_res->ClearWordChoices();
word_res->LogNewRawChoice(choice);
word_res->LogNewCookedChoice(1, false, choice);
}
}
} // namespace tesseract

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,37 @@
/**********************************************************************
* File: control.h (Formerly control.h)
* Description: Module-independent matcher controller.
* Author: Ray Smith
* Created: Thu Apr 23 11:09:58 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
/**
* @file control.h
* Module-independent matcher controller.
*/
#ifndef CONTROL_H
#define CONTROL_H
enum ACCEPTABLE_WERD_TYPE {
AC_UNACCEPTABLE, ///< Unacceptable word
AC_LOWER_CASE, ///< ALL lower case
AC_UPPER_CASE, ///< ALL upper case
AC_INITIAL_CAP, ///< ALL but initial lc
AC_LC_ABBREV, ///< a.b.c.
AC_UC_ABBREV ///< A.B.C.
};
#endif

View File

@ -0,0 +1,932 @@
/******************************************************************
* File: docqual.cpp (Formerly docqual.c)
* Description: Document Quality Metrics
* Author: Phil Cheatle
*
* (C) Copyright 1994, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "docqual.h"
#include <cctype>
#include "reject.h"
#include "tesseractclass.h"
#include "tessvars.h"
namespace tesseract {
static void countMatchingBlobs(int16_t &match_count, int /*index*/) {
++match_count;
}
static void countAcceptedBlobs(WERD_RES *word, int16_t &match_count, int16_t &accepted_match_count,
int index) {
if (word->reject_map[index].accepted()) {
++accepted_match_count;
}
++match_count;
}
static void acceptIfGoodQuality(WERD_RES *word, int index) {
if (word->reject_map[index].accept_if_good_quality()) {
word->reject_map[index].setrej_quality_accept();
}
}
/*************************************************************************
* word_blob_quality()
* How many blobs in the box_word are identical to those of the inword?
* ASSUME blobs in both initial word and box_word are in ascending order of
* left hand blob edge.
*************************************************************************/
int16_t Tesseract::word_blob_quality(WERD_RES *word) {
int16_t match_count = 0;
if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
!word->rebuild_word->blobs.empty()) {
using namespace std::placeholders; // for _1
word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word,
std::bind(countMatchingBlobs, match_count, _1));
}
return match_count;
}
int16_t Tesseract::word_outline_errs(WERD_RES *word) {
int16_t i = 0;
int16_t err_count = 0;
if (word->rebuild_word != nullptr) {
for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
TBLOB *blob = word->rebuild_word->blobs[b];
err_count += count_outline_errs(word->best_choice->unichar_string()[i], blob->NumOutlines());
i++;
}
}
return err_count;
}
/*************************************************************************
* word_char_quality()
* Combination of blob quality and outline quality - how many good chars are
* there? - I.e chars which pass the blob AND outline tests.
*************************************************************************/
void Tesseract::word_char_quality(WERD_RES *word, int16_t *match_count,
int16_t *accepted_match_count) {
*match_count = 0;
*accepted_match_count = 0;
if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
!word->rebuild_word->blobs.empty()) {
using namespace std::placeholders; // for _1
word->bln_boxes->ProcessMatchedBlobs(
*word->rebuild_word,
std::bind(countAcceptedBlobs, word, *match_count, *accepted_match_count, _1));
}
}
/*************************************************************************
* unrej_good_chs()
* Unreject POTENTIAL rejects if the blob passes the blob and outline checks
*************************************************************************/
void Tesseract::unrej_good_chs(WERD_RES *word) {
if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
word->rebuild_word->blobs.empty()) {
using namespace std::placeholders; // for _1
word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word,
std::bind(acceptIfGoodQuality, word, _1));
}
}
int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) {
int expected_outline_count;
if (outlines_odd.contains(c)) {
return 0; // Don't use this char
} else if (outlines_2.contains(c)) {
expected_outline_count = 2;
} else {
expected_outline_count = 1;
}
return abs(outline_count - expected_outline_count);
}
void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc) {
if ((tessedit_good_quality_unrej && good_quality_doc)) {
unrej_good_quality_words(page_res_it);
}
doc_and_block_rejection(page_res_it, good_quality_doc);
if (unlv_tilde_crunching) {
tilde_crunch(page_res_it);
tilde_delete(page_res_it);
}
}
/*************************************************************************
* unrej_good_quality_words()
* Accept potential rejects in words which pass the following checks:
* - Contains a potential reject
* - Word looks like a sensible alpha word.
* - Word segmentation is the same as the original image
* - All characters have the expected number of outlines
* NOTE - the rejection counts are recalculated after unrejection
* - CAN'T do it in a single pass without a bit of fiddling
* - keep it simple but inefficient
*************************************************************************/
void Tesseract::unrej_good_quality_words( // unreject potential
PAGE_RES_IT &page_res_it) {
WERD_RES *word;
ROW_RES *current_row;
BLOCK_RES *current_block;
int i;
page_res_it.restart_page();
while (page_res_it.word() != nullptr) {
check_debug_pt(page_res_it.word(), 100);
if (bland_unrej) {
word = page_res_it.word();
for (i = 0; i < word->reject_map.length(); i++) {
if (word->reject_map[i].accept_if_good_quality()) {
word->reject_map[i].setrej_quality_accept();
}
}
page_res_it.forward();
} else if ((page_res_it.row()->char_count > 0) &&
((page_res_it.row()->rej_count /
static_cast<float>(page_res_it.row()->char_count)) <= quality_rowrej_pc)) {
word = page_res_it.word();
if (word->reject_map.quality_recoverable_rejects() &&
(tessedit_unrej_any_wd ||
acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(),
word->best_choice->unichar_lengths().c_str()) !=
AC_UNACCEPTABLE)) {
unrej_good_chs(word);
}
page_res_it.forward();
} else {
// Skip to end of dodgy row.
current_row = page_res_it.row();
while ((page_res_it.word() != nullptr) && (page_res_it.row() == current_row)) {
page_res_it.forward();
}
}
check_debug_pt(page_res_it.word(), 110);
}
page_res_it.restart_page();
page_res_it.page_res->char_count = 0;
page_res_it.page_res->rej_count = 0;
current_block = nullptr;
current_row = nullptr;
while (page_res_it.word() != nullptr) {
if (current_block != page_res_it.block()) {
current_block = page_res_it.block();
current_block->char_count = 0;
current_block->rej_count = 0;
}
if (current_row != page_res_it.row()) {
current_row = page_res_it.row();
current_row->char_count = 0;
current_row->rej_count = 0;
current_row->whole_word_rej_count = 0;
}
page_res_it.rej_stat_word();
page_res_it.forward();
}
}
/*************************************************************************
* doc_and_block_rejection()
*
* If the page has too many rejects - reject all of it.
* If any block has too many rejects - reject all words in the block
*************************************************************************/
void Tesseract::doc_and_block_rejection( // reject big chunks
PAGE_RES_IT &page_res_it, bool good_quality_doc) {
int16_t block_no = 0;
int16_t row_no = 0;
BLOCK_RES *current_block;
ROW_RES *current_row;
bool rej_word;
bool prev_word_rejected;
int16_t char_quality = 0;
int16_t accepted_char_quality;
if (page_res_it.page_res->rej_count * 100.0 / page_res_it.page_res->char_count >
tessedit_reject_doc_percent) {
reject_whole_page(page_res_it);
if (tessedit_debug_doc_rejection) {
tprintf("REJECT ALL #chars: %d #Rejects: %d; \n", page_res_it.page_res->char_count,
page_res_it.page_res->rej_count);
}
} else {
if (tessedit_debug_doc_rejection) {
tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n", page_res_it.page_res->char_count,
page_res_it.page_res->rej_count);
}
/* Walk blocks testing for block rejection */
page_res_it.restart_page();
WERD_RES *word;
while ((word = page_res_it.word()) != nullptr) {
current_block = page_res_it.block();
block_no = current_block->block->pdblk.index();
if (current_block->char_count > 0 &&
(current_block->rej_count * 100.0 / current_block->char_count) >
tessedit_reject_block_percent) {
if (tessedit_debug_block_rejection) {
tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", block_no,
current_block->char_count, current_block->rej_count);
}
prev_word_rejected = false;
while ((word = page_res_it.word()) != nullptr && (page_res_it.block() == current_block)) {
if (tessedit_preserve_blk_rej_perfect_wds) {
rej_word = word->reject_map.reject_count() > 0 ||
word->reject_map.length() < tessedit_preserve_min_wd_len;
if (rej_word && tessedit_dont_blkrej_good_wds &&
word->reject_map.length() >= tessedit_preserve_min_wd_len &&
acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(),
word->best_choice->unichar_lengths().c_str()) !=
AC_UNACCEPTABLE) {
word_char_quality(word, &char_quality, &accepted_char_quality);
rej_word = char_quality != word->reject_map.length();
}
} else {
rej_word = true;
}
if (rej_word) {
/*
Reject spacing if both current and prev words are rejected.
NOTE - this is NOT restricted to FUZZY spaces. - When tried this
generated more space errors.
*/
if (tessedit_use_reject_spaces && prev_word_rejected &&
page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) {
word->reject_spaces = true;
}
word->reject_map.rej_word_block_rej();
}
prev_word_rejected = rej_word;
page_res_it.forward();
}
} else {
if (tessedit_debug_block_rejection) {
tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n", block_no,
page_res_it.block()->char_count, page_res_it.block()->rej_count);
}
/* Walk rows in block testing for row rejection */
row_no = 0;
while (page_res_it.word() != nullptr && page_res_it.block() == current_block) {
current_row = page_res_it.row();
row_no++;
/* Reject whole row if:
fraction of chars on row which are rejected exceed a limit AND
fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
limit
*/
if (current_row->char_count > 0 &&
(current_row->rej_count * 100.0 / current_row->char_count) >
tessedit_reject_row_percent &&
(current_row->whole_word_rej_count * 100.0 / current_row->rej_count) <
tessedit_whole_wd_rej_row_percent) {
if (tessedit_debug_block_rejection) {
tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n", row_no,
current_row->char_count, current_row->rej_count);
}
prev_word_rejected = false;
while ((word = page_res_it.word()) != nullptr && page_res_it.row() == current_row) {
/* Preserve words on good docs unless they are mostly rejected*/
if (!tessedit_row_rej_good_docs && good_quality_doc) {
rej_word = word->reject_map.reject_count() /
static_cast<float>(word->reject_map.length()) >
tessedit_good_doc_still_rowrej_wd;
} else if (tessedit_preserve_row_rej_perfect_wds) {
/* Preserve perfect words anyway */
rej_word = word->reject_map.reject_count() > 0 ||
word->reject_map.length() < tessedit_preserve_min_wd_len;
if (rej_word && tessedit_dont_rowrej_good_wds &&
word->reject_map.length() >= tessedit_preserve_min_wd_len &&
acceptable_word_string(
*word->uch_set, word->best_choice->unichar_string().c_str(),
word->best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE) {
word_char_quality(word, &char_quality, &accepted_char_quality);
rej_word = char_quality != word->reject_map.length();
}
} else {
rej_word = true;
}
if (rej_word) {
/*
Reject spacing if both current and prev words are rejected.
NOTE - this is NOT restricted to FUZZY spaces. - When tried
this generated more space errors.
*/
if (tessedit_use_reject_spaces && prev_word_rejected &&
page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) {
word->reject_spaces = true;
}
word->reject_map.rej_word_row_rej();
}
prev_word_rejected = rej_word;
page_res_it.forward();
}
} else {
if (tessedit_debug_block_rejection) {
tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n", row_no,
current_row->char_count, current_row->rej_count);
}
while (page_res_it.word() != nullptr && page_res_it.row() == current_row) {
page_res_it.forward();
}
}
}
}
}
}
}
/*************************************************************************
* reject_whole_page()
* Don't believe any of it - set the reject map to 00..00 in all words
*
*************************************************************************/
void reject_whole_page(PAGE_RES_IT &page_res_it) {
page_res_it.restart_page();
while (page_res_it.word() != nullptr) {
page_res_it.word()->reject_map.rej_word_doc_rej();
page_res_it.forward();
}
// whole page is rejected
page_res_it.page_res->rejected = true;
}
void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
WERD_RES *word;
GARBAGE_LEVEL garbage_level;
PAGE_RES_IT copy_it;
bool prev_potential_marked = false;
bool found_terrible_word = false;
bool ok_dict_word;
page_res_it.restart_page();
while (page_res_it.word() != nullptr) {
POLY_BLOCK *pb = page_res_it.block()->block->pdblk.poly_block();
if (pb != nullptr && !pb->IsText()) {
page_res_it.forward();
continue;
}
word = page_res_it.word();
if (crunch_early_convert_bad_unlv_chs) {
convert_bad_unlv_chs(word);
}
if (crunch_early_merge_tess_fails) {
word->merge_tess_fails();
}
if (word->reject_map.accept_count() != 0) {
found_terrible_word = false;
// Forget earlier potential crunches
prev_potential_marked = false;
} else {
ok_dict_word = safe_dict_word(word);
garbage_level = garbage_word(word, ok_dict_word);
if ((garbage_level != G_NEVER_CRUNCH) && (terrible_word_crunch(word, garbage_level))) {
if (crunch_debug > 0) {
tprintf("T CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
}
word->unlv_crunch_mode = CR_KEEP_SPACE;
if (prev_potential_marked) {
while (copy_it.word() != word) {
if (crunch_debug > 0) {
tprintf("P1 CRUNCHING: \"%s\"\n",
copy_it.word()->best_choice->unichar_string().c_str());
}
copy_it.word()->unlv_crunch_mode = CR_KEEP_SPACE;
copy_it.forward();
}
prev_potential_marked = false;
}
found_terrible_word = true;
} else if ((garbage_level != G_NEVER_CRUNCH) &&
(potential_word_crunch(word, garbage_level, ok_dict_word))) {
if (found_terrible_word) {
if (crunch_debug > 0) {
tprintf("P2 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
}
word->unlv_crunch_mode = CR_KEEP_SPACE;
} else if (!prev_potential_marked) {
copy_it = page_res_it;
prev_potential_marked = true;
if (crunch_debug > 1) {
tprintf("P3 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
}
}
} else {
found_terrible_word = false;
// Forget earlier potential crunches
prev_potential_marked = false;
if (crunch_debug > 2) {
tprintf("NO CRUNCH: \"%s\"\n", word->best_choice->unichar_string().c_str());
}
}
}
page_res_it.forward();
}
}
bool Tesseract::terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) {
float rating_per_ch;
int adjusted_len;
int crunch_mode = 0;
if (word->best_choice->unichar_string().empty() ||
(strspn(word->best_choice->unichar_string().c_str(), " ") ==
word->best_choice->unichar_string().size())) {
crunch_mode = 1;
} else {
adjusted_len = word->reject_map.length();
if (adjusted_len > crunch_rating_max) {
adjusted_len = crunch_rating_max;
}
rating_per_ch = word->best_choice->rating() / adjusted_len;
if (rating_per_ch > crunch_terrible_rating) {
crunch_mode = 2;
} else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE)) {
crunch_mode = 3;
} else if ((word->best_choice->certainty() < crunch_poor_garbage_cert) &&
(garbage_level != G_OK)) {
crunch_mode = 4;
} else if ((rating_per_ch > crunch_poor_garbage_rate) && (garbage_level != G_OK)) {
crunch_mode = 5;
}
}
if (crunch_mode > 0) {
if (crunch_debug > 2) {
tprintf("Terrible_word_crunch (%d) on \"%s\"\n", crunch_mode,
word->best_choice->unichar_string().c_str());
}
return true;
} else {
return false;
}
}
bool Tesseract::potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level,
bool ok_dict_word) {
float rating_per_ch;
int adjusted_len;
const char *str = word->best_choice->unichar_string().c_str();
const char *lengths = word->best_choice->unichar_lengths().c_str();
bool word_crunchable;
int poor_indicator_count = 0;
word_crunchable =
!crunch_leave_accept_strings || word->reject_map.length() < 3 ||
(acceptable_word_string(*word->uch_set, str, lengths) == AC_UNACCEPTABLE && !ok_dict_word);
adjusted_len = word->reject_map.length();
if (adjusted_len > 10) {
adjusted_len = 10;
}
rating_per_ch = word->best_choice->rating() / adjusted_len;
if (rating_per_ch > crunch_pot_poor_rate) {
if (crunch_debug > 2) {
tprintf("Potential poor rating on \"%s\"\n", word->best_choice->unichar_string().c_str());
}
poor_indicator_count++;
}
if (word_crunchable && word->best_choice->certainty() < crunch_pot_poor_cert) {
if (crunch_debug > 2) {
tprintf("Potential poor cert on \"%s\"\n", word->best_choice->unichar_string().c_str());
}
poor_indicator_count++;
}
if (garbage_level != G_OK) {
if (crunch_debug > 2) {
tprintf("Potential garbage on \"%s\"\n", word->best_choice->unichar_string().c_str());
}
poor_indicator_count++;
}
return poor_indicator_count >= crunch_pot_indicators;
}
void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
WERD_RES *word;
PAGE_RES_IT copy_it;
bool deleting_from_bol = false;
bool marked_delete_point = false;
int16_t debug_delete_mode;
CRUNCH_MODE delete_mode;
int16_t x_debug_delete_mode;
CRUNCH_MODE x_delete_mode;
page_res_it.restart_page();
while (page_res_it.word() != nullptr) {
word = page_res_it.word();
delete_mode = word_deletable(word, debug_delete_mode);
if (delete_mode != CR_NONE) {
if (word->word->flag(W_BOL) || deleting_from_bol) {
if (crunch_debug > 0) {
tprintf("BOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,
word->best_choice->unichar_string().c_str());
}
word->unlv_crunch_mode = delete_mode;
deleting_from_bol = true;
} else if (word->word->flag(W_EOL)) {
if (marked_delete_point) {
while (copy_it.word() != word) {
x_delete_mode = word_deletable(copy_it.word(), x_debug_delete_mode);
if (crunch_debug > 0) {
tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", x_debug_delete_mode,
copy_it.word()->best_choice->unichar_string().c_str());
}
copy_it.word()->unlv_crunch_mode = x_delete_mode;
copy_it.forward();
}
}
if (crunch_debug > 0) {
tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,
word->best_choice->unichar_string().c_str());
}
word->unlv_crunch_mode = delete_mode;
deleting_from_bol = false;
marked_delete_point = false;
} else {
if (!marked_delete_point) {
copy_it = page_res_it;
marked_delete_point = true;
}
}
} else {
deleting_from_bol = false;
// Forget earlier potential crunches
marked_delete_point = false;
}
/*
The following step has been left till now as the tess fails are used to
determine if the word is deletable.
*/
if (!crunch_early_merge_tess_fails) {
word->merge_tess_fails();
}
page_res_it.forward();
}
}
void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
int i;
UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
for (i = 0; i < word_res->reject_map.length(); ++i) {
if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
word_res->best_choice->set_unichar_id(unichar_dash, i);
if (word_res->reject_map[i].accepted()) {
word_res->reject_map[i].setrej_unlv_rej();
}
}
if (word_res->best_choice->unichar_id(i) == unichar_pow) {
word_res->best_choice->set_unichar_id(unichar_space, i);
if (word_res->reject_map[i].accepted()) {
word_res->reject_map[i].setrej_unlv_rej();
}
}
}
}
GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {
enum STATES {
JUNK,
FIRST_UPPER,
FIRST_LOWER,
FIRST_NUM,
SUBSEQUENT_UPPER,
SUBSEQUENT_LOWER,
SUBSEQUENT_NUM
};
const char *str = word->best_choice->unichar_string().c_str();
const char *lengths = word->best_choice->unichar_lengths().c_str();
STATES state = JUNK;
int len = 0;
int isolated_digits = 0;
int isolated_alphas = 0;
int bad_char_count = 0;
int tess_rejs = 0;
int dodgy_chars = 0;
int ok_chars;
UNICHAR_ID last_char = -1;
int alpha_repetition_count = 0;
int longest_alpha_repetition_count = 0;
int longest_lower_run_len = 0;
int lower_string_count = 0;
int longest_upper_run_len = 0;
int upper_string_count = 0;
int total_alpha_count = 0;
int total_digit_count = 0;
for (; *str != '\0'; str += *(lengths++)) {
len++;
if (word->uch_set->get_isupper(str, *lengths)) {
total_alpha_count++;
switch (state) {
case SUBSEQUENT_UPPER:
case FIRST_UPPER:
state = SUBSEQUENT_UPPER;
upper_string_count++;
if (longest_upper_run_len < upper_string_count) {
longest_upper_run_len = upper_string_count;
}
if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
alpha_repetition_count++;
if (longest_alpha_repetition_count < alpha_repetition_count) {
longest_alpha_repetition_count = alpha_repetition_count;
}
} else {
last_char = word->uch_set->unichar_to_id(str, *lengths);
alpha_repetition_count = 1;
}
break;
case FIRST_NUM:
isolated_digits++;
// Fall through.
default:
state = FIRST_UPPER;
last_char = word->uch_set->unichar_to_id(str, *lengths);
alpha_repetition_count = 1;
upper_string_count = 1;
break;
}
} else if (word->uch_set->get_islower(str, *lengths)) {
total_alpha_count++;
switch (state) {
case SUBSEQUENT_LOWER:
case FIRST_LOWER:
state = SUBSEQUENT_LOWER;
lower_string_count++;
if (longest_lower_run_len < lower_string_count) {
longest_lower_run_len = lower_string_count;
}
if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
alpha_repetition_count++;
if (longest_alpha_repetition_count < alpha_repetition_count) {
longest_alpha_repetition_count = alpha_repetition_count;
}
} else {
last_char = word->uch_set->unichar_to_id(str, *lengths);
alpha_repetition_count = 1;
}
break;
case FIRST_NUM:
isolated_digits++;
// Fall through.
default:
state = FIRST_LOWER;
last_char = word->uch_set->unichar_to_id(str, *lengths);
alpha_repetition_count = 1;
lower_string_count = 1;
break;
}
} else if (word->uch_set->get_isdigit(str, *lengths)) {
total_digit_count++;
switch (state) {
case FIRST_NUM:
state = SUBSEQUENT_NUM;
case SUBSEQUENT_NUM:
break;
case FIRST_UPPER:
case FIRST_LOWER:
isolated_alphas++;
// Fall through.
default:
state = FIRST_NUM;
break;
}
} else {
if (*lengths == 1 && *str == ' ') {
tess_rejs++;
} else {
bad_char_count++;
}
switch (state) {
case FIRST_NUM:
isolated_digits++;
break;
case FIRST_UPPER:
case FIRST_LOWER:
isolated_alphas++;
default:
break;
}
state = JUNK;
}
}
switch (state) {
case FIRST_NUM:
isolated_digits++;
break;
case FIRST_UPPER:
case FIRST_LOWER:
isolated_alphas++;
default:
break;
}
if (crunch_include_numerals) {
total_alpha_count += total_digit_count - isolated_digits;
}
if (crunch_leave_ok_strings && len >= 4 && 2 * (total_alpha_count - isolated_alphas) > len &&
longest_alpha_repetition_count < crunch_long_repetitions) {
if ((crunch_accept_ok &&
acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE) ||
longest_lower_run_len > crunch_leave_lc_strings ||
longest_upper_run_len > crunch_leave_uc_strings) {
return G_NEVER_CRUNCH;
}
}
if (word->reject_map.length() > 1 && strpbrk(str, " ") == nullptr &&
(word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
word->best_choice->permuter() == FREQ_DAWG_PERM ||
word->best_choice->permuter() == USER_DAWG_PERM ||
word->best_choice->permuter() == NUMBER_PERM ||
acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE || ok_dict_word)) {
return G_OK;
}
ok_chars = len - bad_char_count - isolated_digits - isolated_alphas - tess_rejs;
if (crunch_debug > 3) {
tprintf("garbage_word: \"%s\"\n", word->best_choice->unichar_string().c_str());
tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n", len, bad_char_count,
isolated_digits, isolated_alphas, tess_rejs);
}
if (bad_char_count == 0 && tess_rejs == 0 &&
(len > isolated_digits + isolated_alphas || len <= 2)) {
return G_OK;
}
if (tess_rejs > ok_chars || (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len)) {
return G_TERRIBLE;
}
if (len > 4) {
dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits + isolated_alphas;
if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5) {
return G_DODGY;
} else {
return G_OK;
}
} else {
dodgy_chars = 2 * tess_rejs + bad_char_count;
if ((len == 4 && dodgy_chars > 2) || (len == 3 && dodgy_chars > 2) || dodgy_chars >= len) {
return G_DODGY;
} else {
return G_OK;
}
}
}
/*************************************************************************
* word_deletable()
* DELETE WERDS AT ENDS OF ROWS IF
* Word is crunched &&
* ( string length = 0 OR
* > 50% of chars are "|" (before merging) OR
* certainty < -10 OR
* rating /char > 60 OR
* TOP of word is more than 0.5 xht BELOW baseline OR
* BOTTOM of word is more than 0.5 xht ABOVE xht OR
* length of word < 3xht OR
* height of word < 0.7 xht OR
* height of word > 3.0 xht OR
* >75% of the outline BBs have longest dimension < 0.5xht
*************************************************************************/
CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) {
int word_len = word->reject_map.length();
float rating_per_ch;
TBOX box; // BB of word
if (word->unlv_crunch_mode == CR_NONE) {
delete_mode = 0;
return CR_NONE;
}
if (word_len == 0) {
delete_mode = 1;
return CR_DELETE;
}
if (word->rebuild_word != nullptr) {
// Cube leaves rebuild_word nullptr.
box = word->rebuild_word->bounding_box();
if (box.height() < crunch_del_min_ht * kBlnXHeight) {
delete_mode = 4;
return CR_DELETE;
}
if (noise_outlines(word->rebuild_word)) {
delete_mode = 5;
return CR_DELETE;
}
}
if ((failure_count(word) * 1.5) > word_len) {
delete_mode = 2;
return CR_LOOSE_SPACE;
}
if (word->best_choice->certainty() < crunch_del_cert) {
delete_mode = 7;
return CR_LOOSE_SPACE;
}
rating_per_ch = word->best_choice->rating() / word_len;
if (rating_per_ch > crunch_del_rating) {
delete_mode = 8;
return CR_LOOSE_SPACE;
}
if (box.top() < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
delete_mode = 9;
return CR_LOOSE_SPACE;
}
if (box.bottom() > kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
delete_mode = 10;
return CR_LOOSE_SPACE;
}
if (box.height() > crunch_del_max_ht * kBlnXHeight) {
delete_mode = 11;
return CR_LOOSE_SPACE;
}
if (box.width() < crunch_del_min_width * kBlnXHeight) {
delete_mode = 3;
return CR_LOOSE_SPACE;
}
delete_mode = 0;
return CR_NONE;
}
int16_t Tesseract::failure_count(WERD_RES *word) {
const char *str = word->best_choice->unichar_string().c_str();
int tess_rejs = 0;
for (; *str != '\0'; str++) {
if (*str == ' ') {
tess_rejs++;
}
}
return tess_rejs;
}
bool Tesseract::noise_outlines(TWERD *word) {
TBOX box; // BB of outline
int16_t outline_count = 0;
int16_t small_outline_count = 0;
int16_t max_dimension;
float small_limit = kBlnXHeight * crunch_small_outlines_size;
for (int b = 0; b < word->NumBlobs(); ++b) {
TBLOB *blob = word->blobs[b];
for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
outline_count++;
box = ol->bounding_box();
if (box.height() > box.width()) {
max_dimension = box.height();
} else {
max_dimension = box.width();
}
if (max_dimension < small_limit) {
small_outline_count++;
}
}
}
return small_outline_count >= outline_count;
}
} // namespace tesseract

View File

@ -0,0 +1,37 @@
/******************************************************************
* File: docqual.h (Formerly docqual.h)
* Description: Document Quality Metrics
* Author: Phil Cheatle
*
* (C) Copyright 1994, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef DOCQUAL_H
#define DOCQUAL_H
#include <cstdint> // for int16_t
namespace tesseract {
class PAGE_RES_IT;
class ROW;
class WERD_RES;
enum GARBAGE_LEVEL { G_NEVER_CRUNCH, G_OK, G_DODGY, G_TERRIBLE };
int16_t word_blob_quality(WERD_RES *word);
void reject_whole_page(PAGE_RES_IT &page_res_it);
} // namespace tesseract
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,250 @@
///////////////////////////////////////////////////////////////////////
// File: equationdetect.h
// Description: The equation detection class that inherits equationdetectbase.
// Author: Zongyi (Joe) Liu (joeliu@google.com)
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCMAIN_EQUATIONDETECT_H_
#define TESSERACT_CCMAIN_EQUATIONDETECT_H_
#include <tesseract/unichar.h> // for UNICHAR_ID
#include "blobbox.h" // for BLOBNBOX (ptr only), BlobSpecialText...
#include "equationdetectbase.h" // for EquationDetectBase
#include "tesseractclass.h" // for Tesseract
class TBOX;
class UNICHARSET;
namespace tesseract {
class Tesseract;
class ColPartition;
class ColPartitionGrid;
class ColPartitionSet;
class TESS_API EquationDetect : public EquationDetectBase {
public:
EquationDetect(const char *equ_datapath, const char *equ_language);
~EquationDetect() override;
enum IndentType { NO_INDENT, LEFT_INDENT, RIGHT_INDENT, BOTH_INDENT, INDENT_TYPE_COUNT };
// Reset the lang_tesseract_ pointer. This function should be called before we
// do any detector work.
void SetLangTesseract(Tesseract *lang_tesseract);
// Iterate over the blobs inside to_block, and set the blobs that we want to
// process to BSTT_NONE. (By default, they should be BSTT_SKIP). The function
// returns 0 upon success.
int LabelSpecialText(TO_BLOCK *to_block) override;
// Find possible equation partitions from part_grid. Should be called
// after the special_text_type of blobs are set.
// It returns 0 upon success.
int FindEquationParts(ColPartitionGrid *part_grid, ColPartitionSet **best_columns) override;
// Reset the resolution of the processing image. TEST only function.
void SetResolution(const int resolution);
protected:
// Identify the special text type for one blob, and update its field. When
// height_th is set (> 0), we will label the blob as BSTT_NONE if its height
// is less than height_th.
void IdentifySpecialText(BLOBNBOX *blob, const int height_th);
// Estimate the type for one unichar.
BlobSpecialTextType EstimateTypeForUnichar(const UNICHARSET &unicharset,
const UNICHAR_ID id) const;
// Compute special text type for each blobs in part_grid_.
void IdentifySpecialText();
// Identify blobs that we want to skip during special blob type
// classification.
void IdentifyBlobsToSkip(ColPartition *part);
// The ColPartitions in part_grid_ maybe over-segmented, particularly in the
// block equation regions. So we like to identify these partitions and merge
// them before we do the searching.
void MergePartsByLocation();
// Staring from the seed center, we do radius search. And for partitions that
// have large overlaps with seed, we remove them from part_grid_ and add into
// parts_overlap. Note: this function may update the part_grid_, so if the
// caller is also running ColPartitionGridSearch, use the RepositionIterator
// to continue.
void SearchByOverlap(ColPartition *seed, std::vector<ColPartition *> *parts_overlap);
// Insert part back into part_grid_, after it absorbs some other parts.
void InsertPartAfterAbsorb(ColPartition *part);
// Identify the colparitions in part_grid_, label them as PT_EQUATION, and
// save them into cp_seeds_.
void IdentifySeedParts();
// Check the blobs count for a seed region candidate.
bool CheckSeedBlobsCount(ColPartition *part);
// Compute the foreground pixel density for a tbox area.
float ComputeForegroundDensity(const TBOX &tbox);
// Check if part from seed2 label: with low math density and left indented. We
// are using two checks:
// 1. If its left is aligned with any coordinates in indented_texts_left,
// which we assume have been sorted.
// 2. If its foreground density is over foreground_density_th.
bool CheckForSeed2(const std::vector<int> &indented_texts_left,
const float foreground_density_th, ColPartition *part);
// Count the number of values in sorted_vec that is close to val, used to
// check if a partition is aligned with text partitions.
int CountAlignment(const std::vector<int> &sorted_vec, const int val) const;
// Check for a seed candidate using the foreground pixel density. And we
// return true if the density is below a certain threshold, because characters
// in equation regions usually are apart with more white spaces.
bool CheckSeedFgDensity(const float density_th, ColPartition *part);
// A light version of SplitCPHor: instead of really doing the part split, we
// simply compute the union bounding box of each split part.
void SplitCPHorLite(ColPartition *part, std::vector<TBOX> *splitted_boxes);
// Split the part (horizontally), and save the split result into
// parts_splitted. Note that it is caller's responsibility to release the
// memory owns by parts_splitted. On the other hand, the part is unchanged
// during this process and still owns the blobs, so do NOT call DeleteBoxes
// when freeing the colpartitions in parts_splitted.
void SplitCPHor(ColPartition *part, std::vector<ColPartition *> *parts_splitted);
// Check the density for a seed candidate (part) using its math density and
// italic density, returns true if the check passed.
bool CheckSeedDensity(const float math_density_high, const float math_density_low,
const ColPartition *part) const;
// Check if part is indented.
IndentType IsIndented(ColPartition *part);
// Identify inline partitions from cp_seeds_, and re-label them.
void IdentifyInlineParts();
// Compute the super bounding box for all colpartitions inside part_grid_.
void ComputeCPsSuperBBox();
// Identify inline partitions from cp_seeds_ using the horizontal search.
void IdentifyInlinePartsHorizontal();
// Estimate the line spacing between two text partitions. Returns -1 if not
// enough data.
int EstimateTextPartLineSpacing();
// Identify inline partitions from cp_seeds_ using vertical search.
void IdentifyInlinePartsVertical(const bool top_to_bottom, const int textPartsLineSpacing);
// Check if part is an inline equation zone. This should be called after we
// identified the seed regions.
bool IsInline(const bool search_bottom, const int textPartsLineSpacing, ColPartition *part);
// For a given seed partition, we search the part_grid_ and see if there is
// any partition can be merged with it. It returns true if the seed has been
// expanded.
bool ExpandSeed(ColPartition *seed);
// Starting from the seed position, we search the part_grid_
// horizontally/vertically, find all partitions that can be
// merged with seed, remove them from part_grid_, and put them into
// parts_to_merge.
void ExpandSeedHorizontal(const bool search_left, ColPartition *seed,
std::vector<ColPartition *> *parts_to_merge);
void ExpandSeedVertical(const bool search_bottom, ColPartition *seed,
std::vector<ColPartition *> *parts_to_merge);
// Check if a part_box is the small neighbor of seed_box.
bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const;
// Perform the density check for part, which we assume is nearing a seed
// partition. It returns true if the check passed.
bool CheckSeedNeighborDensity(const ColPartition *part) const;
// After identify the math blocks, we do one more scanning on all text
// partitions, and check if any of them is the satellite of:
// math blocks: here a p is the satellite of q if:
// 1. q is the nearest vertical neighbor of p, and
// 2. y_gap(p, q) is less than a threshold, and
// 3. x_overlap(p, q) is over a threshold.
// Note that p can be the satellites of two blocks: its top neighbor and
// bottom neighbor.
void ProcessMathBlockSatelliteParts();
// Check if part is the satellite of one/two math blocks. If it is, we return
// true, and save the blocks into math_blocks.
bool IsMathBlockSatellite(ColPartition *part, std::vector<ColPartition *> *math_blocks);
// Search the nearest neighbor of part in one vertical direction as defined in
// search_bottom. It returns the neighbor found that major x overlap with it,
// or nullptr when not found.
ColPartition *SearchNNVertical(const bool search_bottom, const ColPartition *part);
// Check if the neighbor with vertical distance of y_gap is a near and math
// block partition.
bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const;
// Generate the tiff file name for output/debug file.
void GetOutputTiffName(const char *name, std::string &image_name) const;
// Debugger function that renders ColPartitions on the input image, where:
// parts labeled as PT_EQUATION will be painted in red, PT_INLINE_EQUATION
// will be painted in green, and other parts will be painted in blue.
void PaintColParts(const std::string &outfile) const;
// Debugger function that renders the blobs in part_grid_ over the input
// image.
void PaintSpecialTexts(const std::string &outfile) const;
// Debugger function that print the math blobs density values for a
// ColPartition object.
void PrintSpecialBlobsDensity(const ColPartition *part) const;
// The tesseract engine initialized from equation training data.
Tesseract equ_tesseract_;
// The tesseract engine used for OCR. This pointer is passed in by the caller,
// so do NOT destroy it in this class.
Tesseract *lang_tesseract_;
// The ColPartitionGrid that we are processing. This pointer is passed in from
// the caller, so do NOT destroy it in the class.
ColPartitionGrid *part_grid_ = nullptr;
// A simple array of pointers to the best assigned column division at
// each grid y coordinate. This pointer is passed in from the caller, so do
// NOT destroy it in the class.
ColPartitionSet **best_columns_ = nullptr;
// The super bounding box of all cps in the part_grid_.
TBOX *cps_super_bbox_;
// The seed ColPartition for equation region.
std::vector<ColPartition *> cp_seeds_;
// The resolution (dpi) of the processing image.
int resolution_;
// The number of pages we have processed.
int page_count_;
};
} // namespace tesseract
#endif // TESSERACT_CCMAIN_EQUATIONDETECT_H_

View File

@ -0,0 +1,870 @@
/******************************************************************
* File: fixspace.cpp (Formerly fixspace.c)
* Description: Implements a pass over the page res, exploring the alternative
* spacing possibilities, trying to use context to improve the
* word spacing
* Author: Phil Cheatle
*
* (C) Copyright 1993, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "fixspace.h"
#include "blobs.h" // for TWERD, TBLOB, TESSLINE
#include "boxword.h" // for BoxWord
#include "errcode.h" // for ASSERT_HOST
#include "normalis.h" // for kBlnXHeight, kBlnBaselineOffset
#include "pageres.h" // for WERD_RES_IT, WERD_RES, WERD_RES_LIST
#include "params.h" // for IntParam, StringParam, BoolParam, DoubleParam, ...
#include "ratngs.h" // for WERD_CHOICE, FREQ_DAWG_PERM, NUMBER_PERM
#include "rect.h" // for TBOX
#include "stepblob.h" // for C_BLOB_IT, C_BLOB_LIST, C_BLOB
#include "tesseractclass.h" // for Tesseract, TesseractStats, WordData
#include "tessvars.h" // for debug_fp
#include "tprintf.h" // for tprintf
#include "unicharset.h" // for UNICHARSET
#include "werd.h" // for WERD, W_EOL, W_FUZZY_NON, W_FUZZY_SP
#include <tesseract/ocrclass.h> // for ETEXT_DESC
#include <tesseract/unichar.h> // for UNICHAR_ID
#include <cstdint> // for INT16_MAX, int16_t, int32_t
namespace tesseract {
class BLOCK;
class ROW;
#define PERFECT_WERDS 999
/**********************************************************************
* c_blob_comparator()
*
* Blob comparator used to sort a blob list so that blobs are in increasing
* order of left edge.
**********************************************************************/
static int c_blob_comparator( // sort blobs
const void *blob1p, // ptr to ptr to blob1
const void *blob2p // ptr to ptr to blob2
) {
const C_BLOB *blob1 = *reinterpret_cast<const C_BLOB *const *>(blob1p);
const C_BLOB *blob2 = *reinterpret_cast<const C_BLOB *const *>(blob2p);
return blob1->bounding_box().left() - blob2->bounding_box().left();
}
/**
* @name fix_fuzzy_spaces()
* Walk over the page finding sequences of words joined by fuzzy spaces. Extract
* them as a sublist, process the sublist to find the optimal arrangement of
* spaces then replace the sublist in the ROW_RES.
*
* @param monitor progress monitor
* @param word_count count of words in doc
* @param[out] page_res
*/
void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res) {
BLOCK_RES_IT block_res_it;
ROW_RES_IT row_res_it;
WERD_RES_IT word_res_it_from;
WERD_RES_IT word_res_it_to;
WERD_RES *word_res;
WERD_RES_LIST fuzzy_space_words;
int16_t new_length;
bool prevent_null_wd_fixsp; // DON'T process blobless wds
int32_t word_index; // current word
block_res_it.set_to_list(&page_res->block_res_list);
word_index = 0;
for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); block_res_it.forward()) {
row_res_it.set_to_list(&block_res_it.data()->row_res_list);
for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); row_res_it.forward()) {
word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
while (!word_res_it_from.at_last()) {
word_res = word_res_it_from.data();
while (!word_res_it_from.at_last() &&
!(word_res->combination ||
word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);
word_res = word_res_it_from.forward();
word_index++;
if (monitor != nullptr) {
monitor->ocr_alive = true;
monitor->progress = 90 + 5 * word_index / word_count;
if (monitor->deadline_exceeded() ||
(monitor->cancel != nullptr &&
(*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) {
return;
}
}
}
if (!word_res_it_from.at_last()) {
word_res_it_to = word_res_it_from;
prevent_null_wd_fixsp = word_res->word->cblob_list()->empty();
if (check_debug_pt(word_res, 60)) {
debug_fix_space_level.set_value(10);
}
word_res_it_to.forward();
word_index++;
if (monitor != nullptr) {
monitor->ocr_alive = true;
monitor->progress = 90 + 5 * word_index / word_count;
if (monitor->deadline_exceeded() ||
(monitor->cancel != nullptr &&
(*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) {
return;
}
}
while (!word_res_it_to.at_last() &&
(word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
if (check_debug_pt(word_res, 60)) {
debug_fix_space_level.set_value(10);
}
if (word_res->word->cblob_list()->empty()) {
prevent_null_wd_fixsp = true;
}
word_res = word_res_it_to.forward();
}
if (check_debug_pt(word_res, 60)) {
debug_fix_space_level.set_value(10);
}
if (word_res->word->cblob_list()->empty()) {
prevent_null_wd_fixsp = true;
}
if (prevent_null_wd_fixsp) {
word_res_it_from = word_res_it_to;
} else {
fuzzy_space_words.assign_to_sublist(&word_res_it_from, &word_res_it_to);
fix_fuzzy_space_list(fuzzy_space_words, row_res_it.data()->row,
block_res_it.data()->block);
new_length = fuzzy_space_words.length();
word_res_it_from.add_list_before(&fuzzy_space_words);
for (; !word_res_it_from.at_last() && new_length > 0; new_length--) {
word_res_it_from.forward();
}
}
if (test_pt) {
debug_fix_space_level.set_value(0);
}
}
fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);
// Last word in row
}
}
}
}
void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) {
int16_t best_score;
WERD_RES_LIST current_perm;
int16_t current_score;
bool improved = false;
best_score = eval_word_spacing(best_perm); // default score
dump_words(best_perm, best_score, 1, improved);
if (best_score != PERFECT_WERDS) {
initialise_search(best_perm, current_perm);
}
while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
match_current_words(current_perm, row, block);
current_score = eval_word_spacing(current_perm);
dump_words(current_perm, current_score, 2, improved);
if (current_score > best_score) {
best_perm.clear();
best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
best_score = current_score;
improved = true;
}
if (current_score < PERFECT_WERDS) {
transform_to_next_perm(current_perm);
}
}
dump_words(best_perm, best_score, 3, improved);
}
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
WERD_RES_IT src_it(&src_list);
WERD_RES_IT new_it(&new_list);
WERD_RES *src_wd;
WERD_RES *new_wd;
for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
src_wd = src_it.data();
if (!src_wd->combination) {
new_wd = WERD_RES::deep_copy(src_wd);
new_wd->combination = false;
new_wd->part_of_combo = false;
new_it.add_after_then_move(new_wd);
}
}
}
void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block) {
WERD_RES_IT word_it(&words);
WERD_RES *word;
// Since we are not using PAGE_RES to iterate over words, we need to update
// prev_word_best_choice_ before calling classify_word_pass2().
prev_word_best_choice_ = nullptr;
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
word = word_it.data();
if ((!word->part_of_combo) && (word->box_word == nullptr)) {
WordData word_data(block, row, word);
SetupWordPassN(2, &word_data);
classify_word_and_language(2, nullptr, &word_data);
}
prev_word_best_choice_ = word->best_choice;
}
}
/**
* @name eval_word_spacing()
* The basic measure is the number of characters in contextually confirmed
* words. (I.e the word is done)
* If all words are contextually confirmed the evaluation is deemed perfect.
*
* Some fiddles are done to handle "1"s as these are VERY frequent causes of
* fuzzy spaces. The problem with the basic measure is that "561 63" would score
* the same as "56163", though given our knowledge that the space is fuzzy, and
* that there is a "1" next to the fuzzy space, we need to ensure that "56163"
* is preferred.
*
* The solution is to NOT COUNT the score of any word which has a digit at one
* end and a "1Il" as the character the other side of the space.
*
* Conversely, any character next to a "1" within a word is counted as a
* positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1
* side of the "1" joined). "56163" would score 7 - all chars in a numeric word
* + 2 sides of a "1" joined.
*
* The joined 1 rule is applied to any word REGARDLESS of contextual
* confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally
* confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.
*
*/
int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
WERD_RES_IT word_res_it(&word_res_list);
int16_t total_score = 0;
int16_t word_count = 0;
int16_t done_word_count = 0;
int16_t i;
int16_t offset;
int16_t prev_word_score = 0;
bool prev_word_done = false;
bool prev_char_1 = false; // prev ch a "1/I/l"?
bool prev_char_digit = false; // prev ch 2..9 or 0
const char *punct_chars = "!\"`',.:;";
bool prev_char_punct = false;
do {
// current word
WERD_RES *word = word_res_it.data();
bool word_done = fixspace_thinks_word_done(word);
word_count++;
if (word->tess_failed) {
total_score += prev_word_score;
if (prev_word_done) {
done_word_count++;
}
prev_word_score = 0;
prev_char_1 = false;
prev_char_digit = false;
prev_word_done = false;
} else {
/*
Can we add the prev word score and potentially count this word?
Yes IF it didn't end in a 1 when the first char of this word is a digit
AND it didn't end in a digit when the first char of this word is a 1
*/
auto word_len = word->reject_map.length();
bool current_word_ok_so_far = false;
if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
(prev_char_digit &&
((word_done && word->best_choice->unichar_lengths().c_str()[0] == 1 &&
word->best_choice->unichar_string()[0] == '1') ||
(!word_done &&
conflict_set_I_l_1.contains(word->best_choice->unichar_string()[0])))))) {
total_score += prev_word_score;
if (prev_word_done) {
done_word_count++;
}
current_word_ok_so_far = word_done;
}
if (current_word_ok_so_far) {
prev_word_done = true;
prev_word_score = word_len;
} else {
prev_word_done = false;
prev_word_score = 0;
}
/* Add 1 to total score for every joined 1 regardless of context and
rejtn */
for (i = 0, prev_char_1 = false; i < word_len; i++) {
bool current_char_1 = word->best_choice->unichar_string()[i] == '1';
if (prev_char_1 || (current_char_1 && (i > 0))) {
total_score++;
}
prev_char_1 = current_char_1;
}
/* Add 1 to total score for every joined punctuation regardless of context
and rejtn */
if (tessedit_prefer_joined_punct) {
for (i = 0, offset = 0, prev_char_punct = false; i < word_len;
offset += word->best_choice->unichar_lengths()[i++]) {
bool current_char_punct =
strchr(punct_chars, word->best_choice->unichar_string()[offset]) != nullptr;
if (prev_char_punct || (current_char_punct && i > 0)) {
total_score++;
}
prev_char_punct = current_char_punct;
}
}
prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
for (i = 0, offset = 0; i < word_len - 1;
offset += word->best_choice->unichar_lengths()[i++]) {
;
}
prev_char_1 =
((word_done && (word->best_choice->unichar_string()[offset] == '1')) ||
(!word_done &&
conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])));
}
/* Find next word */
do {
word_res_it.forward();
} while (word_res_it.data()->part_of_combo);
} while (!word_res_it.at_first());
total_score += prev_word_score;
if (prev_word_done) {
done_word_count++;
}
if (done_word_count == word_count) {
return PERFECT_WERDS;
} else {
return total_score;
}
}
bool Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
int i;
int offset;
for (i = 0, offset = 0; i < char_position; offset += word->best_choice->unichar_lengths()[i++]) {
;
}
return (
word->uch_set->get_isdigit(word->best_choice->unichar_string().c_str() + offset,
word->best_choice->unichar_lengths()[i]) ||
(word->best_choice->permuter() == NUMBER_PERM &&
numeric_punctuation.contains(word->best_choice->unichar_string().c_str()[offset])));
}
/**
* @name transform_to_next_perm()
* Examines the current word list to find the smallest word gap size. Then walks
* the word list closing any gaps of this size by either inserted new
* combination words, or extending existing ones.
*
* The routine COULD be limited to stop it building words longer than N blobs.
*
* If there are no more gaps then it DELETES the entire list and returns the
* empty list to cause termination.
*/
void transform_to_next_perm(WERD_RES_LIST &words) {
WERD_RES_IT word_it(&words);
WERD_RES_IT prev_word_it(&words);
WERD_RES *word;
WERD_RES *prev_word;
WERD_RES *combo;
WERD *copy_word;
int16_t prev_right = -INT16_MAX;
TBOX box;
int16_t gap;
int16_t min_gap = INT16_MAX;
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
word = word_it.data();
if (!word->part_of_combo) {
box = word->word->bounding_box();
if (prev_right > -INT16_MAX) {
gap = box.left() - prev_right;
if (gap < min_gap) {
min_gap = gap;
}
}
prev_right = box.right();
}
}
if (min_gap < INT16_MAX) {
prev_right = -INT16_MAX; // back to start
word_it.set_to_list(&words);
// Note: we can't use cycle_pt due to inserted combos at start of list.
for (; (prev_right == -INT16_MAX) || !word_it.at_first(); word_it.forward()) {
word = word_it.data();
if (!word->part_of_combo) {
box = word->word->bounding_box();
if (prev_right > -INT16_MAX) {
gap = box.left() - prev_right;
if (gap <= min_gap) {
prev_word = prev_word_it.data();
if (prev_word->combination) {
combo = prev_word;
} else {
/* Make a new combination and insert before
* the first word being joined. */
copy_word = new WERD;
*copy_word = *(prev_word->word);
// deep copy
combo = new WERD_RES(copy_word);
combo->combination = true;
combo->x_height = prev_word->x_height;
prev_word->part_of_combo = true;
prev_word_it.add_before_then_move(combo);
}
combo->word->set_flag(W_EOL, word->word->flag(W_EOL));
if (word->combination) {
combo->word->join_on(word->word);
// Move blobs to combo
// old combo no longer needed
delete word_it.extract();
} else {
// Copy current wd to combo
combo->copy_on(word);
word->part_of_combo = true;
}
combo->done = false;
combo->ClearResults();
} else {
prev_word_it = word_it; // catch up
}
}
prev_right = box.right();
}
}
} else {
words.clear(); // signal termination
}
}
void Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved) {
WERD_RES_IT word_res_it(&perm);
if (debug_fix_space_level > 0) {
if (mode == 1) {
stats_.dump_words_str = "";
for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
if (!word_res_it.data()->part_of_combo) {
stats_.dump_words_str += word_res_it.data()->best_choice->unichar_string();
stats_.dump_words_str += ' ';
}
}
}
if (debug_fix_space_level > 1) {
switch (mode) {
case 1:
tprintf("EXTRACTED (%d): \"", score);
break;
case 2:
tprintf("TESTED (%d): \"", score);
break;
case 3:
tprintf("RETURNED (%d): \"", score);
break;
}
for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
if (!word_res_it.data()->part_of_combo) {
tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),
static_cast<int>(word_res_it.data()->best_choice->permuter()));
}
}
tprintf("\"\n");
} else if (improved) {
tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.c_str());
for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
if (!word_res_it.data()->part_of_combo) {
tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),
static_cast<int>(word_res_it.data()->best_choice->permuter()));
}
}
tprintf("\"\n");
}
}
}
bool Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
if (word->done) {
return true;
}
/*
Use all the standard pass 2 conditions for mode 5 in set_done() in
reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
CARE WHETHER WE HAVE of/at on/an etc.
*/
if (fixsp_done_mode > 0 &&
(word->tess_accepted || (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
fixsp_done_mode == 3) &&
(strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr) &&
((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
(word->best_choice->permuter() == FREQ_DAWG_PERM) ||
(word->best_choice->permuter() == USER_DAWG_PERM) ||
(word->best_choice->permuter() == NUMBER_PERM))) {
return true;
} else {
return false;
}
}
/**
* @name fix_sp_fp_word()
* Test the current word to see if it can be split by deleting noise blobs. If
* so, do the business.
* Return with the iterator pointing to the same place if the word is unchanged,
* or the last of the replacement words.
*/
void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block) {
WERD_RES *word_res;
WERD_RES_LIST sub_word_list;
WERD_RES_IT sub_word_list_it(&sub_word_list);
int16_t blob_index;
int16_t new_length;
float junk;
word_res = word_res_it.data();
if (word_res->word->flag(W_REP_CHAR) || word_res->combination || word_res->part_of_combo ||
!word_res->word->flag(W_DONT_CHOP)) {
return;
}
blob_index = worst_noise_blob(word_res, &junk);
if (blob_index < 0) {
return;
}
if (debug_fix_space_level > 1) {
tprintf("FP fixspace working on \"%s\"\n", word_res->best_choice->unichar_string().c_str());
}
word_res->word->rej_cblob_list()->sort(c_blob_comparator);
sub_word_list_it.add_after_stay_put(word_res_it.extract());
fix_noisy_space_list(sub_word_list, row, block);
new_length = sub_word_list.length();
word_res_it.add_list_before(&sub_word_list);
for (; !word_res_it.at_last() && new_length > 1; new_length--) {
word_res_it.forward();
}
}
void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) {
int16_t best_score;
WERD_RES_IT best_perm_it(&best_perm);
WERD_RES_LIST current_perm;
WERD_RES_IT current_perm_it(&current_perm);
WERD_RES *old_word_res;
int16_t current_score;
bool improved = false;
best_score = fp_eval_word_spacing(best_perm); // default score
dump_words(best_perm, best_score, 1, improved);
old_word_res = best_perm_it.data();
// Even deep_copy doesn't copy the underlying WERD unless its combination
// flag is true!.
old_word_res->combination = true; // Kludge to force deep copy
current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
old_word_res->combination = false; // Undo kludge
break_noisiest_blob_word(current_perm);
while (best_score != PERFECT_WERDS && !current_perm.empty()) {
match_current_words(current_perm, row, block);
current_score = fp_eval_word_spacing(current_perm);
dump_words(current_perm, current_score, 2, improved);
if (current_score > best_score) {
best_perm.clear();
best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
best_score = current_score;
improved = true;
}
if (current_score < PERFECT_WERDS) {
break_noisiest_blob_word(current_perm);
}
}
dump_words(best_perm, best_score, 3, improved);
}
/**
* break_noisiest_blob_word()
* Find the word with the blob which looks like the worst noise.
* Break the word into two, deleting the noise blob.
*/
void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
WERD_RES_IT word_it(&words);
WERD_RES_IT worst_word_it;
float worst_noise_score = 9999;
int worst_blob_index = -1; // Noisiest blob of noisiest wd
int blob_index; // of wds noisiest blob
float noise_score; // of wds noisiest blob
WERD_RES *word_res;
C_BLOB_IT blob_it;
C_BLOB_IT rej_cblob_it;
C_BLOB_LIST new_blob_list;
C_BLOB_IT new_blob_it;
C_BLOB_IT new_rej_cblob_it;
WERD *new_word;
int16_t start_of_noise_blob;
int16_t i;
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
blob_index = worst_noise_blob(word_it.data(), &noise_score);
if (blob_index > -1 && worst_noise_score > noise_score) {
worst_noise_score = noise_score;
worst_blob_index = blob_index;
worst_word_it = word_it;
}
}
if (worst_blob_index < 0) {
words.clear(); // signal termination
return;
}
/* Now split the worst_word_it */
word_res = worst_word_it.data();
/* Move blobs before noise blob to a new bloblist */
new_blob_it.set_to_list(&new_blob_list);
blob_it.set_to_list(word_res->word->cblob_list());
for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
new_blob_it.add_after_then_move(blob_it.extract());
}
start_of_noise_blob = blob_it.data()->bounding_box().left();
delete blob_it.extract(); // throw out noise blob
new_word = new WERD(&new_blob_list, word_res->word);
new_word->set_flag(W_EOL, false);
word_res->word->set_flag(W_BOL, false);
word_res->word->set_blanks(1); // After break
new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
for (; (!rej_cblob_it.empty() &&
(rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
rej_cblob_it.forward()) {
new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
}
auto *new_word_res = new WERD_RES(new_word);
new_word_res->combination = true;
worst_word_it.add_before_then_move(new_word_res);
word_res->ClearResults();
}
int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) {
float noise_score[512];
int i;
int min_noise_blob; // 1st contender
int max_noise_blob; // last contender
int non_noise_count;
int worst_noise_blob; // Worst blob
float small_limit = kBlnXHeight * fixsp_small_outlines_size;
float non_noise_limit = kBlnXHeight * 0.8;
if (word_res->rebuild_word == nullptr) {
return -1; // Can't handle cube words.
}
// Normalised.
int blob_count = word_res->box_word->length();
ASSERT_HOST(blob_count <= 512);
if (blob_count < 5) {
return -1; // too short to split
}
/* Get the noise scores for all blobs */
#ifndef SECURE_NAMES
if (debug_fix_space_level > 5) {
tprintf("FP fixspace Noise metrics for \"%s\": ",
word_res->best_choice->unichar_string().c_str());
}
#endif
for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
TBLOB *blob = word_res->rebuild_word->blobs[i];
if (word_res->reject_map[i].accepted()) {
noise_score[i] = non_noise_limit;
} else {
noise_score[i] = blob_noise_score(blob);
}
if (debug_fix_space_level > 5) {
tprintf("%1.1f ", noise_score[i]);
}
}
if (debug_fix_space_level > 5) {
tprintf("\n");
}
/* Now find the worst one which is far enough away from the end of the word */
non_noise_count = 0;
for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
if (noise_score[i] >= non_noise_limit) {
non_noise_count++;
}
}
if (non_noise_count < fixsp_non_noise_limit) {
return -1;
}
min_noise_blob = i;
non_noise_count = 0;
for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit; i--) {
if (noise_score[i] >= non_noise_limit) {
non_noise_count++;
}
}
if (non_noise_count < fixsp_non_noise_limit) {
return -1;
}
max_noise_blob = i;
if (min_noise_blob > max_noise_blob) {
return -1;
}
*worst_noise_score = small_limit;
worst_noise_blob = -1;
for (i = min_noise_blob; i <= max_noise_blob; i++) {
if (noise_score[i] < *worst_noise_score) {
worst_noise_blob = i;
*worst_noise_score = noise_score[i];
}
}
return worst_noise_blob;
}
float Tesseract::blob_noise_score(TBLOB *blob) {
TBOX box; // BB of outline
int16_t outline_count = 0;
int16_t max_dimension;
int16_t largest_outline_dimension = 0;
for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
outline_count++;
box = ol->bounding_box();
if (box.height() > box.width()) {
max_dimension = box.height();
} else {
max_dimension = box.width();
}
if (largest_outline_dimension < max_dimension) {
largest_outline_dimension = max_dimension;
}
}
if (outline_count > 5) {
// penalise LOTS of blobs
largest_outline_dimension *= 2;
}
box = blob->bounding_box();
if (box.bottom() > kBlnBaselineOffset * 4 || box.top() < kBlnBaselineOffset / 2) {
// Lax blob is if high or low
largest_outline_dimension /= 2;
}
return largest_outline_dimension;
}
void fixspace_dbg(WERD_RES *word) {
TBOX box = word->word->bounding_box();
const bool show_map_detail = false;
int16_t i;
box.print();
tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
tprintf("Blob count: %d (word); %d/%d (rebuild word)\n", word->word->cblob_list()->length(),
word->rebuild_word->NumBlobs(), word->box_word->length());
word->reject_map.print(debug_fp);
tprintf("\n");
if (show_map_detail) {
tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());
for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
word->reject_map[i].full_print(debug_fp);
}
}
tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
}
/**
* fp_eval_word_spacing()
* Evaluation function for fixed pitch word lists.
*
* Basically, count the number of "nice" characters - those which are in tess
* acceptable words or in dict words and are not rejected.
* Penalise any potential noise chars
*/
int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
WERD_RES_IT word_it(&word_res_list);
WERD_RES *word;
int16_t score = 0;
int16_t i;
float small_limit = kBlnXHeight * fixsp_small_outlines_size;
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
word = word_it.data();
if (word->rebuild_word == nullptr) {
continue; // Can't handle cube words.
}
if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
word->best_choice->permuter() == FREQ_DAWG_PERM ||
word->best_choice->permuter() == USER_DAWG_PERM || safe_dict_word(word) > 0) {
int num_blobs = word->rebuild_word->NumBlobs();
UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
TBLOB *blob = word->rebuild_word->blobs[i];
if (word->best_choice->unichar_id(i) == space || blob_noise_score(blob) < small_limit) {
score -= 1; // penalise possibly erroneous non-space
} else if (word->reject_map[i].accepted()) {
score++;
}
}
}
}
if (score < 0) {
score = 0;
}
return score;
}
} // namespace tesseract

View File

@ -0,0 +1,36 @@
/******************************************************************
* File: fixspace.h (Formerly fixspace.h)
* Description: Implements a pass over the page res, exploring the alternative
* spacing possibilities, trying to use context to improve the
* word spacing
* Author: Phil Cheatle
* Created: Thu Oct 21 11:38:43 BST 1993
*
* (C) Copyright 1993, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef FIXSPACE_H
#define FIXSPACE_H
namespace tesseract {
class WERD_RES;
class WERD_RES_LIST;
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list);
void transform_to_next_perm(WERD_RES_LIST &words);
void fixspace_dbg(WERD_RES *word);
} // namespace tesseract
#endif

View File

@ -0,0 +1,215 @@
/**********************************************************************
* File: fixxht.cpp (Formerly fixxht.c)
* Description: Improve x_ht and look out for case inconsistencies
* Author: Phil Cheatle
* Created: Thu Aug 5 14:11:08 BST 1993
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "float2int.h"
#include "params.h"
#include "tesseractclass.h"
#include <algorithm>
#include <cctype>
#include <cstring>
namespace tesseract {
// Fixxht overview.
// Premise: Initial estimate of x-height is adequate most of the time, but
// occasionally it is incorrect. Most notable causes of failure are:
// 1. Small caps, where the top of the caps is the same as the body text
// xheight. For small caps words the xheight needs to be reduced to correctly
// recognize the caps in the small caps word.
// 2. All xheight lines, such as summer. Here the initial estimate will have
// guessed that the blob tops are caps and will have placed the xheight too low.
// 3. Noise/logos beside words, or changes in font size on a line. Such
// things can blow the statistics and cause an incorrect estimate.
// 4. Incorrect baseline. Can happen when 2 columns are incorrectly merged.
// In this case the x-height is often still correct.
//
// Algorithm.
// Compare the vertical position (top only) of alphnumerics in a word with
// the range of positions in training data (in the unicharset).
// See CountMisfitTops. If any characters disagree sufficiently with the
// initial xheight estimate, then recalculate the xheight, re-run OCR on
// the word, and if the number of vertical misfits goes down, along with
// either the word rating or certainty, then keep the new xheight.
// The new xheight is calculated as follows:ComputeCompatibleXHeight
// For each alphanumeric character that has a vertically misplaced top
// (a misfit), yet its bottom is within the acceptable range (ie it is not
// likely a sub-or super-script) calculate the range of acceptable xheight
// positions from its range of tops, and give each value in the range a
// number of votes equal to the distance of its top from its acceptance range.
// The x-height position with the median of the votes becomes the new
// x-height. This assumes that most characters will be correctly recognized
// even if the x-height is incorrect. This is not a terrible assumption, but
// it is not great. An improvement would be to use a classifier that does
// not care about vertical position or scaling at all.
// Separately collect stats on shifted baselines and apply the same logic to
// computing a best-fit shift to fix the error. If the baseline needs to be
// shifted, but the x-height is OK, returns the original x-height along with
// the baseline shift to indicate that recognition needs to re-run.
// If the max-min top of a unicharset char is bigger than kMaxCharTopRange
// then the char top cannot be used to judge misfits or suggest a new top.
const int kMaxCharTopRange = 48;
// Returns the number of misfit blob tops in this word.
int Tesseract::CountMisfitTops(WERD_RES *word_res) {
int bad_blobs = 0;
int num_blobs = word_res->rebuild_word->NumBlobs();
for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
TBLOB *blob = word_res->rebuild_word->blobs[blob_id];
UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
int top = blob->bounding_box().top();
if (top >= INT_FEAT_RANGE) {
top = INT_FEAT_RANGE - 1;
}
int min_bottom, max_bottom, min_top, max_top;
unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top);
if (max_top - min_top > kMaxCharTopRange) {
continue;
}
bool bad =
top < min_top - x_ht_acceptance_tolerance || top > max_top + x_ht_acceptance_tolerance;
if (bad) {
++bad_blobs;
}
if (debug_x_ht_level >= 1) {
tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
unicharset.id_to_unichar(class_id), bad ? "Misfit" : "OK", top, min_top, max_top,
static_cast<int>(x_ht_acceptance_tolerance));
}
}
}
return bad_blobs;
}
// Returns a new x-height maximally compatible with the result in word_res.
// See comment above for overall algorithm.
float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift) {
STATS top_stats(0, UINT8_MAX);
STATS shift_stats(-UINT8_MAX, UINT8_MAX);
int bottom_shift = 0;
int num_blobs = word_res->rebuild_word->NumBlobs();
do {
top_stats.clear();
shift_stats.clear();
for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
TBLOB *blob = word_res->rebuild_word->blobs[blob_id];
UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
int top = blob->bounding_box().top() + bottom_shift;
// Clip the top to the limit of normalized feature space.
if (top >= INT_FEAT_RANGE) {
top = INT_FEAT_RANGE - 1;
}
int bottom = blob->bounding_box().bottom() + bottom_shift;
int min_bottom, max_bottom, min_top, max_top;
unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top);
// Chars with a wild top range would mess up the result so ignore them.
if (max_top - min_top > kMaxCharTopRange) {
continue;
}
int misfit_dist = std::max((min_top - x_ht_acceptance_tolerance) - top,
top - (max_top + x_ht_acceptance_tolerance));
int height = top - kBlnBaselineOffset;
if (debug_x_ht_level >= 2) {
tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
unicharset.id_to_unichar(class_id), height, min_bottom, max_bottom, min_top,
max_top, bottom, top);
}
// Use only chars that fit in the expected bottom range, and where
// the range of tops is sensibly near the xheight.
if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
bottom - x_ht_acceptance_tolerance <= max_bottom && min_top > kBlnBaselineOffset &&
max_top - kBlnBaselineOffset >= kBlnXHeight && misfit_dist > 0) {
// Compute the x-height position using proportionality between the
// actual height and expected height.
int min_xht = DivRounded(height * kBlnXHeight, max_top - kBlnBaselineOffset);
int max_xht = DivRounded(height * kBlnXHeight, min_top - kBlnBaselineOffset);
if (debug_x_ht_level >= 2) {
tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
}
// The range of expected heights gets a vote equal to the distance
// of the actual top from the expected top.
for (int y = min_xht; y <= max_xht; ++y) {
top_stats.add(y, misfit_dist);
}
} else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
bottom - x_ht_acceptance_tolerance > max_bottom) &&
bottom_shift == 0) {
// Get the range of required bottom shift.
int min_shift = min_bottom - bottom;
int max_shift = max_bottom - bottom;
if (debug_x_ht_level >= 2) {
tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
}
// The range of expected shifts gets a vote equal to the min distance
// of the actual bottom from the expected bottom, spread over the
// range of its acceptance.
int misfit_weight = abs(min_shift);
if (max_shift > min_shift) {
misfit_weight /= max_shift - min_shift;
}
for (int y = min_shift; y <= max_shift; ++y) {
shift_stats.add(y, misfit_weight);
}
} else {
if (bottom_shift == 0) {
// Things with bottoms that are already ok need to say so, on the
// 1st iteration only.
shift_stats.add(0, kBlnBaselineOffset);
}
if (debug_x_ht_level >= 2) {
tprintf(" already OK\n");
}
}
}
}
if (shift_stats.get_total() > top_stats.get_total()) {
bottom_shift = IntCastRounded(shift_stats.median());
if (debug_x_ht_level >= 2) {
tprintf("Applying bottom shift=%d\n", bottom_shift);
}
}
} while (bottom_shift != 0 && top_stats.get_total() < shift_stats.get_total());
// Baseline shift is opposite sign to the bottom shift.
*baseline_shift = -bottom_shift / word_res->denorm.y_scale();
if (debug_x_ht_level >= 2) {
tprintf("baseline shift=%g\n", *baseline_shift);
}
if (top_stats.get_total() == 0) {
return bottom_shift != 0 ? word_res->x_height : 0.0f;
}
// The new xheight is just the median vote, which is then scaled out
// of BLN space back to pixel space to get the x-height in pixel space.
float new_xht = top_stats.median();
if (debug_x_ht_level >= 2) {
tprintf("Median xht=%f\n", new_xht);
tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n", new_xht,
new_xht / word_res->denorm.y_scale());
}
// The xheight must change by at least x_ht_min_change to be used.
if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change) {
return new_xht / word_res->denorm.y_scale();
} else {
return bottom_shift != 0 ? word_res->x_height : 0.0f;
}
}
} // namespace tesseract

View File

@ -0,0 +1,314 @@
///////////////////////////////////////////////////////////////////////
// File: linerec.cpp
// Description: Top-level line-based recognition module for Tesseract.
// Author: Ray Smith
//
// (C) Copyright 2013, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////
#include "tesseractclass.h"
#include <allheaders.h>
#include "boxread.h"
#include "imagedata.h" // for ImageData
#include "lstmrecognizer.h"
#include "pageres.h"
#include "recodebeam.h"
#include "tprintf.h"
#include <algorithm>
namespace tesseract {
// Scale factor to make certainty more comparable to Tesseract.
const float kCertaintyScale = 7.0f;
// Worst acceptable certainty for a dictionary word.
const float kWorstDictCertainty = -25.0f;
// Generates training data for training a line recognizer, eg LSTM.
// Breaks the page into lines, according to the boxes, and writes them to a
// serialized DocumentData based on output_basename.
// Return true if successful, false if an error occurred.
bool Tesseract::TrainLineRecognizer(const char *input_imagename, const std::string &output_basename,
BLOCK_LIST *block_list) {
std::string lstmf_name = output_basename + ".lstmf";
DocumentData images(lstmf_name);
if (applybox_page > 0) {
// Load existing document for the previous pages.
if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) {
tprintf("Failed to read training data from %s!\n", lstmf_name.c_str());
return false;
}
}
std::vector<TBOX> boxes;
std::vector<std::string> texts;
// Get the boxes for this page, if there are any.
if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr, nullptr) ||
boxes.empty()) {
tprintf("Failed to read boxes from %s\n", input_imagename);
return false;
}
TrainFromBoxes(boxes, texts, block_list, &images);
if (images.PagesSize() == 0) {
tprintf("Failed to read pages from %s\n", input_imagename);
return false;
}
images.Shuffle();
if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) {
tprintf("Failed to write training data to %s!\n", lstmf_name.c_str());
return false;
}
return true;
}
// Generates training data for training a line recognizer, eg LSTM.
// Breaks the boxes into lines, normalizes them, converts to ImageData and
// appends them to the given training_data.
void Tesseract::TrainFromBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts,
BLOCK_LIST *block_list, DocumentData *training_data) {
auto box_count = boxes.size();
// Process all the text lines in this page, as defined by the boxes.
unsigned end_box = 0;
// Don't let \t, which marks newlines in the box file, get into the line
// content, as that makes the line unusable in training.
while (end_box < texts.size() && texts[end_box] == "\t") {
++end_box;
}
for (auto start_box = end_box; start_box < box_count; start_box = end_box) {
// Find the textline of boxes starting at start and their bounding box.
TBOX line_box = boxes[start_box];
std::string line_str = texts[start_box];
for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t"; ++end_box) {
line_box += boxes[end_box];
line_str += texts[end_box];
}
// Find the most overlapping block.
BLOCK *best_block = nullptr;
int best_overlap = 0;
BLOCK_IT b_it(block_list);
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
BLOCK *block = b_it.data();
if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {
continue; // Not a text block.
}
TBOX block_box = block->pdblk.bounding_box();
block_box.rotate(block->re_rotation());
if (block_box.major_overlap(line_box)) {
TBOX overlap_box = line_box.intersection(block_box);
if (overlap_box.area() > best_overlap) {
best_overlap = overlap_box.area();
best_block = block;
}
}
}
ImageData *imagedata = nullptr;
if (best_block == nullptr) {
tprintf("No block overlapping textline: %s\n", line_str.c_str());
} else {
imagedata = GetLineData(line_box, boxes, texts, start_box, end_box, *best_block);
}
if (imagedata != nullptr) {
training_data->AddPageToDocument(imagedata);
}
// Don't let \t, which marks newlines in the box file, get into the line
// content, as that makes the line unusable in training.
while (end_box < texts.size() && texts[end_box] == "\t") {
++end_box;
}
}
}
// Returns an Imagedata containing the image of the given box,
// and ground truth boxes/truth text if available in the input.
// The image is not normalized in any way.
ImageData *Tesseract::GetLineData(const TBOX &line_box, const std::vector<TBOX> &boxes,
const std::vector<std::string> &texts, int start_box, int end_box,
const BLOCK &block) {
TBOX revised_box;
ImageData *image_data = GetRectImage(line_box, block, kImagePadding, &revised_box);
if (image_data == nullptr) {
return nullptr;
}
image_data->set_page_number(applybox_page);
// Copy the boxes and shift them so they are relative to the image.
FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
ICOORD shift = -revised_box.botleft();
std::vector<TBOX> line_boxes;
std::vector<std::string> line_texts;
for (int b = start_box; b < end_box; ++b) {
TBOX box = boxes[b];
box.rotate(block_rotation);
box.move(shift);
line_boxes.push_back(box);
line_texts.push_back(texts[b]);
}
std::vector<int> page_numbers;
page_numbers.resize(line_boxes.size(), applybox_page);
image_data->AddBoxes(line_boxes, line_texts, page_numbers);
return image_data;
}
// Helper gets the image of a rectangle, using the block.re_rotation() if
// needed to get to the image, and rotating the result back to horizontal
// layout. (CJK characters will be on their left sides) The vertical text flag
// is set in the returned ImageData if the text was originally vertical, which
// can be used to invoke a different CJK recognition engine. The revised_box
// is also returned to enable calculation of output bounding boxes.
ImageData *Tesseract::GetRectImage(const TBOX &box, const BLOCK &block, int padding,
TBOX *revised_box) const {
TBOX wbox = box;
wbox.pad(padding, padding);
*revised_box = wbox;
// Number of clockwise 90 degree rotations needed to get back to tesseract
// coords from the clipped image.
int num_rotations = 0;
if (block.re_rotation().y() > 0.0f) {
num_rotations = 1;
} else if (block.re_rotation().x() < 0.0f) {
num_rotations = 2;
} else if (block.re_rotation().y() < 0.0f) {
num_rotations = 3;
}
// Handle two cases automatically: 1 the box came from the block, 2 the box
// came from a box file, and refers to the image, which the block may not.
if (block.pdblk.bounding_box().major_overlap(*revised_box)) {
revised_box->rotate(block.re_rotation());
}
// Now revised_box always refers to the image.
// BestPix is never colormapped, but may be of any depth.
Image pix = BestPix();
int width = pixGetWidth(pix);
int height = pixGetHeight(pix);
TBOX image_box(0, 0, width, height);
// Clip to image bounds;
*revised_box &= image_box;
if (revised_box->null_box()) {
return nullptr;
}
Box *clip_box = boxCreate(revised_box->left(), height - revised_box->top(), revised_box->width(),
revised_box->height());
Image box_pix = pixClipRectangle(pix, clip_box, nullptr);
boxDestroy(&clip_box);
if (box_pix == nullptr) {
return nullptr;
}
if (num_rotations > 0) {
Image rot_pix = pixRotateOrth(box_pix, num_rotations);
box_pix.destroy();
box_pix = rot_pix;
}
// Convert sub-8-bit images to 8 bit.
int depth = pixGetDepth(box_pix);
if (depth < 8) {
Image grey;
grey = pixConvertTo8(box_pix, false);
box_pix.destroy();
box_pix = grey;
}
bool vertical_text = false;
if (num_rotations > 0) {
// Rotated the clipped revised box back to internal coordinates.
FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());
revised_box->rotate(rotation);
if (num_rotations != 2) {
vertical_text = true;
}
}
return new ImageData(vertical_text, box_pix);
}
// Recognizes a word or group of words, converting to WERD_RES in *words.
// Analogous to classify_word_pass1, but can handle a group of words as well.
void Tesseract::LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word,
PointerVector<WERD_RES> *words) {
TBOX word_box = word->word->bounding_box();
// Get the word image - no frills.
if (tessedit_pageseg_mode == PSM_SINGLE_WORD || tessedit_pageseg_mode == PSM_RAW_LINE) {
// In single word mode, use the whole image without any other row/word
// interpretation.
word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
} else {
float baseline = row->base_line((word_box.left() + word_box.right()) / 2);
if (baseline + row->descenders() < word_box.bottom()) {
word_box.set_bottom(baseline + row->descenders());
}
if (baseline + row->x_height() + row->ascenders() > word_box.top()) {
word_box.set_top(baseline + row->x_height() + row->ascenders());
}
}
ImageData *im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
if (im_data == nullptr) {
return;
}
bool do_invert = tessedit_do_invert;
lstm_recognizer_->RecognizeLine(*im_data, do_invert, classify_debug_level > 0,
kWorstDictCertainty / kCertaintyScale, word_box, words,
lstm_choice_mode, lstm_choice_iterations);
delete im_data;
SearchWords(words);
}
// Apply segmentation search to the given set of words, within the constraints
// of the existing ratings matrix. If there is already a best_choice on a word
// leaves it untouched and just sets the done/accepted etc flags.
void Tesseract::SearchWords(PointerVector<WERD_RES> *words) {
// Run the segmentation search on the network outputs and make a BoxWord
// for each of the output words.
// If we drop a word as junk, then there is always a space in front of the
// next.
const Dict *stopper_dict = lstm_recognizer_->GetDict();
if (stopper_dict == nullptr) {
stopper_dict = &getDict();
}
bool any_nonspace_delimited = false;
for (int w = 0; w < words->size(); ++w) {
WERD_RES *word = (*words)[w];
if (word->best_choice != nullptr && word->best_choice->ContainsAnyNonSpaceDelimited()) {
any_nonspace_delimited = true;
break;
}
}
for (int w = 0; w < words->size(); ++w) {
WERD_RES *word = (*words)[w];
if (word->best_choice == nullptr) {
// It is a dud.
word->SetupFake(lstm_recognizer_->GetUnicharset());
} else {
// Set the best state.
for (int i = 0; i < word->best_choice->length(); ++i) {
int length = word->best_choice->state(i);
word->best_state.push_back(length);
}
word->reject_map.initialise(word->best_choice->length());
word->tess_failed = false;
word->tess_accepted = true;
word->tess_would_adapt = false;
word->done = true;
word->tesseract = this;
float word_certainty = std::min(word->space_certainty, word->best_choice->certainty());
word_certainty *= kCertaintyScale;
if (getDict().stopper_debug_level >= 1) {
tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
word->best_choice->certainty(), word->space_certainty,
std::min(word->space_certainty, word->best_choice->certainty()) * kCertaintyScale,
word_certainty);
word->best_choice->print();
}
word->best_choice->set_certainty(word_certainty);
word->tess_accepted = stopper_dict->AcceptableResult(word);
}
}
}
} // namespace tesseract.

View File

@ -0,0 +1,507 @@
///////////////////////////////////////////////////////////////////////
// File: ltrresultiterator.cpp
// Description: Iterator for tesseract results in strict left-to-right
// order that avoids using tesseract internal data structures.
// Author: Ray Smith
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include <tesseract/ltrresultiterator.h>
#include "pageres.h"
#include "tesseractclass.h"
#include <allheaders.h>
namespace tesseract {
LTRResultIterator::LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
int scaled_yres, int rect_left, int rect_top, int rect_width,
int rect_height)
: PageIterator(page_res, tesseract, scale, scaled_yres, rect_left, rect_top, rect_width,
rect_height)
, line_separator_("\n")
, paragraph_separator_("\n") {}
// Destructor.
// It is defined here, so the compiler can create a single vtable
// instead of weak vtables in every compilation unit.
LTRResultIterator::~LTRResultIterator() = default;
// Returns the null terminated UTF-8 encoded text string for the current
// object at the given level. Use delete [] to free after use.
char *LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const {
if (it_->word() == nullptr) {
return nullptr; // Already at the end!
}
std::string text;
PAGE_RES_IT res_it(*it_);
WERD_CHOICE *best_choice = res_it.word()->best_choice;
ASSERT_HOST(best_choice != nullptr);
if (level == RIL_SYMBOL) {
text = res_it.word()->BestUTF8(blob_index_, false);
} else if (level == RIL_WORD) {
text = best_choice->unichar_string();
} else {
bool eol = false; // end of line?
bool eop = false; // end of paragraph?
do { // for each paragraph in a block
do { // for each text line in a paragraph
do { // for each word in a text line
best_choice = res_it.word()->best_choice;
ASSERT_HOST(best_choice != nullptr);
text += best_choice->unichar_string();
text += " ";
res_it.forward();
eol = res_it.row() != res_it.prev_row();
} while (!eol);
text.resize(text.length() - 1);
text += line_separator_;
eop = res_it.block() != res_it.prev_block() ||
res_it.row()->row->para() != res_it.prev_row()->row->para();
} while (level != RIL_TEXTLINE && !eop);
if (eop) {
text += paragraph_separator_;
}
} while (level == RIL_BLOCK && res_it.block() == res_it.prev_block());
}
int length = text.length() + 1;
char *result = new char[length];
strncpy(result, text.c_str(), length);
return result;
}
// Set the string inserted at the end of each text line. "\n" by default.
void LTRResultIterator::SetLineSeparator(const char *new_line) {
line_separator_ = new_line;
}
// Set the string inserted at the end of each paragraph. "\n" by default.
void LTRResultIterator::SetParagraphSeparator(const char *new_para) {
paragraph_separator_ = new_para;
}
// Returns the mean confidence of the current object at the given level.
// The number should be interpreted as a percent probability. (0.0f-100.0f)
float LTRResultIterator::Confidence(PageIteratorLevel level) const {
if (it_->word() == nullptr) {
return 0.0f; // Already at the end!
}
float mean_certainty = 0.0f;
int certainty_count = 0;
PAGE_RES_IT res_it(*it_);
WERD_CHOICE *best_choice = res_it.word()->best_choice;
ASSERT_HOST(best_choice != nullptr);
switch (level) {
case RIL_BLOCK:
do {
best_choice = res_it.word()->best_choice;
ASSERT_HOST(best_choice != nullptr);
mean_certainty += best_choice->certainty();
++certainty_count;
res_it.forward();
} while (res_it.block() == res_it.prev_block());
break;
case RIL_PARA:
do {
best_choice = res_it.word()->best_choice;
ASSERT_HOST(best_choice != nullptr);
mean_certainty += best_choice->certainty();
++certainty_count;
res_it.forward();
} while (res_it.block() == res_it.prev_block() &&
res_it.row()->row->para() == res_it.prev_row()->row->para());
break;
case RIL_TEXTLINE:
do {
best_choice = res_it.word()->best_choice;
ASSERT_HOST(best_choice != nullptr);
mean_certainty += best_choice->certainty();
++certainty_count;
res_it.forward();
} while (res_it.row() == res_it.prev_row());
break;
case RIL_WORD:
mean_certainty += best_choice->certainty();
++certainty_count;
break;
case RIL_SYMBOL:
mean_certainty += best_choice->certainty(blob_index_);
++certainty_count;
}
if (certainty_count > 0) {
mean_certainty /= certainty_count;
return ClipToRange(100 + 5 * mean_certainty, 0.0f, 100.0f);
}
return 0.0f;
}
void LTRResultIterator::RowAttributes(float *row_height, float *descenders,
float *ascenders) const {
*row_height =
it_->row()->row->x_height() + it_->row()->row->ascenders() - it_->row()->row->descenders();
*descenders = it_->row()->row->descenders();
*ascenders = it_->row()->row->ascenders();
}
// Returns the font attributes of the current word. If iterating at a higher
// level object than words, eg textlines, then this will return the
// attributes of the first word in that textline.
// The actual return value is a string representing a font name. It points
// to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
// the iterator itself, ie rendered invalid by various members of
// TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
// Pointsize is returned in printers points (1/72 inch.)
const char *LTRResultIterator::WordFontAttributes(bool *is_bold, bool *is_italic,
bool *is_underlined, bool *is_monospace,
bool *is_serif, bool *is_smallcaps,
int *pointsize, int *font_id) const {
const char *result = nullptr;
if (it_->word() == nullptr) {
// Already at the end!
*pointsize = 0;
} else {
float row_height =
it_->row()->row->x_height() + it_->row()->row->ascenders() - it_->row()->row->descenders();
// Convert from pixels to printers points.
*pointsize =
scaled_yres_ > 0 ? static_cast<int>(row_height * kPointsPerInch / scaled_yres_ + 0.5) : 0;
#ifndef DISABLED_LEGACY_ENGINE
const FontInfo *font_info = it_->word()->fontinfo;
if (font_info) {
// Font information available.
*font_id = font_info->universal_id;
*is_bold = font_info->is_bold();
*is_italic = font_info->is_italic();
*is_underlined = false; // TODO(rays) fix this!
*is_monospace = font_info->is_fixed_pitch();
*is_serif = font_info->is_serif();
result = font_info->name;
}
#endif // ndef DISABLED_LEGACY_ENGINE
*is_smallcaps = it_->word()->small_caps;
}
if (!result) {
*is_bold = false;
*is_italic = false;
*is_underlined = false;
*is_monospace = false;
*is_serif = false;
*is_smallcaps = false;
*font_id = -1;
}
return result;
}
// Returns the name of the language used to recognize this word.
const char *LTRResultIterator::WordRecognitionLanguage() const {
if (it_->word() == nullptr || it_->word()->tesseract == nullptr) {
return nullptr;
}
return it_->word()->tesseract->lang.c_str();
}
// Return the overall directionality of this word.
StrongScriptDirection LTRResultIterator::WordDirection() const {
if (it_->word() == nullptr) {
return DIR_NEUTRAL;
}
bool has_rtl = it_->word()->AnyRtlCharsInWord();
bool has_ltr = it_->word()->AnyLtrCharsInWord();
if (has_rtl && !has_ltr) {
return DIR_RIGHT_TO_LEFT;
}
if (has_ltr && !has_rtl) {
return DIR_LEFT_TO_RIGHT;
}
if (!has_ltr && !has_rtl) {
return DIR_NEUTRAL;
}
return DIR_MIX;
}
// Returns true if the current word was found in a dictionary.
bool LTRResultIterator::WordIsFromDictionary() const {
if (it_->word() == nullptr) {
return false; // Already at the end!
}
int permuter = it_->word()->best_choice->permuter();
return permuter == SYSTEM_DAWG_PERM || permuter == FREQ_DAWG_PERM || permuter == USER_DAWG_PERM;
}
// Returns the number of blanks before the current word.
int LTRResultIterator::BlanksBeforeWord() const {
if (it_->word() == nullptr) {
return 1;
}
return it_->word()->word->space();
}
// Returns true if the current word is numeric.
bool LTRResultIterator::WordIsNumeric() const {
if (it_->word() == nullptr) {
return false; // Already at the end!
}
int permuter = it_->word()->best_choice->permuter();
return permuter == NUMBER_PERM;
}
// Returns true if the word contains blamer information.
bool LTRResultIterator::HasBlamerInfo() const {
return it_->word() != nullptr && it_->word()->blamer_bundle != nullptr &&
it_->word()->blamer_bundle->HasDebugInfo();
}
#ifndef DISABLED_LEGACY_ENGINE
// Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
// of the current word.
const void *LTRResultIterator::GetParamsTrainingBundle() const {
return (it_->word() != nullptr && it_->word()->blamer_bundle != nullptr)
? &(it_->word()->blamer_bundle->params_training_bundle())
: nullptr;
}
#endif // ndef DISABLED_LEGACY_ENGINE
// Returns the pointer to the string with blamer information for this word.
// Assumes that the word's blamer_bundle is not nullptr.
const char *LTRResultIterator::GetBlamerDebug() const {
return it_->word()->blamer_bundle->debug().c_str();
}
// Returns the pointer to the string with misadaption information for this word.
// Assumes that the word's blamer_bundle is not nullptr.
const char *LTRResultIterator::GetBlamerMisadaptionDebug() const {
return it_->word()->blamer_bundle->misadaption_debug().c_str();
}
// Returns true if a truth string was recorded for the current word.
bool LTRResultIterator::HasTruthString() const {
if (it_->word() == nullptr) {
return false; // Already at the end!
}
if (it_->word()->blamer_bundle == nullptr || it_->word()->blamer_bundle->NoTruth()) {
return false; // no truth information for this word
}
return true;
}
// Returns true if the given string is equivalent to the truth string for
// the current word.
bool LTRResultIterator::EquivalentToTruth(const char *str) const {
if (!HasTruthString()) {
return false;
}
ASSERT_HOST(it_->word()->uch_set != nullptr);
WERD_CHOICE str_wd(str, *(it_->word()->uch_set));
return it_->word()->blamer_bundle->ChoiceIsCorrect(&str_wd);
}
// Returns the null terminated UTF-8 encoded truth string for the current word.
// Use delete [] to free after use.
char *LTRResultIterator::WordTruthUTF8Text() const {
if (!HasTruthString()) {
return nullptr;
}
std::string truth_text = it_->word()->blamer_bundle->TruthString();
int length = truth_text.length() + 1;
char *result = new char[length];
strncpy(result, truth_text.c_str(), length);
return result;
}
// Returns the null terminated UTF-8 encoded normalized OCR string for the
// current word. Use delete [] to free after use.
char *LTRResultIterator::WordNormedUTF8Text() const {
if (it_->word() == nullptr) {
return nullptr; // Already at the end!
}
std::string ocr_text;
WERD_CHOICE *best_choice = it_->word()->best_choice;
const UNICHARSET *unicharset = it_->word()->uch_set;
ASSERT_HOST(best_choice != nullptr);
for (int i = 0; i < best_choice->length(); ++i) {
ocr_text += unicharset->get_normed_unichar(best_choice->unichar_id(i));
}
int length = ocr_text.length() + 1;
char *result = new char[length];
strncpy(result, ocr_text.c_str(), length);
return result;
}
// Returns a pointer to serialized choice lattice.
// Fills lattice_size with the number of bytes in lattice data.
const char *LTRResultIterator::WordLattice(int *lattice_size) const {
if (it_->word() == nullptr) {
return nullptr; // Already at the end!
}
if (it_->word()->blamer_bundle == nullptr) {
return nullptr;
}
*lattice_size = it_->word()->blamer_bundle->lattice_size();
return it_->word()->blamer_bundle->lattice_data();
}
// Returns true if the current symbol is a superscript.
// If iterating at a higher level object than symbols, eg words, then
// this will return the attributes of the first symbol in that word.
bool LTRResultIterator::SymbolIsSuperscript() const {
if (cblob_it_ == nullptr && it_->word() != nullptr) {
return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUPERSCRIPT;
}
return false;
}
// Returns true if the current symbol is a subscript.
// If iterating at a higher level object than symbols, eg words, then
// this will return the attributes of the first symbol in that word.
bool LTRResultIterator::SymbolIsSubscript() const {
if (cblob_it_ == nullptr && it_->word() != nullptr) {
return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUBSCRIPT;
}
return false;
}
// Returns true if the current symbol is a dropcap.
// If iterating at a higher level object than symbols, eg words, then
// this will return the attributes of the first symbol in that word.
bool LTRResultIterator::SymbolIsDropcap() const {
if (cblob_it_ == nullptr && it_->word() != nullptr) {
return it_->word()->best_choice->BlobPosition(blob_index_) == SP_DROPCAP;
}
return false;
}
ChoiceIterator::ChoiceIterator(const LTRResultIterator &result_it) {
ASSERT_HOST(result_it.it_->word() != nullptr);
word_res_ = result_it.it_->word();
oemLSTM_ = word_res_->tesseract->AnyLSTMLang();
// Is there legacy engine related trained data?
bool oemLegacy = word_res_->tesseract->AnyTessLang();
// Is lstm_choice_mode activated?
bool lstm_choice_mode = word_res_->tesseract->lstm_choice_mode;
rating_coefficient_ = word_res_->tesseract->lstm_rating_coefficient;
blanks_before_word_ = result_it.BlanksBeforeWord();
BLOB_CHOICE_LIST *choices = nullptr;
tstep_index_ = &result_it.blob_index_;
if (oemLSTM_ && !word_res_->CTC_symbol_choices.empty()) {
if (!word_res_->CTC_symbol_choices[0].empty() &&
strcmp(word_res_->CTC_symbol_choices[0][0].first, " ")) {
blanks_before_word_ = 0;
}
auto index = *tstep_index_;
index += blanks_before_word_;
if (index < word_res_->CTC_symbol_choices.size()) {
LSTM_choices_ = &word_res_->CTC_symbol_choices[index];
filterSpaces();
}
}
if ((oemLegacy || !lstm_choice_mode) && word_res_->ratings != nullptr) {
choices = word_res_->GetBlobChoices(result_it.blob_index_);
}
if (choices != nullptr && !choices->empty()) {
choice_it_ = new BLOB_CHOICE_IT(choices);
choice_it_->mark_cycle_pt();
} else {
choice_it_ = nullptr;
}
if (LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
LSTM_choice_it_ = LSTM_choices_->begin();
}
}
ChoiceIterator::~ChoiceIterator() {
delete choice_it_;
}
// Moves to the next choice for the symbol and returns false if there
// are none left.
bool ChoiceIterator::Next() {
if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
if (LSTM_choice_it_ != LSTM_choices_->end() && next(LSTM_choice_it_) == LSTM_choices_->end()) {
return false;
} else {
++LSTM_choice_it_;
return true;
}
} else {
if (choice_it_ == nullptr) {
return false;
}
choice_it_->forward();
return !choice_it_->cycled_list();
}
}
// Returns the null terminated UTF-8 encoded text string for the current
// choice. Do NOT use delete [] to free after use.
const char *ChoiceIterator::GetUTF8Text() const {
if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
std::pair<const char *, float> choice = *LSTM_choice_it_;
return choice.first;
} else {
if (choice_it_ == nullptr) {
return nullptr;
}
UNICHAR_ID id = choice_it_->data()->unichar_id();
return word_res_->uch_set->id_to_unichar_ext(id);
}
}
// Returns the confidence of the current choice depending on the used language
// data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All
// choices for one symbol should roughly add up to 1.0f.
// If only traineddata of the legacy engine is used, the number should be
// interpreted as a percent probability. (0.0f-100.0f) In this case
// probabilities won't add up to 100. Each one stands on its own.
float ChoiceIterator::Confidence() const {
float confidence;
if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
std::pair<const char *, float> choice = *LSTM_choice_it_;
confidence = 100 - rating_coefficient_ * choice.second;
} else {
if (choice_it_ == nullptr) {
return 0.0f;
}
confidence = 100 + 5 * choice_it_->data()->certainty();
}
return ClipToRange(confidence, 0.0f, 100.0f);
}
// Returns the set of timesteps which belong to the current symbol
std::vector<std::vector<std::pair<const char *, float>>> *ChoiceIterator::Timesteps() const {
int offset = *tstep_index_ + blanks_before_word_;
if (offset >= word_res_->segmented_timesteps.size() || !oemLSTM_) {
return nullptr;
}
return &word_res_->segmented_timesteps[offset];
}
void ChoiceIterator::filterSpaces() {
if (LSTM_choices_->empty()) {
return;
}
std::vector<std::pair<const char *, float>>::iterator it;
for (it = LSTM_choices_->begin(); it != LSTM_choices_->end();) {
if (!strcmp(it->first, " ")) {
it = LSTM_choices_->erase(it);
} else {
++it;
}
}
}
} // namespace tesseract.

View File

@ -0,0 +1,24 @@
///////////////////////////////////////////////////////////////////////
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "mutableiterator.h"
namespace tesseract {
// Destructor.
// It is defined here, so the compiler can create a single vtable
// instead of weak vtables in every compilation unit.
MutableIterator::~MutableIterator() = default;
} // namespace tesseract.

View File

@ -0,0 +1,62 @@
///////////////////////////////////////////////////////////////////////
// File: mutableiterator.h
// Description: Iterator for tesseract results providing access to
// both high-level API and Tesseract internal data structures.
// Author: David Eger
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCMAIN_MUTABLEITERATOR_H_
#define TESSERACT_CCMAIN_MUTABLEITERATOR_H_
#include <tesseract/resultiterator.h>
class BLOB_CHOICE_IT;
namespace tesseract {
class Tesseract;
// Class to iterate over tesseract results, providing access to all levels
// of the page hierarchy, without including any tesseract headers or having
// to handle any tesseract structures.
// WARNING! This class points to data held within the TessBaseAPI class, and
// therefore can only be used while the TessBaseAPI class still exists and
// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
// DetectOS, or anything else that changes the internal PAGE_RES.
// See tesseract/publictypes.h for the definition of PageIteratorLevel.
// See also base class PageIterator, which contains the bulk of the interface.
// ResultIterator adds text-specific methods for access to OCR output.
// MutableIterator adds access to internal data structures.
class TESS_API MutableIterator : public ResultIterator {
public:
// See argument descriptions in ResultIterator()
MutableIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres,
int rect_left, int rect_top, int rect_width, int rect_height)
: ResultIterator(LTRResultIterator(page_res, tesseract, scale, scaled_yres, rect_left,
rect_top, rect_width, rect_height)) {}
~MutableIterator() override;
// See PageIterator and ResultIterator for most calls.
// Return access to Tesseract internals.
const PAGE_RES_IT *PageResIt() const {
return it_;
}
};
} // namespace tesseract.
#endif // TESSERACT_CCMAIN_MUTABLEITERATOR_H_

View File

@ -0,0 +1,581 @@
///////////////////////////////////////////////////////////////////////
// File: osdetect.cpp
// Description: Orientation and script detection.
// Author: Samuel Charron
// Ranjith Unnikrishnan
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include <tesseract/osdetect.h>
#include "blobbox.h"
#include "blread.h"
#include "colfind.h"
#include "fontinfo.h"
#include "imagefind.h"
#include "linefind.h"
#include "oldlist.h"
#include "qrsequence.h"
#include "ratngs.h"
#include "tabvector.h"
#include "tesseractclass.h"
#include "textord.h"
#include <algorithm>
#include <cmath> // for std::fabs
#include <memory>
namespace tesseract {
const float kSizeRatioToReject = 2.0;
const int kMinAcceptableBlobHeight = 10;
const float kScriptAcceptRatio = 1.3;
const float kHanRatioInKorean = 0.7;
const float kHanRatioInJapanese = 0.3;
const float kNonAmbiguousMargin = 1.0;
// General scripts
static const char *han_script = "Han";
static const char *latin_script = "Latin";
static const char *katakana_script = "Katakana";
static const char *hiragana_script = "Hiragana";
static const char *hangul_script = "Hangul";
// Pseudo-scripts Name
const char *ScriptDetector::korean_script_ = "Korean";
const char *ScriptDetector::japanese_script_ = "Japanese";
const char *ScriptDetector::fraktur_script_ = "Fraktur";
void OSResults::update_best_orientation() {
float first = orientations[0];
float second = orientations[1];
best_result.orientation_id = 0;
if (orientations[0] < orientations[1]) {
first = orientations[1];
second = orientations[0];
best_result.orientation_id = 1;
}
for (int i = 2; i < 4; ++i) {
if (orientations[i] > first) {
second = first;
first = orientations[i];
best_result.orientation_id = i;
} else if (orientations[i] > second) {
second = orientations[i];
}
}
// Store difference of top two orientation scores.
best_result.oconfidence = first - second;
}
void OSResults::set_best_orientation(int orientation_id) {
best_result.orientation_id = orientation_id;
best_result.oconfidence = 0;
}
void OSResults::update_best_script(int orientation) {
// We skip index 0 to ignore the "Common" script.
float first = scripts_na[orientation][1];
float second = scripts_na[orientation][2];
best_result.script_id = 1;
if (scripts_na[orientation][1] < scripts_na[orientation][2]) {
first = scripts_na[orientation][2];
second = scripts_na[orientation][1];
best_result.script_id = 2;
}
for (int i = 3; i < kMaxNumberOfScripts; ++i) {
if (scripts_na[orientation][i] > first) {
best_result.script_id = i;
second = first;
first = scripts_na[orientation][i];
} else if (scripts_na[orientation][i] > second) {
second = scripts_na[orientation][i];
}
}
best_result.sconfidence =
(second == 0.0f) ? 2.0f : (first / second - 1.0) / (kScriptAcceptRatio - 1.0);
}
int OSResults::get_best_script(int orientation_id) const {
int max_id = -1;
for (int j = 0; j < kMaxNumberOfScripts; ++j) {
const char *script = unicharset->get_script_from_script_id(j);
if (strcmp(script, "Common") && strcmp(script, "NULL")) {
if (max_id == -1 || scripts_na[orientation_id][j] > scripts_na[orientation_id][max_id]) {
max_id = j;
}
}
}
return max_id;
}
// Print the script scores for all possible orientations.
void OSResults::print_scores(void) const {
for (int i = 0; i < 4; ++i) {
tprintf("Orientation id #%d", i);
print_scores(i);
}
}
// Print the script scores for the given candidate orientation.
void OSResults::print_scores(int orientation_id) const {
for (int j = 0; j < kMaxNumberOfScripts; ++j) {
if (scripts_na[orientation_id][j]) {
tprintf("%12s\t: %f\n", unicharset->get_script_from_script_id(j),
scripts_na[orientation_id][j]);
}
}
}
// Accumulate scores with given OSResults instance and update the best script.
void OSResults::accumulate(const OSResults &osr) {
for (int i = 0; i < 4; ++i) {
orientations[i] += osr.orientations[i];
for (int j = 0; j < kMaxNumberOfScripts; ++j) {
scripts_na[i][j] += osr.scripts_na[i][j];
}
}
unicharset = osr.unicharset;
update_best_orientation();
update_best_script(best_result.orientation_id);
}
// Detect and erase horizontal/vertical lines and picture regions from the
// image, so that non-text blobs are removed from consideration.
static void remove_nontext_regions(tesseract::Tesseract *tess, BLOCK_LIST *blocks,
TO_BLOCK_LIST *to_blocks) {
Image pix = tess->pix_binary();
ASSERT_HOST(pix != nullptr);
int vertical_x = 0;
int vertical_y = 1;
tesseract::TabVector_LIST v_lines;
tesseract::TabVector_LIST h_lines;
int resolution;
if (kMinCredibleResolution > pixGetXRes(pix)) {
resolution = kMinCredibleResolution;
tprintf("Warning. Invalid resolution %d dpi. Using %d instead.\n", pixGetXRes(pix), resolution);
} else {
resolution = pixGetXRes(pix);
}
tesseract::LineFinder::FindAndRemoveLines(resolution, false, pix, &vertical_x, &vertical_y,
nullptr, &v_lines, &h_lines);
Image im_pix = tesseract::ImageFind::FindImages(pix, nullptr);
if (im_pix != nullptr) {
pixSubtract(pix, pix, im_pix);
im_pix.destroy();
}
tess->mutable_textord()->find_components(tess->pix_binary(), blocks, to_blocks);
}
// Find connected components in the page and process a subset until finished or
// a stopping criterion is met.
// Returns the number of blobs used in making the estimate. 0 implies failure.
int orientation_and_script_detection(const char *filename, OSResults *osr,
tesseract::Tesseract *tess) {
std::string name = filename; // truncated name
const char *lastdot = strrchr(name.c_str(), '.');
if (lastdot != nullptr) {
name[lastdot - name.c_str()] = '\0';
}
ASSERT_HOST(tess->pix_binary() != nullptr);
int width = pixGetWidth(tess->pix_binary());
int height = pixGetHeight(tess->pix_binary());
BLOCK_LIST blocks;
if (!read_unlv_file(name, width, height, &blocks)) {
FullPageBlock(width, height, &blocks);
}
// Try to remove non-text regions from consideration.
TO_BLOCK_LIST land_blocks, port_blocks;
remove_nontext_regions(tess, &blocks, &port_blocks);
if (port_blocks.empty()) {
// page segmentation did not succeed, so we need to find_components first.
tess->mutable_textord()->find_components(tess->pix_binary(), &blocks, &port_blocks);
} else {
TBOX page_box(0, 0, width, height);
// Filter_blobs sets up the TO_BLOCKs the same as find_components does.
tess->mutable_textord()->filter_blobs(page_box.topright(), &port_blocks, true);
}
return os_detect(&port_blocks, osr, tess);
}
// Filter and sample the blobs.
// Returns a non-zero number of blobs if the page was successfully processed, or
// zero if the page had too few characters to be reliable
int os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr, tesseract::Tesseract *tess) {
int blobs_total = 0;
TO_BLOCK_IT block_it;
block_it.set_to_list(port_blocks);
BLOBNBOX_CLIST filtered_list;
BLOBNBOX_C_IT filtered_it(&filtered_list);
for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
TO_BLOCK *to_block = block_it.data();
if (to_block->block->pdblk.poly_block() && !to_block->block->pdblk.poly_block()->IsText()) {
continue;
}
BLOBNBOX_IT bbox_it;
bbox_it.set_to_list(&to_block->blobs);
for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {
BLOBNBOX *bbox = bbox_it.data();
C_BLOB *blob = bbox->cblob();
TBOX box = blob->bounding_box();
++blobs_total;
// Catch illegal value of box width and avoid division by zero.
if (box.width() == 0) {
continue;
}
// TODO: Can height and width be negative? If not, remove fabs.
float y_x = std::fabs((box.height() * 1.0f) / box.width());
float x_y = 1.0f / y_x;
// Select a >= 1.0 ratio
float ratio = x_y > y_x ? x_y : y_x;
// Blob is ambiguous
if (ratio > kSizeRatioToReject) {
continue;
}
if (box.height() < kMinAcceptableBlobHeight) {
continue;
}
filtered_it.add_to_end(bbox);
}
}
return os_detect_blobs(nullptr, &filtered_list, osr, tess);
}
// Detect orientation and script from a list of blobs.
// Returns a non-zero number of blobs if the list was successfully processed, or
// zero if the list had too few characters to be reliable.
// If allowed_scripts is non-null and non-empty, it is a list of scripts that
// constrains both orientation and script detection to consider only scripts
// from the list.
int os_detect_blobs(const std::vector<int> *allowed_scripts, BLOBNBOX_CLIST *blob_list,
OSResults *osr, tesseract::Tesseract *tess) {
OSResults osr_;
int minCharactersToTry = tess->min_characters_to_try;
int maxCharactersToTry = 5 * minCharactersToTry;
if (osr == nullptr) {
osr = &osr_;
}
osr->unicharset = &tess->unicharset;
OrientationDetector o(allowed_scripts, osr);
ScriptDetector s(allowed_scripts, osr, tess);
BLOBNBOX_C_IT filtered_it(blob_list);
int real_max = std::min(filtered_it.length(), maxCharactersToTry);
// tprintf("Total blobs found = %d\n", blobs_total);
// tprintf("Number of blobs post-filtering = %d\n", filtered_it.length());
// tprintf("Number of blobs to try = %d\n", real_max);
// If there are too few characters, skip this page entirely.
if (real_max < minCharactersToTry / 2) {
tprintf("Too few characters. Skipping this page\n");
return 0;
}
auto **blobs = new BLOBNBOX *[filtered_it.length()];
int number_of_blobs = 0;
for (filtered_it.mark_cycle_pt(); !filtered_it.cycled_list(); filtered_it.forward()) {
blobs[number_of_blobs++] = filtered_it.data();
}
QRSequenceGenerator sequence(number_of_blobs);
int num_blobs_evaluated = 0;
for (int i = 0; i < real_max; ++i) {
if (os_detect_blob(blobs[sequence.GetVal()], &o, &s, osr, tess) && i > minCharactersToTry) {
break;
}
++num_blobs_evaluated;
}
delete[] blobs;
// Make sure the best_result is up-to-date
int orientation = o.get_orientation();
osr->update_best_script(orientation);
return num_blobs_evaluated;
}
// Processes a single blob to estimate script and orientation.
// Return true if estimate of orientation and script satisfies stopping
// criteria.
bool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s, OSResults *osr,
tesseract::Tesseract *tess) {
tess->tess_cn_matching.set_value(true); // turn it on
tess->tess_bn_matching.set_value(false);
C_BLOB *blob = bbox->cblob();
TBLOB *tblob = TBLOB::PolygonalCopy(tess->poly_allow_detailed_fx, blob);
TBOX box = tblob->bounding_box();
FCOORD current_rotation(1.0f, 0.0f);
FCOORD rotation90(0.0f, 1.0f);
BLOB_CHOICE_LIST ratings[4];
// Test the 4 orientations
for (int i = 0; i < 4; ++i) {
// Normalize the blob. Set the origin to the place we want to be the
// bottom-middle after rotation.
// Scaling is to make the rotated height the x-height.
float scaling = static_cast<float>(kBlnXHeight) / box.height();
float x_origin = (box.left() + box.right()) / 2.0f;
float y_origin = (box.bottom() + box.top()) / 2.0f;
if (i == 0 || i == 2) {
// Rotation is 0 or 180.
y_origin = i == 0 ? box.bottom() : box.top();
} else {
// Rotation is 90 or 270.
scaling = static_cast<float>(kBlnXHeight) / box.width();
x_origin = i == 1 ? box.left() : box.right();
}
std::unique_ptr<TBLOB> rotated_blob(new TBLOB(*tblob));
rotated_blob->Normalize(nullptr, &current_rotation, nullptr, x_origin, y_origin, scaling,
scaling, 0.0f, static_cast<float>(kBlnBaselineOffset), false, nullptr);
tess->AdaptiveClassifier(rotated_blob.get(), ratings + i);
current_rotation.rotate(rotation90);
}
delete tblob;
bool stop = o->detect_blob(ratings);
s->detect_blob(ratings);
int orientation = o->get_orientation();
stop = s->must_stop(orientation) && stop;
return stop;
}
OrientationDetector::OrientationDetector(const std::vector<int> *allowed_scripts, OSResults *osr) {
osr_ = osr;
allowed_scripts_ = allowed_scripts;
}
// Score the given blob and return true if it is now sure of the orientation
// after adding this block.
bool OrientationDetector::detect_blob(BLOB_CHOICE_LIST *scores) {
float blob_o_score[4] = {0.0f, 0.0f, 0.0f, 0.0f};
float total_blob_o_score = 0.0f;
for (int i = 0; i < 4; ++i) {
BLOB_CHOICE_IT choice_it(scores + i);
if (!choice_it.empty()) {
BLOB_CHOICE *choice = nullptr;
if (allowed_scripts_ != nullptr && !allowed_scripts_->empty()) {
// Find the top choice in an allowed script.
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list() && choice == nullptr;
choice_it.forward()) {
int choice_script = choice_it.data()->script_id();
int s = 0;
for (s = 0; s < allowed_scripts_->size(); ++s) {
if ((*allowed_scripts_)[s] == choice_script) {
choice = choice_it.data();
break;
}
}
}
} else {
choice = choice_it.data();
}
if (choice != nullptr) {
// The certainty score ranges between [-20,0]. This is converted here to
// [0,1], with 1 indicating best match.
blob_o_score[i] = 1 + 0.05 * choice->certainty();
total_blob_o_score += blob_o_score[i];
}
}
}
if (total_blob_o_score == 0.0) {
return false;
}
// Fill in any blanks with the worst score of the others. This is better than
// picking an arbitrary probability for it and way better than -inf.
float worst_score = 0.0f;
int num_good_scores = 0;
for (float f : blob_o_score) {
if (f > 0.0f) {
++num_good_scores;
if (worst_score == 0.0f || f < worst_score) {
worst_score = f;
}
}
}
if (num_good_scores == 1) {
// Lower worst if there is only one.
worst_score /= 2.0f;
}
for (float &f : blob_o_score) {
if (f == 0.0f) {
f = worst_score;
total_blob_o_score += worst_score;
}
}
// Normalize the orientation scores for the blob and use them to
// update the aggregated orientation score.
for (int i = 0; total_blob_o_score != 0 && i < 4; ++i) {
osr_->orientations[i] += log(blob_o_score[i] / total_blob_o_score);
}
// TODO(ranjith) Add an early exit test, based on min_orientation_margin,
// as used in pagesegmain.cpp.
return false;
}
int OrientationDetector::get_orientation() {
osr_->update_best_orientation();
return osr_->best_result.orientation_id;
}
ScriptDetector::ScriptDetector(const std::vector<int> *allowed_scripts, OSResults *osr,
tesseract::Tesseract *tess) {
osr_ = osr;
tess_ = tess;
allowed_scripts_ = allowed_scripts;
katakana_id_ = tess_->unicharset.add_script(katakana_script);
hiragana_id_ = tess_->unicharset.add_script(hiragana_script);
han_id_ = tess_->unicharset.add_script(han_script);
hangul_id_ = tess_->unicharset.add_script(hangul_script);
japanese_id_ = tess_->unicharset.add_script(japanese_script_);
korean_id_ = tess_->unicharset.add_script(korean_script_);
latin_id_ = tess_->unicharset.add_script(latin_script);
fraktur_id_ = tess_->unicharset.add_script(fraktur_script_);
}
// Score the given blob and return true if it is now sure of the script after
// adding this blob.
void ScriptDetector::detect_blob(BLOB_CHOICE_LIST *scores) {
for (int i = 0; i < 4; ++i) {
bool done[kMaxNumberOfScripts] = {false};
BLOB_CHOICE_IT choice_it;
choice_it.set_to_list(scores + i);
float prev_score = -1;
int script_count = 0;
int prev_id = -1;
int prev_fontinfo_id = -1;
const char *prev_unichar = "";
const char *unichar = "";
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
BLOB_CHOICE *choice = choice_it.data();
int id = choice->script_id();
if (allowed_scripts_ != nullptr && !allowed_scripts_->empty()) {
// Check that the choice is in an allowed script.
int s = 0;
for (s = 0; s < allowed_scripts_->size(); ++s) {
if ((*allowed_scripts_)[s] == id) {
break;
}
}
if (s == allowed_scripts_->size()) {
continue; // Not found in list.
}
}
// Script already processed before.
if (done[id]) {
continue;
}
done[id] = true;
unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
// Save data from the first match
if (prev_score < 0) {
prev_score = -choice->certainty();
script_count = 1;
prev_id = id;
prev_unichar = unichar;
prev_fontinfo_id = choice->fontinfo_id();
} else if (-choice->certainty() < prev_score + kNonAmbiguousMargin) {
++script_count;
}
if (strlen(prev_unichar) == 1) {
if (unichar[0] >= '0' && unichar[0] <= '9') {
break;
}
}
// if script_count is >= 2, character is ambiguous, skip other matches
// since they are useless.
if (script_count >= 2) {
break;
}
}
// Character is non ambiguous
if (script_count == 1) {
// Update the score of the winning script
osr_->scripts_na[i][prev_id] += 1.0;
// Workaround for Fraktur
if (prev_id == latin_id_) {
if (prev_fontinfo_id >= 0) {
const tesseract::FontInfo &fi = tess_->get_fontinfo_table().at(prev_fontinfo_id);
// printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name,
// fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(),
// fi.is_serif(), fi.is_fraktur(),
// prev_unichar);
if (fi.is_fraktur()) {
osr_->scripts_na[i][prev_id] -= 1.0;
osr_->scripts_na[i][fraktur_id_] += 1.0;
}
}
}
// Update Japanese / Korean pseudo-scripts
if (prev_id == katakana_id_) {
osr_->scripts_na[i][japanese_id_] += 1.0;
}
if (prev_id == hiragana_id_) {
osr_->scripts_na[i][japanese_id_] += 1.0;
}
if (prev_id == hangul_id_) {
osr_->scripts_na[i][korean_id_] += 1.0;
}
if (prev_id == han_id_) {
osr_->scripts_na[i][korean_id_] += kHanRatioInKorean;
osr_->scripts_na[i][japanese_id_] += kHanRatioInJapanese;
}
}
} // iterate over each orientation
}
bool ScriptDetector::must_stop(int orientation) const {
osr_->update_best_script(orientation);
return osr_->best_result.sconfidence > 1;
}
// Helper method to convert an orientation index to its value in degrees.
// The value represents the amount of clockwise rotation in degrees that must be
// applied for the text to be upright (readable).
int OrientationIdToValue(const int &id) {
switch (id) {
case 0:
return 0;
case 1:
return 270;
case 2:
return 180;
case 3:
return 90;
default:
return -1;
}
}
} // namespace tesseract

View File

@ -0,0 +1,416 @@
/******************************************************************
* File: output.cpp (Formerly output.c)
* Description: Output pass
* Author: Phil Cheatle
*
* (C) Copyright 1994, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "output.h"
#include "control.h"
#include "tesseractclass.h"
#include "tessvars.h"
#ifndef DISABLED_LEGACY_ENGINE
# include "docqual.h"
# include "reject.h"
#endif
#include "helpers.h"
#include <cctype>
#include <cerrno>
#include <cstring>
#define CTRL_NEWLINE '\012' // newline
#define CTRL_HARDLINE '\015' // cr
namespace tesseract {
void Tesseract::output_pass( // Tess output pass //send to api
PAGE_RES_IT &page_res_it, const TBOX *target_word_box) {
BLOCK_RES *block_of_last_word;
bool force_eol; // During output
BLOCK *nextblock; // block of next word
WERD *nextword; // next word
page_res_it.restart_page();
block_of_last_word = nullptr;
while (page_res_it.word() != nullptr) {
check_debug_pt(page_res_it.word(), 120);
if (target_word_box) {
TBOX current_word_box = page_res_it.word()->word->bounding_box();
FCOORD center_pt((current_word_box.right() + current_word_box.left()) / 2,
(current_word_box.bottom() + current_word_box.top()) / 2);
if (!target_word_box->contains(center_pt)) {
page_res_it.forward();
continue;
}
}
if (tessedit_write_block_separators && block_of_last_word != page_res_it.block()) {
block_of_last_word = page_res_it.block();
}
force_eol =
(tessedit_write_block_separators && (page_res_it.block() != page_res_it.next_block())) ||
(page_res_it.next_word() == nullptr);
if (page_res_it.next_word() != nullptr) {
nextword = page_res_it.next_word()->word;
} else {
nextword = nullptr;
}
if (page_res_it.next_block() != nullptr) {
nextblock = page_res_it.next_block()->block;
} else {
nextblock = nullptr;
}
// regardless of tilde crunching
write_results(page_res_it,
determine_newline_type(page_res_it.word()->word, page_res_it.block()->block,
nextword, nextblock),
force_eol);
page_res_it.forward();
}
}
/*************************************************************************
* write_results()
*
* All recognition and rejection has now been done. Generate the following:
* .txt file - giving the final best choices with NO highlighting
* .raw file - giving the tesseract top choice output for each word
* .map file - showing how the .txt file has been rejected in the .ep file
* epchoice list - a list of one element per word, containing the text for the
* epaper. Reject strings are inserted.
* inset list - a list of bounding boxes of reject insets - indexed by the
* reject strings in the epchoice text.
*************************************************************************/
void Tesseract::write_results(PAGE_RES_IT &page_res_it,
char newline_type, // type of newline
bool force_eol) { // override tilde crunch?
WERD_RES *word = page_res_it.word();
const UNICHARSET &uchset = *word->uch_set;
int i;
bool need_reject = false;
UNICHAR_ID space = uchset.unichar_to_id(" ");
if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->empty()) &&
!tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
if ((word->unlv_crunch_mode != CR_DELETE) &&
(!stats_.tilde_crunch_written ||
((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space() > 0) &&
!word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {
if (!word->word->flag(W_BOL) && (word->word->space() > 0) && !word->word->flag(W_FUZZY_NON) &&
!word->word->flag(W_FUZZY_SP)) {
stats_.last_char_was_tilde = false;
}
need_reject = true;
}
if ((need_reject && !stats_.last_char_was_tilde) ||
(force_eol && stats_.write_results_empty_block)) {
/* Write a reject char - mark as rejected unless zero_rejection mode */
stats_.last_char_was_tilde = true;
stats_.tilde_crunch_written = true;
stats_.last_char_was_newline = false;
stats_.write_results_empty_block = false;
}
if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) {
stats_.tilde_crunch_written = false;
stats_.last_char_was_newline = true;
stats_.last_char_was_tilde = false;
}
if (force_eol) {
stats_.write_results_empty_block = true;
}
return;
}
/* NORMAL PROCESSING of non tilde crunched words */
stats_.tilde_crunch_written = false;
if (newline_type) {
stats_.last_char_was_newline = true;
} else {
stats_.last_char_was_newline = false;
}
stats_.write_results_empty_block = force_eol; // about to write a real word
if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) &&
!(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
(word->best_choice->unichar_id(0) == space)) {
/* Prevent adjacent tilde across words - we know that adjacent tildes within
words have been removed */
word->MergeAdjacentBlobs(0);
}
if (newline_type || (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes)) {
stats_.last_char_was_tilde = false;
} else {
if (word->reject_map.length() > 0) {
if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) {
stats_.last_char_was_tilde = true;
} else {
stats_.last_char_was_tilde = false;
}
} else if (word->word->space() > 0) {
stats_.last_char_was_tilde = false;
}
/* else it is unchanged as there are no output chars */
}
ASSERT_HOST(word->best_choice->length() == word->reject_map.length());
set_unlv_suspects(word);
check_debug_pt(word, 120);
if (tessedit_rejection_debug) {
tprintf("Dict word: \"%s\": %d\n", word->best_choice->debug_string().c_str(),
dict_word(*(word->best_choice)));
}
if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
if (tessedit_zero_rejection) {
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
for (i = 0; i < word->best_choice->length(); ++i) {
if (word->reject_map[i].rejected()) {
word->reject_map[i].setrej_minimal_rej_accept();
}
}
}
if (tessedit_minimal_rejection) {
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
for (i = 0; i < word->best_choice->length(); ++i) {
if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) {
word->reject_map[i].setrej_minimal_rej_accept();
}
}
}
}
}
/**********************************************************************
* determine_newline_type
*
* Find whether we have a wrapping or hard newline.
* Return false if not at end of line.
**********************************************************************/
char determine_newline_type( // test line ends
WERD *word, // word to do
BLOCK *block, // current block
WERD *next_word, // next word
BLOCK *next_block // block of next word
) {
int16_t end_gap; // to right edge
int16_t width; // of next word
TBOX word_box; // bounding
TBOX next_box; // next word
TBOX block_box; // block bounding
if (!word->flag(W_EOL)) {
return false; // not end of line
}
if (next_word == nullptr || next_block == nullptr || block != next_block) {
return CTRL_NEWLINE;
}
if (next_word->space() > 0) {
return CTRL_HARDLINE; // it is tabbed
}
word_box = word->bounding_box();
next_box = next_word->bounding_box();
block_box = block->pdblk.bounding_box();
// gap to eol
end_gap = block_box.right() - word_box.right();
end_gap -= static_cast<int32_t>(block->space());
width = next_box.right() - next_box.left();
// tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
// block_box.right(),word_box.right(),end_gap,
// next_box.right(),next_box.left(),width,
// end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
}
/*************************************************************************
* get_rep_char()
* Return the first accepted character from the repetition string. This is the
* character which is repeated - as determined earlier by fix_rep_char()
*************************************************************************/
UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
int i;
for (i = 0; ((i < word->reject_map.length()) && (word->reject_map[i].rejected())); ++i) {
;
}
if (i < word->reject_map.length()) {
return word->best_choice->unichar_id(i);
} else {
return word->uch_set->unichar_to_id(unrecognised_char.c_str());
}
}
/*************************************************************************
* SUSPECT LEVELS
*
* 0 - don't reject ANYTHING
* 1,2 - partial rejection
* 3 - BEST
*
* NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
* tessedit_minimal_rejection.
*************************************************************************/
void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
int len = word_res->reject_map.length();
const WERD_CHOICE &word = *(word_res->best_choice);
const UNICHARSET &uchset = *word.unicharset();
int i;
float rating_per_ch;
if (suspect_level == 0) {
for (i = 0; i < len; i++) {
if (word_res->reject_map[i].rejected()) {
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
return;
}
if (suspect_level >= 3) {
return; // Use defaults
}
/* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
if (safe_dict_word(word_res) && (count_alphas(word) > suspect_short_words)) {
/* Unreject alphas in dictionary words */
for (i = 0; i < len; ++i) {
if (word_res->reject_map[i].rejected() && uchset.get_isalpha(word.unichar_id(i))) {
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
}
rating_per_ch = word.rating() / word_res->reject_map.length();
if (rating_per_ch >= suspect_rating_per_ch) {
return; // Don't touch bad ratings
}
if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
/* Unreject any Tess Acceptable word - but NOT tess reject chs*/
for (i = 0; i < len; ++i) {
if (word_res->reject_map[i].rejected() && (!uchset.eq(word.unichar_id(i), " "))) {
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
}
for (i = 0; i < len; i++) {
if (word_res->reject_map[i].rejected()) {
if (word_res->reject_map[i].flag(R_DOC_REJ)) {
word_res->reject_map[i].setrej_minimal_rej_accept();
}
if (word_res->reject_map[i].flag(R_BLOCK_REJ)) {
word_res->reject_map[i].setrej_minimal_rej_accept();
}
if (word_res->reject_map[i].flag(R_ROW_REJ)) {
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
}
if (suspect_level == 2) {
return;
}
if (!suspect_constrain_1Il || (word_res->reject_map.length() <= suspect_short_words)) {
for (i = 0; i < len; i++) {
if (word_res->reject_map[i].rejected()) {
if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
word_res->reject_map[i].flag(R_POSTNN_1IL))) {
word_res->reject_map[i].setrej_minimal_rej_accept();
}
if (!suspect_constrain_1Il && word_res->reject_map[i].flag(R_MM_REJECT)) {
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
}
}
if (acceptable_word_string(*word_res->uch_set, word.unichar_string().c_str(),
word.unichar_lengths().c_str()) != AC_UNACCEPTABLE ||
acceptable_number_string(word.unichar_string().c_str(), word.unichar_lengths().c_str())) {
if (word_res->reject_map.length() > suspect_short_words) {
for (i = 0; i < len; i++) {
if (word_res->reject_map[i].rejected() && (!word_res->reject_map[i].perm_rejected() ||
word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
word_res->reject_map[i].flag(R_POSTNN_1IL) ||
word_res->reject_map[i].flag(R_MM_REJECT))) {
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
}
}
}
int16_t Tesseract::count_alphas(const WERD_CHOICE &word) {
int count = 0;
for (int i = 0; i < word.length(); ++i) {
if (word.unicharset()->get_isalpha(word.unichar_id(i))) {
count++;
}
}
return count;
}
int16_t Tesseract::count_alphanums(const WERD_CHOICE &word) {
int count = 0;
for (int i = 0; i < word.length(); ++i) {
if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
word.unicharset()->get_isdigit(word.unichar_id(i))) {
count++;
}
}
return count;
}
bool Tesseract::acceptable_number_string(const char *s, const char *lengths) {
bool prev_digit = false;
if (*lengths == 1 && *s == '(') {
s++;
}
if (*lengths == 1 && ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))) {
s++;
}
for (; *s != '\0'; s += *(lengths++)) {
if (unicharset.get_isdigit(s, *lengths)) {
prev_digit = true;
} else if (prev_digit && (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) {
prev_digit = false;
} else if (prev_digit && *lengths == 1 && (*(s + *lengths) == '\0') &&
((*s == '%') || (*s == ')'))) {
return true;
} else if (prev_digit && *lengths == 1 && (*s == '%') &&
(*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
(*(s + *lengths + *(lengths + 1)) == '\0')) {
return true;
} else {
return false;
}
}
return true;
}
} // namespace tesseract

View File

@ -0,0 +1,37 @@
/******************************************************************
* File: output.h (Formerly output.h)
* Description: Output pass
* Author: Phil Cheatle
* Created: Thu Aug 4 10:56:08 BST 1994
*
* (C) Copyright 1994, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef OUTPUT_H
#define OUTPUT_H
namespace tesseract {
class BLOCK;
class WERD;
/** test line ends */
char determine_newline_type(WERD *word, ///< word to do
BLOCK *block, ///< current block
WERD *next_word, ///< next word
BLOCK *next_block ///< block of next word
);
} // namespace tesseract
#endif

View File

@ -0,0 +1,652 @@
///////////////////////////////////////////////////////////////////////
// File: pageiterator.cpp
// Description: Iterator for tesseract page structure that avoids using
// tesseract internal data structures.
// Author: Ray Smith
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include <allheaders.h>
#include <tesseract/pageiterator.h>
#include "helpers.h"
#include "pageres.h"
#include "tesseractclass.h"
#include <algorithm>
namespace tesseract {
PageIterator::PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres,
int rect_left, int rect_top, int rect_width, int rect_height)
: page_res_(page_res)
, tesseract_(tesseract)
, word_(nullptr)
, word_length_(0)
, blob_index_(0)
, cblob_it_(nullptr)
, include_upper_dots_(false)
, include_lower_dots_(false)
, scale_(scale)
, scaled_yres_(scaled_yres)
, rect_left_(rect_left)
, rect_top_(rect_top)
, rect_width_(rect_width)
, rect_height_(rect_height) {
it_ = new PAGE_RES_IT(page_res);
PageIterator::Begin();
}
PageIterator::~PageIterator() {
delete it_;
delete cblob_it_;
}
/**
* PageIterators may be copied! This makes it possible to iterate over
* all the objects at a lower level, while maintaining an iterator to
* objects at a higher level.
*/
PageIterator::PageIterator(const PageIterator &src)
: page_res_(src.page_res_)
, tesseract_(src.tesseract_)
, word_(nullptr)
, word_length_(src.word_length_)
, blob_index_(src.blob_index_)
, cblob_it_(nullptr)
, include_upper_dots_(src.include_upper_dots_)
, include_lower_dots_(src.include_lower_dots_)
, scale_(src.scale_)
, scaled_yres_(src.scaled_yres_)
, rect_left_(src.rect_left_)
, rect_top_(src.rect_top_)
, rect_width_(src.rect_width_)
, rect_height_(src.rect_height_) {
it_ = new PAGE_RES_IT(*src.it_);
BeginWord(src.blob_index_);
}
const PageIterator &PageIterator::operator=(const PageIterator &src) {
page_res_ = src.page_res_;
tesseract_ = src.tesseract_;
include_upper_dots_ = src.include_upper_dots_;
include_lower_dots_ = src.include_lower_dots_;
scale_ = src.scale_;
scaled_yres_ = src.scaled_yres_;
rect_left_ = src.rect_left_;
rect_top_ = src.rect_top_;
rect_width_ = src.rect_width_;
rect_height_ = src.rect_height_;
delete it_;
it_ = new PAGE_RES_IT(*src.it_);
BeginWord(src.blob_index_);
return *this;
}
bool PageIterator::PositionedAtSameWord(const PAGE_RES_IT *other) const {
return (it_ == nullptr && it_ == other) ||
((other != nullptr) && (it_ != nullptr) && (*it_ == *other));
}
// ============= Moving around within the page ============.
/** Resets the iterator to point to the start of the page. */
void PageIterator::Begin() {
it_->restart_page_with_empties();
BeginWord(0);
}
void PageIterator::RestartParagraph() {
if (it_->block() == nullptr) {
return; // At end of the document.
}
PAGE_RES_IT para(page_res_);
PAGE_RES_IT next_para(para);
next_para.forward_paragraph();
while (next_para.cmp(*it_) <= 0) {
para = next_para;
next_para.forward_paragraph();
}
*it_ = para;
BeginWord(0);
}
bool PageIterator::IsWithinFirstTextlineOfParagraph() const {
PageIterator p_start(*this);
p_start.RestartParagraph();
return p_start.it_->row() == it_->row();
}
void PageIterator::RestartRow() {
it_->restart_row();
BeginWord(0);
}
/**
* Moves to the start of the next object at the given level in the
* page hierarchy, and returns false if the end of the page was reached.
* NOTE (CHANGED!) that ALL PageIteratorLevel level values will visit each
* non-text block at least once.
* Think of non text blocks as containing a single para, with at least one
* line, with a single imaginary word, containing a single symbol.
* The bounding boxes mark out any polygonal nature of the block, and
* PTIsTextType(BLockType()) is false for non-text blocks.
* Calls to Next with different levels may be freely intermixed.
* This function iterates words in right-to-left scripts correctly, if
* the appropriate language has been loaded into Tesseract.
*/
bool PageIterator::Next(PageIteratorLevel level) {
if (it_->block() == nullptr) {
return false; // Already at the end!
}
if (it_->word() == nullptr) {
level = RIL_BLOCK;
}
switch (level) {
case RIL_BLOCK:
it_->forward_block();
break;
case RIL_PARA:
it_->forward_paragraph();
break;
case RIL_TEXTLINE:
for (it_->forward_with_empties(); it_->row() == it_->prev_row();
it_->forward_with_empties()) {
;
}
break;
case RIL_WORD:
it_->forward_with_empties();
break;
case RIL_SYMBOL:
if (cblob_it_ != nullptr) {
cblob_it_->forward();
}
++blob_index_;
if (blob_index_ >= word_length_) {
it_->forward_with_empties();
} else {
return true;
}
break;
}
BeginWord(0);
return it_->block() != nullptr;
}
/**
* Returns true if the iterator is at the start of an object at the given
* level. Possible uses include determining if a call to Next(RIL_WORD)
* moved to the start of a RIL_PARA.
*/
bool PageIterator::IsAtBeginningOf(PageIteratorLevel level) const {
if (it_->block() == nullptr) {
return false; // Already at the end!
}
if (it_->word() == nullptr) {
return true; // In an image block.
}
switch (level) {
case RIL_BLOCK:
return blob_index_ == 0 && it_->block() != it_->prev_block();
case RIL_PARA:
return blob_index_ == 0 && (it_->block() != it_->prev_block() ||
it_->row()->row->para() != it_->prev_row()->row->para());
case RIL_TEXTLINE:
return blob_index_ == 0 && it_->row() != it_->prev_row();
case RIL_WORD:
return blob_index_ == 0;
case RIL_SYMBOL:
return true;
}
return false;
}
/**
* Returns whether the iterator is positioned at the last element in a
* given level. (e.g. the last word in a line, the last line in a block)
*/
bool PageIterator::IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const {
if (Empty(element)) {
return true; // Already at the end!
}
// The result is true if we step forward by element and find we are
// at the the end of the page or at beginning of *all* levels in:
// [level, element).
// When there is more than one level difference between element and level,
// we could for instance move forward one symbol and still be at the first
// word on a line, so we also have to be at the first symbol in a word.
PageIterator next(*this);
next.Next(element);
if (next.Empty(element)) {
return true; // Reached the end of the page.
}
while (element > level) {
element = static_cast<PageIteratorLevel>(element - 1);
if (!next.IsAtBeginningOf(element)) {
return false;
}
}
return true;
}
/**
* Returns whether this iterator is positioned
* before other: -1
* equal to other: 0
* after other: 1
*/
int PageIterator::Cmp(const PageIterator &other) const {
int word_cmp = it_->cmp(*other.it_);
if (word_cmp != 0) {
return word_cmp;
}
if (blob_index_ < other.blob_index_) {
return -1;
}
if (blob_index_ == other.blob_index_) {
return 0;
}
return 1;
}
// ============= Accessing data ==============.
// Coordinate system:
// Integer coordinates are at the cracks between the pixels.
// The top-left corner of the top-left pixel in the image is at (0,0).
// The bottom-right corner of the bottom-right pixel in the image is at
// (width, height).
// Every bounding box goes from the top-left of the top-left contained
// pixel to the bottom-right of the bottom-right contained pixel, so
// the bounding box of the single top-left pixel in the image is:
// (0,0)->(1,1).
// If an image rectangle has been set in the API, then returned coordinates
// relate to the original (full) image, rather than the rectangle.
/**
* Returns the bounding rectangle of the current object at the given level in
* the coordinates of the working image that is pix_binary().
* See comment on coordinate system above.
* Returns false if there is no such object at the current position.
*/
bool PageIterator::BoundingBoxInternal(PageIteratorLevel level, int *left, int *top, int *right,
int *bottom) const {
if (Empty(level)) {
return false;
}
TBOX box;
PARA *para = nullptr;
switch (level) {
case RIL_BLOCK:
box = it_->block()->block->restricted_bounding_box(include_upper_dots_, include_lower_dots_);
break;
case RIL_PARA:
para = it_->row()->row->para();
// Fall through.
case RIL_TEXTLINE:
box = it_->row()->row->restricted_bounding_box(include_upper_dots_, include_lower_dots_);
break;
case RIL_WORD:
box = it_->word()->word->restricted_bounding_box(include_upper_dots_, include_lower_dots_);
break;
case RIL_SYMBOL:
if (cblob_it_ == nullptr) {
box = it_->word()->box_word->BlobBox(blob_index_);
} else {
box = cblob_it_->data()->bounding_box();
}
}
if (level == RIL_PARA) {
PageIterator other = *this;
other.Begin();
do {
if (other.it_->block() && other.it_->block()->block == it_->block()->block &&
other.it_->row() && other.it_->row()->row && other.it_->row()->row->para() == para) {
box = box.bounding_union(other.it_->row()->row->bounding_box());
}
} while (other.Next(RIL_TEXTLINE));
}
if (level != RIL_SYMBOL || cblob_it_ != nullptr) {
box.rotate(it_->block()->block->re_rotation());
}
// Now we have a box in tesseract coordinates relative to the image rectangle,
// we have to convert the coords to a top-down system.
const int pix_height = pixGetHeight(tesseract_->pix_binary());
const int pix_width = pixGetWidth(tesseract_->pix_binary());
*left = ClipToRange(static_cast<int>(box.left()), 0, pix_width);
*top = ClipToRange(pix_height - box.top(), 0, pix_height);
*right = ClipToRange(static_cast<int>(box.right()), *left, pix_width);
*bottom = ClipToRange(pix_height - box.bottom(), *top, pix_height);
return true;
}
/**
* Returns the bounding rectangle of the current object at the given level in
* coordinates of the original image.
* See comment on coordinate system above.
* Returns false if there is no such object at the current position.
*/
bool PageIterator::BoundingBox(PageIteratorLevel level, int *left, int *top, int *right,
int *bottom) const {
return BoundingBox(level, 0, left, top, right, bottom);
}
bool PageIterator::BoundingBox(PageIteratorLevel level, const int padding, int *left, int *top,
int *right, int *bottom) const {
if (!BoundingBoxInternal(level, left, top, right, bottom)) {
return false;
}
// Convert to the coordinate system of the original image.
*left = ClipToRange(*left / scale_ + rect_left_ - padding, rect_left_, rect_left_ + rect_width_);
*top = ClipToRange(*top / scale_ + rect_top_ - padding, rect_top_, rect_top_ + rect_height_);
*right = ClipToRange((*right + scale_ - 1) / scale_ + rect_left_ + padding, *left,
rect_left_ + rect_width_);
*bottom = ClipToRange((*bottom + scale_ - 1) / scale_ + rect_top_ + padding, *top,
rect_top_ + rect_height_);
return true;
}
/** Return that there is no such object at a given level. */
bool PageIterator::Empty(PageIteratorLevel level) const {
if (it_->block() == nullptr) {
return true; // Already at the end!
}
if (it_->word() == nullptr && level != RIL_BLOCK) {
return true; // image block
}
if (level == RIL_SYMBOL && blob_index_ >= word_length_) {
return true; // Zero length word, or already at the end of it.
}
return false;
}
/** Returns the type of the current block.
* See tesseract/publictypes.h for PolyBlockType. */
PolyBlockType PageIterator::BlockType() const {
if (it_->block() == nullptr || it_->block()->block == nullptr) {
return PT_UNKNOWN; // Already at the end!
}
if (it_->block()->block->pdblk.poly_block() == nullptr) {
return PT_FLOWING_TEXT; // No layout analysis used - assume text.
}
return it_->block()->block->pdblk.poly_block()->isA();
}
/** Returns the polygon outline of the current block. The returned Pta must
* be ptaDestroy-ed after use. */
Pta *PageIterator::BlockPolygon() const {
if (it_->block() == nullptr || it_->block()->block == nullptr) {
return nullptr; // Already at the end!
}
if (it_->block()->block->pdblk.poly_block() == nullptr) {
return nullptr; // No layout analysis used - no polygon.
}
// Copy polygon, so we can unrotate it to image coordinates.
POLY_BLOCK *internal_poly = it_->block()->block->pdblk.poly_block();
ICOORDELT_LIST vertices;
vertices.deep_copy(internal_poly->points(), ICOORDELT::deep_copy);
POLY_BLOCK poly(&vertices, internal_poly->isA());
poly.rotate(it_->block()->block->re_rotation());
ICOORDELT_IT it(poly.points());
Pta *pta = ptaCreate(it.length());
int num_pts = 0;
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++num_pts) {
ICOORD *pt = it.data();
// Convert to top-down coords within the input image.
int x = static_cast<float>(pt->x()) / scale_ + rect_left_;
int y = rect_top_ + rect_height_ - static_cast<float>(pt->y()) / scale_;
x = ClipToRange(x, rect_left_, rect_left_ + rect_width_);
y = ClipToRange(y, rect_top_, rect_top_ + rect_height_);
ptaAddPt(pta, x, y);
}
return pta;
}
/**
* Returns a binary image of the current object at the given level.
* The position and size match the return from BoundingBoxInternal, and so this
* could be upscaled with respect to the original input image.
* Use pixDestroy to delete the image after use.
* The following methods are used to generate the images:
* RIL_BLOCK: mask the page image with the block polygon.
* RIL_TEXTLINE: Clip the rectangle of the line box from the page image.
* TODO(rays) fix this to generate and use a line polygon.
* RIL_WORD: Clip the rectangle of the word box from the page image.
* RIL_SYMBOL: Render the symbol outline to an image for cblobs (prior
* to recognition) or the bounding box otherwise.
* A reconstruction of the original image (using xor to check for double
* representation) should be reasonably accurate,
* apart from removed noise, at the block level. Below the block level, the
* reconstruction will be missing images and line separators.
* At the symbol level, kerned characters will be invade the bounding box
* if rendered after recognition, making an xor reconstruction inaccurate, but
* an or construction better. Before recognition, symbol-level reconstruction
* should be good, even with xor, since the images come from the connected
* components.
*/
Pix *PageIterator::GetBinaryImage(PageIteratorLevel level) const {
int left, top, right, bottom;
if (!BoundingBoxInternal(level, &left, &top, &right, &bottom)) {
return nullptr;
}
if (level == RIL_SYMBOL && cblob_it_ != nullptr && cblob_it_->data()->area() != 0) {
return cblob_it_->data()->render();
}
Box *box = boxCreate(left, top, right - left, bottom - top);
Image pix = pixClipRectangle(tesseract_->pix_binary(), box, nullptr);
boxDestroy(&box);
if (level == RIL_BLOCK || level == RIL_PARA) {
// Clip to the block polygon as well.
TBOX mask_box;
Image mask = it_->block()->block->render_mask(&mask_box);
int mask_x = left - mask_box.left();
int mask_y = top - (tesseract_->ImageHeight() - mask_box.top());
// AND the mask and pix, putting the result in pix.
pixRasterop(pix, std::max(0, -mask_x), std::max(0, -mask_y), pixGetWidth(pix),
pixGetHeight(pix), PIX_SRC & PIX_DST, mask, std::max(0, mask_x),
std::max(0, mask_y));
mask.destroy();
}
return pix;
}
/**
* Returns an image of the current object at the given level in greyscale
* if available in the input. To guarantee a binary image use BinaryImage.
* NOTE that in order to give the best possible image, the bounds are
* expanded slightly over the binary connected component, by the supplied
* padding, so the top-left position of the returned image is returned
* in (left,top). These will most likely not match the coordinates
* returned by BoundingBox.
* If you do not supply an original image, you will get a binary one.
* Use pixDestroy to delete the image after use.
*/
Pix *PageIterator::GetImage(PageIteratorLevel level, int padding, Pix *original_img, int *left,
int *top) const {
int right, bottom;
if (!BoundingBox(level, left, top, &right, &bottom)) {
return nullptr;
}
if (original_img == nullptr) {
return GetBinaryImage(level);
}
// Expand the box.
*left = std::max(*left - padding, 0);
*top = std::max(*top - padding, 0);
right = std::min(right + padding, rect_width_);
bottom = std::min(bottom + padding, rect_height_);
Box *box = boxCreate(*left, *top, right - *left, bottom - *top);
Image grey_pix = pixClipRectangle(original_img, box, nullptr);
boxDestroy(&box);
if (level == RIL_BLOCK || level == RIL_PARA) {
// Clip to the block polygon as well.
TBOX mask_box;
Image mask = it_->block()->block->render_mask(&mask_box);
// Copy the mask registered correctly into an image the size of grey_pix.
int mask_x = *left - mask_box.left();
int mask_y = *top - (pixGetHeight(original_img) - mask_box.top());
int width = pixGetWidth(grey_pix);
int height = pixGetHeight(grey_pix);
Image resized_mask = pixCreate(width, height, 1);
pixRasterop(resized_mask, std::max(0, -mask_x), std::max(0, -mask_y), width, height, PIX_SRC,
mask, std::max(0, mask_x), std::max(0, mask_y));
mask.destroy();
pixDilateBrick(resized_mask, resized_mask, 2 * padding + 1, 2 * padding + 1);
pixInvert(resized_mask, resized_mask);
pixSetMasked(grey_pix, resized_mask, UINT32_MAX);
resized_mask.destroy();
}
return grey_pix;
}
/**
* Returns the baseline of the current object at the given level.
* The baseline is the line that passes through (x1, y1) and (x2, y2).
* WARNING: with vertical text, baselines may be vertical!
*/
bool PageIterator::Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2, int *y2) const {
if (it_->word() == nullptr) {
return false; // Already at the end!
}
ROW *row = it_->row()->row;
WERD *word = it_->word()->word;
TBOX box =
(level == RIL_WORD || level == RIL_SYMBOL) ? word->bounding_box() : row->bounding_box();
int left = box.left();
ICOORD startpt(left, static_cast<int16_t>(row->base_line(left) + 0.5));
int right = box.right();
ICOORD endpt(right, static_cast<int16_t>(row->base_line(right) + 0.5));
// Rotate to image coordinates and convert to global image coords.
startpt.rotate(it_->block()->block->re_rotation());
endpt.rotate(it_->block()->block->re_rotation());
*x1 = startpt.x() / scale_ + rect_left_;
*y1 = (rect_height_ - startpt.y()) / scale_ + rect_top_;
*x2 = endpt.x() / scale_ + rect_left_;
*y2 = (rect_height_ - endpt.y()) / scale_ + rect_top_;
return true;
}
void PageIterator::Orientation(tesseract::Orientation *orientation,
tesseract::WritingDirection *writing_direction,
tesseract::TextlineOrder *textline_order,
float *deskew_angle) const {
BLOCK *block = it_->block()->block;
// Orientation
FCOORD up_in_image(0.0, 1.0);
up_in_image.unrotate(block->classify_rotation());
up_in_image.rotate(block->re_rotation());
if (up_in_image.x() == 0.0F) {
if (up_in_image.y() > 0.0F) {
*orientation = ORIENTATION_PAGE_UP;
} else {
*orientation = ORIENTATION_PAGE_DOWN;
}
} else if (up_in_image.x() > 0.0F) {
*orientation = ORIENTATION_PAGE_RIGHT;
} else {
*orientation = ORIENTATION_PAGE_LEFT;
}
// Writing direction
bool is_vertical_text = (block->classify_rotation().x() == 0.0);
bool right_to_left = block->right_to_left();
*writing_direction = is_vertical_text ? WRITING_DIRECTION_TOP_TO_BOTTOM
: (right_to_left ? WRITING_DIRECTION_RIGHT_TO_LEFT
: WRITING_DIRECTION_LEFT_TO_RIGHT);
// Textline Order
const bool is_mongolian = false; // TODO(eger): fix me
*textline_order = is_vertical_text ? (is_mongolian ? TEXTLINE_ORDER_LEFT_TO_RIGHT
: TEXTLINE_ORDER_RIGHT_TO_LEFT)
: TEXTLINE_ORDER_TOP_TO_BOTTOM;
// Deskew angle
FCOORD skew = block->skew(); // true horizontal for textlines
*deskew_angle = -skew.angle();
}
void PageIterator::ParagraphInfo(tesseract::ParagraphJustification *just, bool *is_list_item,
bool *is_crown, int *first_line_indent) const {
*just = tesseract::JUSTIFICATION_UNKNOWN;
if (!it_->row() || !it_->row()->row || !it_->row()->row->para() ||
!it_->row()->row->para()->model) {
return;
}
PARA *para = it_->row()->row->para();
*is_list_item = para->is_list_item;
*is_crown = para->is_very_first_or_continuation;
*first_line_indent = para->model->first_indent() - para->model->body_indent();
*just = para->model->justification();
}
/**
* Sets up the internal data for iterating the blobs of a new word, then
* moves the iterator to the given offset.
*/
void PageIterator::BeginWord(int offset) {
WERD_RES *word_res = it_->word();
if (word_res == nullptr) {
// This is a non-text block, so there is no word.
word_length_ = 0;
blob_index_ = 0;
word_ = nullptr;
return;
}
if (word_res->best_choice != nullptr) {
// Recognition has been done, so we are using the box_word, which
// is already baseline denormalized.
word_length_ = word_res->best_choice->length();
if (word_res->box_word != nullptr) {
if (word_res->box_word->length() != word_length_) {
tprintf("Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: ", word_length_,
word_res->best_choice->unichar_string().c_str(), word_res->box_word->length());
word_res->box_word->bounding_box().print();
}
ASSERT_HOST(word_res->box_word->length() == word_length_);
}
word_ = nullptr;
// We will be iterating the box_word.
delete cblob_it_;
cblob_it_ = nullptr;
} else {
// No recognition yet, so a "symbol" is a cblob.
word_ = word_res->word;
ASSERT_HOST(word_->cblob_list() != nullptr);
word_length_ = word_->cblob_list()->length();
if (cblob_it_ == nullptr) {
cblob_it_ = new C_BLOB_IT;
}
cblob_it_->set_to_list(word_->cblob_list());
}
for (blob_index_ = 0; blob_index_ < offset; ++blob_index_) {
if (cblob_it_ != nullptr) {
cblob_it_->forward();
}
}
}
bool PageIterator::SetWordBlamerBundle(BlamerBundle *blamer_bundle) {
if (it_->word() != nullptr) {
it_->word()->blamer_bundle = blamer_bundle;
return true;
} else {
return false;
}
}
} // namespace tesseract.

View File

@ -0,0 +1,414 @@
/**********************************************************************
* File: pagesegmain.cpp
* Description: Top-level page segmenter for Tesseract.
* Author: Ray Smith
*
* (C) Copyright 2008, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifdef _WIN32
# ifndef unlink
# include <io.h>
# endif
#else
# include <unistd.h>
#endif // _WIN32
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
# include "config_auto.h"
#endif
#include <allheaders.h>
#include "blobbox.h"
#include "blread.h"
#include "colfind.h"
#include "debugpixa.h"
#ifndef DISABLED_LEGACY_ENGINE
# include "equationdetect.h"
#endif
#include <tesseract/osdetect.h>
#include "imagefind.h"
#include "linefind.h"
#include "makerow.h"
#include "tabvector.h"
#include "tesseractclass.h"
#include "tessvars.h"
#include "textord.h"
#include "tordmain.h"
#include "wordseg.h"
namespace tesseract {
// Max erosions to perform in removing an enclosing circle.
const int kMaxCircleErosions = 8;
// Helper to remove an enclosing circle from an image.
// If there isn't one, then the image will most likely get badly mangled.
// The returned pix must be pixDestroyed after use. nullptr may be returned
// if the image doesn't meet the trivial conditions that it uses to determine
// success.
static Image RemoveEnclosingCircle(Image pixs) {
Image pixsi = pixInvert(nullptr, pixs);
Image pixc = pixCreateTemplate(pixs);
pixSetOrClearBorder(pixc, 1, 1, 1, 1, PIX_SET);
pixSeedfillBinary(pixc, pixc, pixsi, 4);
pixInvert(pixc, pixc);
pixsi.destroy();
Image pixt = pixs & pixc;
l_int32 max_count;
pixCountConnComp(pixt, 8, &max_count);
// The count has to go up before we start looking for the minimum.
l_int32 min_count = INT32_MAX;
Image pixout = nullptr;
for (int i = 1; i < kMaxCircleErosions; i++) {
pixt.destroy();
pixErodeBrick(pixc, pixc, 3, 3);
pixt = pixs & pixc;
l_int32 count;
pixCountConnComp(pixt, 8, &count);
if (i == 1 || count > max_count) {
max_count = count;
min_count = count;
} else if (count < min_count) {
min_count = count;
pixout.destroy();
pixout = pixt.copy(); // Save the best.
} else if (count >= min_count) {
break; // We have passed by the best.
}
}
pixt.destroy();
pixc.destroy();
return pixout;
}
/**
* Segment the page according to the current value of tessedit_pageseg_mode.
* pix_binary_ is used as the source image and should not be nullptr.
* On return the blocks list owns all the constructed page layout.
*/
int Tesseract::SegmentPage(const char *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess,
OSResults *osr) {
ASSERT_HOST(pix_binary_ != nullptr);
int width = pixGetWidth(pix_binary_);
int height = pixGetHeight(pix_binary_);
// Get page segmentation mode.
auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode));
// If a UNLV zone file can be found, use that instead of segmentation.
if (!PSM_COL_FIND_ENABLED(pageseg_mode) && input_file != nullptr && input_file[0] != '\0') {
std::string name = input_file;
const char *lastdot = strrchr(name.c_str(), '.');
if (lastdot != nullptr) {
name[lastdot - name.c_str()] = '\0';
}
read_unlv_file(name, width, height, blocks);
}
if (blocks->empty()) {
// No UNLV file present. Work according to the PageSegMode.
// First make a single block covering the whole image.
BLOCK_IT block_it(blocks);
auto *block = new BLOCK("", true, 0, 0, 0, 0, width, height);
block->set_right_to_left(right_to_left());
block_it.add_to_end(block);
} else {
// UNLV file present. Use PSM_SINGLE_BLOCK.
pageseg_mode = PSM_SINGLE_BLOCK;
}
// The diacritic_blobs holds noise blobs that may be diacritics. They
// are separated out on areas of the image that seem noisy and short-circuit
// the layout process, going straight from the initial partition creation
// right through to after word segmentation, where they are added to the
// rej_cblobs list of the most appropriate word. From there classification
// will determine whether they are used.
BLOBNBOX_LIST diacritic_blobs;
int auto_page_seg_ret_val = 0;
TO_BLOCK_LIST to_blocks;
if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
PSM_SPARSE(pageseg_mode)) {
auto_page_seg_ret_val =
AutoPageSeg(pageseg_mode, blocks, &to_blocks,
enable_noise_removal ? &diacritic_blobs : nullptr, osd_tess, osr);
if (pageseg_mode == PSM_OSD_ONLY) {
return auto_page_seg_ret_val;
}
// To create blobs from the image region bounds uncomment this line:
// to_blocks.clear(); // Uncomment to go back to the old mode.
} else {
deskew_ = FCOORD(1.0f, 0.0f);
reskew_ = FCOORD(1.0f, 0.0f);
if (pageseg_mode == PSM_CIRCLE_WORD) {
Image pixcleaned = RemoveEnclosingCircle(pix_binary_);
if (pixcleaned != nullptr) {
pix_binary_.destroy();
pix_binary_ = pixcleaned;
}
}
}
if (auto_page_seg_ret_val < 0) {
return -1;
}
if (blocks->empty()) {
if (textord_debug_tabfind) {
tprintf("Empty page\n");
}
return 0; // AutoPageSeg found an empty page.
}
bool splitting = pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT;
bool cjk_mode = textord_use_cjk_fp_model;
textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_, pix_thresholds_,
pix_grey_, splitting || cjk_mode, &diacritic_blobs, blocks, &to_blocks);
return auto_page_seg_ret_val;
}
/**
* Auto page segmentation. Divide the page image into blocks of uniform
* text linespacing and images.
*
* Resolution (in ppi) is derived from the input image.
*
* The output goes in the blocks list with corresponding TO_BLOCKs in the
* to_blocks list.
*
* If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide
* the image into columns, but multiple blocks are still made if the text is
* of non-uniform linespacing.
*
* If diacritic_blobs is non-null, then diacritics/noise blobs, that would
* confuse layout analysis by causing textline overlap, are placed there,
* with the expectation that they will be reassigned to words later and
* noise/diacriticness determined via classification.
*
* If osd (orientation and script detection) is true then that is performed
* as well. If only_osd is true, then only orientation and script detection is
* performed. If osd is desired, (osd or only_osd) then osr_tess must be
* another Tesseract that was initialized especially for osd, and the results
* will be output into osr (orientation and script result).
*/
int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks,
BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr) {
Image photomask_pix = nullptr;
Image musicmask_pix = nullptr;
// The blocks made by the ColumnFinder. Moved to blocks before return.
BLOCK_LIST found_blocks;
TO_BLOCK_LIST temp_blocks;
ColumnFinder *finder = SetupPageSegAndDetectOrientation(
pageseg_mode, blocks, osd_tess, osr, &temp_blocks, &photomask_pix,
pageseg_apply_music_mask ? &musicmask_pix : nullptr);
int result = 0;
if (finder != nullptr) {
TO_BLOCK_IT to_block_it(&temp_blocks);
TO_BLOCK *to_block = to_block_it.data();
if (musicmask_pix != nullptr) {
// TODO(rays) pass the musicmask_pix into FindBlocks and mark music
// blocks separately. For now combine with photomask_pix.
photomask_pix |= musicmask_pix;
}
#ifndef DISABLED_LEGACY_ENGINE
if (equ_detect_) {
finder->SetEquationDetect(equ_detect_);
}
#endif // ndef DISABLED_LEGACY_ENGINE
result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, to_block,
photomask_pix, pix_thresholds_, pix_grey_, &pixa_debug_,
&found_blocks, diacritic_blobs, to_blocks);
if (result >= 0) {
finder->GetDeskewVectors(&deskew_, &reskew_);
}
delete finder;
}
photomask_pix.destroy();
musicmask_pix.destroy();
if (result < 0) {
return result;
}
blocks->clear();
BLOCK_IT block_it(blocks);
// Move the found blocks to the input/output blocks.
block_it.add_list_after(&found_blocks);
return result;
}
// Helper adds all the scripts from sid_set converted to ids from osd_set to
// allowed_ids.
static void AddAllScriptsConverted(const UNICHARSET &sid_set, const UNICHARSET &osd_set,
std::vector<int> *allowed_ids) {
for (int i = 0; i < sid_set.get_script_table_size(); ++i) {
if (i != sid_set.null_sid()) {
const char *script = sid_set.get_script_from_script_id(i);
allowed_ids->push_back(osd_set.get_script_id_from_name(script));
}
}
}
/**
* Sets up auto page segmentation, determines the orientation, and corrects it.
* Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to
* facilitate testing.
* photo_mask_pix is a pointer to a nullptr pointer that will be filled on
* return with the leptonica photo mask, which must be pixDestroyed by the
* caller. to_blocks is an empty list that will be filled with (usually a
* single) block that is used during layout analysis. This ugly API is required
* because of the possibility of a unlv zone file.
* TODO(rays) clean this up.
* See AutoPageSeg for other arguments.
* The returned ColumnFinder must be deleted after use.
*/
ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode,
BLOCK_LIST *blocks, Tesseract *osd_tess,
OSResults *osr, TO_BLOCK_LIST *to_blocks,
Image *photo_mask_pix,
Image *music_mask_pix) {
int vertical_x = 0;
int vertical_y = 1;
TabVector_LIST v_lines;
TabVector_LIST h_lines;
ICOORD bleft(0, 0);
ASSERT_HOST(pix_binary_ != nullptr);
if (tessedit_dump_pageseg_images) {
pixa_debug_.AddPix(pix_binary_, "PageSegInput");
}
// Leptonica is used to find the rule/separator lines in the input.
LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_,
&vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines);
if (tessedit_dump_pageseg_images) {
pixa_debug_.AddPix(pix_binary_, "NoLines");
}
// Leptonica is used to find a mask of the photo regions in the input.
*photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
if (tessedit_dump_pageseg_images) {
Image pix_no_image_ = nullptr;
if (*photo_mask_pix != nullptr) {
pix_no_image_ = pixSubtract(nullptr, pix_binary_, *photo_mask_pix);
} else {
pix_no_image_ = pix_binary_.clone();
}
pixa_debug_.AddPix(pix_no_image_, "NoImages");
pix_no_image_.destroy();
}
if (!PSM_COL_FIND_ENABLED(pageseg_mode)) {
v_lines.clear();
}
// The rest of the algorithm uses the usual connected components.
textord_.find_components(pix_binary_, blocks, to_blocks);
TO_BLOCK_IT to_block_it(to_blocks);
// There must be exactly one input block.
// TODO(rays) handle new textline finding with a UNLV zone file.
ASSERT_HOST(to_blocks->singleton());
TO_BLOCK *to_block = to_block_it.data();
TBOX blkbox = to_block->block->pdblk.bounding_box();
ColumnFinder *finder = nullptr;
int estimated_resolution = source_resolution_;
if (source_resolution_ == kMinCredibleResolution) {
// Try to estimate resolution from typical body text size.
int res = IntCastRounded(to_block->line_size * kResolutionEstimationFactor);
if (res > estimated_resolution && res < kMaxCredibleResolution) {
estimated_resolution = res;
tprintf("Estimating resolution as %d\n", estimated_resolution);
}
}
if (to_block->line_size >= 2) {
finder = new ColumnFinder(static_cast<int>(to_block->line_size), blkbox.botleft(),
blkbox.topright(), estimated_resolution, textord_use_cjk_fp_model,
textord_tabfind_aligned_gap_fraction, &v_lines, &h_lines, vertical_x,
vertical_y);
finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);
#ifndef DISABLED_LEGACY_ENGINE
if (equ_detect_) {
equ_detect_->LabelSpecialText(to_block);
}
BLOBNBOX_CLIST osd_blobs;
// osd_orientation is the number of 90 degree rotations to make the
// characters upright. (See tesseract/osdetect.h for precise definition.)
// We want the text lines horizontal, (vertical text indicates vertical
// textlines) which may conflict (eg vertically written CJK).
int osd_orientation = 0;
bool vertical_text =
textord_tabfind_force_vertical_text || pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
if (!vertical_text && textord_tabfind_vertical_text && PSM_ORIENTATION_ENABLED(pageseg_mode)) {
vertical_text = finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio, to_block,
&osd_blobs);
}
if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != nullptr && osr != nullptr) {
std::vector<int> osd_scripts;
if (osd_tess != this) {
// We are running osd as part of layout analysis, so constrain the
// scripts to those allowed by *this.
AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
for (auto &lang : sub_langs_) {
AddAllScriptsConverted(lang->unicharset, osd_tess->unicharset, &osd_scripts);
}
}
os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
if (pageseg_mode == PSM_OSD_ONLY) {
delete finder;
return nullptr;
}
osd_orientation = osr->best_result.orientation_id;
double osd_score = osr->orientations[osd_orientation];
double osd_margin = min_orientation_margin * 2;
for (int i = 0; i < 4; ++i) {
if (i != osd_orientation && osd_score - osr->orientations[i] < osd_margin) {
osd_margin = osd_score - osr->orientations[i];
}
}
int best_script_id = osr->best_result.script_id;
const char *best_script_str = osd_tess->unicharset.get_script_from_script_id(best_script_id);
bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
best_script_id == osd_tess->unicharset.hiragana_sid() ||
best_script_id == osd_tess->unicharset.katakana_sid() ||
strcmp("Japanese", best_script_str) == 0 ||
strcmp("Korean", best_script_str) == 0 || strcmp("Hangul", best_script_str) == 0;
if (cjk) {
finder->set_cjk_script(true);
}
if (osd_margin < min_orientation_margin) {
// The margin is weak.
if (!cjk && !vertical_text && osd_orientation == 2) {
// upside down latin text is improbable with such a weak margin.
tprintf(
"OSD: Weak margin (%.2f), horiz textlines, not CJK: "
"Don't rotate.\n",
osd_margin);
osd_orientation = 0;
} else {
tprintf(
"OSD: Weak margin (%.2f) for %d blob text block, "
"but using orientation anyway: %d\n",
osd_margin, osd_blobs.length(), osd_orientation);
}
}
}
osd_blobs.shallow_clear();
finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
#endif // ndef DISABLED_LEGACY_ENGINE
}
return finder;
}
} // namespace tesseract.

View File

@ -0,0 +1,42 @@
/**********************************************************************
* File: pagewalk.cpp (Formerly walkers.c)
* Description: Block list processors
* Author: Phil Cheatle
* Created: Thu Oct 10 16:25:24 BST 1991
*
* (C) Copyright 1991, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "pageres.h"
#include "tesseractclass.h"
namespace tesseract {
/**
* @name process_selected_words()
*
* Walk the current block list applying the specified word processor function
* to each word that overlaps the selection_box.
*/
void Tesseract::process_selected_words(
PAGE_RES *page_res, // blocks to check
TBOX &selection_box, bool (tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it)) {
for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != nullptr; page_res_it.forward()) {
WERD *word = page_res_it.word()->word;
if (word->bounding_box().overlap(selection_box)) {
if (!(this->*word_processor)(&page_res_it)) {
return;
}
}
}
}
} // namespace tesseract

View File

@ -0,0 +1,70 @@
///////////////////////////////////////////////////////////////////////
// File: par_control.cpp
// Description: Control code for parallel implementation.
// Author: Ray Smith
//
// (C) Copyright 2013, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "tesseractclass.h"
#ifdef _OPENMP
# include <omp.h>
#endif // _OPENMP
namespace tesseract {
struct BlobData {
BlobData() = default;
BlobData(int index, Tesseract *tess, const WERD_RES &word)
: blob(word.chopped_word->blobs[index])
, tesseract(tess)
, choices(&(*word.ratings)(index, index)) {}
TBLOB *blob = nullptr;
Tesseract *tesseract = nullptr;
BLOB_CHOICE_LIST **choices = nullptr;
};
void Tesseract::PrerecAllWordsPar(const std::vector<WordData> &words) {
// Prepare all the blobs.
std::vector<BlobData> blobs;
for (const auto &w : words) {
if (w.word->ratings != nullptr && w.word->ratings->get(0, 0) == nullptr) {
for (int s = 0; s < w.lang_words.size(); ++s) {
Tesseract *sub = s < sub_langs_.size() ? sub_langs_[s] : this;
const WERD_RES &word = *w.lang_words[s];
for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) {
blobs.emplace_back(b, sub, word);
}
}
}
}
// Pre-classify all the blobs.
if (tessedit_parallelize > 1) {
#ifdef _OPENMP
# pragma omp parallel for num_threads(10)
#endif // _OPENMP
// NOLINTNEXTLINE(modernize-loop-convert)
for (size_t b = 0; b < blobs.size(); ++b) {
*blobs[b].choices =
blobs[b].tesseract->classify_blob(blobs[b].blob, "par", ScrollView::WHITE, nullptr);
}
} else {
// TODO(AMD) parallelize this.
for (auto &blob : blobs) {
*blob.choices = blob.tesseract->classify_blob(blob.blob, "par", ScrollView::WHITE, nullptr);
}
}
}
} // namespace tesseract.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,104 @@
/**********************************************************************
* File: paragraphs.h
* Description: Paragraph Detection data structures.
* Author: David Eger
* Created: 25 February 2011
*
* (C) Copyright 2011, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_
#define TESSERACT_CCMAIN_PARAGRAPHS_H_
#include <list>
#include <string>
#include "rect.h" // for TBOX
namespace tesseract {
class MutableIterator;
class ParagraphModel;
class PARA_LIST;
struct PARA;
// This structure captures all information needed about a text line for the
// purposes of paragraph detection. It is meant to be exceedingly light-weight
// so that we can easily test paragraph detection independent of the rest of
// Tesseract.
class RowInfo {
public:
// Constant data derived from Tesseract output.
std::string text; // the full UTF-8 text of the line.
bool ltr; // whether the majority of the text is left-to-right
// TODO(eger) make this more fine-grained.
bool has_leaders; // does the line contain leader dots (.....)?
bool has_drop_cap; // does the line have a drop cap?
int pix_ldistance; // distance to the left pblock boundary in pixels
int pix_rdistance; // distance to the right pblock boundary in pixels
float pix_xheight; // guessed xheight for the line
int average_interword_space; // average space between words in pixels.
int num_words;
TBOX lword_box; // in normalized (horiz text rows) space
TBOX rword_box; // in normalized (horiz text rows) space
std::string lword_text; // the UTF-8 text of the leftmost werd
std::string rword_text; // the UTF-8 text of the rightmost werd
// The text of a paragraph typically starts with the start of an idea and
// ends with the end of an idea. Here we define paragraph as something that
// may have a first line indent and a body indent which may be different.
// Typical words that start an idea are:
// 1. Words in western scripts that start with
// a capital letter, for example "The"
// 2. Bulleted or numbered list items, for
// example "2."
// Typical words which end an idea are words ending in punctuation marks. In
// this vocabulary, each list item is represented as a paragraph.
bool lword_indicates_list_item;
bool lword_likely_starts_idea;
bool lword_likely_ends_idea;
bool rword_indicates_list_item;
bool rword_likely_starts_idea;
bool rword_likely_ends_idea;
};
// Main entry point for Paragraph Detection Algorithm.
//
// Given a set of equally spaced textlines (described by row_infos),
// Split them into paragraphs. See http://goto/paragraphstalk
//
// Output:
// row_owners - one pointer for each row, to the paragraph it belongs to.
// paragraphs - this is the actual list of PARA objects.
// models - the list of paragraph models referenced by the PARA objects.
// caller is responsible for deleting the models.
TESS_API
void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,
std::vector<ParagraphModel *> *models);
// Given a MutableIterator to the start of a block, run DetectParagraphs on
// that block and commit the results to the underlying ROW and BLOCK structs,
// saving the ParagraphModels in models. Caller owns the models.
// We use unicharset during the function to answer questions such as "is the
// first letter of this word upper case?"
TESS_API
void DetectParagraphs(int debug_level, bool after_text_recognition,
const MutableIterator *block_start, std::vector<ParagraphModel *> *models);
} // namespace tesseract
#endif // TESSERACT_CCMAIN_PARAGRAPHS_H_

View File

@ -0,0 +1,309 @@
/**********************************************************************
* File: paragraphs_internal.h
* Description: Paragraph Detection internal data structures.
* Author: David Eger
*
* (C) Copyright 2011, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
#define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
#include <tesseract/publictypes.h> // for ParagraphJustification
#include "paragraphs.h"
// NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
// DATA STRUCTURES OR FUNCTIONS IN THIS FILE.
namespace tesseract {
class UNICHARSET;
class WERD_CHOICE;
// Return whether the given word is likely to be a list item start word.
TESS_API
bool AsciiLikelyListItem(const std::string &word);
// Return the first Unicode Codepoint from werd[pos].
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
// Set right word attributes given either a unicharset and werd or a utf8
// string.
TESS_API
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
bool *is_list, bool *starts_idea, bool *ends_idea);
// Set left word attributes given either a unicharset and werd or a utf8 string.
TESS_API
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
bool *is_list, bool *starts_idea, bool *ends_idea);
enum LineType {
LT_START = 'S', // First line of a paragraph.
LT_BODY = 'C', // Continuation line of a paragraph.
LT_UNKNOWN = 'U', // No clues.
LT_MULTIPLE = 'M', // Matches for both LT_START and LT_BODY.
};
// The first paragraph in a page of body text is often un-indented.
// This is a typographic convention which is common to indicate either that:
// (1) The paragraph is the continuation of a previous paragraph, or
// (2) The paragraph is the first paragraph in a chapter.
//
// I refer to such paragraphs as "crown"s, and the output of the paragraph
// detection algorithm attempts to give them the same paragraph model as
// the rest of the body text.
//
// Nonetheless, while building hypotheses, it is useful to mark the lines
// of crown paragraphs temporarily as crowns, either aligned left or right.
extern const ParagraphModel *kCrownLeft;
extern const ParagraphModel *kCrownRight;
inline bool StrongModel(const ParagraphModel *model) {
return model != nullptr && model != kCrownLeft && model != kCrownRight;
}
struct LineHypothesis {
LineHypothesis() : ty(LT_UNKNOWN), model(nullptr) {}
LineHypothesis(LineType line_type, const ParagraphModel *m) : ty(line_type), model(m) {}
LineHypothesis(const LineHypothesis &other) = default;
// Copy assignment operator.
LineHypothesis &operator=(const LineHypothesis &other) = default;
bool operator==(const LineHypothesis &other) const {
return ty == other.ty && model == other.model;
}
LineType ty;
const ParagraphModel *model;
};
class ParagraphTheory; // Forward Declaration
using SetOfModels = std::vector<const ParagraphModel *>;
// Row Scratch Registers are data generated by the paragraph detection
// algorithm based on a RowInfo input.
class RowScratchRegisters {
public:
// We presume row will outlive us.
void Init(const RowInfo &row);
LineType GetLineType() const;
LineType GetLineType(const ParagraphModel *model) const;
// Mark this as a start line type, sans model. This is useful for the
// initial marking of probable body lines or paragraph start lines.
void SetStartLine();
// Mark this as a body line type, sans model. This is useful for the
// initial marking of probably body lines or paragraph start lines.
void SetBodyLine();
// Record that this row fits as a paragraph start line in the given model,
void AddStartLine(const ParagraphModel *model);
// Record that this row fits as a paragraph body line in the given model,
void AddBodyLine(const ParagraphModel *model);
// Clear all hypotheses about this line.
void SetUnknown() {
hypotheses_.clear();
}
// Append all hypotheses of strong models that match this row as a start.
void StartHypotheses(SetOfModels *models) const;
// Append all hypotheses of strong models matching this row.
void StrongHypotheses(SetOfModels *models) const;
// Append all hypotheses for this row.
void NonNullHypotheses(SetOfModels *models) const;
// Discard any hypotheses whose model is not in the given list.
void DiscardNonMatchingHypotheses(const SetOfModels &models);
// If we have only one hypothesis and that is that this line is a paragraph
// start line of a certain model, return that model. Else return nullptr.
const ParagraphModel *UniqueStartHypothesis() const;
// If we have only one hypothesis and that is that this line is a paragraph
// body line of a certain model, return that model. Else return nullptr.
const ParagraphModel *UniqueBodyHypothesis() const;
// Return the indentation for the side opposite of the aligned side.
int OffsideIndent(tesseract::ParagraphJustification just) const {
switch (just) {
case tesseract::JUSTIFICATION_RIGHT:
return lindent_;
case tesseract::JUSTIFICATION_LEFT:
return rindent_;
default:
return lindent_ > rindent_ ? lindent_ : rindent_;
}
}
// Return the indentation for the side the text is aligned to.
int AlignsideIndent(tesseract::ParagraphJustification just) const {
switch (just) {
case tesseract::JUSTIFICATION_RIGHT:
return rindent_;
case tesseract::JUSTIFICATION_LEFT:
return lindent_;
default:
return lindent_ > rindent_ ? lindent_ : rindent_;
}
}
// Append header fields to a vector of row headings.
static void AppendDebugHeaderFields(std::vector<std::string> &header);
// Append data for this row to a vector of debug strings.
void AppendDebugInfo(const ParagraphTheory &theory, std::vector<std::string> &dbg) const;
const RowInfo *ri_;
// These four constants form a horizontal box model for the white space
// on the edges of each line. At each point in the algorithm, the following
// shall hold:
// ri_->pix_ldistance = lmargin_ + lindent_
// ri_->pix_rdistance = rindent_ + rmargin_
int lmargin_;
int lindent_;
int rindent_;
int rmargin_;
private:
// Hypotheses of either LT_START or LT_BODY
std::vector<LineHypothesis> hypotheses_;
};
// A collection of convenience functions for wrapping the set of
// Paragraph Models we believe correctly model the paragraphs in the image.
class ParagraphTheory {
public:
// We presume models will outlive us, and that models will take ownership
// of any ParagraphModel *'s we add.
explicit ParagraphTheory(std::vector<ParagraphModel *> *models) : models_(models) {}
std::vector<ParagraphModel *> &models() {
return *models_;
}
const std::vector<ParagraphModel *> &models() const {
return *models_;
}
// Return an existing model if one that is Comparable() can be found.
// Else, allocate a new copy of model to save and return a pointer to it.
const ParagraphModel *AddModel(const ParagraphModel &model);
// Discard any models we've made that are not in the list of used models.
void DiscardUnusedModels(const SetOfModels &used_models);
// Return the set of all non-centered models.
void NonCenteredModels(SetOfModels *models);
// If any of the non-centered paragraph models we know about fit
// rows[start, end), return it. Else nullptr.
const ParagraphModel *Fits(const std::vector<RowScratchRegisters> *rows, int start,
int end) const;
int IndexOf(const ParagraphModel *model) const;
private:
std::vector<ParagraphModel *> *models_;
std::vector<ParagraphModel *> models_we_added_;
};
bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,
const ParagraphModel *model);
bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,
const ParagraphModel *model);
bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,
const ParagraphModel *model);
// A class for smearing Paragraph Model hypotheses to surrounding rows.
// The idea here is that StrongEvidenceClassify first marks only exceedingly
// obvious start and body rows and constructs models of them. Thereafter,
// we may have left over unmarked lines (mostly end-of-paragraph lines) which
// were too short to have much confidence about, but which fit the models we've
// constructed perfectly and which we ought to mark. This class is used to
// "smear" our models over the text.
class ParagraphModelSmearer {
public:
ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
ParagraphTheory *theory);
// Smear forward paragraph models from existing row markings to subsequent
// text lines if they fit, and mark any thereafter still unmodeled rows
// with any model in the theory that fits them.
void Smear();
private:
// Record in open_models_ for rows [start_row, end_row) the list of models
// currently open at each row.
// A model is still open in a row if some previous row has said model as a
// start hypothesis, and all rows since (including this row) would fit as
// either a body or start line in that model.
void CalculateOpenModels(int row_start, int row_end);
SetOfModels &OpenModels(int row) {
return open_models_[row - row_start_ + 1];
}
ParagraphTheory *theory_;
std::vector<RowScratchRegisters> *rows_;
int row_start_;
int row_end_;
// open_models_ corresponds to rows[start_row_ - 1, end_row_]
//
// open_models_: Contains models which there was an active (open) paragraph
// as of the previous line and for which the left and right
// indents admit the possibility that this text line continues
// to fit the same model.
// TODO(eger): Think about whether we can get rid of "Open" models and just
// use the current hypotheses on RowScratchRegisters.
std::vector<SetOfModels> open_models_;
};
// Clear all hypotheses about lines [start, end) and reset the margins to the
// percentile (0..100) value of the left and right row edges for this run of
// rows.
void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,
int end, int percentile);
// Return the median inter-word space in rows[row_start, row_end).
int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end);
// Return whether the first word on the after line can fit in the space at
// the end of the before line (knowing which way the text is aligned and read).
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after,
tesseract::ParagraphJustification justification);
// Return whether the first word on the after line can fit in the space at
// the end of the before line (not knowing the text alignment).
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after);
// Do rows[start, end) form a single instance of the given paragraph model?
bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,
const ParagraphModel *model);
// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
// normalize each row_owner to point to an actual PARA, and output the
// paragraphs in order onto paragraphs.
void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs);
} // namespace tesseract
#endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_

View File

@ -0,0 +1,358 @@
///////////////////////////////////////////////////////////////////////
// File: paramsd.cpp
// Description: Tesseract parameter Editor
// Author: Joern Wanke
//
// (C) Copyright 2007, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
//
// The parameters editor is used to edit all the parameters used within
// tesseract from the ui.
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
# include "config_auto.h"
#endif
#ifndef GRAPHICS_DISABLED
# include "params.h" // for ParamsVectors, StringParam, BoolParam
# include "paramsd.h"
# include "scrollview.h" // for SVEvent, ScrollView, SVET_POPUP
# include "svmnode.h" // for SVMenuNode
# include "tesseractclass.h" // for Tesseract
# include <cstdio> // for fclose, fopen, fprintf, sprintf, FILE
# include <cstdlib> // for atoi
# include <cstring> // for strcmp, strcspn, strlen, strncpy
# include <locale> // for std::locale::classic
# include <map> // for map, _Rb_tree_iterator, map<>::iterator
# include <memory> // for unique_ptr
# include <sstream> // for std::stringstream
# include <utility> // for pair
namespace tesseract {
# define VARDIR "configs/" /*parameters files */
# define MAX_ITEMS_IN_SUBMENU 30
// The following variables should remain static globals, since they
// are used by debug editor, which uses a single Tesseract instance.
//
// Contains the mappings from unique VC ids to their actual pointers.
static std::map<int, ParamContent *> vcMap;
static int nrParams = 0;
static int writeCommands[2];
// Constructors for the various ParamTypes.
ParamContent::ParamContent(tesseract::StringParam *it) {
my_id_ = nrParams;
nrParams++;
param_type_ = VT_STRING;
sIt = it;
vcMap[my_id_] = this;
}
// Constructors for the various ParamTypes.
ParamContent::ParamContent(tesseract::IntParam *it) {
my_id_ = nrParams;
nrParams++;
param_type_ = VT_INTEGER;
iIt = it;
vcMap[my_id_] = this;
}
// Constructors for the various ParamTypes.
ParamContent::ParamContent(tesseract::BoolParam *it) {
my_id_ = nrParams;
nrParams++;
param_type_ = VT_BOOLEAN;
bIt = it;
vcMap[my_id_] = this;
}
// Constructors for the various ParamTypes.
ParamContent::ParamContent(tesseract::DoubleParam *it) {
my_id_ = nrParams;
nrParams++;
param_type_ = VT_DOUBLE;
dIt = it;
vcMap[my_id_] = this;
}
// Gets a VC object identified by its ID.
ParamContent *ParamContent::GetParamContentById(int id) {
return vcMap[id];
}
// Copy the first N words from the source string to the target string.
// Words are delimited by "_".
void ParamsEditor::GetFirstWords(const char *s, // source string
int n, // number of words
char *t // target string
) {
int full_length = strlen(s);
int reqd_len = 0; // No. of chars requird
const char *next_word = s;
while ((n > 0) && reqd_len < full_length) {
reqd_len += strcspn(next_word, "_") + 1;
next_word += reqd_len;
n--;
}
strncpy(t, s, reqd_len);
t[reqd_len] = '\0'; // ensure null terminal
}
// Getter for the name.
const char *ParamContent::GetName() const {
if (param_type_ == VT_INTEGER) {
return iIt->name_str();
} else if (param_type_ == VT_BOOLEAN) {
return bIt->name_str();
} else if (param_type_ == VT_DOUBLE) {
return dIt->name_str();
} else if (param_type_ == VT_STRING) {
return sIt->name_str();
} else {
return "ERROR: ParamContent::GetName()";
}
}
// Getter for the description.
const char *ParamContent::GetDescription() const {
if (param_type_ == VT_INTEGER) {
return iIt->info_str();
} else if (param_type_ == VT_BOOLEAN) {
return bIt->info_str();
} else if (param_type_ == VT_DOUBLE) {
return dIt->info_str();
} else if (param_type_ == VT_STRING) {
return sIt->info_str();
} else {
return nullptr;
}
}
// Getter for the value.
std::string ParamContent::GetValue() const {
std::string result;
if (param_type_ == VT_INTEGER) {
result += std::to_string(*iIt);
} else if (param_type_ == VT_BOOLEAN) {
result += std::to_string(*bIt);
} else if (param_type_ == VT_DOUBLE) {
result += std::to_string(*dIt);
} else if (param_type_ == VT_STRING) {
result = sIt->c_str();
}
return result;
}
// Setter for the value.
void ParamContent::SetValue(const char *val) {
// TODO (wanke) Test if the values actually are properly converted.
// (Quickly visible impacts?)
changed_ = true;
if (param_type_ == VT_INTEGER) {
iIt->set_value(atoi(val));
} else if (param_type_ == VT_BOOLEAN) {
bIt->set_value(atoi(val));
} else if (param_type_ == VT_DOUBLE) {
std::stringstream stream(val);
// Use "C" locale for reading double value.
stream.imbue(std::locale::classic());
double d = 0;
stream >> d;
dIt->set_value(d);
} else if (param_type_ == VT_STRING) {
sIt->set_value(val);
}
}
// Gets the up to the first 3 prefixes from s (split by _).
// For example, tesseract_foo_bar will be split into tesseract,foo and bar.
void ParamsEditor::GetPrefixes(const char *s, std::string *level_one, std::string *level_two,
std::string *level_three) {
std::unique_ptr<char[]> p(new char[1024]);
GetFirstWords(s, 1, p.get());
*level_one = p.get();
GetFirstWords(s, 2, p.get());
*level_two = p.get();
GetFirstWords(s, 3, p.get());
*level_three = p.get();
}
// Compare two VC objects by their name.
int ParamContent::Compare(const void *v1, const void *v2) {
const ParamContent *one = *static_cast<const ParamContent *const *>(v1);
const ParamContent *two = *static_cast<const ParamContent *const *>(v2);
return strcmp(one->GetName(), two->GetName());
}
// Find all editable parameters used within tesseract and create a
// SVMenuNode tree from it.
// TODO (wanke): This is actually sort of hackish.
SVMenuNode *ParamsEditor::BuildListOfAllLeaves(tesseract::Tesseract *tess) {
auto *mr = new SVMenuNode();
ParamContent_LIST vclist;
ParamContent_IT vc_it(&vclist);
// Amount counts the number of entries for a specific char*.
// TODO(rays) get rid of the use of std::map.
std::map<const char *, int> amount;
// Add all parameters to a list.
int num_iterations = (tess->params() == nullptr) ? 1 : 2;
for (int v = 0; v < num_iterations; ++v) {
tesseract::ParamsVectors *vec = (v == 0) ? GlobalParams() : tess->params();
for (auto &param : vec->int_params) {
vc_it.add_after_then_move(new ParamContent(param));
}
for (auto &param : vec->bool_params) {
vc_it.add_after_then_move(new ParamContent(param));
}
for (auto &param : vec->string_params) {
vc_it.add_after_then_move(new ParamContent(param));
}
for (auto &param : vec->double_params) {
vc_it.add_after_then_move(new ParamContent(param));
}
}
// Count the # of entries starting with a specific prefix.
for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {
ParamContent *vc = vc_it.data();
std::string tag;
std::string tag2;
std::string tag3;
GetPrefixes(vc->GetName(), &tag, &tag2, &tag3);
amount[tag.c_str()]++;
amount[tag2.c_str()]++;
amount[tag3.c_str()]++;
}
vclist.sort(ParamContent::Compare); // Sort the list alphabetically.
SVMenuNode *other = mr->AddChild("OTHER");
// go through the list again and this time create the menu structure.
vc_it.move_to_first();
for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {
ParamContent *vc = vc_it.data();
std::string tag;
std::string tag2;
std::string tag3;
GetPrefixes(vc->GetName(), &tag, &tag2, &tag3);
if (amount[tag.c_str()] == 1) {
other->AddChild(vc->GetName(), vc->GetId(), vc->GetValue().c_str(), vc->GetDescription());
} else { // More than one would use this submenu -> create submenu.
SVMenuNode *sv = mr->AddChild(tag.c_str());
if ((amount[tag.c_str()] <= MAX_ITEMS_IN_SUBMENU) || (amount[tag2.c_str()] <= 1)) {
sv->AddChild(vc->GetName(), vc->GetId(), vc->GetValue().c_str(), vc->GetDescription());
} else { // Make subsubmenus.
SVMenuNode *sv2 = sv->AddChild(tag2.c_str());
sv2->AddChild(vc->GetName(), vc->GetId(), vc->GetValue().c_str(), vc->GetDescription());
}
}
}
return mr;
}
// Event listener. Waits for SVET_POPUP events and processes them.
void ParamsEditor::Notify(const SVEvent *sve) {
if (sve->type == SVET_POPUP) { // only catch SVET_POPUP!
char *param = sve->parameter;
if (sve->command_id == writeCommands[0]) {
WriteParams(param, false);
} else if (sve->command_id == writeCommands[1]) {
WriteParams(param, true);
} else {
ParamContent *vc = ParamContent::GetParamContentById(sve->command_id);
vc->SetValue(param);
sv_window_->AddMessage("Setting %s to %s", vc->GetName(), vc->GetValue().c_str());
}
}
}
// Integrate the parameters editor as popupmenu into the existing scrollview
// window (usually the pg editor). If sv == null, create a new empty
// empty window and attach the parameters editor to that window (ugly).
ParamsEditor::ParamsEditor(tesseract::Tesseract *tess, ScrollView *sv) {
if (sv == nullptr) {
const char *name = "ParamEditorMAIN";
sv = new ScrollView(name, 1, 1, 200, 200, 300, 200);
}
sv_window_ = sv;
// Only one event handler per window.
// sv->AddEventHandler((SVEventHandler*) this);
SVMenuNode *svMenuRoot = BuildListOfAllLeaves(tess);
std::string paramfile;
paramfile = tess->datadir;
paramfile += VARDIR; // parameters dir
paramfile += "edited"; // actual name
SVMenuNode *std_menu = svMenuRoot->AddChild("Build Config File");
writeCommands[0] = nrParams + 1;
std_menu->AddChild("All Parameters", writeCommands[0], paramfile.c_str(), "Config file name?");
writeCommands[1] = nrParams + 2;
std_menu->AddChild("changed_ Parameters Only", writeCommands[1], paramfile.c_str(),
"Config file name?");
svMenuRoot->BuildMenu(sv, false);
}
// Write all (changed_) parameters to a config file.
void ParamsEditor::WriteParams(char *filename, bool changes_only) {
FILE *fp; // input file
char msg_str[255];
// if file exists
if ((fp = fopen(filename, "rb")) != nullptr) {
fclose(fp);
sprintf(msg_str,
"Overwrite file "
"%s"
"? (Y/N)",
filename);
int a = sv_window_->ShowYesNoDialog(msg_str);
if (a == 'n') {
return;
} // don't write
}
fp = fopen(filename, "wb"); // can we write to it?
if (fp == nullptr) {
sv_window_->AddMessage(
"Can't write to file "
"%s"
"",
filename);
return;
}
for (auto &iter : vcMap) {
ParamContent *cur = iter.second;
if (!changes_only || cur->HasChanged()) {
fprintf(fp, "%-25s %-12s # %s\n", cur->GetName(), cur->GetValue().c_str(),
cur->GetDescription());
}
}
fclose(fp);
}
} // namespace tesseract
#endif // !GRAPHICS_DISABLED

View File

@ -0,0 +1,130 @@
///////////////////////////////////////////////////////////////////////
// File: paramsd.h
// Description: Tesseract parameter editor
// Author: Joern Wanke
//
// (C) Copyright 2007, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
//
// Tesseract parameter editor is used to edit all the parameters used
// within tesseract from the ui.
#ifndef TESSERACT_CCMAIN_PARAMSD_H_
#define TESSERACT_CCMAIN_PARAMSD_H_
#ifndef GRAPHICS_DISABLED
# include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
# include "scrollview.h" // for ScrollView (ptr only), SVEvent (ptr only)
namespace tesseract {
class SVMenuNode;
class BoolParam;
class DoubleParam;
class IntParam;
class StringParam;
class Tesseract;
// A list of all possible parameter types used.
enum ParamType { VT_INTEGER, VT_BOOLEAN, VT_STRING, VT_DOUBLE };
// A rather hackish helper structure which can take any kind of parameter input
// (defined by ParamType) and do a couple of common operations on them, like
// comparisond or getting its value. It is used in the context of the
// ParamsEditor as a bridge from the internal tesseract parameters to the
// ones displayed by the ScrollView server.
class ParamContent : public ELIST_LINK {
public:
// Compare two VC objects by their name.
static int Compare(const void *v1, const void *v2);
// Gets a VC object identified by its ID.
static ParamContent *GetParamContentById(int id);
// Constructors for the various ParamTypes.
ParamContent() = default;
explicit ParamContent(tesseract::StringParam *it);
explicit ParamContent(tesseract::IntParam *it);
explicit ParamContent(tesseract::BoolParam *it);
explicit ParamContent(tesseract::DoubleParam *it);
// Getters and Setters.
void SetValue(const char *val);
std::string GetValue() const;
const char *GetName() const;
const char *GetDescription() const;
int GetId() const {
return my_id_;
}
bool HasChanged() const {
return changed_;
}
private:
// The unique ID of this VC object.
int my_id_;
// Whether the parameter was changed_ and thus needs to be rewritten.
bool changed_ = false;
// The actual ParamType of this VC object.
ParamType param_type_;
union {
tesseract::StringParam *sIt;
tesseract::IntParam *iIt;
tesseract::BoolParam *bIt;
tesseract::DoubleParam *dIt;
};
};
ELISTIZEH(ParamContent)
// The parameters editor enables the user to edit all the parameters used within
// tesseract. It can be invoked on its own, but is supposed to be invoked by
// the program editor.
class ParamsEditor : public SVEventHandler {
public:
// Integrate the parameters editor as popupmenu into the existing scrollview
// window (usually the pg editor). If sv == null, create a new empty
// empty window and attach the parameter editor to that window (ugly).
explicit ParamsEditor(tesseract::Tesseract *, ScrollView *sv = nullptr);
// Event listener. Waits for SVET_POPUP events and processes them.
void Notify(const SVEvent *sve) override;
private:
// Gets the up to the first 3 prefixes from s (split by _).
// For example, tesseract_foo_bar will be split into tesseract,foo and bar.
void GetPrefixes(const char *s, std::string *level_one, std::string *level_two, std::string *level_three);
// Gets the first n words (split by _) and puts them in t.
// For example, tesseract_foo_bar with N=2 will yield tesseract_foo_.
void GetFirstWords(const char *s, // source string
int n, // number of words
char *t); // target string
// Find all editable parameters used within tesseract and create a
// SVMenuNode tree from it.
SVMenuNode *BuildListOfAllLeaves(tesseract::Tesseract *tess);
// Write all (changed_) parameters to a config file.
void WriteParams(char *filename, bool changes_only);
ScrollView *sv_window_;
};
} // namespace tesseract
#endif // !GRAPHICS_DISABLED
#endif // TESSERACT_CCMAIN_PARAMSD_H_

View File

@ -0,0 +1,958 @@
/**********************************************************************
* File: pgedit.cpp (Formerly pgeditor.c)
* Description: Page structure file editor
* Author: Phil Cheatle
*
*(C) Copyright 1991, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0(the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http:// www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
# include "config_auto.h"
#endif
#include "pgedit.h"
#include "blread.h"
#include "control.h"
#include "pageres.h"
#include "paramsd.h"
#include "scrollview.h"
#include "statistc.h"
#include "svmnode.h"
#include "tesseractclass.h"
#include "tordmain.h"
#include "werdit.h"
#include <cctype>
#include <cmath>
#ifndef GRAPHICS_DISABLED
namespace tesseract {
# define ASC_HEIGHT (2 * kBlnBaselineOffset + kBlnXHeight)
# define X_HEIGHT (kBlnBaselineOffset + kBlnXHeight)
# define BL_HEIGHT kBlnBaselineOffset
# define DESC_HEIGHT 0
enum CMD_EVENTS {
NULL_CMD_EVENT,
CHANGE_DISP_CMD_EVENT,
DUMP_WERD_CMD_EVENT,
SHOW_POINT_CMD_EVENT,
SHOW_BLN_WERD_CMD_EVENT,
DEBUG_WERD_CMD_EVENT,
BLAMER_CMD_EVENT,
BOUNDING_BOX_CMD_EVENT,
CORRECT_TEXT_CMD_EVENT,
POLYGONAL_CMD_EVENT,
BL_NORM_CMD_EVENT,
BITMAP_CMD_EVENT,
IMAGE_CMD_EVENT,
BLOCKS_CMD_EVENT,
BASELINES_CMD_EVENT,
UNIFORM_DISP_CMD_EVENT,
REFRESH_CMD_EVENT,
QUIT_CMD_EVENT,
RECOG_WERDS,
RECOG_PSEUDO,
SHOW_BLOB_FEATURES,
SHOW_SUBSCRIPT_CMD_EVENT,
SHOW_SUPERSCRIPT_CMD_EVENT,
SHOW_ITALIC_CMD_EVENT,
SHOW_BOLD_CMD_EVENT,
SHOW_UNDERLINE_CMD_EVENT,
SHOW_FIXEDPITCH_CMD_EVENT,
SHOW_SERIF_CMD_EVENT,
SHOW_SMALLCAPS_CMD_EVENT,
SHOW_DROPCAPS_CMD_EVENT,
};
enum ColorationMode {
CM_RAINBOW,
CM_SUBSCRIPT,
CM_SUPERSCRIPT,
CM_ITALIC,
CM_BOLD,
CM_UNDERLINE,
CM_FIXEDPITCH,
CM_SERIF,
CM_SMALLCAPS,
CM_DROPCAPS
};
/*
*
* Some global data
*
*/
static ScrollView *image_win;
static ParamsEditor *pe;
static bool stillRunning = false;
static ScrollView *bln_word_window = nullptr; // baseline norm words
static CMD_EVENTS mode = CHANGE_DISP_CMD_EVENT; // selected words op
static bool recog_done = false; // recog_all_words was called
// These variables should remain global, since they are only used for the
// debug mode (in which only a single Tesseract thread/instance will exist).
static std::bitset<16> word_display_mode;
static ColorationMode color_mode = CM_RAINBOW;
static bool display_image = false;
static bool display_blocks = false;
static bool display_baselines = false;
static PAGE_RES *current_page_res = nullptr;
STRING_VAR(editor_image_win_name, "EditorImage", "Editor image window name");
INT_VAR(editor_image_xpos, 590, "Editor image X Pos");
INT_VAR(editor_image_ypos, 10, "Editor image Y Pos");
static INT_VAR(editor_image_menuheight, 50, "Add to image height for menu bar");
INT_VAR(editor_image_word_bb_color, ScrollView::BLUE, "Word bounding box colour");
INT_VAR(editor_image_blob_bb_color, ScrollView::YELLOW, "Blob bounding box colour");
INT_VAR(editor_image_text_color, ScrollView::WHITE, "Correct text colour");
STRING_VAR(editor_dbwin_name, "EditorDBWin", "Editor debug window name");
INT_VAR(editor_dbwin_xpos, 50, "Editor debug window X Pos");
INT_VAR(editor_dbwin_ypos, 500, "Editor debug window Y Pos");
INT_VAR(editor_dbwin_height, 24, "Editor debug window height");
INT_VAR(editor_dbwin_width, 80, "Editor debug window width");
STRING_VAR(editor_word_name, "BlnWords", "BL normalized word window");
INT_VAR(editor_word_xpos, 60, "Word window X Pos");
INT_VAR(editor_word_ypos, 510, "Word window Y Pos");
INT_VAR(editor_word_height, 240, "Word window height");
INT_VAR(editor_word_width, 655, "Word window width");
/**
* show_point()
*
* Show coords of point, blob bounding box, word bounding box and offset from
* row baseline
*/
static void show_point(PAGE_RES *page_res, float x, float y) {
FCOORD pt(x, y);
PAGE_RES_IT pr_it(page_res);
const int kBufsize = 512;
char msg[kBufsize];
char *msg_ptr = msg;
msg_ptr += sprintf(msg_ptr, "Pt:(%0.3f, %0.3f) ", x, y);
for (WERD_RES *word = pr_it.word(); word != nullptr; word = pr_it.forward()) {
if (pr_it.row() != pr_it.prev_row() && pr_it.row()->row->bounding_box().contains(pt)) {
msg_ptr += sprintf(msg_ptr, "BL(x)=%0.3f ", pr_it.row()->row->base_line(x));
}
if (word->word->bounding_box().contains(pt)) {
TBOX box = word->word->bounding_box();
msg_ptr += sprintf(msg_ptr, "Wd(%d, %d)/(%d, %d) ", box.left(), box.bottom(), box.right(),
box.top());
C_BLOB_IT cblob_it(word->word->cblob_list());
for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) {
C_BLOB *cblob = cblob_it.data();
box = cblob->bounding_box();
if (box.contains(pt)) {
msg_ptr += sprintf(msg_ptr, "CBlb(%d, %d)/(%d, %d) ", box.left(), box.bottom(),
box.right(), box.top());
}
}
}
}
image_win->AddMessage(msg);
}
/**
* pgeditor_msg()
*
* Display a message - in the command window if there is one, or to stdout
*/
static void pgeditor_msg( // message display
const char *msg) {
image_win->AddMessage(msg);
}
class BlnEventHandler : public SVEventHandler {
public:
void Notify(const SVEvent *sv_event) override {
if (sv_event->type == SVET_DESTROY) {
bln_word_window = nullptr;
} else if (sv_event->type == SVET_CLICK) {
show_point(current_page_res, sv_event->x, sv_event->y);
}
}
};
/**
* bln_word_window_handle()
*
* @return a WINDOW for the word window, creating it if necessary
*/
static ScrollView *bln_word_window_handle() { // return handle
// not opened yet
if (bln_word_window == nullptr) {
pgeditor_msg("Creating BLN word window...");
bln_word_window = new ScrollView(editor_word_name.c_str(), editor_word_xpos, editor_word_ypos,
editor_word_width, editor_word_height, 4000, 4000, true);
auto *a = new BlnEventHandler();
bln_word_window->AddEventHandler(a);
pgeditor_msg("Creating BLN word window...Done");
}
return bln_word_window;
}
/**
* build_image_window()
*
* Destroy the existing image window if there is one. Work out how big the
* new window needs to be. Create it and re-display.
*/
static void build_image_window(int width, int height) {
delete image_win;
image_win = new ScrollView(editor_image_win_name.c_str(), editor_image_xpos, editor_image_ypos,
width + 1, height + editor_image_menuheight + 1, width, height, true);
}
/**
* display_bln_lines()
*
* Display normalized baseline, x-height, ascender limit and descender limit
*/
static void display_bln_lines(ScrollView *window, ScrollView::Color colour, float scale_factor,
float y_offset, float minx, float maxx) {
window->Pen(colour);
window->Line(minx, y_offset + scale_factor * DESC_HEIGHT, maxx,
y_offset + scale_factor * DESC_HEIGHT);
window->Line(minx, y_offset + scale_factor * BL_HEIGHT, maxx,
y_offset + scale_factor * BL_HEIGHT);
window->Line(minx, y_offset + scale_factor * X_HEIGHT, maxx, y_offset + scale_factor * X_HEIGHT);
window->Line(minx, y_offset + scale_factor * ASC_HEIGHT, maxx,
y_offset + scale_factor * ASC_HEIGHT);
}
/**
* notify()
*
* Event handler that processes incoming events, either forwarding
* them to process_cmd_win_event or process_image_event.
*
*/
void PGEventHandler::Notify(const SVEvent *event) {
char myval = '0';
if (event->type == SVET_POPUP) {
pe->Notify(event);
} // These are handled by ParamsEditor
else if (event->type == SVET_EXIT) {
stillRunning = false;
} else if (event->type == SVET_MENU) {
if (strcmp(event->parameter, "true") == 0) {
myval = 'T';
} else if (strcmp(event->parameter, "false") == 0) {
myval = 'F';
}
tess_->process_cmd_win_event(event->command_id, &myval);
} else {
tess_->process_image_event(*event);
}
}
/**
* build_menu()
*
* Construct the menu tree used by the command window
*/
SVMenuNode *Tesseract::build_menu_new() {
SVMenuNode *parent_menu;
auto *root_menu_item = new SVMenuNode();
SVMenuNode *modes_menu_item = root_menu_item->AddChild("MODES");
modes_menu_item->AddChild("Change Display", CHANGE_DISP_CMD_EVENT);
modes_menu_item->AddChild("Dump Word", DUMP_WERD_CMD_EVENT);
modes_menu_item->AddChild("Show Point", SHOW_POINT_CMD_EVENT);
modes_menu_item->AddChild("Show BL Norm Word", SHOW_BLN_WERD_CMD_EVENT);
modes_menu_item->AddChild("Config Words", DEBUG_WERD_CMD_EVENT);
modes_menu_item->AddChild("Recog Words", RECOG_WERDS);
modes_menu_item->AddChild("Recog Blobs", RECOG_PSEUDO);
modes_menu_item->AddChild("Show Blob Features", SHOW_BLOB_FEATURES);
parent_menu = root_menu_item->AddChild("DISPLAY");
parent_menu->AddChild("Blamer", BLAMER_CMD_EVENT, false);
parent_menu->AddChild("Bounding Boxes", BOUNDING_BOX_CMD_EVENT, false);
parent_menu->AddChild("Correct Text", CORRECT_TEXT_CMD_EVENT, false);
parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, false);
parent_menu->AddChild("Baseline Normalized", BL_NORM_CMD_EVENT, false);
parent_menu->AddChild("Edge Steps", BITMAP_CMD_EVENT, true);
parent_menu->AddChild("Subscripts", SHOW_SUBSCRIPT_CMD_EVENT);
parent_menu->AddChild("Superscripts", SHOW_SUPERSCRIPT_CMD_EVENT);
parent_menu->AddChild("Italics", SHOW_ITALIC_CMD_EVENT);
parent_menu->AddChild("Bold", SHOW_BOLD_CMD_EVENT);
parent_menu->AddChild("Underline", SHOW_UNDERLINE_CMD_EVENT);
parent_menu->AddChild("FixedPitch", SHOW_FIXEDPITCH_CMD_EVENT);
parent_menu->AddChild("Serifs", SHOW_SERIF_CMD_EVENT);
parent_menu->AddChild("SmallCaps", SHOW_SMALLCAPS_CMD_EVENT);
parent_menu->AddChild("DropCaps", SHOW_DROPCAPS_CMD_EVENT);
parent_menu = root_menu_item->AddChild("OTHER");
parent_menu->AddChild("Quit", QUIT_CMD_EVENT);
parent_menu->AddChild("Show Image", IMAGE_CMD_EVENT, false);
parent_menu->AddChild("ShowBlock Outlines", BLOCKS_CMD_EVENT, false);
parent_menu->AddChild("Show Baselines", BASELINES_CMD_EVENT, false);
parent_menu->AddChild("Uniform Display", UNIFORM_DISP_CMD_EVENT);
parent_menu->AddChild("Refresh Display", REFRESH_CMD_EVENT);
return root_menu_item;
}
/**
* do_re_display()
*
* Redisplay page
*/
void Tesseract::do_re_display(bool (tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it)) {
int block_count = 1;
image_win->Clear();
if (display_image) {
image_win->Draw(pix_binary_, 0, 0);
}
image_win->Brush(ScrollView::NONE);
PAGE_RES_IT pr_it(current_page_res);
for (WERD_RES *word = pr_it.word(); word != nullptr; word = pr_it.forward()) {
(this->*word_painter)(&pr_it);
if (display_baselines && pr_it.row() != pr_it.prev_row()) {
pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN);
}
if (display_blocks && pr_it.block() != pr_it.prev_block()) {
pr_it.block()->block->pdblk.plot(image_win, block_count++, ScrollView::RED);
}
}
image_win->Update();
}
/**
* pgeditor_main()
*
* Top level editor operation:
* Setup a new window and an according event handler
*
*/
void Tesseract::pgeditor_main(int width, int height, PAGE_RES *page_res) {
current_page_res = page_res;
if (current_page_res->block_res_list.empty()) {
return;
}
recog_done = false;
stillRunning = true;
build_image_window(width, height);
word_display_mode.set(DF_EDGE_STEP);
do_re_display(&tesseract::Tesseract::word_set_display);
# ifndef GRAPHICS_DISABLED
pe = new ParamsEditor(this, image_win);
# endif
PGEventHandler pgEventHandler(this);
image_win->AddEventHandler(&pgEventHandler);
image_win->AddMessageBox();
SVMenuNode *svMenuRoot = build_menu_new();
svMenuRoot->BuildMenu(image_win);
image_win->SetVisible(true);
image_win->AwaitEvent(SVET_DESTROY);
image_win->AddEventHandler(nullptr);
}
/**
* process_cmd_win_event()
*
* Process a command returned from the command window
* (Just call the appropriate command handler)
*/
bool Tesseract::process_cmd_win_event( // UI command semantics
int32_t cmd_event, // which menu item?
char *new_value // any prompt data
) {
char msg[160];
bool exit = false;
color_mode = CM_RAINBOW;
// Run recognition on the full page if needed.
switch (cmd_event) {
case BLAMER_CMD_EVENT:
case SHOW_SUBSCRIPT_CMD_EVENT:
case SHOW_SUPERSCRIPT_CMD_EVENT:
case SHOW_ITALIC_CMD_EVENT:
case SHOW_BOLD_CMD_EVENT:
case SHOW_UNDERLINE_CMD_EVENT:
case SHOW_FIXEDPITCH_CMD_EVENT:
case SHOW_SERIF_CMD_EVENT:
case SHOW_SMALLCAPS_CMD_EVENT:
case SHOW_DROPCAPS_CMD_EVENT:
if (!recog_done) {
recog_all_words(current_page_res, nullptr, nullptr, nullptr, 0);
recog_done = true;
}
break;
default:
break;
}
char *parameter;
switch (cmd_event) {
case NULL_CMD_EVENT:
break;
case CHANGE_DISP_CMD_EVENT:
case DUMP_WERD_CMD_EVENT:
case SHOW_POINT_CMD_EVENT:
case SHOW_BLN_WERD_CMD_EVENT:
case RECOG_WERDS:
case RECOG_PSEUDO:
case SHOW_BLOB_FEATURES:
mode = static_cast<CMD_EVENTS>(cmd_event);
break;
case DEBUG_WERD_CMD_EVENT:
mode = DEBUG_WERD_CMD_EVENT;
parameter = image_win->ShowInputDialog("Config File Name");
word_config_ = parameter;
delete[] parameter;
break;
case BOUNDING_BOX_CMD_EVENT:
if (new_value[0] == 'T') {
word_display_mode.set(DF_BOX);
} else {
word_display_mode.reset(DF_BOX);
}
mode = CHANGE_DISP_CMD_EVENT;
break;
case BLAMER_CMD_EVENT:
if (new_value[0] == 'T') {
word_display_mode.set(DF_BLAMER);
} else {
word_display_mode.reset(DF_BLAMER);
}
do_re_display(&tesseract::Tesseract::word_display);
mode = CHANGE_DISP_CMD_EVENT;
break;
case CORRECT_TEXT_CMD_EVENT:
if (new_value[0] == 'T') {
word_display_mode.set(DF_TEXT);
} else {
word_display_mode.reset(DF_TEXT);
}
mode = CHANGE_DISP_CMD_EVENT;
break;
case POLYGONAL_CMD_EVENT:
if (new_value[0] == 'T') {
word_display_mode.set(DF_POLYGONAL);
} else {
word_display_mode.reset(DF_POLYGONAL);
}
mode = CHANGE_DISP_CMD_EVENT;
break;
case BL_NORM_CMD_EVENT:
if (new_value[0] == 'T') {
word_display_mode.set(DF_BN_POLYGONAL);
} else {
word_display_mode.reset(DF_BN_POLYGONAL);
}
mode = CHANGE_DISP_CMD_EVENT;
break;
case BITMAP_CMD_EVENT:
if (new_value[0] == 'T') {
word_display_mode.set(DF_EDGE_STEP);
} else {
word_display_mode.reset(DF_EDGE_STEP);
}
mode = CHANGE_DISP_CMD_EVENT;
break;
case UNIFORM_DISP_CMD_EVENT:
do_re_display(&tesseract::Tesseract::word_set_display);
break;
case IMAGE_CMD_EVENT:
display_image = (new_value[0] == 'T');
do_re_display(&tesseract::Tesseract::word_display);
break;
case BLOCKS_CMD_EVENT:
display_blocks = (new_value[0] == 'T');
do_re_display(&tesseract::Tesseract::word_display);
break;
case BASELINES_CMD_EVENT:
display_baselines = (new_value[0] == 'T');
do_re_display(&tesseract::Tesseract::word_display);
break;
case SHOW_SUBSCRIPT_CMD_EVENT:
color_mode = CM_SUBSCRIPT;
do_re_display(&tesseract::Tesseract::word_display);
break;
case SHOW_SUPERSCRIPT_CMD_EVENT:
color_mode = CM_SUPERSCRIPT;
do_re_display(&tesseract::Tesseract::word_display);
break;
case SHOW_ITALIC_CMD_EVENT:
color_mode = CM_ITALIC;
do_re_display(&tesseract::Tesseract::word_display);
break;
case SHOW_BOLD_CMD_EVENT:
color_mode = CM_BOLD;
do_re_display(&tesseract::Tesseract::word_display);
break;
case SHOW_UNDERLINE_CMD_EVENT:
color_mode = CM_UNDERLINE;
do_re_display(&tesseract::Tesseract::word_display);
break;
case SHOW_FIXEDPITCH_CMD_EVENT:
color_mode = CM_FIXEDPITCH;
do_re_display(&tesseract::Tesseract::word_display);
break;
case SHOW_SERIF_CMD_EVENT:
color_mode = CM_SERIF;
do_re_display(&tesseract::Tesseract::word_display);
break;
case SHOW_SMALLCAPS_CMD_EVENT:
color_mode = CM_SMALLCAPS;
do_re_display(&tesseract::Tesseract::word_display);
break;
case SHOW_DROPCAPS_CMD_EVENT:
color_mode = CM_DROPCAPS;
do_re_display(&tesseract::Tesseract::word_display);
break;
case REFRESH_CMD_EVENT:
do_re_display(&tesseract::Tesseract::word_display);
break;
case QUIT_CMD_EVENT:
exit = true;
ScrollView::Exit();
break;
default:
snprintf(msg, sizeof(msg), "Unrecognised event %" PRId32 "(%s)", cmd_event, new_value);
image_win->AddMessage(msg);
break;
}
return exit;
}
/**
* process_image_event()
*
* User has done something in the image window - mouse down or up. Work out
* what it is and do something with it.
* If DOWN - just remember where it was.
* If UP - for each word in the selected area do the operation defined by
* the current mode.
*/
void Tesseract::process_image_event( // action in image win
const SVEvent &event) {
// The following variable should remain static, since it is used by
// debug editor, which uses a single Tesseract instance.
static ICOORD down;
ICOORD up;
TBOX selection_box;
char msg[80];
switch (event.type) {
case SVET_SELECTION:
if (event.type == SVET_SELECTION) {
down.set_x(event.x + event.x_size);
down.set_y(event.y + event.y_size);
if (mode == SHOW_POINT_CMD_EVENT) {
show_point(current_page_res, event.x, event.y);
}
}
up.set_x(event.x);
up.set_y(event.y);
selection_box = TBOX(down, up);
switch (mode) {
case CHANGE_DISP_CMD_EVENT:
process_selected_words(current_page_res, selection_box,
&tesseract::Tesseract::word_blank_and_set_display);
break;
case DUMP_WERD_CMD_EVENT:
process_selected_words(current_page_res, selection_box,
&tesseract::Tesseract::word_dumper);
break;
case SHOW_BLN_WERD_CMD_EVENT:
process_selected_words(current_page_res, selection_box,
&tesseract::Tesseract::word_bln_display);
break;
case DEBUG_WERD_CMD_EVENT:
debug_word(current_page_res, selection_box);
break;
case SHOW_POINT_CMD_EVENT:
break; // ignore up event
case RECOG_WERDS:
# ifndef DISABLED_LEGACY_ENGINE
image_win->AddMessage("Recogging selected words");
this->process_selected_words(current_page_res, selection_box,
&Tesseract::recog_interactive);
# endif // ndef DISABLED_LEGACY_ENGINE
break;
case RECOG_PSEUDO:
image_win->AddMessage("Recogging selected blobs");
recog_pseudo_word(current_page_res, selection_box);
break;
case SHOW_BLOB_FEATURES:
blob_feature_display(current_page_res, selection_box);
break;
default:
sprintf(msg, "Mode %d not yet implemented", mode);
image_win->AddMessage(msg);
break;
}
default:
break;
}
}
/**
* debug_word
*
* Process the whole image, but load word_config_ for the selected word(s).
*/
void Tesseract::debug_word(PAGE_RES *page_res, const TBOX &selection_box) {
# ifndef DISABLED_LEGACY_ENGINE
ResetAdaptiveClassifier();
# endif
recog_all_words(page_res, nullptr, &selection_box, word_config_.c_str(), 0);
}
/**********************************************************************
* WERD PROCESSOR FUNCTIONS
* ========================
*
* These routines are invoked by one or more of:
* process_all_words()
* process_selected_words()
* or
* process_all_words_it()
* process_selected_words_it()
* for each word to be processed
**********************************************************************/
/**
* word_blank_and_set_display() Word processor
*
* Blank display of word then redisplay word according to current display mode
* settings
*/
bool Tesseract::word_blank_and_set_display(PAGE_RES_IT *pr_it) {
pr_it->word()->word->bounding_box().plot(image_win, ScrollView::BLACK, ScrollView::BLACK);
return word_set_display(pr_it);
}
/**
* word_bln_display()
*
* Normalize word and display in word window
*/
bool Tesseract::word_bln_display(PAGE_RES_IT *pr_it) {
WERD_RES *word_res = pr_it->word();
if (word_res->chopped_word == nullptr) {
// Setup word normalization parameters.
word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
classify_bln_numeric_mode, textord_use_cjk_fp_model,
poly_allow_detailed_fx, pr_it->row()->row, pr_it->block()->block);
}
bln_word_window_handle()->Clear();
display_bln_lines(bln_word_window_handle(), ScrollView::CYAN, 1.0, 0.0f, -1000.0f, 1000.0f);
C_BLOB_IT it(word_res->word->cblob_list());
ScrollView::Color color = WERD::NextColor(ScrollView::BLACK);
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
it.data()->plot_normed(word_res->denorm, color, ScrollView::BROWN, bln_word_window_handle());
color = WERD::NextColor(color);
}
bln_word_window_handle()->Update();
return true;
}
/**
* word_display() Word Processor
*
* Display a word according to its display modes
*/
bool Tesseract::word_display(PAGE_RES_IT *pr_it) {
WERD_RES *word_res = pr_it->word();
WERD *word = word_res->word;
TBOX word_bb; // word bounding box
int word_height; // ht of word BB
bool displayed_something = false;
float shift; // from bot left
if (color_mode != CM_RAINBOW && word_res->box_word != nullptr) {
# ifndef DISABLED_LEGACY_ENGINE
BoxWord *box_word = word_res->box_word;
WERD_CHOICE *best_choice = word_res->best_choice;
int length = box_word->length();
if (word_res->fontinfo == nullptr) {
return false;
}
const FontInfo &font_info = *word_res->fontinfo;
for (int i = 0; i < length; ++i) {
ScrollView::Color color = ScrollView::GREEN;
switch (color_mode) {
case CM_SUBSCRIPT:
if (best_choice->BlobPosition(i) == SP_SUBSCRIPT) {
color = ScrollView::RED;
}
break;
case CM_SUPERSCRIPT:
if (best_choice->BlobPosition(i) == SP_SUPERSCRIPT) {
color = ScrollView::RED;
}
break;
case CM_ITALIC:
if (font_info.is_italic()) {
color = ScrollView::RED;
}
break;
case CM_BOLD:
if (font_info.is_bold()) {
color = ScrollView::RED;
}
break;
case CM_FIXEDPITCH:
if (font_info.is_fixed_pitch()) {
color = ScrollView::RED;
}
break;
case CM_SERIF:
if (font_info.is_serif()) {
color = ScrollView::RED;
}
break;
case CM_SMALLCAPS:
if (word_res->small_caps) {
color = ScrollView::RED;
}
break;
case CM_DROPCAPS:
if (best_choice->BlobPosition(i) == SP_DROPCAP) {
color = ScrollView::RED;
}
break;
// TODO(rays) underline is currently completely unsupported.
case CM_UNDERLINE:
default:
break;
}
image_win->Pen(color);
TBOX box = box_word->BlobBox(i);
image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top());
}
return true;
# else
return false;
# endif // ndef DISABLED_LEGACY_ENGINE
}
/*
Note the double coercions of(COLOUR)((int32_t)editor_image_word_bb_color)
etc. are to keep the compiler happy.
*/
// display bounding box
if (word->display_flag(DF_BOX)) {
word->bounding_box().plot(image_win,
static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color),
static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color));
auto c = static_cast<ScrollView::Color>((int32_t)editor_image_blob_bb_color);
image_win->Pen(c);
// cblob iterator
C_BLOB_IT c_it(word->cblob_list());
for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
c_it.data()->bounding_box().plot(image_win);
}
displayed_something = true;
}
// display edge steps
if (word->display_flag(DF_EDGE_STEP)) { // edgesteps available
word->plot(image_win); // rainbow colors
displayed_something = true;
}
// display poly approx
if (word->display_flag(DF_POLYGONAL)) {
// need to convert
TWERD *tword = TWERD::PolygonalCopy(poly_allow_detailed_fx, word);
tword->plot(image_win);
delete tword;
displayed_something = true;
}
// Display correct text and blamer information.
std::string text;
std::string blame;
if (word->display_flag(DF_TEXT) && word->text() != nullptr) {
text = word->text();
}
if (word->display_flag(DF_BLAMER) &&
!(word_res->blamer_bundle != nullptr &&
word_res->blamer_bundle->incorrect_result_reason() == IRR_CORRECT)) {
text = "";
const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
if (blamer_bundle == nullptr) {
text += "NULL";
} else {
text = blamer_bundle->TruthString();
}
text += " -> ";
std::string best_choice_str;
if (word_res->best_choice == nullptr) {
best_choice_str = "NULL";
} else {
word_res->best_choice->string_and_lengths(&best_choice_str, nullptr);
}
text += best_choice_str;
IncorrectResultReason reason =
(blamer_bundle == nullptr) ? IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason();
ASSERT_HOST(reason < IRR_NUM_REASONS);
blame += " [";
blame += BlamerBundle::IncorrectReasonName(reason);
blame += "]";
}
if (text.length() > 0) {
word_bb = word->bounding_box();
image_win->Pen(ScrollView::RED);
word_height = word_bb.height();
int text_height = 0.50 * word_height;
if (text_height > 20) {
text_height = 20;
}
image_win->TextAttributes("Arial", text_height, false, false, false);
shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
image_win->Text(word_bb.left() + shift, word_bb.bottom() + 0.25 * word_height, text.c_str());
if (blame.length() > 0) {
image_win->Text(word_bb.left() + shift, word_bb.bottom() + 0.25 * word_height - text_height,
blame.c_str());
}
displayed_something = true;
}
if (!displayed_something) { // display BBox anyway
word->bounding_box().plot(image_win,
static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color),
static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color));
}
return true;
}
} // namespace tesseract
#endif // !GRAPHICS_DISABLED
namespace tesseract {
/**
* word_dumper()
*
* Dump members to the debug window
*/
bool Tesseract::word_dumper(PAGE_RES_IT *pr_it) {
if (pr_it->block()->block != nullptr) {
tprintf("\nBlock data...\n");
pr_it->block()->block->print(nullptr, false);
}
tprintf("\nRow data...\n");
pr_it->row()->row->print(nullptr);
tprintf("\nWord data...\n");
WERD_RES *word_res = pr_it->word();
word_res->word->print();
if (word_res->blamer_bundle != nullptr && wordrec_debug_blamer &&
word_res->blamer_bundle->incorrect_result_reason() != IRR_CORRECT) {
tprintf("Current blamer debug: %s\n", word_res->blamer_bundle->debug().c_str());
}
return true;
}
#ifndef GRAPHICS_DISABLED
/**
* word_set_display() Word processor
*
* Display word according to current display mode settings
*/
bool Tesseract::word_set_display(PAGE_RES_IT *pr_it) {
WERD *word = pr_it->word()->word;
word->set_display_flag(DF_BOX, word_display_mode[DF_BOX]);
word->set_display_flag(DF_TEXT, word_display_mode[DF_TEXT]);
word->set_display_flag(DF_POLYGONAL, word_display_mode[DF_POLYGONAL]);
word->set_display_flag(DF_EDGE_STEP, word_display_mode[DF_EDGE_STEP]);
word->set_display_flag(DF_BN_POLYGONAL, word_display_mode[DF_BN_POLYGONAL]);
word->set_display_flag(DF_BLAMER, word_display_mode[DF_BLAMER]);
return word_display(pr_it);
}
// page_res is non-const because the iterator doesn't know if you are going
// to change the items it points to! Really a const here though.
void Tesseract::blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box) {
# ifndef DISABLED_LEGACY_ENGINE
PAGE_RES_IT *it = make_pseudo_word(page_res, selection_box);
if (it != nullptr) {
WERD_RES *word_res = it->word();
word_res->x_height = it->row()->row->x_height();
word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
classify_bln_numeric_mode, textord_use_cjk_fp_model,
poly_allow_detailed_fx, it->row()->row, it->block()->block);
TWERD *bln_word = word_res->chopped_word;
TBLOB *bln_blob = bln_word->blobs[0];
INT_FX_RESULT_STRUCT fx_info;
std::vector<INT_FEATURE_STRUCT> bl_features;
std::vector<INT_FEATURE_STRUCT> cn_features;
Classify::ExtractFeatures(*bln_blob, classify_nonlinear_norm, &bl_features, &cn_features,
&fx_info, nullptr);
// Display baseline features.
ScrollView *bl_win = CreateFeatureSpaceWindow("BL Features", 512, 0);
ClearFeatureSpaceWindow(baseline, bl_win);
for (auto &bl_feature : bl_features) {
RenderIntFeature(bl_win, &bl_feature, ScrollView::GREEN);
}
bl_win->Update();
// Display cn features.
ScrollView *cn_win = CreateFeatureSpaceWindow("CN Features", 512, 0);
ClearFeatureSpaceWindow(character, cn_win);
for (auto &cn_feature : cn_features) {
RenderIntFeature(cn_win, &cn_feature, ScrollView::GREEN);
}
cn_win->Update();
it->DeleteCurrentWord();
delete it;
}
# endif // ndef DISABLED_LEGACY_ENGINE
}
#endif // !GRAPHICS_DISABLED
} // namespace tesseract

View File

@ -0,0 +1,68 @@
///////////////////////////////////////////////////////////////////////
// File: pgedit.h
// Description: Page structure file editor
// Author: Joern Wanke
//
// (C) Copyright 2007, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef PGEDIT_H
#define PGEDIT_H
#include "params.h" // for INT_VAR_H, IntParam, STRING_VAR_H, StringParam
#include "scrollview.h" // for SVEvent (ptr only), SVEventHandler, ScrollView
namespace tesseract {
class BLOCK_LIST;
class PAGE_RES;
class Tesseract;
#ifndef GRAPHICS_DISABLED
// A small event handler class to process incoming events to
// this window.
class PGEventHandler : public SVEventHandler {
public:
PGEventHandler(tesseract::Tesseract *tess) : tess_(tess) {}
void Notify(const SVEvent *sve) override;
private:
tesseract::Tesseract *tess_;
};
#endif // !GRAPHICS_DISABLED
extern BLOCK_LIST *current_block_list;
extern STRING_VAR_H(editor_image_win_name, "EditorImage", "Editor image window name");
extern INT_VAR_H(editor_image_xpos, 590, "Editor image X Pos");
extern INT_VAR_H(editor_image_ypos, 10, "Editor image Y Pos");
extern INT_VAR_H(editor_image_height, 680, "Editor image height");
extern INT_VAR_H(editor_image_width, 655, "Editor image width");
extern INT_VAR_H(editor_image_word_bb_color, BLUE, "Word bounding box colour");
extern INT_VAR_H(editor_image_blob_bb_color, YELLOW, "Blob bounding box colour");
extern INT_VAR_H(editor_image_text_color, WHITE, "Correct text colour");
extern STRING_VAR_H(editor_dbwin_name, "EditorDBWin", "Editor debug window name");
extern INT_VAR_H(editor_dbwin_xpos, 50, "Editor debug window X Pos");
extern INT_VAR_H(editor_dbwin_ypos, 500, "Editor debug window Y Pos");
extern INT_VAR_H(editor_dbwin_height, 24, "Editor debug window height");
extern INT_VAR_H(editor_dbwin_width, 80, "Editor debug window width");
extern STRING_VAR_H(editor_word_name, "BlnWords", "BL normalised word window");
extern INT_VAR_H(editor_word_xpos, 60, "Word window X Pos");
extern INT_VAR_H(editor_word_ypos, 510, "Word window Y Pos");
extern INT_VAR_H(editor_word_height, 240, "Word window height");
extern INT_VAR_H(editor_word_width, 655, "Word window width");
extern double_VAR_H(editor_smd_scale_factor, 1.0, "Scaling for smd image");
} // namespace tesseract
#endif

View File

@ -0,0 +1,228 @@
///////////////////////////////////////////////////////////////////////
// File: recogtraining.cpp
// Description: Functions for ambiguity and parameter training.
// Author: Daria Antonova
//
// (C) Copyright 2009, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "tesseractclass.h"
#include "boxread.h"
#include "control.h"
#include "host.h" // for NearlyEqual
#include "ratngs.h"
#ifndef DISABLED_LEGACY_ENGINE
# include "reject.h"
#endif
#include "stopper.h"
namespace tesseract {
const int16_t kMaxBoxEdgeDiff = 2;
// Sets flags necessary for recognition in the training mode.
// Opens and returns the pointer to the output file.
FILE *Tesseract::init_recog_training(const char *filename) {
if (tessedit_ambigs_training) {
tessedit_tess_adaption_mode.set_value(0); // turn off adaption
tessedit_enable_doc_dict.set_value(false); // turn off document dictionary
// Explore all segmentations.
getDict().stopper_no_acceptable_choices.set_value(true);
}
std::string output_fname = filename;
const char *lastdot = strrchr(output_fname.c_str(), '.');
if (lastdot != nullptr) {
output_fname[lastdot - output_fname.c_str()] = '\0';
}
output_fname += ".txt";
FILE *output_file = fopen(output_fname.c_str(), "a+");
if (output_file == nullptr) {
tprintf("Error: Could not open file %s\n", output_fname.c_str());
ASSERT_HOST(output_file);
}
return output_file;
}
// Copies the bounding box from page_res_it->word() to the given TBOX.
static bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) {
while (page_res_it->block() != nullptr && page_res_it->word() == nullptr) {
page_res_it->forward();
}
if (page_res_it->word() != nullptr) {
*tbox = page_res_it->word()->word->bounding_box();
// If tbox->left() is negative, the training image has vertical text and
// all the coordinates of bounding boxes of page_res are rotated by 90
// degrees in a counterclockwise direction. We need to rotate the TBOX back
// in order to compare with the TBOXes of box files.
if (tbox->left() < 0) {
tbox->rotate(FCOORD(0.0, -1.0));
}
return true;
} else {
return false;
}
}
// This function takes tif/box pair of files and runs recognition on the image,
// while making sure that the word bounds that tesseract identified roughly
// match to those specified by the input box file. For each word (ngram in a
// single bounding box from the input box file) it outputs the ocred result,
// the correct label, rating and certainty.
void Tesseract::recog_training_segmented(const char *filename, PAGE_RES *page_res,
volatile ETEXT_DESC *monitor, FILE *output_file) {
std::string box_fname = filename;
const char *lastdot = strrchr(box_fname.c_str(), '.');
if (lastdot != nullptr) {
box_fname[lastdot - box_fname.c_str()] = '\0';
}
box_fname += ".box";
// ReadNextBox() will close box_file
FILE *box_file = fopen(box_fname.c_str(), "r");
if (box_file == nullptr) {
tprintf("Error: Could not open file %s\n", box_fname.c_str());
ASSERT_HOST(box_file);
}
PAGE_RES_IT page_res_it;
page_res_it.page_res = page_res;
page_res_it.restart_page();
std::string label;
// Process all the words on this page.
TBOX tbox; // tesseract-identified box
TBOX bbox; // box from the box file
bool keep_going;
int line_number = 0;
int examined_words = 0;
do {
keep_going = read_t(&page_res_it, &tbox);
keep_going &= ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
// Align bottom left points of the TBOXes.
while (keep_going && !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
if (bbox.bottom() < tbox.bottom()) {
page_res_it.forward();
keep_going = read_t(&page_res_it, &tbox);
} else {
keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
}
}
while (keep_going && !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
if (bbox.left() > tbox.left()) {
page_res_it.forward();
keep_going = read_t(&page_res_it, &tbox);
} else {
keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
}
}
// OCR the word if top right points of the TBOXes are similar.
if (keep_going && NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
ambigs_classify_and_output(label.c_str(), &page_res_it, output_file);
examined_words++;
}
page_res_it.forward();
} while (keep_going);
// Set up scripts on all of the words that did not get sent to
// ambigs_classify_and_output. They all should have, but if all the
// werd_res's don't get uch_sets, tesseract will crash when you try
// to iterate over them. :-(
int total_words = 0;
for (page_res_it.restart_page(); page_res_it.block() != nullptr; page_res_it.forward()) {
if (page_res_it.word()) {
if (page_res_it.word()->uch_set == nullptr) {
page_res_it.word()->SetupFake(unicharset);
}
total_words++;
}
}
if (examined_words < 0.85 * total_words) {
tprintf(
"TODO(antonova): clean up recog_training_segmented; "
" It examined only a small fraction of the ambigs image.\n");
}
tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words, total_words);
}
// Helper prints the given set of blob choices.
static void PrintPath(int length, const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset,
const char *label, FILE *output_file) {
float rating = 0.0f;
float certainty = 0.0f;
for (int i = 0; i < length; ++i) {
const BLOB_CHOICE *blob_choice = blob_choices[i];
fprintf(output_file, "%s", unicharset.id_to_unichar(blob_choice->unichar_id()));
rating += blob_choice->rating();
if (certainty > blob_choice->certainty()) {
certainty = blob_choice->certainty();
}
}
fprintf(output_file, "\t%s\t%.4f\t%.4f\n", label, rating, certainty);
}
// Helper recursively prints all paths through the ratings matrix, starting
// at column col.
static void PrintMatrixPaths(int col, int dim, const MATRIX &ratings, int length,
const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset,
const char *label, FILE *output_file) {
for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) {
if (ratings.get(col, row) != NOT_CLASSIFIED) {
BLOB_CHOICE_IT bc_it(ratings.get(col, row));
for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
blob_choices[length] = bc_it.data();
if (row + 1 < dim) {
PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices, unicharset, label,
output_file);
} else {
PrintPath(length + 1, blob_choices, unicharset, label, output_file);
}
}
}
}
}
// Runs classify_word_pass1() on the current word. Outputs Tesseract's
// raw choice as a result of the classification. For words labeled with a
// single unichar also outputs all alternatives from blob_choices of the
// best choice.
void Tesseract::ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it,
FILE *output_file) {
// Classify word.
fflush(stdout);
WordData word_data(*pr_it);
SetupWordPassN(1, &word_data);
classify_word_and_language(1, pr_it, &word_data);
WERD_RES *werd_res = word_data.word;
WERD_CHOICE *best_choice = werd_res->best_choice;
ASSERT_HOST(best_choice != nullptr);
// Compute the number of unichars in the label.
std::vector<UNICHAR_ID> encoding;
if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
tprintf("Not outputting illegal unichar %s\n", label);
return;
}
// Dump all paths through the ratings matrix (which is normally small).
int dim = werd_res->ratings->dimension();
const auto **blob_choices = new const BLOB_CHOICE *[dim];
PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset, label, output_file);
delete[] blob_choices;
}
} // namespace tesseract

View File

@ -0,0 +1,785 @@
/**********************************************************************
* File: reject.cpp (Formerly reject.c)
* Description: Rejection functions used in tessedit
* Author: Phil Cheatle
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
# include "config_auto.h"
#endif
#include "reject.h"
#ifdef DISABLED_LEGACY_ENGINE
# include "tesseractclass.h"
namespace tesseract {
int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
const WERD_CHOICE &word = *werd_res->best_choice;
int dict_word_type = werd_res->tesseract->dict_word(word);
return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
}
} // namespace tesseract
#else
# include "control.h"
# include "docqual.h"
# include "tesseractclass.h"
# include "tessvars.h"
# include "helpers.h"
# include <algorithm> // for std::sort
# include <cctype>
# include <cerrno>
# include <cstring>
# include <vector> // for std::vector
namespace tesseract {
/*************************************************************************
* set_done()
*
* Set the done flag based on the word acceptability criteria
*************************************************************************/
void Tesseract::set_done(WERD_RES *word, int16_t pass) {
word->done =
word->tess_accepted && (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr);
bool word_is_ambig = word->best_choice->dangerous_ambig_found();
bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
word->best_choice->permuter() == FREQ_DAWG_PERM ||
word->best_choice->permuter() == USER_DAWG_PERM;
if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
one_ell_conflict(word, false)) {
if (tessedit_rejection_debug) {
tprintf("one_ell_conflict detected\n");
}
word->done = false;
}
if (word->done &&
((!word_from_dict && word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
if (tessedit_rejection_debug) {
tprintf("non-dict or ambig word detected\n");
}
word->done = false;
}
if (tessedit_rejection_debug) {
tprintf("set_done(): done=%d\n", word->done);
word->best_choice->print("");
}
}
/*************************************************************************
* make_reject_map()
*
* Sets the done flag to indicate whether the resylt is acceptable.
*
* Sets a reject map for the word.
*************************************************************************/
void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
int i;
int offset;
flip_0O(word);
check_debug_pt(word, -1); // For trap only
set_done(word, pass); // Set acceptance
word->reject_map.initialise(word->best_choice->unichar_lengths().length());
reject_blanks(word);
/*
0: Rays original heuristic - the baseline
*/
if (tessedit_reject_mode == 0) {
if (!word->done) {
reject_poor_matches(word);
}
} else if (tessedit_reject_mode == 5) {
/*
5: Reject I/1/l from words where there is no strong contextual confirmation;
the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
and the whole of any words which are very small
*/
if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
word->reject_map.rej_word_small_xht();
} else {
one_ell_conflict(word, true);
/*
Originally the code here just used the done flag. Now I have duplicated
and unpacked the conditions for setting the done flag so that each
mechanism can be turned on or off independently. This works WITHOUT
affecting the done flag setting.
*/
if (rej_use_tess_accepted && !word->tess_accepted) {
word->reject_map.rej_word_not_tess_accepted();
}
if (rej_use_tess_blanks &&
(strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {
word->reject_map.rej_word_contains_blanks();
}
WERD_CHOICE *best_choice = word->best_choice;
if (rej_use_good_perm) {
if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
best_choice->permuter() == FREQ_DAWG_PERM ||
best_choice->permuter() == USER_DAWG_PERM) &&
(!rej_use_sensible_wd ||
acceptable_word_string(*word->uch_set, best_choice->unichar_string().c_str(),
best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE)) {
// PASSED TEST
} else if (best_choice->permuter() == NUMBER_PERM) {
if (rej_alphas_in_number_perm) {
for (i = 0, offset = 0; best_choice->unichar_string()[offset] != '\0';
offset += best_choice->unichar_lengths()[i++]) {
if (word->reject_map[i].accepted() &&
word->uch_set->get_isalpha(best_choice->unichar_string().c_str() + offset,
best_choice->unichar_lengths()[i])) {
word->reject_map[i].setrej_bad_permuter();
}
// rej alpha
}
}
} else {
word->reject_map.rej_word_bad_permuter();
}
}
/* Ambig word rejection was here once !!*/
}
} else {
tprintf("BAD tessedit_reject_mode\n");
ASSERT_HOST("Fatal error encountered!" == nullptr);
}
if (tessedit_image_border > -1) {
reject_edge_blobs(word);
}
check_debug_pt(word, 10);
if (tessedit_rejection_debug) {
tprintf("Permuter Type = %d\n", word->best_choice->permuter());
tprintf("Certainty: %f Rating: %f\n", word->best_choice->certainty(),
word->best_choice->rating());
tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
}
flip_hyphens(word);
check_debug_pt(word, 20);
}
void reject_blanks(WERD_RES *word) {
int16_t i;
int16_t offset;
for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
offset += word->best_choice->unichar_lengths()[i], i += 1) {
if (word->best_choice->unichar_string()[offset] == ' ') {
// rej unrecognised blobs
word->reject_map[i].setrej_tess_failure();
}
}
}
void Tesseract::reject_I_1_L(WERD_RES *word) {
int16_t i;
int16_t offset;
for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
offset += word->best_choice->unichar_lengths()[i], i += 1) {
if (conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])) {
// rej 1Il conflict
word->reject_map[i].setrej_1Il_conflict();
}
}
}
void reject_poor_matches(WERD_RES *word) {
float threshold = compute_reject_threshold(word->best_choice);
for (int i = 0; i < word->best_choice->length(); ++i) {
if (word->best_choice->unichar_id(i) == UNICHAR_SPACE) {
word->reject_map[i].setrej_tess_failure();
} else if (word->best_choice->certainty(i) < threshold) {
word->reject_map[i].setrej_poor_match();
}
}
}
/**********************************************************************
* compute_reject_threshold
*
* Set a rejection threshold for this word.
* Initially this is a trivial function which looks for the largest
* gap in the certainty value.
**********************************************************************/
float compute_reject_threshold(WERD_CHOICE *word) {
float threshold; // rejection threshold
float bestgap = 0.0f; // biggest gap
float gapstart; // bottom of gap
int blob_count = word->length();
std::vector<float> ratings;
ratings.reserve(blob_count);
for (int i = 0; i < blob_count; ++i) {
ratings.push_back(word->certainty(i));
}
std::sort(ratings.begin(), ratings.end());
gapstart = ratings[0] - 1; // all reject if none better
if (blob_count >= 3) {
for (int index = 0; index < blob_count - 1; index++) {
if (ratings[index + 1] - ratings[index] > bestgap) {
bestgap = ratings[index + 1] - ratings[index];
// find biggest
gapstart = ratings[index];
}
}
}
threshold = gapstart + bestgap / 2;
return threshold;
}
/*************************************************************************
* reject_edge_blobs()
*
* If the word is perilously close to the edge of the image, reject those blobs
* in the word which are too close to the edge as they could be clipped.
*************************************************************************/
void Tesseract::reject_edge_blobs(WERD_RES *word) {
TBOX word_box = word->word->bounding_box();
// Use the box_word as it is already denormed back to image coordinates.
int blobcount = word->box_word->length();
if (word_box.left() < tessedit_image_border || word_box.bottom() < tessedit_image_border ||
word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
word_box.top() + tessedit_image_border > ImageHeight() - 1) {
ASSERT_HOST(word->reject_map.length() == blobcount);
for (int blobindex = 0; blobindex < blobcount; blobindex++) {
TBOX blob_box = word->box_word->BlobBox(blobindex);
if (blob_box.left() < tessedit_image_border || blob_box.bottom() < tessedit_image_border ||
blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
word->reject_map[blobindex].setrej_edge_char();
// Close to edge
}
}
}
}
/**********************************************************************
* one_ell_conflict()
*
* Identify words where there is a potential I/l/1 error.
* - A bundle of contextual heuristics!
**********************************************************************/
bool Tesseract::one_ell_conflict(WERD_RES *word_res, bool update_map) {
const char *word;
const char *lengths;
int16_t word_len; // its length
int16_t first_alphanum_index_;
int16_t first_alphanum_offset_;
int16_t i;
int16_t offset;
bool non_conflict_set_char; // non conf set a/n?
bool conflict = false;
bool allow_1s;
ACCEPTABLE_WERD_TYPE word_type;
bool dict_perm_type;
bool dict_word_ok;
int dict_word_type;
word = word_res->best_choice->unichar_string().c_str();
lengths = word_res->best_choice->unichar_lengths().c_str();
word_len = strlen(lengths);
/*
If there are no occurrences of the conflict set characters then the word
is OK.
*/
if (strpbrk(word, conflict_set_I_l_1.c_str()) == nullptr) {
return false;
}
/*
There is a conflict if there are NO other (confirmed) alphanumerics apart
from those in the conflict set.
*/
for (i = 0, offset = 0, non_conflict_set_char = false; (i < word_len) && !non_conflict_set_char;
offset += lengths[i++]) {
non_conflict_set_char = (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
!conflict_set_I_l_1.contains(word[offset]);
}
if (!non_conflict_set_char) {
if (update_map) {
reject_I_1_L(word_res);
}
return true;
}
/*
If the word is accepted by a dawg permuter, and the first alpha character
is "I" or "l", check to see if the alternative is also a dawg word. If it
is, then there is a potential error otherwise the word is ok.
*/
dict_perm_type = (word_res->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
(word_res->best_choice->permuter() == USER_DAWG_PERM) ||
(rej_trust_doc_dawg && (word_res->best_choice->permuter() == DOC_DAWG_PERM)) ||
(word_res->best_choice->permuter() == FREQ_DAWG_PERM);
dict_word_type = dict_word(*(word_res->best_choice));
dict_word_ok = (dict_word_type > 0) && (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
if ((rej_1Il_use_dict_word && dict_word_ok) || (rej_1Il_trust_permuter_type && dict_perm_type) ||
(dict_perm_type && dict_word_ok)) {
first_alphanum_index_ = first_alphanum_index(word, lengths);
first_alphanum_offset_ = first_alphanum_offset(word, lengths);
if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
if (safe_dict_word(word_res) > 0) {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
if (update_map) {
word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
}
return true;
} else {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
return false;
}
}
if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
if (safe_dict_word(word_res) > 0) {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
if (update_map) {
word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
}
return true;
} else {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
return false;
}
}
return false;
}
/*
NEW 1Il code. The old code relied on permuter types too much. In fact,
tess will use TOP_CHOICE permute for good things like "palette".
In this code the string is examined independently to see if it looks like
a well formed word.
*/
/*
REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
dictionary word.
*/
first_alphanum_index_ = first_alphanum_index(word, lengths);
first_alphanum_offset_ = first_alphanum_offset(word, lengths);
if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
if (safe_dict_word(word_res) > 0) {
return false;
} else {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
}
} else if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
if (safe_dict_word(word_res) > 0) {
return false;
} else {
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
}
}
/*
For strings containing digits:
If there are no alphas OR the numeric permuter liked the word,
reject any non 1 conflict chs
Else reject all conflict chs
*/
if (word_contains_non_1_digit(word, lengths)) {
allow_1s =
(alpha_count(word, lengths) == 0) || (word_res->best_choice->permuter() == NUMBER_PERM);
int16_t offset;
conflict = false;
for (i = 0, offset = 0; word[offset] != '\0';
offset += word_res->best_choice->unichar_lengths()[i++]) {
if ((!allow_1s || (word[offset] != '1')) &&
conflict_set_I_l_1.contains(word[offset])) {
if (update_map) {
word_res->reject_map[i].setrej_1Il_conflict();
}
conflict = true;
}
}
return conflict;
}
/*
For anything else. See if it conforms to an acceptable word type. If so,
treat accordingly.
*/
word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
first_alphanum_index_ = first_alphanum_index(word, lengths);
first_alphanum_offset_ = first_alphanum_offset(word, lengths);
if (conflict_set_I_l_1.contains(word[first_alphanum_offset_])) {
if (update_map) {
word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
}
return true;
} else {
return false;
}
} else if (word_type == AC_UPPER_CASE) {
return false;
} else {
if (update_map) {
reject_I_1_L(word_res);
}
return true;
}
}
int16_t Tesseract::first_alphanum_index(const char *word, const char *word_lengths) {
int16_t i;
int16_t offset;
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
unicharset.get_isdigit(word + offset, word_lengths[i])) {
return i;
}
}
return -1;
}
int16_t Tesseract::first_alphanum_offset(const char *word, const char *word_lengths) {
int16_t i;
int16_t offset;
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
unicharset.get_isdigit(word + offset, word_lengths[i])) {
return offset;
}
}
return -1;
}
int16_t Tesseract::alpha_count(const char *word, const char *word_lengths) {
int16_t i;
int16_t offset;
int16_t count = 0;
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
if (unicharset.get_isalpha(word + offset, word_lengths[i])) {
count++;
}
}
return count;
}
bool Tesseract::word_contains_non_1_digit(const char *word, const char *word_lengths) {
int16_t i;
int16_t offset;
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
if (unicharset.get_isdigit(word + offset, word_lengths[i]) &&
(word_lengths[i] != 1 || word[offset] != '1')) {
return true;
}
}
return false;
}
/*************************************************************************
* dont_allow_1Il()
* Don't unreject LONE accepted 1Il conflict set chars
*************************************************************************/
void Tesseract::dont_allow_1Il(WERD_RES *word) {
int i = 0;
int offset;
int word_len = word->reject_map.length();
const char *s = word->best_choice->unichar_string().c_str();
const char *lengths = word->best_choice->unichar_lengths().c_str();
bool accepted_1Il = false;
for (i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
if (word->reject_map[i].accepted()) {
if (conflict_set_I_l_1.contains(s[offset])) {
accepted_1Il = true;
} else {
if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
word->uch_set->get_isdigit(s + offset, lengths[i])) {
return; // >=1 non 1Il ch accepted
}
}
}
}
if (!accepted_1Il) {
return; // Nothing to worry about
}
for (i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
if (conflict_set_I_l_1.contains(s[offset]) && word->reject_map[i].accepted()) {
word->reject_map[i].setrej_postNN_1Il();
}
}
}
int16_t Tesseract::count_alphanums(WERD_RES *word_res) {
int count = 0;
const WERD_CHOICE *best_choice = word_res->best_choice;
for (int i = 0; i < word_res->reject_map.length(); ++i) {
if ((word_res->reject_map[i].accepted()) &&
(word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
count++;
}
}
return count;
}
// reject all if most rejected.
void Tesseract::reject_mostly_rejects(WERD_RES *word) {
/* Reject the whole of the word if the fraction of rejects exceeds a limit */
if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >=
rej_whole_of_mostly_reject_word_fract) {
word->reject_map.rej_word_mostly_rej();
}
}
bool Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
int16_t char_quality;
int16_t accepted_char_quality;
if (word->best_choice->unichar_lengths().length() <= 1) {
return false;
}
if (!ok_repeated_ch_non_alphanum_wds.contains(word->best_choice->unichar_string()[0])) {
return false;
}
UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
for (int i = 1; i < word->best_choice->length(); ++i) {
if (word->best_choice->unichar_id(i) != uch_id) {
return false;
}
}
word_char_quality(word, &char_quality, &accepted_char_quality);
if ((word->best_choice->unichar_lengths().length() == char_quality) &&
(char_quality == accepted_char_quality)) {
return true;
} else {
return false;
}
}
int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
const WERD_CHOICE &word = *werd_res->best_choice;
int dict_word_type = werd_res->tesseract->dict_word(word);
return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
}
// Note: After running this function word_res->ratings
// might not contain the right BLOB_CHOICE corresponding to each character
// in word_res->best_choice.
void Tesseract::flip_hyphens(WERD_RES *word_res) {
WERD_CHOICE *best_choice = word_res->best_choice;
int i;
int prev_right = -9999;
int next_left;
TBOX out_box;
float aspect_ratio;
if (tessedit_lower_flip_hyphen <= 1) {
return;
}
int num_blobs = word_res->rebuild_word->NumBlobs();
UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
TBLOB *blob = word_res->rebuild_word->blobs[i];
out_box = blob->bounding_box();
if (i + 1 == num_blobs) {
next_left = 9999;
} else {
next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
}
// Don't touch small or touching blobs - it is too dangerous.
if ((out_box.width() > 8 * word_res->denorm.x_scale()) && (out_box.left() > prev_right) &&
(out_box.right() < next_left)) {
aspect_ratio = out_box.width() / static_cast<float>(out_box.height());
if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
if (aspect_ratio >= tessedit_upper_flip_hyphen &&
word_res->uch_set->contains_unichar_id(unichar_dash) &&
word_res->uch_set->get_enabled(unichar_dash)) {
/* Certain HYPHEN */
best_choice->set_unichar_id(unichar_dash, i);
if (word_res->reject_map[i].rejected()) {
word_res->reject_map[i].setrej_hyphen_accept();
}
}
if ((aspect_ratio > tessedit_lower_flip_hyphen) && word_res->reject_map[i].accepted()) {
// Suspected HYPHEN
word_res->reject_map[i].setrej_hyphen();
}
} else if (best_choice->unichar_id(i) == unichar_dash) {
if ((aspect_ratio >= tessedit_upper_flip_hyphen) && (word_res->reject_map[i].rejected())) {
word_res->reject_map[i].setrej_hyphen_accept();
}
// Certain HYPHEN
if ((aspect_ratio <= tessedit_lower_flip_hyphen) && (word_res->reject_map[i].accepted())) {
// Suspected HYPHEN
word_res->reject_map[i].setrej_hyphen();
}
}
}
prev_right = out_box.right();
}
}
// Note: After running this function word_res->ratings
// might not contain the right BLOB_CHOICE corresponding to each character
// in word_res->best_choice.
void Tesseract::flip_0O(WERD_RES *word_res) {
WERD_CHOICE *best_choice = word_res->best_choice;
int i;
TBOX out_box;
if (!tessedit_flip_0O) {
return;
}
int num_blobs = word_res->rebuild_word->NumBlobs();
for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
TBLOB *blob = word_res->rebuild_word->blobs[i];
if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
out_box = blob->bounding_box();
if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
(out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4)) {
return; // Beware words with sub/superscripts
}
}
}
UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
if (unichar_0 == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_0) ||
unichar_O == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_O)) {
return; // 0 or O are not present/enabled in unicharset
}
for (i = 1; i < best_choice->length(); ++i) {
if (best_choice->unichar_id(i) == unichar_0 || best_choice->unichar_id(i) == unichar_O) {
/* A0A */
if ((i + 1) < best_choice->length() &&
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 1))) {
best_choice->set_unichar_id(unichar_O, i);
}
/* A00A */
if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
(i + 1) < best_choice->length() &&
(best_choice->unichar_id(i + 1) == unichar_0 ||
best_choice->unichar_id(i + 1) == unichar_O) &&
(i + 2) < best_choice->length() &&
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 2))) {
best_choice->set_unichar_id(unichar_O, i);
i++;
}
/* AA0<non digit or end of word> */
if ((i > 1) && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 2)) &&
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
(((i + 1) < best_choice->length() &&
!word_res->uch_set->get_isdigit(best_choice->unichar_id(i + 1)) &&
!word_res->uch_set->eq(best_choice->unichar_id(i + 1), "l") &&
!word_res->uch_set->eq(best_choice->unichar_id(i + 1), "I")) ||
(i == best_choice->length() - 1))) {
best_choice->set_unichar_id(unichar_O, i);
}
/* 9O9 */
if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
(i + 1) < best_choice->length() &&
non_0_digit(*word_res->uch_set, best_choice->unichar_id(i + 1))) {
best_choice->set_unichar_id(unichar_0, i);
}
/* 9OOO */
if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
(i + 2) < best_choice->length() &&
(best_choice->unichar_id(i + 1) == unichar_0 ||
best_choice->unichar_id(i + 1) == unichar_O) &&
(best_choice->unichar_id(i + 2) == unichar_0 ||
best_choice->unichar_id(i + 2) == unichar_O)) {
best_choice->set_unichar_id(unichar_0, i);
best_choice->set_unichar_id(unichar_0, i + 1);
best_choice->set_unichar_id(unichar_0, i + 2);
i += 2;
}
/* 9OO<non upper> */
if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
(i + 2) < best_choice->length() &&
(best_choice->unichar_id(i + 1) == unichar_0 ||
best_choice->unichar_id(i + 1) == unichar_O) &&
!word_res->uch_set->get_isupper(best_choice->unichar_id(i + 2))) {
best_choice->set_unichar_id(unichar_0, i);
best_choice->set_unichar_id(unichar_0, i + 1);
i++;
}
/* 9O<non upper> */
if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
(i + 1) < best_choice->length() &&
!word_res->uch_set->get_isupper(best_choice->unichar_id(i + 1))) {
best_choice->set_unichar_id(unichar_0, i);
}
/* 9[.,]OOO.. */
if ((i > 1) &&
(word_res->uch_set->eq(best_choice->unichar_id(i - 1), ".") ||
word_res->uch_set->eq(best_choice->unichar_id(i - 1), ",")) &&
(word_res->uch_set->get_isdigit(best_choice->unichar_id(i - 2)) ||
best_choice->unichar_id(i - 2) == unichar_O)) {
if (best_choice->unichar_id(i - 2) == unichar_O) {
best_choice->set_unichar_id(unichar_0, i - 2);
}
while (i < best_choice->length() && (best_choice->unichar_id(i) == unichar_O ||
best_choice->unichar_id(i) == unichar_0)) {
best_choice->set_unichar_id(unichar_0, i);
i++;
}
i--;
}
}
}
}
bool Tesseract::non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) {
return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
}
bool Tesseract::non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) {
return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
}
} // namespace tesseract
#endif // def DISABLED_LEGACY_ENGINE

View File

@ -0,0 +1,39 @@
/**********************************************************************
* File: reject.h
* Description: Rejection functions used in tessedit
* Author: Phil Cheatle
* Created: Wed Sep 23 16:50:21 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef REJECT_H
#define REJECT_H
namespace tesseract {
class WERD_CHOICE;
class WERD_RES;
void reject_blanks(WERD_RES *word);
void reject_poor_matches(WERD_RES *word);
float compute_reject_threshold(WERD_CHOICE *word);
bool word_contains_non_1_digit(const char *word, const char *word_lengths);
void dont_allow_1Il(WERD_RES *word);
void flip_hyphens(WERD_RES *word);
void flip_0O(WERD_RES *word);
bool non_0_digit(const char *str, int length);
} // namespace tesseract
#endif

View File

@ -0,0 +1,789 @@
///////////////////////////////////////////////////////////////////////
// File: resultiterator.cpp
// Description: Iterator for tesseract results that is capable of
// iterating in proper reading order over Bi Directional
// (e.g. mixed Hebrew and English) text.
// Author: David Eger
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include <tesseract/resultiterator.h>
#include "pageres.h"
#include "tesseractclass.h"
#include "unicharset.h"
#include <allheaders.h>
#include <set>
#include <vector>
static const char *const kLRM = "\u200E"; // Left-to-Right Mark
static const char *const kRLM = "\u200F"; // Right-to-Left Mark
namespace tesseract {
ResultIterator::ResultIterator(const LTRResultIterator &resit) : LTRResultIterator(resit) {
in_minor_direction_ = false;
at_beginning_of_minor_run_ = false;
preserve_interword_spaces_ = false;
auto *p = ParamUtils::FindParam<BoolParam>(
"preserve_interword_spaces", GlobalParams()->bool_params, tesseract_->params()->bool_params);
if (p != nullptr) {
preserve_interword_spaces_ = (bool)(*p);
}
current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
MoveToLogicalStartOfTextline();
}
ResultIterator *ResultIterator::StartOfParagraph(const LTRResultIterator &resit) {
return new ResultIterator(resit);
}
bool ResultIterator::ParagraphIsLtr() const {
return current_paragraph_is_ltr_;
}
bool ResultIterator::CurrentParagraphIsLtr() const {
if (!it_->word()) {
return true; // doesn't matter.
}
LTRResultIterator it(*this);
it.RestartParagraph();
// Try to figure out the ltr-ness of the paragraph. The rules below
// make more sense in the context of a difficult paragraph example.
// Here we denote {ltr characters, RTL CHARACTERS}:
//
// "don't go in there!" DAIS EH
// EHT OTNI DEPMUJ FELSMIH NEHT DNA
// .GNIDLIUB GNINRUB
//
// On the first line, the left-most word is LTR and the rightmost word
// is RTL. Thus, we are better off taking the majority direction for
// the whole paragraph contents. So instead of "the leftmost word is LTR"
// indicating an LTR paragraph, we use a heuristic about what RTL paragraphs
// would not do: Typically an RTL paragraph would *not* start with an LTR
// word. So our heuristics are as follows:
//
// (1) If the first text line has an RTL word in the left-most position
// it is RTL.
// (2) If the first text line has an LTR word in the right-most position
// it is LTR.
// (3) If neither of the above is true, take the majority count for the
// paragraph -- if there are more rtl words, it is RTL. If there
// are more LTR words, it's LTR.
bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;
bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;
int num_ltr, num_rtl;
num_rtl = leftmost_rtl ? 1 : 0;
num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;
for (it.Next(RIL_WORD); !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);
it.Next(RIL_WORD)) {
StrongScriptDirection dir = it.WordDirection();
rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);
num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
num_ltr += rightmost_ltr ? 1 : 0;
}
if (leftmost_rtl) {
return false;
}
if (rightmost_ltr) {
return true;
}
// First line is ambiguous. Take statistics on the whole paragraph.
if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) {
do {
StrongScriptDirection dir = it.WordDirection();
num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;
} while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));
}
return num_ltr >= num_rtl;
}
const int ResultIterator::kMinorRunStart = -1;
const int ResultIterator::kMinorRunEnd = -2;
const int ResultIterator::kComplexWord = -3;
void ResultIterator::CalculateBlobOrder(std::vector<int> *blob_indices) const {
bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
blob_indices->clear();
if (Empty(RIL_WORD)) {
return;
}
if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {
// Easy! just return the blobs in order;
for (int i = 0; i < word_length_; i++) {
blob_indices->push_back(i);
}
return;
}
// The blobs are in left-to-right order, but the current reading context
// is right-to-left.
const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;
const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;
const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;
const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;
const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;
const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;
const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;
// Step 1: Scan for and mark European Number sequences
// [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*
std::vector<int> letter_types;
letter_types.reserve(word_length_);
for (int i = 0; i < word_length_; i++) {
letter_types.push_back(it_->word()->SymbolDirection(i));
}
// Convert a single separtor sandwiched between two EN's into an EN.
for (int i = 0; i + 2 < word_length_; i++) {
if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&
(letter_types[i + 1] == U_EURO_NUM_SEP || letter_types[i + 1] == U_COMMON_NUM_SEP)) {
letter_types[i + 1] = U_EURO_NUM;
}
}
// Scan for sequences of European Number Terminators around ENs and convert
// them to ENs.
for (int i = 0; i < word_length_; i++) {
if (letter_types[i] == U_EURO_NUM_TERM) {
int j = i + 1;
while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) {
j++;
}
if (j < word_length_ && letter_types[j] == U_EURO_NUM) {
// The sequence [i..j] should be converted to all European Numbers.
for (int k = i; k < j; k++) {
letter_types[k] = U_EURO_NUM;
}
}
j = i - 1;
while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) {
j--;
}
if (j > -1 && letter_types[j] == U_EURO_NUM) {
// The sequence [j..i] should be converted to all European Numbers.
for (int k = j; k <= i; k++) {
letter_types[k] = U_EURO_NUM;
}
}
}
}
// Step 2: Convert all remaining types to either L or R.
// Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.
// All other are R.
for (int i = 0; i < word_length_;) {
int ti = letter_types[i];
if (ti == U_LTR || ti == U_EURO_NUM) {
// Left to right sequence; scan to the end of it.
int last_good = i;
for (int j = i + 1; j < word_length_; j++) {
int tj = letter_types[j];
if (tj == U_LTR || tj == U_EURO_NUM) {
last_good = j;
} else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
// do nothing.
} else {
break;
}
}
// [i..last_good] is the L sequence
for (int k = i; k <= last_good; k++) {
letter_types[k] = U_LTR;
}
i = last_good + 1;
} else {
letter_types[i] = U_RTL;
i++;
}
}
// At this point, letter_types is entirely U_LTR or U_RTL.
for (int i = word_length_ - 1; i >= 0;) {
if (letter_types[i] == U_RTL) {
blob_indices->push_back(i);
i--;
} else {
// left to right sequence. scan to the beginning.
int j = i - 1;
for (; j >= 0 && letter_types[j] != U_RTL; j--) {
} // pass
// Now (j, i] is LTR
for (int k = j + 1; k <= i; k++) {
blob_indices->push_back(k);
}
i = j;
}
}
ASSERT_HOST(blob_indices->size() == word_length_);
}
static void PrintScriptDirs(const std::vector<StrongScriptDirection> &dirs) {
for (auto dir : dirs) {
switch (dir) {
case DIR_NEUTRAL:
tprintf("N ");
break;
case DIR_LEFT_TO_RIGHT:
tprintf("L ");
break;
case DIR_RIGHT_TO_LEFT:
tprintf("R ");
break;
case DIR_MIX:
tprintf("Z ");
break;
default:
tprintf("? ");
break;
}
}
tprintf("\n");
}
void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit,
std::vector<int> *word_indices) const {
std::vector<StrongScriptDirection> directions;
CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);
}
void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit,
std::vector<StrongScriptDirection> *dirs_arg,
std::vector<int> *word_indices) const {
std::vector<StrongScriptDirection> dirs;
std::vector<StrongScriptDirection> *directions;
directions = (dirs_arg != nullptr) ? dirs_arg : &dirs;
directions->clear();
// A LTRResultIterator goes strictly left-to-right word order.
LTRResultIterator ltr_it(resit);
ltr_it.RestartRow();
if (ltr_it.Empty(RIL_WORD)) {
return;
}
do {
directions->push_back(ltr_it.WordDirection());
} while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));
word_indices->clear();
CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
}
void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr,
const std::vector<StrongScriptDirection> &word_dirs,
std::vector<int> *reading_order) {
reading_order->clear();
if (word_dirs.empty()) {
return;
}
// Take all of the runs of minor direction words and insert them
// in reverse order.
int minor_direction, major_direction, major_step, start, end;
if (paragraph_is_ltr) {
start = 0;
end = word_dirs.size();
major_step = 1;
major_direction = DIR_LEFT_TO_RIGHT;
minor_direction = DIR_RIGHT_TO_LEFT;
} else {
start = word_dirs.size() - 1;
end = -1;
major_step = -1;
major_direction = DIR_RIGHT_TO_LEFT;
minor_direction = DIR_LEFT_TO_RIGHT;
// Special rule: if there are neutral words at the right most side
// of a line adjacent to a left-to-right word in the middle of the
// line, we interpret the end of the line as a single LTR sequence.
if (word_dirs[start] == DIR_NEUTRAL) {
int neutral_end = start;
while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
neutral_end--;
}
if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
// LTR followed by neutrals.
// Scan for the beginning of the minor left-to-right run.
int left = neutral_end;
for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
if (word_dirs[i] == DIR_LEFT_TO_RIGHT) {
left = i;
}
}
reading_order->push_back(kMinorRunStart);
for (unsigned i = left; i < word_dirs.size(); i++) {
reading_order->push_back(i);
if (word_dirs[i] == DIR_MIX) {
reading_order->push_back(kComplexWord);
}
}
reading_order->push_back(kMinorRunEnd);
start = left - 1;
}
}
}
for (int i = start; i != end;) {
if (word_dirs[i] == minor_direction) {
int j = i;
while (j != end && word_dirs[j] != major_direction) {
j += major_step;
}
if (j == end) {
j -= major_step;
}
while (j != i && word_dirs[j] != minor_direction) {
j -= major_step;
}
// [j..i] is a minor direction run.
reading_order->push_back(kMinorRunStart);
for (int k = j; k != i; k -= major_step) {
reading_order->push_back(k);
}
reading_order->push_back(i);
reading_order->push_back(kMinorRunEnd);
i = j + major_step;
} else {
reading_order->push_back(i);
if (word_dirs[i] == DIR_MIX) {
reading_order->push_back(kComplexWord);
}
i += major_step;
}
}
}
int ResultIterator::LTRWordIndex() const {
int this_word_index = 0;
LTRResultIterator textline(*this);
textline.RestartRow();
while (!textline.PositionedAtSameWord(it_)) {
this_word_index++;
textline.Next(RIL_WORD);
}
return this_word_index;
}
void ResultIterator::MoveToLogicalStartOfWord() {
if (word_length_ == 0) {
BeginWord(0);
return;
}
std::vector<int> blob_order;
CalculateBlobOrder(&blob_order);
if (blob_order.empty() || blob_order[0] == 0) {
return;
}
BeginWord(blob_order[0]);
}
bool ResultIterator::IsAtFinalSymbolOfWord() const {
if (!it_->word()) {
return true;
}
std::vector<int> blob_order;
CalculateBlobOrder(&blob_order);
return blob_order.empty() || blob_order.back() == blob_index_;
}
bool ResultIterator::IsAtFirstSymbolOfWord() const {
if (!it_->word()) {
return true;
}
std::vector<int> blob_order;
CalculateBlobOrder(&blob_order);
return blob_order.empty() || blob_order[0] == blob_index_;
}
void ResultIterator::AppendSuffixMarks(std::string *text) const {
if (!it_->word()) {
return;
}
bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
// scan forward to see what meta-information the word ordering algorithm
// left us.
// If this word is at the *end* of a minor run, insert the other
// direction's mark; else if this was a complex word, insert the
// current reading order's mark.
std::vector<int> textline_order;
CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &textline_order);
int this_word_index = LTRWordIndex();
size_t i = 0;
for (const auto word_index : textline_order) {
if (word_index == this_word_index) {
break;
}
i++;
}
if (i == textline_order.size()) {
return;
}
int last_non_word_mark = 0;
for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {
last_non_word_mark = textline_order[i];
}
if (last_non_word_mark == kComplexWord) {
*text += reading_direction_is_ltr ? kLRM : kRLM;
} else if (last_non_word_mark == kMinorRunEnd) {
if (current_paragraph_is_ltr_) {
*text += kLRM;
} else {
*text += kRLM;
}
}
}
void ResultIterator::MoveToLogicalStartOfTextline() {
std::vector<int> word_indices;
RestartRow();
CalculateTextlineOrder(current_paragraph_is_ltr_, dynamic_cast<const LTRResultIterator &>(*this),
&word_indices);
unsigned i = 0;
for (; i < word_indices.size() && word_indices[i] < 0; i++) {
if (word_indices[i] == kMinorRunStart) {
in_minor_direction_ = true;
} else if (word_indices[i] == kMinorRunEnd) {
in_minor_direction_ = false;
}
}
if (in_minor_direction_) {
at_beginning_of_minor_run_ = true;
}
if (i >= word_indices.size()) {
return;
}
int first_word_index = word_indices[i];
for (int j = 0; j < first_word_index; j++) {
PageIterator::Next(RIL_WORD);
}
MoveToLogicalStartOfWord();
}
void ResultIterator::Begin() {
LTRResultIterator::Begin();
current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
in_minor_direction_ = false;
at_beginning_of_minor_run_ = false;
MoveToLogicalStartOfTextline();
}
bool ResultIterator::Next(PageIteratorLevel level) {
if (it_->block() == nullptr) {
return false; // already at end!
}
switch (level) {
case RIL_BLOCK: // explicit fall-through
case RIL_PARA: // explicit fall-through
case RIL_TEXTLINE:
if (!PageIterator::Next(level)) {
return false;
}
if (IsWithinFirstTextlineOfParagraph()) {
// if we've advanced to a new paragraph,
// recalculate current_paragraph_is_ltr_
current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
}
in_minor_direction_ = false;
MoveToLogicalStartOfTextline();
return it_->block() != nullptr;
case RIL_SYMBOL: {
std::vector<int> blob_order;
CalculateBlobOrder(&blob_order);
int next_blob = 0;
while (next_blob < blob_order.size() && blob_index_ != blob_order[next_blob]) {
next_blob++;
}
next_blob++;
if (next_blob < blob_order.size()) {
// we're in the same word; simply advance one blob.
BeginWord(blob_order[next_blob]);
at_beginning_of_minor_run_ = false;
return true;
}
level = RIL_WORD; // we've fallen through to the next word.
}
// Fall through.
case RIL_WORD: // explicit fall-through.
{
if (it_->word() == nullptr) {
return Next(RIL_BLOCK);
}
std::vector<int> word_indices;
int this_word_index = LTRWordIndex();
CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &word_indices);
int final_real_index = word_indices.size() - 1;
while (final_real_index > 0 && word_indices[final_real_index] < 0) {
final_real_index--;
}
for (int i = 0; i < final_real_index; i++) {
if (word_indices[i] == this_word_index) {
int j = i + 1;
for (; j < final_real_index && word_indices[j] < 0; j++) {
if (word_indices[j] == kMinorRunStart) {
in_minor_direction_ = true;
}
if (word_indices[j] == kMinorRunEnd) {
in_minor_direction_ = false;
}
}
at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
// awesome, we move to word_indices[j]
if (BidiDebug(3)) {
tprintf("Next(RIL_WORD): %d -> %d\n", this_word_index, word_indices[j]);
}
PageIterator::RestartRow();
for (int k = 0; k < word_indices[j]; k++) {
PageIterator::Next(RIL_WORD);
}
MoveToLogicalStartOfWord();
return true;
}
}
if (BidiDebug(3)) {
tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
}
// we're going off the end of the text line.
return Next(RIL_TEXTLINE);
}
}
ASSERT_HOST(false); // shouldn't happen.
return false;
}
bool ResultIterator::IsAtBeginningOf(PageIteratorLevel level) const {
if (it_->block() == nullptr) {
return false; // Already at the end!
}
if (it_->word() == nullptr) {
return true; // In an image block.
}
if (level == RIL_SYMBOL) {
return true; // Always at beginning of a symbol.
}
bool at_word_start = IsAtFirstSymbolOfWord();
if (level == RIL_WORD) {
return at_word_start;
}
ResultIterator line_start(*this);
// move to the first word in the line...
line_start.MoveToLogicalStartOfTextline();
bool at_textline_start = at_word_start && *line_start.it_ == *it_;
if (level == RIL_TEXTLINE) {
return at_textline_start;
}
// now we move to the left-most word...
line_start.RestartRow();
bool at_block_start =
at_textline_start && line_start.it_->block() != line_start.it_->prev_block();
if (level == RIL_BLOCK) {
return at_block_start;
}
bool at_para_start =
at_block_start || (at_textline_start && line_start.it_->row()->row->para() !=
line_start.it_->prev_row()->row->para());
if (level == RIL_PARA) {
return at_para_start;
}
ASSERT_HOST(false); // shouldn't happen.
return false;
}
/**
* NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the
* change that the variable next is now a ResultIterator instead of a
* PageIterator.
*/
bool ResultIterator::IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const {
if (Empty(element)) {
return true; // Already at the end!
}
// The result is true if we step forward by element and find we are
// at the the end of the page or at beginning of *all* levels in:
// [level, element).
// When there is more than one level difference between element and level,
// we could for instance move forward one symbol and still be at the first
// word on a line, so we also have to be at the first symbol in a word.
ResultIterator next(*this);
next.Next(element);
if (next.Empty(element)) {
return true; // Reached the end of the page.
}
while (element > level) {
element = static_cast<PageIteratorLevel>(element - 1);
if (!next.IsAtBeginningOf(element)) {
return false;
}
}
return true;
}
// Returns the number of blanks before the current word.
int ResultIterator::BlanksBeforeWord() const {
if (CurrentParagraphIsLtr()) {
return LTRResultIterator::BlanksBeforeWord();
}
return IsAtBeginningOf(RIL_TEXTLINE) ? 0 : 1;
}
/**
* Returns the null terminated UTF-8 encoded text string for the current
* object at the given level. Use delete [] to free after use.
*/
char *ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
if (it_->word() == nullptr) {
return nullptr; // Already at the end!
}
std::string text;
switch (level) {
case RIL_BLOCK: {
ResultIterator pp(*this);
do {
pp.AppendUTF8ParagraphText(&text);
} while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
} break;
case RIL_PARA:
AppendUTF8ParagraphText(&text);
break;
case RIL_TEXTLINE: {
ResultIterator it(*this);
it.MoveToLogicalStartOfTextline();
it.IterateAndAppendUTF8TextlineText(&text);
} break;
case RIL_WORD:
AppendUTF8WordText(&text);
break;
case RIL_SYMBOL: {
bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
if (at_beginning_of_minor_run_) {
text += reading_direction_is_ltr ? kLRM : kRLM;
}
text = it_->word()->BestUTF8(blob_index_, false);
if (IsAtFinalSymbolOfWord()) {
AppendSuffixMarks(&text);
}
} break;
}
int length = text.length() + 1;
char *result = new char[length];
strncpy(result, text.c_str(), length);
return result;
}
std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
*ResultIterator::GetRawLSTMTimesteps() const {
if (it_->word() != nullptr) {
return &it_->word()->segmented_timesteps;
} else {
return nullptr;
}
}
std::vector<std::vector<std::pair<const char *, float>>> *ResultIterator::GetBestLSTMSymbolChoices()
const {
if (it_->word() != nullptr) {
return &it_->word()->CTC_symbol_choices;
} else {
return nullptr;
}
}
void ResultIterator::AppendUTF8WordText(std::string *text) const {
if (!it_->word()) {
return;
}
ASSERT_HOST(it_->word()->best_choice != nullptr);
bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
if (at_beginning_of_minor_run_) {
*text += reading_direction_is_ltr ? kLRM : kRLM;
}
std::vector<int> blob_order;
CalculateBlobOrder(&blob_order);
for (int i : blob_order) {
*text += it_->word()->BestUTF8(i, false);
}
AppendSuffixMarks(text);
}
void ResultIterator::IterateAndAppendUTF8TextlineText(std::string *text) {
if (Empty(RIL_WORD)) {
Next(RIL_WORD);
return;
}
if (BidiDebug(1)) {
std::vector<int> textline_order;
std::vector<StrongScriptDirection> dirs;
CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &dirs, &textline_order);
tprintf("Strong Script dirs [%p/P=%s]: ", it_->row(),
current_paragraph_is_ltr_ ? "ltr" : "rtl");
PrintScriptDirs(dirs);
tprintf("Logical textline order [%p/P=%s]: ", it_->row(),
current_paragraph_is_ltr_ ? "ltr" : "rtl");
for (int i : textline_order) {
tprintf("%d ", i);
}
tprintf("\n");
}
int words_appended = 0;
do {
int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space() : (words_appended > 0);
for (int i = 0; i < numSpaces; ++i) {
*text += " ";
}
AppendUTF8WordText(text);
words_appended++;
if (BidiDebug(2)) {
tprintf("Num spaces=%d, text=%s\n", numSpaces, text->c_str());
}
} while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
if (BidiDebug(1)) {
tprintf("%d words printed\n", words_appended);
}
*text += line_separator_;
// If we just finished a paragraph, add an extra newline.
if (IsAtBeginningOf(RIL_PARA)) {
*text += paragraph_separator_;
}
}
void ResultIterator::AppendUTF8ParagraphText(std::string *text) const {
ResultIterator it(*this);
it.RestartParagraph();
it.MoveToLogicalStartOfTextline();
if (it.Empty(RIL_WORD)) {
return;
}
do {
it.IterateAndAppendUTF8TextlineText(text);
} while (it.it_->block() != nullptr && !it.IsAtBeginningOf(RIL_PARA));
}
bool ResultIterator::BidiDebug(int min_level) const {
int debug_level = 1;
auto *p = ParamUtils::FindParam<IntParam>("bidi_debug", GlobalParams()->int_params,
tesseract_->params()->int_params);
if (p != nullptr) {
debug_level = (int32_t)(*p);
}
return debug_level >= min_level;
}
} // namespace tesseract.

View File

@ -0,0 +1,592 @@
/******************************************************************
* File: superscript.cpp
* Description: Correction pass to fix superscripts and subscripts.
* Author: David Eger
*
* (C) Copyright 2012, Google, Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "normalis.h"
#include "tesseractclass.h"
namespace tesseract {
static int LeadingUnicharsToChopped(WERD_RES *word, int num_unichars) {
int num_chopped = 0;
for (int i = 0; i < num_unichars; i++) {
num_chopped += word->best_state[i];
}
return num_chopped;
}
static int TrailingUnicharsToChopped(WERD_RES *word, int num_unichars) {
int num_chopped = 0;
for (int i = 0; i < num_unichars; i++) {
num_chopped += word->best_state[word->best_state.size() - 1 - i];
}
return num_chopped;
}
/**
* Given a recognized blob, see if a contiguous collection of sub-pieces
* (chopped blobs) starting at its left might qualify as being a subscript
* or superscript letter based only on y position. Also do this for the
* right side.
*/
static void YOutlierPieces(WERD_RES *word, int rebuilt_blob_index, int super_y_bottom,
int sub_y_top, ScriptPos *leading_pos, int *num_leading_outliers,
ScriptPos *trailing_pos, int *num_trailing_outliers) {
ScriptPos sp_unused1, sp_unused2;
int unused1, unused2;
if (!leading_pos) {
leading_pos = &sp_unused1;
}
if (!num_leading_outliers) {
num_leading_outliers = &unused1;
}
if (!trailing_pos) {
trailing_pos = &sp_unused2;
}
if (!num_trailing_outliers) {
num_trailing_outliers = &unused2;
}
*num_leading_outliers = *num_trailing_outliers = 0;
*leading_pos = *trailing_pos = SP_NORMAL;
int chopped_start = LeadingUnicharsToChopped(word, rebuilt_blob_index);
int num_chopped_pieces = word->best_state[rebuilt_blob_index];
ScriptPos last_pos = SP_NORMAL;
int trailing_outliers = 0;
for (int i = 0; i < num_chopped_pieces; i++) {
TBOX box = word->chopped_word->blobs[chopped_start + i]->bounding_box();
ScriptPos pos = SP_NORMAL;
if (box.bottom() >= super_y_bottom) {
pos = SP_SUPERSCRIPT;
} else if (box.top() <= sub_y_top) {
pos = SP_SUBSCRIPT;
}
if (pos == SP_NORMAL) {
if (trailing_outliers == i) {
*num_leading_outliers = trailing_outliers;
*leading_pos = last_pos;
}
trailing_outliers = 0;
} else {
if (pos == last_pos) {
trailing_outliers++;
} else {
trailing_outliers = 1;
}
}
last_pos = pos;
}
*num_trailing_outliers = trailing_outliers;
*trailing_pos = last_pos;
}
/**
* Attempt to split off any high (or low) bits at the ends of the word with poor
* certainty and recognize them separately. If the certainty gets much better
* and other sanity checks pass, accept.
*
* This superscript fix is meant to be called in the second pass of recognition
* when we have tried once and already have a preliminary answer for word.
*
* @return Whether we modified the given word.
*/
bool Tesseract::SubAndSuperscriptFix(WERD_RES *word) {
if (word->tess_failed || word->word->flag(W_REP_CHAR) || !word->best_choice) {
return false;
}
int num_leading, num_trailing;
ScriptPos sp_leading, sp_trailing;
float leading_certainty, trailing_certainty;
float avg_certainty, unlikely_threshold;
// Calculate the number of whole suspicious characters at the edges.
GetSubAndSuperscriptCandidates(word, &num_leading, &sp_leading, &leading_certainty, &num_trailing,
&sp_trailing, &trailing_certainty, &avg_certainty,
&unlikely_threshold);
const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super";
const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super";
int num_blobs = word->best_choice->length();
// Calculate the remainder (partial characters) at the edges.
// This accounts for us having classified the best version of
// a word as [speaker?'] when it was instead [speaker.^{21}]
// (that is we accidentally thought the 2 was attached to the period).
int num_remainder_leading = 0, num_remainder_trailing = 0;
if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
int super_y_bottom = kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
int sub_y_top = kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
int last_word_char = num_blobs - 1 - num_trailing;
float last_char_certainty = word->best_choice->certainty(last_word_char);
if (word->best_choice->unichar_id(last_word_char) != 0 &&
last_char_certainty <= unlikely_threshold) {
ScriptPos rpos;
YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top, nullptr, nullptr, &rpos,
&num_remainder_trailing);
if (num_trailing > 0 && rpos != sp_trailing) {
num_remainder_trailing = 0;
}
if (num_remainder_trailing > 0 && last_char_certainty < trailing_certainty) {
trailing_certainty = last_char_certainty;
}
}
bool another_blob_available =
(num_remainder_trailing == 0) || num_leading + num_trailing + 1 < num_blobs;
int first_char_certainty = word->best_choice->certainty(num_leading);
if (another_blob_available && word->best_choice->unichar_id(num_leading) != 0 &&
first_char_certainty <= unlikely_threshold) {
ScriptPos lpos;
YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top, &lpos, &num_remainder_leading,
nullptr, nullptr);
if (num_leading > 0 && lpos != sp_leading) {
num_remainder_leading = 0;
}
if (num_remainder_leading > 0 && first_char_certainty < leading_certainty) {
leading_certainty = first_char_certainty;
}
}
}
// If nothing to do, bail now.
if (num_leading + num_trailing + num_remainder_leading + num_remainder_trailing == 0) {
return false;
}
if (superscript_debug >= 1) {
tprintf("Candidate for superscript detection: %s (",
word->best_choice->unichar_string().c_str());
if (num_leading || num_remainder_leading) {
tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading, leading_pos);
}
if (num_trailing || num_remainder_trailing) {
tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing, trailing_pos);
}
tprintf(")\n");
}
if (superscript_debug >= 3) {
word->best_choice->print();
}
if (superscript_debug >= 2) {
tprintf(" Certainties -- Average: %.2f Unlikely thresh: %.2f ", avg_certainty,
unlikely_threshold);
if (num_leading) {
tprintf("Orig. leading (min): %.2f ", leading_certainty);
}
if (num_trailing) {
tprintf("Orig. trailing (min): %.2f ", trailing_certainty);
}
tprintf("\n");
}
// We've now calculated the number of rebuilt blobs we want to carve off.
// However, split_word() works from TBLOBs in chopped_word, so we need to
// convert to those.
int num_chopped_leading = LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
int num_chopped_trailing = TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
int retry_leading = 0;
int retry_trailing = 0;
bool is_good = false;
WERD_RES *revised = TrySuperscriptSplits(num_chopped_leading, leading_certainty, sp_leading,
num_chopped_trailing, trailing_certainty, sp_trailing,
word, &is_good, &retry_leading, &retry_trailing);
if (is_good) {
word->ConsumeWordResults(revised);
} else if (retry_leading || retry_trailing) {
int retry_chopped_leading = LeadingUnicharsToChopped(revised, retry_leading);
int retry_chopped_trailing = TrailingUnicharsToChopped(revised, retry_trailing);
WERD_RES *revised2 = TrySuperscriptSplits(
retry_chopped_leading, leading_certainty, sp_leading, retry_chopped_trailing,
trailing_certainty, sp_trailing, revised, &is_good, &retry_leading, &retry_trailing);
if (is_good) {
word->ConsumeWordResults(revised2);
}
delete revised2;
}
delete revised;
return is_good;
}
/**
* Determine how many characters (rebuilt blobs) on each end of a given word
* might plausibly be superscripts so SubAndSuperscriptFix can try to
* re-recognize them. Even if we find no whole blobs at either end,
* we will set *unlikely_threshold to a certainty that might be used to
* select "bad enough" outlier characters. If *unlikely_threshold is set to 0,
* though, there's really no hope.
*
* @param[in] word The word to examine.
* @param[out] num_rebuilt_leading the number of rebuilt blobs at the start
* of the word which are all up or down and
* seem badly classified.
* @param[out] leading_pos "super" or "sub" (for debugging)
* @param[out] leading_certainty the worst certainty in the leading blobs.
* @param[out] num_rebuilt_trailing the number of rebuilt blobs at the end
* of the word which are all up or down and
* seem badly classified.
* @param[out] trailing_pos "super" or "sub" (for debugging)
* @param[out] trailing_certainty the worst certainty in the trailing blobs.
* @param[out] avg_certainty the average certainty of "normal" blobs in
* the word.
* @param[out] unlikely_threshold the threshold (on certainty) we used to
* select "bad enough" outlier characters.
*/
void Tesseract::GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading,
ScriptPos *leading_pos, float *leading_certainty,
int *num_rebuilt_trailing, ScriptPos *trailing_pos,
float *trailing_certainty, float *avg_certainty,
float *unlikely_threshold) {
*avg_certainty = *unlikely_threshold = 0.0f;
*num_rebuilt_leading = *num_rebuilt_trailing = 0;
*leading_certainty = *trailing_certainty = 0.0f;
int super_y_bottom = kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
int sub_y_top = kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
// Step one: Get an average certainty for "normally placed" characters.
// Counts here are of blobs in the rebuild_word / unichars in best_choice.
*leading_pos = *trailing_pos = SP_NORMAL;
int leading_outliers = 0;
int trailing_outliers = 0;
int num_normal = 0;
float normal_certainty_total = 0.0f;
float worst_normal_certainty = 0.0f;
ScriptPos last_pos = SP_NORMAL;
int num_blobs = word->rebuild_word->NumBlobs();
for (int b = 0; b < num_blobs; ++b) {
TBOX box = word->rebuild_word->blobs[b]->bounding_box();
ScriptPos pos = SP_NORMAL;
if (box.bottom() >= super_y_bottom) {
pos = SP_SUPERSCRIPT;
} else if (box.top() <= sub_y_top) {
pos = SP_SUBSCRIPT;
}
if (pos == SP_NORMAL) {
if (word->best_choice->unichar_id(b) != 0) {
float char_certainty = word->best_choice->certainty(b);
if (char_certainty < worst_normal_certainty) {
worst_normal_certainty = char_certainty;
}
num_normal++;
normal_certainty_total += char_certainty;
}
if (trailing_outliers == b) {
leading_outliers = trailing_outliers;
*leading_pos = last_pos;
}
trailing_outliers = 0;
} else {
if (last_pos == pos) {
trailing_outliers++;
} else {
trailing_outliers = 1;
}
}
last_pos = pos;
}
*trailing_pos = last_pos;
if (num_normal >= 3) { // throw out the worst as an outlier.
num_normal--;
normal_certainty_total -= worst_normal_certainty;
}
if (num_normal > 0) {
*avg_certainty = normal_certainty_total / num_normal;
*unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
}
if (num_normal == 0 || (leading_outliers == 0 && trailing_outliers == 0)) {
return;
}
// Step two: Try to split off bits of the word that are both outliers
// and have much lower certainty than average
// Calculate num_leading and leading_certainty.
for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0; *num_rebuilt_leading < leading_outliers;
(*num_rebuilt_leading)++) {
float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);
if (char_certainty > *unlikely_threshold) {
break;
}
if (char_certainty < *leading_certainty) {
*leading_certainty = char_certainty;
}
}
// Calculate num_trailing and trailing_certainty.
for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
*num_rebuilt_trailing < trailing_outliers; (*num_rebuilt_trailing)++) {
int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
float char_certainty = word->best_choice->certainty(blob_idx);
if (char_certainty > *unlikely_threshold) {
break;
}
if (char_certainty < *trailing_certainty) {
*trailing_certainty = char_certainty;
}
}
}
/**
* Try splitting off the given number of (chopped) blobs from the front and
* back of the given word and recognizing the pieces.
*
* @param[in] num_chopped_leading how many chopped blobs from the left
* end of the word to chop off and try recognizing as a
* superscript (or subscript)
* @param[in] leading_certainty the (minimum) certainty had by the
* characters in the original leading section.
* @param[in] leading_pos "super" or "sub" (for debugging)
* @param[in] num_chopped_trailing how many chopped blobs from the right
* end of the word to chop off and try recognizing as a
* superscript (or subscript)
* @param[in] trailing_certainty the (minimum) certainty had by the
* characters in the original trailing section.
* @param[in] trailing_pos "super" or "sub" (for debugging)
* @param[in] word the word to try to chop up.
* @param[out] is_good do we believe our result?
* @param[out] retry_rebuild_leading, retry_rebuild_trailing
* If non-zero, and !is_good, then the caller may have luck trying
* to split the returned word with this number of (rebuilt) leading
* and trailing blobs / unichars.
* @return A word which is the result of re-recognizing as asked.
*/
WERD_RES *Tesseract::TrySuperscriptSplits(int num_chopped_leading, float leading_certainty,
ScriptPos leading_pos, int num_chopped_trailing,
float trailing_certainty, ScriptPos trailing_pos,
WERD_RES *word, bool *is_good, int *retry_rebuild_leading,
int *retry_rebuild_trailing) {
int num_chopped = word->chopped_word->NumBlobs();
*retry_rebuild_leading = *retry_rebuild_trailing = 0;
// Chop apart the word into up to three pieces.
BlamerBundle *bb0 = nullptr;
BlamerBundle *bb1 = nullptr;
WERD_RES *prefix = nullptr;
WERD_RES *core = nullptr;
WERD_RES *suffix = nullptr;
if (num_chopped_leading > 0) {
prefix = new WERD_RES(*word);
split_word(prefix, num_chopped_leading, &core, &bb0);
} else {
core = new WERD_RES(*word);
}
if (num_chopped_trailing > 0) {
int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
split_word(core, split_pt, &suffix, &bb1);
}
// Recognize the pieces in turn.
int saved_cp_multiplier = classify_class_pruner_multiplier;
int saved_im_multiplier = classify_integer_matcher_multiplier;
if (prefix) {
// Turn off Tesseract's y-position penalties for the leading superscript.
classify_class_pruner_multiplier.set_value(0);
classify_integer_matcher_multiplier.set_value(0);
// Adjust our expectations about the baseline for this prefix.
if (superscript_debug >= 3) {
tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
}
recog_word_recursive(prefix);
if (superscript_debug >= 2) {
tprintf(" The leading bits look like %s %s\n", ScriptPosToString(leading_pos),
prefix->best_choice->unichar_string().c_str());
}
// Restore the normal y-position penalties.
classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
}
if (superscript_debug >= 3) {
tprintf(" recognizing middle %d chopped blobs\n",
num_chopped - num_chopped_leading - num_chopped_trailing);
}
if (suffix) {
// Turn off Tesseract's y-position penalties for the trailing superscript.
classify_class_pruner_multiplier.set_value(0);
classify_integer_matcher_multiplier.set_value(0);
if (superscript_debug >= 3) {
tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
}
recog_word_recursive(suffix);
if (superscript_debug >= 2) {
tprintf(" The trailing bits look like %s %s\n", ScriptPosToString(trailing_pos),
suffix->best_choice->unichar_string().c_str());
}
// Restore the normal y-position penalties.
classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
}
// Evaluate whether we think the results are believably better
// than what we already had.
bool good_prefix =
!prefix || BelievableSuperscript(superscript_debug >= 1, *prefix,
superscript_bettered_certainty * leading_certainty,
retry_rebuild_leading, nullptr);
bool good_suffix =
!suffix || BelievableSuperscript(superscript_debug >= 1, *suffix,
superscript_bettered_certainty * trailing_certainty, nullptr,
retry_rebuild_trailing);
*is_good = good_prefix && good_suffix;
if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
// None of it is any good. Quit now.
delete core;
delete prefix;
delete suffix;
delete bb1;
return nullptr;
}
recog_word_recursive(core);
// Now paste the results together into core.
if (suffix) {
suffix->SetAllScriptPositions(trailing_pos);
join_words(core, suffix, bb1);
}
if (prefix) {
prefix->SetAllScriptPositions(leading_pos);
join_words(prefix, core, bb0);
core = prefix;
prefix = nullptr;
}
if (superscript_debug >= 1) {
tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
core->best_choice->unichar_string().c_str());
}
return core;
}
/**
* Return whether this is believable superscript or subscript text.
*
* We insist that:
* + there are no punctuation marks.
* + there are no italics.
* + no normal-sized character is smaller than superscript_scaledown_ratio
* of what it ought to be, and
* + each character is at least as certain as certainty_threshold.
*
* @param[in] debug If true, spew debug output
* @param[in] word The word whose best_choice we're evaluating
* @param[in] certainty_threshold If any of the characters have less
* certainty than this, reject.
* @param[out] left_ok How many left-side characters were ok?
* @param[out] right_ok How many right-side characters were ok?
* @return Whether the complete best choice is believable as a superscript.
*/
bool Tesseract::BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold,
int *left_ok, int *right_ok) const {
int initial_ok_run_count = 0;
int ok_run_count = 0;
float worst_certainty = 0.0f;
const WERD_CHOICE &wc = *word.best_choice;
const UnicityTable<FontInfo> &fontinfo_table = get_fontinfo_table();
for (int i = 0; i < wc.length(); i++) {
TBLOB *blob = word.rebuild_word->blobs[i];
UNICHAR_ID unichar_id = wc.unichar_id(i);
float char_certainty = wc.certainty(i);
bool bad_certainty = char_certainty < certainty_threshold;
bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
bool is_italic = word.fontinfo && word.fontinfo->is_italic();
BLOB_CHOICE *choice = word.GetBlobChoice(i);
if (choice && fontinfo_table.size() > 0) {
// Get better information from the specific choice, if available.
int font_id1 = choice->fontinfo_id();
bool font1_is_italic = font_id1 >= 0 ? fontinfo_table.at(font_id1).is_italic() : false;
int font_id2 = choice->fontinfo_id2();
is_italic = font1_is_italic && (font_id2 < 0 || fontinfo_table.at(font_id2).is_italic());
}
float height_fraction = 1.0f;
float char_height = blob->bounding_box().height();
float normal_height = char_height;
if (wc.unicharset()->top_bottom_useful()) {
int min_bot, max_bot, min_top, max_top;
wc.unicharset()->get_top_bottom(unichar_id, &min_bot, &max_bot, &min_top, &max_top);
float hi_height = max_top - max_bot;
float lo_height = min_top - min_bot;
normal_height = (hi_height + lo_height) / 2;
if (normal_height >= kBlnXHeight) {
// Only ding characters that we have decent information for because
// they're supposed to be normal sized, not tiny specks or dashes.
height_fraction = char_height / normal_height;
}
}
bool bad_height = height_fraction < superscript_scaledown_ratio;
if (debug) {
if (is_italic) {
tprintf(" Rejecting: superscript is italic.\n");
}
if (is_punc) {
tprintf(" Rejecting: punctuation present.\n");
}
const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
if (bad_certainty) {
tprintf(
" Rejecting: don't believe character %s with certainty %.2f "
"which is less than threshold %.2f\n",
char_str, char_certainty, certainty_threshold);
}
if (bad_height) {
tprintf(
" Rejecting: character %s seems too small @ %.2f versus "
"expected %.2f\n",
char_str, char_height, normal_height);
}
}
if (bad_certainty || bad_height || is_punc || is_italic) {
if (ok_run_count == i) {
initial_ok_run_count = ok_run_count;
}
ok_run_count = 0;
} else {
ok_run_count++;
}
if (char_certainty < worst_certainty) {
worst_certainty = char_certainty;
}
}
bool all_ok = ok_run_count == wc.length();
if (all_ok && debug) {
tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
}
if (!all_ok) {
if (left_ok) {
*left_ok = initial_ok_run_count;
}
if (right_ok) {
*right_ok = ok_run_count;
}
}
return all_ok;
}
} // namespace tesseract

View File

@ -0,0 +1,76 @@
/**********************************************************************
* File: tessbox.cpp (Formerly tessbox.c)
* Description: Black boxed Tess for developing a resaljet.
* Author: Ray Smith
* Created: Thu Apr 23 11:03:36 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "mfoutline.h"
#include "tesseractclass.h"
/**
* @name tess_segment_pass_n
*
* Segment a word using the pass_n conditions of the tess segmenter.
* @param pass_n pass number
* @param word word to do
*/
namespace tesseract {
void Tesseract::tess_segment_pass_n(int pass_n, WERD_RES *word) {
int saved_enable_assoc = 0;
int saved_chop_enable = 0;
if (word->word->flag(W_DONT_CHOP)) {
saved_enable_assoc = wordrec_enable_assoc;
saved_chop_enable = chop_enable;
wordrec_enable_assoc.set_value(false);
chop_enable.set_value(false);
}
if (pass_n == 1) {
set_pass1();
} else {
set_pass2();
}
recog_word(word);
if (word->best_choice == nullptr) {
word->SetupFake(*word->uch_set);
}
if (word->word->flag(W_DONT_CHOP)) {
wordrec_enable_assoc.set_value(saved_enable_assoc);
chop_enable.set_value(saved_chop_enable);
}
}
/**
* @name tess_acceptable_word
*
* @return true if the word is regarded as "good enough".
* @param word_choice after context
* @param raw_choice before context
*/
bool Tesseract::tess_acceptable_word(WERD_RES *word) {
return getDict().AcceptableResult(word);
}
/**
* @name tess_add_doc_word
*
* Add the given word to the document dictionary
*/
void Tesseract::tess_add_doc_word(WERD_CHOICE *word_choice) {
getDict().add_document_word(*word_choice);
}
} // namespace tesseract

View File

@ -0,0 +1,463 @@
/**********************************************************************
* File: tessedit.cpp (Formerly tessedit.c)
* Description: (Previously) Main program for merge of tess and editor.
* Now just code to load the language model and various
* engine-specific data files.
* Author: Ray Smith
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
# include "config_auto.h"
#endif
#include "control.h"
#include "matchdefs.h"
#include "pageres.h"
#include "params.h"
#include "stopper.h"
#include "tesseractclass.h"
#include "tessvars.h"
#include "tprintf.h"
#ifndef DISABLED_LEGACY_ENGINE
# include "chop.h"
# include "intmatcher.h"
# include "reject.h"
#endif
#include "lstmrecognizer.h"
namespace tesseract {
// Read a "config" file containing a set of variable, value pairs.
// Searches the standard places: tessdata/configs, tessdata/tessconfigs
// and also accepts a relative or absolute path name.
void Tesseract::read_config_file(const char *filename, SetParamConstraint constraint) {
std::string path = datadir;
path += "configs/";
path += filename;
FILE *fp;
if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
fclose(fp);
} else {
path = datadir;
path += "tessconfigs/";
path += filename;
if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
fclose(fp);
} else {
path = filename;
}
}
ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params());
}
// Returns false if a unicharset file for the specified language was not found
// or was invalid.
// This function initializes TessdataManager. After TessdataManager is
// no longer needed, TessdataManager::End() should be called.
//
// This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
// it is OEM_DEFAULT, in which case the value of the variable will be obtained
// from the language-specific config file (stored in [lang].traineddata), from
// the config files specified on the command line or left as the default
// OEM_TESSERACT_ONLY if none of the configs specify this variable.
bool Tesseract::init_tesseract_lang_data(const std::string &arg0, const std::string &textbase,
const std::string &language, OcrEngineMode oem,
char **configs, int configs_size,
const std::vector<std::string> *vars_vec,
const std::vector<std::string> *vars_values,
bool set_only_non_debug_params, TessdataManager *mgr) {
// Set the basename, compute the data directory.
main_setup(arg0, textbase);
// Set the language data path prefix
lang = !language.empty() ? language : "eng";
language_data_path_prefix = datadir;
language_data_path_prefix += lang;
language_data_path_prefix += ".";
// Initialize TessdataManager.
std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) {
tprintf("Error opening data file %s\n", tessdata_path.c_str());
tprintf(
"Please make sure the TESSDATA_PREFIX environment variable is set"
" to your \"tessdata\" directory.\n");
return false;
}
#ifdef DISABLED_LEGACY_ENGINE
tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
#else
if (oem == OEM_DEFAULT) {
// Set the engine mode from availability, which can then be overridden by
// the config file when we read it below.
if (!mgr->IsLSTMAvailable()) {
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
} else if (!mgr->IsBaseAvailable()) {
tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
} else {
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);
}
}
#endif // ndef DISABLED_LEGACY_ENGINE
// If a language specific config file (lang.config) exists, load it in.
TFile fp;
if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp, this->params());
}
SetParamConstraint set_params_constraint =
set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE;
// Load tesseract variables from config files. This is done after loading
// language-specific variables from [lang].traineddata file, so that custom
// config files can override values in [lang].traineddata file.
for (int i = 0; i < configs_size; ++i) {
read_config_file(configs[i], set_params_constraint);
}
// Set params specified in vars_vec (done after setting params from config
// files, so that params in vars_vec can override those from files).
if (vars_vec != nullptr && vars_values != nullptr) {
for (unsigned i = 0; i < vars_vec->size(); ++i) {
if (!ParamUtils::SetParam((*vars_vec)[i].c_str(), (*vars_values)[i].c_str(),
set_params_constraint, this->params())) {
tprintf("Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].c_str());
}
}
}
if (!tessedit_write_params_to_file.empty()) {
FILE *params_file = fopen(tessedit_write_params_to_file.c_str(), "wb");
if (params_file != nullptr) {
ParamUtils::PrintParams(params_file, this->params());
fclose(params_file);
} else {
tprintf("Failed to open %s for writing params.\n", tessedit_write_params_to_file.c_str());
}
}
#ifndef DISABLED_LEGACY_ENGINE
// Determine which ocr engine(s) should be loaded and used for recognition.
if (oem != OEM_DEFAULT) {
tessedit_ocr_engine_mode.set_value(oem);
}
#endif
// If we are only loading the config file (and so not planning on doing any
// recognition) then there's nothing else do here.
if (tessedit_init_config_only) {
return true;
}
// The various OcrEngineMode settings (see tesseract/publictypes.h) determine
// which engine-specific data files need to be loaded. If LSTM_ONLY is
// requested, the base Tesseract files are *Not* required.
#ifdef DISABLED_LEGACY_ENGINE
if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
#else
if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
#endif // ndef DISABLED_LEGACY_ENGINE
if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {
lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix.c_str());
ASSERT_HOST(lstm_recognizer_->Load(this->params(), lstm_use_matrix ? language : "", mgr));
} else {
tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
}
}
// Load the unicharset
if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
// Avoid requiring a unicharset when we aren't running base tesseract.
unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
}
#ifndef DISABLED_LEGACY_ENGINE
else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || !unicharset.load_from_file(&fp, false)) {
tprintf(
"Error: Tesseract (legacy) engine requested, but components are "
"not present in %s!!\n",
tessdata_path.c_str());
return false;
}
#endif // ndef DISABLED_LEGACY_ENGINE
if (unicharset.size() > MAX_NUM_CLASSES) {
tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
return false;
}
right_to_left_ = unicharset.major_right_to_left();
#ifndef DISABLED_LEGACY_ENGINE
// Setup initial unichar ambigs table and read universal ambigs.
UNICHARSET encoder_unicharset;
encoder_unicharset.CopyFrom(unicharset);
unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp, ambigs_debug_level,
use_ambigs_for_adaption, &unicharset);
}
// Init ParamsModel.
// Load pass1 and pass2 weights (for now these two sets are the same, but in
// the future separate sets of weights can be generated).
for (int p = ParamsModel::PTRAIN_PASS1; p < ParamsModel::PTRAIN_NUM_PASSES; ++p) {
language_model_->getParamsModel().SetPass(static_cast<ParamsModel::PassEnum>(p));
if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
if (!language_model_->getParamsModel().LoadFromFp(lang.c_str(), &fp)) {
return false;
}
}
}
#endif // ndef DISABLED_LEGACY_ENGINE
return true;
}
// Helper returns true if the given string is in the vector of strings.
static bool IsStrInList(const std::string &str, const std::vector<std::string> &str_list) {
for (const auto &i : str_list) {
if (i == str) {
return true;
}
}
return false;
}
// Parse a string of the form [~]<lang>[+[~]<lang>]*.
// Langs with no prefix get appended to to_load, provided they
// are not in there already.
// Langs with ~ prefix get appended to not_to_load, provided they are not in
// there already.
void Tesseract::ParseLanguageString(const std::string &lang_str, std::vector<std::string> *to_load,
std::vector<std::string> *not_to_load) {
std::string remains(lang_str);
while (!remains.empty()) {
// Find the start of the lang code and which vector to add to.
const char *start = remains.c_str();
while (*start == '+') {
++start;
}
std::vector<std::string> *target = to_load;
if (*start == '~') {
target = not_to_load;
++start;
}
// Find the index of the end of the lang code in string start.
int end = strlen(start);
const char *plus = strchr(start, '+');
if (plus != nullptr && plus - start < end) {
end = plus - start;
}
std::string lang_code(start);
lang_code.resize(end);
std::string next(start + end);
remains = next;
// Check whether lang_code is already in the target vector and add.
if (!IsStrInList(lang_code, *target)) {
target->push_back(lang_code);
}
}
}
// Initialize for potentially a set of languages defined by the language
// string and recursively any additional languages required by any language
// traineddata file (via tessedit_load_sublangs in its config) that is loaded.
// See init_tesseract_internal for args.
int Tesseract::init_tesseract(const std::string &arg0, const std::string &textbase,
const std::string &language, OcrEngineMode oem, char **configs,
int configs_size, const std::vector<std::string> *vars_vec,
const std::vector<std::string> *vars_values,
bool set_only_non_debug_params, TessdataManager *mgr) {
std::vector<std::string> langs_to_load;
std::vector<std::string> langs_not_to_load;
ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
for (auto *lang : sub_langs_) {
delete lang;
}
sub_langs_.clear();
// Find the first loadable lang and load into this.
// Add any languages that this language requires
bool loaded_primary = false;
// Load the rest into sub_langs_.
for (unsigned lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
const char *lang_str = langs_to_load[lang_index].c_str();
Tesseract *tess_to_init;
if (!loaded_primary) {
tess_to_init = this;
} else {
tess_to_init = new Tesseract;
}
int result = tess_to_init->init_tesseract_internal(arg0, textbase, lang_str, oem, configs,
configs_size, vars_vec, vars_values,
set_only_non_debug_params, mgr);
// Forget that language, but keep any reader we were given.
mgr->Clear();
if (!loaded_primary) {
if (result < 0) {
tprintf("Failed loading language '%s'\n", lang_str);
} else {
ParseLanguageString(tess_to_init->tessedit_load_sublangs.c_str(), &langs_to_load,
&langs_not_to_load);
loaded_primary = true;
}
} else {
if (result < 0) {
tprintf("Failed loading language '%s'\n", lang_str);
delete tess_to_init;
} else {
sub_langs_.push_back(tess_to_init);
// Add any languages that this language requires
ParseLanguageString(tess_to_init->tessedit_load_sublangs.c_str(), &langs_to_load,
&langs_not_to_load);
}
}
}
}
if (!loaded_primary) {
tprintf("Tesseract couldn't load any languages!\n");
return -1; // Couldn't load any language!
}
#ifndef DISABLED_LEGACY_ENGINE
if (!sub_langs_.empty()) {
// In multilingual mode word ratings have to be directly comparable,
// so use the same language model weights for all languages:
// use the primary language's params model if
// tessedit_use_primary_params_model is set,
// otherwise use default language model weights.
if (tessedit_use_primary_params_model) {
for (auto &sub_lang : sub_langs_) {
sub_lang->language_model_->getParamsModel().Copy(this->language_model_->getParamsModel());
}
tprintf("Using params model of the primary language\n");
} else {
this->language_model_->getParamsModel().Clear();
for (auto &sub_lang : sub_langs_) {
sub_lang->language_model_->getParamsModel().Clear();
}
}
}
SetupUniversalFontIds();
#endif // ndef DISABLED_LEGACY_ENGINE
return 0;
}
// Common initialization for a single language.
// arg0 is the datapath for the tessdata directory, which could be the
// path of the tessdata directory with no trailing /, or (if tessdata
// lives in the same directory as the executable, the path of the executable,
// hence the name arg0.
// textbase is an optional output file basename (used only for training)
// language is the language code to load.
// oem controls which engine(s) will operate on the image
// configs (argv) is an array of config filenames to load variables from.
// May be nullptr.
// configs_size (argc) is the number of elements in configs.
// vars_vec is an optional vector of variables to set.
// vars_values is an optional corresponding vector of values for the variables
// in vars_vec.
// If set_only_non_debug_params is true, only params that do not contain
// "debug" in the name will be set.
int Tesseract::init_tesseract_internal(const std::string &arg0, const std::string &textbase,
const std::string &language, OcrEngineMode oem,
char **configs, int configs_size,
const std::vector<std::string> *vars_vec,
const std::vector<std::string> *vars_values,
bool set_only_non_debug_params, TessdataManager *mgr) {
if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs, configs_size, vars_vec,
vars_values, set_only_non_debug_params, mgr)) {
return -1;
}
if (tessedit_init_config_only) {
return 0;
}
// If only LSTM will be used, skip loading Tesseract classifier's
// pre-trained templates and dictionary.
bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY;
program_editup(textbase, init_tesseract ? mgr : nullptr, init_tesseract ? mgr : nullptr);
return 0; // Normal exit
}
#ifndef DISABLED_LEGACY_ENGINE
// Helper builds the all_fonts table by adding new fonts from new_fonts.
static void CollectFonts(const UnicityTable<FontInfo> &new_fonts,
UnicityTable<FontInfo> *all_fonts) {
for (int i = 0; i < new_fonts.size(); ++i) {
// UnicityTable uniques as we go.
all_fonts->push_back(new_fonts.at(i));
}
}
// Helper assigns an id to lang_fonts using the index in all_fonts table.
static void AssignIds(const UnicityTable<FontInfo> &all_fonts, UnicityTable<FontInfo> *lang_fonts) {
for (int i = 0; i < lang_fonts->size(); ++i) {
int index = all_fonts.get_id(lang_fonts->at(i));
lang_fonts->at(i).universal_id = index;
}
}
// Set the universal_id member of each font to be unique among all
// instances of the same font loaded.
void Tesseract::SetupUniversalFontIds() {
// Note that we can get away with bitwise copying FontInfo in
// all_fonts, as it is a temporary structure and we avoid setting the
// delete callback.
UnicityTable<FontInfo> all_fonts;
// Create the universal ID table.
CollectFonts(get_fontinfo_table(), &all_fonts);
for (auto &sub_lang : sub_langs_) {
CollectFonts(sub_lang->get_fontinfo_table(), &all_fonts);
}
// Assign ids from the table to each font table.
AssignIds(all_fonts, &get_fontinfo_table());
for (auto &sub_lang : sub_langs_) {
AssignIds(all_fonts, &sub_lang->get_fontinfo_table());
}
font_table_size_ = all_fonts.size();
}
// init the LM component
int Tesseract::init_tesseract_lm(const std::string &arg0, const std::string &textbase,
const std::string &language, TessdataManager *mgr) {
if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY, nullptr, 0, nullptr,
nullptr, false, mgr)) {
return -1;
}
getDict().SetupForLoad(Dict::GlobalDawgCache());
getDict().Load(lang, mgr);
getDict().FinishLoad();
return 0;
}
#endif // ndef DISABLED_LEGACY_ENGINE
void Tesseract::end_tesseract() {
end_recog();
}
/* Define command type identifiers */
enum CMD_EVENTS { ACTION_1_CMD_EVENT, RECOG_WERDS, RECOG_PSEUDO, ACTION_2_CMD_EVENT };
} // namespace tesseract

View File

@ -0,0 +1,574 @@
///////////////////////////////////////////////////////////////////////
// File: tesseractclass.cpp
// Description: The Tesseract class. It holds/owns everything needed
// to run Tesseract on a single language, and also a set of
// sub-Tesseracts to run sub-languages. For thread safety, *every*
// variable that was previously global or static (except for
// constant data, and some visual debugging flags) has been moved
// in here, directly, or indirectly.
// This makes it safe to run multiple Tesseracts in different
// threads in parallel, and keeps the different language
// instances separate.
// Some global functions remain, but they are isolated re-entrant
// functions that operate on their arguments. Functions that work
// on variable data have been moved to an appropriate class based
// mostly on the directory hierarchy. For more information see
// slide 6 of "2ArchitectureAndDataStructures" in
// https://drive.google.com/file/d/0B7l10Bj_LprhbUlIUFlCdGtDYkE/edit?usp=sharing
// Some global data and related functions still exist in the
// training-related code, but they don't interfere with normal
// recognition operation.
// Author: Ray Smith
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
# include "config_auto.h"
#endif
#include "tesseractclass.h"
#include <allheaders.h>
#include "edgblob.h"
#ifndef DISABLED_LEGACY_ENGINE
# include "equationdetect.h"
#endif
#include "lstmrecognizer.h"
namespace tesseract {
Tesseract::Tesseract()
: BOOL_MEMBER(tessedit_resegment_from_boxes, false,
"Take segmentation and labeling from box file", this->params())
, BOOL_MEMBER(tessedit_resegment_from_line_boxes, false,
"Conversion of word/line box file to char box file", this->params())
, BOOL_MEMBER(tessedit_train_from_boxes, false, "Generate training data from boxed chars",
this->params())
, BOOL_MEMBER(tessedit_make_boxes_from_boxes, false, "Generate more boxes from boxed chars",
this->params())
, BOOL_MEMBER(tessedit_train_line_recognizer, false,
"Break input into lines and remap boxes if present", this->params())
, BOOL_MEMBER(tessedit_dump_pageseg_images, false,
"Dump intermediate images made during page segmentation", this->params())
, BOOL_MEMBER(tessedit_do_invert, true, "Try inverting the image in `LSTMRecognizeWord`",
this->params())
,
// The default for pageseg_mode is the old behaviour, so as not to
// upset anything that relies on that.
INT_MEMBER(tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
"Page seg mode: 0=osd only, 1=auto+osd, 2=auto_only, 3=auto, "
"4=column,"
" 5=block_vert, 6=block, 7=line, 8=word, 9=word_circle, 10=char,"
"11=sparse_text, 12=sparse_text+osd, 13=raw_line"
" (Values from PageSegMode enum in tesseract/publictypes.h)",
this->params())
, INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT,
"Which OCR engine(s) to run (Tesseract, LSTM, both)."
" Defaults to loading and running the most accurate"
" available.",
this->params())
, STRING_MEMBER(tessedit_char_blacklist, "", "Blacklist of chars not to recognize",
this->params())
, STRING_MEMBER(tessedit_char_whitelist, "", "Whitelist of chars to recognize", this->params())
, STRING_MEMBER(tessedit_char_unblacklist, "",
"List of chars to override tessedit_char_blacklist", this->params())
, BOOL_MEMBER(tessedit_ambigs_training, false, "Perform training for ambiguities",
this->params())
, INT_MEMBER(pageseg_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT,
"Whether to use the top-line splitting process for Devanagari "
"documents while performing page-segmentation.",
this->params())
, INT_MEMBER(ocr_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT,
"Whether to use the top-line splitting process for Devanagari "
"documents while performing ocr.",
this->params())
, STRING_MEMBER(tessedit_write_params_to_file, "", "Write all parameters to the given file.",
this->params())
, BOOL_MEMBER(tessedit_adaption_debug, false,
"Generate and print debug"
" information for adaption",
this->params())
, INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params())
, INT_MEMBER(applybox_debug, 1, "Debug level", this->params())
, INT_MEMBER(applybox_page, 0, "Page number to apply boxes from", this->params())
, STRING_MEMBER(applybox_exposure_pattern, ".exp",
"Exposure value follows"
" this pattern in the image filename. The name of the image"
" files are expected to be in the form"
" [lang].[fontname].exp[num].tif",
this->params())
, BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,
"Learn both character fragments (as is done in the"
" special low exposure mode) as well as unfragmented"
" characters.",
this->params())
, BOOL_MEMBER(applybox_learn_ngrams_mode, false,
"Each bounding box"
" is assumed to contain ngrams. Only learn the ngrams"
" whose outlines overlap horizontally.",
this->params())
, BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words", this->params())
, BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices", this->params())
, BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats", this->params())
, BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true, "Try to improve fuzzy spaces", this->params())
, BOOL_MEMBER(tessedit_unrej_any_wd, false, "Don't bother with word plausibility",
this->params())
, BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?", this->params())
, BOOL_MEMBER(tessedit_enable_doc_dict, true, "Add words to the document dictionary",
this->params())
, BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char", this->params())
, INT_MEMBER(tessedit_font_id, 0, "Font ID to use or zero", this->params())
, BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats", this->params())
, BOOL_MEMBER(tessedit_enable_bigram_correction, true,
"Enable correction based on the word bigram dictionary.", this->params())
, BOOL_MEMBER(tessedit_enable_dict_correction, false,
"Enable single word correction based on the dictionary.", this->params())
, INT_MEMBER(tessedit_bigram_debug, 0, "Amount of debug output for bigram correction.",
this->params())
, BOOL_MEMBER(enable_noise_removal, true,
"Remove and conditionally reassign small outlines when they"
" confuse layout analysis, determining diacritics vs noise",
this->params())
, INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines", this->params())
,
// Worst (min) certainty, for which a diacritic is allowed to make the
// base
// character worse and still be included.
double_MEMBER(noise_cert_basechar, -8.0, "Hingepoint for base char certainty", this->params())
,
// Worst (min) certainty, for which a non-overlapping diacritic is allowed
// to make the base character worse and still be included.
double_MEMBER(noise_cert_disjoint, -1.0, "Hingepoint for disjoint certainty", this->params())
,
// Worst (min) certainty, for which a diacritic is allowed to make a new
// stand-alone blob.
double_MEMBER(noise_cert_punc, -3.0, "Threshold for new punc char certainty", this->params())
,
// Factor of certainty margin for adding diacritics to not count as worse.
double_MEMBER(noise_cert_factor, 0.375, "Scaling on certainty diff from Hingepoint",
this->params())
, INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob", this->params())
, INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word", this->params())
, INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params())
, STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation", this->params())
, STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation", this->params())
, STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation", this->params())
, double_MEMBER(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit", this->params())
, double_MEMBER(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit", this->params())
, double_MEMBER(quality_outline_pc, 1.0, "good_quality_doc lte outline error limit",
this->params())
, double_MEMBER(quality_char_pc, 0.95, "good_quality_doc gte good char limit", this->params())
, INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word", this->params())
, INT_MEMBER(tessedit_tess_adaption_mode, 0x27, "Adaptation decision algorithm for tess",
this->params())
, BOOL_MEMBER(tessedit_minimal_rej_pass1, false, "Do minimal rejection on pass 1 output",
this->params())
, BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria", this->params())
, BOOL_MEMBER(test_pt, false, "Test for point", this->params())
, double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params())
, double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params())
, INT_MEMBER(multilang_debug_level, 0, "Print multilang debug info.", this->params())
, INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.", this->params())
, BOOL_MEMBER(paragraph_text_based, true,
"Run paragraph detection on the post-text-recognition "
"(more accurate)",
this->params())
, BOOL_MEMBER(lstm_use_matrix, 1, "Use ratings matrix/beam search with lstm", this->params())
, STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines", this->params())
, STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines", this->params())
, BOOL_MEMBER(tessedit_good_quality_unrej, true, "Reduce rejection on good docs",
this->params())
, BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?", this->params())
, double_MEMBER(tessedit_reject_doc_percent, 65.00, "%rej allowed before rej whole doc",
this->params())
, double_MEMBER(tessedit_reject_block_percent, 45.00, "%rej allowed before rej whole block",
this->params())
, double_MEMBER(tessedit_reject_row_percent, 40.00, "%rej allowed before rej whole row",
this->params())
, double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00,
"Number of row rejects in whole word rejects"
" which prevents whole row rejection",
this->params())
, BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,
"Only rej partially rejected words in block rejection", this->params())
, BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true,
"Only rej partially rejected words in row rejection", this->params())
, BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false, "Use word segmentation quality metric",
this->params())
, BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false, "Use word segmentation quality metric",
this->params())
, INT_MEMBER(tessedit_preserve_min_wd_len, 2, "Only preserve wds longer than this",
this->params())
, BOOL_MEMBER(tessedit_row_rej_good_docs, true, "Apply row rejection to good docs",
this->params())
, double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1,
"rej good doc wd if more than this fraction rejected", this->params())
, BOOL_MEMBER(tessedit_reject_bad_qual_wds, true, "Reject all bad quality wds", this->params())
, BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats", this->params())
, BOOL_MEMBER(tessedit_debug_quality_metrics, false, "Output data to debug file",
this->params())
, BOOL_MEMBER(bland_unrej, false, "unrej potential with no checks", this->params())
, double_MEMBER(quality_rowrej_pc, 1.1, "good_quality_doc gte good char limit", this->params())
, BOOL_MEMBER(unlv_tilde_crunching, false, "Mark v.bad words for tilde crunch", this->params())
, BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output", this->params())
, BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output",
this->params())
, BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?", this->params())
, BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?", this->params())
, double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this", this->params())
, BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params())
, double_MEMBER(crunch_poor_garbage_cert, -9.0, "crunch garbage cert lt this", this->params())
, double_MEMBER(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this", this->params())
, double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this", this->params())
, double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this", this->params())
, double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this", this->params())
, double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this", this->params())
, double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this", this->params())
, double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this", this->params())
, double_MEMBER(crunch_del_min_width, 3.0, "Del if word width lt xht x this", this->params())
, double_MEMBER(crunch_del_high_word, 1.5, "Del if word gt xht x this above bl", this->params())
, double_MEMBER(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl", this->params())
, double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this", this->params())
, INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch", this->params())
, INT_MEMBER(crunch_pot_indicators, 1, "How many potential indicators needed", this->params())
, BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings", this->params())
, BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring", this->params())
, BOOL_MEMBER(crunch_leave_accept_strings, false, "Don't pot crunch sensible strings",
this->params())
, BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures", this->params())
, INT_MEMBER(crunch_leave_lc_strings, 4, "Don't crunch words with long lower case strings",
this->params())
, INT_MEMBER(crunch_leave_uc_strings, 4, "Don't crunch words with long lower case strings",
this->params())
, INT_MEMBER(crunch_long_repetitions, 3, "Crunch words with long repetitions", this->params())
, INT_MEMBER(crunch_debug, 0, "As it says", this->params())
, INT_MEMBER(fixsp_non_noise_limit, 1, "How many non-noise blbs either side?", this->params())
, double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this", this->params())
, BOOL_MEMBER(tessedit_prefer_joined_punct, false, "Reward punctuation joins", this->params())
, INT_MEMBER(fixsp_done_mode, 1, "What constitutes done for spacing", this->params())
, INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug", this->params())
, STRING_MEMBER(numeric_punctuation, ".,", "Punct. chs expected WITHIN numbers", this->params())
, INT_MEMBER(x_ht_acceptance_tolerance, 8,
"Max allowed deviation of blob top outside of font data", this->params())
, INT_MEMBER(x_ht_min_change, 8, "Min change in xht before actually trying it", this->params())
, INT_MEMBER(superscript_debug, 0, "Debug level for sub & superscript fixer", this->params())
, double_MEMBER(superscript_worse_certainty, 2.0,
"How many times worse "
"certainty does a superscript position glyph need to be for "
"us to try classifying it as a char with a different "
"baseline?",
this->params())
, double_MEMBER(superscript_bettered_certainty, 0.97,
"What reduction in "
"badness do we think sufficient to choose a superscript "
"over what we'd thought. For example, a value of 0.6 means "
"we want to reduce badness of certainty by at least 40%",
this->params())
, double_MEMBER(superscript_scaledown_ratio, 0.4,
"A superscript scaled down more than this is unbelievably "
"small. For example, 0.3 means we expect the font size to "
"be no smaller than 30% of the text line font size.",
this->params())
, double_MEMBER(subscript_max_y_top, 0.5,
"Maximum top of a character measured as a multiple of "
"x-height above the baseline for us to reconsider whether "
"it's a subscript.",
this->params())
, double_MEMBER(superscript_min_y_bottom, 0.3,
"Minimum bottom of a character measured as a multiple of "
"x-height above the baseline for us to reconsider whether "
"it's a superscript.",
this->params())
, BOOL_MEMBER(tessedit_write_block_separators, false, "Write block separators in output",
this->params())
, BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code", this->params())
, BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file", this->params())
, BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file", this->params())
, BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file", this->params())
, BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file", this->params())
, BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",
this->params())
, BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file", this->params())
, BOOL_MEMBER(tessedit_create_wordstrbox, false, "Write WordStr format .box output file",
this->params())
, BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file", this->params())
, BOOL_MEMBER(textonly_pdf, false, "Create PDF with only one invisible text layer",
this->params())
, INT_MEMBER(jpg_quality, 85, "Set JPEG quality level", this->params())
, INT_MEMBER(user_defined_dpi, 0, "Specify DPI for input image", this->params())
, INT_MEMBER(min_characters_to_try, 50, "Specify minimum characters to try during OSD",
this->params())
, STRING_MEMBER(unrecognised_char, "|", "Output char for unidentified blobs", this->params())
, INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params())
, INT_MEMBER(suspect_short_words, 2, "Don't suspect dict wds longer than this", this->params())
, BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected", this->params())
, double_MEMBER(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit", this->params())
, double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit", this->params())
, BOOL_MEMBER(tessedit_minimal_rejection, false, "Only reject tess failures", this->params())
, BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING", this->params())
, BOOL_MEMBER(tessedit_word_for_word, false, "Make output have exactly one word per WERD",
this->params())
, BOOL_MEMBER(tessedit_zero_kelvin_rejection, false, "Don't reject ANYTHING AT ALL",
this->params())
, INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", this->params())
, BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug", this->params())
, BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips", this->params())
, double_MEMBER(tessedit_lower_flip_hyphen, 1.5, "Aspect ratio dot/hyphen test", this->params())
, double_MEMBER(tessedit_upper_flip_hyphen, 1.8, "Aspect ratio dot/hyphen test", this->params())
, BOOL_MEMBER(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector", this->params())
, BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test", this->params())
, BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check", this->params())
, BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control", this->params())
, BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control", this->params())
, BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control", this->params())
, BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check", this->params())
, BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check", this->params())
, double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85, "if >this fract", this->params())
, INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit", this->params())
, STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075", "Allow NN to unrej", this->params())
, STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set", this->params())
, INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this", this->params())
, BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes", this->params())
, INT_MEMBER(tessedit_page_number, -1, "-1 -> All pages, else specific page to process",
this->params())
, BOOL_MEMBER(tessedit_write_images, false, "Capture the image from the IPE", this->params())
, BOOL_MEMBER(interactive_display_mode, false, "Run interactively?", this->params())
, STRING_MEMBER(file_type, ".tif", "Filename extension", this->params())
, BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word", this->params())
, STRING_MEMBER(tessedit_load_sublangs, "", "List of languages to load with this one",
this->params())
, BOOL_MEMBER(tessedit_use_primary_params_model, false,
"In multilingual mode use params model of the"
" primary language",
this->params())
, double_MEMBER(min_orientation_margin, 7.0, "Min acceptable orientation margin",
this->params())
, BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", this->params())
, BOOL_MEMBER(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model", this->params())
, BOOL_MEMBER(poly_allow_detailed_fx, false,
"Allow feature extractors to see the original outline", this->params())
, BOOL_INIT_MEMBER(tessedit_init_config_only, false,
"Only initialize with the config file. Useful if the "
"instance is not going to be used for OCR but say only "
"for layout analysis.",
this->params())
, BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector", this->params())
, BOOL_MEMBER(textord_tabfind_vertical_text, true, "Enable vertical detection", this->params())
, BOOL_MEMBER(textord_tabfind_force_vertical_text, false, "Force using vertical text page mode",
this->params())
, double_MEMBER(textord_tabfind_vertical_text_ratio, 0.5,
"Fraction of textlines deemed vertical to use vertical page "
"mode",
this->params())
, double_MEMBER(textord_tabfind_aligned_gap_fraction, 0.75,
"Fraction of height used as a minimum gap for aligned blobs.", this->params())
, INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible", this->params())
, BOOL_MEMBER(preserve_interword_spaces, false, "Preserve multiple interword spaces",
this->params())
, STRING_MEMBER(page_separator, "\f", "Page separator (default is form feed control character)",
this->params())
, INT_MEMBER(lstm_choice_mode, 0,
"Allows to include alternative symbols choices in the hOCR output. "
"Valid input values are 0, 1 and 2. 0 is the default value. "
"With 1 the alternative symbol choices per timestep are included. "
"With 2 alternative symbol choices are extracted from the CTC "
"process instead of the lattice. The choices are mapped per "
"character.",
this->params())
, INT_MEMBER(lstm_choice_iterations, 5,
"Sets the number of cascading iterations for the Beamsearch in "
"lstm_choice_mode. Note that lstm_choice_mode must be set to a "
"value greater than 0 to produce results.",
this->params())
, double_MEMBER(lstm_rating_coefficient, 5,
"Sets the rating coefficient for the lstm choices. The smaller the "
"coefficient, the better are the ratings for each choice and less "
"information is lost due to the cut off at 0. The standard value is "
"5",
this->params())
, BOOL_MEMBER(pageseg_apply_music_mask, true,
"Detect music staff and remove intersecting components", this->params())
,
backup_config_file_(nullptr)
, pix_binary_(nullptr)
, pix_grey_(nullptr)
, pix_original_(nullptr)
, pix_thresholds_(nullptr)
, source_resolution_(0)
, textord_(this)
, right_to_left_(false)
, scaled_color_(nullptr)
, scaled_factor_(-1)
, deskew_(1.0f, 0.0f)
, reskew_(1.0f, 0.0f)
, most_recently_used_(this)
, font_table_size_(0)
, equ_detect_(nullptr)
, lstm_recognizer_(nullptr)
, train_line_page_num_(0) {}
Tesseract::~Tesseract() {
Clear();
pix_original_.destroy();
end_tesseract();
for (auto *lang : sub_langs_) {
delete lang;
}
delete lstm_recognizer_;
lstm_recognizer_ = nullptr;
}
Dict &Tesseract::getDict() {
if (0 == Classify::getDict().NumDawgs() && AnyLSTMLang()) {
if (lstm_recognizer_ && lstm_recognizer_->GetDict()) {
return *lstm_recognizer_->GetDict();
}
}
return Classify::getDict();
}
void Tesseract::Clear() {
std::string debug_name = imagebasename + "_debug.pdf";
pixa_debug_.WritePDF(debug_name.c_str());
pix_binary_.destroy();
pix_grey_.destroy();
pix_thresholds_.destroy();
scaled_color_.destroy();
deskew_ = FCOORD(1.0f, 0.0f);
reskew_ = FCOORD(1.0f, 0.0f);
splitter_.Clear();
scaled_factor_ = -1;
for (auto &sub_lang : sub_langs_) {
sub_lang->Clear();
}
}
#ifndef DISABLED_LEGACY_ENGINE
void Tesseract::SetEquationDetect(EquationDetect *detector) {
equ_detect_ = detector;
equ_detect_->SetLangTesseract(this);
}
// Clear all memory of adaption for this and all subclassifiers.
void Tesseract::ResetAdaptiveClassifier() {
ResetAdaptiveClassifierInternal();
for (auto &sub_lang : sub_langs_) {
sub_lang->ResetAdaptiveClassifierInternal();
}
}
#endif // ndef DISABLED_LEGACY_ENGINE
// Clear the document dictionary for this and all subclassifiers.
void Tesseract::ResetDocumentDictionary() {
getDict().ResetDocumentDictionary();
for (auto &sub_lang : sub_langs_) {
sub_lang->getDict().ResetDocumentDictionary();
}
}
void Tesseract::SetBlackAndWhitelist() {
// Set the white and blacklists (if any)
unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
tessedit_char_whitelist.c_str(),
tessedit_char_unblacklist.c_str());
if (lstm_recognizer_) {
UNICHARSET &lstm_unicharset = lstm_recognizer_->GetUnicharset();
lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
tessedit_char_whitelist.c_str(),
tessedit_char_unblacklist.c_str());
}
// Black and white lists should apply to all loaded classifiers.
for (auto &sub_lang : sub_langs_) {
sub_lang->unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
tessedit_char_whitelist.c_str(),
tessedit_char_unblacklist.c_str());
if (sub_lang->lstm_recognizer_) {
UNICHARSET &lstm_unicharset = sub_lang->lstm_recognizer_->GetUnicharset();
lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
tessedit_char_whitelist.c_str(),
tessedit_char_unblacklist.c_str());
}
}
}
// Perform steps to prepare underlying binary image/other data structures for
// page segmentation.
void Tesseract::PrepareForPageseg() {
textord_.set_use_cjk_fp_model(textord_use_cjk_fp_model);
// Find the max splitter strategy over all langs.
auto max_pageseg_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(
static_cast<int32_t>(pageseg_devanagari_split_strategy));
for (auto &sub_lang : sub_langs_) {
auto pageseg_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(
static_cast<int32_t>(sub_lang->pageseg_devanagari_split_strategy));
if (pageseg_strategy > max_pageseg_strategy) {
max_pageseg_strategy = pageseg_strategy;
}
sub_lang->pix_binary_.destroy();
sub_lang->pix_binary_ = pix_binary().clone();
}
// Perform shiro-rekha (top-line) splitting and replace the current image by
// the newly split image.
splitter_.set_orig_pix(pix_binary());
splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
if (splitter_.Split(true, &pixa_debug_)) {
ASSERT_HOST(splitter_.splitted_image());
pix_binary_.destroy();
pix_binary_ = splitter_.splitted_image().clone();
}
}
// Perform steps to prepare underlying binary image/other data structures for
// OCR. The current segmentation is required by this method.
// Note that this method resets pix_binary_ to the original binarized image,
// which may be different from the image actually used for OCR depending on the
// value of devanagari_ocr_split_strategy.
void Tesseract::PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr) {
// Find the max splitter strategy over all langs.
auto max_ocr_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(
static_cast<int32_t>(ocr_devanagari_split_strategy));
for (auto &sub_lang : sub_langs_) {
auto ocr_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(
static_cast<int32_t>(sub_lang->ocr_devanagari_split_strategy));
if (ocr_strategy > max_ocr_strategy) {
max_ocr_strategy = ocr_strategy;
}
}
// Utilize the segmentation information available.
splitter_.set_segmentation_block_list(block_list);
splitter_.set_ocr_split_strategy(max_ocr_strategy);
// Run the splitter for OCR
bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
// Restore pix_binary to the binarized original pix for future reference.
ASSERT_HOST(splitter_.orig_pix());
pix_binary_.destroy();
pix_binary_ = splitter_.orig_pix().clone();
// If the pageseg and ocr strategies are different, refresh the block list
// (from the last SegmentImage call) with blobs from the real image to be used
// for OCR.
if (splitter_.HasDifferentSplitStrategies()) {
BLOCK block("", true, 0, 0, 0, 0, pixGetWidth(pix_binary_), pixGetHeight(pix_binary_));
Image pix_for_ocr = split_for_ocr ? splitter_.splitted_image() : splitter_.orig_pix();
extract_edges(pix_for_ocr, &block);
splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());
}
// The splitter isn't needed any more after this, so save memory by clearing.
splitter_.Clear();
}
} // namespace tesseract

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,24 @@
/**********************************************************************
* File: tessvars.cpp (Formerly tessvars.c)
* Description: Variables and other globals for tessedit.
* Author: Ray Smith
* Created: Mon Apr 13 13:13:23 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include <cstdio>
#include "tessvars.h"
FILE *debug_fp = stderr; // write debug stuff here

View File

@ -0,0 +1,27 @@
/**********************************************************************
* File: tessvars.h (Formerly tessvars.h)
* Description: Variables and other globals for tessedit.
* Author: Ray Smith
* Created: Mon Apr 13 13:13:23 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TESSVARS_H
#define TESSVARS_H
#include <cstdio>
extern FILE *debug_fp; // write debug stuff here
#endif

View File

@ -0,0 +1,306 @@
/**********************************************************************
* File: tfacepp.cpp (Formerly tface++.c)
* Description: C++ side of the C/C++ Tess/Editor interface.
* Author: Ray Smith
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include <cmath>
#include "blamer.h"
#include "errcode.h"
#include "ratngs.h"
#include "reject.h"
#include "tesseractclass.h"
#include "werd.h"
#define MAX_UNDIVIDED_LENGTH 24
/**********************************************************************
* recog_word
*
* Convert the word to tess form and pass it to the tess segmenter.
* Convert the output back to editor form.
**********************************************************************/
namespace tesseract {
void Tesseract::recog_word(WERD_RES *word) {
if (wordrec_skip_no_truth_words &&
(word->blamer_bundle == nullptr ||
word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) {
if (classify_debug_level) {
tprintf("No truth for word - skipping\n");
}
word->tess_failed = true;
return;
}
ASSERT_HOST(!word->chopped_word->blobs.empty());
recog_word_recursive(word);
word->SetupBoxWord();
if (word->best_choice->length() != word->box_word->length()) {
tprintf(
"recog_word ASSERT FAIL String:\"%s\"; "
"Strlen=%d; #Blobs=%d\n",
word->best_choice->debug_string().c_str(), word->best_choice->length(),
word->box_word->length());
}
ASSERT_HOST(word->best_choice->length() == word->box_word->length());
// Check that the ratings matrix size matches the sum of all the
// segmentation states.
if (!word->StatesAllValid()) {
tprintf("Not all words have valid states relative to ratings matrix!!");
word->DebugWordChoices(true, nullptr);
ASSERT_HOST(word->StatesAllValid());
}
if (tessedit_override_permuter) {
/* Override the permuter type if a straight dictionary check disagrees. */
uint8_t perm_type = word->best_choice->permuter();
if ((perm_type != SYSTEM_DAWG_PERM) && (perm_type != FREQ_DAWG_PERM) &&
(perm_type != USER_DAWG_PERM)) {
uint8_t real_dict_perm_type = dict_word(*word->best_choice);
if (((real_dict_perm_type == SYSTEM_DAWG_PERM) || (real_dict_perm_type == FREQ_DAWG_PERM) ||
(real_dict_perm_type == USER_DAWG_PERM)) &&
(alpha_count(word->best_choice->unichar_string().c_str(),
word->best_choice->unichar_lengths().c_str()) > 0)) {
word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
}
}
if (tessedit_rejection_debug && perm_type != word->best_choice->permuter()) {
tprintf("Permuter Type Flipped from %d to %d\n", perm_type, word->best_choice->permuter());
}
}
// Factored out from control.cpp
ASSERT_HOST((word->best_choice == nullptr) == (word->raw_choice == nullptr));
if (word->best_choice == nullptr || word->best_choice->empty() ||
static_cast<int>(strspn(word->best_choice->unichar_string().c_str(), " ")) ==
word->best_choice->length()) {
word->tess_failed = true;
word->reject_map.initialise(word->box_word->length());
word->reject_map.rej_word_tess_failure();
} else {
word->tess_failed = false;
}
}
/**********************************************************************
* recog_word_recursive
*
* Convert the word to tess form and pass it to the tess segmenter.
* Convert the output back to editor form.
**********************************************************************/
void Tesseract::recog_word_recursive(WERD_RES *word) {
int word_length = word->chopped_word->NumBlobs(); // no of blobs
if (word_length > MAX_UNDIVIDED_LENGTH) {
return split_and_recog_word(word);
}
cc_recog(word);
word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
// Do sanity checks and minor fixes on best_choice.
if (word->best_choice->length() > word_length) {
word->best_choice->make_bad(); // should never happen
tprintf(
"recog_word: Discarded long string \"%s\""
" (%d characters vs %d blobs)\n",
word->best_choice->unichar_string().c_str(), word->best_choice->length(), word_length);
tprintf("Word is at:");
word->word->bounding_box().print();
}
if (word->best_choice->length() < word_length) {
UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
while (word->best_choice->length() < word_length) {
word->best_choice->append_unichar_id(space_id, 1, 0.0, word->best_choice->certainty());
}
}
}
/**********************************************************************
* split_and_recog_word
*
* Split the word into 2 smaller pieces at the largest gap.
* Recognize the pieces and stick the results back together.
**********************************************************************/
void Tesseract::split_and_recog_word(WERD_RES *word) {
// Find the biggest blob gap in the chopped_word.
int bestgap = -INT32_MAX;
int split_index = 0;
for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
int gap = blob_box.left() - prev_box.right();
if (gap > bestgap) {
bestgap = gap;
split_index = b;
}
}
ASSERT_HOST(split_index > 0);
WERD_RES *word2 = nullptr;
BlamerBundle *orig_bb = nullptr;
split_word(word, split_index, &word2, &orig_bb);
// Recognize the first part of the word.
recog_word_recursive(word);
// Recognize the second part of the word.
recog_word_recursive(word2);
join_words(word, word2, orig_bb);
}
/**********************************************************************
* split_word
*
* Split a given WERD_RES in place into two smaller words for recognition.
* split_pt is the index of the first blob to go in the second word.
* The underlying word is left alone, only the TWERD (and subsequent data)
* are split up. orig_blamer_bundle is set to the original blamer bundle,
* and will now be owned by the caller. New blamer bundles are forged for the
* two pieces.
**********************************************************************/
void Tesseract::split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece,
BlamerBundle **orig_blamer_bundle) const {
ASSERT_HOST(split_pt > 0 && split_pt < word->chopped_word->NumBlobs());
// Save a copy of the blamer bundle so we can try to reconstruct it below.
BlamerBundle *orig_bb = word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : nullptr;
auto *word2 = new WERD_RES(*word);
// blow away the copied chopped_word, as we want to work with
// the blobs from the input chopped_word so seam_arrays can be merged.
TWERD *chopped = word->chopped_word;
auto *chopped2 = new TWERD;
chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
chopped2->blobs.push_back(chopped->blobs[i]);
}
chopped->blobs.resize(split_pt);
word->chopped_word = nullptr;
delete word2->chopped_word;
word2->chopped_word = nullptr;
const UNICHARSET &unicharset = *word->uch_set;
word->ClearResults();
word2->ClearResults();
word->chopped_word = chopped;
word2->chopped_word = chopped2;
word->SetupBasicsFromChoppedWord(unicharset);
word2->SetupBasicsFromChoppedWord(unicharset);
// Try to adjust the blamer bundle.
if (orig_bb != nullptr) {
// TODO(rays) Looks like a leak to me.
// orig_bb should take, rather than copy.
word->blamer_bundle = new BlamerBundle();
word2->blamer_bundle = new BlamerBundle();
orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
word2->chopped_word->blobs[0]->bounding_box().left(), wordrec_debug_blamer,
word->blamer_bundle, word2->blamer_bundle);
}
*right_piece = word2;
*orig_blamer_bundle = orig_bb;
}
/**********************************************************************
* join_words
*
* The opposite of split_word():
* join word2 (including any recognized data / seam array / etc)
* onto the right of word and then delete word2.
* Also, if orig_bb is provided, stitch it back into word.
**********************************************************************/
void Tesseract::join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const {
TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
// Tack the word2 outputs onto the end of the word outputs.
word->chopped_word->blobs.insert(word->chopped_word->blobs.end(), word2->chopped_word->blobs.begin(), word2->chopped_word->blobs.end());
word->rebuild_word->blobs.insert(word->rebuild_word->blobs.end(), word2->rebuild_word->blobs.begin(), word2->rebuild_word->blobs.end());
word2->chopped_word->blobs.clear();
word2->rebuild_word->blobs.clear();
TPOINT split_pt;
split_pt.x = (prev_box.right() + blob_box.left()) / 2;
split_pt.y = (prev_box.top() + prev_box.bottom() + blob_box.top() + blob_box.bottom()) / 4;
// Move the word2 seams onto the end of the word1 seam_array.
// Since the seam list is one element short, an empty seam marking the
// end of the last blob in the first word is needed first.
word->seam_array.push_back(new SEAM(0.0f, split_pt));
word->seam_array.insert(word->seam_array.end(), word2->seam_array.begin(), word2->seam_array.end());
word2->seam_array.clear();
// Fix widths and gaps.
word->blob_widths.insert(word->blob_widths.end(), word2->blob_widths.begin(), word2->blob_widths.end());
word->blob_gaps.insert(word->blob_gaps.end(), word2->blob_gaps.begin(), word2->blob_gaps.end());
// Fix the ratings matrix.
int rat1 = word->ratings->dimension();
int rat2 = word2->ratings->dimension();
word->ratings->AttachOnCorner(word2->ratings);
ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
word->best_state.insert(word->best_state.end(), word2->best_state.begin(), word2->best_state.end());
// Append the word choices.
*word->raw_choice += *word2->raw_choice;
// How many alt choices from each should we try to get?
const int kAltsPerPiece = 2;
// When do we start throwing away extra alt choices?
const int kTooManyAltChoices = 100;
// Construct the cartesian product of the best_choices of word(1) and word2.
WERD_CHOICE_LIST joined_choices;
WERD_CHOICE_IT jc_it(&joined_choices);
WERD_CHOICE_IT bc1_it(&word->best_choices);
WERD_CHOICE_IT bc2_it(&word2->best_choices);
int num_word1_choices = word->best_choices.length();
int total_joined_choices = num_word1_choices;
// Nota Bene: For the main loop here, we operate only on the 2nd and greater
// word2 choices, and put them in the joined_choices list. The 1st word2
// choice gets added to the original word1 choices in-place after we have
// finished with them.
int bc2_index = 1;
for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
if (total_joined_choices >= kTooManyAltChoices && bc2_index > kAltsPerPiece) {
break;
}
int bc1_index = 0;
for (bc1_it.move_to_first(); bc1_index < num_word1_choices; ++bc1_index, bc1_it.forward()) {
if (total_joined_choices >= kTooManyAltChoices && bc1_index > kAltsPerPiece) {
break;
}
auto *wc = new WERD_CHOICE(*bc1_it.data());
*wc += *bc2_it.data();
jc_it.add_after_then_move(wc);
++total_joined_choices;
}
}
// Now that we've filled in as many alternates as we want, paste the best
// choice for word2 onto the original word alt_choices.
bc1_it.move_to_first();
bc2_it.move_to_first();
for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
*bc1_it.data() += *bc2_it.data();
}
bc1_it.move_to_last();
bc1_it.add_list_after(&joined_choices);
// Restore the pointer to original blamer bundle and combine blamer
// information recorded in the splits.
if (orig_bb != nullptr) {
orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle, wordrec_debug_blamer);
delete word->blamer_bundle;
word->blamer_bundle = orig_bb;
}
word->SetupBoxWord();
word->reject_map.initialise(word->box_word->length());
delete word2;
}
} // namespace tesseract

View File

@ -0,0 +1,331 @@
///////////////////////////////////////////////////////////////////////
// File: thresholder.cpp
// Description: Base API for thresholding images in tesseract.
// Author: Ray Smith
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include <allheaders.h>
#include <cstdint> // for uint32_t
#include <cstring>
#include "otsuthr.h"
#include "thresholder.h"
#include "tprintf.h" // for tprintf
#if defined(USE_OPENCL)
# include "openclwrapper.h" // for OpenclDevice
#endif
namespace tesseract {
ImageThresholder::ImageThresholder()
: pix_(nullptr)
, image_width_(0)
, image_height_(0)
, pix_channels_(0)
, pix_wpl_(0)
, scale_(1)
, yres_(300)
, estimated_res_(300) {
SetRectangle(0, 0, 0, 0);
}
ImageThresholder::~ImageThresholder() {
Clear();
}
// Destroy the Pix if there is one, freeing memory.
void ImageThresholder::Clear() {
pix_.destroy();
}
// Return true if no image has been set.
bool ImageThresholder::IsEmpty() const {
return pix_ == nullptr;
}
// SetImage makes a copy of all the image data, so it may be deleted
// immediately after this call.
// Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
// Palette color images will not work properly and must be converted to
// 24 bit.
// Binary images of 1 bit per pixel may also be given but they must be
// byte packed with the MSB of the first byte being the first pixel, and a
// one pixel is WHITE. For binary images set bytes_per_pixel=0.
void ImageThresholder::SetImage(const unsigned char *imagedata, int width, int height,
int bytes_per_pixel, int bytes_per_line) {
int bpp = bytes_per_pixel * 8;
if (bpp == 0) {
bpp = 1;
}
Image pix = pixCreate(width, height, bpp == 24 ? 32 : bpp);
l_uint32 *data = pixGetData(pix);
int wpl = pixGetWpl(pix);
switch (bpp) {
case 1:
for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) {
for (int x = 0; x < width; ++x) {
if (imagedata[x / 8] & (0x80 >> (x % 8))) {
CLEAR_DATA_BIT(data, x);
} else {
SET_DATA_BIT(data, x);
}
}
}
break;
case 8:
// Greyscale just copies the bytes in the right order.
for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) {
for (int x = 0; x < width; ++x) {
SET_DATA_BYTE(data, x, imagedata[x]);
}
}
break;
case 24:
// Put the colors in the correct places in the line buffer.
for (int y = 0; y < height; ++y, imagedata += bytes_per_line) {
for (int x = 0; x < width; ++x, ++data) {
SET_DATA_BYTE(data, COLOR_RED, imagedata[3 * x]);
SET_DATA_BYTE(data, COLOR_GREEN, imagedata[3 * x + 1]);
SET_DATA_BYTE(data, COLOR_BLUE, imagedata[3 * x + 2]);
}
}
break;
case 32:
// Maintain byte order consistency across different endianness.
for (int y = 0; y < height; ++y, imagedata += bytes_per_line, data += wpl) {
for (int x = 0; x < width; ++x) {
data[x] = (imagedata[x * 4] << 24) | (imagedata[x * 4 + 1] << 16) |
(imagedata[x * 4 + 2] << 8) | imagedata[x * 4 + 3];
}
}
break;
default:
tprintf("Cannot convert RAW image to Pix with bpp = %d\n", bpp);
}
SetImage(pix);
pix.destroy();
}
// Store the coordinates of the rectangle to process for later use.
// Doesn't actually do any thresholding.
void ImageThresholder::SetRectangle(int left, int top, int width, int height) {
rect_left_ = left;
rect_top_ = top;
rect_width_ = width;
rect_height_ = height;
}
// Get enough parameters to be able to rebuild bounding boxes in the
// original image (not just within the rectangle).
// Left and top are enough with top-down coordinates, but
// the height of the rectangle and the image are needed for bottom-up.
void ImageThresholder::GetImageSizes(int *left, int *top, int *width, int *height, int *imagewidth,
int *imageheight) {
*left = rect_left_;
*top = rect_top_;
*width = rect_width_;
*height = rect_height_;
*imagewidth = image_width_;
*imageheight = image_height_;
}
// Pix vs raw, which to use? Pix is the preferred input for efficiency,
// since raw buffers are copied.
// SetImage for Pix clones its input, so the source pix may be pixDestroyed
// immediately after, but may not go away until after the Thresholder has
// finished with it.
void ImageThresholder::SetImage(const Image pix) {
if (pix_ != nullptr) {
pix_.destroy();
}
Image src = pix;
int depth;
pixGetDimensions(src, &image_width_, &image_height_, &depth);
// Convert the image as necessary so it is one of binary, plain RGB, or
// 8 bit with no colormap. Guarantee that we always end up with our own copy,
// not just a clone of the input.
if (pixGetColormap(src)) {
Image tmp = pixRemoveColormap(src, REMOVE_CMAP_BASED_ON_SRC);
depth = pixGetDepth(tmp);
if (depth > 1 && depth < 8) {
pix_ = pixConvertTo8(tmp, false);
tmp.destroy();
} else {
pix_ = tmp;
}
} else if (depth > 1 && depth < 8) {
pix_ = pixConvertTo8(src, false);
} else {
pix_ = src.copy();
}
depth = pixGetDepth(pix_);
pix_channels_ = depth / 8;
pix_wpl_ = pixGetWpl(pix_);
scale_ = 1;
estimated_res_ = yres_ = pixGetYRes(pix_);
Init();
}
// Threshold the source image as efficiently as possible to the output Pix.
// Creates a Pix and sets pix to point to the resulting pointer.
// Caller must use pixDestroy to free the created Pix.
/// Returns false on error.
bool ImageThresholder::ThresholdToPix(PageSegMode pageseg_mode, Image *pix) {
if (image_width_ > INT16_MAX || image_height_ > INT16_MAX) {
tprintf("Image too large: (%d, %d)\n", image_width_, image_height_);
return false;
}
if (pix_channels_ == 0) {
// We have a binary image, but it still has to be copied, as this API
// allows the caller to modify the output.
Image original = GetPixRect();
*pix = original.copy();
original.destroy();
} else {
OtsuThresholdRectToPix(pix_, pix);
}
return true;
}
// Gets a pix that contains an 8 bit threshold value at each pixel. The
// returned pix may be an integer reduction of the binary image such that
// the scale factor may be inferred from the ratio of the sizes, even down
// to the extreme of a 1x1 pixel thresholds image.
// Ideally the 8 bit threshold should be the exact threshold used to generate
// the binary image in ThresholdToPix, but this is not a hard constraint.
// Returns nullptr if the input is binary. PixDestroy after use.
Image ImageThresholder::GetPixRectThresholds() {
if (IsBinary()) {
return nullptr;
}
Image pix_grey = GetPixRectGrey();
int width = pixGetWidth(pix_grey);
int height = pixGetHeight(pix_grey);
std::vector<int> thresholds;
std::vector<int> hi_values;
OtsuThreshold(pix_grey, 0, 0, width, height, thresholds, hi_values);
pix_grey.destroy();
Image pix_thresholds = pixCreate(width, height, 8);
int threshold = thresholds[0] > 0 ? thresholds[0] : 128;
pixSetAllArbitrary(pix_thresholds, threshold);
return pix_thresholds;
}
// Common initialization shared between SetImage methods.
void ImageThresholder::Init() {
SetRectangle(0, 0, image_width_, image_height_);
}
// Get a clone/copy of the source image rectangle.
// The returned Pix must be pixDestroyed.
// This function will be used in the future by the page layout analysis, and
// the layout analysis that uses it will only be available with Leptonica,
// so there is no raw equivalent.
Image ImageThresholder::GetPixRect() {
if (IsFullImage()) {
// Just clone the whole thing.
return pix_.clone();
} else {
// Crop to the given rectangle.
Box *box = boxCreate(rect_left_, rect_top_, rect_width_, rect_height_);
Image cropped = pixClipRectangle(pix_, box, nullptr);
boxDestroy(&box);
return cropped;
}
}
// Get a clone/copy of the source image rectangle, reduced to greyscale,
// and at the same resolution as the output binary.
// The returned Pix must be pixDestroyed.
// Provided to the classifier to extract features from the greyscale image.
Image ImageThresholder::GetPixRectGrey() {
auto pix = GetPixRect(); // May have to be reduced to grey.
int depth = pixGetDepth(pix);
if (depth != 8) {
if (depth == 24) {
auto tmp = pixConvert24To32(pix);
pix.destroy();
pix = tmp;
}
auto result = pixConvertTo8(pix, false);
pix.destroy();
return result;
}
return pix;
}
// Otsu thresholds the rectangle, taking the rectangle from *this.
void ImageThresholder::OtsuThresholdRectToPix(Image src_pix, Image *out_pix) const {
std::vector<int> thresholds;
std::vector<int> hi_values;
int num_channels = OtsuThreshold(src_pix, rect_left_, rect_top_, rect_width_, rect_height_,
thresholds, hi_values);
// only use opencl if compiled w/ OpenCL and selected device is opencl
#ifdef USE_OPENCL
OpenclDevice od;
if (num_channels == 4 && od.selectedDeviceIsOpenCL() && rect_top_ == 0 && rect_left_ == 0) {
od.ThresholdRectToPixOCL((unsigned char *)pixGetData(src_pix), num_channels,
pixGetWpl(src_pix) * 4, &thresholds[0], &hi_values[0], out_pix /*pix_OCL*/,
rect_height_, rect_width_, rect_top_, rect_left_);
} else {
#endif
ThresholdRectToPix(src_pix, num_channels, thresholds, hi_values, out_pix);
#ifdef USE_OPENCL
}
#endif
}
/// Threshold the rectangle, taking everything except the src_pix
/// from the class, using thresholds/hi_values to the output pix.
/// NOTE that num_channels is the size of the thresholds and hi_values
// arrays and also the bytes per pixel in src_pix.
void ImageThresholder::ThresholdRectToPix(Image src_pix, int num_channels, const std::vector<int> &thresholds,
const std::vector<int> &hi_values, Image *pix) const {
*pix = pixCreate(rect_width_, rect_height_, 1);
uint32_t *pixdata = pixGetData(*pix);
int wpl = pixGetWpl(*pix);
int src_wpl = pixGetWpl(src_pix);
uint32_t *srcdata = pixGetData(src_pix);
pixSetXRes(*pix, pixGetXRes(src_pix));
pixSetYRes(*pix, pixGetYRes(src_pix));
for (int y = 0; y < rect_height_; ++y) {
const uint32_t *linedata = srcdata + (y + rect_top_) * src_wpl;
uint32_t *pixline = pixdata + y * wpl;
for (int x = 0; x < rect_width_; ++x) {
bool white_result = true;
for (int ch = 0; ch < num_channels; ++ch) {
int pixel = GET_DATA_BYTE(linedata, (x + rect_left_) * num_channels + ch);
if (hi_values[ch] >= 0 && (pixel > thresholds[ch]) == (hi_values[ch] == 0)) {
white_result = false;
break;
}
}
if (white_result) {
CLEAR_DATA_BIT(pixline, x);
} else {
SET_DATA_BIT(pixline, x);
}
}
}
}
} // namespace tesseract.

View File

@ -0,0 +1,190 @@
///////////////////////////////////////////////////////////////////////
// File: thresholder.h
// Description: Base API for thresholding images in tesseract.
// Author: Ray Smith
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCMAIN_THRESHOLDER_H_
#define TESSERACT_CCMAIN_THRESHOLDER_H_
#include <tesseract/export.h>
#include <tesseract/publictypes.h>
#include <vector> // for std::vector
struct Pix;
namespace tesseract {
/// Base class for all tesseract image thresholding classes.
/// Specific classes can add new thresholding methods by
/// overriding ThresholdToPix.
/// Each instance deals with a single image, but the design is intended to
/// be useful for multiple calls to SetRectangle and ThresholdTo* if
/// desired.
class TESS_API ImageThresholder {
public:
ImageThresholder();
virtual ~ImageThresholder();
/// Destroy the Pix if there is one, freeing memory.
virtual void Clear();
/// Return true if no image has been set.
bool IsEmpty() const;
/// SetImage makes a copy of all the image data, so it may be deleted
/// immediately after this call.
/// Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
/// Palette color images will not work properly and must be converted to
/// 24 bit.
/// Binary images of 1 bit per pixel may also be given but they must be
/// byte packed with the MSB of the first byte being the first pixel, and a
/// one pixel is WHITE. For binary images set bytes_per_pixel=0.
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel,
int bytes_per_line);
/// Store the coordinates of the rectangle to process for later use.
/// Doesn't actually do any thresholding.
void SetRectangle(int left, int top, int width, int height);
/// Get enough parameters to be able to rebuild bounding boxes in the
/// original image (not just within the rectangle).
/// Left and top are enough with top-down coordinates, but
/// the height of the rectangle and the image are needed for bottom-up.
virtual void GetImageSizes(int *left, int *top, int *width, int *height, int *imagewidth,
int *imageheight);
/// Return true if the source image is color.
bool IsColor() const {
return pix_channels_ >= 3;
}
/// Returns true if the source image is binary.
bool IsBinary() const {
return pix_channels_ == 0;
}
int GetScaleFactor() const {
return scale_;
}
// Set the resolution of the source image in pixels per inch.
// This should be called right after SetImage(), and will let us return
// appropriate font sizes for the text.
void SetSourceYResolution(int ppi) {
yres_ = ppi;
estimated_res_ = ppi;
}
int GetSourceYResolution() const {
return yres_;
}
int GetScaledYResolution() const {
return scale_ * yres_;
}
// Set the resolution of the source image in pixels per inch, as estimated
// by the thresholder from the text size found during thresholding.
// This value will be used to set internal size thresholds during recognition
// and will not influence the output "point size." The default value is
// the same as the source resolution. (yres_)
void SetEstimatedResolution(int ppi) {
estimated_res_ = ppi;
}
// Returns the estimated resolution, including any active scaling.
// This value will be used to set internal size thresholds during recognition.
int GetScaledEstimatedResolution() const {
return scale_ * estimated_res_;
}
/// Pix vs raw, which to use? Pix is the preferred input for efficiency,
/// since raw buffers are copied.
/// SetImage for Pix clones its input, so the source pix may be pixDestroyed
/// immediately after, but may not go away until after the Thresholder has
/// finished with it.
void SetImage(const Image pix);
/// Threshold the source image as efficiently as possible to the output Pix.
/// Creates a Pix and sets pix to point to the resulting pointer.
/// Caller must use pixDestroy to free the created Pix.
/// Returns false on error.
virtual bool ThresholdToPix(PageSegMode pageseg_mode, Image *pix);
// Gets a pix that contains an 8 bit threshold value at each pixel. The
// returned pix may be an integer reduction of the binary image such that
// the scale factor may be inferred from the ratio of the sizes, even down
// to the extreme of a 1x1 pixel thresholds image.
// Ideally the 8 bit threshold should be the exact threshold used to generate
// the binary image in ThresholdToPix, but this is not a hard constraint.
// Returns nullptr if the input is binary. PixDestroy after use.
virtual Image GetPixRectThresholds();
/// Get a clone/copy of the source image rectangle.
/// The returned Pix must be pixDestroyed.
/// This function will be used in the future by the page layout analysis, and
/// the layout analysis that uses it will only be available with Leptonica,
/// so there is no raw equivalent.
Image GetPixRect();
// Get a clone/copy of the source image rectangle, reduced to greyscale,
// and at the same resolution as the output binary.
// The returned Pix must be pixDestroyed.
// Provided to the classifier to extract features from the greyscale image.
virtual Image GetPixRectGrey();
protected:
// ----------------------------------------------------------------------
// Utility functions that may be useful components for other thresholders.
/// Common initialization shared between SetImage methods.
virtual void Init();
/// Return true if we are processing the full image.
bool IsFullImage() const {
return rect_left_ == 0 && rect_top_ == 0 && rect_width_ == image_width_ &&
rect_height_ == image_height_;
}
// Otsu thresholds the rectangle, taking the rectangle from *this.
void OtsuThresholdRectToPix(Image src_pix, Image *out_pix) const;
/// Threshold the rectangle, taking everything except the src_pix
/// from the class, using thresholds/hi_values to the output pix.
/// NOTE that num_channels is the size of the thresholds and hi_values
// arrays and also the bytes per pixel in src_pix.
void ThresholdRectToPix(Image src_pix, int num_channels, const std::vector<int> &thresholds,
const std::vector <int> &hi_values, Image *pix) const;
protected:
/// Clone or other copy of the source Pix.
/// The pix will always be PixDestroy()ed on destruction of the class.
Image pix_;
int image_width_; ///< Width of source pix_.
int image_height_; ///< Height of source pix_.
int pix_channels_; ///< Number of 8-bit channels in pix_.
int pix_wpl_; ///< Words per line of pix_.
// Limits of image rectangle to be processed.
int scale_; ///< Scale factor from original image.
int yres_; ///< y pixels/inch in source image.
int estimated_res_; ///< Resolution estimate from text size.
int rect_left_;
int rect_top_;
int rect_width_;
int rect_height_;
};
} // namespace tesseract.
#endif // TESSERACT_CCMAIN_THRESHOLDER_H_

View File

@ -0,0 +1,68 @@
/**********************************************************************
* File: werdit.cpp (Formerly wordit.c)
* Description: An iterator for passing over all the words in a document.
* Author: Ray Smith
* Created: Mon Apr 27 08:51:22 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "werdit.h"
#include "errcode.h" // for ASSERT_HOST
#include "pageres.h" // for PAGE_RES_IT, PAGE_RES (ptr only), WERD_RES
#include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
#include "werd.h" // for WERD
namespace tesseract {
/**********************************************************************
* make_pseudo_word
*
* Make all the blobs inside a selection into a single word.
* The returned PAGE_RES_IT* it points to the new word. After use, call
* it->DeleteCurrentWord() to delete the fake word, and then
* delete it to get rid of the iterator itself.
**********************************************************************/
PAGE_RES_IT *make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box) {
PAGE_RES_IT pr_it(page_res);
C_BLOB_LIST new_blobs; // list of gathered blobs
C_BLOB_IT new_blob_it = &new_blobs; // iterator
for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
WERD *word = word_res->word;
if (word->bounding_box().overlap(selection_box)) {
C_BLOB_IT blob_it(word->cblob_list());
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
C_BLOB *blob = blob_it.data();
if (blob->bounding_box().overlap(selection_box)) {
new_blob_it.add_after_then_move(C_BLOB::deep_copy(blob));
}
}
if (!new_blobs.empty()) {
WERD *pseudo_word = new WERD(&new_blobs, 1, nullptr);
word_res = pr_it.InsertSimpleCloneWord(*word_res, pseudo_word);
auto *it = new PAGE_RES_IT(page_res);
while (it->word() != word_res && it->word() != nullptr) {
it->forward();
}
ASSERT_HOST(it->word() == word_res);
return it;
}
}
}
return nullptr;
}
} // namespace tesseract

View File

@ -0,0 +1,34 @@
/**********************************************************************
* File: wordit.h
* Description: An iterator for passing over all the words in a document.
* Author: Ray Smith
* Created: Mon Apr 27 08:51:22 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef WERDIT_H
#define WERDIT_H
#include "rect.h" // for TBOX
namespace tesseract {
class PAGE_RES;
class PAGE_RES_IT;
PAGE_RES_IT *make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box);
} // namespace tesseract
#endif

View File

@ -0,0 +1,578 @@
///////////////////////////////////////////////////////////////////////
// File: blamer.cpp
// Description: Module allowing precise error causes to be allocated.
// Author: Rike Antonova
// Refactored: Ray Smith
//
// (C) Copyright 2013, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "blamer.h"
#include "blobs.h" // for TPOINT, TWERD, TBLOB
#include "errcode.h" // for ASSERT_HOST
#if !defined(DISABLED_LEGACY_ENGINE)
# include "lm_pain_points.h" // for LMPainPoints
#endif
#include "matrix.h" // for MATRIX
#include "normalis.h" // for DENORM
#include "pageres.h" // for WERD_RES
#include "unicharset.h" // for UNICHARSET
#include <cmath> // for abs
#include <cstdlib> // for abs
namespace tesseract {
// Names for each value of IncorrectResultReason enum. Keep in sync.
const char kBlameCorrect[] = "corr";
const char kBlameClassifier[] = "cl";
const char kBlameChopper[] = "chop";
const char kBlameClassLMTradeoff[] = "cl/LM";
const char kBlamePageLayout[] = "pglt";
const char kBlameSegsearchHeur[] = "ss_heur";
const char kBlameSegsearchPP[] = "ss_pp";
const char kBlameClassOldLMTradeoff[] = "cl/old_LM";
const char kBlameAdaption[] = "adapt";
const char kBlameNoTruthSplit[] = "no_tr_spl";
const char kBlameNoTruth[] = "no_tr";
const char kBlameUnknown[] = "unkn";
const char *const kIncorrectResultReasonNames[] = {
kBlameCorrect, kBlameClassifier, kBlameChopper, kBlameClassLMTradeoff,
kBlamePageLayout, kBlameSegsearchHeur, kBlameSegsearchPP, kBlameClassOldLMTradeoff,
kBlameAdaption, kBlameNoTruthSplit, kBlameNoTruth, kBlameUnknown};
const char *BlamerBundle::IncorrectReasonName(IncorrectResultReason irr) {
return kIncorrectResultReasonNames[irr];
}
const char *BlamerBundle::IncorrectReason() const {
return kIncorrectResultReasonNames[incorrect_result_reason_];
}
// Functions to setup the blamer.
// Whole word string, whole word bounding box.
void BlamerBundle::SetWordTruth(const UNICHARSET &unicharset, const char *truth_str,
const TBOX &word_box) {
truth_word_.InsertBox(0, word_box);
truth_has_char_boxes_ = false;
// Encode the string as UNICHAR_IDs.
std::vector<UNICHAR_ID> encoding;
std::vector<char> lengths;
unicharset.encode_string(truth_str, false, &encoding, &lengths, nullptr);
int total_length = 0;
for (int i = 0; i < encoding.size(); total_length += lengths[i++]) {
std::string uch(truth_str + total_length);
uch.resize(lengths[i] - total_length);
UNICHAR_ID id = encoding[i];
if (id != INVALID_UNICHAR_ID) {
uch = unicharset.get_normed_unichar(id);
}
truth_text_.push_back(uch);
}
}
// Single "character" string, "character" bounding box.
// May be called multiple times to indicate the characters in a word.
void BlamerBundle::SetSymbolTruth(const UNICHARSET &unicharset, const char *char_str,
const TBOX &char_box) {
std::string symbol_str(char_str);
UNICHAR_ID id = unicharset.unichar_to_id(char_str);
if (id != INVALID_UNICHAR_ID) {
std::string normed_uch(unicharset.get_normed_unichar(id));
if (normed_uch.length() > 0) {
symbol_str = normed_uch;
}
}
int length = truth_word_.length();
truth_text_.push_back(symbol_str);
truth_word_.InsertBox(length, char_box);
if (length == 0) {
truth_has_char_boxes_ = true;
} else if (truth_word_.BlobBox(length - 1) == char_box) {
truth_has_char_boxes_ = false;
}
}
// Marks that there is something wrong with the truth text, like it contains
// reject characters.
void BlamerBundle::SetRejectedTruth() {
incorrect_result_reason_ = IRR_NO_TRUTH;
truth_has_char_boxes_ = false;
}
// Returns true if the provided word_choice is correct.
bool BlamerBundle::ChoiceIsCorrect(const WERD_CHOICE *word_choice) const {
if (word_choice == nullptr) {
return false;
}
const UNICHARSET *uni_set = word_choice->unicharset();
std::string normed_choice_str;
for (int i = 0; i < word_choice->length(); ++i) {
normed_choice_str += uni_set->get_normed_unichar(word_choice->unichar_id(i));
}
std::string truth_str = TruthString();
return truth_str == normed_choice_str;
}
void BlamerBundle::FillDebugString(const std::string &msg, const WERD_CHOICE *choice, std::string &debug) {
debug += "Truth ";
for (auto &text : this->truth_text_) {
debug += text;
}
if (!this->truth_has_char_boxes_) {
debug += " (no char boxes)";
}
if (choice != nullptr) {
debug += " Choice ";
std::string choice_str;
choice->string_and_lengths(&choice_str, nullptr);
debug += choice_str;
}
if (msg.length() > 0) {
debug += "\n";
debug += msg;
}
debug += "\n";
}
// Sets up the norm_truth_word from truth_word using the given DENORM.
void BlamerBundle::SetupNormTruthWord(const DENORM &denorm) {
// TODO(rays) Is this the last use of denorm in WERD_RES and can it go?
norm_box_tolerance_ = kBlamerBoxTolerance * denorm.x_scale();
TPOINT topleft;
TPOINT botright;
TPOINT norm_topleft;
TPOINT norm_botright;
for (int b = 0; b < truth_word_.length(); ++b) {
const TBOX &box = truth_word_.BlobBox(b);
topleft.x = box.left();
topleft.y = box.top();
botright.x = box.right();
botright.y = box.bottom();
denorm.NormTransform(nullptr, topleft, &norm_topleft);
denorm.NormTransform(nullptr, botright, &norm_botright);
TBOX norm_box(norm_topleft.x, norm_botright.y, norm_botright.x, norm_topleft.y);
norm_truth_word_.InsertBox(b, norm_box);
}
}
// Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
// bundles) where the right edge/ of the left-hand word is word1_right,
// and the left edge of the right-hand word is word2_left.
void BlamerBundle::SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1,
BlamerBundle *bundle2) const {
std::string debug_str;
// Find truth boxes that correspond to the split in the blobs.
int b;
int begin2_truth_index = -1;
if (incorrect_result_reason_ != IRR_NO_TRUTH && truth_has_char_boxes_) {
debug_str = "Looking for truth split at";
debug_str += " end1_x " + std::to_string(word1_right);
debug_str += " begin2_x " + std::to_string(word2_left);
debug_str += "\nnorm_truth_word boxes:\n";
if (norm_truth_word_.length() > 1) {
norm_truth_word_.BlobBox(0).print_to_str(debug_str);
for (b = 1; b < norm_truth_word_.length(); ++b) {
norm_truth_word_.BlobBox(b).print_to_str(debug_str);
if ((abs(word1_right - norm_truth_word_.BlobBox(b - 1).right()) < norm_box_tolerance_) &&
(abs(word2_left - norm_truth_word_.BlobBox(b).left()) < norm_box_tolerance_)) {
begin2_truth_index = b;
debug_str += "Split found";
break;
}
}
debug_str += '\n';
}
}
// Populate truth information in word and word2 with the first and second
// part of the original truth.
if (begin2_truth_index > 0) {
bundle1->truth_has_char_boxes_ = true;
bundle1->norm_box_tolerance_ = norm_box_tolerance_;
bundle2->truth_has_char_boxes_ = true;
bundle2->norm_box_tolerance_ = norm_box_tolerance_;
BlamerBundle *curr_bb = bundle1;
for (b = 0; b < norm_truth_word_.length(); ++b) {
if (b == begin2_truth_index) {
curr_bb = bundle2;
}
curr_bb->norm_truth_word_.InsertBox(b, norm_truth_word_.BlobBox(b));
curr_bb->truth_word_.InsertBox(b, truth_word_.BlobBox(b));
curr_bb->truth_text_.push_back(truth_text_[b]);
}
} else if (incorrect_result_reason_ == IRR_NO_TRUTH) {
bundle1->incorrect_result_reason_ = IRR_NO_TRUTH;
bundle2->incorrect_result_reason_ = IRR_NO_TRUTH;
} else {
debug_str += "Truth split not found";
debug_str += truth_has_char_boxes_ ? "\n" : " (no truth char boxes)\n";
bundle1->SetBlame(IRR_NO_TRUTH_SPLIT, debug_str, nullptr, debug);
bundle2->SetBlame(IRR_NO_TRUTH_SPLIT, debug_str, nullptr, debug);
}
}
// "Joins" the blames from bundle1 and bundle2 into *this.
void BlamerBundle::JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2,
bool debug) {
std::string debug_str;
IncorrectResultReason irr = incorrect_result_reason_;
if (irr != IRR_NO_TRUTH_SPLIT) {
debug_str = "";
}
if (bundle1.incorrect_result_reason_ != IRR_CORRECT &&
bundle1.incorrect_result_reason_ != IRR_NO_TRUTH &&
bundle1.incorrect_result_reason_ != IRR_NO_TRUTH_SPLIT) {
debug_str += "Blame from part 1: ";
debug_str += bundle1.debug_;
irr = bundle1.incorrect_result_reason_;
}
if (bundle2.incorrect_result_reason_ != IRR_CORRECT &&
bundle2.incorrect_result_reason_ != IRR_NO_TRUTH &&
bundle2.incorrect_result_reason_ != IRR_NO_TRUTH_SPLIT) {
debug_str += "Blame from part 2: ";
debug_str += bundle2.debug_;
if (irr == IRR_CORRECT) {
irr = bundle2.incorrect_result_reason_;
} else if (irr != bundle2.incorrect_result_reason_) {
irr = IRR_UNKNOWN;
}
}
incorrect_result_reason_ = irr;
if (irr != IRR_CORRECT && irr != IRR_NO_TRUTH) {
SetBlame(irr, debug_str, nullptr, debug);
}
}
// If a blob with the same bounding box as one of the truth character
// bounding boxes is not classified as the corresponding truth character
// blames character classifier for incorrect answer.
void BlamerBundle::BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box,
const BLOB_CHOICE_LIST &choices, bool debug) {
if (!truth_has_char_boxes_ || incorrect_result_reason_ != IRR_CORRECT) {
return; // Nothing to do here.
}
for (int b = 0; b < norm_truth_word_.length(); ++b) {
const TBOX &truth_box = norm_truth_word_.BlobBox(b);
// Note that we are more strict on the bounding box boundaries here
// than in other places (chopper, segmentation search), since we do
// not have the ability to check the previous and next bounding box.
if (blob_box.x_almost_equal(truth_box, norm_box_tolerance_ / 2)) {
bool found = false;
bool incorrect_adapted = false;
UNICHAR_ID incorrect_adapted_id = INVALID_UNICHAR_ID;
const char *truth_str = truth_text_[b].c_str();
// We promise not to modify the list or its contents, using a
// const BLOB_CHOICE* below.
BLOB_CHOICE_IT choices_it(const_cast<BLOB_CHOICE_LIST *>(&choices));
for (choices_it.mark_cycle_pt(); !choices_it.cycled_list(); choices_it.forward()) {
const BLOB_CHOICE *choice = choices_it.data();
if (strcmp(truth_str, unicharset.get_normed_unichar(choice->unichar_id())) == 0) {
found = true;
break;
} else if (choice->IsAdapted()) {
incorrect_adapted = true;
incorrect_adapted_id = choice->unichar_id();
}
} // end choices_it for loop
if (!found) {
std::string debug_str = "unichar ";
debug_str += truth_str;
debug_str += " not found in classification list";
SetBlame(IRR_CLASSIFIER, debug_str, nullptr, debug);
} else if (incorrect_adapted) {
std::string debug_str = "better rating for adapted ";
debug_str += unicharset.id_to_unichar(incorrect_adapted_id);
debug_str += " than for correct ";
debug_str += truth_str;
SetBlame(IRR_ADAPTION, debug_str, nullptr, debug);
}
break;
}
} // end iterating over blamer_bundle->norm_truth_word
}
// Checks whether chops were made at all the character bounding box
// boundaries in word->truth_word. If not - blames the chopper for an
// incorrect answer.
void BlamerBundle::SetChopperBlame(const WERD_RES *word, bool debug) {
if (NoTruth() || !truth_has_char_boxes_ || word->chopped_word->blobs.empty()) {
return;
}
std::string debug_str;
bool missing_chop = false;
int num_blobs = word->chopped_word->blobs.size();
int box_index = 0;
int blob_index = 0;
int16_t truth_x = -1;
while (box_index < truth_word_.length() && blob_index < num_blobs) {
truth_x = norm_truth_word_.BlobBox(box_index).right();
TBLOB *curr_blob = word->chopped_word->blobs[blob_index];
if (curr_blob->bounding_box().right() < truth_x - norm_box_tolerance_) {
++blob_index;
continue; // encountered an extra chop, keep looking
} else if (curr_blob->bounding_box().right() > truth_x + norm_box_tolerance_) {
missing_chop = true;
break;
} else {
++blob_index;
}
}
if (missing_chop || box_index < norm_truth_word_.length()) {
std::string debug_str;
if (missing_chop) {
debug_str += "Detected missing chop (tolerance=" + std::to_string(norm_box_tolerance_);
debug_str += ") at Bounding Box=";
TBLOB *curr_blob = word->chopped_word->blobs[blob_index];
curr_blob->bounding_box().print_to_str(debug_str);
debug_str += "\nNo chop for truth at x=" + std::to_string(truth_x);
} else {
debug_str += "Missing chops for last " + std::to_string(norm_truth_word_.length() - box_index);
debug_str += " truth box(es)";
}
debug_str += "\nMaximally chopped word boxes:\n";
for (blob_index = 0; blob_index < num_blobs; ++blob_index) {
TBLOB *curr_blob = word->chopped_word->blobs[blob_index];
curr_blob->bounding_box().print_to_str(debug_str);
debug_str += '\n';
}
debug_str += "Truth bounding boxes:\n";
for (box_index = 0; box_index < norm_truth_word_.length(); ++box_index) {
norm_truth_word_.BlobBox(box_index).print_to_str(debug_str);
debug_str += '\n';
}
SetBlame(IRR_CHOPPER, debug_str, word->best_choice, debug);
}
}
// Blames the classifier or the language model if, after running only the
// chopper, best_choice is incorrect and no blame has been yet set.
// Blames the classifier if best_choice is classifier's top choice and is a
// dictionary word (i.e. language model could not have helped).
// Otherwise, blames the language model (formerly permuter word adjustment).
void BlamerBundle::BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset,
bool valid_permuter, bool debug) {
if (valid_permuter) {
// Find out whether best choice is a top choice.
best_choice_is_dict_and_top_choice_ = true;
for (int i = 0; i < word->best_choice->length(); ++i) {
BLOB_CHOICE_IT blob_choice_it(word->GetBlobChoices(i));
ASSERT_HOST(!blob_choice_it.empty());
BLOB_CHOICE *first_choice = nullptr;
for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
blob_choice_it.forward()) { // find first non-fragment choice
if (!(unicharset.get_fragment(blob_choice_it.data()->unichar_id()))) {
first_choice = blob_choice_it.data();
break;
}
}
ASSERT_HOST(first_choice != nullptr);
if (first_choice->unichar_id() != word->best_choice->unichar_id(i)) {
best_choice_is_dict_and_top_choice_ = false;
break;
}
}
}
std::string debug_str;
if (best_choice_is_dict_and_top_choice_) {
debug_str = "Best choice is: incorrect, top choice, dictionary word";
debug_str += " with permuter ";
debug_str += word->best_choice->permuter_name();
} else {
debug_str = "Classifier/Old LM tradeoff is to blame";
}
SetBlame(best_choice_is_dict_and_top_choice_ ? IRR_CLASSIFIER : IRR_CLASS_OLD_LM_TRADEOFF,
debug_str, word->best_choice, debug);
}
// Sets up the correct_segmentation_* to mark the correct bounding boxes.
void BlamerBundle::SetupCorrectSegmentation(const TWERD *word, bool debug) {
#ifndef DISABLED_LEGACY_ENGINE
params_training_bundle_.StartHypothesisList();
#endif // ndef DISABLED_LEGACY_ENGINE
if (incorrect_result_reason_ != IRR_CORRECT || !truth_has_char_boxes_) {
return; // Nothing to do here.
}
std::string debug_str = "Blamer computing correct_segmentation_cols\n";
int curr_box_col = 0;
int next_box_col = 0;
int num_blobs = word->NumBlobs();
if (num_blobs == 0) {
return; // No blobs to play with.
}
int blob_index = 0;
int16_t next_box_x = word->blobs[blob_index]->bounding_box().right();
for (int truth_idx = 0; blob_index < num_blobs && truth_idx < norm_truth_word_.length();
++blob_index) {
++next_box_col;
int16_t curr_box_x = next_box_x;
if (blob_index + 1 < num_blobs) {
next_box_x = word->blobs[blob_index + 1]->bounding_box().right();
}
int16_t truth_x = norm_truth_word_.BlobBox(truth_idx).right();
debug_str += "Box x coord vs. truth: " + std::to_string(curr_box_x);
debug_str += " " + std::to_string(truth_x);
debug_str += "\n";
if (curr_box_x > (truth_x + norm_box_tolerance_)) {
break; // failed to find a matching box
} else if (curr_box_x >= truth_x - norm_box_tolerance_ && // matched
(blob_index + 1 >= num_blobs || // next box can't be included
next_box_x > truth_x + norm_box_tolerance_)) {
correct_segmentation_cols_.push_back(curr_box_col);
correct_segmentation_rows_.push_back(next_box_col - 1);
++truth_idx;
debug_str += "col=" + std::to_string(curr_box_col);
debug_str += " row=" + std::to_string(next_box_col - 1);
debug_str += "\n";
curr_box_col = next_box_col;
}
}
if (blob_index < num_blobs || // trailing blobs
correct_segmentation_cols_.size() != norm_truth_word_.length()) {
debug_str +=
"Blamer failed to find correct segmentation"
" (tolerance=" +
std::to_string(norm_box_tolerance_);
if (blob_index >= num_blobs) {
debug_str += " blob == nullptr";
}
debug_str += ")\n";
debug_str += " path length " + std::to_string(correct_segmentation_cols_.size());
debug_str += " vs. truth " + std::to_string(norm_truth_word_.length());
debug_str += "\n";
SetBlame(IRR_UNKNOWN, debug_str, nullptr, debug);
correct_segmentation_cols_.clear();
correct_segmentation_rows_.clear();
}
}
// Returns true if a guided segmentation search is needed.
bool BlamerBundle::GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const {
return incorrect_result_reason_ == IRR_CORRECT && !segsearch_is_looking_for_blame_ &&
truth_has_char_boxes_ && !ChoiceIsCorrect(best_choice);
}
#if !defined(DISABLED_LEGACY_ENGINE)
// Setup ready to guide the segmentation search to the correct segmentation.
void BlamerBundle::InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings,
UNICHAR_ID wildcard_id, bool debug, std::string &debug_str,
tesseract::LMPainPoints *pain_points, double max_char_wh_ratio,
WERD_RES *word_res) {
segsearch_is_looking_for_blame_ = true;
if (debug) {
tprintf("segsearch starting to look for blame\n");
}
// Fill pain points for any unclassifed blob corresponding to the
// correct segmentation state.
debug_str += "Correct segmentation:\n";
for (int idx = 0; idx < correct_segmentation_cols_.size(); ++idx) {
debug_str += "col=" + std::to_string(correct_segmentation_cols_[idx]);
debug_str += " row=" + std::to_string(correct_segmentation_rows_[idx]);
debug_str += "\n";
if (!ratings->Classified(correct_segmentation_cols_[idx], correct_segmentation_rows_[idx],
wildcard_id) &&
!pain_points->GeneratePainPoint(
correct_segmentation_cols_[idx], correct_segmentation_rows_[idx],
tesseract::LM_PPTYPE_BLAMER, 0.0, false, max_char_wh_ratio, word_res)) {
segsearch_is_looking_for_blame_ = false;
debug_str += "\nFailed to insert pain point\n";
SetBlame(IRR_SEGSEARCH_HEUR, debug_str, best_choice, debug);
break;
}
} // end for blamer_bundle->correct_segmentation_cols/rows
}
#endif // !defined(DISABLED_LEGACY_ENGINE)
// Returns true if the guided segsearch is in progress.
bool BlamerBundle::GuidedSegsearchStillGoing() const {
return segsearch_is_looking_for_blame_;
}
// The segmentation search has ended. Sets the blame appropriately.
void BlamerBundle::FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, std::string &debug_str) {
// If we are still looking for blame (i.e. best_choice is incorrect, but a
// path representing the correct segmentation could be constructed), we can
// blame segmentation search pain point prioritization if the rating of the
// path corresponding to the correct segmentation is better than that of
// best_choice (i.e. language model would have done the correct thing, but
// because of poor pain point prioritization the correct segmentation was
// never explored). Otherwise we blame the tradeoff between the language model
// and the classifier, since even after exploring the path corresponding to
// the correct segmentation incorrect best_choice would have been chosen.
// One special case when we blame the classifier instead is when best choice
// is incorrect, but it is a dictionary word and it classifier's top choice.
if (segsearch_is_looking_for_blame_) {
segsearch_is_looking_for_blame_ = false;
if (best_choice_is_dict_and_top_choice_) {
debug_str = "Best choice is: incorrect, top choice, dictionary word";
debug_str += " with permuter ";
debug_str += best_choice->permuter_name();
SetBlame(IRR_CLASSIFIER, debug_str, best_choice, debug);
} else if (best_correctly_segmented_rating_ < best_choice->rating()) {
debug_str += "Correct segmentation state was not explored";
SetBlame(IRR_SEGSEARCH_PP, debug_str, best_choice, debug);
} else {
if (best_correctly_segmented_rating_ >= WERD_CHOICE::kBadRating) {
debug_str += "Correct segmentation paths were pruned by LM\n";
} else {
debug_str += "Best correct segmentation rating " +
std::to_string(best_correctly_segmented_rating_);
debug_str += " vs. best choice rating " + std::to_string(best_choice->rating());
}
SetBlame(IRR_CLASS_LM_TRADEOFF, debug_str, best_choice, debug);
}
}
}
// If the bundle is null or still does not indicate the correct result,
// fix it and use some backup reason for the blame.
void BlamerBundle::LastChanceBlame(bool debug, WERD_RES *word) {
if (word->blamer_bundle == nullptr) {
word->blamer_bundle = new BlamerBundle();
word->blamer_bundle->SetBlame(IRR_PAGE_LAYOUT, "LastChanceBlame", word->best_choice, debug);
} else if (word->blamer_bundle->incorrect_result_reason_ == IRR_NO_TRUTH) {
word->blamer_bundle->SetBlame(IRR_NO_TRUTH, "Rejected truth", word->best_choice, debug);
} else {
bool correct = word->blamer_bundle->ChoiceIsCorrect(word->best_choice);
IncorrectResultReason irr = word->blamer_bundle->incorrect_result_reason_;
if (irr == IRR_CORRECT && !correct) {
std::string debug_str = "Choice is incorrect after recognition";
word->blamer_bundle->SetBlame(IRR_UNKNOWN, debug_str, word->best_choice, debug);
} else if (irr != IRR_CORRECT && correct) {
if (debug) {
tprintf("Corrected %s\n", word->blamer_bundle->debug_.c_str());
}
word->blamer_bundle->incorrect_result_reason_ = IRR_CORRECT;
word->blamer_bundle->debug_ = "";
}
}
}
// Sets the misadaption debug if this word is incorrect, as this word is
// being adapted to.
void BlamerBundle::SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug) {
if (incorrect_result_reason_ != IRR_NO_TRUTH && !ChoiceIsCorrect(best_choice)) {
misadaption_debug_ = "misadapt to word (";
misadaption_debug_ += best_choice->permuter_name();
misadaption_debug_ += "): ";
FillDebugString("", best_choice, misadaption_debug_);
if (debug) {
tprintf("%s\n", misadaption_debug_.c_str());
}
}
}
} // namespace tesseract

View File

@ -0,0 +1,350 @@
///////////////////////////////////////////////////////////////////////
// File: blamer.h
// Description: Module allowing precise error causes to be allocated.
// Author: Rike Antonova
// Refactored: Ray Smith
//
// (C) Copyright 2013, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCSTRUCT_BLAMER_H_
#define TESSERACT_CCSTRUCT_BLAMER_H_
#ifdef HAVE_CONFIG_H
# include "config_auto.h" // DISABLED_LEGACY_ENGINE
#endif
#include "boxword.h" // for BoxWord
#ifndef DISABLED_LEGACY_ENGINE
# include "params_training_featdef.h" // for ParamsTrainingBundle, ParamsTra...
#endif // ndef DISABLED_LEGACY_ENGINE
#include "ratngs.h" // for BLOB_CHOICE_LIST (ptr only)
#include "rect.h" // for TBOX
#include "tprintf.h" // for tprintf
#include <tesseract/unichar.h> // for UNICHAR_ID
#include <cstdint> // for int16_t
#include <cstring> // for memcpy
#include <vector> // for std::vector
namespace tesseract {
class DENORM;
class MATRIX;
class UNICHARSET;
class WERD_RES;
struct MATRIX_COORD;
struct TWERD;
class LMPainPoints;
static const int16_t kBlamerBoxTolerance = 5;
// Enum for expressing the source of error.
// Note: Please update kIncorrectResultReasonNames when modifying this enum.
enum IncorrectResultReason {
// The text recorded in best choice == truth text
IRR_CORRECT,
// Either: Top choice is incorrect and is a dictionary word (language model
// is unlikely to help correct such errors, so blame the classifier).
// Or: the correct unichar was not included in shortlist produced by the
// classifier at all.
IRR_CLASSIFIER,
// Chopper have not found one or more splits that correspond to the correct
// character bounding boxes recorded in BlamerBundle::truth_word.
IRR_CHOPPER,
// Classifier did include correct unichars for each blob in the correct
// segmentation, however its rating could have been too bad to allow the
// language model to pull out the correct choice. On the other hand the
// strength of the language model might have been too weak to favor the
// correct answer, this we call this case a classifier-language model
// tradeoff error.
IRR_CLASS_LM_TRADEOFF,
// Page layout failed to produce the correct bounding box. Blame page layout
// if the truth was not found for the word, which implies that the bounding
// box of the word was incorrect (no truth word had a similar bounding box).
IRR_PAGE_LAYOUT,
// SegSearch heuristic prevented one or more blobs from the correct
// segmentation state to be classified (e.g. the blob was too wide).
IRR_SEGSEARCH_HEUR,
// The correct segmentaiton state was not explored because of poor SegSearch
// pain point prioritization. We blame SegSearch pain point prioritization
// if the best rating of a choice constructed from correct segmentation is
// better than that of the best choice (i.e. if we got to explore the correct
// segmentation state, language model would have picked the correct choice).
IRR_SEGSEARCH_PP,
// Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
// and thus use the old language model (permuters).
// TODO(antonova): integrate the new language mode with chopper
IRR_CLASS_OLD_LM_TRADEOFF,
// If there is an incorrect adaptive template match with a better score than
// a correct one (either pre-trained or adapted), mark this as adaption error.
IRR_ADAPTION,
// split_and_recog_word() failed to find a suitable split in truth.
IRR_NO_TRUTH_SPLIT,
// Truth is not available for this word (e.g. when words in corrected content
// file are turned into ~~~~ because an appropriate alignment was not found.
IRR_NO_TRUTH,
// The text recorded in best choice != truth text, but none of the above
// reasons are set.
IRR_UNKNOWN,
IRR_NUM_REASONS
};
// Blamer-related information to determine the source of errors.
struct BlamerBundle {
static const char *IncorrectReasonName(IncorrectResultReason irr);
BlamerBundle()
: truth_has_char_boxes_(false)
, incorrect_result_reason_(IRR_CORRECT)
, lattice_data_(nullptr) {
ClearResults();
}
BlamerBundle(const BlamerBundle &other) {
this->CopyTruth(other);
this->CopyResults(other);
}
~BlamerBundle() {
delete[] lattice_data_;
}
// Accessors.
std::string TruthString() const {
std::string truth_str;
for (auto &text : truth_text_) {
truth_str += text;
}
return truth_str;
}
IncorrectResultReason incorrect_result_reason() const {
return incorrect_result_reason_;
}
bool NoTruth() const {
return incorrect_result_reason_ == IRR_NO_TRUTH || incorrect_result_reason_ == IRR_PAGE_LAYOUT;
}
bool HasDebugInfo() const {
return debug_.length() > 0 || misadaption_debug_.length() > 0;
}
const std::string &debug() const {
return debug_;
}
const std::string &misadaption_debug() const {
return misadaption_debug_;
}
void UpdateBestRating(float rating) {
if (rating < best_correctly_segmented_rating_) {
best_correctly_segmented_rating_ = rating;
}
}
int correct_segmentation_length() const {
return correct_segmentation_cols_.size();
}
// Returns true if the given ratings matrix col,row position is included
// in the correct segmentation path at the given index.
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord) {
return correct_segmentation_cols_[index] == coord.col &&
correct_segmentation_rows_[index] == coord.row;
}
void set_best_choice_is_dict_and_top_choice(bool value) {
best_choice_is_dict_and_top_choice_ = value;
}
const char *lattice_data() const {
return lattice_data_;
}
int lattice_size() const {
return lattice_size_; // size of lattice_data in bytes
}
void set_lattice_data(const char *data, int size) {
lattice_size_ = size;
delete[] lattice_data_;
lattice_data_ = new char[lattice_size_];
memcpy(lattice_data_, data, lattice_size_);
}
#ifndef DISABLED_LEGACY_ENGINE
const tesseract::ParamsTrainingBundle &params_training_bundle() const {
return params_training_bundle_;
}
// Adds a new ParamsTrainingHypothesis to the current hypothesis list.
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo) {
params_training_bundle_.AddHypothesis(hypo);
}
#endif // ndef DISABLED_LEGACY_ENGINE
// Functions to setup the blamer.
// Whole word string, whole word bounding box.
void SetWordTruth(const UNICHARSET &unicharset, const char *truth_str, const TBOX &word_box);
// Single "character" string, "character" bounding box.
// May be called multiple times to indicate the characters in a word.
void SetSymbolTruth(const UNICHARSET &unicharset, const char *char_str, const TBOX &char_box);
// Marks that there is something wrong with the truth text, like it contains
// reject characters.
void SetRejectedTruth();
// Returns true if the provided word_choice is correct.
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const;
void ClearResults() {
norm_truth_word_.DeleteAllBoxes();
norm_box_tolerance_ = 0;
if (!NoTruth()) {
incorrect_result_reason_ = IRR_CORRECT;
}
debug_ = "";
segsearch_is_looking_for_blame_ = false;
best_correctly_segmented_rating_ = WERD_CHOICE::kBadRating;
correct_segmentation_cols_.clear();
correct_segmentation_rows_.clear();
best_choice_is_dict_and_top_choice_ = false;
delete[] lattice_data_;
lattice_data_ = nullptr;
lattice_size_ = 0;
}
void CopyTruth(const BlamerBundle &other) {
truth_has_char_boxes_ = other.truth_has_char_boxes_;
truth_word_ = other.truth_word_;
truth_text_ = other.truth_text_;
incorrect_result_reason_ = (other.NoTruth() ? other.incorrect_result_reason_ : IRR_CORRECT);
}
void CopyResults(const BlamerBundle &other) {
norm_truth_word_ = other.norm_truth_word_;
norm_box_tolerance_ = other.norm_box_tolerance_;
incorrect_result_reason_ = other.incorrect_result_reason_;
segsearch_is_looking_for_blame_ = other.segsearch_is_looking_for_blame_;
best_correctly_segmented_rating_ = other.best_correctly_segmented_rating_;
correct_segmentation_cols_ = other.correct_segmentation_cols_;
correct_segmentation_rows_ = other.correct_segmentation_rows_;
best_choice_is_dict_and_top_choice_ = other.best_choice_is_dict_and_top_choice_;
if (other.lattice_data_ != nullptr) {
lattice_data_ = new char[other.lattice_size_];
memcpy(lattice_data_, other.lattice_data_, other.lattice_size_);
lattice_size_ = other.lattice_size_;
} else {
lattice_data_ = nullptr;
}
}
const char *IncorrectReason() const;
// Appends choice and truth details to the given debug string.
void FillDebugString(const std::string &msg, const WERD_CHOICE *choice, std::string &debug);
// Sets up the norm_truth_word from truth_word using the given DENORM.
void SetupNormTruthWord(const DENORM &denorm);
// Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
// bundles) where the right edge/ of the left-hand word is word1_right,
// and the left edge of the right-hand word is word2_left.
void SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1,
BlamerBundle *bundle2) const;
// "Joins" the blames from bundle1 and bundle2 into *this.
void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug);
// If a blob with the same bounding box as one of the truth character
// bounding boxes is not classified as the corresponding truth character
// blames character classifier for incorrect answer.
void BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box,
const BLOB_CHOICE_LIST &choices, bool debug);
// Checks whether chops were made at all the character bounding box
// boundaries in word->truth_word. If not - blames the chopper for an
// incorrect answer.
void SetChopperBlame(const WERD_RES *word, bool debug);
// Blames the classifier or the language model if, after running only the
// chopper, best_choice is incorrect and no blame has been yet set.
// Blames the classifier if best_choice is classifier's top choice and is a
// dictionary word (i.e. language model could not have helped).
// Otherwise, blames the language model (formerly permuter word adjustment).
void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset,
bool valid_permuter, bool debug);
// Sets up the correct_segmentation_* to mark the correct bounding boxes.
void SetupCorrectSegmentation(const TWERD *word, bool debug);
// Returns true if a guided segmentation search is needed.
bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const;
// Setup ready to guide the segmentation search to the correct segmentation.
void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id,
bool debug, std::string &debug_str, tesseract::LMPainPoints *pain_points,
double max_char_wh_ratio, WERD_RES *word_res);
// Returns true if the guided segsearch is in progress.
bool GuidedSegsearchStillGoing() const;
// The segmentation search has ended. Sets the blame appropriately.
void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, std::string &debug_str);
// If the bundle is null or still does not indicate the correct result,
// fix it and use some backup reason for the blame.
static void LastChanceBlame(bool debug, WERD_RES *word);
// Sets the misadaption debug if this word is incorrect, as this word is
// being adapted to.
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug);
private:
// Copy assignment operator (currently unused, therefore private).
BlamerBundle &operator=(const BlamerBundle &other) = delete;
void SetBlame(IncorrectResultReason irr, const std::string &msg, const WERD_CHOICE *choice,
bool debug) {
incorrect_result_reason_ = irr;
debug_ = IncorrectReason();
debug_ += " to blame: ";
FillDebugString(msg, choice, debug_);
if (debug) {
tprintf("SetBlame(): %s", debug_.c_str());
}
}
private:
// Set to true when bounding boxes for individual unichars are recorded.
bool truth_has_char_boxes_;
// Variables used by the segmentation search when looking for the blame.
// Set to true while segmentation search is continued after the usual
// termination condition in order to look for the blame.
bool segsearch_is_looking_for_blame_;
// Set to true if best choice is a dictionary word and
// classifier's top choice.
bool best_choice_is_dict_and_top_choice_;
// Tolerance for bounding box comparisons in normalized space.
int norm_box_tolerance_;
// The true_word (in the original image coordinate space) contains ground
// truth bounding boxes for this WERD_RES.
tesseract::BoxWord truth_word_;
// Same as above, but in normalized coordinates
// (filled in by WERD_RES::SetupForRecognition()).
tesseract::BoxWord norm_truth_word_;
// Contains ground truth unichar for each of the bounding boxes in truth_word.
std::vector<std::string> truth_text_;
// The reason for incorrect OCR result.
IncorrectResultReason incorrect_result_reason_;
// Debug text associated with the blame.
std::string debug_;
// Misadaption debug information (filled in if this word was misadapted to).
std::string misadaption_debug_;
// Vectors populated by SegSearch to indicate column and row indices that
// correspond to blobs with correct bounding boxes.
std::vector<int> correct_segmentation_cols_;
std::vector<int> correct_segmentation_rows_;
// Best rating for correctly segmented path
// (set and used by SegSearch when looking for blame).
float best_correctly_segmented_rating_;
int lattice_size_; // size of lattice_data in bytes
// Serialized segmentation search lattice.
char *lattice_data_;
// Information about hypotheses (paths) explored by the segmentation search.
#ifndef DISABLED_LEGACY_ENGINE
tesseract::ParamsTrainingBundle params_training_bundle_;
#endif // ndef DISABLED_LEGACY_ENGINE
};
} // namespace tesseract
#endif // TESSERACT_CCSTRUCT_BLAMER_H_

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,853 @@
/**********************************************************************
* File: blobbox.h (Formerly blobnbox.h)
* Description: Code for the textord blob class.
* Author: Ray Smith
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef BLOBBOX_H
#define BLOBBOX_H
#include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
#include "elst2.h" // for ELIST2_ITERATOR, ELIST2IZEH, ELIST2_LINK
#include "errcode.h" // for ASSERT_HOST
#include "ocrblock.h" // for BLOCK
#include "params.h" // for DoubleParam, double_VAR_H
#include "pdblock.h" // for PDBLK
#include "points.h" // for FCOORD, ICOORD, ICOORDELT_LIST
#include "quspline.h" // for QSPLINE
#include "rect.h" // for TBOX
#include "scrollview.h" // for ScrollView, ScrollView::Color
#include "statistc.h" // for STATS
#include "stepblob.h" // for C_BLOB
#include "tprintf.h" // for tprintf
#include "werd.h" // for WERD_LIST
#include <cinttypes> // for PRId32
#include <cmath> // for std::sqrt
#include <cstdint> // for int16_t, int32_t
struct Pix;
namespace tesseract {
class C_OUTLINE;
enum PITCH_TYPE {
PITCH_DUNNO, // insufficient data
PITCH_DEF_FIXED, // definitely fixed
PITCH_MAYBE_FIXED, // could be
PITCH_DEF_PROP,
PITCH_MAYBE_PROP,
PITCH_CORR_FIXED,
PITCH_CORR_PROP
};
// The possible tab-stop types of each side of a BLOBNBOX.
// The ordering is important, as it is used for deleting dead-ends in the
// search. ALIGNED, CONFIRMED and VLINE should remain greater than the
// non-aligned, unset, or deleted members.
enum TabType {
TT_NONE, // Not a tab.
TT_DELETED, // Not a tab after detailed analysis.
TT_MAYBE_RAGGED, // Initial designation of a tab-stop candidate.
TT_MAYBE_ALIGNED, // Initial designation of a tab-stop candidate.
TT_CONFIRMED, // Aligned with neighbours.
TT_VLINE // Detected as a vertical line.
};
// The possible region types of a BLOBNBOX.
// Note: keep all the text types > BRT_UNKNOWN and all the image types less.
// Keep in sync with kBlobTypes in colpartition.cpp and BoxColor, and the
// *Type static functions below.
enum BlobRegionType {
BRT_NOISE, // Neither text nor image.
BRT_HLINE, // Horizontal separator line.
BRT_VLINE, // Vertical separator line.
BRT_RECTIMAGE, // Rectangular image.
BRT_POLYIMAGE, // Non-rectangular image.
BRT_UNKNOWN, // Not determined yet.
BRT_VERT_TEXT, // Vertical alignment, not necessarily vertically oriented.
BRT_TEXT, // Convincing text.
BRT_COUNT // Number of possibilities.
};
// enum for elements of arrays that refer to neighbours.
// NOTE: keep in this order, so ^2 can be used to flip direction.
enum BlobNeighbourDir { BND_LEFT, BND_BELOW, BND_RIGHT, BND_ABOVE, BND_COUNT };
// enum for special type of text characters, such as math symbol or italic.
enum BlobSpecialTextType {
BSTT_NONE, // No special.
BSTT_ITALIC, // Italic style.
BSTT_DIGIT, // Digit symbols.
BSTT_MATH, // Mathematical symbols (not including digit).
BSTT_UNCLEAR, // Characters with low recognition rate.
BSTT_SKIP, // Characters that we skip labeling (usually too small).
BSTT_COUNT
};
inline BlobNeighbourDir DirOtherWay(BlobNeighbourDir dir) {
return static_cast<BlobNeighbourDir>(dir ^ 2);
}
// BlobTextFlowType indicates the quality of neighbouring information
// related to a chain of connected components, either horizontally or
// vertically. Also used by ColPartition for the collection of blobs
// within, which should all have the same value in most cases.
enum BlobTextFlowType {
BTFT_NONE, // No text flow set yet.
BTFT_NONTEXT, // Flow too poor to be likely text.
BTFT_NEIGHBOURS, // Neighbours support flow in this direction.
BTFT_CHAIN, // There is a weak chain of text in this direction.
BTFT_STRONG_CHAIN, // There is a strong chain of text in this direction.
BTFT_TEXT_ON_IMAGE, // There is a strong chain of text on an image.
BTFT_LEADER, // Leader dots/dashes etc.
BTFT_COUNT
};
// Returns true if type1 dominates type2 in a merge. Mostly determined by the
// ordering of the enum, LEADER is weak and dominates nothing.
// The function is anti-symmetric (t1 > t2) === !(t2 > t1), except that
// this cannot be true if t1 == t2, so the result is undefined.
inline bool DominatesInMerge(BlobTextFlowType type1, BlobTextFlowType type2) {
// LEADER always loses.
if (type1 == BTFT_LEADER) {
return false;
}
if (type2 == BTFT_LEADER) {
return true;
}
// With those out of the way, the ordering of the enum determines the result.
return type1 >= type2;
}
class ColPartition;
class BLOBNBOX;
ELISTIZEH(BLOBNBOX)
class BLOBNBOX : public ELIST_LINK {
public:
BLOBNBOX() {
ReInit();
}
explicit BLOBNBOX(C_BLOB *srcblob) {
box = srcblob->bounding_box();
ReInit();
cblob_ptr = srcblob;
area = static_cast<int>(srcblob->area());
}
~BLOBNBOX() {
if (owns_cblob_) {
delete cblob_ptr;
}
}
static void clear_blobnboxes(BLOBNBOX_LIST *boxes) {
BLOBNBOX_IT it = boxes;
// A BLOBNBOX generally doesn't own its blobs, so if they do, you
// have to delete them explicitly.
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
BLOBNBOX *box = it.data();
// TODO: remove next line, currently still needed for resultiterator_test.
delete box->remove_cblob();
}
}
static BLOBNBOX *RealBlob(C_OUTLINE *outline) {
auto *blob = new C_BLOB(outline);
return new BLOBNBOX(blob);
}
// Rotates the box and the underlying blob.
void rotate(FCOORD rotation);
// Methods that act on the box without touching the underlying blob.
// Reflect the box in the y-axis, leaving the underlying blob untouched.
void reflect_box_in_y_axis();
// Rotates the box by the angle given by rotation.
// If the blob is a diacritic, then only small rotations for skew
// correction can be applied.
void rotate_box(FCOORD rotation);
// Moves just the box by the given vector.
void translate_box(ICOORD v) {
if (IsDiacritic()) {
box.move(v);
base_char_top_ += v.y();
base_char_bottom_ += v.y();
} else {
box.move(v);
set_diacritic_box(box);
}
}
void merge(BLOBNBOX *nextblob);
void really_merge(BLOBNBOX *other);
void chop( // fake chop blob
BLOBNBOX_IT *start_it, // location of this
BLOBNBOX_IT *blob_it, // iterator
FCOORD rotation, // for landscape
float xheight); // line height
void NeighbourGaps(int gaps[BND_COUNT]) const;
void MinMaxGapsClipped(int *h_min, int *h_max, int *v_min, int *v_max) const;
void CleanNeighbours();
// Returns positive if there is at least one side neighbour that has a
// similar stroke width and is not on the other side of a rule line.
int GoodTextBlob() const;
// Returns the number of side neighbours that are of type BRT_NOISE.
int NoisyNeighbours() const;
// Returns true if the blob is noise and has no owner.
bool DeletableNoise() const {
return owner() == nullptr && region_type() == BRT_NOISE;
}
// Returns true, and sets vert_possible/horz_possible if the blob has some
// feature that makes it individually appear to flow one way.
// eg if it has a high aspect ratio, yet has a complex shape, such as a
// joined word in Latin, Arabic, or Hindi, rather than being a -, I, l, 1.
bool DefiniteIndividualFlow();
// Returns true if there is no tabstop violation in merging this and other.
bool ConfirmNoTabViolation(const BLOBNBOX &other) const;
// Returns true if other has a similar stroke width to this.
bool MatchingStrokeWidth(const BLOBNBOX &other, double fractional_tolerance,
double constant_tolerance) const;
// Returns a bounding box of the outline contained within the
// given horizontal range.
TBOX BoundsWithinLimits(int left, int right);
// Estimates and stores the baseline position based on the shape of the
// outline.
void EstimateBaselinePosition();
// Simple accessors.
const TBOX &bounding_box() const {
return box;
}
// Set the bounding box. Use with caution.
// Normally use compute_bounding_box instead.
void set_bounding_box(const TBOX &new_box) {
box = new_box;
base_char_top_ = box.top();
base_char_bottom_ = box.bottom();
}
void compute_bounding_box() {
box = cblob_ptr->bounding_box();
base_char_top_ = box.top();
base_char_bottom_ = box.bottom();
baseline_y_ = box.bottom();
}
const TBOX &reduced_box() const {
return red_box;
}
void set_reduced_box(TBOX new_box) {
red_box = new_box;
reduced = true;
}
int32_t enclosed_area() const {
return area;
}
bool joined_to_prev() const {
return joined;
}
bool red_box_set() const {
return reduced;
}
int repeated_set() const {
return repeated_set_;
}
void set_repeated_set(int set_id) {
repeated_set_ = set_id;
}
C_BLOB *cblob() const {
return cblob_ptr;
}
C_BLOB *remove_cblob() {
auto blob = cblob_ptr;
cblob_ptr = nullptr;
owns_cblob_ = false;
return blob;
}
TabType left_tab_type() const {
return left_tab_type_;
}
void set_left_tab_type(TabType new_type) {
left_tab_type_ = new_type;
}
TabType right_tab_type() const {
return right_tab_type_;
}
void set_right_tab_type(TabType new_type) {
right_tab_type_ = new_type;
}
BlobRegionType region_type() const {
return region_type_;
}
void set_region_type(BlobRegionType new_type) {
region_type_ = new_type;
}
BlobSpecialTextType special_text_type() const {
return spt_type_;
}
void set_special_text_type(BlobSpecialTextType new_type) {
spt_type_ = new_type;
}
BlobTextFlowType flow() const {
return flow_;
}
void set_flow(BlobTextFlowType value) {
flow_ = value;
}
bool vert_possible() const {
return vert_possible_;
}
void set_vert_possible(bool value) {
vert_possible_ = value;
}
bool horz_possible() const {
return horz_possible_;
}
void set_horz_possible(bool value) {
horz_possible_ = value;
}
int left_rule() const {
return left_rule_;
}
void set_left_rule(int new_left) {
left_rule_ = new_left;
}
int right_rule() const {
return right_rule_;
}
void set_right_rule(int new_right) {
right_rule_ = new_right;
}
int left_crossing_rule() const {
return left_crossing_rule_;
}
void set_left_crossing_rule(int new_left) {
left_crossing_rule_ = new_left;
}
int right_crossing_rule() const {
return right_crossing_rule_;
}
void set_right_crossing_rule(int new_right) {
right_crossing_rule_ = new_right;
}
float horz_stroke_width() const {
return horz_stroke_width_;
}
void set_horz_stroke_width(float width) {
horz_stroke_width_ = width;
}
float vert_stroke_width() const {
return vert_stroke_width_;
}
void set_vert_stroke_width(float width) {
vert_stroke_width_ = width;
}
float area_stroke_width() const {
return area_stroke_width_;
}
tesseract::ColPartition *owner() const {
return owner_;
}
void set_owner(tesseract::ColPartition *new_owner) {
owner_ = new_owner;
}
bool leader_on_left() const {
return leader_on_left_;
}
void set_leader_on_left(bool flag) {
leader_on_left_ = flag;
}
bool leader_on_right() const {
return leader_on_right_;
}
void set_leader_on_right(bool flag) {
leader_on_right_ = flag;
}
BLOBNBOX *neighbour(BlobNeighbourDir n) const {
return neighbours_[n];
}
bool good_stroke_neighbour(BlobNeighbourDir n) const {
return good_stroke_neighbours_[n];
}
void set_neighbour(BlobNeighbourDir n, BLOBNBOX *neighbour, bool good) {
neighbours_[n] = neighbour;
good_stroke_neighbours_[n] = good;
}
bool IsDiacritic() const {
return base_char_top_ != box.top() || base_char_bottom_ != box.bottom();
}
int base_char_top() const {
return base_char_top_;
}
int base_char_bottom() const {
return base_char_bottom_;
}
int baseline_position() const {
return baseline_y_;
}
int line_crossings() const {
return line_crossings_;
}
void set_line_crossings(int value) {
line_crossings_ = value;
}
void set_diacritic_box(const TBOX &diacritic_box) {
base_char_top_ = diacritic_box.top();
base_char_bottom_ = diacritic_box.bottom();
}
BLOBNBOX *base_char_blob() const {
return base_char_blob_;
}
void set_base_char_blob(BLOBNBOX *blob) {
base_char_blob_ = blob;
}
void set_owns_cblob(bool value) {
owns_cblob_ = value;
}
bool UniquelyVertical() const {
return vert_possible_ && !horz_possible_;
}
bool UniquelyHorizontal() const {
return horz_possible_ && !vert_possible_;
}
// Returns true if the region type is text.
static bool IsTextType(BlobRegionType type) {
return type == BRT_TEXT || type == BRT_VERT_TEXT;
}
// Returns true if the region type is image.
static bool IsImageType(BlobRegionType type) {
return type == BRT_RECTIMAGE || type == BRT_POLYIMAGE;
}
// Returns true if the region type is line.
static bool IsLineType(BlobRegionType type) {
return type == BRT_HLINE || type == BRT_VLINE;
}
// Returns true if the region type cannot be merged.
static bool UnMergeableType(BlobRegionType type) {
return IsLineType(type) || IsImageType(type);
}
// Helper to call CleanNeighbours on all blobs on the list.
static void CleanNeighbours(BLOBNBOX_LIST *blobs);
// Helper to delete all the deletable blobs on the list.
static void DeleteNoiseBlobs(BLOBNBOX_LIST *blobs);
// Helper to compute edge offsets for all the blobs on the list.
// See coutln.h for an explanation of edge offsets.
static void ComputeEdgeOffsets(Image thresholds, Image grey, BLOBNBOX_LIST *blobs);
#ifndef GRAPHICS_DISABLED
// Helper to draw all the blobs on the list in the given body_colour,
// with child outlines in the child_colour.
static void PlotBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour,
ScrollView::Color child_colour, ScrollView *win);
// Helper to draw only DeletableNoise blobs (unowned, BRT_NOISE) on the
// given list in the given body_colour, with child outlines in the
// child_colour.
static void PlotNoiseBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour,
ScrollView::Color child_colour, ScrollView *win);
static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type);
// Keep in sync with BlobRegionType.
ScrollView::Color BoxColor() const;
void plot(ScrollView *window, // window to draw in
ScrollView::Color blob_colour, // for outer bits
ScrollView::Color child_colour); // for holes
#endif
// Initializes members set by StrokeWidth and beyond, without discarding
// stored area and strokewidth values, which are expensive to calculate.
void ReInit() {
joined = false;
reduced = false;
repeated_set_ = 0;
left_tab_type_ = TT_NONE;
right_tab_type_ = TT_NONE;
region_type_ = BRT_UNKNOWN;
flow_ = BTFT_NONE;
spt_type_ = BSTT_SKIP;
left_rule_ = 0;
right_rule_ = 0;
left_crossing_rule_ = 0;
right_crossing_rule_ = 0;
if (area_stroke_width_ == 0.0f && area > 0 && cblob() != nullptr && cblob()->perimeter() != 0) {
area_stroke_width_ = 2.0f * area / cblob()->perimeter();
}
owner_ = nullptr;
base_char_top_ = box.top();
base_char_bottom_ = box.bottom();
baseline_y_ = box.bottom();
line_crossings_ = 0;
base_char_blob_ = nullptr;
horz_possible_ = false;
vert_possible_ = false;
leader_on_left_ = false;
leader_on_right_ = false;
ClearNeighbours();
}
void ClearNeighbours() {
for (int n = 0; n < BND_COUNT; ++n) {
neighbours_[n] = nullptr;
good_stroke_neighbours_[n] = false;
}
}
private:
C_BLOB *cblob_ptr = nullptr; // edgestep blob
TBOX box; // bounding box
TBOX red_box; // bounding box
int32_t area = 0; // enclosed area
int32_t repeated_set_ = 0; // id of the set of repeated blobs
TabType left_tab_type_ = TT_NONE; // Indicates tab-stop assessment
TabType right_tab_type_ = TT_NONE; // Indicates tab-stop assessment
BlobRegionType region_type_ = BRT_UNKNOWN; // Type of region this blob belongs to
BlobTextFlowType flow_ = BTFT_NONE; // Quality of text flow.
BlobSpecialTextType spt_type_; // Special text type.
bool joined = false; // joined to prev
bool reduced = false; // reduced box set
int16_t left_rule_ = 0; // x-coord of nearest but not crossing rule line
int16_t right_rule_ = 0; // x-coord of nearest but not crossing rule line
int16_t left_crossing_rule_; // x-coord of nearest or crossing rule line
int16_t right_crossing_rule_; // x-coord of nearest or crossing rule line
int16_t base_char_top_; // y-coord of top/bottom of diacritic base,
int16_t base_char_bottom_; // if it exists else top/bottom of this blob.
int16_t baseline_y_; // Estimate of baseline position.
int32_t line_crossings_; // Number of line intersections touched.
BLOBNBOX *base_char_blob_; // The blob that was the base char.
tesseract::ColPartition *owner_; // Who will delete me when I am not needed
BLOBNBOX *neighbours_[BND_COUNT];
float horz_stroke_width_ = 0.0f; // Median horizontal stroke width
float vert_stroke_width_ = 0.0f; // Median vertical stroke width
float area_stroke_width_ = 0.0f; // Stroke width from area/perimeter ratio.
bool good_stroke_neighbours_[BND_COUNT];
bool horz_possible_; // Could be part of horizontal flow.
bool vert_possible_; // Could be part of vertical flow.
bool leader_on_left_; // There is a leader to the left.
bool leader_on_right_; // There is a leader to the right.
// Iff true, then the destructor should delete the cblob_ptr.
// TODO(rays) migrate all uses to correctly setting this flag instead of
// deleting the C_BLOB before deleting the BLOBNBOX.
bool owns_cblob_ = false;
};
class TO_ROW : public ELIST2_LINK {
public:
static const int kErrorWeight = 3;
TO_ROW() {
clear();
} // empty
TO_ROW( // constructor
BLOBNBOX *blob, // from first blob
float top, // of row //target height
float bottom, float row_size);
void print() const;
float max_y() const { // access function
return y_max;
}
float min_y() const {
return y_min;
}
float mean_y() const {
return (y_min + y_max) / 2.0f;
}
float initial_min_y() const {
return initial_y_min;
}
float line_m() const { // access to line fit
return m;
}
float line_c() const {
return c;
}
float line_error() const {
return error;
}
float parallel_c() const {
return para_c;
}
float parallel_error() const {
return para_error;
}
float believability() const { // baseline goodness
return credibility;
}
float intercept() const { // real parallel_c
return y_origin;
}
void add_blob( // put in row
BLOBNBOX *blob, // blob to add
float top, // of row //target height
float bottom, float row_size);
void insert_blob( // put in row in order
BLOBNBOX *blob);
BLOBNBOX_LIST *blob_list() { // get list
return &blobs;
}
void set_line( // set line spec
float new_m, // line to set
float new_c, float new_error) {
m = new_m;
c = new_c;
error = new_error;
}
void set_parallel_line( // set fixed gradient line
float gradient, // page gradient
float new_c, float new_error) {
para_c = new_c;
para_error = new_error;
credibility = blobs.length() - kErrorWeight * new_error;
y_origin = new_c / std::sqrt(1 + gradient * gradient);
// real intercept
}
void set_limits( // set min,max
float new_min, // bottom and
float new_max) { // top of row
y_min = new_min;
y_max = new_max;
}
void compute_vertical_projection();
// get projection
bool rep_chars_marked() const {
return num_repeated_sets_ != -1;
}
void clear_rep_chars_marked() {
num_repeated_sets_ = -1;
}
int num_repeated_sets() const {
return num_repeated_sets_;
}
void set_num_repeated_sets(int num_sets) {
num_repeated_sets_ = num_sets;
}
// true when dead
bool merged = false;
bool all_caps; // had no ascenders
bool used_dm_model; // in guessing pitch
int16_t projection_left; // start of projection
int16_t projection_right; // start of projection
PITCH_TYPE pitch_decision; // how strong is decision
float fixed_pitch; // pitch or 0
float fp_space; // sp if fixed pitch
float fp_nonsp; // nonsp if fixed pitch
float pr_space; // sp if prop
float pr_nonsp; // non sp if prop
float spacing; // to "next" row
float xheight; // of line
int xheight_evidence; // number of blobs of height xheight
float ascrise; // ascenders
float descdrop; // descenders
float body_size; // of CJK characters. Assumed to be
// xheight+ascrise for non-CJK text.
int32_t min_space; // min size for real space
int32_t max_nonspace; // max size of non-space
int32_t space_threshold; // space vs nonspace
float kern_size; // average non-space
float space_size; // average space
WERD_LIST rep_words; // repeated chars
ICOORDELT_LIST char_cells; // fixed pitch cells
QSPLINE baseline; // curved baseline
STATS projection; // vertical projection
private:
void clear(); // clear all values to reasonable defaults
BLOBNBOX_LIST blobs; // blobs in row
float y_min; // coords
float y_max;
float initial_y_min;
float m, c; // line spec
float error; // line error
float para_c; // constrained fit
float para_error;
float y_origin; // rotated para_c;
float credibility; // baseline believability
int num_repeated_sets_; // number of sets of repeated blobs
// set to -1 if we have not searched
// for repeated blobs in this row yet
};
ELIST2IZEH(TO_ROW)
class TESS_API TO_BLOCK : public ELIST_LINK {
public:
TO_BLOCK() : pitch_decision(PITCH_DUNNO) {
clear();
} // empty
TO_BLOCK( // constructor
BLOCK *src_block); // real block
~TO_BLOCK();
void clear(); // clear all scalar members.
TO_ROW_LIST *get_rows() { // access function
return &row_list;
}
// Rotate all the blobnbox lists and the underlying block. Then update the
// median size statistic from the blobs list.
void rotate(const FCOORD &rotation) {
BLOBNBOX_LIST *blobnbox_list[] = {&blobs, &underlines, &noise_blobs,
&small_blobs, &large_blobs, nullptr};
for (BLOBNBOX_LIST **list = blobnbox_list; *list != nullptr; ++list) {
BLOBNBOX_IT it(*list);
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
it.data()->rotate(rotation);
}
}
// Rotate the block
ASSERT_HOST(block->pdblk.poly_block() != nullptr);
block->rotate(rotation);
// Update the median size statistic from the blobs list.
STATS widths(0, block->pdblk.bounding_box().width());
STATS heights(0, block->pdblk.bounding_box().height());
BLOBNBOX_IT blob_it(&blobs);
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
widths.add(blob_it.data()->bounding_box().width(), 1);
heights.add(blob_it.data()->bounding_box().height(), 1);
}
block->set_median_size(static_cast<int>(widths.median() + 0.5),
static_cast<int>(heights.median() + 0.5));
}
void print_rows() { // debug info
TO_ROW_IT row_it = &row_list;
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
auto row = row_it.data();
tprintf("Row range (%g,%g), para_c=%g, blobcount=%" PRId32 "\n", row->min_y(), row->max_y(),
row->parallel_c(), row->blob_list()->length());
}
}
// Reorganizes the blob lists with a different definition of small, medium
// and large, compared to the original definition.
// Height is still the primary filter key, but medium width blobs of small
// height become medium, and very wide blobs of small height stay small.
void ReSetAndReFilterBlobs();
// Deletes noise blobs from all lists where not owned by a ColPartition.
void DeleteUnownedNoise();
// Computes and stores the edge offsets on each blob for use in feature
// extraction, using greyscale if the supplied grey and thresholds pixes
// are 8-bit or otherwise (if nullptr or not 8 bit) the original binary
// edge step outlines.
// Thresholds must either be the same size as grey or an integer down-scale
// of grey.
// See coutln.h for an explanation of edge offsets.
void ComputeEdgeOffsets(Image thresholds, Image grey);
#ifndef GRAPHICS_DISABLED
// Draw the noise blobs from all lists in red.
void plot_noise_blobs(ScrollView *to_win);
// Draw the blobs on on the various lists in the block in different colors.
void plot_graded_blobs(ScrollView *to_win);
#endif
BLOBNBOX_LIST blobs; // medium size
BLOBNBOX_LIST underlines; // underline blobs
BLOBNBOX_LIST noise_blobs; // very small
BLOBNBOX_LIST small_blobs; // fairly small
BLOBNBOX_LIST large_blobs; // big blobs
BLOCK *block; // real block
PITCH_TYPE pitch_decision; // how strong is decision
float line_spacing; // estimate
// line_size is a lower-bound estimate of the font size in pixels of
// the text in the block (with ascenders and descenders), being a small
// (1.25) multiple of the median height of filtered blobs.
// In most cases the font size will be bigger, but it will be closer
// if the text is allcaps, or in a no-x-height script.
float line_size; // estimate
float max_blob_size; // line assignment limit
float baseline_offset; // phase shift
float xheight; // median blob size
float fixed_pitch; // pitch or 0
float kern_size; // average non-space
float space_size; // average space
int32_t min_space; // min definite space
int32_t max_nonspace; // max definite
float fp_space; // sp if fixed pitch
float fp_nonsp; // nonsp if fixed pitch
float pr_space; // sp if prop
float pr_nonsp; // non sp if prop
TO_ROW *key_row; // starting row
private:
TO_ROW_LIST row_list; // temporary rows
};
ELISTIZEH(TO_BLOCK)
extern double_VAR_H(textord_error_weight, 3, "Weighting for error in believability");
void find_cblob_limits( // get y limits
C_BLOB *blob, // blob to search
float leftx, // x limits
float rightx,
FCOORD rotation, // for landscape
float &ymin, // output y limits
float &ymax);
void find_cblob_vlimits( // get y limits
C_BLOB *blob, // blob to search
float leftx, // x limits
float rightx,
float &ymin, // output y limits
float &ymax);
void find_cblob_hlimits( // get x limits
C_BLOB *blob, // blob to search
float bottomy, // y limits
float topy,
float &xmin, // output x limits
float &xymax);
C_BLOB *crotate_cblob( // rotate it
C_BLOB *blob, // blob to search
FCOORD rotation // for landscape
);
TBOX box_next( // get bounding box
BLOBNBOX_IT *it // iterator to blobds
);
TBOX box_next_pre_chopped( // get bounding box
BLOBNBOX_IT *it // iterator to blobds
);
void vertical_cblob_projection( // project outlines
C_BLOB *blob, // blob to project
STATS *stats // output
);
void vertical_coutline_projection( // project outlines
C_OUTLINE *outline, // outline to project
STATS *stats // output
);
#ifndef GRAPHICS_DISABLED
void plot_blob_list(ScrollView *win, // window to draw in
BLOBNBOX_LIST *list, // blob list
ScrollView::Color body_colour, // colour to draw
ScrollView::Color child_colour); // colour of child
#endif // !GRAPHICS_DISABLED
} // namespace tesseract
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,476 @@
/******************************************************************************
*
* File: blobs.h
* Description: Blob definition
* Author: Mark Seaman, OCR Technology
*
* (c) Copyright 1989, Hewlett-Packard Company.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
*****************************************************************************/
#ifndef BLOBS_H
#define BLOBS_H
#include "clst.h" // for CLIST_ITERATOR, CLISTIZEH
#include "normalis.h" // for DENORM
#include "points.h" // for FCOORD, ICOORD
#include "rect.h" // for TBOX
#include "scrollview.h" // for ScrollView, ScrollView::Color
#include <tesseract/publictypes.h> // for OcrEngineMode
#include <cstdint> // for int16_t
struct Pix;
namespace tesseract {
class BLOCK;
class C_BLOB;
class C_OUTLINE;
class LLSQ;
class ROW;
class WERD;
/*----------------------------------------------------------------------
T y p e s
----------------------------------------------------------------------*/
struct TPOINT {
TPOINT() : x(0), y(0) {}
TPOINT(int16_t vx, int16_t vy) : x(vx), y(vy) {}
TPOINT(const ICOORD &ic) : x(ic.x()), y(ic.y()) {}
void operator+=(const TPOINT &other) {
x += other.x;
y += other.y;
}
void operator/=(int divisor) {
x /= divisor;
y /= divisor;
}
bool operator==(const TPOINT &other) const {
return x == other.x && y == other.y;
}
// Returns true when the two line segments cross each other.
// (Moved from outlines.cpp).
static bool IsCrossed(const TPOINT &a0, const TPOINT &a1, const TPOINT &b0, const TPOINT &b1);
// Assign the difference from point p1 to point p2.
void diff(const TPOINT &p1, const TPOINT &p2) {
x = p1.x - p2.x;
y = p1.y - p2.y;
}
// Return cross product.
int cross(const TPOINT &other) const {
return x * other.y - y * other.x;
}
// Return scalar or dot product.
int dot(const TPOINT &other) const {
return x * other.x + y * other.y;
}
// Calculate length of vector.
int length() const {
return x * x + y * y;
}
int16_t x; // absolute x coord.
int16_t y; // absolute y coord.
};
using VECTOR = TPOINT; // structure for coordinates.
struct EDGEPT {
EDGEPT() = default;
EDGEPT(const EDGEPT &src) : next(nullptr), prev(nullptr) {
CopyFrom(src);
}
EDGEPT &operator=(const EDGEPT &src) {
CopyFrom(src);
return *this;
}
// Copies the data elements, but leaves the pointers untouched.
void CopyFrom(const EDGEPT &src) {
pos = src.pos;
vec = src.vec;
is_hidden = src.is_hidden;
runlength = src.runlength;
dir = src.dir;
fixed = src.fixed;
src_outline = src.src_outline;
start_step = src.start_step;
step_count = src.step_count;
}
// Returns the squared distance between the points, with the x-component
// weighted by x_factor.
int WeightedDistance(const EDGEPT &other, int x_factor) const {
int x_dist = pos.x - other.pos.x;
int y_dist = pos.y - other.pos.y;
return x_dist * x_dist * x_factor + y_dist * y_dist;
}
// Returns true if the positions are equal.
bool EqualPos(const EDGEPT &other) const {
return pos == other.pos;
}
// Returns the bounding box of the outline segment from *this to *end.
// Ignores hidden edge flags.
TBOX SegmentBox(const EDGEPT *end) const {
TBOX box(pos.x, pos.y, pos.x, pos.y);
const EDGEPT *pt = this;
do {
pt = pt->next;
if (pt->pos.x < box.left()) {
box.set_left(pt->pos.x);
}
if (pt->pos.x > box.right()) {
box.set_right(pt->pos.x);
}
if (pt->pos.y < box.bottom()) {
box.set_bottom(pt->pos.y);
}
if (pt->pos.y > box.top()) {
box.set_top(pt->pos.y);
}
} while (pt != end && pt != this);
return box;
}
// Returns the area of the outline segment from *this to *end.
// Ignores hidden edge flags.
int SegmentArea(const EDGEPT *end) const {
int area = 0;
const EDGEPT *pt = this->next;
do {
TPOINT origin_vec(pt->pos.x - pos.x, pt->pos.y - pos.y);
area += origin_vec.cross(pt->vec);
pt = pt->next;
} while (pt != end && pt != this);
return area;
}
// Returns true if the number of points in the outline segment from *this to
// *end is less that min_points and false if we get back to *this first.
// Ignores hidden edge flags.
bool ShortNonCircularSegment(int min_points, const EDGEPT *end) const {
int count = 0;
const EDGEPT *pt = this;
do {
if (pt == end) {
return true;
}
pt = pt->next;
++count;
} while (pt != this && count <= min_points);
return false;
}
// Accessors to hide or reveal a cut edge from feature extractors.
void Hide() {
is_hidden = true;
}
void Reveal() {
is_hidden = false;
}
bool IsHidden() const {
return is_hidden;
}
void MarkChop() {
dir = 1;
}
bool IsChopPt() const {
return dir != 0;
}
TPOINT pos; // position
VECTOR vec; // vector to next point
bool is_hidden = false;
uint8_t runlength = 0;
int8_t dir = 0;
int8_t fixed = 0;
EDGEPT *next = nullptr; // anticlockwise element
EDGEPT *prev = nullptr; // clockwise element
C_OUTLINE *src_outline = nullptr; // Outline it came from.
// The following fields are not used if src_outline is nullptr.
int start_step = 0; // Location of pos in src_outline.
int step_count = 0; // Number of steps used (may wrap around).
};
// For use in chop and findseam to keep a list of which EDGEPTs were inserted.
CLISTIZEH(EDGEPT)
struct TESSLINE {
TESSLINE() : is_hole(false), loop(nullptr), next(nullptr) {}
TESSLINE(const TESSLINE &src) : loop(nullptr), next(nullptr) {
CopyFrom(src);
}
~TESSLINE() {
Clear();
}
TESSLINE &operator=(const TESSLINE &src) {
CopyFrom(src);
return *this;
}
// Consume the circular list of EDGEPTs to make a TESSLINE.
static TESSLINE *BuildFromOutlineList(EDGEPT *outline);
// Copies the data and the outline, but leaves next untouched.
void CopyFrom(const TESSLINE &src);
// Deletes owned data.
void Clear();
// Normalize in-place using the DENORM.
void Normalize(const DENORM &denorm);
// Rotates by the given rotation in place.
void Rotate(const FCOORD rotation);
// Moves by the given vec in place.
void Move(const ICOORD vec);
// Scales by the given factor in place.
void Scale(float factor);
// Sets up the start and vec members of the loop from the pos members.
void SetupFromPos();
// Recomputes the bounding box from the points in the loop.
void ComputeBoundingBox();
// Computes the min and max cross product of the outline points with the
// given vec and returns the results in min_xp and max_xp. Geometrically
// this is the left and right edge of the outline perpendicular to the
// given direction, but to get the distance units correct, you would
// have to divide by the modulus of vec.
void MinMaxCrossProduct(const TPOINT vec, int *min_xp, int *max_xp) const;
TBOX bounding_box() const;
// Returns true if *this and other have equal bounding boxes.
bool SameBox(const TESSLINE &other) const {
return topleft == other.topleft && botright == other.botright;
}
// Returns true if the given line segment crosses any outline of this blob.
bool SegmentCrosses(const TPOINT &pt1, const TPOINT &pt2) const {
if (Contains(pt1) && Contains(pt2)) {
EDGEPT *pt = loop;
do {
if (TPOINT::IsCrossed(pt1, pt2, pt->pos, pt->next->pos)) {
return true;
}
pt = pt->next;
} while (pt != loop);
}
return false;
}
// Returns true if the point is contained within the outline box.
bool Contains(const TPOINT &pt) const {
return topleft.x <= pt.x && pt.x <= botright.x && botright.y <= pt.y && pt.y <= topleft.y;
}
#ifndef GRAPHICS_DISABLED
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color);
#endif // !GRAPHICS_DISABLED
// Returns the first outline point that has a different src_outline to its
// predecessor, or, if all the same, the lowest indexed point.
EDGEPT *FindBestStartPt() const;
int BBArea() const {
return (botright.x - topleft.x) * (topleft.y - botright.y);
}
TPOINT topleft; // Top left of loop.
TPOINT botright; // Bottom right of loop.
TPOINT start; // Start of loop.
bool is_hole; // True if this is a hole/child outline.
EDGEPT *loop; // Edgeloop.
TESSLINE *next; // Next outline in blob.
}; // Outline structure.
struct TBLOB {
TBLOB() : outlines(nullptr) {}
TBLOB(const TBLOB &src) : outlines(nullptr) {
CopyFrom(src);
}
~TBLOB() {
Clear();
}
TBLOB &operator=(const TBLOB &src) {
CopyFrom(src);
return *this;
}
// Factory to build a TBLOB from a C_BLOB with polygonal approximation along
// the way. If allow_detailed_fx is true, the EDGEPTs in the returned TBLOB
// contain pointers to the input C_OUTLINEs that enable higher-resolution
// feature extraction that does not use the polygonal approximation.
static TBLOB *PolygonalCopy(bool allow_detailed_fx, C_BLOB *src);
// Factory builds a blob with no outlines, but copies the other member data.
static TBLOB *ShallowCopy(const TBLOB &src);
// Normalizes the blob for classification only if needed.
// (Normally this means a non-zero classify rotation.)
// If no Normalization is needed, then nullptr is returned, and the input blob
// can be used directly. Otherwise a new TBLOB is returned which must be
// deleted after use.
TBLOB *ClassifyNormalizeIfNeeded() const;
// Copies the data and the outlines, but leaves next untouched.
void CopyFrom(const TBLOB &src);
// Deletes owned data.
void Clear();
// Sets up the built-in DENORM and normalizes the blob in-place.
// For parameters see DENORM::SetupNormalization, plus the inverse flag for
// this blob and the Pix for the full image.
void Normalize(const BLOCK *block, const FCOORD *rotation, const DENORM *predecessor,
float x_origin, float y_origin, float x_scale, float y_scale, float final_xshift,
float final_yshift, bool inverse, Image pix);
// Rotates by the given rotation in place.
void Rotate(const FCOORD rotation);
// Moves by the given vec in place.
void Move(const ICOORD vec);
// Scales by the given factor in place.
void Scale(float factor);
// Recomputes the bounding boxes of the outlines.
void ComputeBoundingBoxes();
// Returns the number of outlines.
int NumOutlines() const;
TBOX bounding_box() const;
// Returns true if the given line segment crosses any outline of this blob.
bool SegmentCrossesOutline(const TPOINT &pt1, const TPOINT &pt2) const {
for (const TESSLINE *outline = outlines; outline != nullptr; outline = outline->next) {
if (outline->SegmentCrosses(pt1, pt2)) {
return true;
}
}
return false;
}
// Returns true if the point is contained within any of the outline boxes.
bool Contains(const TPOINT &pt) const {
for (const TESSLINE *outline = outlines; outline != nullptr; outline = outline->next) {
if (outline->Contains(pt)) {
return true;
}
}
return false;
}
// Finds and deletes any duplicate outlines in this blob, without deleting
// their EDGEPTs.
void EliminateDuplicateOutlines();
// Swaps the outlines of *this and next if needed to keep the centers in
// increasing x.
void CorrectBlobOrder(TBLOB *next);
const DENORM &denorm() const {
return denorm_;
}
#ifndef GRAPHICS_DISABLED
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color);
#endif // !GRAPHICS_DISABLED
int BBArea() const {
int total_area = 0;
for (TESSLINE *outline = outlines; outline != nullptr; outline = outline->next) {
total_area += outline->BBArea();
}
return total_area;
}
// Computes the center of mass and second moments for the old baseline and
// 2nd moment normalizations. Returns the outline length.
// The input denorm should be the normalizations that have been applied from
// the image to the current state of this TBLOB.
int ComputeMoments(FCOORD *center, FCOORD *second_moments) const;
// Computes the precise bounding box of the coords that are generated by
// GetEdgeCoords. This may be different from the bounding box of the polygon.
void GetPreciseBoundingBox(TBOX *precise_box) const;
// Adds edges to the given vectors.
// For all the edge steps in all the outlines, or polygonal approximation
// where there are no edge steps, collects the steps into x_coords/y_coords.
// x_coords is a collection of the x-coords of vertical edges for each
// y-coord starting at box.bottom().
// y_coords is a collection of the y-coords of horizontal edges for each
// x-coord starting at box.left().
// Eg x_coords[0] is a collection of the x-coords of edges at y=bottom.
// Eg x_coords[1] is a collection of the x-coords of edges at y=bottom + 1.
void GetEdgeCoords(const TBOX &box, std::vector<std::vector<int>> &x_coords,
std::vector<std::vector<int>> &y_coords) const;
TESSLINE *outlines; // List of outlines in blob.
private: // TODO(rays) Someday the data members will be private too.
// For all the edge steps in all the outlines, or polygonal approximation
// where there are no edge steps, collects the steps into the bounding_box,
// llsq and/or the x_coords/y_coords. Both are used in different kinds of
// normalization.
// For a description of x_coords, y_coords, see GetEdgeCoords above.
void CollectEdges(const TBOX &box, TBOX *bounding_box, LLSQ *llsq,
std::vector<std::vector<int>> *x_coords,
std::vector<std::vector<int>> *y_coords) const;
private:
// DENORM indicating the transformations that this blob has undergone so far.
DENORM denorm_;
}; // Blob structure.
struct TWERD {
TWERD() : latin_script(false) {}
TWERD(const TWERD &src) {
CopyFrom(src);
}
~TWERD() {
Clear();
}
TWERD &operator=(const TWERD &src) {
CopyFrom(src);
return *this;
}
// Factory to build a TWERD from a (C_BLOB) WERD, with polygonal
// approximation along the way.
static TWERD *PolygonalCopy(bool allow_detailed_fx, WERD *src);
// Baseline normalizes the blobs in-place, recording the normalization in the
// DENORMs in the blobs.
void BLNormalize(const BLOCK *block, const ROW *row, Image pix, bool inverse, float x_height,
float baseline_shift, bool numeric_mode, tesseract::OcrEngineMode hint,
const TBOX *norm_box, DENORM *word_denorm);
// Copies the data and the blobs, but leaves next untouched.
void CopyFrom(const TWERD &src);
// Deletes owned data.
void Clear();
// Recomputes the bounding boxes of the blobs.
void ComputeBoundingBoxes();
// Returns the number of blobs in the word.
int NumBlobs() const {
return blobs.size();
}
TBOX bounding_box() const;
// Merges the blobs from start to end, not including end, and deletes
// the blobs between start and end.
void MergeBlobs(int start, int end);
#ifndef GRAPHICS_DISABLED
void plot(ScrollView *window);
#endif // !GRAPHICS_DISABLED
std::vector<TBLOB *> blobs; // Blobs in word.
bool latin_script; // This word is in a latin-based script.
};
/*----------------------------------------------------------------------
F u n c t i o n s
----------------------------------------------------------------------*/
// TODO(rays) Make divisible_blob and divide_blobs members of TBLOB.
bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT *location);
void divide_blobs(TBLOB *blob, TBLOB *other_blob, bool italic_blob, const TPOINT &location);
} // namespace tesseract
#endif

View File

@ -0,0 +1,74 @@
/**********************************************************************
* File: blread.cpp (Formerly pdread.c)
* Description: Friend function of BLOCK to read the uscan pd file.
* Author: Ray Smith
*
* (C) Copyright 1991, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "blread.h"
#include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)
#include "scanutils.h" // for tfscanf
#include <cstdio> // for fclose, fopen, FILE
namespace tesseract {
#define UNLV_EXT ".uzn" // unlv zone file
/**********************************************************************
* read_unlv_file
*
* Read a whole unlv zone file to make a list of blocks.
**********************************************************************/
bool read_unlv_file( // print list of sides
std::string &name, // basename of file
int32_t xsize, // image size
int32_t ysize, // image size
BLOCK_LIST *blocks // output list
) {
FILE *pdfp; // file pointer
BLOCK *block; // current block
int x; // current top-down coords
int y;
int width; // of current block
int height;
BLOCK_IT block_it = blocks; // block iterator
name += UNLV_EXT; // add extension
if ((pdfp = fopen(name.c_str(), "rb")) == nullptr) {
return false; // didn't read one
} else {
while (tfscanf(pdfp, "%d %d %d %d %*s", &x, &y, &width, &height) >= 4) {
// make rect block
block = new BLOCK(name.c_str(), true, 0, 0, static_cast<int16_t>(x),
static_cast<int16_t>(ysize - y - height), static_cast<int16_t>(x + width),
static_cast<int16_t>(ysize - y));
// on end of list
block_it.add_to_end(block);
}
fclose(pdfp);
}
tprintf("UZN file %s loaded.\n", name.c_str());
return true;
}
void FullPageBlock(int width, int height, BLOCK_LIST *blocks) {
BLOCK_IT block_it(blocks);
auto *block = new BLOCK("", true, 0, 0, 0, 0, width, height);
block_it.add_to_end(block);
}
} // namespace tesseract

View File

@ -0,0 +1,40 @@
/**********************************************************************
* File: blread.h (Formerly pdread.h)
* Description: Friend function of BLOCK to read the uscan pd file.
* Author: Ray Smith
*
* (C) Copyright 1991, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef BLREAD_H
#define BLREAD_H
#include <cstdint> // for int32_t
#include <string> // for std::string
namespace tesseract {
class BLOCK_LIST;
bool read_unlv_file( // print list of sides
std::string &name, // basename of file
int32_t xsize, // image size
int32_t ysize, // image size
BLOCK_LIST *blocks // output list
);
void FullPageBlock(int width, int height, BLOCK_LIST *blocks);
} // namespace tesseract
#endif

View File

@ -0,0 +1,282 @@
/**********************************************************************
* File: boxread.cpp
* Description: Read data from a box file.
* Author: Ray Smith
*
* (C) Copyright 2007, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "boxread.h"
#include "errcode.h" // for ERRCODE, TESSEXIT
#include "fileerr.h" // for CANTOPENFILE
#include "rect.h" // for TBOX
#include "tprintf.h" // for tprintf
#include <tesseract/unichar.h> // for UNICHAR
#include "helpers.h" // for chomp_string
#include <climits> // for INT_MAX
#include <cstring> // for strchr, strcmp
#include <fstream> // for std::ifstream
#include <locale> // for std::locale::classic
#include <sstream> // for std::stringstream
#include <string> // for std::string
namespace tesseract {
// Special char code used to identify multi-blob labels.
static const char *kMultiBlobLabelCode = "WordStr";
// Returns the box file name corresponding to the given image_filename.
static std::string BoxFileName(const char *image_filename) {
std::string box_filename = image_filename;
size_t length = box_filename.length();
std::string last = (length > 8) ? box_filename.substr(length - 8) : "";
if (last == ".bin.png" || last == ".nrm.png") {
box_filename.resize(length - 8);
} else {
size_t lastdot = box_filename.find_last_of('.');
if (lastdot < length) {
box_filename.resize(lastdot);
}
}
box_filename += ".box";
return box_filename;
}
// Open the boxfile based on the given image filename.
FILE *OpenBoxFile(const char *fname) {
std::string filename = BoxFileName(fname);
FILE *box_file = nullptr;
if (!(box_file = fopen(filename.c_str(), "rb"))) {
CANTOPENFILE.error("read_next_box", TESSEXIT, "Can't open box file %s", filename.c_str());
}
return box_file;
}
// Reads all boxes from the given filename.
// Reads a specific target_page number if >= 0, or all pages otherwise.
// Skips blanks if skip_blanks is true.
// The UTF-8 label of the box is put in texts, and the full box definition as
// a string is put in box_texts, with the corresponding page number in pages.
// Each of the output vectors is optional (may be nullptr).
// Returns false if no boxes are found.
bool ReadAllBoxes(int target_page, bool skip_blanks, const char *filename, std::vector<TBOX> *boxes,
std::vector<std::string> *texts, std::vector<std::string> *box_texts,
std::vector<int> *pages) {
std::ifstream input(BoxFileName(filename).c_str(), std::ios::in | std::ios::binary);
std::vector<char> box_data(std::istreambuf_iterator<char>(input), {});
if (box_data.empty()) {
return false;
}
// Convert the array of bytes to a string, so it can be used by the parser.
box_data.push_back('\0');
return ReadMemBoxes(target_page, skip_blanks, &box_data[0],
/*continue_on_failure*/ true, boxes, texts, box_texts, pages);
}
// Reads all boxes from the string. Otherwise, as ReadAllBoxes.
bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure,
std::vector<TBOX> *boxes, std::vector<std::string> *texts,
std::vector<std::string> *box_texts, std::vector<int> *pages) {
std::string box_str(box_data);
std::vector<std::string> lines = split(box_str, '\n');
if (lines.empty()) {
return false;
}
int num_boxes = 0;
for (auto &line : lines) {
int page = 0;
std::string utf8_str;
TBOX box;
if (!ParseBoxFileStr(line.c_str(), &page, utf8_str, &box)) {
if (continue_on_failure) {
continue;
} else {
return false;
}
}
if (skip_blanks && (utf8_str == " " || utf8_str == "\t")) {
continue;
}
if (target_page >= 0 && page != target_page) {
continue;
}
if (boxes != nullptr) {
boxes->push_back(box);
}
if (texts != nullptr) {
texts->push_back(utf8_str);
}
if (box_texts != nullptr) {
std::string full_text;
MakeBoxFileStr(utf8_str.c_str(), box, target_page, full_text);
box_texts->push_back(full_text);
}
if (pages != nullptr) {
pages->push_back(page);
}
++num_boxes;
}
return num_boxes > 0;
}
// TODO(rays) convert all uses of ReadNextBox to use the new ReadAllBoxes.
// Box files are used ONLY DURING TRAINING, but by both processes of
// creating tr files with tesseract, and unicharset_extractor.
// ReadNextBox factors out the code to interpret a line of a box
// file so that applybox and unicharset_extractor interpret the same way.
// This function returns the next valid box file utf8 string and coords
// and returns true, or false on eof (and closes the file).
// It ignores the utf8 file signature ByteOrderMark (U+FEFF=EF BB BF), checks
// for valid utf-8 and allows space or tab between fields.
// utf8_str is set with the unichar string, and bounding box with the box.
// If there are page numbers in the file, it reads them all.
bool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box) {
return ReadNextBox(-1, line_number, box_file, utf8_str, bounding_box);
}
// As ReadNextBox above, but get a specific page number. (0-based)
// Use -1 to read any page number. Files without page number all
// read as if they are page 0.
bool ReadNextBox(int target_page, int *line_number, FILE *box_file, std::string &utf8_str,
TBOX *bounding_box) {
int page = 0;
char buff[kBoxReadBufSize]; // boxfile read buffer
char *buffptr = buff;
while (fgets(buff, sizeof(buff) - 1, box_file)) {
(*line_number)++;
buffptr = buff;
const auto *ubuf = reinterpret_cast<const unsigned char *>(buffptr);
if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf) {
buffptr += 3; // Skip unicode file designation.
}
// Check for blank lines in box file
if (*buffptr == '\n' || *buffptr == '\0') {
continue;
}
// Skip blank boxes.
if (*buffptr == ' ' || *buffptr == '\t') {
continue;
}
if (*buffptr != '\0') {
if (!ParseBoxFileStr(buffptr, &page, utf8_str, bounding_box)) {
tprintf("Box file format error on line %i; ignored\n", *line_number);
continue;
}
if (target_page >= 0 && target_page != page) {
continue; // Not on the appropriate page.
}
return true; // Successfully read a box.
}
}
fclose(box_file);
return false; // EOF
}
// Parses the given box file string into a page_number, utf8_str, and
// bounding_box. Returns true on a successful parse.
// The box file is assumed to contain box definitions, one per line, of the
// following format for blob-level boxes:
// <UTF8 str> <left> <bottom> <right> <top> <page id>
// and for word/line-level boxes:
// WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
// See applyybox.cpp for more information.
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str,
TBOX *bounding_box) {
*bounding_box = TBOX(); // Initialize it to empty.
utf8_str = "";
char uch[kBoxReadBufSize];
const char *buffptr = boxfile_str;
// Read the unichar without messing up on Tibetan.
// According to issue 253 the utf-8 surrogates 85 and A0 are treated
// as whitespace by sscanf, so it is more reliable to just find
// ascii space and tab.
int uch_len = 0;
// Skip unicode file designation, if present.
const auto *ubuf = reinterpret_cast<const unsigned char *>(buffptr);
if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf) {
buffptr += 3;
}
// Allow a single blank as the UTF-8 string. Check for empty string and
// then blindly eat the first character.
if (*buffptr == '\0') {
return false;
}
do {
uch[uch_len++] = *buffptr++;
} while (*buffptr != '\0' && *buffptr != ' ' && *buffptr != '\t' &&
uch_len < kBoxReadBufSize - 1);
uch[uch_len] = '\0';
if (*buffptr != '\0') {
++buffptr;
}
int x_min = INT_MAX;
int y_min = INT_MAX;
int x_max = INT_MIN;
int y_max = INT_MIN;
*page_number = 0;
std::stringstream stream(buffptr);
stream.imbue(std::locale::classic());
stream >> x_min;
stream >> y_min;
stream >> x_max;
stream >> y_max;
stream >> *page_number;
if (x_max < x_min || y_max < y_min) {
tprintf("Bad box coordinates in boxfile string! %s\n", ubuf);
return false;
}
// Test for long space-delimited string label.
if (strcmp(uch, kMultiBlobLabelCode) == 0 && (buffptr = strchr(buffptr, '#')) != nullptr) {
strncpy(uch, buffptr + 1, kBoxReadBufSize - 1);
uch[kBoxReadBufSize - 1] = '\0'; // Prevent buffer overrun.
chomp_string(uch);
uch_len = strlen(uch);
}
// Validate UTF8 by making unichars with it.
int used = 0;
while (used < uch_len) {
tesseract::UNICHAR ch(uch + used, uch_len - used);
int new_used = ch.utf8_len();
if (new_used == 0) {
tprintf("Bad UTF-8 str %s starts with 0x%02x at col %d\n", uch + used, uch[used], used + 1);
return false;
}
used += new_used;
}
utf8_str = uch;
if (x_min > x_max) {
std::swap(x_min, x_max);
}
if (y_min > y_max) {
std::swap(y_min, y_max);
}
bounding_box->set_to_given_coords(x_min, y_min, x_max, y_max);
return true; // Successfully read a box.
}
// Creates a box file string from a unichar string, TBOX and page number.
void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, std::string &box_str) {
box_str = unichar_str;
box_str += " " + std::to_string(box.left());
box_str += " " + std::to_string(box.bottom());
box_str += " " + std::to_string(box.right());
box_str += " " + std::to_string(box.top());
box_str += " " + std::to_string(page_num);
}
} // namespace tesseract

View File

@ -0,0 +1,89 @@
/**********************************************************************
* File: boxread.h
* Description: Read data from a box file.
* Author: Ray Smith
*
* (C) Copyright 2007, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_CCUTIL_BOXREAD_H_
#define TESSERACT_CCUTIL_BOXREAD_H_
#include <cstdio> // for FILE
#include <string> // for std::string
#include <vector> // for std::vector
#include <tesseract/export.h> // for TESS_API
namespace tesseract {
class TBOX;
// Size of buffer used to read a line from a box file.
const int kBoxReadBufSize = 1024;
// Open the boxfile based on the given image filename.
// Returns nullptr if the box file cannot be opened.
TESS_API
FILE *OpenBoxFile(const char *filename);
// Reads all boxes from the given filename.
// Reads a specific target_page number if >= 0, or all pages otherwise.
// Skips blanks if skip_blanks is true.
// The UTF-8 label of the box is put in texts, and the full box definition as
// a string is put in box_texts, with the corresponding page number in pages.
// Each of the output vectors is optional (may be nullptr).
// Returns false if no boxes are found.
bool ReadAllBoxes(int target_page, bool skip_blanks, const char *filename, std::vector<TBOX> *boxes,
std::vector<std::string> *texts, std::vector<std::string> *box_texts,
std::vector<int> *pages);
// Reads all boxes from the string. Otherwise, as ReadAllBoxes.
// continue_on_failure allows reading to continue even if an invalid box is
// encountered and will return true if it succeeds in reading some boxes.
// It otherwise gives up and returns false on encountering an invalid box.
TESS_API
bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure,
std::vector<TBOX> *boxes, std::vector<std::string> *texts,
std::vector<std::string> *box_texts, std::vector<int> *pages);
// ReadNextBox factors out the code to interpret a line of a box
// file so that applybox and unicharset_extractor interpret the same way.
// This function returns the next valid box file utf8 string and coords
// and returns true, or false on eof (and closes the file).
// It ignores the utf8 file signature ByteOrderMark (U+FEFF=EF BB BF), checks
// for valid utf-8 and allows space or tab between fields.
// utf8_str is set with the unichar string, and bounding box with the box.
// If there are page numbers in the file, it reads them all.
TESS_API
bool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box);
// As ReadNextBox above, but get a specific page number. (0-based)
// Use -1 to read any page number. Files without page number all
// read as if they are page 0.
TESS_API
bool ReadNextBox(int target_page, int *line_number, FILE *box_file, std::string &utf8_str,
TBOX *bounding_box);
// Parses the given box file string into a page_number, utf8_str, and
// bounding_box. Returns true on a successful parse.
TESS_API
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str,
TBOX *bounding_box);
// Creates a box file string from a unichar string, TBOX and page number.
TESS_API
void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, std::string &box_str);
} // namespace tesseract
#endif // TESSERACT_CCUTIL_BOXREAD_H_

View File

@ -0,0 +1,205 @@
///////////////////////////////////////////////////////////////////////
// File: boxword.cpp
// Description: Class to represent the bounding boxes of the output.
// Author: Ray Smith
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "boxword.h"
#include "blobs.h"
#include "host.h" // for NearlyEqual
#include "normalis.h"
#include "ocrblock.h"
#include "pageres.h"
namespace tesseract {
// Clip output boxes to input blob boxes for bounds that are within this
// tolerance. Otherwise, the blob may be chopped and we have to just use
// the word bounding box.
const int kBoxClipTolerance = 2;
BoxWord::BoxWord() : length_(0) {}
BoxWord::BoxWord(const BoxWord &src) {
CopyFrom(src);
}
BoxWord &BoxWord::operator=(const BoxWord &src) {
CopyFrom(src);
return *this;
}
void BoxWord::CopyFrom(const BoxWord &src) {
bbox_ = src.bbox_;
length_ = src.length_;
boxes_.clear();
boxes_.reserve(length_);
for (int i = 0; i < length_; ++i) {
boxes_.push_back(src.boxes_[i]);
}
}
// Factory to build a BoxWord from a TWERD using the DENORMs on each blob to
// switch back to original image coordinates.
BoxWord *BoxWord::CopyFromNormalized(TWERD *tessword) {
auto *boxword = new BoxWord();
// Count the blobs.
boxword->length_ = tessword->NumBlobs();
// Allocate memory.
boxword->boxes_.reserve(boxword->length_);
for (int b = 0; b < boxword->length_; ++b) {
TBLOB *tblob = tessword->blobs[b];
TBOX blob_box;
for (TESSLINE *outline = tblob->outlines; outline != nullptr; outline = outline->next) {
EDGEPT *edgept = outline->loop;
// Iterate over the edges.
do {
if (!edgept->IsHidden() || !edgept->prev->IsHidden()) {
ICOORD pos(edgept->pos.x, edgept->pos.y);
TPOINT denormed;
tblob->denorm().DenormTransform(nullptr, edgept->pos, &denormed);
pos.set_x(denormed.x);
pos.set_y(denormed.y);
TBOX pt_box(pos, pos);
blob_box += pt_box;
}
edgept = edgept->next;
} while (edgept != outline->loop);
}
boxword->boxes_.push_back(blob_box);
}
boxword->ComputeBoundingBox();
return boxword;
}
// Clean up the bounding boxes from the polygonal approximation by
// expanding slightly, then clipping to the blobs from the original_word
// that overlap. If not null, the block provides the inverse rotation.
void BoxWord::ClipToOriginalWord(const BLOCK *block, WERD *original_word) {
for (int i = 0; i < length_; ++i) {
TBOX box = boxes_[i];
// Expand by a single pixel, as the poly approximation error is 1 pixel.
box = TBOX(box.left() - 1, box.bottom() - 1, box.right() + 1, box.top() + 1);
// Now find the original box that matches.
TBOX original_box;
C_BLOB_IT b_it(original_word->cblob_list());
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
TBOX blob_box = b_it.data()->bounding_box();
if (block != nullptr) {
blob_box.rotate(block->re_rotation());
}
if (blob_box.major_overlap(box)) {
original_box += blob_box;
}
}
if (!original_box.null_box()) {
if (NearlyEqual<int>(original_box.left(), box.left(), kBoxClipTolerance)) {
box.set_left(original_box.left());
}
if (NearlyEqual<int>(original_box.right(), box.right(), kBoxClipTolerance)) {
box.set_right(original_box.right());
}
if (NearlyEqual<int>(original_box.top(), box.top(), kBoxClipTolerance)) {
box.set_top(original_box.top());
}
if (NearlyEqual<int>(original_box.bottom(), box.bottom(), kBoxClipTolerance)) {
box.set_bottom(original_box.bottom());
}
}
original_box = original_word->bounding_box();
if (block != nullptr) {
original_box.rotate(block->re_rotation());
}
boxes_[i] = box.intersection(original_box);
}
ComputeBoundingBox();
}
// Merges the boxes from start to end, not including end, and deletes
// the boxes between start and end.
void BoxWord::MergeBoxes(int start, int end) {
start = ClipToRange(start, 0, length_);
end = ClipToRange(end, 0, length_);
if (end <= start + 1) {
return;
}
for (int i = start + 1; i < end; ++i) {
boxes_[start] += boxes_[i];
}
int shrinkage = end - 1 - start;
length_ -= shrinkage;
for (int i = start + 1; i < length_; ++i) {
boxes_[i] = boxes_[i + shrinkage];
}
boxes_.resize(length_);
}
// Inserts a new box before the given index.
// Recomputes the bounding box.
void BoxWord::InsertBox(int index, const TBOX &box) {
if (index < length_) {
boxes_.insert(boxes_.begin() + index, box);
} else {
boxes_.push_back(box);
}
length_ = boxes_.size();
ComputeBoundingBox();
}
// Changes the box at the given index to the new box.
// Recomputes the bounding box.
void BoxWord::ChangeBox(int index, const TBOX &box) {
boxes_[index] = box;
ComputeBoundingBox();
}
// Deletes the box with the given index, and shuffles up the rest.
// Recomputes the bounding box.
void BoxWord::DeleteBox(int index) {
ASSERT_HOST(0 <= index && index < length_);
boxes_.erase(boxes_.begin() + index);
--length_;
ComputeBoundingBox();
}
// Deletes all the boxes stored in BoxWord.
void BoxWord::DeleteAllBoxes() {
length_ = 0;
boxes_.clear();
bbox_ = TBOX();
}
// Computes the bounding box of the word.
void BoxWord::ComputeBoundingBox() {
bbox_ = TBOX();
for (int i = 0; i < length_; ++i) {
bbox_ += boxes_[i];
}
}
// This and other putatively are the same, so call the (permanent) callback
// for each blob index where the bounding boxes match.
// The callback is deleted on completion.
void BoxWord::ProcessMatchedBlobs(const TWERD &other, std::function<void(int)> cb) const {
for (int i = 0; i < length_ && i < other.NumBlobs(); ++i) {
TBOX blob_box = other.blobs[i]->bounding_box();
if (blob_box == boxes_[i]) {
cb(i);
}
}
}
} // namespace tesseract.

View File

@ -0,0 +1,97 @@
///////////////////////////////////////////////////////////////////////
// File: boxword.h
// Description: Class to represent the bounding boxes of the output.
// Author: Ray Smith
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CSTRUCT_BOXWORD_H_
#define TESSERACT_CSTRUCT_BOXWORD_H_
#include "rect.h" // for TBOX
#include <functional> // for std::function
namespace tesseract {
class BLOCK;
class WERD;
struct TWERD;
// Class to hold an array of bounding boxes for an output word and
// the bounding box of the whole word.
class BoxWord {
public:
BoxWord();
explicit BoxWord(const BoxWord &src);
~BoxWord() = default;
BoxWord &operator=(const BoxWord &src);
void CopyFrom(const BoxWord &src);
// Factory to build a BoxWord from a TWERD using the DENORMs on each blob to
// switch back to original image coordinates.
static BoxWord *CopyFromNormalized(TWERD *tessword);
// Clean up the bounding boxes from the polygonal approximation by
// expanding slightly, then clipping to the blobs from the original_word
// that overlap. If not null, the block provides the inverse rotation.
void ClipToOriginalWord(const BLOCK *block, WERD *original_word);
// Merges the boxes from start to end, not including end, and deletes
// the boxes between start and end.
void MergeBoxes(int start, int end);
// Inserts a new box before the given index.
// Recomputes the bounding box.
void InsertBox(int index, const TBOX &box);
// Changes the box at the given index to the new box.
// Recomputes the bounding box.
void ChangeBox(int index, const TBOX &box);
// Deletes the box with the given index, and shuffles up the rest.
// Recomputes the bounding box.
void DeleteBox(int index);
// Deletes all the boxes stored in BoxWord.
void DeleteAllBoxes();
// This and other putatively are the same, so call the (permanent) callback
// for each blob index where the bounding boxes match.
// The callback is deleted on completion.
void ProcessMatchedBlobs(const TWERD &other, std::function<void(int)> cb) const;
const TBOX &bounding_box() const {
return bbox_;
}
int length() const {
return length_;
}
const TBOX &BlobBox(int index) const {
return boxes_[index];
}
private:
void ComputeBoundingBox();
TBOX bbox_;
int length_;
std::vector<TBOX> boxes_;
};
} // namespace tesseract.
#endif // TESSERACT_CSTRUCT_BOXWORD_H_

View File

@ -0,0 +1,36 @@
///////////////////////////////////////////////////////////////////////
// File: ccstruct.cpp
// Description: ccstruct class.
// Author: Samuel Charron
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "ccstruct.h"
namespace tesseract {
// APPROXIMATIONS of the fractions of the character cell taken by
// the descenders, ascenders, and x-height.
const double CCStruct::kDescenderFraction = 0.25;
const double CCStruct::kXHeightFraction = 0.5;
const double CCStruct::kAscenderFraction = 0.25;
const double CCStruct::kXHeightCapRatio =
CCStruct::kXHeightFraction / (CCStruct::kXHeightFraction + CCStruct::kAscenderFraction);
// Destructor.
// It is defined here, so the compiler can create a single vtable
// instead of weak vtables in every compilation unit.
CCStruct::~CCStruct() = default;
} // namespace tesseract

View File

@ -0,0 +1,41 @@
///////////////////////////////////////////////////////////////////////
// File: ccstruct.h
// Description: ccstruct class.
// Author: Samuel Charron
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCSTRUCT_CCSTRUCT_H_
#define TESSERACT_CCSTRUCT_CCSTRUCT_H_
#include "ccutil.h" // for CCUtil
namespace tesseract {
class TESS_API CCStruct : public CCUtil {
public:
CCStruct() = default;
~CCStruct() override;
// Globally accessible constants.
// APPROXIMATIONS of the fractions of the character cell taken by
// the descenders, ascenders, and x-height.
static const double kDescenderFraction; // = 0.25;
static const double kXHeightFraction; // = 0.5;
static const double kAscenderFraction; // = 0.25;
// Derived value giving the x-height as a fraction of cap-height.
static const double kXHeightCapRatio; // = XHeight/(XHeight + Ascender).
};
} // namespace tesseract
#endif // TESSERACT_CCSTRUCT_CCSTRUCT_H_

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,297 @@
/**********************************************************************
* File: coutln.h
* Description: Code for the C_OUTLINE class.
* Author: Ray Smith
*
* (C) Copyright 1991, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef COUTLN_H
#define COUTLN_H
#include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
#include "mod128.h" // for DIR128, DIRBITS
#include "points.h" // for ICOORD, FCOORD
#include "rect.h" // for TBOX
#include "scrollview.h" // for ScrollView, ScrollView::Color
#include <tesseract/export.h> // for DLLSYM
#include <cstdint> // for int16_t, int32_t
#include <bitset> // for std::bitset<16>
struct Pix;
namespace tesseract {
class CRACKEDGE;
class DENORM;
#define INTERSECTING INT16_MAX // no winding number
// mask to get step
#define STEP_MASK 3
enum C_OUTLINE_FLAGS {
COUT_INVERSE // White on black blob
};
// Simple struct to hold the 3 values needed to compute a more precise edge
// position and direction. The offset_numerator is the difference between the
// grey threshold and the mean pixel value. pixel_diff is the difference between
// the pixels in the edge. Consider the following row of pixels: p1 p2 p3 p4 p5
// Say the image was thresholded at threshold t, making p1, p2, p3 black
// and p4, p5 white (p1, p2, p3 < t, and p4, p5 >= t), but suppose that
// max(p[i+1] - p[i]) is p3 - p2. Then the extrapolated position of the edge,
// based on the maximum gradient, is at the crack between p2 and p3 plus the
// offset (t - (p2+p3)/2)/(p3 - p2). We store the pixel difference p3-p2
// denominator in pixel_diff and the offset numerator, relative to the original
// binary edge (t - (p2+p3)/2) - (p3 -p2) in offset_numerator.
// The sign of offset_numerator and pixel_diff are manipulated to ensure
// that the pixel_diff, which will be used as a weight, is always positive.
// The direction stores the quantized feature direction for the given step
// computed from the edge gradient. (Using binary_angle_plus_pi.)
// If the pixel_diff is zero, it means that the direction of the gradient
// is in conflict with the step direction, so this step is to be ignored.
struct EdgeOffset {
int8_t offset_numerator;
uint8_t pixel_diff;
uint8_t direction;
};
class C_OUTLINE; // forward declaration
ELISTIZEH(C_OUTLINE)
class C_OUTLINE : public ELIST_LINK {
public:
C_OUTLINE() {
stepcount = 0;
offsets = nullptr;
}
C_OUTLINE( // constructor
CRACKEDGE *startpt, // from edge detector
ICOORD bot_left, // bounding box //length of loop
ICOORD top_right, int16_t length);
C_OUTLINE(ICOORD startpt, // start of loop
DIR128 *new_steps, // steps in loop
int16_t length); // length of loop
// outline to copy
C_OUTLINE(C_OUTLINE *srcline, FCOORD rotation); // and rotate
// Build a fake outline, given just a bounding box and append to the list.
static void FakeOutline(const TBOX &box, C_OUTLINE_LIST *outlines);
~C_OUTLINE() { // destructor
delete[] offsets;
}
bool flag( // test flag
C_OUTLINE_FLAGS mask) const { // flag to test
return flags[mask];
}
void set_flag( // set flag value
C_OUTLINE_FLAGS mask, // flag to test
bool value) { // value to set
flags.set(mask, value);
}
C_OUTLINE_LIST *child() { // get child list
return &children;
}
// access function
const TBOX &bounding_box() const {
return box;
}
void set_step( // set a step
int16_t stepindex, // index of step
int8_t stepdir) { // chain code
int shift = stepindex % 4 * 2;
uint8_t mask = 3 << shift;
steps[stepindex / 4] = ((stepdir << shift) & mask) | (steps[stepindex / 4] & ~mask);
// squeeze 4 into byte
}
void set_step( // set a step
int16_t stepindex, // index of step
DIR128 stepdir) { // direction
// clean it
int8_t chaindir = stepdir.get_dir() >> (DIRBITS - 2);
// difference
set_step(stepindex, chaindir);
// squeeze 4 into byte
}
int32_t pathlength() const { // get path length
return stepcount;
}
// Return step at a given index as a DIR128.
DIR128 step_dir(int index) const {
return DIR128(
static_cast<int16_t>(((steps[index / 4] >> (index % 4 * 2)) & STEP_MASK) << (DIRBITS - 2)));
}
// Return the step vector for the given outline position.
ICOORD step(int index) const { // index of step
return step_coords[chain_code(index)];
}
// get start position
const ICOORD &start_pos() const {
return start;
}
// Returns the position at the given index on the outline.
// NOT to be used lightly, as it has to iterate the outline to find out.
ICOORD position_at_index(int index) const {
ICOORD pos = start;
for (int i = 0; i < index; ++i) {
pos += step(i);
}
return pos;
}
// Returns the sub-pixel accurate position given the integer position pos
// at the given index on the outline. pos may be a return value of
// position_at_index, or computed by repeatedly adding step to the
// start_pos() in the usual way.
FCOORD sub_pixel_pos_at_index(const ICOORD &pos, int index) const {
const ICOORD &step_to_next(step(index));
FCOORD f_pos(pos.x() + step_to_next.x() / 2.0f, pos.y() + step_to_next.y() / 2.0f);
if (offsets != nullptr && offsets[index].pixel_diff > 0) {
float offset = offsets[index].offset_numerator;
offset /= offsets[index].pixel_diff;
if (step_to_next.x() != 0) {
f_pos.set_y(f_pos.y() + offset);
} else {
f_pos.set_x(f_pos.x() + offset);
}
}
return f_pos;
}
// Returns the step direction for the given index or -1 if there is none.
int direction_at_index(int index) const {
if (offsets != nullptr && offsets[index].pixel_diff > 0) {
return offsets[index].direction;
}
return -1;
}
// Returns the edge strength for the given index.
// If there are no recorded edge strengths, returns 1 (assuming the image
// is binary). Returns 0 if the gradient direction conflicts with the
// step direction, indicating that this position could be skipped.
int edge_strength_at_index(int index) const {
if (offsets != nullptr) {
return offsets[index].pixel_diff;
}
return 1;
}
// Return the step as a chain code (0-3) related to the standard feature
// direction of binary_angle_plus_pi by:
// chain_code * 64 = feature direction.
int chain_code(int index) const { // index of step
return (steps[index / 4] >> (index % 4 * 2)) & STEP_MASK;
}
int32_t area() const; // Returns area of self and 1st level children.
int32_t perimeter() const; // Total perimeter of self and 1st level children.
int32_t outer_area() const; // Returns area of self only.
int32_t count_transitions( // count maxima
int32_t threshold); // size threshold
bool operator<( // containment test
const C_OUTLINE &other) const;
bool operator>( // containment test
C_OUTLINE &other) const {
return other < *this; // use the < to do it
}
int16_t winding_number( // get winding number
ICOORD testpt) const; // around this point
// get direction
int16_t turn_direction() const;
void reverse(); // reverse direction
void move( // reposition outline
const ICOORD vec); // by vector
// Returns true if *this and its children are legally nested.
// The outer area of a child should have the opposite sign to the
// parent. If not, it means we have discarded an outline in between
// (probably due to excessive length).
bool IsLegallyNested() const;
// If this outline is smaller than the given min_size, delete this and
// remove from its list, via *it, after checking that *it points to this.
// Otherwise, if any children of this are too small, delete them.
// On entry, *it must be an iterator pointing to this. If this gets deleted
// then this is extracted from *it, so an iteration can continue.
void RemoveSmallRecursive(int min_size, C_OUTLINE_IT *it);
// Adds sub-pixel resolution EdgeOffsets for the outline if the supplied
// pix is 8-bit. Does nothing otherwise.
void ComputeEdgeOffsets(int threshold, Image pix);
// Adds sub-pixel resolution EdgeOffsets for the outline using only
// a binary image source.
void ComputeBinaryOffsets();
// Renders the outline to the given pix, with left and top being
// the coords of the upper-left corner of the pix.
void render(int left, int top, Image pix) const;
// Renders just the outline to the given pix (no fill), with left and top
// being the coords of the upper-left corner of the pix.
void render_outline(int left, int top, Image pix) const;
#ifndef GRAPHICS_DISABLED
void plot( // draw one
ScrollView *window, // window to draw in
ScrollView::Color colour) const; // colour to draw it
// Draws the outline in the given colour, normalized using the given denorm,
// making use of sub-pixel accurate information if available.
void plot_normed(const DENORM &denorm, ScrollView::Color colour, ScrollView *window) const;
#endif // !GRAPHICS_DISABLED
C_OUTLINE &operator=(const C_OUTLINE &source);
static C_OUTLINE *deep_copy(const C_OUTLINE *src) {
auto *outline = new C_OUTLINE;
*outline = *src;
return outline;
}
static ICOORD chain_step(int chaindir);
// The maximum length of any outline. The stepcount is stored as 16 bits,
// but it is probably not a good idea to increase this constant by much
// and switch to 32 bits, as it plays an important role in keeping huge
// outlines invisible, which prevents bad speed behavior.
static const int kMaxOutlineLength = 16000;
private:
// Helper for ComputeBinaryOffsets. Increments pos, dir_counts, pos_totals
// by the step, increment, and vertical step ? x : y position * increment
// at step s Mod stepcount respectively. Used to add or subtract the
// direction and position to/from accumulators of a small neighbourhood.
void increment_step(int s, int increment, ICOORD *pos, int *dir_counts, int *pos_totals) const;
int step_mem() const {
return (stepcount + 3) / 4;
}
TBOX box; // bounding box
ICOORD start; // start coord
int16_t stepcount; // no of steps
std::bitset<16> flags; // flags about outline
std::vector<uint8_t> steps; // step array
EdgeOffset *offsets; // Higher precision edge.
C_OUTLINE_LIST children; // child elements
static ICOORD step_coords[4];
};
} // namespace tesseract
#endif

View File

@ -0,0 +1,42 @@
/**********************************************************************
* File: crakedge.h (Formerly: crkedge.h)
* Description: Structures for the Crack following edge detector.
* Author: Ray Smith
* Created: Fri Mar 22 16:06:38 GMT 1991
*
* (C) Copyright 1991, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef CRAKEDGE_H
#define CRAKEDGE_H
#include "mod128.h"
#include "points.h"
namespace tesseract {
class CRACKEDGE {
public:
CRACKEDGE() = default;
ICOORD pos; /*position of crack */
int8_t stepx; // edge step
int8_t stepy;
int8_t stepdir; // chaincode
CRACKEDGE *prev; /*previous point */
CRACKEDGE *next; /*next point */
};
} // namespace tesseract
#endif

View File

@ -0,0 +1,58 @@
#ifndef TESSERACT_CCSTRUCT_DEBUGPIXA_H_
#define TESSERACT_CCSTRUCT_DEBUGPIXA_H_
#include "image.h"
#include <allheaders.h>
namespace tesseract {
// Class to hold a Pixa collection of debug images with captions and save them
// to a PDF file.
class DebugPixa {
public:
// TODO(rays) add another constructor with size control.
DebugPixa() {
pixa_ = pixaCreate(0);
#ifdef TESSERACT_DISABLE_DEBUG_FONTS
fonts_ = NULL;
#else
fonts_ = bmfCreate(nullptr, 14);
#endif
}
// If the filename_ has been set and there are any debug images, they are
// written to the set filename_.
~DebugPixa() {
pixaDestroy(&pixa_);
bmfDestroy(&fonts_);
}
// Adds the given pix to the set of pages in the PDF file, with the given
// caption added to the top.
void AddPix(const Image pix, const char *caption) {
int depth = pixGetDepth(pix);
int color = depth < 8 ? 1 : (depth > 8 ? 0x00ff0000 : 0x80);
Image pix_debug =
pixAddSingleTextblock(pix, fonts_, caption, color, L_ADD_BELOW, nullptr);
pixaAddPix(pixa_, pix_debug, L_INSERT);
}
// Sets the destination filename and enables images to be written to a PDF
// on destruction.
void WritePDF(const char *filename) {
if (pixaGetCount(pixa_) > 0) {
pixaConvertToPdf(pixa_, 300, 1.0f, 0, 0, "AllDebugImages", filename);
pixaClear(pixa_);
}
}
private:
// The collection of images to put in the PDF.
Pixa *pixa_;
// The fonts used to draw text captions.
L_Bmf *fonts_;
};
} // namespace tesseract
#endif // TESSERACT_CCSTRUCT_DEBUGPIXA_H_

View File

@ -0,0 +1,302 @@
///////////////////////////////////////////////////////////////////////
// File: detlinefit.cpp
// Description: Deterministic least median squares line fitting.
// Author: Ray Smith
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "detlinefit.h"
#include "helpers.h" // for IntCastRounded
#include "statistc.h"
#include "tprintf.h"
#include <algorithm>
#include <cfloat> // for FLT_MAX
namespace tesseract {
// The number of points to consider at each end.
const int kNumEndPoints = 3;
// The minimum number of points at which to switch to number of points
// for badly fitted lines.
// To ensure a sensible error metric, kMinPointsForErrorCount should be at
// least kMaxRealDistance / (1 - %ile) where %ile is the fractile used in
// ComputeUpperQuartileError.
const int kMinPointsForErrorCount = 16;
// The maximum real distance to use before switching to number of
// mis-fitted points, which will get square-rooted for true distance.
const int kMaxRealDistance = 2.0;
DetLineFit::DetLineFit() : square_length_(0.0) {}
// Delete all Added points.
void DetLineFit::Clear() {
pts_.clear();
distances_.clear();
}
// Add a new point. Takes a copy - the pt doesn't need to stay in scope.
void DetLineFit::Add(const ICOORD &pt) {
pts_.emplace_back(pt, 0);
}
// Associates a half-width with the given point if a point overlaps the
// previous point by more than half the width, and its distance is further
// than the previous point, then the more distant point is ignored in the
// distance calculation. Useful for ignoring i dots and other diacritics.
void DetLineFit::Add(const ICOORD &pt, int halfwidth) {
pts_.emplace_back(pt, halfwidth);
}
// Fits a line to the points, ignoring the skip_first initial points and the
// skip_last final points, returning the fitted line as a pair of points,
// and the upper quartile error.
double DetLineFit::Fit(int skip_first, int skip_last, ICOORD *pt1, ICOORD *pt2) {
// Do something sensible with no points.
if (pts_.empty()) {
pt1->set_x(0);
pt1->set_y(0);
*pt2 = *pt1;
return 0.0;
}
// Count the points and find the first and last kNumEndPoints.
int pt_count = pts_.size();
ICOORD *starts[kNumEndPoints];
if (skip_first >= pt_count) {
skip_first = pt_count - 1;
}
int start_count = 0;
int end_i = std::min(skip_first + kNumEndPoints, pt_count);
for (int i = skip_first; i < end_i; ++i) {
starts[start_count++] = &pts_[i].pt;
}
ICOORD *ends[kNumEndPoints];
if (skip_last >= pt_count) {
skip_last = pt_count - 1;
}
int end_count = 0;
end_i = std::max(0, pt_count - kNumEndPoints - skip_last);
for (int i = pt_count - 1 - skip_last; i >= end_i; --i) {
ends[end_count++] = &pts_[i].pt;
}
// 1 or 2 points need special treatment.
if (pt_count <= 2) {
*pt1 = *starts[0];
if (pt_count > 1) {
*pt2 = *ends[0];
} else {
*pt2 = *pt1;
}
return 0.0;
}
// Although with between 2 and 2*kNumEndPoints-1 points, there will be
// overlap in the starts, ends sets, this is OK and taken care of by the
// if (*start != *end) test below, which also tests for equal input points.
double best_uq = -1.0;
// Iterate each pair of points and find the best fitting line.
for (int i = 0; i < start_count; ++i) {
ICOORD *start = starts[i];
for (int j = 0; j < end_count; ++j) {
ICOORD *end = ends[j];
if (*start != *end) {
ComputeDistances(*start, *end);
// Compute the upper quartile error from the line.
double dist = EvaluateLineFit();
if (dist < best_uq || best_uq < 0.0) {
best_uq = dist;
*pt1 = *start;
*pt2 = *end;
}
}
}
}
// Finally compute the square root to return the true distance.
return best_uq > 0.0 ? sqrt(best_uq) : best_uq;
}
// Constrained fit with a supplied direction vector. Finds the best line_pt,
// that is one of the supplied points having the median cross product with
// direction, ignoring points that have a cross product outside of the range
// [min_dist, max_dist]. Returns the resulting error metric using the same
// reduced set of points.
// *Makes use of floating point arithmetic*
double DetLineFit::ConstrainedFit(const FCOORD &direction, double min_dist, double max_dist,
bool debug, ICOORD *line_pt) {
ComputeConstrainedDistances(direction, min_dist, max_dist);
// Do something sensible with no points or computed distances.
if (pts_.empty() || distances_.empty()) {
line_pt->set_x(0);
line_pt->set_y(0);
return 0.0;
}
auto median_index = distances_.size() / 2;
std::nth_element(distances_.begin(), distances_.begin() + median_index, distances_.end());
*line_pt = distances_[median_index].data();
if (debug) {
tprintf("Constrained fit to dir %g, %g = %d, %d :%zu distances:\n", direction.x(), direction.y(),
line_pt->x(), line_pt->y(), distances_.size());
for (int i = 0; i < distances_.size(); ++i) {
tprintf("%d: %d, %d -> %g\n", i, distances_[i].data().x(), distances_[i].data().y(),
distances_[i].key());
}
tprintf("Result = %zu\n", median_index);
}
// Center distances on the fitted point.
double dist_origin = direction * *line_pt;
for (auto &distance : distances_) {
distance.key() -= dist_origin;
}
return sqrt(EvaluateLineFit());
}
// Returns true if there were enough points at the last call to Fit or
// ConstrainedFit for the fitted points to be used on a badly fitted line.
bool DetLineFit::SufficientPointsForIndependentFit() const {
return distances_.size() >= kMinPointsForErrorCount;
}
// Backwards compatible fit returning a gradient and constant.
// Deprecated. Prefer Fit(ICOORD*, ICOORD*) where possible, but use this
// function in preference to the LMS class.
double DetLineFit::Fit(float *m, float *c) {
ICOORD start, end;
double error = Fit(&start, &end);
if (end.x() != start.x()) {
*m = static_cast<float>(end.y() - start.y()) / (end.x() - start.x());
*c = start.y() - *m * start.x();
} else {
*m = 0.0f;
*c = 0.0f;
}
return error;
}
// Backwards compatible constrained fit with a supplied gradient.
// Deprecated. Use ConstrainedFit(const FCOORD& direction) where possible
// to avoid potential difficulties with infinite gradients.
double DetLineFit::ConstrainedFit(double m, float *c) {
// Do something sensible with no points.
if (pts_.empty()) {
*c = 0.0f;
return 0.0;
}
double cos = 1.0 / sqrt(1.0 + m * m);
FCOORD direction(cos, m * cos);
ICOORD line_pt;
double error = ConstrainedFit(direction, -FLT_MAX, FLT_MAX, false, &line_pt);
*c = line_pt.y() - line_pt.x() * m;
return error;
}
// Computes and returns the squared evaluation metric for a line fit.
double DetLineFit::EvaluateLineFit() {
// Compute the upper quartile error from the line.
double dist = ComputeUpperQuartileError();
if (distances_.size() >= kMinPointsForErrorCount && dist > kMaxRealDistance * kMaxRealDistance) {
// Use the number of mis-fitted points as the error metric, as this
// gives a better measure of fit for badly fitted lines where more
// than a quarter are badly fitted.
double threshold = kMaxRealDistance * sqrt(square_length_);
dist = NumberOfMisfittedPoints(threshold);
}
return dist;
}
// Computes the absolute error distances of the points from the line,
// and returns the squared upper-quartile error distance.
double DetLineFit::ComputeUpperQuartileError() {
int num_errors = distances_.size();
if (num_errors == 0) {
return 0.0;
}
// Get the absolute values of the errors.
for (int i = 0; i < num_errors; ++i) {
if (distances_[i].key() < 0) {
distances_[i].key() = -distances_[i].key();
}
}
// Now get the upper quartile distance.
auto index = 3 * num_errors / 4;
std::nth_element(distances_.begin(), distances_.begin() + index, distances_.end());
double dist = distances_[index].key();
// The true distance is the square root of the dist squared / square_length.
// Don't bother with the square root. Just return the square distance.
return square_length_ > 0.0 ? dist * dist / square_length_ : 0.0;
}
// Returns the number of sample points that have an error more than threshold.
int DetLineFit::NumberOfMisfittedPoints(double threshold) const {
int num_misfits = 0;
int num_dists = distances_.size();
// Get the absolute values of the errors.
for (int i = 0; i < num_dists; ++i) {
if (distances_[i].key() > threshold) {
++num_misfits;
}
}
return num_misfits;
}
// Computes all the cross product distances of the points from the line,
// storing the actual (signed) cross products in distances.
// Ignores distances of points that are further away than the previous point,
// and overlaps the previous point by at least half.
void DetLineFit::ComputeDistances(const ICOORD &start, const ICOORD &end) {
distances_.clear();
ICOORD line_vector = end;
line_vector -= start;
square_length_ = line_vector.sqlength();
int line_length = IntCastRounded(sqrt(square_length_));
// Compute the distance of each point from the line.
int prev_abs_dist = 0;
int prev_dot = 0;
for (int i = 0; i < pts_.size(); ++i) {
ICOORD pt_vector = pts_[i].pt;
pt_vector -= start;
int dot = line_vector % pt_vector;
// Compute |line_vector||pt_vector|sin(angle between)
int dist = line_vector * pt_vector;
int abs_dist = dist < 0 ? -dist : dist;
if (abs_dist > prev_abs_dist && i > 0) {
// Ignore this point if it overlaps the previous one.
int separation = abs(dot - prev_dot);
if (separation < line_length * pts_[i].halfwidth ||
separation < line_length * pts_[i - 1].halfwidth) {
continue;
}
}
distances_.emplace_back(dist, pts_[i].pt);
prev_abs_dist = abs_dist;
prev_dot = dot;
}
}
// Computes all the cross product distances of the points perpendicular to
// the given direction, ignoring distances outside of the give distance range,
// storing the actual (signed) cross products in distances_.
void DetLineFit::ComputeConstrainedDistances(const FCOORD &direction, double min_dist,
double max_dist) {
distances_.clear();
square_length_ = direction.sqlength();
// Compute the distance of each point from the line.
for (auto &pt : pts_) {
FCOORD pt_vector = pt.pt;
// Compute |line_vector||pt_vector|sin(angle between)
double dist = direction * pt_vector;
if (min_dist <= dist && dist <= max_dist) {
distances_.emplace_back(dist, pt.pt);
}
}
}
} // namespace tesseract.

View File

@ -0,0 +1,157 @@
///////////////////////////////////////////////////////////////////////
// File: detlinefit.h
// Description: Deterministic least upper-quartile squares line fitting.
// Author: Ray Smith
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCSTRUCT_DETLINEFIT_H_
#define TESSERACT_CCSTRUCT_DETLINEFIT_H_
#include "kdpair.h"
#include "points.h"
namespace tesseract {
// This class fits a line to a set of ICOORD points.
// There is no restriction on the direction of the line, as it
// uses a vector method, ie no concern over infinite gradients.
// The fitted line has the least upper quartile of squares of perpendicular
// distances of all source points from the line, subject to the constraint
// that the line is made from one of the pairs of [{p1,p2,p3},{pn-2, pn-1, pn}]
// i.e. the 9 combinations of one of the first 3 and last 3 points.
// A fundamental assumption of this algorithm is that one of the first 3 and
// one of the last 3 points are near the best line fit.
// The points must be Added in line order for the algorithm to work properly.
// No floating point calculations are needed* to make an accurate fit,
// and no random numbers are needed** so the algorithm is deterministic,
// architecture-stable, and compiler-stable as well as stable to minor
// changes in the input.
// *A single floating point division is used to compute each line's distance.
// This is unlikely to result in choice of a different line, but if it does,
// it would be easy to replace with a 64 bit integer calculation.
// **Random numbers are used in the nth_item function, but the worst
// non-determinism that can result is picking a different result among equals,
// and that wouldn't make any difference to the end-result distance, so the
// randomness does not affect the determinism of the algorithm. The random
// numbers are only there to guarantee average linear time.
// Fitting time is linear, but with a high constant, as it tries 9 different
// lines and computes the distance of all points each time.
// This class is aimed at replacing the LLSQ (linear least squares) and
// LMS (least median of squares) classes that are currently used for most
// of the line fitting in Tesseract.
class DetLineFit {
public:
DetLineFit();
~DetLineFit() = default;
// Delete all Added points.
void Clear();
// Adds a new point. Takes a copy - the pt doesn't need to stay in scope.
// Add must be called on points in sequence along the line.
void Add(const ICOORD &pt);
// Associates a half-width with the given point if a point overlaps the
// previous point by more than half the width, and its distance is further
// than the previous point, then the more distant point is ignored in the
// distance calculation. Useful for ignoring i dots and other diacritics.
void Add(const ICOORD &pt, int halfwidth);
// Fits a line to the points, returning the fitted line as a pair of
// points, and the upper quartile error.
double Fit(ICOORD *pt1, ICOORD *pt2) {
return Fit(0, 0, pt1, pt2);
}
// Fits a line to the points, ignoring the skip_first initial points and the
// skip_last final points, returning the fitted line as a pair of points,
// and the upper quartile error.
double Fit(int skip_first, int skip_last, ICOORD *pt1, ICOORD *pt2);
// Constrained fit with a supplied direction vector. Finds the best line_pt,
// that is one of the supplied points having the median cross product with
// direction, ignoring points that have a cross product outside of the range
// [min_dist, max_dist]. Returns the resulting error metric using the same
// reduced set of points.
// *Makes use of floating point arithmetic*
double ConstrainedFit(const FCOORD &direction, double min_dist, double max_dist, bool debug,
ICOORD *line_pt);
// Returns true if there were enough points at the last call to Fit or
// ConstrainedFit for the fitted points to be used on a badly fitted line.
bool SufficientPointsForIndependentFit() const;
// Backwards compatible fit returning a gradient and constant.
// Deprecated. Prefer Fit(ICOORD*, ICOORD*) where possible, but use this
// function in preference to the LMS class.
double Fit(float *m, float *c);
// Backwards compatible constrained fit with a supplied gradient.
// Deprecated. Use ConstrainedFit(const FCOORD& direction) where possible
// to avoid potential difficulties with infinite gradients.
double ConstrainedFit(double m, float *c);
private:
// Simple struct to hold an ICOORD point and a halfwidth representing half
// the "width" (supposedly approximately parallel to the direction of the
// line) of each point, such that distant points can be discarded when they
// overlap nearer points. (Think i dot and other diacritics or noise.)
struct PointWidth {
PointWidth() : pt(ICOORD(0, 0)), halfwidth(0) {}
PointWidth(const ICOORD &pt0, int halfwidth0) : pt(pt0), halfwidth(halfwidth0) {}
ICOORD pt;
int halfwidth;
};
// Type holds the distance of each point from the fitted line and the point
// itself. Use of double allows integer distances from ICOORDs to be stored
// exactly, and also the floating point results from ConstrainedFit.
using DistPointPair = KDPairInc<double, ICOORD>;
// Computes and returns the squared evaluation metric for a line fit.
double EvaluateLineFit();
// Computes the absolute values of the precomputed distances_,
// and returns the squared upper-quartile error distance.
double ComputeUpperQuartileError();
// Returns the number of sample points that have an error more than threshold.
int NumberOfMisfittedPoints(double threshold) const;
// Computes all the cross product distances of the points from the line,
// storing the actual (signed) cross products in distances_.
// Ignores distances of points that are further away than the previous point,
// and overlaps the previous point by at least half.
void ComputeDistances(const ICOORD &start, const ICOORD &end);
// Computes all the cross product distances of the points perpendicular to
// the given direction, ignoring distances outside of the give distance range,
// storing the actual (signed) cross products in distances_.
void ComputeConstrainedDistances(const FCOORD &direction, double min_dist, double max_dist);
// Stores all the source points in the order they were given and their
// halfwidths, if any.
std::vector<PointWidth> pts_;
// Stores the computed perpendicular distances of (some of) the pts_ from a
// given vector (assuming it goes through the origin, making it a line).
// Since the distances may be a subset of the input points, and get
// re-ordered by the nth_item function, the original point is stored
// along side the distance.
std::vector<DistPointPair> distances_; // Distances of points.
// The squared length of the vector used to compute distances_.
double square_length_;
};
} // namespace tesseract.
#endif // TESSERACT_CCSTRUCT_DETLINEFIT_H_

View File

@ -0,0 +1,99 @@
/**********************************************************************
* File: dppoint.cpp
* Description: Simple generic dynamic programming class.
* Author: Ray Smith
* Created: Wed Mar 25 19:08:01 PDT 2009
*
* (C) Copyright 2009, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "dppoint.h"
#include "errcode.h"
#include "tprintf.h"
namespace tesseract {
// Solve the dynamic programming problem for the given array of points, with
// the given size and cost function.
// Steps backwards are limited to being between min_step and max_step
// inclusive.
// The return value is the tail of the best path.
DPPoint *DPPoint::Solve(int min_step, int max_step, bool debug, CostFunc cost_func, int size,
DPPoint *points) {
if (size <= 0 || max_step < min_step || min_step >= size) {
return nullptr; // Degenerate, but not necessarily an error.
}
ASSERT_HOST(min_step > 0); // Infinite loop possible if this is not true.
if (debug) {
tprintf("min = %d, max=%d\n", min_step, max_step);
}
// Evaluate the total cost at each point.
for (int i = 0; i < size; ++i) {
for (int offset = min_step; offset <= max_step; ++offset) {
DPPoint *prev = offset <= i ? points + i - offset : nullptr;
int64_t new_cost = (points[i].*cost_func)(prev);
if (points[i].best_prev_ != nullptr && offset > min_step * 2 &&
new_cost > points[i].total_cost_) {
break; // Find only the first minimum if going over twice the min.
}
}
points[i].total_cost_ += points[i].local_cost_;
if (debug) {
tprintf("At point %d, local cost=%d, total_cost=%d, steps=%d\n", i, points[i].local_cost_,
points[i].total_cost_, points[i].total_steps_);
}
}
// Now find the end of the best path and return it.
int best_cost = points[size - 1].total_cost_;
int best_end = size - 1;
for (int end = best_end - 1; end >= size - min_step; --end) {
int cost = points[end].total_cost_;
if (cost < best_cost) {
best_cost = cost;
best_end = end;
}
}
return points + best_end;
}
// A CostFunc that takes the variance of step into account in the cost.
int64_t DPPoint::CostWithVariance(const DPPoint *prev) {
if (prev == nullptr || prev == this) {
UpdateIfBetter(0, 1, nullptr, 0, 0, 0);
return 0;
}
int delta = this - prev;
int32_t n = prev->n_ + 1;
int32_t sig_x = prev->sig_x_ + delta;
int64_t sig_xsq = prev->sig_xsq_ + delta * delta;
int64_t cost = (sig_xsq - sig_x * sig_x / n) / n;
cost += prev->total_cost_;
UpdateIfBetter(cost, prev->total_steps_ + 1, prev, n, sig_x, sig_xsq);
return cost;
}
// Update the other members if the cost is lower.
void DPPoint::UpdateIfBetter(int64_t cost, int32_t steps, const DPPoint *prev, int32_t n,
int32_t sig_x, int64_t sig_xsq) {
if (cost < total_cost_) {
total_cost_ = cost;
total_steps_ = steps;
best_prev_ = prev;
n_ = n;
sig_x_ = sig_x;
sig_xsq_ = sig_xsq;
}
}
} // namespace tesseract.

View File

@ -0,0 +1,105 @@
/**********************************************************************
* File: dppoint.h
* Description: Simple generic dynamic programming class.
* Author: Ray Smith
* Created: Wed Mar 25 18:57:01 PDT 2009
*
* (C) Copyright 2009, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_CCSTRUCT_DPPOINT_H_
#define TESSERACT_CCSTRUCT_DPPOINT_H_
#include <cstdint>
namespace tesseract {
// A simple class to provide a dynamic programming solution to a class of
// 1st-order problems in which the cost is dependent only on the current
// step and the best cost to that step, with a possible special case
// of using the variance of the steps, and only the top choice is required.
// Useful for problems such as finding the optimal cut points in a fixed-pitch
// (vertical or horizontal) situation.
// Skeletal Example:
// DPPoint* array = new DPPoint[width];
// for (int i = 0; i < width; i++) {
// array[i].AddLocalCost(cost_at_i)
// }
// DPPoint* best_end = DPPoint::Solve(..., array);
// while (best_end != nullptr) {
// int cut_index = best_end - array;
// best_end = best_end->best_prev();
// }
// delete [] array;
class DPPoint {
public:
// The cost function evaluates the total cost at this (excluding this's
// local_cost) and if it beats this's total_cost, then
// replace the appropriate values in this.
using CostFunc = int64_t (DPPoint::*)(const DPPoint *);
DPPoint()
: local_cost_(0)
, total_cost_(INT32_MAX)
, total_steps_(1)
, best_prev_(nullptr)
, n_(0)
, sig_x_(0)
, sig_xsq_(0) {}
// Solve the dynamic programming problem for the given array of points, with
// the given size and cost function.
// Steps backwards are limited to being between min_step and max_step
// inclusive.
// The return value is the tail of the best path.
static DPPoint *Solve(int min_step, int max_step, bool debug, CostFunc cost_func, int size,
DPPoint *points);
// A CostFunc that takes the variance of step into account in the cost.
int64_t CostWithVariance(const DPPoint *prev);
// Accessors.
int total_cost() const {
return total_cost_;
}
int Pathlength() const {
return total_steps_;
}
const DPPoint *best_prev() const {
return best_prev_;
}
void AddLocalCost(int new_cost) {
local_cost_ += new_cost;
}
private:
// Code common to different cost functions.
// Update the other members if the cost is lower.
void UpdateIfBetter(int64_t cost, int32_t steps, const DPPoint *prev, int32_t n, int32_t sig_x,
int64_t sig_xsq);
int32_t local_cost_; // Cost of this point on its own.
int32_t total_cost_; // Sum of all costs in best path to here.
// During cost calculations local_cost is excluded.
int32_t total_steps_; // Number of steps in best path to here.
const DPPoint *best_prev_; // Pointer to prev point in best path from here.
// Information for computing the variance part of the cost.
int32_t n_; // Number of steps in best path to here for variance.
int32_t sig_x_; // Sum of step sizes for computing variance.
int64_t sig_xsq_; // Sum of squares of steps for computing variance.
};
} // namespace tesseract.
#endif // TESSERACT_CCSTRUCT_DPPOINT_H_

Some files were not shown because too many files have changed in this diff Show More