feat: 集成Tesseract源码到项目中
Description: 由于仓库中的Tesseract不是最新版本导致产生了一个bug,因此将Tesseract源码集成到项目中 Log: no Change-Id: I088de95d6c6ab670406daa8d47ed2ed46929c2c0
This commit is contained in:
parent
40c90fc3c7
commit
0cfed22ed4
848
3rdparty/tesseract_ocr/tesseract/include/tesseract/baseapi.h
vendored
Normal file
848
3rdparty/tesseract_ocr/tesseract/include/tesseract/baseapi.h
vendored
Normal file
|
@ -0,0 +1,848 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: baseapi.h
|
||||
// Description: Simple API for calling tesseract.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2006, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_API_BASEAPI_H_
|
||||
#define TESSERACT_API_BASEAPI_H_
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
# include "config_auto.h" // DISABLED_LEGACY_ENGINE
|
||||
#endif
|
||||
|
||||
#include "export.h"
|
||||
#include "pageiterator.h"
|
||||
#include "publictypes.h"
|
||||
#include "resultiterator.h"
|
||||
#include "unichar.h"
|
||||
|
||||
#include "3rdparty/tesseract_ocr/tesseract/include/tesseract/version.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <tuple> // for std::tuple
|
||||
#include <vector> // for std::vector
|
||||
|
||||
struct Pix;
|
||||
struct Pixa;
|
||||
struct Boxa;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class PAGE_RES;
|
||||
class ParagraphModel;
|
||||
class BLOCK_LIST;
|
||||
class ETEXT_DESC;
|
||||
struct OSResults;
|
||||
class UNICHARSET;
|
||||
|
||||
class Dawg;
|
||||
class Dict;
|
||||
class EquationDetect;
|
||||
class PageIterator;
|
||||
class ImageThresholder;
|
||||
class LTRResultIterator;
|
||||
class ResultIterator;
|
||||
class MutableIterator;
|
||||
class TessResultRenderer;
|
||||
class Tesseract;
|
||||
|
||||
// Function to read a std::vector<char> from a whole file.
|
||||
// Returns false on failure.
|
||||
using FileReader = bool (*)(const char *filename, std::vector<char> *data);
|
||||
|
||||
using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID,
|
||||
bool) const;
|
||||
using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *,
|
||||
int, const char *, int);
|
||||
|
||||
/**
|
||||
* Base class for all tesseract APIs.
|
||||
* Specific classes can add ability to work on different inputs or produce
|
||||
* different outputs.
|
||||
* This class is mostly an interface layer on top of the Tesseract instance
|
||||
* class to hide the data types so that users of this class don't have to
|
||||
* include any other Tesseract headers.
|
||||
*/
|
||||
class TESS_API TessBaseAPI {
|
||||
public:
|
||||
TessBaseAPI();
|
||||
virtual ~TessBaseAPI();
|
||||
// Copy constructor and assignment operator are currently unsupported.
|
||||
TessBaseAPI(TessBaseAPI const &) = delete;
|
||||
TessBaseAPI &operator=(TessBaseAPI const &) = delete;
|
||||
|
||||
/**
|
||||
* Returns the version identifier as a static string. Do not delete.
|
||||
*/
|
||||
static const char *Version();
|
||||
|
||||
/**
|
||||
* If compiled with OpenCL AND an available OpenCL
|
||||
* device is deemed faster than serial code, then
|
||||
* "device" is populated with the cl_device_id
|
||||
* and returns sizeof(cl_device_id)
|
||||
* otherwise *device=nullptr and returns 0.
|
||||
*/
|
||||
static size_t getOpenCLDevice(void **device);
|
||||
|
||||
/**
|
||||
* Set the name of the input file. Needed for training and
|
||||
* reading a UNLV zone file, and for searchable PDF output.
|
||||
*/
|
||||
void SetInputName(const char *name);
|
||||
/**
|
||||
* These functions are required for searchable PDF output.
|
||||
* We need our hands on the input file so that we can include
|
||||
* it in the PDF without transcoding. If that is not possible,
|
||||
* we need the original image. Finally, resolution metadata
|
||||
* is stored in the PDF so we need that as well.
|
||||
*/
|
||||
const char *GetInputName();
|
||||
// Takes ownership of the input pix.
|
||||
void SetInputImage(Pix *pix);
|
||||
Pix *GetInputImage();
|
||||
int GetSourceYResolution();
|
||||
const char *GetDatapath();
|
||||
|
||||
/** Set the name of the bonus output files. Needed only for debugging. */
|
||||
void SetOutputName(const char *name);
|
||||
|
||||
/**
|
||||
* Set the value of an internal "parameter."
|
||||
* Supply the name of the parameter and the value as a string, just as
|
||||
* you would in a config file.
|
||||
* Returns false if the name lookup failed.
|
||||
* Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z.
|
||||
* Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode.
|
||||
* SetVariable may be used before Init, but settings will revert to
|
||||
* defaults on End().
|
||||
*
|
||||
* Note: Must be called after Init(). Only works for non-init variables
|
||||
* (init variables should be passed to Init()).
|
||||
*/
|
||||
bool SetVariable(const char *name, const char *value);
|
||||
bool SetDebugVariable(const char *name, const char *value);
|
||||
|
||||
/**
|
||||
* Returns true if the parameter was found among Tesseract parameters.
|
||||
* Fills in value with the value of the parameter.
|
||||
*/
|
||||
bool GetIntVariable(const char *name, int *value) const;
|
||||
bool GetBoolVariable(const char *name, bool *value) const;
|
||||
bool GetDoubleVariable(const char *name, double *value) const;
|
||||
|
||||
/**
|
||||
* Returns the pointer to the string that represents the value of the
|
||||
* parameter if it was found among Tesseract parameters.
|
||||
*/
|
||||
const char *GetStringVariable(const char *name) const;
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
/**
|
||||
* Print Tesseract fonts table to the given file.
|
||||
*/
|
||||
void PrintFontsTable(FILE* fp) const;
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Print Tesseract parameters to the given file.
|
||||
*/
|
||||
void PrintVariables(FILE *fp) const;
|
||||
|
||||
/**
|
||||
* Get value of named variable as a string, if it exists.
|
||||
*/
|
||||
bool GetVariableAsString(const char *name, std::string *val) const;
|
||||
|
||||
/**
|
||||
* Instances are now mostly thread-safe and totally independent,
|
||||
* but some global parameters remain. Basically it is safe to use multiple
|
||||
* TessBaseAPIs in different threads in parallel, UNLESS:
|
||||
* you use SetVariable on some of the Params in classify and textord.
|
||||
* If you do, then the effect will be to change it for all your instances.
|
||||
*
|
||||
* Start tesseract. Returns zero on success and -1 on failure.
|
||||
* NOTE that the only members that may be called before Init are those
|
||||
* listed above here in the class definition.
|
||||
*
|
||||
* The datapath must be the name of the tessdata directory.
|
||||
* The language is (usually) an ISO 639-3 string or nullptr will default to
|
||||
* eng. It is entirely safe (and eventually will be efficient too) to call
|
||||
* Init multiple times on the same instance to change language, or just
|
||||
* to reset the classifier.
|
||||
* The language may be a string of the form [~]<lang>[+[~]<lang>]* indicating
|
||||
* that multiple languages are to be loaded. Eg hin+eng will load Hindi and
|
||||
* English. Languages may specify internally that they want to be loaded
|
||||
* with one or more other languages, so the ~ sign is available to override
|
||||
* that. Eg if hin were set to load eng by default, then hin+~eng would force
|
||||
* loading only hin. The number of loaded languages is limited only by
|
||||
* memory, with the caveat that loading additional languages will impact
|
||||
* both speed and accuracy, as there is more work to do to decide on the
|
||||
* applicable language, and there is more chance of hallucinating incorrect
|
||||
* words.
|
||||
* WARNING: On changing languages, all Tesseract parameters are reset
|
||||
* back to their default values. (Which may vary between languages.)
|
||||
* If you have a rare need to set a Variable that controls
|
||||
* initialization for a second call to Init you should explicitly
|
||||
* call End() and then use SetVariable before Init. This is only a very
|
||||
* rare use case, since there are very few uses that require any parameters
|
||||
* to be set before Init.
|
||||
*
|
||||
* If set_only_non_debug_params is true, only params that do not contain
|
||||
* "debug" in the name will be set.
|
||||
*/
|
||||
int Init(const char *datapath, const char *language, OcrEngineMode mode,
|
||||
char **configs, int configs_size,
|
||||
const std::vector<std::string> *vars_vec,
|
||||
const std::vector<std::string> *vars_values,
|
||||
bool set_only_non_debug_params);
|
||||
int Init(const char *datapath, const char *language, OcrEngineMode oem) {
|
||||
return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false);
|
||||
}
|
||||
int Init(const char *datapath, const char *language) {
|
||||
return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr,
|
||||
false);
|
||||
}
|
||||
// In-memory version reads the traineddata file directly from the given
|
||||
// data[data_size] array, and/or reads data via a FileReader.
|
||||
int Init(const char *data, int data_size, const char *language,
|
||||
OcrEngineMode mode, char **configs, int configs_size,
|
||||
const std::vector<std::string> *vars_vec,
|
||||
const std::vector<std::string> *vars_values,
|
||||
bool set_only_non_debug_params, FileReader reader);
|
||||
|
||||
/**
|
||||
* Returns the languages string used in the last valid initialization.
|
||||
* If the last initialization specified "deu+hin" then that will be
|
||||
* returned. If hin loaded eng automatically as well, then that will
|
||||
* not be included in this list. To find the languages actually
|
||||
* loaded use GetLoadedLanguagesAsVector.
|
||||
* The returned string should NOT be deleted.
|
||||
*/
|
||||
const char *GetInitLanguagesAsString() const;
|
||||
|
||||
/**
|
||||
* Returns the loaded languages in the vector of std::string.
|
||||
* Includes all languages loaded by the last Init, including those loaded
|
||||
* as dependencies of other loaded languages.
|
||||
*/
|
||||
void GetLoadedLanguagesAsVector(std::vector<std::string> *langs) const;
|
||||
|
||||
/**
|
||||
* Returns the available languages in the sorted vector of std::string.
|
||||
*/
|
||||
void GetAvailableLanguagesAsVector(std::vector<std::string> *langs) const;
|
||||
|
||||
/**
|
||||
* Init only the lang model component of Tesseract. The only functions
|
||||
* that work after this init are SetVariable and IsValidWord.
|
||||
* WARNING: temporary! This function will be removed from here and placed
|
||||
* in a separate API at some future time.
|
||||
*/
|
||||
int InitLangMod(const char *datapath, const char *language);
|
||||
|
||||
/**
|
||||
* Init only for page layout analysis. Use only for calls to SetImage and
|
||||
* AnalysePage. Calls that attempt recognition will generate an error.
|
||||
*/
|
||||
void InitForAnalysePage();
|
||||
|
||||
/**
|
||||
* Read a "config" file containing a set of param, value pairs.
|
||||
* Searches the standard places: tessdata/configs, tessdata/tessconfigs
|
||||
* and also accepts a relative or absolute path name.
|
||||
* Note: only non-init params will be set (init params are set by Init()).
|
||||
*/
|
||||
void ReadConfigFile(const char *filename);
|
||||
/** Same as above, but only set debug params from the given config file. */
|
||||
void ReadDebugConfigFile(const char *filename);
|
||||
|
||||
/**
|
||||
* Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK.
|
||||
* The mode is stored as an IntParam so it can also be modified by
|
||||
* ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
|
||||
*/
|
||||
void SetPageSegMode(PageSegMode mode);
|
||||
|
||||
/** Return the current page segmentation mode. */
|
||||
PageSegMode GetPageSegMode() const;
|
||||
|
||||
/**
|
||||
* Recognize a rectangle from an image and return the result as a string.
|
||||
* May be called many times for a single Init.
|
||||
* Currently has no error checking.
|
||||
* Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
|
||||
* Palette color images will not work properly and must be converted to
|
||||
* 24 bit.
|
||||
* Binary images of 1 bit per pixel may also be given but they must be
|
||||
* byte packed with the MSB of the first byte being the first pixel, and a
|
||||
* 1 represents WHITE. For binary images set bytes_per_pixel=0.
|
||||
* The recognized text is returned as a char* which is coded
|
||||
* as UTF8 and must be freed with the delete [] operator.
|
||||
*
|
||||
* Note that TesseractRect is the simplified convenience interface.
|
||||
* For advanced uses, use SetImage, (optionally) SetRectangle, Recognize,
|
||||
* and one or more of the Get*Text functions below.
|
||||
*/
|
||||
char *TesseractRect(const unsigned char *imagedata, int bytes_per_pixel,
|
||||
int bytes_per_line, int left, int top, int width,
|
||||
int height);
|
||||
|
||||
/**
|
||||
* Call between pages or documents etc to free up memory and forget
|
||||
* adaptive data.
|
||||
*/
|
||||
void ClearAdaptiveClassifier();
|
||||
|
||||
/**
|
||||
* @defgroup AdvancedAPI Advanced API
|
||||
* The following methods break TesseractRect into pieces, so you can
|
||||
* get hold of the thresholded image, get the text in different formats,
|
||||
* get bounding boxes, confidences etc.
|
||||
*/
|
||||
/* @{ */
|
||||
|
||||
/**
|
||||
* Provide an image for Tesseract to recognize. Format is as
|
||||
* TesseractRect above. Copies the image buffer and converts to Pix.
|
||||
* SetImage clears all recognition results, and sets the rectangle to the
|
||||
* full image, so it may be followed immediately by a GetUTF8Text, and it
|
||||
* will automatically perform recognition.
|
||||
*/
|
||||
void SetImage(const unsigned char *imagedata, int width, int height,
|
||||
int bytes_per_pixel, int bytes_per_line);
|
||||
|
||||
/**
|
||||
* Provide an image for Tesseract to recognize. As with SetImage above,
|
||||
* Tesseract takes its own copy of the image, so it need not persist until
|
||||
* after Recognize.
|
||||
* Pix vs raw, which to use?
|
||||
* Use Pix where possible. Tesseract uses Pix as its internal representation
|
||||
* and it is therefore more efficient to provide a Pix directly.
|
||||
*/
|
||||
void SetImage(Pix *pix);
|
||||
|
||||
/**
|
||||
* Set the resolution of the source image in pixels per inch so font size
|
||||
* information can be calculated in results. Call this after SetImage().
|
||||
*/
|
||||
void SetSourceResolution(int ppi);
|
||||
|
||||
/**
|
||||
* Restrict recognition to a sub-rectangle of the image. Call after SetImage.
|
||||
* Each SetRectangle clears the recogntion results so multiple rectangles
|
||||
* can be recognized with the same image.
|
||||
*/
|
||||
void SetRectangle(int left, int top, int width, int height);
|
||||
|
||||
/**
|
||||
* Get a copy of the internal thresholded image from Tesseract.
|
||||
* Caller takes ownership of the Pix and must pixDestroy it.
|
||||
* May be called any time after SetImage, or after TesseractRect.
|
||||
*/
|
||||
Pix *GetThresholdedImage();
|
||||
|
||||
/**
|
||||
* Get the result of page layout analysis as a leptonica-style
|
||||
* Boxa, Pixa pair, in reading order.
|
||||
* Can be called before or after Recognize.
|
||||
*/
|
||||
Boxa *GetRegions(Pixa **pixa);
|
||||
|
||||
/**
|
||||
* Get the textlines as a leptonica-style
|
||||
* Boxa, Pixa pair, in reading order.
|
||||
* Can be called before or after Recognize.
|
||||
* If raw_image is true, then extract from the original image instead of the
|
||||
* thresholded image and pad by raw_padding pixels.
|
||||
* If blockids is not nullptr, the block-id of each line is also returned as
|
||||
* an array of one element per line. delete [] after use. If paraids is not
|
||||
* nullptr, the paragraph-id of each line within its block is also returned as
|
||||
* an array of one element per line. delete [] after use.
|
||||
*/
|
||||
Boxa *GetTextlines(bool raw_image, int raw_padding, Pixa **pixa,
|
||||
int **blockids, int **paraids);
|
||||
/*
|
||||
Helper method to extract from the thresholded image. (most common usage)
|
||||
*/
|
||||
Boxa *GetTextlines(Pixa **pixa, int **blockids) {
|
||||
return GetTextlines(false, 0, pixa, blockids, nullptr);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get textlines and strips of image regions as a leptonica-style Boxa, Pixa
|
||||
* pair, in reading order. Enables downstream handling of non-rectangular
|
||||
* regions.
|
||||
* Can be called before or after Recognize.
|
||||
* If blockids is not nullptr, the block-id of each line is also returned as
|
||||
* an array of one element per line. delete [] after use.
|
||||
*/
|
||||
Boxa *GetStrips(Pixa **pixa, int **blockids);
|
||||
|
||||
/**
|
||||
* Get the words as a leptonica-style
|
||||
* Boxa, Pixa pair, in reading order.
|
||||
* Can be called before or after Recognize.
|
||||
*/
|
||||
Boxa *GetWords(Pixa **pixa);
|
||||
|
||||
/**
|
||||
* Gets the individual connected (text) components (created
|
||||
* after pages segmentation step, but before recognition)
|
||||
* as a leptonica-style Boxa, Pixa pair, in reading order.
|
||||
* Can be called before or after Recognize.
|
||||
* Note: the caller is responsible for calling boxaDestroy()
|
||||
* on the returned Boxa array and pixaDestroy() on cc array.
|
||||
*/
|
||||
Boxa *GetConnectedComponents(Pixa **cc);
|
||||
|
||||
/**
|
||||
* Get the given level kind of components (block, textline, word etc.) as a
|
||||
* leptonica-style Boxa, Pixa pair, in reading order.
|
||||
* Can be called before or after Recognize.
|
||||
* If blockids is not nullptr, the block-id of each component is also returned
|
||||
* as an array of one element per component. delete [] after use.
|
||||
* If blockids is not nullptr, the paragraph-id of each component with its
|
||||
* block is also returned as an array of one element per component. delete []
|
||||
* after use. If raw_image is true, then portions of the original image are
|
||||
* extracted instead of the thresholded image and padded with raw_padding. If
|
||||
* text_only is true, then only text components are returned.
|
||||
*/
|
||||
Boxa *GetComponentImages(PageIteratorLevel level, bool text_only,
|
||||
bool raw_image, int raw_padding, Pixa **pixa,
|
||||
int **blockids, int **paraids);
|
||||
// Helper function to get binary images with no padding (most common usage).
|
||||
Boxa *GetComponentImages(const PageIteratorLevel level, const bool text_only,
|
||||
Pixa **pixa, int **blockids) {
|
||||
return GetComponentImages(level, text_only, false, 0, pixa, blockids,
|
||||
nullptr);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the scale factor of the thresholded image that would be returned by
|
||||
* GetThresholdedImage() and the various GetX() methods that call
|
||||
* GetComponentImages().
|
||||
* Returns 0 if no thresholder has been set.
|
||||
*/
|
||||
int GetThresholdedImageScaleFactor() const;
|
||||
|
||||
/**
|
||||
* Runs page layout analysis in the mode set by SetPageSegMode.
|
||||
* May optionally be called prior to Recognize to get access to just
|
||||
* the page layout results. Returns an iterator to the results.
|
||||
* If merge_similar_words is true, words are combined where suitable for use
|
||||
* with a line recognizer. Use if you want to use AnalyseLayout to find the
|
||||
* textlines, and then want to process textline fragments with an external
|
||||
* line recognizer.
|
||||
* Returns nullptr on error or an empty page.
|
||||
* The returned iterator must be deleted after use.
|
||||
* WARNING! This class points to data held within the TessBaseAPI class, and
|
||||
* therefore can only be used while the TessBaseAPI class still exists and
|
||||
* has not been subjected to a call of Init, SetImage, Recognize, Clear, End
|
||||
* DetectOS, or anything else that changes the internal PAGE_RES.
|
||||
*/
|
||||
PageIterator *AnalyseLayout();
|
||||
PageIterator *AnalyseLayout(bool merge_similar_words);
|
||||
|
||||
/**
|
||||
* Recognize the image from SetAndThresholdImage, generating Tesseract
|
||||
* internal structures. Returns 0 on success.
|
||||
* Optional. The Get*Text functions below will call Recognize if needed.
|
||||
* After Recognize, the output is kept internally until the next SetImage.
|
||||
*/
|
||||
int Recognize(ETEXT_DESC *monitor);
|
||||
|
||||
/**
|
||||
* Methods to retrieve information after SetAndThresholdImage(),
|
||||
* Recognize() or TesseractRect(). (Recognize is called implicitly if needed.)
|
||||
*/
|
||||
|
||||
/**
|
||||
* Turns images into symbolic text.
|
||||
*
|
||||
* filename can point to a single image, a multi-page TIFF,
|
||||
* or a plain text list of image filenames.
|
||||
*
|
||||
* retry_config is useful for debugging. If not nullptr, you can fall
|
||||
* back to an alternate configuration if a page fails for some
|
||||
* reason.
|
||||
*
|
||||
* timeout_millisec terminates processing if any single page
|
||||
* takes too long. Set to 0 for unlimited time.
|
||||
*
|
||||
* renderer is responible for creating the output. For example,
|
||||
* use the TessTextRenderer if you want plaintext output, or
|
||||
* the TessPDFRender to produce searchable PDF.
|
||||
*
|
||||
* If tessedit_page_number is non-negative, will only process that
|
||||
* single page. Works for multi-page tiff file, or filelist.
|
||||
*
|
||||
* Returns true if successful, false on error.
|
||||
*/
|
||||
bool ProcessPages(const char *filename, const char *retry_config,
|
||||
int timeout_millisec, TessResultRenderer *renderer);
|
||||
// Does the real work of ProcessPages.
|
||||
bool ProcessPagesInternal(const char *filename, const char *retry_config,
|
||||
int timeout_millisec, TessResultRenderer *renderer);
|
||||
|
||||
/**
|
||||
* Turn a single image into symbolic text.
|
||||
*
|
||||
* The pix is the image processed. filename and page_index are
|
||||
* metadata used by side-effect processes, such as reading a box
|
||||
* file or formatting as hOCR.
|
||||
*
|
||||
* See ProcessPages for descriptions of other parameters.
|
||||
*/
|
||||
bool ProcessPage(Pix *pix, int page_index, const char *filename,
|
||||
const char *retry_config, int timeout_millisec,
|
||||
TessResultRenderer *renderer);
|
||||
|
||||
/**
|
||||
* Get a reading-order iterator to the results of LayoutAnalysis and/or
|
||||
* Recognize. The returned iterator must be deleted after use.
|
||||
* WARNING! This class points to data held within the TessBaseAPI class, and
|
||||
* therefore can only be used while the TessBaseAPI class still exists and
|
||||
* has not been subjected to a call of Init, SetImage, Recognize, Clear, End
|
||||
* DetectOS, or anything else that changes the internal PAGE_RES.
|
||||
*/
|
||||
ResultIterator *GetIterator();
|
||||
|
||||
/**
|
||||
* Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.
|
||||
* The returned iterator must be deleted after use.
|
||||
* WARNING! This class points to data held within the TessBaseAPI class, and
|
||||
* therefore can only be used while the TessBaseAPI class still exists and
|
||||
* has not been subjected to a call of Init, SetImage, Recognize, Clear, End
|
||||
* DetectOS, or anything else that changes the internal PAGE_RES.
|
||||
*/
|
||||
MutableIterator *GetMutableIterator();
|
||||
|
||||
/**
|
||||
* The recognized text is returned as a char* which is coded
|
||||
* as UTF8 and must be freed with the delete [] operator.
|
||||
*/
|
||||
char *GetUTF8Text();
|
||||
|
||||
size_t GetNumberOfTables() const;
|
||||
|
||||
/// Return the i-th table bounding box coordinates
|
||||
///
|
||||
/// Gives the (top_left.x, top_left.y, bottom_right.x, bottom_right.y)
|
||||
/// coordinates of the i-th table.
|
||||
std::tuple<int, int, int, int> GetTableBoundingBox(
|
||||
unsigned
|
||||
i ///< Index of the table, for upper limit \see GetNumberOfTables()
|
||||
);
|
||||
|
||||
/// Get bounding boxes of the rows of a table
|
||||
/// return values are (top_left.x, top_left.y, bottom_right.x, bottom_right.y)
|
||||
std::vector<std::tuple<int, int, int, int> > GetTableRows(
|
||||
unsigned
|
||||
i ///< Index of the table, for upper limit \see GetNumberOfTables()
|
||||
);
|
||||
|
||||
/// Get bounding boxes of the cols of a table
|
||||
/// return values are (top_left.x, top_left.y, bottom_right.x, bottom_right.y)
|
||||
std::vector<std::tuple<int, int, int, int> > GetTableCols(
|
||||
unsigned
|
||||
i ///< Index of the table, for upper limit \see GetNumberOfTables()
|
||||
);
|
||||
|
||||
/**
|
||||
* Make a HTML-formatted string with hOCR markup from the internal
|
||||
* data structures.
|
||||
* page_number is 0-based but will appear in the output as 1-based.
|
||||
* monitor can be used to
|
||||
* cancel the recognition
|
||||
* receive progress callbacks
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
char *GetHOCRText(ETEXT_DESC *monitor, int page_number);
|
||||
|
||||
/**
|
||||
* Make a HTML-formatted string with hOCR markup from the internal
|
||||
* data structures.
|
||||
* page_number is 0-based but will appear in the output as 1-based.
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
char *GetHOCRText(int page_number);
|
||||
|
||||
/**
|
||||
* Make an XML-formatted string with Alto markup from the internal
|
||||
* data structures.
|
||||
*/
|
||||
char *GetAltoText(ETEXT_DESC *monitor, int page_number);
|
||||
|
||||
/**
|
||||
* Make an XML-formatted string with Alto markup from the internal
|
||||
* data structures.
|
||||
*/
|
||||
char *GetAltoText(int page_number);
|
||||
|
||||
/**
|
||||
* Make a TSV-formatted string from the internal data structures.
|
||||
* page_number is 0-based but will appear in the output as 1-based.
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
char *GetTSVText(int page_number);
|
||||
|
||||
/**
|
||||
* Make a box file for LSTM training from the internal data structures.
|
||||
* Constructs coordinates in the original image - not just the rectangle.
|
||||
* page_number is a 0-based page index that will appear in the box file.
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
char *GetLSTMBoxText(int page_number);
|
||||
|
||||
/**
|
||||
* The recognized text is returned as a char* which is coded in the same
|
||||
* format as a box file used in training.
|
||||
* Constructs coordinates in the original image - not just the rectangle.
|
||||
* page_number is a 0-based page index that will appear in the box file.
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
char *GetBoxText(int page_number);
|
||||
|
||||
/**
|
||||
* The recognized text is returned as a char* which is coded in the same
|
||||
* format as a WordStr box file used in training.
|
||||
* page_number is a 0-based page index that will appear in the box file.
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
char *GetWordStrBoxText(int page_number);
|
||||
|
||||
/**
|
||||
* The recognized text is returned as a char* which is coded
|
||||
* as UNLV format Latin-1 with specific reject and suspect codes.
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
char *GetUNLVText();
|
||||
|
||||
/**
|
||||
* Detect the orientation of the input image and apparent script (alphabet).
|
||||
* orient_deg is the detected clockwise rotation of the input image in degrees
|
||||
* (0, 90, 180, 270)
|
||||
* orient_conf is the confidence (15.0 is reasonably confident)
|
||||
* script_name is an ASCII string, the name of the script, e.g. "Latin"
|
||||
* script_conf is confidence level in the script
|
||||
* Returns true on success and writes values to each parameter as an output
|
||||
*/
|
||||
bool DetectOrientationScript(int *orient_deg, float *orient_conf,
|
||||
const char **script_name, float *script_conf);
|
||||
|
||||
/**
|
||||
* The recognized text is returned as a char* which is coded
|
||||
* as UTF8 and must be freed with the delete [] operator.
|
||||
* page_number is a 0-based page index that will appear in the osd file.
|
||||
*/
|
||||
char *GetOsdText(int page_number);
|
||||
|
||||
/** Returns the (average) confidence value between 0 and 100. */
|
||||
int MeanTextConf();
|
||||
/**
|
||||
* Returns all word confidences (between 0 and 100) in an array, terminated
|
||||
* by -1. The calling function must delete [] after use.
|
||||
* The number of confidences should correspond to the number of space-
|
||||
* delimited words in GetUTF8Text.
|
||||
*/
|
||||
int *AllWordConfidences();
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
/**
|
||||
* Applies the given word to the adaptive classifier if possible.
|
||||
* The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can
|
||||
* tell the boundaries of the graphemes.
|
||||
* Assumes that SetImage/SetRectangle have been used to set the image
|
||||
* to the given word. The mode arg should be PSM_SINGLE_WORD or
|
||||
* PSM_CIRCLE_WORD, as that will be used to control layout analysis.
|
||||
* The currently set PageSegMode is preserved.
|
||||
* Returns false if adaption was not possible for some reason.
|
||||
*/
|
||||
bool AdaptToWordStr(PageSegMode mode, const char *wordstr);
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
/**
|
||||
* Free up recognition results and any stored image data, without actually
|
||||
* freeing any recognition data that would be time-consuming to reload.
|
||||
* Afterwards, you must call SetImage or TesseractRect before doing
|
||||
* any Recognize or Get* operation.
|
||||
*/
|
||||
void Clear();
|
||||
|
||||
/**
|
||||
* Close down tesseract and free up all memory. End() is equivalent to
|
||||
* destructing and reconstructing your TessBaseAPI.
|
||||
* Once End() has been used, none of the other API functions may be used
|
||||
* other than Init and anything declared above it in the class definition.
|
||||
*/
|
||||
void End();
|
||||
|
||||
/**
|
||||
* Clear any library-level memory caches.
|
||||
* There are a variety of expensive-to-load constant data structures (mostly
|
||||
* language dictionaries) that are cached globally -- surviving the Init()
|
||||
* and End() of individual TessBaseAPI's. This function allows the clearing
|
||||
* of these caches.
|
||||
**/
|
||||
static void ClearPersistentCache();
|
||||
|
||||
/**
|
||||
* Check whether a word is valid according to Tesseract's language model
|
||||
* @return 0 if the word is invalid, non-zero if valid.
|
||||
* @warning temporary! This function will be removed from here and placed
|
||||
* in a separate API at some future time.
|
||||
*/
|
||||
int IsValidWord(const char *word) const;
|
||||
// Returns true if utf8_character is defined in the UniCharset.
|
||||
bool IsValidCharacter(const char *utf8_character) const;
|
||||
|
||||
bool GetTextDirection(int *out_offset, float *out_slope);
|
||||
|
||||
/** Sets Dict::letter_is_okay_ function to point to the given function. */
|
||||
void SetDictFunc(DictFunc f);
|
||||
|
||||
/** Sets Dict::probability_in_context_ function to point to the given
|
||||
* function.
|
||||
*/
|
||||
void SetProbabilityInContextFunc(ProbabilityInContextFunc f);
|
||||
|
||||
/**
|
||||
* Estimates the Orientation And Script of the image.
|
||||
* @return true if the image was processed successfully.
|
||||
*/
|
||||
bool DetectOS(OSResults *);
|
||||
|
||||
/**
|
||||
* Return text orientation of each block as determined by an earlier run
|
||||
* of layout analysis.
|
||||
*/
|
||||
void GetBlockTextOrientations(int **block_orientation,
|
||||
bool **vertical_writing);
|
||||
|
||||
/** This method returns the string form of the specified unichar. */
|
||||
const char *GetUnichar(int unichar_id) const;
|
||||
|
||||
/** Return the pointer to the i-th dawg loaded into tesseract_ object. */
|
||||
const Dawg *GetDawg(int i) const;
|
||||
|
||||
/** Return the number of dawgs loaded into tesseract_ object. */
|
||||
int NumDawgs() const;
|
||||
|
||||
Tesseract *tesseract() const {
|
||||
return tesseract_;
|
||||
}
|
||||
|
||||
OcrEngineMode oem() const {
|
||||
return last_oem_requested_;
|
||||
}
|
||||
|
||||
void set_min_orientation_margin(double margin);
|
||||
/* @} */
|
||||
|
||||
protected:
|
||||
/** Common code for setting the image. Returns true if Init has been called.
|
||||
*/
|
||||
bool InternalSetImage();
|
||||
|
||||
/**
|
||||
* Run the thresholder to make the thresholded image. If pix is not nullptr,
|
||||
* the source is thresholded to pix instead of the internal IMAGE.
|
||||
*/
|
||||
virtual bool Threshold(Pix **pix);
|
||||
|
||||
/**
|
||||
* Find lines from the image making the BLOCK_LIST.
|
||||
* @return 0 on success.
|
||||
*/
|
||||
int FindLines();
|
||||
|
||||
/** Delete the pageres and block list ready for a new page. */
|
||||
void ClearResults();
|
||||
|
||||
/**
|
||||
* Return an LTR Result Iterator -- used only for training, as we really want
|
||||
* to ignore all BiDi smarts at that point.
|
||||
* delete once you're done with it.
|
||||
*/
|
||||
LTRResultIterator *GetLTRIterator();
|
||||
|
||||
/**
|
||||
* Return the length of the output text string, as UTF8, assuming
|
||||
* one newline per line and one per block, with a terminator,
|
||||
* and assuming a single character reject marker for each rejected character.
|
||||
* Also return the number of recognized blobs in blob_count.
|
||||
*/
|
||||
int TextLength(int *blob_count) const;
|
||||
|
||||
//// paragraphs.cpp ////////////////////////////////////////////////////
|
||||
void DetectParagraphs(bool after_text_recognition);
|
||||
|
||||
const PAGE_RES *GetPageRes() const {
|
||||
return page_res_;
|
||||
}
|
||||
|
||||
protected:
|
||||
Tesseract *tesseract_; ///< The underlying data object.
|
||||
Tesseract *osd_tesseract_; ///< For orientation & script detection.
|
||||
EquationDetect *equ_detect_; ///< The equation detector.
|
||||
FileReader reader_; ///< Reads files from any filesystem.
|
||||
ImageThresholder *thresholder_; ///< Image thresholding module.
|
||||
std::vector<ParagraphModel *> *paragraph_models_;
|
||||
BLOCK_LIST *block_list_; ///< The page layout.
|
||||
PAGE_RES *page_res_; ///< The page-level data.
|
||||
std::string input_file_; ///< Name used by training code.
|
||||
std::string output_file_; ///< Name used by debug code.
|
||||
std::string datapath_; ///< Current location of tessdata.
|
||||
std::string language_; ///< Last initialized language.
|
||||
OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested.
|
||||
bool recognition_done_; ///< page_res_ contains recognition data.
|
||||
|
||||
/**
|
||||
* @defgroup ThresholderParams Thresholder Parameters
|
||||
* Parameters saved from the Thresholder. Needed to rebuild coordinates.
|
||||
*/
|
||||
/* @{ */
|
||||
int rect_left_;
|
||||
int rect_top_;
|
||||
int rect_width_;
|
||||
int rect_height_;
|
||||
int image_width_;
|
||||
int image_height_;
|
||||
/* @} */
|
||||
|
||||
private:
|
||||
// A list of image filenames gets special consideration
|
||||
bool ProcessPagesFileList(FILE *fp, std::string *buf,
|
||||
const char *retry_config, int timeout_millisec,
|
||||
TessResultRenderer *renderer,
|
||||
int tessedit_page_number);
|
||||
// TIFF supports multipage so gets special consideration.
|
||||
bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size,
|
||||
const char *filename, const char *retry_config,
|
||||
int timeout_millisec,
|
||||
TessResultRenderer *renderer,
|
||||
int tessedit_page_number);
|
||||
}; // class TessBaseAPI.
|
||||
|
||||
/** Escape a char string - remove &<>"' with HTML codes. */
|
||||
std::string HOcrEscape(const char *text);
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_API_BASEAPI_H_
|
482
3rdparty/tesseract_ocr/tesseract/include/tesseract/capi.h
vendored
Normal file
482
3rdparty/tesseract_ocr/tesseract/include/tesseract/capi.h
vendored
Normal file
|
@ -0,0 +1,482 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: capi.h
|
||||
// Description: C-API TessBaseAPI
|
||||
//
|
||||
// (C) Copyright 2012, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef API_CAPI_H_
|
||||
#define API_CAPI_H_
|
||||
|
||||
#include "export.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
# include <tesseract/baseapi.h>
|
||||
# include <tesseract/ocrclass.h>
|
||||
# include <tesseract/pageiterator.h>
|
||||
# include <tesseract/renderer.h>
|
||||
# include <tesseract/resultiterator.h>
|
||||
#endif
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifndef BOOL
|
||||
# define BOOL int
|
||||
# define TRUE 1
|
||||
# define FALSE 0
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
typedef tesseract::TessResultRenderer TessResultRenderer;
|
||||
typedef tesseract::TessBaseAPI TessBaseAPI;
|
||||
typedef tesseract::PageIterator TessPageIterator;
|
||||
typedef tesseract::ResultIterator TessResultIterator;
|
||||
typedef tesseract::MutableIterator TessMutableIterator;
|
||||
typedef tesseract::ChoiceIterator TessChoiceIterator;
|
||||
typedef tesseract::OcrEngineMode TessOcrEngineMode;
|
||||
typedef tesseract::PageSegMode TessPageSegMode;
|
||||
typedef tesseract::PageIteratorLevel TessPageIteratorLevel;
|
||||
typedef tesseract::Orientation TessOrientation;
|
||||
typedef tesseract::ParagraphJustification TessParagraphJustification;
|
||||
typedef tesseract::WritingDirection TessWritingDirection;
|
||||
typedef tesseract::TextlineOrder TessTextlineOrder;
|
||||
typedef tesseract::PolyBlockType TessPolyBlockType;
|
||||
typedef tesseract::ETEXT_DESC ETEXT_DESC;
|
||||
#else
|
||||
typedef struct TessResultRenderer TessResultRenderer;
|
||||
typedef struct TessBaseAPI TessBaseAPI;
|
||||
typedef struct TessPageIterator TessPageIterator;
|
||||
typedef struct TessResultIterator TessResultIterator;
|
||||
typedef struct TessMutableIterator TessMutableIterator;
|
||||
typedef struct TessChoiceIterator TessChoiceIterator;
|
||||
typedef enum TessOcrEngineMode {
|
||||
OEM_TESSERACT_ONLY,
|
||||
OEM_LSTM_ONLY,
|
||||
OEM_TESSERACT_LSTM_COMBINED,
|
||||
OEM_DEFAULT
|
||||
} TessOcrEngineMode;
|
||||
typedef enum TessPageSegMode {
|
||||
PSM_OSD_ONLY,
|
||||
PSM_AUTO_OSD,
|
||||
PSM_AUTO_ONLY,
|
||||
PSM_AUTO,
|
||||
PSM_SINGLE_COLUMN,
|
||||
PSM_SINGLE_BLOCK_VERT_TEXT,
|
||||
PSM_SINGLE_BLOCK,
|
||||
PSM_SINGLE_LINE,
|
||||
PSM_SINGLE_WORD,
|
||||
PSM_CIRCLE_WORD,
|
||||
PSM_SINGLE_CHAR,
|
||||
PSM_SPARSE_TEXT,
|
||||
PSM_SPARSE_TEXT_OSD,
|
||||
PSM_RAW_LINE,
|
||||
PSM_COUNT
|
||||
} TessPageSegMode;
|
||||
typedef enum TessPageIteratorLevel {
|
||||
RIL_BLOCK,
|
||||
RIL_PARA,
|
||||
RIL_TEXTLINE,
|
||||
RIL_WORD,
|
||||
RIL_SYMBOL
|
||||
} TessPageIteratorLevel;
|
||||
typedef enum TessPolyBlockType {
|
||||
PT_UNKNOWN,
|
||||
PT_FLOWING_TEXT,
|
||||
PT_HEADING_TEXT,
|
||||
PT_PULLOUT_TEXT,
|
||||
PT_EQUATION,
|
||||
PT_INLINE_EQUATION,
|
||||
PT_TABLE,
|
||||
PT_VERTICAL_TEXT,
|
||||
PT_CAPTION_TEXT,
|
||||
PT_FLOWING_IMAGE,
|
||||
PT_HEADING_IMAGE,
|
||||
PT_PULLOUT_IMAGE,
|
||||
PT_HORZ_LINE,
|
||||
PT_VERT_LINE,
|
||||
PT_NOISE,
|
||||
PT_COUNT
|
||||
} TessPolyBlockType;
|
||||
typedef enum TessOrientation {
|
||||
ORIENTATION_PAGE_UP,
|
||||
ORIENTATION_PAGE_RIGHT,
|
||||
ORIENTATION_PAGE_DOWN,
|
||||
ORIENTATION_PAGE_LEFT
|
||||
} TessOrientation;
|
||||
typedef enum TessParagraphJustification {
|
||||
JUSTIFICATION_UNKNOWN,
|
||||
JUSTIFICATION_LEFT,
|
||||
JUSTIFICATION_CENTER,
|
||||
JUSTIFICATION_RIGHT
|
||||
} TessParagraphJustification;
|
||||
typedef enum TessWritingDirection {
|
||||
WRITING_DIRECTION_LEFT_TO_RIGHT,
|
||||
WRITING_DIRECTION_RIGHT_TO_LEFT,
|
||||
WRITING_DIRECTION_TOP_TO_BOTTOM
|
||||
} TessWritingDirection;
|
||||
typedef enum TessTextlineOrder {
|
||||
TEXTLINE_ORDER_LEFT_TO_RIGHT,
|
||||
TEXTLINE_ORDER_RIGHT_TO_LEFT,
|
||||
TEXTLINE_ORDER_TOP_TO_BOTTOM
|
||||
} TessTextlineOrder;
|
||||
typedef struct ETEXT_DESC ETEXT_DESC;
|
||||
#endif
|
||||
|
||||
typedef bool (*TessCancelFunc)(void *cancel_this, int words);
|
||||
typedef bool (*TessProgressFunc)(ETEXT_DESC *ths, int left, int right, int top,
|
||||
int bottom);
|
||||
|
||||
struct Pix;
|
||||
struct Boxa;
|
||||
struct Pixa;
|
||||
|
||||
/* General free functions */
|
||||
|
||||
TESS_API const char *TessVersion();
|
||||
TESS_API void TessDeleteText(const char *text);
|
||||
TESS_API void TessDeleteTextArray(char **arr);
|
||||
TESS_API void TessDeleteIntArray(const int *arr);
|
||||
|
||||
/* Renderer API */
|
||||
TESS_API TessResultRenderer *TessTextRendererCreate(const char *outputbase);
|
||||
TESS_API TessResultRenderer *TessHOcrRendererCreate(const char *outputbase);
|
||||
TESS_API TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase,
|
||||
BOOL font_info);
|
||||
TESS_API TessResultRenderer *TessAltoRendererCreate(const char *outputbase);
|
||||
TESS_API TessResultRenderer *TessTsvRendererCreate(const char *outputbase);
|
||||
TESS_API TessResultRenderer *TessPDFRendererCreate(const char *outputbase,
|
||||
const char *datadir,
|
||||
BOOL textonly);
|
||||
TESS_API TessResultRenderer *TessUnlvRendererCreate(const char *outputbase);
|
||||
TESS_API TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase);
|
||||
TESS_API TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase);
|
||||
TESS_API TessResultRenderer *TessWordStrBoxRendererCreate(
|
||||
const char *outputbase);
|
||||
|
||||
TESS_API void TessDeleteResultRenderer(TessResultRenderer *renderer);
|
||||
TESS_API void TessResultRendererInsert(TessResultRenderer *renderer,
|
||||
TessResultRenderer *next);
|
||||
TESS_API TessResultRenderer *TessResultRendererNext(
|
||||
TessResultRenderer *renderer);
|
||||
TESS_API BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer,
|
||||
const char *title);
|
||||
TESS_API BOOL TessResultRendererAddImage(TessResultRenderer *renderer,
|
||||
TessBaseAPI *api);
|
||||
TESS_API BOOL TessResultRendererEndDocument(TessResultRenderer *renderer);
|
||||
|
||||
TESS_API const char *TessResultRendererExtention(TessResultRenderer *renderer);
|
||||
TESS_API const char *TessResultRendererTitle(TessResultRenderer *renderer);
|
||||
TESS_API int TessResultRendererImageNum(TessResultRenderer *renderer);
|
||||
|
||||
/* Base API */
|
||||
|
||||
TESS_API TessBaseAPI *TessBaseAPICreate();
|
||||
TESS_API void TessBaseAPIDelete(TessBaseAPI *handle);
|
||||
|
||||
TESS_API size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI *handle, void **device);
|
||||
|
||||
TESS_API void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name);
|
||||
TESS_API const char *TessBaseAPIGetInputName(TessBaseAPI *handle);
|
||||
|
||||
TESS_API void TessBaseAPISetInputImage(TessBaseAPI *handle, struct Pix *pix);
|
||||
TESS_API struct Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle);
|
||||
|
||||
TESS_API int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle);
|
||||
TESS_API const char *TessBaseAPIGetDatapath(TessBaseAPI *handle);
|
||||
|
||||
TESS_API void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name);
|
||||
|
||||
TESS_API BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name,
|
||||
const char *value);
|
||||
TESS_API BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name,
|
||||
const char *value);
|
||||
|
||||
TESS_API BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle,
|
||||
const char *name, int *value);
|
||||
TESS_API BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle,
|
||||
const char *name, BOOL *value);
|
||||
TESS_API BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle,
|
||||
const char *name, double *value);
|
||||
TESS_API const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle,
|
||||
const char *name);
|
||||
|
||||
TESS_API void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp);
|
||||
TESS_API BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle,
|
||||
const char *filename);
|
||||
|
||||
TESS_API int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath,
|
||||
const char *language, TessOcrEngineMode oem,
|
||||
char **configs, int configs_size);
|
||||
TESS_API int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath,
|
||||
const char *language, TessOcrEngineMode oem);
|
||||
TESS_API int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath,
|
||||
const char *language);
|
||||
|
||||
TESS_API int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath,
|
||||
const char *language, TessOcrEngineMode mode,
|
||||
char **configs, int configs_size, char **vars_vec,
|
||||
char **vars_values, size_t vars_vec_size,
|
||||
BOOL set_only_non_debug_params);
|
||||
|
||||
TESS_API const char *TessBaseAPIGetInitLanguagesAsString(
|
||||
const TessBaseAPI *handle);
|
||||
TESS_API char **TessBaseAPIGetLoadedLanguagesAsVector(
|
||||
const TessBaseAPI *handle);
|
||||
TESS_API char **TessBaseAPIGetAvailableLanguagesAsVector(
|
||||
const TessBaseAPI *handle);
|
||||
|
||||
TESS_API int TessBaseAPIInitLangMod(TessBaseAPI *handle, const char *datapath,
|
||||
const char *language);
|
||||
TESS_API void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle);
|
||||
|
||||
TESS_API void TessBaseAPIReadConfigFile(TessBaseAPI *handle,
|
||||
const char *filename);
|
||||
TESS_API void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle,
|
||||
const char *filename);
|
||||
|
||||
TESS_API void TessBaseAPISetPageSegMode(TessBaseAPI *handle,
|
||||
TessPageSegMode mode);
|
||||
TESS_API TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle);
|
||||
|
||||
TESS_API char *TessBaseAPIRect(TessBaseAPI *handle,
|
||||
const unsigned char *imagedata,
|
||||
int bytes_per_pixel, int bytes_per_line,
|
||||
int left, int top, int width, int height);
|
||||
|
||||
TESS_API void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle);
|
||||
|
||||
TESS_API void TessBaseAPISetImage(TessBaseAPI *handle,
|
||||
const unsigned char *imagedata, int width,
|
||||
int height, int bytes_per_pixel,
|
||||
int bytes_per_line);
|
||||
TESS_API void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix);
|
||||
|
||||
TESS_API void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi);
|
||||
|
||||
TESS_API void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top,
|
||||
int width, int height);
|
||||
|
||||
TESS_API struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle);
|
||||
TESS_API struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle,
|
||||
struct Pixa **pixa);
|
||||
TESS_API struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle,
|
||||
struct Pixa **pixa,
|
||||
int **blockids);
|
||||
TESS_API struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle,
|
||||
BOOL raw_image, int raw_padding,
|
||||
struct Pixa **pixa,
|
||||
int **blockids, int **paraids);
|
||||
TESS_API struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle,
|
||||
struct Pixa **pixa, int **blockids);
|
||||
TESS_API struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle,
|
||||
struct Pixa **pixa);
|
||||
TESS_API struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle,
|
||||
struct Pixa **cc);
|
||||
TESS_API struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle,
|
||||
TessPageIteratorLevel level,
|
||||
BOOL text_only,
|
||||
struct Pixa **pixa,
|
||||
int **blockids);
|
||||
TESS_API struct Boxa *TessBaseAPIGetComponentImages1(
|
||||
TessBaseAPI *handle, TessPageIteratorLevel level, BOOL text_only,
|
||||
BOOL raw_image, int raw_padding, struct Pixa **pixa, int **blockids,
|
||||
int **paraids);
|
||||
|
||||
TESS_API int TessBaseAPIGetThresholdedImageScaleFactor(
|
||||
const TessBaseAPI *handle);
|
||||
|
||||
TESS_API TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle);
|
||||
|
||||
TESS_API int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor);
|
||||
|
||||
TESS_API BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename,
|
||||
const char *retry_config,
|
||||
int timeout_millisec,
|
||||
TessResultRenderer *renderer);
|
||||
TESS_API BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix,
|
||||
int page_index, const char *filename,
|
||||
const char *retry_config,
|
||||
int timeout_millisec,
|
||||
TessResultRenderer *renderer);
|
||||
|
||||
TESS_API TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle);
|
||||
TESS_API TessMutableIterator *TessBaseAPIGetMutableIterator(
|
||||
TessBaseAPI *handle);
|
||||
|
||||
TESS_API char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle);
|
||||
TESS_API char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number);
|
||||
|
||||
TESS_API char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number);
|
||||
TESS_API char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number);
|
||||
|
||||
TESS_API char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number);
|
||||
TESS_API char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number);
|
||||
TESS_API char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle,
|
||||
int page_number);
|
||||
|
||||
TESS_API char *TessBaseAPIGetUNLVText(TessBaseAPI *handle);
|
||||
TESS_API int TessBaseAPIMeanTextConf(TessBaseAPI *handle);
|
||||
|
||||
TESS_API int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle);
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
TESS_API BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle,
|
||||
TessPageSegMode mode,
|
||||
const char *wordstr);
|
||||
#endif // #ifndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
TESS_API void TessBaseAPIClear(TessBaseAPI *handle);
|
||||
TESS_API void TessBaseAPIEnd(TessBaseAPI *handle);
|
||||
|
||||
TESS_API int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word);
|
||||
TESS_API BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset,
|
||||
float *out_slope);
|
||||
|
||||
TESS_API const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id);
|
||||
|
||||
TESS_API void TessBaseAPIClearPersistentCache(TessBaseAPI *handle);
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
// Call TessDeleteText(*best_script_name) to free memory allocated by this
|
||||
// function
|
||||
TESS_API BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle,
|
||||
int *orient_deg,
|
||||
float *orient_conf,
|
||||
const char **script_name,
|
||||
float *script_conf);
|
||||
#endif // #ifndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
TESS_API void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle,
|
||||
double margin);
|
||||
|
||||
TESS_API int TessBaseAPINumDawgs(const TessBaseAPI *handle);
|
||||
|
||||
TESS_API TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle);
|
||||
|
||||
TESS_API void TessBaseGetBlockTextOrientations(TessBaseAPI *handle,
|
||||
int **block_orientation,
|
||||
bool **vertical_writing);
|
||||
|
||||
/* Page iterator */
|
||||
|
||||
TESS_API void TessPageIteratorDelete(TessPageIterator *handle);
|
||||
|
||||
TESS_API TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle);
|
||||
|
||||
TESS_API void TessPageIteratorBegin(TessPageIterator *handle);
|
||||
|
||||
TESS_API BOOL TessPageIteratorNext(TessPageIterator *handle,
|
||||
TessPageIteratorLevel level);
|
||||
|
||||
TESS_API BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle,
|
||||
TessPageIteratorLevel level);
|
||||
|
||||
TESS_API BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle,
|
||||
TessPageIteratorLevel level,
|
||||
TessPageIteratorLevel element);
|
||||
|
||||
TESS_API BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle,
|
||||
TessPageIteratorLevel level,
|
||||
int *left, int *top, int *right,
|
||||
int *bottom);
|
||||
|
||||
TESS_API TessPolyBlockType
|
||||
TessPageIteratorBlockType(const TessPageIterator *handle);
|
||||
|
||||
TESS_API struct Pix *TessPageIteratorGetBinaryImage(
|
||||
const TessPageIterator *handle, TessPageIteratorLevel level);
|
||||
|
||||
TESS_API struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle,
|
||||
TessPageIteratorLevel level,
|
||||
int padding,
|
||||
struct Pix *original_image,
|
||||
int *left, int *top);
|
||||
|
||||
TESS_API BOOL TessPageIteratorBaseline(const TessPageIterator *handle,
|
||||
TessPageIteratorLevel level, int *x1,
|
||||
int *y1, int *x2, int *y2);
|
||||
|
||||
TESS_API void TessPageIteratorOrientation(
|
||||
TessPageIterator *handle, TessOrientation *orientation,
|
||||
TessWritingDirection *writing_direction, TessTextlineOrder *textline_order,
|
||||
float *deskew_angle);
|
||||
|
||||
TESS_API void TessPageIteratorParagraphInfo(
|
||||
TessPageIterator *handle, TessParagraphJustification *justification,
|
||||
BOOL *is_list_item, BOOL *is_crown, int *first_line_indent);
|
||||
|
||||
/* Result iterator */
|
||||
|
||||
TESS_API void TessResultIteratorDelete(TessResultIterator *handle);
|
||||
TESS_API TessResultIterator *TessResultIteratorCopy(
|
||||
const TessResultIterator *handle);
|
||||
TESS_API TessPageIterator *TessResultIteratorGetPageIterator(
|
||||
TessResultIterator *handle);
|
||||
TESS_API const TessPageIterator *TessResultIteratorGetPageIteratorConst(
|
||||
const TessResultIterator *handle);
|
||||
TESS_API TessChoiceIterator *TessResultIteratorGetChoiceIterator(
|
||||
const TessResultIterator *handle);
|
||||
|
||||
TESS_API BOOL TessResultIteratorNext(TessResultIterator *handle,
|
||||
TessPageIteratorLevel level);
|
||||
TESS_API char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle,
|
||||
TessPageIteratorLevel level);
|
||||
TESS_API float TessResultIteratorConfidence(const TessResultIterator *handle,
|
||||
TessPageIteratorLevel level);
|
||||
TESS_API const char *TessResultIteratorWordRecognitionLanguage(
|
||||
const TessResultIterator *handle);
|
||||
TESS_API const char *TessResultIteratorWordFontAttributes(
|
||||
const TessResultIterator *handle, BOOL *is_bold, BOOL *is_italic,
|
||||
BOOL *is_underlined, BOOL *is_monospace, BOOL *is_serif, BOOL *is_smallcaps,
|
||||
int *pointsize, int *font_id);
|
||||
|
||||
TESS_API BOOL
|
||||
TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle);
|
||||
TESS_API BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle);
|
||||
TESS_API BOOL
|
||||
TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle);
|
||||
TESS_API BOOL
|
||||
TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle);
|
||||
TESS_API BOOL
|
||||
TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle);
|
||||
|
||||
TESS_API void TessChoiceIteratorDelete(TessChoiceIterator *handle);
|
||||
TESS_API BOOL TessChoiceIteratorNext(TessChoiceIterator *handle);
|
||||
TESS_API const char *TessChoiceIteratorGetUTF8Text(
|
||||
const TessChoiceIterator *handle);
|
||||
TESS_API float TessChoiceIteratorConfidence(const TessChoiceIterator *handle);
|
||||
|
||||
/* Progress monitor */
|
||||
|
||||
TESS_API ETEXT_DESC *TessMonitorCreate();
|
||||
TESS_API void TessMonitorDelete(ETEXT_DESC *monitor);
|
||||
TESS_API void TessMonitorSetCancelFunc(ETEXT_DESC *monitor,
|
||||
TessCancelFunc cancelFunc);
|
||||
TESS_API void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis);
|
||||
TESS_API void *TessMonitorGetCancelThis(ETEXT_DESC *monitor);
|
||||
TESS_API void TessMonitorSetProgressFunc(ETEXT_DESC *monitor,
|
||||
TessProgressFunc progressFunc);
|
||||
TESS_API int TessMonitorGetProgress(ETEXT_DESC *monitor);
|
||||
TESS_API void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // API_CAPI_H_
|
39
3rdparty/tesseract_ocr/tesseract/include/tesseract/export.h
vendored
Normal file
39
3rdparty/tesseract_ocr/tesseract/include/tesseract/export.h
vendored
Normal file
|
@ -0,0 +1,39 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: export.h
|
||||
// Description: Place holder
|
||||
//
|
||||
// (C) Copyright 2006, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_PLATFORM_H_
|
||||
#define TESSERACT_PLATFORM_H_
|
||||
|
||||
#ifndef TESS_API
|
||||
# if defined(_WIN32) || defined(__CYGWIN__)
|
||||
# if defined(TESS_EXPORTS)
|
||||
# define TESS_API __declspec(dllexport)
|
||||
# elif defined(TESS_IMPORTS)
|
||||
# define TESS_API __declspec(dllimport)
|
||||
# else
|
||||
# define TESS_API
|
||||
# endif
|
||||
# else
|
||||
# if defined(TESS_EXPORTS) || defined(TESS_IMPORTS)
|
||||
# define TESS_API __attribute__((visibility("default")))
|
||||
# else
|
||||
# define TESS_API
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#endif // TESSERACT_PLATFORM_H_
|
241
3rdparty/tesseract_ocr/tesseract/include/tesseract/ltrresultiterator.h
vendored
Normal file
241
3rdparty/tesseract_ocr/tesseract/include/tesseract/ltrresultiterator.h
vendored
Normal file
|
@ -0,0 +1,241 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: ltrresultiterator.h
|
||||
// Description: Iterator for tesseract results in strict left-to-right
|
||||
// order that avoids using tesseract internal data structures.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2010, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
|
||||
#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
|
||||
|
||||
#include "export.h" // for TESS_API
|
||||
#include "pageiterator.h" // for PageIterator
|
||||
#include "publictypes.h" // for PageIteratorLevel
|
||||
#include "unichar.h" // for StrongScriptDirection
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class BLOB_CHOICE_IT;
|
||||
class PAGE_RES;
|
||||
class WERD_RES;
|
||||
|
||||
class Tesseract;
|
||||
|
||||
// Class to iterate over tesseract results, providing access to all levels
|
||||
// of the page hierarchy, without including any tesseract headers or having
|
||||
// to handle any tesseract structures.
|
||||
// WARNING! This class points to data held within the TessBaseAPI class, and
|
||||
// therefore can only be used while the TessBaseAPI class still exists and
|
||||
// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
|
||||
// DetectOS, or anything else that changes the internal PAGE_RES.
|
||||
// See tesseract/publictypes.h for the definition of PageIteratorLevel.
|
||||
// See also base class PageIterator, which contains the bulk of the interface.
|
||||
// LTRResultIterator adds text-specific methods for access to OCR output.
|
||||
|
||||
class TESS_API LTRResultIterator : public PageIterator {
|
||||
friend class ChoiceIterator;
|
||||
|
||||
public:
|
||||
// page_res and tesseract come directly from the BaseAPI.
|
||||
// The rectangle parameters are copied indirectly from the Thresholder,
|
||||
// via the BaseAPI. They represent the coordinates of some rectangle in an
|
||||
// original image (in top-left-origin coordinates) and therefore the top-left
|
||||
// needs to be added to any output boxes in order to specify coordinates
|
||||
// in the original image. See TessBaseAPI::SetRectangle.
|
||||
// The scale and scaled_yres are in case the Thresholder scaled the image
|
||||
// rectangle prior to thresholding. Any coordinates in tesseract's image
|
||||
// must be divided by scale before adding (rect_left, rect_top).
|
||||
// The scaled_yres indicates the effective resolution of the binary image
|
||||
// that tesseract has been given by the Thresholder.
|
||||
// After the constructor, Begin has already been called.
|
||||
LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
|
||||
int scaled_yres, int rect_left, int rect_top,
|
||||
int rect_width, int rect_height);
|
||||
|
||||
~LTRResultIterator() override;
|
||||
|
||||
// LTRResultIterators may be copied! This makes it possible to iterate over
|
||||
// all the objects at a lower level, while maintaining an iterator to
|
||||
// objects at a higher level. These constructors DO NOT CALL Begin, so
|
||||
// iterations will continue from the location of src.
|
||||
// TODO: For now the copy constructor and operator= only need the base class
|
||||
// versions, but if new data members are added, don't forget to add them!
|
||||
|
||||
// ============= Moving around within the page ============.
|
||||
|
||||
// See PageIterator.
|
||||
|
||||
// ============= Accessing data ==============.
|
||||
|
||||
// Returns the null terminated UTF-8 encoded text string for the current
|
||||
// object at the given level. Use delete [] to free after use.
|
||||
char *GetUTF8Text(PageIteratorLevel level) const;
|
||||
|
||||
// Set the string inserted at the end of each text line. "\n" by default.
|
||||
void SetLineSeparator(const char *new_line);
|
||||
|
||||
// Set the string inserted at the end of each paragraph. "\n" by default.
|
||||
void SetParagraphSeparator(const char *new_para);
|
||||
|
||||
// Returns the mean confidence of the current object at the given level.
|
||||
// The number should be interpreted as a percent probability. (0.0f-100.0f)
|
||||
float Confidence(PageIteratorLevel level) const;
|
||||
|
||||
// Returns the attributes of the current row.
|
||||
void RowAttributes(float *row_height, float *descenders,
|
||||
float *ascenders) const;
|
||||
|
||||
// ============= Functions that refer to words only ============.
|
||||
|
||||
// Returns the font attributes of the current word. If iterating at a higher
|
||||
// level object than words, eg textlines, then this will return the
|
||||
// attributes of the first word in that textline.
|
||||
// The actual return value is a string representing a font name. It points
|
||||
// to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
|
||||
// the iterator itself, ie rendered invalid by various members of
|
||||
// TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
|
||||
// Pointsize is returned in printers points (1/72 inch.)
|
||||
const char *WordFontAttributes(bool *is_bold, bool *is_italic,
|
||||
bool *is_underlined, bool *is_monospace,
|
||||
bool *is_serif, bool *is_smallcaps,
|
||||
int *pointsize, int *font_id) const;
|
||||
|
||||
// Return the name of the language used to recognize this word.
|
||||
// On error, nullptr. Do not delete this pointer.
|
||||
const char *WordRecognitionLanguage() const;
|
||||
|
||||
// Return the overall directionality of this word.
|
||||
StrongScriptDirection WordDirection() const;
|
||||
|
||||
// Returns true if the current word was found in a dictionary.
|
||||
bool WordIsFromDictionary() const;
|
||||
|
||||
// Returns the number of blanks before the current word.
|
||||
int BlanksBeforeWord() const;
|
||||
|
||||
// Returns true if the current word is numeric.
|
||||
bool WordIsNumeric() const;
|
||||
|
||||
// Returns true if the word contains blamer information.
|
||||
bool HasBlamerInfo() const;
|
||||
|
||||
// Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
|
||||
// of the current word.
|
||||
const void *GetParamsTrainingBundle() const;
|
||||
|
||||
// Returns a pointer to the string with blamer information for this word.
|
||||
// Assumes that the word's blamer_bundle is not nullptr.
|
||||
const char *GetBlamerDebug() const;
|
||||
|
||||
// Returns a pointer to the string with misadaption information for this word.
|
||||
// Assumes that the word's blamer_bundle is not nullptr.
|
||||
const char *GetBlamerMisadaptionDebug() const;
|
||||
|
||||
// Returns true if a truth string was recorded for the current word.
|
||||
bool HasTruthString() const;
|
||||
|
||||
// Returns true if the given string is equivalent to the truth string for
|
||||
// the current word.
|
||||
bool EquivalentToTruth(const char *str) const;
|
||||
|
||||
// Returns a null terminated UTF-8 encoded truth string for the current word.
|
||||
// Use delete [] to free after use.
|
||||
char *WordTruthUTF8Text() const;
|
||||
|
||||
// Returns a null terminated UTF-8 encoded normalized OCR string for the
|
||||
// current word. Use delete [] to free after use.
|
||||
char *WordNormedUTF8Text() const;
|
||||
|
||||
// Returns a pointer to serialized choice lattice.
|
||||
// Fills lattice_size with the number of bytes in lattice data.
|
||||
const char *WordLattice(int *lattice_size) const;
|
||||
|
||||
// ============= Functions that refer to symbols only ============.
|
||||
|
||||
// Returns true if the current symbol is a superscript.
|
||||
// If iterating at a higher level object than symbols, eg words, then
|
||||
// this will return the attributes of the first symbol in that word.
|
||||
bool SymbolIsSuperscript() const;
|
||||
// Returns true if the current symbol is a subscript.
|
||||
// If iterating at a higher level object than symbols, eg words, then
|
||||
// this will return the attributes of the first symbol in that word.
|
||||
bool SymbolIsSubscript() const;
|
||||
// Returns true if the current symbol is a dropcap.
|
||||
// If iterating at a higher level object than symbols, eg words, then
|
||||
// this will return the attributes of the first symbol in that word.
|
||||
bool SymbolIsDropcap() const;
|
||||
|
||||
protected:
|
||||
const char *line_separator_;
|
||||
const char *paragraph_separator_;
|
||||
};
|
||||
|
||||
// Class to iterate over the classifier choices for a single RIL_SYMBOL.
|
||||
class TESS_API ChoiceIterator {
|
||||
public:
|
||||
// Construction is from a LTRResultIterator that points to the symbol of
|
||||
// interest. The ChoiceIterator allows a one-shot iteration over the
|
||||
// choices for this symbol and after that is is useless.
|
||||
explicit ChoiceIterator(const LTRResultIterator &result_it);
|
||||
~ChoiceIterator();
|
||||
|
||||
// Moves to the next choice for the symbol and returns false if there
|
||||
// are none left.
|
||||
bool Next();
|
||||
|
||||
// ============= Accessing data ==============.
|
||||
|
||||
// Returns the null terminated UTF-8 encoded text string for the current
|
||||
// choice.
|
||||
// NOTE: Unlike LTRResultIterator::GetUTF8Text, the return points to an
|
||||
// internal structure and should NOT be delete[]ed to free after use.
|
||||
const char *GetUTF8Text() const;
|
||||
|
||||
// Returns the confidence of the current choice depending on the used language
|
||||
// data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All
|
||||
// choices for one symbol should roughly add up to 1.0f.
|
||||
// If only traineddata of the legacy engine is used, the number should be
|
||||
// interpreted as a percent probability. (0.0f-100.0f) In this case
|
||||
// probabilities won't add up to 100. Each one stands on its own.
|
||||
float Confidence() const;
|
||||
|
||||
// Returns a vector containing all timesteps, which belong to the currently
|
||||
// selected symbol. A timestep is a vector containing pairs of symbols and
|
||||
// floating point numbers. The number states the probability for the
|
||||
// corresponding symbol.
|
||||
std::vector<std::vector<std::pair<const char *, float>>> *Timesteps() const;
|
||||
|
||||
private:
|
||||
// clears the remaining spaces out of the results and adapt the probabilities
|
||||
void filterSpaces();
|
||||
// Pointer to the WERD_RES object owned by the API.
|
||||
WERD_RES *word_res_;
|
||||
// Iterator over the blob choices.
|
||||
BLOB_CHOICE_IT *choice_it_;
|
||||
std::vector<std::pair<const char *, float>> *LSTM_choices_ = nullptr;
|
||||
std::vector<std::pair<const char *, float>>::iterator LSTM_choice_it_;
|
||||
|
||||
const int *tstep_index_;
|
||||
// regulates the rating granularity
|
||||
double rating_coefficient_;
|
||||
// leading blanks
|
||||
int blanks_before_word_;
|
||||
// true when there is lstm engine related trained data
|
||||
bool oemLSTM_;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_
|
157
3rdparty/tesseract_ocr/tesseract/include/tesseract/ocrclass.h
vendored
Normal file
157
3rdparty/tesseract_ocr/tesseract/include/tesseract/ocrclass.h
vendored
Normal file
|
@ -0,0 +1,157 @@
|
|||
/**********************************************************************
|
||||
* File: ocrclass.h
|
||||
* Description: Class definitions and constants for the OCR API.
|
||||
* Author: Hewlett-Packard Co
|
||||
*
|
||||
* (C) Copyright 1996, Hewlett-Packard Co.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
/**********************************************************************
|
||||
* This file contains typedefs for all the structures used by
|
||||
* the HP OCR interface.
|
||||
* The structures are designed to allow them to be used with any
|
||||
* structure alignment up to 8.
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef CCUTIL_OCRCLASS_H_
|
||||
#define CCUTIL_OCRCLASS_H_
|
||||
|
||||
#include <chrono>
|
||||
#include <ctime>
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
/**********************************************************************
|
||||
* EANYCODE_CHAR
|
||||
* Description of a single character. The character code is defined by
|
||||
* the character set of the current font.
|
||||
* Output text is sent as an array of these structures.
|
||||
* Spaces and line endings in the output are represented in the
|
||||
* structures of the surrounding characters. They are not directly
|
||||
* represented as characters.
|
||||
* The first character in a word has a positive value of blanks.
|
||||
* Missing information should be set to the defaults in the comments.
|
||||
* If word bounds are known, but not character bounds, then the top and
|
||||
* bottom of each character should be those of the word. The left of the
|
||||
* first and right of the last char in each word should be set. All other
|
||||
* lefts and rights should be set to -1.
|
||||
* If set, the values of right and bottom are left+width and top+height.
|
||||
* Most of the members come directly from the parameters to ocr_append_char.
|
||||
* The formatting member uses the enhancement parameter and combines the
|
||||
* line direction stuff into the top 3 bits.
|
||||
* The coding is 0=RL char, 1=LR char, 2=DR NL, 3=UL NL, 4=DR Para,
|
||||
* 5=UL Para, 6=TB char, 7=BT char. API users do not need to know what
|
||||
* the coding is, only that it is backwards compatible with the previous
|
||||
* version.
|
||||
**********************************************************************/
|
||||
|
||||
struct EANYCODE_CHAR { /*single character */
|
||||
// It should be noted that the format for char_code for version 2.0 and beyond
|
||||
// is UTF8 which means that ASCII characters will come out as one structure
|
||||
// but other characters will be returned in two or more instances of this
|
||||
// structure with a single byte of the UTF8 code in each, but each will have
|
||||
// the same bounding box. Programs which want to handle languagues with
|
||||
// different characters sets will need to handle extended characters
|
||||
// appropriately, but *all* code needs to be prepared to receive UTF8 coded
|
||||
// characters for characters such as bullet and fancy quotes.
|
||||
uint16_t char_code; /*character itself */
|
||||
int16_t left; /*of char (-1) */
|
||||
int16_t right; /*of char (-1) */
|
||||
int16_t top; /*of char (-1) */
|
||||
int16_t bottom; /*of char (-1) */
|
||||
int16_t font_index; /*what font (0) */
|
||||
uint8_t confidence; /*0=perfect, 100=reject (0/100) */
|
||||
uint8_t point_size; /*of char, 72=i inch, (10) */
|
||||
int8_t blanks; /*no of spaces before this char (1) */
|
||||
uint8_t formatting; /*char formatting (0) */
|
||||
};
|
||||
|
||||
/**********************************************************************
|
||||
* ETEXT_DESC
|
||||
* Description of the output of the OCR engine.
|
||||
* This structure is used as both a progress monitor and the final
|
||||
* output header, since it needs to be a valid progress monitor while
|
||||
* the OCR engine is storing its output to shared memory.
|
||||
* During progress, all the buffer info is -1.
|
||||
* Progress starts at 0 and increases to 100 during OCR. No other constraint.
|
||||
* Additionally the progress callback contains the bounding box of the word that
|
||||
* is currently being processed.
|
||||
* Every progress callback, the OCR engine must set ocr_alive to 1.
|
||||
* The HP side will set ocr_alive to 0. Repeated failure to reset
|
||||
* to 1 indicates that the OCR engine is dead.
|
||||
* If the cancel function is not null then it is called with the number of
|
||||
* user words found. If it returns true then operation is cancelled.
|
||||
**********************************************************************/
|
||||
class ETEXT_DESC;
|
||||
|
||||
using CANCEL_FUNC = bool (*)(void *, int);
|
||||
using PROGRESS_FUNC = bool (*)(int, int, int, int, int);
|
||||
using PROGRESS_FUNC2 = bool (*)(ETEXT_DESC *, int, int, int, int);
|
||||
|
||||
class ETEXT_DESC { // output header
|
||||
public:
|
||||
int16_t count{0}; /// chars in this buffer(0)
|
||||
int16_t progress{0}; /// percent complete increasing (0-100)
|
||||
/** Progress monitor covers word recognition and it does not cover layout
|
||||
* analysis.
|
||||
* See Ray comment in https://github.com/tesseract-ocr/tesseract/pull/27 */
|
||||
int8_t more_to_come{0}; /// true if not last
|
||||
volatile int8_t ocr_alive{0}; /// ocr sets to 1, HP 0
|
||||
int8_t err_code{0}; /// for errcode use
|
||||
CANCEL_FUNC cancel{nullptr}; /// returns true to cancel
|
||||
PROGRESS_FUNC progress_callback{
|
||||
nullptr}; /// called whenever progress increases
|
||||
PROGRESS_FUNC2 progress_callback2; /// monitor-aware progress callback
|
||||
void *cancel_this{nullptr}; /// this or other data for cancel
|
||||
std::chrono::steady_clock::time_point end_time;
|
||||
/// Time to stop. Expected to be set only
|
||||
/// by call to set_deadline_msecs().
|
||||
EANYCODE_CHAR text[1]{}; /// character data
|
||||
|
||||
ETEXT_DESC() : progress_callback2(&default_progress_func) {
|
||||
end_time = std::chrono::time_point<std::chrono::steady_clock,
|
||||
std::chrono::milliseconds>();
|
||||
}
|
||||
|
||||
// Sets the end time to be deadline_msecs milliseconds from now.
|
||||
void set_deadline_msecs(int32_t deadline_msecs) {
|
||||
if (deadline_msecs > 0) {
|
||||
end_time = std::chrono::steady_clock::now() +
|
||||
std::chrono::milliseconds(deadline_msecs);
|
||||
}
|
||||
}
|
||||
|
||||
// Returns false if we've not passed the end_time, or have not set a deadline.
|
||||
bool deadline_exceeded() const {
|
||||
if (end_time.time_since_epoch() ==
|
||||
std::chrono::steady_clock::duration::zero()) {
|
||||
return false;
|
||||
}
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
return (now > end_time);
|
||||
}
|
||||
|
||||
private:
|
||||
static bool default_progress_func(ETEXT_DESC *ths, int left, int right,
|
||||
int top, int bottom) {
|
||||
if (ths->progress_callback != nullptr) {
|
||||
return (*(ths->progress_callback))(ths->progress, left, right, top,
|
||||
bottom);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // CCUTIL_OCRCLASS_H_
|
141
3rdparty/tesseract_ocr/tesseract/include/tesseract/osdetect.h
vendored
Normal file
141
3rdparty/tesseract_ocr/tesseract/include/tesseract/osdetect.h
vendored
Normal file
|
@ -0,0 +1,141 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: osdetect.h
|
||||
// Description: Orientation and script detection.
|
||||
// Author: Samuel Charron
|
||||
// Ranjith Unnikrishnan
|
||||
//
|
||||
// (C) Copyright 2008, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCMAIN_OSDETECT_H_
|
||||
#define TESSERACT_CCMAIN_OSDETECT_H_
|
||||
|
||||
#include "export.h" // for TESS_API
|
||||
|
||||
#include <vector> // for std::vector
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class BLOBNBOX;
|
||||
class BLOBNBOX_CLIST;
|
||||
class BLOB_CHOICE_LIST;
|
||||
class TO_BLOCK_LIST;
|
||||
class UNICHARSET;
|
||||
|
||||
class Tesseract;
|
||||
|
||||
// Max number of scripts in ICU + "NULL" + Japanese and Korean + Fraktur
|
||||
const int kMaxNumberOfScripts = 116 + 1 + 2 + 1;
|
||||
|
||||
struct OSBestResult {
|
||||
OSBestResult()
|
||||
: orientation_id(0), script_id(0), sconfidence(0.0), oconfidence(0.0) {}
|
||||
int orientation_id;
|
||||
int script_id;
|
||||
float sconfidence;
|
||||
float oconfidence;
|
||||
};
|
||||
|
||||
struct OSResults {
|
||||
OSResults() : unicharset(nullptr) {
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
for (int j = 0; j < kMaxNumberOfScripts; ++j) {
|
||||
scripts_na[i][j] = 0;
|
||||
}
|
||||
orientations[i] = 0;
|
||||
}
|
||||
}
|
||||
void update_best_orientation();
|
||||
// Set the estimate of the orientation to the given id.
|
||||
void set_best_orientation(int orientation_id);
|
||||
// Update/Compute the best estimate of the script assuming the given
|
||||
// orientation id.
|
||||
void update_best_script(int orientation_id);
|
||||
// Return the index of the script with the highest score for this orientation.
|
||||
TESS_API int get_best_script(int orientation_id) const;
|
||||
// Accumulate scores with given OSResults instance and update the best script.
|
||||
void accumulate(const OSResults &osr);
|
||||
|
||||
// Print statistics.
|
||||
void print_scores(void) const;
|
||||
void print_scores(int orientation_id) const;
|
||||
|
||||
// Array holding scores for each orientation id [0,3].
|
||||
// Orientation ids [0..3] map to [0, 270, 180, 90] degree orientations of the
|
||||
// page respectively, where the values refer to the amount of clockwise
|
||||
// rotation to be applied to the page for the text to be upright and readable.
|
||||
float orientations[4];
|
||||
// Script confidence scores for each of 4 possible orientations.
|
||||
float scripts_na[4][kMaxNumberOfScripts];
|
||||
|
||||
UNICHARSET *unicharset;
|
||||
OSBestResult best_result;
|
||||
};
|
||||
|
||||
class OrientationDetector {
|
||||
public:
|
||||
OrientationDetector(const std::vector<int> *allowed_scripts,
|
||||
OSResults *results);
|
||||
bool detect_blob(BLOB_CHOICE_LIST *scores);
|
||||
int get_orientation();
|
||||
|
||||
private:
|
||||
OSResults *osr_;
|
||||
const std::vector<int> *allowed_scripts_;
|
||||
};
|
||||
|
||||
class ScriptDetector {
|
||||
public:
|
||||
ScriptDetector(const std::vector<int> *allowed_scripts, OSResults *osr,
|
||||
tesseract::Tesseract *tess);
|
||||
void detect_blob(BLOB_CHOICE_LIST *scores);
|
||||
bool must_stop(int orientation) const;
|
||||
|
||||
private:
|
||||
OSResults *osr_;
|
||||
static const char *korean_script_;
|
||||
static const char *japanese_script_;
|
||||
static const char *fraktur_script_;
|
||||
int korean_id_;
|
||||
int japanese_id_;
|
||||
int katakana_id_;
|
||||
int hiragana_id_;
|
||||
int han_id_;
|
||||
int hangul_id_;
|
||||
int latin_id_;
|
||||
int fraktur_id_;
|
||||
tesseract::Tesseract *tess_;
|
||||
const std::vector<int> *allowed_scripts_;
|
||||
};
|
||||
|
||||
int orientation_and_script_detection(const char *filename, OSResults *,
|
||||
tesseract::Tesseract *);
|
||||
|
||||
int os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr,
|
||||
tesseract::Tesseract *tess);
|
||||
|
||||
int os_detect_blobs(const std::vector<int> *allowed_scripts,
|
||||
BLOBNBOX_CLIST *blob_list, OSResults *osr,
|
||||
tesseract::Tesseract *tess);
|
||||
|
||||
bool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s,
|
||||
OSResults *, tesseract::Tesseract *tess);
|
||||
|
||||
// Helper method to convert an orientation index to its value in degrees.
|
||||
// The value represents the amount of clockwise rotation in degrees that must be
|
||||
// applied for the text to be upright (readable).
|
||||
TESS_API int OrientationIdToValue(const int &id);
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_CCMAIN_OSDETECT_H_
|
362
3rdparty/tesseract_ocr/tesseract/include/tesseract/pageiterator.h
vendored
Normal file
362
3rdparty/tesseract_ocr/tesseract/include/tesseract/pageiterator.h
vendored
Normal file
|
@ -0,0 +1,362 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: pageiterator.h
|
||||
// Description: Iterator for tesseract page structure that avoids using
|
||||
// tesseract internal data structures.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2010, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_
|
||||
#define TESSERACT_CCMAIN_PAGEITERATOR_H_
|
||||
|
||||
#include "export.h"
|
||||
#include "publictypes.h"
|
||||
|
||||
struct Pix;
|
||||
struct Pta;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
struct BlamerBundle;
|
||||
class C_BLOB_IT;
|
||||
class PAGE_RES;
|
||||
class PAGE_RES_IT;
|
||||
class WERD;
|
||||
|
||||
class Tesseract;
|
||||
|
||||
/**
|
||||
* Class to iterate over tesseract page structure, providing access to all
|
||||
* levels of the page hierarchy, without including any tesseract headers or
|
||||
* having to handle any tesseract structures.
|
||||
* WARNING! This class points to data held within the TessBaseAPI class, and
|
||||
* therefore can only be used while the TessBaseAPI class still exists and
|
||||
* has not been subjected to a call of Init, SetImage, Recognize, Clear, End
|
||||
* DetectOS, or anything else that changes the internal PAGE_RES.
|
||||
* See tesseract/publictypes.h for the definition of PageIteratorLevel.
|
||||
* See also ResultIterator, derived from PageIterator, which adds in the
|
||||
* ability to access OCR output with text-specific methods.
|
||||
*/
|
||||
|
||||
class TESS_API PageIterator {
|
||||
public:
|
||||
/**
|
||||
* page_res and tesseract come directly from the BaseAPI.
|
||||
* The rectangle parameters are copied indirectly from the Thresholder,
|
||||
* via the BaseAPI. They represent the coordinates of some rectangle in an
|
||||
* original image (in top-left-origin coordinates) and therefore the top-left
|
||||
* needs to be added to any output boxes in order to specify coordinates
|
||||
* in the original image. See TessBaseAPI::SetRectangle.
|
||||
* The scale and scaled_yres are in case the Thresholder scaled the image
|
||||
* rectangle prior to thresholding. Any coordinates in tesseract's image
|
||||
* must be divided by scale before adding (rect_left, rect_top).
|
||||
* The scaled_yres indicates the effective resolution of the binary image
|
||||
* that tesseract has been given by the Thresholder.
|
||||
* After the constructor, Begin has already been called.
|
||||
*/
|
||||
PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
|
||||
int scaled_yres, int rect_left, int rect_top, int rect_width,
|
||||
int rect_height);
|
||||
virtual ~PageIterator();
|
||||
|
||||
/**
|
||||
* Page/ResultIterators may be copied! This makes it possible to iterate over
|
||||
* all the objects at a lower level, while maintaining an iterator to
|
||||
* objects at a higher level. These constructors DO NOT CALL Begin, so
|
||||
* iterations will continue from the location of src.
|
||||
*/
|
||||
PageIterator(const PageIterator &src);
|
||||
const PageIterator &operator=(const PageIterator &src);
|
||||
|
||||
/** Are we positioned at the same location as other? */
|
||||
bool PositionedAtSameWord(const PAGE_RES_IT *other) const;
|
||||
|
||||
// ============= Moving around within the page ============.
|
||||
|
||||
/**
|
||||
* Moves the iterator to point to the start of the page to begin an
|
||||
* iteration.
|
||||
*/
|
||||
virtual void Begin();
|
||||
|
||||
/**
|
||||
* Moves the iterator to the beginning of the paragraph.
|
||||
* This class implements this functionality by moving it to the zero indexed
|
||||
* blob of the first (leftmost) word on the first row of the paragraph.
|
||||
*/
|
||||
virtual void RestartParagraph();
|
||||
|
||||
/**
|
||||
* Return whether this iterator points anywhere in the first textline of a
|
||||
* paragraph.
|
||||
*/
|
||||
bool IsWithinFirstTextlineOfParagraph() const;
|
||||
|
||||
/**
|
||||
* Moves the iterator to the beginning of the text line.
|
||||
* This class implements this functionality by moving it to the zero indexed
|
||||
* blob of the first (leftmost) word of the row.
|
||||
*/
|
||||
virtual void RestartRow();
|
||||
|
||||
/**
|
||||
* Moves to the start of the next object at the given level in the
|
||||
* page hierarchy, and returns false if the end of the page was reached.
|
||||
* NOTE that RIL_SYMBOL will skip non-text blocks, but all other
|
||||
* PageIteratorLevel level values will visit each non-text block once.
|
||||
* Think of non text blocks as containing a single para, with a single line,
|
||||
* with a single imaginary word.
|
||||
* Calls to Next with different levels may be freely intermixed.
|
||||
* This function iterates words in right-to-left scripts correctly, if
|
||||
* the appropriate language has been loaded into Tesseract.
|
||||
*/
|
||||
virtual bool Next(PageIteratorLevel level);
|
||||
|
||||
/**
|
||||
* Returns true if the iterator is at the start of an object at the given
|
||||
* level.
|
||||
*
|
||||
* For instance, suppose an iterator it is pointed to the first symbol of the
|
||||
* first word of the third line of the second paragraph of the first block in
|
||||
* a page, then:
|
||||
* it.IsAtBeginningOf(RIL_BLOCK) = false
|
||||
* it.IsAtBeginningOf(RIL_PARA) = false
|
||||
* it.IsAtBeginningOf(RIL_TEXTLINE) = true
|
||||
* it.IsAtBeginningOf(RIL_WORD) = true
|
||||
* it.IsAtBeginningOf(RIL_SYMBOL) = true
|
||||
*/
|
||||
virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
|
||||
|
||||
/**
|
||||
* Returns whether the iterator is positioned at the last element in a
|
||||
* given level. (e.g. the last word in a line, the last line in a block)
|
||||
*
|
||||
* Here's some two-paragraph example
|
||||
* text. It starts off innocuously
|
||||
* enough but quickly turns bizarre.
|
||||
* The author inserts a cornucopia
|
||||
* of words to guard against confused
|
||||
* references.
|
||||
*
|
||||
* Now take an iterator it pointed to the start of "bizarre."
|
||||
* it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false
|
||||
* it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true
|
||||
* it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false
|
||||
*/
|
||||
virtual bool IsAtFinalElement(PageIteratorLevel level,
|
||||
PageIteratorLevel element) const;
|
||||
|
||||
/**
|
||||
* Returns whether this iterator is positioned
|
||||
* before other: -1
|
||||
* equal to other: 0
|
||||
* after other: 1
|
||||
*/
|
||||
int Cmp(const PageIterator &other) const;
|
||||
|
||||
// ============= Accessing data ==============.
|
||||
// Coordinate system:
|
||||
// Integer coordinates are at the cracks between the pixels.
|
||||
// The top-left corner of the top-left pixel in the image is at (0,0).
|
||||
// The bottom-right corner of the bottom-right pixel in the image is at
|
||||
// (width, height).
|
||||
// Every bounding box goes from the top-left of the top-left contained
|
||||
// pixel to the bottom-right of the bottom-right contained pixel, so
|
||||
// the bounding box of the single top-left pixel in the image is:
|
||||
// (0,0)->(1,1).
|
||||
// If an image rectangle has been set in the API, then returned coordinates
|
||||
// relate to the original (full) image, rather than the rectangle.
|
||||
|
||||
/**
|
||||
* Controls what to include in a bounding box. Bounding boxes of all levels
|
||||
* between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics.
|
||||
* Between layout analysis and recognition, it isn't known where all
|
||||
* diacritics belong, so this control is used to include or exclude some
|
||||
* diacritics that are above or below the main body of the word. In most cases
|
||||
* where the placement is obvious, and after recognition, it doesn't make as
|
||||
* much difference, as the diacritics will already be included in the word.
|
||||
*/
|
||||
void SetBoundingBoxComponents(bool include_upper_dots,
|
||||
bool include_lower_dots) {
|
||||
include_upper_dots_ = include_upper_dots;
|
||||
include_lower_dots_ = include_lower_dots;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the bounding rectangle of the current object at the given level.
|
||||
* See comment on coordinate system above.
|
||||
* Returns false if there is no such object at the current position.
|
||||
* The returned bounding box is guaranteed to match the size and position
|
||||
* of the image returned by GetBinaryImage, but may clip foreground pixels
|
||||
* from a grey image. The padding argument to GetImage can be used to expand
|
||||
* the image to include more foreground pixels. See GetImage below.
|
||||
*/
|
||||
bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right,
|
||||
int *bottom) const;
|
||||
bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top,
|
||||
int *right, int *bottom) const;
|
||||
/**
|
||||
* Returns the bounding rectangle of the object in a coordinate system of the
|
||||
* working image rectangle having its origin at (rect_left_, rect_top_) with
|
||||
* respect to the original image and is scaled by a factor scale_.
|
||||
*/
|
||||
bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top,
|
||||
int *right, int *bottom) const;
|
||||
|
||||
/** Returns whether there is no object of a given level. */
|
||||
bool Empty(PageIteratorLevel level) const;
|
||||
|
||||
/**
|
||||
* Returns the type of the current block.
|
||||
* See tesseract/publictypes.h for PolyBlockType.
|
||||
*/
|
||||
PolyBlockType BlockType() const;
|
||||
|
||||
/**
|
||||
* Returns the polygon outline of the current block. The returned Pta must
|
||||
* be ptaDestroy-ed after use. Note that the returned Pta lists the vertices
|
||||
* of the polygon, and the last edge is the line segment between the last
|
||||
* point and the first point. nullptr will be returned if the iterator is
|
||||
* at the end of the document or layout analysis was not used.
|
||||
*/
|
||||
Pta *BlockPolygon() const;
|
||||
|
||||
/**
|
||||
* Returns a binary image of the current object at the given level.
|
||||
* The position and size match the return from BoundingBoxInternal, and so
|
||||
* this could be upscaled with respect to the original input image.
|
||||
* Use pixDestroy to delete the image after use.
|
||||
*/
|
||||
Pix *GetBinaryImage(PageIteratorLevel level) const;
|
||||
|
||||
/**
|
||||
* Returns an image of the current object at the given level in greyscale
|
||||
* if available in the input. To guarantee a binary image use BinaryImage.
|
||||
* NOTE that in order to give the best possible image, the bounds are
|
||||
* expanded slightly over the binary connected component, by the supplied
|
||||
* padding, so the top-left position of the returned image is returned
|
||||
* in (left,top). These will most likely not match the coordinates
|
||||
* returned by BoundingBox.
|
||||
* If you do not supply an original image, you will get a binary one.
|
||||
* Use pixDestroy to delete the image after use.
|
||||
*/
|
||||
Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img,
|
||||
int *left, int *top) const;
|
||||
|
||||
/**
|
||||
* Returns the baseline of the current object at the given level.
|
||||
* The baseline is the line that passes through (x1, y1) and (x2, y2).
|
||||
* WARNING: with vertical text, baselines may be vertical!
|
||||
* Returns false if there is no baseline at the current position.
|
||||
*/
|
||||
bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2,
|
||||
int *y2) const;
|
||||
|
||||
/**
|
||||
* Returns orientation for the block the iterator points to.
|
||||
* orientation, writing_direction, textline_order: see publictypes.h
|
||||
* deskew_angle: after rotating the block so the text orientation is
|
||||
* upright, how many radians does one have to rotate the
|
||||
* block anti-clockwise for it to be level?
|
||||
* -Pi/4 <= deskew_angle <= Pi/4
|
||||
*/
|
||||
void Orientation(tesseract::Orientation *orientation,
|
||||
tesseract::WritingDirection *writing_direction,
|
||||
tesseract::TextlineOrder *textline_order,
|
||||
float *deskew_angle) const;
|
||||
|
||||
/**
|
||||
* Returns information about the current paragraph, if available.
|
||||
*
|
||||
* justification -
|
||||
* LEFT if ragged right, or fully justified and script is left-to-right.
|
||||
* RIGHT if ragged left, or fully justified and script is right-to-left.
|
||||
* unknown if it looks like source code or we have very few lines.
|
||||
* is_list_item -
|
||||
* true if we believe this is a member of an ordered or unordered list.
|
||||
* is_crown -
|
||||
* true if the first line of the paragraph is aligned with the other
|
||||
* lines of the paragraph even though subsequent paragraphs have first
|
||||
* line indents. This typically indicates that this is the continuation
|
||||
* of a previous paragraph or that it is the very first paragraph in
|
||||
* the chapter.
|
||||
* first_line_indent -
|
||||
* For LEFT aligned paragraphs, the first text line of paragraphs of
|
||||
* this kind are indented this many pixels from the left edge of the
|
||||
* rest of the paragraph.
|
||||
* for RIGHT aligned paragraphs, the first text line of paragraphs of
|
||||
* this kind are indented this many pixels from the right edge of the
|
||||
* rest of the paragraph.
|
||||
* NOTE 1: This value may be negative.
|
||||
* NOTE 2: if *is_crown == true, the first line of this paragraph is
|
||||
* actually flush, and first_line_indent is set to the "common"
|
||||
* first_line_indent for subsequent paragraphs in this block
|
||||
* of text.
|
||||
*/
|
||||
void ParagraphInfo(tesseract::ParagraphJustification *justification,
|
||||
bool *is_list_item, bool *is_crown,
|
||||
int *first_line_indent) const;
|
||||
|
||||
// If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle
|
||||
// of the current word to the given pointer (takes ownership of the pointer)
|
||||
// and returns true.
|
||||
// Can only be used when iterating on the word level.
|
||||
bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);
|
||||
|
||||
protected:
|
||||
/**
|
||||
* Sets up the internal data for iterating the blobs of a new word, then
|
||||
* moves the iterator to the given offset.
|
||||
*/
|
||||
void BeginWord(int offset);
|
||||
|
||||
/** Pointer to the page_res owned by the API. */
|
||||
PAGE_RES *page_res_;
|
||||
/** Pointer to the Tesseract object owned by the API. */
|
||||
Tesseract *tesseract_;
|
||||
/**
|
||||
* The iterator to the page_res_. Owned by this ResultIterator.
|
||||
* A pointer just to avoid dragging in Tesseract includes.
|
||||
*/
|
||||
PAGE_RES_IT *it_;
|
||||
/**
|
||||
* The current input WERD being iterated. If there is an output from OCR,
|
||||
* then word_ is nullptr. Owned by the API
|
||||
*/
|
||||
WERD *word_;
|
||||
/** The length of the current word_. */
|
||||
int word_length_;
|
||||
/** The current blob index within the word. */
|
||||
int blob_index_;
|
||||
/**
|
||||
* Iterator to the blobs within the word. If nullptr, then we are iterating
|
||||
* OCR results in the box_word.
|
||||
* Owned by this ResultIterator.
|
||||
*/
|
||||
C_BLOB_IT *cblob_it_;
|
||||
/** Control over what to include in bounding boxes. */
|
||||
bool include_upper_dots_;
|
||||
bool include_lower_dots_;
|
||||
/** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/
|
||||
int scale_;
|
||||
int scaled_yres_;
|
||||
int rect_left_;
|
||||
int rect_top_;
|
||||
int rect_width_;
|
||||
int rect_height_;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_CCMAIN_PAGEITERATOR_H_
|
283
3rdparty/tesseract_ocr/tesseract/include/tesseract/publictypes.h
vendored
Normal file
283
3rdparty/tesseract_ocr/tesseract/include/tesseract/publictypes.h
vendored
Normal file
|
@ -0,0 +1,283 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: publictypes.h
|
||||
// Description: Types used in both the API and internally
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2010, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCSTRUCT_PUBLICTYPES_H_
|
||||
#define TESSERACT_CCSTRUCT_PUBLICTYPES_H_
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// This file contains types that are used both by the API and internally
|
||||
// to Tesseract. In order to decouple the API from Tesseract and prevent cyclic
|
||||
// dependencies, THIS FILE SHOULD NOT DEPEND ON ANY OTHER PART OF TESSERACT.
|
||||
// Restated: It is OK for low-level Tesseract files to include publictypes.h,
|
||||
// but not for the low-level tesseract code to include top-level API code.
|
||||
// This file should not use other Tesseract types, as that would drag
|
||||
// their includes into the API-level.
|
||||
|
||||
/** Number of printers' points in an inch. The unit of the pointsize return. */
|
||||
constexpr int kPointsPerInch = 72;
|
||||
/**
|
||||
* Minimum believable resolution. Used as a default if there is no other
|
||||
* information, as it is safer to under-estimate than over-estimate.
|
||||
*/
|
||||
constexpr int kMinCredibleResolution = 70;
|
||||
/** Maximum believable resolution. */
|
||||
constexpr int kMaxCredibleResolution = 2400;
|
||||
/**
|
||||
* Ratio between median blob size and likely resolution. Used to estimate
|
||||
* resolution when none is provided. This is basically 1/usual text size in
|
||||
* inches. */
|
||||
constexpr int kResolutionEstimationFactor = 10;
|
||||
|
||||
/**
|
||||
* Possible types for a POLY_BLOCK or ColPartition.
|
||||
* Must be kept in sync with kPBColors in polyblk.cpp and PTIs*Type functions
|
||||
* below, as well as kPolyBlockNames in layout_test.cc.
|
||||
* Used extensively by ColPartition, and POLY_BLOCK.
|
||||
*/
|
||||
enum PolyBlockType {
|
||||
PT_UNKNOWN, // Type is not yet known. Keep as the first element.
|
||||
PT_FLOWING_TEXT, // Text that lives inside a column.
|
||||
PT_HEADING_TEXT, // Text that spans more than one column.
|
||||
PT_PULLOUT_TEXT, // Text that is in a cross-column pull-out region.
|
||||
PT_EQUATION, // Partition belonging to an equation region.
|
||||
PT_INLINE_EQUATION, // Partition has inline equation.
|
||||
PT_TABLE, // Partition belonging to a table region.
|
||||
PT_VERTICAL_TEXT, // Text-line runs vertically.
|
||||
PT_CAPTION_TEXT, // Text that belongs to an image.
|
||||
PT_FLOWING_IMAGE, // Image that lives inside a column.
|
||||
PT_HEADING_IMAGE, // Image that spans more than one column.
|
||||
PT_PULLOUT_IMAGE, // Image that is in a cross-column pull-out region.
|
||||
PT_HORZ_LINE, // Horizontal Line.
|
||||
PT_VERT_LINE, // Vertical Line.
|
||||
PT_NOISE, // Lies outside of any column.
|
||||
PT_COUNT
|
||||
};
|
||||
|
||||
/** Returns true if PolyBlockType is of horizontal line type */
|
||||
inline bool PTIsLineType(PolyBlockType type) {
|
||||
return type == PT_HORZ_LINE || type == PT_VERT_LINE;
|
||||
}
|
||||
/** Returns true if PolyBlockType is of image type */
|
||||
inline bool PTIsImageType(PolyBlockType type) {
|
||||
return type == PT_FLOWING_IMAGE || type == PT_HEADING_IMAGE ||
|
||||
type == PT_PULLOUT_IMAGE;
|
||||
}
|
||||
/** Returns true if PolyBlockType is of text type */
|
||||
inline bool PTIsTextType(PolyBlockType type) {
|
||||
return type == PT_FLOWING_TEXT || type == PT_HEADING_TEXT ||
|
||||
type == PT_PULLOUT_TEXT || type == PT_TABLE ||
|
||||
type == PT_VERTICAL_TEXT || type == PT_CAPTION_TEXT ||
|
||||
type == PT_INLINE_EQUATION;
|
||||
}
|
||||
// Returns true if PolyBlockType is of pullout(inter-column) type
|
||||
inline bool PTIsPulloutType(PolyBlockType type) {
|
||||
return type == PT_PULLOUT_IMAGE || type == PT_PULLOUT_TEXT;
|
||||
}
|
||||
|
||||
/**
|
||||
* +------------------+ Orientation Example:
|
||||
* | 1 Aaaa Aaaa Aaaa | ====================
|
||||
* | Aaa aa aaa aa | To left is a diagram of some (1) English and
|
||||
* | aaaaaa A aa aaa. | (2) Chinese text and a (3) photo credit.
|
||||
* | 2 |
|
||||
* | ####### c c C | Upright Latin characters are represented as A and a.
|
||||
* | ####### c c c | '<' represents a latin character rotated
|
||||
* | < ####### c c c | anti-clockwise 90 degrees.
|
||||
* | < ####### c c |
|
||||
* | < ####### . c | Upright Chinese characters are represented C and c.
|
||||
* | 3 ####### c |
|
||||
* +------------------+ NOTA BENE: enum values here should match goodoc.proto
|
||||
|
||||
* If you orient your head so that "up" aligns with Orientation,
|
||||
* then the characters will appear "right side up" and readable.
|
||||
*
|
||||
* In the example above, both the English and Chinese paragraphs are oriented
|
||||
* so their "up" is the top of the page (page up). The photo credit is read
|
||||
* with one's head turned leftward ("up" is to page left).
|
||||
*
|
||||
* The values of this enum match the convention of Tesseract's osdetect.h
|
||||
*/
|
||||
enum Orientation {
|
||||
ORIENTATION_PAGE_UP = 0,
|
||||
ORIENTATION_PAGE_RIGHT = 1,
|
||||
ORIENTATION_PAGE_DOWN = 2,
|
||||
ORIENTATION_PAGE_LEFT = 3,
|
||||
};
|
||||
|
||||
/**
|
||||
* The grapheme clusters within a line of text are laid out logically
|
||||
* in this direction, judged when looking at the text line rotated so that
|
||||
* its Orientation is "page up".
|
||||
*
|
||||
* For English text, the writing direction is left-to-right. For the
|
||||
* Chinese text in the above example, the writing direction is top-to-bottom.
|
||||
*/
|
||||
enum WritingDirection {
|
||||
WRITING_DIRECTION_LEFT_TO_RIGHT = 0,
|
||||
WRITING_DIRECTION_RIGHT_TO_LEFT = 1,
|
||||
WRITING_DIRECTION_TOP_TO_BOTTOM = 2,
|
||||
};
|
||||
|
||||
/**
|
||||
* The text lines are read in the given sequence.
|
||||
*
|
||||
* In English, the order is top-to-bottom.
|
||||
* In Chinese, vertical text lines are read right-to-left. Mongolian is
|
||||
* written in vertical columns top to bottom like Chinese, but the lines
|
||||
* order left-to right.
|
||||
*
|
||||
* Note that only some combinations make sense. For example,
|
||||
* WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM
|
||||
*/
|
||||
enum TextlineOrder {
|
||||
TEXTLINE_ORDER_LEFT_TO_RIGHT = 0,
|
||||
TEXTLINE_ORDER_RIGHT_TO_LEFT = 1,
|
||||
TEXTLINE_ORDER_TOP_TO_BOTTOM = 2,
|
||||
};
|
||||
|
||||
/**
|
||||
* Possible modes for page layout analysis. These *must* be kept in order
|
||||
* of decreasing amount of layout analysis to be done, except for OSD_ONLY,
|
||||
* so that the inequality test macros below work.
|
||||
*/
|
||||
enum PageSegMode {
|
||||
PSM_OSD_ONLY = 0, ///< Orientation and script detection only.
|
||||
PSM_AUTO_OSD = 1, ///< Automatic page segmentation with orientation and
|
||||
///< script detection. (OSD)
|
||||
PSM_AUTO_ONLY = 2, ///< Automatic page segmentation, but no OSD, or OCR.
|
||||
PSM_AUTO = 3, ///< Fully automatic page segmentation, but no OSD.
|
||||
PSM_SINGLE_COLUMN = 4, ///< Assume a single column of text of variable sizes.
|
||||
PSM_SINGLE_BLOCK_VERT_TEXT = 5, ///< Assume a single uniform block of
|
||||
///< vertically aligned text.
|
||||
PSM_SINGLE_BLOCK = 6, ///< Assume a single uniform block of text. (Default.)
|
||||
PSM_SINGLE_LINE = 7, ///< Treat the image as a single text line.
|
||||
PSM_SINGLE_WORD = 8, ///< Treat the image as a single word.
|
||||
PSM_CIRCLE_WORD = 9, ///< Treat the image as a single word in a circle.
|
||||
PSM_SINGLE_CHAR = 10, ///< Treat the image as a single character.
|
||||
PSM_SPARSE_TEXT =
|
||||
11, ///< Find as much text as possible in no particular order.
|
||||
PSM_SPARSE_TEXT_OSD = 12, ///< Sparse text with orientation and script det.
|
||||
PSM_RAW_LINE = 13, ///< Treat the image as a single text line, bypassing
|
||||
///< hacks that are Tesseract-specific.
|
||||
|
||||
PSM_COUNT ///< Number of enum entries.
|
||||
};
|
||||
|
||||
/**
|
||||
* Inline functions that act on a PageSegMode to determine whether components of
|
||||
* layout analysis are enabled.
|
||||
* *Depend critically on the order of elements of PageSegMode.*
|
||||
* NOTE that arg is an int for compatibility with INT_PARAM.
|
||||
*/
|
||||
inline bool PSM_OSD_ENABLED(int pageseg_mode) {
|
||||
return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD;
|
||||
}
|
||||
inline bool PSM_ORIENTATION_ENABLED(int pageseg_mode) {
|
||||
return pageseg_mode <= PSM_AUTO || pageseg_mode == PSM_SPARSE_TEXT_OSD;
|
||||
}
|
||||
inline bool PSM_COL_FIND_ENABLED(int pageseg_mode) {
|
||||
return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO;
|
||||
}
|
||||
inline bool PSM_SPARSE(int pageseg_mode) {
|
||||
return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
|
||||
}
|
||||
inline bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode) {
|
||||
return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN;
|
||||
}
|
||||
inline bool PSM_LINE_FIND_ENABLED(int pageseg_mode) {
|
||||
return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK;
|
||||
}
|
||||
inline bool PSM_WORD_FIND_ENABLED(int pageseg_mode) {
|
||||
return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) ||
|
||||
pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
|
||||
}
|
||||
|
||||
/**
|
||||
* enum of the elements of the page hierarchy, used in ResultIterator
|
||||
* to provide functions that operate on each level without having to
|
||||
* have 5x as many functions.
|
||||
*/
|
||||
enum PageIteratorLevel {
|
||||
RIL_BLOCK, // Block of text/image/separator line.
|
||||
RIL_PARA, // Paragraph within a block.
|
||||
RIL_TEXTLINE, // Line within a paragraph.
|
||||
RIL_WORD, // Word within a textline.
|
||||
RIL_SYMBOL // Symbol/character within a word.
|
||||
};
|
||||
|
||||
/**
|
||||
* JUSTIFICATION_UNKNOWN
|
||||
* The alignment is not clearly one of the other options. This could happen
|
||||
* for example if there are only one or two lines of text or the text looks
|
||||
* like source code or poetry.
|
||||
*
|
||||
* NOTA BENE: Fully justified paragraphs (text aligned to both left and right
|
||||
* margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text
|
||||
* is written with a left-to-right script and with JUSTIFICATION_RIGHT if
|
||||
* their text is written in a right-to-left script.
|
||||
*
|
||||
* Interpretation for text read in vertical lines:
|
||||
* "Left" is wherever the starting reading position is.
|
||||
*
|
||||
* JUSTIFICATION_LEFT
|
||||
* Each line, except possibly the first, is flush to the same left tab stop.
|
||||
*
|
||||
* JUSTIFICATION_CENTER
|
||||
* The text lines of the paragraph are centered about a line going
|
||||
* down through their middle of the text lines.
|
||||
*
|
||||
* JUSTIFICATION_RIGHT
|
||||
* Each line, except possibly the first, is flush to the same right tab stop.
|
||||
*/
|
||||
enum ParagraphJustification {
|
||||
JUSTIFICATION_UNKNOWN,
|
||||
JUSTIFICATION_LEFT,
|
||||
JUSTIFICATION_CENTER,
|
||||
JUSTIFICATION_RIGHT,
|
||||
};
|
||||
|
||||
/**
|
||||
* When Tesseract/Cube is initialized we can choose to instantiate/load/run
|
||||
* only the Tesseract part, only the Cube part or both along with the combiner.
|
||||
* The preference of which engine to use is stored in tessedit_ocr_engine_mode.
|
||||
*
|
||||
* ATTENTION: When modifying this enum, please make sure to make the
|
||||
* appropriate changes to all the enums mirroring it (e.g. OCREngine in
|
||||
* cityblock/workflow/detection/detection_storage.proto). Such enums will
|
||||
* mention the connection to OcrEngineMode in the comments.
|
||||
*/
|
||||
enum OcrEngineMode {
|
||||
OEM_TESSERACT_ONLY, // Run Tesseract only - fastest; deprecated
|
||||
OEM_LSTM_ONLY, // Run just the LSTM line recognizer.
|
||||
OEM_TESSERACT_LSTM_COMBINED, // Run the LSTM recognizer, but allow fallback
|
||||
// to Tesseract when things get difficult.
|
||||
// deprecated
|
||||
OEM_DEFAULT, // Specify this mode when calling init_*(),
|
||||
// to indicate that any of the above modes
|
||||
// should be automatically inferred from the
|
||||
// variables in the language-specific config,
|
||||
// command-line configs, or if not specified
|
||||
// in any of the above should be set to the
|
||||
// default OEM_TESSERACT_ONLY.
|
||||
OEM_COUNT // Number of OEMs
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_CCSTRUCT_PUBLICTYPES_H_
|
310
3rdparty/tesseract_ocr/tesseract/include/tesseract/renderer.h
vendored
Normal file
310
3rdparty/tesseract_ocr/tesseract/include/tesseract/renderer.h
vendored
Normal file
|
@ -0,0 +1,310 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: renderer.h
|
||||
// Description: Rendering interface to inject into TessBaseAPI
|
||||
//
|
||||
// (C) Copyright 2011, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_API_RENDERER_H_
|
||||
#define TESSERACT_API_RENDERER_H_
|
||||
|
||||
#include "export.h"
|
||||
|
||||
// To avoid collision with other typenames include the ABSOLUTE MINIMUM
|
||||
// complexity of includes here. Use forward declarations wherever possible
|
||||
// and hide includes of complex types in baseapi.cpp.
|
||||
#include <string> // for std::string
|
||||
#include <vector> // for std::vector
|
||||
|
||||
struct Pix;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class TessBaseAPI;
|
||||
|
||||
/**
|
||||
* Interface for rendering tesseract results into a document, such as text,
|
||||
* HOCR or pdf. This class is abstract. Specific classes handle individual
|
||||
* formats. This interface is then used to inject the renderer class into
|
||||
* tesseract when processing images.
|
||||
*
|
||||
* For simplicity implementing this with tesseract version 3.01,
|
||||
* the renderer contains document state that is cleared from document
|
||||
* to document just as the TessBaseAPI is. This way the base API can just
|
||||
* delegate its rendering functionality to injected renderers, and the
|
||||
* renderers can manage the associated state needed for the specific formats
|
||||
* in addition to the heuristics for producing it.
|
||||
*/
|
||||
class TESS_API TessResultRenderer {
|
||||
public:
|
||||
virtual ~TessResultRenderer();
|
||||
|
||||
// Takes ownership of pointer so must be new'd instance.
|
||||
// Renderers aren't ordered, but appends the sequences of next parameter
|
||||
// and existing next(). The renderers should be unique across both lists.
|
||||
void insert(TessResultRenderer *next);
|
||||
|
||||
// Returns the next renderer or nullptr.
|
||||
TessResultRenderer *next() {
|
||||
return next_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Starts a new document with the given title.
|
||||
* This clears the contents of the output data.
|
||||
* Title should use UTF-8 encoding.
|
||||
*/
|
||||
bool BeginDocument(const char *title);
|
||||
|
||||
/**
|
||||
* Adds the recognized text from the source image to the current document.
|
||||
* Invalid if BeginDocument not yet called.
|
||||
*
|
||||
* Note that this API is a bit weird but is designed to fit into the
|
||||
* current TessBaseAPI implementation where the api has lots of state
|
||||
* information that we might want to add in.
|
||||
*/
|
||||
bool AddImage(TessBaseAPI *api);
|
||||
|
||||
/**
|
||||
* Finishes the document and finalizes the output data
|
||||
* Invalid if BeginDocument not yet called.
|
||||
*/
|
||||
bool EndDocument();
|
||||
|
||||
const char *file_extension() const {
|
||||
return file_extension_;
|
||||
}
|
||||
const char *title() const {
|
||||
return title_.c_str();
|
||||
}
|
||||
|
||||
// Is everything fine? Otherwise something went wrong.
|
||||
bool happy() const {
|
||||
return happy_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the index of the last image given to AddImage
|
||||
* (i.e. images are incremented whether the image succeeded or not)
|
||||
*
|
||||
* This is always defined. It means either the number of the
|
||||
* current image, the last image ended, or in the completed document
|
||||
* depending on when in the document lifecycle you are looking at it.
|
||||
* Will return -1 if a document was never started.
|
||||
*/
|
||||
int imagenum() const {
|
||||
return imagenum_;
|
||||
}
|
||||
|
||||
protected:
|
||||
/**
|
||||
* Called by concrete classes.
|
||||
*
|
||||
* outputbase is the name of the output file excluding
|
||||
* extension. For example, "/path/to/chocolate-chip-cookie-recipe"
|
||||
*
|
||||
* extension indicates the file extension to be used for output
|
||||
* files. For example "pdf" will produce a .pdf file, and "hocr"
|
||||
* will produce .hocr files.
|
||||
*/
|
||||
TessResultRenderer(const char *outputbase, const char *extension);
|
||||
|
||||
// Hook for specialized handling in BeginDocument()
|
||||
virtual bool BeginDocumentHandler();
|
||||
|
||||
// This must be overridden to render the OCR'd results
|
||||
virtual bool AddImageHandler(TessBaseAPI *api) = 0;
|
||||
|
||||
// Hook for specialized handling in EndDocument()
|
||||
virtual bool EndDocumentHandler();
|
||||
|
||||
// Renderers can call this to append '\0' terminated strings into
|
||||
// the output string returned by GetOutput.
|
||||
// This method will grow the output buffer if needed.
|
||||
void AppendString(const char *s);
|
||||
|
||||
// Renderers can call this to append binary byte sequences into
|
||||
// the output string returned by GetOutput. Note that s is not necessarily
|
||||
// '\0' terminated (and can contain '\0' within it).
|
||||
// This method will grow the output buffer if needed.
|
||||
void AppendData(const char *s, int len);
|
||||
|
||||
private:
|
||||
const char *file_extension_; // standard extension for generated output
|
||||
std::string title_; // title of document being rendered
|
||||
int imagenum_; // index of last image added
|
||||
|
||||
FILE *fout_; // output file pointer
|
||||
TessResultRenderer *next_; // Can link multiple renderers together
|
||||
bool happy_; // I get grumpy when the disk fills up, etc.
|
||||
};
|
||||
|
||||
/**
|
||||
* Renders tesseract output into a plain UTF-8 text string
|
||||
*/
|
||||
class TESS_API TessTextRenderer : public TessResultRenderer {
|
||||
public:
|
||||
explicit TessTextRenderer(const char *outputbase);
|
||||
|
||||
protected:
|
||||
bool AddImageHandler(TessBaseAPI *api) override;
|
||||
};
|
||||
|
||||
/**
|
||||
* Renders tesseract output into an hocr text string
|
||||
*/
|
||||
class TESS_API TessHOcrRenderer : public TessResultRenderer {
|
||||
public:
|
||||
explicit TessHOcrRenderer(const char *outputbase, bool font_info);
|
||||
explicit TessHOcrRenderer(const char *outputbase);
|
||||
|
||||
protected:
|
||||
bool BeginDocumentHandler() override;
|
||||
bool AddImageHandler(TessBaseAPI *api) override;
|
||||
bool EndDocumentHandler() override;
|
||||
|
||||
private:
|
||||
bool font_info_; // whether to print font information
|
||||
};
|
||||
|
||||
/**
|
||||
* Renders tesseract output into an alto text string
|
||||
*/
|
||||
class TESS_API TessAltoRenderer : public TessResultRenderer {
|
||||
public:
|
||||
explicit TessAltoRenderer(const char *outputbase);
|
||||
|
||||
protected:
|
||||
bool BeginDocumentHandler() override;
|
||||
bool AddImageHandler(TessBaseAPI *api) override;
|
||||
bool EndDocumentHandler() override;
|
||||
};
|
||||
|
||||
/**
|
||||
* Renders Tesseract output into a TSV string
|
||||
*/
|
||||
class TESS_API TessTsvRenderer : public TessResultRenderer {
|
||||
public:
|
||||
explicit TessTsvRenderer(const char *outputbase, bool font_info);
|
||||
explicit TessTsvRenderer(const char *outputbase);
|
||||
|
||||
protected:
|
||||
bool BeginDocumentHandler() override;
|
||||
bool AddImageHandler(TessBaseAPI *api) override;
|
||||
bool EndDocumentHandler() override;
|
||||
|
||||
private:
|
||||
bool font_info_; // whether to print font information
|
||||
};
|
||||
|
||||
/**
|
||||
* Renders tesseract output into searchable PDF
|
||||
*/
|
||||
class TESS_API TessPDFRenderer : public TessResultRenderer {
|
||||
public:
|
||||
// datadir is the location of the TESSDATA. We need it because
|
||||
// we load a custom PDF font from this location.
|
||||
TessPDFRenderer(const char *outputbase, const char *datadir,
|
||||
bool textonly = false);
|
||||
|
||||
protected:
|
||||
bool BeginDocumentHandler() override;
|
||||
bool AddImageHandler(TessBaseAPI *api) override;
|
||||
bool EndDocumentHandler() override;
|
||||
|
||||
private:
|
||||
// We don't want to have every image in memory at once,
|
||||
// so we store some metadata as we go along producing
|
||||
// PDFs one page at a time. At the end, that metadata is
|
||||
// used to make everything that isn't easily handled in a
|
||||
// streaming fashion.
|
||||
long int obj_; // counter for PDF objects
|
||||
std::vector<long int> offsets_; // offset of every PDF object in bytes
|
||||
std::vector<long int> pages_; // object number for every /Page object
|
||||
std::string datadir_; // where to find the custom font
|
||||
bool textonly_; // skip images if set
|
||||
// Bookkeeping only. DIY = Do It Yourself.
|
||||
void AppendPDFObjectDIY(size_t objectsize);
|
||||
// Bookkeeping + emit data.
|
||||
void AppendPDFObject(const char *data);
|
||||
// Create the /Contents object for an entire page.
|
||||
char *GetPDFTextObjects(TessBaseAPI *api, double width, double height);
|
||||
// Turn an image into a PDF object. Only transcode if we have to.
|
||||
static bool imageToPDFObj(Pix *pix, const char *filename, long int objnum,
|
||||
char **pdf_object, long int *pdf_object_size,
|
||||
int jpg_quality);
|
||||
};
|
||||
|
||||
/**
|
||||
* Renders tesseract output into a plain UTF-8 text string
|
||||
*/
|
||||
class TESS_API TessUnlvRenderer : public TessResultRenderer {
|
||||
public:
|
||||
explicit TessUnlvRenderer(const char *outputbase);
|
||||
|
||||
protected:
|
||||
bool AddImageHandler(TessBaseAPI *api) override;
|
||||
};
|
||||
|
||||
/**
|
||||
* Renders tesseract output into a plain UTF-8 text string for LSTMBox
|
||||
*/
|
||||
class TESS_API TessLSTMBoxRenderer : public TessResultRenderer {
|
||||
public:
|
||||
explicit TessLSTMBoxRenderer(const char *outputbase);
|
||||
|
||||
protected:
|
||||
bool AddImageHandler(TessBaseAPI *api) override;
|
||||
};
|
||||
|
||||
/**
|
||||
* Renders tesseract output into a plain UTF-8 text string
|
||||
*/
|
||||
class TESS_API TessBoxTextRenderer : public TessResultRenderer {
|
||||
public:
|
||||
explicit TessBoxTextRenderer(const char *outputbase);
|
||||
|
||||
protected:
|
||||
bool AddImageHandler(TessBaseAPI *api) override;
|
||||
};
|
||||
|
||||
/**
|
||||
* Renders tesseract output into a plain UTF-8 text string in WordStr format
|
||||
*/
|
||||
class TESS_API TessWordStrBoxRenderer : public TessResultRenderer {
|
||||
public:
|
||||
explicit TessWordStrBoxRenderer(const char *outputbase);
|
||||
|
||||
protected:
|
||||
bool AddImageHandler(TessBaseAPI *api) override;
|
||||
};
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
/**
|
||||
* Renders tesseract output into an osd text string
|
||||
*/
|
||||
class TESS_API TessOsdRenderer : public TessResultRenderer {
|
||||
public:
|
||||
explicit TessOsdRenderer(const char *outputbase);
|
||||
|
||||
protected:
|
||||
bool AddImageHandler(TessBaseAPI *api) override;
|
||||
};
|
||||
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_API_RENDERER_H_
|
252
3rdparty/tesseract_ocr/tesseract/include/tesseract/resultiterator.h
vendored
Normal file
252
3rdparty/tesseract_ocr/tesseract/include/tesseract/resultiterator.h
vendored
Normal file
|
@ -0,0 +1,252 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: resultiterator.h
|
||||
// Description: Iterator for tesseract results that is capable of
|
||||
// iterating in proper reading order over Bi Directional
|
||||
// (e.g. mixed Hebrew and English) text.
|
||||
// Author: David Eger
|
||||
//
|
||||
// (C) Copyright 2011, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_
|
||||
#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_
|
||||
|
||||
#include "export.h" // for TESS_API, TESS_LOCAL
|
||||
#include "ltrresultiterator.h" // for LTRResultIterator
|
||||
#include "publictypes.h" // for PageIteratorLevel
|
||||
#include "unichar.h" // for StrongScriptDirection
|
||||
|
||||
#include <set> // for std::pair
|
||||
#include <vector> // for std::vector
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class TESS_API ResultIterator : public LTRResultIterator {
|
||||
public:
|
||||
static ResultIterator *StartOfParagraph(const LTRResultIterator &resit);
|
||||
|
||||
/**
|
||||
* ResultIterator is copy constructible!
|
||||
* The default copy constructor works just fine for us.
|
||||
*/
|
||||
~ResultIterator() override = default;
|
||||
|
||||
// ============= Moving around within the page ============.
|
||||
/**
|
||||
* Moves the iterator to point to the start of the page to begin
|
||||
* an iteration.
|
||||
*/
|
||||
void Begin() override;
|
||||
|
||||
/**
|
||||
* Moves to the start of the next object at the given level in the
|
||||
* page hierarchy in the appropriate reading order and returns false if
|
||||
* the end of the page was reached.
|
||||
* NOTE that RIL_SYMBOL will skip non-text blocks, but all other
|
||||
* PageIteratorLevel level values will visit each non-text block once.
|
||||
* Think of non text blocks as containing a single para, with a single line,
|
||||
* with a single imaginary word.
|
||||
* Calls to Next with different levels may be freely intermixed.
|
||||
* This function iterates words in right-to-left scripts correctly, if
|
||||
* the appropriate language has been loaded into Tesseract.
|
||||
*/
|
||||
bool Next(PageIteratorLevel level) override;
|
||||
|
||||
/**
|
||||
* IsAtBeginningOf() returns whether we're at the logical beginning of the
|
||||
* given level. (as opposed to ResultIterator's left-to-right top-to-bottom
|
||||
* order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf().
|
||||
* For a full description, see pageiterator.h
|
||||
*/
|
||||
bool IsAtBeginningOf(PageIteratorLevel level) const override;
|
||||
|
||||
/**
|
||||
* Implement PageIterator's IsAtFinalElement correctly in a BiDi context.
|
||||
* For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we
|
||||
* point at the last word in a paragraph. See PageIterator for full comment.
|
||||
*/
|
||||
bool IsAtFinalElement(PageIteratorLevel level,
|
||||
PageIteratorLevel element) const override;
|
||||
|
||||
// ============= Functions that refer to words only ============.
|
||||
// Returns the number of blanks before the current word.
|
||||
int BlanksBeforeWord() const;
|
||||
|
||||
// ============= Accessing data ==============.
|
||||
|
||||
/**
|
||||
* Returns the null terminated UTF-8 encoded text string for the current
|
||||
* object at the given level. Use delete [] to free after use.
|
||||
*/
|
||||
virtual char *GetUTF8Text(PageIteratorLevel level) const;
|
||||
|
||||
/**
|
||||
* Returns the LSTM choices for every LSTM timestep for the current word.
|
||||
*/
|
||||
virtual std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
|
||||
*GetRawLSTMTimesteps() const;
|
||||
virtual std::vector<std::vector<std::pair<const char *, float>>>
|
||||
*GetBestLSTMSymbolChoices() const;
|
||||
|
||||
/**
|
||||
* Return whether the current paragraph's dominant reading direction
|
||||
* is left-to-right (as opposed to right-to-left).
|
||||
*/
|
||||
bool ParagraphIsLtr() const;
|
||||
|
||||
// ============= Exposed only for testing =============.
|
||||
|
||||
/**
|
||||
* Yields the reading order as a sequence of indices and (optional)
|
||||
* meta-marks for a set of words (given left-to-right).
|
||||
* The meta marks are passed as negative values:
|
||||
* kMinorRunStart Start of minor direction text.
|
||||
* kMinorRunEnd End of minor direction text.
|
||||
* kComplexWord The next indexed word contains both left-to-right and
|
||||
* right-to-left characters and was treated as neutral.
|
||||
*
|
||||
* For example, suppose we have five words in a text line,
|
||||
* indexed [0,1,2,3,4] from the leftmost side of the text line.
|
||||
* The following are all believable reading_orders:
|
||||
*
|
||||
* Left-to-Right (in ltr paragraph):
|
||||
* { 0, 1, 2, 3, 4 }
|
||||
* Left-to-Right (in rtl paragraph):
|
||||
* { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd }
|
||||
* Right-to-Left (in rtl paragraph):
|
||||
* { 4, 3, 2, 1, 0 }
|
||||
* Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph:
|
||||
* { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }
|
||||
*/
|
||||
static void CalculateTextlineOrder(
|
||||
bool paragraph_is_ltr,
|
||||
const std::vector<StrongScriptDirection> &word_dirs,
|
||||
std::vector<int> *reading_order);
|
||||
|
||||
static const int kMinorRunStart;
|
||||
static const int kMinorRunEnd;
|
||||
static const int kComplexWord;
|
||||
|
||||
protected:
|
||||
/**
|
||||
* We presume the data associated with the given iterator will outlive us.
|
||||
* NB: This is private because it does something that is non-obvious:
|
||||
* it resets to the beginning of the paragraph instead of staying wherever
|
||||
* resit might have pointed.
|
||||
*/
|
||||
explicit ResultIterator(const LTRResultIterator &resit);
|
||||
|
||||
private:
|
||||
/**
|
||||
* Calculates the current paragraph's dominant writing direction.
|
||||
* Typically, members should use current_paragraph_ltr_ instead.
|
||||
*/
|
||||
bool CurrentParagraphIsLtr() const;
|
||||
|
||||
/**
|
||||
* Returns word indices as measured from resit->RestartRow() = index 0
|
||||
* for the reading order of words within a textline given an iterator
|
||||
* into the middle of the text line.
|
||||
* In addition to non-negative word indices, the following negative values
|
||||
* may be inserted:
|
||||
* kMinorRunStart Start of minor direction text.
|
||||
* kMinorRunEnd End of minor direction text.
|
||||
* kComplexWord The previous word contains both left-to-right and
|
||||
* right-to-left characters and was treated as neutral.
|
||||
*/
|
||||
void CalculateTextlineOrder(bool paragraph_is_ltr,
|
||||
const LTRResultIterator &resit,
|
||||
std::vector<int> *indices) const;
|
||||
/** Same as above, but the caller's ssd gets filled in if ssd != nullptr. */
|
||||
void CalculateTextlineOrder(bool paragraph_is_ltr,
|
||||
const LTRResultIterator &resit,
|
||||
std::vector<StrongScriptDirection> *ssd,
|
||||
std::vector<int> *indices) const;
|
||||
|
||||
/**
|
||||
* What is the index of the current word in a strict left-to-right reading
|
||||
* of the row?
|
||||
*/
|
||||
int LTRWordIndex() const;
|
||||
|
||||
/**
|
||||
* Given an iterator pointing at a word, returns the logical reading order
|
||||
* of blob indices for the word.
|
||||
*/
|
||||
void CalculateBlobOrder(std::vector<int> *blob_indices) const;
|
||||
|
||||
/** Precondition: current_paragraph_is_ltr_ is set. */
|
||||
void MoveToLogicalStartOfTextline();
|
||||
|
||||
/**
|
||||
* Precondition: current_paragraph_is_ltr_ and in_minor_direction_
|
||||
* are set.
|
||||
*/
|
||||
void MoveToLogicalStartOfWord();
|
||||
|
||||
/** Are we pointing at the final (reading order) symbol of the word? */
|
||||
bool IsAtFinalSymbolOfWord() const;
|
||||
|
||||
/** Are we pointing at the first (reading order) symbol of the word? */
|
||||
bool IsAtFirstSymbolOfWord() const;
|
||||
|
||||
/**
|
||||
* Append any extra marks that should be appended to this word when printed.
|
||||
* Mostly, these are Unicode BiDi control characters.
|
||||
*/
|
||||
void AppendSuffixMarks(std::string *text) const;
|
||||
|
||||
/** Appends the current word in reading order to the given buffer.*/
|
||||
void AppendUTF8WordText(std::string *text) const;
|
||||
|
||||
/**
|
||||
* Appends the text of the current text line, *assuming this iterator is
|
||||
* positioned at the beginning of the text line* This function
|
||||
* updates the iterator to point to the first position past the text line.
|
||||
* Each textline is terminated in a single newline character.
|
||||
* If the textline ends a paragraph, it gets a second terminal newline.
|
||||
*/
|
||||
void IterateAndAppendUTF8TextlineText(std::string *text);
|
||||
|
||||
/**
|
||||
* Appends the text of the current paragraph in reading order
|
||||
* to the given buffer.
|
||||
* Each textline is terminated in a single newline character, and the
|
||||
* paragraph gets an extra newline at the end.
|
||||
*/
|
||||
void AppendUTF8ParagraphText(std::string *text) const;
|
||||
|
||||
/** Returns whether the bidi_debug flag is set to at least min_level. */
|
||||
bool BidiDebug(int min_level) const;
|
||||
|
||||
bool current_paragraph_is_ltr_;
|
||||
|
||||
/**
|
||||
* Is the currently pointed-at character at the beginning of
|
||||
* a minor-direction run?
|
||||
*/
|
||||
bool at_beginning_of_minor_run_;
|
||||
|
||||
/** Is the currently pointed-at character in a minor-direction sequence? */
|
||||
bool in_minor_direction_;
|
||||
|
||||
/**
|
||||
* Should detected inter-word spaces be preserved, or "compressed" to a single
|
||||
* space character (default behavior).
|
||||
*/
|
||||
bool preserve_interword_spaces_;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H_
|
177
3rdparty/tesseract_ocr/tesseract/include/tesseract/unichar.h
vendored
Normal file
177
3rdparty/tesseract_ocr/tesseract/include/tesseract/unichar.h
vendored
Normal file
|
@ -0,0 +1,177 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: unichar.h
|
||||
// Description: Unicode character/ligature class.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2006, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCUTIL_UNICHAR_H_
|
||||
#define TESSERACT_CCUTIL_UNICHAR_H_
|
||||
|
||||
#include "export.h"
|
||||
|
||||
#include <memory.h>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Maximum number of characters that can be stored in a UNICHAR. Must be
|
||||
// at least 4. Must not exceed 31 without changing the coding of length.
|
||||
#define UNICHAR_LEN 30
|
||||
|
||||
// TODO(rays) Move these to the tesseract namespace.
|
||||
// A UNICHAR_ID is the unique id of a unichar.
|
||||
using UNICHAR_ID = int;
|
||||
|
||||
// A variable to indicate an invalid or uninitialized unichar id.
|
||||
static const int INVALID_UNICHAR_ID = -1;
|
||||
// A special unichar that corresponds to INVALID_UNICHAR_ID.
|
||||
static const char INVALID_UNICHAR[] = "__INVALID_UNICHAR__";
|
||||
|
||||
enum StrongScriptDirection {
|
||||
DIR_NEUTRAL = 0, // Text contains only neutral characters.
|
||||
DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters.
|
||||
DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters.
|
||||
DIR_MIX = 3, // Text contains a mixture of left-to-right
|
||||
// and right-to-left characters.
|
||||
};
|
||||
|
||||
using char32 = signed int;
|
||||
|
||||
// The UNICHAR class holds a single classification result. This may be
|
||||
// a single Unicode character (stored as between 1 and 4 utf8 bytes) or
|
||||
// multiple Unicode characters representing the NFKC expansion of a ligature
|
||||
// such as fi, ffl etc. These are also stored as utf8.
|
||||
class TESS_API UNICHAR {
|
||||
public:
|
||||
UNICHAR() {
|
||||
memset(chars, 0, UNICHAR_LEN);
|
||||
}
|
||||
|
||||
// Construct from a utf8 string. If len<0 then the string is null terminated.
|
||||
// If the string is too long to fit in the UNICHAR then it takes only what
|
||||
// will fit.
|
||||
UNICHAR(const char *utf8_str, int len);
|
||||
|
||||
// Construct from a single UCS4 character.
|
||||
explicit UNICHAR(int unicode);
|
||||
|
||||
// Default copy constructor and operator= are OK.
|
||||
|
||||
// Get the first character as UCS-4.
|
||||
int first_uni() const;
|
||||
|
||||
// Get the length of the UTF8 string.
|
||||
int utf8_len() const {
|
||||
int len = chars[UNICHAR_LEN - 1];
|
||||
return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;
|
||||
}
|
||||
|
||||
// Get a UTF8 string, but NOT nullptr terminated.
|
||||
const char *utf8() const {
|
||||
return chars;
|
||||
}
|
||||
|
||||
// Get a terminated UTF8 string: Must delete[] it after use.
|
||||
char *utf8_str() const;
|
||||
|
||||
// Get the number of bytes in the first character of the given utf8 string.
|
||||
static int utf8_step(const char *utf8_str);
|
||||
|
||||
// A class to simplify iterating over and accessing elements of a UTF8
|
||||
// string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or
|
||||
// take ownership of the underlying byte array. It also does not permit
|
||||
// modification of the array (as the name suggests).
|
||||
//
|
||||
// Example:
|
||||
// for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len);
|
||||
// it != UNICHAR::end(str, len);
|
||||
// ++it) {
|
||||
// tprintf("UCS-4 symbol code = %d\n", *it);
|
||||
// char buf[5];
|
||||
// int char_len = it.get_utf8(buf); buf[char_len] = '\0';
|
||||
// tprintf("Char = %s\n", buf);
|
||||
// }
|
||||
class TESS_API const_iterator {
|
||||
using CI = const_iterator;
|
||||
|
||||
public:
|
||||
// Step to the next UTF8 character.
|
||||
// If the current position is at an illegal UTF8 character, then print an
|
||||
// error message and step by one byte. If the current position is at a
|
||||
// nullptr value, don't step past it.
|
||||
const_iterator &operator++();
|
||||
|
||||
// Return the UCS-4 value at the current position.
|
||||
// If the current position is at an illegal UTF8 value, return a single
|
||||
// space character.
|
||||
int operator*() const;
|
||||
|
||||
// Store the UTF-8 encoding of the current codepoint into buf, which must be
|
||||
// at least 4 bytes long. Return the number of bytes written.
|
||||
// If the current position is at an illegal UTF8 value, writes a single
|
||||
// space character and returns 1.
|
||||
// Note that this method does not null-terminate the buffer.
|
||||
int get_utf8(char *buf) const;
|
||||
// Returns the number of bytes of the current codepoint. Returns 1 if the
|
||||
// current position is at an illegal UTF8 value.
|
||||
int utf8_len() const;
|
||||
// Returns true if the UTF-8 encoding at the current position is legal.
|
||||
bool is_legal() const;
|
||||
|
||||
// Return the pointer into the string at the current position.
|
||||
const char *utf8_data() const {
|
||||
return it_;
|
||||
}
|
||||
|
||||
// Iterator equality operators.
|
||||
friend bool operator==(const CI &lhs, const CI &rhs) {
|
||||
return lhs.it_ == rhs.it_;
|
||||
}
|
||||
friend bool operator!=(const CI &lhs, const CI &rhs) {
|
||||
return !(lhs == rhs);
|
||||
}
|
||||
|
||||
private:
|
||||
friend class UNICHAR;
|
||||
explicit const_iterator(const char *it) : it_(it) {}
|
||||
|
||||
const char *it_; // Pointer into the string.
|
||||
};
|
||||
|
||||
// Create a start/end iterator pointing to a string. Note that these methods
|
||||
// are static and do NOT create a copy or take ownership of the underlying
|
||||
// array.
|
||||
static const_iterator begin(const char *utf8_str, int byte_length);
|
||||
static const_iterator end(const char *utf8_str, int byte_length);
|
||||
|
||||
// Converts a utf-8 string to a vector of unicodes.
|
||||
// Returns an empty vector if the input contains invalid UTF-8.
|
||||
static std::vector<char32> UTF8ToUTF32(const char *utf8_str);
|
||||
// Converts a vector of unicodes to a utf8 string.
|
||||
// Returns an empty string if the input contains an invalid unicode.
|
||||
static std::string UTF32ToUTF8(const std::vector<char32> &str32);
|
||||
|
||||
private:
|
||||
// A UTF-8 representation of 1 or more Unicode characters.
|
||||
// The last element (chars[UNICHAR_LEN - 1]) is a length if
|
||||
// its value < UNICHAR_LEN, otherwise it is a genuine character.
|
||||
char chars[UNICHAR_LEN]{};
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_CCUTIL_UNICHAR_H_
|
36
3rdparty/tesseract_ocr/tesseract/include/tesseract/version.h
vendored
Normal file
36
3rdparty/tesseract_ocr/tesseract/include/tesseract/version.h
vendored
Normal file
|
@ -0,0 +1,36 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: version.h
|
||||
// Description: Version information
|
||||
//
|
||||
// (C) Copyright 2018, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_API_VERSION_H_
|
||||
#define TESSERACT_API_VERSION_H_
|
||||
|
||||
// clang-format off
|
||||
|
||||
#define TESSERACT_MAJOR_VERSION 5
|
||||
#define TESSERACT_MINOR_VERSION 0
|
||||
#define TESSERACT_MICRO_VERSION 0
|
||||
|
||||
#define TESSERACT_VERSION \
|
||||
(TESSERACT_MAJOR_VERSION << 16 | \
|
||||
TESSERACT_MINOR_VERSION << 8 | \
|
||||
TESSERACT_MICRO_VERSION)
|
||||
|
||||
#define TESSERACT_VERSION_STR "5.0.0-alpha-20210401-98-g176d"
|
||||
|
||||
// clang-format on
|
||||
|
||||
#endif // TESSERACT_API_VERSION_H_
|
245
3rdparty/tesseract_ocr/tesseract/src/api/altorenderer.cpp
vendored
Normal file
245
3rdparty/tesseract_ocr/tesseract/src/api/altorenderer.cpp
vendored
Normal file
|
@ -0,0 +1,245 @@
|
|||
// File: altorenderer.cpp
|
||||
// Description: ALTO rendering interface
|
||||
// Author: Jake Sebright
|
||||
|
||||
// (C) Copyright 2018
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifdef _WIN32
|
||||
# include "host.h" // windows.h for MultiByteToWideChar, ...
|
||||
#endif
|
||||
|
||||
#include <tesseract/baseapi.h>
|
||||
#include <tesseract/renderer.h>
|
||||
|
||||
#include <memory>
|
||||
#include <sstream> // for std::stringstream
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
/// Add coordinates to specified TextBlock, TextLine or String bounding box.
|
||||
/// Add word confidence if adding to a String bounding box.
|
||||
///
|
||||
static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level,
|
||||
std::stringstream &alto_str) {
|
||||
int left, top, right, bottom;
|
||||
it->BoundingBox(level, &left, &top, &right, &bottom);
|
||||
|
||||
int hpos = left;
|
||||
int vpos = top;
|
||||
int height = bottom - top;
|
||||
int width = right - left;
|
||||
|
||||
alto_str << " HPOS=\"" << hpos << "\"";
|
||||
alto_str << " VPOS=\"" << vpos << "\"";
|
||||
alto_str << " WIDTH=\"" << width << "\"";
|
||||
alto_str << " HEIGHT=\"" << height << "\"";
|
||||
|
||||
if (level == RIL_WORD) {
|
||||
int wc = it->Confidence(RIL_WORD);
|
||||
alto_str << " WC=\"0." << wc << "\"";
|
||||
} else {
|
||||
alto_str << ">";
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Append the ALTO XML for the beginning of the document
|
||||
///
|
||||
bool TessAltoRenderer::BeginDocumentHandler() {
|
||||
AppendString(
|
||||
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
|
||||
"<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" "
|
||||
"xmlns:xlink=\"http://www.w3.org/1999/xlink\" "
|
||||
"xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
|
||||
"xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# "
|
||||
"http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n"
|
||||
"\t<Description>\n"
|
||||
"\t\t<MeasurementUnit>pixel</MeasurementUnit>\n"
|
||||
"\t\t<sourceImageInformation>\n"
|
||||
"\t\t\t<fileName>");
|
||||
|
||||
AppendString(title());
|
||||
|
||||
AppendString(
|
||||
"</fileName>\n"
|
||||
"\t\t</sourceImageInformation>\n"
|
||||
"\t\t<OCRProcessing ID=\"OCR_0\">\n"
|
||||
"\t\t\t<ocrProcessingStep>\n"
|
||||
"\t\t\t\t<processingSoftware>\n"
|
||||
"\t\t\t\t\t<softwareName>tesseract ");
|
||||
AppendString(TessBaseAPI::Version());
|
||||
AppendString(
|
||||
"</softwareName>\n"
|
||||
"\t\t\t\t</processingSoftware>\n"
|
||||
"\t\t\t</ocrProcessingStep>\n"
|
||||
"\t\t</OCRProcessing>\n"
|
||||
"\t</Description>\n"
|
||||
"\t<Layout>\n");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
///
|
||||
/// Append the ALTO XML for the layout of the image
|
||||
///
|
||||
bool TessAltoRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
const std::unique_ptr<const char[]> text(api->GetAltoText(imagenum()));
|
||||
if (text == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AppendString(text.get());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
///
|
||||
/// Append the ALTO XML for the end of the document
|
||||
///
|
||||
bool TessAltoRenderer::EndDocumentHandler() {
|
||||
AppendString("\t</Layout>\n</alto>\n");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
TessAltoRenderer::TessAltoRenderer(const char *outputbase)
|
||||
: TessResultRenderer(outputbase, "xml") {}
|
||||
|
||||
///
|
||||
/// Make an XML-formatted string with ALTO markup from the internal
|
||||
/// data structures.
|
||||
///
|
||||
char *TessBaseAPI::GetAltoText(int page_number) {
|
||||
return GetAltoText(nullptr, page_number);
|
||||
}
|
||||
|
||||
///
|
||||
/// Make an XML-formatted string with ALTO markup from the internal
|
||||
/// data structures.
|
||||
///
|
||||
char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
|
||||
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0;
|
||||
|
||||
if (input_file_.empty()) {
|
||||
SetInputName(nullptr);
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
// convert input name from ANSI encoding to utf-8
|
||||
int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
|
||||
wchar_t *uni16_str = new WCHAR[str16_len];
|
||||
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, str16_len);
|
||||
int utf8_len =
|
||||
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr);
|
||||
char *utf8_str = new char[utf8_len];
|
||||
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr);
|
||||
input_file_ = utf8_str;
|
||||
delete[] uni16_str;
|
||||
delete[] utf8_str;
|
||||
#endif
|
||||
|
||||
std::stringstream alto_str;
|
||||
// Use "C" locale (needed for int values larger than 999).
|
||||
alto_str.imbue(std::locale::classic());
|
||||
alto_str << "\t\t<Page WIDTH=\"" << rect_width_ << "\" HEIGHT=\"" << rect_height_
|
||||
<< "\" PHYSICAL_IMG_NR=\"" << page_number << "\""
|
||||
<< " ID=\"page_" << page_number << "\">\n"
|
||||
<< "\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\""
|
||||
<< " WIDTH=\"" << rect_width_ << "\""
|
||||
<< " HEIGHT=\"" << rect_height_ << "\">\n";
|
||||
|
||||
ResultIterator *res_it = GetIterator();
|
||||
while (!res_it->Empty(RIL_BLOCK)) {
|
||||
if (res_it->Empty(RIL_WORD)) {
|
||||
res_it->Next(RIL_WORD);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
|
||||
alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\"";
|
||||
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
|
||||
alto_str << "\n";
|
||||
}
|
||||
|
||||
if (res_it->IsAtBeginningOf(RIL_PARA)) {
|
||||
alto_str << "\t\t\t\t\t<TextBlock ID=\"block_" << tcnt << "\"";
|
||||
AddBoxToAlto(res_it, RIL_PARA, alto_str);
|
||||
alto_str << "\n";
|
||||
}
|
||||
|
||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||
alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
|
||||
AddBoxToAlto(res_it, RIL_TEXTLINE, alto_str);
|
||||
alto_str << "\n";
|
||||
}
|
||||
|
||||
alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
|
||||
AddBoxToAlto(res_it, RIL_WORD, alto_str);
|
||||
alto_str << " CONTENT=\"";
|
||||
|
||||
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
|
||||
bool last_word_in_tblock = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
|
||||
bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
|
||||
|
||||
int left, top, right, bottom;
|
||||
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
|
||||
|
||||
do {
|
||||
const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
|
||||
if (grapheme && grapheme[0] != 0) {
|
||||
alto_str << HOcrEscape(grapheme.get()).c_str();
|
||||
}
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
||||
|
||||
alto_str << "\"/>";
|
||||
|
||||
wcnt++;
|
||||
|
||||
if (last_word_in_line) {
|
||||
alto_str << "\n\t\t\t\t\t\t</TextLine>\n";
|
||||
lcnt++;
|
||||
} else {
|
||||
int hpos = right;
|
||||
int vpos = top;
|
||||
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
|
||||
int width = left - hpos;
|
||||
alto_str << "<SP WIDTH=\"" << width << "\" VPOS=\"" << vpos << "\" HPOS=\"" << hpos
|
||||
<< "\"/>\n";
|
||||
}
|
||||
|
||||
if (last_word_in_tblock) {
|
||||
alto_str << "\t\t\t\t\t</TextBlock>\n";
|
||||
tcnt++;
|
||||
}
|
||||
|
||||
if (last_word_in_cblock) {
|
||||
alto_str << "\t\t\t\t</ComposedBlock>\n";
|
||||
bcnt++;
|
||||
}
|
||||
}
|
||||
|
||||
alto_str << "\t\t\t</PrintSpace>\n"
|
||||
<< "\t\t</Page>\n";
|
||||
const std::string &text = alto_str.str();
|
||||
|
||||
char *result = new char[text.length() + 1];
|
||||
strcpy(result, text.c_str());
|
||||
delete res_it;
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
2419
3rdparty/tesseract_ocr/tesseract/src/api/baseapi.cpp
vendored
Normal file
2419
3rdparty/tesseract_ocr/tesseract/src/api/baseapi.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
689
3rdparty/tesseract_ocr/tesseract/src/api/capi.cpp
vendored
Normal file
689
3rdparty/tesseract_ocr/tesseract/src/api/capi.cpp
vendored
Normal file
|
@ -0,0 +1,689 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: capi.cpp
|
||||
// Description: C-API TessBaseAPI
|
||||
//
|
||||
// (C) Copyright 2012, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <tesseract/capi.h>
|
||||
|
||||
#include <cstring> // for strdup
|
||||
|
||||
const char *TessVersion() {
|
||||
return TessBaseAPI::Version();
|
||||
}
|
||||
|
||||
void TessDeleteText(const char *text) {
|
||||
delete[] text;
|
||||
}
|
||||
|
||||
void TessDeleteTextArray(char **arr) {
|
||||
for (char **pos = arr; *pos != nullptr; ++pos) {
|
||||
delete[] * pos;
|
||||
}
|
||||
delete[] arr;
|
||||
}
|
||||
|
||||
void TessDeleteIntArray(const int *arr) {
|
||||
delete[] arr;
|
||||
}
|
||||
|
||||
TessResultRenderer *TessTextRendererCreate(const char *outputbase) {
|
||||
return new tesseract::TessTextRenderer(outputbase);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessHOcrRendererCreate(const char *outputbase) {
|
||||
return new tesseract::TessHOcrRenderer(outputbase);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase, BOOL font_info) {
|
||||
return new tesseract::TessHOcrRenderer(outputbase, font_info != 0);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessAltoRendererCreate(const char *outputbase) {
|
||||
return new tesseract::TessAltoRenderer(outputbase);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessTsvRendererCreate(const char *outputbase) {
|
||||
return new tesseract::TessTsvRenderer(outputbase);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessPDFRendererCreate(const char *outputbase, const char *datadir,
|
||||
BOOL textonly) {
|
||||
return new tesseract::TessPDFRenderer(outputbase, datadir, textonly != 0);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessUnlvRendererCreate(const char *outputbase) {
|
||||
return new tesseract::TessUnlvRenderer(outputbase);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase) {
|
||||
return new tesseract::TessBoxTextRenderer(outputbase);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessWordStrBoxRendererCreate(const char *outputbase) {
|
||||
return new tesseract::TessWordStrBoxRenderer(outputbase);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase) {
|
||||
return new tesseract::TessLSTMBoxRenderer(outputbase);
|
||||
}
|
||||
|
||||
void TessDeleteResultRenderer(TessResultRenderer *renderer) {
|
||||
delete renderer;
|
||||
}
|
||||
|
||||
void TessResultRendererInsert(TessResultRenderer *renderer, TessResultRenderer *next) {
|
||||
renderer->insert(next);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessResultRendererNext(TessResultRenderer *renderer) {
|
||||
return renderer->next();
|
||||
}
|
||||
|
||||
BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer, const char *title) {
|
||||
return static_cast<int>(renderer->BeginDocument(title));
|
||||
}
|
||||
|
||||
BOOL TessResultRendererAddImage(TessResultRenderer *renderer, TessBaseAPI *api) {
|
||||
return static_cast<int>(renderer->AddImage(api));
|
||||
}
|
||||
|
||||
BOOL TessResultRendererEndDocument(TessResultRenderer *renderer) {
|
||||
return static_cast<int>(renderer->EndDocument());
|
||||
}
|
||||
|
||||
const char *TessResultRendererExtention(TessResultRenderer *renderer) {
|
||||
return renderer->file_extension();
|
||||
}
|
||||
|
||||
const char *TessResultRendererTitle(TessResultRenderer *renderer) {
|
||||
return renderer->title();
|
||||
}
|
||||
|
||||
int TessResultRendererImageNum(TessResultRenderer *renderer) {
|
||||
return renderer->imagenum();
|
||||
}
|
||||
|
||||
TessBaseAPI *TessBaseAPICreate() {
|
||||
return new TessBaseAPI;
|
||||
}
|
||||
|
||||
void TessBaseAPIDelete(TessBaseAPI *handle) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI * /*handle*/, void **device) {
|
||||
return TessBaseAPI::getOpenCLDevice(device);
|
||||
}
|
||||
|
||||
void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name) {
|
||||
handle->SetInputName(name);
|
||||
}
|
||||
|
||||
const char *TessBaseAPIGetInputName(TessBaseAPI *handle) {
|
||||
return handle->GetInputName();
|
||||
}
|
||||
|
||||
void TessBaseAPISetInputImage(TessBaseAPI *handle, Pix *pix) {
|
||||
handle->SetInputImage(pix);
|
||||
}
|
||||
|
||||
Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle) {
|
||||
return handle->GetInputImage();
|
||||
}
|
||||
|
||||
int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle) {
|
||||
return handle->GetSourceYResolution();
|
||||
}
|
||||
|
||||
const char *TessBaseAPIGetDatapath(TessBaseAPI *handle) {
|
||||
return handle->GetDatapath();
|
||||
}
|
||||
|
||||
void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name) {
|
||||
handle->SetOutputName(name);
|
||||
}
|
||||
|
||||
BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name, const char *value) {
|
||||
return static_cast<int>(handle->SetVariable(name, value));
|
||||
}
|
||||
|
||||
BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name, const char *value) {
|
||||
return static_cast<int>(handle->SetDebugVariable(name, value));
|
||||
}
|
||||
|
||||
BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle, const char *name, int *value) {
|
||||
return static_cast<int>(handle->GetIntVariable(name, value));
|
||||
}
|
||||
|
||||
BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle, const char *name, BOOL *value) {
|
||||
bool boolValue;
|
||||
bool result = handle->GetBoolVariable(name, &boolValue);
|
||||
if (result) {
|
||||
*value = static_cast<int>(boolValue);
|
||||
}
|
||||
return static_cast<int>(result);
|
||||
}
|
||||
|
||||
BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle, const char *name, double *value) {
|
||||
return static_cast<int>(handle->GetDoubleVariable(name, value));
|
||||
}
|
||||
|
||||
const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle, const char *name) {
|
||||
return handle->GetStringVariable(name);
|
||||
}
|
||||
|
||||
void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp) {
|
||||
handle->PrintVariables(fp);
|
||||
}
|
||||
|
||||
BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle, const char *filename) {
|
||||
FILE *fp = fopen(filename, "w");
|
||||
if (fp != nullptr) {
|
||||
handle->PrintVariables(fp);
|
||||
fclose(fp);
|
||||
return TRUE;
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath, const char *language,
|
||||
TessOcrEngineMode mode, char **configs, int configs_size, char **vars_vec,
|
||||
char **vars_values, size_t vars_vec_size, BOOL set_only_non_debug_params) {
|
||||
std::vector<std::string> varNames;
|
||||
std::vector<std::string> varValues;
|
||||
if (vars_vec != nullptr && vars_values != nullptr) {
|
||||
for (size_t i = 0; i < vars_vec_size; i++) {
|
||||
varNames.emplace_back(vars_vec[i]);
|
||||
varValues.emplace_back(vars_values[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return handle->Init(datapath, language, mode, configs, configs_size, &varNames, &varValues,
|
||||
set_only_non_debug_params != 0);
|
||||
}
|
||||
|
||||
int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath, const char *language,
|
||||
TessOcrEngineMode oem, char **configs, int configs_size) {
|
||||
return handle->Init(datapath, language, oem, configs, configs_size, nullptr, nullptr, false);
|
||||
}
|
||||
|
||||
int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath, const char *language,
|
||||
TessOcrEngineMode oem) {
|
||||
return handle->Init(datapath, language, oem);
|
||||
}
|
||||
|
||||
int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath, const char *language) {
|
||||
return handle->Init(datapath, language);
|
||||
}
|
||||
|
||||
const char *TessBaseAPIGetInitLanguagesAsString(const TessBaseAPI *handle) {
|
||||
return handle->GetInitLanguagesAsString();
|
||||
}
|
||||
|
||||
char **TessBaseAPIGetLoadedLanguagesAsVector(const TessBaseAPI *handle) {
|
||||
std::vector<std::string> languages;
|
||||
handle->GetLoadedLanguagesAsVector(&languages);
|
||||
char **arr = new char *[languages.size() + 1];
|
||||
for (auto &language : languages) {
|
||||
arr[&language - &languages[0]] = strdup(language.c_str());
|
||||
}
|
||||
arr[languages.size()] = nullptr;
|
||||
return arr;
|
||||
}
|
||||
|
||||
char **TessBaseAPIGetAvailableLanguagesAsVector(const TessBaseAPI *handle) {
|
||||
std::vector<std::string> languages;
|
||||
handle->GetAvailableLanguagesAsVector(&languages);
|
||||
char **arr = new char *[languages.size() + 1];
|
||||
for (auto &language : languages) {
|
||||
arr[&language - &languages[0]] = strdup(language.c_str());
|
||||
}
|
||||
arr[languages.size()] = nullptr;
|
||||
return arr;
|
||||
}
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
int TessBaseAPIInitLangMod(TessBaseAPI *handle, const char *datapath, const char *language) {
|
||||
return handle->InitLangMod(datapath, language);
|
||||
}
|
||||
#endif
|
||||
|
||||
void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle) {
|
||||
handle->InitForAnalysePage();
|
||||
}
|
||||
|
||||
void TessBaseAPIReadConfigFile(TessBaseAPI *handle, const char *filename) {
|
||||
handle->ReadConfigFile(filename);
|
||||
}
|
||||
|
||||
void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle, const char *filename) {
|
||||
handle->ReadDebugConfigFile(filename);
|
||||
}
|
||||
|
||||
void TessBaseAPISetPageSegMode(TessBaseAPI *handle, TessPageSegMode mode) {
|
||||
handle->SetPageSegMode(mode);
|
||||
}
|
||||
|
||||
TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle) {
|
||||
return handle->GetPageSegMode();
|
||||
}
|
||||
|
||||
char *TessBaseAPIRect(TessBaseAPI *handle, const unsigned char *imagedata, int bytes_per_pixel,
|
||||
int bytes_per_line, int left, int top, int width, int height) {
|
||||
return handle->TesseractRect(imagedata, bytes_per_pixel, bytes_per_line, left, top, width,
|
||||
height);
|
||||
}
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle) {
|
||||
handle->ClearAdaptiveClassifier();
|
||||
}
|
||||
#endif
|
||||
|
||||
void TessBaseAPISetImage(TessBaseAPI *handle, const unsigned char *imagedata, int width, int height,
|
||||
int bytes_per_pixel, int bytes_per_line) {
|
||||
handle->SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line);
|
||||
}
|
||||
|
||||
void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix) {
|
||||
return handle->SetImage(pix);
|
||||
}
|
||||
|
||||
void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi) {
|
||||
handle->SetSourceResolution(ppi);
|
||||
}
|
||||
|
||||
void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top, int width, int height) {
|
||||
handle->SetRectangle(left, top, width, height);
|
||||
}
|
||||
|
||||
struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle) {
|
||||
return handle->GetThresholdedImage();
|
||||
}
|
||||
|
||||
void TessBaseAPIClearPersistentCache(TessBaseAPI * /*handle*/) {
|
||||
TessBaseAPI::ClearPersistentCache();
|
||||
}
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle, int *orient_deg, float *orient_conf,
|
||||
const char **script_name, float *script_conf) {
|
||||
auto success = handle->DetectOrientationScript(orient_deg, orient_conf, script_name, script_conf);
|
||||
return static_cast<BOOL>(success);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle, struct Pixa **pixa) {
|
||||
return handle->GetRegions(pixa);
|
||||
}
|
||||
|
||||
struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle, struct Pixa **pixa, int **blockids) {
|
||||
return handle->GetTextlines(pixa, blockids);
|
||||
}
|
||||
|
||||
struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle, const BOOL raw_image,
|
||||
const int raw_padding, struct Pixa **pixa, int **blockids,
|
||||
int **paraids) {
|
||||
return handle->GetTextlines(raw_image != 0, raw_padding, pixa, blockids, paraids);
|
||||
}
|
||||
|
||||
struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle, struct Pixa **pixa, int **blockids) {
|
||||
return handle->GetStrips(pixa, blockids);
|
||||
}
|
||||
|
||||
struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle, struct Pixa **pixa) {
|
||||
return handle->GetWords(pixa);
|
||||
}
|
||||
|
||||
struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle, struct Pixa **cc) {
|
||||
return handle->GetConnectedComponents(cc);
|
||||
}
|
||||
|
||||
struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle, TessPageIteratorLevel level,
|
||||
BOOL text_only, struct Pixa **pixa, int **blockids) {
|
||||
return handle->GetComponentImages(level, static_cast<bool>(text_only), pixa, blockids);
|
||||
}
|
||||
|
||||
struct Boxa *TessBaseAPIGetComponentImages1(TessBaseAPI *handle, const TessPageIteratorLevel level,
|
||||
const BOOL text_only, const BOOL raw_image,
|
||||
const int raw_padding, struct Pixa **pixa,
|
||||
int **blockids, int **paraids) {
|
||||
return handle->GetComponentImages(level, static_cast<bool>(text_only), raw_image != 0,
|
||||
raw_padding, pixa, blockids, paraids);
|
||||
}
|
||||
|
||||
int TessBaseAPIGetThresholdedImageScaleFactor(const TessBaseAPI *handle) {
|
||||
return handle->GetThresholdedImageScaleFactor();
|
||||
}
|
||||
|
||||
TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle) {
|
||||
return handle->AnalyseLayout();
|
||||
}
|
||||
|
||||
int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor) {
|
||||
return handle->Recognize(monitor);
|
||||
}
|
||||
|
||||
BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename, const char *retry_config,
|
||||
int timeout_millisec, TessResultRenderer *renderer) {
|
||||
return static_cast<int>(handle->ProcessPages(filename, retry_config, timeout_millisec, renderer));
|
||||
}
|
||||
|
||||
BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix, int page_index,
|
||||
const char *filename, const char *retry_config, int timeout_millisec,
|
||||
TessResultRenderer *renderer) {
|
||||
return static_cast<int>(
|
||||
handle->ProcessPage(pix, page_index, filename, retry_config, timeout_millisec, renderer));
|
||||
}
|
||||
|
||||
TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle) {
|
||||
return handle->GetIterator();
|
||||
}
|
||||
|
||||
TessMutableIterator *TessBaseAPIGetMutableIterator(TessBaseAPI *handle) {
|
||||
return handle->GetMutableIterator();
|
||||
}
|
||||
|
||||
char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle) {
|
||||
return handle->GetUTF8Text();
|
||||
}
|
||||
|
||||
char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number) {
|
||||
return handle->GetHOCRText(nullptr, page_number);
|
||||
}
|
||||
|
||||
char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number) {
|
||||
return handle->GetAltoText(page_number);
|
||||
}
|
||||
|
||||
char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number) {
|
||||
return handle->GetTSVText(page_number);
|
||||
}
|
||||
|
||||
char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number) {
|
||||
return handle->GetBoxText(page_number);
|
||||
}
|
||||
|
||||
char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle, int page_number) {
|
||||
return handle->GetWordStrBoxText(page_number);
|
||||
}
|
||||
|
||||
char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number) {
|
||||
return handle->GetLSTMBoxText(page_number);
|
||||
}
|
||||
|
||||
char *TessBaseAPIGetUNLVText(TessBaseAPI *handle) {
|
||||
return handle->GetUNLVText();
|
||||
}
|
||||
|
||||
int TessBaseAPIMeanTextConf(TessBaseAPI *handle) {
|
||||
return handle->MeanTextConf();
|
||||
}
|
||||
|
||||
int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle) {
|
||||
return handle->AllWordConfidences();
|
||||
}
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle, TessPageSegMode mode, const char *wordstr) {
|
||||
return static_cast<int>(handle->AdaptToWordStr(mode, wordstr));
|
||||
}
|
||||
#endif
|
||||
|
||||
void TessBaseAPIClear(TessBaseAPI *handle) {
|
||||
handle->Clear();
|
||||
}
|
||||
|
||||
void TessBaseAPIEnd(TessBaseAPI *handle) {
|
||||
handle->End();
|
||||
}
|
||||
|
||||
int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word) {
|
||||
return handle->IsValidWord(word);
|
||||
}
|
||||
|
||||
BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset, float *out_slope) {
|
||||
return static_cast<int>(handle->GetTextDirection(out_offset, out_slope));
|
||||
}
|
||||
|
||||
const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id) {
|
||||
return handle->GetUnichar(unichar_id);
|
||||
}
|
||||
|
||||
void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle, double margin) {
|
||||
handle->set_min_orientation_margin(margin);
|
||||
}
|
||||
|
||||
int TessBaseAPINumDawgs(const TessBaseAPI *handle) {
|
||||
return handle->NumDawgs();
|
||||
}
|
||||
|
||||
TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle) {
|
||||
return handle->oem();
|
||||
}
|
||||
|
||||
void TessBaseGetBlockTextOrientations(TessBaseAPI *handle, int **block_orientation,
|
||||
bool **vertical_writing) {
|
||||
handle->GetBlockTextOrientations(block_orientation, vertical_writing);
|
||||
}
|
||||
|
||||
void TessPageIteratorDelete(TessPageIterator *handle) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle) {
|
||||
return new TessPageIterator(*handle);
|
||||
}
|
||||
|
||||
void TessPageIteratorBegin(TessPageIterator *handle) {
|
||||
handle->Begin();
|
||||
}
|
||||
|
||||
BOOL TessPageIteratorNext(TessPageIterator *handle, TessPageIteratorLevel level) {
|
||||
return static_cast<int>(handle->Next(level));
|
||||
}
|
||||
|
||||
BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle, TessPageIteratorLevel level) {
|
||||
return static_cast<int>(handle->IsAtBeginningOf(level));
|
||||
}
|
||||
|
||||
BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle, TessPageIteratorLevel level,
|
||||
TessPageIteratorLevel element) {
|
||||
return static_cast<int>(handle->IsAtFinalElement(level, element));
|
||||
}
|
||||
|
||||
BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle, TessPageIteratorLevel level,
|
||||
int *left, int *top, int *right, int *bottom) {
|
||||
return static_cast<int>(handle->BoundingBox(level, left, top, right, bottom));
|
||||
}
|
||||
|
||||
TessPolyBlockType TessPageIteratorBlockType(const TessPageIterator *handle) {
|
||||
return handle->BlockType();
|
||||
}
|
||||
|
||||
struct Pix *TessPageIteratorGetBinaryImage(const TessPageIterator *handle,
|
||||
TessPageIteratorLevel level) {
|
||||
return handle->GetBinaryImage(level);
|
||||
}
|
||||
|
||||
struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle, TessPageIteratorLevel level,
|
||||
int padding, struct Pix *original_image, int *left, int *top) {
|
||||
return handle->GetImage(level, padding, original_image, left, top);
|
||||
}
|
||||
|
||||
BOOL TessPageIteratorBaseline(const TessPageIterator *handle, TessPageIteratorLevel level, int *x1,
|
||||
int *y1, int *x2, int *y2) {
|
||||
return static_cast<int>(handle->Baseline(level, x1, y1, x2, y2));
|
||||
}
|
||||
|
||||
void TessPageIteratorOrientation(TessPageIterator *handle, TessOrientation *orientation,
|
||||
TessWritingDirection *writing_direction,
|
||||
TessTextlineOrder *textline_order, float *deskew_angle) {
|
||||
handle->Orientation(orientation, writing_direction, textline_order, deskew_angle);
|
||||
}
|
||||
|
||||
void TessPageIteratorParagraphInfo(TessPageIterator *handle,
|
||||
TessParagraphJustification *justification, BOOL *is_list_item,
|
||||
BOOL *is_crown, int *first_line_indent) {
|
||||
bool bool_is_list_item;
|
||||
bool bool_is_crown;
|
||||
handle->ParagraphInfo(justification, &bool_is_list_item, &bool_is_crown, first_line_indent);
|
||||
if (is_list_item != nullptr) {
|
||||
*is_list_item = static_cast<int>(bool_is_list_item);
|
||||
}
|
||||
if (is_crown != nullptr) {
|
||||
*is_crown = static_cast<int>(bool_is_crown);
|
||||
}
|
||||
}
|
||||
|
||||
void TessResultIteratorDelete(TessResultIterator *handle) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
TessResultIterator *TessResultIteratorCopy(const TessResultIterator *handle) {
|
||||
return new TessResultIterator(*handle);
|
||||
}
|
||||
|
||||
TessPageIterator *TessResultIteratorGetPageIterator(TessResultIterator *handle) {
|
||||
return handle;
|
||||
}
|
||||
|
||||
const TessPageIterator *TessResultIteratorGetPageIteratorConst(const TessResultIterator *handle) {
|
||||
return handle;
|
||||
}
|
||||
|
||||
TessChoiceIterator *TessResultIteratorGetChoiceIterator(const TessResultIterator *handle) {
|
||||
return new TessChoiceIterator(*handle);
|
||||
}
|
||||
|
||||
BOOL TessResultIteratorNext(TessResultIterator *handle, TessPageIteratorLevel level) {
|
||||
return static_cast<int>(handle->Next(level));
|
||||
}
|
||||
|
||||
char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle, TessPageIteratorLevel level) {
|
||||
return handle->GetUTF8Text(level);
|
||||
}
|
||||
|
||||
float TessResultIteratorConfidence(const TessResultIterator *handle, TessPageIteratorLevel level) {
|
||||
return handle->Confidence(level);
|
||||
}
|
||||
|
||||
const char *TessResultIteratorWordRecognitionLanguage(const TessResultIterator *handle) {
|
||||
return handle->WordRecognitionLanguage();
|
||||
}
|
||||
|
||||
const char *TessResultIteratorWordFontAttributes(const TessResultIterator *handle, BOOL *is_bold,
|
||||
BOOL *is_italic, BOOL *is_underlined,
|
||||
BOOL *is_monospace, BOOL *is_serif,
|
||||
BOOL *is_smallcaps, int *pointsize, int *font_id) {
|
||||
bool bool_is_bold;
|
||||
bool bool_is_italic;
|
||||
bool bool_is_underlined;
|
||||
bool bool_is_monospace;
|
||||
bool bool_is_serif;
|
||||
bool bool_is_smallcaps;
|
||||
const char *ret = handle->WordFontAttributes(&bool_is_bold, &bool_is_italic, &bool_is_underlined,
|
||||
&bool_is_monospace, &bool_is_serif,
|
||||
&bool_is_smallcaps, pointsize, font_id);
|
||||
if (is_bold != nullptr) {
|
||||
*is_bold = static_cast<int>(bool_is_bold);
|
||||
}
|
||||
if (is_italic != nullptr) {
|
||||
*is_italic = static_cast<int>(bool_is_italic);
|
||||
}
|
||||
if (is_underlined != nullptr) {
|
||||
*is_underlined = static_cast<int>(bool_is_underlined);
|
||||
}
|
||||
if (is_monospace != nullptr) {
|
||||
*is_monospace = static_cast<int>(bool_is_monospace);
|
||||
}
|
||||
if (is_serif != nullptr) {
|
||||
*is_serif = static_cast<int>(bool_is_serif);
|
||||
}
|
||||
if (is_smallcaps != nullptr) {
|
||||
*is_smallcaps = static_cast<int>(bool_is_smallcaps);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
BOOL TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle) {
|
||||
return static_cast<int>(handle->WordIsFromDictionary());
|
||||
}
|
||||
|
||||
BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle) {
|
||||
return static_cast<int>(handle->WordIsNumeric());
|
||||
}
|
||||
|
||||
BOOL TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle) {
|
||||
return static_cast<int>(handle->SymbolIsSuperscript());
|
||||
}
|
||||
|
||||
BOOL TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle) {
|
||||
return static_cast<int>(handle->SymbolIsSubscript());
|
||||
}
|
||||
|
||||
BOOL TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle) {
|
||||
return static_cast<int>(handle->SymbolIsDropcap());
|
||||
}
|
||||
|
||||
void TessChoiceIteratorDelete(TessChoiceIterator *handle) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
BOOL TessChoiceIteratorNext(TessChoiceIterator *handle) {
|
||||
return static_cast<int>(handle->Next());
|
||||
}
|
||||
|
||||
const char *TessChoiceIteratorGetUTF8Text(const TessChoiceIterator *handle) {
|
||||
return handle->GetUTF8Text();
|
||||
}
|
||||
|
||||
float TessChoiceIteratorConfidence(const TessChoiceIterator *handle) {
|
||||
return handle->Confidence();
|
||||
}
|
||||
|
||||
ETEXT_DESC *TessMonitorCreate() {
|
||||
return new ETEXT_DESC();
|
||||
}
|
||||
|
||||
void TessMonitorDelete(ETEXT_DESC *monitor) {
|
||||
delete monitor;
|
||||
}
|
||||
|
||||
void TessMonitorSetCancelFunc(ETEXT_DESC *monitor, TessCancelFunc cancelFunc) {
|
||||
monitor->cancel = cancelFunc;
|
||||
}
|
||||
|
||||
void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis) {
|
||||
monitor->cancel_this = cancelThis;
|
||||
}
|
||||
|
||||
void *TessMonitorGetCancelThis(ETEXT_DESC *monitor) {
|
||||
return monitor->cancel_this;
|
||||
}
|
||||
|
||||
void TessMonitorSetProgressFunc(ETEXT_DESC *monitor, TessProgressFunc progressFunc) {
|
||||
monitor->progress_callback2 = progressFunc;
|
||||
}
|
||||
|
||||
int TessMonitorGetProgress(ETEXT_DESC *monitor) {
|
||||
return monitor->progress;
|
||||
}
|
||||
|
||||
void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline) {
|
||||
monitor->set_deadline_msecs(deadline);
|
||||
}
|
489
3rdparty/tesseract_ocr/tesseract/src/api/hocrrenderer.cpp
vendored
Normal file
489
3rdparty/tesseract_ocr/tesseract/src/api/hocrrenderer.cpp
vendored
Normal file
|
@ -0,0 +1,489 @@
|
|||
/**********************************************************************
|
||||
* File: hocrrenderer.cpp
|
||||
* Description: Simple API for calling tesseract.
|
||||
* Author: Ray Smith (original code from baseapi.cpp)
|
||||
* Author: Stefan Weil (moved to separate file and cleaned code)
|
||||
*
|
||||
* (C) Copyright 2006, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include <tesseract/baseapi.h> // for TessBaseAPI
|
||||
#include <locale> // for std::locale::classic
|
||||
#include <memory> // for std::unique_ptr
|
||||
#include <sstream> // for std::stringstream
|
||||
#ifdef _WIN32
|
||||
# include "host.h" // windows.h for MultiByteToWideChar, ...
|
||||
#endif
|
||||
#include <tesseract/renderer.h>
|
||||
#include "tesseractclass.h" // for Tesseract
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
/**
|
||||
* Gets the block orientation at the current iterator position.
|
||||
*/
|
||||
static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) {
|
||||
tesseract::Orientation orientation;
|
||||
tesseract::WritingDirection writing_direction;
|
||||
tesseract::TextlineOrder textline_order;
|
||||
float deskew_angle;
|
||||
it->Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle);
|
||||
return orientation;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fits a line to the baseline at the given level, and appends its coefficients
|
||||
* to the hOCR string.
|
||||
* NOTE: The hOCR spec is unclear on how to specify baseline coefficients for
|
||||
* rotated textlines. For this reason, on textlines that are not upright, this
|
||||
* method currently only inserts a 'textangle' property to indicate the rotation
|
||||
* direction and does not add any baseline information to the hocr string.
|
||||
*/
|
||||
static void AddBaselineCoordsTohOCR(const PageIterator *it, PageIteratorLevel level,
|
||||
std::stringstream &hocr_str) {
|
||||
tesseract::Orientation orientation = GetBlockTextOrientation(it);
|
||||
if (orientation != ORIENTATION_PAGE_UP) {
|
||||
hocr_str << "; textangle " << 360 - orientation * 90;
|
||||
return;
|
||||
}
|
||||
|
||||
int left, top, right, bottom;
|
||||
it->BoundingBox(level, &left, &top, &right, &bottom);
|
||||
|
||||
// Try to get the baseline coordinates at this level.
|
||||
int x1, y1, x2, y2;
|
||||
if (!it->Baseline(level, &x1, &y1, &x2, &y2)) {
|
||||
return;
|
||||
}
|
||||
// Following the description of this field of the hOCR spec, we convert the
|
||||
// baseline coordinates so that "the bottom left of the bounding box is the
|
||||
// origin".
|
||||
x1 -= left;
|
||||
x2 -= left;
|
||||
y1 -= bottom;
|
||||
y2 -= bottom;
|
||||
|
||||
// Now fit a line through the points so we can extract coefficients for the
|
||||
// equation: y = p1 x + p0
|
||||
if (x1 == x2) {
|
||||
// Problem computing the polynomial coefficients.
|
||||
return;
|
||||
}
|
||||
double p1 = (y2 - y1) / static_cast<double>(x2 - x1);
|
||||
double p0 = y1 - p1 * x1;
|
||||
|
||||
hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " " << round(p0 * 1000.0) / 1000.0;
|
||||
}
|
||||
|
||||
static void AddBoxTohOCR(const ResultIterator *it, PageIteratorLevel level,
|
||||
std::stringstream &hocr_str) {
|
||||
int left, top, right, bottom;
|
||||
it->BoundingBox(level, &left, &top, &right, &bottom);
|
||||
// This is the only place we use double quotes instead of single quotes,
|
||||
// but it may too late to change for consistency
|
||||
hocr_str << " title=\"bbox " << left << " " << top << " " << right << " " << bottom;
|
||||
// Add baseline coordinates & heights for textlines only.
|
||||
if (level == RIL_TEXTLINE) {
|
||||
AddBaselineCoordsTohOCR(it, level, hocr_str);
|
||||
// add custom height measures
|
||||
float row_height, descenders, ascenders; // row attributes
|
||||
it->RowAttributes(&row_height, &descenders, &ascenders);
|
||||
// TODO(rays): Do we want to limit these to a single decimal place?
|
||||
hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders << "; x_ascenders "
|
||||
<< ascenders;
|
||||
}
|
||||
hocr_str << "\">";
|
||||
}
|
||||
|
||||
/**
|
||||
* Make a HTML-formatted string with hOCR markup from the internal
|
||||
* data structures.
|
||||
* page_number is 0-based but will appear in the output as 1-based.
|
||||
* Image name/input_file_ can be set by SetInputName before calling
|
||||
* GetHOCRText
|
||||
* STL removed from original patch submission and refactored by rays.
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
char *TessBaseAPI::GetHOCRText(int page_number) {
|
||||
return GetHOCRText(nullptr, page_number);
|
||||
}
|
||||
|
||||
/**
|
||||
* Make a HTML-formatted string with hOCR markup from the internal
|
||||
* data structures.
|
||||
* page_number is 0-based but will appear in the output as 1-based.
|
||||
* Image name/input_file_ can be set by SetInputName before calling
|
||||
* GetHOCRText
|
||||
* STL removed from original patch submission and refactored by rays.
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
|
||||
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, ccnt = 1;
|
||||
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
|
||||
bool para_is_ltr = true; // Default direction is LTR
|
||||
const char *paragraph_lang = nullptr;
|
||||
bool font_info = false;
|
||||
bool hocr_boxes = false;
|
||||
GetBoolVariable("hocr_font_info", &font_info);
|
||||
GetBoolVariable("hocr_char_boxes", &hocr_boxes);
|
||||
|
||||
if (input_file_.empty()) {
|
||||
SetInputName(nullptr);
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
// convert input name from ANSI encoding to utf-8
|
||||
int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
|
||||
wchar_t *uni16_str = new WCHAR[str16_len];
|
||||
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, str16_len);
|
||||
int utf8_len =
|
||||
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr);
|
||||
char *utf8_str = new char[utf8_len];
|
||||
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr);
|
||||
input_file_ = utf8_str;
|
||||
delete[] uni16_str;
|
||||
delete[] utf8_str;
|
||||
#endif
|
||||
|
||||
std::stringstream hocr_str;
|
||||
// Use "C" locale (needed for double values x_size and x_descenders).
|
||||
hocr_str.imbue(std::locale::classic());
|
||||
// Use 8 digits for double values.
|
||||
hocr_str.precision(8);
|
||||
hocr_str << " <div class='ocr_page'"
|
||||
<< " id='"
|
||||
<< "page_" << page_id << "'"
|
||||
<< " title='image \"";
|
||||
if (!input_file_.empty()) {
|
||||
hocr_str << HOcrEscape(input_file_.c_str());
|
||||
} else {
|
||||
hocr_str << "unknown";
|
||||
}
|
||||
hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " " << rect_width_ << " "
|
||||
<< rect_height_ << "; ppageno " << page_number << "'>\n";
|
||||
|
||||
std::unique_ptr<ResultIterator> res_it(GetIterator());
|
||||
while (!res_it->Empty(RIL_BLOCK)) {
|
||||
if (res_it->Empty(RIL_WORD)) {
|
||||
res_it->Next(RIL_WORD);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Open any new block/paragraph/textline.
|
||||
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
|
||||
para_is_ltr = true; // reset to default direction
|
||||
hocr_str << " <div class='ocr_carea'"
|
||||
<< " id='"
|
||||
<< "block_" << page_id << "_" << bcnt << "'";
|
||||
AddBoxTohOCR(res_it.get(), RIL_BLOCK, hocr_str);
|
||||
}
|
||||
if (res_it->IsAtBeginningOf(RIL_PARA)) {
|
||||
hocr_str << "\n <p class='ocr_par'";
|
||||
para_is_ltr = res_it->ParagraphIsLtr();
|
||||
if (!para_is_ltr) {
|
||||
hocr_str << " dir='rtl'";
|
||||
}
|
||||
hocr_str << " id='"
|
||||
<< "par_" << page_id << "_" << pcnt << "'";
|
||||
paragraph_lang = res_it->WordRecognitionLanguage();
|
||||
if (paragraph_lang) {
|
||||
hocr_str << " lang='" << paragraph_lang << "'";
|
||||
}
|
||||
AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str);
|
||||
}
|
||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||
hocr_str << "\n <span class='";
|
||||
switch (res_it->BlockType()) {
|
||||
case PT_HEADING_TEXT:
|
||||
hocr_str << "ocr_header";
|
||||
break;
|
||||
case PT_PULLOUT_TEXT:
|
||||
hocr_str << "ocr_textfloat";
|
||||
break;
|
||||
case PT_CAPTION_TEXT:
|
||||
hocr_str << "ocr_caption";
|
||||
break;
|
||||
default:
|
||||
hocr_str << "ocr_line";
|
||||
}
|
||||
hocr_str << "' id='"
|
||||
<< "line_" << page_id << "_" << lcnt << "'";
|
||||
AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str);
|
||||
}
|
||||
|
||||
// Now, process the word...
|
||||
int32_t lstm_choice_mode = tesseract_->lstm_choice_mode;
|
||||
std::vector<std::vector<std::vector<std::pair<const char *, float>>>> *rawTimestepMap = nullptr;
|
||||
std::vector<std::vector<std::pair<const char *, float>>> *CTCMap = nullptr;
|
||||
if (lstm_choice_mode) {
|
||||
CTCMap = res_it->GetBestLSTMSymbolChoices();
|
||||
rawTimestepMap = res_it->GetRawLSTMTimesteps();
|
||||
}
|
||||
hocr_str << "\n <span class='ocrx_word'"
|
||||
<< " id='"
|
||||
<< "word_" << page_id << "_" << wcnt << "'";
|
||||
int left, top, right, bottom;
|
||||
bool bold, italic, underlined, monospace, serif, smallcaps;
|
||||
int pointsize, font_id;
|
||||
const char *font_name;
|
||||
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
|
||||
font_name = res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif,
|
||||
&smallcaps, &pointsize, &font_id);
|
||||
hocr_str << " title='bbox " << left << " " << top << " " << right << " " << bottom
|
||||
<< "; x_wconf " << static_cast<int>(res_it->Confidence(RIL_WORD));
|
||||
if (font_info) {
|
||||
if (font_name) {
|
||||
hocr_str << "; x_font " << HOcrEscape(font_name).c_str();
|
||||
}
|
||||
hocr_str << "; x_fsize " << pointsize;
|
||||
}
|
||||
hocr_str << "'";
|
||||
const char *lang = res_it->WordRecognitionLanguage();
|
||||
if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
|
||||
hocr_str << " lang='" << lang << "'";
|
||||
}
|
||||
switch (res_it->WordDirection()) {
|
||||
// Only emit direction if different from current paragraph direction
|
||||
case DIR_LEFT_TO_RIGHT:
|
||||
if (!para_is_ltr) {
|
||||
hocr_str << " dir='ltr'";
|
||||
}
|
||||
break;
|
||||
case DIR_RIGHT_TO_LEFT:
|
||||
if (para_is_ltr) {
|
||||
hocr_str << " dir='rtl'";
|
||||
}
|
||||
break;
|
||||
case DIR_MIX:
|
||||
case DIR_NEUTRAL:
|
||||
default: // Do nothing.
|
||||
break;
|
||||
}
|
||||
hocr_str << ">";
|
||||
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
|
||||
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
|
||||
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
|
||||
if (bold) {
|
||||
hocr_str << "<strong>";
|
||||
}
|
||||
if (italic) {
|
||||
hocr_str << "<em>";
|
||||
}
|
||||
do {
|
||||
const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
|
||||
if (grapheme && grapheme[0] != 0) {
|
||||
if (hocr_boxes) {
|
||||
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
|
||||
hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes " << left << " " << top
|
||||
<< " " << right << " " << bottom << "; x_conf " << res_it->Confidence(RIL_SYMBOL)
|
||||
<< "'>";
|
||||
}
|
||||
hocr_str << HOcrEscape(grapheme.get()).c_str();
|
||||
if (hocr_boxes) {
|
||||
hocr_str << "</span>";
|
||||
tesseract::ChoiceIterator ci(*res_it);
|
||||
if (lstm_choice_mode == 1 && ci.Timesteps() != nullptr) {
|
||||
std::vector<std::vector<std::pair<const char *, float>>> *symbol = ci.Timesteps();
|
||||
hocr_str << "\n <span class='ocr_symbol'"
|
||||
<< " id='"
|
||||
<< "symbol_" << page_id << "_" << wcnt << "_" << scnt << "'>";
|
||||
for (auto timestep : *symbol) {
|
||||
hocr_str << "\n <span class='ocrx_cinfo'"
|
||||
<< " id='"
|
||||
<< "timestep" << page_id << "_" << wcnt << "_" << tcnt << "'>";
|
||||
for (auto conf : timestep) {
|
||||
hocr_str << "\n <span class='ocrx_cinfo'"
|
||||
<< " id='"
|
||||
<< "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'"
|
||||
<< " title='x_confs " << int(conf.second * 100) << "'>"
|
||||
<< HOcrEscape(conf.first).c_str() << "</span>";
|
||||
++ccnt;
|
||||
}
|
||||
hocr_str << "</span>";
|
||||
++tcnt;
|
||||
}
|
||||
hocr_str << "\n </span>";
|
||||
++scnt;
|
||||
} else if (lstm_choice_mode == 2) {
|
||||
tesseract::ChoiceIterator ci(*res_it);
|
||||
hocr_str << "\n <span class='ocrx_cinfo'"
|
||||
<< " id='"
|
||||
<< "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt << "'>";
|
||||
do {
|
||||
const char *choice = ci.GetUTF8Text();
|
||||
float choiceconf = ci.Confidence();
|
||||
if (choice != nullptr) {
|
||||
hocr_str << "\n <span class='ocrx_cinfo'"
|
||||
<< " id='"
|
||||
<< "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'"
|
||||
<< " title='x_confs " << choiceconf << "'>" << HOcrEscape(choice).c_str()
|
||||
<< "</span>";
|
||||
ccnt++;
|
||||
}
|
||||
} while (ci.Next());
|
||||
hocr_str << "\n </span>";
|
||||
tcnt++;
|
||||
}
|
||||
}
|
||||
}
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
||||
if (italic) {
|
||||
hocr_str << "</em>";
|
||||
}
|
||||
if (bold) {
|
||||
hocr_str << "</strong>";
|
||||
}
|
||||
// If the lstm choice mode is required it is added here
|
||||
if (lstm_choice_mode == 1 && !hocr_boxes && rawTimestepMap != nullptr) {
|
||||
for (auto symbol : *rawTimestepMap) {
|
||||
hocr_str << "\n <span class='ocr_symbol'"
|
||||
<< " id='"
|
||||
<< "symbol_" << page_id << "_" << wcnt << "_" << scnt << "'>";
|
||||
for (auto timestep : symbol) {
|
||||
hocr_str << "\n <span class='ocrx_cinfo'"
|
||||
<< " id='"
|
||||
<< "timestep" << page_id << "_" << wcnt << "_" << tcnt << "'>";
|
||||
for (auto conf : timestep) {
|
||||
hocr_str << "\n <span class='ocrx_cinfo'"
|
||||
<< " id='"
|
||||
<< "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'"
|
||||
<< " title='x_confs " << int(conf.second * 100) << "'>"
|
||||
<< HOcrEscape(conf.first).c_str() << "</span>";
|
||||
++ccnt;
|
||||
}
|
||||
hocr_str << "</span>";
|
||||
++tcnt;
|
||||
}
|
||||
hocr_str << "</span>";
|
||||
++scnt;
|
||||
}
|
||||
} else if (lstm_choice_mode == 2 && !hocr_boxes && CTCMap != nullptr) {
|
||||
for (auto timestep : *CTCMap) {
|
||||
if (timestep.size() > 0) {
|
||||
hocr_str << "\n <span class='ocrx_cinfo'"
|
||||
<< " id='"
|
||||
<< "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt << "'>";
|
||||
for (auto &j : timestep) {
|
||||
float conf = 100 - tesseract_->lstm_rating_coefficient * j.second;
|
||||
if (conf < 0.0f) {
|
||||
conf = 0.0f;
|
||||
}
|
||||
if (conf > 100.0f) {
|
||||
conf = 100.0f;
|
||||
}
|
||||
hocr_str << "\n <span class='ocrx_cinfo'"
|
||||
<< " id='"
|
||||
<< "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'"
|
||||
<< " title='x_confs " << conf << "'>" << HOcrEscape(j.first).c_str()
|
||||
<< "</span>";
|
||||
ccnt++;
|
||||
}
|
||||
hocr_str << "</span>";
|
||||
tcnt++;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Close ocrx_word.
|
||||
if (hocr_boxes || lstm_choice_mode > 0) {
|
||||
hocr_str << "\n ";
|
||||
}
|
||||
hocr_str << "</span>";
|
||||
tcnt = 1;
|
||||
ccnt = 1;
|
||||
wcnt++;
|
||||
// Close any ending block/paragraph/textline.
|
||||
if (last_word_in_line) {
|
||||
hocr_str << "\n </span>";
|
||||
lcnt++;
|
||||
}
|
||||
if (last_word_in_para) {
|
||||
hocr_str << "\n </p>\n";
|
||||
pcnt++;
|
||||
para_is_ltr = true; // back to default direction
|
||||
}
|
||||
if (last_word_in_block) {
|
||||
hocr_str << " </div>\n";
|
||||
bcnt++;
|
||||
}
|
||||
}
|
||||
hocr_str << " </div>\n";
|
||||
|
||||
const std::string &text = hocr_str.str();
|
||||
char *result = new char[text.length() + 1];
|
||||
strcpy(result, text.c_str());
|
||||
return result;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* HOcr Text Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessHOcrRenderer::TessHOcrRenderer(const char *outputbase)
|
||||
: TessResultRenderer(outputbase, "hocr") {
|
||||
font_info_ = false;
|
||||
}
|
||||
|
||||
TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info)
|
||||
: TessResultRenderer(outputbase, "hocr") {
|
||||
font_info_ = font_info;
|
||||
}
|
||||
|
||||
bool TessHOcrRenderer::BeginDocumentHandler() {
|
||||
AppendString(
|
||||
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
|
||||
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
|
||||
" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
|
||||
"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
|
||||
"lang=\"en\">\n <head>\n <title>");
|
||||
AppendString(title());
|
||||
AppendString(
|
||||
"</title>\n"
|
||||
" <meta http-equiv=\"Content-Type\" content=\"text/html;"
|
||||
"charset=utf-8\"/>\n"
|
||||
" <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR
|
||||
"' />\n"
|
||||
" <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
|
||||
" ocr_line ocrx_word ocrp_wconf");
|
||||
if (font_info_) {
|
||||
AppendString(" ocrp_lang ocrp_dir ocrp_font ocrp_fsize");
|
||||
}
|
||||
AppendString(
|
||||
"'/>\n"
|
||||
" </head>\n"
|
||||
" <body>\n");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TessHOcrRenderer::EndDocumentHandler() {
|
||||
AppendString(" </body>\n</html>\n");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TessHOcrRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
const std::unique_ptr<const char[]> hocr(api->GetHOCRText(imagenum()));
|
||||
if (hocr == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AppendString(hocr.get());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
107
3rdparty/tesseract_ocr/tesseract/src/api/lstmboxrenderer.cpp
vendored
Normal file
107
3rdparty/tesseract_ocr/tesseract/src/api/lstmboxrenderer.cpp
vendored
Normal file
|
@ -0,0 +1,107 @@
|
|||
/**********************************************************************
|
||||
* File: lstmboxrenderer.cpp
|
||||
* Description: Renderer for creating box file for LSTM training.
|
||||
* based on the tsv renderer.
|
||||
*
|
||||
* (C) Copyright 2019, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include <tesseract/baseapi.h> // for TessBaseAPI
|
||||
#include <tesseract/renderer.h>
|
||||
#include "tesseractclass.h" // for Tesseract
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
/**
|
||||
* Create a UTF8 box file for LSTM training from the internal data structures.
|
||||
* page_number is a 0-base page index that will appear in the box file.
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
static void AddBoxToLSTM(int right, int bottom, int top, int image_height, int page_num,
|
||||
std::string &text) {
|
||||
text += " " + std::to_string(image_height - bottom);
|
||||
text += " " + std::to_string(right + 5);
|
||||
text += " " + std::to_string(image_height - top);
|
||||
text += " " + std::to_string(page_num);
|
||||
}
|
||||
|
||||
char *TessBaseAPI::GetLSTMBoxText(int page_number = 0) {
|
||||
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::string lstm_box_str;
|
||||
bool first_word = true;
|
||||
int left = 0, top = 0, right = 0, bottom = 0;
|
||||
|
||||
LTRResultIterator *res_it = GetLTRIterator();
|
||||
while (!res_it->Empty(RIL_BLOCK)) {
|
||||
if (res_it->Empty(RIL_SYMBOL)) {
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
continue;
|
||||
}
|
||||
if (!first_word) {
|
||||
if (!(res_it->IsAtBeginningOf(RIL_TEXTLINE))) {
|
||||
if (res_it->IsAtBeginningOf(RIL_WORD)) {
|
||||
lstm_box_str += " " + std::to_string(left);
|
||||
AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
|
||||
lstm_box_str += "\n"; // end of row for word
|
||||
} // word
|
||||
} else {
|
||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||
lstm_box_str += "\t " + std::to_string(left);
|
||||
AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
|
||||
lstm_box_str += "\n"; // end of row for line
|
||||
} // line
|
||||
}
|
||||
} // not first word
|
||||
first_word = false;
|
||||
// Use bounding box for whole line for everything
|
||||
res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
|
||||
do {
|
||||
lstm_box_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_SYMBOL));
|
||||
lstm_box_str += " " + std::to_string(left);
|
||||
AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
|
||||
lstm_box_str += "\n"; // end of row for symbol
|
||||
}
|
||||
if (!first_word) { // if first_word is true => empty page
|
||||
lstm_box_str += "\t " + std::to_string(left);
|
||||
AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
|
||||
lstm_box_str += "\n"; // end of PAGE
|
||||
}
|
||||
char *ret = new char[lstm_box_str.length() + 1];
|
||||
strcpy(ret, lstm_box_str.c_str());
|
||||
delete res_it;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* LSTMBox Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessLSTMBoxRenderer::TessLSTMBoxRenderer(const char *outputbase)
|
||||
: TessResultRenderer(outputbase, "box") {}
|
||||
|
||||
bool TessLSTMBoxRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
const std::unique_ptr<const char[]> lstmbox(api->GetLSTMBoxText(imagenum()));
|
||||
if (lstmbox == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AppendString(lstmbox.get());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
63
3rdparty/tesseract_ocr/tesseract/src/api/pdf_ttf.h
vendored
Normal file
63
3rdparty/tesseract_ocr/tesseract/src/api/pdf_ttf.h
vendored
Normal file
|
@ -0,0 +1,63 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: pdf_ttf.h
|
||||
// Description: pdf.ttf (GlyphLessFont) replacement.
|
||||
// Generated with: "bin2cpp pdf.ttf pdf_ttf cpp17"
|
||||
// Author: Zdenko Podobny
|
||||
//
|
||||
// (C) Copyright 2020, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef pdf_ttf__H
|
||||
#define pdf_ttf__H
|
||||
|
||||
#include <cstdint> // uint8_t
|
||||
|
||||
static const uint8_t pdf_ttf[] = {
|
||||
0x0, 0x1, 0x0, 0x0, 0x0, 0xa, 0x0, 0x80, 0x0, 0x3, 0x0, 0x20, 0x4f, 0x53, 0x2f, 0x32,
|
||||
0x56, 0xde, 0xc8, 0x94, 0x0, 0x0, 0x1, 0x28, 0x0, 0x0, 0x0, 0x60, 0x63, 0x6d, 0x61, 0x70,
|
||||
0x0, 0xa, 0x0, 0x34, 0x0, 0x0, 0x1, 0x90, 0x0, 0x0, 0x0, 0x1e, 0x67, 0x6c, 0x79, 0x66,
|
||||
0x15, 0x22, 0x41, 0x24, 0x0, 0x0, 0x1, 0xb8, 0x0, 0x0, 0x0, 0x18, 0x68, 0x65, 0x61, 0x64,
|
||||
0xb, 0x78, 0xf1, 0x65, 0x0, 0x0, 0x0, 0xac, 0x0, 0x0, 0x0, 0x36, 0x68, 0x68, 0x65, 0x61,
|
||||
0xc, 0x2, 0x4, 0x2, 0x0, 0x0, 0x0, 0xe4, 0x0, 0x0, 0x0, 0x24, 0x68, 0x6d, 0x74, 0x78,
|
||||
0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x88, 0x0, 0x0, 0x0, 0x8, 0x6c, 0x6f, 0x63, 0x61,
|
||||
0x0, 0xc, 0x0, 0x0, 0x0, 0x0, 0x1, 0xb0, 0x0, 0x0, 0x0, 0x6, 0x6d, 0x61, 0x78, 0x70,
|
||||
0x0, 0x4, 0x0, 0x5, 0x0, 0x0, 0x1, 0x8, 0x0, 0x0, 0x0, 0x20, 0x6e, 0x61, 0x6d, 0x65,
|
||||
0xf2, 0xeb, 0x16, 0xda, 0x0, 0x0, 0x1, 0xd0, 0x0, 0x0, 0x0, 0x4b, 0x70, 0x6f, 0x73, 0x74,
|
||||
0x0, 0x1, 0x0, 0x1, 0x0, 0x0, 0x2, 0x1c, 0x0, 0x0, 0x0, 0x20, 0x0, 0x1, 0x0, 0x0,
|
||||
0x0, 0x1, 0x0, 0x0, 0xb0, 0x94, 0x71, 0x10, 0x5f, 0xf, 0x3c, 0xf5, 0x4, 0x7, 0x8, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0xcf, 0x9a, 0xfc, 0x6e, 0x0, 0x0, 0x0, 0x0, 0xd4, 0xc3, 0xa7, 0xf2,
|
||||
0x0, 0x0, 0x0, 0x0, 0x4, 0x0, 0x8, 0x0, 0x0, 0x0, 0x0, 0x10, 0x0, 0x2, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x8, 0x0, 0xff, 0xff, 0x0, 0x0, 0x4, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x4, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x1, 0x0, 0x0, 0x0, 0x2, 0x0, 0x4,
|
||||
0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x1, 0x90, 0x0, 0x5,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5, 0x0, 0x1, 0x0, 0x1, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x47, 0x4f, 0x4f, 0x47, 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0xff, 0xff,
|
||||
0x0, 0x0, 0x0, 0x1, 0x0, 0x1, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x2, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x14, 0x0, 0x3, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x14, 0x0, 0x6, 0x0, 0xa, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0xc, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x4, 0x0,
|
||||
0x8, 0x0, 0x0, 0x3, 0x0, 0x0, 0x31, 0x21, 0x11, 0x21, 0x4, 0x0, 0xfc, 0x0, 0x8, 0x0,
|
||||
0x0, 0x0, 0x0, 0x3, 0x0, 0x2a, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x5, 0x0, 0x16,
|
||||
0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5, 0x0, 0xb, 0x0, 0x16, 0x0, 0x3,
|
||||
0x0, 0x1, 0x4, 0x9, 0x0, 0x5, 0x0, 0x16, 0x0, 0x0, 0x0, 0x56, 0x0, 0x65, 0x0, 0x72,
|
||||
0x0, 0x73, 0x0, 0x69, 0x0, 0x6f, 0x0, 0x6e, 0x0, 0x20, 0x0, 0x31, 0x0, 0x2e, 0x0, 0x30,
|
||||
0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x31, 0x2e, 0x30, 0x0, 0x0, 0x1, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
|
||||
|
||||
#endif
|
969
3rdparty/tesseract_ocr/tesseract/src/api/pdfrenderer.cpp
vendored
Normal file
969
3rdparty/tesseract_ocr/tesseract/src/api/pdfrenderer.cpp
vendored
Normal file
|
@ -0,0 +1,969 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: pdfrenderer.cpp
|
||||
// Description: PDF rendering interface to inject into TessBaseAPI
|
||||
//
|
||||
// (C) Copyright 2011, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Include automatically generated configuration file if running autoconf.
|
||||
#ifdef HAVE_CONFIG_H
|
||||
# include "config_auto.h"
|
||||
#endif
|
||||
|
||||
#include "pdf_ttf.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
#include <allheaders.h>
|
||||
#include <tesseract/baseapi.h>
|
||||
#include <tesseract/renderer.h>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <fstream> // for std::ifstream
|
||||
#include <locale> // for std::locale::classic
|
||||
#include <memory> // std::unique_ptr
|
||||
#include <sstream> // for std::stringstream
|
||||
#include "helpers.h" // for Swap
|
||||
|
||||
/*
|
||||
|
||||
Design notes from Ken Sharp, with light editing.
|
||||
|
||||
We think one solution is a font with a single glyph (.notdef) and a
|
||||
CIDToGIDMap which maps all the CIDs to 0. That map would then be
|
||||
stored as a stream in the PDF file, and when flat compressed should
|
||||
be pretty small. The font, of course, will be approximately the same
|
||||
size as the one you currently use.
|
||||
|
||||
I'm working on such a font now, the CIDToGIDMap is trivial, you just
|
||||
create a stream object which contains 128k bytes (2 bytes per possible
|
||||
CID and your CIDs range from 0 to 65535) and where you currently have
|
||||
"/CIDToGIDMap /Identity" you would have "/CIDToGIDMap <object> 0 R".
|
||||
|
||||
Note that if, in future, you were to use a different (ie not 2 byte)
|
||||
CMap for character codes you could trivially extend the CIDToGIDMap.
|
||||
|
||||
The following is an explanation of how some of the font stuff works,
|
||||
this may be too simple for you in which case please accept my
|
||||
apologies, its hard to know how much knowledge someone has. You can
|
||||
skip all this anyway, its just for information.
|
||||
|
||||
The font embedded in a PDF file is usually intended just to be
|
||||
rendered, but extensions allow for at least some ability to locate (or
|
||||
copy) text from a document. This isn't something which was an original
|
||||
goal of the PDF format, but its been retro-fitted, presumably due to
|
||||
popular demand.
|
||||
|
||||
To do this reliably the PDF file must contain a ToUnicode CMap, a
|
||||
device for mapping character codes to Unicode code points. If one of
|
||||
these is present, then this will be used to convert the character
|
||||
codes into Unicode values. If its not present then the reader will
|
||||
fall back through a series of heuristics to try and guess the
|
||||
result. This is, as you would expect, prone to failure.
|
||||
|
||||
This doesn't concern you of course, since you always write a ToUnicode
|
||||
CMap, so because you are writing the text in text rendering mode 3 it
|
||||
would seem that you don't really need to worry about this, but in the
|
||||
PDF spec you cannot have an isolated ToUnicode CMap, it has to be
|
||||
attached to a font, so in order to get even copy/paste to work you
|
||||
need to define a font.
|
||||
|
||||
This is what leads to problems, tools like pdfwrite assume that they
|
||||
are going to be able to (or even have to) modify the font entries, so
|
||||
they require that the font being embedded be valid, and to be honest
|
||||
the font Tesseract embeds isn't valid (for this purpose).
|
||||
|
||||
|
||||
To see why lets look at how text is specified in a PDF file:
|
||||
|
||||
(Test) Tj
|
||||
|
||||
Now that looks like text but actually it isn't. Each of those bytes is
|
||||
a 'character code'. When it comes to rendering the text a complex
|
||||
sequence of events takes place, which converts the character code into
|
||||
'something' which the font understands. Its entirely possible via
|
||||
character mappings to have that text render as 'Sftu'
|
||||
|
||||
For simple fonts (PostScript type 1), we use the character code as the
|
||||
index into an Encoding array (256 elements), each element of which is
|
||||
a glyph name, so this gives us a glyph name. We then consult the
|
||||
CharStrings dictionary in the font, that's a complex object which
|
||||
contains pairs of keys and values, you can use the key to retrieve a
|
||||
given value. So we have a glyph name, we then use that as the key to
|
||||
the dictionary and retrieve the associated value. For a type 1 font,
|
||||
the value is a glyph program that describes how to draw the glyph.
|
||||
|
||||
For CIDFonts, its a little more complicated. Because CIDFonts can be
|
||||
large, using a glyph name as the key is unreasonable (it would also
|
||||
lead to unfeasibly large Encoding arrays), so instead we use a 'CID'
|
||||
as the key. CIDs are just numbers.
|
||||
|
||||
But.... We don't use the character code as the CID. What we do is use
|
||||
a CMap to convert the character code into a CID. We then use the CID
|
||||
to key the CharStrings dictionary and proceed as before. So the 'CMap'
|
||||
is the equivalent of the Encoding array, but its a more compact and
|
||||
flexible representation.
|
||||
|
||||
Note that you have to use the CMap just to find out how many bytes
|
||||
constitute a character code, and it can be variable. For example you
|
||||
can say if the first byte is 0x00->0x7f then its just one byte, if its
|
||||
0x80->0xf0 then its 2 bytes and if its 0xf0->0xff then its 3 bytes. I
|
||||
have seen CMaps defining character codes up to 5 bytes wide.
|
||||
|
||||
Now that's fine for 'PostScript' CIDFonts, but its not sufficient for
|
||||
TrueType CIDFonts. The thing is that TrueType fonts are accessed using
|
||||
a Glyph ID (GID) (and the LOCA table) which may well not be anything
|
||||
like the CID. So for this case PDF includes a CIDToGIDMap. That maps
|
||||
the CIDs to GIDs, and we can then use the GID to get the glyph
|
||||
description from the GLYF table of the font.
|
||||
|
||||
So for a TrueType CIDFont, character-code->CID->GID->glyf-program.
|
||||
|
||||
Looking at the PDF file I was supplied with we see that it contains
|
||||
text like :
|
||||
|
||||
<0x0075> Tj
|
||||
|
||||
So we start by taking the character code (117) and look it up in the
|
||||
CMap. Well you don't supply a CMap, you just use the Identity-H one
|
||||
which is predefined. So character code 117 maps to CID 117. Then we
|
||||
use the CIDToGIDMap, again you don't supply one, you just use the
|
||||
predefined 'Identity' map. So CID 117 maps to GID 117. But the font we
|
||||
were supplied with only contains 116 glyphs.
|
||||
|
||||
Now for Latin that's not a huge problem, you can just supply a bigger
|
||||
font. But for more complex languages that *is* going to be more of a
|
||||
problem. Either you need to supply a font which contains glyphs for
|
||||
all the possible CID->GID mappings, or we need to think laterally.
|
||||
|
||||
Our solution using a TrueType CIDFont is to intervene at the
|
||||
CIDToGIDMap stage and convert all the CIDs to GID 0. Then we have a
|
||||
font with just one glyph, the .notdef glyph at GID 0. This is what I'm
|
||||
looking into now.
|
||||
|
||||
It would also be possible to have a 'PostScript' (ie type 1 outlines)
|
||||
CIDFont which contained 1 glyph, and a CMap which mapped all character
|
||||
codes to CID 0. The effect would be the same.
|
||||
|
||||
Its possible (I haven't checked) that the PostScript CIDFont and
|
||||
associated CMap would be smaller than the TrueType font and associated
|
||||
CIDToGIDMap.
|
||||
|
||||
--- in a followup ---
|
||||
|
||||
OK there is a small problem there, if I use GID 0 then Acrobat gets
|
||||
upset about it and complains it cannot extract the font. If I set the
|
||||
CIDToGIDMap so that all the entries are 1 instead, it's happy. Totally
|
||||
mad......
|
||||
|
||||
*/
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// If the font is 10 pts, nominal character width is 5 pts
|
||||
static const int kCharWidth = 2;
|
||||
|
||||
// Used for memory allocation. A codepoint must take no more than this
|
||||
// many bytes, when written in the PDF way. e.g. "<0063>" for the
|
||||
// letter 'c'
|
||||
static const int kMaxBytesPerCodepoint = 20;
|
||||
|
||||
/**********************************************************************
|
||||
* PDF Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessPDFRenderer::TessPDFRenderer(const char *outputbase, const char *datadir, bool textonly)
|
||||
: TessResultRenderer(outputbase, "pdf"), datadir_(datadir) {
|
||||
obj_ = 0;
|
||||
textonly_ = textonly;
|
||||
offsets_.push_back(0);
|
||||
}
|
||||
|
||||
void TessPDFRenderer::AppendPDFObjectDIY(size_t objectsize) {
|
||||
offsets_.push_back(objectsize + offsets_.back());
|
||||
obj_++;
|
||||
}
|
||||
|
||||
void TessPDFRenderer::AppendPDFObject(const char *data) {
|
||||
AppendPDFObjectDIY(strlen(data));
|
||||
AppendString(data);
|
||||
}
|
||||
|
||||
// Helper function to prevent us from accidentally writing
|
||||
// scientific notation to an HOCR or PDF file. Besides, three
|
||||
// decimal points are all you really need.
|
||||
static double prec(double x) {
|
||||
double kPrecision = 1000.0;
|
||||
double a = round(x * kPrecision) / kPrecision;
|
||||
if (a == -0) {
|
||||
return 0;
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
static long dist2(int x1, int y1, int x2, int y2) {
|
||||
return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
|
||||
}
|
||||
|
||||
// Viewers like evince can get really confused during copy-paste when
|
||||
// the baseline wanders around. So I've decided to project every word
|
||||
// onto the (straight) line baseline. All numbers are in the native
|
||||
// PDF coordinate system, which has the origin in the bottom left and
|
||||
// the unit is points, which is 1/72 inch. Tesseract reports baselines
|
||||
// left-to-right no matter what the reading order is. We need the
|
||||
// word baseline in reading order, so we do that conversion here. Returns
|
||||
// the word's baseline origin and length.
|
||||
static void GetWordBaseline(int writing_direction, int ppi, int height, int word_x1, int word_y1,
|
||||
int word_x2, int word_y2, int line_x1, int line_y1, int line_x2,
|
||||
int line_y2, double *x0, double *y0, double *length) {
|
||||
if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {
|
||||
std::swap(word_x1, word_x2);
|
||||
std::swap(word_y1, word_y2);
|
||||
}
|
||||
double word_length;
|
||||
double x, y;
|
||||
{
|
||||
int px = word_x1;
|
||||
int py = word_y1;
|
||||
double l2 = dist2(line_x1, line_y1, line_x2, line_y2);
|
||||
if (l2 == 0) {
|
||||
x = line_x1;
|
||||
y = line_y1;
|
||||
} else {
|
||||
double t = ((px - line_x2) * (line_x2 - line_x1) + (py - line_y2) * (line_y2 - line_y1)) / l2;
|
||||
x = line_x2 + t * (line_x2 - line_x1);
|
||||
y = line_y2 + t * (line_y2 - line_y1);
|
||||
}
|
||||
word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1, word_x2, word_y2)));
|
||||
word_length = word_length * 72.0 / ppi;
|
||||
x = x * 72 / ppi;
|
||||
y = height - (y * 72.0 / ppi);
|
||||
}
|
||||
*x0 = x;
|
||||
*y0 = y;
|
||||
*length = word_length;
|
||||
}
|
||||
|
||||
// Compute coefficients for an affine matrix describing the rotation
|
||||
// of the text. If the text is right-to-left such as Arabic or Hebrew,
|
||||
// we reflect over the Y-axis. This matrix will set the coordinate
|
||||
// system for placing text in the PDF file.
|
||||
//
|
||||
// RTL
|
||||
// [ x' ] = [ a b ][ x ] = [-1 0 ] [ cos sin ][ x ]
|
||||
// [ y' ] [ c d ][ y ] [ 0 1 ] [-sin cos ][ y ]
|
||||
static void AffineMatrix(int writing_direction, int line_x1, int line_y1, int line_x2, int line_y2,
|
||||
double *a, double *b, double *c, double *d) {
|
||||
double theta =
|
||||
atan2(static_cast<double>(line_y1 - line_y2), static_cast<double>(line_x2 - line_x1));
|
||||
*a = cos(theta);
|
||||
*b = sin(theta);
|
||||
*c = -sin(theta);
|
||||
*d = cos(theta);
|
||||
switch (writing_direction) {
|
||||
case WRITING_DIRECTION_RIGHT_TO_LEFT:
|
||||
*a = -*a;
|
||||
*b = -*b;
|
||||
break;
|
||||
case WRITING_DIRECTION_TOP_TO_BOTTOM:
|
||||
// TODO(jbreiden) Consider using the vertical PDF writing mode.
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// There are some really awkward PDF viewers in the wild, such as
|
||||
// 'Preview' which ships with the Mac. They do a better job with text
|
||||
// selection and highlighting when given perfectly flat baseline
|
||||
// instead of very slightly tilted. We clip small tilts to appease
|
||||
// these viewers. I chose this threshold large enough to absorb noise,
|
||||
// but small enough that lines probably won't cross each other if the
|
||||
// whole page is tilted at almost exactly the clipping threshold.
|
||||
static void ClipBaseline(int ppi, int x1, int y1, int x2, int y2, int *line_x1, int *line_y1,
|
||||
int *line_x2, int *line_y2) {
|
||||
*line_x1 = x1;
|
||||
*line_y1 = y1;
|
||||
*line_x2 = x2;
|
||||
*line_y2 = y2;
|
||||
int rise = abs(y2 - y1) * 72;
|
||||
int run = abs(x2 - x1) * 72;
|
||||
if (rise < 2 * ppi && 2 * ppi < run) {
|
||||
*line_y1 = *line_y2 = (y1 + y2) / 2;
|
||||
}
|
||||
}
|
||||
|
||||
static bool CodepointToUtf16be(int code, char utf16[kMaxBytesPerCodepoint]) {
|
||||
if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
|
||||
tprintf("Dropping invalid codepoint %d\n", code);
|
||||
return false;
|
||||
}
|
||||
if (code < 0x10000) {
|
||||
snprintf(utf16, kMaxBytesPerCodepoint, "%04X", code);
|
||||
} else {
|
||||
int a = code - 0x010000;
|
||||
int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
|
||||
int low_surrogate = (0x03FF & a) + 0xDC00;
|
||||
snprintf(utf16, kMaxBytesPerCodepoint, "%04X%04X", high_surrogate, low_surrogate);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double height) {
|
||||
double ppi = api->GetSourceYResolution();
|
||||
|
||||
// These initial conditions are all arbitrary and will be overwritten
|
||||
double old_x = 0.0, old_y = 0.0;
|
||||
int old_fontsize = 0;
|
||||
tesseract::WritingDirection old_writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;
|
||||
bool new_block = true;
|
||||
int fontsize = 0;
|
||||
double a = 1;
|
||||
double b = 0;
|
||||
double c = 0;
|
||||
double d = 1;
|
||||
|
||||
std::stringstream pdf_str;
|
||||
// Use "C" locale (needed for double values prec()).
|
||||
pdf_str.imbue(std::locale::classic());
|
||||
// Use 8 digits for double values.
|
||||
pdf_str.precision(8);
|
||||
|
||||
// TODO(jbreiden) This marries the text and image together.
|
||||
// Slightly cleaner from an abstraction standpoint if this were to
|
||||
// live inside a separate text object.
|
||||
pdf_str << "q " << prec(width) << " 0 0 " << prec(height) << " 0 0 cm";
|
||||
if (!textonly_) {
|
||||
pdf_str << " /Im1 Do";
|
||||
}
|
||||
pdf_str << " Q\n";
|
||||
|
||||
int line_x1 = 0;
|
||||
int line_y1 = 0;
|
||||
int line_x2 = 0;
|
||||
int line_y2 = 0;
|
||||
|
||||
const std::unique_ptr</*non-const*/ ResultIterator> res_it(api->GetIterator());
|
||||
while (!res_it->Empty(RIL_BLOCK)) {
|
||||
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
|
||||
pdf_str << "BT\n3 Tr"; // Begin text object, use invisible ink
|
||||
old_fontsize = 0; // Every block will declare its fontsize
|
||||
new_block = true; // Every block will declare its affine matrix
|
||||
}
|
||||
|
||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||
int x1, y1, x2, y2;
|
||||
res_it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
|
||||
ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
|
||||
}
|
||||
|
||||
if (res_it->Empty(RIL_WORD)) {
|
||||
res_it->Next(RIL_WORD);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Writing direction changes at a per-word granularity
|
||||
tesseract::WritingDirection writing_direction;
|
||||
{
|
||||
tesseract::Orientation orientation;
|
||||
tesseract::TextlineOrder textline_order;
|
||||
float deskew_angle;
|
||||
res_it->Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle);
|
||||
if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) {
|
||||
switch (res_it->WordDirection()) {
|
||||
case DIR_LEFT_TO_RIGHT:
|
||||
writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;
|
||||
break;
|
||||
case DIR_RIGHT_TO_LEFT:
|
||||
writing_direction = WRITING_DIRECTION_RIGHT_TO_LEFT;
|
||||
break;
|
||||
default:
|
||||
writing_direction = old_writing_direction;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Where is word origin and how long is it?
|
||||
double x, y, word_length;
|
||||
{
|
||||
int word_x1, word_y1, word_x2, word_y2;
|
||||
res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
|
||||
GetWordBaseline(writing_direction, ppi, height, word_x1, word_y1, word_x2, word_y2, line_x1,
|
||||
line_y1, line_x2, line_y2, &x, &y, &word_length);
|
||||
}
|
||||
|
||||
if (writing_direction != old_writing_direction || new_block) {
|
||||
AffineMatrix(writing_direction, line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
|
||||
pdf_str << " " << prec(a) // . This affine matrix
|
||||
<< " " << prec(b) // . sets the coordinate
|
||||
<< " " << prec(c) // . system for all
|
||||
<< " " << prec(d) // . text that follows.
|
||||
<< " " << prec(x) // .
|
||||
<< " " << prec(y) // .
|
||||
<< (" Tm "); // Place cursor absolutely
|
||||
new_block = false;
|
||||
} else {
|
||||
double dx = x - old_x;
|
||||
double dy = y - old_y;
|
||||
pdf_str << " " << prec(dx * a + dy * b) << " " << prec(dx * c + dy * d)
|
||||
<< (" Td "); // Relative moveto
|
||||
}
|
||||
old_x = x;
|
||||
old_y = y;
|
||||
old_writing_direction = writing_direction;
|
||||
|
||||
// Adjust font size on a per word granularity. Pay attention to
|
||||
// fontsize, old_fontsize, and pdf_str. We've found that for
|
||||
// in Arabic, Tesseract will happily return a fontsize of zero,
|
||||
// so we make up a default number to protect ourselves.
|
||||
{
|
||||
bool bold, italic, underlined, monospace, serif, smallcaps;
|
||||
int font_id;
|
||||
res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps,
|
||||
&fontsize, &font_id);
|
||||
const int kDefaultFontsize = 8;
|
||||
if (fontsize <= 0) {
|
||||
fontsize = kDefaultFontsize;
|
||||
}
|
||||
if (fontsize != old_fontsize) {
|
||||
pdf_str << "/f-0-0 " << fontsize << " Tf ";
|
||||
old_fontsize = fontsize;
|
||||
}
|
||||
}
|
||||
|
||||
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
|
||||
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
|
||||
std::string pdf_word;
|
||||
int pdf_word_len = 0;
|
||||
do {
|
||||
const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
|
||||
if (grapheme && grapheme[0] != '\0') {
|
||||
std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(grapheme.get());
|
||||
char utf16[kMaxBytesPerCodepoint];
|
||||
for (char32 code : unicodes) {
|
||||
if (CodepointToUtf16be(code, utf16)) {
|
||||
pdf_word += utf16;
|
||||
pdf_word_len++;
|
||||
}
|
||||
}
|
||||
}
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
||||
if (res_it->IsAtBeginningOf(RIL_WORD)) {
|
||||
pdf_word += "0020";
|
||||
pdf_word_len++;
|
||||
}
|
||||
if (word_length > 0 && pdf_word_len > 0) {
|
||||
double h_stretch = kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
|
||||
pdf_str << h_stretch << " Tz" // horizontal stretch
|
||||
<< " [ <" << pdf_word // UTF-16BE representation
|
||||
<< "> ] TJ"; // show the text
|
||||
}
|
||||
if (last_word_in_line) {
|
||||
pdf_str << " \n";
|
||||
}
|
||||
if (last_word_in_block) {
|
||||
pdf_str << "ET\n"; // end the text object
|
||||
}
|
||||
}
|
||||
const std::string &text = pdf_str.str();
|
||||
char *result = new char[text.length() + 1];
|
||||
strcpy(result, text.c_str());
|
||||
return result;
|
||||
}
|
||||
|
||||
bool TessPDFRenderer::BeginDocumentHandler() {
|
||||
AppendPDFObject("%PDF-1.5\n%\xDE\xAD\xBE\xEB\n");
|
||||
|
||||
// CATALOG
|
||||
AppendPDFObject(
|
||||
"1 0 obj\n"
|
||||
"<<\n"
|
||||
" /Type /Catalog\n"
|
||||
" /Pages 2 0 R\n"
|
||||
">>\nendobj\n");
|
||||
|
||||
// We are reserving object #2 for the /Pages
|
||||
// object, which I am going to create and write
|
||||
// at the end of the PDF file.
|
||||
AppendPDFObject("");
|
||||
|
||||
// TYPE0 FONT
|
||||
AppendPDFObject(
|
||||
"3 0 obj\n"
|
||||
"<<\n"
|
||||
" /BaseFont /GlyphLessFont\n"
|
||||
" /DescendantFonts [ 4 0 R ]\n" // CIDFontType2 font
|
||||
" /Encoding /Identity-H\n"
|
||||
" /Subtype /Type0\n"
|
||||
" /ToUnicode 6 0 R\n" // ToUnicode
|
||||
" /Type /Font\n"
|
||||
">>\n"
|
||||
"endobj\n");
|
||||
|
||||
// CIDFONTTYPE2
|
||||
std::stringstream stream;
|
||||
// Use "C" locale (needed for int values larger than 999).
|
||||
stream.imbue(std::locale::classic());
|
||||
stream << "4 0 obj\n"
|
||||
"<<\n"
|
||||
" /BaseFont /GlyphLessFont\n"
|
||||
" /CIDToGIDMap 5 0 R\n" // CIDToGIDMap
|
||||
" /CIDSystemInfo\n"
|
||||
" <<\n"
|
||||
" /Ordering (Identity)\n"
|
||||
" /Registry (Adobe)\n"
|
||||
" /Supplement 0\n"
|
||||
" >>\n"
|
||||
" /FontDescriptor 7 0 R\n" // Font descriptor
|
||||
" /Subtype /CIDFontType2\n"
|
||||
" /Type /Font\n"
|
||||
" /DW "
|
||||
<< (1000 / kCharWidth)
|
||||
<< "\n"
|
||||
">>\n"
|
||||
"endobj\n";
|
||||
AppendPDFObject(stream.str().c_str());
|
||||
|
||||
// CIDTOGIDMAP
|
||||
const int kCIDToGIDMapSize = 2 * (1 << 16);
|
||||
const std::unique_ptr<unsigned char[]> cidtogidmap(new unsigned char[kCIDToGIDMapSize]);
|
||||
for (int i = 0; i < kCIDToGIDMapSize; i++) {
|
||||
cidtogidmap[i] = (i % 2) ? 1 : 0;
|
||||
}
|
||||
size_t len;
|
||||
unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
|
||||
stream.str("");
|
||||
stream << "5 0 obj\n"
|
||||
"<<\n"
|
||||
" /Length "
|
||||
<< len
|
||||
<< " /Filter /FlateDecode\n"
|
||||
">>\n"
|
||||
"stream\n";
|
||||
AppendString(stream.str().c_str());
|
||||
long objsize = stream.str().size();
|
||||
AppendData(reinterpret_cast<char *>(comp), len);
|
||||
objsize += len;
|
||||
lept_free(comp);
|
||||
const char *endstream_endobj =
|
||||
"endstream\n"
|
||||
"endobj\n";
|
||||
AppendString(endstream_endobj);
|
||||
objsize += strlen(endstream_endobj);
|
||||
AppendPDFObjectDIY(objsize);
|
||||
|
||||
const char stream2[] =
|
||||
"/CIDInit /ProcSet findresource begin\n"
|
||||
"12 dict begin\n"
|
||||
"begincmap\n"
|
||||
"/CIDSystemInfo\n"
|
||||
"<<\n"
|
||||
" /Registry (Adobe)\n"
|
||||
" /Ordering (UCS)\n"
|
||||
" /Supplement 0\n"
|
||||
">> def\n"
|
||||
"/CMapName /Adobe-Identify-UCS def\n"
|
||||
"/CMapType 2 def\n"
|
||||
"1 begincodespacerange\n"
|
||||
"<0000> <FFFF>\n"
|
||||
"endcodespacerange\n"
|
||||
"1 beginbfrange\n"
|
||||
"<0000> <FFFF> <0000>\n"
|
||||
"endbfrange\n"
|
||||
"endcmap\n"
|
||||
"CMapName currentdict /CMap defineresource pop\n"
|
||||
"end\n"
|
||||
"end\n";
|
||||
|
||||
// TOUNICODE
|
||||
stream.str("");
|
||||
stream << "6 0 obj\n"
|
||||
"<< /Length "
|
||||
<< (sizeof(stream2) - 1)
|
||||
<< " >>\n"
|
||||
"stream\n"
|
||||
<< stream2
|
||||
<< "endstream\n"
|
||||
"endobj\n";
|
||||
AppendPDFObject(stream.str().c_str());
|
||||
|
||||
// FONT DESCRIPTOR
|
||||
stream.str("");
|
||||
stream << "7 0 obj\n"
|
||||
"<<\n"
|
||||
" /Ascent 1000\n"
|
||||
" /CapHeight 1000\n"
|
||||
" /Descent -1\n" // Spec says must be negative
|
||||
" /Flags 5\n" // FixedPitch + Symbolic
|
||||
" /FontBBox [ 0 0 "
|
||||
<< (1000 / kCharWidth)
|
||||
<< " 1000 ]\n"
|
||||
" /FontFile2 8 0 R\n"
|
||||
" /FontName /GlyphLessFont\n"
|
||||
" /ItalicAngle 0\n"
|
||||
" /StemV 80\n"
|
||||
" /Type /FontDescriptor\n"
|
||||
">>\n"
|
||||
"endobj\n";
|
||||
AppendPDFObject(stream.str().c_str());
|
||||
|
||||
stream.str("");
|
||||
stream << datadir_.c_str() << "/pdf.ttf";
|
||||
const uint8_t *font;
|
||||
std::ifstream input(stream.str().c_str(), std::ios::in | std::ios::binary);
|
||||
std::vector<unsigned char> buffer(std::istreambuf_iterator<char>(input), {});
|
||||
auto size = buffer.size();
|
||||
if (size) {
|
||||
font = buffer.data();
|
||||
} else {
|
||||
#if !defined(NDEBUG)
|
||||
tprintf("Cannot open file \"%s\"!\nUsing internal glyphless font.\n", stream.str().c_str());
|
||||
#endif
|
||||
font = pdf_ttf;
|
||||
size = sizeof(pdf_ttf);
|
||||
}
|
||||
|
||||
// FONTFILE2
|
||||
stream.str("");
|
||||
stream << "8 0 obj\n"
|
||||
"<<\n"
|
||||
" /Length "
|
||||
<< size
|
||||
<< "\n"
|
||||
" /Length1 "
|
||||
<< size
|
||||
<< "\n"
|
||||
">>\n"
|
||||
"stream\n";
|
||||
AppendString(stream.str().c_str());
|
||||
objsize = stream.str().size();
|
||||
AppendData(reinterpret_cast<const char *>(font), size);
|
||||
objsize += size;
|
||||
AppendString(endstream_endobj);
|
||||
objsize += strlen(endstream_endobj);
|
||||
AppendPDFObjectDIY(objsize);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TessPDFRenderer::imageToPDFObj(Pix *pix, const char *filename, long int objnum,
|
||||
char **pdf_object, long int *pdf_object_size,
|
||||
const int jpg_quality) {
|
||||
if (!pdf_object_size || !pdf_object) {
|
||||
return false;
|
||||
}
|
||||
*pdf_object = nullptr;
|
||||
*pdf_object_size = 0;
|
||||
if (!filename && !pix) {
|
||||
return false;
|
||||
}
|
||||
|
||||
L_Compressed_Data *cid = nullptr;
|
||||
|
||||
int sad = 0;
|
||||
if (pixGetInputFormat(pix) == IFF_PNG) {
|
||||
sad = pixGenerateCIData(pix, L_FLATE_ENCODE, 0, 0, &cid);
|
||||
}
|
||||
if (!cid) {
|
||||
sad = l_generateCIDataForPdf(filename, pix, jpg_quality, &cid);
|
||||
}
|
||||
|
||||
if (sad || !cid) {
|
||||
l_CIDataDestroy(&cid);
|
||||
return false;
|
||||
}
|
||||
|
||||
const char *group4 = "";
|
||||
const char *filter;
|
||||
switch (cid->type) {
|
||||
case L_FLATE_ENCODE:
|
||||
filter = "/FlateDecode";
|
||||
break;
|
||||
case L_JPEG_ENCODE:
|
||||
filter = "/DCTDecode";
|
||||
break;
|
||||
case L_G4_ENCODE:
|
||||
filter = "/CCITTFaxDecode";
|
||||
group4 = " /K -1\n";
|
||||
break;
|
||||
case L_JP2K_ENCODE:
|
||||
filter = "/JPXDecode";
|
||||
break;
|
||||
default:
|
||||
l_CIDataDestroy(&cid);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Maybe someday we will accept RGBA but today is not that day.
|
||||
// It requires creating an /SMask for the alpha channel.
|
||||
// http://stackoverflow.com/questions/14220221
|
||||
std::stringstream colorspace;
|
||||
// Use "C" locale (needed for int values larger than 999).
|
||||
colorspace.imbue(std::locale::classic());
|
||||
if (cid->ncolors > 0) {
|
||||
colorspace << " /ColorSpace [ /Indexed /DeviceRGB " << (cid->ncolors - 1) << " "
|
||||
<< cid->cmapdatahex << " ]\n";
|
||||
} else {
|
||||
switch (cid->spp) {
|
||||
case 1:
|
||||
if (cid->bps == 1 && pixGetInputFormat(pix) == IFF_PNG) {
|
||||
colorspace.str(
|
||||
" /ColorSpace /DeviceGray\n"
|
||||
" /Decode [1 0]\n");
|
||||
} else {
|
||||
colorspace.str(" /ColorSpace /DeviceGray\n");
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
colorspace.str(" /ColorSpace /DeviceRGB\n");
|
||||
break;
|
||||
default:
|
||||
l_CIDataDestroy(&cid);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
int predictor = (cid->predictor) ? 14 : 1;
|
||||
|
||||
// IMAGE
|
||||
std::stringstream b1;
|
||||
// Use "C" locale (needed for int values larger than 999).
|
||||
b1.imbue(std::locale::classic());
|
||||
b1 << objnum
|
||||
<< " 0 obj\n"
|
||||
"<<\n"
|
||||
" /Length "
|
||||
<< cid->nbytescomp
|
||||
<< "\n"
|
||||
" /Subtype /Image\n";
|
||||
|
||||
std::stringstream b2;
|
||||
// Use "C" locale (needed for int values larger than 999).
|
||||
b2.imbue(std::locale::classic());
|
||||
b2 << " /Width " << cid->w
|
||||
<< "\n"
|
||||
" /Height "
|
||||
<< cid->h
|
||||
<< "\n"
|
||||
" /BitsPerComponent "
|
||||
<< cid->bps
|
||||
<< "\n"
|
||||
" /Filter "
|
||||
<< filter
|
||||
<< "\n"
|
||||
" /DecodeParms\n"
|
||||
" <<\n"
|
||||
" /Predictor "
|
||||
<< predictor
|
||||
<< "\n"
|
||||
" /Colors "
|
||||
<< cid->spp << "\n"
|
||||
<< group4 << " /Columns " << cid->w
|
||||
<< "\n"
|
||||
" /BitsPerComponent "
|
||||
<< cid->bps
|
||||
<< "\n"
|
||||
" >>\n"
|
||||
">>\n"
|
||||
"stream\n";
|
||||
|
||||
const char *b3 =
|
||||
"endstream\n"
|
||||
"endobj\n";
|
||||
|
||||
size_t b1_len = b1.str().size();
|
||||
size_t b2_len = b2.str().size();
|
||||
size_t b3_len = strlen(b3);
|
||||
size_t colorspace_len = colorspace.str().size();
|
||||
|
||||
*pdf_object_size = b1_len + colorspace_len + b2_len + cid->nbytescomp + b3_len;
|
||||
*pdf_object = new char[*pdf_object_size];
|
||||
|
||||
char *p = *pdf_object;
|
||||
memcpy(p, b1.str().c_str(), b1_len);
|
||||
p += b1_len;
|
||||
memcpy(p, colorspace.str().c_str(), colorspace_len);
|
||||
p += colorspace_len;
|
||||
memcpy(p, b2.str().c_str(), b2_len);
|
||||
p += b2_len;
|
||||
memcpy(p, cid->datacomp, cid->nbytescomp);
|
||||
p += cid->nbytescomp;
|
||||
memcpy(p, b3, b3_len);
|
||||
l_CIDataDestroy(&cid);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TessPDFRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
Pix *pix = api->GetInputImage();
|
||||
const char *filename = api->GetInputName();
|
||||
int ppi = api->GetSourceYResolution();
|
||||
if (!pix || ppi <= 0) {
|
||||
return false;
|
||||
}
|
||||
double width = pixGetWidth(pix) * 72.0 / ppi;
|
||||
double height = pixGetHeight(pix) * 72.0 / ppi;
|
||||
|
||||
std::stringstream xobject;
|
||||
// Use "C" locale (needed for int values larger than 999).
|
||||
xobject.imbue(std::locale::classic());
|
||||
if (!textonly_) {
|
||||
xobject << "/XObject << /Im1 " << (obj_ + 2) << " 0 R >>\n";
|
||||
}
|
||||
|
||||
// PAGE
|
||||
std::stringstream stream;
|
||||
// Use "C" locale (needed for double values width and height).
|
||||
stream.imbue(std::locale::classic());
|
||||
stream.precision(2);
|
||||
stream << std::fixed << obj_
|
||||
<< " 0 obj\n"
|
||||
"<<\n"
|
||||
" /Type /Page\n"
|
||||
" /Parent 2 0 R\n" // Pages object
|
||||
" /MediaBox [0 0 "
|
||||
<< width << " " << height
|
||||
<< "]\n"
|
||||
" /Contents "
|
||||
<< (obj_ + 1)
|
||||
<< " 0 R\n" // Contents object
|
||||
" /Resources\n"
|
||||
" <<\n"
|
||||
" "
|
||||
<< xobject.str() << // Image object
|
||||
" /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
|
||||
" /Font << /f-0-0 3 0 R >>\n" // Type0 Font
|
||||
" >>\n"
|
||||
">>\n"
|
||||
"endobj\n";
|
||||
pages_.push_back(obj_);
|
||||
AppendPDFObject(stream.str().c_str());
|
||||
|
||||
// CONTENTS
|
||||
const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));
|
||||
const size_t pdftext_len = strlen(pdftext.get());
|
||||
size_t len;
|
||||
unsigned char *comp_pdftext =
|
||||
zlibCompress(reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
|
||||
long comp_pdftext_len = len;
|
||||
stream.str("");
|
||||
stream << obj_
|
||||
<< " 0 obj\n"
|
||||
"<<\n"
|
||||
" /Length "
|
||||
<< comp_pdftext_len
|
||||
<< " /Filter /FlateDecode\n"
|
||||
">>\n"
|
||||
"stream\n";
|
||||
AppendString(stream.str().c_str());
|
||||
long objsize = stream.str().size();
|
||||
AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
|
||||
objsize += comp_pdftext_len;
|
||||
lept_free(comp_pdftext);
|
||||
const char *b2 =
|
||||
"endstream\n"
|
||||
"endobj\n";
|
||||
AppendString(b2);
|
||||
objsize += strlen(b2);
|
||||
AppendPDFObjectDIY(objsize);
|
||||
|
||||
if (!textonly_) {
|
||||
char *pdf_object = nullptr;
|
||||
int jpg_quality;
|
||||
api->GetIntVariable("jpg_quality", &jpg_quality);
|
||||
if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize, jpg_quality)) {
|
||||
return false;
|
||||
}
|
||||
AppendData(pdf_object, objsize);
|
||||
AppendPDFObjectDIY(objsize);
|
||||
delete[] pdf_object;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TessPDFRenderer::EndDocumentHandler() {
|
||||
// We reserved the /Pages object number early, so that the /Page
|
||||
// objects could refer to their parent. We finally have enough
|
||||
// information to go fill it in. Using lower level calls to manipulate
|
||||
// the offset record in two spots, because we are placing objects
|
||||
// out of order in the file.
|
||||
|
||||
// PAGES
|
||||
const long int kPagesObjectNumber = 2;
|
||||
offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1
|
||||
std::stringstream stream;
|
||||
// Use "C" locale (needed for int values larger than 999).
|
||||
stream.imbue(std::locale::classic());
|
||||
stream << kPagesObjectNumber << " 0 obj\n<<\n /Type /Pages\n /Kids [ ";
|
||||
AppendString(stream.str().c_str());
|
||||
size_t pages_objsize = stream.str().size();
|
||||
for (const auto &page : pages_) {
|
||||
stream.str("");
|
||||
stream << page << " 0 R ";
|
||||
AppendString(stream.str().c_str());
|
||||
pages_objsize += stream.str().size();
|
||||
}
|
||||
stream.str("");
|
||||
stream << "]\n /Count " << pages_.size() << "\n>>\nendobj\n";
|
||||
AppendString(stream.str().c_str());
|
||||
pages_objsize += stream.str().size();
|
||||
offsets_.back() += pages_objsize; // manipulation #2
|
||||
|
||||
// INFO
|
||||
std::string utf16_title = "FEFF"; // byte_order_marker
|
||||
std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(title());
|
||||
char utf16[kMaxBytesPerCodepoint];
|
||||
for (char32 code : unicodes) {
|
||||
if (CodepointToUtf16be(code, utf16)) {
|
||||
utf16_title += utf16;
|
||||
}
|
||||
}
|
||||
|
||||
char *datestr = l_getFormattedDate();
|
||||
stream.str("");
|
||||
stream << obj_
|
||||
<< " 0 obj\n"
|
||||
"<<\n"
|
||||
" /Producer (Tesseract "
|
||||
<< tesseract::TessBaseAPI::Version()
|
||||
<< ")\n"
|
||||
" /CreationDate (D:"
|
||||
<< datestr
|
||||
<< ")\n"
|
||||
" /Title <"
|
||||
<< utf16_title.c_str()
|
||||
<< ">\n"
|
||||
">>\n"
|
||||
"endobj\n";
|
||||
lept_free(datestr);
|
||||
AppendPDFObject(stream.str().c_str());
|
||||
stream.str("");
|
||||
stream << "xref\n0 " << obj_ << "\n0000000000 65535 f \n";
|
||||
AppendString(stream.str().c_str());
|
||||
for (int i = 1; i < obj_; i++) {
|
||||
stream.str("");
|
||||
stream.width(10);
|
||||
stream.fill('0');
|
||||
stream << offsets_[i] << " 00000 n \n";
|
||||
AppendString(stream.str().c_str());
|
||||
}
|
||||
stream.str("");
|
||||
stream << "trailer\n<<\n /Size " << obj_
|
||||
<< "\n"
|
||||
" /Root 1 0 R\n" // catalog
|
||||
" /Info "
|
||||
<< (obj_ - 1)
|
||||
<< " 0 R\n" // info
|
||||
">>\nstartxref\n"
|
||||
<< offsets_.back() << "\n%%EOF\n";
|
||||
AppendString(stream.str().c_str());
|
||||
return true;
|
||||
}
|
||||
} // namespace tesseract
|
241
3rdparty/tesseract_ocr/tesseract/src/api/renderer.cpp
vendored
Normal file
241
3rdparty/tesseract_ocr/tesseract/src/api/renderer.cpp
vendored
Normal file
|
@ -0,0 +1,241 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: renderer.cpp
|
||||
// Description: Rendering interface to inject into TessBaseAPI
|
||||
//
|
||||
// (C) Copyright 2011, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
# include "config_auto.h"
|
||||
#endif
|
||||
#include <tesseract/baseapi.h>
|
||||
#include <tesseract/renderer.h>
|
||||
#include <cstring>
|
||||
#include <memory> // std::unique_ptr
|
||||
#include <string> // std::string
|
||||
#include "serialis.h" // Serialize
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
/**********************************************************************
|
||||
* Base Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessResultRenderer::TessResultRenderer(const char *outputbase, const char *extension)
|
||||
: file_extension_(extension)
|
||||
, title_("")
|
||||
, imagenum_(-1)
|
||||
, fout_(stdout)
|
||||
, next_(nullptr)
|
||||
, happy_(true) {
|
||||
if (strcmp(outputbase, "-") && strcmp(outputbase, "stdout")) {
|
||||
std::string outfile = std::string(outputbase) + "." + extension;
|
||||
fout_ = fopen(outfile.c_str(), "wb");
|
||||
if (fout_ == nullptr) {
|
||||
happy_ = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TessResultRenderer::~TessResultRenderer() {
|
||||
if (fout_ != nullptr) {
|
||||
if (fout_ != stdout) {
|
||||
fclose(fout_);
|
||||
} else {
|
||||
clearerr(fout_);
|
||||
}
|
||||
}
|
||||
delete next_;
|
||||
}
|
||||
|
||||
void TessResultRenderer::insert(TessResultRenderer *next) {
|
||||
if (next == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
TessResultRenderer *remainder = next_;
|
||||
next_ = next;
|
||||
if (remainder) {
|
||||
while (next->next_ != nullptr) {
|
||||
next = next->next_;
|
||||
}
|
||||
next->next_ = remainder;
|
||||
}
|
||||
}
|
||||
|
||||
bool TessResultRenderer::BeginDocument(const char *title) {
|
||||
if (!happy_) {
|
||||
return false;
|
||||
}
|
||||
title_ = title;
|
||||
imagenum_ = -1;
|
||||
bool ok = BeginDocumentHandler();
|
||||
if (next_) {
|
||||
ok = next_->BeginDocument(title) && ok;
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
bool TessResultRenderer::AddImage(TessBaseAPI *api) {
|
||||
if (!happy_) {
|
||||
return false;
|
||||
}
|
||||
++imagenum_;
|
||||
bool ok = AddImageHandler(api);
|
||||
if (next_) {
|
||||
ok = next_->AddImage(api) && ok;
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
bool TessResultRenderer::EndDocument() {
|
||||
if (!happy_) {
|
||||
return false;
|
||||
}
|
||||
bool ok = EndDocumentHandler();
|
||||
if (next_) {
|
||||
ok = next_->EndDocument() && ok;
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
void TessResultRenderer::AppendString(const char *s) {
|
||||
AppendData(s, strlen(s));
|
||||
}
|
||||
|
||||
void TessResultRenderer::AppendData(const char *s, int len) {
|
||||
if (!tesseract::Serialize(fout_, s, len)) {
|
||||
happy_ = false;
|
||||
}
|
||||
fflush(fout_);
|
||||
}
|
||||
|
||||
bool TessResultRenderer::BeginDocumentHandler() {
|
||||
return happy_;
|
||||
}
|
||||
|
||||
bool TessResultRenderer::EndDocumentHandler() {
|
||||
return happy_;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* UTF8 Text Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessTextRenderer::TessTextRenderer(const char *outputbase)
|
||||
: TessResultRenderer(outputbase, "txt") {}
|
||||
|
||||
bool TessTextRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
const std::unique_ptr<const char[]> utf8(api->GetUTF8Text());
|
||||
if (utf8 == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AppendString(utf8.get());
|
||||
|
||||
const char *pageSeparator = api->GetStringVariable("page_separator");
|
||||
if (pageSeparator != nullptr && *pageSeparator != '\0') {
|
||||
AppendString(pageSeparator);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* TSV Text Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessTsvRenderer::TessTsvRenderer(const char *outputbase) : TessResultRenderer(outputbase, "tsv") {
|
||||
font_info_ = false;
|
||||
}
|
||||
|
||||
TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info)
|
||||
: TessResultRenderer(outputbase, "tsv") {
|
||||
font_info_ = font_info;
|
||||
}
|
||||
|
||||
bool TessTsvRenderer::BeginDocumentHandler() {
|
||||
// Output TSV column headings
|
||||
AppendString(
|
||||
"level\tpage_num\tblock_num\tpar_num\tline_num\tword_"
|
||||
"num\tleft\ttop\twidth\theight\tconf\ttext\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TessTsvRenderer::EndDocumentHandler() {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TessTsvRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
const std::unique_ptr<const char[]> tsv(api->GetTSVText(imagenum()));
|
||||
if (tsv == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AppendString(tsv.get());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* UNLV Text Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessUnlvRenderer::TessUnlvRenderer(const char *outputbase)
|
||||
: TessResultRenderer(outputbase, "unlv") {}
|
||||
|
||||
bool TessUnlvRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
const std::unique_ptr<const char[]> unlv(api->GetUNLVText());
|
||||
if (unlv == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AppendString(unlv.get());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* BoxText Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessBoxTextRenderer::TessBoxTextRenderer(const char *outputbase)
|
||||
: TessResultRenderer(outputbase, "box") {}
|
||||
|
||||
bool TessBoxTextRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
const std::unique_ptr<const char[]> text(api->GetBoxText(imagenum()));
|
||||
if (text == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AppendString(text.get());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
/**********************************************************************
|
||||
* Osd Text Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessOsdRenderer::TessOsdRenderer(const char *outputbase) : TessResultRenderer(outputbase, "osd") {}
|
||||
|
||||
bool TessOsdRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
const std::unique_ptr<const char[]> osd(api->GetOsdText(imagenum()));
|
||||
if (osd == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AppendString(osd.get());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
} // namespace tesseract
|
106
3rdparty/tesseract_ocr/tesseract/src/api/wordstrboxrenderer.cpp
vendored
Normal file
106
3rdparty/tesseract_ocr/tesseract/src/api/wordstrboxrenderer.cpp
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
/**********************************************************************
|
||||
* File: wordstrboxrenderer.cpp
|
||||
* Description: Renderer for creating box file with WordStr strings.
|
||||
* based on the tsv renderer.
|
||||
*
|
||||
* (C) Copyright 2019, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include <tesseract/baseapi.h> // for TessBaseAPI
|
||||
#include <tesseract/renderer.h>
|
||||
#include "tesseractclass.h" // for Tesseract
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
/**
|
||||
* Create a UTF8 box file with WordStr strings from the internal data
|
||||
* structures. page_number is a 0-base page index that will appear in the box
|
||||
* file. Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
|
||||
char *TessBaseAPI::GetWordStrBoxText(int page_number = 0) {
|
||||
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::string wordstr_box_str;
|
||||
int left = 0, top = 0, right = 0, bottom = 0;
|
||||
|
||||
bool first_line = true;
|
||||
|
||||
LTRResultIterator *res_it = GetLTRIterator();
|
||||
while (!res_it->Empty(RIL_BLOCK)) {
|
||||
if (res_it->Empty(RIL_WORD)) {
|
||||
res_it->Next(RIL_WORD);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||
if (!first_line) {
|
||||
wordstr_box_str += "\n\t " + std::to_string(right + 1);
|
||||
wordstr_box_str += " " + std::to_string(image_height_ - bottom);
|
||||
wordstr_box_str += " " + std::to_string(right + 5);
|
||||
wordstr_box_str += " " + std::to_string(image_height_ - top);
|
||||
wordstr_box_str += " " + std::to_string(page_number); // row for tab for EOL
|
||||
wordstr_box_str += "\n";
|
||||
} else {
|
||||
first_line = false;
|
||||
}
|
||||
// Use bounding box for whole line for WordStr
|
||||
res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
|
||||
wordstr_box_str += "WordStr " + std::to_string(left);
|
||||
wordstr_box_str += " " + std::to_string(image_height_ - bottom);
|
||||
wordstr_box_str += " " + std::to_string(right);
|
||||
wordstr_box_str += " " + std::to_string(image_height_ - top);
|
||||
wordstr_box_str += " " + std::to_string(page_number); // word
|
||||
wordstr_box_str += " #";
|
||||
}
|
||||
do {
|
||||
wordstr_box_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
|
||||
wordstr_box_str += " ";
|
||||
res_it->Next(RIL_WORD);
|
||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
||||
}
|
||||
|
||||
if (left != 0 && top != 0 && right != 0 && bottom != 0) {
|
||||
wordstr_box_str += "\n\t " + std::to_string(right + 1);
|
||||
wordstr_box_str += " " + std::to_string(image_height_ - bottom);
|
||||
wordstr_box_str += " " + std::to_string(right + 5);
|
||||
wordstr_box_str += " " + std::to_string(image_height_ - top);
|
||||
wordstr_box_str += " " + std::to_string(page_number); // row for tab for EOL
|
||||
wordstr_box_str += "\n";
|
||||
}
|
||||
char *ret = new char[wordstr_box_str.length() + 1];
|
||||
strcpy(ret, wordstr_box_str.c_str());
|
||||
delete res_it;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* WordStrBox Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessWordStrBoxRenderer::TessWordStrBoxRenderer(const char *outputbase)
|
||||
: TessResultRenderer(outputbase, "box") {}
|
||||
|
||||
bool TessWordStrBoxRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
const std::unique_ptr<const char[]> wordstrbox(api->GetWordStrBoxText(imagenum()));
|
||||
if (wordstrbox == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AppendString(wordstrbox.get());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
30
3rdparty/tesseract_ocr/tesseract/src/arch/dotproduct.cpp
vendored
Normal file
30
3rdparty/tesseract_ocr/tesseract/src/arch/dotproduct.cpp
vendored
Normal file
|
@ -0,0 +1,30 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: dotproduct.h
|
||||
// Description: Native dot product function.
|
||||
//
|
||||
// (C) Copyright 2018, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "dotproduct.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Computes and returns the dot product of the two n-vectors u and v.
|
||||
double DotProductNative(const double *u, const double *v, int n) {
|
||||
double total = 0.0;
|
||||
for (int k = 0; k < n; ++k) {
|
||||
total += u[k] * v[k];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
36
3rdparty/tesseract_ocr/tesseract/src/arch/dotproduct.h
vendored
Normal file
36
3rdparty/tesseract_ocr/tesseract/src/arch/dotproduct.h
vendored
Normal file
|
@ -0,0 +1,36 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: dotproduct.h
|
||||
// Description: Native dot product function.
|
||||
//
|
||||
// (C) Copyright 2018, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_ARCH_DOTPRODUCT_H_
|
||||
#define TESSERACT_ARCH_DOTPRODUCT_H_
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Computes and returns the dot product of the n-vectors u and v.
|
||||
double DotProductNative(const double *u, const double *v, int n);
|
||||
|
||||
// Uses Intel AVX intrinsics to access the SIMD instruction set.
|
||||
double DotProductAVX(const double *u, const double *v, int n);
|
||||
|
||||
// Use Intel FMA.
|
||||
double DotProductFMA(const double *u, const double *v, int n);
|
||||
|
||||
// Uses Intel SSE intrinsics to access the SIMD instruction set.
|
||||
double DotProductSSE(const double *u, const double *v, int n);
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_ARCH_DOTPRODUCT_H_
|
63
3rdparty/tesseract_ocr/tesseract/src/arch/dotproductavx.cpp
vendored
Normal file
63
3rdparty/tesseract_ocr/tesseract/src/arch/dotproductavx.cpp
vendored
Normal file
|
@ -0,0 +1,63 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: dotproductavx.cpp
|
||||
// Description: Architecture-specific dot-product function.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2015, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if !defined(__AVX__)
|
||||
# if defined(__i686__) || defined(__x86_64__)
|
||||
# error Implementation only for AVX capable architectures
|
||||
# endif
|
||||
#else
|
||||
|
||||
# include <immintrin.h>
|
||||
# include <cstdint>
|
||||
# include "dotproduct.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Computes and returns the dot product of the n-vectors u and v.
|
||||
// Uses Intel AVX intrinsics to access the SIMD instruction set.
|
||||
double DotProductAVX(const double *u, const double *v, int n) {
|
||||
const unsigned quot = n / 8;
|
||||
const unsigned rem = n % 8;
|
||||
__m256d t0 = _mm256_setzero_pd();
|
||||
__m256d t1 = _mm256_setzero_pd();
|
||||
for (unsigned k = 0; k < quot; k++) {
|
||||
__m256d f0 = _mm256_loadu_pd(u);
|
||||
__m256d f1 = _mm256_loadu_pd(v);
|
||||
f0 = _mm256_mul_pd(f0, f1);
|
||||
t0 = _mm256_add_pd(t0, f0);
|
||||
u += 4;
|
||||
v += 4;
|
||||
__m256d f2 = _mm256_loadu_pd(u);
|
||||
__m256d f3 = _mm256_loadu_pd(v);
|
||||
f2 = _mm256_mul_pd(f2, f3);
|
||||
t1 = _mm256_add_pd(t1, f2);
|
||||
u += 4;
|
||||
v += 4;
|
||||
}
|
||||
t0 = _mm256_hadd_pd(t0, t1);
|
||||
alignas(32) double tmp[4];
|
||||
_mm256_store_pd(tmp, t0);
|
||||
double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
|
||||
for (unsigned k = 0; k < rem; k++) {
|
||||
result += *u++ * *v++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif
|
61
3rdparty/tesseract_ocr/tesseract/src/arch/dotproductfma.cpp
vendored
Normal file
61
3rdparty/tesseract_ocr/tesseract/src/arch/dotproductfma.cpp
vendored
Normal file
|
@ -0,0 +1,61 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: dotproductfma.cpp
|
||||
// Description: Architecture-specific dot-product function.
|
||||
// Author: Stefan Weil
|
||||
//
|
||||
// (C) Copyright 2015, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if !defined(__FMA__)
|
||||
# if defined(__i686__) || defined(__x86_64__)
|
||||
# error Implementation only for FMA capable architectures
|
||||
# endif
|
||||
#else
|
||||
|
||||
# include <immintrin.h>
|
||||
# include <cstdint>
|
||||
# include "dotproduct.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Computes and returns the dot product of the n-vectors u and v.
|
||||
// Uses Intel FMA intrinsics to access the SIMD instruction set.
|
||||
double DotProductFMA(const double *u, const double *v, int n) {
|
||||
const unsigned quot = n / 8;
|
||||
const unsigned rem = n % 8;
|
||||
__m256d t0 = _mm256_setzero_pd();
|
||||
__m256d t1 = _mm256_setzero_pd();
|
||||
for (unsigned k = 0; k < quot; k++) {
|
||||
__m256d f0 = _mm256_loadu_pd(u);
|
||||
__m256d f1 = _mm256_loadu_pd(v);
|
||||
t0 = _mm256_fmadd_pd(f0, f1, t0);
|
||||
u += 4;
|
||||
v += 4;
|
||||
__m256d f2 = _mm256_loadu_pd(u);
|
||||
__m256d f3 = _mm256_loadu_pd(v);
|
||||
t1 = _mm256_fmadd_pd(f2, f3, t1);
|
||||
u += 4;
|
||||
v += 4;
|
||||
}
|
||||
t0 = _mm256_hadd_pd(t0, t1);
|
||||
alignas(32) double tmp[4];
|
||||
_mm256_store_pd(tmp, t0);
|
||||
double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
|
||||
for (unsigned k = 0; k < rem; k++) {
|
||||
result += *u++ * *v++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif
|
84
3rdparty/tesseract_ocr/tesseract/src/arch/dotproductsse.cpp
vendored
Normal file
84
3rdparty/tesseract_ocr/tesseract/src/arch/dotproductsse.cpp
vendored
Normal file
|
@ -0,0 +1,84 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: dotproductsse.cpp
|
||||
// Description: Architecture-specific dot-product function.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2015, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if !defined(__SSE4_1__)
|
||||
# if defined(__i686__) || defined(__x86_64__)
|
||||
# error Implementation only for SSE 4.1 capable architectures
|
||||
# endif
|
||||
#else
|
||||
|
||||
# include <emmintrin.h>
|
||||
# include <smmintrin.h>
|
||||
# include <cstdint>
|
||||
# include "dotproduct.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Computes and returns the dot product of the n-vectors u and v.
|
||||
// Uses Intel SSE intrinsics to access the SIMD instruction set.
|
||||
double DotProductSSE(const double *u, const double *v, int n) {
|
||||
int max_offset = n - 2;
|
||||
int offset = 0;
|
||||
// Accumulate a set of 2 sums in sum, by loading pairs of 2 values from u and
|
||||
// v, and multiplying them together in parallel.
|
||||
__m128d sum = _mm_setzero_pd();
|
||||
if (offset <= max_offset) {
|
||||
offset = 2;
|
||||
// Aligned load is reputedly faster but requires 16 byte aligned input.
|
||||
if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 && (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
|
||||
// Use aligned load.
|
||||
sum = _mm_load_pd(u);
|
||||
__m128d floats2 = _mm_load_pd(v);
|
||||
// Multiply.
|
||||
sum = _mm_mul_pd(sum, floats2);
|
||||
while (offset <= max_offset) {
|
||||
__m128d floats1 = _mm_load_pd(u + offset);
|
||||
floats2 = _mm_load_pd(v + offset);
|
||||
offset += 2;
|
||||
floats1 = _mm_mul_pd(floats1, floats2);
|
||||
sum = _mm_add_pd(sum, floats1);
|
||||
}
|
||||
} else {
|
||||
// Use unaligned load.
|
||||
sum = _mm_loadu_pd(u);
|
||||
__m128d floats2 = _mm_loadu_pd(v);
|
||||
// Multiply.
|
||||
sum = _mm_mul_pd(sum, floats2);
|
||||
while (offset <= max_offset) {
|
||||
__m128d floats1 = _mm_loadu_pd(u + offset);
|
||||
floats2 = _mm_loadu_pd(v + offset);
|
||||
offset += 2;
|
||||
floats1 = _mm_mul_pd(floats1, floats2);
|
||||
sum = _mm_add_pd(sum, floats1);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Add the 2 sums in sum horizontally.
|
||||
sum = _mm_hadd_pd(sum, sum);
|
||||
// Extract the low result.
|
||||
double result = _mm_cvtsd_f64(sum);
|
||||
// Add on any left-over products.
|
||||
while (offset < n) {
|
||||
result += u[offset] * v[offset];
|
||||
++offset;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif
|
94
3rdparty/tesseract_ocr/tesseract/src/arch/intsimdmatrix.cpp
vendored
Normal file
94
3rdparty/tesseract_ocr/tesseract/src/arch/intsimdmatrix.cpp
vendored
Normal file
|
@ -0,0 +1,94 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: intsimdmatrix.cpp
|
||||
// Description: Base class for 8-bit int SIMD matrix multipliers.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2017, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "intsimdmatrix.h"
|
||||
#include "matrix.h" // for GENERIC_2D_ARRAY
|
||||
#include "simddetect.h" // for SIMDDetect
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
const IntSimdMatrix *IntSimdMatrix::intSimdMatrix = nullptr;
|
||||
|
||||
// Computes a reshaped copy of the weight matrix w.
|
||||
void IntSimdMatrix::Init(const GENERIC_2D_ARRAY<int8_t> &w, std::vector<int8_t> &shaped_w,
|
||||
int32_t &rounded_num_out) const {
|
||||
const int num_out = w.dim1();
|
||||
const int num_in = w.dim2() - 1;
|
||||
// The rounded-up sizes of the reshaped weight matrix, excluding biases.
|
||||
int rounded_num_in = Roundup(num_in, num_inputs_per_group_);
|
||||
rounded_num_out = RoundOutputs(num_out);
|
||||
// Add the bias and compute the required size.
|
||||
shaped_w.resize((rounded_num_in + 1) * rounded_num_out, 0);
|
||||
int shaped_index = 0;
|
||||
int output = 0;
|
||||
// Each number of registers needs a different format! Iterates over the
|
||||
// different numbers of registers (each a power of 2).
|
||||
for (int num_registers = max_output_registers_; num_registers >= 1; num_registers /= 2) {
|
||||
// The number of outputs that we will generate with this many registers.
|
||||
int num_outputs_per_register_set = num_registers * num_outputs_per_register_;
|
||||
// Use the max number of registers until we have to go fewer.
|
||||
while (output + num_outputs_per_register_set <= rounded_num_out) {
|
||||
// Accumulating outputs in registers saves iterating over the inputs, so
|
||||
// we only have to do it once per output register set.
|
||||
for (int input = 0; input < num_in; input += num_inputs_per_group_) {
|
||||
// Iterate over the number of outputs in a register set.
|
||||
for (int j = 0; j < num_outputs_per_register_set; ++j) {
|
||||
// Inner-most loop corresponds to the number of inputs in an input
|
||||
// group.
|
||||
for (int i = 0; i < num_inputs_per_group_; ++i) {
|
||||
int8_t weight = 0;
|
||||
if (output + j < num_out && input + i < num_in) {
|
||||
weight = w(output + j, input + i);
|
||||
}
|
||||
shaped_w[shaped_index++] = weight;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Append the bias weights for the register set.
|
||||
for (int j = 0; j < num_outputs_per_register_set; ++j) {
|
||||
int8_t weight = 0;
|
||||
if (output + j < num_out) {
|
||||
weight = w(output + j, num_in);
|
||||
}
|
||||
shaped_w[shaped_index++] = weight;
|
||||
}
|
||||
output += num_outputs_per_register_set;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Computes matrix.vector v = Wu.
|
||||
// u is of size W.dim2() - 1 and the output v is of size W.dim1().
|
||||
// u is imagined to have an extra element at the end with value 1, to
|
||||
// implement the bias, but it doesn't actually have it.
|
||||
void IntSimdMatrix::MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w,
|
||||
const std::vector<double> &scales, const int8_t *u, double *v) {
|
||||
int num_out = w.dim1();
|
||||
int num_in = w.dim2() - 1;
|
||||
// Base implementation.
|
||||
for (int i = 0; i < num_out; ++i) {
|
||||
const int8_t *wi = w[i];
|
||||
int total = 0;
|
||||
for (int j = 0; j < num_in; ++j) {
|
||||
total += wi[j] * u[j];
|
||||
}
|
||||
// Add in the bias and correct for integer values.
|
||||
v[i] = (total + wi[num_in] * INT8_MAX) * scales[i];
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
123
3rdparty/tesseract_ocr/tesseract/src/arch/intsimdmatrix.h
vendored
Normal file
123
3rdparty/tesseract_ocr/tesseract/src/arch/intsimdmatrix.h
vendored
Normal file
|
@ -0,0 +1,123 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: intsimdmatrix.h
|
||||
// Description: Base class for 8-bit int SIMD matrix multipliers.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2017, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_ARCH_INTSIMDMATRIX_H_
|
||||
#define TESSERACT_ARCH_INTSIMDMATRIX_H_
|
||||
|
||||
#include <tesseract/export.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
template <class T>
|
||||
class GENERIC_2D_ARRAY;
|
||||
|
||||
// Base class for a SIMD function to multiply a matrix by a vector, with sources
|
||||
// of 8-bit signed integer, and result in a double, after appropriate scaling.
|
||||
// Assumes a specific method of multiplication that can be applied to any size
|
||||
// and number of SIMD registers as follows:
|
||||
// int32_t results are computed with num_outputs_per_register_ in each of
|
||||
// max_output_registers_ result registers, repeatedly until it would make too
|
||||
// many results, then the number of registers is halved, and so-on down to a
|
||||
// single result register. The last calculation only outputs the required number
|
||||
// of results instead of writing beyond the bounds. Eg: matrix has 75 outputs,
|
||||
// num_outputs_per_register_ = 4, and max_output_registers_ = 8,
|
||||
// Step 1: 8x4=32 results are computed,
|
||||
// Step 2: 8x4=32 again, total 64,
|
||||
// Step 3: 2x4=8 (since 8x4 is too many, so is 4x4), total 72,
|
||||
// Step 4: 1x3, total 75.
|
||||
// Each step above is computed using a PartialFunc, which runs over the input
|
||||
// vector once. The input is read one registerful of num_inputs_per_register_
|
||||
// at a time (presumably 4x num_outputs_per_register_ since they are int8_t)
|
||||
// so the inputs MUST BE PADDED to a multiple of num_inputs_per_register_.
|
||||
// Since it is slow (on Intel at least) to horizontally add in a register,
|
||||
// provision is made to process num_inputs_per_group_ inputs at a time, with
|
||||
// the group being replicated num_input_groups_ times and multiplied by a
|
||||
// num_inputs_per_group_ by num_input_groups_ rectangle of the weights matrix.
|
||||
// This is most convenient if num_inputs_per_group_ is 4, and the product
|
||||
// sign-extends and sums 8x8=16 bit results to 32 bits, adding 4 adjacent
|
||||
// results in the process, but it doesn't have to be implemented that way.
|
||||
// The weights are re-ordered by Init() to be used sequentially by the above
|
||||
// algorithm, followed by the biases, so they can be added at the end.
|
||||
// The base class computes the base C++ implementation.
|
||||
// NOTE that, although the subclasses execute on different SIMD hardware, no
|
||||
// virtual methods are needed, as the constructor sets up everything that
|
||||
// is required to allow the base class implementation to do all the work.
|
||||
struct TESS_API IntSimdMatrix {
|
||||
// Computes a reshaped copy of the weight matrix w.
|
||||
void Init(const GENERIC_2D_ARRAY<int8_t> &w, std::vector<int8_t> &shaped_w,
|
||||
int32_t &rounded_num_out) const;
|
||||
|
||||
// Rounds the size up to a multiple of the input register size (in int8_t).
|
||||
int RoundInputs(int size) const {
|
||||
return Roundup(size, num_inputs_per_register_);
|
||||
}
|
||||
// Rounds the size up to a multiple of the output register size (in int32_t).
|
||||
int RoundOutputs(int size) const {
|
||||
return Roundup(size, num_outputs_per_register_);
|
||||
}
|
||||
|
||||
// Computes matrix.vector v = Wu.
|
||||
// u is of size W.dim2() - 1 and the output v is of size W.dim1().
|
||||
// u is imagined to have an extra element at the end with value 1, to
|
||||
// implement the bias, but it doesn't actually have it.
|
||||
// Computes the base C++ implementation.
|
||||
static void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w, const std::vector<double> &scales,
|
||||
const int8_t *u, double *v);
|
||||
|
||||
// Rounds the input up to a multiple of the given factor.
|
||||
static int Roundup(int input, int factor) {
|
||||
return (input + factor - 1) / factor * factor;
|
||||
}
|
||||
|
||||
// Computes matrix.vector v = Wu.
|
||||
// u is of size W.dim2() - 1 and the output v is of size W.dim1().
|
||||
// u is imagined to have an extra element at the end with value 1, to
|
||||
// implement the bias, but it doesn't actually have it.
|
||||
// Uses an optimized implementation with partial funcs.
|
||||
// NOTE: The size of the input vector (u) must be padded using
|
||||
// RoundInputs above.
|
||||
// The input will be over-read to the extent of the padding. There are no
|
||||
// alignment requirements.
|
||||
using MatrixDotVectorFunction = void (*)(int, int, const int8_t *, const double *, const int8_t *,
|
||||
double *);
|
||||
MatrixDotVectorFunction matrixDotVectorFunction;
|
||||
|
||||
// Number of 32 bit outputs held in each register.
|
||||
int num_outputs_per_register_;
|
||||
// Maximum number of registers that we will use to hold outputs.
|
||||
int max_output_registers_;
|
||||
// Number of 8 bit inputs in the inputs register.
|
||||
int num_inputs_per_register_;
|
||||
// Number of inputs in each weight group.
|
||||
int num_inputs_per_group_;
|
||||
// Number of groups of inputs to be broadcast.
|
||||
// num_input_groups_ = num_inputs_per_register_ / num_inputs_per_group_
|
||||
|
||||
static const IntSimdMatrix *intSimdMatrix;
|
||||
// Only available with NEON.
|
||||
static const IntSimdMatrix intSimdMatrixNEON;
|
||||
// Only available with AVX2 / SSE.
|
||||
static const IntSimdMatrix intSimdMatrixAVX2;
|
||||
static const IntSimdMatrix intSimdMatrixSSE;
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_ARCH_INTSIMDMATRIX_H_
|
348
3rdparty/tesseract_ocr/tesseract/src/arch/intsimdmatrixavx2.cpp
vendored
Normal file
348
3rdparty/tesseract_ocr/tesseract/src/arch/intsimdmatrixavx2.cpp
vendored
Normal file
|
@ -0,0 +1,348 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: intsimdmatrixavx2.cpp
|
||||
// Description: matrix-vector product for 8-bit data on avx2.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2017, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
# if defined(__i686__) || defined(__x86_64__)
|
||||
# error Implementation only for AVX2 capable architectures
|
||||
# endif
|
||||
#else
|
||||
|
||||
# include "intsimdmatrix.h"
|
||||
|
||||
# include <immintrin.h>
|
||||
# include <algorithm>
|
||||
# include <cstdint>
|
||||
# include <vector>
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Number of outputs held in each register. 8 x 32 bit ints.
|
||||
constexpr int kNumOutputsPerRegister = 8;
|
||||
// Maximum number of registers that we will use.
|
||||
constexpr int kMaxOutputRegisters = 8;
|
||||
// Number of inputs in the inputs register.
|
||||
constexpr int kNumInputsPerRegister = 32;
|
||||
// Number of inputs in each weight group.
|
||||
constexpr int kNumInputsPerGroup = 4;
|
||||
// Number of groups of inputs to be broadcast.
|
||||
constexpr int kNumInputGroups = kNumInputsPerRegister / kNumInputsPerGroup;
|
||||
|
||||
// Functions to compute part of a matrix.vector multiplication. The weights
|
||||
// are in a very specific order (see above) in w, which is multiplied by
|
||||
// u of length num_in, to produce output v after scaling the integer results
|
||||
// by the corresponding member of scales.
|
||||
// The amount of w and scales consumed is fixed and not available to the
|
||||
// caller. The number of outputs written to v will be at most num_out.
|
||||
|
||||
// Computes one set of 4x8 products of inputs and weights, adding to result.
|
||||
// Horizontally adds 4 adjacent results, making 8x32-bit results.
|
||||
// rep_input is assumed to be an 8x replicated set of 4x8-bit signed integers.
|
||||
// Note that wi must previously have been re-organized with blocks of 4x8
|
||||
// weights in contiguous memory.
|
||||
// ones is a register of 16x16-bit values all equal to 1.
|
||||
// Note: wi is incremented by the amount of data read.
|
||||
// weights and reps are scratch registers.
|
||||
// This function must be inlined with references in order for the compiler to
|
||||
// correctly use the registers declared in the caller.
|
||||
static inline void MultiplyGroup(const __m256i &rep_input, const __m256i &ones, const int8_t *&wi,
|
||||
__m256i &weights, __m256i &reps, __m256i &result) {
|
||||
// Load a 4x8 block of weights.
|
||||
weights = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(wi));
|
||||
wi += kNumInputsPerRegister;
|
||||
// Normalize the signs on rep_input, weights, so weights is always +ve.
|
||||
reps = _mm256_sign_epi8(rep_input, weights);
|
||||
weights = _mm256_sign_epi8(weights, weights);
|
||||
// Multiply 32x8-bit reps by 32x8-bit weights to make 16x16-bit results,
|
||||
// with adjacent pairs added.
|
||||
weights = _mm256_maddubs_epi16(weights, reps);
|
||||
// Multiply 16x16-bit result by 16x16-bit ones to make 8x32-bit results,
|
||||
// with adjacent pairs added. What we really want is a horizontal add of
|
||||
// 16+16=32 bit result, but there is no such instruction, so multiply by
|
||||
// 16-bit ones instead. It is probably faster than all the sign-extending,
|
||||
// permuting and adding that would otherwise be required.
|
||||
weights = _mm256_madd_epi16(weights, ones);
|
||||
result = _mm256_add_epi32(result, weights);
|
||||
}
|
||||
|
||||
// Load 64 bits into the bottom of a 128bit register.
|
||||
// We don't actually care what the top 64bits are, but this ends
|
||||
// up with them being zero.
|
||||
static inline __m128i load64_to_128(const int8_t *wi_) {
|
||||
const auto *wi = reinterpret_cast<const int64_t *>(wi_);
|
||||
return _mm_set_epi64x(0, wi[0]);
|
||||
}
|
||||
|
||||
static inline void ExtractResults8(__m256i result, const int8_t *wi, const double *scales,
|
||||
double *v) {
|
||||
__m128i w128 = load64_to_128(wi); // 8x8bit vals in bottom of 128bit reg
|
||||
__m256i w256 = _mm256_cvtepi8_epi32(w128); // 8x32bit vals in 256bit reg
|
||||
__m256i bias_scale = _mm256_set_epi32(127, 127, 127, 127, 127, 127, 127, 127);
|
||||
__m256d scale0123 = _mm256_loadu_pd(scales);
|
||||
__m256d scale4567 = _mm256_loadu_pd(scales + 4);
|
||||
w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 <bias * 127>
|
||||
result = _mm256_add_epi32(result, w256); // result += bias * 127
|
||||
__m256d res0123 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result));
|
||||
result = _mm256_permute4x64_epi64(result, 2 + (3 << 2));
|
||||
__m256d res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result));
|
||||
res0123 = _mm256_mul_pd(res0123, scale0123);
|
||||
res4567 = _mm256_mul_pd(res4567, scale4567);
|
||||
_mm256_storeu_pd(v, res0123);
|
||||
_mm256_storeu_pd(v + 4, res4567);
|
||||
}
|
||||
|
||||
static inline void ExtractResults16(__m256i result0, __m256i result1, const int8_t *&wi,
|
||||
const double *&scales, double *&v) {
|
||||
__m128i w8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(wi));
|
||||
// 8x8bit vals in bottom of 128bit reg
|
||||
const __m256i bias_scale = _mm256_set_epi32(127, 127, 127, 127, 127, 127, 127, 127);
|
||||
__m256i w256 = _mm256_cvtepi8_epi32(w8); // 8x32bit vals in 256bit reg
|
||||
__m256d scale0123 = _mm256_loadu_pd(scales);
|
||||
__m256d scale4567 = _mm256_loadu_pd(scales + 4);
|
||||
w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 <bias * 127>
|
||||
result0 = _mm256_add_epi32(result0, w256); // result += bias * 127
|
||||
__m256d res0123 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result0));
|
||||
result0 = _mm256_permute4x64_epi64(result0, 2 + (3 << 2));
|
||||
__m256d res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result0));
|
||||
res0123 = _mm256_mul_pd(res0123, scale0123);
|
||||
res4567 = _mm256_mul_pd(res4567, scale4567);
|
||||
_mm256_storeu_pd(v, res0123);
|
||||
_mm256_storeu_pd(v + 4, res4567);
|
||||
w8 = _mm_shuffle_epi32(w8, 2 + (3 << 2));
|
||||
w256 = _mm256_cvtepi8_epi32(w8); // 8x32bit vals in 256bit reg
|
||||
scale0123 = _mm256_loadu_pd(scales + 8);
|
||||
scale4567 = _mm256_loadu_pd(scales + 12);
|
||||
w256 = _mm256_mullo_epi32(w256, bias_scale); // 8x32 <bias * 127>
|
||||
result1 = _mm256_add_epi32(result1, w256); // result += bias * 127
|
||||
res0123 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result1));
|
||||
result1 = _mm256_permute4x64_epi64(result1, 2 + (3 << 2));
|
||||
res4567 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(result1));
|
||||
res0123 = _mm256_mul_pd(res0123, scale0123);
|
||||
res4567 = _mm256_mul_pd(res4567, scale4567);
|
||||
_mm256_storeu_pd(v + 8, res0123);
|
||||
_mm256_storeu_pd(v + 12, res4567);
|
||||
wi += 16;
|
||||
scales += 16;
|
||||
v += 16;
|
||||
}
|
||||
|
||||
// Computes part of matrix.vector v = Wu. Computes N=64 results.
|
||||
// The weights *must* be arranged so that consecutive reads from wi
|
||||
// provides (num_in/kNumInputsPerGroup groups of (N output dim groups of
|
||||
// (kNumInputsPerGroup inputs))). After that there must be N consecutive
|
||||
// bias weights, before continuing with any more weights.
|
||||
// u must be padded out with zeros to
|
||||
// kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements.
|
||||
static void PartialMatrixDotVector64(const int8_t *wi, const double *scales, const int8_t *u,
|
||||
int num_in, double *v) {
|
||||
// Register containing 16-bit ones for horizontal add with 16->32 bit
|
||||
// conversion.
|
||||
__m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
|
||||
__m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
|
||||
// Initialize all the results to 0.
|
||||
__m256i result0 = _mm256_setzero_si256();
|
||||
__m256i result1 = _mm256_setzero_si256();
|
||||
__m256i result2 = _mm256_setzero_si256();
|
||||
__m256i result3 = _mm256_setzero_si256();
|
||||
__m256i result4 = _mm256_setzero_si256();
|
||||
__m256i result5 = _mm256_setzero_si256();
|
||||
__m256i result6 = _mm256_setzero_si256();
|
||||
__m256i result7 = _mm256_setzero_si256();
|
||||
// Iterate over the input (u), one registerful at a time.
|
||||
for (int j = 0; j < num_in;) {
|
||||
__m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
|
||||
// Inputs are processed in groups of kNumInputsPerGroup, replicated
|
||||
// kNumInputGroups times.
|
||||
for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
|
||||
// Replicate the low 32 bits (4 inputs) 8 times.
|
||||
__m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
|
||||
// Rotate the inputs in groups of 4, so the next 4 inputs are ready.
|
||||
inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
|
||||
__m256i weights, reps;
|
||||
// Mul-add, with horizontal add of the 4 inputs to each of the results.
|
||||
MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
|
||||
MultiplyGroup(rep_input, ones, wi, weights, reps, result1);
|
||||
MultiplyGroup(rep_input, ones, wi, weights, reps, result2);
|
||||
MultiplyGroup(rep_input, ones, wi, weights, reps, result3);
|
||||
MultiplyGroup(rep_input, ones, wi, weights, reps, result4);
|
||||
MultiplyGroup(rep_input, ones, wi, weights, reps, result5);
|
||||
MultiplyGroup(rep_input, ones, wi, weights, reps, result6);
|
||||
MultiplyGroup(rep_input, ones, wi, weights, reps, result7);
|
||||
}
|
||||
}
|
||||
ExtractResults16(result0, result1, wi, scales, v);
|
||||
ExtractResults16(result2, result3, wi, scales, v);
|
||||
ExtractResults16(result4, result5, wi, scales, v);
|
||||
ExtractResults16(result6, result7, wi, scales, v);
|
||||
}
|
||||
|
||||
// Computes part of matrix.vector v = Wu. Computes N=32 results.
|
||||
// For details see PartialMatrixDotVector64 with N=32.
|
||||
static void PartialMatrixDotVector32(const int8_t *wi, const double *scales, const int8_t *u,
|
||||
int num_in, double *v) {
|
||||
// Register containing 16-bit ones for horizontal add with 16->32 bit
|
||||
// conversion.
|
||||
__m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
|
||||
__m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
|
||||
// Initialize all the results to 0.
|
||||
__m256i result0 = _mm256_setzero_si256();
|
||||
__m256i result1 = _mm256_setzero_si256();
|
||||
__m256i result2 = _mm256_setzero_si256();
|
||||
__m256i result3 = _mm256_setzero_si256();
|
||||
// Iterate over the input (u), one registerful at a time.
|
||||
for (int j = 0; j < num_in;) {
|
||||
__m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
|
||||
// Inputs are processed in groups of kNumInputsPerGroup, replicated
|
||||
// kNumInputGroups times.
|
||||
for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
|
||||
// Replicate the low 32 bits (4 inputs) 8 times.
|
||||
__m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
|
||||
// Rotate the inputs in groups of 4, so the next 4 inputs are ready.
|
||||
inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
|
||||
__m256i weights, reps;
|
||||
// Mul-add, with horizontal add of the 4 inputs to each of the results.
|
||||
MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
|
||||
MultiplyGroup(rep_input, ones, wi, weights, reps, result1);
|
||||
MultiplyGroup(rep_input, ones, wi, weights, reps, result2);
|
||||
MultiplyGroup(rep_input, ones, wi, weights, reps, result3);
|
||||
}
|
||||
}
|
||||
ExtractResults16(result0, result1, wi, scales, v);
|
||||
ExtractResults16(result2, result3, wi, scales, v);
|
||||
}
|
||||
|
||||
// Computes part of matrix.vector v = Wu. Computes N=16 results.
|
||||
// For details see PartialMatrixDotVector64 with N=16.
|
||||
static void PartialMatrixDotVector16(const int8_t *wi, const double *scales, const int8_t *u,
|
||||
int num_in, double *v) {
|
||||
// Register containing 16-bit ones for horizontal add with 16->32 bit
|
||||
// conversion.
|
||||
__m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
|
||||
__m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
|
||||
// Initialize all the results to 0.
|
||||
__m256i result0 = _mm256_setzero_si256();
|
||||
__m256i result1 = _mm256_setzero_si256();
|
||||
// Iterate over the input (u), one registerful at a time.
|
||||
for (int j = 0; j < num_in;) {
|
||||
__m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
|
||||
// Inputs are processed in groups of kNumInputsPerGroup, replicated
|
||||
// kNumInputGroups times.
|
||||
for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
|
||||
// Replicate the low 32 bits (4 inputs) 8 times.
|
||||
__m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
|
||||
// Rotate the inputs in groups of 4, so the next 4 inputs are ready.
|
||||
inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
|
||||
__m256i weights, reps;
|
||||
// Mul-add, with horizontal add of the 4 inputs to each of the results.
|
||||
MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
|
||||
MultiplyGroup(rep_input, ones, wi, weights, reps, result1);
|
||||
}
|
||||
}
|
||||
ExtractResults16(result0, result1, wi, scales, v);
|
||||
}
|
||||
|
||||
// Computes part of matrix.vector v = Wu. Computes N=8 results.
|
||||
// For details see PartialMatrixDotVector64 with N=8.
|
||||
static inline void PartialMatrixDotVector8(const int8_t *wi, const double *scales, const int8_t *u,
|
||||
int num_in, double *v) {
|
||||
// Register containing 16-bit ones for horizontal add with 16->32 bit
|
||||
// conversion.
|
||||
__m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
|
||||
__m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
|
||||
// Initialize all the results to 0.
|
||||
__m256i result0 = _mm256_setzero_si256();
|
||||
// Iterate over the input (u), one registerful at a time.
|
||||
for (int j = 0; j < num_in;) {
|
||||
__m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(u + j));
|
||||
// Inputs are processed in groups of kNumInputsPerGroup, replicated
|
||||
// kNumInputGroups times.
|
||||
for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) {
|
||||
// Replicate the low 32 bits (4 inputs) 8 times.
|
||||
__m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
|
||||
// Rotate the inputs in groups of 4, so the next 4 inputs are ready.
|
||||
inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
|
||||
__m256i weights, reps;
|
||||
// Mul-add, with horizontal add of the 4 inputs to each of the results.
|
||||
MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
|
||||
}
|
||||
}
|
||||
ExtractResults8(result0, wi, scales, v);
|
||||
}
|
||||
|
||||
static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales,
|
||||
const int8_t *u, double *v) {
|
||||
const int num_out = dim1;
|
||||
const int num_in = dim2 - 1;
|
||||
// Each call to a partial_func_ produces group_size outputs, except the
|
||||
// last one, which can produce less.
|
||||
const int rounded_num_in = IntSimdMatrix::Roundup(num_in, kNumInputsPerGroup);
|
||||
const int rounded_num_out = IntSimdMatrix::Roundup(num_out, kNumOutputsPerRegister);
|
||||
int group_size = kNumOutputsPerRegister * kMaxOutputRegisters;
|
||||
int output = 0;
|
||||
|
||||
int w_step = (rounded_num_in + 1) * group_size;
|
||||
|
||||
// Run with this group size, until it would produce too much output, then
|
||||
// switch to a smaller size.
|
||||
for (; output + group_size <= rounded_num_out; output += group_size) {
|
||||
PartialMatrixDotVector64(wi, scales, u, rounded_num_in, v);
|
||||
wi += w_step;
|
||||
scales += group_size;
|
||||
v += group_size;
|
||||
}
|
||||
group_size /= 2;
|
||||
w_step /= 2;
|
||||
|
||||
if (output + group_size <= rounded_num_out) {
|
||||
PartialMatrixDotVector32(wi, scales, u, rounded_num_in, v);
|
||||
wi += w_step;
|
||||
scales += group_size;
|
||||
v += group_size;
|
||||
output += group_size;
|
||||
}
|
||||
group_size /= 2;
|
||||
w_step /= 2;
|
||||
|
||||
if (output + group_size <= rounded_num_out) {
|
||||
PartialMatrixDotVector16(wi, scales, u, rounded_num_in, v);
|
||||
wi += w_step;
|
||||
scales += group_size;
|
||||
v += group_size;
|
||||
output += group_size;
|
||||
}
|
||||
group_size /= 2;
|
||||
w_step /= 2;
|
||||
|
||||
if (output + group_size <= rounded_num_out) {
|
||||
PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v);
|
||||
}
|
||||
}
|
||||
|
||||
const IntSimdMatrix IntSimdMatrix::intSimdMatrixAVX2 = {
|
||||
// Function.
|
||||
matrixDotVector,
|
||||
// Number of 32 bit outputs held in each register.
|
||||
kNumOutputsPerRegister,
|
||||
// Maximum number of registers that we will use to hold outputs.
|
||||
kMaxOutputRegisters,
|
||||
// Number of 8 bit inputs in the inputs register.
|
||||
kNumInputsPerRegister,
|
||||
// Number of inputs in each weight group.
|
||||
kNumInputsPerGroup};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif
|
203
3rdparty/tesseract_ocr/tesseract/src/arch/intsimdmatrixneon.cpp
vendored
Normal file
203
3rdparty/tesseract_ocr/tesseract/src/arch/intsimdmatrixneon.cpp
vendored
Normal file
|
@ -0,0 +1,203 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: intsimdmatrixneon.cpp
|
||||
// Description: matrix-vector product for 8-bit data on neon.
|
||||
// Author: Robin Watts (from the AVX2 original by Ray Smith)
|
||||
//
|
||||
// (C) Copyright 2017, Google Inc.
|
||||
// (C) Copyright 2020, Artifex Software Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
|
||||
# include "intsimdmatrix.h"
|
||||
|
||||
# include <algorithm>
|
||||
# include <cstdint>
|
||||
# include <vector>
|
||||
# include "arm_neon.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Number of outputs held in each register. (Actually, we use a
|
||||
// pair of 4x32 registers, so 8 x 32 bit ints).
|
||||
constexpr int kNumOutputsPerRegister = 8;
|
||||
// Maximum number of registers that we will use.
|
||||
constexpr int kMaxOutputRegisters = 1;
|
||||
// Number of inputs in the inputs register.
|
||||
constexpr int kNumInputsPerRegister = 8;
|
||||
// Number of inputs in each weight group.
|
||||
constexpr int kNumInputsPerGroup = 8;
|
||||
|
||||
// Function to compute part of a matrix.vector multiplication. The weights
|
||||
// are in a very specific order (see above) in w, which is multiplied by
|
||||
// u of length num_in, to produce output v after scaling the integer results
|
||||
// by the corresponding member of scales.
|
||||
// The amount of w and scales consumed is fixed and not available to the
|
||||
// caller.
|
||||
|
||||
// Computes part of matrix.vector v = Wu. Computes N=8 results.
|
||||
// The weights *must* be arranged so that consecutive reads from wi
|
||||
// provides (num_in/kNumInputsPerGroup groups of (N output dim groups of
|
||||
// (kNumInputsPerGroup inputs))). After that there must be N consecutive
|
||||
// bias weights, before continuing with any more weights.
|
||||
// u must be padded out with zeros to
|
||||
// kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements.
|
||||
static inline void PartialMatrixDotVector8(const int8_t *__restrict wi,
|
||||
const double *__restrict scales,
|
||||
const int8_t *__restrict u, int num_in,
|
||||
double *__restrict v, int num_out) {
|
||||
// Initialize all the results to 0.
|
||||
int32x4_t result0123 = {0, 0, 0, 0};
|
||||
int32x4_t result4567 = {0, 0, 0, 0};
|
||||
int8x8_t bias_scale = {127, 127, 127, 127, 127, 127, 127, 127};
|
||||
// Iterate over the input (u), one registerful at a time.
|
||||
for (int j = 0; j < num_in; j += 8) {
|
||||
int8x8_t vu = vld1_s8(u); // vu = u0 u1 u2 u3 u4 u5 u6 u7
|
||||
int8x16_t vw01 = vld1q_s8(wi); // vw0 = w00 w01 w02 w03 w04 w05 w06 w07
|
||||
// w10 w11 w12 w13 w14 w15 w16 w17
|
||||
int8x16_t vw23 = vld1q_s8(wi + 8 * 2); // vw2 = w20 w21 w22 w23 w24 w25 w26 w27 w30
|
||||
// w31 w32 w33 w34 w35 w36 w37
|
||||
int8x16_t vw45 = vld1q_s8(wi + 8 * 4); // vw4 = w40 w41 w42 w43 w44 w45 w46 w47 w50
|
||||
// w51 w52 w53 w54 w55 w56 w57
|
||||
int8x16_t vw67 = vld1q_s8(wi + 8 * 6); // vw6 = w60 w61 w62 w63 w64 w65 w66 w67 w70
|
||||
// w71 w72 w73 w74 w75 w76 w77
|
||||
|
||||
int16x8_t vrow0q = vmull_s8(vget_low_s8(vw01), vu); // vrow0q = vw00.u0 w01.u1 w02.u2
|
||||
// w03.u3 vw04.u4 w05.u5 w06.u6 w07.u7
|
||||
int16x8_t vrow1q = vmull_s8(vget_high_s8(vw01),
|
||||
vu); // vrow1q = vw10.u0 w11.u1 w12.u2 w13.u3
|
||||
// vw14.u4 w15.u5 w16.u6 w17.u7
|
||||
int16x8_t vrow2q = vmull_s8(vget_low_s8(vw23), vu); // vrow2q = vw20.u0 w21.u1 w22.u2
|
||||
// w23.u3 vw24.u4 w25.u5 w26.u6 w27.u7
|
||||
int16x8_t vrow3q = vmull_s8(vget_high_s8(vw23),
|
||||
vu); // vrow3q = vw30.u0 w31.u1 w32.u2 w33.u3
|
||||
// vw34.u4 w35.u5 w36.u6 w37.u7
|
||||
int16x8_t vrow4q = vmull_s8(vget_low_s8(vw45), vu); // vrow4q = vw40.u0 w41.u1 w42.u2
|
||||
// w43.u3 vw44.u4 w45.u5 w46.u6 w47.u7
|
||||
int16x8_t vrow5q = vmull_s8(vget_high_s8(vw45),
|
||||
vu); // vrow5q = vw50.u0 w51.u1 w52.u2 w53.u3
|
||||
// vw54.u4 w55.u5 w56.u6 w57.u7
|
||||
int16x8_t vrow6q = vmull_s8(vget_low_s8(vw67), vu); // vrow6q = vw60.u0 w61.u1 w62.u2
|
||||
// w63.u3 vw64.u4 w65.u5 w66.u6 w67.u7
|
||||
int16x8_t vrow7q = vmull_s8(vget_high_s8(vw67),
|
||||
vu); // vrow7q = vw70.u0 w71.u1 w72.u2 w73.u3
|
||||
// vw74.u4 w75.u5 w76.u6 w77.u7
|
||||
|
||||
int32x4_t vrow0q2 = vpaddlq_s16(vrow0q); // vrow0q2 = vw00.u0+w01.u1 w02.u2+w03.u3
|
||||
// vw04.u4+w05.u5 w06.u6+w07.u7
|
||||
int32x4_t vrow1q2 = vpaddlq_s16(vrow1q); // vrow1q2 = vw10.u0+w11.u1 w12.u2+w13.u3
|
||||
// vw14.u4+w15.u5 w16.u6+w17.u7
|
||||
int32x4_t vrow2q2 = vpaddlq_s16(vrow2q); // vrow2q2 = vw20.u0+w21.u1 w22.u2+w23.u3
|
||||
// vw24.u4+w25.u5 w26.u6+w27.u7
|
||||
int32x4_t vrow3q2 = vpaddlq_s16(vrow3q); // vrow3q2 = vw30.u0+w31.u1 w32.u2+w33.u3
|
||||
// vw34.u4+w35.u5 w36.u6+w37.u7
|
||||
int32x4_t vrow4q2 = vpaddlq_s16(vrow4q); // vrow4q2 = vw40.u0+w41.u1 w42.u2+w43.u3
|
||||
// vw44.u4+w45.u5 w46.u6+w47.u7
|
||||
int32x4_t vrow5q2 = vpaddlq_s16(vrow5q); // vrow5q2 = vw50.u0+w51.u1 w52.u2+w53.u3
|
||||
// vw54.u4+w55.u5 w56.u6+w57.u7
|
||||
int32x4_t vrow6q2 = vpaddlq_s16(vrow6q); // vrow6q2 = vw60.u0+w61.u1 w62.u2+w63.u3
|
||||
// vw64.u4+w65.u5 w66.u6+w67.u7
|
||||
int32x4_t vrow7q2 = vpaddlq_s16(vrow7q); // vrow7q2 = vw70.u0+w71.u1 w72.u2+w73.u3
|
||||
// vw74.u4+w75.u5 w76.u6+w77.u7
|
||||
|
||||
vrow0q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow0q2), vget_high_s32(vrow0q2)),
|
||||
vpadd_s32(vget_low_s32(vrow1q2), vget_high_s32(vrow1q2)));
|
||||
// vrow0q2 = vw00.u0+...+w03.u3 vw04.u4+...+w07.u7 vw10.u0+...+w13.u3
|
||||
// vw14.u4+...+w17.u7
|
||||
vrow2q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow2q2), vget_high_s32(vrow2q2)),
|
||||
vpadd_s32(vget_low_s32(vrow3q2), vget_high_s32(vrow3q2)));
|
||||
// vrow0q2 = vw20.u0+...+w23.u3 vw24.u4+...+w27.u7 vw30.u0+...+w33.u3
|
||||
// vw34.u4+...+w37.u7
|
||||
vrow4q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow4q2), vget_high_s32(vrow4q2)),
|
||||
vpadd_s32(vget_low_s32(vrow5q2), vget_high_s32(vrow5q2)));
|
||||
// vrow0q2 = vw40.u0+...+w43.u3 vw44.u4+...+w47.u7 vw50.u0+...+w53.u3
|
||||
// vw54.u4+...+w57.u7
|
||||
vrow6q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow6q2), vget_high_s32(vrow6q2)),
|
||||
vpadd_s32(vget_low_s32(vrow7q2), vget_high_s32(vrow7q2)));
|
||||
// vrow0q2 = vw60.u0+...+w63.u3 vw64.u4+...+w67.u7 vw70.u0+...+w73.u3
|
||||
// vw74.u4+...+w77.u7
|
||||
|
||||
vrow0q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow0q2), vget_high_s32(vrow0q2)),
|
||||
vpadd_s32(vget_low_s32(vrow2q2), vget_high_s32(vrow2q2)));
|
||||
// vrow0q2 = vw00.u0+...+w07.u7 vw10.u0+...+w17.u7 vw20.u0+...+w27.u7
|
||||
// vw30.u0+...+w37.u7
|
||||
vrow4q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow4q2), vget_high_s32(vrow4q2)),
|
||||
vpadd_s32(vget_low_s32(vrow6q2), vget_high_s32(vrow6q2)));
|
||||
// vrow0q2 = vw40.u0+...+w47.u7 vw50.u0+...+w57.u7 vw60.u0+...+w67.u7
|
||||
// vw70.u0+...+w77.u7
|
||||
|
||||
result0123 = vaddq_s32(result0123, vrow0q2);
|
||||
result4567 = vaddq_s32(result4567, vrow4q2);
|
||||
u += 8;
|
||||
wi += 64;
|
||||
}
|
||||
{
|
||||
int8x8_t bias = vld1_s8(wi); // vw0 = b0 b1 b2 b3 b4 b5 b6 b7
|
||||
int16x8_t scaled_bias = vmull_s8(bias, bias_scale);
|
||||
result0123 = vaddw_s16(result0123, vget_low_s16(scaled_bias));
|
||||
result4567 = vaddw_s16(result4567, vget_high_s16(scaled_bias));
|
||||
*v++ = vget_lane_s32(vget_low_s32(result0123), 0) * *scales++;
|
||||
if (num_out > 1)
|
||||
*v++ = vget_lane_s32(vget_low_s32(result0123), 1) * *scales++;
|
||||
if (num_out > 2)
|
||||
*v++ = vget_lane_s32(vget_high_s32(result0123), 0) * *scales++;
|
||||
if (num_out > 3)
|
||||
*v++ = vget_lane_s32(vget_high_s32(result0123), 1) * *scales++;
|
||||
if (num_out > 4)
|
||||
*v++ = vget_lane_s32(vget_low_s32(result4567), 0) * *scales++;
|
||||
if (num_out > 5)
|
||||
*v++ = vget_lane_s32(vget_low_s32(result4567), 1) * *scales++;
|
||||
if (num_out > 6)
|
||||
*v++ = vget_lane_s32(vget_high_s32(result4567), 0) * *scales++;
|
||||
if (num_out > 7)
|
||||
*v = vget_lane_s32(vget_high_s32(result4567), 1) * *scales;
|
||||
}
|
||||
}
|
||||
|
||||
static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales,
|
||||
const int8_t *u, double *v) {
|
||||
const int num_out = dim1;
|
||||
const int num_in = dim2 - 1;
|
||||
// Each call to a partial_func_ produces group_size outputs, except the
|
||||
// last one, which can produce less.
|
||||
const int rounded_num_in = IntSimdMatrix::Roundup(num_in, kNumInputsPerGroup);
|
||||
int group_size = kNumOutputsPerRegister * kMaxOutputRegisters;
|
||||
int output = 0;
|
||||
|
||||
int w_step = (rounded_num_in + 1) * group_size;
|
||||
|
||||
for (; output + group_size <= num_out; output += group_size) {
|
||||
PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v, kNumOutputsPerRegister);
|
||||
wi += w_step;
|
||||
scales += group_size;
|
||||
v += group_size;
|
||||
}
|
||||
if (output < num_out)
|
||||
PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v,
|
||||
num_out & (kNumOutputsPerRegister - 1));
|
||||
}
|
||||
|
||||
const IntSimdMatrix IntSimdMatrix::intSimdMatrixNEON = {
|
||||
// Function.
|
||||
matrixDotVector,
|
||||
// Number of 32 bit outputs held in each register.
|
||||
kNumOutputsPerRegister,
|
||||
// Maximum number of registers that we will use to hold outputs.
|
||||
kMaxOutputRegisters,
|
||||
// Number of 8 bit inputs in the inputs register.
|
||||
kNumInputsPerRegister,
|
||||
// Number of inputs in each weight group.
|
||||
kNumInputsPerGroup};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif /* __ARM_NEON */
|
106
3rdparty/tesseract_ocr/tesseract/src/arch/intsimdmatrixsse.cpp
vendored
Normal file
106
3rdparty/tesseract_ocr/tesseract/src/arch/intsimdmatrixsse.cpp
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: intsindmatrixsse.cpp
|
||||
// Description: SSE implementation of 8-bit int SIMD matrix multiply.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2017, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if !defined(__SSE4_1__)
|
||||
# if defined(__i686__) || defined(__x86_64__)
|
||||
# error Implementation only for SSE 4.1 capable architectures
|
||||
# endif
|
||||
#else
|
||||
|
||||
# include "intsimdmatrix.h"
|
||||
|
||||
# include <emmintrin.h>
|
||||
# include <smmintrin.h>
|
||||
# include <cstdint>
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Computes and returns the dot product of the n-vectors u and v.
|
||||
// Uses Intel SSE intrinsics to access the SIMD instruction set.
|
||||
static int32_t IntDotProductSSE(const int8_t *u, const int8_t *v, int n) {
|
||||
int max_offset = n - 8;
|
||||
int offset = 0;
|
||||
// Accumulate a set of 4 32-bit sums in sum, by loading 8 pairs of 8-bit
|
||||
// values, extending to 16 bit, multiplying to make 32 bit results.
|
||||
int32_t result = 0;
|
||||
if (offset <= max_offset) {
|
||||
offset = 8;
|
||||
__m128i packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(u));
|
||||
__m128i packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(v));
|
||||
__m128i sum = _mm_cvtepi8_epi16(packed1);
|
||||
packed2 = _mm_cvtepi8_epi16(packed2);
|
||||
// The magic _mm_add_epi16 is perfect here. It multiplies 8 pairs of 16 bit
|
||||
// ints to make 32 bit results, which are then horizontally added in pairs
|
||||
// to make 4 32 bit results that still fit in a 128 bit register.
|
||||
sum = _mm_madd_epi16(sum, packed2);
|
||||
while (offset <= max_offset) {
|
||||
packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(u + offset));
|
||||
packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(v + offset));
|
||||
offset += 8;
|
||||
packed1 = _mm_cvtepi8_epi16(packed1);
|
||||
packed2 = _mm_cvtepi8_epi16(packed2);
|
||||
packed1 = _mm_madd_epi16(packed1, packed2);
|
||||
sum = _mm_add_epi32(sum, packed1);
|
||||
}
|
||||
// Sum the 4 packed 32 bit sums and extract the low result.
|
||||
sum = _mm_hadd_epi32(sum, sum);
|
||||
sum = _mm_hadd_epi32(sum, sum);
|
||||
result = _mm_cvtsi128_si32(sum);
|
||||
}
|
||||
while (offset < n) {
|
||||
result += u[offset] * v[offset];
|
||||
++offset;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Computes part of matrix.vector v = Wu. Computes 1 result.
|
||||
static void PartialMatrixDotVector1(const int8_t *wi, const double *scales, const int8_t *u,
|
||||
int num_in, double *v) {
|
||||
double total = IntDotProductSSE(u, wi, num_in);
|
||||
// Add in the bias and correct for integer values.
|
||||
*v = (total + wi[num_in] * INT8_MAX) * *scales;
|
||||
}
|
||||
|
||||
static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales,
|
||||
const int8_t *u, double *v) {
|
||||
const int num_out = dim1;
|
||||
const int num_in = dim2 - 1;
|
||||
int output = 0;
|
||||
|
||||
for (; output < num_out; output++) {
|
||||
PartialMatrixDotVector1(wi, scales, u, num_in, v);
|
||||
wi += dim2;
|
||||
scales++;
|
||||
v++;
|
||||
}
|
||||
}
|
||||
|
||||
const IntSimdMatrix IntSimdMatrix::intSimdMatrixSSE = {
|
||||
matrixDotVector,
|
||||
// Number of 32 bit outputs held in each register.
|
||||
1,
|
||||
// Maximum number of registers that we will use to hold outputs.
|
||||
1,
|
||||
// Number of 8 bit inputs in the inputs register.
|
||||
1,
|
||||
// Number of inputs in each weight group.
|
||||
1};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif
|
283
3rdparty/tesseract_ocr/tesseract/src/arch/simddetect.cpp
vendored
Normal file
283
3rdparty/tesseract_ocr/tesseract/src/arch/simddetect.cpp
vendored
Normal file
|
@ -0,0 +1,283 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: simddetect.cpp
|
||||
// Description: Architecture detector.
|
||||
// Author: Stefan Weil (based on code from Ray Smith)
|
||||
//
|
||||
// (C) Copyright 2014, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
# include "config_auto.h" // for HAVE_AVX, ...
|
||||
#endif
|
||||
#include <numeric> // for std::inner_product
|
||||
#include "dotproduct.h"
|
||||
#include "intsimdmatrix.h" // for IntSimdMatrix
|
||||
#include "params.h" // for STRING_VAR
|
||||
#include "simddetect.h"
|
||||
#include "tprintf.h" // for tprintf
|
||||
|
||||
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
|
||||
# define HAS_CPUID
|
||||
#endif
|
||||
|
||||
#if defined(HAS_CPUID)
|
||||
# if defined(__GNUC__)
|
||||
# include <cpuid.h>
|
||||
# elif defined(_WIN32)
|
||||
# include <intrin.h>
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON) && !defined(__aarch64__)
|
||||
# ifdef ANDROID
|
||||
# include <cpu-features.h>
|
||||
# else
|
||||
/* Assume linux */
|
||||
# include <asm/hwcap.h>
|
||||
# include <sys/auxv.h>
|
||||
# endif
|
||||
#endif
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Computes and returns the dot product of the two n-vectors u and v.
|
||||
// Note: because the order of addition is different among the different dot
|
||||
// product functions, the results can (and do) vary slightly (although they
|
||||
// agree to within about 4e-15). This produces different results when running
|
||||
// training, despite all random inputs being precisely equal.
|
||||
// To get consistent results, use just one of these dot product functions.
|
||||
// On a test multi-layer network, serial is 57% slower than SSE, and AVX
|
||||
// is about 8% faster than SSE. This suggests that the time is memory
|
||||
// bandwidth constrained and could benefit from holding the reused vector
|
||||
// in AVX registers.
|
||||
DotProductFunction DotProduct;
|
||||
|
||||
static STRING_VAR(dotproduct, "auto", "Function used for calculation of dot product");
|
||||
|
||||
SIMDDetect SIMDDetect::detector;
|
||||
|
||||
#if defined(__aarch64__)
|
||||
// ARMv8 always has NEON.
|
||||
bool SIMDDetect::neon_available_ = true;
|
||||
#elif defined(HAVE_NEON)
|
||||
// If true, then Neon has been detected.
|
||||
bool SIMDDetect::neon_available_;
|
||||
#else
|
||||
// If true, then AVX has been detected.
|
||||
bool SIMDDetect::avx_available_;
|
||||
bool SIMDDetect::avx2_available_;
|
||||
bool SIMDDetect::avx512F_available_;
|
||||
bool SIMDDetect::avx512BW_available_;
|
||||
// If true, then FMA has been detected.
|
||||
bool SIMDDetect::fma_available_;
|
||||
// If true, then SSe4.1 has been detected.
|
||||
bool SIMDDetect::sse_available_;
|
||||
#endif
|
||||
|
||||
// Computes and returns the dot product of the two n-vectors u and v.
|
||||
static double DotProductGeneric(const double *u, const double *v, int n) {
|
||||
double total = 0.0;
|
||||
for (int k = 0; k < n; ++k) {
|
||||
total += u[k] * v[k];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
// Compute dot product using std::inner_product.
|
||||
static double DotProductStdInnerProduct(const double *u, const double *v, int n) {
|
||||
return std::inner_product(u, u + n, v, 0.0);
|
||||
}
|
||||
|
||||
static void SetDotProduct(DotProductFunction f, const IntSimdMatrix *m = nullptr) {
|
||||
DotProduct = f;
|
||||
IntSimdMatrix::intSimdMatrix = m;
|
||||
}
|
||||
|
||||
// Constructor.
|
||||
// Tests the architecture in a system-dependent way to detect AVX, SSE and
|
||||
// any other available SIMD equipment.
|
||||
// __GNUC__ is also defined by compilers that include GNU extensions such as
|
||||
// clang.
|
||||
SIMDDetect::SIMDDetect() {
|
||||
// The fallback is a generic dot product calculation.
|
||||
SetDotProduct(DotProductGeneric);
|
||||
|
||||
#if defined(HAS_CPUID)
|
||||
# if defined(__GNUC__)
|
||||
unsigned int eax, ebx, ecx, edx;
|
||||
if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
|
||||
// Note that these tests all use hex because the older compilers don't have
|
||||
// the newer flags.
|
||||
# if defined(HAVE_SSE4_1)
|
||||
sse_available_ = (ecx & 0x00080000) != 0;
|
||||
# endif
|
||||
# if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
|
||||
auto xgetbv = []() {
|
||||
uint32_t xcr0;
|
||||
__asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
|
||||
return xcr0;
|
||||
};
|
||||
if ((ecx & 0x08000000) && ((xgetbv() & 6) == 6)) {
|
||||
// OSXSAVE bit is set, XMM state and YMM state are fine.
|
||||
# if defined(HAVE_FMA)
|
||||
fma_available_ = (ecx & 0x00001000) != 0;
|
||||
# endif
|
||||
# if defined(HAVE_AVX)
|
||||
avx_available_ = (ecx & 0x10000000) != 0;
|
||||
if (avx_available_) {
|
||||
// There is supposed to be a __get_cpuid_count function, but this is all
|
||||
// there is in my cpuid.h. It is a macro for an asm statement and cannot
|
||||
// be used inside an if.
|
||||
__cpuid_count(7, 0, eax, ebx, ecx, edx);
|
||||
avx2_available_ = (ebx & 0x00000020) != 0;
|
||||
avx512F_available_ = (ebx & 0x00010000) != 0;
|
||||
avx512BW_available_ = (ebx & 0x40000000) != 0;
|
||||
}
|
||||
# endif
|
||||
}
|
||||
# endif
|
||||
}
|
||||
# elif defined(_WIN32)
|
||||
int cpuInfo[4];
|
||||
int max_function_id;
|
||||
__cpuid(cpuInfo, 0);
|
||||
max_function_id = cpuInfo[0];
|
||||
if (max_function_id >= 1) {
|
||||
__cpuid(cpuInfo, 1);
|
||||
# if defined(HAVE_SSE4_1)
|
||||
sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
|
||||
# endif
|
||||
# if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
|
||||
if ((cpuInfo[2] & 0x08000000) && ((_xgetbv(0) & 6) == 6)) {
|
||||
// OSXSAVE bit is set, XMM state and YMM state are fine.
|
||||
# if defined(HAVE_FMA)
|
||||
fma_available_ = (cpuInfo[2] & 0x00001000) != 0;
|
||||
# endif
|
||||
# if defined(HAVE_AVX)
|
||||
avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
|
||||
# endif
|
||||
# if defined(HAVE_AVX2)
|
||||
if (max_function_id >= 7) {
|
||||
__cpuid(cpuInfo, 7);
|
||||
avx2_available_ = (cpuInfo[1] & 0x00000020) != 0;
|
||||
avx512F_available_ = (cpuInfo[1] & 0x00010000) != 0;
|
||||
avx512BW_available_ = (cpuInfo[1] & 0x40000000) != 0;
|
||||
}
|
||||
# endif
|
||||
}
|
||||
# endif
|
||||
}
|
||||
# else
|
||||
# error "I don't know how to test for SIMD with this compiler"
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON) && !defined(__aarch64__)
|
||||
# ifdef ANDROID
|
||||
{
|
||||
AndroidCpuFamily family = android_getCpuFamily();
|
||||
if (family == ANDROID_CPU_FAMILY_ARM)
|
||||
neon_available_ = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON);
|
||||
}
|
||||
# else
|
||||
/* Assume linux */
|
||||
neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON;
|
||||
# endif
|
||||
#endif
|
||||
|
||||
// Select code for calculation of dot product based on autodetection.
|
||||
if (false) {
|
||||
// This is a dummy to support conditional compilation.
|
||||
#if defined(HAVE_AVX2)
|
||||
} else if (avx2_available_) {
|
||||
// AVX2 detected.
|
||||
SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2);
|
||||
#endif
|
||||
#if defined(HAVE_AVX)
|
||||
} else if (avx_available_) {
|
||||
// AVX detected.
|
||||
SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
|
||||
#endif
|
||||
#if defined(HAVE_SSE4_1)
|
||||
} else if (sse_available_) {
|
||||
// SSE detected.
|
||||
SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);
|
||||
#endif
|
||||
#if defined(HAVE_NEON) || defined(__aarch64__)
|
||||
} else if (neon_available_) {
|
||||
// NEON detected.
|
||||
SetDotProduct(DotProduct, &IntSimdMatrix::intSimdMatrixNEON);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
void SIMDDetect::Update() {
|
||||
// Select code for calculation of dot product based on the
|
||||
// value of the config variable if that value is not empty.
|
||||
const char *dotproduct_method = "generic";
|
||||
if (!strcmp(dotproduct.c_str(), "auto")) {
|
||||
// Automatic detection. Nothing to be done.
|
||||
} else if (!strcmp(dotproduct.c_str(), "generic")) {
|
||||
// Generic code selected by config variable.
|
||||
SetDotProduct(DotProductGeneric);
|
||||
dotproduct_method = "generic";
|
||||
} else if (!strcmp(dotproduct.c_str(), "native")) {
|
||||
// Native optimized code selected by config variable.
|
||||
SetDotProduct(DotProductNative);
|
||||
dotproduct_method = "native";
|
||||
#if defined(HAVE_AVX2)
|
||||
} else if (!strcmp(dotproduct.c_str(), "avx2")) {
|
||||
// AVX2 selected by config variable.
|
||||
SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2);
|
||||
dotproduct_method = "avx2";
|
||||
#endif
|
||||
#if defined(HAVE_AVX)
|
||||
} else if (!strcmp(dotproduct.c_str(), "avx")) {
|
||||
// AVX selected by config variable.
|
||||
SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
|
||||
dotproduct_method = "avx";
|
||||
#endif
|
||||
#if defined(HAVE_FMA)
|
||||
} else if (!strcmp(dotproduct.c_str(), "fma")) {
|
||||
// FMA selected by config variable.
|
||||
SetDotProduct(DotProductFMA, IntSimdMatrix::intSimdMatrix);
|
||||
dotproduct_method = "fma";
|
||||
#endif
|
||||
#if defined(HAVE_SSE4_1)
|
||||
} else if (!strcmp(dotproduct.c_str(), "sse")) {
|
||||
// SSE selected by config variable.
|
||||
SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);
|
||||
dotproduct_method = "sse";
|
||||
#endif
|
||||
} else if (!strcmp(dotproduct.c_str(), "std::inner_product")) {
|
||||
// std::inner_product selected by config variable.
|
||||
SetDotProduct(DotProductStdInnerProduct);
|
||||
dotproduct_method = "std::inner_product";
|
||||
} else {
|
||||
// Unsupported value of config variable.
|
||||
tprintf("Warning, ignoring unsupported config variable value: dotproduct=%s\n",
|
||||
dotproduct.c_str());
|
||||
tprintf(
|
||||
"Support values for dotproduct: auto generic native"
|
||||
#if defined(HAVE_AVX)
|
||||
" avx"
|
||||
#endif
|
||||
#if defined(HAVE_SSE4_1)
|
||||
" sse"
|
||||
#endif
|
||||
" std::inner_product.\n");
|
||||
}
|
||||
|
||||
dotproduct.set_value(dotproduct_method);
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
87
3rdparty/tesseract_ocr/tesseract/src/arch/simddetect.h
vendored
Normal file
87
3rdparty/tesseract_ocr/tesseract/src/arch/simddetect.h
vendored
Normal file
|
@ -0,0 +1,87 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: simddetect.h
|
||||
// Description: Architecture detector.
|
||||
// Author: Stefan Weil (based on code from Ray Smith)
|
||||
//
|
||||
// (C) Copyright 2014, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
#ifndef TESSERACT_ARCH_SIMDDETECT_H_
|
||||
#define TESSERACT_ARCH_SIMDDETECT_H_
|
||||
|
||||
#include <tesseract/export.h>
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Function pointer for best calculation of dot product.
|
||||
using DotProductFunction = double (*)(const double *, const double *, int);
|
||||
extern DotProductFunction DotProduct;
|
||||
|
||||
// Architecture detector. Add code here to detect any other architectures for
|
||||
// SIMD-based faster dot product functions. Intended to be a single static
|
||||
// object, but it does no real harm to have more than one.
|
||||
class SIMDDetect {
|
||||
public:
|
||||
// Returns true if AVX is available on this system.
|
||||
static inline bool IsAVXAvailable() {
|
||||
return detector.avx_available_;
|
||||
}
|
||||
// Returns true if AVX2 (integer support) is available on this system.
|
||||
static inline bool IsAVX2Available() {
|
||||
return detector.avx2_available_;
|
||||
}
|
||||
// Returns true if AVX512 Foundation (float) is available on this system.
|
||||
static inline bool IsAVX512FAvailable() {
|
||||
return detector.avx512F_available_;
|
||||
}
|
||||
// Returns true if AVX512 integer is available on this system.
|
||||
static inline bool IsAVX512BWAvailable() {
|
||||
return detector.avx512BW_available_;
|
||||
}
|
||||
// Returns true if FMA is available on this system.
|
||||
static inline bool IsFMAAvailable() {
|
||||
return detector.fma_available_;
|
||||
}
|
||||
// Returns true if SSE4.1 is available on this system.
|
||||
static inline bool IsSSEAvailable() {
|
||||
return detector.sse_available_;
|
||||
}
|
||||
// Returns true if NEON is available on this system.
|
||||
static inline bool IsNEONAvailable() {
|
||||
return detector.neon_available_;
|
||||
}
|
||||
|
||||
// Update settings after config variable was set.
|
||||
static TESS_API void Update();
|
||||
|
||||
private:
|
||||
// Constructor, must set all static member variables.
|
||||
SIMDDetect();
|
||||
|
||||
private:
|
||||
// Singleton.
|
||||
static SIMDDetect detector;
|
||||
// If true, then AVX has been detected.
|
||||
static TESS_API bool avx_available_;
|
||||
static TESS_API bool avx2_available_;
|
||||
static TESS_API bool avx512F_available_;
|
||||
static TESS_API bool avx512BW_available_;
|
||||
// If true, then FMA has been detected.
|
||||
static TESS_API bool fma_available_;
|
||||
// If true, then SSe4.1 has been detected.
|
||||
static TESS_API bool sse_available_;
|
||||
// If true, then NEON has been detected.
|
||||
static TESS_API bool neon_available_;
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_ARCH_SIMDDETECT_H_
|
120
3rdparty/tesseract_ocr/tesseract/src/ccmain/adaptions.cpp
vendored
Normal file
120
3rdparty/tesseract_ocr/tesseract/src/ccmain/adaptions.cpp
vendored
Normal file
|
@ -0,0 +1,120 @@
|
|||
/**********************************************************************
|
||||
* File: adaptions.cpp (Formerly adaptions.c)
|
||||
* Description: Functions used to adapt to blobs already confidently
|
||||
* identified
|
||||
* Author: Chris Newton
|
||||
*
|
||||
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include <cctype>
|
||||
#include <cstring>
|
||||
#include "control.h"
|
||||
#include "reject.h"
|
||||
#include "stopper.h"
|
||||
#include "tesseractclass.h"
|
||||
#include "tessvars.h"
|
||||
|
||||
// Include automatically generated configuration file if running autoconf.
|
||||
#ifdef HAVE_CONFIG_H
|
||||
# include "config_auto.h"
|
||||
#endif
|
||||
|
||||
namespace tesseract {
|
||||
bool Tesseract::word_adaptable( // should we adapt?
|
||||
WERD_RES *word, uint16_t mode) {
|
||||
if (tessedit_adaption_debug) {
|
||||
tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
|
||||
word->best_choice->unichar_string().c_str(), word->best_choice->rating(),
|
||||
word->best_choice->certainty());
|
||||
}
|
||||
|
||||
bool status = false;
|
||||
std::bitset<16> flags(mode);
|
||||
|
||||
enum MODES {
|
||||
ADAPTABLE_WERD,
|
||||
ACCEPTABLE_WERD,
|
||||
CHECK_DAWGS,
|
||||
CHECK_SPACES,
|
||||
CHECK_ONE_ELL_CONFLICT,
|
||||
CHECK_AMBIG_WERD
|
||||
};
|
||||
|
||||
/*
|
||||
0: NO adaption
|
||||
*/
|
||||
if (mode == 0) {
|
||||
if (tessedit_adaption_debug) {
|
||||
tprintf("adaption disabled\n");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (flags[ADAPTABLE_WERD]) {
|
||||
status |= word->tess_would_adapt; // result of Classify::AdaptableWord()
|
||||
if (tessedit_adaption_debug && !status) {
|
||||
tprintf("tess_would_adapt bit is false\n");
|
||||
}
|
||||
}
|
||||
|
||||
if (flags[ACCEPTABLE_WERD]) {
|
||||
status |= word->tess_accepted;
|
||||
if (tessedit_adaption_debug && !status) {
|
||||
tprintf("tess_accepted bit is false\n");
|
||||
}
|
||||
}
|
||||
|
||||
if (!status) { // If not set then
|
||||
return false; // ignore other checks
|
||||
}
|
||||
|
||||
if (flags[CHECK_DAWGS] && (word->best_choice->permuter() != SYSTEM_DAWG_PERM) &&
|
||||
(word->best_choice->permuter() != FREQ_DAWG_PERM) &&
|
||||
(word->best_choice->permuter() != USER_DAWG_PERM) &&
|
||||
(word->best_choice->permuter() != NUMBER_PERM)) {
|
||||
if (tessedit_adaption_debug) {
|
||||
tprintf("word not in dawgs\n");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (flags[CHECK_ONE_ELL_CONFLICT] && one_ell_conflict(word, false)) {
|
||||
if (tessedit_adaption_debug) {
|
||||
tprintf("word has ell conflict\n");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (flags[CHECK_SPACES] &&
|
||||
(strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {
|
||||
if (tessedit_adaption_debug) {
|
||||
tprintf("word contains spaces\n");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (flags[CHECK_AMBIG_WERD] && word->best_choice->dangerous_ambig_found()) {
|
||||
if (tessedit_adaption_debug) {
|
||||
tprintf("word is ambiguous\n");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (tessedit_adaption_debug) {
|
||||
tprintf("returning status %d\n", status);
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
781
3rdparty/tesseract_ocr/tesseract/src/ccmain/applybox.cpp
vendored
Normal file
781
3rdparty/tesseract_ocr/tesseract/src/ccmain/applybox.cpp
vendored
Normal file
|
@ -0,0 +1,781 @@
|
|||
/**********************************************************************
|
||||
* File: applybox.cpp (Formerly applybox.c)
|
||||
* Description: Re segment rows according to box file data
|
||||
* Author: Phil Cheatle
|
||||
*
|
||||
* (C) Copyright 1993, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
# include <allheaders.h>
|
||||
# include <cctype>
|
||||
# include <cerrno>
|
||||
# include <cstring>
|
||||
# include "boxread.h"
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
#include <tesseract/unichar.h>
|
||||
#include "pageres.h"
|
||||
#include "tesseractclass.h"
|
||||
#include "unicharset.h"
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
/** Max number of blobs to classify together in FindSegmentation. */
|
||||
const int kMaxGroupSize = 4;
|
||||
/// Max fraction of median allowed as deviation in xheight before switching
|
||||
/// to median.
|
||||
const double kMaxXHeightDeviationFraction = 0.125;
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
/**
|
||||
* The box file is assumed to contain box definitions, one per line, of the
|
||||
* following format for blob-level boxes:
|
||||
* @verbatim
|
||||
* <UTF8 str> <left> <bottom> <right> <top> <page id>
|
||||
* @endverbatim
|
||||
* and for word/line-level boxes:
|
||||
* @verbatim
|
||||
* WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
|
||||
* @endverbatim
|
||||
* NOTES:
|
||||
* The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
|
||||
*
|
||||
* <page id> is 0-based, and the page number is used for multipage input (tiff).
|
||||
*
|
||||
* In the blob-level form, each line represents a recognizable unit, which may
|
||||
* be several UTF-8 bytes, but there is a bounding box around each recognizable
|
||||
* unit, and no classifier is needed to train in this mode (bootstrapping.)
|
||||
*
|
||||
* In the word/line-level form, the line begins with the literal "WordStr", and
|
||||
* the bounding box bounds either a whole line or a whole word. The recognizable
|
||||
* units in the word/line are listed after the # at the end of the line and
|
||||
* are space delimited, ignoring any original spaces on the line.
|
||||
* Eg.
|
||||
* @verbatim
|
||||
* word -> #w o r d
|
||||
* multi word line -> #m u l t i w o r d l i n e
|
||||
* @endverbatim
|
||||
* The recognizable units must be space-delimited in order to allow multiple
|
||||
* unicodes to be used for a single recognizable unit, eg Hindi.
|
||||
*
|
||||
* In this mode, the classifier must have been pre-trained with the desired
|
||||
* character set, or it will not be able to find the character segmentations.
|
||||
*/
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
static void clear_any_old_text(BLOCK_LIST *block_list) {
|
||||
BLOCK_IT block_it(block_list);
|
||||
for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
|
||||
ROW_IT row_it(block_it.data()->row_list());
|
||||
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
|
||||
WERD_IT word_it(row_it.data()->word_list());
|
||||
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
|
||||
word_it.data()->set_text("");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Applies the box file based on the image name filename, and resegments
|
||||
// the words in the block_list (page), with:
|
||||
// blob-mode: one blob per line in the box file, words as input.
|
||||
// word/line-mode: one blob per space-delimited unit after the #, and one word
|
||||
// per line in the box file. (See comment above for box file format.)
|
||||
// If find_segmentation is true, (word/line mode) then the classifier is used
|
||||
// to re-segment words/lines to match the space-delimited truth string for
|
||||
// each box. In this case, the input box may be for a word or even a whole
|
||||
// text line, and the output words will contain multiple blobs corresponding
|
||||
// to the space-delimited input string.
|
||||
// With find_segmentation false, no classifier is needed, but the chopper
|
||||
// can still be used to correctly segment touching characters with the help
|
||||
// of the input boxes.
|
||||
// In the returned PAGE_RES, the WERD_RES are setup as they would be returned
|
||||
// from normal classification, ie. with a word, chopped_word, rebuild_word,
|
||||
// seam_array, denorm, box_word, and best_state, but NO best_choice or
|
||||
// raw_choice, as they would require a UNICHARSET, which we aim to avoid.
|
||||
// Instead, the correct_text member of WERD_RES is set, and this may be later
|
||||
// converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
|
||||
// is not required before calling ApplyBoxTraining.
|
||||
PAGE_RES *Tesseract::ApplyBoxes(const char *filename, bool find_segmentation,
|
||||
BLOCK_LIST *block_list) {
|
||||
std::vector<TBOX> boxes;
|
||||
std::vector<std::string> texts, full_texts;
|
||||
if (!ReadAllBoxes(applybox_page, true, filename, &boxes, &texts, &full_texts, nullptr)) {
|
||||
return nullptr; // Can't do it.
|
||||
}
|
||||
|
||||
const int box_count = boxes.size();
|
||||
int box_failures = 0;
|
||||
|
||||
// In word mode, we use the boxes to make a word for each box, but
|
||||
// in blob mode we use the existing words and maximally chop them first.
|
||||
PAGE_RES *page_res = find_segmentation ? nullptr : SetupApplyBoxes(boxes, block_list);
|
||||
clear_any_old_text(block_list);
|
||||
|
||||
for (int i = 0; i < box_count; i++) {
|
||||
bool foundit = false;
|
||||
if (page_res != nullptr) {
|
||||
foundit =
|
||||
ResegmentCharBox(page_res, (i == 0) ? nullptr : &boxes[i - 1], boxes[i],
|
||||
(i == box_count - 1) ? nullptr : &boxes[i + 1], full_texts[i].c_str());
|
||||
} else {
|
||||
foundit = ResegmentWordBox(block_list, boxes[i],
|
||||
(i == box_count - 1) ? nullptr : &boxes[i + 1], texts[i].c_str());
|
||||
}
|
||||
if (!foundit) {
|
||||
box_failures++;
|
||||
ReportFailedBox(i, boxes[i], texts[i].c_str(), "FAILURE! Couldn't find a matching blob");
|
||||
}
|
||||
}
|
||||
|
||||
if (page_res == nullptr) {
|
||||
// In word/line mode, we now maximally chop all the words and resegment
|
||||
// them with the classifier.
|
||||
page_res = SetupApplyBoxes(boxes, block_list);
|
||||
ReSegmentByClassification(page_res);
|
||||
}
|
||||
if (applybox_debug > 0) {
|
||||
tprintf("APPLY_BOXES:\n");
|
||||
tprintf(" Boxes read from boxfile: %6d\n", box_count);
|
||||
if (box_failures > 0) {
|
||||
tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
|
||||
}
|
||||
}
|
||||
TidyUp(page_res);
|
||||
return page_res;
|
||||
}
|
||||
|
||||
// Helper computes median xheight in the image.
|
||||
static double MedianXHeight(BLOCK_LIST *block_list) {
|
||||
BLOCK_IT block_it(block_list);
|
||||
STATS xheights(0, block_it.data()->pdblk.bounding_box().height());
|
||||
for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
|
||||
ROW_IT row_it(block_it.data()->row_list());
|
||||
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
|
||||
xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
|
||||
}
|
||||
}
|
||||
return xheights.median();
|
||||
}
|
||||
|
||||
/// Any row xheight that is significantly different from the median is set
|
||||
/// to the median.
|
||||
void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
|
||||
const double median_xheight = MedianXHeight(block_list);
|
||||
const double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
|
||||
// Strip all fuzzy space markers to simplify the PAGE_RES.
|
||||
BLOCK_IT b_it(block_list);
|
||||
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
|
||||
BLOCK *block = b_it.data();
|
||||
ROW_IT r_it(block->row_list());
|
||||
for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
|
||||
ROW *row = r_it.data();
|
||||
const double diff = fabs(row->x_height() - median_xheight);
|
||||
if (diff > max_deviation) {
|
||||
if (applybox_debug) {
|
||||
tprintf("row xheight=%g, but median xheight = %g\n", row->x_height(), median_xheight);
|
||||
}
|
||||
row->set_x_height(static_cast<float>(median_xheight));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
|
||||
/// All fuzzy spaces are removed, and all the words are maximally chopped.
|
||||
PAGE_RES *Tesseract::SetupApplyBoxes(const std::vector<TBOX> &boxes, BLOCK_LIST *block_list) {
|
||||
PreenXHeights(block_list);
|
||||
// Strip all fuzzy space markers to simplify the PAGE_RES.
|
||||
BLOCK_IT b_it(block_list);
|
||||
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
|
||||
BLOCK *block = b_it.data();
|
||||
ROW_IT r_it(block->row_list());
|
||||
for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
|
||||
ROW *row = r_it.data();
|
||||
WERD_IT w_it(row->word_list());
|
||||
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
|
||||
WERD *word = w_it.data();
|
||||
if (word->cblob_list()->empty()) {
|
||||
delete w_it.extract();
|
||||
} else {
|
||||
word->set_flag(W_FUZZY_SP, false);
|
||||
word->set_flag(W_FUZZY_NON, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
auto *page_res = new PAGE_RES(false, block_list, nullptr);
|
||||
PAGE_RES_IT pr_it(page_res);
|
||||
WERD_RES *word_res;
|
||||
while ((word_res = pr_it.word()) != nullptr) {
|
||||
MaximallyChopWord(boxes, pr_it.block()->block, pr_it.row()->row, word_res);
|
||||
pr_it.forward();
|
||||
}
|
||||
return page_res;
|
||||
}
|
||||
|
||||
/// Tests the chopper by exhaustively running chop_one_blob.
|
||||
/// The word_res will contain filled chopped_word, seam_array, denorm,
|
||||
/// box_word and best_state for the maximally chopped word.
|
||||
void Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block, ROW *row,
|
||||
WERD_RES *word_res) {
|
||||
if (!word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
|
||||
classify_bln_numeric_mode, textord_use_cjk_fp_model,
|
||||
poly_allow_detailed_fx, row, block)) {
|
||||
word_res->CloneChoppedToRebuild();
|
||||
return;
|
||||
}
|
||||
if (chop_debug) {
|
||||
tprintf("Maximally chopping word at:");
|
||||
word_res->word->bounding_box().print();
|
||||
}
|
||||
std::vector<BLOB_CHOICE *> blob_choices;
|
||||
ASSERT_HOST(!word_res->chopped_word->blobs.empty());
|
||||
auto rating = static_cast<float>(INT8_MAX);
|
||||
for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
|
||||
// The rating and certainty are not quite arbitrary. Since
|
||||
// select_blob_to_chop uses the worst certainty to choose, they all have
|
||||
// to be different, so starting with INT8_MAX, subtract 1/8 for each blob
|
||||
// in here, and then divide by e each time they are chopped, which
|
||||
// should guarantee a set of unequal values for the whole tree of blobs
|
||||
// produced, however much chopping is required. The chops are thus only
|
||||
// limited by the ability of the chopper to find suitable chop points,
|
||||
// and not by the value of the certainties.
|
||||
auto *choice = new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
|
||||
blob_choices.push_back(choice);
|
||||
rating -= 0.125f;
|
||||
}
|
||||
const double e = exp(1.0); // The base of natural logs.
|
||||
int blob_number;
|
||||
int right_chop_index = 0;
|
||||
if (!assume_fixed_pitch_char_segment) {
|
||||
// We only chop if the language is not fixed pitch like CJK.
|
||||
SEAM *seam = nullptr;
|
||||
while ((seam = chop_one_blob(boxes, blob_choices, word_res, &blob_number)) != nullptr) {
|
||||
word_res->InsertSeam(blob_number, seam);
|
||||
BLOB_CHOICE *left_choice = blob_choices[blob_number];
|
||||
rating = left_choice->rating() / e;
|
||||
left_choice->set_rating(rating);
|
||||
left_choice->set_certainty(-rating);
|
||||
// combine confidence w/ serial #
|
||||
auto *right_choice = new BLOB_CHOICE(++right_chop_index, rating - 0.125f, -rating, -1, 0.0f,
|
||||
0.0f, 0.0f, BCC_FAKE);
|
||||
blob_choices.insert(blob_choices.begin() + blob_number + 1, right_choice);
|
||||
}
|
||||
}
|
||||
word_res->CloneChoppedToRebuild();
|
||||
word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
|
||||
}
|
||||
|
||||
/// Helper to compute the dispute resolution metric.
|
||||
/// Disputed blob resolution. The aim is to give the blob to the most
|
||||
/// appropriate boxfile box. Most of the time it is obvious, but if
|
||||
/// two boxfile boxes overlap significantly it is not. If a small boxfile
|
||||
/// box takes most of the blob, and a large boxfile box does too, then
|
||||
/// we want the small boxfile box to get it, but if the small box
|
||||
/// is much smaller than the blob, we don't want it to get it.
|
||||
/// Details of the disputed blob resolution:
|
||||
/// Given a box with area A, and a blob with area B, with overlap area C,
|
||||
/// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
|
||||
/// miss metric gets the blob.
|
||||
static double BoxMissMetric(const TBOX &box1, const TBOX &box2) {
|
||||
const int overlap_area = box1.intersection(box2).area();
|
||||
const int a = box1.area();
|
||||
const int b = box2.area();
|
||||
ASSERT_HOST(a != 0 && b != 0);
|
||||
return 1.0 * (a - overlap_area) * (b - overlap_area) / a / b;
|
||||
}
|
||||
|
||||
/// Gather consecutive blobs that match the given box into the best_state
|
||||
/// and corresponding correct_text.
|
||||
///
|
||||
/// Fights over which box owns which blobs are settled by pre-chopping and
|
||||
/// applying the blobs to box or next_box with the least non-overlap.
|
||||
/// @return false if the box was in error, which can only be caused by
|
||||
/// failing to find an appropriate blob for a box.
|
||||
///
|
||||
/// This means that occasionally, blobs may be incorrectly segmented if the
|
||||
/// chopper fails to find a suitable chop point.
|
||||
bool Tesseract::ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box,
|
||||
const TBOX *next_box, const char *correct_text) {
|
||||
if (applybox_debug > 1) {
|
||||
tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
|
||||
}
|
||||
PAGE_RES_IT page_res_it(page_res);
|
||||
WERD_RES *word_res;
|
||||
for (word_res = page_res_it.word(); word_res != nullptr; word_res = page_res_it.forward()) {
|
||||
if (!word_res->box_word->bounding_box().major_overlap(box)) {
|
||||
continue;
|
||||
}
|
||||
if (applybox_debug > 1) {
|
||||
tprintf("Checking word box:");
|
||||
word_res->box_word->bounding_box().print();
|
||||
}
|
||||
int word_len = word_res->box_word->length();
|
||||
for (int i = 0; i < word_len; ++i) {
|
||||
TBOX char_box = TBOX();
|
||||
int blob_count = 0;
|
||||
for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
|
||||
TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
|
||||
if (!blob_box.major_overlap(box)) {
|
||||
break;
|
||||
}
|
||||
if (word_res->correct_text[i + blob_count].length() > 0) {
|
||||
break; // Blob is claimed already.
|
||||
}
|
||||
if (next_box != nullptr) {
|
||||
const double current_box_miss_metric = BoxMissMetric(blob_box, box);
|
||||
const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
|
||||
if (applybox_debug > 2) {
|
||||
tprintf("Checking blob:");
|
||||
blob_box.print();
|
||||
tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,
|
||||
next_box_miss_metric);
|
||||
}
|
||||
if (current_box_miss_metric > next_box_miss_metric) {
|
||||
break; // Blob is a better match for next box.
|
||||
}
|
||||
}
|
||||
char_box += blob_box;
|
||||
}
|
||||
if (blob_count > 0) {
|
||||
if (applybox_debug > 1) {
|
||||
tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
|
||||
}
|
||||
if (!char_box.almost_equal(box, 3) &&
|
||||
((next_box != nullptr && box.x_gap(*next_box) < -3) ||
|
||||
(prev_box != nullptr && prev_box->x_gap(box) < -3))) {
|
||||
return false;
|
||||
}
|
||||
// We refine just the box_word, best_state and correct_text here.
|
||||
// The rebuild_word is made in TidyUp.
|
||||
// blob_count blobs are put together to match the box. Merge the
|
||||
// box_word boxes, save the blob_count in the state and the text.
|
||||
word_res->box_word->MergeBoxes(i, i + blob_count);
|
||||
word_res->best_state[i] = blob_count;
|
||||
word_res->correct_text[i] = correct_text;
|
||||
if (applybox_debug > 2) {
|
||||
tprintf("%d Blobs match: blob box:", blob_count);
|
||||
word_res->box_word->BlobBox(i).print();
|
||||
tprintf("Matches box:");
|
||||
box.print();
|
||||
if (next_box != nullptr) {
|
||||
tprintf("With next box:");
|
||||
next_box->print();
|
||||
}
|
||||
}
|
||||
// Eliminated best_state and correct_text entries for the consumed
|
||||
// blobs.
|
||||
for (int j = 1; j < blob_count; ++j) {
|
||||
word_res->best_state.erase(word_res->best_state.begin() + i + 1);
|
||||
word_res->correct_text.erase(word_res->correct_text.begin() + i + 1);
|
||||
}
|
||||
// Assume that no box spans multiple source words, so we are done with
|
||||
// this box.
|
||||
if (applybox_debug > 1) {
|
||||
tprintf("Best state = ");
|
||||
for (auto best_state : word_res->best_state) {
|
||||
tprintf("%d ", best_state);
|
||||
}
|
||||
tprintf("\n");
|
||||
tprintf("Correct text = [[ ");
|
||||
for (auto &it : word_res->correct_text) {
|
||||
tprintf("%s ", it.c_str());
|
||||
}
|
||||
tprintf("]]\n");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (applybox_debug > 0) {
|
||||
tprintf("FAIL!\n");
|
||||
}
|
||||
return false; // Failure.
|
||||
}
|
||||
|
||||
/// Consume all source blobs that strongly overlap the given box,
|
||||
/// putting them into a new word, with the correct_text label.
|
||||
/// Fights over which box owns which blobs are settled by
|
||||
/// applying the blobs to box or next_box with the least non-overlap.
|
||||
/// @return false if the box was in error, which can only be caused by
|
||||
/// failing to find an overlapping blob for a box.
|
||||
bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box,
|
||||
const char *correct_text) {
|
||||
if (applybox_debug > 1) {
|
||||
tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
|
||||
}
|
||||
WERD *new_word = nullptr;
|
||||
BLOCK_IT b_it(block_list);
|
||||
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
|
||||
BLOCK *block = b_it.data();
|
||||
if (!box.major_overlap(block->pdblk.bounding_box())) {
|
||||
continue;
|
||||
}
|
||||
ROW_IT r_it(block->row_list());
|
||||
for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
|
||||
ROW *row = r_it.data();
|
||||
if (!box.major_overlap(row->bounding_box())) {
|
||||
continue;
|
||||
}
|
||||
WERD_IT w_it(row->word_list());
|
||||
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
|
||||
WERD *word = w_it.data();
|
||||
if (applybox_debug > 2) {
|
||||
tprintf("Checking word:");
|
||||
word->bounding_box().print();
|
||||
}
|
||||
if (word->text() != nullptr && word->text()[0] != '\0') {
|
||||
continue; // Ignore words that are already done.
|
||||
}
|
||||
if (!box.major_overlap(word->bounding_box())) {
|
||||
continue;
|
||||
}
|
||||
C_BLOB_IT blob_it(word->cblob_list());
|
||||
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
|
||||
C_BLOB *blob = blob_it.data();
|
||||
TBOX blob_box = blob->bounding_box();
|
||||
if (!blob_box.major_overlap(box)) {
|
||||
continue;
|
||||
}
|
||||
if (next_box != nullptr) {
|
||||
const double current_box_miss_metric = BoxMissMetric(blob_box, box);
|
||||
const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
|
||||
if (applybox_debug > 2) {
|
||||
tprintf("Checking blob:");
|
||||
blob_box.print();
|
||||
tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,
|
||||
next_box_miss_metric);
|
||||
}
|
||||
if (current_box_miss_metric > next_box_miss_metric) {
|
||||
continue; // Blob is a better match for next box.
|
||||
}
|
||||
}
|
||||
if (applybox_debug > 2) {
|
||||
tprintf("Blob match: blob:");
|
||||
blob_box.print();
|
||||
tprintf("Matches box:");
|
||||
box.print();
|
||||
if (next_box != nullptr) {
|
||||
tprintf("With next box:");
|
||||
next_box->print();
|
||||
}
|
||||
}
|
||||
if (new_word == nullptr) {
|
||||
// Make a new word with a single blob.
|
||||
new_word = word->shallow_copy();
|
||||
new_word->set_text(correct_text);
|
||||
w_it.add_to_end(new_word);
|
||||
}
|
||||
C_BLOB_IT new_blob_it(new_word->cblob_list());
|
||||
new_blob_it.add_to_end(blob_it.extract());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (new_word == nullptr && applybox_debug > 0) {
|
||||
tprintf("FAIL!\n");
|
||||
}
|
||||
return new_word != nullptr;
|
||||
}
|
||||
|
||||
/// Resegments the words by running the classifier in an attempt to find the
|
||||
/// correct segmentation that produces the required string.
|
||||
void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) {
|
||||
PAGE_RES_IT pr_it(page_res);
|
||||
WERD_RES *word_res;
|
||||
for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
|
||||
const WERD *word = word_res->word;
|
||||
if (word->text() == nullptr || word->text()[0] == '\0') {
|
||||
continue; // Ignore words that have no text.
|
||||
}
|
||||
// Convert the correct text to a vector of UNICHAR_ID
|
||||
std::vector<UNICHAR_ID> target_text;
|
||||
if (!ConvertStringToUnichars(word->text(), &target_text)) {
|
||||
tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text());
|
||||
pr_it.DeleteCurrentWord();
|
||||
continue;
|
||||
}
|
||||
if (!FindSegmentation(target_text, word_res)) {
|
||||
tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n", word->text());
|
||||
pr_it.DeleteCurrentWord();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
|
||||
/// @return false if an invalid UNICHAR_ID is encountered.
|
||||
bool Tesseract::ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids) {
|
||||
for (int step = 0; *utf8 != '\0'; utf8 += step) {
|
||||
const char *next_space = strchr(utf8, ' ');
|
||||
if (next_space == nullptr) {
|
||||
next_space = utf8 + strlen(utf8);
|
||||
}
|
||||
step = next_space - utf8;
|
||||
UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
|
||||
if (class_id == INVALID_UNICHAR_ID) {
|
||||
return false;
|
||||
}
|
||||
while (utf8[step] == ' ') {
|
||||
++step;
|
||||
}
|
||||
class_ids->push_back(class_id);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Resegments the word to achieve the target_text from the classifier.
|
||||
/// Returns false if the re-segmentation fails.
|
||||
/// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and
|
||||
/// applies a full search on the classifier results to find the best classified
|
||||
/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
|
||||
/// substitutions ARE used.
|
||||
bool Tesseract::FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res) {
|
||||
// Classify all required combinations of blobs and save results in choices.
|
||||
const int word_length = word_res->box_word->length();
|
||||
auto *choices = new std::vector<BLOB_CHOICE_LIST *>[word_length];
|
||||
for (int i = 0; i < word_length; ++i) {
|
||||
for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
|
||||
BLOB_CHOICE_LIST *match_result =
|
||||
classify_piece(word_res->seam_array, i, i + j - 1, "Applybox", word_res->chopped_word,
|
||||
word_res->blamer_bundle);
|
||||
if (applybox_debug > 2) {
|
||||
tprintf("%d+%d:", i, j);
|
||||
print_ratings_list("Segment:", match_result, unicharset);
|
||||
}
|
||||
choices[i].push_back(match_result);
|
||||
}
|
||||
}
|
||||
// Search the segmentation graph for the target text. Must be an exact
|
||||
// match. Using wildcards makes it difficult to find the correct
|
||||
// segmentation even when it is there.
|
||||
word_res->best_state.clear();
|
||||
std::vector<int> search_segmentation;
|
||||
float best_rating = 0.0f;
|
||||
SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating,
|
||||
&word_res->best_state);
|
||||
for (int i = 0; i < word_length; ++i) {
|
||||
for (auto choice : choices[i]) {
|
||||
delete choice;
|
||||
}
|
||||
}
|
||||
delete[] choices;
|
||||
if (word_res->best_state.empty()) {
|
||||
// Build the original segmentation and if it is the same length as the
|
||||
// truth, assume it will do.
|
||||
int blob_count = 1;
|
||||
for (auto s : word_res->seam_array) {
|
||||
SEAM *seam = s;
|
||||
if (!seam->HasAnySplits()) {
|
||||
word_res->best_state.push_back(blob_count);
|
||||
blob_count = 1;
|
||||
} else {
|
||||
++blob_count;
|
||||
}
|
||||
}
|
||||
word_res->best_state.push_back(blob_count);
|
||||
if (word_res->best_state.size() != target_text.size()) {
|
||||
word_res->best_state.clear(); // No good. Original segmentation bad size.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
word_res->correct_text.clear();
|
||||
for (auto &text : target_text) {
|
||||
word_res->correct_text.emplace_back(unicharset.id_to_unichar(text));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Recursive helper to find a match to the target_text (from text_index
|
||||
/// position) in the choices (from choices_pos position).
|
||||
/// @param choices is an array of vectors of length choices_length,
|
||||
/// with each element representing a starting position in the word, and the
|
||||
/// #vector holding classification results for a sequence of consecutive
|
||||
/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
|
||||
/// @param choices_pos
|
||||
/// @param choices_length
|
||||
/// @param target_text
|
||||
/// @param text_index
|
||||
/// @param rating
|
||||
/// @param segmentation
|
||||
/// @param best_rating
|
||||
/// @param best_segmentation
|
||||
void Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
|
||||
int choices_length, const std::vector<UNICHAR_ID> &target_text,
|
||||
int text_index, float rating, std::vector<int> *segmentation,
|
||||
float *best_rating, std::vector<int> *best_segmentation) {
|
||||
const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs();
|
||||
for (unsigned length = 1; length <= choices[choices_pos].size(); ++length) {
|
||||
// Rating of matching choice or worst choice if no match.
|
||||
float choice_rating = 0.0f;
|
||||
// Find the corresponding best BLOB_CHOICE.
|
||||
BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
|
||||
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
|
||||
const BLOB_CHOICE *choice = choice_it.data();
|
||||
choice_rating = choice->rating();
|
||||
UNICHAR_ID class_id = choice->unichar_id();
|
||||
if (class_id == target_text[text_index]) {
|
||||
break;
|
||||
}
|
||||
// Search ambigs table.
|
||||
if (class_id < table.size() && table[class_id] != nullptr) {
|
||||
AmbigSpec_IT spec_it(table[class_id]);
|
||||
for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) {
|
||||
const AmbigSpec *ambig_spec = spec_it.data();
|
||||
// We'll only do 1-1.
|
||||
if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
|
||||
ambig_spec->correct_ngram_id == target_text[text_index]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!spec_it.cycled_list()) {
|
||||
break; // Found an ambig.
|
||||
}
|
||||
}
|
||||
}
|
||||
if (choice_it.cycled_list()) {
|
||||
continue; // No match.
|
||||
}
|
||||
segmentation->push_back(length);
|
||||
if (choices_pos + length == choices_length && text_index + 1 == target_text.size()) {
|
||||
// This is a complete match. If the rating is good record a new best.
|
||||
if (applybox_debug > 2) {
|
||||
tprintf("Complete match, rating = %g, best=%g, seglength=%zu, best=%zu\n",
|
||||
rating + choice_rating, *best_rating, segmentation->size(),
|
||||
best_segmentation->size());
|
||||
}
|
||||
if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
|
||||
*best_segmentation = *segmentation;
|
||||
*best_rating = rating + choice_rating;
|
||||
}
|
||||
} else if (choices_pos + length < choices_length && text_index + 1 < target_text.size()) {
|
||||
if (applybox_debug > 3) {
|
||||
tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n", target_text[text_index],
|
||||
unicharset.id_to_unichar(target_text[text_index]),
|
||||
choice_it.data()->unichar_id() == target_text[text_index] ? "Match" : "Ambig",
|
||||
choices_pos, length);
|
||||
}
|
||||
SearchForText(choices, choices_pos + length, choices_length, target_text, text_index + 1,
|
||||
rating + choice_rating, segmentation, best_rating, best_segmentation);
|
||||
if (applybox_debug > 3) {
|
||||
tprintf("End recursion for %d=%s\n", target_text[text_index],
|
||||
unicharset.id_to_unichar(target_text[text_index]));
|
||||
}
|
||||
}
|
||||
segmentation->resize(segmentation->size() - 1);
|
||||
}
|
||||
}
|
||||
|
||||
/// - Counts up the labelled words and the blobs within.
|
||||
/// - Deletes all unused or emptied words, counting the unused ones.
|
||||
/// - Resets W_BOL and W_EOL flags correctly.
|
||||
/// - Builds the rebuild_word and rebuilds the box_word and the best_choice.
|
||||
void Tesseract::TidyUp(PAGE_RES *page_res) {
|
||||
int ok_blob_count = 0;
|
||||
int bad_blob_count = 0;
|
||||
int ok_word_count = 0;
|
||||
int unlabelled_words = 0;
|
||||
PAGE_RES_IT pr_it(page_res);
|
||||
WERD_RES *word_res;
|
||||
for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
|
||||
int ok_in_word = 0;
|
||||
int blob_count = word_res->correct_text.size();
|
||||
auto *word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
|
||||
word_choice->set_permuter(TOP_CHOICE_PERM);
|
||||
for (int c = 0; c < blob_count; ++c) {
|
||||
if (word_res->correct_text[c].length() > 0) {
|
||||
++ok_in_word;
|
||||
}
|
||||
// Since we only need a fake word_res->best_choice, the actual
|
||||
// unichar_ids do not matter. Which is fortunate, since TidyUp()
|
||||
// can be called while training Tesseract, at the stage where
|
||||
// unicharset is not meaningful yet.
|
||||
word_choice->append_unichar_id_space_allocated(INVALID_UNICHAR_ID, word_res->best_state[c],
|
||||
1.0f, -1.0f);
|
||||
}
|
||||
if (ok_in_word > 0) {
|
||||
ok_blob_count += ok_in_word;
|
||||
bad_blob_count += word_res->correct_text.size() - ok_in_word;
|
||||
word_res->LogNewRawChoice(word_choice);
|
||||
word_res->LogNewCookedChoice(1, false, word_choice);
|
||||
} else {
|
||||
++unlabelled_words;
|
||||
if (applybox_debug > 0) {
|
||||
tprintf("APPLY_BOXES: Unlabelled word at :");
|
||||
word_res->word->bounding_box().print();
|
||||
}
|
||||
pr_it.DeleteCurrentWord();
|
||||
delete word_choice;
|
||||
}
|
||||
}
|
||||
pr_it.restart_page();
|
||||
for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
|
||||
// Denormalize back to a BoxWord.
|
||||
word_res->RebuildBestState();
|
||||
word_res->SetupBoxWord();
|
||||
word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
|
||||
word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
|
||||
}
|
||||
if (applybox_debug > 0) {
|
||||
tprintf(" Found %d good blobs.\n", ok_blob_count);
|
||||
if (bad_blob_count > 0) {
|
||||
tprintf(" Leaving %d unlabelled blobs in %d words.\n", bad_blob_count, ok_word_count);
|
||||
}
|
||||
if (unlabelled_words > 0) {
|
||||
tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Logs a bad box by line in the box file and box coords.*/
|
||||
void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch,
|
||||
const char *err_msg) {
|
||||
tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n", boxfile_lineno + 1, box_ch,
|
||||
box.left(), box.bottom(), box.right(), box.top(), err_msg);
|
||||
}
|
||||
|
||||
/// Calls #LearnWord to extract features for labelled blobs within each word.
|
||||
/// Features are stored in an internal buffer.
|
||||
void Tesseract::ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res) {
|
||||
PAGE_RES_IT pr_it(page_res);
|
||||
int word_count = 0;
|
||||
for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
|
||||
LearnWord(fontname.c_str(), word_res);
|
||||
++word_count;
|
||||
}
|
||||
tprintf("Generated training data for %d words\n", word_count);
|
||||
}
|
||||
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
/** Creates a fake best_choice entry in each WERD_RES with the correct text.*/
|
||||
void Tesseract::CorrectClassifyWords(PAGE_RES *page_res) {
|
||||
PAGE_RES_IT pr_it(page_res);
|
||||
for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
|
||||
auto *choice = new WERD_CHOICE(word_res->uch_set, word_res->correct_text.size());
|
||||
for (auto &correct_text : word_res->correct_text) {
|
||||
// The part before the first space is the real ground truth, and the
|
||||
// rest is the bounding box location and page number.
|
||||
std::vector<std::string> tokens = split(correct_text, ' ');
|
||||
UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].c_str());
|
||||
choice->append_unichar_id_space_allocated(char_id, word_res->best_state[&correct_text - &word_res->correct_text[0]], 0.0f, 0.0f);
|
||||
}
|
||||
word_res->ClearWordChoices();
|
||||
word_res->LogNewRawChoice(choice);
|
||||
word_res->LogNewCookedChoice(1, false, choice);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
2087
3rdparty/tesseract_ocr/tesseract/src/ccmain/control.cpp
vendored
Normal file
2087
3rdparty/tesseract_ocr/tesseract/src/ccmain/control.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
37
3rdparty/tesseract_ocr/tesseract/src/ccmain/control.h
vendored
Normal file
37
3rdparty/tesseract_ocr/tesseract/src/ccmain/control.h
vendored
Normal file
|
@ -0,0 +1,37 @@
|
|||
/**********************************************************************
|
||||
* File: control.h (Formerly control.h)
|
||||
* Description: Module-independent matcher controller.
|
||||
* Author: Ray Smith
|
||||
* Created: Thu Apr 23 11:09:58 BST 1992
|
||||
*
|
||||
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
/**
|
||||
* @file control.h
|
||||
* Module-independent matcher controller.
|
||||
*/
|
||||
|
||||
#ifndef CONTROL_H
|
||||
#define CONTROL_H
|
||||
|
||||
enum ACCEPTABLE_WERD_TYPE {
|
||||
AC_UNACCEPTABLE, ///< Unacceptable word
|
||||
AC_LOWER_CASE, ///< ALL lower case
|
||||
AC_UPPER_CASE, ///< ALL upper case
|
||||
AC_INITIAL_CAP, ///< ALL but initial lc
|
||||
AC_LC_ABBREV, ///< a.b.c.
|
||||
AC_UC_ABBREV ///< A.B.C.
|
||||
};
|
||||
|
||||
#endif
|
932
3rdparty/tesseract_ocr/tesseract/src/ccmain/docqual.cpp
vendored
Normal file
932
3rdparty/tesseract_ocr/tesseract/src/ccmain/docqual.cpp
vendored
Normal file
|
@ -0,0 +1,932 @@
|
|||
/******************************************************************
|
||||
* File: docqual.cpp (Formerly docqual.c)
|
||||
* Description: Document Quality Metrics
|
||||
* Author: Phil Cheatle
|
||||
*
|
||||
* (C) Copyright 1994, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "docqual.h"
|
||||
#include <cctype>
|
||||
#include "reject.h"
|
||||
#include "tesseractclass.h"
|
||||
#include "tessvars.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
static void countMatchingBlobs(int16_t &match_count, int /*index*/) {
|
||||
++match_count;
|
||||
}
|
||||
|
||||
static void countAcceptedBlobs(WERD_RES *word, int16_t &match_count, int16_t &accepted_match_count,
|
||||
int index) {
|
||||
if (word->reject_map[index].accepted()) {
|
||||
++accepted_match_count;
|
||||
}
|
||||
++match_count;
|
||||
}
|
||||
|
||||
static void acceptIfGoodQuality(WERD_RES *word, int index) {
|
||||
if (word->reject_map[index].accept_if_good_quality()) {
|
||||
word->reject_map[index].setrej_quality_accept();
|
||||
}
|
||||
}
|
||||
|
||||
/*************************************************************************
|
||||
* word_blob_quality()
|
||||
* How many blobs in the box_word are identical to those of the inword?
|
||||
* ASSUME blobs in both initial word and box_word are in ascending order of
|
||||
* left hand blob edge.
|
||||
*************************************************************************/
|
||||
int16_t Tesseract::word_blob_quality(WERD_RES *word) {
|
||||
int16_t match_count = 0;
|
||||
if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
|
||||
!word->rebuild_word->blobs.empty()) {
|
||||
using namespace std::placeholders; // for _1
|
||||
word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word,
|
||||
std::bind(countMatchingBlobs, match_count, _1));
|
||||
}
|
||||
return match_count;
|
||||
}
|
||||
|
||||
int16_t Tesseract::word_outline_errs(WERD_RES *word) {
|
||||
int16_t i = 0;
|
||||
int16_t err_count = 0;
|
||||
|
||||
if (word->rebuild_word != nullptr) {
|
||||
for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
|
||||
TBLOB *blob = word->rebuild_word->blobs[b];
|
||||
err_count += count_outline_errs(word->best_choice->unichar_string()[i], blob->NumOutlines());
|
||||
i++;
|
||||
}
|
||||
}
|
||||
return err_count;
|
||||
}
|
||||
|
||||
/*************************************************************************
|
||||
* word_char_quality()
|
||||
* Combination of blob quality and outline quality - how many good chars are
|
||||
* there? - I.e chars which pass the blob AND outline tests.
|
||||
*************************************************************************/
|
||||
void Tesseract::word_char_quality(WERD_RES *word, int16_t *match_count,
|
||||
int16_t *accepted_match_count) {
|
||||
*match_count = 0;
|
||||
*accepted_match_count = 0;
|
||||
if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
|
||||
!word->rebuild_word->blobs.empty()) {
|
||||
using namespace std::placeholders; // for _1
|
||||
word->bln_boxes->ProcessMatchedBlobs(
|
||||
*word->rebuild_word,
|
||||
std::bind(countAcceptedBlobs, word, *match_count, *accepted_match_count, _1));
|
||||
}
|
||||
}
|
||||
|
||||
/*************************************************************************
|
||||
* unrej_good_chs()
|
||||
* Unreject POTENTIAL rejects if the blob passes the blob and outline checks
|
||||
*************************************************************************/
|
||||
void Tesseract::unrej_good_chs(WERD_RES *word) {
|
||||
if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
|
||||
word->rebuild_word->blobs.empty()) {
|
||||
using namespace std::placeholders; // for _1
|
||||
word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word,
|
||||
std::bind(acceptIfGoodQuality, word, _1));
|
||||
}
|
||||
}
|
||||
|
||||
int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) {
|
||||
int expected_outline_count;
|
||||
|
||||
if (outlines_odd.contains(c)) {
|
||||
return 0; // Don't use this char
|
||||
} else if (outlines_2.contains(c)) {
|
||||
expected_outline_count = 2;
|
||||
} else {
|
||||
expected_outline_count = 1;
|
||||
}
|
||||
return abs(outline_count - expected_outline_count);
|
||||
}
|
||||
|
||||
void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc) {
|
||||
if ((tessedit_good_quality_unrej && good_quality_doc)) {
|
||||
unrej_good_quality_words(page_res_it);
|
||||
}
|
||||
doc_and_block_rejection(page_res_it, good_quality_doc);
|
||||
if (unlv_tilde_crunching) {
|
||||
tilde_crunch(page_res_it);
|
||||
tilde_delete(page_res_it);
|
||||
}
|
||||
}
|
||||
|
||||
/*************************************************************************
|
||||
* unrej_good_quality_words()
|
||||
* Accept potential rejects in words which pass the following checks:
|
||||
* - Contains a potential reject
|
||||
* - Word looks like a sensible alpha word.
|
||||
* - Word segmentation is the same as the original image
|
||||
* - All characters have the expected number of outlines
|
||||
* NOTE - the rejection counts are recalculated after unrejection
|
||||
* - CAN'T do it in a single pass without a bit of fiddling
|
||||
* - keep it simple but inefficient
|
||||
*************************************************************************/
|
||||
void Tesseract::unrej_good_quality_words( // unreject potential
|
||||
PAGE_RES_IT &page_res_it) {
|
||||
WERD_RES *word;
|
||||
ROW_RES *current_row;
|
||||
BLOCK_RES *current_block;
|
||||
int i;
|
||||
|
||||
page_res_it.restart_page();
|
||||
while (page_res_it.word() != nullptr) {
|
||||
check_debug_pt(page_res_it.word(), 100);
|
||||
if (bland_unrej) {
|
||||
word = page_res_it.word();
|
||||
for (i = 0; i < word->reject_map.length(); i++) {
|
||||
if (word->reject_map[i].accept_if_good_quality()) {
|
||||
word->reject_map[i].setrej_quality_accept();
|
||||
}
|
||||
}
|
||||
page_res_it.forward();
|
||||
} else if ((page_res_it.row()->char_count > 0) &&
|
||||
((page_res_it.row()->rej_count /
|
||||
static_cast<float>(page_res_it.row()->char_count)) <= quality_rowrej_pc)) {
|
||||
word = page_res_it.word();
|
||||
if (word->reject_map.quality_recoverable_rejects() &&
|
||||
(tessedit_unrej_any_wd ||
|
||||
acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(),
|
||||
word->best_choice->unichar_lengths().c_str()) !=
|
||||
AC_UNACCEPTABLE)) {
|
||||
unrej_good_chs(word);
|
||||
}
|
||||
page_res_it.forward();
|
||||
} else {
|
||||
// Skip to end of dodgy row.
|
||||
current_row = page_res_it.row();
|
||||
while ((page_res_it.word() != nullptr) && (page_res_it.row() == current_row)) {
|
||||
page_res_it.forward();
|
||||
}
|
||||
}
|
||||
check_debug_pt(page_res_it.word(), 110);
|
||||
}
|
||||
page_res_it.restart_page();
|
||||
page_res_it.page_res->char_count = 0;
|
||||
page_res_it.page_res->rej_count = 0;
|
||||
current_block = nullptr;
|
||||
current_row = nullptr;
|
||||
while (page_res_it.word() != nullptr) {
|
||||
if (current_block != page_res_it.block()) {
|
||||
current_block = page_res_it.block();
|
||||
current_block->char_count = 0;
|
||||
current_block->rej_count = 0;
|
||||
}
|
||||
if (current_row != page_res_it.row()) {
|
||||
current_row = page_res_it.row();
|
||||
current_row->char_count = 0;
|
||||
current_row->rej_count = 0;
|
||||
current_row->whole_word_rej_count = 0;
|
||||
}
|
||||
page_res_it.rej_stat_word();
|
||||
page_res_it.forward();
|
||||
}
|
||||
}
|
||||
|
||||
/*************************************************************************
|
||||
* doc_and_block_rejection()
|
||||
*
|
||||
* If the page has too many rejects - reject all of it.
|
||||
* If any block has too many rejects - reject all words in the block
|
||||
*************************************************************************/
|
||||
|
||||
void Tesseract::doc_and_block_rejection( // reject big chunks
|
||||
PAGE_RES_IT &page_res_it, bool good_quality_doc) {
|
||||
int16_t block_no = 0;
|
||||
int16_t row_no = 0;
|
||||
BLOCK_RES *current_block;
|
||||
ROW_RES *current_row;
|
||||
|
||||
bool rej_word;
|
||||
bool prev_word_rejected;
|
||||
int16_t char_quality = 0;
|
||||
int16_t accepted_char_quality;
|
||||
|
||||
if (page_res_it.page_res->rej_count * 100.0 / page_res_it.page_res->char_count >
|
||||
tessedit_reject_doc_percent) {
|
||||
reject_whole_page(page_res_it);
|
||||
if (tessedit_debug_doc_rejection) {
|
||||
tprintf("REJECT ALL #chars: %d #Rejects: %d; \n", page_res_it.page_res->char_count,
|
||||
page_res_it.page_res->rej_count);
|
||||
}
|
||||
} else {
|
||||
if (tessedit_debug_doc_rejection) {
|
||||
tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n", page_res_it.page_res->char_count,
|
||||
page_res_it.page_res->rej_count);
|
||||
}
|
||||
|
||||
/* Walk blocks testing for block rejection */
|
||||
|
||||
page_res_it.restart_page();
|
||||
WERD_RES *word;
|
||||
while ((word = page_res_it.word()) != nullptr) {
|
||||
current_block = page_res_it.block();
|
||||
block_no = current_block->block->pdblk.index();
|
||||
if (current_block->char_count > 0 &&
|
||||
(current_block->rej_count * 100.0 / current_block->char_count) >
|
||||
tessedit_reject_block_percent) {
|
||||
if (tessedit_debug_block_rejection) {
|
||||
tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", block_no,
|
||||
current_block->char_count, current_block->rej_count);
|
||||
}
|
||||
prev_word_rejected = false;
|
||||
while ((word = page_res_it.word()) != nullptr && (page_res_it.block() == current_block)) {
|
||||
if (tessedit_preserve_blk_rej_perfect_wds) {
|
||||
rej_word = word->reject_map.reject_count() > 0 ||
|
||||
word->reject_map.length() < tessedit_preserve_min_wd_len;
|
||||
if (rej_word && tessedit_dont_blkrej_good_wds &&
|
||||
word->reject_map.length() >= tessedit_preserve_min_wd_len &&
|
||||
acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(),
|
||||
word->best_choice->unichar_lengths().c_str()) !=
|
||||
AC_UNACCEPTABLE) {
|
||||
word_char_quality(word, &char_quality, &accepted_char_quality);
|
||||
rej_word = char_quality != word->reject_map.length();
|
||||
}
|
||||
} else {
|
||||
rej_word = true;
|
||||
}
|
||||
if (rej_word) {
|
||||
/*
|
||||
Reject spacing if both current and prev words are rejected.
|
||||
NOTE - this is NOT restricted to FUZZY spaces. - When tried this
|
||||
generated more space errors.
|
||||
*/
|
||||
if (tessedit_use_reject_spaces && prev_word_rejected &&
|
||||
page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) {
|
||||
word->reject_spaces = true;
|
||||
}
|
||||
word->reject_map.rej_word_block_rej();
|
||||
}
|
||||
prev_word_rejected = rej_word;
|
||||
page_res_it.forward();
|
||||
}
|
||||
} else {
|
||||
if (tessedit_debug_block_rejection) {
|
||||
tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n", block_no,
|
||||
page_res_it.block()->char_count, page_res_it.block()->rej_count);
|
||||
}
|
||||
|
||||
/* Walk rows in block testing for row rejection */
|
||||
row_no = 0;
|
||||
while (page_res_it.word() != nullptr && page_res_it.block() == current_block) {
|
||||
current_row = page_res_it.row();
|
||||
row_no++;
|
||||
/* Reject whole row if:
|
||||
fraction of chars on row which are rejected exceed a limit AND
|
||||
fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
|
||||
limit
|
||||
*/
|
||||
if (current_row->char_count > 0 &&
|
||||
(current_row->rej_count * 100.0 / current_row->char_count) >
|
||||
tessedit_reject_row_percent &&
|
||||
(current_row->whole_word_rej_count * 100.0 / current_row->rej_count) <
|
||||
tessedit_whole_wd_rej_row_percent) {
|
||||
if (tessedit_debug_block_rejection) {
|
||||
tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n", row_no,
|
||||
current_row->char_count, current_row->rej_count);
|
||||
}
|
||||
prev_word_rejected = false;
|
||||
while ((word = page_res_it.word()) != nullptr && page_res_it.row() == current_row) {
|
||||
/* Preserve words on good docs unless they are mostly rejected*/
|
||||
if (!tessedit_row_rej_good_docs && good_quality_doc) {
|
||||
rej_word = word->reject_map.reject_count() /
|
||||
static_cast<float>(word->reject_map.length()) >
|
||||
tessedit_good_doc_still_rowrej_wd;
|
||||
} else if (tessedit_preserve_row_rej_perfect_wds) {
|
||||
/* Preserve perfect words anyway */
|
||||
rej_word = word->reject_map.reject_count() > 0 ||
|
||||
word->reject_map.length() < tessedit_preserve_min_wd_len;
|
||||
if (rej_word && tessedit_dont_rowrej_good_wds &&
|
||||
word->reject_map.length() >= tessedit_preserve_min_wd_len &&
|
||||
acceptable_word_string(
|
||||
*word->uch_set, word->best_choice->unichar_string().c_str(),
|
||||
word->best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE) {
|
||||
word_char_quality(word, &char_quality, &accepted_char_quality);
|
||||
rej_word = char_quality != word->reject_map.length();
|
||||
}
|
||||
} else {
|
||||
rej_word = true;
|
||||
}
|
||||
if (rej_word) {
|
||||
/*
|
||||
Reject spacing if both current and prev words are rejected.
|
||||
NOTE - this is NOT restricted to FUZZY spaces. - When tried
|
||||
this generated more space errors.
|
||||
*/
|
||||
if (tessedit_use_reject_spaces && prev_word_rejected &&
|
||||
page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) {
|
||||
word->reject_spaces = true;
|
||||
}
|
||||
word->reject_map.rej_word_row_rej();
|
||||
}
|
||||
prev_word_rejected = rej_word;
|
||||
page_res_it.forward();
|
||||
}
|
||||
} else {
|
||||
if (tessedit_debug_block_rejection) {
|
||||
tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n", row_no,
|
||||
current_row->char_count, current_row->rej_count);
|
||||
}
|
||||
while (page_res_it.word() != nullptr && page_res_it.row() == current_row) {
|
||||
page_res_it.forward();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*************************************************************************
|
||||
* reject_whole_page()
|
||||
* Don't believe any of it - set the reject map to 00..00 in all words
|
||||
*
|
||||
*************************************************************************/
|
||||
|
||||
void reject_whole_page(PAGE_RES_IT &page_res_it) {
|
||||
page_res_it.restart_page();
|
||||
while (page_res_it.word() != nullptr) {
|
||||
page_res_it.word()->reject_map.rej_word_doc_rej();
|
||||
page_res_it.forward();
|
||||
}
|
||||
// whole page is rejected
|
||||
page_res_it.page_res->rejected = true;
|
||||
}
|
||||
|
||||
void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
|
||||
WERD_RES *word;
|
||||
GARBAGE_LEVEL garbage_level;
|
||||
PAGE_RES_IT copy_it;
|
||||
bool prev_potential_marked = false;
|
||||
bool found_terrible_word = false;
|
||||
bool ok_dict_word;
|
||||
|
||||
page_res_it.restart_page();
|
||||
while (page_res_it.word() != nullptr) {
|
||||
POLY_BLOCK *pb = page_res_it.block()->block->pdblk.poly_block();
|
||||
if (pb != nullptr && !pb->IsText()) {
|
||||
page_res_it.forward();
|
||||
continue;
|
||||
}
|
||||
word = page_res_it.word();
|
||||
|
||||
if (crunch_early_convert_bad_unlv_chs) {
|
||||
convert_bad_unlv_chs(word);
|
||||
}
|
||||
|
||||
if (crunch_early_merge_tess_fails) {
|
||||
word->merge_tess_fails();
|
||||
}
|
||||
|
||||
if (word->reject_map.accept_count() != 0) {
|
||||
found_terrible_word = false;
|
||||
// Forget earlier potential crunches
|
||||
prev_potential_marked = false;
|
||||
} else {
|
||||
ok_dict_word = safe_dict_word(word);
|
||||
garbage_level = garbage_word(word, ok_dict_word);
|
||||
|
||||
if ((garbage_level != G_NEVER_CRUNCH) && (terrible_word_crunch(word, garbage_level))) {
|
||||
if (crunch_debug > 0) {
|
||||
tprintf("T CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
|
||||
}
|
||||
word->unlv_crunch_mode = CR_KEEP_SPACE;
|
||||
if (prev_potential_marked) {
|
||||
while (copy_it.word() != word) {
|
||||
if (crunch_debug > 0) {
|
||||
tprintf("P1 CRUNCHING: \"%s\"\n",
|
||||
copy_it.word()->best_choice->unichar_string().c_str());
|
||||
}
|
||||
copy_it.word()->unlv_crunch_mode = CR_KEEP_SPACE;
|
||||
copy_it.forward();
|
||||
}
|
||||
prev_potential_marked = false;
|
||||
}
|
||||
found_terrible_word = true;
|
||||
} else if ((garbage_level != G_NEVER_CRUNCH) &&
|
||||
(potential_word_crunch(word, garbage_level, ok_dict_word))) {
|
||||
if (found_terrible_word) {
|
||||
if (crunch_debug > 0) {
|
||||
tprintf("P2 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
|
||||
}
|
||||
word->unlv_crunch_mode = CR_KEEP_SPACE;
|
||||
} else if (!prev_potential_marked) {
|
||||
copy_it = page_res_it;
|
||||
prev_potential_marked = true;
|
||||
if (crunch_debug > 1) {
|
||||
tprintf("P3 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
found_terrible_word = false;
|
||||
// Forget earlier potential crunches
|
||||
prev_potential_marked = false;
|
||||
if (crunch_debug > 2) {
|
||||
tprintf("NO CRUNCH: \"%s\"\n", word->best_choice->unichar_string().c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
page_res_it.forward();
|
||||
}
|
||||
}
|
||||
|
||||
bool Tesseract::terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) {
|
||||
float rating_per_ch;
|
||||
int adjusted_len;
|
||||
int crunch_mode = 0;
|
||||
|
||||
if (word->best_choice->unichar_string().empty() ||
|
||||
(strspn(word->best_choice->unichar_string().c_str(), " ") ==
|
||||
word->best_choice->unichar_string().size())) {
|
||||
crunch_mode = 1;
|
||||
} else {
|
||||
adjusted_len = word->reject_map.length();
|
||||
if (adjusted_len > crunch_rating_max) {
|
||||
adjusted_len = crunch_rating_max;
|
||||
}
|
||||
rating_per_ch = word->best_choice->rating() / adjusted_len;
|
||||
|
||||
if (rating_per_ch > crunch_terrible_rating) {
|
||||
crunch_mode = 2;
|
||||
} else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE)) {
|
||||
crunch_mode = 3;
|
||||
} else if ((word->best_choice->certainty() < crunch_poor_garbage_cert) &&
|
||||
(garbage_level != G_OK)) {
|
||||
crunch_mode = 4;
|
||||
} else if ((rating_per_ch > crunch_poor_garbage_rate) && (garbage_level != G_OK)) {
|
||||
crunch_mode = 5;
|
||||
}
|
||||
}
|
||||
if (crunch_mode > 0) {
|
||||
if (crunch_debug > 2) {
|
||||
tprintf("Terrible_word_crunch (%d) on \"%s\"\n", crunch_mode,
|
||||
word->best_choice->unichar_string().c_str());
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool Tesseract::potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level,
|
||||
bool ok_dict_word) {
|
||||
float rating_per_ch;
|
||||
int adjusted_len;
|
||||
const char *str = word->best_choice->unichar_string().c_str();
|
||||
const char *lengths = word->best_choice->unichar_lengths().c_str();
|
||||
bool word_crunchable;
|
||||
int poor_indicator_count = 0;
|
||||
|
||||
word_crunchable =
|
||||
!crunch_leave_accept_strings || word->reject_map.length() < 3 ||
|
||||
(acceptable_word_string(*word->uch_set, str, lengths) == AC_UNACCEPTABLE && !ok_dict_word);
|
||||
|
||||
adjusted_len = word->reject_map.length();
|
||||
if (adjusted_len > 10) {
|
||||
adjusted_len = 10;
|
||||
}
|
||||
rating_per_ch = word->best_choice->rating() / adjusted_len;
|
||||
|
||||
if (rating_per_ch > crunch_pot_poor_rate) {
|
||||
if (crunch_debug > 2) {
|
||||
tprintf("Potential poor rating on \"%s\"\n", word->best_choice->unichar_string().c_str());
|
||||
}
|
||||
poor_indicator_count++;
|
||||
}
|
||||
|
||||
if (word_crunchable && word->best_choice->certainty() < crunch_pot_poor_cert) {
|
||||
if (crunch_debug > 2) {
|
||||
tprintf("Potential poor cert on \"%s\"\n", word->best_choice->unichar_string().c_str());
|
||||
}
|
||||
poor_indicator_count++;
|
||||
}
|
||||
|
||||
if (garbage_level != G_OK) {
|
||||
if (crunch_debug > 2) {
|
||||
tprintf("Potential garbage on \"%s\"\n", word->best_choice->unichar_string().c_str());
|
||||
}
|
||||
poor_indicator_count++;
|
||||
}
|
||||
return poor_indicator_count >= crunch_pot_indicators;
|
||||
}
|
||||
|
||||
void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
|
||||
WERD_RES *word;
|
||||
PAGE_RES_IT copy_it;
|
||||
bool deleting_from_bol = false;
|
||||
bool marked_delete_point = false;
|
||||
int16_t debug_delete_mode;
|
||||
CRUNCH_MODE delete_mode;
|
||||
int16_t x_debug_delete_mode;
|
||||
CRUNCH_MODE x_delete_mode;
|
||||
|
||||
page_res_it.restart_page();
|
||||
while (page_res_it.word() != nullptr) {
|
||||
word = page_res_it.word();
|
||||
|
||||
delete_mode = word_deletable(word, debug_delete_mode);
|
||||
if (delete_mode != CR_NONE) {
|
||||
if (word->word->flag(W_BOL) || deleting_from_bol) {
|
||||
if (crunch_debug > 0) {
|
||||
tprintf("BOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,
|
||||
word->best_choice->unichar_string().c_str());
|
||||
}
|
||||
word->unlv_crunch_mode = delete_mode;
|
||||
deleting_from_bol = true;
|
||||
} else if (word->word->flag(W_EOL)) {
|
||||
if (marked_delete_point) {
|
||||
while (copy_it.word() != word) {
|
||||
x_delete_mode = word_deletable(copy_it.word(), x_debug_delete_mode);
|
||||
if (crunch_debug > 0) {
|
||||
tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", x_debug_delete_mode,
|
||||
copy_it.word()->best_choice->unichar_string().c_str());
|
||||
}
|
||||
copy_it.word()->unlv_crunch_mode = x_delete_mode;
|
||||
copy_it.forward();
|
||||
}
|
||||
}
|
||||
if (crunch_debug > 0) {
|
||||
tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,
|
||||
word->best_choice->unichar_string().c_str());
|
||||
}
|
||||
word->unlv_crunch_mode = delete_mode;
|
||||
deleting_from_bol = false;
|
||||
marked_delete_point = false;
|
||||
} else {
|
||||
if (!marked_delete_point) {
|
||||
copy_it = page_res_it;
|
||||
marked_delete_point = true;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
deleting_from_bol = false;
|
||||
// Forget earlier potential crunches
|
||||
marked_delete_point = false;
|
||||
}
|
||||
/*
|
||||
The following step has been left till now as the tess fails are used to
|
||||
determine if the word is deletable.
|
||||
*/
|
||||
if (!crunch_early_merge_tess_fails) {
|
||||
word->merge_tess_fails();
|
||||
}
|
||||
page_res_it.forward();
|
||||
}
|
||||
}
|
||||
|
||||
void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
|
||||
int i;
|
||||
UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
|
||||
UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
|
||||
UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
|
||||
UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
|
||||
for (i = 0; i < word_res->reject_map.length(); ++i) {
|
||||
if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
|
||||
word_res->best_choice->set_unichar_id(unichar_dash, i);
|
||||
if (word_res->reject_map[i].accepted()) {
|
||||
word_res->reject_map[i].setrej_unlv_rej();
|
||||
}
|
||||
}
|
||||
if (word_res->best_choice->unichar_id(i) == unichar_pow) {
|
||||
word_res->best_choice->set_unichar_id(unichar_space, i);
|
||||
if (word_res->reject_map[i].accepted()) {
|
||||
word_res->reject_map[i].setrej_unlv_rej();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {
|
||||
enum STATES {
|
||||
JUNK,
|
||||
FIRST_UPPER,
|
||||
FIRST_LOWER,
|
||||
FIRST_NUM,
|
||||
SUBSEQUENT_UPPER,
|
||||
SUBSEQUENT_LOWER,
|
||||
SUBSEQUENT_NUM
|
||||
};
|
||||
const char *str = word->best_choice->unichar_string().c_str();
|
||||
const char *lengths = word->best_choice->unichar_lengths().c_str();
|
||||
STATES state = JUNK;
|
||||
int len = 0;
|
||||
int isolated_digits = 0;
|
||||
int isolated_alphas = 0;
|
||||
int bad_char_count = 0;
|
||||
int tess_rejs = 0;
|
||||
int dodgy_chars = 0;
|
||||
int ok_chars;
|
||||
UNICHAR_ID last_char = -1;
|
||||
int alpha_repetition_count = 0;
|
||||
int longest_alpha_repetition_count = 0;
|
||||
int longest_lower_run_len = 0;
|
||||
int lower_string_count = 0;
|
||||
int longest_upper_run_len = 0;
|
||||
int upper_string_count = 0;
|
||||
int total_alpha_count = 0;
|
||||
int total_digit_count = 0;
|
||||
|
||||
for (; *str != '\0'; str += *(lengths++)) {
|
||||
len++;
|
||||
if (word->uch_set->get_isupper(str, *lengths)) {
|
||||
total_alpha_count++;
|
||||
switch (state) {
|
||||
case SUBSEQUENT_UPPER:
|
||||
case FIRST_UPPER:
|
||||
state = SUBSEQUENT_UPPER;
|
||||
upper_string_count++;
|
||||
if (longest_upper_run_len < upper_string_count) {
|
||||
longest_upper_run_len = upper_string_count;
|
||||
}
|
||||
if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
|
||||
alpha_repetition_count++;
|
||||
if (longest_alpha_repetition_count < alpha_repetition_count) {
|
||||
longest_alpha_repetition_count = alpha_repetition_count;
|
||||
}
|
||||
} else {
|
||||
last_char = word->uch_set->unichar_to_id(str, *lengths);
|
||||
alpha_repetition_count = 1;
|
||||
}
|
||||
break;
|
||||
case FIRST_NUM:
|
||||
isolated_digits++;
|
||||
// Fall through.
|
||||
default:
|
||||
state = FIRST_UPPER;
|
||||
last_char = word->uch_set->unichar_to_id(str, *lengths);
|
||||
alpha_repetition_count = 1;
|
||||
upper_string_count = 1;
|
||||
break;
|
||||
}
|
||||
} else if (word->uch_set->get_islower(str, *lengths)) {
|
||||
total_alpha_count++;
|
||||
switch (state) {
|
||||
case SUBSEQUENT_LOWER:
|
||||
case FIRST_LOWER:
|
||||
state = SUBSEQUENT_LOWER;
|
||||
lower_string_count++;
|
||||
if (longest_lower_run_len < lower_string_count) {
|
||||
longest_lower_run_len = lower_string_count;
|
||||
}
|
||||
if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
|
||||
alpha_repetition_count++;
|
||||
if (longest_alpha_repetition_count < alpha_repetition_count) {
|
||||
longest_alpha_repetition_count = alpha_repetition_count;
|
||||
}
|
||||
} else {
|
||||
last_char = word->uch_set->unichar_to_id(str, *lengths);
|
||||
alpha_repetition_count = 1;
|
||||
}
|
||||
break;
|
||||
case FIRST_NUM:
|
||||
isolated_digits++;
|
||||
// Fall through.
|
||||
default:
|
||||
state = FIRST_LOWER;
|
||||
last_char = word->uch_set->unichar_to_id(str, *lengths);
|
||||
alpha_repetition_count = 1;
|
||||
lower_string_count = 1;
|
||||
break;
|
||||
}
|
||||
} else if (word->uch_set->get_isdigit(str, *lengths)) {
|
||||
total_digit_count++;
|
||||
switch (state) {
|
||||
case FIRST_NUM:
|
||||
state = SUBSEQUENT_NUM;
|
||||
case SUBSEQUENT_NUM:
|
||||
break;
|
||||
case FIRST_UPPER:
|
||||
case FIRST_LOWER:
|
||||
isolated_alphas++;
|
||||
// Fall through.
|
||||
default:
|
||||
state = FIRST_NUM;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if (*lengths == 1 && *str == ' ') {
|
||||
tess_rejs++;
|
||||
} else {
|
||||
bad_char_count++;
|
||||
}
|
||||
switch (state) {
|
||||
case FIRST_NUM:
|
||||
isolated_digits++;
|
||||
break;
|
||||
case FIRST_UPPER:
|
||||
case FIRST_LOWER:
|
||||
isolated_alphas++;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
state = JUNK;
|
||||
}
|
||||
}
|
||||
|
||||
switch (state) {
|
||||
case FIRST_NUM:
|
||||
isolated_digits++;
|
||||
break;
|
||||
case FIRST_UPPER:
|
||||
case FIRST_LOWER:
|
||||
isolated_alphas++;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
if (crunch_include_numerals) {
|
||||
total_alpha_count += total_digit_count - isolated_digits;
|
||||
}
|
||||
|
||||
if (crunch_leave_ok_strings && len >= 4 && 2 * (total_alpha_count - isolated_alphas) > len &&
|
||||
longest_alpha_repetition_count < crunch_long_repetitions) {
|
||||
if ((crunch_accept_ok &&
|
||||
acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE) ||
|
||||
longest_lower_run_len > crunch_leave_lc_strings ||
|
||||
longest_upper_run_len > crunch_leave_uc_strings) {
|
||||
return G_NEVER_CRUNCH;
|
||||
}
|
||||
}
|
||||
if (word->reject_map.length() > 1 && strpbrk(str, " ") == nullptr &&
|
||||
(word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
|
||||
word->best_choice->permuter() == FREQ_DAWG_PERM ||
|
||||
word->best_choice->permuter() == USER_DAWG_PERM ||
|
||||
word->best_choice->permuter() == NUMBER_PERM ||
|
||||
acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE || ok_dict_word)) {
|
||||
return G_OK;
|
||||
}
|
||||
|
||||
ok_chars = len - bad_char_count - isolated_digits - isolated_alphas - tess_rejs;
|
||||
|
||||
if (crunch_debug > 3) {
|
||||
tprintf("garbage_word: \"%s\"\n", word->best_choice->unichar_string().c_str());
|
||||
tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n", len, bad_char_count,
|
||||
isolated_digits, isolated_alphas, tess_rejs);
|
||||
}
|
||||
if (bad_char_count == 0 && tess_rejs == 0 &&
|
||||
(len > isolated_digits + isolated_alphas || len <= 2)) {
|
||||
return G_OK;
|
||||
}
|
||||
|
||||
if (tess_rejs > ok_chars || (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len)) {
|
||||
return G_TERRIBLE;
|
||||
}
|
||||
|
||||
if (len > 4) {
|
||||
dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits + isolated_alphas;
|
||||
if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5) {
|
||||
return G_DODGY;
|
||||
} else {
|
||||
return G_OK;
|
||||
}
|
||||
} else {
|
||||
dodgy_chars = 2 * tess_rejs + bad_char_count;
|
||||
if ((len == 4 && dodgy_chars > 2) || (len == 3 && dodgy_chars > 2) || dodgy_chars >= len) {
|
||||
return G_DODGY;
|
||||
} else {
|
||||
return G_OK;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*************************************************************************
|
||||
* word_deletable()
|
||||
* DELETE WERDS AT ENDS OF ROWS IF
|
||||
* Word is crunched &&
|
||||
* ( string length = 0 OR
|
||||
* > 50% of chars are "|" (before merging) OR
|
||||
* certainty < -10 OR
|
||||
* rating /char > 60 OR
|
||||
* TOP of word is more than 0.5 xht BELOW baseline OR
|
||||
* BOTTOM of word is more than 0.5 xht ABOVE xht OR
|
||||
* length of word < 3xht OR
|
||||
* height of word < 0.7 xht OR
|
||||
* height of word > 3.0 xht OR
|
||||
* >75% of the outline BBs have longest dimension < 0.5xht
|
||||
*************************************************************************/
|
||||
|
||||
CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) {
|
||||
int word_len = word->reject_map.length();
|
||||
float rating_per_ch;
|
||||
TBOX box; // BB of word
|
||||
|
||||
if (word->unlv_crunch_mode == CR_NONE) {
|
||||
delete_mode = 0;
|
||||
return CR_NONE;
|
||||
}
|
||||
|
||||
if (word_len == 0) {
|
||||
delete_mode = 1;
|
||||
return CR_DELETE;
|
||||
}
|
||||
|
||||
if (word->rebuild_word != nullptr) {
|
||||
// Cube leaves rebuild_word nullptr.
|
||||
box = word->rebuild_word->bounding_box();
|
||||
if (box.height() < crunch_del_min_ht * kBlnXHeight) {
|
||||
delete_mode = 4;
|
||||
return CR_DELETE;
|
||||
}
|
||||
|
||||
if (noise_outlines(word->rebuild_word)) {
|
||||
delete_mode = 5;
|
||||
return CR_DELETE;
|
||||
}
|
||||
}
|
||||
|
||||
if ((failure_count(word) * 1.5) > word_len) {
|
||||
delete_mode = 2;
|
||||
return CR_LOOSE_SPACE;
|
||||
}
|
||||
|
||||
if (word->best_choice->certainty() < crunch_del_cert) {
|
||||
delete_mode = 7;
|
||||
return CR_LOOSE_SPACE;
|
||||
}
|
||||
|
||||
rating_per_ch = word->best_choice->rating() / word_len;
|
||||
|
||||
if (rating_per_ch > crunch_del_rating) {
|
||||
delete_mode = 8;
|
||||
return CR_LOOSE_SPACE;
|
||||
}
|
||||
|
||||
if (box.top() < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
|
||||
delete_mode = 9;
|
||||
return CR_LOOSE_SPACE;
|
||||
}
|
||||
|
||||
if (box.bottom() > kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
|
||||
delete_mode = 10;
|
||||
return CR_LOOSE_SPACE;
|
||||
}
|
||||
|
||||
if (box.height() > crunch_del_max_ht * kBlnXHeight) {
|
||||
delete_mode = 11;
|
||||
return CR_LOOSE_SPACE;
|
||||
}
|
||||
|
||||
if (box.width() < crunch_del_min_width * kBlnXHeight) {
|
||||
delete_mode = 3;
|
||||
return CR_LOOSE_SPACE;
|
||||
}
|
||||
|
||||
delete_mode = 0;
|
||||
return CR_NONE;
|
||||
}
|
||||
|
||||
int16_t Tesseract::failure_count(WERD_RES *word) {
|
||||
const char *str = word->best_choice->unichar_string().c_str();
|
||||
int tess_rejs = 0;
|
||||
|
||||
for (; *str != '\0'; str++) {
|
||||
if (*str == ' ') {
|
||||
tess_rejs++;
|
||||
}
|
||||
}
|
||||
return tess_rejs;
|
||||
}
|
||||
|
||||
bool Tesseract::noise_outlines(TWERD *word) {
|
||||
TBOX box; // BB of outline
|
||||
int16_t outline_count = 0;
|
||||
int16_t small_outline_count = 0;
|
||||
int16_t max_dimension;
|
||||
float small_limit = kBlnXHeight * crunch_small_outlines_size;
|
||||
|
||||
for (int b = 0; b < word->NumBlobs(); ++b) {
|
||||
TBLOB *blob = word->blobs[b];
|
||||
for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
|
||||
outline_count++;
|
||||
box = ol->bounding_box();
|
||||
if (box.height() > box.width()) {
|
||||
max_dimension = box.height();
|
||||
} else {
|
||||
max_dimension = box.width();
|
||||
}
|
||||
if (max_dimension < small_limit) {
|
||||
small_outline_count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return small_outline_count >= outline_count;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
37
3rdparty/tesseract_ocr/tesseract/src/ccmain/docqual.h
vendored
Normal file
37
3rdparty/tesseract_ocr/tesseract/src/ccmain/docqual.h
vendored
Normal file
|
@ -0,0 +1,37 @@
|
|||
/******************************************************************
|
||||
* File: docqual.h (Formerly docqual.h)
|
||||
* Description: Document Quality Metrics
|
||||
* Author: Phil Cheatle
|
||||
*
|
||||
* (C) Copyright 1994, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef DOCQUAL_H
|
||||
#define DOCQUAL_H
|
||||
|
||||
#include <cstdint> // for int16_t
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class PAGE_RES_IT;
|
||||
class ROW;
|
||||
class WERD_RES;
|
||||
|
||||
enum GARBAGE_LEVEL { G_NEVER_CRUNCH, G_OK, G_DODGY, G_TERRIBLE };
|
||||
|
||||
int16_t word_blob_quality(WERD_RES *word);
|
||||
void reject_whole_page(PAGE_RES_IT &page_res_it);
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif
|
1451
3rdparty/tesseract_ocr/tesseract/src/ccmain/equationdetect.cpp
vendored
Normal file
1451
3rdparty/tesseract_ocr/tesseract/src/ccmain/equationdetect.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
250
3rdparty/tesseract_ocr/tesseract/src/ccmain/equationdetect.h
vendored
Normal file
250
3rdparty/tesseract_ocr/tesseract/src/ccmain/equationdetect.h
vendored
Normal file
|
@ -0,0 +1,250 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: equationdetect.h
|
||||
// Description: The equation detection class that inherits equationdetectbase.
|
||||
// Author: Zongyi (Joe) Liu (joeliu@google.com)
|
||||
//
|
||||
// (C) Copyright 2011, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCMAIN_EQUATIONDETECT_H_
|
||||
#define TESSERACT_CCMAIN_EQUATIONDETECT_H_
|
||||
|
||||
#include <tesseract/unichar.h> // for UNICHAR_ID
|
||||
#include "blobbox.h" // for BLOBNBOX (ptr only), BlobSpecialText...
|
||||
#include "equationdetectbase.h" // for EquationDetectBase
|
||||
#include "tesseractclass.h" // for Tesseract
|
||||
|
||||
class TBOX;
|
||||
class UNICHARSET;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class Tesseract;
|
||||
class ColPartition;
|
||||
class ColPartitionGrid;
|
||||
class ColPartitionSet;
|
||||
|
||||
class TESS_API EquationDetect : public EquationDetectBase {
|
||||
public:
|
||||
EquationDetect(const char *equ_datapath, const char *equ_language);
|
||||
~EquationDetect() override;
|
||||
|
||||
enum IndentType { NO_INDENT, LEFT_INDENT, RIGHT_INDENT, BOTH_INDENT, INDENT_TYPE_COUNT };
|
||||
|
||||
// Reset the lang_tesseract_ pointer. This function should be called before we
|
||||
// do any detector work.
|
||||
void SetLangTesseract(Tesseract *lang_tesseract);
|
||||
|
||||
// Iterate over the blobs inside to_block, and set the blobs that we want to
|
||||
// process to BSTT_NONE. (By default, they should be BSTT_SKIP). The function
|
||||
// returns 0 upon success.
|
||||
int LabelSpecialText(TO_BLOCK *to_block) override;
|
||||
|
||||
// Find possible equation partitions from part_grid. Should be called
|
||||
// after the special_text_type of blobs are set.
|
||||
// It returns 0 upon success.
|
||||
int FindEquationParts(ColPartitionGrid *part_grid, ColPartitionSet **best_columns) override;
|
||||
|
||||
// Reset the resolution of the processing image. TEST only function.
|
||||
void SetResolution(const int resolution);
|
||||
|
||||
protected:
|
||||
// Identify the special text type for one blob, and update its field. When
|
||||
// height_th is set (> 0), we will label the blob as BSTT_NONE if its height
|
||||
// is less than height_th.
|
||||
void IdentifySpecialText(BLOBNBOX *blob, const int height_th);
|
||||
|
||||
// Estimate the type for one unichar.
|
||||
BlobSpecialTextType EstimateTypeForUnichar(const UNICHARSET &unicharset,
|
||||
const UNICHAR_ID id) const;
|
||||
|
||||
// Compute special text type for each blobs in part_grid_.
|
||||
void IdentifySpecialText();
|
||||
|
||||
// Identify blobs that we want to skip during special blob type
|
||||
// classification.
|
||||
void IdentifyBlobsToSkip(ColPartition *part);
|
||||
|
||||
// The ColPartitions in part_grid_ maybe over-segmented, particularly in the
|
||||
// block equation regions. So we like to identify these partitions and merge
|
||||
// them before we do the searching.
|
||||
void MergePartsByLocation();
|
||||
|
||||
// Staring from the seed center, we do radius search. And for partitions that
|
||||
// have large overlaps with seed, we remove them from part_grid_ and add into
|
||||
// parts_overlap. Note: this function may update the part_grid_, so if the
|
||||
// caller is also running ColPartitionGridSearch, use the RepositionIterator
|
||||
// to continue.
|
||||
void SearchByOverlap(ColPartition *seed, std::vector<ColPartition *> *parts_overlap);
|
||||
|
||||
// Insert part back into part_grid_, after it absorbs some other parts.
|
||||
void InsertPartAfterAbsorb(ColPartition *part);
|
||||
|
||||
// Identify the colparitions in part_grid_, label them as PT_EQUATION, and
|
||||
// save them into cp_seeds_.
|
||||
void IdentifySeedParts();
|
||||
|
||||
// Check the blobs count for a seed region candidate.
|
||||
bool CheckSeedBlobsCount(ColPartition *part);
|
||||
|
||||
// Compute the foreground pixel density for a tbox area.
|
||||
float ComputeForegroundDensity(const TBOX &tbox);
|
||||
|
||||
// Check if part from seed2 label: with low math density and left indented. We
|
||||
// are using two checks:
|
||||
// 1. If its left is aligned with any coordinates in indented_texts_left,
|
||||
// which we assume have been sorted.
|
||||
// 2. If its foreground density is over foreground_density_th.
|
||||
bool CheckForSeed2(const std::vector<int> &indented_texts_left,
|
||||
const float foreground_density_th, ColPartition *part);
|
||||
|
||||
// Count the number of values in sorted_vec that is close to val, used to
|
||||
// check if a partition is aligned with text partitions.
|
||||
int CountAlignment(const std::vector<int> &sorted_vec, const int val) const;
|
||||
|
||||
// Check for a seed candidate using the foreground pixel density. And we
|
||||
// return true if the density is below a certain threshold, because characters
|
||||
// in equation regions usually are apart with more white spaces.
|
||||
bool CheckSeedFgDensity(const float density_th, ColPartition *part);
|
||||
|
||||
// A light version of SplitCPHor: instead of really doing the part split, we
|
||||
// simply compute the union bounding box of each split part.
|
||||
void SplitCPHorLite(ColPartition *part, std::vector<TBOX> *splitted_boxes);
|
||||
|
||||
// Split the part (horizontally), and save the split result into
|
||||
// parts_splitted. Note that it is caller's responsibility to release the
|
||||
// memory owns by parts_splitted. On the other hand, the part is unchanged
|
||||
// during this process and still owns the blobs, so do NOT call DeleteBoxes
|
||||
// when freeing the colpartitions in parts_splitted.
|
||||
void SplitCPHor(ColPartition *part, std::vector<ColPartition *> *parts_splitted);
|
||||
|
||||
// Check the density for a seed candidate (part) using its math density and
|
||||
// italic density, returns true if the check passed.
|
||||
bool CheckSeedDensity(const float math_density_high, const float math_density_low,
|
||||
const ColPartition *part) const;
|
||||
|
||||
// Check if part is indented.
|
||||
IndentType IsIndented(ColPartition *part);
|
||||
|
||||
// Identify inline partitions from cp_seeds_, and re-label them.
|
||||
void IdentifyInlineParts();
|
||||
|
||||
// Compute the super bounding box for all colpartitions inside part_grid_.
|
||||
void ComputeCPsSuperBBox();
|
||||
|
||||
// Identify inline partitions from cp_seeds_ using the horizontal search.
|
||||
void IdentifyInlinePartsHorizontal();
|
||||
|
||||
// Estimate the line spacing between two text partitions. Returns -1 if not
|
||||
// enough data.
|
||||
int EstimateTextPartLineSpacing();
|
||||
|
||||
// Identify inline partitions from cp_seeds_ using vertical search.
|
||||
void IdentifyInlinePartsVertical(const bool top_to_bottom, const int textPartsLineSpacing);
|
||||
|
||||
// Check if part is an inline equation zone. This should be called after we
|
||||
// identified the seed regions.
|
||||
bool IsInline(const bool search_bottom, const int textPartsLineSpacing, ColPartition *part);
|
||||
|
||||
// For a given seed partition, we search the part_grid_ and see if there is
|
||||
// any partition can be merged with it. It returns true if the seed has been
|
||||
// expanded.
|
||||
bool ExpandSeed(ColPartition *seed);
|
||||
|
||||
// Starting from the seed position, we search the part_grid_
|
||||
// horizontally/vertically, find all partitions that can be
|
||||
// merged with seed, remove them from part_grid_, and put them into
|
||||
// parts_to_merge.
|
||||
void ExpandSeedHorizontal(const bool search_left, ColPartition *seed,
|
||||
std::vector<ColPartition *> *parts_to_merge);
|
||||
void ExpandSeedVertical(const bool search_bottom, ColPartition *seed,
|
||||
std::vector<ColPartition *> *parts_to_merge);
|
||||
|
||||
// Check if a part_box is the small neighbor of seed_box.
|
||||
bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const;
|
||||
|
||||
// Perform the density check for part, which we assume is nearing a seed
|
||||
// partition. It returns true if the check passed.
|
||||
bool CheckSeedNeighborDensity(const ColPartition *part) const;
|
||||
|
||||
// After identify the math blocks, we do one more scanning on all text
|
||||
// partitions, and check if any of them is the satellite of:
|
||||
// math blocks: here a p is the satellite of q if:
|
||||
// 1. q is the nearest vertical neighbor of p, and
|
||||
// 2. y_gap(p, q) is less than a threshold, and
|
||||
// 3. x_overlap(p, q) is over a threshold.
|
||||
// Note that p can be the satellites of two blocks: its top neighbor and
|
||||
// bottom neighbor.
|
||||
void ProcessMathBlockSatelliteParts();
|
||||
|
||||
// Check if part is the satellite of one/two math blocks. If it is, we return
|
||||
// true, and save the blocks into math_blocks.
|
||||
bool IsMathBlockSatellite(ColPartition *part, std::vector<ColPartition *> *math_blocks);
|
||||
|
||||
// Search the nearest neighbor of part in one vertical direction as defined in
|
||||
// search_bottom. It returns the neighbor found that major x overlap with it,
|
||||
// or nullptr when not found.
|
||||
ColPartition *SearchNNVertical(const bool search_bottom, const ColPartition *part);
|
||||
|
||||
// Check if the neighbor with vertical distance of y_gap is a near and math
|
||||
// block partition.
|
||||
bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const;
|
||||
|
||||
// Generate the tiff file name for output/debug file.
|
||||
void GetOutputTiffName(const char *name, std::string &image_name) const;
|
||||
|
||||
// Debugger function that renders ColPartitions on the input image, where:
|
||||
// parts labeled as PT_EQUATION will be painted in red, PT_INLINE_EQUATION
|
||||
// will be painted in green, and other parts will be painted in blue.
|
||||
void PaintColParts(const std::string &outfile) const;
|
||||
|
||||
// Debugger function that renders the blobs in part_grid_ over the input
|
||||
// image.
|
||||
void PaintSpecialTexts(const std::string &outfile) const;
|
||||
|
||||
// Debugger function that print the math blobs density values for a
|
||||
// ColPartition object.
|
||||
void PrintSpecialBlobsDensity(const ColPartition *part) const;
|
||||
|
||||
// The tesseract engine initialized from equation training data.
|
||||
Tesseract equ_tesseract_;
|
||||
|
||||
// The tesseract engine used for OCR. This pointer is passed in by the caller,
|
||||
// so do NOT destroy it in this class.
|
||||
Tesseract *lang_tesseract_;
|
||||
|
||||
// The ColPartitionGrid that we are processing. This pointer is passed in from
|
||||
// the caller, so do NOT destroy it in the class.
|
||||
ColPartitionGrid *part_grid_ = nullptr;
|
||||
|
||||
// A simple array of pointers to the best assigned column division at
|
||||
// each grid y coordinate. This pointer is passed in from the caller, so do
|
||||
// NOT destroy it in the class.
|
||||
ColPartitionSet **best_columns_ = nullptr;
|
||||
|
||||
// The super bounding box of all cps in the part_grid_.
|
||||
TBOX *cps_super_bbox_;
|
||||
|
||||
// The seed ColPartition for equation region.
|
||||
std::vector<ColPartition *> cp_seeds_;
|
||||
|
||||
// The resolution (dpi) of the processing image.
|
||||
int resolution_;
|
||||
|
||||
// The number of pages we have processed.
|
||||
int page_count_;
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_CCMAIN_EQUATIONDETECT_H_
|
870
3rdparty/tesseract_ocr/tesseract/src/ccmain/fixspace.cpp
vendored
Normal file
870
3rdparty/tesseract_ocr/tesseract/src/ccmain/fixspace.cpp
vendored
Normal file
|
@ -0,0 +1,870 @@
|
|||
/******************************************************************
|
||||
* File: fixspace.cpp (Formerly fixspace.c)
|
||||
* Description: Implements a pass over the page res, exploring the alternative
|
||||
* spacing possibilities, trying to use context to improve the
|
||||
* word spacing
|
||||
* Author: Phil Cheatle
|
||||
*
|
||||
* (C) Copyright 1993, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "fixspace.h"
|
||||
|
||||
#include "blobs.h" // for TWERD, TBLOB, TESSLINE
|
||||
#include "boxword.h" // for BoxWord
|
||||
#include "errcode.h" // for ASSERT_HOST
|
||||
#include "normalis.h" // for kBlnXHeight, kBlnBaselineOffset
|
||||
#include "pageres.h" // for WERD_RES_IT, WERD_RES, WERD_RES_LIST
|
||||
#include "params.h" // for IntParam, StringParam, BoolParam, DoubleParam, ...
|
||||
#include "ratngs.h" // for WERD_CHOICE, FREQ_DAWG_PERM, NUMBER_PERM
|
||||
#include "rect.h" // for TBOX
|
||||
#include "stepblob.h" // for C_BLOB_IT, C_BLOB_LIST, C_BLOB
|
||||
#include "tesseractclass.h" // for Tesseract, TesseractStats, WordData
|
||||
#include "tessvars.h" // for debug_fp
|
||||
#include "tprintf.h" // for tprintf
|
||||
#include "unicharset.h" // for UNICHARSET
|
||||
#include "werd.h" // for WERD, W_EOL, W_FUZZY_NON, W_FUZZY_SP
|
||||
|
||||
#include <tesseract/ocrclass.h> // for ETEXT_DESC
|
||||
#include <tesseract/unichar.h> // for UNICHAR_ID
|
||||
|
||||
#include <cstdint> // for INT16_MAX, int16_t, int32_t
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class BLOCK;
|
||||
class ROW;
|
||||
|
||||
#define PERFECT_WERDS 999
|
||||
|
||||
/**********************************************************************
|
||||
* c_blob_comparator()
|
||||
*
|
||||
* Blob comparator used to sort a blob list so that blobs are in increasing
|
||||
* order of left edge.
|
||||
**********************************************************************/
|
||||
|
||||
static int c_blob_comparator( // sort blobs
|
||||
const void *blob1p, // ptr to ptr to blob1
|
||||
const void *blob2p // ptr to ptr to blob2
|
||||
) {
|
||||
const C_BLOB *blob1 = *reinterpret_cast<const C_BLOB *const *>(blob1p);
|
||||
const C_BLOB *blob2 = *reinterpret_cast<const C_BLOB *const *>(blob2p);
|
||||
|
||||
return blob1->bounding_box().left() - blob2->bounding_box().left();
|
||||
}
|
||||
|
||||
/**
|
||||
* @name fix_fuzzy_spaces()
|
||||
* Walk over the page finding sequences of words joined by fuzzy spaces. Extract
|
||||
* them as a sublist, process the sublist to find the optimal arrangement of
|
||||
* spaces then replace the sublist in the ROW_RES.
|
||||
*
|
||||
* @param monitor progress monitor
|
||||
* @param word_count count of words in doc
|
||||
* @param[out] page_res
|
||||
*/
|
||||
void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res) {
|
||||
BLOCK_RES_IT block_res_it;
|
||||
ROW_RES_IT row_res_it;
|
||||
WERD_RES_IT word_res_it_from;
|
||||
WERD_RES_IT word_res_it_to;
|
||||
WERD_RES *word_res;
|
||||
WERD_RES_LIST fuzzy_space_words;
|
||||
int16_t new_length;
|
||||
bool prevent_null_wd_fixsp; // DON'T process blobless wds
|
||||
int32_t word_index; // current word
|
||||
|
||||
block_res_it.set_to_list(&page_res->block_res_list);
|
||||
word_index = 0;
|
||||
for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); block_res_it.forward()) {
|
||||
row_res_it.set_to_list(&block_res_it.data()->row_res_list);
|
||||
for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); row_res_it.forward()) {
|
||||
word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
|
||||
while (!word_res_it_from.at_last()) {
|
||||
word_res = word_res_it_from.data();
|
||||
while (!word_res_it_from.at_last() &&
|
||||
!(word_res->combination ||
|
||||
word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
|
||||
word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
|
||||
fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);
|
||||
word_res = word_res_it_from.forward();
|
||||
word_index++;
|
||||
if (monitor != nullptr) {
|
||||
monitor->ocr_alive = true;
|
||||
monitor->progress = 90 + 5 * word_index / word_count;
|
||||
if (monitor->deadline_exceeded() ||
|
||||
(monitor->cancel != nullptr &&
|
||||
(*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!word_res_it_from.at_last()) {
|
||||
word_res_it_to = word_res_it_from;
|
||||
prevent_null_wd_fixsp = word_res->word->cblob_list()->empty();
|
||||
if (check_debug_pt(word_res, 60)) {
|
||||
debug_fix_space_level.set_value(10);
|
||||
}
|
||||
word_res_it_to.forward();
|
||||
word_index++;
|
||||
if (monitor != nullptr) {
|
||||
monitor->ocr_alive = true;
|
||||
monitor->progress = 90 + 5 * word_index / word_count;
|
||||
if (monitor->deadline_exceeded() ||
|
||||
(monitor->cancel != nullptr &&
|
||||
(*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
while (!word_res_it_to.at_last() &&
|
||||
(word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
|
||||
word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
|
||||
if (check_debug_pt(word_res, 60)) {
|
||||
debug_fix_space_level.set_value(10);
|
||||
}
|
||||
if (word_res->word->cblob_list()->empty()) {
|
||||
prevent_null_wd_fixsp = true;
|
||||
}
|
||||
word_res = word_res_it_to.forward();
|
||||
}
|
||||
if (check_debug_pt(word_res, 60)) {
|
||||
debug_fix_space_level.set_value(10);
|
||||
}
|
||||
if (word_res->word->cblob_list()->empty()) {
|
||||
prevent_null_wd_fixsp = true;
|
||||
}
|
||||
if (prevent_null_wd_fixsp) {
|
||||
word_res_it_from = word_res_it_to;
|
||||
} else {
|
||||
fuzzy_space_words.assign_to_sublist(&word_res_it_from, &word_res_it_to);
|
||||
fix_fuzzy_space_list(fuzzy_space_words, row_res_it.data()->row,
|
||||
block_res_it.data()->block);
|
||||
new_length = fuzzy_space_words.length();
|
||||
word_res_it_from.add_list_before(&fuzzy_space_words);
|
||||
for (; !word_res_it_from.at_last() && new_length > 0; new_length--) {
|
||||
word_res_it_from.forward();
|
||||
}
|
||||
}
|
||||
if (test_pt) {
|
||||
debug_fix_space_level.set_value(0);
|
||||
}
|
||||
}
|
||||
fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);
|
||||
// Last word in row
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) {
|
||||
int16_t best_score;
|
||||
WERD_RES_LIST current_perm;
|
||||
int16_t current_score;
|
||||
bool improved = false;
|
||||
|
||||
best_score = eval_word_spacing(best_perm); // default score
|
||||
dump_words(best_perm, best_score, 1, improved);
|
||||
|
||||
if (best_score != PERFECT_WERDS) {
|
||||
initialise_search(best_perm, current_perm);
|
||||
}
|
||||
|
||||
while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
|
||||
match_current_words(current_perm, row, block);
|
||||
current_score = eval_word_spacing(current_perm);
|
||||
dump_words(current_perm, current_score, 2, improved);
|
||||
if (current_score > best_score) {
|
||||
best_perm.clear();
|
||||
best_perm.deep_copy(¤t_perm, &WERD_RES::deep_copy);
|
||||
best_score = current_score;
|
||||
improved = true;
|
||||
}
|
||||
if (current_score < PERFECT_WERDS) {
|
||||
transform_to_next_perm(current_perm);
|
||||
}
|
||||
}
|
||||
dump_words(best_perm, best_score, 3, improved);
|
||||
}
|
||||
|
||||
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
|
||||
WERD_RES_IT src_it(&src_list);
|
||||
WERD_RES_IT new_it(&new_list);
|
||||
WERD_RES *src_wd;
|
||||
WERD_RES *new_wd;
|
||||
|
||||
for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
|
||||
src_wd = src_it.data();
|
||||
if (!src_wd->combination) {
|
||||
new_wd = WERD_RES::deep_copy(src_wd);
|
||||
new_wd->combination = false;
|
||||
new_wd->part_of_combo = false;
|
||||
new_it.add_after_then_move(new_wd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block) {
|
||||
WERD_RES_IT word_it(&words);
|
||||
WERD_RES *word;
|
||||
// Since we are not using PAGE_RES to iterate over words, we need to update
|
||||
// prev_word_best_choice_ before calling classify_word_pass2().
|
||||
prev_word_best_choice_ = nullptr;
|
||||
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
|
||||
word = word_it.data();
|
||||
if ((!word->part_of_combo) && (word->box_word == nullptr)) {
|
||||
WordData word_data(block, row, word);
|
||||
SetupWordPassN(2, &word_data);
|
||||
classify_word_and_language(2, nullptr, &word_data);
|
||||
}
|
||||
prev_word_best_choice_ = word->best_choice;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @name eval_word_spacing()
|
||||
* The basic measure is the number of characters in contextually confirmed
|
||||
* words. (I.e the word is done)
|
||||
* If all words are contextually confirmed the evaluation is deemed perfect.
|
||||
*
|
||||
* Some fiddles are done to handle "1"s as these are VERY frequent causes of
|
||||
* fuzzy spaces. The problem with the basic measure is that "561 63" would score
|
||||
* the same as "56163", though given our knowledge that the space is fuzzy, and
|
||||
* that there is a "1" next to the fuzzy space, we need to ensure that "56163"
|
||||
* is preferred.
|
||||
*
|
||||
* The solution is to NOT COUNT the score of any word which has a digit at one
|
||||
* end and a "1Il" as the character the other side of the space.
|
||||
*
|
||||
* Conversely, any character next to a "1" within a word is counted as a
|
||||
* positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1
|
||||
* side of the "1" joined). "56163" would score 7 - all chars in a numeric word
|
||||
* + 2 sides of a "1" joined.
|
||||
*
|
||||
* The joined 1 rule is applied to any word REGARDLESS of contextual
|
||||
* confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally
|
||||
* confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.
|
||||
*
|
||||
*/
|
||||
int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
|
||||
WERD_RES_IT word_res_it(&word_res_list);
|
||||
int16_t total_score = 0;
|
||||
int16_t word_count = 0;
|
||||
int16_t done_word_count = 0;
|
||||
int16_t i;
|
||||
int16_t offset;
|
||||
int16_t prev_word_score = 0;
|
||||
bool prev_word_done = false;
|
||||
bool prev_char_1 = false; // prev ch a "1/I/l"?
|
||||
bool prev_char_digit = false; // prev ch 2..9 or 0
|
||||
const char *punct_chars = "!\"`',.:;";
|
||||
bool prev_char_punct = false;
|
||||
|
||||
do {
|
||||
// current word
|
||||
WERD_RES *word = word_res_it.data();
|
||||
bool word_done = fixspace_thinks_word_done(word);
|
||||
word_count++;
|
||||
if (word->tess_failed) {
|
||||
total_score += prev_word_score;
|
||||
if (prev_word_done) {
|
||||
done_word_count++;
|
||||
}
|
||||
prev_word_score = 0;
|
||||
prev_char_1 = false;
|
||||
prev_char_digit = false;
|
||||
prev_word_done = false;
|
||||
} else {
|
||||
/*
|
||||
Can we add the prev word score and potentially count this word?
|
||||
Yes IF it didn't end in a 1 when the first char of this word is a digit
|
||||
AND it didn't end in a digit when the first char of this word is a 1
|
||||
*/
|
||||
auto word_len = word->reject_map.length();
|
||||
bool current_word_ok_so_far = false;
|
||||
if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
|
||||
(prev_char_digit &&
|
||||
((word_done && word->best_choice->unichar_lengths().c_str()[0] == 1 &&
|
||||
word->best_choice->unichar_string()[0] == '1') ||
|
||||
(!word_done &&
|
||||
conflict_set_I_l_1.contains(word->best_choice->unichar_string()[0])))))) {
|
||||
total_score += prev_word_score;
|
||||
if (prev_word_done) {
|
||||
done_word_count++;
|
||||
}
|
||||
current_word_ok_so_far = word_done;
|
||||
}
|
||||
|
||||
if (current_word_ok_so_far) {
|
||||
prev_word_done = true;
|
||||
prev_word_score = word_len;
|
||||
} else {
|
||||
prev_word_done = false;
|
||||
prev_word_score = 0;
|
||||
}
|
||||
|
||||
/* Add 1 to total score for every joined 1 regardless of context and
|
||||
rejtn */
|
||||
for (i = 0, prev_char_1 = false; i < word_len; i++) {
|
||||
bool current_char_1 = word->best_choice->unichar_string()[i] == '1';
|
||||
if (prev_char_1 || (current_char_1 && (i > 0))) {
|
||||
total_score++;
|
||||
}
|
||||
prev_char_1 = current_char_1;
|
||||
}
|
||||
|
||||
/* Add 1 to total score for every joined punctuation regardless of context
|
||||
and rejtn */
|
||||
if (tessedit_prefer_joined_punct) {
|
||||
for (i = 0, offset = 0, prev_char_punct = false; i < word_len;
|
||||
offset += word->best_choice->unichar_lengths()[i++]) {
|
||||
bool current_char_punct =
|
||||
strchr(punct_chars, word->best_choice->unichar_string()[offset]) != nullptr;
|
||||
if (prev_char_punct || (current_char_punct && i > 0)) {
|
||||
total_score++;
|
||||
}
|
||||
prev_char_punct = current_char_punct;
|
||||
}
|
||||
}
|
||||
prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
|
||||
for (i = 0, offset = 0; i < word_len - 1;
|
||||
offset += word->best_choice->unichar_lengths()[i++]) {
|
||||
;
|
||||
}
|
||||
prev_char_1 =
|
||||
((word_done && (word->best_choice->unichar_string()[offset] == '1')) ||
|
||||
(!word_done &&
|
||||
conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])));
|
||||
}
|
||||
/* Find next word */
|
||||
do {
|
||||
word_res_it.forward();
|
||||
} while (word_res_it.data()->part_of_combo);
|
||||
} while (!word_res_it.at_first());
|
||||
total_score += prev_word_score;
|
||||
if (prev_word_done) {
|
||||
done_word_count++;
|
||||
}
|
||||
if (done_word_count == word_count) {
|
||||
return PERFECT_WERDS;
|
||||
} else {
|
||||
return total_score;
|
||||
}
|
||||
}
|
||||
|
||||
bool Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
|
||||
int i;
|
||||
int offset;
|
||||
|
||||
for (i = 0, offset = 0; i < char_position; offset += word->best_choice->unichar_lengths()[i++]) {
|
||||
;
|
||||
}
|
||||
return (
|
||||
word->uch_set->get_isdigit(word->best_choice->unichar_string().c_str() + offset,
|
||||
word->best_choice->unichar_lengths()[i]) ||
|
||||
(word->best_choice->permuter() == NUMBER_PERM &&
|
||||
numeric_punctuation.contains(word->best_choice->unichar_string().c_str()[offset])));
|
||||
}
|
||||
|
||||
/**
|
||||
* @name transform_to_next_perm()
|
||||
* Examines the current word list to find the smallest word gap size. Then walks
|
||||
* the word list closing any gaps of this size by either inserted new
|
||||
* combination words, or extending existing ones.
|
||||
*
|
||||
* The routine COULD be limited to stop it building words longer than N blobs.
|
||||
*
|
||||
* If there are no more gaps then it DELETES the entire list and returns the
|
||||
* empty list to cause termination.
|
||||
*/
|
||||
void transform_to_next_perm(WERD_RES_LIST &words) {
|
||||
WERD_RES_IT word_it(&words);
|
||||
WERD_RES_IT prev_word_it(&words);
|
||||
WERD_RES *word;
|
||||
WERD_RES *prev_word;
|
||||
WERD_RES *combo;
|
||||
WERD *copy_word;
|
||||
int16_t prev_right = -INT16_MAX;
|
||||
TBOX box;
|
||||
int16_t gap;
|
||||
int16_t min_gap = INT16_MAX;
|
||||
|
||||
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
|
||||
word = word_it.data();
|
||||
if (!word->part_of_combo) {
|
||||
box = word->word->bounding_box();
|
||||
if (prev_right > -INT16_MAX) {
|
||||
gap = box.left() - prev_right;
|
||||
if (gap < min_gap) {
|
||||
min_gap = gap;
|
||||
}
|
||||
}
|
||||
prev_right = box.right();
|
||||
}
|
||||
}
|
||||
if (min_gap < INT16_MAX) {
|
||||
prev_right = -INT16_MAX; // back to start
|
||||
word_it.set_to_list(&words);
|
||||
// Note: we can't use cycle_pt due to inserted combos at start of list.
|
||||
for (; (prev_right == -INT16_MAX) || !word_it.at_first(); word_it.forward()) {
|
||||
word = word_it.data();
|
||||
if (!word->part_of_combo) {
|
||||
box = word->word->bounding_box();
|
||||
if (prev_right > -INT16_MAX) {
|
||||
gap = box.left() - prev_right;
|
||||
if (gap <= min_gap) {
|
||||
prev_word = prev_word_it.data();
|
||||
if (prev_word->combination) {
|
||||
combo = prev_word;
|
||||
} else {
|
||||
/* Make a new combination and insert before
|
||||
* the first word being joined. */
|
||||
copy_word = new WERD;
|
||||
*copy_word = *(prev_word->word);
|
||||
// deep copy
|
||||
combo = new WERD_RES(copy_word);
|
||||
combo->combination = true;
|
||||
combo->x_height = prev_word->x_height;
|
||||
prev_word->part_of_combo = true;
|
||||
prev_word_it.add_before_then_move(combo);
|
||||
}
|
||||
combo->word->set_flag(W_EOL, word->word->flag(W_EOL));
|
||||
if (word->combination) {
|
||||
combo->word->join_on(word->word);
|
||||
// Move blobs to combo
|
||||
// old combo no longer needed
|
||||
delete word_it.extract();
|
||||
} else {
|
||||
// Copy current wd to combo
|
||||
combo->copy_on(word);
|
||||
word->part_of_combo = true;
|
||||
}
|
||||
combo->done = false;
|
||||
combo->ClearResults();
|
||||
} else {
|
||||
prev_word_it = word_it; // catch up
|
||||
}
|
||||
}
|
||||
prev_right = box.right();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
words.clear(); // signal termination
|
||||
}
|
||||
}
|
||||
|
||||
void Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved) {
|
||||
WERD_RES_IT word_res_it(&perm);
|
||||
|
||||
if (debug_fix_space_level > 0) {
|
||||
if (mode == 1) {
|
||||
stats_.dump_words_str = "";
|
||||
for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
|
||||
if (!word_res_it.data()->part_of_combo) {
|
||||
stats_.dump_words_str += word_res_it.data()->best_choice->unichar_string();
|
||||
stats_.dump_words_str += ' ';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (debug_fix_space_level > 1) {
|
||||
switch (mode) {
|
||||
case 1:
|
||||
tprintf("EXTRACTED (%d): \"", score);
|
||||
break;
|
||||
case 2:
|
||||
tprintf("TESTED (%d): \"", score);
|
||||
break;
|
||||
case 3:
|
||||
tprintf("RETURNED (%d): \"", score);
|
||||
break;
|
||||
}
|
||||
|
||||
for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
|
||||
if (!word_res_it.data()->part_of_combo) {
|
||||
tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),
|
||||
static_cast<int>(word_res_it.data()->best_choice->permuter()));
|
||||
}
|
||||
}
|
||||
tprintf("\"\n");
|
||||
} else if (improved) {
|
||||
tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.c_str());
|
||||
for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
|
||||
if (!word_res_it.data()->part_of_combo) {
|
||||
tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),
|
||||
static_cast<int>(word_res_it.data()->best_choice->permuter()));
|
||||
}
|
||||
}
|
||||
tprintf("\"\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
|
||||
if (word->done) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
Use all the standard pass 2 conditions for mode 5 in set_done() in
|
||||
reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
|
||||
CARE WHETHER WE HAVE of/at on/an etc.
|
||||
*/
|
||||
if (fixsp_done_mode > 0 &&
|
||||
(word->tess_accepted || (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
|
||||
fixsp_done_mode == 3) &&
|
||||
(strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr) &&
|
||||
((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
|
||||
(word->best_choice->permuter() == FREQ_DAWG_PERM) ||
|
||||
(word->best_choice->permuter() == USER_DAWG_PERM) ||
|
||||
(word->best_choice->permuter() == NUMBER_PERM))) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @name fix_sp_fp_word()
|
||||
* Test the current word to see if it can be split by deleting noise blobs. If
|
||||
* so, do the business.
|
||||
* Return with the iterator pointing to the same place if the word is unchanged,
|
||||
* or the last of the replacement words.
|
||||
*/
|
||||
void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block) {
|
||||
WERD_RES *word_res;
|
||||
WERD_RES_LIST sub_word_list;
|
||||
WERD_RES_IT sub_word_list_it(&sub_word_list);
|
||||
int16_t blob_index;
|
||||
int16_t new_length;
|
||||
float junk;
|
||||
|
||||
word_res = word_res_it.data();
|
||||
if (word_res->word->flag(W_REP_CHAR) || word_res->combination || word_res->part_of_combo ||
|
||||
!word_res->word->flag(W_DONT_CHOP)) {
|
||||
return;
|
||||
}
|
||||
|
||||
blob_index = worst_noise_blob(word_res, &junk);
|
||||
if (blob_index < 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (debug_fix_space_level > 1) {
|
||||
tprintf("FP fixspace working on \"%s\"\n", word_res->best_choice->unichar_string().c_str());
|
||||
}
|
||||
word_res->word->rej_cblob_list()->sort(c_blob_comparator);
|
||||
sub_word_list_it.add_after_stay_put(word_res_it.extract());
|
||||
fix_noisy_space_list(sub_word_list, row, block);
|
||||
new_length = sub_word_list.length();
|
||||
word_res_it.add_list_before(&sub_word_list);
|
||||
for (; !word_res_it.at_last() && new_length > 1; new_length--) {
|
||||
word_res_it.forward();
|
||||
}
|
||||
}
|
||||
|
||||
void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) {
|
||||
int16_t best_score;
|
||||
WERD_RES_IT best_perm_it(&best_perm);
|
||||
WERD_RES_LIST current_perm;
|
||||
WERD_RES_IT current_perm_it(¤t_perm);
|
||||
WERD_RES *old_word_res;
|
||||
int16_t current_score;
|
||||
bool improved = false;
|
||||
|
||||
best_score = fp_eval_word_spacing(best_perm); // default score
|
||||
|
||||
dump_words(best_perm, best_score, 1, improved);
|
||||
|
||||
old_word_res = best_perm_it.data();
|
||||
// Even deep_copy doesn't copy the underlying WERD unless its combination
|
||||
// flag is true!.
|
||||
old_word_res->combination = true; // Kludge to force deep copy
|
||||
current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
|
||||
old_word_res->combination = false; // Undo kludge
|
||||
|
||||
break_noisiest_blob_word(current_perm);
|
||||
|
||||
while (best_score != PERFECT_WERDS && !current_perm.empty()) {
|
||||
match_current_words(current_perm, row, block);
|
||||
current_score = fp_eval_word_spacing(current_perm);
|
||||
dump_words(current_perm, current_score, 2, improved);
|
||||
if (current_score > best_score) {
|
||||
best_perm.clear();
|
||||
best_perm.deep_copy(¤t_perm, &WERD_RES::deep_copy);
|
||||
best_score = current_score;
|
||||
improved = true;
|
||||
}
|
||||
if (current_score < PERFECT_WERDS) {
|
||||
break_noisiest_blob_word(current_perm);
|
||||
}
|
||||
}
|
||||
dump_words(best_perm, best_score, 3, improved);
|
||||
}
|
||||
|
||||
/**
|
||||
* break_noisiest_blob_word()
|
||||
* Find the word with the blob which looks like the worst noise.
|
||||
* Break the word into two, deleting the noise blob.
|
||||
*/
|
||||
void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
|
||||
WERD_RES_IT word_it(&words);
|
||||
WERD_RES_IT worst_word_it;
|
||||
float worst_noise_score = 9999;
|
||||
int worst_blob_index = -1; // Noisiest blob of noisiest wd
|
||||
int blob_index; // of wds noisiest blob
|
||||
float noise_score; // of wds noisiest blob
|
||||
WERD_RES *word_res;
|
||||
C_BLOB_IT blob_it;
|
||||
C_BLOB_IT rej_cblob_it;
|
||||
C_BLOB_LIST new_blob_list;
|
||||
C_BLOB_IT new_blob_it;
|
||||
C_BLOB_IT new_rej_cblob_it;
|
||||
WERD *new_word;
|
||||
int16_t start_of_noise_blob;
|
||||
int16_t i;
|
||||
|
||||
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
|
||||
blob_index = worst_noise_blob(word_it.data(), &noise_score);
|
||||
if (blob_index > -1 && worst_noise_score > noise_score) {
|
||||
worst_noise_score = noise_score;
|
||||
worst_blob_index = blob_index;
|
||||
worst_word_it = word_it;
|
||||
}
|
||||
}
|
||||
if (worst_blob_index < 0) {
|
||||
words.clear(); // signal termination
|
||||
return;
|
||||
}
|
||||
|
||||
/* Now split the worst_word_it */
|
||||
|
||||
word_res = worst_word_it.data();
|
||||
|
||||
/* Move blobs before noise blob to a new bloblist */
|
||||
|
||||
new_blob_it.set_to_list(&new_blob_list);
|
||||
blob_it.set_to_list(word_res->word->cblob_list());
|
||||
for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
|
||||
new_blob_it.add_after_then_move(blob_it.extract());
|
||||
}
|
||||
start_of_noise_blob = blob_it.data()->bounding_box().left();
|
||||
delete blob_it.extract(); // throw out noise blob
|
||||
|
||||
new_word = new WERD(&new_blob_list, word_res->word);
|
||||
new_word->set_flag(W_EOL, false);
|
||||
word_res->word->set_flag(W_BOL, false);
|
||||
word_res->word->set_blanks(1); // After break
|
||||
|
||||
new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
|
||||
rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
|
||||
for (; (!rej_cblob_it.empty() &&
|
||||
(rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
|
||||
rej_cblob_it.forward()) {
|
||||
new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
|
||||
}
|
||||
|
||||
auto *new_word_res = new WERD_RES(new_word);
|
||||
new_word_res->combination = true;
|
||||
worst_word_it.add_before_then_move(new_word_res);
|
||||
|
||||
word_res->ClearResults();
|
||||
}
|
||||
|
||||
int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) {
|
||||
float noise_score[512];
|
||||
int i;
|
||||
int min_noise_blob; // 1st contender
|
||||
int max_noise_blob; // last contender
|
||||
int non_noise_count;
|
||||
int worst_noise_blob; // Worst blob
|
||||
float small_limit = kBlnXHeight * fixsp_small_outlines_size;
|
||||
float non_noise_limit = kBlnXHeight * 0.8;
|
||||
|
||||
if (word_res->rebuild_word == nullptr) {
|
||||
return -1; // Can't handle cube words.
|
||||
}
|
||||
|
||||
// Normalised.
|
||||
int blob_count = word_res->box_word->length();
|
||||
ASSERT_HOST(blob_count <= 512);
|
||||
if (blob_count < 5) {
|
||||
return -1; // too short to split
|
||||
}
|
||||
|
||||
/* Get the noise scores for all blobs */
|
||||
|
||||
#ifndef SECURE_NAMES
|
||||
if (debug_fix_space_level > 5) {
|
||||
tprintf("FP fixspace Noise metrics for \"%s\": ",
|
||||
word_res->best_choice->unichar_string().c_str());
|
||||
}
|
||||
#endif
|
||||
|
||||
for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
|
||||
TBLOB *blob = word_res->rebuild_word->blobs[i];
|
||||
if (word_res->reject_map[i].accepted()) {
|
||||
noise_score[i] = non_noise_limit;
|
||||
} else {
|
||||
noise_score[i] = blob_noise_score(blob);
|
||||
}
|
||||
|
||||
if (debug_fix_space_level > 5) {
|
||||
tprintf("%1.1f ", noise_score[i]);
|
||||
}
|
||||
}
|
||||
if (debug_fix_space_level > 5) {
|
||||
tprintf("\n");
|
||||
}
|
||||
|
||||
/* Now find the worst one which is far enough away from the end of the word */
|
||||
|
||||
non_noise_count = 0;
|
||||
for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
|
||||
if (noise_score[i] >= non_noise_limit) {
|
||||
non_noise_count++;
|
||||
}
|
||||
}
|
||||
if (non_noise_count < fixsp_non_noise_limit) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
min_noise_blob = i;
|
||||
|
||||
non_noise_count = 0;
|
||||
for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit; i--) {
|
||||
if (noise_score[i] >= non_noise_limit) {
|
||||
non_noise_count++;
|
||||
}
|
||||
}
|
||||
if (non_noise_count < fixsp_non_noise_limit) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
max_noise_blob = i;
|
||||
|
||||
if (min_noise_blob > max_noise_blob) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
*worst_noise_score = small_limit;
|
||||
worst_noise_blob = -1;
|
||||
for (i = min_noise_blob; i <= max_noise_blob; i++) {
|
||||
if (noise_score[i] < *worst_noise_score) {
|
||||
worst_noise_blob = i;
|
||||
*worst_noise_score = noise_score[i];
|
||||
}
|
||||
}
|
||||
return worst_noise_blob;
|
||||
}
|
||||
|
||||
float Tesseract::blob_noise_score(TBLOB *blob) {
|
||||
TBOX box; // BB of outline
|
||||
int16_t outline_count = 0;
|
||||
int16_t max_dimension;
|
||||
int16_t largest_outline_dimension = 0;
|
||||
|
||||
for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
|
||||
outline_count++;
|
||||
box = ol->bounding_box();
|
||||
if (box.height() > box.width()) {
|
||||
max_dimension = box.height();
|
||||
} else {
|
||||
max_dimension = box.width();
|
||||
}
|
||||
|
||||
if (largest_outline_dimension < max_dimension) {
|
||||
largest_outline_dimension = max_dimension;
|
||||
}
|
||||
}
|
||||
|
||||
if (outline_count > 5) {
|
||||
// penalise LOTS of blobs
|
||||
largest_outline_dimension *= 2;
|
||||
}
|
||||
|
||||
box = blob->bounding_box();
|
||||
if (box.bottom() > kBlnBaselineOffset * 4 || box.top() < kBlnBaselineOffset / 2) {
|
||||
// Lax blob is if high or low
|
||||
largest_outline_dimension /= 2;
|
||||
}
|
||||
|
||||
return largest_outline_dimension;
|
||||
}
|
||||
|
||||
void fixspace_dbg(WERD_RES *word) {
|
||||
TBOX box = word->word->bounding_box();
|
||||
const bool show_map_detail = false;
|
||||
int16_t i;
|
||||
|
||||
box.print();
|
||||
tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
|
||||
tprintf("Blob count: %d (word); %d/%d (rebuild word)\n", word->word->cblob_list()->length(),
|
||||
word->rebuild_word->NumBlobs(), word->box_word->length());
|
||||
word->reject_map.print(debug_fp);
|
||||
tprintf("\n");
|
||||
if (show_map_detail) {
|
||||
tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());
|
||||
for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
|
||||
tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
|
||||
word->reject_map[i].full_print(debug_fp);
|
||||
}
|
||||
}
|
||||
|
||||
tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
|
||||
tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
|
||||
}
|
||||
|
||||
/**
|
||||
* fp_eval_word_spacing()
|
||||
* Evaluation function for fixed pitch word lists.
|
||||
*
|
||||
* Basically, count the number of "nice" characters - those which are in tess
|
||||
* acceptable words or in dict words and are not rejected.
|
||||
* Penalise any potential noise chars
|
||||
*/
|
||||
int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
|
||||
WERD_RES_IT word_it(&word_res_list);
|
||||
WERD_RES *word;
|
||||
int16_t score = 0;
|
||||
int16_t i;
|
||||
float small_limit = kBlnXHeight * fixsp_small_outlines_size;
|
||||
|
||||
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
|
||||
word = word_it.data();
|
||||
if (word->rebuild_word == nullptr) {
|
||||
continue; // Can't handle cube words.
|
||||
}
|
||||
if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
|
||||
word->best_choice->permuter() == FREQ_DAWG_PERM ||
|
||||
word->best_choice->permuter() == USER_DAWG_PERM || safe_dict_word(word) > 0) {
|
||||
int num_blobs = word->rebuild_word->NumBlobs();
|
||||
UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
|
||||
for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
|
||||
TBLOB *blob = word->rebuild_word->blobs[i];
|
||||
if (word->best_choice->unichar_id(i) == space || blob_noise_score(blob) < small_limit) {
|
||||
score -= 1; // penalise possibly erroneous non-space
|
||||
} else if (word->reject_map[i].accepted()) {
|
||||
score++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (score < 0) {
|
||||
score = 0;
|
||||
}
|
||||
return score;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
36
3rdparty/tesseract_ocr/tesseract/src/ccmain/fixspace.h
vendored
Normal file
36
3rdparty/tesseract_ocr/tesseract/src/ccmain/fixspace.h
vendored
Normal file
|
@ -0,0 +1,36 @@
|
|||
/******************************************************************
|
||||
* File: fixspace.h (Formerly fixspace.h)
|
||||
* Description: Implements a pass over the page res, exploring the alternative
|
||||
* spacing possibilities, trying to use context to improve the
|
||||
* word spacing
|
||||
* Author: Phil Cheatle
|
||||
* Created: Thu Oct 21 11:38:43 BST 1993
|
||||
*
|
||||
* (C) Copyright 1993, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef FIXSPACE_H
|
||||
#define FIXSPACE_H
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class WERD_RES;
|
||||
class WERD_RES_LIST;
|
||||
|
||||
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list);
|
||||
void transform_to_next_perm(WERD_RES_LIST &words);
|
||||
void fixspace_dbg(WERD_RES *word);
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif
|
215
3rdparty/tesseract_ocr/tesseract/src/ccmain/fixxht.cpp
vendored
Normal file
215
3rdparty/tesseract_ocr/tesseract/src/ccmain/fixxht.cpp
vendored
Normal file
|
@ -0,0 +1,215 @@
|
|||
/**********************************************************************
|
||||
* File: fixxht.cpp (Formerly fixxht.c)
|
||||
* Description: Improve x_ht and look out for case inconsistencies
|
||||
* Author: Phil Cheatle
|
||||
* Created: Thu Aug 5 14:11:08 BST 1993
|
||||
*
|
||||
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "float2int.h"
|
||||
#include "params.h"
|
||||
#include "tesseractclass.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <cstring>
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Fixxht overview.
|
||||
// Premise: Initial estimate of x-height is adequate most of the time, but
|
||||
// occasionally it is incorrect. Most notable causes of failure are:
|
||||
// 1. Small caps, where the top of the caps is the same as the body text
|
||||
// xheight. For small caps words the xheight needs to be reduced to correctly
|
||||
// recognize the caps in the small caps word.
|
||||
// 2. All xheight lines, such as summer. Here the initial estimate will have
|
||||
// guessed that the blob tops are caps and will have placed the xheight too low.
|
||||
// 3. Noise/logos beside words, or changes in font size on a line. Such
|
||||
// things can blow the statistics and cause an incorrect estimate.
|
||||
// 4. Incorrect baseline. Can happen when 2 columns are incorrectly merged.
|
||||
// In this case the x-height is often still correct.
|
||||
//
|
||||
// Algorithm.
|
||||
// Compare the vertical position (top only) of alphnumerics in a word with
|
||||
// the range of positions in training data (in the unicharset).
|
||||
// See CountMisfitTops. If any characters disagree sufficiently with the
|
||||
// initial xheight estimate, then recalculate the xheight, re-run OCR on
|
||||
// the word, and if the number of vertical misfits goes down, along with
|
||||
// either the word rating or certainty, then keep the new xheight.
|
||||
// The new xheight is calculated as follows:ComputeCompatibleXHeight
|
||||
// For each alphanumeric character that has a vertically misplaced top
|
||||
// (a misfit), yet its bottom is within the acceptable range (ie it is not
|
||||
// likely a sub-or super-script) calculate the range of acceptable xheight
|
||||
// positions from its range of tops, and give each value in the range a
|
||||
// number of votes equal to the distance of its top from its acceptance range.
|
||||
// The x-height position with the median of the votes becomes the new
|
||||
// x-height. This assumes that most characters will be correctly recognized
|
||||
// even if the x-height is incorrect. This is not a terrible assumption, but
|
||||
// it is not great. An improvement would be to use a classifier that does
|
||||
// not care about vertical position or scaling at all.
|
||||
// Separately collect stats on shifted baselines and apply the same logic to
|
||||
// computing a best-fit shift to fix the error. If the baseline needs to be
|
||||
// shifted, but the x-height is OK, returns the original x-height along with
|
||||
// the baseline shift to indicate that recognition needs to re-run.
|
||||
|
||||
// If the max-min top of a unicharset char is bigger than kMaxCharTopRange
|
||||
// then the char top cannot be used to judge misfits or suggest a new top.
|
||||
const int kMaxCharTopRange = 48;
|
||||
|
||||
// Returns the number of misfit blob tops in this word.
|
||||
int Tesseract::CountMisfitTops(WERD_RES *word_res) {
|
||||
int bad_blobs = 0;
|
||||
int num_blobs = word_res->rebuild_word->NumBlobs();
|
||||
for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
|
||||
TBLOB *blob = word_res->rebuild_word->blobs[blob_id];
|
||||
UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
|
||||
if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
|
||||
int top = blob->bounding_box().top();
|
||||
if (top >= INT_FEAT_RANGE) {
|
||||
top = INT_FEAT_RANGE - 1;
|
||||
}
|
||||
int min_bottom, max_bottom, min_top, max_top;
|
||||
unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top);
|
||||
if (max_top - min_top > kMaxCharTopRange) {
|
||||
continue;
|
||||
}
|
||||
bool bad =
|
||||
top < min_top - x_ht_acceptance_tolerance || top > max_top + x_ht_acceptance_tolerance;
|
||||
if (bad) {
|
||||
++bad_blobs;
|
||||
}
|
||||
if (debug_x_ht_level >= 1) {
|
||||
tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
|
||||
unicharset.id_to_unichar(class_id), bad ? "Misfit" : "OK", top, min_top, max_top,
|
||||
static_cast<int>(x_ht_acceptance_tolerance));
|
||||
}
|
||||
}
|
||||
}
|
||||
return bad_blobs;
|
||||
}
|
||||
|
||||
// Returns a new x-height maximally compatible with the result in word_res.
|
||||
// See comment above for overall algorithm.
|
||||
float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift) {
|
||||
STATS top_stats(0, UINT8_MAX);
|
||||
STATS shift_stats(-UINT8_MAX, UINT8_MAX);
|
||||
int bottom_shift = 0;
|
||||
int num_blobs = word_res->rebuild_word->NumBlobs();
|
||||
do {
|
||||
top_stats.clear();
|
||||
shift_stats.clear();
|
||||
for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
|
||||
TBLOB *blob = word_res->rebuild_word->blobs[blob_id];
|
||||
UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
|
||||
if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
|
||||
int top = blob->bounding_box().top() + bottom_shift;
|
||||
// Clip the top to the limit of normalized feature space.
|
||||
if (top >= INT_FEAT_RANGE) {
|
||||
top = INT_FEAT_RANGE - 1;
|
||||
}
|
||||
int bottom = blob->bounding_box().bottom() + bottom_shift;
|
||||
int min_bottom, max_bottom, min_top, max_top;
|
||||
unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top);
|
||||
// Chars with a wild top range would mess up the result so ignore them.
|
||||
if (max_top - min_top > kMaxCharTopRange) {
|
||||
continue;
|
||||
}
|
||||
int misfit_dist = std::max((min_top - x_ht_acceptance_tolerance) - top,
|
||||
top - (max_top + x_ht_acceptance_tolerance));
|
||||
int height = top - kBlnBaselineOffset;
|
||||
if (debug_x_ht_level >= 2) {
|
||||
tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
|
||||
unicharset.id_to_unichar(class_id), height, min_bottom, max_bottom, min_top,
|
||||
max_top, bottom, top);
|
||||
}
|
||||
// Use only chars that fit in the expected bottom range, and where
|
||||
// the range of tops is sensibly near the xheight.
|
||||
if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
|
||||
bottom - x_ht_acceptance_tolerance <= max_bottom && min_top > kBlnBaselineOffset &&
|
||||
max_top - kBlnBaselineOffset >= kBlnXHeight && misfit_dist > 0) {
|
||||
// Compute the x-height position using proportionality between the
|
||||
// actual height and expected height.
|
||||
int min_xht = DivRounded(height * kBlnXHeight, max_top - kBlnBaselineOffset);
|
||||
int max_xht = DivRounded(height * kBlnXHeight, min_top - kBlnBaselineOffset);
|
||||
if (debug_x_ht_level >= 2) {
|
||||
tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
|
||||
}
|
||||
// The range of expected heights gets a vote equal to the distance
|
||||
// of the actual top from the expected top.
|
||||
for (int y = min_xht; y <= max_xht; ++y) {
|
||||
top_stats.add(y, misfit_dist);
|
||||
}
|
||||
} else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
|
||||
bottom - x_ht_acceptance_tolerance > max_bottom) &&
|
||||
bottom_shift == 0) {
|
||||
// Get the range of required bottom shift.
|
||||
int min_shift = min_bottom - bottom;
|
||||
int max_shift = max_bottom - bottom;
|
||||
if (debug_x_ht_level >= 2) {
|
||||
tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
|
||||
}
|
||||
// The range of expected shifts gets a vote equal to the min distance
|
||||
// of the actual bottom from the expected bottom, spread over the
|
||||
// range of its acceptance.
|
||||
int misfit_weight = abs(min_shift);
|
||||
if (max_shift > min_shift) {
|
||||
misfit_weight /= max_shift - min_shift;
|
||||
}
|
||||
for (int y = min_shift; y <= max_shift; ++y) {
|
||||
shift_stats.add(y, misfit_weight);
|
||||
}
|
||||
} else {
|
||||
if (bottom_shift == 0) {
|
||||
// Things with bottoms that are already ok need to say so, on the
|
||||
// 1st iteration only.
|
||||
shift_stats.add(0, kBlnBaselineOffset);
|
||||
}
|
||||
if (debug_x_ht_level >= 2) {
|
||||
tprintf(" already OK\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (shift_stats.get_total() > top_stats.get_total()) {
|
||||
bottom_shift = IntCastRounded(shift_stats.median());
|
||||
if (debug_x_ht_level >= 2) {
|
||||
tprintf("Applying bottom shift=%d\n", bottom_shift);
|
||||
}
|
||||
}
|
||||
} while (bottom_shift != 0 && top_stats.get_total() < shift_stats.get_total());
|
||||
// Baseline shift is opposite sign to the bottom shift.
|
||||
*baseline_shift = -bottom_shift / word_res->denorm.y_scale();
|
||||
if (debug_x_ht_level >= 2) {
|
||||
tprintf("baseline shift=%g\n", *baseline_shift);
|
||||
}
|
||||
if (top_stats.get_total() == 0) {
|
||||
return bottom_shift != 0 ? word_res->x_height : 0.0f;
|
||||
}
|
||||
// The new xheight is just the median vote, which is then scaled out
|
||||
// of BLN space back to pixel space to get the x-height in pixel space.
|
||||
float new_xht = top_stats.median();
|
||||
if (debug_x_ht_level >= 2) {
|
||||
tprintf("Median xht=%f\n", new_xht);
|
||||
tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n", new_xht,
|
||||
new_xht / word_res->denorm.y_scale());
|
||||
}
|
||||
// The xheight must change by at least x_ht_min_change to be used.
|
||||
if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change) {
|
||||
return new_xht / word_res->denorm.y_scale();
|
||||
} else {
|
||||
return bottom_shift != 0 ? word_res->x_height : 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
314
3rdparty/tesseract_ocr/tesseract/src/ccmain/linerec.cpp
vendored
Normal file
314
3rdparty/tesseract_ocr/tesseract/src/ccmain/linerec.cpp
vendored
Normal file
|
@ -0,0 +1,314 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: linerec.cpp
|
||||
// Description: Top-level line-based recognition module for Tesseract.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2013, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "tesseractclass.h"
|
||||
|
||||
#include <allheaders.h>
|
||||
#include "boxread.h"
|
||||
#include "imagedata.h" // for ImageData
|
||||
#include "lstmrecognizer.h"
|
||||
#include "pageres.h"
|
||||
#include "recodebeam.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Scale factor to make certainty more comparable to Tesseract.
|
||||
const float kCertaintyScale = 7.0f;
|
||||
// Worst acceptable certainty for a dictionary word.
|
||||
const float kWorstDictCertainty = -25.0f;
|
||||
|
||||
// Generates training data for training a line recognizer, eg LSTM.
|
||||
// Breaks the page into lines, according to the boxes, and writes them to a
|
||||
// serialized DocumentData based on output_basename.
|
||||
// Return true if successful, false if an error occurred.
|
||||
bool Tesseract::TrainLineRecognizer(const char *input_imagename, const std::string &output_basename,
|
||||
BLOCK_LIST *block_list) {
|
||||
std::string lstmf_name = output_basename + ".lstmf";
|
||||
DocumentData images(lstmf_name);
|
||||
if (applybox_page > 0) {
|
||||
// Load existing document for the previous pages.
|
||||
if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) {
|
||||
tprintf("Failed to read training data from %s!\n", lstmf_name.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
std::vector<TBOX> boxes;
|
||||
std::vector<std::string> texts;
|
||||
// Get the boxes for this page, if there are any.
|
||||
if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr, nullptr) ||
|
||||
boxes.empty()) {
|
||||
tprintf("Failed to read boxes from %s\n", input_imagename);
|
||||
return false;
|
||||
}
|
||||
TrainFromBoxes(boxes, texts, block_list, &images);
|
||||
if (images.PagesSize() == 0) {
|
||||
tprintf("Failed to read pages from %s\n", input_imagename);
|
||||
return false;
|
||||
}
|
||||
images.Shuffle();
|
||||
if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) {
|
||||
tprintf("Failed to write training data to %s!\n", lstmf_name.c_str());
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Generates training data for training a line recognizer, eg LSTM.
|
||||
// Breaks the boxes into lines, normalizes them, converts to ImageData and
|
||||
// appends them to the given training_data.
|
||||
void Tesseract::TrainFromBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts,
|
||||
BLOCK_LIST *block_list, DocumentData *training_data) {
|
||||
auto box_count = boxes.size();
|
||||
// Process all the text lines in this page, as defined by the boxes.
|
||||
unsigned end_box = 0;
|
||||
// Don't let \t, which marks newlines in the box file, get into the line
|
||||
// content, as that makes the line unusable in training.
|
||||
while (end_box < texts.size() && texts[end_box] == "\t") {
|
||||
++end_box;
|
||||
}
|
||||
for (auto start_box = end_box; start_box < box_count; start_box = end_box) {
|
||||
// Find the textline of boxes starting at start and their bounding box.
|
||||
TBOX line_box = boxes[start_box];
|
||||
std::string line_str = texts[start_box];
|
||||
for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t"; ++end_box) {
|
||||
line_box += boxes[end_box];
|
||||
line_str += texts[end_box];
|
||||
}
|
||||
// Find the most overlapping block.
|
||||
BLOCK *best_block = nullptr;
|
||||
int best_overlap = 0;
|
||||
BLOCK_IT b_it(block_list);
|
||||
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
|
||||
BLOCK *block = b_it.data();
|
||||
if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {
|
||||
continue; // Not a text block.
|
||||
}
|
||||
TBOX block_box = block->pdblk.bounding_box();
|
||||
block_box.rotate(block->re_rotation());
|
||||
if (block_box.major_overlap(line_box)) {
|
||||
TBOX overlap_box = line_box.intersection(block_box);
|
||||
if (overlap_box.area() > best_overlap) {
|
||||
best_overlap = overlap_box.area();
|
||||
best_block = block;
|
||||
}
|
||||
}
|
||||
}
|
||||
ImageData *imagedata = nullptr;
|
||||
if (best_block == nullptr) {
|
||||
tprintf("No block overlapping textline: %s\n", line_str.c_str());
|
||||
} else {
|
||||
imagedata = GetLineData(line_box, boxes, texts, start_box, end_box, *best_block);
|
||||
}
|
||||
if (imagedata != nullptr) {
|
||||
training_data->AddPageToDocument(imagedata);
|
||||
}
|
||||
// Don't let \t, which marks newlines in the box file, get into the line
|
||||
// content, as that makes the line unusable in training.
|
||||
while (end_box < texts.size() && texts[end_box] == "\t") {
|
||||
++end_box;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Returns an Imagedata containing the image of the given box,
|
||||
// and ground truth boxes/truth text if available in the input.
|
||||
// The image is not normalized in any way.
|
||||
ImageData *Tesseract::GetLineData(const TBOX &line_box, const std::vector<TBOX> &boxes,
|
||||
const std::vector<std::string> &texts, int start_box, int end_box,
|
||||
const BLOCK &block) {
|
||||
TBOX revised_box;
|
||||
ImageData *image_data = GetRectImage(line_box, block, kImagePadding, &revised_box);
|
||||
if (image_data == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
image_data->set_page_number(applybox_page);
|
||||
// Copy the boxes and shift them so they are relative to the image.
|
||||
FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
|
||||
ICOORD shift = -revised_box.botleft();
|
||||
std::vector<TBOX> line_boxes;
|
||||
std::vector<std::string> line_texts;
|
||||
for (int b = start_box; b < end_box; ++b) {
|
||||
TBOX box = boxes[b];
|
||||
box.rotate(block_rotation);
|
||||
box.move(shift);
|
||||
line_boxes.push_back(box);
|
||||
line_texts.push_back(texts[b]);
|
||||
}
|
||||
std::vector<int> page_numbers;
|
||||
page_numbers.resize(line_boxes.size(), applybox_page);
|
||||
image_data->AddBoxes(line_boxes, line_texts, page_numbers);
|
||||
return image_data;
|
||||
}
|
||||
|
||||
// Helper gets the image of a rectangle, using the block.re_rotation() if
|
||||
// needed to get to the image, and rotating the result back to horizontal
|
||||
// layout. (CJK characters will be on their left sides) The vertical text flag
|
||||
// is set in the returned ImageData if the text was originally vertical, which
|
||||
// can be used to invoke a different CJK recognition engine. The revised_box
|
||||
// is also returned to enable calculation of output bounding boxes.
|
||||
ImageData *Tesseract::GetRectImage(const TBOX &box, const BLOCK &block, int padding,
|
||||
TBOX *revised_box) const {
|
||||
TBOX wbox = box;
|
||||
wbox.pad(padding, padding);
|
||||
*revised_box = wbox;
|
||||
// Number of clockwise 90 degree rotations needed to get back to tesseract
|
||||
// coords from the clipped image.
|
||||
int num_rotations = 0;
|
||||
if (block.re_rotation().y() > 0.0f) {
|
||||
num_rotations = 1;
|
||||
} else if (block.re_rotation().x() < 0.0f) {
|
||||
num_rotations = 2;
|
||||
} else if (block.re_rotation().y() < 0.0f) {
|
||||
num_rotations = 3;
|
||||
}
|
||||
// Handle two cases automatically: 1 the box came from the block, 2 the box
|
||||
// came from a box file, and refers to the image, which the block may not.
|
||||
if (block.pdblk.bounding_box().major_overlap(*revised_box)) {
|
||||
revised_box->rotate(block.re_rotation());
|
||||
}
|
||||
// Now revised_box always refers to the image.
|
||||
// BestPix is never colormapped, but may be of any depth.
|
||||
Image pix = BestPix();
|
||||
int width = pixGetWidth(pix);
|
||||
int height = pixGetHeight(pix);
|
||||
TBOX image_box(0, 0, width, height);
|
||||
// Clip to image bounds;
|
||||
*revised_box &= image_box;
|
||||
if (revised_box->null_box()) {
|
||||
return nullptr;
|
||||
}
|
||||
Box *clip_box = boxCreate(revised_box->left(), height - revised_box->top(), revised_box->width(),
|
||||
revised_box->height());
|
||||
Image box_pix = pixClipRectangle(pix, clip_box, nullptr);
|
||||
boxDestroy(&clip_box);
|
||||
if (box_pix == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
if (num_rotations > 0) {
|
||||
Image rot_pix = pixRotateOrth(box_pix, num_rotations);
|
||||
box_pix.destroy();
|
||||
box_pix = rot_pix;
|
||||
}
|
||||
// Convert sub-8-bit images to 8 bit.
|
||||
int depth = pixGetDepth(box_pix);
|
||||
if (depth < 8) {
|
||||
Image grey;
|
||||
grey = pixConvertTo8(box_pix, false);
|
||||
box_pix.destroy();
|
||||
box_pix = grey;
|
||||
}
|
||||
bool vertical_text = false;
|
||||
if (num_rotations > 0) {
|
||||
// Rotated the clipped revised box back to internal coordinates.
|
||||
FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());
|
||||
revised_box->rotate(rotation);
|
||||
if (num_rotations != 2) {
|
||||
vertical_text = true;
|
||||
}
|
||||
}
|
||||
return new ImageData(vertical_text, box_pix);
|
||||
}
|
||||
|
||||
// Recognizes a word or group of words, converting to WERD_RES in *words.
|
||||
// Analogous to classify_word_pass1, but can handle a group of words as well.
|
||||
void Tesseract::LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word,
|
||||
PointerVector<WERD_RES> *words) {
|
||||
TBOX word_box = word->word->bounding_box();
|
||||
// Get the word image - no frills.
|
||||
if (tessedit_pageseg_mode == PSM_SINGLE_WORD || tessedit_pageseg_mode == PSM_RAW_LINE) {
|
||||
// In single word mode, use the whole image without any other row/word
|
||||
// interpretation.
|
||||
word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
|
||||
} else {
|
||||
float baseline = row->base_line((word_box.left() + word_box.right()) / 2);
|
||||
if (baseline + row->descenders() < word_box.bottom()) {
|
||||
word_box.set_bottom(baseline + row->descenders());
|
||||
}
|
||||
if (baseline + row->x_height() + row->ascenders() > word_box.top()) {
|
||||
word_box.set_top(baseline + row->x_height() + row->ascenders());
|
||||
}
|
||||
}
|
||||
ImageData *im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
|
||||
if (im_data == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
bool do_invert = tessedit_do_invert;
|
||||
lstm_recognizer_->RecognizeLine(*im_data, do_invert, classify_debug_level > 0,
|
||||
kWorstDictCertainty / kCertaintyScale, word_box, words,
|
||||
lstm_choice_mode, lstm_choice_iterations);
|
||||
delete im_data;
|
||||
SearchWords(words);
|
||||
}
|
||||
|
||||
// Apply segmentation search to the given set of words, within the constraints
|
||||
// of the existing ratings matrix. If there is already a best_choice on a word
|
||||
// leaves it untouched and just sets the done/accepted etc flags.
|
||||
void Tesseract::SearchWords(PointerVector<WERD_RES> *words) {
|
||||
// Run the segmentation search on the network outputs and make a BoxWord
|
||||
// for each of the output words.
|
||||
// If we drop a word as junk, then there is always a space in front of the
|
||||
// next.
|
||||
const Dict *stopper_dict = lstm_recognizer_->GetDict();
|
||||
if (stopper_dict == nullptr) {
|
||||
stopper_dict = &getDict();
|
||||
}
|
||||
bool any_nonspace_delimited = false;
|
||||
for (int w = 0; w < words->size(); ++w) {
|
||||
WERD_RES *word = (*words)[w];
|
||||
if (word->best_choice != nullptr && word->best_choice->ContainsAnyNonSpaceDelimited()) {
|
||||
any_nonspace_delimited = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (int w = 0; w < words->size(); ++w) {
|
||||
WERD_RES *word = (*words)[w];
|
||||
if (word->best_choice == nullptr) {
|
||||
// It is a dud.
|
||||
word->SetupFake(lstm_recognizer_->GetUnicharset());
|
||||
} else {
|
||||
// Set the best state.
|
||||
for (int i = 0; i < word->best_choice->length(); ++i) {
|
||||
int length = word->best_choice->state(i);
|
||||
word->best_state.push_back(length);
|
||||
}
|
||||
word->reject_map.initialise(word->best_choice->length());
|
||||
word->tess_failed = false;
|
||||
word->tess_accepted = true;
|
||||
word->tess_would_adapt = false;
|
||||
word->done = true;
|
||||
word->tesseract = this;
|
||||
float word_certainty = std::min(word->space_certainty, word->best_choice->certainty());
|
||||
word_certainty *= kCertaintyScale;
|
||||
if (getDict().stopper_debug_level >= 1) {
|
||||
tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
|
||||
word->best_choice->certainty(), word->space_certainty,
|
||||
std::min(word->space_certainty, word->best_choice->certainty()) * kCertaintyScale,
|
||||
word_certainty);
|
||||
word->best_choice->print();
|
||||
}
|
||||
word->best_choice->set_certainty(word_certainty);
|
||||
|
||||
word->tess_accepted = stopper_dict->AcceptableResult(word);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
507
3rdparty/tesseract_ocr/tesseract/src/ccmain/ltrresultiterator.cpp
vendored
Normal file
507
3rdparty/tesseract_ocr/tesseract/src/ccmain/ltrresultiterator.cpp
vendored
Normal file
|
@ -0,0 +1,507 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: ltrresultiterator.cpp
|
||||
// Description: Iterator for tesseract results in strict left-to-right
|
||||
// order that avoids using tesseract internal data structures.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2010, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <tesseract/ltrresultiterator.h>
|
||||
|
||||
#include "pageres.h"
|
||||
#include "tesseractclass.h"
|
||||
|
||||
#include <allheaders.h>
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
LTRResultIterator::LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
|
||||
int scaled_yres, int rect_left, int rect_top, int rect_width,
|
||||
int rect_height)
|
||||
: PageIterator(page_res, tesseract, scale, scaled_yres, rect_left, rect_top, rect_width,
|
||||
rect_height)
|
||||
, line_separator_("\n")
|
||||
, paragraph_separator_("\n") {}
|
||||
|
||||
// Destructor.
|
||||
// It is defined here, so the compiler can create a single vtable
|
||||
// instead of weak vtables in every compilation unit.
|
||||
LTRResultIterator::~LTRResultIterator() = default;
|
||||
|
||||
// Returns the null terminated UTF-8 encoded text string for the current
|
||||
// object at the given level. Use delete [] to free after use.
|
||||
char *LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const {
|
||||
if (it_->word() == nullptr) {
|
||||
return nullptr; // Already at the end!
|
||||
}
|
||||
std::string text;
|
||||
PAGE_RES_IT res_it(*it_);
|
||||
WERD_CHOICE *best_choice = res_it.word()->best_choice;
|
||||
ASSERT_HOST(best_choice != nullptr);
|
||||
if (level == RIL_SYMBOL) {
|
||||
text = res_it.word()->BestUTF8(blob_index_, false);
|
||||
} else if (level == RIL_WORD) {
|
||||
text = best_choice->unichar_string();
|
||||
} else {
|
||||
bool eol = false; // end of line?
|
||||
bool eop = false; // end of paragraph?
|
||||
do { // for each paragraph in a block
|
||||
do { // for each text line in a paragraph
|
||||
do { // for each word in a text line
|
||||
best_choice = res_it.word()->best_choice;
|
||||
ASSERT_HOST(best_choice != nullptr);
|
||||
text += best_choice->unichar_string();
|
||||
text += " ";
|
||||
res_it.forward();
|
||||
eol = res_it.row() != res_it.prev_row();
|
||||
} while (!eol);
|
||||
text.resize(text.length() - 1);
|
||||
text += line_separator_;
|
||||
eop = res_it.block() != res_it.prev_block() ||
|
||||
res_it.row()->row->para() != res_it.prev_row()->row->para();
|
||||
} while (level != RIL_TEXTLINE && !eop);
|
||||
if (eop) {
|
||||
text += paragraph_separator_;
|
||||
}
|
||||
} while (level == RIL_BLOCK && res_it.block() == res_it.prev_block());
|
||||
}
|
||||
int length = text.length() + 1;
|
||||
char *result = new char[length];
|
||||
strncpy(result, text.c_str(), length);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Set the string inserted at the end of each text line. "\n" by default.
|
||||
void LTRResultIterator::SetLineSeparator(const char *new_line) {
|
||||
line_separator_ = new_line;
|
||||
}
|
||||
|
||||
// Set the string inserted at the end of each paragraph. "\n" by default.
|
||||
void LTRResultIterator::SetParagraphSeparator(const char *new_para) {
|
||||
paragraph_separator_ = new_para;
|
||||
}
|
||||
|
||||
// Returns the mean confidence of the current object at the given level.
|
||||
// The number should be interpreted as a percent probability. (0.0f-100.0f)
|
||||
float LTRResultIterator::Confidence(PageIteratorLevel level) const {
|
||||
if (it_->word() == nullptr) {
|
||||
return 0.0f; // Already at the end!
|
||||
}
|
||||
float mean_certainty = 0.0f;
|
||||
int certainty_count = 0;
|
||||
PAGE_RES_IT res_it(*it_);
|
||||
WERD_CHOICE *best_choice = res_it.word()->best_choice;
|
||||
ASSERT_HOST(best_choice != nullptr);
|
||||
switch (level) {
|
||||
case RIL_BLOCK:
|
||||
do {
|
||||
best_choice = res_it.word()->best_choice;
|
||||
ASSERT_HOST(best_choice != nullptr);
|
||||
mean_certainty += best_choice->certainty();
|
||||
++certainty_count;
|
||||
res_it.forward();
|
||||
} while (res_it.block() == res_it.prev_block());
|
||||
break;
|
||||
case RIL_PARA:
|
||||
do {
|
||||
best_choice = res_it.word()->best_choice;
|
||||
ASSERT_HOST(best_choice != nullptr);
|
||||
mean_certainty += best_choice->certainty();
|
||||
++certainty_count;
|
||||
res_it.forward();
|
||||
} while (res_it.block() == res_it.prev_block() &&
|
||||
res_it.row()->row->para() == res_it.prev_row()->row->para());
|
||||
break;
|
||||
case RIL_TEXTLINE:
|
||||
do {
|
||||
best_choice = res_it.word()->best_choice;
|
||||
ASSERT_HOST(best_choice != nullptr);
|
||||
mean_certainty += best_choice->certainty();
|
||||
++certainty_count;
|
||||
res_it.forward();
|
||||
} while (res_it.row() == res_it.prev_row());
|
||||
break;
|
||||
case RIL_WORD:
|
||||
mean_certainty += best_choice->certainty();
|
||||
++certainty_count;
|
||||
break;
|
||||
case RIL_SYMBOL:
|
||||
mean_certainty += best_choice->certainty(blob_index_);
|
||||
++certainty_count;
|
||||
}
|
||||
if (certainty_count > 0) {
|
||||
mean_certainty /= certainty_count;
|
||||
return ClipToRange(100 + 5 * mean_certainty, 0.0f, 100.0f);
|
||||
}
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
void LTRResultIterator::RowAttributes(float *row_height, float *descenders,
|
||||
float *ascenders) const {
|
||||
*row_height =
|
||||
it_->row()->row->x_height() + it_->row()->row->ascenders() - it_->row()->row->descenders();
|
||||
*descenders = it_->row()->row->descenders();
|
||||
*ascenders = it_->row()->row->ascenders();
|
||||
}
|
||||
|
||||
// Returns the font attributes of the current word. If iterating at a higher
|
||||
// level object than words, eg textlines, then this will return the
|
||||
// attributes of the first word in that textline.
|
||||
// The actual return value is a string representing a font name. It points
|
||||
// to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
|
||||
// the iterator itself, ie rendered invalid by various members of
|
||||
// TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
|
||||
// Pointsize is returned in printers points (1/72 inch.)
|
||||
const char *LTRResultIterator::WordFontAttributes(bool *is_bold, bool *is_italic,
|
||||
bool *is_underlined, bool *is_monospace,
|
||||
bool *is_serif, bool *is_smallcaps,
|
||||
int *pointsize, int *font_id) const {
|
||||
const char *result = nullptr;
|
||||
|
||||
if (it_->word() == nullptr) {
|
||||
// Already at the end!
|
||||
*pointsize = 0;
|
||||
} else {
|
||||
float row_height =
|
||||
it_->row()->row->x_height() + it_->row()->row->ascenders() - it_->row()->row->descenders();
|
||||
// Convert from pixels to printers points.
|
||||
*pointsize =
|
||||
scaled_yres_ > 0 ? static_cast<int>(row_height * kPointsPerInch / scaled_yres_ + 0.5) : 0;
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
const FontInfo *font_info = it_->word()->fontinfo;
|
||||
if (font_info) {
|
||||
// Font information available.
|
||||
*font_id = font_info->universal_id;
|
||||
*is_bold = font_info->is_bold();
|
||||
*is_italic = font_info->is_italic();
|
||||
*is_underlined = false; // TODO(rays) fix this!
|
||||
*is_monospace = font_info->is_fixed_pitch();
|
||||
*is_serif = font_info->is_serif();
|
||||
result = font_info->name;
|
||||
}
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
*is_smallcaps = it_->word()->small_caps;
|
||||
}
|
||||
|
||||
if (!result) {
|
||||
*is_bold = false;
|
||||
*is_italic = false;
|
||||
*is_underlined = false;
|
||||
*is_monospace = false;
|
||||
*is_serif = false;
|
||||
*is_smallcaps = false;
|
||||
*font_id = -1;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Returns the name of the language used to recognize this word.
|
||||
const char *LTRResultIterator::WordRecognitionLanguage() const {
|
||||
if (it_->word() == nullptr || it_->word()->tesseract == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
return it_->word()->tesseract->lang.c_str();
|
||||
}
|
||||
|
||||
// Return the overall directionality of this word.
|
||||
StrongScriptDirection LTRResultIterator::WordDirection() const {
|
||||
if (it_->word() == nullptr) {
|
||||
return DIR_NEUTRAL;
|
||||
}
|
||||
bool has_rtl = it_->word()->AnyRtlCharsInWord();
|
||||
bool has_ltr = it_->word()->AnyLtrCharsInWord();
|
||||
if (has_rtl && !has_ltr) {
|
||||
return DIR_RIGHT_TO_LEFT;
|
||||
}
|
||||
if (has_ltr && !has_rtl) {
|
||||
return DIR_LEFT_TO_RIGHT;
|
||||
}
|
||||
if (!has_ltr && !has_rtl) {
|
||||
return DIR_NEUTRAL;
|
||||
}
|
||||
return DIR_MIX;
|
||||
}
|
||||
|
||||
// Returns true if the current word was found in a dictionary.
|
||||
bool LTRResultIterator::WordIsFromDictionary() const {
|
||||
if (it_->word() == nullptr) {
|
||||
return false; // Already at the end!
|
||||
}
|
||||
int permuter = it_->word()->best_choice->permuter();
|
||||
return permuter == SYSTEM_DAWG_PERM || permuter == FREQ_DAWG_PERM || permuter == USER_DAWG_PERM;
|
||||
}
|
||||
|
||||
// Returns the number of blanks before the current word.
|
||||
int LTRResultIterator::BlanksBeforeWord() const {
|
||||
if (it_->word() == nullptr) {
|
||||
return 1;
|
||||
}
|
||||
return it_->word()->word->space();
|
||||
}
|
||||
|
||||
// Returns true if the current word is numeric.
|
||||
bool LTRResultIterator::WordIsNumeric() const {
|
||||
if (it_->word() == nullptr) {
|
||||
return false; // Already at the end!
|
||||
}
|
||||
int permuter = it_->word()->best_choice->permuter();
|
||||
return permuter == NUMBER_PERM;
|
||||
}
|
||||
|
||||
// Returns true if the word contains blamer information.
|
||||
bool LTRResultIterator::HasBlamerInfo() const {
|
||||
return it_->word() != nullptr && it_->word()->blamer_bundle != nullptr &&
|
||||
it_->word()->blamer_bundle->HasDebugInfo();
|
||||
}
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
// Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
|
||||
// of the current word.
|
||||
const void *LTRResultIterator::GetParamsTrainingBundle() const {
|
||||
return (it_->word() != nullptr && it_->word()->blamer_bundle != nullptr)
|
||||
? &(it_->word()->blamer_bundle->params_training_bundle())
|
||||
: nullptr;
|
||||
}
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
// Returns the pointer to the string with blamer information for this word.
|
||||
// Assumes that the word's blamer_bundle is not nullptr.
|
||||
const char *LTRResultIterator::GetBlamerDebug() const {
|
||||
return it_->word()->blamer_bundle->debug().c_str();
|
||||
}
|
||||
|
||||
// Returns the pointer to the string with misadaption information for this word.
|
||||
// Assumes that the word's blamer_bundle is not nullptr.
|
||||
const char *LTRResultIterator::GetBlamerMisadaptionDebug() const {
|
||||
return it_->word()->blamer_bundle->misadaption_debug().c_str();
|
||||
}
|
||||
|
||||
// Returns true if a truth string was recorded for the current word.
|
||||
bool LTRResultIterator::HasTruthString() const {
|
||||
if (it_->word() == nullptr) {
|
||||
return false; // Already at the end!
|
||||
}
|
||||
if (it_->word()->blamer_bundle == nullptr || it_->word()->blamer_bundle->NoTruth()) {
|
||||
return false; // no truth information for this word
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns true if the given string is equivalent to the truth string for
|
||||
// the current word.
|
||||
bool LTRResultIterator::EquivalentToTruth(const char *str) const {
|
||||
if (!HasTruthString()) {
|
||||
return false;
|
||||
}
|
||||
ASSERT_HOST(it_->word()->uch_set != nullptr);
|
||||
WERD_CHOICE str_wd(str, *(it_->word()->uch_set));
|
||||
return it_->word()->blamer_bundle->ChoiceIsCorrect(&str_wd);
|
||||
}
|
||||
|
||||
// Returns the null terminated UTF-8 encoded truth string for the current word.
|
||||
// Use delete [] to free after use.
|
||||
char *LTRResultIterator::WordTruthUTF8Text() const {
|
||||
if (!HasTruthString()) {
|
||||
return nullptr;
|
||||
}
|
||||
std::string truth_text = it_->word()->blamer_bundle->TruthString();
|
||||
int length = truth_text.length() + 1;
|
||||
char *result = new char[length];
|
||||
strncpy(result, truth_text.c_str(), length);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Returns the null terminated UTF-8 encoded normalized OCR string for the
|
||||
// current word. Use delete [] to free after use.
|
||||
char *LTRResultIterator::WordNormedUTF8Text() const {
|
||||
if (it_->word() == nullptr) {
|
||||
return nullptr; // Already at the end!
|
||||
}
|
||||
std::string ocr_text;
|
||||
WERD_CHOICE *best_choice = it_->word()->best_choice;
|
||||
const UNICHARSET *unicharset = it_->word()->uch_set;
|
||||
ASSERT_HOST(best_choice != nullptr);
|
||||
for (int i = 0; i < best_choice->length(); ++i) {
|
||||
ocr_text += unicharset->get_normed_unichar(best_choice->unichar_id(i));
|
||||
}
|
||||
int length = ocr_text.length() + 1;
|
||||
char *result = new char[length];
|
||||
strncpy(result, ocr_text.c_str(), length);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Returns a pointer to serialized choice lattice.
|
||||
// Fills lattice_size with the number of bytes in lattice data.
|
||||
const char *LTRResultIterator::WordLattice(int *lattice_size) const {
|
||||
if (it_->word() == nullptr) {
|
||||
return nullptr; // Already at the end!
|
||||
}
|
||||
if (it_->word()->blamer_bundle == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
*lattice_size = it_->word()->blamer_bundle->lattice_size();
|
||||
return it_->word()->blamer_bundle->lattice_data();
|
||||
}
|
||||
|
||||
// Returns true if the current symbol is a superscript.
|
||||
// If iterating at a higher level object than symbols, eg words, then
|
||||
// this will return the attributes of the first symbol in that word.
|
||||
bool LTRResultIterator::SymbolIsSuperscript() const {
|
||||
if (cblob_it_ == nullptr && it_->word() != nullptr) {
|
||||
return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUPERSCRIPT;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns true if the current symbol is a subscript.
|
||||
// If iterating at a higher level object than symbols, eg words, then
|
||||
// this will return the attributes of the first symbol in that word.
|
||||
bool LTRResultIterator::SymbolIsSubscript() const {
|
||||
if (cblob_it_ == nullptr && it_->word() != nullptr) {
|
||||
return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUBSCRIPT;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns true if the current symbol is a dropcap.
|
||||
// If iterating at a higher level object than symbols, eg words, then
|
||||
// this will return the attributes of the first symbol in that word.
|
||||
bool LTRResultIterator::SymbolIsDropcap() const {
|
||||
if (cblob_it_ == nullptr && it_->word() != nullptr) {
|
||||
return it_->word()->best_choice->BlobPosition(blob_index_) == SP_DROPCAP;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
ChoiceIterator::ChoiceIterator(const LTRResultIterator &result_it) {
|
||||
ASSERT_HOST(result_it.it_->word() != nullptr);
|
||||
word_res_ = result_it.it_->word();
|
||||
oemLSTM_ = word_res_->tesseract->AnyLSTMLang();
|
||||
// Is there legacy engine related trained data?
|
||||
bool oemLegacy = word_res_->tesseract->AnyTessLang();
|
||||
// Is lstm_choice_mode activated?
|
||||
bool lstm_choice_mode = word_res_->tesseract->lstm_choice_mode;
|
||||
rating_coefficient_ = word_res_->tesseract->lstm_rating_coefficient;
|
||||
blanks_before_word_ = result_it.BlanksBeforeWord();
|
||||
BLOB_CHOICE_LIST *choices = nullptr;
|
||||
tstep_index_ = &result_it.blob_index_;
|
||||
if (oemLSTM_ && !word_res_->CTC_symbol_choices.empty()) {
|
||||
if (!word_res_->CTC_symbol_choices[0].empty() &&
|
||||
strcmp(word_res_->CTC_symbol_choices[0][0].first, " ")) {
|
||||
blanks_before_word_ = 0;
|
||||
}
|
||||
auto index = *tstep_index_;
|
||||
index += blanks_before_word_;
|
||||
if (index < word_res_->CTC_symbol_choices.size()) {
|
||||
LSTM_choices_ = &word_res_->CTC_symbol_choices[index];
|
||||
filterSpaces();
|
||||
}
|
||||
}
|
||||
if ((oemLegacy || !lstm_choice_mode) && word_res_->ratings != nullptr) {
|
||||
choices = word_res_->GetBlobChoices(result_it.blob_index_);
|
||||
}
|
||||
if (choices != nullptr && !choices->empty()) {
|
||||
choice_it_ = new BLOB_CHOICE_IT(choices);
|
||||
choice_it_->mark_cycle_pt();
|
||||
} else {
|
||||
choice_it_ = nullptr;
|
||||
}
|
||||
if (LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
|
||||
LSTM_choice_it_ = LSTM_choices_->begin();
|
||||
}
|
||||
}
|
||||
ChoiceIterator::~ChoiceIterator() {
|
||||
delete choice_it_;
|
||||
}
|
||||
|
||||
// Moves to the next choice for the symbol and returns false if there
|
||||
// are none left.
|
||||
bool ChoiceIterator::Next() {
|
||||
if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
|
||||
if (LSTM_choice_it_ != LSTM_choices_->end() && next(LSTM_choice_it_) == LSTM_choices_->end()) {
|
||||
return false;
|
||||
} else {
|
||||
++LSTM_choice_it_;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
if (choice_it_ == nullptr) {
|
||||
return false;
|
||||
}
|
||||
choice_it_->forward();
|
||||
return !choice_it_->cycled_list();
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the null terminated UTF-8 encoded text string for the current
|
||||
// choice. Do NOT use delete [] to free after use.
|
||||
const char *ChoiceIterator::GetUTF8Text() const {
|
||||
if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
|
||||
std::pair<const char *, float> choice = *LSTM_choice_it_;
|
||||
return choice.first;
|
||||
} else {
|
||||
if (choice_it_ == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
UNICHAR_ID id = choice_it_->data()->unichar_id();
|
||||
return word_res_->uch_set->id_to_unichar_ext(id);
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the confidence of the current choice depending on the used language
|
||||
// data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All
|
||||
// choices for one symbol should roughly add up to 1.0f.
|
||||
// If only traineddata of the legacy engine is used, the number should be
|
||||
// interpreted as a percent probability. (0.0f-100.0f) In this case
|
||||
// probabilities won't add up to 100. Each one stands on its own.
|
||||
float ChoiceIterator::Confidence() const {
|
||||
float confidence;
|
||||
if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
|
||||
std::pair<const char *, float> choice = *LSTM_choice_it_;
|
||||
confidence = 100 - rating_coefficient_ * choice.second;
|
||||
} else {
|
||||
if (choice_it_ == nullptr) {
|
||||
return 0.0f;
|
||||
}
|
||||
confidence = 100 + 5 * choice_it_->data()->certainty();
|
||||
}
|
||||
return ClipToRange(confidence, 0.0f, 100.0f);
|
||||
}
|
||||
|
||||
// Returns the set of timesteps which belong to the current symbol
|
||||
std::vector<std::vector<std::pair<const char *, float>>> *ChoiceIterator::Timesteps() const {
|
||||
int offset = *tstep_index_ + blanks_before_word_;
|
||||
if (offset >= word_res_->segmented_timesteps.size() || !oemLSTM_) {
|
||||
return nullptr;
|
||||
}
|
||||
return &word_res_->segmented_timesteps[offset];
|
||||
}
|
||||
|
||||
void ChoiceIterator::filterSpaces() {
|
||||
if (LSTM_choices_->empty()) {
|
||||
return;
|
||||
}
|
||||
std::vector<std::pair<const char *, float>>::iterator it;
|
||||
for (it = LSTM_choices_->begin(); it != LSTM_choices_->end();) {
|
||||
if (!strcmp(it->first, " ")) {
|
||||
it = LSTM_choices_->erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace tesseract.
|
24
3rdparty/tesseract_ocr/tesseract/src/ccmain/mutableiterator.cpp
vendored
Normal file
24
3rdparty/tesseract_ocr/tesseract/src/ccmain/mutableiterator.cpp
vendored
Normal file
|
@ -0,0 +1,24 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "mutableiterator.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Destructor.
|
||||
// It is defined here, so the compiler can create a single vtable
|
||||
// instead of weak vtables in every compilation unit.
|
||||
MutableIterator::~MutableIterator() = default;
|
||||
|
||||
} // namespace tesseract.
|
62
3rdparty/tesseract_ocr/tesseract/src/ccmain/mutableiterator.h
vendored
Normal file
62
3rdparty/tesseract_ocr/tesseract/src/ccmain/mutableiterator.h
vendored
Normal file
|
@ -0,0 +1,62 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: mutableiterator.h
|
||||
// Description: Iterator for tesseract results providing access to
|
||||
// both high-level API and Tesseract internal data structures.
|
||||
// Author: David Eger
|
||||
//
|
||||
// (C) Copyright 2011, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCMAIN_MUTABLEITERATOR_H_
|
||||
#define TESSERACT_CCMAIN_MUTABLEITERATOR_H_
|
||||
|
||||
#include <tesseract/resultiterator.h>
|
||||
|
||||
class BLOB_CHOICE_IT;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class Tesseract;
|
||||
|
||||
// Class to iterate over tesseract results, providing access to all levels
|
||||
// of the page hierarchy, without including any tesseract headers or having
|
||||
// to handle any tesseract structures.
|
||||
// WARNING! This class points to data held within the TessBaseAPI class, and
|
||||
// therefore can only be used while the TessBaseAPI class still exists and
|
||||
// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
|
||||
// DetectOS, or anything else that changes the internal PAGE_RES.
|
||||
// See tesseract/publictypes.h for the definition of PageIteratorLevel.
|
||||
// See also base class PageIterator, which contains the bulk of the interface.
|
||||
// ResultIterator adds text-specific methods for access to OCR output.
|
||||
// MutableIterator adds access to internal data structures.
|
||||
|
||||
class TESS_API MutableIterator : public ResultIterator {
|
||||
public:
|
||||
// See argument descriptions in ResultIterator()
|
||||
MutableIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres,
|
||||
int rect_left, int rect_top, int rect_width, int rect_height)
|
||||
: ResultIterator(LTRResultIterator(page_res, tesseract, scale, scaled_yres, rect_left,
|
||||
rect_top, rect_width, rect_height)) {}
|
||||
~MutableIterator() override;
|
||||
|
||||
// See PageIterator and ResultIterator for most calls.
|
||||
|
||||
// Return access to Tesseract internals.
|
||||
const PAGE_RES_IT *PageResIt() const {
|
||||
return it_;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_CCMAIN_MUTABLEITERATOR_H_
|
581
3rdparty/tesseract_ocr/tesseract/src/ccmain/osdetect.cpp
vendored
Normal file
581
3rdparty/tesseract_ocr/tesseract/src/ccmain/osdetect.cpp
vendored
Normal file
|
@ -0,0 +1,581 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: osdetect.cpp
|
||||
// Description: Orientation and script detection.
|
||||
// Author: Samuel Charron
|
||||
// Ranjith Unnikrishnan
|
||||
//
|
||||
// (C) Copyright 2008, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <tesseract/osdetect.h>
|
||||
|
||||
#include "blobbox.h"
|
||||
#include "blread.h"
|
||||
#include "colfind.h"
|
||||
#include "fontinfo.h"
|
||||
#include "imagefind.h"
|
||||
#include "linefind.h"
|
||||
#include "oldlist.h"
|
||||
#include "qrsequence.h"
|
||||
#include "ratngs.h"
|
||||
#include "tabvector.h"
|
||||
#include "tesseractclass.h"
|
||||
#include "textord.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath> // for std::fabs
|
||||
#include <memory>
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
const float kSizeRatioToReject = 2.0;
|
||||
const int kMinAcceptableBlobHeight = 10;
|
||||
|
||||
const float kScriptAcceptRatio = 1.3;
|
||||
|
||||
const float kHanRatioInKorean = 0.7;
|
||||
const float kHanRatioInJapanese = 0.3;
|
||||
|
||||
const float kNonAmbiguousMargin = 1.0;
|
||||
|
||||
// General scripts
|
||||
static const char *han_script = "Han";
|
||||
static const char *latin_script = "Latin";
|
||||
static const char *katakana_script = "Katakana";
|
||||
static const char *hiragana_script = "Hiragana";
|
||||
static const char *hangul_script = "Hangul";
|
||||
|
||||
// Pseudo-scripts Name
|
||||
const char *ScriptDetector::korean_script_ = "Korean";
|
||||
const char *ScriptDetector::japanese_script_ = "Japanese";
|
||||
const char *ScriptDetector::fraktur_script_ = "Fraktur";
|
||||
|
||||
void OSResults::update_best_orientation() {
|
||||
float first = orientations[0];
|
||||
float second = orientations[1];
|
||||
best_result.orientation_id = 0;
|
||||
if (orientations[0] < orientations[1]) {
|
||||
first = orientations[1];
|
||||
second = orientations[0];
|
||||
best_result.orientation_id = 1;
|
||||
}
|
||||
for (int i = 2; i < 4; ++i) {
|
||||
if (orientations[i] > first) {
|
||||
second = first;
|
||||
first = orientations[i];
|
||||
best_result.orientation_id = i;
|
||||
} else if (orientations[i] > second) {
|
||||
second = orientations[i];
|
||||
}
|
||||
}
|
||||
// Store difference of top two orientation scores.
|
||||
best_result.oconfidence = first - second;
|
||||
}
|
||||
|
||||
void OSResults::set_best_orientation(int orientation_id) {
|
||||
best_result.orientation_id = orientation_id;
|
||||
best_result.oconfidence = 0;
|
||||
}
|
||||
|
||||
void OSResults::update_best_script(int orientation) {
|
||||
// We skip index 0 to ignore the "Common" script.
|
||||
float first = scripts_na[orientation][1];
|
||||
float second = scripts_na[orientation][2];
|
||||
best_result.script_id = 1;
|
||||
if (scripts_na[orientation][1] < scripts_na[orientation][2]) {
|
||||
first = scripts_na[orientation][2];
|
||||
second = scripts_na[orientation][1];
|
||||
best_result.script_id = 2;
|
||||
}
|
||||
for (int i = 3; i < kMaxNumberOfScripts; ++i) {
|
||||
if (scripts_na[orientation][i] > first) {
|
||||
best_result.script_id = i;
|
||||
second = first;
|
||||
first = scripts_na[orientation][i];
|
||||
} else if (scripts_na[orientation][i] > second) {
|
||||
second = scripts_na[orientation][i];
|
||||
}
|
||||
}
|
||||
best_result.sconfidence =
|
||||
(second == 0.0f) ? 2.0f : (first / second - 1.0) / (kScriptAcceptRatio - 1.0);
|
||||
}
|
||||
|
||||
int OSResults::get_best_script(int orientation_id) const {
|
||||
int max_id = -1;
|
||||
for (int j = 0; j < kMaxNumberOfScripts; ++j) {
|
||||
const char *script = unicharset->get_script_from_script_id(j);
|
||||
if (strcmp(script, "Common") && strcmp(script, "NULL")) {
|
||||
if (max_id == -1 || scripts_na[orientation_id][j] > scripts_na[orientation_id][max_id]) {
|
||||
max_id = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
return max_id;
|
||||
}
|
||||
|
||||
// Print the script scores for all possible orientations.
|
||||
void OSResults::print_scores(void) const {
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
tprintf("Orientation id #%d", i);
|
||||
print_scores(i);
|
||||
}
|
||||
}
|
||||
|
||||
// Print the script scores for the given candidate orientation.
|
||||
void OSResults::print_scores(int orientation_id) const {
|
||||
for (int j = 0; j < kMaxNumberOfScripts; ++j) {
|
||||
if (scripts_na[orientation_id][j]) {
|
||||
tprintf("%12s\t: %f\n", unicharset->get_script_from_script_id(j),
|
||||
scripts_na[orientation_id][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Accumulate scores with given OSResults instance and update the best script.
|
||||
void OSResults::accumulate(const OSResults &osr) {
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
orientations[i] += osr.orientations[i];
|
||||
for (int j = 0; j < kMaxNumberOfScripts; ++j) {
|
||||
scripts_na[i][j] += osr.scripts_na[i][j];
|
||||
}
|
||||
}
|
||||
unicharset = osr.unicharset;
|
||||
update_best_orientation();
|
||||
update_best_script(best_result.orientation_id);
|
||||
}
|
||||
|
||||
// Detect and erase horizontal/vertical lines and picture regions from the
|
||||
// image, so that non-text blobs are removed from consideration.
|
||||
static void remove_nontext_regions(tesseract::Tesseract *tess, BLOCK_LIST *blocks,
|
||||
TO_BLOCK_LIST *to_blocks) {
|
||||
Image pix = tess->pix_binary();
|
||||
ASSERT_HOST(pix != nullptr);
|
||||
int vertical_x = 0;
|
||||
int vertical_y = 1;
|
||||
tesseract::TabVector_LIST v_lines;
|
||||
tesseract::TabVector_LIST h_lines;
|
||||
int resolution;
|
||||
if (kMinCredibleResolution > pixGetXRes(pix)) {
|
||||
resolution = kMinCredibleResolution;
|
||||
tprintf("Warning. Invalid resolution %d dpi. Using %d instead.\n", pixGetXRes(pix), resolution);
|
||||
} else {
|
||||
resolution = pixGetXRes(pix);
|
||||
}
|
||||
|
||||
tesseract::LineFinder::FindAndRemoveLines(resolution, false, pix, &vertical_x, &vertical_y,
|
||||
nullptr, &v_lines, &h_lines);
|
||||
Image im_pix = tesseract::ImageFind::FindImages(pix, nullptr);
|
||||
if (im_pix != nullptr) {
|
||||
pixSubtract(pix, pix, im_pix);
|
||||
im_pix.destroy();
|
||||
}
|
||||
tess->mutable_textord()->find_components(tess->pix_binary(), blocks, to_blocks);
|
||||
}
|
||||
|
||||
// Find connected components in the page and process a subset until finished or
|
||||
// a stopping criterion is met.
|
||||
// Returns the number of blobs used in making the estimate. 0 implies failure.
|
||||
int orientation_and_script_detection(const char *filename, OSResults *osr,
|
||||
tesseract::Tesseract *tess) {
|
||||
std::string name = filename; // truncated name
|
||||
|
||||
const char *lastdot = strrchr(name.c_str(), '.');
|
||||
if (lastdot != nullptr) {
|
||||
name[lastdot - name.c_str()] = '\0';
|
||||
}
|
||||
|
||||
ASSERT_HOST(tess->pix_binary() != nullptr);
|
||||
int width = pixGetWidth(tess->pix_binary());
|
||||
int height = pixGetHeight(tess->pix_binary());
|
||||
|
||||
BLOCK_LIST blocks;
|
||||
if (!read_unlv_file(name, width, height, &blocks)) {
|
||||
FullPageBlock(width, height, &blocks);
|
||||
}
|
||||
|
||||
// Try to remove non-text regions from consideration.
|
||||
TO_BLOCK_LIST land_blocks, port_blocks;
|
||||
remove_nontext_regions(tess, &blocks, &port_blocks);
|
||||
|
||||
if (port_blocks.empty()) {
|
||||
// page segmentation did not succeed, so we need to find_components first.
|
||||
tess->mutable_textord()->find_components(tess->pix_binary(), &blocks, &port_blocks);
|
||||
} else {
|
||||
TBOX page_box(0, 0, width, height);
|
||||
// Filter_blobs sets up the TO_BLOCKs the same as find_components does.
|
||||
tess->mutable_textord()->filter_blobs(page_box.topright(), &port_blocks, true);
|
||||
}
|
||||
|
||||
return os_detect(&port_blocks, osr, tess);
|
||||
}
|
||||
|
||||
// Filter and sample the blobs.
|
||||
// Returns a non-zero number of blobs if the page was successfully processed, or
|
||||
// zero if the page had too few characters to be reliable
|
||||
int os_detect(TO_BLOCK_LIST *port_blocks, OSResults *osr, tesseract::Tesseract *tess) {
|
||||
int blobs_total = 0;
|
||||
TO_BLOCK_IT block_it;
|
||||
block_it.set_to_list(port_blocks);
|
||||
|
||||
BLOBNBOX_CLIST filtered_list;
|
||||
BLOBNBOX_C_IT filtered_it(&filtered_list);
|
||||
|
||||
for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
|
||||
TO_BLOCK *to_block = block_it.data();
|
||||
if (to_block->block->pdblk.poly_block() && !to_block->block->pdblk.poly_block()->IsText()) {
|
||||
continue;
|
||||
}
|
||||
BLOBNBOX_IT bbox_it;
|
||||
bbox_it.set_to_list(&to_block->blobs);
|
||||
for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {
|
||||
BLOBNBOX *bbox = bbox_it.data();
|
||||
C_BLOB *blob = bbox->cblob();
|
||||
TBOX box = blob->bounding_box();
|
||||
++blobs_total;
|
||||
|
||||
// Catch illegal value of box width and avoid division by zero.
|
||||
if (box.width() == 0) {
|
||||
continue;
|
||||
}
|
||||
// TODO: Can height and width be negative? If not, remove fabs.
|
||||
float y_x = std::fabs((box.height() * 1.0f) / box.width());
|
||||
float x_y = 1.0f / y_x;
|
||||
// Select a >= 1.0 ratio
|
||||
float ratio = x_y > y_x ? x_y : y_x;
|
||||
// Blob is ambiguous
|
||||
if (ratio > kSizeRatioToReject) {
|
||||
continue;
|
||||
}
|
||||
if (box.height() < kMinAcceptableBlobHeight) {
|
||||
continue;
|
||||
}
|
||||
filtered_it.add_to_end(bbox);
|
||||
}
|
||||
}
|
||||
return os_detect_blobs(nullptr, &filtered_list, osr, tess);
|
||||
}
|
||||
|
||||
// Detect orientation and script from a list of blobs.
|
||||
// Returns a non-zero number of blobs if the list was successfully processed, or
|
||||
// zero if the list had too few characters to be reliable.
|
||||
// If allowed_scripts is non-null and non-empty, it is a list of scripts that
|
||||
// constrains both orientation and script detection to consider only scripts
|
||||
// from the list.
|
||||
int os_detect_blobs(const std::vector<int> *allowed_scripts, BLOBNBOX_CLIST *blob_list,
|
||||
OSResults *osr, tesseract::Tesseract *tess) {
|
||||
OSResults osr_;
|
||||
int minCharactersToTry = tess->min_characters_to_try;
|
||||
int maxCharactersToTry = 5 * minCharactersToTry;
|
||||
if (osr == nullptr) {
|
||||
osr = &osr_;
|
||||
}
|
||||
|
||||
osr->unicharset = &tess->unicharset;
|
||||
OrientationDetector o(allowed_scripts, osr);
|
||||
ScriptDetector s(allowed_scripts, osr, tess);
|
||||
|
||||
BLOBNBOX_C_IT filtered_it(blob_list);
|
||||
int real_max = std::min(filtered_it.length(), maxCharactersToTry);
|
||||
// tprintf("Total blobs found = %d\n", blobs_total);
|
||||
// tprintf("Number of blobs post-filtering = %d\n", filtered_it.length());
|
||||
// tprintf("Number of blobs to try = %d\n", real_max);
|
||||
|
||||
// If there are too few characters, skip this page entirely.
|
||||
if (real_max < minCharactersToTry / 2) {
|
||||
tprintf("Too few characters. Skipping this page\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
auto **blobs = new BLOBNBOX *[filtered_it.length()];
|
||||
int number_of_blobs = 0;
|
||||
for (filtered_it.mark_cycle_pt(); !filtered_it.cycled_list(); filtered_it.forward()) {
|
||||
blobs[number_of_blobs++] = filtered_it.data();
|
||||
}
|
||||
QRSequenceGenerator sequence(number_of_blobs);
|
||||
int num_blobs_evaluated = 0;
|
||||
for (int i = 0; i < real_max; ++i) {
|
||||
if (os_detect_blob(blobs[sequence.GetVal()], &o, &s, osr, tess) && i > minCharactersToTry) {
|
||||
break;
|
||||
}
|
||||
++num_blobs_evaluated;
|
||||
}
|
||||
delete[] blobs;
|
||||
|
||||
// Make sure the best_result is up-to-date
|
||||
int orientation = o.get_orientation();
|
||||
osr->update_best_script(orientation);
|
||||
return num_blobs_evaluated;
|
||||
}
|
||||
|
||||
// Processes a single blob to estimate script and orientation.
|
||||
// Return true if estimate of orientation and script satisfies stopping
|
||||
// criteria.
|
||||
bool os_detect_blob(BLOBNBOX *bbox, OrientationDetector *o, ScriptDetector *s, OSResults *osr,
|
||||
tesseract::Tesseract *tess) {
|
||||
tess->tess_cn_matching.set_value(true); // turn it on
|
||||
tess->tess_bn_matching.set_value(false);
|
||||
C_BLOB *blob = bbox->cblob();
|
||||
TBLOB *tblob = TBLOB::PolygonalCopy(tess->poly_allow_detailed_fx, blob);
|
||||
TBOX box = tblob->bounding_box();
|
||||
FCOORD current_rotation(1.0f, 0.0f);
|
||||
FCOORD rotation90(0.0f, 1.0f);
|
||||
BLOB_CHOICE_LIST ratings[4];
|
||||
// Test the 4 orientations
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
// Normalize the blob. Set the origin to the place we want to be the
|
||||
// bottom-middle after rotation.
|
||||
// Scaling is to make the rotated height the x-height.
|
||||
float scaling = static_cast<float>(kBlnXHeight) / box.height();
|
||||
float x_origin = (box.left() + box.right()) / 2.0f;
|
||||
float y_origin = (box.bottom() + box.top()) / 2.0f;
|
||||
if (i == 0 || i == 2) {
|
||||
// Rotation is 0 or 180.
|
||||
y_origin = i == 0 ? box.bottom() : box.top();
|
||||
} else {
|
||||
// Rotation is 90 or 270.
|
||||
scaling = static_cast<float>(kBlnXHeight) / box.width();
|
||||
x_origin = i == 1 ? box.left() : box.right();
|
||||
}
|
||||
std::unique_ptr<TBLOB> rotated_blob(new TBLOB(*tblob));
|
||||
rotated_blob->Normalize(nullptr, ¤t_rotation, nullptr, x_origin, y_origin, scaling,
|
||||
scaling, 0.0f, static_cast<float>(kBlnBaselineOffset), false, nullptr);
|
||||
tess->AdaptiveClassifier(rotated_blob.get(), ratings + i);
|
||||
current_rotation.rotate(rotation90);
|
||||
}
|
||||
delete tblob;
|
||||
|
||||
bool stop = o->detect_blob(ratings);
|
||||
s->detect_blob(ratings);
|
||||
int orientation = o->get_orientation();
|
||||
stop = s->must_stop(orientation) && stop;
|
||||
return stop;
|
||||
}
|
||||
|
||||
OrientationDetector::OrientationDetector(const std::vector<int> *allowed_scripts, OSResults *osr) {
|
||||
osr_ = osr;
|
||||
allowed_scripts_ = allowed_scripts;
|
||||
}
|
||||
|
||||
// Score the given blob and return true if it is now sure of the orientation
|
||||
// after adding this block.
|
||||
bool OrientationDetector::detect_blob(BLOB_CHOICE_LIST *scores) {
|
||||
float blob_o_score[4] = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
float total_blob_o_score = 0.0f;
|
||||
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
BLOB_CHOICE_IT choice_it(scores + i);
|
||||
if (!choice_it.empty()) {
|
||||
BLOB_CHOICE *choice = nullptr;
|
||||
if (allowed_scripts_ != nullptr && !allowed_scripts_->empty()) {
|
||||
// Find the top choice in an allowed script.
|
||||
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list() && choice == nullptr;
|
||||
choice_it.forward()) {
|
||||
int choice_script = choice_it.data()->script_id();
|
||||
int s = 0;
|
||||
for (s = 0; s < allowed_scripts_->size(); ++s) {
|
||||
if ((*allowed_scripts_)[s] == choice_script) {
|
||||
choice = choice_it.data();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
choice = choice_it.data();
|
||||
}
|
||||
if (choice != nullptr) {
|
||||
// The certainty score ranges between [-20,0]. This is converted here to
|
||||
// [0,1], with 1 indicating best match.
|
||||
blob_o_score[i] = 1 + 0.05 * choice->certainty();
|
||||
total_blob_o_score += blob_o_score[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
if (total_blob_o_score == 0.0) {
|
||||
return false;
|
||||
}
|
||||
// Fill in any blanks with the worst score of the others. This is better than
|
||||
// picking an arbitrary probability for it and way better than -inf.
|
||||
float worst_score = 0.0f;
|
||||
int num_good_scores = 0;
|
||||
for (float f : blob_o_score) {
|
||||
if (f > 0.0f) {
|
||||
++num_good_scores;
|
||||
if (worst_score == 0.0f || f < worst_score) {
|
||||
worst_score = f;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (num_good_scores == 1) {
|
||||
// Lower worst if there is only one.
|
||||
worst_score /= 2.0f;
|
||||
}
|
||||
for (float &f : blob_o_score) {
|
||||
if (f == 0.0f) {
|
||||
f = worst_score;
|
||||
total_blob_o_score += worst_score;
|
||||
}
|
||||
}
|
||||
// Normalize the orientation scores for the blob and use them to
|
||||
// update the aggregated orientation score.
|
||||
for (int i = 0; total_blob_o_score != 0 && i < 4; ++i) {
|
||||
osr_->orientations[i] += log(blob_o_score[i] / total_blob_o_score);
|
||||
}
|
||||
|
||||
// TODO(ranjith) Add an early exit test, based on min_orientation_margin,
|
||||
// as used in pagesegmain.cpp.
|
||||
return false;
|
||||
}
|
||||
|
||||
int OrientationDetector::get_orientation() {
|
||||
osr_->update_best_orientation();
|
||||
return osr_->best_result.orientation_id;
|
||||
}
|
||||
|
||||
ScriptDetector::ScriptDetector(const std::vector<int> *allowed_scripts, OSResults *osr,
|
||||
tesseract::Tesseract *tess) {
|
||||
osr_ = osr;
|
||||
tess_ = tess;
|
||||
allowed_scripts_ = allowed_scripts;
|
||||
katakana_id_ = tess_->unicharset.add_script(katakana_script);
|
||||
hiragana_id_ = tess_->unicharset.add_script(hiragana_script);
|
||||
han_id_ = tess_->unicharset.add_script(han_script);
|
||||
hangul_id_ = tess_->unicharset.add_script(hangul_script);
|
||||
japanese_id_ = tess_->unicharset.add_script(japanese_script_);
|
||||
korean_id_ = tess_->unicharset.add_script(korean_script_);
|
||||
latin_id_ = tess_->unicharset.add_script(latin_script);
|
||||
fraktur_id_ = tess_->unicharset.add_script(fraktur_script_);
|
||||
}
|
||||
|
||||
// Score the given blob and return true if it is now sure of the script after
|
||||
// adding this blob.
|
||||
void ScriptDetector::detect_blob(BLOB_CHOICE_LIST *scores) {
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
bool done[kMaxNumberOfScripts] = {false};
|
||||
|
||||
BLOB_CHOICE_IT choice_it;
|
||||
choice_it.set_to_list(scores + i);
|
||||
|
||||
float prev_score = -1;
|
||||
int script_count = 0;
|
||||
int prev_id = -1;
|
||||
int prev_fontinfo_id = -1;
|
||||
const char *prev_unichar = "";
|
||||
const char *unichar = "";
|
||||
|
||||
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
|
||||
BLOB_CHOICE *choice = choice_it.data();
|
||||
int id = choice->script_id();
|
||||
if (allowed_scripts_ != nullptr && !allowed_scripts_->empty()) {
|
||||
// Check that the choice is in an allowed script.
|
||||
int s = 0;
|
||||
for (s = 0; s < allowed_scripts_->size(); ++s) {
|
||||
if ((*allowed_scripts_)[s] == id) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (s == allowed_scripts_->size()) {
|
||||
continue; // Not found in list.
|
||||
}
|
||||
}
|
||||
// Script already processed before.
|
||||
if (done[id]) {
|
||||
continue;
|
||||
}
|
||||
done[id] = true;
|
||||
|
||||
unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
|
||||
// Save data from the first match
|
||||
if (prev_score < 0) {
|
||||
prev_score = -choice->certainty();
|
||||
script_count = 1;
|
||||
prev_id = id;
|
||||
prev_unichar = unichar;
|
||||
prev_fontinfo_id = choice->fontinfo_id();
|
||||
} else if (-choice->certainty() < prev_score + kNonAmbiguousMargin) {
|
||||
++script_count;
|
||||
}
|
||||
|
||||
if (strlen(prev_unichar) == 1) {
|
||||
if (unichar[0] >= '0' && unichar[0] <= '9') {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// if script_count is >= 2, character is ambiguous, skip other matches
|
||||
// since they are useless.
|
||||
if (script_count >= 2) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Character is non ambiguous
|
||||
if (script_count == 1) {
|
||||
// Update the score of the winning script
|
||||
osr_->scripts_na[i][prev_id] += 1.0;
|
||||
|
||||
// Workaround for Fraktur
|
||||
if (prev_id == latin_id_) {
|
||||
if (prev_fontinfo_id >= 0) {
|
||||
const tesseract::FontInfo &fi = tess_->get_fontinfo_table().at(prev_fontinfo_id);
|
||||
// printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name,
|
||||
// fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(),
|
||||
// fi.is_serif(), fi.is_fraktur(),
|
||||
// prev_unichar);
|
||||
if (fi.is_fraktur()) {
|
||||
osr_->scripts_na[i][prev_id] -= 1.0;
|
||||
osr_->scripts_na[i][fraktur_id_] += 1.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Update Japanese / Korean pseudo-scripts
|
||||
if (prev_id == katakana_id_) {
|
||||
osr_->scripts_na[i][japanese_id_] += 1.0;
|
||||
}
|
||||
if (prev_id == hiragana_id_) {
|
||||
osr_->scripts_na[i][japanese_id_] += 1.0;
|
||||
}
|
||||
if (prev_id == hangul_id_) {
|
||||
osr_->scripts_na[i][korean_id_] += 1.0;
|
||||
}
|
||||
if (prev_id == han_id_) {
|
||||
osr_->scripts_na[i][korean_id_] += kHanRatioInKorean;
|
||||
osr_->scripts_na[i][japanese_id_] += kHanRatioInJapanese;
|
||||
}
|
||||
}
|
||||
} // iterate over each orientation
|
||||
}
|
||||
|
||||
bool ScriptDetector::must_stop(int orientation) const {
|
||||
osr_->update_best_script(orientation);
|
||||
return osr_->best_result.sconfidence > 1;
|
||||
}
|
||||
|
||||
// Helper method to convert an orientation index to its value in degrees.
|
||||
// The value represents the amount of clockwise rotation in degrees that must be
|
||||
// applied for the text to be upright (readable).
|
||||
int OrientationIdToValue(const int &id) {
|
||||
switch (id) {
|
||||
case 0:
|
||||
return 0;
|
||||
case 1:
|
||||
return 270;
|
||||
case 2:
|
||||
return 180;
|
||||
case 3:
|
||||
return 90;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
416
3rdparty/tesseract_ocr/tesseract/src/ccmain/output.cpp
vendored
Normal file
416
3rdparty/tesseract_ocr/tesseract/src/ccmain/output.cpp
vendored
Normal file
|
@ -0,0 +1,416 @@
|
|||
/******************************************************************
|
||||
* File: output.cpp (Formerly output.c)
|
||||
* Description: Output pass
|
||||
* Author: Phil Cheatle
|
||||
*
|
||||
* (C) Copyright 1994, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "output.h"
|
||||
|
||||
#include "control.h"
|
||||
#include "tesseractclass.h"
|
||||
#include "tessvars.h"
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
# include "docqual.h"
|
||||
# include "reject.h"
|
||||
#endif
|
||||
|
||||
#include "helpers.h"
|
||||
|
||||
#include <cctype>
|
||||
#include <cerrno>
|
||||
#include <cstring>
|
||||
|
||||
#define CTRL_NEWLINE '\012' // newline
|
||||
#define CTRL_HARDLINE '\015' // cr
|
||||
|
||||
namespace tesseract {
|
||||
void Tesseract::output_pass( // Tess output pass //send to api
|
||||
PAGE_RES_IT &page_res_it, const TBOX *target_word_box) {
|
||||
BLOCK_RES *block_of_last_word;
|
||||
bool force_eol; // During output
|
||||
BLOCK *nextblock; // block of next word
|
||||
WERD *nextword; // next word
|
||||
|
||||
page_res_it.restart_page();
|
||||
block_of_last_word = nullptr;
|
||||
while (page_res_it.word() != nullptr) {
|
||||
check_debug_pt(page_res_it.word(), 120);
|
||||
|
||||
if (target_word_box) {
|
||||
TBOX current_word_box = page_res_it.word()->word->bounding_box();
|
||||
FCOORD center_pt((current_word_box.right() + current_word_box.left()) / 2,
|
||||
(current_word_box.bottom() + current_word_box.top()) / 2);
|
||||
if (!target_word_box->contains(center_pt)) {
|
||||
page_res_it.forward();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (tessedit_write_block_separators && block_of_last_word != page_res_it.block()) {
|
||||
block_of_last_word = page_res_it.block();
|
||||
}
|
||||
|
||||
force_eol =
|
||||
(tessedit_write_block_separators && (page_res_it.block() != page_res_it.next_block())) ||
|
||||
(page_res_it.next_word() == nullptr);
|
||||
|
||||
if (page_res_it.next_word() != nullptr) {
|
||||
nextword = page_res_it.next_word()->word;
|
||||
} else {
|
||||
nextword = nullptr;
|
||||
}
|
||||
if (page_res_it.next_block() != nullptr) {
|
||||
nextblock = page_res_it.next_block()->block;
|
||||
} else {
|
||||
nextblock = nullptr;
|
||||
}
|
||||
// regardless of tilde crunching
|
||||
write_results(page_res_it,
|
||||
determine_newline_type(page_res_it.word()->word, page_res_it.block()->block,
|
||||
nextword, nextblock),
|
||||
force_eol);
|
||||
page_res_it.forward();
|
||||
}
|
||||
}
|
||||
|
||||
/*************************************************************************
|
||||
* write_results()
|
||||
*
|
||||
* All recognition and rejection has now been done. Generate the following:
|
||||
* .txt file - giving the final best choices with NO highlighting
|
||||
* .raw file - giving the tesseract top choice output for each word
|
||||
* .map file - showing how the .txt file has been rejected in the .ep file
|
||||
* epchoice list - a list of one element per word, containing the text for the
|
||||
* epaper. Reject strings are inserted.
|
||||
* inset list - a list of bounding boxes of reject insets - indexed by the
|
||||
* reject strings in the epchoice text.
|
||||
*************************************************************************/
|
||||
void Tesseract::write_results(PAGE_RES_IT &page_res_it,
|
||||
char newline_type, // type of newline
|
||||
bool force_eol) { // override tilde crunch?
|
||||
WERD_RES *word = page_res_it.word();
|
||||
const UNICHARSET &uchset = *word->uch_set;
|
||||
int i;
|
||||
bool need_reject = false;
|
||||
UNICHAR_ID space = uchset.unichar_to_id(" ");
|
||||
|
||||
if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->empty()) &&
|
||||
!tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
|
||||
if ((word->unlv_crunch_mode != CR_DELETE) &&
|
||||
(!stats_.tilde_crunch_written ||
|
||||
((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space() > 0) &&
|
||||
!word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {
|
||||
if (!word->word->flag(W_BOL) && (word->word->space() > 0) && !word->word->flag(W_FUZZY_NON) &&
|
||||
!word->word->flag(W_FUZZY_SP)) {
|
||||
stats_.last_char_was_tilde = false;
|
||||
}
|
||||
need_reject = true;
|
||||
}
|
||||
if ((need_reject && !stats_.last_char_was_tilde) ||
|
||||
(force_eol && stats_.write_results_empty_block)) {
|
||||
/* Write a reject char - mark as rejected unless zero_rejection mode */
|
||||
stats_.last_char_was_tilde = true;
|
||||
stats_.tilde_crunch_written = true;
|
||||
stats_.last_char_was_newline = false;
|
||||
stats_.write_results_empty_block = false;
|
||||
}
|
||||
|
||||
if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) {
|
||||
stats_.tilde_crunch_written = false;
|
||||
stats_.last_char_was_newline = true;
|
||||
stats_.last_char_was_tilde = false;
|
||||
}
|
||||
|
||||
if (force_eol) {
|
||||
stats_.write_results_empty_block = true;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* NORMAL PROCESSING of non tilde crunched words */
|
||||
|
||||
stats_.tilde_crunch_written = false;
|
||||
if (newline_type) {
|
||||
stats_.last_char_was_newline = true;
|
||||
} else {
|
||||
stats_.last_char_was_newline = false;
|
||||
}
|
||||
stats_.write_results_empty_block = force_eol; // about to write a real word
|
||||
|
||||
if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) &&
|
||||
!(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
|
||||
(word->best_choice->unichar_id(0) == space)) {
|
||||
/* Prevent adjacent tilde across words - we know that adjacent tildes within
|
||||
words have been removed */
|
||||
word->MergeAdjacentBlobs(0);
|
||||
}
|
||||
if (newline_type || (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes)) {
|
||||
stats_.last_char_was_tilde = false;
|
||||
} else {
|
||||
if (word->reject_map.length() > 0) {
|
||||
if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) {
|
||||
stats_.last_char_was_tilde = true;
|
||||
} else {
|
||||
stats_.last_char_was_tilde = false;
|
||||
}
|
||||
} else if (word->word->space() > 0) {
|
||||
stats_.last_char_was_tilde = false;
|
||||
}
|
||||
/* else it is unchanged as there are no output chars */
|
||||
}
|
||||
|
||||
ASSERT_HOST(word->best_choice->length() == word->reject_map.length());
|
||||
|
||||
set_unlv_suspects(word);
|
||||
check_debug_pt(word, 120);
|
||||
if (tessedit_rejection_debug) {
|
||||
tprintf("Dict word: \"%s\": %d\n", word->best_choice->debug_string().c_str(),
|
||||
dict_word(*(word->best_choice)));
|
||||
}
|
||||
if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
|
||||
if (tessedit_zero_rejection) {
|
||||
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
|
||||
for (i = 0; i < word->best_choice->length(); ++i) {
|
||||
if (word->reject_map[i].rejected()) {
|
||||
word->reject_map[i].setrej_minimal_rej_accept();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (tessedit_minimal_rejection) {
|
||||
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
|
||||
for (i = 0; i < word->best_choice->length(); ++i) {
|
||||
if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) {
|
||||
word->reject_map[i].setrej_minimal_rej_accept();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* determine_newline_type
|
||||
*
|
||||
* Find whether we have a wrapping or hard newline.
|
||||
* Return false if not at end of line.
|
||||
**********************************************************************/
|
||||
|
||||
char determine_newline_type( // test line ends
|
||||
WERD *word, // word to do
|
||||
BLOCK *block, // current block
|
||||
WERD *next_word, // next word
|
||||
BLOCK *next_block // block of next word
|
||||
) {
|
||||
int16_t end_gap; // to right edge
|
||||
int16_t width; // of next word
|
||||
TBOX word_box; // bounding
|
||||
TBOX next_box; // next word
|
||||
TBOX block_box; // block bounding
|
||||
|
||||
if (!word->flag(W_EOL)) {
|
||||
return false; // not end of line
|
||||
}
|
||||
if (next_word == nullptr || next_block == nullptr || block != next_block) {
|
||||
return CTRL_NEWLINE;
|
||||
}
|
||||
if (next_word->space() > 0) {
|
||||
return CTRL_HARDLINE; // it is tabbed
|
||||
}
|
||||
word_box = word->bounding_box();
|
||||
next_box = next_word->bounding_box();
|
||||
block_box = block->pdblk.bounding_box();
|
||||
// gap to eol
|
||||
end_gap = block_box.right() - word_box.right();
|
||||
end_gap -= static_cast<int32_t>(block->space());
|
||||
width = next_box.right() - next_box.left();
|
||||
// tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
|
||||
// block_box.right(),word_box.right(),end_gap,
|
||||
// next_box.right(),next_box.left(),width,
|
||||
// end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
|
||||
return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
|
||||
}
|
||||
|
||||
/*************************************************************************
|
||||
* get_rep_char()
|
||||
* Return the first accepted character from the repetition string. This is the
|
||||
* character which is repeated - as determined earlier by fix_rep_char()
|
||||
*************************************************************************/
|
||||
UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
|
||||
int i;
|
||||
for (i = 0; ((i < word->reject_map.length()) && (word->reject_map[i].rejected())); ++i) {
|
||||
;
|
||||
}
|
||||
|
||||
if (i < word->reject_map.length()) {
|
||||
return word->best_choice->unichar_id(i);
|
||||
} else {
|
||||
return word->uch_set->unichar_to_id(unrecognised_char.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
/*************************************************************************
|
||||
* SUSPECT LEVELS
|
||||
*
|
||||
* 0 - don't reject ANYTHING
|
||||
* 1,2 - partial rejection
|
||||
* 3 - BEST
|
||||
*
|
||||
* NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
|
||||
* tessedit_minimal_rejection.
|
||||
*************************************************************************/
|
||||
void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
|
||||
int len = word_res->reject_map.length();
|
||||
const WERD_CHOICE &word = *(word_res->best_choice);
|
||||
const UNICHARSET &uchset = *word.unicharset();
|
||||
int i;
|
||||
float rating_per_ch;
|
||||
|
||||
if (suspect_level == 0) {
|
||||
for (i = 0; i < len; i++) {
|
||||
if (word_res->reject_map[i].rejected()) {
|
||||
word_res->reject_map[i].setrej_minimal_rej_accept();
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (suspect_level >= 3) {
|
||||
return; // Use defaults
|
||||
}
|
||||
|
||||
/* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
|
||||
|
||||
if (safe_dict_word(word_res) && (count_alphas(word) > suspect_short_words)) {
|
||||
/* Unreject alphas in dictionary words */
|
||||
for (i = 0; i < len; ++i) {
|
||||
if (word_res->reject_map[i].rejected() && uchset.get_isalpha(word.unichar_id(i))) {
|
||||
word_res->reject_map[i].setrej_minimal_rej_accept();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
rating_per_ch = word.rating() / word_res->reject_map.length();
|
||||
|
||||
if (rating_per_ch >= suspect_rating_per_ch) {
|
||||
return; // Don't touch bad ratings
|
||||
}
|
||||
|
||||
if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
|
||||
/* Unreject any Tess Acceptable word - but NOT tess reject chs*/
|
||||
for (i = 0; i < len; ++i) {
|
||||
if (word_res->reject_map[i].rejected() && (!uchset.eq(word.unichar_id(i), " "))) {
|
||||
word_res->reject_map[i].setrej_minimal_rej_accept();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < len; i++) {
|
||||
if (word_res->reject_map[i].rejected()) {
|
||||
if (word_res->reject_map[i].flag(R_DOC_REJ)) {
|
||||
word_res->reject_map[i].setrej_minimal_rej_accept();
|
||||
}
|
||||
if (word_res->reject_map[i].flag(R_BLOCK_REJ)) {
|
||||
word_res->reject_map[i].setrej_minimal_rej_accept();
|
||||
}
|
||||
if (word_res->reject_map[i].flag(R_ROW_REJ)) {
|
||||
word_res->reject_map[i].setrej_minimal_rej_accept();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (suspect_level == 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!suspect_constrain_1Il || (word_res->reject_map.length() <= suspect_short_words)) {
|
||||
for (i = 0; i < len; i++) {
|
||||
if (word_res->reject_map[i].rejected()) {
|
||||
if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
|
||||
word_res->reject_map[i].flag(R_POSTNN_1IL))) {
|
||||
word_res->reject_map[i].setrej_minimal_rej_accept();
|
||||
}
|
||||
|
||||
if (!suspect_constrain_1Il && word_res->reject_map[i].flag(R_MM_REJECT)) {
|
||||
word_res->reject_map[i].setrej_minimal_rej_accept();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (acceptable_word_string(*word_res->uch_set, word.unichar_string().c_str(),
|
||||
word.unichar_lengths().c_str()) != AC_UNACCEPTABLE ||
|
||||
acceptable_number_string(word.unichar_string().c_str(), word.unichar_lengths().c_str())) {
|
||||
if (word_res->reject_map.length() > suspect_short_words) {
|
||||
for (i = 0; i < len; i++) {
|
||||
if (word_res->reject_map[i].rejected() && (!word_res->reject_map[i].perm_rejected() ||
|
||||
word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
|
||||
word_res->reject_map[i].flag(R_POSTNN_1IL) ||
|
||||
word_res->reject_map[i].flag(R_MM_REJECT))) {
|
||||
word_res->reject_map[i].setrej_minimal_rej_accept();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int16_t Tesseract::count_alphas(const WERD_CHOICE &word) {
|
||||
int count = 0;
|
||||
for (int i = 0; i < word.length(); ++i) {
|
||||
if (word.unicharset()->get_isalpha(word.unichar_id(i))) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
int16_t Tesseract::count_alphanums(const WERD_CHOICE &word) {
|
||||
int count = 0;
|
||||
for (int i = 0; i < word.length(); ++i) {
|
||||
if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
|
||||
word.unicharset()->get_isdigit(word.unichar_id(i))) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
bool Tesseract::acceptable_number_string(const char *s, const char *lengths) {
|
||||
bool prev_digit = false;
|
||||
|
||||
if (*lengths == 1 && *s == '(') {
|
||||
s++;
|
||||
}
|
||||
|
||||
if (*lengths == 1 && ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))) {
|
||||
s++;
|
||||
}
|
||||
|
||||
for (; *s != '\0'; s += *(lengths++)) {
|
||||
if (unicharset.get_isdigit(s, *lengths)) {
|
||||
prev_digit = true;
|
||||
} else if (prev_digit && (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) {
|
||||
prev_digit = false;
|
||||
} else if (prev_digit && *lengths == 1 && (*(s + *lengths) == '\0') &&
|
||||
((*s == '%') || (*s == ')'))) {
|
||||
return true;
|
||||
} else if (prev_digit && *lengths == 1 && (*s == '%') &&
|
||||
(*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
|
||||
(*(s + *lengths + *(lengths + 1)) == '\0')) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} // namespace tesseract
|
37
3rdparty/tesseract_ocr/tesseract/src/ccmain/output.h
vendored
Normal file
37
3rdparty/tesseract_ocr/tesseract/src/ccmain/output.h
vendored
Normal file
|
@ -0,0 +1,37 @@
|
|||
/******************************************************************
|
||||
* File: output.h (Formerly output.h)
|
||||
* Description: Output pass
|
||||
* Author: Phil Cheatle
|
||||
* Created: Thu Aug 4 10:56:08 BST 1994
|
||||
*
|
||||
* (C) Copyright 1994, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef OUTPUT_H
|
||||
#define OUTPUT_H
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class BLOCK;
|
||||
class WERD;
|
||||
|
||||
/** test line ends */
|
||||
char determine_newline_type(WERD *word, ///< word to do
|
||||
BLOCK *block, ///< current block
|
||||
WERD *next_word, ///< next word
|
||||
BLOCK *next_block ///< block of next word
|
||||
);
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif
|
652
3rdparty/tesseract_ocr/tesseract/src/ccmain/pageiterator.cpp
vendored
Normal file
652
3rdparty/tesseract_ocr/tesseract/src/ccmain/pageiterator.cpp
vendored
Normal file
|
@ -0,0 +1,652 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: pageiterator.cpp
|
||||
// Description: Iterator for tesseract page structure that avoids using
|
||||
// tesseract internal data structures.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2010, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <allheaders.h>
|
||||
#include <tesseract/pageiterator.h>
|
||||
#include "helpers.h"
|
||||
#include "pageres.h"
|
||||
#include "tesseractclass.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
PageIterator::PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres,
|
||||
int rect_left, int rect_top, int rect_width, int rect_height)
|
||||
: page_res_(page_res)
|
||||
, tesseract_(tesseract)
|
||||
, word_(nullptr)
|
||||
, word_length_(0)
|
||||
, blob_index_(0)
|
||||
, cblob_it_(nullptr)
|
||||
, include_upper_dots_(false)
|
||||
, include_lower_dots_(false)
|
||||
, scale_(scale)
|
||||
, scaled_yres_(scaled_yres)
|
||||
, rect_left_(rect_left)
|
||||
, rect_top_(rect_top)
|
||||
, rect_width_(rect_width)
|
||||
, rect_height_(rect_height) {
|
||||
it_ = new PAGE_RES_IT(page_res);
|
||||
PageIterator::Begin();
|
||||
}
|
||||
|
||||
PageIterator::~PageIterator() {
|
||||
delete it_;
|
||||
delete cblob_it_;
|
||||
}
|
||||
|
||||
/**
|
||||
* PageIterators may be copied! This makes it possible to iterate over
|
||||
* all the objects at a lower level, while maintaining an iterator to
|
||||
* objects at a higher level.
|
||||
*/
|
||||
PageIterator::PageIterator(const PageIterator &src)
|
||||
: page_res_(src.page_res_)
|
||||
, tesseract_(src.tesseract_)
|
||||
, word_(nullptr)
|
||||
, word_length_(src.word_length_)
|
||||
, blob_index_(src.blob_index_)
|
||||
, cblob_it_(nullptr)
|
||||
, include_upper_dots_(src.include_upper_dots_)
|
||||
, include_lower_dots_(src.include_lower_dots_)
|
||||
, scale_(src.scale_)
|
||||
, scaled_yres_(src.scaled_yres_)
|
||||
, rect_left_(src.rect_left_)
|
||||
, rect_top_(src.rect_top_)
|
||||
, rect_width_(src.rect_width_)
|
||||
, rect_height_(src.rect_height_) {
|
||||
it_ = new PAGE_RES_IT(*src.it_);
|
||||
BeginWord(src.blob_index_);
|
||||
}
|
||||
|
||||
const PageIterator &PageIterator::operator=(const PageIterator &src) {
|
||||
page_res_ = src.page_res_;
|
||||
tesseract_ = src.tesseract_;
|
||||
include_upper_dots_ = src.include_upper_dots_;
|
||||
include_lower_dots_ = src.include_lower_dots_;
|
||||
scale_ = src.scale_;
|
||||
scaled_yres_ = src.scaled_yres_;
|
||||
rect_left_ = src.rect_left_;
|
||||
rect_top_ = src.rect_top_;
|
||||
rect_width_ = src.rect_width_;
|
||||
rect_height_ = src.rect_height_;
|
||||
delete it_;
|
||||
it_ = new PAGE_RES_IT(*src.it_);
|
||||
BeginWord(src.blob_index_);
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool PageIterator::PositionedAtSameWord(const PAGE_RES_IT *other) const {
|
||||
return (it_ == nullptr && it_ == other) ||
|
||||
((other != nullptr) && (it_ != nullptr) && (*it_ == *other));
|
||||
}
|
||||
|
||||
// ============= Moving around within the page ============.
|
||||
|
||||
/** Resets the iterator to point to the start of the page. */
|
||||
void PageIterator::Begin() {
|
||||
it_->restart_page_with_empties();
|
||||
BeginWord(0);
|
||||
}
|
||||
|
||||
void PageIterator::RestartParagraph() {
|
||||
if (it_->block() == nullptr) {
|
||||
return; // At end of the document.
|
||||
}
|
||||
PAGE_RES_IT para(page_res_);
|
||||
PAGE_RES_IT next_para(para);
|
||||
next_para.forward_paragraph();
|
||||
while (next_para.cmp(*it_) <= 0) {
|
||||
para = next_para;
|
||||
next_para.forward_paragraph();
|
||||
}
|
||||
*it_ = para;
|
||||
BeginWord(0);
|
||||
}
|
||||
|
||||
bool PageIterator::IsWithinFirstTextlineOfParagraph() const {
|
||||
PageIterator p_start(*this);
|
||||
p_start.RestartParagraph();
|
||||
return p_start.it_->row() == it_->row();
|
||||
}
|
||||
|
||||
void PageIterator::RestartRow() {
|
||||
it_->restart_row();
|
||||
BeginWord(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Moves to the start of the next object at the given level in the
|
||||
* page hierarchy, and returns false if the end of the page was reached.
|
||||
* NOTE (CHANGED!) that ALL PageIteratorLevel level values will visit each
|
||||
* non-text block at least once.
|
||||
* Think of non text blocks as containing a single para, with at least one
|
||||
* line, with a single imaginary word, containing a single symbol.
|
||||
* The bounding boxes mark out any polygonal nature of the block, and
|
||||
* PTIsTextType(BLockType()) is false for non-text blocks.
|
||||
* Calls to Next with different levels may be freely intermixed.
|
||||
* This function iterates words in right-to-left scripts correctly, if
|
||||
* the appropriate language has been loaded into Tesseract.
|
||||
*/
|
||||
bool PageIterator::Next(PageIteratorLevel level) {
|
||||
if (it_->block() == nullptr) {
|
||||
return false; // Already at the end!
|
||||
}
|
||||
if (it_->word() == nullptr) {
|
||||
level = RIL_BLOCK;
|
||||
}
|
||||
|
||||
switch (level) {
|
||||
case RIL_BLOCK:
|
||||
it_->forward_block();
|
||||
break;
|
||||
case RIL_PARA:
|
||||
it_->forward_paragraph();
|
||||
break;
|
||||
case RIL_TEXTLINE:
|
||||
for (it_->forward_with_empties(); it_->row() == it_->prev_row();
|
||||
it_->forward_with_empties()) {
|
||||
;
|
||||
}
|
||||
break;
|
||||
case RIL_WORD:
|
||||
it_->forward_with_empties();
|
||||
break;
|
||||
case RIL_SYMBOL:
|
||||
if (cblob_it_ != nullptr) {
|
||||
cblob_it_->forward();
|
||||
}
|
||||
++blob_index_;
|
||||
if (blob_index_ >= word_length_) {
|
||||
it_->forward_with_empties();
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
BeginWord(0);
|
||||
return it_->block() != nullptr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the iterator is at the start of an object at the given
|
||||
* level. Possible uses include determining if a call to Next(RIL_WORD)
|
||||
* moved to the start of a RIL_PARA.
|
||||
*/
|
||||
bool PageIterator::IsAtBeginningOf(PageIteratorLevel level) const {
|
||||
if (it_->block() == nullptr) {
|
||||
return false; // Already at the end!
|
||||
}
|
||||
if (it_->word() == nullptr) {
|
||||
return true; // In an image block.
|
||||
}
|
||||
switch (level) {
|
||||
case RIL_BLOCK:
|
||||
return blob_index_ == 0 && it_->block() != it_->prev_block();
|
||||
case RIL_PARA:
|
||||
return blob_index_ == 0 && (it_->block() != it_->prev_block() ||
|
||||
it_->row()->row->para() != it_->prev_row()->row->para());
|
||||
case RIL_TEXTLINE:
|
||||
return blob_index_ == 0 && it_->row() != it_->prev_row();
|
||||
case RIL_WORD:
|
||||
return blob_index_ == 0;
|
||||
case RIL_SYMBOL:
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether the iterator is positioned at the last element in a
|
||||
* given level. (e.g. the last word in a line, the last line in a block)
|
||||
*/
|
||||
bool PageIterator::IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const {
|
||||
if (Empty(element)) {
|
||||
return true; // Already at the end!
|
||||
}
|
||||
// The result is true if we step forward by element and find we are
|
||||
// at the the end of the page or at beginning of *all* levels in:
|
||||
// [level, element).
|
||||
// When there is more than one level difference between element and level,
|
||||
// we could for instance move forward one symbol and still be at the first
|
||||
// word on a line, so we also have to be at the first symbol in a word.
|
||||
PageIterator next(*this);
|
||||
next.Next(element);
|
||||
if (next.Empty(element)) {
|
||||
return true; // Reached the end of the page.
|
||||
}
|
||||
while (element > level) {
|
||||
element = static_cast<PageIteratorLevel>(element - 1);
|
||||
if (!next.IsAtBeginningOf(element)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether this iterator is positioned
|
||||
* before other: -1
|
||||
* equal to other: 0
|
||||
* after other: 1
|
||||
*/
|
||||
int PageIterator::Cmp(const PageIterator &other) const {
|
||||
int word_cmp = it_->cmp(*other.it_);
|
||||
if (word_cmp != 0) {
|
||||
return word_cmp;
|
||||
}
|
||||
if (blob_index_ < other.blob_index_) {
|
||||
return -1;
|
||||
}
|
||||
if (blob_index_ == other.blob_index_) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
// ============= Accessing data ==============.
|
||||
// Coordinate system:
|
||||
// Integer coordinates are at the cracks between the pixels.
|
||||
// The top-left corner of the top-left pixel in the image is at (0,0).
|
||||
// The bottom-right corner of the bottom-right pixel in the image is at
|
||||
// (width, height).
|
||||
// Every bounding box goes from the top-left of the top-left contained
|
||||
// pixel to the bottom-right of the bottom-right contained pixel, so
|
||||
// the bounding box of the single top-left pixel in the image is:
|
||||
// (0,0)->(1,1).
|
||||
// If an image rectangle has been set in the API, then returned coordinates
|
||||
// relate to the original (full) image, rather than the rectangle.
|
||||
|
||||
/**
|
||||
* Returns the bounding rectangle of the current object at the given level in
|
||||
* the coordinates of the working image that is pix_binary().
|
||||
* See comment on coordinate system above.
|
||||
* Returns false if there is no such object at the current position.
|
||||
*/
|
||||
bool PageIterator::BoundingBoxInternal(PageIteratorLevel level, int *left, int *top, int *right,
|
||||
int *bottom) const {
|
||||
if (Empty(level)) {
|
||||
return false;
|
||||
}
|
||||
TBOX box;
|
||||
PARA *para = nullptr;
|
||||
switch (level) {
|
||||
case RIL_BLOCK:
|
||||
box = it_->block()->block->restricted_bounding_box(include_upper_dots_, include_lower_dots_);
|
||||
break;
|
||||
case RIL_PARA:
|
||||
para = it_->row()->row->para();
|
||||
// Fall through.
|
||||
case RIL_TEXTLINE:
|
||||
box = it_->row()->row->restricted_bounding_box(include_upper_dots_, include_lower_dots_);
|
||||
break;
|
||||
case RIL_WORD:
|
||||
box = it_->word()->word->restricted_bounding_box(include_upper_dots_, include_lower_dots_);
|
||||
break;
|
||||
case RIL_SYMBOL:
|
||||
if (cblob_it_ == nullptr) {
|
||||
box = it_->word()->box_word->BlobBox(blob_index_);
|
||||
} else {
|
||||
box = cblob_it_->data()->bounding_box();
|
||||
}
|
||||
}
|
||||
if (level == RIL_PARA) {
|
||||
PageIterator other = *this;
|
||||
other.Begin();
|
||||
do {
|
||||
if (other.it_->block() && other.it_->block()->block == it_->block()->block &&
|
||||
other.it_->row() && other.it_->row()->row && other.it_->row()->row->para() == para) {
|
||||
box = box.bounding_union(other.it_->row()->row->bounding_box());
|
||||
}
|
||||
} while (other.Next(RIL_TEXTLINE));
|
||||
}
|
||||
if (level != RIL_SYMBOL || cblob_it_ != nullptr) {
|
||||
box.rotate(it_->block()->block->re_rotation());
|
||||
}
|
||||
// Now we have a box in tesseract coordinates relative to the image rectangle,
|
||||
// we have to convert the coords to a top-down system.
|
||||
const int pix_height = pixGetHeight(tesseract_->pix_binary());
|
||||
const int pix_width = pixGetWidth(tesseract_->pix_binary());
|
||||
*left = ClipToRange(static_cast<int>(box.left()), 0, pix_width);
|
||||
*top = ClipToRange(pix_height - box.top(), 0, pix_height);
|
||||
*right = ClipToRange(static_cast<int>(box.right()), *left, pix_width);
|
||||
*bottom = ClipToRange(pix_height - box.bottom(), *top, pix_height);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the bounding rectangle of the current object at the given level in
|
||||
* coordinates of the original image.
|
||||
* See comment on coordinate system above.
|
||||
* Returns false if there is no such object at the current position.
|
||||
*/
|
||||
bool PageIterator::BoundingBox(PageIteratorLevel level, int *left, int *top, int *right,
|
||||
int *bottom) const {
|
||||
return BoundingBox(level, 0, left, top, right, bottom);
|
||||
}
|
||||
|
||||
bool PageIterator::BoundingBox(PageIteratorLevel level, const int padding, int *left, int *top,
|
||||
int *right, int *bottom) const {
|
||||
if (!BoundingBoxInternal(level, left, top, right, bottom)) {
|
||||
return false;
|
||||
}
|
||||
// Convert to the coordinate system of the original image.
|
||||
*left = ClipToRange(*left / scale_ + rect_left_ - padding, rect_left_, rect_left_ + rect_width_);
|
||||
*top = ClipToRange(*top / scale_ + rect_top_ - padding, rect_top_, rect_top_ + rect_height_);
|
||||
*right = ClipToRange((*right + scale_ - 1) / scale_ + rect_left_ + padding, *left,
|
||||
rect_left_ + rect_width_);
|
||||
*bottom = ClipToRange((*bottom + scale_ - 1) / scale_ + rect_top_ + padding, *top,
|
||||
rect_top_ + rect_height_);
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Return that there is no such object at a given level. */
|
||||
bool PageIterator::Empty(PageIteratorLevel level) const {
|
||||
if (it_->block() == nullptr) {
|
||||
return true; // Already at the end!
|
||||
}
|
||||
if (it_->word() == nullptr && level != RIL_BLOCK) {
|
||||
return true; // image block
|
||||
}
|
||||
if (level == RIL_SYMBOL && blob_index_ >= word_length_) {
|
||||
return true; // Zero length word, or already at the end of it.
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Returns the type of the current block.
|
||||
* See tesseract/publictypes.h for PolyBlockType. */
|
||||
PolyBlockType PageIterator::BlockType() const {
|
||||
if (it_->block() == nullptr || it_->block()->block == nullptr) {
|
||||
return PT_UNKNOWN; // Already at the end!
|
||||
}
|
||||
if (it_->block()->block->pdblk.poly_block() == nullptr) {
|
||||
return PT_FLOWING_TEXT; // No layout analysis used - assume text.
|
||||
}
|
||||
return it_->block()->block->pdblk.poly_block()->isA();
|
||||
}
|
||||
|
||||
/** Returns the polygon outline of the current block. The returned Pta must
|
||||
* be ptaDestroy-ed after use. */
|
||||
Pta *PageIterator::BlockPolygon() const {
|
||||
if (it_->block() == nullptr || it_->block()->block == nullptr) {
|
||||
return nullptr; // Already at the end!
|
||||
}
|
||||
if (it_->block()->block->pdblk.poly_block() == nullptr) {
|
||||
return nullptr; // No layout analysis used - no polygon.
|
||||
}
|
||||
// Copy polygon, so we can unrotate it to image coordinates.
|
||||
POLY_BLOCK *internal_poly = it_->block()->block->pdblk.poly_block();
|
||||
ICOORDELT_LIST vertices;
|
||||
vertices.deep_copy(internal_poly->points(), ICOORDELT::deep_copy);
|
||||
POLY_BLOCK poly(&vertices, internal_poly->isA());
|
||||
poly.rotate(it_->block()->block->re_rotation());
|
||||
ICOORDELT_IT it(poly.points());
|
||||
Pta *pta = ptaCreate(it.length());
|
||||
int num_pts = 0;
|
||||
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++num_pts) {
|
||||
ICOORD *pt = it.data();
|
||||
// Convert to top-down coords within the input image.
|
||||
int x = static_cast<float>(pt->x()) / scale_ + rect_left_;
|
||||
int y = rect_top_ + rect_height_ - static_cast<float>(pt->y()) / scale_;
|
||||
x = ClipToRange(x, rect_left_, rect_left_ + rect_width_);
|
||||
y = ClipToRange(y, rect_top_, rect_top_ + rect_height_);
|
||||
ptaAddPt(pta, x, y);
|
||||
}
|
||||
return pta;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a binary image of the current object at the given level.
|
||||
* The position and size match the return from BoundingBoxInternal, and so this
|
||||
* could be upscaled with respect to the original input image.
|
||||
* Use pixDestroy to delete the image after use.
|
||||
* The following methods are used to generate the images:
|
||||
* RIL_BLOCK: mask the page image with the block polygon.
|
||||
* RIL_TEXTLINE: Clip the rectangle of the line box from the page image.
|
||||
* TODO(rays) fix this to generate and use a line polygon.
|
||||
* RIL_WORD: Clip the rectangle of the word box from the page image.
|
||||
* RIL_SYMBOL: Render the symbol outline to an image for cblobs (prior
|
||||
* to recognition) or the bounding box otherwise.
|
||||
* A reconstruction of the original image (using xor to check for double
|
||||
* representation) should be reasonably accurate,
|
||||
* apart from removed noise, at the block level. Below the block level, the
|
||||
* reconstruction will be missing images and line separators.
|
||||
* At the symbol level, kerned characters will be invade the bounding box
|
||||
* if rendered after recognition, making an xor reconstruction inaccurate, but
|
||||
* an or construction better. Before recognition, symbol-level reconstruction
|
||||
* should be good, even with xor, since the images come from the connected
|
||||
* components.
|
||||
*/
|
||||
Pix *PageIterator::GetBinaryImage(PageIteratorLevel level) const {
|
||||
int left, top, right, bottom;
|
||||
if (!BoundingBoxInternal(level, &left, &top, &right, &bottom)) {
|
||||
return nullptr;
|
||||
}
|
||||
if (level == RIL_SYMBOL && cblob_it_ != nullptr && cblob_it_->data()->area() != 0) {
|
||||
return cblob_it_->data()->render();
|
||||
}
|
||||
Box *box = boxCreate(left, top, right - left, bottom - top);
|
||||
Image pix = pixClipRectangle(tesseract_->pix_binary(), box, nullptr);
|
||||
boxDestroy(&box);
|
||||
if (level == RIL_BLOCK || level == RIL_PARA) {
|
||||
// Clip to the block polygon as well.
|
||||
TBOX mask_box;
|
||||
Image mask = it_->block()->block->render_mask(&mask_box);
|
||||
int mask_x = left - mask_box.left();
|
||||
int mask_y = top - (tesseract_->ImageHeight() - mask_box.top());
|
||||
// AND the mask and pix, putting the result in pix.
|
||||
pixRasterop(pix, std::max(0, -mask_x), std::max(0, -mask_y), pixGetWidth(pix),
|
||||
pixGetHeight(pix), PIX_SRC & PIX_DST, mask, std::max(0, mask_x),
|
||||
std::max(0, mask_y));
|
||||
mask.destroy();
|
||||
}
|
||||
return pix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an image of the current object at the given level in greyscale
|
||||
* if available in the input. To guarantee a binary image use BinaryImage.
|
||||
* NOTE that in order to give the best possible image, the bounds are
|
||||
* expanded slightly over the binary connected component, by the supplied
|
||||
* padding, so the top-left position of the returned image is returned
|
||||
* in (left,top). These will most likely not match the coordinates
|
||||
* returned by BoundingBox.
|
||||
* If you do not supply an original image, you will get a binary one.
|
||||
* Use pixDestroy to delete the image after use.
|
||||
*/
|
||||
Pix *PageIterator::GetImage(PageIteratorLevel level, int padding, Pix *original_img, int *left,
|
||||
int *top) const {
|
||||
int right, bottom;
|
||||
if (!BoundingBox(level, left, top, &right, &bottom)) {
|
||||
return nullptr;
|
||||
}
|
||||
if (original_img == nullptr) {
|
||||
return GetBinaryImage(level);
|
||||
}
|
||||
|
||||
// Expand the box.
|
||||
*left = std::max(*left - padding, 0);
|
||||
*top = std::max(*top - padding, 0);
|
||||
right = std::min(right + padding, rect_width_);
|
||||
bottom = std::min(bottom + padding, rect_height_);
|
||||
Box *box = boxCreate(*left, *top, right - *left, bottom - *top);
|
||||
Image grey_pix = pixClipRectangle(original_img, box, nullptr);
|
||||
boxDestroy(&box);
|
||||
if (level == RIL_BLOCK || level == RIL_PARA) {
|
||||
// Clip to the block polygon as well.
|
||||
TBOX mask_box;
|
||||
Image mask = it_->block()->block->render_mask(&mask_box);
|
||||
// Copy the mask registered correctly into an image the size of grey_pix.
|
||||
int mask_x = *left - mask_box.left();
|
||||
int mask_y = *top - (pixGetHeight(original_img) - mask_box.top());
|
||||
int width = pixGetWidth(grey_pix);
|
||||
int height = pixGetHeight(grey_pix);
|
||||
Image resized_mask = pixCreate(width, height, 1);
|
||||
pixRasterop(resized_mask, std::max(0, -mask_x), std::max(0, -mask_y), width, height, PIX_SRC,
|
||||
mask, std::max(0, mask_x), std::max(0, mask_y));
|
||||
mask.destroy();
|
||||
pixDilateBrick(resized_mask, resized_mask, 2 * padding + 1, 2 * padding + 1);
|
||||
pixInvert(resized_mask, resized_mask);
|
||||
pixSetMasked(grey_pix, resized_mask, UINT32_MAX);
|
||||
resized_mask.destroy();
|
||||
}
|
||||
return grey_pix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the baseline of the current object at the given level.
|
||||
* The baseline is the line that passes through (x1, y1) and (x2, y2).
|
||||
* WARNING: with vertical text, baselines may be vertical!
|
||||
*/
|
||||
bool PageIterator::Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2, int *y2) const {
|
||||
if (it_->word() == nullptr) {
|
||||
return false; // Already at the end!
|
||||
}
|
||||
ROW *row = it_->row()->row;
|
||||
WERD *word = it_->word()->word;
|
||||
TBOX box =
|
||||
(level == RIL_WORD || level == RIL_SYMBOL) ? word->bounding_box() : row->bounding_box();
|
||||
int left = box.left();
|
||||
ICOORD startpt(left, static_cast<int16_t>(row->base_line(left) + 0.5));
|
||||
int right = box.right();
|
||||
ICOORD endpt(right, static_cast<int16_t>(row->base_line(right) + 0.5));
|
||||
// Rotate to image coordinates and convert to global image coords.
|
||||
startpt.rotate(it_->block()->block->re_rotation());
|
||||
endpt.rotate(it_->block()->block->re_rotation());
|
||||
*x1 = startpt.x() / scale_ + rect_left_;
|
||||
*y1 = (rect_height_ - startpt.y()) / scale_ + rect_top_;
|
||||
*x2 = endpt.x() / scale_ + rect_left_;
|
||||
*y2 = (rect_height_ - endpt.y()) / scale_ + rect_top_;
|
||||
return true;
|
||||
}
|
||||
|
||||
void PageIterator::Orientation(tesseract::Orientation *orientation,
|
||||
tesseract::WritingDirection *writing_direction,
|
||||
tesseract::TextlineOrder *textline_order,
|
||||
float *deskew_angle) const {
|
||||
BLOCK *block = it_->block()->block;
|
||||
|
||||
// Orientation
|
||||
FCOORD up_in_image(0.0, 1.0);
|
||||
up_in_image.unrotate(block->classify_rotation());
|
||||
up_in_image.rotate(block->re_rotation());
|
||||
|
||||
if (up_in_image.x() == 0.0F) {
|
||||
if (up_in_image.y() > 0.0F) {
|
||||
*orientation = ORIENTATION_PAGE_UP;
|
||||
} else {
|
||||
*orientation = ORIENTATION_PAGE_DOWN;
|
||||
}
|
||||
} else if (up_in_image.x() > 0.0F) {
|
||||
*orientation = ORIENTATION_PAGE_RIGHT;
|
||||
} else {
|
||||
*orientation = ORIENTATION_PAGE_LEFT;
|
||||
}
|
||||
|
||||
// Writing direction
|
||||
bool is_vertical_text = (block->classify_rotation().x() == 0.0);
|
||||
bool right_to_left = block->right_to_left();
|
||||
*writing_direction = is_vertical_text ? WRITING_DIRECTION_TOP_TO_BOTTOM
|
||||
: (right_to_left ? WRITING_DIRECTION_RIGHT_TO_LEFT
|
||||
: WRITING_DIRECTION_LEFT_TO_RIGHT);
|
||||
|
||||
// Textline Order
|
||||
const bool is_mongolian = false; // TODO(eger): fix me
|
||||
*textline_order = is_vertical_text ? (is_mongolian ? TEXTLINE_ORDER_LEFT_TO_RIGHT
|
||||
: TEXTLINE_ORDER_RIGHT_TO_LEFT)
|
||||
: TEXTLINE_ORDER_TOP_TO_BOTTOM;
|
||||
|
||||
// Deskew angle
|
||||
FCOORD skew = block->skew(); // true horizontal for textlines
|
||||
*deskew_angle = -skew.angle();
|
||||
}
|
||||
|
||||
void PageIterator::ParagraphInfo(tesseract::ParagraphJustification *just, bool *is_list_item,
|
||||
bool *is_crown, int *first_line_indent) const {
|
||||
*just = tesseract::JUSTIFICATION_UNKNOWN;
|
||||
if (!it_->row() || !it_->row()->row || !it_->row()->row->para() ||
|
||||
!it_->row()->row->para()->model) {
|
||||
return;
|
||||
}
|
||||
|
||||
PARA *para = it_->row()->row->para();
|
||||
*is_list_item = para->is_list_item;
|
||||
*is_crown = para->is_very_first_or_continuation;
|
||||
*first_line_indent = para->model->first_indent() - para->model->body_indent();
|
||||
*just = para->model->justification();
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets up the internal data for iterating the blobs of a new word, then
|
||||
* moves the iterator to the given offset.
|
||||
*/
|
||||
void PageIterator::BeginWord(int offset) {
|
||||
WERD_RES *word_res = it_->word();
|
||||
if (word_res == nullptr) {
|
||||
// This is a non-text block, so there is no word.
|
||||
word_length_ = 0;
|
||||
blob_index_ = 0;
|
||||
word_ = nullptr;
|
||||
return;
|
||||
}
|
||||
if (word_res->best_choice != nullptr) {
|
||||
// Recognition has been done, so we are using the box_word, which
|
||||
// is already baseline denormalized.
|
||||
word_length_ = word_res->best_choice->length();
|
||||
if (word_res->box_word != nullptr) {
|
||||
if (word_res->box_word->length() != word_length_) {
|
||||
tprintf("Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: ", word_length_,
|
||||
word_res->best_choice->unichar_string().c_str(), word_res->box_word->length());
|
||||
word_res->box_word->bounding_box().print();
|
||||
}
|
||||
ASSERT_HOST(word_res->box_word->length() == word_length_);
|
||||
}
|
||||
word_ = nullptr;
|
||||
// We will be iterating the box_word.
|
||||
delete cblob_it_;
|
||||
cblob_it_ = nullptr;
|
||||
} else {
|
||||
// No recognition yet, so a "symbol" is a cblob.
|
||||
word_ = word_res->word;
|
||||
ASSERT_HOST(word_->cblob_list() != nullptr);
|
||||
word_length_ = word_->cblob_list()->length();
|
||||
if (cblob_it_ == nullptr) {
|
||||
cblob_it_ = new C_BLOB_IT;
|
||||
}
|
||||
cblob_it_->set_to_list(word_->cblob_list());
|
||||
}
|
||||
for (blob_index_ = 0; blob_index_ < offset; ++blob_index_) {
|
||||
if (cblob_it_ != nullptr) {
|
||||
cblob_it_->forward();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool PageIterator::SetWordBlamerBundle(BlamerBundle *blamer_bundle) {
|
||||
if (it_->word() != nullptr) {
|
||||
it_->word()->blamer_bundle = blamer_bundle;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
414
3rdparty/tesseract_ocr/tesseract/src/ccmain/pagesegmain.cpp
vendored
Normal file
414
3rdparty/tesseract_ocr/tesseract/src/ccmain/pagesegmain.cpp
vendored
Normal file
|
@ -0,0 +1,414 @@
|
|||
/**********************************************************************
|
||||
* File: pagesegmain.cpp
|
||||
* Description: Top-level page segmenter for Tesseract.
|
||||
* Author: Ray Smith
|
||||
*
|
||||
* (C) Copyright 2008, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifdef _WIN32
|
||||
# ifndef unlink
|
||||
# include <io.h>
|
||||
# endif
|
||||
#else
|
||||
# include <unistd.h>
|
||||
#endif // _WIN32
|
||||
|
||||
// Include automatically generated configuration file if running autoconf.
|
||||
#ifdef HAVE_CONFIG_H
|
||||
# include "config_auto.h"
|
||||
#endif
|
||||
|
||||
#include <allheaders.h>
|
||||
#include "blobbox.h"
|
||||
#include "blread.h"
|
||||
#include "colfind.h"
|
||||
#include "debugpixa.h"
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
# include "equationdetect.h"
|
||||
#endif
|
||||
#include <tesseract/osdetect.h>
|
||||
#include "imagefind.h"
|
||||
#include "linefind.h"
|
||||
#include "makerow.h"
|
||||
#include "tabvector.h"
|
||||
#include "tesseractclass.h"
|
||||
#include "tessvars.h"
|
||||
#include "textord.h"
|
||||
#include "tordmain.h"
|
||||
#include "wordseg.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Max erosions to perform in removing an enclosing circle.
|
||||
const int kMaxCircleErosions = 8;
|
||||
|
||||
// Helper to remove an enclosing circle from an image.
|
||||
// If there isn't one, then the image will most likely get badly mangled.
|
||||
// The returned pix must be pixDestroyed after use. nullptr may be returned
|
||||
// if the image doesn't meet the trivial conditions that it uses to determine
|
||||
// success.
|
||||
static Image RemoveEnclosingCircle(Image pixs) {
|
||||
Image pixsi = pixInvert(nullptr, pixs);
|
||||
Image pixc = pixCreateTemplate(pixs);
|
||||
pixSetOrClearBorder(pixc, 1, 1, 1, 1, PIX_SET);
|
||||
pixSeedfillBinary(pixc, pixc, pixsi, 4);
|
||||
pixInvert(pixc, pixc);
|
||||
pixsi.destroy();
|
||||
Image pixt = pixs & pixc;
|
||||
l_int32 max_count;
|
||||
pixCountConnComp(pixt, 8, &max_count);
|
||||
// The count has to go up before we start looking for the minimum.
|
||||
l_int32 min_count = INT32_MAX;
|
||||
Image pixout = nullptr;
|
||||
for (int i = 1; i < kMaxCircleErosions; i++) {
|
||||
pixt.destroy();
|
||||
pixErodeBrick(pixc, pixc, 3, 3);
|
||||
pixt = pixs & pixc;
|
||||
l_int32 count;
|
||||
pixCountConnComp(pixt, 8, &count);
|
||||
if (i == 1 || count > max_count) {
|
||||
max_count = count;
|
||||
min_count = count;
|
||||
} else if (count < min_count) {
|
||||
min_count = count;
|
||||
pixout.destroy();
|
||||
pixout = pixt.copy(); // Save the best.
|
||||
} else if (count >= min_count) {
|
||||
break; // We have passed by the best.
|
||||
}
|
||||
}
|
||||
pixt.destroy();
|
||||
pixc.destroy();
|
||||
return pixout;
|
||||
}
|
||||
|
||||
/**
|
||||
* Segment the page according to the current value of tessedit_pageseg_mode.
|
||||
* pix_binary_ is used as the source image and should not be nullptr.
|
||||
* On return the blocks list owns all the constructed page layout.
|
||||
*/
|
||||
int Tesseract::SegmentPage(const char *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess,
|
||||
OSResults *osr) {
|
||||
ASSERT_HOST(pix_binary_ != nullptr);
|
||||
int width = pixGetWidth(pix_binary_);
|
||||
int height = pixGetHeight(pix_binary_);
|
||||
// Get page segmentation mode.
|
||||
auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode));
|
||||
// If a UNLV zone file can be found, use that instead of segmentation.
|
||||
if (!PSM_COL_FIND_ENABLED(pageseg_mode) && input_file != nullptr && input_file[0] != '\0') {
|
||||
std::string name = input_file;
|
||||
const char *lastdot = strrchr(name.c_str(), '.');
|
||||
if (lastdot != nullptr) {
|
||||
name[lastdot - name.c_str()] = '\0';
|
||||
}
|
||||
read_unlv_file(name, width, height, blocks);
|
||||
}
|
||||
if (blocks->empty()) {
|
||||
// No UNLV file present. Work according to the PageSegMode.
|
||||
// First make a single block covering the whole image.
|
||||
BLOCK_IT block_it(blocks);
|
||||
auto *block = new BLOCK("", true, 0, 0, 0, 0, width, height);
|
||||
block->set_right_to_left(right_to_left());
|
||||
block_it.add_to_end(block);
|
||||
} else {
|
||||
// UNLV file present. Use PSM_SINGLE_BLOCK.
|
||||
pageseg_mode = PSM_SINGLE_BLOCK;
|
||||
}
|
||||
// The diacritic_blobs holds noise blobs that may be diacritics. They
|
||||
// are separated out on areas of the image that seem noisy and short-circuit
|
||||
// the layout process, going straight from the initial partition creation
|
||||
// right through to after word segmentation, where they are added to the
|
||||
// rej_cblobs list of the most appropriate word. From there classification
|
||||
// will determine whether they are used.
|
||||
BLOBNBOX_LIST diacritic_blobs;
|
||||
int auto_page_seg_ret_val = 0;
|
||||
TO_BLOCK_LIST to_blocks;
|
||||
if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
|
||||
PSM_SPARSE(pageseg_mode)) {
|
||||
auto_page_seg_ret_val =
|
||||
AutoPageSeg(pageseg_mode, blocks, &to_blocks,
|
||||
enable_noise_removal ? &diacritic_blobs : nullptr, osd_tess, osr);
|
||||
if (pageseg_mode == PSM_OSD_ONLY) {
|
||||
return auto_page_seg_ret_val;
|
||||
}
|
||||
// To create blobs from the image region bounds uncomment this line:
|
||||
// to_blocks.clear(); // Uncomment to go back to the old mode.
|
||||
} else {
|
||||
deskew_ = FCOORD(1.0f, 0.0f);
|
||||
reskew_ = FCOORD(1.0f, 0.0f);
|
||||
if (pageseg_mode == PSM_CIRCLE_WORD) {
|
||||
Image pixcleaned = RemoveEnclosingCircle(pix_binary_);
|
||||
if (pixcleaned != nullptr) {
|
||||
pix_binary_.destroy();
|
||||
pix_binary_ = pixcleaned;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (auto_page_seg_ret_val < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (blocks->empty()) {
|
||||
if (textord_debug_tabfind) {
|
||||
tprintf("Empty page\n");
|
||||
}
|
||||
return 0; // AutoPageSeg found an empty page.
|
||||
}
|
||||
bool splitting = pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT;
|
||||
bool cjk_mode = textord_use_cjk_fp_model;
|
||||
|
||||
textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_, pix_thresholds_,
|
||||
pix_grey_, splitting || cjk_mode, &diacritic_blobs, blocks, &to_blocks);
|
||||
return auto_page_seg_ret_val;
|
||||
}
|
||||
|
||||
/**
|
||||
* Auto page segmentation. Divide the page image into blocks of uniform
|
||||
* text linespacing and images.
|
||||
*
|
||||
* Resolution (in ppi) is derived from the input image.
|
||||
*
|
||||
* The output goes in the blocks list with corresponding TO_BLOCKs in the
|
||||
* to_blocks list.
|
||||
*
|
||||
* If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide
|
||||
* the image into columns, but multiple blocks are still made if the text is
|
||||
* of non-uniform linespacing.
|
||||
*
|
||||
* If diacritic_blobs is non-null, then diacritics/noise blobs, that would
|
||||
* confuse layout analysis by causing textline overlap, are placed there,
|
||||
* with the expectation that they will be reassigned to words later and
|
||||
* noise/diacriticness determined via classification.
|
||||
*
|
||||
* If osd (orientation and script detection) is true then that is performed
|
||||
* as well. If only_osd is true, then only orientation and script detection is
|
||||
* performed. If osd is desired, (osd or only_osd) then osr_tess must be
|
||||
* another Tesseract that was initialized especially for osd, and the results
|
||||
* will be output into osr (orientation and script result).
|
||||
*/
|
||||
int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks,
|
||||
BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr) {
|
||||
Image photomask_pix = nullptr;
|
||||
Image musicmask_pix = nullptr;
|
||||
// The blocks made by the ColumnFinder. Moved to blocks before return.
|
||||
BLOCK_LIST found_blocks;
|
||||
TO_BLOCK_LIST temp_blocks;
|
||||
|
||||
ColumnFinder *finder = SetupPageSegAndDetectOrientation(
|
||||
pageseg_mode, blocks, osd_tess, osr, &temp_blocks, &photomask_pix,
|
||||
pageseg_apply_music_mask ? &musicmask_pix : nullptr);
|
||||
int result = 0;
|
||||
if (finder != nullptr) {
|
||||
TO_BLOCK_IT to_block_it(&temp_blocks);
|
||||
TO_BLOCK *to_block = to_block_it.data();
|
||||
if (musicmask_pix != nullptr) {
|
||||
// TODO(rays) pass the musicmask_pix into FindBlocks and mark music
|
||||
// blocks separately. For now combine with photomask_pix.
|
||||
photomask_pix |= musicmask_pix;
|
||||
}
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
if (equ_detect_) {
|
||||
finder->SetEquationDetect(equ_detect_);
|
||||
}
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, to_block,
|
||||
photomask_pix, pix_thresholds_, pix_grey_, &pixa_debug_,
|
||||
&found_blocks, diacritic_blobs, to_blocks);
|
||||
if (result >= 0) {
|
||||
finder->GetDeskewVectors(&deskew_, &reskew_);
|
||||
}
|
||||
delete finder;
|
||||
}
|
||||
photomask_pix.destroy();
|
||||
musicmask_pix.destroy();
|
||||
if (result < 0) {
|
||||
return result;
|
||||
}
|
||||
|
||||
blocks->clear();
|
||||
BLOCK_IT block_it(blocks);
|
||||
// Move the found blocks to the input/output blocks.
|
||||
block_it.add_list_after(&found_blocks);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Helper adds all the scripts from sid_set converted to ids from osd_set to
|
||||
// allowed_ids.
|
||||
static void AddAllScriptsConverted(const UNICHARSET &sid_set, const UNICHARSET &osd_set,
|
||||
std::vector<int> *allowed_ids) {
|
||||
for (int i = 0; i < sid_set.get_script_table_size(); ++i) {
|
||||
if (i != sid_set.null_sid()) {
|
||||
const char *script = sid_set.get_script_from_script_id(i);
|
||||
allowed_ids->push_back(osd_set.get_script_id_from_name(script));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets up auto page segmentation, determines the orientation, and corrects it.
|
||||
* Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to
|
||||
* facilitate testing.
|
||||
* photo_mask_pix is a pointer to a nullptr pointer that will be filled on
|
||||
* return with the leptonica photo mask, which must be pixDestroyed by the
|
||||
* caller. to_blocks is an empty list that will be filled with (usually a
|
||||
* single) block that is used during layout analysis. This ugly API is required
|
||||
* because of the possibility of a unlv zone file.
|
||||
* TODO(rays) clean this up.
|
||||
* See AutoPageSeg for other arguments.
|
||||
* The returned ColumnFinder must be deleted after use.
|
||||
*/
|
||||
ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode,
|
||||
BLOCK_LIST *blocks, Tesseract *osd_tess,
|
||||
OSResults *osr, TO_BLOCK_LIST *to_blocks,
|
||||
Image *photo_mask_pix,
|
||||
Image *music_mask_pix) {
|
||||
int vertical_x = 0;
|
||||
int vertical_y = 1;
|
||||
TabVector_LIST v_lines;
|
||||
TabVector_LIST h_lines;
|
||||
ICOORD bleft(0, 0);
|
||||
|
||||
ASSERT_HOST(pix_binary_ != nullptr);
|
||||
if (tessedit_dump_pageseg_images) {
|
||||
pixa_debug_.AddPix(pix_binary_, "PageSegInput");
|
||||
}
|
||||
// Leptonica is used to find the rule/separator lines in the input.
|
||||
LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_,
|
||||
&vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines);
|
||||
if (tessedit_dump_pageseg_images) {
|
||||
pixa_debug_.AddPix(pix_binary_, "NoLines");
|
||||
}
|
||||
// Leptonica is used to find a mask of the photo regions in the input.
|
||||
*photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
|
||||
if (tessedit_dump_pageseg_images) {
|
||||
Image pix_no_image_ = nullptr;
|
||||
if (*photo_mask_pix != nullptr) {
|
||||
pix_no_image_ = pixSubtract(nullptr, pix_binary_, *photo_mask_pix);
|
||||
} else {
|
||||
pix_no_image_ = pix_binary_.clone();
|
||||
}
|
||||
pixa_debug_.AddPix(pix_no_image_, "NoImages");
|
||||
pix_no_image_.destroy();
|
||||
}
|
||||
if (!PSM_COL_FIND_ENABLED(pageseg_mode)) {
|
||||
v_lines.clear();
|
||||
}
|
||||
|
||||
// The rest of the algorithm uses the usual connected components.
|
||||
textord_.find_components(pix_binary_, blocks, to_blocks);
|
||||
|
||||
TO_BLOCK_IT to_block_it(to_blocks);
|
||||
// There must be exactly one input block.
|
||||
// TODO(rays) handle new textline finding with a UNLV zone file.
|
||||
ASSERT_HOST(to_blocks->singleton());
|
||||
TO_BLOCK *to_block = to_block_it.data();
|
||||
TBOX blkbox = to_block->block->pdblk.bounding_box();
|
||||
ColumnFinder *finder = nullptr;
|
||||
int estimated_resolution = source_resolution_;
|
||||
if (source_resolution_ == kMinCredibleResolution) {
|
||||
// Try to estimate resolution from typical body text size.
|
||||
int res = IntCastRounded(to_block->line_size * kResolutionEstimationFactor);
|
||||
if (res > estimated_resolution && res < kMaxCredibleResolution) {
|
||||
estimated_resolution = res;
|
||||
tprintf("Estimating resolution as %d\n", estimated_resolution);
|
||||
}
|
||||
}
|
||||
|
||||
if (to_block->line_size >= 2) {
|
||||
finder = new ColumnFinder(static_cast<int>(to_block->line_size), blkbox.botleft(),
|
||||
blkbox.topright(), estimated_resolution, textord_use_cjk_fp_model,
|
||||
textord_tabfind_aligned_gap_fraction, &v_lines, &h_lines, vertical_x,
|
||||
vertical_y);
|
||||
|
||||
finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
if (equ_detect_) {
|
||||
equ_detect_->LabelSpecialText(to_block);
|
||||
}
|
||||
|
||||
BLOBNBOX_CLIST osd_blobs;
|
||||
// osd_orientation is the number of 90 degree rotations to make the
|
||||
// characters upright. (See tesseract/osdetect.h for precise definition.)
|
||||
// We want the text lines horizontal, (vertical text indicates vertical
|
||||
// textlines) which may conflict (eg vertically written CJK).
|
||||
int osd_orientation = 0;
|
||||
bool vertical_text =
|
||||
textord_tabfind_force_vertical_text || pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
|
||||
if (!vertical_text && textord_tabfind_vertical_text && PSM_ORIENTATION_ENABLED(pageseg_mode)) {
|
||||
vertical_text = finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio, to_block,
|
||||
&osd_blobs);
|
||||
}
|
||||
if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != nullptr && osr != nullptr) {
|
||||
std::vector<int> osd_scripts;
|
||||
if (osd_tess != this) {
|
||||
// We are running osd as part of layout analysis, so constrain the
|
||||
// scripts to those allowed by *this.
|
||||
AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
|
||||
for (auto &lang : sub_langs_) {
|
||||
AddAllScriptsConverted(lang->unicharset, osd_tess->unicharset, &osd_scripts);
|
||||
}
|
||||
}
|
||||
os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
|
||||
if (pageseg_mode == PSM_OSD_ONLY) {
|
||||
delete finder;
|
||||
return nullptr;
|
||||
}
|
||||
osd_orientation = osr->best_result.orientation_id;
|
||||
double osd_score = osr->orientations[osd_orientation];
|
||||
double osd_margin = min_orientation_margin * 2;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
if (i != osd_orientation && osd_score - osr->orientations[i] < osd_margin) {
|
||||
osd_margin = osd_score - osr->orientations[i];
|
||||
}
|
||||
}
|
||||
int best_script_id = osr->best_result.script_id;
|
||||
const char *best_script_str = osd_tess->unicharset.get_script_from_script_id(best_script_id);
|
||||
bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
|
||||
best_script_id == osd_tess->unicharset.hiragana_sid() ||
|
||||
best_script_id == osd_tess->unicharset.katakana_sid() ||
|
||||
strcmp("Japanese", best_script_str) == 0 ||
|
||||
strcmp("Korean", best_script_str) == 0 || strcmp("Hangul", best_script_str) == 0;
|
||||
if (cjk) {
|
||||
finder->set_cjk_script(true);
|
||||
}
|
||||
if (osd_margin < min_orientation_margin) {
|
||||
// The margin is weak.
|
||||
if (!cjk && !vertical_text && osd_orientation == 2) {
|
||||
// upside down latin text is improbable with such a weak margin.
|
||||
tprintf(
|
||||
"OSD: Weak margin (%.2f), horiz textlines, not CJK: "
|
||||
"Don't rotate.\n",
|
||||
osd_margin);
|
||||
osd_orientation = 0;
|
||||
} else {
|
||||
tprintf(
|
||||
"OSD: Weak margin (%.2f) for %d blob text block, "
|
||||
"but using orientation anyway: %d\n",
|
||||
osd_margin, osd_blobs.length(), osd_orientation);
|
||||
}
|
||||
}
|
||||
}
|
||||
osd_blobs.shallow_clear();
|
||||
finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
|
||||
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
}
|
||||
|
||||
return finder;
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
42
3rdparty/tesseract_ocr/tesseract/src/ccmain/pagewalk.cpp
vendored
Normal file
42
3rdparty/tesseract_ocr/tesseract/src/ccmain/pagewalk.cpp
vendored
Normal file
|
@ -0,0 +1,42 @@
|
|||
/**********************************************************************
|
||||
* File: pagewalk.cpp (Formerly walkers.c)
|
||||
* Description: Block list processors
|
||||
* Author: Phil Cheatle
|
||||
* Created: Thu Oct 10 16:25:24 BST 1991
|
||||
*
|
||||
* (C) Copyright 1991, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "pageres.h"
|
||||
#include "tesseractclass.h"
|
||||
|
||||
namespace tesseract {
|
||||
/**
|
||||
* @name process_selected_words()
|
||||
*
|
||||
* Walk the current block list applying the specified word processor function
|
||||
* to each word that overlaps the selection_box.
|
||||
*/
|
||||
void Tesseract::process_selected_words(
|
||||
PAGE_RES *page_res, // blocks to check
|
||||
TBOX &selection_box, bool (tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it)) {
|
||||
for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != nullptr; page_res_it.forward()) {
|
||||
WERD *word = page_res_it.word()->word;
|
||||
if (word->bounding_box().overlap(selection_box)) {
|
||||
if (!(this->*word_processor)(&page_res_it)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace tesseract
|
70
3rdparty/tesseract_ocr/tesseract/src/ccmain/par_control.cpp
vendored
Normal file
70
3rdparty/tesseract_ocr/tesseract/src/ccmain/par_control.cpp
vendored
Normal file
|
@ -0,0 +1,70 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: par_control.cpp
|
||||
// Description: Control code for parallel implementation.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2013, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "tesseractclass.h"
|
||||
#ifdef _OPENMP
|
||||
# include <omp.h>
|
||||
#endif // _OPENMP
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
struct BlobData {
|
||||
BlobData() = default;
|
||||
BlobData(int index, Tesseract *tess, const WERD_RES &word)
|
||||
: blob(word.chopped_word->blobs[index])
|
||||
, tesseract(tess)
|
||||
, choices(&(*word.ratings)(index, index)) {}
|
||||
|
||||
TBLOB *blob = nullptr;
|
||||
Tesseract *tesseract = nullptr;
|
||||
BLOB_CHOICE_LIST **choices = nullptr;
|
||||
};
|
||||
|
||||
void Tesseract::PrerecAllWordsPar(const std::vector<WordData> &words) {
|
||||
// Prepare all the blobs.
|
||||
std::vector<BlobData> blobs;
|
||||
for (const auto &w : words) {
|
||||
if (w.word->ratings != nullptr && w.word->ratings->get(0, 0) == nullptr) {
|
||||
for (int s = 0; s < w.lang_words.size(); ++s) {
|
||||
Tesseract *sub = s < sub_langs_.size() ? sub_langs_[s] : this;
|
||||
const WERD_RES &word = *w.lang_words[s];
|
||||
for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) {
|
||||
blobs.emplace_back(b, sub, word);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Pre-classify all the blobs.
|
||||
if (tessedit_parallelize > 1) {
|
||||
#ifdef _OPENMP
|
||||
# pragma omp parallel for num_threads(10)
|
||||
#endif // _OPENMP
|
||||
// NOLINTNEXTLINE(modernize-loop-convert)
|
||||
for (size_t b = 0; b < blobs.size(); ++b) {
|
||||
*blobs[b].choices =
|
||||
blobs[b].tesseract->classify_blob(blobs[b].blob, "par", ScrollView::WHITE, nullptr);
|
||||
}
|
||||
} else {
|
||||
// TODO(AMD) parallelize this.
|
||||
for (auto &blob : blobs) {
|
||||
*blob.choices = blob.tesseract->classify_blob(blob.blob, "par", ScrollView::WHITE, nullptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
2628
3rdparty/tesseract_ocr/tesseract/src/ccmain/paragraphs.cpp
vendored
Normal file
2628
3rdparty/tesseract_ocr/tesseract/src/ccmain/paragraphs.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
104
3rdparty/tesseract_ocr/tesseract/src/ccmain/paragraphs.h
vendored
Normal file
104
3rdparty/tesseract_ocr/tesseract/src/ccmain/paragraphs.h
vendored
Normal file
|
@ -0,0 +1,104 @@
|
|||
/**********************************************************************
|
||||
* File: paragraphs.h
|
||||
* Description: Paragraph Detection data structures.
|
||||
* Author: David Eger
|
||||
* Created: 25 February 2011
|
||||
*
|
||||
* (C) Copyright 2011, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_
|
||||
#define TESSERACT_CCMAIN_PARAGRAPHS_H_
|
||||
|
||||
#include <list>
|
||||
#include <string>
|
||||
#include "rect.h" // for TBOX
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class MutableIterator;
|
||||
class ParagraphModel;
|
||||
class PARA_LIST;
|
||||
struct PARA;
|
||||
|
||||
// This structure captures all information needed about a text line for the
|
||||
// purposes of paragraph detection. It is meant to be exceedingly light-weight
|
||||
// so that we can easily test paragraph detection independent of the rest of
|
||||
// Tesseract.
|
||||
class RowInfo {
|
||||
public:
|
||||
// Constant data derived from Tesseract output.
|
||||
std::string text; // the full UTF-8 text of the line.
|
||||
bool ltr; // whether the majority of the text is left-to-right
|
||||
// TODO(eger) make this more fine-grained.
|
||||
|
||||
bool has_leaders; // does the line contain leader dots (.....)?
|
||||
bool has_drop_cap; // does the line have a drop cap?
|
||||
int pix_ldistance; // distance to the left pblock boundary in pixels
|
||||
int pix_rdistance; // distance to the right pblock boundary in pixels
|
||||
float pix_xheight; // guessed xheight for the line
|
||||
int average_interword_space; // average space between words in pixels.
|
||||
|
||||
int num_words;
|
||||
TBOX lword_box; // in normalized (horiz text rows) space
|
||||
TBOX rword_box; // in normalized (horiz text rows) space
|
||||
|
||||
std::string lword_text; // the UTF-8 text of the leftmost werd
|
||||
std::string rword_text; // the UTF-8 text of the rightmost werd
|
||||
|
||||
// The text of a paragraph typically starts with the start of an idea and
|
||||
// ends with the end of an idea. Here we define paragraph as something that
|
||||
// may have a first line indent and a body indent which may be different.
|
||||
// Typical words that start an idea are:
|
||||
// 1. Words in western scripts that start with
|
||||
// a capital letter, for example "The"
|
||||
// 2. Bulleted or numbered list items, for
|
||||
// example "2."
|
||||
// Typical words which end an idea are words ending in punctuation marks. In
|
||||
// this vocabulary, each list item is represented as a paragraph.
|
||||
bool lword_indicates_list_item;
|
||||
bool lword_likely_starts_idea;
|
||||
bool lword_likely_ends_idea;
|
||||
|
||||
bool rword_indicates_list_item;
|
||||
bool rword_likely_starts_idea;
|
||||
bool rword_likely_ends_idea;
|
||||
};
|
||||
|
||||
// Main entry point for Paragraph Detection Algorithm.
|
||||
//
|
||||
// Given a set of equally spaced textlines (described by row_infos),
|
||||
// Split them into paragraphs. See http://goto/paragraphstalk
|
||||
//
|
||||
// Output:
|
||||
// row_owners - one pointer for each row, to the paragraph it belongs to.
|
||||
// paragraphs - this is the actual list of PARA objects.
|
||||
// models - the list of paragraph models referenced by the PARA objects.
|
||||
// caller is responsible for deleting the models.
|
||||
TESS_API
|
||||
void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
|
||||
std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,
|
||||
std::vector<ParagraphModel *> *models);
|
||||
|
||||
// Given a MutableIterator to the start of a block, run DetectParagraphs on
|
||||
// that block and commit the results to the underlying ROW and BLOCK structs,
|
||||
// saving the ParagraphModels in models. Caller owns the models.
|
||||
// We use unicharset during the function to answer questions such as "is the
|
||||
// first letter of this word upper case?"
|
||||
TESS_API
|
||||
void DetectParagraphs(int debug_level, bool after_text_recognition,
|
||||
const MutableIterator *block_start, std::vector<ParagraphModel *> *models);
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_CCMAIN_PARAGRAPHS_H_
|
309
3rdparty/tesseract_ocr/tesseract/src/ccmain/paragraphs_internal.h
vendored
Normal file
309
3rdparty/tesseract_ocr/tesseract/src/ccmain/paragraphs_internal.h
vendored
Normal file
|
@ -0,0 +1,309 @@
|
|||
/**********************************************************************
|
||||
* File: paragraphs_internal.h
|
||||
* Description: Paragraph Detection internal data structures.
|
||||
* Author: David Eger
|
||||
*
|
||||
* (C) Copyright 2011, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
|
||||
#define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
|
||||
|
||||
#include <tesseract/publictypes.h> // for ParagraphJustification
|
||||
#include "paragraphs.h"
|
||||
|
||||
// NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
|
||||
// DATA STRUCTURES OR FUNCTIONS IN THIS FILE.
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class UNICHARSET;
|
||||
class WERD_CHOICE;
|
||||
|
||||
// Return whether the given word is likely to be a list item start word.
|
||||
TESS_API
|
||||
bool AsciiLikelyListItem(const std::string &word);
|
||||
|
||||
// Return the first Unicode Codepoint from werd[pos].
|
||||
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
|
||||
|
||||
// Set right word attributes given either a unicharset and werd or a utf8
|
||||
// string.
|
||||
TESS_API
|
||||
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
|
||||
bool *is_list, bool *starts_idea, bool *ends_idea);
|
||||
|
||||
// Set left word attributes given either a unicharset and werd or a utf8 string.
|
||||
TESS_API
|
||||
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
|
||||
bool *is_list, bool *starts_idea, bool *ends_idea);
|
||||
|
||||
enum LineType {
|
||||
LT_START = 'S', // First line of a paragraph.
|
||||
LT_BODY = 'C', // Continuation line of a paragraph.
|
||||
LT_UNKNOWN = 'U', // No clues.
|
||||
LT_MULTIPLE = 'M', // Matches for both LT_START and LT_BODY.
|
||||
};
|
||||
|
||||
// The first paragraph in a page of body text is often un-indented.
|
||||
// This is a typographic convention which is common to indicate either that:
|
||||
// (1) The paragraph is the continuation of a previous paragraph, or
|
||||
// (2) The paragraph is the first paragraph in a chapter.
|
||||
//
|
||||
// I refer to such paragraphs as "crown"s, and the output of the paragraph
|
||||
// detection algorithm attempts to give them the same paragraph model as
|
||||
// the rest of the body text.
|
||||
//
|
||||
// Nonetheless, while building hypotheses, it is useful to mark the lines
|
||||
// of crown paragraphs temporarily as crowns, either aligned left or right.
|
||||
extern const ParagraphModel *kCrownLeft;
|
||||
extern const ParagraphModel *kCrownRight;
|
||||
|
||||
inline bool StrongModel(const ParagraphModel *model) {
|
||||
return model != nullptr && model != kCrownLeft && model != kCrownRight;
|
||||
}
|
||||
|
||||
struct LineHypothesis {
|
||||
LineHypothesis() : ty(LT_UNKNOWN), model(nullptr) {}
|
||||
LineHypothesis(LineType line_type, const ParagraphModel *m) : ty(line_type), model(m) {}
|
||||
LineHypothesis(const LineHypothesis &other) = default;
|
||||
|
||||
// Copy assignment operator.
|
||||
LineHypothesis &operator=(const LineHypothesis &other) = default;
|
||||
|
||||
bool operator==(const LineHypothesis &other) const {
|
||||
return ty == other.ty && model == other.model;
|
||||
}
|
||||
|
||||
LineType ty;
|
||||
const ParagraphModel *model;
|
||||
};
|
||||
|
||||
class ParagraphTheory; // Forward Declaration
|
||||
|
||||
using SetOfModels = std::vector<const ParagraphModel *>;
|
||||
|
||||
// Row Scratch Registers are data generated by the paragraph detection
|
||||
// algorithm based on a RowInfo input.
|
||||
class RowScratchRegisters {
|
||||
public:
|
||||
// We presume row will outlive us.
|
||||
void Init(const RowInfo &row);
|
||||
|
||||
LineType GetLineType() const;
|
||||
|
||||
LineType GetLineType(const ParagraphModel *model) const;
|
||||
|
||||
// Mark this as a start line type, sans model. This is useful for the
|
||||
// initial marking of probable body lines or paragraph start lines.
|
||||
void SetStartLine();
|
||||
|
||||
// Mark this as a body line type, sans model. This is useful for the
|
||||
// initial marking of probably body lines or paragraph start lines.
|
||||
void SetBodyLine();
|
||||
|
||||
// Record that this row fits as a paragraph start line in the given model,
|
||||
void AddStartLine(const ParagraphModel *model);
|
||||
// Record that this row fits as a paragraph body line in the given model,
|
||||
void AddBodyLine(const ParagraphModel *model);
|
||||
|
||||
// Clear all hypotheses about this line.
|
||||
void SetUnknown() {
|
||||
hypotheses_.clear();
|
||||
}
|
||||
|
||||
// Append all hypotheses of strong models that match this row as a start.
|
||||
void StartHypotheses(SetOfModels *models) const;
|
||||
|
||||
// Append all hypotheses of strong models matching this row.
|
||||
void StrongHypotheses(SetOfModels *models) const;
|
||||
|
||||
// Append all hypotheses for this row.
|
||||
void NonNullHypotheses(SetOfModels *models) const;
|
||||
|
||||
// Discard any hypotheses whose model is not in the given list.
|
||||
void DiscardNonMatchingHypotheses(const SetOfModels &models);
|
||||
|
||||
// If we have only one hypothesis and that is that this line is a paragraph
|
||||
// start line of a certain model, return that model. Else return nullptr.
|
||||
const ParagraphModel *UniqueStartHypothesis() const;
|
||||
|
||||
// If we have only one hypothesis and that is that this line is a paragraph
|
||||
// body line of a certain model, return that model. Else return nullptr.
|
||||
const ParagraphModel *UniqueBodyHypothesis() const;
|
||||
|
||||
// Return the indentation for the side opposite of the aligned side.
|
||||
int OffsideIndent(tesseract::ParagraphJustification just) const {
|
||||
switch (just) {
|
||||
case tesseract::JUSTIFICATION_RIGHT:
|
||||
return lindent_;
|
||||
case tesseract::JUSTIFICATION_LEFT:
|
||||
return rindent_;
|
||||
default:
|
||||
return lindent_ > rindent_ ? lindent_ : rindent_;
|
||||
}
|
||||
}
|
||||
|
||||
// Return the indentation for the side the text is aligned to.
|
||||
int AlignsideIndent(tesseract::ParagraphJustification just) const {
|
||||
switch (just) {
|
||||
case tesseract::JUSTIFICATION_RIGHT:
|
||||
return rindent_;
|
||||
case tesseract::JUSTIFICATION_LEFT:
|
||||
return lindent_;
|
||||
default:
|
||||
return lindent_ > rindent_ ? lindent_ : rindent_;
|
||||
}
|
||||
}
|
||||
|
||||
// Append header fields to a vector of row headings.
|
||||
static void AppendDebugHeaderFields(std::vector<std::string> &header);
|
||||
|
||||
// Append data for this row to a vector of debug strings.
|
||||
void AppendDebugInfo(const ParagraphTheory &theory, std::vector<std::string> &dbg) const;
|
||||
|
||||
const RowInfo *ri_;
|
||||
|
||||
// These four constants form a horizontal box model for the white space
|
||||
// on the edges of each line. At each point in the algorithm, the following
|
||||
// shall hold:
|
||||
// ri_->pix_ldistance = lmargin_ + lindent_
|
||||
// ri_->pix_rdistance = rindent_ + rmargin_
|
||||
int lmargin_;
|
||||
int lindent_;
|
||||
int rindent_;
|
||||
int rmargin_;
|
||||
|
||||
private:
|
||||
// Hypotheses of either LT_START or LT_BODY
|
||||
std::vector<LineHypothesis> hypotheses_;
|
||||
};
|
||||
|
||||
// A collection of convenience functions for wrapping the set of
|
||||
// Paragraph Models we believe correctly model the paragraphs in the image.
|
||||
class ParagraphTheory {
|
||||
public:
|
||||
// We presume models will outlive us, and that models will take ownership
|
||||
// of any ParagraphModel *'s we add.
|
||||
explicit ParagraphTheory(std::vector<ParagraphModel *> *models) : models_(models) {}
|
||||
std::vector<ParagraphModel *> &models() {
|
||||
return *models_;
|
||||
}
|
||||
const std::vector<ParagraphModel *> &models() const {
|
||||
return *models_;
|
||||
}
|
||||
|
||||
// Return an existing model if one that is Comparable() can be found.
|
||||
// Else, allocate a new copy of model to save and return a pointer to it.
|
||||
const ParagraphModel *AddModel(const ParagraphModel &model);
|
||||
|
||||
// Discard any models we've made that are not in the list of used models.
|
||||
void DiscardUnusedModels(const SetOfModels &used_models);
|
||||
|
||||
// Return the set of all non-centered models.
|
||||
void NonCenteredModels(SetOfModels *models);
|
||||
|
||||
// If any of the non-centered paragraph models we know about fit
|
||||
// rows[start, end), return it. Else nullptr.
|
||||
const ParagraphModel *Fits(const std::vector<RowScratchRegisters> *rows, int start,
|
||||
int end) const;
|
||||
|
||||
int IndexOf(const ParagraphModel *model) const;
|
||||
|
||||
private:
|
||||
std::vector<ParagraphModel *> *models_;
|
||||
std::vector<ParagraphModel *> models_we_added_;
|
||||
};
|
||||
|
||||
bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,
|
||||
const ParagraphModel *model);
|
||||
bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,
|
||||
const ParagraphModel *model);
|
||||
bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,
|
||||
const ParagraphModel *model);
|
||||
|
||||
// A class for smearing Paragraph Model hypotheses to surrounding rows.
|
||||
// The idea here is that StrongEvidenceClassify first marks only exceedingly
|
||||
// obvious start and body rows and constructs models of them. Thereafter,
|
||||
// we may have left over unmarked lines (mostly end-of-paragraph lines) which
|
||||
// were too short to have much confidence about, but which fit the models we've
|
||||
// constructed perfectly and which we ought to mark. This class is used to
|
||||
// "smear" our models over the text.
|
||||
class ParagraphModelSmearer {
|
||||
public:
|
||||
ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
|
||||
ParagraphTheory *theory);
|
||||
|
||||
// Smear forward paragraph models from existing row markings to subsequent
|
||||
// text lines if they fit, and mark any thereafter still unmodeled rows
|
||||
// with any model in the theory that fits them.
|
||||
void Smear();
|
||||
|
||||
private:
|
||||
// Record in open_models_ for rows [start_row, end_row) the list of models
|
||||
// currently open at each row.
|
||||
// A model is still open in a row if some previous row has said model as a
|
||||
// start hypothesis, and all rows since (including this row) would fit as
|
||||
// either a body or start line in that model.
|
||||
void CalculateOpenModels(int row_start, int row_end);
|
||||
|
||||
SetOfModels &OpenModels(int row) {
|
||||
return open_models_[row - row_start_ + 1];
|
||||
}
|
||||
|
||||
ParagraphTheory *theory_;
|
||||
std::vector<RowScratchRegisters> *rows_;
|
||||
int row_start_;
|
||||
int row_end_;
|
||||
|
||||
// open_models_ corresponds to rows[start_row_ - 1, end_row_]
|
||||
//
|
||||
// open_models_: Contains models which there was an active (open) paragraph
|
||||
// as of the previous line and for which the left and right
|
||||
// indents admit the possibility that this text line continues
|
||||
// to fit the same model.
|
||||
// TODO(eger): Think about whether we can get rid of "Open" models and just
|
||||
// use the current hypotheses on RowScratchRegisters.
|
||||
std::vector<SetOfModels> open_models_;
|
||||
};
|
||||
|
||||
// Clear all hypotheses about lines [start, end) and reset the margins to the
|
||||
// percentile (0..100) value of the left and right row edges for this run of
|
||||
// rows.
|
||||
void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,
|
||||
int end, int percentile);
|
||||
|
||||
// Return the median inter-word space in rows[row_start, row_end).
|
||||
int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end);
|
||||
|
||||
// Return whether the first word on the after line can fit in the space at
|
||||
// the end of the before line (knowing which way the text is aligned and read).
|
||||
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after,
|
||||
tesseract::ParagraphJustification justification);
|
||||
|
||||
// Return whether the first word on the after line can fit in the space at
|
||||
// the end of the before line (not knowing the text alignment).
|
||||
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after);
|
||||
|
||||
// Do rows[start, end) form a single instance of the given paragraph model?
|
||||
bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,
|
||||
const ParagraphModel *model);
|
||||
|
||||
// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
|
||||
// normalize each row_owner to point to an actual PARA, and output the
|
||||
// paragraphs in order onto paragraphs.
|
||||
void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs);
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
|
358
3rdparty/tesseract_ocr/tesseract/src/ccmain/paramsd.cpp
vendored
Normal file
358
3rdparty/tesseract_ocr/tesseract/src/ccmain/paramsd.cpp
vendored
Normal file
|
@ -0,0 +1,358 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: paramsd.cpp
|
||||
// Description: Tesseract parameter Editor
|
||||
// Author: Joern Wanke
|
||||
//
|
||||
// (C) Copyright 2007, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// The parameters editor is used to edit all the parameters used within
|
||||
// tesseract from the ui.
|
||||
|
||||
// Include automatically generated configuration file if running autoconf.
|
||||
#ifdef HAVE_CONFIG_H
|
||||
# include "config_auto.h"
|
||||
#endif
|
||||
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
|
||||
# include "params.h" // for ParamsVectors, StringParam, BoolParam
|
||||
# include "paramsd.h"
|
||||
# include "scrollview.h" // for SVEvent, ScrollView, SVET_POPUP
|
||||
# include "svmnode.h" // for SVMenuNode
|
||||
# include "tesseractclass.h" // for Tesseract
|
||||
|
||||
# include <cstdio> // for fclose, fopen, fprintf, sprintf, FILE
|
||||
# include <cstdlib> // for atoi
|
||||
# include <cstring> // for strcmp, strcspn, strlen, strncpy
|
||||
# include <locale> // for std::locale::classic
|
||||
# include <map> // for map, _Rb_tree_iterator, map<>::iterator
|
||||
# include <memory> // for unique_ptr
|
||||
# include <sstream> // for std::stringstream
|
||||
# include <utility> // for pair
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
# define VARDIR "configs/" /*parameters files */
|
||||
# define MAX_ITEMS_IN_SUBMENU 30
|
||||
|
||||
// The following variables should remain static globals, since they
|
||||
// are used by debug editor, which uses a single Tesseract instance.
|
||||
//
|
||||
// Contains the mappings from unique VC ids to their actual pointers.
|
||||
static std::map<int, ParamContent *> vcMap;
|
||||
static int nrParams = 0;
|
||||
static int writeCommands[2];
|
||||
|
||||
// Constructors for the various ParamTypes.
|
||||
ParamContent::ParamContent(tesseract::StringParam *it) {
|
||||
my_id_ = nrParams;
|
||||
nrParams++;
|
||||
param_type_ = VT_STRING;
|
||||
sIt = it;
|
||||
vcMap[my_id_] = this;
|
||||
}
|
||||
// Constructors for the various ParamTypes.
|
||||
ParamContent::ParamContent(tesseract::IntParam *it) {
|
||||
my_id_ = nrParams;
|
||||
nrParams++;
|
||||
param_type_ = VT_INTEGER;
|
||||
iIt = it;
|
||||
vcMap[my_id_] = this;
|
||||
}
|
||||
// Constructors for the various ParamTypes.
|
||||
ParamContent::ParamContent(tesseract::BoolParam *it) {
|
||||
my_id_ = nrParams;
|
||||
nrParams++;
|
||||
param_type_ = VT_BOOLEAN;
|
||||
bIt = it;
|
||||
vcMap[my_id_] = this;
|
||||
}
|
||||
// Constructors for the various ParamTypes.
|
||||
ParamContent::ParamContent(tesseract::DoubleParam *it) {
|
||||
my_id_ = nrParams;
|
||||
nrParams++;
|
||||
param_type_ = VT_DOUBLE;
|
||||
dIt = it;
|
||||
vcMap[my_id_] = this;
|
||||
}
|
||||
|
||||
// Gets a VC object identified by its ID.
|
||||
ParamContent *ParamContent::GetParamContentById(int id) {
|
||||
return vcMap[id];
|
||||
}
|
||||
|
||||
// Copy the first N words from the source string to the target string.
|
||||
// Words are delimited by "_".
|
||||
void ParamsEditor::GetFirstWords(const char *s, // source string
|
||||
int n, // number of words
|
||||
char *t // target string
|
||||
) {
|
||||
int full_length = strlen(s);
|
||||
int reqd_len = 0; // No. of chars requird
|
||||
const char *next_word = s;
|
||||
|
||||
while ((n > 0) && reqd_len < full_length) {
|
||||
reqd_len += strcspn(next_word, "_") + 1;
|
||||
next_word += reqd_len;
|
||||
n--;
|
||||
}
|
||||
strncpy(t, s, reqd_len);
|
||||
t[reqd_len] = '\0'; // ensure null terminal
|
||||
}
|
||||
|
||||
// Getter for the name.
|
||||
const char *ParamContent::GetName() const {
|
||||
if (param_type_ == VT_INTEGER) {
|
||||
return iIt->name_str();
|
||||
} else if (param_type_ == VT_BOOLEAN) {
|
||||
return bIt->name_str();
|
||||
} else if (param_type_ == VT_DOUBLE) {
|
||||
return dIt->name_str();
|
||||
} else if (param_type_ == VT_STRING) {
|
||||
return sIt->name_str();
|
||||
} else {
|
||||
return "ERROR: ParamContent::GetName()";
|
||||
}
|
||||
}
|
||||
|
||||
// Getter for the description.
|
||||
const char *ParamContent::GetDescription() const {
|
||||
if (param_type_ == VT_INTEGER) {
|
||||
return iIt->info_str();
|
||||
} else if (param_type_ == VT_BOOLEAN) {
|
||||
return bIt->info_str();
|
||||
} else if (param_type_ == VT_DOUBLE) {
|
||||
return dIt->info_str();
|
||||
} else if (param_type_ == VT_STRING) {
|
||||
return sIt->info_str();
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Getter for the value.
|
||||
std::string ParamContent::GetValue() const {
|
||||
std::string result;
|
||||
if (param_type_ == VT_INTEGER) {
|
||||
result += std::to_string(*iIt);
|
||||
} else if (param_type_ == VT_BOOLEAN) {
|
||||
result += std::to_string(*bIt);
|
||||
} else if (param_type_ == VT_DOUBLE) {
|
||||
result += std::to_string(*dIt);
|
||||
} else if (param_type_ == VT_STRING) {
|
||||
result = sIt->c_str();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Setter for the value.
|
||||
void ParamContent::SetValue(const char *val) {
|
||||
// TODO (wanke) Test if the values actually are properly converted.
|
||||
// (Quickly visible impacts?)
|
||||
changed_ = true;
|
||||
if (param_type_ == VT_INTEGER) {
|
||||
iIt->set_value(atoi(val));
|
||||
} else if (param_type_ == VT_BOOLEAN) {
|
||||
bIt->set_value(atoi(val));
|
||||
} else if (param_type_ == VT_DOUBLE) {
|
||||
std::stringstream stream(val);
|
||||
// Use "C" locale for reading double value.
|
||||
stream.imbue(std::locale::classic());
|
||||
double d = 0;
|
||||
stream >> d;
|
||||
dIt->set_value(d);
|
||||
} else if (param_type_ == VT_STRING) {
|
||||
sIt->set_value(val);
|
||||
}
|
||||
}
|
||||
|
||||
// Gets the up to the first 3 prefixes from s (split by _).
|
||||
// For example, tesseract_foo_bar will be split into tesseract,foo and bar.
|
||||
void ParamsEditor::GetPrefixes(const char *s, std::string *level_one, std::string *level_two,
|
||||
std::string *level_three) {
|
||||
std::unique_ptr<char[]> p(new char[1024]);
|
||||
GetFirstWords(s, 1, p.get());
|
||||
*level_one = p.get();
|
||||
GetFirstWords(s, 2, p.get());
|
||||
*level_two = p.get();
|
||||
GetFirstWords(s, 3, p.get());
|
||||
*level_three = p.get();
|
||||
}
|
||||
|
||||
// Compare two VC objects by their name.
|
||||
int ParamContent::Compare(const void *v1, const void *v2) {
|
||||
const ParamContent *one = *static_cast<const ParamContent *const *>(v1);
|
||||
const ParamContent *two = *static_cast<const ParamContent *const *>(v2);
|
||||
return strcmp(one->GetName(), two->GetName());
|
||||
}
|
||||
|
||||
// Find all editable parameters used within tesseract and create a
|
||||
// SVMenuNode tree from it.
|
||||
// TODO (wanke): This is actually sort of hackish.
|
||||
SVMenuNode *ParamsEditor::BuildListOfAllLeaves(tesseract::Tesseract *tess) {
|
||||
auto *mr = new SVMenuNode();
|
||||
ParamContent_LIST vclist;
|
||||
ParamContent_IT vc_it(&vclist);
|
||||
// Amount counts the number of entries for a specific char*.
|
||||
// TODO(rays) get rid of the use of std::map.
|
||||
std::map<const char *, int> amount;
|
||||
|
||||
// Add all parameters to a list.
|
||||
int num_iterations = (tess->params() == nullptr) ? 1 : 2;
|
||||
for (int v = 0; v < num_iterations; ++v) {
|
||||
tesseract::ParamsVectors *vec = (v == 0) ? GlobalParams() : tess->params();
|
||||
for (auto ¶m : vec->int_params) {
|
||||
vc_it.add_after_then_move(new ParamContent(param));
|
||||
}
|
||||
for (auto ¶m : vec->bool_params) {
|
||||
vc_it.add_after_then_move(new ParamContent(param));
|
||||
}
|
||||
for (auto ¶m : vec->string_params) {
|
||||
vc_it.add_after_then_move(new ParamContent(param));
|
||||
}
|
||||
for (auto ¶m : vec->double_params) {
|
||||
vc_it.add_after_then_move(new ParamContent(param));
|
||||
}
|
||||
}
|
||||
|
||||
// Count the # of entries starting with a specific prefix.
|
||||
for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {
|
||||
ParamContent *vc = vc_it.data();
|
||||
std::string tag;
|
||||
std::string tag2;
|
||||
std::string tag3;
|
||||
|
||||
GetPrefixes(vc->GetName(), &tag, &tag2, &tag3);
|
||||
amount[tag.c_str()]++;
|
||||
amount[tag2.c_str()]++;
|
||||
amount[tag3.c_str()]++;
|
||||
}
|
||||
|
||||
vclist.sort(ParamContent::Compare); // Sort the list alphabetically.
|
||||
|
||||
SVMenuNode *other = mr->AddChild("OTHER");
|
||||
|
||||
// go through the list again and this time create the menu structure.
|
||||
vc_it.move_to_first();
|
||||
for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {
|
||||
ParamContent *vc = vc_it.data();
|
||||
std::string tag;
|
||||
std::string tag2;
|
||||
std::string tag3;
|
||||
GetPrefixes(vc->GetName(), &tag, &tag2, &tag3);
|
||||
|
||||
if (amount[tag.c_str()] == 1) {
|
||||
other->AddChild(vc->GetName(), vc->GetId(), vc->GetValue().c_str(), vc->GetDescription());
|
||||
} else { // More than one would use this submenu -> create submenu.
|
||||
SVMenuNode *sv = mr->AddChild(tag.c_str());
|
||||
if ((amount[tag.c_str()] <= MAX_ITEMS_IN_SUBMENU) || (amount[tag2.c_str()] <= 1)) {
|
||||
sv->AddChild(vc->GetName(), vc->GetId(), vc->GetValue().c_str(), vc->GetDescription());
|
||||
} else { // Make subsubmenus.
|
||||
SVMenuNode *sv2 = sv->AddChild(tag2.c_str());
|
||||
sv2->AddChild(vc->GetName(), vc->GetId(), vc->GetValue().c_str(), vc->GetDescription());
|
||||
}
|
||||
}
|
||||
}
|
||||
return mr;
|
||||
}
|
||||
|
||||
// Event listener. Waits for SVET_POPUP events and processes them.
|
||||
void ParamsEditor::Notify(const SVEvent *sve) {
|
||||
if (sve->type == SVET_POPUP) { // only catch SVET_POPUP!
|
||||
char *param = sve->parameter;
|
||||
if (sve->command_id == writeCommands[0]) {
|
||||
WriteParams(param, false);
|
||||
} else if (sve->command_id == writeCommands[1]) {
|
||||
WriteParams(param, true);
|
||||
} else {
|
||||
ParamContent *vc = ParamContent::GetParamContentById(sve->command_id);
|
||||
vc->SetValue(param);
|
||||
sv_window_->AddMessage("Setting %s to %s", vc->GetName(), vc->GetValue().c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Integrate the parameters editor as popupmenu into the existing scrollview
|
||||
// window (usually the pg editor). If sv == null, create a new empty
|
||||
// empty window and attach the parameters editor to that window (ugly).
|
||||
ParamsEditor::ParamsEditor(tesseract::Tesseract *tess, ScrollView *sv) {
|
||||
if (sv == nullptr) {
|
||||
const char *name = "ParamEditorMAIN";
|
||||
sv = new ScrollView(name, 1, 1, 200, 200, 300, 200);
|
||||
}
|
||||
|
||||
sv_window_ = sv;
|
||||
|
||||
// Only one event handler per window.
|
||||
// sv->AddEventHandler((SVEventHandler*) this);
|
||||
|
||||
SVMenuNode *svMenuRoot = BuildListOfAllLeaves(tess);
|
||||
|
||||
std::string paramfile;
|
||||
paramfile = tess->datadir;
|
||||
paramfile += VARDIR; // parameters dir
|
||||
paramfile += "edited"; // actual name
|
||||
|
||||
SVMenuNode *std_menu = svMenuRoot->AddChild("Build Config File");
|
||||
|
||||
writeCommands[0] = nrParams + 1;
|
||||
std_menu->AddChild("All Parameters", writeCommands[0], paramfile.c_str(), "Config file name?");
|
||||
|
||||
writeCommands[1] = nrParams + 2;
|
||||
std_menu->AddChild("changed_ Parameters Only", writeCommands[1], paramfile.c_str(),
|
||||
"Config file name?");
|
||||
|
||||
svMenuRoot->BuildMenu(sv, false);
|
||||
}
|
||||
|
||||
// Write all (changed_) parameters to a config file.
|
||||
void ParamsEditor::WriteParams(char *filename, bool changes_only) {
|
||||
FILE *fp; // input file
|
||||
char msg_str[255];
|
||||
// if file exists
|
||||
if ((fp = fopen(filename, "rb")) != nullptr) {
|
||||
fclose(fp);
|
||||
sprintf(msg_str,
|
||||
"Overwrite file "
|
||||
"%s"
|
||||
"? (Y/N)",
|
||||
filename);
|
||||
int a = sv_window_->ShowYesNoDialog(msg_str);
|
||||
if (a == 'n') {
|
||||
return;
|
||||
} // don't write
|
||||
}
|
||||
|
||||
fp = fopen(filename, "wb"); // can we write to it?
|
||||
if (fp == nullptr) {
|
||||
sv_window_->AddMessage(
|
||||
"Can't write to file "
|
||||
"%s"
|
||||
"",
|
||||
filename);
|
||||
return;
|
||||
}
|
||||
for (auto &iter : vcMap) {
|
||||
ParamContent *cur = iter.second;
|
||||
if (!changes_only || cur->HasChanged()) {
|
||||
fprintf(fp, "%-25s %-12s # %s\n", cur->GetName(), cur->GetValue().c_str(),
|
||||
cur->GetDescription());
|
||||
}
|
||||
}
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // !GRAPHICS_DISABLED
|
130
3rdparty/tesseract_ocr/tesseract/src/ccmain/paramsd.h
vendored
Normal file
130
3rdparty/tesseract_ocr/tesseract/src/ccmain/paramsd.h
vendored
Normal file
|
@ -0,0 +1,130 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: paramsd.h
|
||||
// Description: Tesseract parameter editor
|
||||
// Author: Joern Wanke
|
||||
//
|
||||
// (C) Copyright 2007, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Tesseract parameter editor is used to edit all the parameters used
|
||||
// within tesseract from the ui.
|
||||
#ifndef TESSERACT_CCMAIN_PARAMSD_H_
|
||||
#define TESSERACT_CCMAIN_PARAMSD_H_
|
||||
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
|
||||
# include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
|
||||
# include "scrollview.h" // for ScrollView (ptr only), SVEvent (ptr only)
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class SVMenuNode;
|
||||
|
||||
class BoolParam;
|
||||
class DoubleParam;
|
||||
class IntParam;
|
||||
class StringParam;
|
||||
class Tesseract;
|
||||
|
||||
// A list of all possible parameter types used.
|
||||
enum ParamType { VT_INTEGER, VT_BOOLEAN, VT_STRING, VT_DOUBLE };
|
||||
|
||||
// A rather hackish helper structure which can take any kind of parameter input
|
||||
// (defined by ParamType) and do a couple of common operations on them, like
|
||||
// comparisond or getting its value. It is used in the context of the
|
||||
// ParamsEditor as a bridge from the internal tesseract parameters to the
|
||||
// ones displayed by the ScrollView server.
|
||||
class ParamContent : public ELIST_LINK {
|
||||
public:
|
||||
// Compare two VC objects by their name.
|
||||
static int Compare(const void *v1, const void *v2);
|
||||
|
||||
// Gets a VC object identified by its ID.
|
||||
static ParamContent *GetParamContentById(int id);
|
||||
|
||||
// Constructors for the various ParamTypes.
|
||||
ParamContent() = default;
|
||||
explicit ParamContent(tesseract::StringParam *it);
|
||||
explicit ParamContent(tesseract::IntParam *it);
|
||||
explicit ParamContent(tesseract::BoolParam *it);
|
||||
explicit ParamContent(tesseract::DoubleParam *it);
|
||||
|
||||
// Getters and Setters.
|
||||
void SetValue(const char *val);
|
||||
std::string GetValue() const;
|
||||
const char *GetName() const;
|
||||
const char *GetDescription() const;
|
||||
|
||||
int GetId() const {
|
||||
return my_id_;
|
||||
}
|
||||
bool HasChanged() const {
|
||||
return changed_;
|
||||
}
|
||||
|
||||
private:
|
||||
// The unique ID of this VC object.
|
||||
int my_id_;
|
||||
// Whether the parameter was changed_ and thus needs to be rewritten.
|
||||
bool changed_ = false;
|
||||
// The actual ParamType of this VC object.
|
||||
ParamType param_type_;
|
||||
|
||||
union {
|
||||
tesseract::StringParam *sIt;
|
||||
tesseract::IntParam *iIt;
|
||||
tesseract::BoolParam *bIt;
|
||||
tesseract::DoubleParam *dIt;
|
||||
};
|
||||
};
|
||||
|
||||
ELISTIZEH(ParamContent)
|
||||
|
||||
// The parameters editor enables the user to edit all the parameters used within
|
||||
// tesseract. It can be invoked on its own, but is supposed to be invoked by
|
||||
// the program editor.
|
||||
class ParamsEditor : public SVEventHandler {
|
||||
public:
|
||||
// Integrate the parameters editor as popupmenu into the existing scrollview
|
||||
// window (usually the pg editor). If sv == null, create a new empty
|
||||
// empty window and attach the parameter editor to that window (ugly).
|
||||
explicit ParamsEditor(tesseract::Tesseract *, ScrollView *sv = nullptr);
|
||||
|
||||
// Event listener. Waits for SVET_POPUP events and processes them.
|
||||
void Notify(const SVEvent *sve) override;
|
||||
|
||||
private:
|
||||
// Gets the up to the first 3 prefixes from s (split by _).
|
||||
// For example, tesseract_foo_bar will be split into tesseract,foo and bar.
|
||||
void GetPrefixes(const char *s, std::string *level_one, std::string *level_two, std::string *level_three);
|
||||
|
||||
// Gets the first n words (split by _) and puts them in t.
|
||||
// For example, tesseract_foo_bar with N=2 will yield tesseract_foo_.
|
||||
void GetFirstWords(const char *s, // source string
|
||||
int n, // number of words
|
||||
char *t); // target string
|
||||
|
||||
// Find all editable parameters used within tesseract and create a
|
||||
// SVMenuNode tree from it.
|
||||
SVMenuNode *BuildListOfAllLeaves(tesseract::Tesseract *tess);
|
||||
|
||||
// Write all (changed_) parameters to a config file.
|
||||
void WriteParams(char *filename, bool changes_only);
|
||||
|
||||
ScrollView *sv_window_;
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // !GRAPHICS_DISABLED
|
||||
#endif // TESSERACT_CCMAIN_PARAMSD_H_
|
958
3rdparty/tesseract_ocr/tesseract/src/ccmain/pgedit.cpp
vendored
Normal file
958
3rdparty/tesseract_ocr/tesseract/src/ccmain/pgedit.cpp
vendored
Normal file
|
@ -0,0 +1,958 @@
|
|||
/**********************************************************************
|
||||
* File: pgedit.cpp (Formerly pgeditor.c)
|
||||
* Description: Page structure file editor
|
||||
* Author: Phil Cheatle
|
||||
*
|
||||
*(C) Copyright 1991, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0(the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http:// www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
// Include automatically generated configuration file if running autoconf.
|
||||
#ifdef HAVE_CONFIG_H
|
||||
# include "config_auto.h"
|
||||
#endif
|
||||
|
||||
#include "pgedit.h"
|
||||
|
||||
#include "blread.h"
|
||||
#include "control.h"
|
||||
#include "pageres.h"
|
||||
#include "paramsd.h"
|
||||
#include "scrollview.h"
|
||||
#include "statistc.h"
|
||||
#include "svmnode.h"
|
||||
#include "tesseractclass.h"
|
||||
#include "tordmain.h"
|
||||
#include "werdit.h"
|
||||
|
||||
#include <cctype>
|
||||
#include <cmath>
|
||||
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
namespace tesseract {
|
||||
# define ASC_HEIGHT (2 * kBlnBaselineOffset + kBlnXHeight)
|
||||
# define X_HEIGHT (kBlnBaselineOffset + kBlnXHeight)
|
||||
# define BL_HEIGHT kBlnBaselineOffset
|
||||
# define DESC_HEIGHT 0
|
||||
|
||||
enum CMD_EVENTS {
|
||||
NULL_CMD_EVENT,
|
||||
CHANGE_DISP_CMD_EVENT,
|
||||
DUMP_WERD_CMD_EVENT,
|
||||
SHOW_POINT_CMD_EVENT,
|
||||
SHOW_BLN_WERD_CMD_EVENT,
|
||||
DEBUG_WERD_CMD_EVENT,
|
||||
BLAMER_CMD_EVENT,
|
||||
BOUNDING_BOX_CMD_EVENT,
|
||||
CORRECT_TEXT_CMD_EVENT,
|
||||
POLYGONAL_CMD_EVENT,
|
||||
BL_NORM_CMD_EVENT,
|
||||
BITMAP_CMD_EVENT,
|
||||
IMAGE_CMD_EVENT,
|
||||
BLOCKS_CMD_EVENT,
|
||||
BASELINES_CMD_EVENT,
|
||||
UNIFORM_DISP_CMD_EVENT,
|
||||
REFRESH_CMD_EVENT,
|
||||
QUIT_CMD_EVENT,
|
||||
RECOG_WERDS,
|
||||
RECOG_PSEUDO,
|
||||
SHOW_BLOB_FEATURES,
|
||||
SHOW_SUBSCRIPT_CMD_EVENT,
|
||||
SHOW_SUPERSCRIPT_CMD_EVENT,
|
||||
SHOW_ITALIC_CMD_EVENT,
|
||||
SHOW_BOLD_CMD_EVENT,
|
||||
SHOW_UNDERLINE_CMD_EVENT,
|
||||
SHOW_FIXEDPITCH_CMD_EVENT,
|
||||
SHOW_SERIF_CMD_EVENT,
|
||||
SHOW_SMALLCAPS_CMD_EVENT,
|
||||
SHOW_DROPCAPS_CMD_EVENT,
|
||||
};
|
||||
|
||||
enum ColorationMode {
|
||||
CM_RAINBOW,
|
||||
CM_SUBSCRIPT,
|
||||
CM_SUPERSCRIPT,
|
||||
CM_ITALIC,
|
||||
CM_BOLD,
|
||||
CM_UNDERLINE,
|
||||
CM_FIXEDPITCH,
|
||||
CM_SERIF,
|
||||
CM_SMALLCAPS,
|
||||
CM_DROPCAPS
|
||||
};
|
||||
|
||||
/*
|
||||
*
|
||||
* Some global data
|
||||
*
|
||||
*/
|
||||
|
||||
static ScrollView *image_win;
|
||||
static ParamsEditor *pe;
|
||||
static bool stillRunning = false;
|
||||
|
||||
static ScrollView *bln_word_window = nullptr; // baseline norm words
|
||||
|
||||
static CMD_EVENTS mode = CHANGE_DISP_CMD_EVENT; // selected words op
|
||||
|
||||
static bool recog_done = false; // recog_all_words was called
|
||||
|
||||
// These variables should remain global, since they are only used for the
|
||||
// debug mode (in which only a single Tesseract thread/instance will exist).
|
||||
static std::bitset<16> word_display_mode;
|
||||
static ColorationMode color_mode = CM_RAINBOW;
|
||||
static bool display_image = false;
|
||||
static bool display_blocks = false;
|
||||
static bool display_baselines = false;
|
||||
|
||||
static PAGE_RES *current_page_res = nullptr;
|
||||
|
||||
STRING_VAR(editor_image_win_name, "EditorImage", "Editor image window name");
|
||||
INT_VAR(editor_image_xpos, 590, "Editor image X Pos");
|
||||
INT_VAR(editor_image_ypos, 10, "Editor image Y Pos");
|
||||
static INT_VAR(editor_image_menuheight, 50, "Add to image height for menu bar");
|
||||
INT_VAR(editor_image_word_bb_color, ScrollView::BLUE, "Word bounding box colour");
|
||||
INT_VAR(editor_image_blob_bb_color, ScrollView::YELLOW, "Blob bounding box colour");
|
||||
INT_VAR(editor_image_text_color, ScrollView::WHITE, "Correct text colour");
|
||||
|
||||
STRING_VAR(editor_dbwin_name, "EditorDBWin", "Editor debug window name");
|
||||
INT_VAR(editor_dbwin_xpos, 50, "Editor debug window X Pos");
|
||||
INT_VAR(editor_dbwin_ypos, 500, "Editor debug window Y Pos");
|
||||
INT_VAR(editor_dbwin_height, 24, "Editor debug window height");
|
||||
INT_VAR(editor_dbwin_width, 80, "Editor debug window width");
|
||||
|
||||
STRING_VAR(editor_word_name, "BlnWords", "BL normalized word window");
|
||||
INT_VAR(editor_word_xpos, 60, "Word window X Pos");
|
||||
INT_VAR(editor_word_ypos, 510, "Word window Y Pos");
|
||||
INT_VAR(editor_word_height, 240, "Word window height");
|
||||
INT_VAR(editor_word_width, 655, "Word window width");
|
||||
|
||||
/**
|
||||
* show_point()
|
||||
*
|
||||
* Show coords of point, blob bounding box, word bounding box and offset from
|
||||
* row baseline
|
||||
*/
|
||||
|
||||
static void show_point(PAGE_RES *page_res, float x, float y) {
|
||||
FCOORD pt(x, y);
|
||||
PAGE_RES_IT pr_it(page_res);
|
||||
|
||||
const int kBufsize = 512;
|
||||
char msg[kBufsize];
|
||||
char *msg_ptr = msg;
|
||||
|
||||
msg_ptr += sprintf(msg_ptr, "Pt:(%0.3f, %0.3f) ", x, y);
|
||||
|
||||
for (WERD_RES *word = pr_it.word(); word != nullptr; word = pr_it.forward()) {
|
||||
if (pr_it.row() != pr_it.prev_row() && pr_it.row()->row->bounding_box().contains(pt)) {
|
||||
msg_ptr += sprintf(msg_ptr, "BL(x)=%0.3f ", pr_it.row()->row->base_line(x));
|
||||
}
|
||||
if (word->word->bounding_box().contains(pt)) {
|
||||
TBOX box = word->word->bounding_box();
|
||||
msg_ptr += sprintf(msg_ptr, "Wd(%d, %d)/(%d, %d) ", box.left(), box.bottom(), box.right(),
|
||||
box.top());
|
||||
C_BLOB_IT cblob_it(word->word->cblob_list());
|
||||
for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) {
|
||||
C_BLOB *cblob = cblob_it.data();
|
||||
box = cblob->bounding_box();
|
||||
if (box.contains(pt)) {
|
||||
msg_ptr += sprintf(msg_ptr, "CBlb(%d, %d)/(%d, %d) ", box.left(), box.bottom(),
|
||||
box.right(), box.top());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
image_win->AddMessage(msg);
|
||||
}
|
||||
|
||||
/**
|
||||
* pgeditor_msg()
|
||||
*
|
||||
* Display a message - in the command window if there is one, or to stdout
|
||||
*/
|
||||
|
||||
static void pgeditor_msg( // message display
|
||||
const char *msg) {
|
||||
image_win->AddMessage(msg);
|
||||
}
|
||||
|
||||
class BlnEventHandler : public SVEventHandler {
|
||||
public:
|
||||
void Notify(const SVEvent *sv_event) override {
|
||||
if (sv_event->type == SVET_DESTROY) {
|
||||
bln_word_window = nullptr;
|
||||
} else if (sv_event->type == SVET_CLICK) {
|
||||
show_point(current_page_res, sv_event->x, sv_event->y);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* bln_word_window_handle()
|
||||
*
|
||||
* @return a WINDOW for the word window, creating it if necessary
|
||||
*/
|
||||
static ScrollView *bln_word_window_handle() { // return handle
|
||||
// not opened yet
|
||||
if (bln_word_window == nullptr) {
|
||||
pgeditor_msg("Creating BLN word window...");
|
||||
bln_word_window = new ScrollView(editor_word_name.c_str(), editor_word_xpos, editor_word_ypos,
|
||||
editor_word_width, editor_word_height, 4000, 4000, true);
|
||||
auto *a = new BlnEventHandler();
|
||||
bln_word_window->AddEventHandler(a);
|
||||
pgeditor_msg("Creating BLN word window...Done");
|
||||
}
|
||||
return bln_word_window;
|
||||
}
|
||||
|
||||
/**
|
||||
* build_image_window()
|
||||
*
|
||||
* Destroy the existing image window if there is one. Work out how big the
|
||||
* new window needs to be. Create it and re-display.
|
||||
*/
|
||||
|
||||
static void build_image_window(int width, int height) {
|
||||
delete image_win;
|
||||
image_win = new ScrollView(editor_image_win_name.c_str(), editor_image_xpos, editor_image_ypos,
|
||||
width + 1, height + editor_image_menuheight + 1, width, height, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* display_bln_lines()
|
||||
*
|
||||
* Display normalized baseline, x-height, ascender limit and descender limit
|
||||
*/
|
||||
|
||||
static void display_bln_lines(ScrollView *window, ScrollView::Color colour, float scale_factor,
|
||||
float y_offset, float minx, float maxx) {
|
||||
window->Pen(colour);
|
||||
window->Line(minx, y_offset + scale_factor * DESC_HEIGHT, maxx,
|
||||
y_offset + scale_factor * DESC_HEIGHT);
|
||||
window->Line(minx, y_offset + scale_factor * BL_HEIGHT, maxx,
|
||||
y_offset + scale_factor * BL_HEIGHT);
|
||||
window->Line(minx, y_offset + scale_factor * X_HEIGHT, maxx, y_offset + scale_factor * X_HEIGHT);
|
||||
window->Line(minx, y_offset + scale_factor * ASC_HEIGHT, maxx,
|
||||
y_offset + scale_factor * ASC_HEIGHT);
|
||||
}
|
||||
|
||||
/**
|
||||
* notify()
|
||||
*
|
||||
* Event handler that processes incoming events, either forwarding
|
||||
* them to process_cmd_win_event or process_image_event.
|
||||
*
|
||||
*/
|
||||
|
||||
void PGEventHandler::Notify(const SVEvent *event) {
|
||||
char myval = '0';
|
||||
if (event->type == SVET_POPUP) {
|
||||
pe->Notify(event);
|
||||
} // These are handled by ParamsEditor
|
||||
else if (event->type == SVET_EXIT) {
|
||||
stillRunning = false;
|
||||
} else if (event->type == SVET_MENU) {
|
||||
if (strcmp(event->parameter, "true") == 0) {
|
||||
myval = 'T';
|
||||
} else if (strcmp(event->parameter, "false") == 0) {
|
||||
myval = 'F';
|
||||
}
|
||||
tess_->process_cmd_win_event(event->command_id, &myval);
|
||||
} else {
|
||||
tess_->process_image_event(*event);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* build_menu()
|
||||
*
|
||||
* Construct the menu tree used by the command window
|
||||
*/
|
||||
SVMenuNode *Tesseract::build_menu_new() {
|
||||
SVMenuNode *parent_menu;
|
||||
auto *root_menu_item = new SVMenuNode();
|
||||
|
||||
SVMenuNode *modes_menu_item = root_menu_item->AddChild("MODES");
|
||||
|
||||
modes_menu_item->AddChild("Change Display", CHANGE_DISP_CMD_EVENT);
|
||||
modes_menu_item->AddChild("Dump Word", DUMP_WERD_CMD_EVENT);
|
||||
modes_menu_item->AddChild("Show Point", SHOW_POINT_CMD_EVENT);
|
||||
modes_menu_item->AddChild("Show BL Norm Word", SHOW_BLN_WERD_CMD_EVENT);
|
||||
modes_menu_item->AddChild("Config Words", DEBUG_WERD_CMD_EVENT);
|
||||
modes_menu_item->AddChild("Recog Words", RECOG_WERDS);
|
||||
modes_menu_item->AddChild("Recog Blobs", RECOG_PSEUDO);
|
||||
modes_menu_item->AddChild("Show Blob Features", SHOW_BLOB_FEATURES);
|
||||
|
||||
parent_menu = root_menu_item->AddChild("DISPLAY");
|
||||
|
||||
parent_menu->AddChild("Blamer", BLAMER_CMD_EVENT, false);
|
||||
parent_menu->AddChild("Bounding Boxes", BOUNDING_BOX_CMD_EVENT, false);
|
||||
parent_menu->AddChild("Correct Text", CORRECT_TEXT_CMD_EVENT, false);
|
||||
parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, false);
|
||||
parent_menu->AddChild("Baseline Normalized", BL_NORM_CMD_EVENT, false);
|
||||
parent_menu->AddChild("Edge Steps", BITMAP_CMD_EVENT, true);
|
||||
parent_menu->AddChild("Subscripts", SHOW_SUBSCRIPT_CMD_EVENT);
|
||||
parent_menu->AddChild("Superscripts", SHOW_SUPERSCRIPT_CMD_EVENT);
|
||||
parent_menu->AddChild("Italics", SHOW_ITALIC_CMD_EVENT);
|
||||
parent_menu->AddChild("Bold", SHOW_BOLD_CMD_EVENT);
|
||||
parent_menu->AddChild("Underline", SHOW_UNDERLINE_CMD_EVENT);
|
||||
parent_menu->AddChild("FixedPitch", SHOW_FIXEDPITCH_CMD_EVENT);
|
||||
parent_menu->AddChild("Serifs", SHOW_SERIF_CMD_EVENT);
|
||||
parent_menu->AddChild("SmallCaps", SHOW_SMALLCAPS_CMD_EVENT);
|
||||
parent_menu->AddChild("DropCaps", SHOW_DROPCAPS_CMD_EVENT);
|
||||
|
||||
parent_menu = root_menu_item->AddChild("OTHER");
|
||||
|
||||
parent_menu->AddChild("Quit", QUIT_CMD_EVENT);
|
||||
parent_menu->AddChild("Show Image", IMAGE_CMD_EVENT, false);
|
||||
parent_menu->AddChild("ShowBlock Outlines", BLOCKS_CMD_EVENT, false);
|
||||
parent_menu->AddChild("Show Baselines", BASELINES_CMD_EVENT, false);
|
||||
parent_menu->AddChild("Uniform Display", UNIFORM_DISP_CMD_EVENT);
|
||||
parent_menu->AddChild("Refresh Display", REFRESH_CMD_EVENT);
|
||||
|
||||
return root_menu_item;
|
||||
}
|
||||
|
||||
/**
|
||||
* do_re_display()
|
||||
*
|
||||
* Redisplay page
|
||||
*/
|
||||
void Tesseract::do_re_display(bool (tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it)) {
|
||||
int block_count = 1;
|
||||
|
||||
image_win->Clear();
|
||||
if (display_image) {
|
||||
image_win->Draw(pix_binary_, 0, 0);
|
||||
}
|
||||
|
||||
image_win->Brush(ScrollView::NONE);
|
||||
PAGE_RES_IT pr_it(current_page_res);
|
||||
for (WERD_RES *word = pr_it.word(); word != nullptr; word = pr_it.forward()) {
|
||||
(this->*word_painter)(&pr_it);
|
||||
if (display_baselines && pr_it.row() != pr_it.prev_row()) {
|
||||
pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN);
|
||||
}
|
||||
if (display_blocks && pr_it.block() != pr_it.prev_block()) {
|
||||
pr_it.block()->block->pdblk.plot(image_win, block_count++, ScrollView::RED);
|
||||
}
|
||||
}
|
||||
image_win->Update();
|
||||
}
|
||||
|
||||
/**
|
||||
* pgeditor_main()
|
||||
*
|
||||
* Top level editor operation:
|
||||
* Setup a new window and an according event handler
|
||||
*
|
||||
*/
|
||||
|
||||
void Tesseract::pgeditor_main(int width, int height, PAGE_RES *page_res) {
|
||||
current_page_res = page_res;
|
||||
if (current_page_res->block_res_list.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
recog_done = false;
|
||||
stillRunning = true;
|
||||
|
||||
build_image_window(width, height);
|
||||
word_display_mode.set(DF_EDGE_STEP);
|
||||
do_re_display(&tesseract::Tesseract::word_set_display);
|
||||
# ifndef GRAPHICS_DISABLED
|
||||
pe = new ParamsEditor(this, image_win);
|
||||
# endif
|
||||
PGEventHandler pgEventHandler(this);
|
||||
|
||||
image_win->AddEventHandler(&pgEventHandler);
|
||||
image_win->AddMessageBox();
|
||||
|
||||
SVMenuNode *svMenuRoot = build_menu_new();
|
||||
|
||||
svMenuRoot->BuildMenu(image_win);
|
||||
image_win->SetVisible(true);
|
||||
|
||||
image_win->AwaitEvent(SVET_DESTROY);
|
||||
image_win->AddEventHandler(nullptr);
|
||||
}
|
||||
|
||||
/**
|
||||
* process_cmd_win_event()
|
||||
*
|
||||
* Process a command returned from the command window
|
||||
* (Just call the appropriate command handler)
|
||||
*/
|
||||
|
||||
bool Tesseract::process_cmd_win_event( // UI command semantics
|
||||
int32_t cmd_event, // which menu item?
|
||||
char *new_value // any prompt data
|
||||
) {
|
||||
char msg[160];
|
||||
bool exit = false;
|
||||
|
||||
color_mode = CM_RAINBOW;
|
||||
|
||||
// Run recognition on the full page if needed.
|
||||
switch (cmd_event) {
|
||||
case BLAMER_CMD_EVENT:
|
||||
case SHOW_SUBSCRIPT_CMD_EVENT:
|
||||
case SHOW_SUPERSCRIPT_CMD_EVENT:
|
||||
case SHOW_ITALIC_CMD_EVENT:
|
||||
case SHOW_BOLD_CMD_EVENT:
|
||||
case SHOW_UNDERLINE_CMD_EVENT:
|
||||
case SHOW_FIXEDPITCH_CMD_EVENT:
|
||||
case SHOW_SERIF_CMD_EVENT:
|
||||
case SHOW_SMALLCAPS_CMD_EVENT:
|
||||
case SHOW_DROPCAPS_CMD_EVENT:
|
||||
if (!recog_done) {
|
||||
recog_all_words(current_page_res, nullptr, nullptr, nullptr, 0);
|
||||
recog_done = true;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
char *parameter;
|
||||
|
||||
switch (cmd_event) {
|
||||
case NULL_CMD_EVENT:
|
||||
break;
|
||||
|
||||
case CHANGE_DISP_CMD_EVENT:
|
||||
case DUMP_WERD_CMD_EVENT:
|
||||
case SHOW_POINT_CMD_EVENT:
|
||||
case SHOW_BLN_WERD_CMD_EVENT:
|
||||
case RECOG_WERDS:
|
||||
case RECOG_PSEUDO:
|
||||
case SHOW_BLOB_FEATURES:
|
||||
mode = static_cast<CMD_EVENTS>(cmd_event);
|
||||
break;
|
||||
case DEBUG_WERD_CMD_EVENT:
|
||||
mode = DEBUG_WERD_CMD_EVENT;
|
||||
parameter = image_win->ShowInputDialog("Config File Name");
|
||||
word_config_ = parameter;
|
||||
delete[] parameter;
|
||||
break;
|
||||
case BOUNDING_BOX_CMD_EVENT:
|
||||
if (new_value[0] == 'T') {
|
||||
word_display_mode.set(DF_BOX);
|
||||
} else {
|
||||
word_display_mode.reset(DF_BOX);
|
||||
}
|
||||
mode = CHANGE_DISP_CMD_EVENT;
|
||||
break;
|
||||
case BLAMER_CMD_EVENT:
|
||||
if (new_value[0] == 'T') {
|
||||
word_display_mode.set(DF_BLAMER);
|
||||
} else {
|
||||
word_display_mode.reset(DF_BLAMER);
|
||||
}
|
||||
do_re_display(&tesseract::Tesseract::word_display);
|
||||
mode = CHANGE_DISP_CMD_EVENT;
|
||||
break;
|
||||
case CORRECT_TEXT_CMD_EVENT:
|
||||
if (new_value[0] == 'T') {
|
||||
word_display_mode.set(DF_TEXT);
|
||||
} else {
|
||||
word_display_mode.reset(DF_TEXT);
|
||||
}
|
||||
mode = CHANGE_DISP_CMD_EVENT;
|
||||
break;
|
||||
case POLYGONAL_CMD_EVENT:
|
||||
if (new_value[0] == 'T') {
|
||||
word_display_mode.set(DF_POLYGONAL);
|
||||
} else {
|
||||
word_display_mode.reset(DF_POLYGONAL);
|
||||
}
|
||||
mode = CHANGE_DISP_CMD_EVENT;
|
||||
break;
|
||||
case BL_NORM_CMD_EVENT:
|
||||
if (new_value[0] == 'T') {
|
||||
word_display_mode.set(DF_BN_POLYGONAL);
|
||||
} else {
|
||||
word_display_mode.reset(DF_BN_POLYGONAL);
|
||||
}
|
||||
mode = CHANGE_DISP_CMD_EVENT;
|
||||
break;
|
||||
case BITMAP_CMD_EVENT:
|
||||
if (new_value[0] == 'T') {
|
||||
word_display_mode.set(DF_EDGE_STEP);
|
||||
} else {
|
||||
word_display_mode.reset(DF_EDGE_STEP);
|
||||
}
|
||||
mode = CHANGE_DISP_CMD_EVENT;
|
||||
break;
|
||||
case UNIFORM_DISP_CMD_EVENT:
|
||||
do_re_display(&tesseract::Tesseract::word_set_display);
|
||||
break;
|
||||
case IMAGE_CMD_EVENT:
|
||||
display_image = (new_value[0] == 'T');
|
||||
do_re_display(&tesseract::Tesseract::word_display);
|
||||
break;
|
||||
case BLOCKS_CMD_EVENT:
|
||||
display_blocks = (new_value[0] == 'T');
|
||||
do_re_display(&tesseract::Tesseract::word_display);
|
||||
break;
|
||||
case BASELINES_CMD_EVENT:
|
||||
display_baselines = (new_value[0] == 'T');
|
||||
do_re_display(&tesseract::Tesseract::word_display);
|
||||
break;
|
||||
case SHOW_SUBSCRIPT_CMD_EVENT:
|
||||
color_mode = CM_SUBSCRIPT;
|
||||
do_re_display(&tesseract::Tesseract::word_display);
|
||||
break;
|
||||
case SHOW_SUPERSCRIPT_CMD_EVENT:
|
||||
color_mode = CM_SUPERSCRIPT;
|
||||
do_re_display(&tesseract::Tesseract::word_display);
|
||||
break;
|
||||
case SHOW_ITALIC_CMD_EVENT:
|
||||
color_mode = CM_ITALIC;
|
||||
do_re_display(&tesseract::Tesseract::word_display);
|
||||
break;
|
||||
case SHOW_BOLD_CMD_EVENT:
|
||||
color_mode = CM_BOLD;
|
||||
do_re_display(&tesseract::Tesseract::word_display);
|
||||
break;
|
||||
case SHOW_UNDERLINE_CMD_EVENT:
|
||||
color_mode = CM_UNDERLINE;
|
||||
do_re_display(&tesseract::Tesseract::word_display);
|
||||
break;
|
||||
case SHOW_FIXEDPITCH_CMD_EVENT:
|
||||
color_mode = CM_FIXEDPITCH;
|
||||
do_re_display(&tesseract::Tesseract::word_display);
|
||||
break;
|
||||
case SHOW_SERIF_CMD_EVENT:
|
||||
color_mode = CM_SERIF;
|
||||
do_re_display(&tesseract::Tesseract::word_display);
|
||||
break;
|
||||
case SHOW_SMALLCAPS_CMD_EVENT:
|
||||
color_mode = CM_SMALLCAPS;
|
||||
do_re_display(&tesseract::Tesseract::word_display);
|
||||
break;
|
||||
case SHOW_DROPCAPS_CMD_EVENT:
|
||||
color_mode = CM_DROPCAPS;
|
||||
do_re_display(&tesseract::Tesseract::word_display);
|
||||
break;
|
||||
case REFRESH_CMD_EVENT:
|
||||
do_re_display(&tesseract::Tesseract::word_display);
|
||||
break;
|
||||
case QUIT_CMD_EVENT:
|
||||
exit = true;
|
||||
ScrollView::Exit();
|
||||
break;
|
||||
|
||||
default:
|
||||
snprintf(msg, sizeof(msg), "Unrecognised event %" PRId32 "(%s)", cmd_event, new_value);
|
||||
image_win->AddMessage(msg);
|
||||
break;
|
||||
}
|
||||
return exit;
|
||||
}
|
||||
|
||||
/**
|
||||
* process_image_event()
|
||||
*
|
||||
* User has done something in the image window - mouse down or up. Work out
|
||||
* what it is and do something with it.
|
||||
* If DOWN - just remember where it was.
|
||||
* If UP - for each word in the selected area do the operation defined by
|
||||
* the current mode.
|
||||
*/
|
||||
void Tesseract::process_image_event( // action in image win
|
||||
const SVEvent &event) {
|
||||
// The following variable should remain static, since it is used by
|
||||
// debug editor, which uses a single Tesseract instance.
|
||||
static ICOORD down;
|
||||
ICOORD up;
|
||||
TBOX selection_box;
|
||||
char msg[80];
|
||||
|
||||
switch (event.type) {
|
||||
case SVET_SELECTION:
|
||||
if (event.type == SVET_SELECTION) {
|
||||
down.set_x(event.x + event.x_size);
|
||||
down.set_y(event.y + event.y_size);
|
||||
if (mode == SHOW_POINT_CMD_EVENT) {
|
||||
show_point(current_page_res, event.x, event.y);
|
||||
}
|
||||
}
|
||||
|
||||
up.set_x(event.x);
|
||||
up.set_y(event.y);
|
||||
|
||||
selection_box = TBOX(down, up);
|
||||
|
||||
switch (mode) {
|
||||
case CHANGE_DISP_CMD_EVENT:
|
||||
process_selected_words(current_page_res, selection_box,
|
||||
&tesseract::Tesseract::word_blank_and_set_display);
|
||||
break;
|
||||
case DUMP_WERD_CMD_EVENT:
|
||||
process_selected_words(current_page_res, selection_box,
|
||||
&tesseract::Tesseract::word_dumper);
|
||||
break;
|
||||
case SHOW_BLN_WERD_CMD_EVENT:
|
||||
process_selected_words(current_page_res, selection_box,
|
||||
&tesseract::Tesseract::word_bln_display);
|
||||
break;
|
||||
case DEBUG_WERD_CMD_EVENT:
|
||||
debug_word(current_page_res, selection_box);
|
||||
break;
|
||||
case SHOW_POINT_CMD_EVENT:
|
||||
break; // ignore up event
|
||||
|
||||
case RECOG_WERDS:
|
||||
# ifndef DISABLED_LEGACY_ENGINE
|
||||
image_win->AddMessage("Recogging selected words");
|
||||
this->process_selected_words(current_page_res, selection_box,
|
||||
&Tesseract::recog_interactive);
|
||||
# endif // ndef DISABLED_LEGACY_ENGINE
|
||||
break;
|
||||
case RECOG_PSEUDO:
|
||||
image_win->AddMessage("Recogging selected blobs");
|
||||
recog_pseudo_word(current_page_res, selection_box);
|
||||
break;
|
||||
case SHOW_BLOB_FEATURES:
|
||||
blob_feature_display(current_page_res, selection_box);
|
||||
break;
|
||||
|
||||
default:
|
||||
sprintf(msg, "Mode %d not yet implemented", mode);
|
||||
image_win->AddMessage(msg);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* debug_word
|
||||
*
|
||||
* Process the whole image, but load word_config_ for the selected word(s).
|
||||
*/
|
||||
void Tesseract::debug_word(PAGE_RES *page_res, const TBOX &selection_box) {
|
||||
# ifndef DISABLED_LEGACY_ENGINE
|
||||
ResetAdaptiveClassifier();
|
||||
# endif
|
||||
recog_all_words(page_res, nullptr, &selection_box, word_config_.c_str(), 0);
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* WERD PROCESSOR FUNCTIONS
|
||||
* ========================
|
||||
*
|
||||
* These routines are invoked by one or more of:
|
||||
* process_all_words()
|
||||
* process_selected_words()
|
||||
* or
|
||||
* process_all_words_it()
|
||||
* process_selected_words_it()
|
||||
* for each word to be processed
|
||||
**********************************************************************/
|
||||
|
||||
/**
|
||||
* word_blank_and_set_display() Word processor
|
||||
*
|
||||
* Blank display of word then redisplay word according to current display mode
|
||||
* settings
|
||||
*/
|
||||
|
||||
bool Tesseract::word_blank_and_set_display(PAGE_RES_IT *pr_it) {
|
||||
pr_it->word()->word->bounding_box().plot(image_win, ScrollView::BLACK, ScrollView::BLACK);
|
||||
return word_set_display(pr_it);
|
||||
}
|
||||
|
||||
/**
|
||||
* word_bln_display()
|
||||
*
|
||||
* Normalize word and display in word window
|
||||
*/
|
||||
bool Tesseract::word_bln_display(PAGE_RES_IT *pr_it) {
|
||||
WERD_RES *word_res = pr_it->word();
|
||||
if (word_res->chopped_word == nullptr) {
|
||||
// Setup word normalization parameters.
|
||||
word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
|
||||
classify_bln_numeric_mode, textord_use_cjk_fp_model,
|
||||
poly_allow_detailed_fx, pr_it->row()->row, pr_it->block()->block);
|
||||
}
|
||||
bln_word_window_handle()->Clear();
|
||||
display_bln_lines(bln_word_window_handle(), ScrollView::CYAN, 1.0, 0.0f, -1000.0f, 1000.0f);
|
||||
C_BLOB_IT it(word_res->word->cblob_list());
|
||||
ScrollView::Color color = WERD::NextColor(ScrollView::BLACK);
|
||||
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
|
||||
it.data()->plot_normed(word_res->denorm, color, ScrollView::BROWN, bln_word_window_handle());
|
||||
color = WERD::NextColor(color);
|
||||
}
|
||||
bln_word_window_handle()->Update();
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* word_display() Word Processor
|
||||
*
|
||||
* Display a word according to its display modes
|
||||
*/
|
||||
bool Tesseract::word_display(PAGE_RES_IT *pr_it) {
|
||||
WERD_RES *word_res = pr_it->word();
|
||||
WERD *word = word_res->word;
|
||||
TBOX word_bb; // word bounding box
|
||||
int word_height; // ht of word BB
|
||||
bool displayed_something = false;
|
||||
float shift; // from bot left
|
||||
|
||||
if (color_mode != CM_RAINBOW && word_res->box_word != nullptr) {
|
||||
# ifndef DISABLED_LEGACY_ENGINE
|
||||
BoxWord *box_word = word_res->box_word;
|
||||
WERD_CHOICE *best_choice = word_res->best_choice;
|
||||
int length = box_word->length();
|
||||
if (word_res->fontinfo == nullptr) {
|
||||
return false;
|
||||
}
|
||||
const FontInfo &font_info = *word_res->fontinfo;
|
||||
for (int i = 0; i < length; ++i) {
|
||||
ScrollView::Color color = ScrollView::GREEN;
|
||||
switch (color_mode) {
|
||||
case CM_SUBSCRIPT:
|
||||
if (best_choice->BlobPosition(i) == SP_SUBSCRIPT) {
|
||||
color = ScrollView::RED;
|
||||
}
|
||||
break;
|
||||
case CM_SUPERSCRIPT:
|
||||
if (best_choice->BlobPosition(i) == SP_SUPERSCRIPT) {
|
||||
color = ScrollView::RED;
|
||||
}
|
||||
break;
|
||||
case CM_ITALIC:
|
||||
if (font_info.is_italic()) {
|
||||
color = ScrollView::RED;
|
||||
}
|
||||
break;
|
||||
case CM_BOLD:
|
||||
if (font_info.is_bold()) {
|
||||
color = ScrollView::RED;
|
||||
}
|
||||
break;
|
||||
case CM_FIXEDPITCH:
|
||||
if (font_info.is_fixed_pitch()) {
|
||||
color = ScrollView::RED;
|
||||
}
|
||||
break;
|
||||
case CM_SERIF:
|
||||
if (font_info.is_serif()) {
|
||||
color = ScrollView::RED;
|
||||
}
|
||||
break;
|
||||
case CM_SMALLCAPS:
|
||||
if (word_res->small_caps) {
|
||||
color = ScrollView::RED;
|
||||
}
|
||||
break;
|
||||
case CM_DROPCAPS:
|
||||
if (best_choice->BlobPosition(i) == SP_DROPCAP) {
|
||||
color = ScrollView::RED;
|
||||
}
|
||||
break;
|
||||
// TODO(rays) underline is currently completely unsupported.
|
||||
case CM_UNDERLINE:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
image_win->Pen(color);
|
||||
TBOX box = box_word->BlobBox(i);
|
||||
image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top());
|
||||
}
|
||||
return true;
|
||||
# else
|
||||
return false;
|
||||
# endif // ndef DISABLED_LEGACY_ENGINE
|
||||
}
|
||||
/*
|
||||
Note the double coercions of(COLOUR)((int32_t)editor_image_word_bb_color)
|
||||
etc. are to keep the compiler happy.
|
||||
*/
|
||||
// display bounding box
|
||||
if (word->display_flag(DF_BOX)) {
|
||||
word->bounding_box().plot(image_win,
|
||||
static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color),
|
||||
static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color));
|
||||
|
||||
auto c = static_cast<ScrollView::Color>((int32_t)editor_image_blob_bb_color);
|
||||
image_win->Pen(c);
|
||||
// cblob iterator
|
||||
C_BLOB_IT c_it(word->cblob_list());
|
||||
for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
|
||||
c_it.data()->bounding_box().plot(image_win);
|
||||
}
|
||||
displayed_something = true;
|
||||
}
|
||||
|
||||
// display edge steps
|
||||
if (word->display_flag(DF_EDGE_STEP)) { // edgesteps available
|
||||
word->plot(image_win); // rainbow colors
|
||||
displayed_something = true;
|
||||
}
|
||||
|
||||
// display poly approx
|
||||
if (word->display_flag(DF_POLYGONAL)) {
|
||||
// need to convert
|
||||
TWERD *tword = TWERD::PolygonalCopy(poly_allow_detailed_fx, word);
|
||||
tword->plot(image_win);
|
||||
delete tword;
|
||||
displayed_something = true;
|
||||
}
|
||||
|
||||
// Display correct text and blamer information.
|
||||
std::string text;
|
||||
std::string blame;
|
||||
if (word->display_flag(DF_TEXT) && word->text() != nullptr) {
|
||||
text = word->text();
|
||||
}
|
||||
if (word->display_flag(DF_BLAMER) &&
|
||||
!(word_res->blamer_bundle != nullptr &&
|
||||
word_res->blamer_bundle->incorrect_result_reason() == IRR_CORRECT)) {
|
||||
text = "";
|
||||
const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
|
||||
if (blamer_bundle == nullptr) {
|
||||
text += "NULL";
|
||||
} else {
|
||||
text = blamer_bundle->TruthString();
|
||||
}
|
||||
text += " -> ";
|
||||
std::string best_choice_str;
|
||||
if (word_res->best_choice == nullptr) {
|
||||
best_choice_str = "NULL";
|
||||
} else {
|
||||
word_res->best_choice->string_and_lengths(&best_choice_str, nullptr);
|
||||
}
|
||||
text += best_choice_str;
|
||||
IncorrectResultReason reason =
|
||||
(blamer_bundle == nullptr) ? IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason();
|
||||
ASSERT_HOST(reason < IRR_NUM_REASONS);
|
||||
blame += " [";
|
||||
blame += BlamerBundle::IncorrectReasonName(reason);
|
||||
blame += "]";
|
||||
}
|
||||
if (text.length() > 0) {
|
||||
word_bb = word->bounding_box();
|
||||
image_win->Pen(ScrollView::RED);
|
||||
word_height = word_bb.height();
|
||||
int text_height = 0.50 * word_height;
|
||||
if (text_height > 20) {
|
||||
text_height = 20;
|
||||
}
|
||||
image_win->TextAttributes("Arial", text_height, false, false, false);
|
||||
shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
|
||||
image_win->Text(word_bb.left() + shift, word_bb.bottom() + 0.25 * word_height, text.c_str());
|
||||
if (blame.length() > 0) {
|
||||
image_win->Text(word_bb.left() + shift, word_bb.bottom() + 0.25 * word_height - text_height,
|
||||
blame.c_str());
|
||||
}
|
||||
|
||||
displayed_something = true;
|
||||
}
|
||||
|
||||
if (!displayed_something) { // display BBox anyway
|
||||
word->bounding_box().plot(image_win,
|
||||
static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color),
|
||||
static_cast<ScrollView::Color>((int32_t)editor_image_word_bb_color));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} // namespace tesseract
|
||||
#endif // !GRAPHICS_DISABLED
|
||||
|
||||
namespace tesseract {
|
||||
/**
|
||||
* word_dumper()
|
||||
*
|
||||
* Dump members to the debug window
|
||||
*/
|
||||
bool Tesseract::word_dumper(PAGE_RES_IT *pr_it) {
|
||||
if (pr_it->block()->block != nullptr) {
|
||||
tprintf("\nBlock data...\n");
|
||||
pr_it->block()->block->print(nullptr, false);
|
||||
}
|
||||
tprintf("\nRow data...\n");
|
||||
pr_it->row()->row->print(nullptr);
|
||||
tprintf("\nWord data...\n");
|
||||
WERD_RES *word_res = pr_it->word();
|
||||
word_res->word->print();
|
||||
if (word_res->blamer_bundle != nullptr && wordrec_debug_blamer &&
|
||||
word_res->blamer_bundle->incorrect_result_reason() != IRR_CORRECT) {
|
||||
tprintf("Current blamer debug: %s\n", word_res->blamer_bundle->debug().c_str());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
/**
|
||||
* word_set_display() Word processor
|
||||
*
|
||||
* Display word according to current display mode settings
|
||||
*/
|
||||
bool Tesseract::word_set_display(PAGE_RES_IT *pr_it) {
|
||||
WERD *word = pr_it->word()->word;
|
||||
word->set_display_flag(DF_BOX, word_display_mode[DF_BOX]);
|
||||
word->set_display_flag(DF_TEXT, word_display_mode[DF_TEXT]);
|
||||
word->set_display_flag(DF_POLYGONAL, word_display_mode[DF_POLYGONAL]);
|
||||
word->set_display_flag(DF_EDGE_STEP, word_display_mode[DF_EDGE_STEP]);
|
||||
word->set_display_flag(DF_BN_POLYGONAL, word_display_mode[DF_BN_POLYGONAL]);
|
||||
word->set_display_flag(DF_BLAMER, word_display_mode[DF_BLAMER]);
|
||||
return word_display(pr_it);
|
||||
}
|
||||
|
||||
// page_res is non-const because the iterator doesn't know if you are going
|
||||
// to change the items it points to! Really a const here though.
|
||||
void Tesseract::blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box) {
|
||||
# ifndef DISABLED_LEGACY_ENGINE
|
||||
PAGE_RES_IT *it = make_pseudo_word(page_res, selection_box);
|
||||
if (it != nullptr) {
|
||||
WERD_RES *word_res = it->word();
|
||||
word_res->x_height = it->row()->row->x_height();
|
||||
word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
|
||||
classify_bln_numeric_mode, textord_use_cjk_fp_model,
|
||||
poly_allow_detailed_fx, it->row()->row, it->block()->block);
|
||||
TWERD *bln_word = word_res->chopped_word;
|
||||
TBLOB *bln_blob = bln_word->blobs[0];
|
||||
INT_FX_RESULT_STRUCT fx_info;
|
||||
std::vector<INT_FEATURE_STRUCT> bl_features;
|
||||
std::vector<INT_FEATURE_STRUCT> cn_features;
|
||||
Classify::ExtractFeatures(*bln_blob, classify_nonlinear_norm, &bl_features, &cn_features,
|
||||
&fx_info, nullptr);
|
||||
// Display baseline features.
|
||||
ScrollView *bl_win = CreateFeatureSpaceWindow("BL Features", 512, 0);
|
||||
ClearFeatureSpaceWindow(baseline, bl_win);
|
||||
for (auto &bl_feature : bl_features) {
|
||||
RenderIntFeature(bl_win, &bl_feature, ScrollView::GREEN);
|
||||
}
|
||||
bl_win->Update();
|
||||
// Display cn features.
|
||||
ScrollView *cn_win = CreateFeatureSpaceWindow("CN Features", 512, 0);
|
||||
ClearFeatureSpaceWindow(character, cn_win);
|
||||
for (auto &cn_feature : cn_features) {
|
||||
RenderIntFeature(cn_win, &cn_feature, ScrollView::GREEN);
|
||||
}
|
||||
cn_win->Update();
|
||||
|
||||
it->DeleteCurrentWord();
|
||||
delete it;
|
||||
}
|
||||
# endif // ndef DISABLED_LEGACY_ENGINE
|
||||
}
|
||||
|
||||
#endif // !GRAPHICS_DISABLED
|
||||
|
||||
} // namespace tesseract
|
68
3rdparty/tesseract_ocr/tesseract/src/ccmain/pgedit.h
vendored
Normal file
68
3rdparty/tesseract_ocr/tesseract/src/ccmain/pgedit.h
vendored
Normal file
|
@ -0,0 +1,68 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: pgedit.h
|
||||
// Description: Page structure file editor
|
||||
// Author: Joern Wanke
|
||||
//
|
||||
// (C) Copyright 2007, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef PGEDIT_H
|
||||
#define PGEDIT_H
|
||||
|
||||
#include "params.h" // for INT_VAR_H, IntParam, STRING_VAR_H, StringParam
|
||||
#include "scrollview.h" // for SVEvent (ptr only), SVEventHandler, ScrollView
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class BLOCK_LIST;
|
||||
class PAGE_RES;
|
||||
|
||||
class Tesseract;
|
||||
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
// A small event handler class to process incoming events to
|
||||
// this window.
|
||||
class PGEventHandler : public SVEventHandler {
|
||||
public:
|
||||
PGEventHandler(tesseract::Tesseract *tess) : tess_(tess) {}
|
||||
void Notify(const SVEvent *sve) override;
|
||||
|
||||
private:
|
||||
tesseract::Tesseract *tess_;
|
||||
};
|
||||
#endif // !GRAPHICS_DISABLED
|
||||
|
||||
extern BLOCK_LIST *current_block_list;
|
||||
extern STRING_VAR_H(editor_image_win_name, "EditorImage", "Editor image window name");
|
||||
extern INT_VAR_H(editor_image_xpos, 590, "Editor image X Pos");
|
||||
extern INT_VAR_H(editor_image_ypos, 10, "Editor image Y Pos");
|
||||
extern INT_VAR_H(editor_image_height, 680, "Editor image height");
|
||||
extern INT_VAR_H(editor_image_width, 655, "Editor image width");
|
||||
extern INT_VAR_H(editor_image_word_bb_color, BLUE, "Word bounding box colour");
|
||||
extern INT_VAR_H(editor_image_blob_bb_color, YELLOW, "Blob bounding box colour");
|
||||
extern INT_VAR_H(editor_image_text_color, WHITE, "Correct text colour");
|
||||
extern STRING_VAR_H(editor_dbwin_name, "EditorDBWin", "Editor debug window name");
|
||||
extern INT_VAR_H(editor_dbwin_xpos, 50, "Editor debug window X Pos");
|
||||
extern INT_VAR_H(editor_dbwin_ypos, 500, "Editor debug window Y Pos");
|
||||
extern INT_VAR_H(editor_dbwin_height, 24, "Editor debug window height");
|
||||
extern INT_VAR_H(editor_dbwin_width, 80, "Editor debug window width");
|
||||
extern STRING_VAR_H(editor_word_name, "BlnWords", "BL normalised word window");
|
||||
extern INT_VAR_H(editor_word_xpos, 60, "Word window X Pos");
|
||||
extern INT_VAR_H(editor_word_ypos, 510, "Word window Y Pos");
|
||||
extern INT_VAR_H(editor_word_height, 240, "Word window height");
|
||||
extern INT_VAR_H(editor_word_width, 655, "Word window width");
|
||||
extern double_VAR_H(editor_smd_scale_factor, 1.0, "Scaling for smd image");
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif
|
228
3rdparty/tesseract_ocr/tesseract/src/ccmain/recogtraining.cpp
vendored
Normal file
228
3rdparty/tesseract_ocr/tesseract/src/ccmain/recogtraining.cpp
vendored
Normal file
|
@ -0,0 +1,228 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: recogtraining.cpp
|
||||
// Description: Functions for ambiguity and parameter training.
|
||||
// Author: Daria Antonova
|
||||
//
|
||||
// (C) Copyright 2009, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "tesseractclass.h"
|
||||
|
||||
#include "boxread.h"
|
||||
#include "control.h"
|
||||
#include "host.h" // for NearlyEqual
|
||||
#include "ratngs.h"
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
# include "reject.h"
|
||||
#endif
|
||||
#include "stopper.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
const int16_t kMaxBoxEdgeDiff = 2;
|
||||
|
||||
// Sets flags necessary for recognition in the training mode.
|
||||
// Opens and returns the pointer to the output file.
|
||||
FILE *Tesseract::init_recog_training(const char *filename) {
|
||||
if (tessedit_ambigs_training) {
|
||||
tessedit_tess_adaption_mode.set_value(0); // turn off adaption
|
||||
tessedit_enable_doc_dict.set_value(false); // turn off document dictionary
|
||||
// Explore all segmentations.
|
||||
getDict().stopper_no_acceptable_choices.set_value(true);
|
||||
}
|
||||
|
||||
std::string output_fname = filename;
|
||||
const char *lastdot = strrchr(output_fname.c_str(), '.');
|
||||
if (lastdot != nullptr) {
|
||||
output_fname[lastdot - output_fname.c_str()] = '\0';
|
||||
}
|
||||
output_fname += ".txt";
|
||||
FILE *output_file = fopen(output_fname.c_str(), "a+");
|
||||
if (output_file == nullptr) {
|
||||
tprintf("Error: Could not open file %s\n", output_fname.c_str());
|
||||
ASSERT_HOST(output_file);
|
||||
}
|
||||
return output_file;
|
||||
}
|
||||
|
||||
// Copies the bounding box from page_res_it->word() to the given TBOX.
|
||||
static bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) {
|
||||
while (page_res_it->block() != nullptr && page_res_it->word() == nullptr) {
|
||||
page_res_it->forward();
|
||||
}
|
||||
|
||||
if (page_res_it->word() != nullptr) {
|
||||
*tbox = page_res_it->word()->word->bounding_box();
|
||||
|
||||
// If tbox->left() is negative, the training image has vertical text and
|
||||
// all the coordinates of bounding boxes of page_res are rotated by 90
|
||||
// degrees in a counterclockwise direction. We need to rotate the TBOX back
|
||||
// in order to compare with the TBOXes of box files.
|
||||
if (tbox->left() < 0) {
|
||||
tbox->rotate(FCOORD(0.0, -1.0));
|
||||
}
|
||||
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// This function takes tif/box pair of files and runs recognition on the image,
|
||||
// while making sure that the word bounds that tesseract identified roughly
|
||||
// match to those specified by the input box file. For each word (ngram in a
|
||||
// single bounding box from the input box file) it outputs the ocred result,
|
||||
// the correct label, rating and certainty.
|
||||
void Tesseract::recog_training_segmented(const char *filename, PAGE_RES *page_res,
|
||||
volatile ETEXT_DESC *monitor, FILE *output_file) {
|
||||
std::string box_fname = filename;
|
||||
const char *lastdot = strrchr(box_fname.c_str(), '.');
|
||||
if (lastdot != nullptr) {
|
||||
box_fname[lastdot - box_fname.c_str()] = '\0';
|
||||
}
|
||||
box_fname += ".box";
|
||||
// ReadNextBox() will close box_file
|
||||
FILE *box_file = fopen(box_fname.c_str(), "r");
|
||||
if (box_file == nullptr) {
|
||||
tprintf("Error: Could not open file %s\n", box_fname.c_str());
|
||||
ASSERT_HOST(box_file);
|
||||
}
|
||||
|
||||
PAGE_RES_IT page_res_it;
|
||||
page_res_it.page_res = page_res;
|
||||
page_res_it.restart_page();
|
||||
std::string label;
|
||||
|
||||
// Process all the words on this page.
|
||||
TBOX tbox; // tesseract-identified box
|
||||
TBOX bbox; // box from the box file
|
||||
bool keep_going;
|
||||
int line_number = 0;
|
||||
int examined_words = 0;
|
||||
do {
|
||||
keep_going = read_t(&page_res_it, &tbox);
|
||||
keep_going &= ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
|
||||
// Align bottom left points of the TBOXes.
|
||||
while (keep_going && !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
|
||||
if (bbox.bottom() < tbox.bottom()) {
|
||||
page_res_it.forward();
|
||||
keep_going = read_t(&page_res_it, &tbox);
|
||||
} else {
|
||||
keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
|
||||
}
|
||||
}
|
||||
while (keep_going && !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
|
||||
if (bbox.left() > tbox.left()) {
|
||||
page_res_it.forward();
|
||||
keep_going = read_t(&page_res_it, &tbox);
|
||||
} else {
|
||||
keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
|
||||
}
|
||||
}
|
||||
// OCR the word if top right points of the TBOXes are similar.
|
||||
if (keep_going && NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
|
||||
NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
|
||||
ambigs_classify_and_output(label.c_str(), &page_res_it, output_file);
|
||||
examined_words++;
|
||||
}
|
||||
page_res_it.forward();
|
||||
} while (keep_going);
|
||||
|
||||
// Set up scripts on all of the words that did not get sent to
|
||||
// ambigs_classify_and_output. They all should have, but if all the
|
||||
// werd_res's don't get uch_sets, tesseract will crash when you try
|
||||
// to iterate over them. :-(
|
||||
int total_words = 0;
|
||||
for (page_res_it.restart_page(); page_res_it.block() != nullptr; page_res_it.forward()) {
|
||||
if (page_res_it.word()) {
|
||||
if (page_res_it.word()->uch_set == nullptr) {
|
||||
page_res_it.word()->SetupFake(unicharset);
|
||||
}
|
||||
total_words++;
|
||||
}
|
||||
}
|
||||
if (examined_words < 0.85 * total_words) {
|
||||
tprintf(
|
||||
"TODO(antonova): clean up recog_training_segmented; "
|
||||
" It examined only a small fraction of the ambigs image.\n");
|
||||
}
|
||||
tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words, total_words);
|
||||
}
|
||||
|
||||
// Helper prints the given set of blob choices.
|
||||
static void PrintPath(int length, const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset,
|
||||
const char *label, FILE *output_file) {
|
||||
float rating = 0.0f;
|
||||
float certainty = 0.0f;
|
||||
for (int i = 0; i < length; ++i) {
|
||||
const BLOB_CHOICE *blob_choice = blob_choices[i];
|
||||
fprintf(output_file, "%s", unicharset.id_to_unichar(blob_choice->unichar_id()));
|
||||
rating += blob_choice->rating();
|
||||
if (certainty > blob_choice->certainty()) {
|
||||
certainty = blob_choice->certainty();
|
||||
}
|
||||
}
|
||||
fprintf(output_file, "\t%s\t%.4f\t%.4f\n", label, rating, certainty);
|
||||
}
|
||||
|
||||
// Helper recursively prints all paths through the ratings matrix, starting
|
||||
// at column col.
|
||||
static void PrintMatrixPaths(int col, int dim, const MATRIX &ratings, int length,
|
||||
const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset,
|
||||
const char *label, FILE *output_file) {
|
||||
for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) {
|
||||
if (ratings.get(col, row) != NOT_CLASSIFIED) {
|
||||
BLOB_CHOICE_IT bc_it(ratings.get(col, row));
|
||||
for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
|
||||
blob_choices[length] = bc_it.data();
|
||||
if (row + 1 < dim) {
|
||||
PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices, unicharset, label,
|
||||
output_file);
|
||||
} else {
|
||||
PrintPath(length + 1, blob_choices, unicharset, label, output_file);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Runs classify_word_pass1() on the current word. Outputs Tesseract's
|
||||
// raw choice as a result of the classification. For words labeled with a
|
||||
// single unichar also outputs all alternatives from blob_choices of the
|
||||
// best choice.
|
||||
void Tesseract::ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it,
|
||||
FILE *output_file) {
|
||||
// Classify word.
|
||||
fflush(stdout);
|
||||
WordData word_data(*pr_it);
|
||||
SetupWordPassN(1, &word_data);
|
||||
classify_word_and_language(1, pr_it, &word_data);
|
||||
WERD_RES *werd_res = word_data.word;
|
||||
WERD_CHOICE *best_choice = werd_res->best_choice;
|
||||
ASSERT_HOST(best_choice != nullptr);
|
||||
|
||||
// Compute the number of unichars in the label.
|
||||
std::vector<UNICHAR_ID> encoding;
|
||||
if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
|
||||
tprintf("Not outputting illegal unichar %s\n", label);
|
||||
return;
|
||||
}
|
||||
|
||||
// Dump all paths through the ratings matrix (which is normally small).
|
||||
int dim = werd_res->ratings->dimension();
|
||||
const auto **blob_choices = new const BLOB_CHOICE *[dim];
|
||||
PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset, label, output_file);
|
||||
delete[] blob_choices;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
785
3rdparty/tesseract_ocr/tesseract/src/ccmain/reject.cpp
vendored
Normal file
785
3rdparty/tesseract_ocr/tesseract/src/ccmain/reject.cpp
vendored
Normal file
|
@ -0,0 +1,785 @@
|
|||
/**********************************************************************
|
||||
* File: reject.cpp (Formerly reject.c)
|
||||
* Description: Rejection functions used in tessedit
|
||||
* Author: Phil Cheatle
|
||||
*
|
||||
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
// Include automatically generated configuration file if running autoconf.
|
||||
#ifdef HAVE_CONFIG_H
|
||||
# include "config_auto.h"
|
||||
#endif
|
||||
|
||||
#include "reject.h"
|
||||
|
||||
#ifdef DISABLED_LEGACY_ENGINE
|
||||
|
||||
# include "tesseractclass.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
|
||||
const WERD_CHOICE &word = *werd_res->best_choice;
|
||||
int dict_word_type = werd_res->tesseract->dict_word(word);
|
||||
return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
|
||||
}
|
||||
} // namespace tesseract
|
||||
|
||||
#else
|
||||
|
||||
# include "control.h"
|
||||
# include "docqual.h"
|
||||
# include "tesseractclass.h"
|
||||
# include "tessvars.h"
|
||||
|
||||
# include "helpers.h"
|
||||
|
||||
# include <algorithm> // for std::sort
|
||||
# include <cctype>
|
||||
# include <cerrno>
|
||||
# include <cstring>
|
||||
# include <vector> // for std::vector
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
/*************************************************************************
|
||||
* set_done()
|
||||
*
|
||||
* Set the done flag based on the word acceptability criteria
|
||||
*************************************************************************/
|
||||
|
||||
void Tesseract::set_done(WERD_RES *word, int16_t pass) {
|
||||
word->done =
|
||||
word->tess_accepted && (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr);
|
||||
bool word_is_ambig = word->best_choice->dangerous_ambig_found();
|
||||
bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
|
||||
word->best_choice->permuter() == FREQ_DAWG_PERM ||
|
||||
word->best_choice->permuter() == USER_DAWG_PERM;
|
||||
if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
|
||||
one_ell_conflict(word, false)) {
|
||||
if (tessedit_rejection_debug) {
|
||||
tprintf("one_ell_conflict detected\n");
|
||||
}
|
||||
word->done = false;
|
||||
}
|
||||
if (word->done &&
|
||||
((!word_from_dict && word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
|
||||
if (tessedit_rejection_debug) {
|
||||
tprintf("non-dict or ambig word detected\n");
|
||||
}
|
||||
word->done = false;
|
||||
}
|
||||
if (tessedit_rejection_debug) {
|
||||
tprintf("set_done(): done=%d\n", word->done);
|
||||
word->best_choice->print("");
|
||||
}
|
||||
}
|
||||
|
||||
/*************************************************************************
|
||||
* make_reject_map()
|
||||
*
|
||||
* Sets the done flag to indicate whether the resylt is acceptable.
|
||||
*
|
||||
* Sets a reject map for the word.
|
||||
*************************************************************************/
|
||||
void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
|
||||
int i;
|
||||
int offset;
|
||||
|
||||
flip_0O(word);
|
||||
check_debug_pt(word, -1); // For trap only
|
||||
set_done(word, pass); // Set acceptance
|
||||
word->reject_map.initialise(word->best_choice->unichar_lengths().length());
|
||||
reject_blanks(word);
|
||||
/*
|
||||
0: Rays original heuristic - the baseline
|
||||
*/
|
||||
if (tessedit_reject_mode == 0) {
|
||||
if (!word->done) {
|
||||
reject_poor_matches(word);
|
||||
}
|
||||
} else if (tessedit_reject_mode == 5) {
|
||||
/*
|
||||
5: Reject I/1/l from words where there is no strong contextual confirmation;
|
||||
the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
|
||||
and the whole of any words which are very small
|
||||
*/
|
||||
if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
|
||||
word->reject_map.rej_word_small_xht();
|
||||
} else {
|
||||
one_ell_conflict(word, true);
|
||||
/*
|
||||
Originally the code here just used the done flag. Now I have duplicated
|
||||
and unpacked the conditions for setting the done flag so that each
|
||||
mechanism can be turned on or off independently. This works WITHOUT
|
||||
affecting the done flag setting.
|
||||
*/
|
||||
if (rej_use_tess_accepted && !word->tess_accepted) {
|
||||
word->reject_map.rej_word_not_tess_accepted();
|
||||
}
|
||||
|
||||
if (rej_use_tess_blanks &&
|
||||
(strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {
|
||||
word->reject_map.rej_word_contains_blanks();
|
||||
}
|
||||
|
||||
WERD_CHOICE *best_choice = word->best_choice;
|
||||
if (rej_use_good_perm) {
|
||||
if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
|
||||
best_choice->permuter() == FREQ_DAWG_PERM ||
|
||||
best_choice->permuter() == USER_DAWG_PERM) &&
|
||||
(!rej_use_sensible_wd ||
|
||||
acceptable_word_string(*word->uch_set, best_choice->unichar_string().c_str(),
|
||||
best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE)) {
|
||||
// PASSED TEST
|
||||
} else if (best_choice->permuter() == NUMBER_PERM) {
|
||||
if (rej_alphas_in_number_perm) {
|
||||
for (i = 0, offset = 0; best_choice->unichar_string()[offset] != '\0';
|
||||
offset += best_choice->unichar_lengths()[i++]) {
|
||||
if (word->reject_map[i].accepted() &&
|
||||
word->uch_set->get_isalpha(best_choice->unichar_string().c_str() + offset,
|
||||
best_choice->unichar_lengths()[i])) {
|
||||
word->reject_map[i].setrej_bad_permuter();
|
||||
}
|
||||
// rej alpha
|
||||
}
|
||||
}
|
||||
} else {
|
||||
word->reject_map.rej_word_bad_permuter();
|
||||
}
|
||||
}
|
||||
/* Ambig word rejection was here once !!*/
|
||||
}
|
||||
} else {
|
||||
tprintf("BAD tessedit_reject_mode\n");
|
||||
ASSERT_HOST("Fatal error encountered!" == nullptr);
|
||||
}
|
||||
|
||||
if (tessedit_image_border > -1) {
|
||||
reject_edge_blobs(word);
|
||||
}
|
||||
|
||||
check_debug_pt(word, 10);
|
||||
if (tessedit_rejection_debug) {
|
||||
tprintf("Permuter Type = %d\n", word->best_choice->permuter());
|
||||
tprintf("Certainty: %f Rating: %f\n", word->best_choice->certainty(),
|
||||
word->best_choice->rating());
|
||||
tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
|
||||
}
|
||||
|
||||
flip_hyphens(word);
|
||||
check_debug_pt(word, 20);
|
||||
}
|
||||
|
||||
void reject_blanks(WERD_RES *word) {
|
||||
int16_t i;
|
||||
int16_t offset;
|
||||
|
||||
for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
|
||||
offset += word->best_choice->unichar_lengths()[i], i += 1) {
|
||||
if (word->best_choice->unichar_string()[offset] == ' ') {
|
||||
// rej unrecognised blobs
|
||||
word->reject_map[i].setrej_tess_failure();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Tesseract::reject_I_1_L(WERD_RES *word) {
|
||||
int16_t i;
|
||||
int16_t offset;
|
||||
|
||||
for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
|
||||
offset += word->best_choice->unichar_lengths()[i], i += 1) {
|
||||
if (conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])) {
|
||||
// rej 1Il conflict
|
||||
word->reject_map[i].setrej_1Il_conflict();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void reject_poor_matches(WERD_RES *word) {
|
||||
float threshold = compute_reject_threshold(word->best_choice);
|
||||
for (int i = 0; i < word->best_choice->length(); ++i) {
|
||||
if (word->best_choice->unichar_id(i) == UNICHAR_SPACE) {
|
||||
word->reject_map[i].setrej_tess_failure();
|
||||
} else if (word->best_choice->certainty(i) < threshold) {
|
||||
word->reject_map[i].setrej_poor_match();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* compute_reject_threshold
|
||||
*
|
||||
* Set a rejection threshold for this word.
|
||||
* Initially this is a trivial function which looks for the largest
|
||||
* gap in the certainty value.
|
||||
**********************************************************************/
|
||||
|
||||
float compute_reject_threshold(WERD_CHOICE *word) {
|
||||
float threshold; // rejection threshold
|
||||
float bestgap = 0.0f; // biggest gap
|
||||
float gapstart; // bottom of gap
|
||||
|
||||
int blob_count = word->length();
|
||||
std::vector<float> ratings;
|
||||
ratings.reserve(blob_count);
|
||||
for (int i = 0; i < blob_count; ++i) {
|
||||
ratings.push_back(word->certainty(i));
|
||||
}
|
||||
std::sort(ratings.begin(), ratings.end());
|
||||
gapstart = ratings[0] - 1; // all reject if none better
|
||||
if (blob_count >= 3) {
|
||||
for (int index = 0; index < blob_count - 1; index++) {
|
||||
if (ratings[index + 1] - ratings[index] > bestgap) {
|
||||
bestgap = ratings[index + 1] - ratings[index];
|
||||
// find biggest
|
||||
gapstart = ratings[index];
|
||||
}
|
||||
}
|
||||
}
|
||||
threshold = gapstart + bestgap / 2;
|
||||
|
||||
return threshold;
|
||||
}
|
||||
|
||||
/*************************************************************************
|
||||
* reject_edge_blobs()
|
||||
*
|
||||
* If the word is perilously close to the edge of the image, reject those blobs
|
||||
* in the word which are too close to the edge as they could be clipped.
|
||||
*************************************************************************/
|
||||
void Tesseract::reject_edge_blobs(WERD_RES *word) {
|
||||
TBOX word_box = word->word->bounding_box();
|
||||
// Use the box_word as it is already denormed back to image coordinates.
|
||||
int blobcount = word->box_word->length();
|
||||
|
||||
if (word_box.left() < tessedit_image_border || word_box.bottom() < tessedit_image_border ||
|
||||
word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
|
||||
word_box.top() + tessedit_image_border > ImageHeight() - 1) {
|
||||
ASSERT_HOST(word->reject_map.length() == blobcount);
|
||||
for (int blobindex = 0; blobindex < blobcount; blobindex++) {
|
||||
TBOX blob_box = word->box_word->BlobBox(blobindex);
|
||||
if (blob_box.left() < tessedit_image_border || blob_box.bottom() < tessedit_image_border ||
|
||||
blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
|
||||
blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
|
||||
word->reject_map[blobindex].setrej_edge_char();
|
||||
// Close to edge
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* one_ell_conflict()
|
||||
*
|
||||
* Identify words where there is a potential I/l/1 error.
|
||||
* - A bundle of contextual heuristics!
|
||||
**********************************************************************/
|
||||
bool Tesseract::one_ell_conflict(WERD_RES *word_res, bool update_map) {
|
||||
const char *word;
|
||||
const char *lengths;
|
||||
int16_t word_len; // its length
|
||||
int16_t first_alphanum_index_;
|
||||
int16_t first_alphanum_offset_;
|
||||
int16_t i;
|
||||
int16_t offset;
|
||||
bool non_conflict_set_char; // non conf set a/n?
|
||||
bool conflict = false;
|
||||
bool allow_1s;
|
||||
ACCEPTABLE_WERD_TYPE word_type;
|
||||
bool dict_perm_type;
|
||||
bool dict_word_ok;
|
||||
int dict_word_type;
|
||||
|
||||
word = word_res->best_choice->unichar_string().c_str();
|
||||
lengths = word_res->best_choice->unichar_lengths().c_str();
|
||||
word_len = strlen(lengths);
|
||||
/*
|
||||
If there are no occurrences of the conflict set characters then the word
|
||||
is OK.
|
||||
*/
|
||||
if (strpbrk(word, conflict_set_I_l_1.c_str()) == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
There is a conflict if there are NO other (confirmed) alphanumerics apart
|
||||
from those in the conflict set.
|
||||
*/
|
||||
|
||||
for (i = 0, offset = 0, non_conflict_set_char = false; (i < word_len) && !non_conflict_set_char;
|
||||
offset += lengths[i++]) {
|
||||
non_conflict_set_char = (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
|
||||
word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
|
||||
!conflict_set_I_l_1.contains(word[offset]);
|
||||
}
|
||||
if (!non_conflict_set_char) {
|
||||
if (update_map) {
|
||||
reject_I_1_L(word_res);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
If the word is accepted by a dawg permuter, and the first alpha character
|
||||
is "I" or "l", check to see if the alternative is also a dawg word. If it
|
||||
is, then there is a potential error otherwise the word is ok.
|
||||
*/
|
||||
|
||||
dict_perm_type = (word_res->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
|
||||
(word_res->best_choice->permuter() == USER_DAWG_PERM) ||
|
||||
(rej_trust_doc_dawg && (word_res->best_choice->permuter() == DOC_DAWG_PERM)) ||
|
||||
(word_res->best_choice->permuter() == FREQ_DAWG_PERM);
|
||||
dict_word_type = dict_word(*(word_res->best_choice));
|
||||
dict_word_ok = (dict_word_type > 0) && (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
|
||||
|
||||
if ((rej_1Il_use_dict_word && dict_word_ok) || (rej_1Il_trust_permuter_type && dict_perm_type) ||
|
||||
(dict_perm_type && dict_word_ok)) {
|
||||
first_alphanum_index_ = first_alphanum_index(word, lengths);
|
||||
first_alphanum_offset_ = first_alphanum_offset(word, lengths);
|
||||
if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {
|
||||
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
|
||||
if (safe_dict_word(word_res) > 0) {
|
||||
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
|
||||
if (update_map) {
|
||||
word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {
|
||||
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
|
||||
if (safe_dict_word(word_res) > 0) {
|
||||
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
|
||||
if (update_map) {
|
||||
word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
NEW 1Il code. The old code relied on permuter types too much. In fact,
|
||||
tess will use TOP_CHOICE permute for good things like "palette".
|
||||
In this code the string is examined independently to see if it looks like
|
||||
a well formed word.
|
||||
*/
|
||||
|
||||
/*
|
||||
REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
|
||||
dictionary word.
|
||||
*/
|
||||
first_alphanum_index_ = first_alphanum_index(word, lengths);
|
||||
first_alphanum_offset_ = first_alphanum_offset(word, lengths);
|
||||
if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {
|
||||
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
|
||||
if (safe_dict_word(word_res) > 0) {
|
||||
return false;
|
||||
} else {
|
||||
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
|
||||
}
|
||||
} else if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {
|
||||
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
|
||||
if (safe_dict_word(word_res) > 0) {
|
||||
return false;
|
||||
} else {
|
||||
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
|
||||
}
|
||||
}
|
||||
/*
|
||||
For strings containing digits:
|
||||
If there are no alphas OR the numeric permuter liked the word,
|
||||
reject any non 1 conflict chs
|
||||
Else reject all conflict chs
|
||||
*/
|
||||
if (word_contains_non_1_digit(word, lengths)) {
|
||||
allow_1s =
|
||||
(alpha_count(word, lengths) == 0) || (word_res->best_choice->permuter() == NUMBER_PERM);
|
||||
|
||||
int16_t offset;
|
||||
conflict = false;
|
||||
for (i = 0, offset = 0; word[offset] != '\0';
|
||||
offset += word_res->best_choice->unichar_lengths()[i++]) {
|
||||
if ((!allow_1s || (word[offset] != '1')) &&
|
||||
conflict_set_I_l_1.contains(word[offset])) {
|
||||
if (update_map) {
|
||||
word_res->reject_map[i].setrej_1Il_conflict();
|
||||
}
|
||||
conflict = true;
|
||||
}
|
||||
}
|
||||
return conflict;
|
||||
}
|
||||
/*
|
||||
For anything else. See if it conforms to an acceptable word type. If so,
|
||||
treat accordingly.
|
||||
*/
|
||||
word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
|
||||
if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
|
||||
first_alphanum_index_ = first_alphanum_index(word, lengths);
|
||||
first_alphanum_offset_ = first_alphanum_offset(word, lengths);
|
||||
if (conflict_set_I_l_1.contains(word[first_alphanum_offset_])) {
|
||||
if (update_map) {
|
||||
word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else if (word_type == AC_UPPER_CASE) {
|
||||
return false;
|
||||
} else {
|
||||
if (update_map) {
|
||||
reject_I_1_L(word_res);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
int16_t Tesseract::first_alphanum_index(const char *word, const char *word_lengths) {
|
||||
int16_t i;
|
||||
int16_t offset;
|
||||
|
||||
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
|
||||
if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
|
||||
unicharset.get_isdigit(word + offset, word_lengths[i])) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int16_t Tesseract::first_alphanum_offset(const char *word, const char *word_lengths) {
|
||||
int16_t i;
|
||||
int16_t offset;
|
||||
|
||||
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
|
||||
if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
|
||||
unicharset.get_isdigit(word + offset, word_lengths[i])) {
|
||||
return offset;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int16_t Tesseract::alpha_count(const char *word, const char *word_lengths) {
|
||||
int16_t i;
|
||||
int16_t offset;
|
||||
int16_t count = 0;
|
||||
|
||||
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
|
||||
if (unicharset.get_isalpha(word + offset, word_lengths[i])) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
bool Tesseract::word_contains_non_1_digit(const char *word, const char *word_lengths) {
|
||||
int16_t i;
|
||||
int16_t offset;
|
||||
|
||||
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
|
||||
if (unicharset.get_isdigit(word + offset, word_lengths[i]) &&
|
||||
(word_lengths[i] != 1 || word[offset] != '1')) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*************************************************************************
|
||||
* dont_allow_1Il()
|
||||
* Don't unreject LONE accepted 1Il conflict set chars
|
||||
*************************************************************************/
|
||||
void Tesseract::dont_allow_1Il(WERD_RES *word) {
|
||||
int i = 0;
|
||||
int offset;
|
||||
int word_len = word->reject_map.length();
|
||||
const char *s = word->best_choice->unichar_string().c_str();
|
||||
const char *lengths = word->best_choice->unichar_lengths().c_str();
|
||||
bool accepted_1Il = false;
|
||||
|
||||
for (i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
|
||||
if (word->reject_map[i].accepted()) {
|
||||
if (conflict_set_I_l_1.contains(s[offset])) {
|
||||
accepted_1Il = true;
|
||||
} else {
|
||||
if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
|
||||
word->uch_set->get_isdigit(s + offset, lengths[i])) {
|
||||
return; // >=1 non 1Il ch accepted
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!accepted_1Il) {
|
||||
return; // Nothing to worry about
|
||||
}
|
||||
|
||||
for (i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
|
||||
if (conflict_set_I_l_1.contains(s[offset]) && word->reject_map[i].accepted()) {
|
||||
word->reject_map[i].setrej_postNN_1Il();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int16_t Tesseract::count_alphanums(WERD_RES *word_res) {
|
||||
int count = 0;
|
||||
const WERD_CHOICE *best_choice = word_res->best_choice;
|
||||
for (int i = 0; i < word_res->reject_map.length(); ++i) {
|
||||
if ((word_res->reject_map[i].accepted()) &&
|
||||
(word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
|
||||
word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
// reject all if most rejected.
|
||||
void Tesseract::reject_mostly_rejects(WERD_RES *word) {
|
||||
/* Reject the whole of the word if the fraction of rejects exceeds a limit */
|
||||
|
||||
if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >=
|
||||
rej_whole_of_mostly_reject_word_fract) {
|
||||
word->reject_map.rej_word_mostly_rej();
|
||||
}
|
||||
}
|
||||
|
||||
bool Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
|
||||
int16_t char_quality;
|
||||
int16_t accepted_char_quality;
|
||||
|
||||
if (word->best_choice->unichar_lengths().length() <= 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ok_repeated_ch_non_alphanum_wds.contains(word->best_choice->unichar_string()[0])) {
|
||||
return false;
|
||||
}
|
||||
|
||||
UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
|
||||
for (int i = 1; i < word->best_choice->length(); ++i) {
|
||||
if (word->best_choice->unichar_id(i) != uch_id) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
word_char_quality(word, &char_quality, &accepted_char_quality);
|
||||
|
||||
if ((word->best_choice->unichar_lengths().length() == char_quality) &&
|
||||
(char_quality == accepted_char_quality)) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
|
||||
const WERD_CHOICE &word = *werd_res->best_choice;
|
||||
int dict_word_type = werd_res->tesseract->dict_word(word);
|
||||
return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
|
||||
}
|
||||
|
||||
// Note: After running this function word_res->ratings
|
||||
// might not contain the right BLOB_CHOICE corresponding to each character
|
||||
// in word_res->best_choice.
|
||||
void Tesseract::flip_hyphens(WERD_RES *word_res) {
|
||||
WERD_CHOICE *best_choice = word_res->best_choice;
|
||||
int i;
|
||||
int prev_right = -9999;
|
||||
int next_left;
|
||||
TBOX out_box;
|
||||
float aspect_ratio;
|
||||
|
||||
if (tessedit_lower_flip_hyphen <= 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
int num_blobs = word_res->rebuild_word->NumBlobs();
|
||||
UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
|
||||
for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
|
||||
TBLOB *blob = word_res->rebuild_word->blobs[i];
|
||||
out_box = blob->bounding_box();
|
||||
if (i + 1 == num_blobs) {
|
||||
next_left = 9999;
|
||||
} else {
|
||||
next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
|
||||
}
|
||||
// Don't touch small or touching blobs - it is too dangerous.
|
||||
if ((out_box.width() > 8 * word_res->denorm.x_scale()) && (out_box.left() > prev_right) &&
|
||||
(out_box.right() < next_left)) {
|
||||
aspect_ratio = out_box.width() / static_cast<float>(out_box.height());
|
||||
if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
|
||||
if (aspect_ratio >= tessedit_upper_flip_hyphen &&
|
||||
word_res->uch_set->contains_unichar_id(unichar_dash) &&
|
||||
word_res->uch_set->get_enabled(unichar_dash)) {
|
||||
/* Certain HYPHEN */
|
||||
best_choice->set_unichar_id(unichar_dash, i);
|
||||
if (word_res->reject_map[i].rejected()) {
|
||||
word_res->reject_map[i].setrej_hyphen_accept();
|
||||
}
|
||||
}
|
||||
if ((aspect_ratio > tessedit_lower_flip_hyphen) && word_res->reject_map[i].accepted()) {
|
||||
// Suspected HYPHEN
|
||||
word_res->reject_map[i].setrej_hyphen();
|
||||
}
|
||||
} else if (best_choice->unichar_id(i) == unichar_dash) {
|
||||
if ((aspect_ratio >= tessedit_upper_flip_hyphen) && (word_res->reject_map[i].rejected())) {
|
||||
word_res->reject_map[i].setrej_hyphen_accept();
|
||||
}
|
||||
// Certain HYPHEN
|
||||
|
||||
if ((aspect_ratio <= tessedit_lower_flip_hyphen) && (word_res->reject_map[i].accepted())) {
|
||||
// Suspected HYPHEN
|
||||
word_res->reject_map[i].setrej_hyphen();
|
||||
}
|
||||
}
|
||||
}
|
||||
prev_right = out_box.right();
|
||||
}
|
||||
}
|
||||
|
||||
// Note: After running this function word_res->ratings
|
||||
// might not contain the right BLOB_CHOICE corresponding to each character
|
||||
// in word_res->best_choice.
|
||||
void Tesseract::flip_0O(WERD_RES *word_res) {
|
||||
WERD_CHOICE *best_choice = word_res->best_choice;
|
||||
int i;
|
||||
TBOX out_box;
|
||||
|
||||
if (!tessedit_flip_0O) {
|
||||
return;
|
||||
}
|
||||
|
||||
int num_blobs = word_res->rebuild_word->NumBlobs();
|
||||
for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
|
||||
TBLOB *blob = word_res->rebuild_word->blobs[i];
|
||||
if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
|
||||
word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
|
||||
out_box = blob->bounding_box();
|
||||
if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
|
||||
(out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4)) {
|
||||
return; // Beware words with sub/superscripts
|
||||
}
|
||||
}
|
||||
}
|
||||
UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
|
||||
UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
|
||||
if (unichar_0 == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_0) ||
|
||||
unichar_O == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_O)) {
|
||||
return; // 0 or O are not present/enabled in unicharset
|
||||
}
|
||||
for (i = 1; i < best_choice->length(); ++i) {
|
||||
if (best_choice->unichar_id(i) == unichar_0 || best_choice->unichar_id(i) == unichar_O) {
|
||||
/* A0A */
|
||||
if ((i + 1) < best_choice->length() &&
|
||||
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
|
||||
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 1))) {
|
||||
best_choice->set_unichar_id(unichar_O, i);
|
||||
}
|
||||
/* A00A */
|
||||
if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
|
||||
(i + 1) < best_choice->length() &&
|
||||
(best_choice->unichar_id(i + 1) == unichar_0 ||
|
||||
best_choice->unichar_id(i + 1) == unichar_O) &&
|
||||
(i + 2) < best_choice->length() &&
|
||||
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 2))) {
|
||||
best_choice->set_unichar_id(unichar_O, i);
|
||||
i++;
|
||||
}
|
||||
/* AA0<non digit or end of word> */
|
||||
if ((i > 1) && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 2)) &&
|
||||
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
|
||||
(((i + 1) < best_choice->length() &&
|
||||
!word_res->uch_set->get_isdigit(best_choice->unichar_id(i + 1)) &&
|
||||
!word_res->uch_set->eq(best_choice->unichar_id(i + 1), "l") &&
|
||||
!word_res->uch_set->eq(best_choice->unichar_id(i + 1), "I")) ||
|
||||
(i == best_choice->length() - 1))) {
|
||||
best_choice->set_unichar_id(unichar_O, i);
|
||||
}
|
||||
/* 9O9 */
|
||||
if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
|
||||
(i + 1) < best_choice->length() &&
|
||||
non_0_digit(*word_res->uch_set, best_choice->unichar_id(i + 1))) {
|
||||
best_choice->set_unichar_id(unichar_0, i);
|
||||
}
|
||||
/* 9OOO */
|
||||
if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
|
||||
(i + 2) < best_choice->length() &&
|
||||
(best_choice->unichar_id(i + 1) == unichar_0 ||
|
||||
best_choice->unichar_id(i + 1) == unichar_O) &&
|
||||
(best_choice->unichar_id(i + 2) == unichar_0 ||
|
||||
best_choice->unichar_id(i + 2) == unichar_O)) {
|
||||
best_choice->set_unichar_id(unichar_0, i);
|
||||
best_choice->set_unichar_id(unichar_0, i + 1);
|
||||
best_choice->set_unichar_id(unichar_0, i + 2);
|
||||
i += 2;
|
||||
}
|
||||
/* 9OO<non upper> */
|
||||
if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
|
||||
(i + 2) < best_choice->length() &&
|
||||
(best_choice->unichar_id(i + 1) == unichar_0 ||
|
||||
best_choice->unichar_id(i + 1) == unichar_O) &&
|
||||
!word_res->uch_set->get_isupper(best_choice->unichar_id(i + 2))) {
|
||||
best_choice->set_unichar_id(unichar_0, i);
|
||||
best_choice->set_unichar_id(unichar_0, i + 1);
|
||||
i++;
|
||||
}
|
||||
/* 9O<non upper> */
|
||||
if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
|
||||
(i + 1) < best_choice->length() &&
|
||||
!word_res->uch_set->get_isupper(best_choice->unichar_id(i + 1))) {
|
||||
best_choice->set_unichar_id(unichar_0, i);
|
||||
}
|
||||
/* 9[.,]OOO.. */
|
||||
if ((i > 1) &&
|
||||
(word_res->uch_set->eq(best_choice->unichar_id(i - 1), ".") ||
|
||||
word_res->uch_set->eq(best_choice->unichar_id(i - 1), ",")) &&
|
||||
(word_res->uch_set->get_isdigit(best_choice->unichar_id(i - 2)) ||
|
||||
best_choice->unichar_id(i - 2) == unichar_O)) {
|
||||
if (best_choice->unichar_id(i - 2) == unichar_O) {
|
||||
best_choice->set_unichar_id(unichar_0, i - 2);
|
||||
}
|
||||
while (i < best_choice->length() && (best_choice->unichar_id(i) == unichar_O ||
|
||||
best_choice->unichar_id(i) == unichar_0)) {
|
||||
best_choice->set_unichar_id(unichar_0, i);
|
||||
i++;
|
||||
}
|
||||
i--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool Tesseract::non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) {
|
||||
return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
|
||||
}
|
||||
|
||||
bool Tesseract::non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) {
|
||||
return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
|
||||
}
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // def DISABLED_LEGACY_ENGINE
|
39
3rdparty/tesseract_ocr/tesseract/src/ccmain/reject.h
vendored
Normal file
39
3rdparty/tesseract_ocr/tesseract/src/ccmain/reject.h
vendored
Normal file
|
@ -0,0 +1,39 @@
|
|||
/**********************************************************************
|
||||
* File: reject.h
|
||||
* Description: Rejection functions used in tessedit
|
||||
* Author: Phil Cheatle
|
||||
* Created: Wed Sep 23 16:50:21 BST 1992
|
||||
*
|
||||
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef REJECT_H
|
||||
#define REJECT_H
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class WERD_CHOICE;
|
||||
class WERD_RES;
|
||||
|
||||
void reject_blanks(WERD_RES *word);
|
||||
void reject_poor_matches(WERD_RES *word);
|
||||
float compute_reject_threshold(WERD_CHOICE *word);
|
||||
bool word_contains_non_1_digit(const char *word, const char *word_lengths);
|
||||
void dont_allow_1Il(WERD_RES *word);
|
||||
void flip_hyphens(WERD_RES *word);
|
||||
void flip_0O(WERD_RES *word);
|
||||
bool non_0_digit(const char *str, int length);
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif
|
789
3rdparty/tesseract_ocr/tesseract/src/ccmain/resultiterator.cpp
vendored
Normal file
789
3rdparty/tesseract_ocr/tesseract/src/ccmain/resultiterator.cpp
vendored
Normal file
|
@ -0,0 +1,789 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: resultiterator.cpp
|
||||
// Description: Iterator for tesseract results that is capable of
|
||||
// iterating in proper reading order over Bi Directional
|
||||
// (e.g. mixed Hebrew and English) text.
|
||||
// Author: David Eger
|
||||
//
|
||||
// (C) Copyright 2011, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <tesseract/resultiterator.h>
|
||||
|
||||
#include "pageres.h"
|
||||
#include "tesseractclass.h"
|
||||
#include "unicharset.h"
|
||||
|
||||
#include <allheaders.h>
|
||||
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
static const char *const kLRM = "\u200E"; // Left-to-Right Mark
|
||||
static const char *const kRLM = "\u200F"; // Right-to-Left Mark
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
ResultIterator::ResultIterator(const LTRResultIterator &resit) : LTRResultIterator(resit) {
|
||||
in_minor_direction_ = false;
|
||||
at_beginning_of_minor_run_ = false;
|
||||
preserve_interword_spaces_ = false;
|
||||
|
||||
auto *p = ParamUtils::FindParam<BoolParam>(
|
||||
"preserve_interword_spaces", GlobalParams()->bool_params, tesseract_->params()->bool_params);
|
||||
if (p != nullptr) {
|
||||
preserve_interword_spaces_ = (bool)(*p);
|
||||
}
|
||||
|
||||
current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
|
||||
MoveToLogicalStartOfTextline();
|
||||
}
|
||||
|
||||
ResultIterator *ResultIterator::StartOfParagraph(const LTRResultIterator &resit) {
|
||||
return new ResultIterator(resit);
|
||||
}
|
||||
|
||||
bool ResultIterator::ParagraphIsLtr() const {
|
||||
return current_paragraph_is_ltr_;
|
||||
}
|
||||
|
||||
bool ResultIterator::CurrentParagraphIsLtr() const {
|
||||
if (!it_->word()) {
|
||||
return true; // doesn't matter.
|
||||
}
|
||||
LTRResultIterator it(*this);
|
||||
it.RestartParagraph();
|
||||
// Try to figure out the ltr-ness of the paragraph. The rules below
|
||||
// make more sense in the context of a difficult paragraph example.
|
||||
// Here we denote {ltr characters, RTL CHARACTERS}:
|
||||
//
|
||||
// "don't go in there!" DAIS EH
|
||||
// EHT OTNI DEPMUJ FELSMIH NEHT DNA
|
||||
// .GNIDLIUB GNINRUB
|
||||
//
|
||||
// On the first line, the left-most word is LTR and the rightmost word
|
||||
// is RTL. Thus, we are better off taking the majority direction for
|
||||
// the whole paragraph contents. So instead of "the leftmost word is LTR"
|
||||
// indicating an LTR paragraph, we use a heuristic about what RTL paragraphs
|
||||
// would not do: Typically an RTL paragraph would *not* start with an LTR
|
||||
// word. So our heuristics are as follows:
|
||||
//
|
||||
// (1) If the first text line has an RTL word in the left-most position
|
||||
// it is RTL.
|
||||
// (2) If the first text line has an LTR word in the right-most position
|
||||
// it is LTR.
|
||||
// (3) If neither of the above is true, take the majority count for the
|
||||
// paragraph -- if there are more rtl words, it is RTL. If there
|
||||
// are more LTR words, it's LTR.
|
||||
bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;
|
||||
bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;
|
||||
int num_ltr, num_rtl;
|
||||
num_rtl = leftmost_rtl ? 1 : 0;
|
||||
num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;
|
||||
for (it.Next(RIL_WORD); !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);
|
||||
it.Next(RIL_WORD)) {
|
||||
StrongScriptDirection dir = it.WordDirection();
|
||||
rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);
|
||||
num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
|
||||
num_ltr += rightmost_ltr ? 1 : 0;
|
||||
}
|
||||
if (leftmost_rtl) {
|
||||
return false;
|
||||
}
|
||||
if (rightmost_ltr) {
|
||||
return true;
|
||||
}
|
||||
// First line is ambiguous. Take statistics on the whole paragraph.
|
||||
if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) {
|
||||
do {
|
||||
StrongScriptDirection dir = it.WordDirection();
|
||||
num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
|
||||
num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;
|
||||
} while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));
|
||||
}
|
||||
return num_ltr >= num_rtl;
|
||||
}
|
||||
|
||||
const int ResultIterator::kMinorRunStart = -1;
|
||||
const int ResultIterator::kMinorRunEnd = -2;
|
||||
const int ResultIterator::kComplexWord = -3;
|
||||
|
||||
void ResultIterator::CalculateBlobOrder(std::vector<int> *blob_indices) const {
|
||||
bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
|
||||
blob_indices->clear();
|
||||
if (Empty(RIL_WORD)) {
|
||||
return;
|
||||
}
|
||||
if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {
|
||||
// Easy! just return the blobs in order;
|
||||
for (int i = 0; i < word_length_; i++) {
|
||||
blob_indices->push_back(i);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// The blobs are in left-to-right order, but the current reading context
|
||||
// is right-to-left.
|
||||
const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;
|
||||
const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;
|
||||
const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;
|
||||
const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;
|
||||
const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;
|
||||
const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;
|
||||
const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;
|
||||
|
||||
// Step 1: Scan for and mark European Number sequences
|
||||
// [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*
|
||||
std::vector<int> letter_types;
|
||||
letter_types.reserve(word_length_);
|
||||
for (int i = 0; i < word_length_; i++) {
|
||||
letter_types.push_back(it_->word()->SymbolDirection(i));
|
||||
}
|
||||
// Convert a single separtor sandwiched between two EN's into an EN.
|
||||
for (int i = 0; i + 2 < word_length_; i++) {
|
||||
if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&
|
||||
(letter_types[i + 1] == U_EURO_NUM_SEP || letter_types[i + 1] == U_COMMON_NUM_SEP)) {
|
||||
letter_types[i + 1] = U_EURO_NUM;
|
||||
}
|
||||
}
|
||||
// Scan for sequences of European Number Terminators around ENs and convert
|
||||
// them to ENs.
|
||||
for (int i = 0; i < word_length_; i++) {
|
||||
if (letter_types[i] == U_EURO_NUM_TERM) {
|
||||
int j = i + 1;
|
||||
while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) {
|
||||
j++;
|
||||
}
|
||||
if (j < word_length_ && letter_types[j] == U_EURO_NUM) {
|
||||
// The sequence [i..j] should be converted to all European Numbers.
|
||||
for (int k = i; k < j; k++) {
|
||||
letter_types[k] = U_EURO_NUM;
|
||||
}
|
||||
}
|
||||
j = i - 1;
|
||||
while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) {
|
||||
j--;
|
||||
}
|
||||
if (j > -1 && letter_types[j] == U_EURO_NUM) {
|
||||
// The sequence [j..i] should be converted to all European Numbers.
|
||||
for (int k = j; k <= i; k++) {
|
||||
letter_types[k] = U_EURO_NUM;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Step 2: Convert all remaining types to either L or R.
|
||||
// Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.
|
||||
// All other are R.
|
||||
for (int i = 0; i < word_length_;) {
|
||||
int ti = letter_types[i];
|
||||
if (ti == U_LTR || ti == U_EURO_NUM) {
|
||||
// Left to right sequence; scan to the end of it.
|
||||
int last_good = i;
|
||||
for (int j = i + 1; j < word_length_; j++) {
|
||||
int tj = letter_types[j];
|
||||
if (tj == U_LTR || tj == U_EURO_NUM) {
|
||||
last_good = j;
|
||||
} else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
|
||||
// do nothing.
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// [i..last_good] is the L sequence
|
||||
for (int k = i; k <= last_good; k++) {
|
||||
letter_types[k] = U_LTR;
|
||||
}
|
||||
i = last_good + 1;
|
||||
} else {
|
||||
letter_types[i] = U_RTL;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
// At this point, letter_types is entirely U_LTR or U_RTL.
|
||||
for (int i = word_length_ - 1; i >= 0;) {
|
||||
if (letter_types[i] == U_RTL) {
|
||||
blob_indices->push_back(i);
|
||||
i--;
|
||||
} else {
|
||||
// left to right sequence. scan to the beginning.
|
||||
int j = i - 1;
|
||||
for (; j >= 0 && letter_types[j] != U_RTL; j--) {
|
||||
} // pass
|
||||
// Now (j, i] is LTR
|
||||
for (int k = j + 1; k <= i; k++) {
|
||||
blob_indices->push_back(k);
|
||||
}
|
||||
i = j;
|
||||
}
|
||||
}
|
||||
ASSERT_HOST(blob_indices->size() == word_length_);
|
||||
}
|
||||
|
||||
static void PrintScriptDirs(const std::vector<StrongScriptDirection> &dirs) {
|
||||
for (auto dir : dirs) {
|
||||
switch (dir) {
|
||||
case DIR_NEUTRAL:
|
||||
tprintf("N ");
|
||||
break;
|
||||
case DIR_LEFT_TO_RIGHT:
|
||||
tprintf("L ");
|
||||
break;
|
||||
case DIR_RIGHT_TO_LEFT:
|
||||
tprintf("R ");
|
||||
break;
|
||||
case DIR_MIX:
|
||||
tprintf("Z ");
|
||||
break;
|
||||
default:
|
||||
tprintf("? ");
|
||||
break;
|
||||
}
|
||||
}
|
||||
tprintf("\n");
|
||||
}
|
||||
|
||||
void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit,
|
||||
std::vector<int> *word_indices) const {
|
||||
std::vector<StrongScriptDirection> directions;
|
||||
CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);
|
||||
}
|
||||
|
||||
void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit,
|
||||
std::vector<StrongScriptDirection> *dirs_arg,
|
||||
std::vector<int> *word_indices) const {
|
||||
std::vector<StrongScriptDirection> dirs;
|
||||
std::vector<StrongScriptDirection> *directions;
|
||||
directions = (dirs_arg != nullptr) ? dirs_arg : &dirs;
|
||||
directions->clear();
|
||||
|
||||
// A LTRResultIterator goes strictly left-to-right word order.
|
||||
LTRResultIterator ltr_it(resit);
|
||||
ltr_it.RestartRow();
|
||||
if (ltr_it.Empty(RIL_WORD)) {
|
||||
return;
|
||||
}
|
||||
do {
|
||||
directions->push_back(ltr_it.WordDirection());
|
||||
} while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));
|
||||
|
||||
word_indices->clear();
|
||||
CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
|
||||
}
|
||||
|
||||
void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr,
|
||||
const std::vector<StrongScriptDirection> &word_dirs,
|
||||
std::vector<int> *reading_order) {
|
||||
reading_order->clear();
|
||||
if (word_dirs.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Take all of the runs of minor direction words and insert them
|
||||
// in reverse order.
|
||||
int minor_direction, major_direction, major_step, start, end;
|
||||
if (paragraph_is_ltr) {
|
||||
start = 0;
|
||||
end = word_dirs.size();
|
||||
major_step = 1;
|
||||
major_direction = DIR_LEFT_TO_RIGHT;
|
||||
minor_direction = DIR_RIGHT_TO_LEFT;
|
||||
} else {
|
||||
start = word_dirs.size() - 1;
|
||||
end = -1;
|
||||
major_step = -1;
|
||||
major_direction = DIR_RIGHT_TO_LEFT;
|
||||
minor_direction = DIR_LEFT_TO_RIGHT;
|
||||
// Special rule: if there are neutral words at the right most side
|
||||
// of a line adjacent to a left-to-right word in the middle of the
|
||||
// line, we interpret the end of the line as a single LTR sequence.
|
||||
if (word_dirs[start] == DIR_NEUTRAL) {
|
||||
int neutral_end = start;
|
||||
while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
|
||||
neutral_end--;
|
||||
}
|
||||
if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
|
||||
// LTR followed by neutrals.
|
||||
// Scan for the beginning of the minor left-to-right run.
|
||||
int left = neutral_end;
|
||||
for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
|
||||
if (word_dirs[i] == DIR_LEFT_TO_RIGHT) {
|
||||
left = i;
|
||||
}
|
||||
}
|
||||
reading_order->push_back(kMinorRunStart);
|
||||
for (unsigned i = left; i < word_dirs.size(); i++) {
|
||||
reading_order->push_back(i);
|
||||
if (word_dirs[i] == DIR_MIX) {
|
||||
reading_order->push_back(kComplexWord);
|
||||
}
|
||||
}
|
||||
reading_order->push_back(kMinorRunEnd);
|
||||
start = left - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = start; i != end;) {
|
||||
if (word_dirs[i] == minor_direction) {
|
||||
int j = i;
|
||||
while (j != end && word_dirs[j] != major_direction) {
|
||||
j += major_step;
|
||||
}
|
||||
if (j == end) {
|
||||
j -= major_step;
|
||||
}
|
||||
while (j != i && word_dirs[j] != minor_direction) {
|
||||
j -= major_step;
|
||||
}
|
||||
// [j..i] is a minor direction run.
|
||||
reading_order->push_back(kMinorRunStart);
|
||||
for (int k = j; k != i; k -= major_step) {
|
||||
reading_order->push_back(k);
|
||||
}
|
||||
reading_order->push_back(i);
|
||||
reading_order->push_back(kMinorRunEnd);
|
||||
i = j + major_step;
|
||||
} else {
|
||||
reading_order->push_back(i);
|
||||
if (word_dirs[i] == DIR_MIX) {
|
||||
reading_order->push_back(kComplexWord);
|
||||
}
|
||||
i += major_step;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int ResultIterator::LTRWordIndex() const {
|
||||
int this_word_index = 0;
|
||||
LTRResultIterator textline(*this);
|
||||
textline.RestartRow();
|
||||
while (!textline.PositionedAtSameWord(it_)) {
|
||||
this_word_index++;
|
||||
textline.Next(RIL_WORD);
|
||||
}
|
||||
return this_word_index;
|
||||
}
|
||||
|
||||
void ResultIterator::MoveToLogicalStartOfWord() {
|
||||
if (word_length_ == 0) {
|
||||
BeginWord(0);
|
||||
return;
|
||||
}
|
||||
std::vector<int> blob_order;
|
||||
CalculateBlobOrder(&blob_order);
|
||||
if (blob_order.empty() || blob_order[0] == 0) {
|
||||
return;
|
||||
}
|
||||
BeginWord(blob_order[0]);
|
||||
}
|
||||
|
||||
bool ResultIterator::IsAtFinalSymbolOfWord() const {
|
||||
if (!it_->word()) {
|
||||
return true;
|
||||
}
|
||||
std::vector<int> blob_order;
|
||||
CalculateBlobOrder(&blob_order);
|
||||
return blob_order.empty() || blob_order.back() == blob_index_;
|
||||
}
|
||||
|
||||
bool ResultIterator::IsAtFirstSymbolOfWord() const {
|
||||
if (!it_->word()) {
|
||||
return true;
|
||||
}
|
||||
std::vector<int> blob_order;
|
||||
CalculateBlobOrder(&blob_order);
|
||||
return blob_order.empty() || blob_order[0] == blob_index_;
|
||||
}
|
||||
|
||||
void ResultIterator::AppendSuffixMarks(std::string *text) const {
|
||||
if (!it_->word()) {
|
||||
return;
|
||||
}
|
||||
bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
|
||||
// scan forward to see what meta-information the word ordering algorithm
|
||||
// left us.
|
||||
// If this word is at the *end* of a minor run, insert the other
|
||||
// direction's mark; else if this was a complex word, insert the
|
||||
// current reading order's mark.
|
||||
std::vector<int> textline_order;
|
||||
CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &textline_order);
|
||||
int this_word_index = LTRWordIndex();
|
||||
size_t i = 0;
|
||||
for (const auto word_index : textline_order) {
|
||||
if (word_index == this_word_index) {
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
if (i == textline_order.size()) {
|
||||
return;
|
||||
}
|
||||
|
||||
int last_non_word_mark = 0;
|
||||
for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {
|
||||
last_non_word_mark = textline_order[i];
|
||||
}
|
||||
if (last_non_word_mark == kComplexWord) {
|
||||
*text += reading_direction_is_ltr ? kLRM : kRLM;
|
||||
} else if (last_non_word_mark == kMinorRunEnd) {
|
||||
if (current_paragraph_is_ltr_) {
|
||||
*text += kLRM;
|
||||
} else {
|
||||
*text += kRLM;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ResultIterator::MoveToLogicalStartOfTextline() {
|
||||
std::vector<int> word_indices;
|
||||
RestartRow();
|
||||
CalculateTextlineOrder(current_paragraph_is_ltr_, dynamic_cast<const LTRResultIterator &>(*this),
|
||||
&word_indices);
|
||||
unsigned i = 0;
|
||||
for (; i < word_indices.size() && word_indices[i] < 0; i++) {
|
||||
if (word_indices[i] == kMinorRunStart) {
|
||||
in_minor_direction_ = true;
|
||||
} else if (word_indices[i] == kMinorRunEnd) {
|
||||
in_minor_direction_ = false;
|
||||
}
|
||||
}
|
||||
if (in_minor_direction_) {
|
||||
at_beginning_of_minor_run_ = true;
|
||||
}
|
||||
if (i >= word_indices.size()) {
|
||||
return;
|
||||
}
|
||||
int first_word_index = word_indices[i];
|
||||
for (int j = 0; j < first_word_index; j++) {
|
||||
PageIterator::Next(RIL_WORD);
|
||||
}
|
||||
MoveToLogicalStartOfWord();
|
||||
}
|
||||
|
||||
void ResultIterator::Begin() {
|
||||
LTRResultIterator::Begin();
|
||||
current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
|
||||
in_minor_direction_ = false;
|
||||
at_beginning_of_minor_run_ = false;
|
||||
MoveToLogicalStartOfTextline();
|
||||
}
|
||||
|
||||
bool ResultIterator::Next(PageIteratorLevel level) {
|
||||
if (it_->block() == nullptr) {
|
||||
return false; // already at end!
|
||||
}
|
||||
switch (level) {
|
||||
case RIL_BLOCK: // explicit fall-through
|
||||
case RIL_PARA: // explicit fall-through
|
||||
case RIL_TEXTLINE:
|
||||
if (!PageIterator::Next(level)) {
|
||||
return false;
|
||||
}
|
||||
if (IsWithinFirstTextlineOfParagraph()) {
|
||||
// if we've advanced to a new paragraph,
|
||||
// recalculate current_paragraph_is_ltr_
|
||||
current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
|
||||
}
|
||||
in_minor_direction_ = false;
|
||||
MoveToLogicalStartOfTextline();
|
||||
return it_->block() != nullptr;
|
||||
case RIL_SYMBOL: {
|
||||
std::vector<int> blob_order;
|
||||
CalculateBlobOrder(&blob_order);
|
||||
int next_blob = 0;
|
||||
while (next_blob < blob_order.size() && blob_index_ != blob_order[next_blob]) {
|
||||
next_blob++;
|
||||
}
|
||||
next_blob++;
|
||||
if (next_blob < blob_order.size()) {
|
||||
// we're in the same word; simply advance one blob.
|
||||
BeginWord(blob_order[next_blob]);
|
||||
at_beginning_of_minor_run_ = false;
|
||||
return true;
|
||||
}
|
||||
level = RIL_WORD; // we've fallen through to the next word.
|
||||
}
|
||||
// Fall through.
|
||||
case RIL_WORD: // explicit fall-through.
|
||||
{
|
||||
if (it_->word() == nullptr) {
|
||||
return Next(RIL_BLOCK);
|
||||
}
|
||||
std::vector<int> word_indices;
|
||||
int this_word_index = LTRWordIndex();
|
||||
CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &word_indices);
|
||||
int final_real_index = word_indices.size() - 1;
|
||||
while (final_real_index > 0 && word_indices[final_real_index] < 0) {
|
||||
final_real_index--;
|
||||
}
|
||||
for (int i = 0; i < final_real_index; i++) {
|
||||
if (word_indices[i] == this_word_index) {
|
||||
int j = i + 1;
|
||||
for (; j < final_real_index && word_indices[j] < 0; j++) {
|
||||
if (word_indices[j] == kMinorRunStart) {
|
||||
in_minor_direction_ = true;
|
||||
}
|
||||
if (word_indices[j] == kMinorRunEnd) {
|
||||
in_minor_direction_ = false;
|
||||
}
|
||||
}
|
||||
at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
|
||||
// awesome, we move to word_indices[j]
|
||||
if (BidiDebug(3)) {
|
||||
tprintf("Next(RIL_WORD): %d -> %d\n", this_word_index, word_indices[j]);
|
||||
}
|
||||
PageIterator::RestartRow();
|
||||
for (int k = 0; k < word_indices[j]; k++) {
|
||||
PageIterator::Next(RIL_WORD);
|
||||
}
|
||||
MoveToLogicalStartOfWord();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if (BidiDebug(3)) {
|
||||
tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
|
||||
}
|
||||
// we're going off the end of the text line.
|
||||
return Next(RIL_TEXTLINE);
|
||||
}
|
||||
}
|
||||
ASSERT_HOST(false); // shouldn't happen.
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ResultIterator::IsAtBeginningOf(PageIteratorLevel level) const {
|
||||
if (it_->block() == nullptr) {
|
||||
return false; // Already at the end!
|
||||
}
|
||||
if (it_->word() == nullptr) {
|
||||
return true; // In an image block.
|
||||
}
|
||||
if (level == RIL_SYMBOL) {
|
||||
return true; // Always at beginning of a symbol.
|
||||
}
|
||||
|
||||
bool at_word_start = IsAtFirstSymbolOfWord();
|
||||
if (level == RIL_WORD) {
|
||||
return at_word_start;
|
||||
}
|
||||
|
||||
ResultIterator line_start(*this);
|
||||
// move to the first word in the line...
|
||||
line_start.MoveToLogicalStartOfTextline();
|
||||
|
||||
bool at_textline_start = at_word_start && *line_start.it_ == *it_;
|
||||
if (level == RIL_TEXTLINE) {
|
||||
return at_textline_start;
|
||||
}
|
||||
|
||||
// now we move to the left-most word...
|
||||
line_start.RestartRow();
|
||||
bool at_block_start =
|
||||
at_textline_start && line_start.it_->block() != line_start.it_->prev_block();
|
||||
if (level == RIL_BLOCK) {
|
||||
return at_block_start;
|
||||
}
|
||||
|
||||
bool at_para_start =
|
||||
at_block_start || (at_textline_start && line_start.it_->row()->row->para() !=
|
||||
line_start.it_->prev_row()->row->para());
|
||||
if (level == RIL_PARA) {
|
||||
return at_para_start;
|
||||
}
|
||||
|
||||
ASSERT_HOST(false); // shouldn't happen.
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the
|
||||
* change that the variable next is now a ResultIterator instead of a
|
||||
* PageIterator.
|
||||
*/
|
||||
bool ResultIterator::IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const {
|
||||
if (Empty(element)) {
|
||||
return true; // Already at the end!
|
||||
}
|
||||
// The result is true if we step forward by element and find we are
|
||||
// at the the end of the page or at beginning of *all* levels in:
|
||||
// [level, element).
|
||||
// When there is more than one level difference between element and level,
|
||||
// we could for instance move forward one symbol and still be at the first
|
||||
// word on a line, so we also have to be at the first symbol in a word.
|
||||
ResultIterator next(*this);
|
||||
next.Next(element);
|
||||
if (next.Empty(element)) {
|
||||
return true; // Reached the end of the page.
|
||||
}
|
||||
while (element > level) {
|
||||
element = static_cast<PageIteratorLevel>(element - 1);
|
||||
if (!next.IsAtBeginningOf(element)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns the number of blanks before the current word.
|
||||
int ResultIterator::BlanksBeforeWord() const {
|
||||
if (CurrentParagraphIsLtr()) {
|
||||
return LTRResultIterator::BlanksBeforeWord();
|
||||
}
|
||||
return IsAtBeginningOf(RIL_TEXTLINE) ? 0 : 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the null terminated UTF-8 encoded text string for the current
|
||||
* object at the given level. Use delete [] to free after use.
|
||||
*/
|
||||
char *ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
|
||||
if (it_->word() == nullptr) {
|
||||
return nullptr; // Already at the end!
|
||||
}
|
||||
std::string text;
|
||||
switch (level) {
|
||||
case RIL_BLOCK: {
|
||||
ResultIterator pp(*this);
|
||||
do {
|
||||
pp.AppendUTF8ParagraphText(&text);
|
||||
} while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
|
||||
} break;
|
||||
case RIL_PARA:
|
||||
AppendUTF8ParagraphText(&text);
|
||||
break;
|
||||
case RIL_TEXTLINE: {
|
||||
ResultIterator it(*this);
|
||||
it.MoveToLogicalStartOfTextline();
|
||||
it.IterateAndAppendUTF8TextlineText(&text);
|
||||
} break;
|
||||
case RIL_WORD:
|
||||
AppendUTF8WordText(&text);
|
||||
break;
|
||||
case RIL_SYMBOL: {
|
||||
bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
|
||||
if (at_beginning_of_minor_run_) {
|
||||
text += reading_direction_is_ltr ? kLRM : kRLM;
|
||||
}
|
||||
text = it_->word()->BestUTF8(blob_index_, false);
|
||||
if (IsAtFinalSymbolOfWord()) {
|
||||
AppendSuffixMarks(&text);
|
||||
}
|
||||
} break;
|
||||
}
|
||||
int length = text.length() + 1;
|
||||
char *result = new char[length];
|
||||
strncpy(result, text.c_str(), length);
|
||||
return result;
|
||||
}
|
||||
std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
|
||||
*ResultIterator::GetRawLSTMTimesteps() const {
|
||||
if (it_->word() != nullptr) {
|
||||
return &it_->word()->segmented_timesteps;
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::pair<const char *, float>>> *ResultIterator::GetBestLSTMSymbolChoices()
|
||||
const {
|
||||
if (it_->word() != nullptr) {
|
||||
return &it_->word()->CTC_symbol_choices;
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void ResultIterator::AppendUTF8WordText(std::string *text) const {
|
||||
if (!it_->word()) {
|
||||
return;
|
||||
}
|
||||
ASSERT_HOST(it_->word()->best_choice != nullptr);
|
||||
bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
|
||||
if (at_beginning_of_minor_run_) {
|
||||
*text += reading_direction_is_ltr ? kLRM : kRLM;
|
||||
}
|
||||
|
||||
std::vector<int> blob_order;
|
||||
CalculateBlobOrder(&blob_order);
|
||||
for (int i : blob_order) {
|
||||
*text += it_->word()->BestUTF8(i, false);
|
||||
}
|
||||
AppendSuffixMarks(text);
|
||||
}
|
||||
|
||||
void ResultIterator::IterateAndAppendUTF8TextlineText(std::string *text) {
|
||||
if (Empty(RIL_WORD)) {
|
||||
Next(RIL_WORD);
|
||||
return;
|
||||
}
|
||||
if (BidiDebug(1)) {
|
||||
std::vector<int> textline_order;
|
||||
std::vector<StrongScriptDirection> dirs;
|
||||
CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &dirs, &textline_order);
|
||||
tprintf("Strong Script dirs [%p/P=%s]: ", it_->row(),
|
||||
current_paragraph_is_ltr_ ? "ltr" : "rtl");
|
||||
PrintScriptDirs(dirs);
|
||||
tprintf("Logical textline order [%p/P=%s]: ", it_->row(),
|
||||
current_paragraph_is_ltr_ ? "ltr" : "rtl");
|
||||
for (int i : textline_order) {
|
||||
tprintf("%d ", i);
|
||||
}
|
||||
tprintf("\n");
|
||||
}
|
||||
|
||||
int words_appended = 0;
|
||||
do {
|
||||
int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space() : (words_appended > 0);
|
||||
for (int i = 0; i < numSpaces; ++i) {
|
||||
*text += " ";
|
||||
}
|
||||
AppendUTF8WordText(text);
|
||||
words_appended++;
|
||||
if (BidiDebug(2)) {
|
||||
tprintf("Num spaces=%d, text=%s\n", numSpaces, text->c_str());
|
||||
}
|
||||
} while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
|
||||
if (BidiDebug(1)) {
|
||||
tprintf("%d words printed\n", words_appended);
|
||||
}
|
||||
*text += line_separator_;
|
||||
// If we just finished a paragraph, add an extra newline.
|
||||
if (IsAtBeginningOf(RIL_PARA)) {
|
||||
*text += paragraph_separator_;
|
||||
}
|
||||
}
|
||||
|
||||
void ResultIterator::AppendUTF8ParagraphText(std::string *text) const {
|
||||
ResultIterator it(*this);
|
||||
it.RestartParagraph();
|
||||
it.MoveToLogicalStartOfTextline();
|
||||
if (it.Empty(RIL_WORD)) {
|
||||
return;
|
||||
}
|
||||
do {
|
||||
it.IterateAndAppendUTF8TextlineText(text);
|
||||
} while (it.it_->block() != nullptr && !it.IsAtBeginningOf(RIL_PARA));
|
||||
}
|
||||
|
||||
bool ResultIterator::BidiDebug(int min_level) const {
|
||||
int debug_level = 1;
|
||||
auto *p = ParamUtils::FindParam<IntParam>("bidi_debug", GlobalParams()->int_params,
|
||||
tesseract_->params()->int_params);
|
||||
if (p != nullptr) {
|
||||
debug_level = (int32_t)(*p);
|
||||
}
|
||||
return debug_level >= min_level;
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
592
3rdparty/tesseract_ocr/tesseract/src/ccmain/superscript.cpp
vendored
Normal file
592
3rdparty/tesseract_ocr/tesseract/src/ccmain/superscript.cpp
vendored
Normal file
|
@ -0,0 +1,592 @@
|
|||
/******************************************************************
|
||||
* File: superscript.cpp
|
||||
* Description: Correction pass to fix superscripts and subscripts.
|
||||
* Author: David Eger
|
||||
*
|
||||
* (C) Copyright 2012, Google, Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "normalis.h"
|
||||
#include "tesseractclass.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
static int LeadingUnicharsToChopped(WERD_RES *word, int num_unichars) {
|
||||
int num_chopped = 0;
|
||||
for (int i = 0; i < num_unichars; i++) {
|
||||
num_chopped += word->best_state[i];
|
||||
}
|
||||
return num_chopped;
|
||||
}
|
||||
|
||||
static int TrailingUnicharsToChopped(WERD_RES *word, int num_unichars) {
|
||||
int num_chopped = 0;
|
||||
for (int i = 0; i < num_unichars; i++) {
|
||||
num_chopped += word->best_state[word->best_state.size() - 1 - i];
|
||||
}
|
||||
return num_chopped;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a recognized blob, see if a contiguous collection of sub-pieces
|
||||
* (chopped blobs) starting at its left might qualify as being a subscript
|
||||
* or superscript letter based only on y position. Also do this for the
|
||||
* right side.
|
||||
*/
|
||||
static void YOutlierPieces(WERD_RES *word, int rebuilt_blob_index, int super_y_bottom,
|
||||
int sub_y_top, ScriptPos *leading_pos, int *num_leading_outliers,
|
||||
ScriptPos *trailing_pos, int *num_trailing_outliers) {
|
||||
ScriptPos sp_unused1, sp_unused2;
|
||||
int unused1, unused2;
|
||||
if (!leading_pos) {
|
||||
leading_pos = &sp_unused1;
|
||||
}
|
||||
if (!num_leading_outliers) {
|
||||
num_leading_outliers = &unused1;
|
||||
}
|
||||
if (!trailing_pos) {
|
||||
trailing_pos = &sp_unused2;
|
||||
}
|
||||
if (!num_trailing_outliers) {
|
||||
num_trailing_outliers = &unused2;
|
||||
}
|
||||
|
||||
*num_leading_outliers = *num_trailing_outliers = 0;
|
||||
*leading_pos = *trailing_pos = SP_NORMAL;
|
||||
|
||||
int chopped_start = LeadingUnicharsToChopped(word, rebuilt_blob_index);
|
||||
int num_chopped_pieces = word->best_state[rebuilt_blob_index];
|
||||
ScriptPos last_pos = SP_NORMAL;
|
||||
int trailing_outliers = 0;
|
||||
for (int i = 0; i < num_chopped_pieces; i++) {
|
||||
TBOX box = word->chopped_word->blobs[chopped_start + i]->bounding_box();
|
||||
ScriptPos pos = SP_NORMAL;
|
||||
if (box.bottom() >= super_y_bottom) {
|
||||
pos = SP_SUPERSCRIPT;
|
||||
} else if (box.top() <= sub_y_top) {
|
||||
pos = SP_SUBSCRIPT;
|
||||
}
|
||||
if (pos == SP_NORMAL) {
|
||||
if (trailing_outliers == i) {
|
||||
*num_leading_outliers = trailing_outliers;
|
||||
*leading_pos = last_pos;
|
||||
}
|
||||
trailing_outliers = 0;
|
||||
} else {
|
||||
if (pos == last_pos) {
|
||||
trailing_outliers++;
|
||||
} else {
|
||||
trailing_outliers = 1;
|
||||
}
|
||||
}
|
||||
last_pos = pos;
|
||||
}
|
||||
*num_trailing_outliers = trailing_outliers;
|
||||
*trailing_pos = last_pos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to split off any high (or low) bits at the ends of the word with poor
|
||||
* certainty and recognize them separately. If the certainty gets much better
|
||||
* and other sanity checks pass, accept.
|
||||
*
|
||||
* This superscript fix is meant to be called in the second pass of recognition
|
||||
* when we have tried once and already have a preliminary answer for word.
|
||||
*
|
||||
* @return Whether we modified the given word.
|
||||
*/
|
||||
bool Tesseract::SubAndSuperscriptFix(WERD_RES *word) {
|
||||
if (word->tess_failed || word->word->flag(W_REP_CHAR) || !word->best_choice) {
|
||||
return false;
|
||||
}
|
||||
int num_leading, num_trailing;
|
||||
ScriptPos sp_leading, sp_trailing;
|
||||
float leading_certainty, trailing_certainty;
|
||||
float avg_certainty, unlikely_threshold;
|
||||
|
||||
// Calculate the number of whole suspicious characters at the edges.
|
||||
GetSubAndSuperscriptCandidates(word, &num_leading, &sp_leading, &leading_certainty, &num_trailing,
|
||||
&sp_trailing, &trailing_certainty, &avg_certainty,
|
||||
&unlikely_threshold);
|
||||
|
||||
const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super";
|
||||
const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super";
|
||||
|
||||
int num_blobs = word->best_choice->length();
|
||||
|
||||
// Calculate the remainder (partial characters) at the edges.
|
||||
// This accounts for us having classified the best version of
|
||||
// a word as [speaker?'] when it was instead [speaker.^{21}]
|
||||
// (that is we accidentally thought the 2 was attached to the period).
|
||||
int num_remainder_leading = 0, num_remainder_trailing = 0;
|
||||
if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
|
||||
int super_y_bottom = kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
|
||||
int sub_y_top = kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
|
||||
int last_word_char = num_blobs - 1 - num_trailing;
|
||||
float last_char_certainty = word->best_choice->certainty(last_word_char);
|
||||
if (word->best_choice->unichar_id(last_word_char) != 0 &&
|
||||
last_char_certainty <= unlikely_threshold) {
|
||||
ScriptPos rpos;
|
||||
YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top, nullptr, nullptr, &rpos,
|
||||
&num_remainder_trailing);
|
||||
if (num_trailing > 0 && rpos != sp_trailing) {
|
||||
num_remainder_trailing = 0;
|
||||
}
|
||||
if (num_remainder_trailing > 0 && last_char_certainty < trailing_certainty) {
|
||||
trailing_certainty = last_char_certainty;
|
||||
}
|
||||
}
|
||||
bool another_blob_available =
|
||||
(num_remainder_trailing == 0) || num_leading + num_trailing + 1 < num_blobs;
|
||||
int first_char_certainty = word->best_choice->certainty(num_leading);
|
||||
if (another_blob_available && word->best_choice->unichar_id(num_leading) != 0 &&
|
||||
first_char_certainty <= unlikely_threshold) {
|
||||
ScriptPos lpos;
|
||||
YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top, &lpos, &num_remainder_leading,
|
||||
nullptr, nullptr);
|
||||
if (num_leading > 0 && lpos != sp_leading) {
|
||||
num_remainder_leading = 0;
|
||||
}
|
||||
if (num_remainder_leading > 0 && first_char_certainty < leading_certainty) {
|
||||
leading_certainty = first_char_certainty;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If nothing to do, bail now.
|
||||
if (num_leading + num_trailing + num_remainder_leading + num_remainder_trailing == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (superscript_debug >= 1) {
|
||||
tprintf("Candidate for superscript detection: %s (",
|
||||
word->best_choice->unichar_string().c_str());
|
||||
if (num_leading || num_remainder_leading) {
|
||||
tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading, leading_pos);
|
||||
}
|
||||
if (num_trailing || num_remainder_trailing) {
|
||||
tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing, trailing_pos);
|
||||
}
|
||||
tprintf(")\n");
|
||||
}
|
||||
if (superscript_debug >= 3) {
|
||||
word->best_choice->print();
|
||||
}
|
||||
if (superscript_debug >= 2) {
|
||||
tprintf(" Certainties -- Average: %.2f Unlikely thresh: %.2f ", avg_certainty,
|
||||
unlikely_threshold);
|
||||
if (num_leading) {
|
||||
tprintf("Orig. leading (min): %.2f ", leading_certainty);
|
||||
}
|
||||
if (num_trailing) {
|
||||
tprintf("Orig. trailing (min): %.2f ", trailing_certainty);
|
||||
}
|
||||
tprintf("\n");
|
||||
}
|
||||
|
||||
// We've now calculated the number of rebuilt blobs we want to carve off.
|
||||
// However, split_word() works from TBLOBs in chopped_word, so we need to
|
||||
// convert to those.
|
||||
int num_chopped_leading = LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
|
||||
int num_chopped_trailing = TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
|
||||
|
||||
int retry_leading = 0;
|
||||
int retry_trailing = 0;
|
||||
bool is_good = false;
|
||||
WERD_RES *revised = TrySuperscriptSplits(num_chopped_leading, leading_certainty, sp_leading,
|
||||
num_chopped_trailing, trailing_certainty, sp_trailing,
|
||||
word, &is_good, &retry_leading, &retry_trailing);
|
||||
if (is_good) {
|
||||
word->ConsumeWordResults(revised);
|
||||
} else if (retry_leading || retry_trailing) {
|
||||
int retry_chopped_leading = LeadingUnicharsToChopped(revised, retry_leading);
|
||||
int retry_chopped_trailing = TrailingUnicharsToChopped(revised, retry_trailing);
|
||||
WERD_RES *revised2 = TrySuperscriptSplits(
|
||||
retry_chopped_leading, leading_certainty, sp_leading, retry_chopped_trailing,
|
||||
trailing_certainty, sp_trailing, revised, &is_good, &retry_leading, &retry_trailing);
|
||||
if (is_good) {
|
||||
word->ConsumeWordResults(revised2);
|
||||
}
|
||||
delete revised2;
|
||||
}
|
||||
delete revised;
|
||||
return is_good;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine how many characters (rebuilt blobs) on each end of a given word
|
||||
* might plausibly be superscripts so SubAndSuperscriptFix can try to
|
||||
* re-recognize them. Even if we find no whole blobs at either end,
|
||||
* we will set *unlikely_threshold to a certainty that might be used to
|
||||
* select "bad enough" outlier characters. If *unlikely_threshold is set to 0,
|
||||
* though, there's really no hope.
|
||||
*
|
||||
* @param[in] word The word to examine.
|
||||
* @param[out] num_rebuilt_leading the number of rebuilt blobs at the start
|
||||
* of the word which are all up or down and
|
||||
* seem badly classified.
|
||||
* @param[out] leading_pos "super" or "sub" (for debugging)
|
||||
* @param[out] leading_certainty the worst certainty in the leading blobs.
|
||||
* @param[out] num_rebuilt_trailing the number of rebuilt blobs at the end
|
||||
* of the word which are all up or down and
|
||||
* seem badly classified.
|
||||
* @param[out] trailing_pos "super" or "sub" (for debugging)
|
||||
* @param[out] trailing_certainty the worst certainty in the trailing blobs.
|
||||
* @param[out] avg_certainty the average certainty of "normal" blobs in
|
||||
* the word.
|
||||
* @param[out] unlikely_threshold the threshold (on certainty) we used to
|
||||
* select "bad enough" outlier characters.
|
||||
*/
|
||||
void Tesseract::GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading,
|
||||
ScriptPos *leading_pos, float *leading_certainty,
|
||||
int *num_rebuilt_trailing, ScriptPos *trailing_pos,
|
||||
float *trailing_certainty, float *avg_certainty,
|
||||
float *unlikely_threshold) {
|
||||
*avg_certainty = *unlikely_threshold = 0.0f;
|
||||
*num_rebuilt_leading = *num_rebuilt_trailing = 0;
|
||||
*leading_certainty = *trailing_certainty = 0.0f;
|
||||
|
||||
int super_y_bottom = kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
|
||||
int sub_y_top = kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
|
||||
|
||||
// Step one: Get an average certainty for "normally placed" characters.
|
||||
|
||||
// Counts here are of blobs in the rebuild_word / unichars in best_choice.
|
||||
*leading_pos = *trailing_pos = SP_NORMAL;
|
||||
int leading_outliers = 0;
|
||||
int trailing_outliers = 0;
|
||||
int num_normal = 0;
|
||||
float normal_certainty_total = 0.0f;
|
||||
float worst_normal_certainty = 0.0f;
|
||||
ScriptPos last_pos = SP_NORMAL;
|
||||
int num_blobs = word->rebuild_word->NumBlobs();
|
||||
for (int b = 0; b < num_blobs; ++b) {
|
||||
TBOX box = word->rebuild_word->blobs[b]->bounding_box();
|
||||
ScriptPos pos = SP_NORMAL;
|
||||
if (box.bottom() >= super_y_bottom) {
|
||||
pos = SP_SUPERSCRIPT;
|
||||
} else if (box.top() <= sub_y_top) {
|
||||
pos = SP_SUBSCRIPT;
|
||||
}
|
||||
if (pos == SP_NORMAL) {
|
||||
if (word->best_choice->unichar_id(b) != 0) {
|
||||
float char_certainty = word->best_choice->certainty(b);
|
||||
if (char_certainty < worst_normal_certainty) {
|
||||
worst_normal_certainty = char_certainty;
|
||||
}
|
||||
num_normal++;
|
||||
normal_certainty_total += char_certainty;
|
||||
}
|
||||
if (trailing_outliers == b) {
|
||||
leading_outliers = trailing_outliers;
|
||||
*leading_pos = last_pos;
|
||||
}
|
||||
trailing_outliers = 0;
|
||||
} else {
|
||||
if (last_pos == pos) {
|
||||
trailing_outliers++;
|
||||
} else {
|
||||
trailing_outliers = 1;
|
||||
}
|
||||
}
|
||||
last_pos = pos;
|
||||
}
|
||||
*trailing_pos = last_pos;
|
||||
if (num_normal >= 3) { // throw out the worst as an outlier.
|
||||
num_normal--;
|
||||
normal_certainty_total -= worst_normal_certainty;
|
||||
}
|
||||
if (num_normal > 0) {
|
||||
*avg_certainty = normal_certainty_total / num_normal;
|
||||
*unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
|
||||
}
|
||||
if (num_normal == 0 || (leading_outliers == 0 && trailing_outliers == 0)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Step two: Try to split off bits of the word that are both outliers
|
||||
// and have much lower certainty than average
|
||||
// Calculate num_leading and leading_certainty.
|
||||
for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0; *num_rebuilt_leading < leading_outliers;
|
||||
(*num_rebuilt_leading)++) {
|
||||
float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);
|
||||
if (char_certainty > *unlikely_threshold) {
|
||||
break;
|
||||
}
|
||||
if (char_certainty < *leading_certainty) {
|
||||
*leading_certainty = char_certainty;
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate num_trailing and trailing_certainty.
|
||||
for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
|
||||
*num_rebuilt_trailing < trailing_outliers; (*num_rebuilt_trailing)++) {
|
||||
int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
|
||||
float char_certainty = word->best_choice->certainty(blob_idx);
|
||||
if (char_certainty > *unlikely_threshold) {
|
||||
break;
|
||||
}
|
||||
if (char_certainty < *trailing_certainty) {
|
||||
*trailing_certainty = char_certainty;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Try splitting off the given number of (chopped) blobs from the front and
|
||||
* back of the given word and recognizing the pieces.
|
||||
*
|
||||
* @param[in] num_chopped_leading how many chopped blobs from the left
|
||||
* end of the word to chop off and try recognizing as a
|
||||
* superscript (or subscript)
|
||||
* @param[in] leading_certainty the (minimum) certainty had by the
|
||||
* characters in the original leading section.
|
||||
* @param[in] leading_pos "super" or "sub" (for debugging)
|
||||
* @param[in] num_chopped_trailing how many chopped blobs from the right
|
||||
* end of the word to chop off and try recognizing as a
|
||||
* superscript (or subscript)
|
||||
* @param[in] trailing_certainty the (minimum) certainty had by the
|
||||
* characters in the original trailing section.
|
||||
* @param[in] trailing_pos "super" or "sub" (for debugging)
|
||||
* @param[in] word the word to try to chop up.
|
||||
* @param[out] is_good do we believe our result?
|
||||
* @param[out] retry_rebuild_leading, retry_rebuild_trailing
|
||||
* If non-zero, and !is_good, then the caller may have luck trying
|
||||
* to split the returned word with this number of (rebuilt) leading
|
||||
* and trailing blobs / unichars.
|
||||
* @return A word which is the result of re-recognizing as asked.
|
||||
*/
|
||||
WERD_RES *Tesseract::TrySuperscriptSplits(int num_chopped_leading, float leading_certainty,
|
||||
ScriptPos leading_pos, int num_chopped_trailing,
|
||||
float trailing_certainty, ScriptPos trailing_pos,
|
||||
WERD_RES *word, bool *is_good, int *retry_rebuild_leading,
|
||||
int *retry_rebuild_trailing) {
|
||||
int num_chopped = word->chopped_word->NumBlobs();
|
||||
|
||||
*retry_rebuild_leading = *retry_rebuild_trailing = 0;
|
||||
|
||||
// Chop apart the word into up to three pieces.
|
||||
|
||||
BlamerBundle *bb0 = nullptr;
|
||||
BlamerBundle *bb1 = nullptr;
|
||||
WERD_RES *prefix = nullptr;
|
||||
WERD_RES *core = nullptr;
|
||||
WERD_RES *suffix = nullptr;
|
||||
if (num_chopped_leading > 0) {
|
||||
prefix = new WERD_RES(*word);
|
||||
split_word(prefix, num_chopped_leading, &core, &bb0);
|
||||
} else {
|
||||
core = new WERD_RES(*word);
|
||||
}
|
||||
|
||||
if (num_chopped_trailing > 0) {
|
||||
int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
|
||||
split_word(core, split_pt, &suffix, &bb1);
|
||||
}
|
||||
|
||||
// Recognize the pieces in turn.
|
||||
int saved_cp_multiplier = classify_class_pruner_multiplier;
|
||||
int saved_im_multiplier = classify_integer_matcher_multiplier;
|
||||
if (prefix) {
|
||||
// Turn off Tesseract's y-position penalties for the leading superscript.
|
||||
classify_class_pruner_multiplier.set_value(0);
|
||||
classify_integer_matcher_multiplier.set_value(0);
|
||||
|
||||
// Adjust our expectations about the baseline for this prefix.
|
||||
if (superscript_debug >= 3) {
|
||||
tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
|
||||
}
|
||||
recog_word_recursive(prefix);
|
||||
if (superscript_debug >= 2) {
|
||||
tprintf(" The leading bits look like %s %s\n", ScriptPosToString(leading_pos),
|
||||
prefix->best_choice->unichar_string().c_str());
|
||||
}
|
||||
|
||||
// Restore the normal y-position penalties.
|
||||
classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
|
||||
classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
|
||||
}
|
||||
|
||||
if (superscript_debug >= 3) {
|
||||
tprintf(" recognizing middle %d chopped blobs\n",
|
||||
num_chopped - num_chopped_leading - num_chopped_trailing);
|
||||
}
|
||||
|
||||
if (suffix) {
|
||||
// Turn off Tesseract's y-position penalties for the trailing superscript.
|
||||
classify_class_pruner_multiplier.set_value(0);
|
||||
classify_integer_matcher_multiplier.set_value(0);
|
||||
|
||||
if (superscript_debug >= 3) {
|
||||
tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
|
||||
}
|
||||
recog_word_recursive(suffix);
|
||||
if (superscript_debug >= 2) {
|
||||
tprintf(" The trailing bits look like %s %s\n", ScriptPosToString(trailing_pos),
|
||||
suffix->best_choice->unichar_string().c_str());
|
||||
}
|
||||
|
||||
// Restore the normal y-position penalties.
|
||||
classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
|
||||
classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
|
||||
}
|
||||
|
||||
// Evaluate whether we think the results are believably better
|
||||
// than what we already had.
|
||||
bool good_prefix =
|
||||
!prefix || BelievableSuperscript(superscript_debug >= 1, *prefix,
|
||||
superscript_bettered_certainty * leading_certainty,
|
||||
retry_rebuild_leading, nullptr);
|
||||
bool good_suffix =
|
||||
!suffix || BelievableSuperscript(superscript_debug >= 1, *suffix,
|
||||
superscript_bettered_certainty * trailing_certainty, nullptr,
|
||||
retry_rebuild_trailing);
|
||||
|
||||
*is_good = good_prefix && good_suffix;
|
||||
if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
|
||||
// None of it is any good. Quit now.
|
||||
delete core;
|
||||
delete prefix;
|
||||
delete suffix;
|
||||
delete bb1;
|
||||
return nullptr;
|
||||
}
|
||||
recog_word_recursive(core);
|
||||
|
||||
// Now paste the results together into core.
|
||||
if (suffix) {
|
||||
suffix->SetAllScriptPositions(trailing_pos);
|
||||
join_words(core, suffix, bb1);
|
||||
}
|
||||
if (prefix) {
|
||||
prefix->SetAllScriptPositions(leading_pos);
|
||||
join_words(prefix, core, bb0);
|
||||
core = prefix;
|
||||
prefix = nullptr;
|
||||
}
|
||||
|
||||
if (superscript_debug >= 1) {
|
||||
tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
|
||||
core->best_choice->unichar_string().c_str());
|
||||
}
|
||||
return core;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return whether this is believable superscript or subscript text.
|
||||
*
|
||||
* We insist that:
|
||||
* + there are no punctuation marks.
|
||||
* + there are no italics.
|
||||
* + no normal-sized character is smaller than superscript_scaledown_ratio
|
||||
* of what it ought to be, and
|
||||
* + each character is at least as certain as certainty_threshold.
|
||||
*
|
||||
* @param[in] debug If true, spew debug output
|
||||
* @param[in] word The word whose best_choice we're evaluating
|
||||
* @param[in] certainty_threshold If any of the characters have less
|
||||
* certainty than this, reject.
|
||||
* @param[out] left_ok How many left-side characters were ok?
|
||||
* @param[out] right_ok How many right-side characters were ok?
|
||||
* @return Whether the complete best choice is believable as a superscript.
|
||||
*/
|
||||
bool Tesseract::BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold,
|
||||
int *left_ok, int *right_ok) const {
|
||||
int initial_ok_run_count = 0;
|
||||
int ok_run_count = 0;
|
||||
float worst_certainty = 0.0f;
|
||||
const WERD_CHOICE &wc = *word.best_choice;
|
||||
|
||||
const UnicityTable<FontInfo> &fontinfo_table = get_fontinfo_table();
|
||||
for (int i = 0; i < wc.length(); i++) {
|
||||
TBLOB *blob = word.rebuild_word->blobs[i];
|
||||
UNICHAR_ID unichar_id = wc.unichar_id(i);
|
||||
float char_certainty = wc.certainty(i);
|
||||
bool bad_certainty = char_certainty < certainty_threshold;
|
||||
bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
|
||||
bool is_italic = word.fontinfo && word.fontinfo->is_italic();
|
||||
BLOB_CHOICE *choice = word.GetBlobChoice(i);
|
||||
if (choice && fontinfo_table.size() > 0) {
|
||||
// Get better information from the specific choice, if available.
|
||||
int font_id1 = choice->fontinfo_id();
|
||||
bool font1_is_italic = font_id1 >= 0 ? fontinfo_table.at(font_id1).is_italic() : false;
|
||||
int font_id2 = choice->fontinfo_id2();
|
||||
is_italic = font1_is_italic && (font_id2 < 0 || fontinfo_table.at(font_id2).is_italic());
|
||||
}
|
||||
|
||||
float height_fraction = 1.0f;
|
||||
float char_height = blob->bounding_box().height();
|
||||
float normal_height = char_height;
|
||||
if (wc.unicharset()->top_bottom_useful()) {
|
||||
int min_bot, max_bot, min_top, max_top;
|
||||
wc.unicharset()->get_top_bottom(unichar_id, &min_bot, &max_bot, &min_top, &max_top);
|
||||
float hi_height = max_top - max_bot;
|
||||
float lo_height = min_top - min_bot;
|
||||
normal_height = (hi_height + lo_height) / 2;
|
||||
if (normal_height >= kBlnXHeight) {
|
||||
// Only ding characters that we have decent information for because
|
||||
// they're supposed to be normal sized, not tiny specks or dashes.
|
||||
height_fraction = char_height / normal_height;
|
||||
}
|
||||
}
|
||||
bool bad_height = height_fraction < superscript_scaledown_ratio;
|
||||
|
||||
if (debug) {
|
||||
if (is_italic) {
|
||||
tprintf(" Rejecting: superscript is italic.\n");
|
||||
}
|
||||
if (is_punc) {
|
||||
tprintf(" Rejecting: punctuation present.\n");
|
||||
}
|
||||
const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
|
||||
if (bad_certainty) {
|
||||
tprintf(
|
||||
" Rejecting: don't believe character %s with certainty %.2f "
|
||||
"which is less than threshold %.2f\n",
|
||||
char_str, char_certainty, certainty_threshold);
|
||||
}
|
||||
if (bad_height) {
|
||||
tprintf(
|
||||
" Rejecting: character %s seems too small @ %.2f versus "
|
||||
"expected %.2f\n",
|
||||
char_str, char_height, normal_height);
|
||||
}
|
||||
}
|
||||
if (bad_certainty || bad_height || is_punc || is_italic) {
|
||||
if (ok_run_count == i) {
|
||||
initial_ok_run_count = ok_run_count;
|
||||
}
|
||||
ok_run_count = 0;
|
||||
} else {
|
||||
ok_run_count++;
|
||||
}
|
||||
if (char_certainty < worst_certainty) {
|
||||
worst_certainty = char_certainty;
|
||||
}
|
||||
}
|
||||
bool all_ok = ok_run_count == wc.length();
|
||||
if (all_ok && debug) {
|
||||
tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
|
||||
}
|
||||
if (!all_ok) {
|
||||
if (left_ok) {
|
||||
*left_ok = initial_ok_run_count;
|
||||
}
|
||||
if (right_ok) {
|
||||
*right_ok = ok_run_count;
|
||||
}
|
||||
}
|
||||
return all_ok;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
76
3rdparty/tesseract_ocr/tesseract/src/ccmain/tessbox.cpp
vendored
Normal file
76
3rdparty/tesseract_ocr/tesseract/src/ccmain/tessbox.cpp
vendored
Normal file
|
@ -0,0 +1,76 @@
|
|||
/**********************************************************************
|
||||
* File: tessbox.cpp (Formerly tessbox.c)
|
||||
* Description: Black boxed Tess for developing a resaljet.
|
||||
* Author: Ray Smith
|
||||
* Created: Thu Apr 23 11:03:36 BST 1992
|
||||
*
|
||||
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "mfoutline.h"
|
||||
#include "tesseractclass.h"
|
||||
|
||||
/**
|
||||
* @name tess_segment_pass_n
|
||||
*
|
||||
* Segment a word using the pass_n conditions of the tess segmenter.
|
||||
* @param pass_n pass number
|
||||
* @param word word to do
|
||||
*/
|
||||
|
||||
namespace tesseract {
|
||||
void Tesseract::tess_segment_pass_n(int pass_n, WERD_RES *word) {
|
||||
int saved_enable_assoc = 0;
|
||||
int saved_chop_enable = 0;
|
||||
|
||||
if (word->word->flag(W_DONT_CHOP)) {
|
||||
saved_enable_assoc = wordrec_enable_assoc;
|
||||
saved_chop_enable = chop_enable;
|
||||
wordrec_enable_assoc.set_value(false);
|
||||
chop_enable.set_value(false);
|
||||
}
|
||||
if (pass_n == 1) {
|
||||
set_pass1();
|
||||
} else {
|
||||
set_pass2();
|
||||
}
|
||||
recog_word(word);
|
||||
if (word->best_choice == nullptr) {
|
||||
word->SetupFake(*word->uch_set);
|
||||
}
|
||||
if (word->word->flag(W_DONT_CHOP)) {
|
||||
wordrec_enable_assoc.set_value(saved_enable_assoc);
|
||||
chop_enable.set_value(saved_chop_enable);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @name tess_acceptable_word
|
||||
*
|
||||
* @return true if the word is regarded as "good enough".
|
||||
* @param word_choice after context
|
||||
* @param raw_choice before context
|
||||
*/
|
||||
bool Tesseract::tess_acceptable_word(WERD_RES *word) {
|
||||
return getDict().AcceptableResult(word);
|
||||
}
|
||||
|
||||
/**
|
||||
* @name tess_add_doc_word
|
||||
*
|
||||
* Add the given word to the document dictionary
|
||||
*/
|
||||
void Tesseract::tess_add_doc_word(WERD_CHOICE *word_choice) {
|
||||
getDict().add_document_word(*word_choice);
|
||||
}
|
||||
} // namespace tesseract
|
463
3rdparty/tesseract_ocr/tesseract/src/ccmain/tessedit.cpp
vendored
Normal file
463
3rdparty/tesseract_ocr/tesseract/src/ccmain/tessedit.cpp
vendored
Normal file
|
@ -0,0 +1,463 @@
|
|||
/**********************************************************************
|
||||
* File: tessedit.cpp (Formerly tessedit.c)
|
||||
* Description: (Previously) Main program for merge of tess and editor.
|
||||
* Now just code to load the language model and various
|
||||
* engine-specific data files.
|
||||
* Author: Ray Smith
|
||||
*
|
||||
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
// Include automatically generated configuration file if running autoconf.
|
||||
#ifdef HAVE_CONFIG_H
|
||||
# include "config_auto.h"
|
||||
#endif
|
||||
|
||||
#include "control.h"
|
||||
#include "matchdefs.h"
|
||||
#include "pageres.h"
|
||||
#include "params.h"
|
||||
#include "stopper.h"
|
||||
#include "tesseractclass.h"
|
||||
#include "tessvars.h"
|
||||
#include "tprintf.h"
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
# include "chop.h"
|
||||
# include "intmatcher.h"
|
||||
# include "reject.h"
|
||||
#endif
|
||||
#include "lstmrecognizer.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Read a "config" file containing a set of variable, value pairs.
|
||||
// Searches the standard places: tessdata/configs, tessdata/tessconfigs
|
||||
// and also accepts a relative or absolute path name.
|
||||
void Tesseract::read_config_file(const char *filename, SetParamConstraint constraint) {
|
||||
std::string path = datadir;
|
||||
path += "configs/";
|
||||
path += filename;
|
||||
FILE *fp;
|
||||
if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
|
||||
fclose(fp);
|
||||
} else {
|
||||
path = datadir;
|
||||
path += "tessconfigs/";
|
||||
path += filename;
|
||||
if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
|
||||
fclose(fp);
|
||||
} else {
|
||||
path = filename;
|
||||
}
|
||||
}
|
||||
ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params());
|
||||
}
|
||||
|
||||
// Returns false if a unicharset file for the specified language was not found
|
||||
// or was invalid.
|
||||
// This function initializes TessdataManager. After TessdataManager is
|
||||
// no longer needed, TessdataManager::End() should be called.
|
||||
//
|
||||
// This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
|
||||
// it is OEM_DEFAULT, in which case the value of the variable will be obtained
|
||||
// from the language-specific config file (stored in [lang].traineddata), from
|
||||
// the config files specified on the command line or left as the default
|
||||
// OEM_TESSERACT_ONLY if none of the configs specify this variable.
|
||||
bool Tesseract::init_tesseract_lang_data(const std::string &arg0, const std::string &textbase,
|
||||
const std::string &language, OcrEngineMode oem,
|
||||
char **configs, int configs_size,
|
||||
const std::vector<std::string> *vars_vec,
|
||||
const std::vector<std::string> *vars_values,
|
||||
bool set_only_non_debug_params, TessdataManager *mgr) {
|
||||
// Set the basename, compute the data directory.
|
||||
main_setup(arg0, textbase);
|
||||
|
||||
// Set the language data path prefix
|
||||
lang = !language.empty() ? language : "eng";
|
||||
language_data_path_prefix = datadir;
|
||||
language_data_path_prefix += lang;
|
||||
language_data_path_prefix += ".";
|
||||
|
||||
// Initialize TessdataManager.
|
||||
std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
|
||||
if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) {
|
||||
tprintf("Error opening data file %s\n", tessdata_path.c_str());
|
||||
tprintf(
|
||||
"Please make sure the TESSDATA_PREFIX environment variable is set"
|
||||
" to your \"tessdata\" directory.\n");
|
||||
return false;
|
||||
}
|
||||
#ifdef DISABLED_LEGACY_ENGINE
|
||||
tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
|
||||
#else
|
||||
if (oem == OEM_DEFAULT) {
|
||||
// Set the engine mode from availability, which can then be overridden by
|
||||
// the config file when we read it below.
|
||||
if (!mgr->IsLSTMAvailable()) {
|
||||
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
|
||||
} else if (!mgr->IsBaseAvailable()) {
|
||||
tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
|
||||
} else {
|
||||
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);
|
||||
}
|
||||
}
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
// If a language specific config file (lang.config) exists, load it in.
|
||||
TFile fp;
|
||||
if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
|
||||
ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp, this->params());
|
||||
}
|
||||
|
||||
SetParamConstraint set_params_constraint =
|
||||
set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE;
|
||||
// Load tesseract variables from config files. This is done after loading
|
||||
// language-specific variables from [lang].traineddata file, so that custom
|
||||
// config files can override values in [lang].traineddata file.
|
||||
for (int i = 0; i < configs_size; ++i) {
|
||||
read_config_file(configs[i], set_params_constraint);
|
||||
}
|
||||
|
||||
// Set params specified in vars_vec (done after setting params from config
|
||||
// files, so that params in vars_vec can override those from files).
|
||||
if (vars_vec != nullptr && vars_values != nullptr) {
|
||||
for (unsigned i = 0; i < vars_vec->size(); ++i) {
|
||||
if (!ParamUtils::SetParam((*vars_vec)[i].c_str(), (*vars_values)[i].c_str(),
|
||||
set_params_constraint, this->params())) {
|
||||
tprintf("Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!tessedit_write_params_to_file.empty()) {
|
||||
FILE *params_file = fopen(tessedit_write_params_to_file.c_str(), "wb");
|
||||
if (params_file != nullptr) {
|
||||
ParamUtils::PrintParams(params_file, this->params());
|
||||
fclose(params_file);
|
||||
} else {
|
||||
tprintf("Failed to open %s for writing params.\n", tessedit_write_params_to_file.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
// Determine which ocr engine(s) should be loaded and used for recognition.
|
||||
if (oem != OEM_DEFAULT) {
|
||||
tessedit_ocr_engine_mode.set_value(oem);
|
||||
}
|
||||
#endif
|
||||
|
||||
// If we are only loading the config file (and so not planning on doing any
|
||||
// recognition) then there's nothing else do here.
|
||||
if (tessedit_init_config_only) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// The various OcrEngineMode settings (see tesseract/publictypes.h) determine
|
||||
// which engine-specific data files need to be loaded. If LSTM_ONLY is
|
||||
// requested, the base Tesseract files are *Not* required.
|
||||
#ifdef DISABLED_LEGACY_ENGINE
|
||||
if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
|
||||
#else
|
||||
if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
|
||||
tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {
|
||||
lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix.c_str());
|
||||
ASSERT_HOST(lstm_recognizer_->Load(this->params(), lstm_use_matrix ? language : "", mgr));
|
||||
} else {
|
||||
tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
|
||||
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
|
||||
}
|
||||
}
|
||||
|
||||
// Load the unicharset
|
||||
if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
|
||||
// Avoid requiring a unicharset when we aren't running base tesseract.
|
||||
unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
|
||||
}
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || !unicharset.load_from_file(&fp, false)) {
|
||||
tprintf(
|
||||
"Error: Tesseract (legacy) engine requested, but components are "
|
||||
"not present in %s!!\n",
|
||||
tessdata_path.c_str());
|
||||
return false;
|
||||
}
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
if (unicharset.size() > MAX_NUM_CLASSES) {
|
||||
tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
|
||||
return false;
|
||||
}
|
||||
right_to_left_ = unicharset.major_right_to_left();
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
// Setup initial unichar ambigs table and read universal ambigs.
|
||||
UNICHARSET encoder_unicharset;
|
||||
encoder_unicharset.CopyFrom(unicharset);
|
||||
unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
|
||||
unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
|
||||
|
||||
if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
|
||||
unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp, ambigs_debug_level,
|
||||
use_ambigs_for_adaption, &unicharset);
|
||||
}
|
||||
|
||||
// Init ParamsModel.
|
||||
// Load pass1 and pass2 weights (for now these two sets are the same, but in
|
||||
// the future separate sets of weights can be generated).
|
||||
for (int p = ParamsModel::PTRAIN_PASS1; p < ParamsModel::PTRAIN_NUM_PASSES; ++p) {
|
||||
language_model_->getParamsModel().SetPass(static_cast<ParamsModel::PassEnum>(p));
|
||||
if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
|
||||
if (!language_model_->getParamsModel().LoadFromFp(lang.c_str(), &fp)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Helper returns true if the given string is in the vector of strings.
|
||||
static bool IsStrInList(const std::string &str, const std::vector<std::string> &str_list) {
|
||||
for (const auto &i : str_list) {
|
||||
if (i == str) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Parse a string of the form [~]<lang>[+[~]<lang>]*.
|
||||
// Langs with no prefix get appended to to_load, provided they
|
||||
// are not in there already.
|
||||
// Langs with ~ prefix get appended to not_to_load, provided they are not in
|
||||
// there already.
|
||||
void Tesseract::ParseLanguageString(const std::string &lang_str, std::vector<std::string> *to_load,
|
||||
std::vector<std::string> *not_to_load) {
|
||||
std::string remains(lang_str);
|
||||
while (!remains.empty()) {
|
||||
// Find the start of the lang code and which vector to add to.
|
||||
const char *start = remains.c_str();
|
||||
while (*start == '+') {
|
||||
++start;
|
||||
}
|
||||
std::vector<std::string> *target = to_load;
|
||||
if (*start == '~') {
|
||||
target = not_to_load;
|
||||
++start;
|
||||
}
|
||||
// Find the index of the end of the lang code in string start.
|
||||
int end = strlen(start);
|
||||
const char *plus = strchr(start, '+');
|
||||
if (plus != nullptr && plus - start < end) {
|
||||
end = plus - start;
|
||||
}
|
||||
std::string lang_code(start);
|
||||
lang_code.resize(end);
|
||||
std::string next(start + end);
|
||||
remains = next;
|
||||
// Check whether lang_code is already in the target vector and add.
|
||||
if (!IsStrInList(lang_code, *target)) {
|
||||
target->push_back(lang_code);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize for potentially a set of languages defined by the language
|
||||
// string and recursively any additional languages required by any language
|
||||
// traineddata file (via tessedit_load_sublangs in its config) that is loaded.
|
||||
// See init_tesseract_internal for args.
|
||||
int Tesseract::init_tesseract(const std::string &arg0, const std::string &textbase,
|
||||
const std::string &language, OcrEngineMode oem, char **configs,
|
||||
int configs_size, const std::vector<std::string> *vars_vec,
|
||||
const std::vector<std::string> *vars_values,
|
||||
bool set_only_non_debug_params, TessdataManager *mgr) {
|
||||
std::vector<std::string> langs_to_load;
|
||||
std::vector<std::string> langs_not_to_load;
|
||||
ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
|
||||
|
||||
for (auto *lang : sub_langs_) {
|
||||
delete lang;
|
||||
}
|
||||
sub_langs_.clear();
|
||||
// Find the first loadable lang and load into this.
|
||||
// Add any languages that this language requires
|
||||
bool loaded_primary = false;
|
||||
// Load the rest into sub_langs_.
|
||||
for (unsigned lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
|
||||
if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
|
||||
const char *lang_str = langs_to_load[lang_index].c_str();
|
||||
Tesseract *tess_to_init;
|
||||
if (!loaded_primary) {
|
||||
tess_to_init = this;
|
||||
} else {
|
||||
tess_to_init = new Tesseract;
|
||||
}
|
||||
|
||||
int result = tess_to_init->init_tesseract_internal(arg0, textbase, lang_str, oem, configs,
|
||||
configs_size, vars_vec, vars_values,
|
||||
set_only_non_debug_params, mgr);
|
||||
// Forget that language, but keep any reader we were given.
|
||||
mgr->Clear();
|
||||
|
||||
if (!loaded_primary) {
|
||||
if (result < 0) {
|
||||
tprintf("Failed loading language '%s'\n", lang_str);
|
||||
} else {
|
||||
ParseLanguageString(tess_to_init->tessedit_load_sublangs.c_str(), &langs_to_load,
|
||||
&langs_not_to_load);
|
||||
loaded_primary = true;
|
||||
}
|
||||
} else {
|
||||
if (result < 0) {
|
||||
tprintf("Failed loading language '%s'\n", lang_str);
|
||||
delete tess_to_init;
|
||||
} else {
|
||||
sub_langs_.push_back(tess_to_init);
|
||||
// Add any languages that this language requires
|
||||
ParseLanguageString(tess_to_init->tessedit_load_sublangs.c_str(), &langs_to_load,
|
||||
&langs_not_to_load);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!loaded_primary) {
|
||||
tprintf("Tesseract couldn't load any languages!\n");
|
||||
return -1; // Couldn't load any language!
|
||||
}
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
if (!sub_langs_.empty()) {
|
||||
// In multilingual mode word ratings have to be directly comparable,
|
||||
// so use the same language model weights for all languages:
|
||||
// use the primary language's params model if
|
||||
// tessedit_use_primary_params_model is set,
|
||||
// otherwise use default language model weights.
|
||||
if (tessedit_use_primary_params_model) {
|
||||
for (auto &sub_lang : sub_langs_) {
|
||||
sub_lang->language_model_->getParamsModel().Copy(this->language_model_->getParamsModel());
|
||||
}
|
||||
tprintf("Using params model of the primary language\n");
|
||||
} else {
|
||||
this->language_model_->getParamsModel().Clear();
|
||||
for (auto &sub_lang : sub_langs_) {
|
||||
sub_lang->language_model_->getParamsModel().Clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SetupUniversalFontIds();
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Common initialization for a single language.
|
||||
// arg0 is the datapath for the tessdata directory, which could be the
|
||||
// path of the tessdata directory with no trailing /, or (if tessdata
|
||||
// lives in the same directory as the executable, the path of the executable,
|
||||
// hence the name arg0.
|
||||
// textbase is an optional output file basename (used only for training)
|
||||
// language is the language code to load.
|
||||
// oem controls which engine(s) will operate on the image
|
||||
// configs (argv) is an array of config filenames to load variables from.
|
||||
// May be nullptr.
|
||||
// configs_size (argc) is the number of elements in configs.
|
||||
// vars_vec is an optional vector of variables to set.
|
||||
// vars_values is an optional corresponding vector of values for the variables
|
||||
// in vars_vec.
|
||||
// If set_only_non_debug_params is true, only params that do not contain
|
||||
// "debug" in the name will be set.
|
||||
int Tesseract::init_tesseract_internal(const std::string &arg0, const std::string &textbase,
|
||||
const std::string &language, OcrEngineMode oem,
|
||||
char **configs, int configs_size,
|
||||
const std::vector<std::string> *vars_vec,
|
||||
const std::vector<std::string> *vars_values,
|
||||
bool set_only_non_debug_params, TessdataManager *mgr) {
|
||||
if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs, configs_size, vars_vec,
|
||||
vars_values, set_only_non_debug_params, mgr)) {
|
||||
return -1;
|
||||
}
|
||||
if (tessedit_init_config_only) {
|
||||
return 0;
|
||||
}
|
||||
// If only LSTM will be used, skip loading Tesseract classifier's
|
||||
// pre-trained templates and dictionary.
|
||||
bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY;
|
||||
program_editup(textbase, init_tesseract ? mgr : nullptr, init_tesseract ? mgr : nullptr);
|
||||
return 0; // Normal exit
|
||||
}
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
// Helper builds the all_fonts table by adding new fonts from new_fonts.
|
||||
static void CollectFonts(const UnicityTable<FontInfo> &new_fonts,
|
||||
UnicityTable<FontInfo> *all_fonts) {
|
||||
for (int i = 0; i < new_fonts.size(); ++i) {
|
||||
// UnicityTable uniques as we go.
|
||||
all_fonts->push_back(new_fonts.at(i));
|
||||
}
|
||||
}
|
||||
|
||||
// Helper assigns an id to lang_fonts using the index in all_fonts table.
|
||||
static void AssignIds(const UnicityTable<FontInfo> &all_fonts, UnicityTable<FontInfo> *lang_fonts) {
|
||||
for (int i = 0; i < lang_fonts->size(); ++i) {
|
||||
int index = all_fonts.get_id(lang_fonts->at(i));
|
||||
lang_fonts->at(i).universal_id = index;
|
||||
}
|
||||
}
|
||||
|
||||
// Set the universal_id member of each font to be unique among all
|
||||
// instances of the same font loaded.
|
||||
void Tesseract::SetupUniversalFontIds() {
|
||||
// Note that we can get away with bitwise copying FontInfo in
|
||||
// all_fonts, as it is a temporary structure and we avoid setting the
|
||||
// delete callback.
|
||||
UnicityTable<FontInfo> all_fonts;
|
||||
|
||||
// Create the universal ID table.
|
||||
CollectFonts(get_fontinfo_table(), &all_fonts);
|
||||
for (auto &sub_lang : sub_langs_) {
|
||||
CollectFonts(sub_lang->get_fontinfo_table(), &all_fonts);
|
||||
}
|
||||
// Assign ids from the table to each font table.
|
||||
AssignIds(all_fonts, &get_fontinfo_table());
|
||||
for (auto &sub_lang : sub_langs_) {
|
||||
AssignIds(all_fonts, &sub_lang->get_fontinfo_table());
|
||||
}
|
||||
font_table_size_ = all_fonts.size();
|
||||
}
|
||||
|
||||
// init the LM component
|
||||
int Tesseract::init_tesseract_lm(const std::string &arg0, const std::string &textbase,
|
||||
const std::string &language, TessdataManager *mgr) {
|
||||
if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY, nullptr, 0, nullptr,
|
||||
nullptr, false, mgr)) {
|
||||
return -1;
|
||||
}
|
||||
getDict().SetupForLoad(Dict::GlobalDawgCache());
|
||||
getDict().Load(lang, mgr);
|
||||
getDict().FinishLoad();
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
void Tesseract::end_tesseract() {
|
||||
end_recog();
|
||||
}
|
||||
|
||||
/* Define command type identifiers */
|
||||
|
||||
enum CMD_EVENTS { ACTION_1_CMD_EVENT, RECOG_WERDS, RECOG_PSEUDO, ACTION_2_CMD_EVENT };
|
||||
} // namespace tesseract
|
574
3rdparty/tesseract_ocr/tesseract/src/ccmain/tesseractclass.cpp
vendored
Normal file
574
3rdparty/tesseract_ocr/tesseract/src/ccmain/tesseractclass.cpp
vendored
Normal file
|
@ -0,0 +1,574 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: tesseractclass.cpp
|
||||
// Description: The Tesseract class. It holds/owns everything needed
|
||||
// to run Tesseract on a single language, and also a set of
|
||||
// sub-Tesseracts to run sub-languages. For thread safety, *every*
|
||||
// variable that was previously global or static (except for
|
||||
// constant data, and some visual debugging flags) has been moved
|
||||
// in here, directly, or indirectly.
|
||||
// This makes it safe to run multiple Tesseracts in different
|
||||
// threads in parallel, and keeps the different language
|
||||
// instances separate.
|
||||
// Some global functions remain, but they are isolated re-entrant
|
||||
// functions that operate on their arguments. Functions that work
|
||||
// on variable data have been moved to an appropriate class based
|
||||
// mostly on the directory hierarchy. For more information see
|
||||
// slide 6 of "2ArchitectureAndDataStructures" in
|
||||
// https://drive.google.com/file/d/0B7l10Bj_LprhbUlIUFlCdGtDYkE/edit?usp=sharing
|
||||
// Some global data and related functions still exist in the
|
||||
// training-related code, but they don't interfere with normal
|
||||
// recognition operation.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2008, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Include automatically generated configuration file if running autoconf.
|
||||
#ifdef HAVE_CONFIG_H
|
||||
# include "config_auto.h"
|
||||
#endif
|
||||
|
||||
#include "tesseractclass.h"
|
||||
|
||||
#include <allheaders.h>
|
||||
#include "edgblob.h"
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
# include "equationdetect.h"
|
||||
#endif
|
||||
#include "lstmrecognizer.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
Tesseract::Tesseract()
|
||||
: BOOL_MEMBER(tessedit_resegment_from_boxes, false,
|
||||
"Take segmentation and labeling from box file", this->params())
|
||||
, BOOL_MEMBER(tessedit_resegment_from_line_boxes, false,
|
||||
"Conversion of word/line box file to char box file", this->params())
|
||||
, BOOL_MEMBER(tessedit_train_from_boxes, false, "Generate training data from boxed chars",
|
||||
this->params())
|
||||
, BOOL_MEMBER(tessedit_make_boxes_from_boxes, false, "Generate more boxes from boxed chars",
|
||||
this->params())
|
||||
, BOOL_MEMBER(tessedit_train_line_recognizer, false,
|
||||
"Break input into lines and remap boxes if present", this->params())
|
||||
, BOOL_MEMBER(tessedit_dump_pageseg_images, false,
|
||||
"Dump intermediate images made during page segmentation", this->params())
|
||||
, BOOL_MEMBER(tessedit_do_invert, true, "Try inverting the image in `LSTMRecognizeWord`",
|
||||
this->params())
|
||||
,
|
||||
// The default for pageseg_mode is the old behaviour, so as not to
|
||||
// upset anything that relies on that.
|
||||
INT_MEMBER(tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
|
||||
"Page seg mode: 0=osd only, 1=auto+osd, 2=auto_only, 3=auto, "
|
||||
"4=column,"
|
||||
" 5=block_vert, 6=block, 7=line, 8=word, 9=word_circle, 10=char,"
|
||||
"11=sparse_text, 12=sparse_text+osd, 13=raw_line"
|
||||
" (Values from PageSegMode enum in tesseract/publictypes.h)",
|
||||
this->params())
|
||||
, INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT,
|
||||
"Which OCR engine(s) to run (Tesseract, LSTM, both)."
|
||||
" Defaults to loading and running the most accurate"
|
||||
" available.",
|
||||
this->params())
|
||||
, STRING_MEMBER(tessedit_char_blacklist, "", "Blacklist of chars not to recognize",
|
||||
this->params())
|
||||
, STRING_MEMBER(tessedit_char_whitelist, "", "Whitelist of chars to recognize", this->params())
|
||||
, STRING_MEMBER(tessedit_char_unblacklist, "",
|
||||
"List of chars to override tessedit_char_blacklist", this->params())
|
||||
, BOOL_MEMBER(tessedit_ambigs_training, false, "Perform training for ambiguities",
|
||||
this->params())
|
||||
, INT_MEMBER(pageseg_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT,
|
||||
"Whether to use the top-line splitting process for Devanagari "
|
||||
"documents while performing page-segmentation.",
|
||||
this->params())
|
||||
, INT_MEMBER(ocr_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT,
|
||||
"Whether to use the top-line splitting process for Devanagari "
|
||||
"documents while performing ocr.",
|
||||
this->params())
|
||||
, STRING_MEMBER(tessedit_write_params_to_file, "", "Write all parameters to the given file.",
|
||||
this->params())
|
||||
, BOOL_MEMBER(tessedit_adaption_debug, false,
|
||||
"Generate and print debug"
|
||||
" information for adaption",
|
||||
this->params())
|
||||
, INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params())
|
||||
, INT_MEMBER(applybox_debug, 1, "Debug level", this->params())
|
||||
, INT_MEMBER(applybox_page, 0, "Page number to apply boxes from", this->params())
|
||||
, STRING_MEMBER(applybox_exposure_pattern, ".exp",
|
||||
"Exposure value follows"
|
||||
" this pattern in the image filename. The name of the image"
|
||||
" files are expected to be in the form"
|
||||
" [lang].[fontname].exp[num].tif",
|
||||
this->params())
|
||||
, BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,
|
||||
"Learn both character fragments (as is done in the"
|
||||
" special low exposure mode) as well as unfragmented"
|
||||
" characters.",
|
||||
this->params())
|
||||
, BOOL_MEMBER(applybox_learn_ngrams_mode, false,
|
||||
"Each bounding box"
|
||||
" is assumed to contain ngrams. Only learn the ngrams"
|
||||
" whose outlines overlap horizontally.",
|
||||
this->params())
|
||||
, BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words", this->params())
|
||||
, BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices", this->params())
|
||||
, BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats", this->params())
|
||||
, BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true, "Try to improve fuzzy spaces", this->params())
|
||||
, BOOL_MEMBER(tessedit_unrej_any_wd, false, "Don't bother with word plausibility",
|
||||
this->params())
|
||||
, BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?", this->params())
|
||||
, BOOL_MEMBER(tessedit_enable_doc_dict, true, "Add words to the document dictionary",
|
||||
this->params())
|
||||
, BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char", this->params())
|
||||
, INT_MEMBER(tessedit_font_id, 0, "Font ID to use or zero", this->params())
|
||||
, BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats", this->params())
|
||||
, BOOL_MEMBER(tessedit_enable_bigram_correction, true,
|
||||
"Enable correction based on the word bigram dictionary.", this->params())
|
||||
, BOOL_MEMBER(tessedit_enable_dict_correction, false,
|
||||
"Enable single word correction based on the dictionary.", this->params())
|
||||
, INT_MEMBER(tessedit_bigram_debug, 0, "Amount of debug output for bigram correction.",
|
||||
this->params())
|
||||
, BOOL_MEMBER(enable_noise_removal, true,
|
||||
"Remove and conditionally reassign small outlines when they"
|
||||
" confuse layout analysis, determining diacritics vs noise",
|
||||
this->params())
|
||||
, INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines", this->params())
|
||||
,
|
||||
// Worst (min) certainty, for which a diacritic is allowed to make the
|
||||
// base
|
||||
// character worse and still be included.
|
||||
double_MEMBER(noise_cert_basechar, -8.0, "Hingepoint for base char certainty", this->params())
|
||||
,
|
||||
// Worst (min) certainty, for which a non-overlapping diacritic is allowed
|
||||
// to make the base character worse and still be included.
|
||||
double_MEMBER(noise_cert_disjoint, -1.0, "Hingepoint for disjoint certainty", this->params())
|
||||
,
|
||||
// Worst (min) certainty, for which a diacritic is allowed to make a new
|
||||
// stand-alone blob.
|
||||
double_MEMBER(noise_cert_punc, -3.0, "Threshold for new punc char certainty", this->params())
|
||||
,
|
||||
// Factor of certainty margin for adding diacritics to not count as worse.
|
||||
double_MEMBER(noise_cert_factor, 0.375, "Scaling on certainty diff from Hingepoint",
|
||||
this->params())
|
||||
, INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob", this->params())
|
||||
, INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word", this->params())
|
||||
, INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params())
|
||||
, STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation", this->params())
|
||||
, STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation", this->params())
|
||||
, STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation", this->params())
|
||||
, double_MEMBER(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit", this->params())
|
||||
, double_MEMBER(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit", this->params())
|
||||
, double_MEMBER(quality_outline_pc, 1.0, "good_quality_doc lte outline error limit",
|
||||
this->params())
|
||||
, double_MEMBER(quality_char_pc, 0.95, "good_quality_doc gte good char limit", this->params())
|
||||
, INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word", this->params())
|
||||
, INT_MEMBER(tessedit_tess_adaption_mode, 0x27, "Adaptation decision algorithm for tess",
|
||||
this->params())
|
||||
, BOOL_MEMBER(tessedit_minimal_rej_pass1, false, "Do minimal rejection on pass 1 output",
|
||||
this->params())
|
||||
, BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria", this->params())
|
||||
, BOOL_MEMBER(test_pt, false, "Test for point", this->params())
|
||||
, double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params())
|
||||
, double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params())
|
||||
, INT_MEMBER(multilang_debug_level, 0, "Print multilang debug info.", this->params())
|
||||
, INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.", this->params())
|
||||
, BOOL_MEMBER(paragraph_text_based, true,
|
||||
"Run paragraph detection on the post-text-recognition "
|
||||
"(more accurate)",
|
||||
this->params())
|
||||
, BOOL_MEMBER(lstm_use_matrix, 1, "Use ratings matrix/beam search with lstm", this->params())
|
||||
, STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines", this->params())
|
||||
, STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines", this->params())
|
||||
, BOOL_MEMBER(tessedit_good_quality_unrej, true, "Reduce rejection on good docs",
|
||||
this->params())
|
||||
, BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?", this->params())
|
||||
, double_MEMBER(tessedit_reject_doc_percent, 65.00, "%rej allowed before rej whole doc",
|
||||
this->params())
|
||||
, double_MEMBER(tessedit_reject_block_percent, 45.00, "%rej allowed before rej whole block",
|
||||
this->params())
|
||||
, double_MEMBER(tessedit_reject_row_percent, 40.00, "%rej allowed before rej whole row",
|
||||
this->params())
|
||||
, double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00,
|
||||
"Number of row rejects in whole word rejects"
|
||||
" which prevents whole row rejection",
|
||||
this->params())
|
||||
, BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,
|
||||
"Only rej partially rejected words in block rejection", this->params())
|
||||
, BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true,
|
||||
"Only rej partially rejected words in row rejection", this->params())
|
||||
, BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false, "Use word segmentation quality metric",
|
||||
this->params())
|
||||
, BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false, "Use word segmentation quality metric",
|
||||
this->params())
|
||||
, INT_MEMBER(tessedit_preserve_min_wd_len, 2, "Only preserve wds longer than this",
|
||||
this->params())
|
||||
, BOOL_MEMBER(tessedit_row_rej_good_docs, true, "Apply row rejection to good docs",
|
||||
this->params())
|
||||
, double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1,
|
||||
"rej good doc wd if more than this fraction rejected", this->params())
|
||||
, BOOL_MEMBER(tessedit_reject_bad_qual_wds, true, "Reject all bad quality wds", this->params())
|
||||
, BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats", this->params())
|
||||
, BOOL_MEMBER(tessedit_debug_quality_metrics, false, "Output data to debug file",
|
||||
this->params())
|
||||
, BOOL_MEMBER(bland_unrej, false, "unrej potential with no checks", this->params())
|
||||
, double_MEMBER(quality_rowrej_pc, 1.1, "good_quality_doc gte good char limit", this->params())
|
||||
, BOOL_MEMBER(unlv_tilde_crunching, false, "Mark v.bad words for tilde crunch", this->params())
|
||||
, BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output", this->params())
|
||||
, BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output",
|
||||
this->params())
|
||||
, BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?", this->params())
|
||||
, BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?", this->params())
|
||||
, double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this", this->params())
|
||||
, BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params())
|
||||
, double_MEMBER(crunch_poor_garbage_cert, -9.0, "crunch garbage cert lt this", this->params())
|
||||
, double_MEMBER(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this", this->params())
|
||||
, double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this", this->params())
|
||||
, double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this", this->params())
|
||||
, double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this", this->params())
|
||||
, double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this", this->params())
|
||||
, double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this", this->params())
|
||||
, double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this", this->params())
|
||||
, double_MEMBER(crunch_del_min_width, 3.0, "Del if word width lt xht x this", this->params())
|
||||
, double_MEMBER(crunch_del_high_word, 1.5, "Del if word gt xht x this above bl", this->params())
|
||||
, double_MEMBER(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl", this->params())
|
||||
, double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this", this->params())
|
||||
, INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch", this->params())
|
||||
, INT_MEMBER(crunch_pot_indicators, 1, "How many potential indicators needed", this->params())
|
||||
, BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings", this->params())
|
||||
, BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring", this->params())
|
||||
, BOOL_MEMBER(crunch_leave_accept_strings, false, "Don't pot crunch sensible strings",
|
||||
this->params())
|
||||
, BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures", this->params())
|
||||
, INT_MEMBER(crunch_leave_lc_strings, 4, "Don't crunch words with long lower case strings",
|
||||
this->params())
|
||||
, INT_MEMBER(crunch_leave_uc_strings, 4, "Don't crunch words with long lower case strings",
|
||||
this->params())
|
||||
, INT_MEMBER(crunch_long_repetitions, 3, "Crunch words with long repetitions", this->params())
|
||||
, INT_MEMBER(crunch_debug, 0, "As it says", this->params())
|
||||
, INT_MEMBER(fixsp_non_noise_limit, 1, "How many non-noise blbs either side?", this->params())
|
||||
, double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this", this->params())
|
||||
, BOOL_MEMBER(tessedit_prefer_joined_punct, false, "Reward punctuation joins", this->params())
|
||||
, INT_MEMBER(fixsp_done_mode, 1, "What constitutes done for spacing", this->params())
|
||||
, INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug", this->params())
|
||||
, STRING_MEMBER(numeric_punctuation, ".,", "Punct. chs expected WITHIN numbers", this->params())
|
||||
, INT_MEMBER(x_ht_acceptance_tolerance, 8,
|
||||
"Max allowed deviation of blob top outside of font data", this->params())
|
||||
, INT_MEMBER(x_ht_min_change, 8, "Min change in xht before actually trying it", this->params())
|
||||
, INT_MEMBER(superscript_debug, 0, "Debug level for sub & superscript fixer", this->params())
|
||||
, double_MEMBER(superscript_worse_certainty, 2.0,
|
||||
"How many times worse "
|
||||
"certainty does a superscript position glyph need to be for "
|
||||
"us to try classifying it as a char with a different "
|
||||
"baseline?",
|
||||
this->params())
|
||||
, double_MEMBER(superscript_bettered_certainty, 0.97,
|
||||
"What reduction in "
|
||||
"badness do we think sufficient to choose a superscript "
|
||||
"over what we'd thought. For example, a value of 0.6 means "
|
||||
"we want to reduce badness of certainty by at least 40%",
|
||||
this->params())
|
||||
, double_MEMBER(superscript_scaledown_ratio, 0.4,
|
||||
"A superscript scaled down more than this is unbelievably "
|
||||
"small. For example, 0.3 means we expect the font size to "
|
||||
"be no smaller than 30% of the text line font size.",
|
||||
this->params())
|
||||
, double_MEMBER(subscript_max_y_top, 0.5,
|
||||
"Maximum top of a character measured as a multiple of "
|
||||
"x-height above the baseline for us to reconsider whether "
|
||||
"it's a subscript.",
|
||||
this->params())
|
||||
, double_MEMBER(superscript_min_y_bottom, 0.3,
|
||||
"Minimum bottom of a character measured as a multiple of "
|
||||
"x-height above the baseline for us to reconsider whether "
|
||||
"it's a superscript.",
|
||||
this->params())
|
||||
, BOOL_MEMBER(tessedit_write_block_separators, false, "Write block separators in output",
|
||||
this->params())
|
||||
, BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code", this->params())
|
||||
, BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file", this->params())
|
||||
, BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file", this->params())
|
||||
, BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file", this->params())
|
||||
, BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file", this->params())
|
||||
, BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",
|
||||
this->params())
|
||||
, BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file", this->params())
|
||||
, BOOL_MEMBER(tessedit_create_wordstrbox, false, "Write WordStr format .box output file",
|
||||
this->params())
|
||||
, BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file", this->params())
|
||||
, BOOL_MEMBER(textonly_pdf, false, "Create PDF with only one invisible text layer",
|
||||
this->params())
|
||||
, INT_MEMBER(jpg_quality, 85, "Set JPEG quality level", this->params())
|
||||
, INT_MEMBER(user_defined_dpi, 0, "Specify DPI for input image", this->params())
|
||||
, INT_MEMBER(min_characters_to_try, 50, "Specify minimum characters to try during OSD",
|
||||
this->params())
|
||||
, STRING_MEMBER(unrecognised_char, "|", "Output char for unidentified blobs", this->params())
|
||||
, INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params())
|
||||
, INT_MEMBER(suspect_short_words, 2, "Don't suspect dict wds longer than this", this->params())
|
||||
, BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected", this->params())
|
||||
, double_MEMBER(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit", this->params())
|
||||
, double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit", this->params())
|
||||
, BOOL_MEMBER(tessedit_minimal_rejection, false, "Only reject tess failures", this->params())
|
||||
, BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING", this->params())
|
||||
, BOOL_MEMBER(tessedit_word_for_word, false, "Make output have exactly one word per WERD",
|
||||
this->params())
|
||||
, BOOL_MEMBER(tessedit_zero_kelvin_rejection, false, "Don't reject ANYTHING AT ALL",
|
||||
this->params())
|
||||
, INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", this->params())
|
||||
, BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug", this->params())
|
||||
, BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips", this->params())
|
||||
, double_MEMBER(tessedit_lower_flip_hyphen, 1.5, "Aspect ratio dot/hyphen test", this->params())
|
||||
, double_MEMBER(tessedit_upper_flip_hyphen, 1.8, "Aspect ratio dot/hyphen test", this->params())
|
||||
, BOOL_MEMBER(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector", this->params())
|
||||
, BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test", this->params())
|
||||
, BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check", this->params())
|
||||
, BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control", this->params())
|
||||
, BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control", this->params())
|
||||
, BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control", this->params())
|
||||
, BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check", this->params())
|
||||
, BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check", this->params())
|
||||
, double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85, "if >this fract", this->params())
|
||||
, INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit", this->params())
|
||||
, STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075", "Allow NN to unrej", this->params())
|
||||
, STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set", this->params())
|
||||
, INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this", this->params())
|
||||
, BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes", this->params())
|
||||
, INT_MEMBER(tessedit_page_number, -1, "-1 -> All pages, else specific page to process",
|
||||
this->params())
|
||||
, BOOL_MEMBER(tessedit_write_images, false, "Capture the image from the IPE", this->params())
|
||||
, BOOL_MEMBER(interactive_display_mode, false, "Run interactively?", this->params())
|
||||
, STRING_MEMBER(file_type, ".tif", "Filename extension", this->params())
|
||||
, BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word", this->params())
|
||||
, STRING_MEMBER(tessedit_load_sublangs, "", "List of languages to load with this one",
|
||||
this->params())
|
||||
, BOOL_MEMBER(tessedit_use_primary_params_model, false,
|
||||
"In multilingual mode use params model of the"
|
||||
" primary language",
|
||||
this->params())
|
||||
, double_MEMBER(min_orientation_margin, 7.0, "Min acceptable orientation margin",
|
||||
this->params())
|
||||
, BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", this->params())
|
||||
, BOOL_MEMBER(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model", this->params())
|
||||
, BOOL_MEMBER(poly_allow_detailed_fx, false,
|
||||
"Allow feature extractors to see the original outline", this->params())
|
||||
, BOOL_INIT_MEMBER(tessedit_init_config_only, false,
|
||||
"Only initialize with the config file. Useful if the "
|
||||
"instance is not going to be used for OCR but say only "
|
||||
"for layout analysis.",
|
||||
this->params())
|
||||
, BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector", this->params())
|
||||
, BOOL_MEMBER(textord_tabfind_vertical_text, true, "Enable vertical detection", this->params())
|
||||
, BOOL_MEMBER(textord_tabfind_force_vertical_text, false, "Force using vertical text page mode",
|
||||
this->params())
|
||||
, double_MEMBER(textord_tabfind_vertical_text_ratio, 0.5,
|
||||
"Fraction of textlines deemed vertical to use vertical page "
|
||||
"mode",
|
||||
this->params())
|
||||
, double_MEMBER(textord_tabfind_aligned_gap_fraction, 0.75,
|
||||
"Fraction of height used as a minimum gap for aligned blobs.", this->params())
|
||||
, INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible", this->params())
|
||||
, BOOL_MEMBER(preserve_interword_spaces, false, "Preserve multiple interword spaces",
|
||||
this->params())
|
||||
, STRING_MEMBER(page_separator, "\f", "Page separator (default is form feed control character)",
|
||||
this->params())
|
||||
, INT_MEMBER(lstm_choice_mode, 0,
|
||||
"Allows to include alternative symbols choices in the hOCR output. "
|
||||
"Valid input values are 0, 1 and 2. 0 is the default value. "
|
||||
"With 1 the alternative symbol choices per timestep are included. "
|
||||
"With 2 alternative symbol choices are extracted from the CTC "
|
||||
"process instead of the lattice. The choices are mapped per "
|
||||
"character.",
|
||||
this->params())
|
||||
, INT_MEMBER(lstm_choice_iterations, 5,
|
||||
"Sets the number of cascading iterations for the Beamsearch in "
|
||||
"lstm_choice_mode. Note that lstm_choice_mode must be set to a "
|
||||
"value greater than 0 to produce results.",
|
||||
this->params())
|
||||
, double_MEMBER(lstm_rating_coefficient, 5,
|
||||
"Sets the rating coefficient for the lstm choices. The smaller the "
|
||||
"coefficient, the better are the ratings for each choice and less "
|
||||
"information is lost due to the cut off at 0. The standard value is "
|
||||
"5",
|
||||
this->params())
|
||||
, BOOL_MEMBER(pageseg_apply_music_mask, true,
|
||||
"Detect music staff and remove intersecting components", this->params())
|
||||
,
|
||||
|
||||
backup_config_file_(nullptr)
|
||||
, pix_binary_(nullptr)
|
||||
, pix_grey_(nullptr)
|
||||
, pix_original_(nullptr)
|
||||
, pix_thresholds_(nullptr)
|
||||
, source_resolution_(0)
|
||||
, textord_(this)
|
||||
, right_to_left_(false)
|
||||
, scaled_color_(nullptr)
|
||||
, scaled_factor_(-1)
|
||||
, deskew_(1.0f, 0.0f)
|
||||
, reskew_(1.0f, 0.0f)
|
||||
, most_recently_used_(this)
|
||||
, font_table_size_(0)
|
||||
, equ_detect_(nullptr)
|
||||
, lstm_recognizer_(nullptr)
|
||||
, train_line_page_num_(0) {}
|
||||
|
||||
Tesseract::~Tesseract() {
|
||||
Clear();
|
||||
pix_original_.destroy();
|
||||
end_tesseract();
|
||||
for (auto *lang : sub_langs_) {
|
||||
delete lang;
|
||||
}
|
||||
delete lstm_recognizer_;
|
||||
lstm_recognizer_ = nullptr;
|
||||
}
|
||||
|
||||
Dict &Tesseract::getDict() {
|
||||
if (0 == Classify::getDict().NumDawgs() && AnyLSTMLang()) {
|
||||
if (lstm_recognizer_ && lstm_recognizer_->GetDict()) {
|
||||
return *lstm_recognizer_->GetDict();
|
||||
}
|
||||
}
|
||||
return Classify::getDict();
|
||||
}
|
||||
|
||||
void Tesseract::Clear() {
|
||||
std::string debug_name = imagebasename + "_debug.pdf";
|
||||
pixa_debug_.WritePDF(debug_name.c_str());
|
||||
pix_binary_.destroy();
|
||||
pix_grey_.destroy();
|
||||
pix_thresholds_.destroy();
|
||||
scaled_color_.destroy();
|
||||
deskew_ = FCOORD(1.0f, 0.0f);
|
||||
reskew_ = FCOORD(1.0f, 0.0f);
|
||||
splitter_.Clear();
|
||||
scaled_factor_ = -1;
|
||||
for (auto &sub_lang : sub_langs_) {
|
||||
sub_lang->Clear();
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
void Tesseract::SetEquationDetect(EquationDetect *detector) {
|
||||
equ_detect_ = detector;
|
||||
equ_detect_->SetLangTesseract(this);
|
||||
}
|
||||
|
||||
// Clear all memory of adaption for this and all subclassifiers.
|
||||
void Tesseract::ResetAdaptiveClassifier() {
|
||||
ResetAdaptiveClassifierInternal();
|
||||
for (auto &sub_lang : sub_langs_) {
|
||||
sub_lang->ResetAdaptiveClassifierInternal();
|
||||
}
|
||||
}
|
||||
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
// Clear the document dictionary for this and all subclassifiers.
|
||||
void Tesseract::ResetDocumentDictionary() {
|
||||
getDict().ResetDocumentDictionary();
|
||||
for (auto &sub_lang : sub_langs_) {
|
||||
sub_lang->getDict().ResetDocumentDictionary();
|
||||
}
|
||||
}
|
||||
|
||||
void Tesseract::SetBlackAndWhitelist() {
|
||||
// Set the white and blacklists (if any)
|
||||
unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
|
||||
tessedit_char_whitelist.c_str(),
|
||||
tessedit_char_unblacklist.c_str());
|
||||
if (lstm_recognizer_) {
|
||||
UNICHARSET &lstm_unicharset = lstm_recognizer_->GetUnicharset();
|
||||
lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
|
||||
tessedit_char_whitelist.c_str(),
|
||||
tessedit_char_unblacklist.c_str());
|
||||
}
|
||||
// Black and white lists should apply to all loaded classifiers.
|
||||
for (auto &sub_lang : sub_langs_) {
|
||||
sub_lang->unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
|
||||
tessedit_char_whitelist.c_str(),
|
||||
tessedit_char_unblacklist.c_str());
|
||||
if (sub_lang->lstm_recognizer_) {
|
||||
UNICHARSET &lstm_unicharset = sub_lang->lstm_recognizer_->GetUnicharset();
|
||||
lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
|
||||
tessedit_char_whitelist.c_str(),
|
||||
tessedit_char_unblacklist.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Perform steps to prepare underlying binary image/other data structures for
|
||||
// page segmentation.
|
||||
void Tesseract::PrepareForPageseg() {
|
||||
textord_.set_use_cjk_fp_model(textord_use_cjk_fp_model);
|
||||
// Find the max splitter strategy over all langs.
|
||||
auto max_pageseg_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(
|
||||
static_cast<int32_t>(pageseg_devanagari_split_strategy));
|
||||
for (auto &sub_lang : sub_langs_) {
|
||||
auto pageseg_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(
|
||||
static_cast<int32_t>(sub_lang->pageseg_devanagari_split_strategy));
|
||||
if (pageseg_strategy > max_pageseg_strategy) {
|
||||
max_pageseg_strategy = pageseg_strategy;
|
||||
}
|
||||
sub_lang->pix_binary_.destroy();
|
||||
sub_lang->pix_binary_ = pix_binary().clone();
|
||||
}
|
||||
// Perform shiro-rekha (top-line) splitting and replace the current image by
|
||||
// the newly split image.
|
||||
splitter_.set_orig_pix(pix_binary());
|
||||
splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
|
||||
if (splitter_.Split(true, &pixa_debug_)) {
|
||||
ASSERT_HOST(splitter_.splitted_image());
|
||||
pix_binary_.destroy();
|
||||
pix_binary_ = splitter_.splitted_image().clone();
|
||||
}
|
||||
}
|
||||
|
||||
// Perform steps to prepare underlying binary image/other data structures for
|
||||
// OCR. The current segmentation is required by this method.
|
||||
// Note that this method resets pix_binary_ to the original binarized image,
|
||||
// which may be different from the image actually used for OCR depending on the
|
||||
// value of devanagari_ocr_split_strategy.
|
||||
void Tesseract::PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr) {
|
||||
// Find the max splitter strategy over all langs.
|
||||
auto max_ocr_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(
|
||||
static_cast<int32_t>(ocr_devanagari_split_strategy));
|
||||
for (auto &sub_lang : sub_langs_) {
|
||||
auto ocr_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>(
|
||||
static_cast<int32_t>(sub_lang->ocr_devanagari_split_strategy));
|
||||
if (ocr_strategy > max_ocr_strategy) {
|
||||
max_ocr_strategy = ocr_strategy;
|
||||
}
|
||||
}
|
||||
// Utilize the segmentation information available.
|
||||
splitter_.set_segmentation_block_list(block_list);
|
||||
splitter_.set_ocr_split_strategy(max_ocr_strategy);
|
||||
// Run the splitter for OCR
|
||||
bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
|
||||
// Restore pix_binary to the binarized original pix for future reference.
|
||||
ASSERT_HOST(splitter_.orig_pix());
|
||||
pix_binary_.destroy();
|
||||
pix_binary_ = splitter_.orig_pix().clone();
|
||||
// If the pageseg and ocr strategies are different, refresh the block list
|
||||
// (from the last SegmentImage call) with blobs from the real image to be used
|
||||
// for OCR.
|
||||
if (splitter_.HasDifferentSplitStrategies()) {
|
||||
BLOCK block("", true, 0, 0, 0, 0, pixGetWidth(pix_binary_), pixGetHeight(pix_binary_));
|
||||
Image pix_for_ocr = split_for_ocr ? splitter_.splitted_image() : splitter_.orig_pix();
|
||||
extract_edges(pix_for_ocr, &block);
|
||||
splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());
|
||||
}
|
||||
// The splitter isn't needed any more after this, so save memory by clearing.
|
||||
splitter_.Clear();
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
1081
3rdparty/tesseract_ocr/tesseract/src/ccmain/tesseractclass.h
vendored
Normal file
1081
3rdparty/tesseract_ocr/tesseract/src/ccmain/tesseractclass.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
24
3rdparty/tesseract_ocr/tesseract/src/ccmain/tessvars.cpp
vendored
Normal file
24
3rdparty/tesseract_ocr/tesseract/src/ccmain/tessvars.cpp
vendored
Normal file
|
@ -0,0 +1,24 @@
|
|||
/**********************************************************************
|
||||
* File: tessvars.cpp (Formerly tessvars.c)
|
||||
* Description: Variables and other globals for tessedit.
|
||||
* Author: Ray Smith
|
||||
* Created: Mon Apr 13 13:13:23 BST 1992
|
||||
*
|
||||
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
#include "tessvars.h"
|
||||
|
||||
FILE *debug_fp = stderr; // write debug stuff here
|
27
3rdparty/tesseract_ocr/tesseract/src/ccmain/tessvars.h
vendored
Normal file
27
3rdparty/tesseract_ocr/tesseract/src/ccmain/tessvars.h
vendored
Normal file
|
@ -0,0 +1,27 @@
|
|||
/**********************************************************************
|
||||
* File: tessvars.h (Formerly tessvars.h)
|
||||
* Description: Variables and other globals for tessedit.
|
||||
* Author: Ray Smith
|
||||
* Created: Mon Apr 13 13:13:23 BST 1992
|
||||
*
|
||||
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef TESSVARS_H
|
||||
#define TESSVARS_H
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
extern FILE *debug_fp; // write debug stuff here
|
||||
|
||||
#endif
|
306
3rdparty/tesseract_ocr/tesseract/src/ccmain/tfacepp.cpp
vendored
Normal file
306
3rdparty/tesseract_ocr/tesseract/src/ccmain/tfacepp.cpp
vendored
Normal file
|
@ -0,0 +1,306 @@
|
|||
/**********************************************************************
|
||||
* File: tfacepp.cpp (Formerly tface++.c)
|
||||
* Description: C++ side of the C/C++ Tess/Editor interface.
|
||||
* Author: Ray Smith
|
||||
*
|
||||
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "blamer.h"
|
||||
#include "errcode.h"
|
||||
#include "ratngs.h"
|
||||
#include "reject.h"
|
||||
#include "tesseractclass.h"
|
||||
#include "werd.h"
|
||||
|
||||
#define MAX_UNDIVIDED_LENGTH 24
|
||||
|
||||
/**********************************************************************
|
||||
* recog_word
|
||||
*
|
||||
* Convert the word to tess form and pass it to the tess segmenter.
|
||||
* Convert the output back to editor form.
|
||||
**********************************************************************/
|
||||
namespace tesseract {
|
||||
void Tesseract::recog_word(WERD_RES *word) {
|
||||
if (wordrec_skip_no_truth_words &&
|
||||
(word->blamer_bundle == nullptr ||
|
||||
word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) {
|
||||
if (classify_debug_level) {
|
||||
tprintf("No truth for word - skipping\n");
|
||||
}
|
||||
word->tess_failed = true;
|
||||
return;
|
||||
}
|
||||
ASSERT_HOST(!word->chopped_word->blobs.empty());
|
||||
recog_word_recursive(word);
|
||||
word->SetupBoxWord();
|
||||
if (word->best_choice->length() != word->box_word->length()) {
|
||||
tprintf(
|
||||
"recog_word ASSERT FAIL String:\"%s\"; "
|
||||
"Strlen=%d; #Blobs=%d\n",
|
||||
word->best_choice->debug_string().c_str(), word->best_choice->length(),
|
||||
word->box_word->length());
|
||||
}
|
||||
ASSERT_HOST(word->best_choice->length() == word->box_word->length());
|
||||
// Check that the ratings matrix size matches the sum of all the
|
||||
// segmentation states.
|
||||
if (!word->StatesAllValid()) {
|
||||
tprintf("Not all words have valid states relative to ratings matrix!!");
|
||||
word->DebugWordChoices(true, nullptr);
|
||||
ASSERT_HOST(word->StatesAllValid());
|
||||
}
|
||||
if (tessedit_override_permuter) {
|
||||
/* Override the permuter type if a straight dictionary check disagrees. */
|
||||
uint8_t perm_type = word->best_choice->permuter();
|
||||
if ((perm_type != SYSTEM_DAWG_PERM) && (perm_type != FREQ_DAWG_PERM) &&
|
||||
(perm_type != USER_DAWG_PERM)) {
|
||||
uint8_t real_dict_perm_type = dict_word(*word->best_choice);
|
||||
if (((real_dict_perm_type == SYSTEM_DAWG_PERM) || (real_dict_perm_type == FREQ_DAWG_PERM) ||
|
||||
(real_dict_perm_type == USER_DAWG_PERM)) &&
|
||||
(alpha_count(word->best_choice->unichar_string().c_str(),
|
||||
word->best_choice->unichar_lengths().c_str()) > 0)) {
|
||||
word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
|
||||
}
|
||||
}
|
||||
if (tessedit_rejection_debug && perm_type != word->best_choice->permuter()) {
|
||||
tprintf("Permuter Type Flipped from %d to %d\n", perm_type, word->best_choice->permuter());
|
||||
}
|
||||
}
|
||||
// Factored out from control.cpp
|
||||
ASSERT_HOST((word->best_choice == nullptr) == (word->raw_choice == nullptr));
|
||||
if (word->best_choice == nullptr || word->best_choice->empty() ||
|
||||
static_cast<int>(strspn(word->best_choice->unichar_string().c_str(), " ")) ==
|
||||
word->best_choice->length()) {
|
||||
word->tess_failed = true;
|
||||
word->reject_map.initialise(word->box_word->length());
|
||||
word->reject_map.rej_word_tess_failure();
|
||||
} else {
|
||||
word->tess_failed = false;
|
||||
}
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* recog_word_recursive
|
||||
*
|
||||
* Convert the word to tess form and pass it to the tess segmenter.
|
||||
* Convert the output back to editor form.
|
||||
**********************************************************************/
|
||||
void Tesseract::recog_word_recursive(WERD_RES *word) {
|
||||
int word_length = word->chopped_word->NumBlobs(); // no of blobs
|
||||
if (word_length > MAX_UNDIVIDED_LENGTH) {
|
||||
return split_and_recog_word(word);
|
||||
}
|
||||
cc_recog(word);
|
||||
word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
|
||||
|
||||
// Do sanity checks and minor fixes on best_choice.
|
||||
if (word->best_choice->length() > word_length) {
|
||||
word->best_choice->make_bad(); // should never happen
|
||||
tprintf(
|
||||
"recog_word: Discarded long string \"%s\""
|
||||
" (%d characters vs %d blobs)\n",
|
||||
word->best_choice->unichar_string().c_str(), word->best_choice->length(), word_length);
|
||||
tprintf("Word is at:");
|
||||
word->word->bounding_box().print();
|
||||
}
|
||||
if (word->best_choice->length() < word_length) {
|
||||
UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
|
||||
while (word->best_choice->length() < word_length) {
|
||||
word->best_choice->append_unichar_id(space_id, 1, 0.0, word->best_choice->certainty());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* split_and_recog_word
|
||||
*
|
||||
* Split the word into 2 smaller pieces at the largest gap.
|
||||
* Recognize the pieces and stick the results back together.
|
||||
**********************************************************************/
|
||||
void Tesseract::split_and_recog_word(WERD_RES *word) {
|
||||
// Find the biggest blob gap in the chopped_word.
|
||||
int bestgap = -INT32_MAX;
|
||||
int split_index = 0;
|
||||
for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
|
||||
TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
|
||||
TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
|
||||
int gap = blob_box.left() - prev_box.right();
|
||||
if (gap > bestgap) {
|
||||
bestgap = gap;
|
||||
split_index = b;
|
||||
}
|
||||
}
|
||||
ASSERT_HOST(split_index > 0);
|
||||
|
||||
WERD_RES *word2 = nullptr;
|
||||
BlamerBundle *orig_bb = nullptr;
|
||||
split_word(word, split_index, &word2, &orig_bb);
|
||||
|
||||
// Recognize the first part of the word.
|
||||
recog_word_recursive(word);
|
||||
// Recognize the second part of the word.
|
||||
recog_word_recursive(word2);
|
||||
|
||||
join_words(word, word2, orig_bb);
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* split_word
|
||||
*
|
||||
* Split a given WERD_RES in place into two smaller words for recognition.
|
||||
* split_pt is the index of the first blob to go in the second word.
|
||||
* The underlying word is left alone, only the TWERD (and subsequent data)
|
||||
* are split up. orig_blamer_bundle is set to the original blamer bundle,
|
||||
* and will now be owned by the caller. New blamer bundles are forged for the
|
||||
* two pieces.
|
||||
**********************************************************************/
|
||||
void Tesseract::split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece,
|
||||
BlamerBundle **orig_blamer_bundle) const {
|
||||
ASSERT_HOST(split_pt > 0 && split_pt < word->chopped_word->NumBlobs());
|
||||
|
||||
// Save a copy of the blamer bundle so we can try to reconstruct it below.
|
||||
BlamerBundle *orig_bb = word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : nullptr;
|
||||
|
||||
auto *word2 = new WERD_RES(*word);
|
||||
|
||||
// blow away the copied chopped_word, as we want to work with
|
||||
// the blobs from the input chopped_word so seam_arrays can be merged.
|
||||
TWERD *chopped = word->chopped_word;
|
||||
auto *chopped2 = new TWERD;
|
||||
chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
|
||||
for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
|
||||
chopped2->blobs.push_back(chopped->blobs[i]);
|
||||
}
|
||||
chopped->blobs.resize(split_pt);
|
||||
word->chopped_word = nullptr;
|
||||
delete word2->chopped_word;
|
||||
word2->chopped_word = nullptr;
|
||||
|
||||
const UNICHARSET &unicharset = *word->uch_set;
|
||||
word->ClearResults();
|
||||
word2->ClearResults();
|
||||
word->chopped_word = chopped;
|
||||
word2->chopped_word = chopped2;
|
||||
word->SetupBasicsFromChoppedWord(unicharset);
|
||||
word2->SetupBasicsFromChoppedWord(unicharset);
|
||||
|
||||
// Try to adjust the blamer bundle.
|
||||
if (orig_bb != nullptr) {
|
||||
// TODO(rays) Looks like a leak to me.
|
||||
// orig_bb should take, rather than copy.
|
||||
word->blamer_bundle = new BlamerBundle();
|
||||
word2->blamer_bundle = new BlamerBundle();
|
||||
orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
|
||||
word2->chopped_word->blobs[0]->bounding_box().left(), wordrec_debug_blamer,
|
||||
word->blamer_bundle, word2->blamer_bundle);
|
||||
}
|
||||
|
||||
*right_piece = word2;
|
||||
*orig_blamer_bundle = orig_bb;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* join_words
|
||||
*
|
||||
* The opposite of split_word():
|
||||
* join word2 (including any recognized data / seam array / etc)
|
||||
* onto the right of word and then delete word2.
|
||||
* Also, if orig_bb is provided, stitch it back into word.
|
||||
**********************************************************************/
|
||||
void Tesseract::join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const {
|
||||
TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
|
||||
TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
|
||||
// Tack the word2 outputs onto the end of the word outputs.
|
||||
word->chopped_word->blobs.insert(word->chopped_word->blobs.end(), word2->chopped_word->blobs.begin(), word2->chopped_word->blobs.end());
|
||||
word->rebuild_word->blobs.insert(word->rebuild_word->blobs.end(), word2->rebuild_word->blobs.begin(), word2->rebuild_word->blobs.end());
|
||||
word2->chopped_word->blobs.clear();
|
||||
word2->rebuild_word->blobs.clear();
|
||||
TPOINT split_pt;
|
||||
split_pt.x = (prev_box.right() + blob_box.left()) / 2;
|
||||
split_pt.y = (prev_box.top() + prev_box.bottom() + blob_box.top() + blob_box.bottom()) / 4;
|
||||
// Move the word2 seams onto the end of the word1 seam_array.
|
||||
// Since the seam list is one element short, an empty seam marking the
|
||||
// end of the last blob in the first word is needed first.
|
||||
word->seam_array.push_back(new SEAM(0.0f, split_pt));
|
||||
word->seam_array.insert(word->seam_array.end(), word2->seam_array.begin(), word2->seam_array.end());
|
||||
word2->seam_array.clear();
|
||||
// Fix widths and gaps.
|
||||
word->blob_widths.insert(word->blob_widths.end(), word2->blob_widths.begin(), word2->blob_widths.end());
|
||||
word->blob_gaps.insert(word->blob_gaps.end(), word2->blob_gaps.begin(), word2->blob_gaps.end());
|
||||
// Fix the ratings matrix.
|
||||
int rat1 = word->ratings->dimension();
|
||||
int rat2 = word2->ratings->dimension();
|
||||
word->ratings->AttachOnCorner(word2->ratings);
|
||||
ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
|
||||
word->best_state.insert(word->best_state.end(), word2->best_state.begin(), word2->best_state.end());
|
||||
// Append the word choices.
|
||||
*word->raw_choice += *word2->raw_choice;
|
||||
|
||||
// How many alt choices from each should we try to get?
|
||||
const int kAltsPerPiece = 2;
|
||||
// When do we start throwing away extra alt choices?
|
||||
const int kTooManyAltChoices = 100;
|
||||
|
||||
// Construct the cartesian product of the best_choices of word(1) and word2.
|
||||
WERD_CHOICE_LIST joined_choices;
|
||||
WERD_CHOICE_IT jc_it(&joined_choices);
|
||||
WERD_CHOICE_IT bc1_it(&word->best_choices);
|
||||
WERD_CHOICE_IT bc2_it(&word2->best_choices);
|
||||
int num_word1_choices = word->best_choices.length();
|
||||
int total_joined_choices = num_word1_choices;
|
||||
// Nota Bene: For the main loop here, we operate only on the 2nd and greater
|
||||
// word2 choices, and put them in the joined_choices list. The 1st word2
|
||||
// choice gets added to the original word1 choices in-place after we have
|
||||
// finished with them.
|
||||
int bc2_index = 1;
|
||||
for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
|
||||
if (total_joined_choices >= kTooManyAltChoices && bc2_index > kAltsPerPiece) {
|
||||
break;
|
||||
}
|
||||
int bc1_index = 0;
|
||||
for (bc1_it.move_to_first(); bc1_index < num_word1_choices; ++bc1_index, bc1_it.forward()) {
|
||||
if (total_joined_choices >= kTooManyAltChoices && bc1_index > kAltsPerPiece) {
|
||||
break;
|
||||
}
|
||||
auto *wc = new WERD_CHOICE(*bc1_it.data());
|
||||
*wc += *bc2_it.data();
|
||||
jc_it.add_after_then_move(wc);
|
||||
++total_joined_choices;
|
||||
}
|
||||
}
|
||||
// Now that we've filled in as many alternates as we want, paste the best
|
||||
// choice for word2 onto the original word alt_choices.
|
||||
bc1_it.move_to_first();
|
||||
bc2_it.move_to_first();
|
||||
for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
|
||||
*bc1_it.data() += *bc2_it.data();
|
||||
}
|
||||
bc1_it.move_to_last();
|
||||
bc1_it.add_list_after(&joined_choices);
|
||||
|
||||
// Restore the pointer to original blamer bundle and combine blamer
|
||||
// information recorded in the splits.
|
||||
if (orig_bb != nullptr) {
|
||||
orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle, wordrec_debug_blamer);
|
||||
delete word->blamer_bundle;
|
||||
word->blamer_bundle = orig_bb;
|
||||
}
|
||||
word->SetupBoxWord();
|
||||
word->reject_map.initialise(word->box_word->length());
|
||||
delete word2;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
331
3rdparty/tesseract_ocr/tesseract/src/ccmain/thresholder.cpp
vendored
Normal file
331
3rdparty/tesseract_ocr/tesseract/src/ccmain/thresholder.cpp
vendored
Normal file
|
@ -0,0 +1,331 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: thresholder.cpp
|
||||
// Description: Base API for thresholding images in tesseract.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2008, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <allheaders.h>
|
||||
|
||||
#include <cstdint> // for uint32_t
|
||||
#include <cstring>
|
||||
|
||||
#include "otsuthr.h"
|
||||
#include "thresholder.h"
|
||||
#include "tprintf.h" // for tprintf
|
||||
|
||||
#if defined(USE_OPENCL)
|
||||
# include "openclwrapper.h" // for OpenclDevice
|
||||
#endif
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
ImageThresholder::ImageThresholder()
|
||||
: pix_(nullptr)
|
||||
, image_width_(0)
|
||||
, image_height_(0)
|
||||
, pix_channels_(0)
|
||||
, pix_wpl_(0)
|
||||
, scale_(1)
|
||||
, yres_(300)
|
||||
, estimated_res_(300) {
|
||||
SetRectangle(0, 0, 0, 0);
|
||||
}
|
||||
|
||||
ImageThresholder::~ImageThresholder() {
|
||||
Clear();
|
||||
}
|
||||
|
||||
// Destroy the Pix if there is one, freeing memory.
|
||||
void ImageThresholder::Clear() {
|
||||
pix_.destroy();
|
||||
}
|
||||
|
||||
// Return true if no image has been set.
|
||||
bool ImageThresholder::IsEmpty() const {
|
||||
return pix_ == nullptr;
|
||||
}
|
||||
|
||||
// SetImage makes a copy of all the image data, so it may be deleted
|
||||
// immediately after this call.
|
||||
// Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
|
||||
// Palette color images will not work properly and must be converted to
|
||||
// 24 bit.
|
||||
// Binary images of 1 bit per pixel may also be given but they must be
|
||||
// byte packed with the MSB of the first byte being the first pixel, and a
|
||||
// one pixel is WHITE. For binary images set bytes_per_pixel=0.
|
||||
void ImageThresholder::SetImage(const unsigned char *imagedata, int width, int height,
|
||||
int bytes_per_pixel, int bytes_per_line) {
|
||||
int bpp = bytes_per_pixel * 8;
|
||||
if (bpp == 0) {
|
||||
bpp = 1;
|
||||
}
|
||||
Image pix = pixCreate(width, height, bpp == 24 ? 32 : bpp);
|
||||
l_uint32 *data = pixGetData(pix);
|
||||
int wpl = pixGetWpl(pix);
|
||||
switch (bpp) {
|
||||
case 1:
|
||||
for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
if (imagedata[x / 8] & (0x80 >> (x % 8))) {
|
||||
CLEAR_DATA_BIT(data, x);
|
||||
} else {
|
||||
SET_DATA_BIT(data, x);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 8:
|
||||
// Greyscale just copies the bytes in the right order.
|
||||
for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
SET_DATA_BYTE(data, x, imagedata[x]);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 24:
|
||||
// Put the colors in the correct places in the line buffer.
|
||||
for (int y = 0; y < height; ++y, imagedata += bytes_per_line) {
|
||||
for (int x = 0; x < width; ++x, ++data) {
|
||||
SET_DATA_BYTE(data, COLOR_RED, imagedata[3 * x]);
|
||||
SET_DATA_BYTE(data, COLOR_GREEN, imagedata[3 * x + 1]);
|
||||
SET_DATA_BYTE(data, COLOR_BLUE, imagedata[3 * x + 2]);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case 32:
|
||||
// Maintain byte order consistency across different endianness.
|
||||
for (int y = 0; y < height; ++y, imagedata += bytes_per_line, data += wpl) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
data[x] = (imagedata[x * 4] << 24) | (imagedata[x * 4 + 1] << 16) |
|
||||
(imagedata[x * 4 + 2] << 8) | imagedata[x * 4 + 3];
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
tprintf("Cannot convert RAW image to Pix with bpp = %d\n", bpp);
|
||||
}
|
||||
SetImage(pix);
|
||||
pix.destroy();
|
||||
}
|
||||
|
||||
// Store the coordinates of the rectangle to process for later use.
|
||||
// Doesn't actually do any thresholding.
|
||||
void ImageThresholder::SetRectangle(int left, int top, int width, int height) {
|
||||
rect_left_ = left;
|
||||
rect_top_ = top;
|
||||
rect_width_ = width;
|
||||
rect_height_ = height;
|
||||
}
|
||||
|
||||
// Get enough parameters to be able to rebuild bounding boxes in the
|
||||
// original image (not just within the rectangle).
|
||||
// Left and top are enough with top-down coordinates, but
|
||||
// the height of the rectangle and the image are needed for bottom-up.
|
||||
void ImageThresholder::GetImageSizes(int *left, int *top, int *width, int *height, int *imagewidth,
|
||||
int *imageheight) {
|
||||
*left = rect_left_;
|
||||
*top = rect_top_;
|
||||
*width = rect_width_;
|
||||
*height = rect_height_;
|
||||
*imagewidth = image_width_;
|
||||
*imageheight = image_height_;
|
||||
}
|
||||
|
||||
// Pix vs raw, which to use? Pix is the preferred input for efficiency,
|
||||
// since raw buffers are copied.
|
||||
// SetImage for Pix clones its input, so the source pix may be pixDestroyed
|
||||
// immediately after, but may not go away until after the Thresholder has
|
||||
// finished with it.
|
||||
void ImageThresholder::SetImage(const Image pix) {
|
||||
if (pix_ != nullptr) {
|
||||
pix_.destroy();
|
||||
}
|
||||
Image src = pix;
|
||||
int depth;
|
||||
pixGetDimensions(src, &image_width_, &image_height_, &depth);
|
||||
// Convert the image as necessary so it is one of binary, plain RGB, or
|
||||
// 8 bit with no colormap. Guarantee that we always end up with our own copy,
|
||||
// not just a clone of the input.
|
||||
if (pixGetColormap(src)) {
|
||||
Image tmp = pixRemoveColormap(src, REMOVE_CMAP_BASED_ON_SRC);
|
||||
depth = pixGetDepth(tmp);
|
||||
if (depth > 1 && depth < 8) {
|
||||
pix_ = pixConvertTo8(tmp, false);
|
||||
tmp.destroy();
|
||||
} else {
|
||||
pix_ = tmp;
|
||||
}
|
||||
} else if (depth > 1 && depth < 8) {
|
||||
pix_ = pixConvertTo8(src, false);
|
||||
} else {
|
||||
pix_ = src.copy();
|
||||
}
|
||||
depth = pixGetDepth(pix_);
|
||||
pix_channels_ = depth / 8;
|
||||
pix_wpl_ = pixGetWpl(pix_);
|
||||
scale_ = 1;
|
||||
estimated_res_ = yres_ = pixGetYRes(pix_);
|
||||
Init();
|
||||
}
|
||||
|
||||
// Threshold the source image as efficiently as possible to the output Pix.
|
||||
// Creates a Pix and sets pix to point to the resulting pointer.
|
||||
// Caller must use pixDestroy to free the created Pix.
|
||||
/// Returns false on error.
|
||||
bool ImageThresholder::ThresholdToPix(PageSegMode pageseg_mode, Image *pix) {
|
||||
if (image_width_ > INT16_MAX || image_height_ > INT16_MAX) {
|
||||
tprintf("Image too large: (%d, %d)\n", image_width_, image_height_);
|
||||
return false;
|
||||
}
|
||||
if (pix_channels_ == 0) {
|
||||
// We have a binary image, but it still has to be copied, as this API
|
||||
// allows the caller to modify the output.
|
||||
Image original = GetPixRect();
|
||||
*pix = original.copy();
|
||||
original.destroy();
|
||||
} else {
|
||||
OtsuThresholdRectToPix(pix_, pix);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Gets a pix that contains an 8 bit threshold value at each pixel. The
|
||||
// returned pix may be an integer reduction of the binary image such that
|
||||
// the scale factor may be inferred from the ratio of the sizes, even down
|
||||
// to the extreme of a 1x1 pixel thresholds image.
|
||||
// Ideally the 8 bit threshold should be the exact threshold used to generate
|
||||
// the binary image in ThresholdToPix, but this is not a hard constraint.
|
||||
// Returns nullptr if the input is binary. PixDestroy after use.
|
||||
Image ImageThresholder::GetPixRectThresholds() {
|
||||
if (IsBinary()) {
|
||||
return nullptr;
|
||||
}
|
||||
Image pix_grey = GetPixRectGrey();
|
||||
int width = pixGetWidth(pix_grey);
|
||||
int height = pixGetHeight(pix_grey);
|
||||
std::vector<int> thresholds;
|
||||
std::vector<int> hi_values;
|
||||
OtsuThreshold(pix_grey, 0, 0, width, height, thresholds, hi_values);
|
||||
pix_grey.destroy();
|
||||
Image pix_thresholds = pixCreate(width, height, 8);
|
||||
int threshold = thresholds[0] > 0 ? thresholds[0] : 128;
|
||||
pixSetAllArbitrary(pix_thresholds, threshold);
|
||||
return pix_thresholds;
|
||||
}
|
||||
|
||||
// Common initialization shared between SetImage methods.
|
||||
void ImageThresholder::Init() {
|
||||
SetRectangle(0, 0, image_width_, image_height_);
|
||||
}
|
||||
|
||||
// Get a clone/copy of the source image rectangle.
|
||||
// The returned Pix must be pixDestroyed.
|
||||
// This function will be used in the future by the page layout analysis, and
|
||||
// the layout analysis that uses it will only be available with Leptonica,
|
||||
// so there is no raw equivalent.
|
||||
Image ImageThresholder::GetPixRect() {
|
||||
if (IsFullImage()) {
|
||||
// Just clone the whole thing.
|
||||
return pix_.clone();
|
||||
} else {
|
||||
// Crop to the given rectangle.
|
||||
Box *box = boxCreate(rect_left_, rect_top_, rect_width_, rect_height_);
|
||||
Image cropped = pixClipRectangle(pix_, box, nullptr);
|
||||
boxDestroy(&box);
|
||||
return cropped;
|
||||
}
|
||||
}
|
||||
|
||||
// Get a clone/copy of the source image rectangle, reduced to greyscale,
|
||||
// and at the same resolution as the output binary.
|
||||
// The returned Pix must be pixDestroyed.
|
||||
// Provided to the classifier to extract features from the greyscale image.
|
||||
Image ImageThresholder::GetPixRectGrey() {
|
||||
auto pix = GetPixRect(); // May have to be reduced to grey.
|
||||
int depth = pixGetDepth(pix);
|
||||
if (depth != 8) {
|
||||
if (depth == 24) {
|
||||
auto tmp = pixConvert24To32(pix);
|
||||
pix.destroy();
|
||||
pix = tmp;
|
||||
}
|
||||
auto result = pixConvertTo8(pix, false);
|
||||
pix.destroy();
|
||||
return result;
|
||||
}
|
||||
return pix;
|
||||
}
|
||||
|
||||
// Otsu thresholds the rectangle, taking the rectangle from *this.
|
||||
void ImageThresholder::OtsuThresholdRectToPix(Image src_pix, Image *out_pix) const {
|
||||
std::vector<int> thresholds;
|
||||
std::vector<int> hi_values;
|
||||
|
||||
int num_channels = OtsuThreshold(src_pix, rect_left_, rect_top_, rect_width_, rect_height_,
|
||||
thresholds, hi_values);
|
||||
// only use opencl if compiled w/ OpenCL and selected device is opencl
|
||||
#ifdef USE_OPENCL
|
||||
OpenclDevice od;
|
||||
if (num_channels == 4 && od.selectedDeviceIsOpenCL() && rect_top_ == 0 && rect_left_ == 0) {
|
||||
od.ThresholdRectToPixOCL((unsigned char *)pixGetData(src_pix), num_channels,
|
||||
pixGetWpl(src_pix) * 4, &thresholds[0], &hi_values[0], out_pix /*pix_OCL*/,
|
||||
rect_height_, rect_width_, rect_top_, rect_left_);
|
||||
} else {
|
||||
#endif
|
||||
ThresholdRectToPix(src_pix, num_channels, thresholds, hi_values, out_pix);
|
||||
#ifdef USE_OPENCL
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Threshold the rectangle, taking everything except the src_pix
|
||||
/// from the class, using thresholds/hi_values to the output pix.
|
||||
/// NOTE that num_channels is the size of the thresholds and hi_values
|
||||
// arrays and also the bytes per pixel in src_pix.
|
||||
void ImageThresholder::ThresholdRectToPix(Image src_pix, int num_channels, const std::vector<int> &thresholds,
|
||||
const std::vector<int> &hi_values, Image *pix) const {
|
||||
*pix = pixCreate(rect_width_, rect_height_, 1);
|
||||
uint32_t *pixdata = pixGetData(*pix);
|
||||
int wpl = pixGetWpl(*pix);
|
||||
int src_wpl = pixGetWpl(src_pix);
|
||||
uint32_t *srcdata = pixGetData(src_pix);
|
||||
pixSetXRes(*pix, pixGetXRes(src_pix));
|
||||
pixSetYRes(*pix, pixGetYRes(src_pix));
|
||||
for (int y = 0; y < rect_height_; ++y) {
|
||||
const uint32_t *linedata = srcdata + (y + rect_top_) * src_wpl;
|
||||
uint32_t *pixline = pixdata + y * wpl;
|
||||
for (int x = 0; x < rect_width_; ++x) {
|
||||
bool white_result = true;
|
||||
for (int ch = 0; ch < num_channels; ++ch) {
|
||||
int pixel = GET_DATA_BYTE(linedata, (x + rect_left_) * num_channels + ch);
|
||||
if (hi_values[ch] >= 0 && (pixel > thresholds[ch]) == (hi_values[ch] == 0)) {
|
||||
white_result = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (white_result) {
|
||||
CLEAR_DATA_BIT(pixline, x);
|
||||
} else {
|
||||
SET_DATA_BIT(pixline, x);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
190
3rdparty/tesseract_ocr/tesseract/src/ccmain/thresholder.h
vendored
Normal file
190
3rdparty/tesseract_ocr/tesseract/src/ccmain/thresholder.h
vendored
Normal file
|
@ -0,0 +1,190 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: thresholder.h
|
||||
// Description: Base API for thresholding images in tesseract.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2008, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCMAIN_THRESHOLDER_H_
|
||||
#define TESSERACT_CCMAIN_THRESHOLDER_H_
|
||||
|
||||
#include <tesseract/export.h>
|
||||
#include <tesseract/publictypes.h>
|
||||
|
||||
#include <vector> // for std::vector
|
||||
|
||||
struct Pix;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
/// Base class for all tesseract image thresholding classes.
|
||||
/// Specific classes can add new thresholding methods by
|
||||
/// overriding ThresholdToPix.
|
||||
/// Each instance deals with a single image, but the design is intended to
|
||||
/// be useful for multiple calls to SetRectangle and ThresholdTo* if
|
||||
/// desired.
|
||||
class TESS_API ImageThresholder {
|
||||
public:
|
||||
ImageThresholder();
|
||||
virtual ~ImageThresholder();
|
||||
|
||||
/// Destroy the Pix if there is one, freeing memory.
|
||||
virtual void Clear();
|
||||
|
||||
/// Return true if no image has been set.
|
||||
bool IsEmpty() const;
|
||||
|
||||
/// SetImage makes a copy of all the image data, so it may be deleted
|
||||
/// immediately after this call.
|
||||
/// Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
|
||||
/// Palette color images will not work properly and must be converted to
|
||||
/// 24 bit.
|
||||
/// Binary images of 1 bit per pixel may also be given but they must be
|
||||
/// byte packed with the MSB of the first byte being the first pixel, and a
|
||||
/// one pixel is WHITE. For binary images set bytes_per_pixel=0.
|
||||
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel,
|
||||
int bytes_per_line);
|
||||
|
||||
/// Store the coordinates of the rectangle to process for later use.
|
||||
/// Doesn't actually do any thresholding.
|
||||
void SetRectangle(int left, int top, int width, int height);
|
||||
|
||||
/// Get enough parameters to be able to rebuild bounding boxes in the
|
||||
/// original image (not just within the rectangle).
|
||||
/// Left and top are enough with top-down coordinates, but
|
||||
/// the height of the rectangle and the image are needed for bottom-up.
|
||||
virtual void GetImageSizes(int *left, int *top, int *width, int *height, int *imagewidth,
|
||||
int *imageheight);
|
||||
|
||||
/// Return true if the source image is color.
|
||||
bool IsColor() const {
|
||||
return pix_channels_ >= 3;
|
||||
}
|
||||
|
||||
/// Returns true if the source image is binary.
|
||||
bool IsBinary() const {
|
||||
return pix_channels_ == 0;
|
||||
}
|
||||
|
||||
int GetScaleFactor() const {
|
||||
return scale_;
|
||||
}
|
||||
|
||||
// Set the resolution of the source image in pixels per inch.
|
||||
// This should be called right after SetImage(), and will let us return
|
||||
// appropriate font sizes for the text.
|
||||
void SetSourceYResolution(int ppi) {
|
||||
yres_ = ppi;
|
||||
estimated_res_ = ppi;
|
||||
}
|
||||
int GetSourceYResolution() const {
|
||||
return yres_;
|
||||
}
|
||||
int GetScaledYResolution() const {
|
||||
return scale_ * yres_;
|
||||
}
|
||||
// Set the resolution of the source image in pixels per inch, as estimated
|
||||
// by the thresholder from the text size found during thresholding.
|
||||
// This value will be used to set internal size thresholds during recognition
|
||||
// and will not influence the output "point size." The default value is
|
||||
// the same as the source resolution. (yres_)
|
||||
void SetEstimatedResolution(int ppi) {
|
||||
estimated_res_ = ppi;
|
||||
}
|
||||
// Returns the estimated resolution, including any active scaling.
|
||||
// This value will be used to set internal size thresholds during recognition.
|
||||
int GetScaledEstimatedResolution() const {
|
||||
return scale_ * estimated_res_;
|
||||
}
|
||||
|
||||
/// Pix vs raw, which to use? Pix is the preferred input for efficiency,
|
||||
/// since raw buffers are copied.
|
||||
/// SetImage for Pix clones its input, so the source pix may be pixDestroyed
|
||||
/// immediately after, but may not go away until after the Thresholder has
|
||||
/// finished with it.
|
||||
void SetImage(const Image pix);
|
||||
|
||||
/// Threshold the source image as efficiently as possible to the output Pix.
|
||||
/// Creates a Pix and sets pix to point to the resulting pointer.
|
||||
/// Caller must use pixDestroy to free the created Pix.
|
||||
/// Returns false on error.
|
||||
virtual bool ThresholdToPix(PageSegMode pageseg_mode, Image *pix);
|
||||
|
||||
// Gets a pix that contains an 8 bit threshold value at each pixel. The
|
||||
// returned pix may be an integer reduction of the binary image such that
|
||||
// the scale factor may be inferred from the ratio of the sizes, even down
|
||||
// to the extreme of a 1x1 pixel thresholds image.
|
||||
// Ideally the 8 bit threshold should be the exact threshold used to generate
|
||||
// the binary image in ThresholdToPix, but this is not a hard constraint.
|
||||
// Returns nullptr if the input is binary. PixDestroy after use.
|
||||
virtual Image GetPixRectThresholds();
|
||||
|
||||
/// Get a clone/copy of the source image rectangle.
|
||||
/// The returned Pix must be pixDestroyed.
|
||||
/// This function will be used in the future by the page layout analysis, and
|
||||
/// the layout analysis that uses it will only be available with Leptonica,
|
||||
/// so there is no raw equivalent.
|
||||
Image GetPixRect();
|
||||
|
||||
// Get a clone/copy of the source image rectangle, reduced to greyscale,
|
||||
// and at the same resolution as the output binary.
|
||||
// The returned Pix must be pixDestroyed.
|
||||
// Provided to the classifier to extract features from the greyscale image.
|
||||
virtual Image GetPixRectGrey();
|
||||
|
||||
protected:
|
||||
// ----------------------------------------------------------------------
|
||||
// Utility functions that may be useful components for other thresholders.
|
||||
|
||||
/// Common initialization shared between SetImage methods.
|
||||
virtual void Init();
|
||||
|
||||
/// Return true if we are processing the full image.
|
||||
bool IsFullImage() const {
|
||||
return rect_left_ == 0 && rect_top_ == 0 && rect_width_ == image_width_ &&
|
||||
rect_height_ == image_height_;
|
||||
}
|
||||
|
||||
// Otsu thresholds the rectangle, taking the rectangle from *this.
|
||||
void OtsuThresholdRectToPix(Image src_pix, Image *out_pix) const;
|
||||
|
||||
/// Threshold the rectangle, taking everything except the src_pix
|
||||
/// from the class, using thresholds/hi_values to the output pix.
|
||||
/// NOTE that num_channels is the size of the thresholds and hi_values
|
||||
// arrays and also the bytes per pixel in src_pix.
|
||||
void ThresholdRectToPix(Image src_pix, int num_channels, const std::vector<int> &thresholds,
|
||||
const std::vector <int> &hi_values, Image *pix) const;
|
||||
|
||||
protected:
|
||||
/// Clone or other copy of the source Pix.
|
||||
/// The pix will always be PixDestroy()ed on destruction of the class.
|
||||
Image pix_;
|
||||
|
||||
int image_width_; ///< Width of source pix_.
|
||||
int image_height_; ///< Height of source pix_.
|
||||
int pix_channels_; ///< Number of 8-bit channels in pix_.
|
||||
int pix_wpl_; ///< Words per line of pix_.
|
||||
// Limits of image rectangle to be processed.
|
||||
int scale_; ///< Scale factor from original image.
|
||||
int yres_; ///< y pixels/inch in source image.
|
||||
int estimated_res_; ///< Resolution estimate from text size.
|
||||
int rect_left_;
|
||||
int rect_top_;
|
||||
int rect_width_;
|
||||
int rect_height_;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_CCMAIN_THRESHOLDER_H_
|
68
3rdparty/tesseract_ocr/tesseract/src/ccmain/werdit.cpp
vendored
Normal file
68
3rdparty/tesseract_ocr/tesseract/src/ccmain/werdit.cpp
vendored
Normal file
|
@ -0,0 +1,68 @@
|
|||
/**********************************************************************
|
||||
* File: werdit.cpp (Formerly wordit.c)
|
||||
* Description: An iterator for passing over all the words in a document.
|
||||
* Author: Ray Smith
|
||||
* Created: Mon Apr 27 08:51:22 BST 1992
|
||||
*
|
||||
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "werdit.h"
|
||||
|
||||
#include "errcode.h" // for ASSERT_HOST
|
||||
#include "pageres.h" // for PAGE_RES_IT, PAGE_RES (ptr only), WERD_RES
|
||||
#include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
|
||||
#include "werd.h" // for WERD
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
/**********************************************************************
|
||||
* make_pseudo_word
|
||||
*
|
||||
* Make all the blobs inside a selection into a single word.
|
||||
* The returned PAGE_RES_IT* it points to the new word. After use, call
|
||||
* it->DeleteCurrentWord() to delete the fake word, and then
|
||||
* delete it to get rid of the iterator itself.
|
||||
**********************************************************************/
|
||||
|
||||
PAGE_RES_IT *make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box) {
|
||||
PAGE_RES_IT pr_it(page_res);
|
||||
C_BLOB_LIST new_blobs; // list of gathered blobs
|
||||
C_BLOB_IT new_blob_it = &new_blobs; // iterator
|
||||
|
||||
for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
|
||||
WERD *word = word_res->word;
|
||||
if (word->bounding_box().overlap(selection_box)) {
|
||||
C_BLOB_IT blob_it(word->cblob_list());
|
||||
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
|
||||
C_BLOB *blob = blob_it.data();
|
||||
if (blob->bounding_box().overlap(selection_box)) {
|
||||
new_blob_it.add_after_then_move(C_BLOB::deep_copy(blob));
|
||||
}
|
||||
}
|
||||
if (!new_blobs.empty()) {
|
||||
WERD *pseudo_word = new WERD(&new_blobs, 1, nullptr);
|
||||
word_res = pr_it.InsertSimpleCloneWord(*word_res, pseudo_word);
|
||||
auto *it = new PAGE_RES_IT(page_res);
|
||||
while (it->word() != word_res && it->word() != nullptr) {
|
||||
it->forward();
|
||||
}
|
||||
ASSERT_HOST(it->word() == word_res);
|
||||
return it;
|
||||
}
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
34
3rdparty/tesseract_ocr/tesseract/src/ccmain/werdit.h
vendored
Normal file
34
3rdparty/tesseract_ocr/tesseract/src/ccmain/werdit.h
vendored
Normal file
|
@ -0,0 +1,34 @@
|
|||
/**********************************************************************
|
||||
* File: wordit.h
|
||||
* Description: An iterator for passing over all the words in a document.
|
||||
* Author: Ray Smith
|
||||
* Created: Mon Apr 27 08:51:22 BST 1992
|
||||
*
|
||||
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef WERDIT_H
|
||||
#define WERDIT_H
|
||||
|
||||
#include "rect.h" // for TBOX
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class PAGE_RES;
|
||||
class PAGE_RES_IT;
|
||||
|
||||
PAGE_RES_IT *make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box);
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif
|
578
3rdparty/tesseract_ocr/tesseract/src/ccstruct/blamer.cpp
vendored
Normal file
578
3rdparty/tesseract_ocr/tesseract/src/ccstruct/blamer.cpp
vendored
Normal file
|
@ -0,0 +1,578 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: blamer.cpp
|
||||
// Description: Module allowing precise error causes to be allocated.
|
||||
// Author: Rike Antonova
|
||||
// Refactored: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2013, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "blamer.h"
|
||||
|
||||
#include "blobs.h" // for TPOINT, TWERD, TBLOB
|
||||
#include "errcode.h" // for ASSERT_HOST
|
||||
#if !defined(DISABLED_LEGACY_ENGINE)
|
||||
# include "lm_pain_points.h" // for LMPainPoints
|
||||
#endif
|
||||
#include "matrix.h" // for MATRIX
|
||||
#include "normalis.h" // for DENORM
|
||||
#include "pageres.h" // for WERD_RES
|
||||
#include "unicharset.h" // for UNICHARSET
|
||||
|
||||
#include <cmath> // for abs
|
||||
#include <cstdlib> // for abs
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Names for each value of IncorrectResultReason enum. Keep in sync.
|
||||
const char kBlameCorrect[] = "corr";
|
||||
const char kBlameClassifier[] = "cl";
|
||||
const char kBlameChopper[] = "chop";
|
||||
const char kBlameClassLMTradeoff[] = "cl/LM";
|
||||
const char kBlamePageLayout[] = "pglt";
|
||||
const char kBlameSegsearchHeur[] = "ss_heur";
|
||||
const char kBlameSegsearchPP[] = "ss_pp";
|
||||
const char kBlameClassOldLMTradeoff[] = "cl/old_LM";
|
||||
const char kBlameAdaption[] = "adapt";
|
||||
const char kBlameNoTruthSplit[] = "no_tr_spl";
|
||||
const char kBlameNoTruth[] = "no_tr";
|
||||
const char kBlameUnknown[] = "unkn";
|
||||
|
||||
const char *const kIncorrectResultReasonNames[] = {
|
||||
kBlameCorrect, kBlameClassifier, kBlameChopper, kBlameClassLMTradeoff,
|
||||
kBlamePageLayout, kBlameSegsearchHeur, kBlameSegsearchPP, kBlameClassOldLMTradeoff,
|
||||
kBlameAdaption, kBlameNoTruthSplit, kBlameNoTruth, kBlameUnknown};
|
||||
|
||||
const char *BlamerBundle::IncorrectReasonName(IncorrectResultReason irr) {
|
||||
return kIncorrectResultReasonNames[irr];
|
||||
}
|
||||
|
||||
const char *BlamerBundle::IncorrectReason() const {
|
||||
return kIncorrectResultReasonNames[incorrect_result_reason_];
|
||||
}
|
||||
|
||||
// Functions to setup the blamer.
|
||||
// Whole word string, whole word bounding box.
|
||||
void BlamerBundle::SetWordTruth(const UNICHARSET &unicharset, const char *truth_str,
|
||||
const TBOX &word_box) {
|
||||
truth_word_.InsertBox(0, word_box);
|
||||
truth_has_char_boxes_ = false;
|
||||
// Encode the string as UNICHAR_IDs.
|
||||
std::vector<UNICHAR_ID> encoding;
|
||||
std::vector<char> lengths;
|
||||
unicharset.encode_string(truth_str, false, &encoding, &lengths, nullptr);
|
||||
int total_length = 0;
|
||||
for (int i = 0; i < encoding.size(); total_length += lengths[i++]) {
|
||||
std::string uch(truth_str + total_length);
|
||||
uch.resize(lengths[i] - total_length);
|
||||
UNICHAR_ID id = encoding[i];
|
||||
if (id != INVALID_UNICHAR_ID) {
|
||||
uch = unicharset.get_normed_unichar(id);
|
||||
}
|
||||
truth_text_.push_back(uch);
|
||||
}
|
||||
}
|
||||
|
||||
// Single "character" string, "character" bounding box.
|
||||
// May be called multiple times to indicate the characters in a word.
|
||||
void BlamerBundle::SetSymbolTruth(const UNICHARSET &unicharset, const char *char_str,
|
||||
const TBOX &char_box) {
|
||||
std::string symbol_str(char_str);
|
||||
UNICHAR_ID id = unicharset.unichar_to_id(char_str);
|
||||
if (id != INVALID_UNICHAR_ID) {
|
||||
std::string normed_uch(unicharset.get_normed_unichar(id));
|
||||
if (normed_uch.length() > 0) {
|
||||
symbol_str = normed_uch;
|
||||
}
|
||||
}
|
||||
int length = truth_word_.length();
|
||||
truth_text_.push_back(symbol_str);
|
||||
truth_word_.InsertBox(length, char_box);
|
||||
if (length == 0) {
|
||||
truth_has_char_boxes_ = true;
|
||||
} else if (truth_word_.BlobBox(length - 1) == char_box) {
|
||||
truth_has_char_boxes_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Marks that there is something wrong with the truth text, like it contains
|
||||
// reject characters.
|
||||
void BlamerBundle::SetRejectedTruth() {
|
||||
incorrect_result_reason_ = IRR_NO_TRUTH;
|
||||
truth_has_char_boxes_ = false;
|
||||
}
|
||||
|
||||
// Returns true if the provided word_choice is correct.
|
||||
bool BlamerBundle::ChoiceIsCorrect(const WERD_CHOICE *word_choice) const {
|
||||
if (word_choice == nullptr) {
|
||||
return false;
|
||||
}
|
||||
const UNICHARSET *uni_set = word_choice->unicharset();
|
||||
std::string normed_choice_str;
|
||||
for (int i = 0; i < word_choice->length(); ++i) {
|
||||
normed_choice_str += uni_set->get_normed_unichar(word_choice->unichar_id(i));
|
||||
}
|
||||
std::string truth_str = TruthString();
|
||||
return truth_str == normed_choice_str;
|
||||
}
|
||||
|
||||
void BlamerBundle::FillDebugString(const std::string &msg, const WERD_CHOICE *choice, std::string &debug) {
|
||||
debug += "Truth ";
|
||||
for (auto &text : this->truth_text_) {
|
||||
debug += text;
|
||||
}
|
||||
if (!this->truth_has_char_boxes_) {
|
||||
debug += " (no char boxes)";
|
||||
}
|
||||
if (choice != nullptr) {
|
||||
debug += " Choice ";
|
||||
std::string choice_str;
|
||||
choice->string_and_lengths(&choice_str, nullptr);
|
||||
debug += choice_str;
|
||||
}
|
||||
if (msg.length() > 0) {
|
||||
debug += "\n";
|
||||
debug += msg;
|
||||
}
|
||||
debug += "\n";
|
||||
}
|
||||
|
||||
// Sets up the norm_truth_word from truth_word using the given DENORM.
|
||||
void BlamerBundle::SetupNormTruthWord(const DENORM &denorm) {
|
||||
// TODO(rays) Is this the last use of denorm in WERD_RES and can it go?
|
||||
norm_box_tolerance_ = kBlamerBoxTolerance * denorm.x_scale();
|
||||
TPOINT topleft;
|
||||
TPOINT botright;
|
||||
TPOINT norm_topleft;
|
||||
TPOINT norm_botright;
|
||||
for (int b = 0; b < truth_word_.length(); ++b) {
|
||||
const TBOX &box = truth_word_.BlobBox(b);
|
||||
topleft.x = box.left();
|
||||
topleft.y = box.top();
|
||||
botright.x = box.right();
|
||||
botright.y = box.bottom();
|
||||
denorm.NormTransform(nullptr, topleft, &norm_topleft);
|
||||
denorm.NormTransform(nullptr, botright, &norm_botright);
|
||||
TBOX norm_box(norm_topleft.x, norm_botright.y, norm_botright.x, norm_topleft.y);
|
||||
norm_truth_word_.InsertBox(b, norm_box);
|
||||
}
|
||||
}
|
||||
|
||||
// Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
|
||||
// bundles) where the right edge/ of the left-hand word is word1_right,
|
||||
// and the left edge of the right-hand word is word2_left.
|
||||
void BlamerBundle::SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1,
|
||||
BlamerBundle *bundle2) const {
|
||||
std::string debug_str;
|
||||
// Find truth boxes that correspond to the split in the blobs.
|
||||
int b;
|
||||
int begin2_truth_index = -1;
|
||||
if (incorrect_result_reason_ != IRR_NO_TRUTH && truth_has_char_boxes_) {
|
||||
debug_str = "Looking for truth split at";
|
||||
debug_str += " end1_x " + std::to_string(word1_right);
|
||||
debug_str += " begin2_x " + std::to_string(word2_left);
|
||||
debug_str += "\nnorm_truth_word boxes:\n";
|
||||
if (norm_truth_word_.length() > 1) {
|
||||
norm_truth_word_.BlobBox(0).print_to_str(debug_str);
|
||||
for (b = 1; b < norm_truth_word_.length(); ++b) {
|
||||
norm_truth_word_.BlobBox(b).print_to_str(debug_str);
|
||||
if ((abs(word1_right - norm_truth_word_.BlobBox(b - 1).right()) < norm_box_tolerance_) &&
|
||||
(abs(word2_left - norm_truth_word_.BlobBox(b).left()) < norm_box_tolerance_)) {
|
||||
begin2_truth_index = b;
|
||||
debug_str += "Split found";
|
||||
break;
|
||||
}
|
||||
}
|
||||
debug_str += '\n';
|
||||
}
|
||||
}
|
||||
// Populate truth information in word and word2 with the first and second
|
||||
// part of the original truth.
|
||||
if (begin2_truth_index > 0) {
|
||||
bundle1->truth_has_char_boxes_ = true;
|
||||
bundle1->norm_box_tolerance_ = norm_box_tolerance_;
|
||||
bundle2->truth_has_char_boxes_ = true;
|
||||
bundle2->norm_box_tolerance_ = norm_box_tolerance_;
|
||||
BlamerBundle *curr_bb = bundle1;
|
||||
for (b = 0; b < norm_truth_word_.length(); ++b) {
|
||||
if (b == begin2_truth_index) {
|
||||
curr_bb = bundle2;
|
||||
}
|
||||
curr_bb->norm_truth_word_.InsertBox(b, norm_truth_word_.BlobBox(b));
|
||||
curr_bb->truth_word_.InsertBox(b, truth_word_.BlobBox(b));
|
||||
curr_bb->truth_text_.push_back(truth_text_[b]);
|
||||
}
|
||||
} else if (incorrect_result_reason_ == IRR_NO_TRUTH) {
|
||||
bundle1->incorrect_result_reason_ = IRR_NO_TRUTH;
|
||||
bundle2->incorrect_result_reason_ = IRR_NO_TRUTH;
|
||||
} else {
|
||||
debug_str += "Truth split not found";
|
||||
debug_str += truth_has_char_boxes_ ? "\n" : " (no truth char boxes)\n";
|
||||
bundle1->SetBlame(IRR_NO_TRUTH_SPLIT, debug_str, nullptr, debug);
|
||||
bundle2->SetBlame(IRR_NO_TRUTH_SPLIT, debug_str, nullptr, debug);
|
||||
}
|
||||
}
|
||||
|
||||
// "Joins" the blames from bundle1 and bundle2 into *this.
|
||||
void BlamerBundle::JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2,
|
||||
bool debug) {
|
||||
std::string debug_str;
|
||||
IncorrectResultReason irr = incorrect_result_reason_;
|
||||
if (irr != IRR_NO_TRUTH_SPLIT) {
|
||||
debug_str = "";
|
||||
}
|
||||
if (bundle1.incorrect_result_reason_ != IRR_CORRECT &&
|
||||
bundle1.incorrect_result_reason_ != IRR_NO_TRUTH &&
|
||||
bundle1.incorrect_result_reason_ != IRR_NO_TRUTH_SPLIT) {
|
||||
debug_str += "Blame from part 1: ";
|
||||
debug_str += bundle1.debug_;
|
||||
irr = bundle1.incorrect_result_reason_;
|
||||
}
|
||||
if (bundle2.incorrect_result_reason_ != IRR_CORRECT &&
|
||||
bundle2.incorrect_result_reason_ != IRR_NO_TRUTH &&
|
||||
bundle2.incorrect_result_reason_ != IRR_NO_TRUTH_SPLIT) {
|
||||
debug_str += "Blame from part 2: ";
|
||||
debug_str += bundle2.debug_;
|
||||
if (irr == IRR_CORRECT) {
|
||||
irr = bundle2.incorrect_result_reason_;
|
||||
} else if (irr != bundle2.incorrect_result_reason_) {
|
||||
irr = IRR_UNKNOWN;
|
||||
}
|
||||
}
|
||||
incorrect_result_reason_ = irr;
|
||||
if (irr != IRR_CORRECT && irr != IRR_NO_TRUTH) {
|
||||
SetBlame(irr, debug_str, nullptr, debug);
|
||||
}
|
||||
}
|
||||
|
||||
// If a blob with the same bounding box as one of the truth character
|
||||
// bounding boxes is not classified as the corresponding truth character
|
||||
// blames character classifier for incorrect answer.
|
||||
void BlamerBundle::BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box,
|
||||
const BLOB_CHOICE_LIST &choices, bool debug) {
|
||||
if (!truth_has_char_boxes_ || incorrect_result_reason_ != IRR_CORRECT) {
|
||||
return; // Nothing to do here.
|
||||
}
|
||||
|
||||
for (int b = 0; b < norm_truth_word_.length(); ++b) {
|
||||
const TBOX &truth_box = norm_truth_word_.BlobBox(b);
|
||||
// Note that we are more strict on the bounding box boundaries here
|
||||
// than in other places (chopper, segmentation search), since we do
|
||||
// not have the ability to check the previous and next bounding box.
|
||||
if (blob_box.x_almost_equal(truth_box, norm_box_tolerance_ / 2)) {
|
||||
bool found = false;
|
||||
bool incorrect_adapted = false;
|
||||
UNICHAR_ID incorrect_adapted_id = INVALID_UNICHAR_ID;
|
||||
const char *truth_str = truth_text_[b].c_str();
|
||||
// We promise not to modify the list or its contents, using a
|
||||
// const BLOB_CHOICE* below.
|
||||
BLOB_CHOICE_IT choices_it(const_cast<BLOB_CHOICE_LIST *>(&choices));
|
||||
for (choices_it.mark_cycle_pt(); !choices_it.cycled_list(); choices_it.forward()) {
|
||||
const BLOB_CHOICE *choice = choices_it.data();
|
||||
if (strcmp(truth_str, unicharset.get_normed_unichar(choice->unichar_id())) == 0) {
|
||||
found = true;
|
||||
break;
|
||||
} else if (choice->IsAdapted()) {
|
||||
incorrect_adapted = true;
|
||||
incorrect_adapted_id = choice->unichar_id();
|
||||
}
|
||||
} // end choices_it for loop
|
||||
if (!found) {
|
||||
std::string debug_str = "unichar ";
|
||||
debug_str += truth_str;
|
||||
debug_str += " not found in classification list";
|
||||
SetBlame(IRR_CLASSIFIER, debug_str, nullptr, debug);
|
||||
} else if (incorrect_adapted) {
|
||||
std::string debug_str = "better rating for adapted ";
|
||||
debug_str += unicharset.id_to_unichar(incorrect_adapted_id);
|
||||
debug_str += " than for correct ";
|
||||
debug_str += truth_str;
|
||||
SetBlame(IRR_ADAPTION, debug_str, nullptr, debug);
|
||||
}
|
||||
break;
|
||||
}
|
||||
} // end iterating over blamer_bundle->norm_truth_word
|
||||
}
|
||||
|
||||
// Checks whether chops were made at all the character bounding box
|
||||
// boundaries in word->truth_word. If not - blames the chopper for an
|
||||
// incorrect answer.
|
||||
void BlamerBundle::SetChopperBlame(const WERD_RES *word, bool debug) {
|
||||
if (NoTruth() || !truth_has_char_boxes_ || word->chopped_word->blobs.empty()) {
|
||||
return;
|
||||
}
|
||||
std::string debug_str;
|
||||
bool missing_chop = false;
|
||||
int num_blobs = word->chopped_word->blobs.size();
|
||||
int box_index = 0;
|
||||
int blob_index = 0;
|
||||
int16_t truth_x = -1;
|
||||
while (box_index < truth_word_.length() && blob_index < num_blobs) {
|
||||
truth_x = norm_truth_word_.BlobBox(box_index).right();
|
||||
TBLOB *curr_blob = word->chopped_word->blobs[blob_index];
|
||||
if (curr_blob->bounding_box().right() < truth_x - norm_box_tolerance_) {
|
||||
++blob_index;
|
||||
continue; // encountered an extra chop, keep looking
|
||||
} else if (curr_blob->bounding_box().right() > truth_x + norm_box_tolerance_) {
|
||||
missing_chop = true;
|
||||
break;
|
||||
} else {
|
||||
++blob_index;
|
||||
}
|
||||
}
|
||||
if (missing_chop || box_index < norm_truth_word_.length()) {
|
||||
std::string debug_str;
|
||||
if (missing_chop) {
|
||||
debug_str += "Detected missing chop (tolerance=" + std::to_string(norm_box_tolerance_);
|
||||
debug_str += ") at Bounding Box=";
|
||||
TBLOB *curr_blob = word->chopped_word->blobs[blob_index];
|
||||
curr_blob->bounding_box().print_to_str(debug_str);
|
||||
debug_str += "\nNo chop for truth at x=" + std::to_string(truth_x);
|
||||
} else {
|
||||
debug_str += "Missing chops for last " + std::to_string(norm_truth_word_.length() - box_index);
|
||||
debug_str += " truth box(es)";
|
||||
}
|
||||
debug_str += "\nMaximally chopped word boxes:\n";
|
||||
for (blob_index = 0; blob_index < num_blobs; ++blob_index) {
|
||||
TBLOB *curr_blob = word->chopped_word->blobs[blob_index];
|
||||
curr_blob->bounding_box().print_to_str(debug_str);
|
||||
debug_str += '\n';
|
||||
}
|
||||
debug_str += "Truth bounding boxes:\n";
|
||||
for (box_index = 0; box_index < norm_truth_word_.length(); ++box_index) {
|
||||
norm_truth_word_.BlobBox(box_index).print_to_str(debug_str);
|
||||
debug_str += '\n';
|
||||
}
|
||||
SetBlame(IRR_CHOPPER, debug_str, word->best_choice, debug);
|
||||
}
|
||||
}
|
||||
|
||||
// Blames the classifier or the language model if, after running only the
|
||||
// chopper, best_choice is incorrect and no blame has been yet set.
|
||||
// Blames the classifier if best_choice is classifier's top choice and is a
|
||||
// dictionary word (i.e. language model could not have helped).
|
||||
// Otherwise, blames the language model (formerly permuter word adjustment).
|
||||
void BlamerBundle::BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset,
|
||||
bool valid_permuter, bool debug) {
|
||||
if (valid_permuter) {
|
||||
// Find out whether best choice is a top choice.
|
||||
best_choice_is_dict_and_top_choice_ = true;
|
||||
for (int i = 0; i < word->best_choice->length(); ++i) {
|
||||
BLOB_CHOICE_IT blob_choice_it(word->GetBlobChoices(i));
|
||||
ASSERT_HOST(!blob_choice_it.empty());
|
||||
BLOB_CHOICE *first_choice = nullptr;
|
||||
for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
|
||||
blob_choice_it.forward()) { // find first non-fragment choice
|
||||
if (!(unicharset.get_fragment(blob_choice_it.data()->unichar_id()))) {
|
||||
first_choice = blob_choice_it.data();
|
||||
break;
|
||||
}
|
||||
}
|
||||
ASSERT_HOST(first_choice != nullptr);
|
||||
if (first_choice->unichar_id() != word->best_choice->unichar_id(i)) {
|
||||
best_choice_is_dict_and_top_choice_ = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::string debug_str;
|
||||
if (best_choice_is_dict_and_top_choice_) {
|
||||
debug_str = "Best choice is: incorrect, top choice, dictionary word";
|
||||
debug_str += " with permuter ";
|
||||
debug_str += word->best_choice->permuter_name();
|
||||
} else {
|
||||
debug_str = "Classifier/Old LM tradeoff is to blame";
|
||||
}
|
||||
SetBlame(best_choice_is_dict_and_top_choice_ ? IRR_CLASSIFIER : IRR_CLASS_OLD_LM_TRADEOFF,
|
||||
debug_str, word->best_choice, debug);
|
||||
}
|
||||
|
||||
// Sets up the correct_segmentation_* to mark the correct bounding boxes.
|
||||
void BlamerBundle::SetupCorrectSegmentation(const TWERD *word, bool debug) {
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
params_training_bundle_.StartHypothesisList();
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
if (incorrect_result_reason_ != IRR_CORRECT || !truth_has_char_boxes_) {
|
||||
return; // Nothing to do here.
|
||||
}
|
||||
|
||||
std::string debug_str = "Blamer computing correct_segmentation_cols\n";
|
||||
int curr_box_col = 0;
|
||||
int next_box_col = 0;
|
||||
int num_blobs = word->NumBlobs();
|
||||
if (num_blobs == 0) {
|
||||
return; // No blobs to play with.
|
||||
}
|
||||
int blob_index = 0;
|
||||
int16_t next_box_x = word->blobs[blob_index]->bounding_box().right();
|
||||
for (int truth_idx = 0; blob_index < num_blobs && truth_idx < norm_truth_word_.length();
|
||||
++blob_index) {
|
||||
++next_box_col;
|
||||
int16_t curr_box_x = next_box_x;
|
||||
if (blob_index + 1 < num_blobs) {
|
||||
next_box_x = word->blobs[blob_index + 1]->bounding_box().right();
|
||||
}
|
||||
int16_t truth_x = norm_truth_word_.BlobBox(truth_idx).right();
|
||||
debug_str += "Box x coord vs. truth: " + std::to_string(curr_box_x);
|
||||
debug_str += " " + std::to_string(truth_x);
|
||||
debug_str += "\n";
|
||||
if (curr_box_x > (truth_x + norm_box_tolerance_)) {
|
||||
break; // failed to find a matching box
|
||||
} else if (curr_box_x >= truth_x - norm_box_tolerance_ && // matched
|
||||
(blob_index + 1 >= num_blobs || // next box can't be included
|
||||
next_box_x > truth_x + norm_box_tolerance_)) {
|
||||
correct_segmentation_cols_.push_back(curr_box_col);
|
||||
correct_segmentation_rows_.push_back(next_box_col - 1);
|
||||
++truth_idx;
|
||||
debug_str += "col=" + std::to_string(curr_box_col);
|
||||
debug_str += " row=" + std::to_string(next_box_col - 1);
|
||||
debug_str += "\n";
|
||||
curr_box_col = next_box_col;
|
||||
}
|
||||
}
|
||||
if (blob_index < num_blobs || // trailing blobs
|
||||
correct_segmentation_cols_.size() != norm_truth_word_.length()) {
|
||||
debug_str +=
|
||||
"Blamer failed to find correct segmentation"
|
||||
" (tolerance=" +
|
||||
std::to_string(norm_box_tolerance_);
|
||||
if (blob_index >= num_blobs) {
|
||||
debug_str += " blob == nullptr";
|
||||
}
|
||||
debug_str += ")\n";
|
||||
debug_str += " path length " + std::to_string(correct_segmentation_cols_.size());
|
||||
debug_str += " vs. truth " + std::to_string(norm_truth_word_.length());
|
||||
debug_str += "\n";
|
||||
SetBlame(IRR_UNKNOWN, debug_str, nullptr, debug);
|
||||
correct_segmentation_cols_.clear();
|
||||
correct_segmentation_rows_.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// Returns true if a guided segmentation search is needed.
|
||||
bool BlamerBundle::GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const {
|
||||
return incorrect_result_reason_ == IRR_CORRECT && !segsearch_is_looking_for_blame_ &&
|
||||
truth_has_char_boxes_ && !ChoiceIsCorrect(best_choice);
|
||||
}
|
||||
|
||||
#if !defined(DISABLED_LEGACY_ENGINE)
|
||||
// Setup ready to guide the segmentation search to the correct segmentation.
|
||||
void BlamerBundle::InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings,
|
||||
UNICHAR_ID wildcard_id, bool debug, std::string &debug_str,
|
||||
tesseract::LMPainPoints *pain_points, double max_char_wh_ratio,
|
||||
WERD_RES *word_res) {
|
||||
segsearch_is_looking_for_blame_ = true;
|
||||
if (debug) {
|
||||
tprintf("segsearch starting to look for blame\n");
|
||||
}
|
||||
// Fill pain points for any unclassifed blob corresponding to the
|
||||
// correct segmentation state.
|
||||
debug_str += "Correct segmentation:\n";
|
||||
for (int idx = 0; idx < correct_segmentation_cols_.size(); ++idx) {
|
||||
debug_str += "col=" + std::to_string(correct_segmentation_cols_[idx]);
|
||||
debug_str += " row=" + std::to_string(correct_segmentation_rows_[idx]);
|
||||
debug_str += "\n";
|
||||
if (!ratings->Classified(correct_segmentation_cols_[idx], correct_segmentation_rows_[idx],
|
||||
wildcard_id) &&
|
||||
!pain_points->GeneratePainPoint(
|
||||
correct_segmentation_cols_[idx], correct_segmentation_rows_[idx],
|
||||
tesseract::LM_PPTYPE_BLAMER, 0.0, false, max_char_wh_ratio, word_res)) {
|
||||
segsearch_is_looking_for_blame_ = false;
|
||||
debug_str += "\nFailed to insert pain point\n";
|
||||
SetBlame(IRR_SEGSEARCH_HEUR, debug_str, best_choice, debug);
|
||||
break;
|
||||
}
|
||||
} // end for blamer_bundle->correct_segmentation_cols/rows
|
||||
}
|
||||
#endif // !defined(DISABLED_LEGACY_ENGINE)
|
||||
|
||||
// Returns true if the guided segsearch is in progress.
|
||||
bool BlamerBundle::GuidedSegsearchStillGoing() const {
|
||||
return segsearch_is_looking_for_blame_;
|
||||
}
|
||||
|
||||
// The segmentation search has ended. Sets the blame appropriately.
|
||||
void BlamerBundle::FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, std::string &debug_str) {
|
||||
// If we are still looking for blame (i.e. best_choice is incorrect, but a
|
||||
// path representing the correct segmentation could be constructed), we can
|
||||
// blame segmentation search pain point prioritization if the rating of the
|
||||
// path corresponding to the correct segmentation is better than that of
|
||||
// best_choice (i.e. language model would have done the correct thing, but
|
||||
// because of poor pain point prioritization the correct segmentation was
|
||||
// never explored). Otherwise we blame the tradeoff between the language model
|
||||
// and the classifier, since even after exploring the path corresponding to
|
||||
// the correct segmentation incorrect best_choice would have been chosen.
|
||||
// One special case when we blame the classifier instead is when best choice
|
||||
// is incorrect, but it is a dictionary word and it classifier's top choice.
|
||||
if (segsearch_is_looking_for_blame_) {
|
||||
segsearch_is_looking_for_blame_ = false;
|
||||
if (best_choice_is_dict_and_top_choice_) {
|
||||
debug_str = "Best choice is: incorrect, top choice, dictionary word";
|
||||
debug_str += " with permuter ";
|
||||
debug_str += best_choice->permuter_name();
|
||||
SetBlame(IRR_CLASSIFIER, debug_str, best_choice, debug);
|
||||
} else if (best_correctly_segmented_rating_ < best_choice->rating()) {
|
||||
debug_str += "Correct segmentation state was not explored";
|
||||
SetBlame(IRR_SEGSEARCH_PP, debug_str, best_choice, debug);
|
||||
} else {
|
||||
if (best_correctly_segmented_rating_ >= WERD_CHOICE::kBadRating) {
|
||||
debug_str += "Correct segmentation paths were pruned by LM\n";
|
||||
} else {
|
||||
debug_str += "Best correct segmentation rating " +
|
||||
std::to_string(best_correctly_segmented_rating_);
|
||||
debug_str += " vs. best choice rating " + std::to_string(best_choice->rating());
|
||||
}
|
||||
SetBlame(IRR_CLASS_LM_TRADEOFF, debug_str, best_choice, debug);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If the bundle is null or still does not indicate the correct result,
|
||||
// fix it and use some backup reason for the blame.
|
||||
void BlamerBundle::LastChanceBlame(bool debug, WERD_RES *word) {
|
||||
if (word->blamer_bundle == nullptr) {
|
||||
word->blamer_bundle = new BlamerBundle();
|
||||
word->blamer_bundle->SetBlame(IRR_PAGE_LAYOUT, "LastChanceBlame", word->best_choice, debug);
|
||||
} else if (word->blamer_bundle->incorrect_result_reason_ == IRR_NO_TRUTH) {
|
||||
word->blamer_bundle->SetBlame(IRR_NO_TRUTH, "Rejected truth", word->best_choice, debug);
|
||||
} else {
|
||||
bool correct = word->blamer_bundle->ChoiceIsCorrect(word->best_choice);
|
||||
IncorrectResultReason irr = word->blamer_bundle->incorrect_result_reason_;
|
||||
if (irr == IRR_CORRECT && !correct) {
|
||||
std::string debug_str = "Choice is incorrect after recognition";
|
||||
word->blamer_bundle->SetBlame(IRR_UNKNOWN, debug_str, word->best_choice, debug);
|
||||
} else if (irr != IRR_CORRECT && correct) {
|
||||
if (debug) {
|
||||
tprintf("Corrected %s\n", word->blamer_bundle->debug_.c_str());
|
||||
}
|
||||
word->blamer_bundle->incorrect_result_reason_ = IRR_CORRECT;
|
||||
word->blamer_bundle->debug_ = "";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sets the misadaption debug if this word is incorrect, as this word is
|
||||
// being adapted to.
|
||||
void BlamerBundle::SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug) {
|
||||
if (incorrect_result_reason_ != IRR_NO_TRUTH && !ChoiceIsCorrect(best_choice)) {
|
||||
misadaption_debug_ = "misadapt to word (";
|
||||
misadaption_debug_ += best_choice->permuter_name();
|
||||
misadaption_debug_ += "): ";
|
||||
FillDebugString("", best_choice, misadaption_debug_);
|
||||
if (debug) {
|
||||
tprintf("%s\n", misadaption_debug_.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
350
3rdparty/tesseract_ocr/tesseract/src/ccstruct/blamer.h
vendored
Normal file
350
3rdparty/tesseract_ocr/tesseract/src/ccstruct/blamer.h
vendored
Normal file
|
@ -0,0 +1,350 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: blamer.h
|
||||
// Description: Module allowing precise error causes to be allocated.
|
||||
// Author: Rike Antonova
|
||||
// Refactored: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2013, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCSTRUCT_BLAMER_H_
|
||||
#define TESSERACT_CCSTRUCT_BLAMER_H_
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
# include "config_auto.h" // DISABLED_LEGACY_ENGINE
|
||||
#endif
|
||||
#include "boxword.h" // for BoxWord
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
# include "params_training_featdef.h" // for ParamsTrainingBundle, ParamsTra...
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
#include "ratngs.h" // for BLOB_CHOICE_LIST (ptr only)
|
||||
#include "rect.h" // for TBOX
|
||||
#include "tprintf.h" // for tprintf
|
||||
|
||||
#include <tesseract/unichar.h> // for UNICHAR_ID
|
||||
|
||||
#include <cstdint> // for int16_t
|
||||
#include <cstring> // for memcpy
|
||||
#include <vector> // for std::vector
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class DENORM;
|
||||
class MATRIX;
|
||||
class UNICHARSET;
|
||||
class WERD_RES;
|
||||
|
||||
struct MATRIX_COORD;
|
||||
struct TWERD;
|
||||
|
||||
class LMPainPoints;
|
||||
|
||||
static const int16_t kBlamerBoxTolerance = 5;
|
||||
|
||||
// Enum for expressing the source of error.
|
||||
// Note: Please update kIncorrectResultReasonNames when modifying this enum.
|
||||
enum IncorrectResultReason {
|
||||
// The text recorded in best choice == truth text
|
||||
IRR_CORRECT,
|
||||
// Either: Top choice is incorrect and is a dictionary word (language model
|
||||
// is unlikely to help correct such errors, so blame the classifier).
|
||||
// Or: the correct unichar was not included in shortlist produced by the
|
||||
// classifier at all.
|
||||
IRR_CLASSIFIER,
|
||||
// Chopper have not found one or more splits that correspond to the correct
|
||||
// character bounding boxes recorded in BlamerBundle::truth_word.
|
||||
IRR_CHOPPER,
|
||||
// Classifier did include correct unichars for each blob in the correct
|
||||
// segmentation, however its rating could have been too bad to allow the
|
||||
// language model to pull out the correct choice. On the other hand the
|
||||
// strength of the language model might have been too weak to favor the
|
||||
// correct answer, this we call this case a classifier-language model
|
||||
// tradeoff error.
|
||||
IRR_CLASS_LM_TRADEOFF,
|
||||
// Page layout failed to produce the correct bounding box. Blame page layout
|
||||
// if the truth was not found for the word, which implies that the bounding
|
||||
// box of the word was incorrect (no truth word had a similar bounding box).
|
||||
IRR_PAGE_LAYOUT,
|
||||
// SegSearch heuristic prevented one or more blobs from the correct
|
||||
// segmentation state to be classified (e.g. the blob was too wide).
|
||||
IRR_SEGSEARCH_HEUR,
|
||||
// The correct segmentaiton state was not explored because of poor SegSearch
|
||||
// pain point prioritization. We blame SegSearch pain point prioritization
|
||||
// if the best rating of a choice constructed from correct segmentation is
|
||||
// better than that of the best choice (i.e. if we got to explore the correct
|
||||
// segmentation state, language model would have picked the correct choice).
|
||||
IRR_SEGSEARCH_PP,
|
||||
// Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
|
||||
// and thus use the old language model (permuters).
|
||||
// TODO(antonova): integrate the new language mode with chopper
|
||||
IRR_CLASS_OLD_LM_TRADEOFF,
|
||||
// If there is an incorrect adaptive template match with a better score than
|
||||
// a correct one (either pre-trained or adapted), mark this as adaption error.
|
||||
IRR_ADAPTION,
|
||||
// split_and_recog_word() failed to find a suitable split in truth.
|
||||
IRR_NO_TRUTH_SPLIT,
|
||||
// Truth is not available for this word (e.g. when words in corrected content
|
||||
// file are turned into ~~~~ because an appropriate alignment was not found.
|
||||
IRR_NO_TRUTH,
|
||||
// The text recorded in best choice != truth text, but none of the above
|
||||
// reasons are set.
|
||||
IRR_UNKNOWN,
|
||||
|
||||
IRR_NUM_REASONS
|
||||
};
|
||||
|
||||
// Blamer-related information to determine the source of errors.
|
||||
struct BlamerBundle {
|
||||
static const char *IncorrectReasonName(IncorrectResultReason irr);
|
||||
BlamerBundle()
|
||||
: truth_has_char_boxes_(false)
|
||||
, incorrect_result_reason_(IRR_CORRECT)
|
||||
, lattice_data_(nullptr) {
|
||||
ClearResults();
|
||||
}
|
||||
BlamerBundle(const BlamerBundle &other) {
|
||||
this->CopyTruth(other);
|
||||
this->CopyResults(other);
|
||||
}
|
||||
~BlamerBundle() {
|
||||
delete[] lattice_data_;
|
||||
}
|
||||
|
||||
// Accessors.
|
||||
std::string TruthString() const {
|
||||
std::string truth_str;
|
||||
for (auto &text : truth_text_) {
|
||||
truth_str += text;
|
||||
}
|
||||
return truth_str;
|
||||
}
|
||||
IncorrectResultReason incorrect_result_reason() const {
|
||||
return incorrect_result_reason_;
|
||||
}
|
||||
bool NoTruth() const {
|
||||
return incorrect_result_reason_ == IRR_NO_TRUTH || incorrect_result_reason_ == IRR_PAGE_LAYOUT;
|
||||
}
|
||||
bool HasDebugInfo() const {
|
||||
return debug_.length() > 0 || misadaption_debug_.length() > 0;
|
||||
}
|
||||
const std::string &debug() const {
|
||||
return debug_;
|
||||
}
|
||||
const std::string &misadaption_debug() const {
|
||||
return misadaption_debug_;
|
||||
}
|
||||
void UpdateBestRating(float rating) {
|
||||
if (rating < best_correctly_segmented_rating_) {
|
||||
best_correctly_segmented_rating_ = rating;
|
||||
}
|
||||
}
|
||||
int correct_segmentation_length() const {
|
||||
return correct_segmentation_cols_.size();
|
||||
}
|
||||
// Returns true if the given ratings matrix col,row position is included
|
||||
// in the correct segmentation path at the given index.
|
||||
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord) {
|
||||
return correct_segmentation_cols_[index] == coord.col &&
|
||||
correct_segmentation_rows_[index] == coord.row;
|
||||
}
|
||||
void set_best_choice_is_dict_and_top_choice(bool value) {
|
||||
best_choice_is_dict_and_top_choice_ = value;
|
||||
}
|
||||
const char *lattice_data() const {
|
||||
return lattice_data_;
|
||||
}
|
||||
int lattice_size() const {
|
||||
return lattice_size_; // size of lattice_data in bytes
|
||||
}
|
||||
void set_lattice_data(const char *data, int size) {
|
||||
lattice_size_ = size;
|
||||
delete[] lattice_data_;
|
||||
lattice_data_ = new char[lattice_size_];
|
||||
memcpy(lattice_data_, data, lattice_size_);
|
||||
}
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
const tesseract::ParamsTrainingBundle ¶ms_training_bundle() const {
|
||||
return params_training_bundle_;
|
||||
}
|
||||
// Adds a new ParamsTrainingHypothesis to the current hypothesis list.
|
||||
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo) {
|
||||
params_training_bundle_.AddHypothesis(hypo);
|
||||
}
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
// Functions to setup the blamer.
|
||||
// Whole word string, whole word bounding box.
|
||||
void SetWordTruth(const UNICHARSET &unicharset, const char *truth_str, const TBOX &word_box);
|
||||
// Single "character" string, "character" bounding box.
|
||||
// May be called multiple times to indicate the characters in a word.
|
||||
void SetSymbolTruth(const UNICHARSET &unicharset, const char *char_str, const TBOX &char_box);
|
||||
// Marks that there is something wrong with the truth text, like it contains
|
||||
// reject characters.
|
||||
void SetRejectedTruth();
|
||||
|
||||
// Returns true if the provided word_choice is correct.
|
||||
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const;
|
||||
|
||||
void ClearResults() {
|
||||
norm_truth_word_.DeleteAllBoxes();
|
||||
norm_box_tolerance_ = 0;
|
||||
if (!NoTruth()) {
|
||||
incorrect_result_reason_ = IRR_CORRECT;
|
||||
}
|
||||
debug_ = "";
|
||||
segsearch_is_looking_for_blame_ = false;
|
||||
best_correctly_segmented_rating_ = WERD_CHOICE::kBadRating;
|
||||
correct_segmentation_cols_.clear();
|
||||
correct_segmentation_rows_.clear();
|
||||
best_choice_is_dict_and_top_choice_ = false;
|
||||
delete[] lattice_data_;
|
||||
lattice_data_ = nullptr;
|
||||
lattice_size_ = 0;
|
||||
}
|
||||
void CopyTruth(const BlamerBundle &other) {
|
||||
truth_has_char_boxes_ = other.truth_has_char_boxes_;
|
||||
truth_word_ = other.truth_word_;
|
||||
truth_text_ = other.truth_text_;
|
||||
incorrect_result_reason_ = (other.NoTruth() ? other.incorrect_result_reason_ : IRR_CORRECT);
|
||||
}
|
||||
void CopyResults(const BlamerBundle &other) {
|
||||
norm_truth_word_ = other.norm_truth_word_;
|
||||
norm_box_tolerance_ = other.norm_box_tolerance_;
|
||||
incorrect_result_reason_ = other.incorrect_result_reason_;
|
||||
segsearch_is_looking_for_blame_ = other.segsearch_is_looking_for_blame_;
|
||||
best_correctly_segmented_rating_ = other.best_correctly_segmented_rating_;
|
||||
correct_segmentation_cols_ = other.correct_segmentation_cols_;
|
||||
correct_segmentation_rows_ = other.correct_segmentation_rows_;
|
||||
best_choice_is_dict_and_top_choice_ = other.best_choice_is_dict_and_top_choice_;
|
||||
if (other.lattice_data_ != nullptr) {
|
||||
lattice_data_ = new char[other.lattice_size_];
|
||||
memcpy(lattice_data_, other.lattice_data_, other.lattice_size_);
|
||||
lattice_size_ = other.lattice_size_;
|
||||
} else {
|
||||
lattice_data_ = nullptr;
|
||||
}
|
||||
}
|
||||
const char *IncorrectReason() const;
|
||||
|
||||
// Appends choice and truth details to the given debug string.
|
||||
void FillDebugString(const std::string &msg, const WERD_CHOICE *choice, std::string &debug);
|
||||
|
||||
// Sets up the norm_truth_word from truth_word using the given DENORM.
|
||||
void SetupNormTruthWord(const DENORM &denorm);
|
||||
|
||||
// Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
|
||||
// bundles) where the right edge/ of the left-hand word is word1_right,
|
||||
// and the left edge of the right-hand word is word2_left.
|
||||
void SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1,
|
||||
BlamerBundle *bundle2) const;
|
||||
// "Joins" the blames from bundle1 and bundle2 into *this.
|
||||
void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug);
|
||||
|
||||
// If a blob with the same bounding box as one of the truth character
|
||||
// bounding boxes is not classified as the corresponding truth character
|
||||
// blames character classifier for incorrect answer.
|
||||
void BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box,
|
||||
const BLOB_CHOICE_LIST &choices, bool debug);
|
||||
|
||||
// Checks whether chops were made at all the character bounding box
|
||||
// boundaries in word->truth_word. If not - blames the chopper for an
|
||||
// incorrect answer.
|
||||
void SetChopperBlame(const WERD_RES *word, bool debug);
|
||||
// Blames the classifier or the language model if, after running only the
|
||||
// chopper, best_choice is incorrect and no blame has been yet set.
|
||||
// Blames the classifier if best_choice is classifier's top choice and is a
|
||||
// dictionary word (i.e. language model could not have helped).
|
||||
// Otherwise, blames the language model (formerly permuter word adjustment).
|
||||
void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset,
|
||||
bool valid_permuter, bool debug);
|
||||
// Sets up the correct_segmentation_* to mark the correct bounding boxes.
|
||||
void SetupCorrectSegmentation(const TWERD *word, bool debug);
|
||||
|
||||
// Returns true if a guided segmentation search is needed.
|
||||
bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const;
|
||||
// Setup ready to guide the segmentation search to the correct segmentation.
|
||||
void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id,
|
||||
bool debug, std::string &debug_str, tesseract::LMPainPoints *pain_points,
|
||||
double max_char_wh_ratio, WERD_RES *word_res);
|
||||
// Returns true if the guided segsearch is in progress.
|
||||
bool GuidedSegsearchStillGoing() const;
|
||||
// The segmentation search has ended. Sets the blame appropriately.
|
||||
void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, std::string &debug_str);
|
||||
|
||||
// If the bundle is null or still does not indicate the correct result,
|
||||
// fix it and use some backup reason for the blame.
|
||||
static void LastChanceBlame(bool debug, WERD_RES *word);
|
||||
|
||||
// Sets the misadaption debug if this word is incorrect, as this word is
|
||||
// being adapted to.
|
||||
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug);
|
||||
|
||||
private:
|
||||
// Copy assignment operator (currently unused, therefore private).
|
||||
BlamerBundle &operator=(const BlamerBundle &other) = delete;
|
||||
void SetBlame(IncorrectResultReason irr, const std::string &msg, const WERD_CHOICE *choice,
|
||||
bool debug) {
|
||||
incorrect_result_reason_ = irr;
|
||||
debug_ = IncorrectReason();
|
||||
debug_ += " to blame: ";
|
||||
FillDebugString(msg, choice, debug_);
|
||||
if (debug) {
|
||||
tprintf("SetBlame(): %s", debug_.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// Set to true when bounding boxes for individual unichars are recorded.
|
||||
bool truth_has_char_boxes_;
|
||||
// Variables used by the segmentation search when looking for the blame.
|
||||
// Set to true while segmentation search is continued after the usual
|
||||
// termination condition in order to look for the blame.
|
||||
bool segsearch_is_looking_for_blame_;
|
||||
// Set to true if best choice is a dictionary word and
|
||||
// classifier's top choice.
|
||||
bool best_choice_is_dict_and_top_choice_;
|
||||
// Tolerance for bounding box comparisons in normalized space.
|
||||
int norm_box_tolerance_;
|
||||
// The true_word (in the original image coordinate space) contains ground
|
||||
// truth bounding boxes for this WERD_RES.
|
||||
tesseract::BoxWord truth_word_;
|
||||
// Same as above, but in normalized coordinates
|
||||
// (filled in by WERD_RES::SetupForRecognition()).
|
||||
tesseract::BoxWord norm_truth_word_;
|
||||
// Contains ground truth unichar for each of the bounding boxes in truth_word.
|
||||
std::vector<std::string> truth_text_;
|
||||
// The reason for incorrect OCR result.
|
||||
IncorrectResultReason incorrect_result_reason_;
|
||||
// Debug text associated with the blame.
|
||||
std::string debug_;
|
||||
// Misadaption debug information (filled in if this word was misadapted to).
|
||||
std::string misadaption_debug_;
|
||||
// Vectors populated by SegSearch to indicate column and row indices that
|
||||
// correspond to blobs with correct bounding boxes.
|
||||
std::vector<int> correct_segmentation_cols_;
|
||||
std::vector<int> correct_segmentation_rows_;
|
||||
// Best rating for correctly segmented path
|
||||
// (set and used by SegSearch when looking for blame).
|
||||
float best_correctly_segmented_rating_;
|
||||
int lattice_size_; // size of lattice_data in bytes
|
||||
// Serialized segmentation search lattice.
|
||||
char *lattice_data_;
|
||||
// Information about hypotheses (paths) explored by the segmentation search.
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
tesseract::ParamsTrainingBundle params_training_bundle_;
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_CCSTRUCT_BLAMER_H_
|
1081
3rdparty/tesseract_ocr/tesseract/src/ccstruct/blobbox.cpp
vendored
Normal file
1081
3rdparty/tesseract_ocr/tesseract/src/ccstruct/blobbox.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
853
3rdparty/tesseract_ocr/tesseract/src/ccstruct/blobbox.h
vendored
Normal file
853
3rdparty/tesseract_ocr/tesseract/src/ccstruct/blobbox.h
vendored
Normal file
|
@ -0,0 +1,853 @@
|
|||
/**********************************************************************
|
||||
* File: blobbox.h (Formerly blobnbox.h)
|
||||
* Description: Code for the textord blob class.
|
||||
* Author: Ray Smith
|
||||
*
|
||||
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef BLOBBOX_H
|
||||
#define BLOBBOX_H
|
||||
|
||||
#include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
|
||||
#include "elst2.h" // for ELIST2_ITERATOR, ELIST2IZEH, ELIST2_LINK
|
||||
#include "errcode.h" // for ASSERT_HOST
|
||||
#include "ocrblock.h" // for BLOCK
|
||||
#include "params.h" // for DoubleParam, double_VAR_H
|
||||
#include "pdblock.h" // for PDBLK
|
||||
#include "points.h" // for FCOORD, ICOORD, ICOORDELT_LIST
|
||||
#include "quspline.h" // for QSPLINE
|
||||
#include "rect.h" // for TBOX
|
||||
#include "scrollview.h" // for ScrollView, ScrollView::Color
|
||||
#include "statistc.h" // for STATS
|
||||
#include "stepblob.h" // for C_BLOB
|
||||
#include "tprintf.h" // for tprintf
|
||||
#include "werd.h" // for WERD_LIST
|
||||
|
||||
#include <cinttypes> // for PRId32
|
||||
#include <cmath> // for std::sqrt
|
||||
#include <cstdint> // for int16_t, int32_t
|
||||
|
||||
struct Pix;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class C_OUTLINE;
|
||||
|
||||
enum PITCH_TYPE {
|
||||
PITCH_DUNNO, // insufficient data
|
||||
PITCH_DEF_FIXED, // definitely fixed
|
||||
PITCH_MAYBE_FIXED, // could be
|
||||
PITCH_DEF_PROP,
|
||||
PITCH_MAYBE_PROP,
|
||||
PITCH_CORR_FIXED,
|
||||
PITCH_CORR_PROP
|
||||
};
|
||||
|
||||
// The possible tab-stop types of each side of a BLOBNBOX.
|
||||
// The ordering is important, as it is used for deleting dead-ends in the
|
||||
// search. ALIGNED, CONFIRMED and VLINE should remain greater than the
|
||||
// non-aligned, unset, or deleted members.
|
||||
enum TabType {
|
||||
TT_NONE, // Not a tab.
|
||||
TT_DELETED, // Not a tab after detailed analysis.
|
||||
TT_MAYBE_RAGGED, // Initial designation of a tab-stop candidate.
|
||||
TT_MAYBE_ALIGNED, // Initial designation of a tab-stop candidate.
|
||||
TT_CONFIRMED, // Aligned with neighbours.
|
||||
TT_VLINE // Detected as a vertical line.
|
||||
};
|
||||
|
||||
// The possible region types of a BLOBNBOX.
|
||||
// Note: keep all the text types > BRT_UNKNOWN and all the image types less.
|
||||
// Keep in sync with kBlobTypes in colpartition.cpp and BoxColor, and the
|
||||
// *Type static functions below.
|
||||
enum BlobRegionType {
|
||||
BRT_NOISE, // Neither text nor image.
|
||||
BRT_HLINE, // Horizontal separator line.
|
||||
BRT_VLINE, // Vertical separator line.
|
||||
BRT_RECTIMAGE, // Rectangular image.
|
||||
BRT_POLYIMAGE, // Non-rectangular image.
|
||||
BRT_UNKNOWN, // Not determined yet.
|
||||
BRT_VERT_TEXT, // Vertical alignment, not necessarily vertically oriented.
|
||||
BRT_TEXT, // Convincing text.
|
||||
|
||||
BRT_COUNT // Number of possibilities.
|
||||
};
|
||||
|
||||
// enum for elements of arrays that refer to neighbours.
|
||||
// NOTE: keep in this order, so ^2 can be used to flip direction.
|
||||
enum BlobNeighbourDir { BND_LEFT, BND_BELOW, BND_RIGHT, BND_ABOVE, BND_COUNT };
|
||||
|
||||
// enum for special type of text characters, such as math symbol or italic.
|
||||
enum BlobSpecialTextType {
|
||||
BSTT_NONE, // No special.
|
||||
BSTT_ITALIC, // Italic style.
|
||||
BSTT_DIGIT, // Digit symbols.
|
||||
BSTT_MATH, // Mathematical symbols (not including digit).
|
||||
BSTT_UNCLEAR, // Characters with low recognition rate.
|
||||
BSTT_SKIP, // Characters that we skip labeling (usually too small).
|
||||
BSTT_COUNT
|
||||
};
|
||||
|
||||
inline BlobNeighbourDir DirOtherWay(BlobNeighbourDir dir) {
|
||||
return static_cast<BlobNeighbourDir>(dir ^ 2);
|
||||
}
|
||||
|
||||
// BlobTextFlowType indicates the quality of neighbouring information
|
||||
// related to a chain of connected components, either horizontally or
|
||||
// vertically. Also used by ColPartition for the collection of blobs
|
||||
// within, which should all have the same value in most cases.
|
||||
enum BlobTextFlowType {
|
||||
BTFT_NONE, // No text flow set yet.
|
||||
BTFT_NONTEXT, // Flow too poor to be likely text.
|
||||
BTFT_NEIGHBOURS, // Neighbours support flow in this direction.
|
||||
BTFT_CHAIN, // There is a weak chain of text in this direction.
|
||||
BTFT_STRONG_CHAIN, // There is a strong chain of text in this direction.
|
||||
BTFT_TEXT_ON_IMAGE, // There is a strong chain of text on an image.
|
||||
BTFT_LEADER, // Leader dots/dashes etc.
|
||||
BTFT_COUNT
|
||||
};
|
||||
|
||||
// Returns true if type1 dominates type2 in a merge. Mostly determined by the
|
||||
// ordering of the enum, LEADER is weak and dominates nothing.
|
||||
// The function is anti-symmetric (t1 > t2) === !(t2 > t1), except that
|
||||
// this cannot be true if t1 == t2, so the result is undefined.
|
||||
inline bool DominatesInMerge(BlobTextFlowType type1, BlobTextFlowType type2) {
|
||||
// LEADER always loses.
|
||||
if (type1 == BTFT_LEADER) {
|
||||
return false;
|
||||
}
|
||||
if (type2 == BTFT_LEADER) {
|
||||
return true;
|
||||
}
|
||||
// With those out of the way, the ordering of the enum determines the result.
|
||||
return type1 >= type2;
|
||||
}
|
||||
|
||||
class ColPartition;
|
||||
|
||||
class BLOBNBOX;
|
||||
ELISTIZEH(BLOBNBOX)
|
||||
class BLOBNBOX : public ELIST_LINK {
|
||||
public:
|
||||
BLOBNBOX() {
|
||||
ReInit();
|
||||
}
|
||||
explicit BLOBNBOX(C_BLOB *srcblob) {
|
||||
box = srcblob->bounding_box();
|
||||
ReInit();
|
||||
cblob_ptr = srcblob;
|
||||
area = static_cast<int>(srcblob->area());
|
||||
}
|
||||
~BLOBNBOX() {
|
||||
if (owns_cblob_) {
|
||||
delete cblob_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
static void clear_blobnboxes(BLOBNBOX_LIST *boxes) {
|
||||
BLOBNBOX_IT it = boxes;
|
||||
// A BLOBNBOX generally doesn't own its blobs, so if they do, you
|
||||
// have to delete them explicitly.
|
||||
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
|
||||
BLOBNBOX *box = it.data();
|
||||
// TODO: remove next line, currently still needed for resultiterator_test.
|
||||
delete box->remove_cblob();
|
||||
}
|
||||
}
|
||||
|
||||
static BLOBNBOX *RealBlob(C_OUTLINE *outline) {
|
||||
auto *blob = new C_BLOB(outline);
|
||||
return new BLOBNBOX(blob);
|
||||
}
|
||||
|
||||
// Rotates the box and the underlying blob.
|
||||
void rotate(FCOORD rotation);
|
||||
|
||||
// Methods that act on the box without touching the underlying blob.
|
||||
// Reflect the box in the y-axis, leaving the underlying blob untouched.
|
||||
void reflect_box_in_y_axis();
|
||||
// Rotates the box by the angle given by rotation.
|
||||
// If the blob is a diacritic, then only small rotations for skew
|
||||
// correction can be applied.
|
||||
void rotate_box(FCOORD rotation);
|
||||
// Moves just the box by the given vector.
|
||||
void translate_box(ICOORD v) {
|
||||
if (IsDiacritic()) {
|
||||
box.move(v);
|
||||
base_char_top_ += v.y();
|
||||
base_char_bottom_ += v.y();
|
||||
} else {
|
||||
box.move(v);
|
||||
set_diacritic_box(box);
|
||||
}
|
||||
}
|
||||
void merge(BLOBNBOX *nextblob);
|
||||
void really_merge(BLOBNBOX *other);
|
||||
void chop( // fake chop blob
|
||||
BLOBNBOX_IT *start_it, // location of this
|
||||
BLOBNBOX_IT *blob_it, // iterator
|
||||
FCOORD rotation, // for landscape
|
||||
float xheight); // line height
|
||||
|
||||
void NeighbourGaps(int gaps[BND_COUNT]) const;
|
||||
void MinMaxGapsClipped(int *h_min, int *h_max, int *v_min, int *v_max) const;
|
||||
void CleanNeighbours();
|
||||
// Returns positive if there is at least one side neighbour that has a
|
||||
// similar stroke width and is not on the other side of a rule line.
|
||||
int GoodTextBlob() const;
|
||||
// Returns the number of side neighbours that are of type BRT_NOISE.
|
||||
int NoisyNeighbours() const;
|
||||
|
||||
// Returns true if the blob is noise and has no owner.
|
||||
bool DeletableNoise() const {
|
||||
return owner() == nullptr && region_type() == BRT_NOISE;
|
||||
}
|
||||
|
||||
// Returns true, and sets vert_possible/horz_possible if the blob has some
|
||||
// feature that makes it individually appear to flow one way.
|
||||
// eg if it has a high aspect ratio, yet has a complex shape, such as a
|
||||
// joined word in Latin, Arabic, or Hindi, rather than being a -, I, l, 1.
|
||||
bool DefiniteIndividualFlow();
|
||||
|
||||
// Returns true if there is no tabstop violation in merging this and other.
|
||||
bool ConfirmNoTabViolation(const BLOBNBOX &other) const;
|
||||
|
||||
// Returns true if other has a similar stroke width to this.
|
||||
bool MatchingStrokeWidth(const BLOBNBOX &other, double fractional_tolerance,
|
||||
double constant_tolerance) const;
|
||||
|
||||
// Returns a bounding box of the outline contained within the
|
||||
// given horizontal range.
|
||||
TBOX BoundsWithinLimits(int left, int right);
|
||||
|
||||
// Estimates and stores the baseline position based on the shape of the
|
||||
// outline.
|
||||
void EstimateBaselinePosition();
|
||||
|
||||
// Simple accessors.
|
||||
const TBOX &bounding_box() const {
|
||||
return box;
|
||||
}
|
||||
// Set the bounding box. Use with caution.
|
||||
// Normally use compute_bounding_box instead.
|
||||
void set_bounding_box(const TBOX &new_box) {
|
||||
box = new_box;
|
||||
base_char_top_ = box.top();
|
||||
base_char_bottom_ = box.bottom();
|
||||
}
|
||||
void compute_bounding_box() {
|
||||
box = cblob_ptr->bounding_box();
|
||||
base_char_top_ = box.top();
|
||||
base_char_bottom_ = box.bottom();
|
||||
baseline_y_ = box.bottom();
|
||||
}
|
||||
const TBOX &reduced_box() const {
|
||||
return red_box;
|
||||
}
|
||||
void set_reduced_box(TBOX new_box) {
|
||||
red_box = new_box;
|
||||
reduced = true;
|
||||
}
|
||||
int32_t enclosed_area() const {
|
||||
return area;
|
||||
}
|
||||
bool joined_to_prev() const {
|
||||
return joined;
|
||||
}
|
||||
bool red_box_set() const {
|
||||
return reduced;
|
||||
}
|
||||
int repeated_set() const {
|
||||
return repeated_set_;
|
||||
}
|
||||
void set_repeated_set(int set_id) {
|
||||
repeated_set_ = set_id;
|
||||
}
|
||||
C_BLOB *cblob() const {
|
||||
return cblob_ptr;
|
||||
}
|
||||
C_BLOB *remove_cblob() {
|
||||
auto blob = cblob_ptr;
|
||||
cblob_ptr = nullptr;
|
||||
owns_cblob_ = false;
|
||||
return blob;
|
||||
}
|
||||
TabType left_tab_type() const {
|
||||
return left_tab_type_;
|
||||
}
|
||||
void set_left_tab_type(TabType new_type) {
|
||||
left_tab_type_ = new_type;
|
||||
}
|
||||
TabType right_tab_type() const {
|
||||
return right_tab_type_;
|
||||
}
|
||||
void set_right_tab_type(TabType new_type) {
|
||||
right_tab_type_ = new_type;
|
||||
}
|
||||
BlobRegionType region_type() const {
|
||||
return region_type_;
|
||||
}
|
||||
void set_region_type(BlobRegionType new_type) {
|
||||
region_type_ = new_type;
|
||||
}
|
||||
BlobSpecialTextType special_text_type() const {
|
||||
return spt_type_;
|
||||
}
|
||||
void set_special_text_type(BlobSpecialTextType new_type) {
|
||||
spt_type_ = new_type;
|
||||
}
|
||||
BlobTextFlowType flow() const {
|
||||
return flow_;
|
||||
}
|
||||
void set_flow(BlobTextFlowType value) {
|
||||
flow_ = value;
|
||||
}
|
||||
bool vert_possible() const {
|
||||
return vert_possible_;
|
||||
}
|
||||
void set_vert_possible(bool value) {
|
||||
vert_possible_ = value;
|
||||
}
|
||||
bool horz_possible() const {
|
||||
return horz_possible_;
|
||||
}
|
||||
void set_horz_possible(bool value) {
|
||||
horz_possible_ = value;
|
||||
}
|
||||
int left_rule() const {
|
||||
return left_rule_;
|
||||
}
|
||||
void set_left_rule(int new_left) {
|
||||
left_rule_ = new_left;
|
||||
}
|
||||
int right_rule() const {
|
||||
return right_rule_;
|
||||
}
|
||||
void set_right_rule(int new_right) {
|
||||
right_rule_ = new_right;
|
||||
}
|
||||
int left_crossing_rule() const {
|
||||
return left_crossing_rule_;
|
||||
}
|
||||
void set_left_crossing_rule(int new_left) {
|
||||
left_crossing_rule_ = new_left;
|
||||
}
|
||||
int right_crossing_rule() const {
|
||||
return right_crossing_rule_;
|
||||
}
|
||||
void set_right_crossing_rule(int new_right) {
|
||||
right_crossing_rule_ = new_right;
|
||||
}
|
||||
float horz_stroke_width() const {
|
||||
return horz_stroke_width_;
|
||||
}
|
||||
void set_horz_stroke_width(float width) {
|
||||
horz_stroke_width_ = width;
|
||||
}
|
||||
float vert_stroke_width() const {
|
||||
return vert_stroke_width_;
|
||||
}
|
||||
void set_vert_stroke_width(float width) {
|
||||
vert_stroke_width_ = width;
|
||||
}
|
||||
float area_stroke_width() const {
|
||||
return area_stroke_width_;
|
||||
}
|
||||
tesseract::ColPartition *owner() const {
|
||||
return owner_;
|
||||
}
|
||||
void set_owner(tesseract::ColPartition *new_owner) {
|
||||
owner_ = new_owner;
|
||||
}
|
||||
bool leader_on_left() const {
|
||||
return leader_on_left_;
|
||||
}
|
||||
void set_leader_on_left(bool flag) {
|
||||
leader_on_left_ = flag;
|
||||
}
|
||||
bool leader_on_right() const {
|
||||
return leader_on_right_;
|
||||
}
|
||||
void set_leader_on_right(bool flag) {
|
||||
leader_on_right_ = flag;
|
||||
}
|
||||
BLOBNBOX *neighbour(BlobNeighbourDir n) const {
|
||||
return neighbours_[n];
|
||||
}
|
||||
bool good_stroke_neighbour(BlobNeighbourDir n) const {
|
||||
return good_stroke_neighbours_[n];
|
||||
}
|
||||
void set_neighbour(BlobNeighbourDir n, BLOBNBOX *neighbour, bool good) {
|
||||
neighbours_[n] = neighbour;
|
||||
good_stroke_neighbours_[n] = good;
|
||||
}
|
||||
bool IsDiacritic() const {
|
||||
return base_char_top_ != box.top() || base_char_bottom_ != box.bottom();
|
||||
}
|
||||
int base_char_top() const {
|
||||
return base_char_top_;
|
||||
}
|
||||
int base_char_bottom() const {
|
||||
return base_char_bottom_;
|
||||
}
|
||||
int baseline_position() const {
|
||||
return baseline_y_;
|
||||
}
|
||||
int line_crossings() const {
|
||||
return line_crossings_;
|
||||
}
|
||||
void set_line_crossings(int value) {
|
||||
line_crossings_ = value;
|
||||
}
|
||||
void set_diacritic_box(const TBOX &diacritic_box) {
|
||||
base_char_top_ = diacritic_box.top();
|
||||
base_char_bottom_ = diacritic_box.bottom();
|
||||
}
|
||||
BLOBNBOX *base_char_blob() const {
|
||||
return base_char_blob_;
|
||||
}
|
||||
void set_base_char_blob(BLOBNBOX *blob) {
|
||||
base_char_blob_ = blob;
|
||||
}
|
||||
void set_owns_cblob(bool value) {
|
||||
owns_cblob_ = value;
|
||||
}
|
||||
|
||||
bool UniquelyVertical() const {
|
||||
return vert_possible_ && !horz_possible_;
|
||||
}
|
||||
bool UniquelyHorizontal() const {
|
||||
return horz_possible_ && !vert_possible_;
|
||||
}
|
||||
|
||||
// Returns true if the region type is text.
|
||||
static bool IsTextType(BlobRegionType type) {
|
||||
return type == BRT_TEXT || type == BRT_VERT_TEXT;
|
||||
}
|
||||
// Returns true if the region type is image.
|
||||
static bool IsImageType(BlobRegionType type) {
|
||||
return type == BRT_RECTIMAGE || type == BRT_POLYIMAGE;
|
||||
}
|
||||
// Returns true if the region type is line.
|
||||
static bool IsLineType(BlobRegionType type) {
|
||||
return type == BRT_HLINE || type == BRT_VLINE;
|
||||
}
|
||||
// Returns true if the region type cannot be merged.
|
||||
static bool UnMergeableType(BlobRegionType type) {
|
||||
return IsLineType(type) || IsImageType(type);
|
||||
}
|
||||
// Helper to call CleanNeighbours on all blobs on the list.
|
||||
static void CleanNeighbours(BLOBNBOX_LIST *blobs);
|
||||
// Helper to delete all the deletable blobs on the list.
|
||||
static void DeleteNoiseBlobs(BLOBNBOX_LIST *blobs);
|
||||
// Helper to compute edge offsets for all the blobs on the list.
|
||||
// See coutln.h for an explanation of edge offsets.
|
||||
static void ComputeEdgeOffsets(Image thresholds, Image grey, BLOBNBOX_LIST *blobs);
|
||||
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
// Helper to draw all the blobs on the list in the given body_colour,
|
||||
// with child outlines in the child_colour.
|
||||
static void PlotBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour,
|
||||
ScrollView::Color child_colour, ScrollView *win);
|
||||
// Helper to draw only DeletableNoise blobs (unowned, BRT_NOISE) on the
|
||||
// given list in the given body_colour, with child outlines in the
|
||||
// child_colour.
|
||||
static void PlotNoiseBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour,
|
||||
ScrollView::Color child_colour, ScrollView *win);
|
||||
|
||||
static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type);
|
||||
|
||||
// Keep in sync with BlobRegionType.
|
||||
ScrollView::Color BoxColor() const;
|
||||
|
||||
void plot(ScrollView *window, // window to draw in
|
||||
ScrollView::Color blob_colour, // for outer bits
|
||||
ScrollView::Color child_colour); // for holes
|
||||
#endif
|
||||
|
||||
// Initializes members set by StrokeWidth and beyond, without discarding
|
||||
// stored area and strokewidth values, which are expensive to calculate.
|
||||
void ReInit() {
|
||||
joined = false;
|
||||
reduced = false;
|
||||
repeated_set_ = 0;
|
||||
left_tab_type_ = TT_NONE;
|
||||
right_tab_type_ = TT_NONE;
|
||||
region_type_ = BRT_UNKNOWN;
|
||||
flow_ = BTFT_NONE;
|
||||
spt_type_ = BSTT_SKIP;
|
||||
left_rule_ = 0;
|
||||
right_rule_ = 0;
|
||||
left_crossing_rule_ = 0;
|
||||
right_crossing_rule_ = 0;
|
||||
if (area_stroke_width_ == 0.0f && area > 0 && cblob() != nullptr && cblob()->perimeter() != 0) {
|
||||
area_stroke_width_ = 2.0f * area / cblob()->perimeter();
|
||||
}
|
||||
owner_ = nullptr;
|
||||
base_char_top_ = box.top();
|
||||
base_char_bottom_ = box.bottom();
|
||||
baseline_y_ = box.bottom();
|
||||
line_crossings_ = 0;
|
||||
base_char_blob_ = nullptr;
|
||||
horz_possible_ = false;
|
||||
vert_possible_ = false;
|
||||
leader_on_left_ = false;
|
||||
leader_on_right_ = false;
|
||||
ClearNeighbours();
|
||||
}
|
||||
|
||||
void ClearNeighbours() {
|
||||
for (int n = 0; n < BND_COUNT; ++n) {
|
||||
neighbours_[n] = nullptr;
|
||||
good_stroke_neighbours_[n] = false;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
C_BLOB *cblob_ptr = nullptr; // edgestep blob
|
||||
TBOX box; // bounding box
|
||||
TBOX red_box; // bounding box
|
||||
int32_t area = 0; // enclosed area
|
||||
int32_t repeated_set_ = 0; // id of the set of repeated blobs
|
||||
TabType left_tab_type_ = TT_NONE; // Indicates tab-stop assessment
|
||||
TabType right_tab_type_ = TT_NONE; // Indicates tab-stop assessment
|
||||
BlobRegionType region_type_ = BRT_UNKNOWN; // Type of region this blob belongs to
|
||||
BlobTextFlowType flow_ = BTFT_NONE; // Quality of text flow.
|
||||
BlobSpecialTextType spt_type_; // Special text type.
|
||||
bool joined = false; // joined to prev
|
||||
bool reduced = false; // reduced box set
|
||||
int16_t left_rule_ = 0; // x-coord of nearest but not crossing rule line
|
||||
int16_t right_rule_ = 0; // x-coord of nearest but not crossing rule line
|
||||
int16_t left_crossing_rule_; // x-coord of nearest or crossing rule line
|
||||
int16_t right_crossing_rule_; // x-coord of nearest or crossing rule line
|
||||
int16_t base_char_top_; // y-coord of top/bottom of diacritic base,
|
||||
int16_t base_char_bottom_; // if it exists else top/bottom of this blob.
|
||||
int16_t baseline_y_; // Estimate of baseline position.
|
||||
int32_t line_crossings_; // Number of line intersections touched.
|
||||
BLOBNBOX *base_char_blob_; // The blob that was the base char.
|
||||
tesseract::ColPartition *owner_; // Who will delete me when I am not needed
|
||||
BLOBNBOX *neighbours_[BND_COUNT];
|
||||
float horz_stroke_width_ = 0.0f; // Median horizontal stroke width
|
||||
float vert_stroke_width_ = 0.0f; // Median vertical stroke width
|
||||
float area_stroke_width_ = 0.0f; // Stroke width from area/perimeter ratio.
|
||||
bool good_stroke_neighbours_[BND_COUNT];
|
||||
bool horz_possible_; // Could be part of horizontal flow.
|
||||
bool vert_possible_; // Could be part of vertical flow.
|
||||
bool leader_on_left_; // There is a leader to the left.
|
||||
bool leader_on_right_; // There is a leader to the right.
|
||||
// Iff true, then the destructor should delete the cblob_ptr.
|
||||
// TODO(rays) migrate all uses to correctly setting this flag instead of
|
||||
// deleting the C_BLOB before deleting the BLOBNBOX.
|
||||
bool owns_cblob_ = false;
|
||||
};
|
||||
|
||||
class TO_ROW : public ELIST2_LINK {
|
||||
public:
|
||||
static const int kErrorWeight = 3;
|
||||
|
||||
TO_ROW() {
|
||||
clear();
|
||||
} // empty
|
||||
TO_ROW( // constructor
|
||||
BLOBNBOX *blob, // from first blob
|
||||
float top, // of row //target height
|
||||
float bottom, float row_size);
|
||||
|
||||
void print() const;
|
||||
float max_y() const { // access function
|
||||
return y_max;
|
||||
}
|
||||
float min_y() const {
|
||||
return y_min;
|
||||
}
|
||||
float mean_y() const {
|
||||
return (y_min + y_max) / 2.0f;
|
||||
}
|
||||
float initial_min_y() const {
|
||||
return initial_y_min;
|
||||
}
|
||||
float line_m() const { // access to line fit
|
||||
return m;
|
||||
}
|
||||
float line_c() const {
|
||||
return c;
|
||||
}
|
||||
float line_error() const {
|
||||
return error;
|
||||
}
|
||||
float parallel_c() const {
|
||||
return para_c;
|
||||
}
|
||||
float parallel_error() const {
|
||||
return para_error;
|
||||
}
|
||||
float believability() const { // baseline goodness
|
||||
return credibility;
|
||||
}
|
||||
float intercept() const { // real parallel_c
|
||||
return y_origin;
|
||||
}
|
||||
void add_blob( // put in row
|
||||
BLOBNBOX *blob, // blob to add
|
||||
float top, // of row //target height
|
||||
float bottom, float row_size);
|
||||
void insert_blob( // put in row in order
|
||||
BLOBNBOX *blob);
|
||||
|
||||
BLOBNBOX_LIST *blob_list() { // get list
|
||||
return &blobs;
|
||||
}
|
||||
|
||||
void set_line( // set line spec
|
||||
float new_m, // line to set
|
||||
float new_c, float new_error) {
|
||||
m = new_m;
|
||||
c = new_c;
|
||||
error = new_error;
|
||||
}
|
||||
void set_parallel_line( // set fixed gradient line
|
||||
float gradient, // page gradient
|
||||
float new_c, float new_error) {
|
||||
para_c = new_c;
|
||||
para_error = new_error;
|
||||
credibility = blobs.length() - kErrorWeight * new_error;
|
||||
y_origin = new_c / std::sqrt(1 + gradient * gradient);
|
||||
// real intercept
|
||||
}
|
||||
void set_limits( // set min,max
|
||||
float new_min, // bottom and
|
||||
float new_max) { // top of row
|
||||
y_min = new_min;
|
||||
y_max = new_max;
|
||||
}
|
||||
void compute_vertical_projection();
|
||||
// get projection
|
||||
|
||||
bool rep_chars_marked() const {
|
||||
return num_repeated_sets_ != -1;
|
||||
}
|
||||
void clear_rep_chars_marked() {
|
||||
num_repeated_sets_ = -1;
|
||||
}
|
||||
int num_repeated_sets() const {
|
||||
return num_repeated_sets_;
|
||||
}
|
||||
void set_num_repeated_sets(int num_sets) {
|
||||
num_repeated_sets_ = num_sets;
|
||||
}
|
||||
|
||||
// true when dead
|
||||
bool merged = false;
|
||||
bool all_caps; // had no ascenders
|
||||
bool used_dm_model; // in guessing pitch
|
||||
int16_t projection_left; // start of projection
|
||||
int16_t projection_right; // start of projection
|
||||
PITCH_TYPE pitch_decision; // how strong is decision
|
||||
float fixed_pitch; // pitch or 0
|
||||
float fp_space; // sp if fixed pitch
|
||||
float fp_nonsp; // nonsp if fixed pitch
|
||||
float pr_space; // sp if prop
|
||||
float pr_nonsp; // non sp if prop
|
||||
float spacing; // to "next" row
|
||||
float xheight; // of line
|
||||
int xheight_evidence; // number of blobs of height xheight
|
||||
float ascrise; // ascenders
|
||||
float descdrop; // descenders
|
||||
float body_size; // of CJK characters. Assumed to be
|
||||
// xheight+ascrise for non-CJK text.
|
||||
int32_t min_space; // min size for real space
|
||||
int32_t max_nonspace; // max size of non-space
|
||||
int32_t space_threshold; // space vs nonspace
|
||||
float kern_size; // average non-space
|
||||
float space_size; // average space
|
||||
WERD_LIST rep_words; // repeated chars
|
||||
ICOORDELT_LIST char_cells; // fixed pitch cells
|
||||
QSPLINE baseline; // curved baseline
|
||||
STATS projection; // vertical projection
|
||||
|
||||
private:
|
||||
void clear(); // clear all values to reasonable defaults
|
||||
|
||||
BLOBNBOX_LIST blobs; // blobs in row
|
||||
float y_min; // coords
|
||||
float y_max;
|
||||
float initial_y_min;
|
||||
float m, c; // line spec
|
||||
float error; // line error
|
||||
float para_c; // constrained fit
|
||||
float para_error;
|
||||
float y_origin; // rotated para_c;
|
||||
float credibility; // baseline believability
|
||||
int num_repeated_sets_; // number of sets of repeated blobs
|
||||
// set to -1 if we have not searched
|
||||
// for repeated blobs in this row yet
|
||||
};
|
||||
|
||||
ELIST2IZEH(TO_ROW)
|
||||
class TESS_API TO_BLOCK : public ELIST_LINK {
|
||||
public:
|
||||
TO_BLOCK() : pitch_decision(PITCH_DUNNO) {
|
||||
clear();
|
||||
} // empty
|
||||
TO_BLOCK( // constructor
|
||||
BLOCK *src_block); // real block
|
||||
~TO_BLOCK();
|
||||
|
||||
void clear(); // clear all scalar members.
|
||||
|
||||
TO_ROW_LIST *get_rows() { // access function
|
||||
return &row_list;
|
||||
}
|
||||
|
||||
// Rotate all the blobnbox lists and the underlying block. Then update the
|
||||
// median size statistic from the blobs list.
|
||||
void rotate(const FCOORD &rotation) {
|
||||
BLOBNBOX_LIST *blobnbox_list[] = {&blobs, &underlines, &noise_blobs,
|
||||
&small_blobs, &large_blobs, nullptr};
|
||||
for (BLOBNBOX_LIST **list = blobnbox_list; *list != nullptr; ++list) {
|
||||
BLOBNBOX_IT it(*list);
|
||||
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
|
||||
it.data()->rotate(rotation);
|
||||
}
|
||||
}
|
||||
// Rotate the block
|
||||
ASSERT_HOST(block->pdblk.poly_block() != nullptr);
|
||||
block->rotate(rotation);
|
||||
// Update the median size statistic from the blobs list.
|
||||
STATS widths(0, block->pdblk.bounding_box().width());
|
||||
STATS heights(0, block->pdblk.bounding_box().height());
|
||||
BLOBNBOX_IT blob_it(&blobs);
|
||||
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
|
||||
widths.add(blob_it.data()->bounding_box().width(), 1);
|
||||
heights.add(blob_it.data()->bounding_box().height(), 1);
|
||||
}
|
||||
block->set_median_size(static_cast<int>(widths.median() + 0.5),
|
||||
static_cast<int>(heights.median() + 0.5));
|
||||
}
|
||||
|
||||
void print_rows() { // debug info
|
||||
TO_ROW_IT row_it = &row_list;
|
||||
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
|
||||
auto row = row_it.data();
|
||||
tprintf("Row range (%g,%g), para_c=%g, blobcount=%" PRId32 "\n", row->min_y(), row->max_y(),
|
||||
row->parallel_c(), row->blob_list()->length());
|
||||
}
|
||||
}
|
||||
|
||||
// Reorganizes the blob lists with a different definition of small, medium
|
||||
// and large, compared to the original definition.
|
||||
// Height is still the primary filter key, but medium width blobs of small
|
||||
// height become medium, and very wide blobs of small height stay small.
|
||||
void ReSetAndReFilterBlobs();
|
||||
|
||||
// Deletes noise blobs from all lists where not owned by a ColPartition.
|
||||
void DeleteUnownedNoise();
|
||||
|
||||
// Computes and stores the edge offsets on each blob for use in feature
|
||||
// extraction, using greyscale if the supplied grey and thresholds pixes
|
||||
// are 8-bit or otherwise (if nullptr or not 8 bit) the original binary
|
||||
// edge step outlines.
|
||||
// Thresholds must either be the same size as grey or an integer down-scale
|
||||
// of grey.
|
||||
// See coutln.h for an explanation of edge offsets.
|
||||
void ComputeEdgeOffsets(Image thresholds, Image grey);
|
||||
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
// Draw the noise blobs from all lists in red.
|
||||
void plot_noise_blobs(ScrollView *to_win);
|
||||
// Draw the blobs on on the various lists in the block in different colors.
|
||||
void plot_graded_blobs(ScrollView *to_win);
|
||||
#endif
|
||||
|
||||
BLOBNBOX_LIST blobs; // medium size
|
||||
BLOBNBOX_LIST underlines; // underline blobs
|
||||
BLOBNBOX_LIST noise_blobs; // very small
|
||||
BLOBNBOX_LIST small_blobs; // fairly small
|
||||
BLOBNBOX_LIST large_blobs; // big blobs
|
||||
BLOCK *block; // real block
|
||||
PITCH_TYPE pitch_decision; // how strong is decision
|
||||
float line_spacing; // estimate
|
||||
// line_size is a lower-bound estimate of the font size in pixels of
|
||||
// the text in the block (with ascenders and descenders), being a small
|
||||
// (1.25) multiple of the median height of filtered blobs.
|
||||
// In most cases the font size will be bigger, but it will be closer
|
||||
// if the text is allcaps, or in a no-x-height script.
|
||||
float line_size; // estimate
|
||||
float max_blob_size; // line assignment limit
|
||||
float baseline_offset; // phase shift
|
||||
float xheight; // median blob size
|
||||
float fixed_pitch; // pitch or 0
|
||||
float kern_size; // average non-space
|
||||
float space_size; // average space
|
||||
int32_t min_space; // min definite space
|
||||
int32_t max_nonspace; // max definite
|
||||
float fp_space; // sp if fixed pitch
|
||||
float fp_nonsp; // nonsp if fixed pitch
|
||||
float pr_space; // sp if prop
|
||||
float pr_nonsp; // non sp if prop
|
||||
TO_ROW *key_row; // starting row
|
||||
|
||||
private:
|
||||
TO_ROW_LIST row_list; // temporary rows
|
||||
};
|
||||
|
||||
ELISTIZEH(TO_BLOCK)
|
||||
extern double_VAR_H(textord_error_weight, 3, "Weighting for error in believability");
|
||||
void find_cblob_limits( // get y limits
|
||||
C_BLOB *blob, // blob to search
|
||||
float leftx, // x limits
|
||||
float rightx,
|
||||
FCOORD rotation, // for landscape
|
||||
float &ymin, // output y limits
|
||||
float &ymax);
|
||||
void find_cblob_vlimits( // get y limits
|
||||
C_BLOB *blob, // blob to search
|
||||
float leftx, // x limits
|
||||
float rightx,
|
||||
float &ymin, // output y limits
|
||||
float &ymax);
|
||||
void find_cblob_hlimits( // get x limits
|
||||
C_BLOB *blob, // blob to search
|
||||
float bottomy, // y limits
|
||||
float topy,
|
||||
float &xmin, // output x limits
|
||||
float &xymax);
|
||||
C_BLOB *crotate_cblob( // rotate it
|
||||
C_BLOB *blob, // blob to search
|
||||
FCOORD rotation // for landscape
|
||||
);
|
||||
TBOX box_next( // get bounding box
|
||||
BLOBNBOX_IT *it // iterator to blobds
|
||||
);
|
||||
TBOX box_next_pre_chopped( // get bounding box
|
||||
BLOBNBOX_IT *it // iterator to blobds
|
||||
);
|
||||
void vertical_cblob_projection( // project outlines
|
||||
C_BLOB *blob, // blob to project
|
||||
STATS *stats // output
|
||||
);
|
||||
void vertical_coutline_projection( // project outlines
|
||||
C_OUTLINE *outline, // outline to project
|
||||
STATS *stats // output
|
||||
);
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
void plot_blob_list(ScrollView *win, // window to draw in
|
||||
BLOBNBOX_LIST *list, // blob list
|
||||
ScrollView::Color body_colour, // colour to draw
|
||||
ScrollView::Color child_colour); // colour of child
|
||||
#endif // !GRAPHICS_DISABLED
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif
|
1006
3rdparty/tesseract_ocr/tesseract/src/ccstruct/blobs.cpp
vendored
Normal file
1006
3rdparty/tesseract_ocr/tesseract/src/ccstruct/blobs.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
476
3rdparty/tesseract_ocr/tesseract/src/ccstruct/blobs.h
vendored
Normal file
476
3rdparty/tesseract_ocr/tesseract/src/ccstruct/blobs.h
vendored
Normal file
|
@ -0,0 +1,476 @@
|
|||
/******************************************************************************
|
||||
*
|
||||
* File: blobs.h
|
||||
* Description: Blob definition
|
||||
* Author: Mark Seaman, OCR Technology
|
||||
*
|
||||
* (c) Copyright 1989, Hewlett-Packard Company.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef BLOBS_H
|
||||
#define BLOBS_H
|
||||
|
||||
#include "clst.h" // for CLIST_ITERATOR, CLISTIZEH
|
||||
#include "normalis.h" // for DENORM
|
||||
#include "points.h" // for FCOORD, ICOORD
|
||||
#include "rect.h" // for TBOX
|
||||
#include "scrollview.h" // for ScrollView, ScrollView::Color
|
||||
|
||||
#include <tesseract/publictypes.h> // for OcrEngineMode
|
||||
|
||||
#include <cstdint> // for int16_t
|
||||
|
||||
struct Pix;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class BLOCK;
|
||||
class C_BLOB;
|
||||
class C_OUTLINE;
|
||||
class LLSQ;
|
||||
class ROW;
|
||||
class WERD;
|
||||
|
||||
/*----------------------------------------------------------------------
|
||||
T y p e s
|
||||
----------------------------------------------------------------------*/
|
||||
|
||||
struct TPOINT {
|
||||
TPOINT() : x(0), y(0) {}
|
||||
TPOINT(int16_t vx, int16_t vy) : x(vx), y(vy) {}
|
||||
TPOINT(const ICOORD &ic) : x(ic.x()), y(ic.y()) {}
|
||||
|
||||
void operator+=(const TPOINT &other) {
|
||||
x += other.x;
|
||||
y += other.y;
|
||||
}
|
||||
void operator/=(int divisor) {
|
||||
x /= divisor;
|
||||
y /= divisor;
|
||||
}
|
||||
bool operator==(const TPOINT &other) const {
|
||||
return x == other.x && y == other.y;
|
||||
}
|
||||
// Returns true when the two line segments cross each other.
|
||||
// (Moved from outlines.cpp).
|
||||
static bool IsCrossed(const TPOINT &a0, const TPOINT &a1, const TPOINT &b0, const TPOINT &b1);
|
||||
|
||||
// Assign the difference from point p1 to point p2.
|
||||
void diff(const TPOINT &p1, const TPOINT &p2) {
|
||||
x = p1.x - p2.x;
|
||||
y = p1.y - p2.y;
|
||||
}
|
||||
|
||||
// Return cross product.
|
||||
int cross(const TPOINT &other) const {
|
||||
return x * other.y - y * other.x;
|
||||
}
|
||||
|
||||
// Return scalar or dot product.
|
||||
int dot(const TPOINT &other) const {
|
||||
return x * other.x + y * other.y;
|
||||
}
|
||||
|
||||
// Calculate length of vector.
|
||||
int length() const {
|
||||
return x * x + y * y;
|
||||
}
|
||||
|
||||
int16_t x; // absolute x coord.
|
||||
int16_t y; // absolute y coord.
|
||||
};
|
||||
|
||||
using VECTOR = TPOINT; // structure for coordinates.
|
||||
|
||||
struct EDGEPT {
|
||||
EDGEPT() = default;
|
||||
EDGEPT(const EDGEPT &src) : next(nullptr), prev(nullptr) {
|
||||
CopyFrom(src);
|
||||
}
|
||||
EDGEPT &operator=(const EDGEPT &src) {
|
||||
CopyFrom(src);
|
||||
return *this;
|
||||
}
|
||||
// Copies the data elements, but leaves the pointers untouched.
|
||||
void CopyFrom(const EDGEPT &src) {
|
||||
pos = src.pos;
|
||||
vec = src.vec;
|
||||
is_hidden = src.is_hidden;
|
||||
runlength = src.runlength;
|
||||
dir = src.dir;
|
||||
fixed = src.fixed;
|
||||
src_outline = src.src_outline;
|
||||
start_step = src.start_step;
|
||||
step_count = src.step_count;
|
||||
}
|
||||
// Returns the squared distance between the points, with the x-component
|
||||
// weighted by x_factor.
|
||||
int WeightedDistance(const EDGEPT &other, int x_factor) const {
|
||||
int x_dist = pos.x - other.pos.x;
|
||||
int y_dist = pos.y - other.pos.y;
|
||||
return x_dist * x_dist * x_factor + y_dist * y_dist;
|
||||
}
|
||||
// Returns true if the positions are equal.
|
||||
bool EqualPos(const EDGEPT &other) const {
|
||||
return pos == other.pos;
|
||||
}
|
||||
// Returns the bounding box of the outline segment from *this to *end.
|
||||
// Ignores hidden edge flags.
|
||||
TBOX SegmentBox(const EDGEPT *end) const {
|
||||
TBOX box(pos.x, pos.y, pos.x, pos.y);
|
||||
const EDGEPT *pt = this;
|
||||
do {
|
||||
pt = pt->next;
|
||||
if (pt->pos.x < box.left()) {
|
||||
box.set_left(pt->pos.x);
|
||||
}
|
||||
if (pt->pos.x > box.right()) {
|
||||
box.set_right(pt->pos.x);
|
||||
}
|
||||
if (pt->pos.y < box.bottom()) {
|
||||
box.set_bottom(pt->pos.y);
|
||||
}
|
||||
if (pt->pos.y > box.top()) {
|
||||
box.set_top(pt->pos.y);
|
||||
}
|
||||
} while (pt != end && pt != this);
|
||||
return box;
|
||||
}
|
||||
// Returns the area of the outline segment from *this to *end.
|
||||
// Ignores hidden edge flags.
|
||||
int SegmentArea(const EDGEPT *end) const {
|
||||
int area = 0;
|
||||
const EDGEPT *pt = this->next;
|
||||
do {
|
||||
TPOINT origin_vec(pt->pos.x - pos.x, pt->pos.y - pos.y);
|
||||
area += origin_vec.cross(pt->vec);
|
||||
pt = pt->next;
|
||||
} while (pt != end && pt != this);
|
||||
return area;
|
||||
}
|
||||
// Returns true if the number of points in the outline segment from *this to
|
||||
// *end is less that min_points and false if we get back to *this first.
|
||||
// Ignores hidden edge flags.
|
||||
bool ShortNonCircularSegment(int min_points, const EDGEPT *end) const {
|
||||
int count = 0;
|
||||
const EDGEPT *pt = this;
|
||||
do {
|
||||
if (pt == end) {
|
||||
return true;
|
||||
}
|
||||
pt = pt->next;
|
||||
++count;
|
||||
} while (pt != this && count <= min_points);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Accessors to hide or reveal a cut edge from feature extractors.
|
||||
void Hide() {
|
||||
is_hidden = true;
|
||||
}
|
||||
void Reveal() {
|
||||
is_hidden = false;
|
||||
}
|
||||
bool IsHidden() const {
|
||||
return is_hidden;
|
||||
}
|
||||
void MarkChop() {
|
||||
dir = 1;
|
||||
}
|
||||
bool IsChopPt() const {
|
||||
return dir != 0;
|
||||
}
|
||||
|
||||
TPOINT pos; // position
|
||||
VECTOR vec; // vector to next point
|
||||
bool is_hidden = false;
|
||||
uint8_t runlength = 0;
|
||||
int8_t dir = 0;
|
||||
int8_t fixed = 0;
|
||||
EDGEPT *next = nullptr; // anticlockwise element
|
||||
EDGEPT *prev = nullptr; // clockwise element
|
||||
C_OUTLINE *src_outline = nullptr; // Outline it came from.
|
||||
// The following fields are not used if src_outline is nullptr.
|
||||
int start_step = 0; // Location of pos in src_outline.
|
||||
int step_count = 0; // Number of steps used (may wrap around).
|
||||
};
|
||||
|
||||
// For use in chop and findseam to keep a list of which EDGEPTs were inserted.
|
||||
CLISTIZEH(EDGEPT)
|
||||
|
||||
struct TESSLINE {
|
||||
TESSLINE() : is_hole(false), loop(nullptr), next(nullptr) {}
|
||||
TESSLINE(const TESSLINE &src) : loop(nullptr), next(nullptr) {
|
||||
CopyFrom(src);
|
||||
}
|
||||
~TESSLINE() {
|
||||
Clear();
|
||||
}
|
||||
TESSLINE &operator=(const TESSLINE &src) {
|
||||
CopyFrom(src);
|
||||
return *this;
|
||||
}
|
||||
// Consume the circular list of EDGEPTs to make a TESSLINE.
|
||||
static TESSLINE *BuildFromOutlineList(EDGEPT *outline);
|
||||
// Copies the data and the outline, but leaves next untouched.
|
||||
void CopyFrom(const TESSLINE &src);
|
||||
// Deletes owned data.
|
||||
void Clear();
|
||||
// Normalize in-place using the DENORM.
|
||||
void Normalize(const DENORM &denorm);
|
||||
// Rotates by the given rotation in place.
|
||||
void Rotate(const FCOORD rotation);
|
||||
// Moves by the given vec in place.
|
||||
void Move(const ICOORD vec);
|
||||
// Scales by the given factor in place.
|
||||
void Scale(float factor);
|
||||
// Sets up the start and vec members of the loop from the pos members.
|
||||
void SetupFromPos();
|
||||
// Recomputes the bounding box from the points in the loop.
|
||||
void ComputeBoundingBox();
|
||||
// Computes the min and max cross product of the outline points with the
|
||||
// given vec and returns the results in min_xp and max_xp. Geometrically
|
||||
// this is the left and right edge of the outline perpendicular to the
|
||||
// given direction, but to get the distance units correct, you would
|
||||
// have to divide by the modulus of vec.
|
||||
void MinMaxCrossProduct(const TPOINT vec, int *min_xp, int *max_xp) const;
|
||||
|
||||
TBOX bounding_box() const;
|
||||
// Returns true if *this and other have equal bounding boxes.
|
||||
bool SameBox(const TESSLINE &other) const {
|
||||
return topleft == other.topleft && botright == other.botright;
|
||||
}
|
||||
// Returns true if the given line segment crosses any outline of this blob.
|
||||
bool SegmentCrosses(const TPOINT &pt1, const TPOINT &pt2) const {
|
||||
if (Contains(pt1) && Contains(pt2)) {
|
||||
EDGEPT *pt = loop;
|
||||
do {
|
||||
if (TPOINT::IsCrossed(pt1, pt2, pt->pos, pt->next->pos)) {
|
||||
return true;
|
||||
}
|
||||
pt = pt->next;
|
||||
} while (pt != loop);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
// Returns true if the point is contained within the outline box.
|
||||
bool Contains(const TPOINT &pt) const {
|
||||
return topleft.x <= pt.x && pt.x <= botright.x && botright.y <= pt.y && pt.y <= topleft.y;
|
||||
}
|
||||
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color);
|
||||
#endif // !GRAPHICS_DISABLED
|
||||
|
||||
// Returns the first outline point that has a different src_outline to its
|
||||
// predecessor, or, if all the same, the lowest indexed point.
|
||||
EDGEPT *FindBestStartPt() const;
|
||||
|
||||
int BBArea() const {
|
||||
return (botright.x - topleft.x) * (topleft.y - botright.y);
|
||||
}
|
||||
|
||||
TPOINT topleft; // Top left of loop.
|
||||
TPOINT botright; // Bottom right of loop.
|
||||
TPOINT start; // Start of loop.
|
||||
bool is_hole; // True if this is a hole/child outline.
|
||||
EDGEPT *loop; // Edgeloop.
|
||||
TESSLINE *next; // Next outline in blob.
|
||||
}; // Outline structure.
|
||||
|
||||
struct TBLOB {
|
||||
TBLOB() : outlines(nullptr) {}
|
||||
TBLOB(const TBLOB &src) : outlines(nullptr) {
|
||||
CopyFrom(src);
|
||||
}
|
||||
~TBLOB() {
|
||||
Clear();
|
||||
}
|
||||
TBLOB &operator=(const TBLOB &src) {
|
||||
CopyFrom(src);
|
||||
return *this;
|
||||
}
|
||||
// Factory to build a TBLOB from a C_BLOB with polygonal approximation along
|
||||
// the way. If allow_detailed_fx is true, the EDGEPTs in the returned TBLOB
|
||||
// contain pointers to the input C_OUTLINEs that enable higher-resolution
|
||||
// feature extraction that does not use the polygonal approximation.
|
||||
static TBLOB *PolygonalCopy(bool allow_detailed_fx, C_BLOB *src);
|
||||
// Factory builds a blob with no outlines, but copies the other member data.
|
||||
static TBLOB *ShallowCopy(const TBLOB &src);
|
||||
// Normalizes the blob for classification only if needed.
|
||||
// (Normally this means a non-zero classify rotation.)
|
||||
// If no Normalization is needed, then nullptr is returned, and the input blob
|
||||
// can be used directly. Otherwise a new TBLOB is returned which must be
|
||||
// deleted after use.
|
||||
TBLOB *ClassifyNormalizeIfNeeded() const;
|
||||
|
||||
// Copies the data and the outlines, but leaves next untouched.
|
||||
void CopyFrom(const TBLOB &src);
|
||||
// Deletes owned data.
|
||||
void Clear();
|
||||
// Sets up the built-in DENORM and normalizes the blob in-place.
|
||||
// For parameters see DENORM::SetupNormalization, plus the inverse flag for
|
||||
// this blob and the Pix for the full image.
|
||||
void Normalize(const BLOCK *block, const FCOORD *rotation, const DENORM *predecessor,
|
||||
float x_origin, float y_origin, float x_scale, float y_scale, float final_xshift,
|
||||
float final_yshift, bool inverse, Image pix);
|
||||
// Rotates by the given rotation in place.
|
||||
void Rotate(const FCOORD rotation);
|
||||
// Moves by the given vec in place.
|
||||
void Move(const ICOORD vec);
|
||||
// Scales by the given factor in place.
|
||||
void Scale(float factor);
|
||||
// Recomputes the bounding boxes of the outlines.
|
||||
void ComputeBoundingBoxes();
|
||||
|
||||
// Returns the number of outlines.
|
||||
int NumOutlines() const;
|
||||
|
||||
TBOX bounding_box() const;
|
||||
|
||||
// Returns true if the given line segment crosses any outline of this blob.
|
||||
bool SegmentCrossesOutline(const TPOINT &pt1, const TPOINT &pt2) const {
|
||||
for (const TESSLINE *outline = outlines; outline != nullptr; outline = outline->next) {
|
||||
if (outline->SegmentCrosses(pt1, pt2)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
// Returns true if the point is contained within any of the outline boxes.
|
||||
bool Contains(const TPOINT &pt) const {
|
||||
for (const TESSLINE *outline = outlines; outline != nullptr; outline = outline->next) {
|
||||
if (outline->Contains(pt)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Finds and deletes any duplicate outlines in this blob, without deleting
|
||||
// their EDGEPTs.
|
||||
void EliminateDuplicateOutlines();
|
||||
|
||||
// Swaps the outlines of *this and next if needed to keep the centers in
|
||||
// increasing x.
|
||||
void CorrectBlobOrder(TBLOB *next);
|
||||
|
||||
const DENORM &denorm() const {
|
||||
return denorm_;
|
||||
}
|
||||
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color);
|
||||
#endif // !GRAPHICS_DISABLED
|
||||
|
||||
int BBArea() const {
|
||||
int total_area = 0;
|
||||
for (TESSLINE *outline = outlines; outline != nullptr; outline = outline->next) {
|
||||
total_area += outline->BBArea();
|
||||
}
|
||||
return total_area;
|
||||
}
|
||||
|
||||
// Computes the center of mass and second moments for the old baseline and
|
||||
// 2nd moment normalizations. Returns the outline length.
|
||||
// The input denorm should be the normalizations that have been applied from
|
||||
// the image to the current state of this TBLOB.
|
||||
int ComputeMoments(FCOORD *center, FCOORD *second_moments) const;
|
||||
// Computes the precise bounding box of the coords that are generated by
|
||||
// GetEdgeCoords. This may be different from the bounding box of the polygon.
|
||||
void GetPreciseBoundingBox(TBOX *precise_box) const;
|
||||
// Adds edges to the given vectors.
|
||||
// For all the edge steps in all the outlines, or polygonal approximation
|
||||
// where there are no edge steps, collects the steps into x_coords/y_coords.
|
||||
// x_coords is a collection of the x-coords of vertical edges for each
|
||||
// y-coord starting at box.bottom().
|
||||
// y_coords is a collection of the y-coords of horizontal edges for each
|
||||
// x-coord starting at box.left().
|
||||
// Eg x_coords[0] is a collection of the x-coords of edges at y=bottom.
|
||||
// Eg x_coords[1] is a collection of the x-coords of edges at y=bottom + 1.
|
||||
void GetEdgeCoords(const TBOX &box, std::vector<std::vector<int>> &x_coords,
|
||||
std::vector<std::vector<int>> &y_coords) const;
|
||||
|
||||
TESSLINE *outlines; // List of outlines in blob.
|
||||
|
||||
private: // TODO(rays) Someday the data members will be private too.
|
||||
// For all the edge steps in all the outlines, or polygonal approximation
|
||||
// where there are no edge steps, collects the steps into the bounding_box,
|
||||
// llsq and/or the x_coords/y_coords. Both are used in different kinds of
|
||||
// normalization.
|
||||
// For a description of x_coords, y_coords, see GetEdgeCoords above.
|
||||
void CollectEdges(const TBOX &box, TBOX *bounding_box, LLSQ *llsq,
|
||||
std::vector<std::vector<int>> *x_coords,
|
||||
std::vector<std::vector<int>> *y_coords) const;
|
||||
|
||||
private:
|
||||
// DENORM indicating the transformations that this blob has undergone so far.
|
||||
DENORM denorm_;
|
||||
}; // Blob structure.
|
||||
|
||||
struct TWERD {
|
||||
TWERD() : latin_script(false) {}
|
||||
TWERD(const TWERD &src) {
|
||||
CopyFrom(src);
|
||||
}
|
||||
~TWERD() {
|
||||
Clear();
|
||||
}
|
||||
TWERD &operator=(const TWERD &src) {
|
||||
CopyFrom(src);
|
||||
return *this;
|
||||
}
|
||||
// Factory to build a TWERD from a (C_BLOB) WERD, with polygonal
|
||||
// approximation along the way.
|
||||
static TWERD *PolygonalCopy(bool allow_detailed_fx, WERD *src);
|
||||
// Baseline normalizes the blobs in-place, recording the normalization in the
|
||||
// DENORMs in the blobs.
|
||||
void BLNormalize(const BLOCK *block, const ROW *row, Image pix, bool inverse, float x_height,
|
||||
float baseline_shift, bool numeric_mode, tesseract::OcrEngineMode hint,
|
||||
const TBOX *norm_box, DENORM *word_denorm);
|
||||
// Copies the data and the blobs, but leaves next untouched.
|
||||
void CopyFrom(const TWERD &src);
|
||||
// Deletes owned data.
|
||||
void Clear();
|
||||
// Recomputes the bounding boxes of the blobs.
|
||||
void ComputeBoundingBoxes();
|
||||
|
||||
// Returns the number of blobs in the word.
|
||||
int NumBlobs() const {
|
||||
return blobs.size();
|
||||
}
|
||||
TBOX bounding_box() const;
|
||||
|
||||
// Merges the blobs from start to end, not including end, and deletes
|
||||
// the blobs between start and end.
|
||||
void MergeBlobs(int start, int end);
|
||||
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
void plot(ScrollView *window);
|
||||
#endif // !GRAPHICS_DISABLED
|
||||
|
||||
std::vector<TBLOB *> blobs; // Blobs in word.
|
||||
bool latin_script; // This word is in a latin-based script.
|
||||
};
|
||||
|
||||
/*----------------------------------------------------------------------
|
||||
F u n c t i o n s
|
||||
----------------------------------------------------------------------*/
|
||||
// TODO(rays) Make divisible_blob and divide_blobs members of TBLOB.
|
||||
bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT *location);
|
||||
|
||||
void divide_blobs(TBLOB *blob, TBLOB *other_blob, bool italic_blob, const TPOINT &location);
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif
|
74
3rdparty/tesseract_ocr/tesseract/src/ccstruct/blread.cpp
vendored
Normal file
74
3rdparty/tesseract_ocr/tesseract/src/ccstruct/blread.cpp
vendored
Normal file
|
@ -0,0 +1,74 @@
|
|||
/**********************************************************************
|
||||
* File: blread.cpp (Formerly pdread.c)
|
||||
* Description: Friend function of BLOCK to read the uscan pd file.
|
||||
* Author: Ray Smith
|
||||
*
|
||||
* (C) Copyright 1991, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "blread.h"
|
||||
|
||||
#include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)
|
||||
#include "scanutils.h" // for tfscanf
|
||||
|
||||
#include <cstdio> // for fclose, fopen, FILE
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
#define UNLV_EXT ".uzn" // unlv zone file
|
||||
|
||||
/**********************************************************************
|
||||
* read_unlv_file
|
||||
*
|
||||
* Read a whole unlv zone file to make a list of blocks.
|
||||
**********************************************************************/
|
||||
|
||||
bool read_unlv_file( // print list of sides
|
||||
std::string &name, // basename of file
|
||||
int32_t xsize, // image size
|
||||
int32_t ysize, // image size
|
||||
BLOCK_LIST *blocks // output list
|
||||
) {
|
||||
FILE *pdfp; // file pointer
|
||||
BLOCK *block; // current block
|
||||
int x; // current top-down coords
|
||||
int y;
|
||||
int width; // of current block
|
||||
int height;
|
||||
BLOCK_IT block_it = blocks; // block iterator
|
||||
|
||||
name += UNLV_EXT; // add extension
|
||||
if ((pdfp = fopen(name.c_str(), "rb")) == nullptr) {
|
||||
return false; // didn't read one
|
||||
} else {
|
||||
while (tfscanf(pdfp, "%d %d %d %d %*s", &x, &y, &width, &height) >= 4) {
|
||||
// make rect block
|
||||
block = new BLOCK(name.c_str(), true, 0, 0, static_cast<int16_t>(x),
|
||||
static_cast<int16_t>(ysize - y - height), static_cast<int16_t>(x + width),
|
||||
static_cast<int16_t>(ysize - y));
|
||||
// on end of list
|
||||
block_it.add_to_end(block);
|
||||
}
|
||||
fclose(pdfp);
|
||||
}
|
||||
tprintf("UZN file %s loaded.\n", name.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
void FullPageBlock(int width, int height, BLOCK_LIST *blocks) {
|
||||
BLOCK_IT block_it(blocks);
|
||||
auto *block = new BLOCK("", true, 0, 0, 0, 0, width, height);
|
||||
block_it.add_to_end(block);
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
40
3rdparty/tesseract_ocr/tesseract/src/ccstruct/blread.h
vendored
Normal file
40
3rdparty/tesseract_ocr/tesseract/src/ccstruct/blread.h
vendored
Normal file
|
@ -0,0 +1,40 @@
|
|||
/**********************************************************************
|
||||
* File: blread.h (Formerly pdread.h)
|
||||
* Description: Friend function of BLOCK to read the uscan pd file.
|
||||
* Author: Ray Smith
|
||||
*
|
||||
* (C) Copyright 1991, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef BLREAD_H
|
||||
#define BLREAD_H
|
||||
|
||||
#include <cstdint> // for int32_t
|
||||
#include <string> // for std::string
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class BLOCK_LIST;
|
||||
|
||||
bool read_unlv_file( // print list of sides
|
||||
std::string &name, // basename of file
|
||||
int32_t xsize, // image size
|
||||
int32_t ysize, // image size
|
||||
BLOCK_LIST *blocks // output list
|
||||
);
|
||||
|
||||
void FullPageBlock(int width, int height, BLOCK_LIST *blocks);
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif
|
282
3rdparty/tesseract_ocr/tesseract/src/ccstruct/boxread.cpp
vendored
Normal file
282
3rdparty/tesseract_ocr/tesseract/src/ccstruct/boxread.cpp
vendored
Normal file
|
@ -0,0 +1,282 @@
|
|||
/**********************************************************************
|
||||
* File: boxread.cpp
|
||||
* Description: Read data from a box file.
|
||||
* Author: Ray Smith
|
||||
*
|
||||
* (C) Copyright 2007, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "boxread.h"
|
||||
|
||||
#include "errcode.h" // for ERRCODE, TESSEXIT
|
||||
#include "fileerr.h" // for CANTOPENFILE
|
||||
#include "rect.h" // for TBOX
|
||||
#include "tprintf.h" // for tprintf
|
||||
|
||||
#include <tesseract/unichar.h> // for UNICHAR
|
||||
#include "helpers.h" // for chomp_string
|
||||
|
||||
#include <climits> // for INT_MAX
|
||||
#include <cstring> // for strchr, strcmp
|
||||
#include <fstream> // for std::ifstream
|
||||
#include <locale> // for std::locale::classic
|
||||
#include <sstream> // for std::stringstream
|
||||
#include <string> // for std::string
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Special char code used to identify multi-blob labels.
|
||||
static const char *kMultiBlobLabelCode = "WordStr";
|
||||
|
||||
// Returns the box file name corresponding to the given image_filename.
|
||||
static std::string BoxFileName(const char *image_filename) {
|
||||
std::string box_filename = image_filename;
|
||||
size_t length = box_filename.length();
|
||||
std::string last = (length > 8) ? box_filename.substr(length - 8) : "";
|
||||
if (last == ".bin.png" || last == ".nrm.png") {
|
||||
box_filename.resize(length - 8);
|
||||
} else {
|
||||
size_t lastdot = box_filename.find_last_of('.');
|
||||
if (lastdot < length) {
|
||||
box_filename.resize(lastdot);
|
||||
}
|
||||
}
|
||||
box_filename += ".box";
|
||||
return box_filename;
|
||||
}
|
||||
|
||||
// Open the boxfile based on the given image filename.
|
||||
FILE *OpenBoxFile(const char *fname) {
|
||||
std::string filename = BoxFileName(fname);
|
||||
FILE *box_file = nullptr;
|
||||
if (!(box_file = fopen(filename.c_str(), "rb"))) {
|
||||
CANTOPENFILE.error("read_next_box", TESSEXIT, "Can't open box file %s", filename.c_str());
|
||||
}
|
||||
return box_file;
|
||||
}
|
||||
|
||||
// Reads all boxes from the given filename.
|
||||
// Reads a specific target_page number if >= 0, or all pages otherwise.
|
||||
// Skips blanks if skip_blanks is true.
|
||||
// The UTF-8 label of the box is put in texts, and the full box definition as
|
||||
// a string is put in box_texts, with the corresponding page number in pages.
|
||||
// Each of the output vectors is optional (may be nullptr).
|
||||
// Returns false if no boxes are found.
|
||||
bool ReadAllBoxes(int target_page, bool skip_blanks, const char *filename, std::vector<TBOX> *boxes,
|
||||
std::vector<std::string> *texts, std::vector<std::string> *box_texts,
|
||||
std::vector<int> *pages) {
|
||||
std::ifstream input(BoxFileName(filename).c_str(), std::ios::in | std::ios::binary);
|
||||
std::vector<char> box_data(std::istreambuf_iterator<char>(input), {});
|
||||
if (box_data.empty()) {
|
||||
return false;
|
||||
}
|
||||
// Convert the array of bytes to a string, so it can be used by the parser.
|
||||
box_data.push_back('\0');
|
||||
return ReadMemBoxes(target_page, skip_blanks, &box_data[0],
|
||||
/*continue_on_failure*/ true, boxes, texts, box_texts, pages);
|
||||
}
|
||||
|
||||
// Reads all boxes from the string. Otherwise, as ReadAllBoxes.
|
||||
bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure,
|
||||
std::vector<TBOX> *boxes, std::vector<std::string> *texts,
|
||||
std::vector<std::string> *box_texts, std::vector<int> *pages) {
|
||||
std::string box_str(box_data);
|
||||
std::vector<std::string> lines = split(box_str, '\n');
|
||||
if (lines.empty()) {
|
||||
return false;
|
||||
}
|
||||
int num_boxes = 0;
|
||||
for (auto &line : lines) {
|
||||
int page = 0;
|
||||
std::string utf8_str;
|
||||
TBOX box;
|
||||
if (!ParseBoxFileStr(line.c_str(), &page, utf8_str, &box)) {
|
||||
if (continue_on_failure) {
|
||||
continue;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (skip_blanks && (utf8_str == " " || utf8_str == "\t")) {
|
||||
continue;
|
||||
}
|
||||
if (target_page >= 0 && page != target_page) {
|
||||
continue;
|
||||
}
|
||||
if (boxes != nullptr) {
|
||||
boxes->push_back(box);
|
||||
}
|
||||
if (texts != nullptr) {
|
||||
texts->push_back(utf8_str);
|
||||
}
|
||||
if (box_texts != nullptr) {
|
||||
std::string full_text;
|
||||
MakeBoxFileStr(utf8_str.c_str(), box, target_page, full_text);
|
||||
box_texts->push_back(full_text);
|
||||
}
|
||||
if (pages != nullptr) {
|
||||
pages->push_back(page);
|
||||
}
|
||||
++num_boxes;
|
||||
}
|
||||
return num_boxes > 0;
|
||||
}
|
||||
|
||||
// TODO(rays) convert all uses of ReadNextBox to use the new ReadAllBoxes.
|
||||
// Box files are used ONLY DURING TRAINING, but by both processes of
|
||||
// creating tr files with tesseract, and unicharset_extractor.
|
||||
// ReadNextBox factors out the code to interpret a line of a box
|
||||
// file so that applybox and unicharset_extractor interpret the same way.
|
||||
// This function returns the next valid box file utf8 string and coords
|
||||
// and returns true, or false on eof (and closes the file).
|
||||
// It ignores the utf8 file signature ByteOrderMark (U+FEFF=EF BB BF), checks
|
||||
// for valid utf-8 and allows space or tab between fields.
|
||||
// utf8_str is set with the unichar string, and bounding box with the box.
|
||||
// If there are page numbers in the file, it reads them all.
|
||||
bool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box) {
|
||||
return ReadNextBox(-1, line_number, box_file, utf8_str, bounding_box);
|
||||
}
|
||||
|
||||
// As ReadNextBox above, but get a specific page number. (0-based)
|
||||
// Use -1 to read any page number. Files without page number all
|
||||
// read as if they are page 0.
|
||||
bool ReadNextBox(int target_page, int *line_number, FILE *box_file, std::string &utf8_str,
|
||||
TBOX *bounding_box) {
|
||||
int page = 0;
|
||||
char buff[kBoxReadBufSize]; // boxfile read buffer
|
||||
char *buffptr = buff;
|
||||
|
||||
while (fgets(buff, sizeof(buff) - 1, box_file)) {
|
||||
(*line_number)++;
|
||||
|
||||
buffptr = buff;
|
||||
const auto *ubuf = reinterpret_cast<const unsigned char *>(buffptr);
|
||||
if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf) {
|
||||
buffptr += 3; // Skip unicode file designation.
|
||||
}
|
||||
// Check for blank lines in box file
|
||||
if (*buffptr == '\n' || *buffptr == '\0') {
|
||||
continue;
|
||||
}
|
||||
// Skip blank boxes.
|
||||
if (*buffptr == ' ' || *buffptr == '\t') {
|
||||
continue;
|
||||
}
|
||||
if (*buffptr != '\0') {
|
||||
if (!ParseBoxFileStr(buffptr, &page, utf8_str, bounding_box)) {
|
||||
tprintf("Box file format error on line %i; ignored\n", *line_number);
|
||||
continue;
|
||||
}
|
||||
if (target_page >= 0 && target_page != page) {
|
||||
continue; // Not on the appropriate page.
|
||||
}
|
||||
return true; // Successfully read a box.
|
||||
}
|
||||
}
|
||||
fclose(box_file);
|
||||
return false; // EOF
|
||||
}
|
||||
|
||||
// Parses the given box file string into a page_number, utf8_str, and
|
||||
// bounding_box. Returns true on a successful parse.
|
||||
// The box file is assumed to contain box definitions, one per line, of the
|
||||
// following format for blob-level boxes:
|
||||
// <UTF8 str> <left> <bottom> <right> <top> <page id>
|
||||
// and for word/line-level boxes:
|
||||
// WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
|
||||
// See applyybox.cpp for more information.
|
||||
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str,
|
||||
TBOX *bounding_box) {
|
||||
*bounding_box = TBOX(); // Initialize it to empty.
|
||||
utf8_str = "";
|
||||
char uch[kBoxReadBufSize];
|
||||
const char *buffptr = boxfile_str;
|
||||
// Read the unichar without messing up on Tibetan.
|
||||
// According to issue 253 the utf-8 surrogates 85 and A0 are treated
|
||||
// as whitespace by sscanf, so it is more reliable to just find
|
||||
// ascii space and tab.
|
||||
int uch_len = 0;
|
||||
// Skip unicode file designation, if present.
|
||||
const auto *ubuf = reinterpret_cast<const unsigned char *>(buffptr);
|
||||
if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf) {
|
||||
buffptr += 3;
|
||||
}
|
||||
// Allow a single blank as the UTF-8 string. Check for empty string and
|
||||
// then blindly eat the first character.
|
||||
if (*buffptr == '\0') {
|
||||
return false;
|
||||
}
|
||||
do {
|
||||
uch[uch_len++] = *buffptr++;
|
||||
} while (*buffptr != '\0' && *buffptr != ' ' && *buffptr != '\t' &&
|
||||
uch_len < kBoxReadBufSize - 1);
|
||||
uch[uch_len] = '\0';
|
||||
if (*buffptr != '\0') {
|
||||
++buffptr;
|
||||
}
|
||||
int x_min = INT_MAX;
|
||||
int y_min = INT_MAX;
|
||||
int x_max = INT_MIN;
|
||||
int y_max = INT_MIN;
|
||||
*page_number = 0;
|
||||
std::stringstream stream(buffptr);
|
||||
stream.imbue(std::locale::classic());
|
||||
stream >> x_min;
|
||||
stream >> y_min;
|
||||
stream >> x_max;
|
||||
stream >> y_max;
|
||||
stream >> *page_number;
|
||||
if (x_max < x_min || y_max < y_min) {
|
||||
tprintf("Bad box coordinates in boxfile string! %s\n", ubuf);
|
||||
return false;
|
||||
}
|
||||
// Test for long space-delimited string label.
|
||||
if (strcmp(uch, kMultiBlobLabelCode) == 0 && (buffptr = strchr(buffptr, '#')) != nullptr) {
|
||||
strncpy(uch, buffptr + 1, kBoxReadBufSize - 1);
|
||||
uch[kBoxReadBufSize - 1] = '\0'; // Prevent buffer overrun.
|
||||
chomp_string(uch);
|
||||
uch_len = strlen(uch);
|
||||
}
|
||||
// Validate UTF8 by making unichars with it.
|
||||
int used = 0;
|
||||
while (used < uch_len) {
|
||||
tesseract::UNICHAR ch(uch + used, uch_len - used);
|
||||
int new_used = ch.utf8_len();
|
||||
if (new_used == 0) {
|
||||
tprintf("Bad UTF-8 str %s starts with 0x%02x at col %d\n", uch + used, uch[used], used + 1);
|
||||
return false;
|
||||
}
|
||||
used += new_used;
|
||||
}
|
||||
utf8_str = uch;
|
||||
if (x_min > x_max) {
|
||||
std::swap(x_min, x_max);
|
||||
}
|
||||
if (y_min > y_max) {
|
||||
std::swap(y_min, y_max);
|
||||
}
|
||||
bounding_box->set_to_given_coords(x_min, y_min, x_max, y_max);
|
||||
return true; // Successfully read a box.
|
||||
}
|
||||
|
||||
// Creates a box file string from a unichar string, TBOX and page number.
|
||||
void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, std::string &box_str) {
|
||||
box_str = unichar_str;
|
||||
box_str += " " + std::to_string(box.left());
|
||||
box_str += " " + std::to_string(box.bottom());
|
||||
box_str += " " + std::to_string(box.right());
|
||||
box_str += " " + std::to_string(box.top());
|
||||
box_str += " " + std::to_string(page_num);
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
89
3rdparty/tesseract_ocr/tesseract/src/ccstruct/boxread.h
vendored
Normal file
89
3rdparty/tesseract_ocr/tesseract/src/ccstruct/boxread.h
vendored
Normal file
|
@ -0,0 +1,89 @@
|
|||
/**********************************************************************
|
||||
* File: boxread.h
|
||||
* Description: Read data from a box file.
|
||||
* Author: Ray Smith
|
||||
*
|
||||
* (C) Copyright 2007, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef TESSERACT_CCUTIL_BOXREAD_H_
|
||||
#define TESSERACT_CCUTIL_BOXREAD_H_
|
||||
|
||||
#include <cstdio> // for FILE
|
||||
#include <string> // for std::string
|
||||
#include <vector> // for std::vector
|
||||
|
||||
#include <tesseract/export.h> // for TESS_API
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class TBOX;
|
||||
|
||||
// Size of buffer used to read a line from a box file.
|
||||
const int kBoxReadBufSize = 1024;
|
||||
|
||||
// Open the boxfile based on the given image filename.
|
||||
// Returns nullptr if the box file cannot be opened.
|
||||
TESS_API
|
||||
FILE *OpenBoxFile(const char *filename);
|
||||
|
||||
// Reads all boxes from the given filename.
|
||||
// Reads a specific target_page number if >= 0, or all pages otherwise.
|
||||
// Skips blanks if skip_blanks is true.
|
||||
// The UTF-8 label of the box is put in texts, and the full box definition as
|
||||
// a string is put in box_texts, with the corresponding page number in pages.
|
||||
// Each of the output vectors is optional (may be nullptr).
|
||||
// Returns false if no boxes are found.
|
||||
bool ReadAllBoxes(int target_page, bool skip_blanks, const char *filename, std::vector<TBOX> *boxes,
|
||||
std::vector<std::string> *texts, std::vector<std::string> *box_texts,
|
||||
std::vector<int> *pages);
|
||||
|
||||
// Reads all boxes from the string. Otherwise, as ReadAllBoxes.
|
||||
// continue_on_failure allows reading to continue even if an invalid box is
|
||||
// encountered and will return true if it succeeds in reading some boxes.
|
||||
// It otherwise gives up and returns false on encountering an invalid box.
|
||||
TESS_API
|
||||
bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure,
|
||||
std::vector<TBOX> *boxes, std::vector<std::string> *texts,
|
||||
std::vector<std::string> *box_texts, std::vector<int> *pages);
|
||||
|
||||
// ReadNextBox factors out the code to interpret a line of a box
|
||||
// file so that applybox and unicharset_extractor interpret the same way.
|
||||
// This function returns the next valid box file utf8 string and coords
|
||||
// and returns true, or false on eof (and closes the file).
|
||||
// It ignores the utf8 file signature ByteOrderMark (U+FEFF=EF BB BF), checks
|
||||
// for valid utf-8 and allows space or tab between fields.
|
||||
// utf8_str is set with the unichar string, and bounding box with the box.
|
||||
// If there are page numbers in the file, it reads them all.
|
||||
TESS_API
|
||||
bool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box);
|
||||
// As ReadNextBox above, but get a specific page number. (0-based)
|
||||
// Use -1 to read any page number. Files without page number all
|
||||
// read as if they are page 0.
|
||||
TESS_API
|
||||
bool ReadNextBox(int target_page, int *line_number, FILE *box_file, std::string &utf8_str,
|
||||
TBOX *bounding_box);
|
||||
|
||||
// Parses the given box file string into a page_number, utf8_str, and
|
||||
// bounding_box. Returns true on a successful parse.
|
||||
TESS_API
|
||||
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str,
|
||||
TBOX *bounding_box);
|
||||
|
||||
// Creates a box file string from a unichar string, TBOX and page number.
|
||||
TESS_API
|
||||
void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, std::string &box_str);
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_CCUTIL_BOXREAD_H_
|
205
3rdparty/tesseract_ocr/tesseract/src/ccstruct/boxword.cpp
vendored
Normal file
205
3rdparty/tesseract_ocr/tesseract/src/ccstruct/boxword.cpp
vendored
Normal file
|
@ -0,0 +1,205 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: boxword.cpp
|
||||
// Description: Class to represent the bounding boxes of the output.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2010, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "boxword.h"
|
||||
#include "blobs.h"
|
||||
#include "host.h" // for NearlyEqual
|
||||
#include "normalis.h"
|
||||
#include "ocrblock.h"
|
||||
#include "pageres.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Clip output boxes to input blob boxes for bounds that are within this
|
||||
// tolerance. Otherwise, the blob may be chopped and we have to just use
|
||||
// the word bounding box.
|
||||
const int kBoxClipTolerance = 2;
|
||||
|
||||
BoxWord::BoxWord() : length_(0) {}
|
||||
|
||||
BoxWord::BoxWord(const BoxWord &src) {
|
||||
CopyFrom(src);
|
||||
}
|
||||
|
||||
BoxWord &BoxWord::operator=(const BoxWord &src) {
|
||||
CopyFrom(src);
|
||||
return *this;
|
||||
}
|
||||
|
||||
void BoxWord::CopyFrom(const BoxWord &src) {
|
||||
bbox_ = src.bbox_;
|
||||
length_ = src.length_;
|
||||
boxes_.clear();
|
||||
boxes_.reserve(length_);
|
||||
for (int i = 0; i < length_; ++i) {
|
||||
boxes_.push_back(src.boxes_[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Factory to build a BoxWord from a TWERD using the DENORMs on each blob to
|
||||
// switch back to original image coordinates.
|
||||
BoxWord *BoxWord::CopyFromNormalized(TWERD *tessword) {
|
||||
auto *boxword = new BoxWord();
|
||||
// Count the blobs.
|
||||
boxword->length_ = tessword->NumBlobs();
|
||||
// Allocate memory.
|
||||
boxword->boxes_.reserve(boxword->length_);
|
||||
|
||||
for (int b = 0; b < boxword->length_; ++b) {
|
||||
TBLOB *tblob = tessword->blobs[b];
|
||||
TBOX blob_box;
|
||||
for (TESSLINE *outline = tblob->outlines; outline != nullptr; outline = outline->next) {
|
||||
EDGEPT *edgept = outline->loop;
|
||||
// Iterate over the edges.
|
||||
do {
|
||||
if (!edgept->IsHidden() || !edgept->prev->IsHidden()) {
|
||||
ICOORD pos(edgept->pos.x, edgept->pos.y);
|
||||
TPOINT denormed;
|
||||
tblob->denorm().DenormTransform(nullptr, edgept->pos, &denormed);
|
||||
pos.set_x(denormed.x);
|
||||
pos.set_y(denormed.y);
|
||||
TBOX pt_box(pos, pos);
|
||||
blob_box += pt_box;
|
||||
}
|
||||
edgept = edgept->next;
|
||||
} while (edgept != outline->loop);
|
||||
}
|
||||
boxword->boxes_.push_back(blob_box);
|
||||
}
|
||||
boxword->ComputeBoundingBox();
|
||||
return boxword;
|
||||
}
|
||||
|
||||
// Clean up the bounding boxes from the polygonal approximation by
|
||||
// expanding slightly, then clipping to the blobs from the original_word
|
||||
// that overlap. If not null, the block provides the inverse rotation.
|
||||
void BoxWord::ClipToOriginalWord(const BLOCK *block, WERD *original_word) {
|
||||
for (int i = 0; i < length_; ++i) {
|
||||
TBOX box = boxes_[i];
|
||||
// Expand by a single pixel, as the poly approximation error is 1 pixel.
|
||||
box = TBOX(box.left() - 1, box.bottom() - 1, box.right() + 1, box.top() + 1);
|
||||
// Now find the original box that matches.
|
||||
TBOX original_box;
|
||||
C_BLOB_IT b_it(original_word->cblob_list());
|
||||
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
|
||||
TBOX blob_box = b_it.data()->bounding_box();
|
||||
if (block != nullptr) {
|
||||
blob_box.rotate(block->re_rotation());
|
||||
}
|
||||
if (blob_box.major_overlap(box)) {
|
||||
original_box += blob_box;
|
||||
}
|
||||
}
|
||||
if (!original_box.null_box()) {
|
||||
if (NearlyEqual<int>(original_box.left(), box.left(), kBoxClipTolerance)) {
|
||||
box.set_left(original_box.left());
|
||||
}
|
||||
if (NearlyEqual<int>(original_box.right(), box.right(), kBoxClipTolerance)) {
|
||||
box.set_right(original_box.right());
|
||||
}
|
||||
if (NearlyEqual<int>(original_box.top(), box.top(), kBoxClipTolerance)) {
|
||||
box.set_top(original_box.top());
|
||||
}
|
||||
if (NearlyEqual<int>(original_box.bottom(), box.bottom(), kBoxClipTolerance)) {
|
||||
box.set_bottom(original_box.bottom());
|
||||
}
|
||||
}
|
||||
original_box = original_word->bounding_box();
|
||||
if (block != nullptr) {
|
||||
original_box.rotate(block->re_rotation());
|
||||
}
|
||||
boxes_[i] = box.intersection(original_box);
|
||||
}
|
||||
ComputeBoundingBox();
|
||||
}
|
||||
|
||||
// Merges the boxes from start to end, not including end, and deletes
|
||||
// the boxes between start and end.
|
||||
void BoxWord::MergeBoxes(int start, int end) {
|
||||
start = ClipToRange(start, 0, length_);
|
||||
end = ClipToRange(end, 0, length_);
|
||||
if (end <= start + 1) {
|
||||
return;
|
||||
}
|
||||
for (int i = start + 1; i < end; ++i) {
|
||||
boxes_[start] += boxes_[i];
|
||||
}
|
||||
int shrinkage = end - 1 - start;
|
||||
length_ -= shrinkage;
|
||||
for (int i = start + 1; i < length_; ++i) {
|
||||
boxes_[i] = boxes_[i + shrinkage];
|
||||
}
|
||||
boxes_.resize(length_);
|
||||
}
|
||||
|
||||
// Inserts a new box before the given index.
|
||||
// Recomputes the bounding box.
|
||||
void BoxWord::InsertBox(int index, const TBOX &box) {
|
||||
if (index < length_) {
|
||||
boxes_.insert(boxes_.begin() + index, box);
|
||||
} else {
|
||||
boxes_.push_back(box);
|
||||
}
|
||||
length_ = boxes_.size();
|
||||
ComputeBoundingBox();
|
||||
}
|
||||
|
||||
// Changes the box at the given index to the new box.
|
||||
// Recomputes the bounding box.
|
||||
void BoxWord::ChangeBox(int index, const TBOX &box) {
|
||||
boxes_[index] = box;
|
||||
ComputeBoundingBox();
|
||||
}
|
||||
|
||||
// Deletes the box with the given index, and shuffles up the rest.
|
||||
// Recomputes the bounding box.
|
||||
void BoxWord::DeleteBox(int index) {
|
||||
ASSERT_HOST(0 <= index && index < length_);
|
||||
boxes_.erase(boxes_.begin() + index);
|
||||
--length_;
|
||||
ComputeBoundingBox();
|
||||
}
|
||||
|
||||
// Deletes all the boxes stored in BoxWord.
|
||||
void BoxWord::DeleteAllBoxes() {
|
||||
length_ = 0;
|
||||
boxes_.clear();
|
||||
bbox_ = TBOX();
|
||||
}
|
||||
|
||||
// Computes the bounding box of the word.
|
||||
void BoxWord::ComputeBoundingBox() {
|
||||
bbox_ = TBOX();
|
||||
for (int i = 0; i < length_; ++i) {
|
||||
bbox_ += boxes_[i];
|
||||
}
|
||||
}
|
||||
|
||||
// This and other putatively are the same, so call the (permanent) callback
|
||||
// for each blob index where the bounding boxes match.
|
||||
// The callback is deleted on completion.
|
||||
void BoxWord::ProcessMatchedBlobs(const TWERD &other, std::function<void(int)> cb) const {
|
||||
for (int i = 0; i < length_ && i < other.NumBlobs(); ++i) {
|
||||
TBOX blob_box = other.blobs[i]->bounding_box();
|
||||
if (blob_box == boxes_[i]) {
|
||||
cb(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
97
3rdparty/tesseract_ocr/tesseract/src/ccstruct/boxword.h
vendored
Normal file
97
3rdparty/tesseract_ocr/tesseract/src/ccstruct/boxword.h
vendored
Normal file
|
@ -0,0 +1,97 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: boxword.h
|
||||
// Description: Class to represent the bounding boxes of the output.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2010, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CSTRUCT_BOXWORD_H_
|
||||
#define TESSERACT_CSTRUCT_BOXWORD_H_
|
||||
|
||||
#include "rect.h" // for TBOX
|
||||
|
||||
#include <functional> // for std::function
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class BLOCK;
|
||||
class WERD;
|
||||
struct TWERD;
|
||||
|
||||
// Class to hold an array of bounding boxes for an output word and
|
||||
// the bounding box of the whole word.
|
||||
class BoxWord {
|
||||
public:
|
||||
BoxWord();
|
||||
explicit BoxWord(const BoxWord &src);
|
||||
~BoxWord() = default;
|
||||
|
||||
BoxWord &operator=(const BoxWord &src);
|
||||
|
||||
void CopyFrom(const BoxWord &src);
|
||||
|
||||
// Factory to build a BoxWord from a TWERD using the DENORMs on each blob to
|
||||
// switch back to original image coordinates.
|
||||
static BoxWord *CopyFromNormalized(TWERD *tessword);
|
||||
|
||||
// Clean up the bounding boxes from the polygonal approximation by
|
||||
// expanding slightly, then clipping to the blobs from the original_word
|
||||
// that overlap. If not null, the block provides the inverse rotation.
|
||||
void ClipToOriginalWord(const BLOCK *block, WERD *original_word);
|
||||
|
||||
// Merges the boxes from start to end, not including end, and deletes
|
||||
// the boxes between start and end.
|
||||
void MergeBoxes(int start, int end);
|
||||
|
||||
// Inserts a new box before the given index.
|
||||
// Recomputes the bounding box.
|
||||
void InsertBox(int index, const TBOX &box);
|
||||
|
||||
// Changes the box at the given index to the new box.
|
||||
// Recomputes the bounding box.
|
||||
void ChangeBox(int index, const TBOX &box);
|
||||
|
||||
// Deletes the box with the given index, and shuffles up the rest.
|
||||
// Recomputes the bounding box.
|
||||
void DeleteBox(int index);
|
||||
|
||||
// Deletes all the boxes stored in BoxWord.
|
||||
void DeleteAllBoxes();
|
||||
|
||||
// This and other putatively are the same, so call the (permanent) callback
|
||||
// for each blob index where the bounding boxes match.
|
||||
// The callback is deleted on completion.
|
||||
void ProcessMatchedBlobs(const TWERD &other, std::function<void(int)> cb) const;
|
||||
|
||||
const TBOX &bounding_box() const {
|
||||
return bbox_;
|
||||
}
|
||||
int length() const {
|
||||
return length_;
|
||||
}
|
||||
const TBOX &BlobBox(int index) const {
|
||||
return boxes_[index];
|
||||
}
|
||||
|
||||
private:
|
||||
void ComputeBoundingBox();
|
||||
|
||||
TBOX bbox_;
|
||||
int length_;
|
||||
std::vector<TBOX> boxes_;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_CSTRUCT_BOXWORD_H_
|
36
3rdparty/tesseract_ocr/tesseract/src/ccstruct/ccstruct.cpp
vendored
Normal file
36
3rdparty/tesseract_ocr/tesseract/src/ccstruct/ccstruct.cpp
vendored
Normal file
|
@ -0,0 +1,36 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: ccstruct.cpp
|
||||
// Description: ccstruct class.
|
||||
// Author: Samuel Charron
|
||||
//
|
||||
// (C) Copyright 2006, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "ccstruct.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// APPROXIMATIONS of the fractions of the character cell taken by
|
||||
// the descenders, ascenders, and x-height.
|
||||
const double CCStruct::kDescenderFraction = 0.25;
|
||||
const double CCStruct::kXHeightFraction = 0.5;
|
||||
const double CCStruct::kAscenderFraction = 0.25;
|
||||
const double CCStruct::kXHeightCapRatio =
|
||||
CCStruct::kXHeightFraction / (CCStruct::kXHeightFraction + CCStruct::kAscenderFraction);
|
||||
|
||||
// Destructor.
|
||||
// It is defined here, so the compiler can create a single vtable
|
||||
// instead of weak vtables in every compilation unit.
|
||||
CCStruct::~CCStruct() = default;
|
||||
|
||||
} // namespace tesseract
|
41
3rdparty/tesseract_ocr/tesseract/src/ccstruct/ccstruct.h
vendored
Normal file
41
3rdparty/tesseract_ocr/tesseract/src/ccstruct/ccstruct.h
vendored
Normal file
|
@ -0,0 +1,41 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: ccstruct.h
|
||||
// Description: ccstruct class.
|
||||
// Author: Samuel Charron
|
||||
//
|
||||
// (C) Copyright 2006, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCSTRUCT_CCSTRUCT_H_
|
||||
#define TESSERACT_CCSTRUCT_CCSTRUCT_H_
|
||||
|
||||
#include "ccutil.h" // for CCUtil
|
||||
|
||||
namespace tesseract {
|
||||
class TESS_API CCStruct : public CCUtil {
|
||||
public:
|
||||
CCStruct() = default;
|
||||
~CCStruct() override;
|
||||
|
||||
// Globally accessible constants.
|
||||
// APPROXIMATIONS of the fractions of the character cell taken by
|
||||
// the descenders, ascenders, and x-height.
|
||||
static const double kDescenderFraction; // = 0.25;
|
||||
static const double kXHeightFraction; // = 0.5;
|
||||
static const double kAscenderFraction; // = 0.25;
|
||||
// Derived value giving the x-height as a fraction of cap-height.
|
||||
static const double kXHeightCapRatio; // = XHeight/(XHeight + Ascender).
|
||||
};
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_CCSTRUCT_CCSTRUCT_H_
|
1062
3rdparty/tesseract_ocr/tesseract/src/ccstruct/coutln.cpp
vendored
Normal file
1062
3rdparty/tesseract_ocr/tesseract/src/ccstruct/coutln.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
297
3rdparty/tesseract_ocr/tesseract/src/ccstruct/coutln.h
vendored
Normal file
297
3rdparty/tesseract_ocr/tesseract/src/ccstruct/coutln.h
vendored
Normal file
|
@ -0,0 +1,297 @@
|
|||
/**********************************************************************
|
||||
* File: coutln.h
|
||||
* Description: Code for the C_OUTLINE class.
|
||||
* Author: Ray Smith
|
||||
*
|
||||
* (C) Copyright 1991, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef COUTLN_H
|
||||
#define COUTLN_H
|
||||
|
||||
#include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
|
||||
#include "mod128.h" // for DIR128, DIRBITS
|
||||
#include "points.h" // for ICOORD, FCOORD
|
||||
#include "rect.h" // for TBOX
|
||||
#include "scrollview.h" // for ScrollView, ScrollView::Color
|
||||
|
||||
#include <tesseract/export.h> // for DLLSYM
|
||||
|
||||
#include <cstdint> // for int16_t, int32_t
|
||||
#include <bitset> // for std::bitset<16>
|
||||
|
||||
struct Pix;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class CRACKEDGE;
|
||||
class DENORM;
|
||||
|
||||
#define INTERSECTING INT16_MAX // no winding number
|
||||
|
||||
// mask to get step
|
||||
#define STEP_MASK 3
|
||||
|
||||
enum C_OUTLINE_FLAGS {
|
||||
COUT_INVERSE // White on black blob
|
||||
};
|
||||
|
||||
// Simple struct to hold the 3 values needed to compute a more precise edge
|
||||
// position and direction. The offset_numerator is the difference between the
|
||||
// grey threshold and the mean pixel value. pixel_diff is the difference between
|
||||
// the pixels in the edge. Consider the following row of pixels: p1 p2 p3 p4 p5
|
||||
// Say the image was thresholded at threshold t, making p1, p2, p3 black
|
||||
// and p4, p5 white (p1, p2, p3 < t, and p4, p5 >= t), but suppose that
|
||||
// max(p[i+1] - p[i]) is p3 - p2. Then the extrapolated position of the edge,
|
||||
// based on the maximum gradient, is at the crack between p2 and p3 plus the
|
||||
// offset (t - (p2+p3)/2)/(p3 - p2). We store the pixel difference p3-p2
|
||||
// denominator in pixel_diff and the offset numerator, relative to the original
|
||||
// binary edge (t - (p2+p3)/2) - (p3 -p2) in offset_numerator.
|
||||
// The sign of offset_numerator and pixel_diff are manipulated to ensure
|
||||
// that the pixel_diff, which will be used as a weight, is always positive.
|
||||
// The direction stores the quantized feature direction for the given step
|
||||
// computed from the edge gradient. (Using binary_angle_plus_pi.)
|
||||
// If the pixel_diff is zero, it means that the direction of the gradient
|
||||
// is in conflict with the step direction, so this step is to be ignored.
|
||||
struct EdgeOffset {
|
||||
int8_t offset_numerator;
|
||||
uint8_t pixel_diff;
|
||||
uint8_t direction;
|
||||
};
|
||||
|
||||
class C_OUTLINE; // forward declaration
|
||||
|
||||
ELISTIZEH(C_OUTLINE)
|
||||
class C_OUTLINE : public ELIST_LINK {
|
||||
public:
|
||||
C_OUTLINE() {
|
||||
stepcount = 0;
|
||||
offsets = nullptr;
|
||||
}
|
||||
C_OUTLINE( // constructor
|
||||
CRACKEDGE *startpt, // from edge detector
|
||||
ICOORD bot_left, // bounding box //length of loop
|
||||
ICOORD top_right, int16_t length);
|
||||
C_OUTLINE(ICOORD startpt, // start of loop
|
||||
DIR128 *new_steps, // steps in loop
|
||||
int16_t length); // length of loop
|
||||
// outline to copy
|
||||
C_OUTLINE(C_OUTLINE *srcline, FCOORD rotation); // and rotate
|
||||
|
||||
// Build a fake outline, given just a bounding box and append to the list.
|
||||
static void FakeOutline(const TBOX &box, C_OUTLINE_LIST *outlines);
|
||||
|
||||
~C_OUTLINE() { // destructor
|
||||
delete[] offsets;
|
||||
}
|
||||
|
||||
bool flag( // test flag
|
||||
C_OUTLINE_FLAGS mask) const { // flag to test
|
||||
return flags[mask];
|
||||
}
|
||||
void set_flag( // set flag value
|
||||
C_OUTLINE_FLAGS mask, // flag to test
|
||||
bool value) { // value to set
|
||||
flags.set(mask, value);
|
||||
}
|
||||
|
||||
C_OUTLINE_LIST *child() { // get child list
|
||||
return &children;
|
||||
}
|
||||
|
||||
// access function
|
||||
const TBOX &bounding_box() const {
|
||||
return box;
|
||||
}
|
||||
void set_step( // set a step
|
||||
int16_t stepindex, // index of step
|
||||
int8_t stepdir) { // chain code
|
||||
int shift = stepindex % 4 * 2;
|
||||
uint8_t mask = 3 << shift;
|
||||
steps[stepindex / 4] = ((stepdir << shift) & mask) | (steps[stepindex / 4] & ~mask);
|
||||
// squeeze 4 into byte
|
||||
}
|
||||
void set_step( // set a step
|
||||
int16_t stepindex, // index of step
|
||||
DIR128 stepdir) { // direction
|
||||
// clean it
|
||||
int8_t chaindir = stepdir.get_dir() >> (DIRBITS - 2);
|
||||
// difference
|
||||
set_step(stepindex, chaindir);
|
||||
// squeeze 4 into byte
|
||||
}
|
||||
|
||||
int32_t pathlength() const { // get path length
|
||||
return stepcount;
|
||||
}
|
||||
// Return step at a given index as a DIR128.
|
||||
DIR128 step_dir(int index) const {
|
||||
return DIR128(
|
||||
static_cast<int16_t>(((steps[index / 4] >> (index % 4 * 2)) & STEP_MASK) << (DIRBITS - 2)));
|
||||
}
|
||||
// Return the step vector for the given outline position.
|
||||
ICOORD step(int index) const { // index of step
|
||||
return step_coords[chain_code(index)];
|
||||
}
|
||||
// get start position
|
||||
const ICOORD &start_pos() const {
|
||||
return start;
|
||||
}
|
||||
// Returns the position at the given index on the outline.
|
||||
// NOT to be used lightly, as it has to iterate the outline to find out.
|
||||
ICOORD position_at_index(int index) const {
|
||||
ICOORD pos = start;
|
||||
for (int i = 0; i < index; ++i) {
|
||||
pos += step(i);
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
// Returns the sub-pixel accurate position given the integer position pos
|
||||
// at the given index on the outline. pos may be a return value of
|
||||
// position_at_index, or computed by repeatedly adding step to the
|
||||
// start_pos() in the usual way.
|
||||
FCOORD sub_pixel_pos_at_index(const ICOORD &pos, int index) const {
|
||||
const ICOORD &step_to_next(step(index));
|
||||
FCOORD f_pos(pos.x() + step_to_next.x() / 2.0f, pos.y() + step_to_next.y() / 2.0f);
|
||||
if (offsets != nullptr && offsets[index].pixel_diff > 0) {
|
||||
float offset = offsets[index].offset_numerator;
|
||||
offset /= offsets[index].pixel_diff;
|
||||
if (step_to_next.x() != 0) {
|
||||
f_pos.set_y(f_pos.y() + offset);
|
||||
} else {
|
||||
f_pos.set_x(f_pos.x() + offset);
|
||||
}
|
||||
}
|
||||
return f_pos;
|
||||
}
|
||||
// Returns the step direction for the given index or -1 if there is none.
|
||||
int direction_at_index(int index) const {
|
||||
if (offsets != nullptr && offsets[index].pixel_diff > 0) {
|
||||
return offsets[index].direction;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
// Returns the edge strength for the given index.
|
||||
// If there are no recorded edge strengths, returns 1 (assuming the image
|
||||
// is binary). Returns 0 if the gradient direction conflicts with the
|
||||
// step direction, indicating that this position could be skipped.
|
||||
int edge_strength_at_index(int index) const {
|
||||
if (offsets != nullptr) {
|
||||
return offsets[index].pixel_diff;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
// Return the step as a chain code (0-3) related to the standard feature
|
||||
// direction of binary_angle_plus_pi by:
|
||||
// chain_code * 64 = feature direction.
|
||||
int chain_code(int index) const { // index of step
|
||||
return (steps[index / 4] >> (index % 4 * 2)) & STEP_MASK;
|
||||
}
|
||||
|
||||
int32_t area() const; // Returns area of self and 1st level children.
|
||||
int32_t perimeter() const; // Total perimeter of self and 1st level children.
|
||||
int32_t outer_area() const; // Returns area of self only.
|
||||
int32_t count_transitions( // count maxima
|
||||
int32_t threshold); // size threshold
|
||||
|
||||
bool operator<( // containment test
|
||||
const C_OUTLINE &other) const;
|
||||
bool operator>( // containment test
|
||||
C_OUTLINE &other) const {
|
||||
return other < *this; // use the < to do it
|
||||
}
|
||||
int16_t winding_number( // get winding number
|
||||
ICOORD testpt) const; // around this point
|
||||
// get direction
|
||||
int16_t turn_direction() const;
|
||||
void reverse(); // reverse direction
|
||||
|
||||
void move( // reposition outline
|
||||
const ICOORD vec); // by vector
|
||||
|
||||
// Returns true if *this and its children are legally nested.
|
||||
// The outer area of a child should have the opposite sign to the
|
||||
// parent. If not, it means we have discarded an outline in between
|
||||
// (probably due to excessive length).
|
||||
bool IsLegallyNested() const;
|
||||
|
||||
// If this outline is smaller than the given min_size, delete this and
|
||||
// remove from its list, via *it, after checking that *it points to this.
|
||||
// Otherwise, if any children of this are too small, delete them.
|
||||
// On entry, *it must be an iterator pointing to this. If this gets deleted
|
||||
// then this is extracted from *it, so an iteration can continue.
|
||||
void RemoveSmallRecursive(int min_size, C_OUTLINE_IT *it);
|
||||
|
||||
// Adds sub-pixel resolution EdgeOffsets for the outline if the supplied
|
||||
// pix is 8-bit. Does nothing otherwise.
|
||||
void ComputeEdgeOffsets(int threshold, Image pix);
|
||||
// Adds sub-pixel resolution EdgeOffsets for the outline using only
|
||||
// a binary image source.
|
||||
void ComputeBinaryOffsets();
|
||||
|
||||
// Renders the outline to the given pix, with left and top being
|
||||
// the coords of the upper-left corner of the pix.
|
||||
void render(int left, int top, Image pix) const;
|
||||
|
||||
// Renders just the outline to the given pix (no fill), with left and top
|
||||
// being the coords of the upper-left corner of the pix.
|
||||
void render_outline(int left, int top, Image pix) const;
|
||||
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
void plot( // draw one
|
||||
ScrollView *window, // window to draw in
|
||||
ScrollView::Color colour) const; // colour to draw it
|
||||
// Draws the outline in the given colour, normalized using the given denorm,
|
||||
// making use of sub-pixel accurate information if available.
|
||||
void plot_normed(const DENORM &denorm, ScrollView::Color colour, ScrollView *window) const;
|
||||
#endif // !GRAPHICS_DISABLED
|
||||
|
||||
C_OUTLINE &operator=(const C_OUTLINE &source);
|
||||
|
||||
static C_OUTLINE *deep_copy(const C_OUTLINE *src) {
|
||||
auto *outline = new C_OUTLINE;
|
||||
*outline = *src;
|
||||
return outline;
|
||||
}
|
||||
|
||||
static ICOORD chain_step(int chaindir);
|
||||
|
||||
// The maximum length of any outline. The stepcount is stored as 16 bits,
|
||||
// but it is probably not a good idea to increase this constant by much
|
||||
// and switch to 32 bits, as it plays an important role in keeping huge
|
||||
// outlines invisible, which prevents bad speed behavior.
|
||||
static const int kMaxOutlineLength = 16000;
|
||||
|
||||
private:
|
||||
// Helper for ComputeBinaryOffsets. Increments pos, dir_counts, pos_totals
|
||||
// by the step, increment, and vertical step ? x : y position * increment
|
||||
// at step s Mod stepcount respectively. Used to add or subtract the
|
||||
// direction and position to/from accumulators of a small neighbourhood.
|
||||
void increment_step(int s, int increment, ICOORD *pos, int *dir_counts, int *pos_totals) const;
|
||||
int step_mem() const {
|
||||
return (stepcount + 3) / 4;
|
||||
}
|
||||
|
||||
TBOX box; // bounding box
|
||||
ICOORD start; // start coord
|
||||
int16_t stepcount; // no of steps
|
||||
std::bitset<16> flags; // flags about outline
|
||||
std::vector<uint8_t> steps; // step array
|
||||
EdgeOffset *offsets; // Higher precision edge.
|
||||
C_OUTLINE_LIST children; // child elements
|
||||
static ICOORD step_coords[4];
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif
|
42
3rdparty/tesseract_ocr/tesseract/src/ccstruct/crakedge.h
vendored
Normal file
42
3rdparty/tesseract_ocr/tesseract/src/ccstruct/crakedge.h
vendored
Normal file
|
@ -0,0 +1,42 @@
|
|||
/**********************************************************************
|
||||
* File: crakedge.h (Formerly: crkedge.h)
|
||||
* Description: Structures for the Crack following edge detector.
|
||||
* Author: Ray Smith
|
||||
* Created: Fri Mar 22 16:06:38 GMT 1991
|
||||
*
|
||||
* (C) Copyright 1991, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef CRAKEDGE_H
|
||||
#define CRAKEDGE_H
|
||||
|
||||
#include "mod128.h"
|
||||
#include "points.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class CRACKEDGE {
|
||||
public:
|
||||
CRACKEDGE() = default;
|
||||
|
||||
ICOORD pos; /*position of crack */
|
||||
int8_t stepx; // edge step
|
||||
int8_t stepy;
|
||||
int8_t stepdir; // chaincode
|
||||
CRACKEDGE *prev; /*previous point */
|
||||
CRACKEDGE *next; /*next point */
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif
|
58
3rdparty/tesseract_ocr/tesseract/src/ccstruct/debugpixa.h
vendored
Normal file
58
3rdparty/tesseract_ocr/tesseract/src/ccstruct/debugpixa.h
vendored
Normal file
|
@ -0,0 +1,58 @@
|
|||
#ifndef TESSERACT_CCSTRUCT_DEBUGPIXA_H_
|
||||
#define TESSERACT_CCSTRUCT_DEBUGPIXA_H_
|
||||
|
||||
#include "image.h"
|
||||
|
||||
#include <allheaders.h>
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Class to hold a Pixa collection of debug images with captions and save them
|
||||
// to a PDF file.
|
||||
class DebugPixa {
|
||||
public:
|
||||
// TODO(rays) add another constructor with size control.
|
||||
DebugPixa() {
|
||||
pixa_ = pixaCreate(0);
|
||||
#ifdef TESSERACT_DISABLE_DEBUG_FONTS
|
||||
fonts_ = NULL;
|
||||
#else
|
||||
fonts_ = bmfCreate(nullptr, 14);
|
||||
#endif
|
||||
}
|
||||
// If the filename_ has been set and there are any debug images, they are
|
||||
// written to the set filename_.
|
||||
~DebugPixa() {
|
||||
pixaDestroy(&pixa_);
|
||||
bmfDestroy(&fonts_);
|
||||
}
|
||||
|
||||
// Adds the given pix to the set of pages in the PDF file, with the given
|
||||
// caption added to the top.
|
||||
void AddPix(const Image pix, const char *caption) {
|
||||
int depth = pixGetDepth(pix);
|
||||
int color = depth < 8 ? 1 : (depth > 8 ? 0x00ff0000 : 0x80);
|
||||
Image pix_debug =
|
||||
pixAddSingleTextblock(pix, fonts_, caption, color, L_ADD_BELOW, nullptr);
|
||||
pixaAddPix(pixa_, pix_debug, L_INSERT);
|
||||
}
|
||||
|
||||
// Sets the destination filename and enables images to be written to a PDF
|
||||
// on destruction.
|
||||
void WritePDF(const char *filename) {
|
||||
if (pixaGetCount(pixa_) > 0) {
|
||||
pixaConvertToPdf(pixa_, 300, 1.0f, 0, 0, "AllDebugImages", filename);
|
||||
pixaClear(pixa_);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// The collection of images to put in the PDF.
|
||||
Pixa *pixa_;
|
||||
// The fonts used to draw text captions.
|
||||
L_Bmf *fonts_;
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_CCSTRUCT_DEBUGPIXA_H_
|
302
3rdparty/tesseract_ocr/tesseract/src/ccstruct/detlinefit.cpp
vendored
Normal file
302
3rdparty/tesseract_ocr/tesseract/src/ccstruct/detlinefit.cpp
vendored
Normal file
|
@ -0,0 +1,302 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: detlinefit.cpp
|
||||
// Description: Deterministic least median squares line fitting.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2008, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "detlinefit.h"
|
||||
#include "helpers.h" // for IntCastRounded
|
||||
#include "statistc.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cfloat> // for FLT_MAX
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// The number of points to consider at each end.
|
||||
const int kNumEndPoints = 3;
|
||||
// The minimum number of points at which to switch to number of points
|
||||
// for badly fitted lines.
|
||||
// To ensure a sensible error metric, kMinPointsForErrorCount should be at
|
||||
// least kMaxRealDistance / (1 - %ile) where %ile is the fractile used in
|
||||
// ComputeUpperQuartileError.
|
||||
const int kMinPointsForErrorCount = 16;
|
||||
// The maximum real distance to use before switching to number of
|
||||
// mis-fitted points, which will get square-rooted for true distance.
|
||||
const int kMaxRealDistance = 2.0;
|
||||
|
||||
DetLineFit::DetLineFit() : square_length_(0.0) {}
|
||||
|
||||
// Delete all Added points.
|
||||
void DetLineFit::Clear() {
|
||||
pts_.clear();
|
||||
distances_.clear();
|
||||
}
|
||||
|
||||
// Add a new point. Takes a copy - the pt doesn't need to stay in scope.
|
||||
void DetLineFit::Add(const ICOORD &pt) {
|
||||
pts_.emplace_back(pt, 0);
|
||||
}
|
||||
// Associates a half-width with the given point if a point overlaps the
|
||||
// previous point by more than half the width, and its distance is further
|
||||
// than the previous point, then the more distant point is ignored in the
|
||||
// distance calculation. Useful for ignoring i dots and other diacritics.
|
||||
void DetLineFit::Add(const ICOORD &pt, int halfwidth) {
|
||||
pts_.emplace_back(pt, halfwidth);
|
||||
}
|
||||
|
||||
// Fits a line to the points, ignoring the skip_first initial points and the
|
||||
// skip_last final points, returning the fitted line as a pair of points,
|
||||
// and the upper quartile error.
|
||||
double DetLineFit::Fit(int skip_first, int skip_last, ICOORD *pt1, ICOORD *pt2) {
|
||||
// Do something sensible with no points.
|
||||
if (pts_.empty()) {
|
||||
pt1->set_x(0);
|
||||
pt1->set_y(0);
|
||||
*pt2 = *pt1;
|
||||
return 0.0;
|
||||
}
|
||||
// Count the points and find the first and last kNumEndPoints.
|
||||
int pt_count = pts_.size();
|
||||
ICOORD *starts[kNumEndPoints];
|
||||
if (skip_first >= pt_count) {
|
||||
skip_first = pt_count - 1;
|
||||
}
|
||||
int start_count = 0;
|
||||
int end_i = std::min(skip_first + kNumEndPoints, pt_count);
|
||||
for (int i = skip_first; i < end_i; ++i) {
|
||||
starts[start_count++] = &pts_[i].pt;
|
||||
}
|
||||
ICOORD *ends[kNumEndPoints];
|
||||
if (skip_last >= pt_count) {
|
||||
skip_last = pt_count - 1;
|
||||
}
|
||||
int end_count = 0;
|
||||
end_i = std::max(0, pt_count - kNumEndPoints - skip_last);
|
||||
for (int i = pt_count - 1 - skip_last; i >= end_i; --i) {
|
||||
ends[end_count++] = &pts_[i].pt;
|
||||
}
|
||||
// 1 or 2 points need special treatment.
|
||||
if (pt_count <= 2) {
|
||||
*pt1 = *starts[0];
|
||||
if (pt_count > 1) {
|
||||
*pt2 = *ends[0];
|
||||
} else {
|
||||
*pt2 = *pt1;
|
||||
}
|
||||
return 0.0;
|
||||
}
|
||||
// Although with between 2 and 2*kNumEndPoints-1 points, there will be
|
||||
// overlap in the starts, ends sets, this is OK and taken care of by the
|
||||
// if (*start != *end) test below, which also tests for equal input points.
|
||||
double best_uq = -1.0;
|
||||
// Iterate each pair of points and find the best fitting line.
|
||||
for (int i = 0; i < start_count; ++i) {
|
||||
ICOORD *start = starts[i];
|
||||
for (int j = 0; j < end_count; ++j) {
|
||||
ICOORD *end = ends[j];
|
||||
if (*start != *end) {
|
||||
ComputeDistances(*start, *end);
|
||||
// Compute the upper quartile error from the line.
|
||||
double dist = EvaluateLineFit();
|
||||
if (dist < best_uq || best_uq < 0.0) {
|
||||
best_uq = dist;
|
||||
*pt1 = *start;
|
||||
*pt2 = *end;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Finally compute the square root to return the true distance.
|
||||
return best_uq > 0.0 ? sqrt(best_uq) : best_uq;
|
||||
}
|
||||
|
||||
// Constrained fit with a supplied direction vector. Finds the best line_pt,
|
||||
// that is one of the supplied points having the median cross product with
|
||||
// direction, ignoring points that have a cross product outside of the range
|
||||
// [min_dist, max_dist]. Returns the resulting error metric using the same
|
||||
// reduced set of points.
|
||||
// *Makes use of floating point arithmetic*
|
||||
double DetLineFit::ConstrainedFit(const FCOORD &direction, double min_dist, double max_dist,
|
||||
bool debug, ICOORD *line_pt) {
|
||||
ComputeConstrainedDistances(direction, min_dist, max_dist);
|
||||
// Do something sensible with no points or computed distances.
|
||||
if (pts_.empty() || distances_.empty()) {
|
||||
line_pt->set_x(0);
|
||||
line_pt->set_y(0);
|
||||
return 0.0;
|
||||
}
|
||||
auto median_index = distances_.size() / 2;
|
||||
std::nth_element(distances_.begin(), distances_.begin() + median_index, distances_.end());
|
||||
*line_pt = distances_[median_index].data();
|
||||
if (debug) {
|
||||
tprintf("Constrained fit to dir %g, %g = %d, %d :%zu distances:\n", direction.x(), direction.y(),
|
||||
line_pt->x(), line_pt->y(), distances_.size());
|
||||
for (int i = 0; i < distances_.size(); ++i) {
|
||||
tprintf("%d: %d, %d -> %g\n", i, distances_[i].data().x(), distances_[i].data().y(),
|
||||
distances_[i].key());
|
||||
}
|
||||
tprintf("Result = %zu\n", median_index);
|
||||
}
|
||||
// Center distances on the fitted point.
|
||||
double dist_origin = direction * *line_pt;
|
||||
for (auto &distance : distances_) {
|
||||
distance.key() -= dist_origin;
|
||||
}
|
||||
return sqrt(EvaluateLineFit());
|
||||
}
|
||||
|
||||
// Returns true if there were enough points at the last call to Fit or
|
||||
// ConstrainedFit for the fitted points to be used on a badly fitted line.
|
||||
bool DetLineFit::SufficientPointsForIndependentFit() const {
|
||||
return distances_.size() >= kMinPointsForErrorCount;
|
||||
}
|
||||
|
||||
// Backwards compatible fit returning a gradient and constant.
|
||||
// Deprecated. Prefer Fit(ICOORD*, ICOORD*) where possible, but use this
|
||||
// function in preference to the LMS class.
|
||||
double DetLineFit::Fit(float *m, float *c) {
|
||||
ICOORD start, end;
|
||||
double error = Fit(&start, &end);
|
||||
if (end.x() != start.x()) {
|
||||
*m = static_cast<float>(end.y() - start.y()) / (end.x() - start.x());
|
||||
*c = start.y() - *m * start.x();
|
||||
} else {
|
||||
*m = 0.0f;
|
||||
*c = 0.0f;
|
||||
}
|
||||
return error;
|
||||
}
|
||||
|
||||
// Backwards compatible constrained fit with a supplied gradient.
|
||||
// Deprecated. Use ConstrainedFit(const FCOORD& direction) where possible
|
||||
// to avoid potential difficulties with infinite gradients.
|
||||
double DetLineFit::ConstrainedFit(double m, float *c) {
|
||||
// Do something sensible with no points.
|
||||
if (pts_.empty()) {
|
||||
*c = 0.0f;
|
||||
return 0.0;
|
||||
}
|
||||
double cos = 1.0 / sqrt(1.0 + m * m);
|
||||
FCOORD direction(cos, m * cos);
|
||||
ICOORD line_pt;
|
||||
double error = ConstrainedFit(direction, -FLT_MAX, FLT_MAX, false, &line_pt);
|
||||
*c = line_pt.y() - line_pt.x() * m;
|
||||
return error;
|
||||
}
|
||||
|
||||
// Computes and returns the squared evaluation metric for a line fit.
|
||||
double DetLineFit::EvaluateLineFit() {
|
||||
// Compute the upper quartile error from the line.
|
||||
double dist = ComputeUpperQuartileError();
|
||||
if (distances_.size() >= kMinPointsForErrorCount && dist > kMaxRealDistance * kMaxRealDistance) {
|
||||
// Use the number of mis-fitted points as the error metric, as this
|
||||
// gives a better measure of fit for badly fitted lines where more
|
||||
// than a quarter are badly fitted.
|
||||
double threshold = kMaxRealDistance * sqrt(square_length_);
|
||||
dist = NumberOfMisfittedPoints(threshold);
|
||||
}
|
||||
return dist;
|
||||
}
|
||||
|
||||
// Computes the absolute error distances of the points from the line,
|
||||
// and returns the squared upper-quartile error distance.
|
||||
double DetLineFit::ComputeUpperQuartileError() {
|
||||
int num_errors = distances_.size();
|
||||
if (num_errors == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
// Get the absolute values of the errors.
|
||||
for (int i = 0; i < num_errors; ++i) {
|
||||
if (distances_[i].key() < 0) {
|
||||
distances_[i].key() = -distances_[i].key();
|
||||
}
|
||||
}
|
||||
// Now get the upper quartile distance.
|
||||
auto index = 3 * num_errors / 4;
|
||||
std::nth_element(distances_.begin(), distances_.begin() + index, distances_.end());
|
||||
double dist = distances_[index].key();
|
||||
// The true distance is the square root of the dist squared / square_length.
|
||||
// Don't bother with the square root. Just return the square distance.
|
||||
return square_length_ > 0.0 ? dist * dist / square_length_ : 0.0;
|
||||
}
|
||||
|
||||
// Returns the number of sample points that have an error more than threshold.
|
||||
int DetLineFit::NumberOfMisfittedPoints(double threshold) const {
|
||||
int num_misfits = 0;
|
||||
int num_dists = distances_.size();
|
||||
// Get the absolute values of the errors.
|
||||
for (int i = 0; i < num_dists; ++i) {
|
||||
if (distances_[i].key() > threshold) {
|
||||
++num_misfits;
|
||||
}
|
||||
}
|
||||
return num_misfits;
|
||||
}
|
||||
|
||||
// Computes all the cross product distances of the points from the line,
|
||||
// storing the actual (signed) cross products in distances.
|
||||
// Ignores distances of points that are further away than the previous point,
|
||||
// and overlaps the previous point by at least half.
|
||||
void DetLineFit::ComputeDistances(const ICOORD &start, const ICOORD &end) {
|
||||
distances_.clear();
|
||||
ICOORD line_vector = end;
|
||||
line_vector -= start;
|
||||
square_length_ = line_vector.sqlength();
|
||||
int line_length = IntCastRounded(sqrt(square_length_));
|
||||
// Compute the distance of each point from the line.
|
||||
int prev_abs_dist = 0;
|
||||
int prev_dot = 0;
|
||||
for (int i = 0; i < pts_.size(); ++i) {
|
||||
ICOORD pt_vector = pts_[i].pt;
|
||||
pt_vector -= start;
|
||||
int dot = line_vector % pt_vector;
|
||||
// Compute |line_vector||pt_vector|sin(angle between)
|
||||
int dist = line_vector * pt_vector;
|
||||
int abs_dist = dist < 0 ? -dist : dist;
|
||||
if (abs_dist > prev_abs_dist && i > 0) {
|
||||
// Ignore this point if it overlaps the previous one.
|
||||
int separation = abs(dot - prev_dot);
|
||||
if (separation < line_length * pts_[i].halfwidth ||
|
||||
separation < line_length * pts_[i - 1].halfwidth) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
distances_.emplace_back(dist, pts_[i].pt);
|
||||
prev_abs_dist = abs_dist;
|
||||
prev_dot = dot;
|
||||
}
|
||||
}
|
||||
|
||||
// Computes all the cross product distances of the points perpendicular to
|
||||
// the given direction, ignoring distances outside of the give distance range,
|
||||
// storing the actual (signed) cross products in distances_.
|
||||
void DetLineFit::ComputeConstrainedDistances(const FCOORD &direction, double min_dist,
|
||||
double max_dist) {
|
||||
distances_.clear();
|
||||
square_length_ = direction.sqlength();
|
||||
// Compute the distance of each point from the line.
|
||||
for (auto &pt : pts_) {
|
||||
FCOORD pt_vector = pt.pt;
|
||||
// Compute |line_vector||pt_vector|sin(angle between)
|
||||
double dist = direction * pt_vector;
|
||||
if (min_dist <= dist && dist <= max_dist) {
|
||||
distances_.emplace_back(dist, pt.pt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
157
3rdparty/tesseract_ocr/tesseract/src/ccstruct/detlinefit.h
vendored
Normal file
157
3rdparty/tesseract_ocr/tesseract/src/ccstruct/detlinefit.h
vendored
Normal file
|
@ -0,0 +1,157 @@
|
|||
///////////////////////////////////////////////////////////////////////
|
||||
// File: detlinefit.h
|
||||
// Description: Deterministic least upper-quartile squares line fitting.
|
||||
// Author: Ray Smith
|
||||
//
|
||||
// (C) Copyright 2008, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCSTRUCT_DETLINEFIT_H_
|
||||
#define TESSERACT_CCSTRUCT_DETLINEFIT_H_
|
||||
|
||||
#include "kdpair.h"
|
||||
#include "points.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// This class fits a line to a set of ICOORD points.
|
||||
// There is no restriction on the direction of the line, as it
|
||||
// uses a vector method, ie no concern over infinite gradients.
|
||||
// The fitted line has the least upper quartile of squares of perpendicular
|
||||
// distances of all source points from the line, subject to the constraint
|
||||
// that the line is made from one of the pairs of [{p1,p2,p3},{pn-2, pn-1, pn}]
|
||||
// i.e. the 9 combinations of one of the first 3 and last 3 points.
|
||||
// A fundamental assumption of this algorithm is that one of the first 3 and
|
||||
// one of the last 3 points are near the best line fit.
|
||||
// The points must be Added in line order for the algorithm to work properly.
|
||||
// No floating point calculations are needed* to make an accurate fit,
|
||||
// and no random numbers are needed** so the algorithm is deterministic,
|
||||
// architecture-stable, and compiler-stable as well as stable to minor
|
||||
// changes in the input.
|
||||
// *A single floating point division is used to compute each line's distance.
|
||||
// This is unlikely to result in choice of a different line, but if it does,
|
||||
// it would be easy to replace with a 64 bit integer calculation.
|
||||
// **Random numbers are used in the nth_item function, but the worst
|
||||
// non-determinism that can result is picking a different result among equals,
|
||||
// and that wouldn't make any difference to the end-result distance, so the
|
||||
// randomness does not affect the determinism of the algorithm. The random
|
||||
// numbers are only there to guarantee average linear time.
|
||||
// Fitting time is linear, but with a high constant, as it tries 9 different
|
||||
// lines and computes the distance of all points each time.
|
||||
// This class is aimed at replacing the LLSQ (linear least squares) and
|
||||
// LMS (least median of squares) classes that are currently used for most
|
||||
// of the line fitting in Tesseract.
|
||||
class DetLineFit {
|
||||
public:
|
||||
DetLineFit();
|
||||
~DetLineFit() = default;
|
||||
|
||||
// Delete all Added points.
|
||||
void Clear();
|
||||
|
||||
// Adds a new point. Takes a copy - the pt doesn't need to stay in scope.
|
||||
// Add must be called on points in sequence along the line.
|
||||
void Add(const ICOORD &pt);
|
||||
// Associates a half-width with the given point if a point overlaps the
|
||||
// previous point by more than half the width, and its distance is further
|
||||
// than the previous point, then the more distant point is ignored in the
|
||||
// distance calculation. Useful for ignoring i dots and other diacritics.
|
||||
void Add(const ICOORD &pt, int halfwidth);
|
||||
|
||||
// Fits a line to the points, returning the fitted line as a pair of
|
||||
// points, and the upper quartile error.
|
||||
double Fit(ICOORD *pt1, ICOORD *pt2) {
|
||||
return Fit(0, 0, pt1, pt2);
|
||||
}
|
||||
// Fits a line to the points, ignoring the skip_first initial points and the
|
||||
// skip_last final points, returning the fitted line as a pair of points,
|
||||
// and the upper quartile error.
|
||||
double Fit(int skip_first, int skip_last, ICOORD *pt1, ICOORD *pt2);
|
||||
|
||||
// Constrained fit with a supplied direction vector. Finds the best line_pt,
|
||||
// that is one of the supplied points having the median cross product with
|
||||
// direction, ignoring points that have a cross product outside of the range
|
||||
// [min_dist, max_dist]. Returns the resulting error metric using the same
|
||||
// reduced set of points.
|
||||
// *Makes use of floating point arithmetic*
|
||||
double ConstrainedFit(const FCOORD &direction, double min_dist, double max_dist, bool debug,
|
||||
ICOORD *line_pt);
|
||||
|
||||
// Returns true if there were enough points at the last call to Fit or
|
||||
// ConstrainedFit for the fitted points to be used on a badly fitted line.
|
||||
bool SufficientPointsForIndependentFit() const;
|
||||
|
||||
// Backwards compatible fit returning a gradient and constant.
|
||||
// Deprecated. Prefer Fit(ICOORD*, ICOORD*) where possible, but use this
|
||||
// function in preference to the LMS class.
|
||||
double Fit(float *m, float *c);
|
||||
|
||||
// Backwards compatible constrained fit with a supplied gradient.
|
||||
// Deprecated. Use ConstrainedFit(const FCOORD& direction) where possible
|
||||
// to avoid potential difficulties with infinite gradients.
|
||||
double ConstrainedFit(double m, float *c);
|
||||
|
||||
private:
|
||||
// Simple struct to hold an ICOORD point and a halfwidth representing half
|
||||
// the "width" (supposedly approximately parallel to the direction of the
|
||||
// line) of each point, such that distant points can be discarded when they
|
||||
// overlap nearer points. (Think i dot and other diacritics or noise.)
|
||||
struct PointWidth {
|
||||
PointWidth() : pt(ICOORD(0, 0)), halfwidth(0) {}
|
||||
PointWidth(const ICOORD &pt0, int halfwidth0) : pt(pt0), halfwidth(halfwidth0) {}
|
||||
|
||||
ICOORD pt;
|
||||
int halfwidth;
|
||||
};
|
||||
// Type holds the distance of each point from the fitted line and the point
|
||||
// itself. Use of double allows integer distances from ICOORDs to be stored
|
||||
// exactly, and also the floating point results from ConstrainedFit.
|
||||
using DistPointPair = KDPairInc<double, ICOORD>;
|
||||
|
||||
// Computes and returns the squared evaluation metric for a line fit.
|
||||
double EvaluateLineFit();
|
||||
|
||||
// Computes the absolute values of the precomputed distances_,
|
||||
// and returns the squared upper-quartile error distance.
|
||||
double ComputeUpperQuartileError();
|
||||
|
||||
// Returns the number of sample points that have an error more than threshold.
|
||||
int NumberOfMisfittedPoints(double threshold) const;
|
||||
|
||||
// Computes all the cross product distances of the points from the line,
|
||||
// storing the actual (signed) cross products in distances_.
|
||||
// Ignores distances of points that are further away than the previous point,
|
||||
// and overlaps the previous point by at least half.
|
||||
void ComputeDistances(const ICOORD &start, const ICOORD &end);
|
||||
|
||||
// Computes all the cross product distances of the points perpendicular to
|
||||
// the given direction, ignoring distances outside of the give distance range,
|
||||
// storing the actual (signed) cross products in distances_.
|
||||
void ComputeConstrainedDistances(const FCOORD &direction, double min_dist, double max_dist);
|
||||
|
||||
// Stores all the source points in the order they were given and their
|
||||
// halfwidths, if any.
|
||||
std::vector<PointWidth> pts_;
|
||||
// Stores the computed perpendicular distances of (some of) the pts_ from a
|
||||
// given vector (assuming it goes through the origin, making it a line).
|
||||
// Since the distances may be a subset of the input points, and get
|
||||
// re-ordered by the nth_item function, the original point is stored
|
||||
// along side the distance.
|
||||
std::vector<DistPointPair> distances_; // Distances of points.
|
||||
// The squared length of the vector used to compute distances_.
|
||||
double square_length_;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_CCSTRUCT_DETLINEFIT_H_
|
99
3rdparty/tesseract_ocr/tesseract/src/ccstruct/dppoint.cpp
vendored
Normal file
99
3rdparty/tesseract_ocr/tesseract/src/ccstruct/dppoint.cpp
vendored
Normal file
|
@ -0,0 +1,99 @@
|
|||
/**********************************************************************
|
||||
* File: dppoint.cpp
|
||||
* Description: Simple generic dynamic programming class.
|
||||
* Author: Ray Smith
|
||||
* Created: Wed Mar 25 19:08:01 PDT 2009
|
||||
*
|
||||
* (C) Copyright 2009, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "dppoint.h"
|
||||
#include "errcode.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Solve the dynamic programming problem for the given array of points, with
|
||||
// the given size and cost function.
|
||||
// Steps backwards are limited to being between min_step and max_step
|
||||
// inclusive.
|
||||
// The return value is the tail of the best path.
|
||||
DPPoint *DPPoint::Solve(int min_step, int max_step, bool debug, CostFunc cost_func, int size,
|
||||
DPPoint *points) {
|
||||
if (size <= 0 || max_step < min_step || min_step >= size) {
|
||||
return nullptr; // Degenerate, but not necessarily an error.
|
||||
}
|
||||
ASSERT_HOST(min_step > 0); // Infinite loop possible if this is not true.
|
||||
if (debug) {
|
||||
tprintf("min = %d, max=%d\n", min_step, max_step);
|
||||
}
|
||||
// Evaluate the total cost at each point.
|
||||
for (int i = 0; i < size; ++i) {
|
||||
for (int offset = min_step; offset <= max_step; ++offset) {
|
||||
DPPoint *prev = offset <= i ? points + i - offset : nullptr;
|
||||
int64_t new_cost = (points[i].*cost_func)(prev);
|
||||
if (points[i].best_prev_ != nullptr && offset > min_step * 2 &&
|
||||
new_cost > points[i].total_cost_) {
|
||||
break; // Find only the first minimum if going over twice the min.
|
||||
}
|
||||
}
|
||||
points[i].total_cost_ += points[i].local_cost_;
|
||||
if (debug) {
|
||||
tprintf("At point %d, local cost=%d, total_cost=%d, steps=%d\n", i, points[i].local_cost_,
|
||||
points[i].total_cost_, points[i].total_steps_);
|
||||
}
|
||||
}
|
||||
// Now find the end of the best path and return it.
|
||||
int best_cost = points[size - 1].total_cost_;
|
||||
int best_end = size - 1;
|
||||
for (int end = best_end - 1; end >= size - min_step; --end) {
|
||||
int cost = points[end].total_cost_;
|
||||
if (cost < best_cost) {
|
||||
best_cost = cost;
|
||||
best_end = end;
|
||||
}
|
||||
}
|
||||
return points + best_end;
|
||||
}
|
||||
|
||||
// A CostFunc that takes the variance of step into account in the cost.
|
||||
int64_t DPPoint::CostWithVariance(const DPPoint *prev) {
|
||||
if (prev == nullptr || prev == this) {
|
||||
UpdateIfBetter(0, 1, nullptr, 0, 0, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int delta = this - prev;
|
||||
int32_t n = prev->n_ + 1;
|
||||
int32_t sig_x = prev->sig_x_ + delta;
|
||||
int64_t sig_xsq = prev->sig_xsq_ + delta * delta;
|
||||
int64_t cost = (sig_xsq - sig_x * sig_x / n) / n;
|
||||
cost += prev->total_cost_;
|
||||
UpdateIfBetter(cost, prev->total_steps_ + 1, prev, n, sig_x, sig_xsq);
|
||||
return cost;
|
||||
}
|
||||
|
||||
// Update the other members if the cost is lower.
|
||||
void DPPoint::UpdateIfBetter(int64_t cost, int32_t steps, const DPPoint *prev, int32_t n,
|
||||
int32_t sig_x, int64_t sig_xsq) {
|
||||
if (cost < total_cost_) {
|
||||
total_cost_ = cost;
|
||||
total_steps_ = steps;
|
||||
best_prev_ = prev;
|
||||
n_ = n;
|
||||
sig_x_ = sig_x;
|
||||
sig_xsq_ = sig_xsq;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
105
3rdparty/tesseract_ocr/tesseract/src/ccstruct/dppoint.h
vendored
Normal file
105
3rdparty/tesseract_ocr/tesseract/src/ccstruct/dppoint.h
vendored
Normal file
|
@ -0,0 +1,105 @@
|
|||
/**********************************************************************
|
||||
* File: dppoint.h
|
||||
* Description: Simple generic dynamic programming class.
|
||||
* Author: Ray Smith
|
||||
* Created: Wed Mar 25 18:57:01 PDT 2009
|
||||
*
|
||||
* (C) Copyright 2009, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef TESSERACT_CCSTRUCT_DPPOINT_H_
|
||||
#define TESSERACT_CCSTRUCT_DPPOINT_H_
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// A simple class to provide a dynamic programming solution to a class of
|
||||
// 1st-order problems in which the cost is dependent only on the current
|
||||
// step and the best cost to that step, with a possible special case
|
||||
// of using the variance of the steps, and only the top choice is required.
|
||||
// Useful for problems such as finding the optimal cut points in a fixed-pitch
|
||||
// (vertical or horizontal) situation.
|
||||
// Skeletal Example:
|
||||
// DPPoint* array = new DPPoint[width];
|
||||
// for (int i = 0; i < width; i++) {
|
||||
// array[i].AddLocalCost(cost_at_i)
|
||||
// }
|
||||
// DPPoint* best_end = DPPoint::Solve(..., array);
|
||||
// while (best_end != nullptr) {
|
||||
// int cut_index = best_end - array;
|
||||
// best_end = best_end->best_prev();
|
||||
// }
|
||||
// delete [] array;
|
||||
class DPPoint {
|
||||
public:
|
||||
// The cost function evaluates the total cost at this (excluding this's
|
||||
// local_cost) and if it beats this's total_cost, then
|
||||
// replace the appropriate values in this.
|
||||
using CostFunc = int64_t (DPPoint::*)(const DPPoint *);
|
||||
|
||||
DPPoint()
|
||||
: local_cost_(0)
|
||||
, total_cost_(INT32_MAX)
|
||||
, total_steps_(1)
|
||||
, best_prev_(nullptr)
|
||||
, n_(0)
|
||||
, sig_x_(0)
|
||||
, sig_xsq_(0) {}
|
||||
|
||||
// Solve the dynamic programming problem for the given array of points, with
|
||||
// the given size and cost function.
|
||||
// Steps backwards are limited to being between min_step and max_step
|
||||
// inclusive.
|
||||
// The return value is the tail of the best path.
|
||||
static DPPoint *Solve(int min_step, int max_step, bool debug, CostFunc cost_func, int size,
|
||||
DPPoint *points);
|
||||
|
||||
// A CostFunc that takes the variance of step into account in the cost.
|
||||
int64_t CostWithVariance(const DPPoint *prev);
|
||||
|
||||
// Accessors.
|
||||
int total_cost() const {
|
||||
return total_cost_;
|
||||
}
|
||||
int Pathlength() const {
|
||||
return total_steps_;
|
||||
}
|
||||
const DPPoint *best_prev() const {
|
||||
return best_prev_;
|
||||
}
|
||||
void AddLocalCost(int new_cost) {
|
||||
local_cost_ += new_cost;
|
||||
}
|
||||
|
||||
private:
|
||||
// Code common to different cost functions.
|
||||
|
||||
// Update the other members if the cost is lower.
|
||||
void UpdateIfBetter(int64_t cost, int32_t steps, const DPPoint *prev, int32_t n, int32_t sig_x,
|
||||
int64_t sig_xsq);
|
||||
|
||||
int32_t local_cost_; // Cost of this point on its own.
|
||||
int32_t total_cost_; // Sum of all costs in best path to here.
|
||||
// During cost calculations local_cost is excluded.
|
||||
int32_t total_steps_; // Number of steps in best path to here.
|
||||
const DPPoint *best_prev_; // Pointer to prev point in best path from here.
|
||||
// Information for computing the variance part of the cost.
|
||||
int32_t n_; // Number of steps in best path to here for variance.
|
||||
int32_t sig_x_; // Sum of step sizes for computing variance.
|
||||
int64_t sig_xsq_; // Sum of squares of steps for computing variance.
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_CCSTRUCT_DPPOINT_H_
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user