feat: 集成Tesseract源码到项目中
Description: 由于仓库中的Tesseract不是最新版本导致产生了一个bug,因此将Tesseract源码集成到项目中 Log: no Change-Id: I088de95d6c6ab670406daa8d47ed2ed46929c2c0
This commit is contained in:
245
3rdparty/tesseract_ocr/tesseract/src/api/altorenderer.cpp
vendored
Normal file
245
3rdparty/tesseract_ocr/tesseract/src/api/altorenderer.cpp
vendored
Normal file
@ -0,0 +1,245 @@
|
||||
// File: altorenderer.cpp
|
||||
// Description: ALTO rendering interface
|
||||
// Author: Jake Sebright
|
||||
|
||||
// (C) Copyright 2018
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifdef _WIN32
|
||||
# include "host.h" // windows.h for MultiByteToWideChar, ...
|
||||
#endif
|
||||
|
||||
#include <tesseract/baseapi.h>
|
||||
#include <tesseract/renderer.h>
|
||||
|
||||
#include <memory>
|
||||
#include <sstream> // for std::stringstream
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
/// Add coordinates to specified TextBlock, TextLine or String bounding box.
|
||||
/// Add word confidence if adding to a String bounding box.
|
||||
///
|
||||
static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level,
|
||||
std::stringstream &alto_str) {
|
||||
int left, top, right, bottom;
|
||||
it->BoundingBox(level, &left, &top, &right, &bottom);
|
||||
|
||||
int hpos = left;
|
||||
int vpos = top;
|
||||
int height = bottom - top;
|
||||
int width = right - left;
|
||||
|
||||
alto_str << " HPOS=\"" << hpos << "\"";
|
||||
alto_str << " VPOS=\"" << vpos << "\"";
|
||||
alto_str << " WIDTH=\"" << width << "\"";
|
||||
alto_str << " HEIGHT=\"" << height << "\"";
|
||||
|
||||
if (level == RIL_WORD) {
|
||||
int wc = it->Confidence(RIL_WORD);
|
||||
alto_str << " WC=\"0." << wc << "\"";
|
||||
} else {
|
||||
alto_str << ">";
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Append the ALTO XML for the beginning of the document
|
||||
///
|
||||
bool TessAltoRenderer::BeginDocumentHandler() {
|
||||
AppendString(
|
||||
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
|
||||
"<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" "
|
||||
"xmlns:xlink=\"http://www.w3.org/1999/xlink\" "
|
||||
"xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
|
||||
"xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# "
|
||||
"http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n"
|
||||
"\t<Description>\n"
|
||||
"\t\t<MeasurementUnit>pixel</MeasurementUnit>\n"
|
||||
"\t\t<sourceImageInformation>\n"
|
||||
"\t\t\t<fileName>");
|
||||
|
||||
AppendString(title());
|
||||
|
||||
AppendString(
|
||||
"</fileName>\n"
|
||||
"\t\t</sourceImageInformation>\n"
|
||||
"\t\t<OCRProcessing ID=\"OCR_0\">\n"
|
||||
"\t\t\t<ocrProcessingStep>\n"
|
||||
"\t\t\t\t<processingSoftware>\n"
|
||||
"\t\t\t\t\t<softwareName>tesseract ");
|
||||
AppendString(TessBaseAPI::Version());
|
||||
AppendString(
|
||||
"</softwareName>\n"
|
||||
"\t\t\t\t</processingSoftware>\n"
|
||||
"\t\t\t</ocrProcessingStep>\n"
|
||||
"\t\t</OCRProcessing>\n"
|
||||
"\t</Description>\n"
|
||||
"\t<Layout>\n");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
///
|
||||
/// Append the ALTO XML for the layout of the image
|
||||
///
|
||||
bool TessAltoRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
const std::unique_ptr<const char[]> text(api->GetAltoText(imagenum()));
|
||||
if (text == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AppendString(text.get());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
///
|
||||
/// Append the ALTO XML for the end of the document
|
||||
///
|
||||
bool TessAltoRenderer::EndDocumentHandler() {
|
||||
AppendString("\t</Layout>\n</alto>\n");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
TessAltoRenderer::TessAltoRenderer(const char *outputbase)
|
||||
: TessResultRenderer(outputbase, "xml") {}
|
||||
|
||||
///
|
||||
/// Make an XML-formatted string with ALTO markup from the internal
|
||||
/// data structures.
|
||||
///
|
||||
char *TessBaseAPI::GetAltoText(int page_number) {
|
||||
return GetAltoText(nullptr, page_number);
|
||||
}
|
||||
|
||||
///
|
||||
/// Make an XML-formatted string with ALTO markup from the internal
|
||||
/// data structures.
|
||||
///
|
||||
char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
|
||||
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0;
|
||||
|
||||
if (input_file_.empty()) {
|
||||
SetInputName(nullptr);
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
// convert input name from ANSI encoding to utf-8
|
||||
int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
|
||||
wchar_t *uni16_str = new WCHAR[str16_len];
|
||||
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, str16_len);
|
||||
int utf8_len =
|
||||
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr);
|
||||
char *utf8_str = new char[utf8_len];
|
||||
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr);
|
||||
input_file_ = utf8_str;
|
||||
delete[] uni16_str;
|
||||
delete[] utf8_str;
|
||||
#endif
|
||||
|
||||
std::stringstream alto_str;
|
||||
// Use "C" locale (needed for int values larger than 999).
|
||||
alto_str.imbue(std::locale::classic());
|
||||
alto_str << "\t\t<Page WIDTH=\"" << rect_width_ << "\" HEIGHT=\"" << rect_height_
|
||||
<< "\" PHYSICAL_IMG_NR=\"" << page_number << "\""
|
||||
<< " ID=\"page_" << page_number << "\">\n"
|
||||
<< "\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\""
|
||||
<< " WIDTH=\"" << rect_width_ << "\""
|
||||
<< " HEIGHT=\"" << rect_height_ << "\">\n";
|
||||
|
||||
ResultIterator *res_it = GetIterator();
|
||||
while (!res_it->Empty(RIL_BLOCK)) {
|
||||
if (res_it->Empty(RIL_WORD)) {
|
||||
res_it->Next(RIL_WORD);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
|
||||
alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\"";
|
||||
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
|
||||
alto_str << "\n";
|
||||
}
|
||||
|
||||
if (res_it->IsAtBeginningOf(RIL_PARA)) {
|
||||
alto_str << "\t\t\t\t\t<TextBlock ID=\"block_" << tcnt << "\"";
|
||||
AddBoxToAlto(res_it, RIL_PARA, alto_str);
|
||||
alto_str << "\n";
|
||||
}
|
||||
|
||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||
alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
|
||||
AddBoxToAlto(res_it, RIL_TEXTLINE, alto_str);
|
||||
alto_str << "\n";
|
||||
}
|
||||
|
||||
alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
|
||||
AddBoxToAlto(res_it, RIL_WORD, alto_str);
|
||||
alto_str << " CONTENT=\"";
|
||||
|
||||
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
|
||||
bool last_word_in_tblock = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
|
||||
bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
|
||||
|
||||
int left, top, right, bottom;
|
||||
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
|
||||
|
||||
do {
|
||||
const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
|
||||
if (grapheme && grapheme[0] != 0) {
|
||||
alto_str << HOcrEscape(grapheme.get()).c_str();
|
||||
}
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
||||
|
||||
alto_str << "\"/>";
|
||||
|
||||
wcnt++;
|
||||
|
||||
if (last_word_in_line) {
|
||||
alto_str << "\n\t\t\t\t\t\t</TextLine>\n";
|
||||
lcnt++;
|
||||
} else {
|
||||
int hpos = right;
|
||||
int vpos = top;
|
||||
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
|
||||
int width = left - hpos;
|
||||
alto_str << "<SP WIDTH=\"" << width << "\" VPOS=\"" << vpos << "\" HPOS=\"" << hpos
|
||||
<< "\"/>\n";
|
||||
}
|
||||
|
||||
if (last_word_in_tblock) {
|
||||
alto_str << "\t\t\t\t\t</TextBlock>\n";
|
||||
tcnt++;
|
||||
}
|
||||
|
||||
if (last_word_in_cblock) {
|
||||
alto_str << "\t\t\t\t</ComposedBlock>\n";
|
||||
bcnt++;
|
||||
}
|
||||
}
|
||||
|
||||
alto_str << "\t\t\t</PrintSpace>\n"
|
||||
<< "\t\t</Page>\n";
|
||||
const std::string &text = alto_str.str();
|
||||
|
||||
char *result = new char[text.length() + 1];
|
||||
strcpy(result, text.c_str());
|
||||
delete res_it;
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
2419
3rdparty/tesseract_ocr/tesseract/src/api/baseapi.cpp
vendored
Normal file
2419
3rdparty/tesseract_ocr/tesseract/src/api/baseapi.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
689
3rdparty/tesseract_ocr/tesseract/src/api/capi.cpp
vendored
Normal file
689
3rdparty/tesseract_ocr/tesseract/src/api/capi.cpp
vendored
Normal file
@ -0,0 +1,689 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: capi.cpp
|
||||
// Description: C-API TessBaseAPI
|
||||
//
|
||||
// (C) Copyright 2012, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <tesseract/capi.h>
|
||||
|
||||
#include <cstring> // for strdup
|
||||
|
||||
const char *TessVersion() {
|
||||
return TessBaseAPI::Version();
|
||||
}
|
||||
|
||||
void TessDeleteText(const char *text) {
|
||||
delete[] text;
|
||||
}
|
||||
|
||||
void TessDeleteTextArray(char **arr) {
|
||||
for (char **pos = arr; *pos != nullptr; ++pos) {
|
||||
delete[] * pos;
|
||||
}
|
||||
delete[] arr;
|
||||
}
|
||||
|
||||
void TessDeleteIntArray(const int *arr) {
|
||||
delete[] arr;
|
||||
}
|
||||
|
||||
TessResultRenderer *TessTextRendererCreate(const char *outputbase) {
|
||||
return new tesseract::TessTextRenderer(outputbase);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessHOcrRendererCreate(const char *outputbase) {
|
||||
return new tesseract::TessHOcrRenderer(outputbase);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessHOcrRendererCreate2(const char *outputbase, BOOL font_info) {
|
||||
return new tesseract::TessHOcrRenderer(outputbase, font_info != 0);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessAltoRendererCreate(const char *outputbase) {
|
||||
return new tesseract::TessAltoRenderer(outputbase);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessTsvRendererCreate(const char *outputbase) {
|
||||
return new tesseract::TessTsvRenderer(outputbase);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessPDFRendererCreate(const char *outputbase, const char *datadir,
|
||||
BOOL textonly) {
|
||||
return new tesseract::TessPDFRenderer(outputbase, datadir, textonly != 0);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessUnlvRendererCreate(const char *outputbase) {
|
||||
return new tesseract::TessUnlvRenderer(outputbase);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessBoxTextRendererCreate(const char *outputbase) {
|
||||
return new tesseract::TessBoxTextRenderer(outputbase);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessWordStrBoxRendererCreate(const char *outputbase) {
|
||||
return new tesseract::TessWordStrBoxRenderer(outputbase);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessLSTMBoxRendererCreate(const char *outputbase) {
|
||||
return new tesseract::TessLSTMBoxRenderer(outputbase);
|
||||
}
|
||||
|
||||
void TessDeleteResultRenderer(TessResultRenderer *renderer) {
|
||||
delete renderer;
|
||||
}
|
||||
|
||||
void TessResultRendererInsert(TessResultRenderer *renderer, TessResultRenderer *next) {
|
||||
renderer->insert(next);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessResultRendererNext(TessResultRenderer *renderer) {
|
||||
return renderer->next();
|
||||
}
|
||||
|
||||
BOOL TessResultRendererBeginDocument(TessResultRenderer *renderer, const char *title) {
|
||||
return static_cast<int>(renderer->BeginDocument(title));
|
||||
}
|
||||
|
||||
BOOL TessResultRendererAddImage(TessResultRenderer *renderer, TessBaseAPI *api) {
|
||||
return static_cast<int>(renderer->AddImage(api));
|
||||
}
|
||||
|
||||
BOOL TessResultRendererEndDocument(TessResultRenderer *renderer) {
|
||||
return static_cast<int>(renderer->EndDocument());
|
||||
}
|
||||
|
||||
const char *TessResultRendererExtention(TessResultRenderer *renderer) {
|
||||
return renderer->file_extension();
|
||||
}
|
||||
|
||||
const char *TessResultRendererTitle(TessResultRenderer *renderer) {
|
||||
return renderer->title();
|
||||
}
|
||||
|
||||
int TessResultRendererImageNum(TessResultRenderer *renderer) {
|
||||
return renderer->imagenum();
|
||||
}
|
||||
|
||||
TessBaseAPI *TessBaseAPICreate() {
|
||||
return new TessBaseAPI;
|
||||
}
|
||||
|
||||
void TessBaseAPIDelete(TessBaseAPI *handle) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
size_t TessBaseAPIGetOpenCLDevice(TessBaseAPI * /*handle*/, void **device) {
|
||||
return TessBaseAPI::getOpenCLDevice(device);
|
||||
}
|
||||
|
||||
void TessBaseAPISetInputName(TessBaseAPI *handle, const char *name) {
|
||||
handle->SetInputName(name);
|
||||
}
|
||||
|
||||
const char *TessBaseAPIGetInputName(TessBaseAPI *handle) {
|
||||
return handle->GetInputName();
|
||||
}
|
||||
|
||||
void TessBaseAPISetInputImage(TessBaseAPI *handle, Pix *pix) {
|
||||
handle->SetInputImage(pix);
|
||||
}
|
||||
|
||||
Pix *TessBaseAPIGetInputImage(TessBaseAPI *handle) {
|
||||
return handle->GetInputImage();
|
||||
}
|
||||
|
||||
int TessBaseAPIGetSourceYResolution(TessBaseAPI *handle) {
|
||||
return handle->GetSourceYResolution();
|
||||
}
|
||||
|
||||
const char *TessBaseAPIGetDatapath(TessBaseAPI *handle) {
|
||||
return handle->GetDatapath();
|
||||
}
|
||||
|
||||
void TessBaseAPISetOutputName(TessBaseAPI *handle, const char *name) {
|
||||
handle->SetOutputName(name);
|
||||
}
|
||||
|
||||
BOOL TessBaseAPISetVariable(TessBaseAPI *handle, const char *name, const char *value) {
|
||||
return static_cast<int>(handle->SetVariable(name, value));
|
||||
}
|
||||
|
||||
BOOL TessBaseAPISetDebugVariable(TessBaseAPI *handle, const char *name, const char *value) {
|
||||
return static_cast<int>(handle->SetDebugVariable(name, value));
|
||||
}
|
||||
|
||||
BOOL TessBaseAPIGetIntVariable(const TessBaseAPI *handle, const char *name, int *value) {
|
||||
return static_cast<int>(handle->GetIntVariable(name, value));
|
||||
}
|
||||
|
||||
BOOL TessBaseAPIGetBoolVariable(const TessBaseAPI *handle, const char *name, BOOL *value) {
|
||||
bool boolValue;
|
||||
bool result = handle->GetBoolVariable(name, &boolValue);
|
||||
if (result) {
|
||||
*value = static_cast<int>(boolValue);
|
||||
}
|
||||
return static_cast<int>(result);
|
||||
}
|
||||
|
||||
BOOL TessBaseAPIGetDoubleVariable(const TessBaseAPI *handle, const char *name, double *value) {
|
||||
return static_cast<int>(handle->GetDoubleVariable(name, value));
|
||||
}
|
||||
|
||||
const char *TessBaseAPIGetStringVariable(const TessBaseAPI *handle, const char *name) {
|
||||
return handle->GetStringVariable(name);
|
||||
}
|
||||
|
||||
void TessBaseAPIPrintVariables(const TessBaseAPI *handle, FILE *fp) {
|
||||
handle->PrintVariables(fp);
|
||||
}
|
||||
|
||||
BOOL TessBaseAPIPrintVariablesToFile(const TessBaseAPI *handle, const char *filename) {
|
||||
FILE *fp = fopen(filename, "w");
|
||||
if (fp != nullptr) {
|
||||
handle->PrintVariables(fp);
|
||||
fclose(fp);
|
||||
return TRUE;
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
int TessBaseAPIInit4(TessBaseAPI *handle, const char *datapath, const char *language,
|
||||
TessOcrEngineMode mode, char **configs, int configs_size, char **vars_vec,
|
||||
char **vars_values, size_t vars_vec_size, BOOL set_only_non_debug_params) {
|
||||
std::vector<std::string> varNames;
|
||||
std::vector<std::string> varValues;
|
||||
if (vars_vec != nullptr && vars_values != nullptr) {
|
||||
for (size_t i = 0; i < vars_vec_size; i++) {
|
||||
varNames.emplace_back(vars_vec[i]);
|
||||
varValues.emplace_back(vars_values[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return handle->Init(datapath, language, mode, configs, configs_size, &varNames, &varValues,
|
||||
set_only_non_debug_params != 0);
|
||||
}
|
||||
|
||||
int TessBaseAPIInit1(TessBaseAPI *handle, const char *datapath, const char *language,
|
||||
TessOcrEngineMode oem, char **configs, int configs_size) {
|
||||
return handle->Init(datapath, language, oem, configs, configs_size, nullptr, nullptr, false);
|
||||
}
|
||||
|
||||
int TessBaseAPIInit2(TessBaseAPI *handle, const char *datapath, const char *language,
|
||||
TessOcrEngineMode oem) {
|
||||
return handle->Init(datapath, language, oem);
|
||||
}
|
||||
|
||||
int TessBaseAPIInit3(TessBaseAPI *handle, const char *datapath, const char *language) {
|
||||
return handle->Init(datapath, language);
|
||||
}
|
||||
|
||||
const char *TessBaseAPIGetInitLanguagesAsString(const TessBaseAPI *handle) {
|
||||
return handle->GetInitLanguagesAsString();
|
||||
}
|
||||
|
||||
char **TessBaseAPIGetLoadedLanguagesAsVector(const TessBaseAPI *handle) {
|
||||
std::vector<std::string> languages;
|
||||
handle->GetLoadedLanguagesAsVector(&languages);
|
||||
char **arr = new char *[languages.size() + 1];
|
||||
for (auto &language : languages) {
|
||||
arr[&language - &languages[0]] = strdup(language.c_str());
|
||||
}
|
||||
arr[languages.size()] = nullptr;
|
||||
return arr;
|
||||
}
|
||||
|
||||
char **TessBaseAPIGetAvailableLanguagesAsVector(const TessBaseAPI *handle) {
|
||||
std::vector<std::string> languages;
|
||||
handle->GetAvailableLanguagesAsVector(&languages);
|
||||
char **arr = new char *[languages.size() + 1];
|
||||
for (auto &language : languages) {
|
||||
arr[&language - &languages[0]] = strdup(language.c_str());
|
||||
}
|
||||
arr[languages.size()] = nullptr;
|
||||
return arr;
|
||||
}
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
int TessBaseAPIInitLangMod(TessBaseAPI *handle, const char *datapath, const char *language) {
|
||||
return handle->InitLangMod(datapath, language);
|
||||
}
|
||||
#endif
|
||||
|
||||
void TessBaseAPIInitForAnalysePage(TessBaseAPI *handle) {
|
||||
handle->InitForAnalysePage();
|
||||
}
|
||||
|
||||
void TessBaseAPIReadConfigFile(TessBaseAPI *handle, const char *filename) {
|
||||
handle->ReadConfigFile(filename);
|
||||
}
|
||||
|
||||
void TessBaseAPIReadDebugConfigFile(TessBaseAPI *handle, const char *filename) {
|
||||
handle->ReadDebugConfigFile(filename);
|
||||
}
|
||||
|
||||
void TessBaseAPISetPageSegMode(TessBaseAPI *handle, TessPageSegMode mode) {
|
||||
handle->SetPageSegMode(mode);
|
||||
}
|
||||
|
||||
TessPageSegMode TessBaseAPIGetPageSegMode(const TessBaseAPI *handle) {
|
||||
return handle->GetPageSegMode();
|
||||
}
|
||||
|
||||
char *TessBaseAPIRect(TessBaseAPI *handle, const unsigned char *imagedata, int bytes_per_pixel,
|
||||
int bytes_per_line, int left, int top, int width, int height) {
|
||||
return handle->TesseractRect(imagedata, bytes_per_pixel, bytes_per_line, left, top, width,
|
||||
height);
|
||||
}
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
void TessBaseAPIClearAdaptiveClassifier(TessBaseAPI *handle) {
|
||||
handle->ClearAdaptiveClassifier();
|
||||
}
|
||||
#endif
|
||||
|
||||
void TessBaseAPISetImage(TessBaseAPI *handle, const unsigned char *imagedata, int width, int height,
|
||||
int bytes_per_pixel, int bytes_per_line) {
|
||||
handle->SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line);
|
||||
}
|
||||
|
||||
void TessBaseAPISetImage2(TessBaseAPI *handle, struct Pix *pix) {
|
||||
return handle->SetImage(pix);
|
||||
}
|
||||
|
||||
void TessBaseAPISetSourceResolution(TessBaseAPI *handle, int ppi) {
|
||||
handle->SetSourceResolution(ppi);
|
||||
}
|
||||
|
||||
void TessBaseAPISetRectangle(TessBaseAPI *handle, int left, int top, int width, int height) {
|
||||
handle->SetRectangle(left, top, width, height);
|
||||
}
|
||||
|
||||
struct Pix *TessBaseAPIGetThresholdedImage(TessBaseAPI *handle) {
|
||||
return handle->GetThresholdedImage();
|
||||
}
|
||||
|
||||
void TessBaseAPIClearPersistentCache(TessBaseAPI * /*handle*/) {
|
||||
TessBaseAPI::ClearPersistentCache();
|
||||
}
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
BOOL TessBaseAPIDetectOrientationScript(TessBaseAPI *handle, int *orient_deg, float *orient_conf,
|
||||
const char **script_name, float *script_conf) {
|
||||
auto success = handle->DetectOrientationScript(orient_deg, orient_conf, script_name, script_conf);
|
||||
return static_cast<BOOL>(success);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
struct Boxa *TessBaseAPIGetRegions(TessBaseAPI *handle, struct Pixa **pixa) {
|
||||
return handle->GetRegions(pixa);
|
||||
}
|
||||
|
||||
struct Boxa *TessBaseAPIGetTextlines(TessBaseAPI *handle, struct Pixa **pixa, int **blockids) {
|
||||
return handle->GetTextlines(pixa, blockids);
|
||||
}
|
||||
|
||||
struct Boxa *TessBaseAPIGetTextlines1(TessBaseAPI *handle, const BOOL raw_image,
|
||||
const int raw_padding, struct Pixa **pixa, int **blockids,
|
||||
int **paraids) {
|
||||
return handle->GetTextlines(raw_image != 0, raw_padding, pixa, blockids, paraids);
|
||||
}
|
||||
|
||||
struct Boxa *TessBaseAPIGetStrips(TessBaseAPI *handle, struct Pixa **pixa, int **blockids) {
|
||||
return handle->GetStrips(pixa, blockids);
|
||||
}
|
||||
|
||||
struct Boxa *TessBaseAPIGetWords(TessBaseAPI *handle, struct Pixa **pixa) {
|
||||
return handle->GetWords(pixa);
|
||||
}
|
||||
|
||||
struct Boxa *TessBaseAPIGetConnectedComponents(TessBaseAPI *handle, struct Pixa **cc) {
|
||||
return handle->GetConnectedComponents(cc);
|
||||
}
|
||||
|
||||
struct Boxa *TessBaseAPIGetComponentImages(TessBaseAPI *handle, TessPageIteratorLevel level,
|
||||
BOOL text_only, struct Pixa **pixa, int **blockids) {
|
||||
return handle->GetComponentImages(level, static_cast<bool>(text_only), pixa, blockids);
|
||||
}
|
||||
|
||||
struct Boxa *TessBaseAPIGetComponentImages1(TessBaseAPI *handle, const TessPageIteratorLevel level,
|
||||
const BOOL text_only, const BOOL raw_image,
|
||||
const int raw_padding, struct Pixa **pixa,
|
||||
int **blockids, int **paraids) {
|
||||
return handle->GetComponentImages(level, static_cast<bool>(text_only), raw_image != 0,
|
||||
raw_padding, pixa, blockids, paraids);
|
||||
}
|
||||
|
||||
int TessBaseAPIGetThresholdedImageScaleFactor(const TessBaseAPI *handle) {
|
||||
return handle->GetThresholdedImageScaleFactor();
|
||||
}
|
||||
|
||||
TessPageIterator *TessBaseAPIAnalyseLayout(TessBaseAPI *handle) {
|
||||
return handle->AnalyseLayout();
|
||||
}
|
||||
|
||||
int TessBaseAPIRecognize(TessBaseAPI *handle, ETEXT_DESC *monitor) {
|
||||
return handle->Recognize(monitor);
|
||||
}
|
||||
|
||||
BOOL TessBaseAPIProcessPages(TessBaseAPI *handle, const char *filename, const char *retry_config,
|
||||
int timeout_millisec, TessResultRenderer *renderer) {
|
||||
return static_cast<int>(handle->ProcessPages(filename, retry_config, timeout_millisec, renderer));
|
||||
}
|
||||
|
||||
BOOL TessBaseAPIProcessPage(TessBaseAPI *handle, struct Pix *pix, int page_index,
|
||||
const char *filename, const char *retry_config, int timeout_millisec,
|
||||
TessResultRenderer *renderer) {
|
||||
return static_cast<int>(
|
||||
handle->ProcessPage(pix, page_index, filename, retry_config, timeout_millisec, renderer));
|
||||
}
|
||||
|
||||
TessResultIterator *TessBaseAPIGetIterator(TessBaseAPI *handle) {
|
||||
return handle->GetIterator();
|
||||
}
|
||||
|
||||
TessMutableIterator *TessBaseAPIGetMutableIterator(TessBaseAPI *handle) {
|
||||
return handle->GetMutableIterator();
|
||||
}
|
||||
|
||||
char *TessBaseAPIGetUTF8Text(TessBaseAPI *handle) {
|
||||
return handle->GetUTF8Text();
|
||||
}
|
||||
|
||||
char *TessBaseAPIGetHOCRText(TessBaseAPI *handle, int page_number) {
|
||||
return handle->GetHOCRText(nullptr, page_number);
|
||||
}
|
||||
|
||||
char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number) {
|
||||
return handle->GetAltoText(page_number);
|
||||
}
|
||||
|
||||
char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number) {
|
||||
return handle->GetTSVText(page_number);
|
||||
}
|
||||
|
||||
char *TessBaseAPIGetBoxText(TessBaseAPI *handle, int page_number) {
|
||||
return handle->GetBoxText(page_number);
|
||||
}
|
||||
|
||||
char *TessBaseAPIGetWordStrBoxText(TessBaseAPI *handle, int page_number) {
|
||||
return handle->GetWordStrBoxText(page_number);
|
||||
}
|
||||
|
||||
char *TessBaseAPIGetLSTMBoxText(TessBaseAPI *handle, int page_number) {
|
||||
return handle->GetLSTMBoxText(page_number);
|
||||
}
|
||||
|
||||
char *TessBaseAPIGetUNLVText(TessBaseAPI *handle) {
|
||||
return handle->GetUNLVText();
|
||||
}
|
||||
|
||||
int TessBaseAPIMeanTextConf(TessBaseAPI *handle) {
|
||||
return handle->MeanTextConf();
|
||||
}
|
||||
|
||||
int *TessBaseAPIAllWordConfidences(TessBaseAPI *handle) {
|
||||
return handle->AllWordConfidences();
|
||||
}
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
BOOL TessBaseAPIAdaptToWordStr(TessBaseAPI *handle, TessPageSegMode mode, const char *wordstr) {
|
||||
return static_cast<int>(handle->AdaptToWordStr(mode, wordstr));
|
||||
}
|
||||
#endif
|
||||
|
||||
void TessBaseAPIClear(TessBaseAPI *handle) {
|
||||
handle->Clear();
|
||||
}
|
||||
|
||||
void TessBaseAPIEnd(TessBaseAPI *handle) {
|
||||
handle->End();
|
||||
}
|
||||
|
||||
int TessBaseAPIIsValidWord(TessBaseAPI *handle, const char *word) {
|
||||
return handle->IsValidWord(word);
|
||||
}
|
||||
|
||||
BOOL TessBaseAPIGetTextDirection(TessBaseAPI *handle, int *out_offset, float *out_slope) {
|
||||
return static_cast<int>(handle->GetTextDirection(out_offset, out_slope));
|
||||
}
|
||||
|
||||
const char *TessBaseAPIGetUnichar(TessBaseAPI *handle, int unichar_id) {
|
||||
return handle->GetUnichar(unichar_id);
|
||||
}
|
||||
|
||||
void TessBaseAPISetMinOrientationMargin(TessBaseAPI *handle, double margin) {
|
||||
handle->set_min_orientation_margin(margin);
|
||||
}
|
||||
|
||||
int TessBaseAPINumDawgs(const TessBaseAPI *handle) {
|
||||
return handle->NumDawgs();
|
||||
}
|
||||
|
||||
TessOcrEngineMode TessBaseAPIOem(const TessBaseAPI *handle) {
|
||||
return handle->oem();
|
||||
}
|
||||
|
||||
void TessBaseGetBlockTextOrientations(TessBaseAPI *handle, int **block_orientation,
|
||||
bool **vertical_writing) {
|
||||
handle->GetBlockTextOrientations(block_orientation, vertical_writing);
|
||||
}
|
||||
|
||||
void TessPageIteratorDelete(TessPageIterator *handle) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
TessPageIterator *TessPageIteratorCopy(const TessPageIterator *handle) {
|
||||
return new TessPageIterator(*handle);
|
||||
}
|
||||
|
||||
void TessPageIteratorBegin(TessPageIterator *handle) {
|
||||
handle->Begin();
|
||||
}
|
||||
|
||||
BOOL TessPageIteratorNext(TessPageIterator *handle, TessPageIteratorLevel level) {
|
||||
return static_cast<int>(handle->Next(level));
|
||||
}
|
||||
|
||||
BOOL TessPageIteratorIsAtBeginningOf(const TessPageIterator *handle, TessPageIteratorLevel level) {
|
||||
return static_cast<int>(handle->IsAtBeginningOf(level));
|
||||
}
|
||||
|
||||
BOOL TessPageIteratorIsAtFinalElement(const TessPageIterator *handle, TessPageIteratorLevel level,
|
||||
TessPageIteratorLevel element) {
|
||||
return static_cast<int>(handle->IsAtFinalElement(level, element));
|
||||
}
|
||||
|
||||
BOOL TessPageIteratorBoundingBox(const TessPageIterator *handle, TessPageIteratorLevel level,
|
||||
int *left, int *top, int *right, int *bottom) {
|
||||
return static_cast<int>(handle->BoundingBox(level, left, top, right, bottom));
|
||||
}
|
||||
|
||||
TessPolyBlockType TessPageIteratorBlockType(const TessPageIterator *handle) {
|
||||
return handle->BlockType();
|
||||
}
|
||||
|
||||
struct Pix *TessPageIteratorGetBinaryImage(const TessPageIterator *handle,
|
||||
TessPageIteratorLevel level) {
|
||||
return handle->GetBinaryImage(level);
|
||||
}
|
||||
|
||||
struct Pix *TessPageIteratorGetImage(const TessPageIterator *handle, TessPageIteratorLevel level,
|
||||
int padding, struct Pix *original_image, int *left, int *top) {
|
||||
return handle->GetImage(level, padding, original_image, left, top);
|
||||
}
|
||||
|
||||
BOOL TessPageIteratorBaseline(const TessPageIterator *handle, TessPageIteratorLevel level, int *x1,
|
||||
int *y1, int *x2, int *y2) {
|
||||
return static_cast<int>(handle->Baseline(level, x1, y1, x2, y2));
|
||||
}
|
||||
|
||||
void TessPageIteratorOrientation(TessPageIterator *handle, TessOrientation *orientation,
|
||||
TessWritingDirection *writing_direction,
|
||||
TessTextlineOrder *textline_order, float *deskew_angle) {
|
||||
handle->Orientation(orientation, writing_direction, textline_order, deskew_angle);
|
||||
}
|
||||
|
||||
void TessPageIteratorParagraphInfo(TessPageIterator *handle,
|
||||
TessParagraphJustification *justification, BOOL *is_list_item,
|
||||
BOOL *is_crown, int *first_line_indent) {
|
||||
bool bool_is_list_item;
|
||||
bool bool_is_crown;
|
||||
handle->ParagraphInfo(justification, &bool_is_list_item, &bool_is_crown, first_line_indent);
|
||||
if (is_list_item != nullptr) {
|
||||
*is_list_item = static_cast<int>(bool_is_list_item);
|
||||
}
|
||||
if (is_crown != nullptr) {
|
||||
*is_crown = static_cast<int>(bool_is_crown);
|
||||
}
|
||||
}
|
||||
|
||||
void TessResultIteratorDelete(TessResultIterator *handle) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
TessResultIterator *TessResultIteratorCopy(const TessResultIterator *handle) {
|
||||
return new TessResultIterator(*handle);
|
||||
}
|
||||
|
||||
TessPageIterator *TessResultIteratorGetPageIterator(TessResultIterator *handle) {
|
||||
return handle;
|
||||
}
|
||||
|
||||
const TessPageIterator *TessResultIteratorGetPageIteratorConst(const TessResultIterator *handle) {
|
||||
return handle;
|
||||
}
|
||||
|
||||
TessChoiceIterator *TessResultIteratorGetChoiceIterator(const TessResultIterator *handle) {
|
||||
return new TessChoiceIterator(*handle);
|
||||
}
|
||||
|
||||
BOOL TessResultIteratorNext(TessResultIterator *handle, TessPageIteratorLevel level) {
|
||||
return static_cast<int>(handle->Next(level));
|
||||
}
|
||||
|
||||
char *TessResultIteratorGetUTF8Text(const TessResultIterator *handle, TessPageIteratorLevel level) {
|
||||
return handle->GetUTF8Text(level);
|
||||
}
|
||||
|
||||
float TessResultIteratorConfidence(const TessResultIterator *handle, TessPageIteratorLevel level) {
|
||||
return handle->Confidence(level);
|
||||
}
|
||||
|
||||
const char *TessResultIteratorWordRecognitionLanguage(const TessResultIterator *handle) {
|
||||
return handle->WordRecognitionLanguage();
|
||||
}
|
||||
|
||||
const char *TessResultIteratorWordFontAttributes(const TessResultIterator *handle, BOOL *is_bold,
|
||||
BOOL *is_italic, BOOL *is_underlined,
|
||||
BOOL *is_monospace, BOOL *is_serif,
|
||||
BOOL *is_smallcaps, int *pointsize, int *font_id) {
|
||||
bool bool_is_bold;
|
||||
bool bool_is_italic;
|
||||
bool bool_is_underlined;
|
||||
bool bool_is_monospace;
|
||||
bool bool_is_serif;
|
||||
bool bool_is_smallcaps;
|
||||
const char *ret = handle->WordFontAttributes(&bool_is_bold, &bool_is_italic, &bool_is_underlined,
|
||||
&bool_is_monospace, &bool_is_serif,
|
||||
&bool_is_smallcaps, pointsize, font_id);
|
||||
if (is_bold != nullptr) {
|
||||
*is_bold = static_cast<int>(bool_is_bold);
|
||||
}
|
||||
if (is_italic != nullptr) {
|
||||
*is_italic = static_cast<int>(bool_is_italic);
|
||||
}
|
||||
if (is_underlined != nullptr) {
|
||||
*is_underlined = static_cast<int>(bool_is_underlined);
|
||||
}
|
||||
if (is_monospace != nullptr) {
|
||||
*is_monospace = static_cast<int>(bool_is_monospace);
|
||||
}
|
||||
if (is_serif != nullptr) {
|
||||
*is_serif = static_cast<int>(bool_is_serif);
|
||||
}
|
||||
if (is_smallcaps != nullptr) {
|
||||
*is_smallcaps = static_cast<int>(bool_is_smallcaps);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
BOOL TessResultIteratorWordIsFromDictionary(const TessResultIterator *handle) {
|
||||
return static_cast<int>(handle->WordIsFromDictionary());
|
||||
}
|
||||
|
||||
BOOL TessResultIteratorWordIsNumeric(const TessResultIterator *handle) {
|
||||
return static_cast<int>(handle->WordIsNumeric());
|
||||
}
|
||||
|
||||
BOOL TessResultIteratorSymbolIsSuperscript(const TessResultIterator *handle) {
|
||||
return static_cast<int>(handle->SymbolIsSuperscript());
|
||||
}
|
||||
|
||||
BOOL TessResultIteratorSymbolIsSubscript(const TessResultIterator *handle) {
|
||||
return static_cast<int>(handle->SymbolIsSubscript());
|
||||
}
|
||||
|
||||
BOOL TessResultIteratorSymbolIsDropcap(const TessResultIterator *handle) {
|
||||
return static_cast<int>(handle->SymbolIsDropcap());
|
||||
}
|
||||
|
||||
void TessChoiceIteratorDelete(TessChoiceIterator *handle) {
|
||||
delete handle;
|
||||
}
|
||||
|
||||
BOOL TessChoiceIteratorNext(TessChoiceIterator *handle) {
|
||||
return static_cast<int>(handle->Next());
|
||||
}
|
||||
|
||||
const char *TessChoiceIteratorGetUTF8Text(const TessChoiceIterator *handle) {
|
||||
return handle->GetUTF8Text();
|
||||
}
|
||||
|
||||
float TessChoiceIteratorConfidence(const TessChoiceIterator *handle) {
|
||||
return handle->Confidence();
|
||||
}
|
||||
|
||||
ETEXT_DESC *TessMonitorCreate() {
|
||||
return new ETEXT_DESC();
|
||||
}
|
||||
|
||||
void TessMonitorDelete(ETEXT_DESC *monitor) {
|
||||
delete monitor;
|
||||
}
|
||||
|
||||
void TessMonitorSetCancelFunc(ETEXT_DESC *monitor, TessCancelFunc cancelFunc) {
|
||||
monitor->cancel = cancelFunc;
|
||||
}
|
||||
|
||||
void TessMonitorSetCancelThis(ETEXT_DESC *monitor, void *cancelThis) {
|
||||
monitor->cancel_this = cancelThis;
|
||||
}
|
||||
|
||||
void *TessMonitorGetCancelThis(ETEXT_DESC *monitor) {
|
||||
return monitor->cancel_this;
|
||||
}
|
||||
|
||||
void TessMonitorSetProgressFunc(ETEXT_DESC *monitor, TessProgressFunc progressFunc) {
|
||||
monitor->progress_callback2 = progressFunc;
|
||||
}
|
||||
|
||||
int TessMonitorGetProgress(ETEXT_DESC *monitor) {
|
||||
return monitor->progress;
|
||||
}
|
||||
|
||||
void TessMonitorSetDeadlineMSecs(ETEXT_DESC *monitor, int deadline) {
|
||||
monitor->set_deadline_msecs(deadline);
|
||||
}
|
489
3rdparty/tesseract_ocr/tesseract/src/api/hocrrenderer.cpp
vendored
Normal file
489
3rdparty/tesseract_ocr/tesseract/src/api/hocrrenderer.cpp
vendored
Normal file
@ -0,0 +1,489 @@
|
||||
/**********************************************************************
|
||||
* File: hocrrenderer.cpp
|
||||
* Description: Simple API for calling tesseract.
|
||||
* Author: Ray Smith (original code from baseapi.cpp)
|
||||
* Author: Stefan Weil (moved to separate file and cleaned code)
|
||||
*
|
||||
* (C) Copyright 2006, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include <tesseract/baseapi.h> // for TessBaseAPI
|
||||
#include <locale> // for std::locale::classic
|
||||
#include <memory> // for std::unique_ptr
|
||||
#include <sstream> // for std::stringstream
|
||||
#ifdef _WIN32
|
||||
# include "host.h" // windows.h for MultiByteToWideChar, ...
|
||||
#endif
|
||||
#include <tesseract/renderer.h>
|
||||
#include "tesseractclass.h" // for Tesseract
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
/**
|
||||
* Gets the block orientation at the current iterator position.
|
||||
*/
|
||||
static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) {
|
||||
tesseract::Orientation orientation;
|
||||
tesseract::WritingDirection writing_direction;
|
||||
tesseract::TextlineOrder textline_order;
|
||||
float deskew_angle;
|
||||
it->Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle);
|
||||
return orientation;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fits a line to the baseline at the given level, and appends its coefficients
|
||||
* to the hOCR string.
|
||||
* NOTE: The hOCR spec is unclear on how to specify baseline coefficients for
|
||||
* rotated textlines. For this reason, on textlines that are not upright, this
|
||||
* method currently only inserts a 'textangle' property to indicate the rotation
|
||||
* direction and does not add any baseline information to the hocr string.
|
||||
*/
|
||||
static void AddBaselineCoordsTohOCR(const PageIterator *it, PageIteratorLevel level,
|
||||
std::stringstream &hocr_str) {
|
||||
tesseract::Orientation orientation = GetBlockTextOrientation(it);
|
||||
if (orientation != ORIENTATION_PAGE_UP) {
|
||||
hocr_str << "; textangle " << 360 - orientation * 90;
|
||||
return;
|
||||
}
|
||||
|
||||
int left, top, right, bottom;
|
||||
it->BoundingBox(level, &left, &top, &right, &bottom);
|
||||
|
||||
// Try to get the baseline coordinates at this level.
|
||||
int x1, y1, x2, y2;
|
||||
if (!it->Baseline(level, &x1, &y1, &x2, &y2)) {
|
||||
return;
|
||||
}
|
||||
// Following the description of this field of the hOCR spec, we convert the
|
||||
// baseline coordinates so that "the bottom left of the bounding box is the
|
||||
// origin".
|
||||
x1 -= left;
|
||||
x2 -= left;
|
||||
y1 -= bottom;
|
||||
y2 -= bottom;
|
||||
|
||||
// Now fit a line through the points so we can extract coefficients for the
|
||||
// equation: y = p1 x + p0
|
||||
if (x1 == x2) {
|
||||
// Problem computing the polynomial coefficients.
|
||||
return;
|
||||
}
|
||||
double p1 = (y2 - y1) / static_cast<double>(x2 - x1);
|
||||
double p0 = y1 - p1 * x1;
|
||||
|
||||
hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " " << round(p0 * 1000.0) / 1000.0;
|
||||
}
|
||||
|
||||
static void AddBoxTohOCR(const ResultIterator *it, PageIteratorLevel level,
|
||||
std::stringstream &hocr_str) {
|
||||
int left, top, right, bottom;
|
||||
it->BoundingBox(level, &left, &top, &right, &bottom);
|
||||
// This is the only place we use double quotes instead of single quotes,
|
||||
// but it may too late to change for consistency
|
||||
hocr_str << " title=\"bbox " << left << " " << top << " " << right << " " << bottom;
|
||||
// Add baseline coordinates & heights for textlines only.
|
||||
if (level == RIL_TEXTLINE) {
|
||||
AddBaselineCoordsTohOCR(it, level, hocr_str);
|
||||
// add custom height measures
|
||||
float row_height, descenders, ascenders; // row attributes
|
||||
it->RowAttributes(&row_height, &descenders, &ascenders);
|
||||
// TODO(rays): Do we want to limit these to a single decimal place?
|
||||
hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders << "; x_ascenders "
|
||||
<< ascenders;
|
||||
}
|
||||
hocr_str << "\">";
|
||||
}
|
||||
|
||||
/**
|
||||
* Make a HTML-formatted string with hOCR markup from the internal
|
||||
* data structures.
|
||||
* page_number is 0-based but will appear in the output as 1-based.
|
||||
* Image name/input_file_ can be set by SetInputName before calling
|
||||
* GetHOCRText
|
||||
* STL removed from original patch submission and refactored by rays.
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
char *TessBaseAPI::GetHOCRText(int page_number) {
|
||||
return GetHOCRText(nullptr, page_number);
|
||||
}
|
||||
|
||||
/**
|
||||
* Make a HTML-formatted string with hOCR markup from the internal
|
||||
* data structures.
|
||||
* page_number is 0-based but will appear in the output as 1-based.
|
||||
* Image name/input_file_ can be set by SetInputName before calling
|
||||
* GetHOCRText
|
||||
* STL removed from original patch submission and refactored by rays.
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
|
||||
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, ccnt = 1;
|
||||
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
|
||||
bool para_is_ltr = true; // Default direction is LTR
|
||||
const char *paragraph_lang = nullptr;
|
||||
bool font_info = false;
|
||||
bool hocr_boxes = false;
|
||||
GetBoolVariable("hocr_font_info", &font_info);
|
||||
GetBoolVariable("hocr_char_boxes", &hocr_boxes);
|
||||
|
||||
if (input_file_.empty()) {
|
||||
SetInputName(nullptr);
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
// convert input name from ANSI encoding to utf-8
|
||||
int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
|
||||
wchar_t *uni16_str = new WCHAR[str16_len];
|
||||
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, str16_len);
|
||||
int utf8_len =
|
||||
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr);
|
||||
char *utf8_str = new char[utf8_len];
|
||||
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr);
|
||||
input_file_ = utf8_str;
|
||||
delete[] uni16_str;
|
||||
delete[] utf8_str;
|
||||
#endif
|
||||
|
||||
std::stringstream hocr_str;
|
||||
// Use "C" locale (needed for double values x_size and x_descenders).
|
||||
hocr_str.imbue(std::locale::classic());
|
||||
// Use 8 digits for double values.
|
||||
hocr_str.precision(8);
|
||||
hocr_str << " <div class='ocr_page'"
|
||||
<< " id='"
|
||||
<< "page_" << page_id << "'"
|
||||
<< " title='image \"";
|
||||
if (!input_file_.empty()) {
|
||||
hocr_str << HOcrEscape(input_file_.c_str());
|
||||
} else {
|
||||
hocr_str << "unknown";
|
||||
}
|
||||
hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " " << rect_width_ << " "
|
||||
<< rect_height_ << "; ppageno " << page_number << "'>\n";
|
||||
|
||||
std::unique_ptr<ResultIterator> res_it(GetIterator());
|
||||
while (!res_it->Empty(RIL_BLOCK)) {
|
||||
if (res_it->Empty(RIL_WORD)) {
|
||||
res_it->Next(RIL_WORD);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Open any new block/paragraph/textline.
|
||||
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
|
||||
para_is_ltr = true; // reset to default direction
|
||||
hocr_str << " <div class='ocr_carea'"
|
||||
<< " id='"
|
||||
<< "block_" << page_id << "_" << bcnt << "'";
|
||||
AddBoxTohOCR(res_it.get(), RIL_BLOCK, hocr_str);
|
||||
}
|
||||
if (res_it->IsAtBeginningOf(RIL_PARA)) {
|
||||
hocr_str << "\n <p class='ocr_par'";
|
||||
para_is_ltr = res_it->ParagraphIsLtr();
|
||||
if (!para_is_ltr) {
|
||||
hocr_str << " dir='rtl'";
|
||||
}
|
||||
hocr_str << " id='"
|
||||
<< "par_" << page_id << "_" << pcnt << "'";
|
||||
paragraph_lang = res_it->WordRecognitionLanguage();
|
||||
if (paragraph_lang) {
|
||||
hocr_str << " lang='" << paragraph_lang << "'";
|
||||
}
|
||||
AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str);
|
||||
}
|
||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||
hocr_str << "\n <span class='";
|
||||
switch (res_it->BlockType()) {
|
||||
case PT_HEADING_TEXT:
|
||||
hocr_str << "ocr_header";
|
||||
break;
|
||||
case PT_PULLOUT_TEXT:
|
||||
hocr_str << "ocr_textfloat";
|
||||
break;
|
||||
case PT_CAPTION_TEXT:
|
||||
hocr_str << "ocr_caption";
|
||||
break;
|
||||
default:
|
||||
hocr_str << "ocr_line";
|
||||
}
|
||||
hocr_str << "' id='"
|
||||
<< "line_" << page_id << "_" << lcnt << "'";
|
||||
AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str);
|
||||
}
|
||||
|
||||
// Now, process the word...
|
||||
int32_t lstm_choice_mode = tesseract_->lstm_choice_mode;
|
||||
std::vector<std::vector<std::vector<std::pair<const char *, float>>>> *rawTimestepMap = nullptr;
|
||||
std::vector<std::vector<std::pair<const char *, float>>> *CTCMap = nullptr;
|
||||
if (lstm_choice_mode) {
|
||||
CTCMap = res_it->GetBestLSTMSymbolChoices();
|
||||
rawTimestepMap = res_it->GetRawLSTMTimesteps();
|
||||
}
|
||||
hocr_str << "\n <span class='ocrx_word'"
|
||||
<< " id='"
|
||||
<< "word_" << page_id << "_" << wcnt << "'";
|
||||
int left, top, right, bottom;
|
||||
bool bold, italic, underlined, monospace, serif, smallcaps;
|
||||
int pointsize, font_id;
|
||||
const char *font_name;
|
||||
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
|
||||
font_name = res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif,
|
||||
&smallcaps, &pointsize, &font_id);
|
||||
hocr_str << " title='bbox " << left << " " << top << " " << right << " " << bottom
|
||||
<< "; x_wconf " << static_cast<int>(res_it->Confidence(RIL_WORD));
|
||||
if (font_info) {
|
||||
if (font_name) {
|
||||
hocr_str << "; x_font " << HOcrEscape(font_name).c_str();
|
||||
}
|
||||
hocr_str << "; x_fsize " << pointsize;
|
||||
}
|
||||
hocr_str << "'";
|
||||
const char *lang = res_it->WordRecognitionLanguage();
|
||||
if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
|
||||
hocr_str << " lang='" << lang << "'";
|
||||
}
|
||||
switch (res_it->WordDirection()) {
|
||||
// Only emit direction if different from current paragraph direction
|
||||
case DIR_LEFT_TO_RIGHT:
|
||||
if (!para_is_ltr) {
|
||||
hocr_str << " dir='ltr'";
|
||||
}
|
||||
break;
|
||||
case DIR_RIGHT_TO_LEFT:
|
||||
if (para_is_ltr) {
|
||||
hocr_str << " dir='rtl'";
|
||||
}
|
||||
break;
|
||||
case DIR_MIX:
|
||||
case DIR_NEUTRAL:
|
||||
default: // Do nothing.
|
||||
break;
|
||||
}
|
||||
hocr_str << ">";
|
||||
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
|
||||
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
|
||||
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
|
||||
if (bold) {
|
||||
hocr_str << "<strong>";
|
||||
}
|
||||
if (italic) {
|
||||
hocr_str << "<em>";
|
||||
}
|
||||
do {
|
||||
const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
|
||||
if (grapheme && grapheme[0] != 0) {
|
||||
if (hocr_boxes) {
|
||||
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
|
||||
hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes " << left << " " << top
|
||||
<< " " << right << " " << bottom << "; x_conf " << res_it->Confidence(RIL_SYMBOL)
|
||||
<< "'>";
|
||||
}
|
||||
hocr_str << HOcrEscape(grapheme.get()).c_str();
|
||||
if (hocr_boxes) {
|
||||
hocr_str << "</span>";
|
||||
tesseract::ChoiceIterator ci(*res_it);
|
||||
if (lstm_choice_mode == 1 && ci.Timesteps() != nullptr) {
|
||||
std::vector<std::vector<std::pair<const char *, float>>> *symbol = ci.Timesteps();
|
||||
hocr_str << "\n <span class='ocr_symbol'"
|
||||
<< " id='"
|
||||
<< "symbol_" << page_id << "_" << wcnt << "_" << scnt << "'>";
|
||||
for (auto timestep : *symbol) {
|
||||
hocr_str << "\n <span class='ocrx_cinfo'"
|
||||
<< " id='"
|
||||
<< "timestep" << page_id << "_" << wcnt << "_" << tcnt << "'>";
|
||||
for (auto conf : timestep) {
|
||||
hocr_str << "\n <span class='ocrx_cinfo'"
|
||||
<< " id='"
|
||||
<< "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'"
|
||||
<< " title='x_confs " << int(conf.second * 100) << "'>"
|
||||
<< HOcrEscape(conf.first).c_str() << "</span>";
|
||||
++ccnt;
|
||||
}
|
||||
hocr_str << "</span>";
|
||||
++tcnt;
|
||||
}
|
||||
hocr_str << "\n </span>";
|
||||
++scnt;
|
||||
} else if (lstm_choice_mode == 2) {
|
||||
tesseract::ChoiceIterator ci(*res_it);
|
||||
hocr_str << "\n <span class='ocrx_cinfo'"
|
||||
<< " id='"
|
||||
<< "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt << "'>";
|
||||
do {
|
||||
const char *choice = ci.GetUTF8Text();
|
||||
float choiceconf = ci.Confidence();
|
||||
if (choice != nullptr) {
|
||||
hocr_str << "\n <span class='ocrx_cinfo'"
|
||||
<< " id='"
|
||||
<< "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'"
|
||||
<< " title='x_confs " << choiceconf << "'>" << HOcrEscape(choice).c_str()
|
||||
<< "</span>";
|
||||
ccnt++;
|
||||
}
|
||||
} while (ci.Next());
|
||||
hocr_str << "\n </span>";
|
||||
tcnt++;
|
||||
}
|
||||
}
|
||||
}
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
||||
if (italic) {
|
||||
hocr_str << "</em>";
|
||||
}
|
||||
if (bold) {
|
||||
hocr_str << "</strong>";
|
||||
}
|
||||
// If the lstm choice mode is required it is added here
|
||||
if (lstm_choice_mode == 1 && !hocr_boxes && rawTimestepMap != nullptr) {
|
||||
for (auto symbol : *rawTimestepMap) {
|
||||
hocr_str << "\n <span class='ocr_symbol'"
|
||||
<< " id='"
|
||||
<< "symbol_" << page_id << "_" << wcnt << "_" << scnt << "'>";
|
||||
for (auto timestep : symbol) {
|
||||
hocr_str << "\n <span class='ocrx_cinfo'"
|
||||
<< " id='"
|
||||
<< "timestep" << page_id << "_" << wcnt << "_" << tcnt << "'>";
|
||||
for (auto conf : timestep) {
|
||||
hocr_str << "\n <span class='ocrx_cinfo'"
|
||||
<< " id='"
|
||||
<< "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'"
|
||||
<< " title='x_confs " << int(conf.second * 100) << "'>"
|
||||
<< HOcrEscape(conf.first).c_str() << "</span>";
|
||||
++ccnt;
|
||||
}
|
||||
hocr_str << "</span>";
|
||||
++tcnt;
|
||||
}
|
||||
hocr_str << "</span>";
|
||||
++scnt;
|
||||
}
|
||||
} else if (lstm_choice_mode == 2 && !hocr_boxes && CTCMap != nullptr) {
|
||||
for (auto timestep : *CTCMap) {
|
||||
if (timestep.size() > 0) {
|
||||
hocr_str << "\n <span class='ocrx_cinfo'"
|
||||
<< " id='"
|
||||
<< "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt << "'>";
|
||||
for (auto &j : timestep) {
|
||||
float conf = 100 - tesseract_->lstm_rating_coefficient * j.second;
|
||||
if (conf < 0.0f) {
|
||||
conf = 0.0f;
|
||||
}
|
||||
if (conf > 100.0f) {
|
||||
conf = 100.0f;
|
||||
}
|
||||
hocr_str << "\n <span class='ocrx_cinfo'"
|
||||
<< " id='"
|
||||
<< "choice_" << page_id << "_" << wcnt << "_" << ccnt << "'"
|
||||
<< " title='x_confs " << conf << "'>" << HOcrEscape(j.first).c_str()
|
||||
<< "</span>";
|
||||
ccnt++;
|
||||
}
|
||||
hocr_str << "</span>";
|
||||
tcnt++;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Close ocrx_word.
|
||||
if (hocr_boxes || lstm_choice_mode > 0) {
|
||||
hocr_str << "\n ";
|
||||
}
|
||||
hocr_str << "</span>";
|
||||
tcnt = 1;
|
||||
ccnt = 1;
|
||||
wcnt++;
|
||||
// Close any ending block/paragraph/textline.
|
||||
if (last_word_in_line) {
|
||||
hocr_str << "\n </span>";
|
||||
lcnt++;
|
||||
}
|
||||
if (last_word_in_para) {
|
||||
hocr_str << "\n </p>\n";
|
||||
pcnt++;
|
||||
para_is_ltr = true; // back to default direction
|
||||
}
|
||||
if (last_word_in_block) {
|
||||
hocr_str << " </div>\n";
|
||||
bcnt++;
|
||||
}
|
||||
}
|
||||
hocr_str << " </div>\n";
|
||||
|
||||
const std::string &text = hocr_str.str();
|
||||
char *result = new char[text.length() + 1];
|
||||
strcpy(result, text.c_str());
|
||||
return result;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* HOcr Text Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessHOcrRenderer::TessHOcrRenderer(const char *outputbase)
|
||||
: TessResultRenderer(outputbase, "hocr") {
|
||||
font_info_ = false;
|
||||
}
|
||||
|
||||
TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info)
|
||||
: TessResultRenderer(outputbase, "hocr") {
|
||||
font_info_ = font_info;
|
||||
}
|
||||
|
||||
bool TessHOcrRenderer::BeginDocumentHandler() {
|
||||
AppendString(
|
||||
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
|
||||
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
|
||||
" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
|
||||
"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
|
||||
"lang=\"en\">\n <head>\n <title>");
|
||||
AppendString(title());
|
||||
AppendString(
|
||||
"</title>\n"
|
||||
" <meta http-equiv=\"Content-Type\" content=\"text/html;"
|
||||
"charset=utf-8\"/>\n"
|
||||
" <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR
|
||||
"' />\n"
|
||||
" <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
|
||||
" ocr_line ocrx_word ocrp_wconf");
|
||||
if (font_info_) {
|
||||
AppendString(" ocrp_lang ocrp_dir ocrp_font ocrp_fsize");
|
||||
}
|
||||
AppendString(
|
||||
"'/>\n"
|
||||
" </head>\n"
|
||||
" <body>\n");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TessHOcrRenderer::EndDocumentHandler() {
|
||||
AppendString(" </body>\n</html>\n");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TessHOcrRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
const std::unique_ptr<const char[]> hocr(api->GetHOCRText(imagenum()));
|
||||
if (hocr == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AppendString(hocr.get());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
107
3rdparty/tesseract_ocr/tesseract/src/api/lstmboxrenderer.cpp
vendored
Normal file
107
3rdparty/tesseract_ocr/tesseract/src/api/lstmboxrenderer.cpp
vendored
Normal file
@ -0,0 +1,107 @@
|
||||
/**********************************************************************
|
||||
* File: lstmboxrenderer.cpp
|
||||
* Description: Renderer for creating box file for LSTM training.
|
||||
* based on the tsv renderer.
|
||||
*
|
||||
* (C) Copyright 2019, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include <tesseract/baseapi.h> // for TessBaseAPI
|
||||
#include <tesseract/renderer.h>
|
||||
#include "tesseractclass.h" // for Tesseract
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
/**
|
||||
* Create a UTF8 box file for LSTM training from the internal data structures.
|
||||
* page_number is a 0-base page index that will appear in the box file.
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
static void AddBoxToLSTM(int right, int bottom, int top, int image_height, int page_num,
|
||||
std::string &text) {
|
||||
text += " " + std::to_string(image_height - bottom);
|
||||
text += " " + std::to_string(right + 5);
|
||||
text += " " + std::to_string(image_height - top);
|
||||
text += " " + std::to_string(page_num);
|
||||
}
|
||||
|
||||
char *TessBaseAPI::GetLSTMBoxText(int page_number = 0) {
|
||||
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::string lstm_box_str;
|
||||
bool first_word = true;
|
||||
int left = 0, top = 0, right = 0, bottom = 0;
|
||||
|
||||
LTRResultIterator *res_it = GetLTRIterator();
|
||||
while (!res_it->Empty(RIL_BLOCK)) {
|
||||
if (res_it->Empty(RIL_SYMBOL)) {
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
continue;
|
||||
}
|
||||
if (!first_word) {
|
||||
if (!(res_it->IsAtBeginningOf(RIL_TEXTLINE))) {
|
||||
if (res_it->IsAtBeginningOf(RIL_WORD)) {
|
||||
lstm_box_str += " " + std::to_string(left);
|
||||
AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
|
||||
lstm_box_str += "\n"; // end of row for word
|
||||
} // word
|
||||
} else {
|
||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||
lstm_box_str += "\t " + std::to_string(left);
|
||||
AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
|
||||
lstm_box_str += "\n"; // end of row for line
|
||||
} // line
|
||||
}
|
||||
} // not first word
|
||||
first_word = false;
|
||||
// Use bounding box for whole line for everything
|
||||
res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
|
||||
do {
|
||||
lstm_box_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_SYMBOL));
|
||||
lstm_box_str += " " + std::to_string(left);
|
||||
AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
|
||||
lstm_box_str += "\n"; // end of row for symbol
|
||||
}
|
||||
if (!first_word) { // if first_word is true => empty page
|
||||
lstm_box_str += "\t " + std::to_string(left);
|
||||
AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
|
||||
lstm_box_str += "\n"; // end of PAGE
|
||||
}
|
||||
char *ret = new char[lstm_box_str.length() + 1];
|
||||
strcpy(ret, lstm_box_str.c_str());
|
||||
delete res_it;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* LSTMBox Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessLSTMBoxRenderer::TessLSTMBoxRenderer(const char *outputbase)
|
||||
: TessResultRenderer(outputbase, "box") {}
|
||||
|
||||
bool TessLSTMBoxRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
const std::unique_ptr<const char[]> lstmbox(api->GetLSTMBoxText(imagenum()));
|
||||
if (lstmbox == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AppendString(lstmbox.get());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
63
3rdparty/tesseract_ocr/tesseract/src/api/pdf_ttf.h
vendored
Normal file
63
3rdparty/tesseract_ocr/tesseract/src/api/pdf_ttf.h
vendored
Normal file
@ -0,0 +1,63 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: pdf_ttf.h
|
||||
// Description: pdf.ttf (GlyphLessFont) replacement.
|
||||
// Generated with: "bin2cpp pdf.ttf pdf_ttf cpp17"
|
||||
// Author: Zdenko Podobny
|
||||
//
|
||||
// (C) Copyright 2020, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef pdf_ttf__H
|
||||
#define pdf_ttf__H
|
||||
|
||||
#include <cstdint> // uint8_t
|
||||
|
||||
static const uint8_t pdf_ttf[] = {
|
||||
0x0, 0x1, 0x0, 0x0, 0x0, 0xa, 0x0, 0x80, 0x0, 0x3, 0x0, 0x20, 0x4f, 0x53, 0x2f, 0x32,
|
||||
0x56, 0xde, 0xc8, 0x94, 0x0, 0x0, 0x1, 0x28, 0x0, 0x0, 0x0, 0x60, 0x63, 0x6d, 0x61, 0x70,
|
||||
0x0, 0xa, 0x0, 0x34, 0x0, 0x0, 0x1, 0x90, 0x0, 0x0, 0x0, 0x1e, 0x67, 0x6c, 0x79, 0x66,
|
||||
0x15, 0x22, 0x41, 0x24, 0x0, 0x0, 0x1, 0xb8, 0x0, 0x0, 0x0, 0x18, 0x68, 0x65, 0x61, 0x64,
|
||||
0xb, 0x78, 0xf1, 0x65, 0x0, 0x0, 0x0, 0xac, 0x0, 0x0, 0x0, 0x36, 0x68, 0x68, 0x65, 0x61,
|
||||
0xc, 0x2, 0x4, 0x2, 0x0, 0x0, 0x0, 0xe4, 0x0, 0x0, 0x0, 0x24, 0x68, 0x6d, 0x74, 0x78,
|
||||
0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x88, 0x0, 0x0, 0x0, 0x8, 0x6c, 0x6f, 0x63, 0x61,
|
||||
0x0, 0xc, 0x0, 0x0, 0x0, 0x0, 0x1, 0xb0, 0x0, 0x0, 0x0, 0x6, 0x6d, 0x61, 0x78, 0x70,
|
||||
0x0, 0x4, 0x0, 0x5, 0x0, 0x0, 0x1, 0x8, 0x0, 0x0, 0x0, 0x20, 0x6e, 0x61, 0x6d, 0x65,
|
||||
0xf2, 0xeb, 0x16, 0xda, 0x0, 0x0, 0x1, 0xd0, 0x0, 0x0, 0x0, 0x4b, 0x70, 0x6f, 0x73, 0x74,
|
||||
0x0, 0x1, 0x0, 0x1, 0x0, 0x0, 0x2, 0x1c, 0x0, 0x0, 0x0, 0x20, 0x0, 0x1, 0x0, 0x0,
|
||||
0x0, 0x1, 0x0, 0x0, 0xb0, 0x94, 0x71, 0x10, 0x5f, 0xf, 0x3c, 0xf5, 0x4, 0x7, 0x8, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0xcf, 0x9a, 0xfc, 0x6e, 0x0, 0x0, 0x0, 0x0, 0xd4, 0xc3, 0xa7, 0xf2,
|
||||
0x0, 0x0, 0x0, 0x0, 0x4, 0x0, 0x8, 0x0, 0x0, 0x0, 0x0, 0x10, 0x0, 0x2, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x8, 0x0, 0xff, 0xff, 0x0, 0x0, 0x4, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x4, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x1, 0x0, 0x0, 0x0, 0x2, 0x0, 0x4,
|
||||
0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x1, 0x90, 0x0, 0x5,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5, 0x0, 0x1, 0x0, 0x1, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x47, 0x4f, 0x4f, 0x47, 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0xff, 0xff,
|
||||
0x0, 0x0, 0x0, 0x1, 0x0, 0x1, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x2, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x14, 0x0, 0x3, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x14, 0x0, 0x6, 0x0, 0xa, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0xc, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x4, 0x0,
|
||||
0x8, 0x0, 0x0, 0x3, 0x0, 0x0, 0x31, 0x21, 0x11, 0x21, 0x4, 0x0, 0xfc, 0x0, 0x8, 0x0,
|
||||
0x0, 0x0, 0x0, 0x3, 0x0, 0x2a, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x5, 0x0, 0x16,
|
||||
0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5, 0x0, 0xb, 0x0, 0x16, 0x0, 0x3,
|
||||
0x0, 0x1, 0x4, 0x9, 0x0, 0x5, 0x0, 0x16, 0x0, 0x0, 0x0, 0x56, 0x0, 0x65, 0x0, 0x72,
|
||||
0x0, 0x73, 0x0, 0x69, 0x0, 0x6f, 0x0, 0x6e, 0x0, 0x20, 0x0, 0x31, 0x0, 0x2e, 0x0, 0x30,
|
||||
0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x31, 0x2e, 0x30, 0x0, 0x0, 0x1, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
|
||||
|
||||
#endif
|
969
3rdparty/tesseract_ocr/tesseract/src/api/pdfrenderer.cpp
vendored
Normal file
969
3rdparty/tesseract_ocr/tesseract/src/api/pdfrenderer.cpp
vendored
Normal file
@ -0,0 +1,969 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: pdfrenderer.cpp
|
||||
// Description: PDF rendering interface to inject into TessBaseAPI
|
||||
//
|
||||
// (C) Copyright 2011, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Include automatically generated configuration file if running autoconf.
|
||||
#ifdef HAVE_CONFIG_H
|
||||
# include "config_auto.h"
|
||||
#endif
|
||||
|
||||
#include "pdf_ttf.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
#include <allheaders.h>
|
||||
#include <tesseract/baseapi.h>
|
||||
#include <tesseract/renderer.h>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <fstream> // for std::ifstream
|
||||
#include <locale> // for std::locale::classic
|
||||
#include <memory> // std::unique_ptr
|
||||
#include <sstream> // for std::stringstream
|
||||
#include "helpers.h" // for Swap
|
||||
|
||||
/*
|
||||
|
||||
Design notes from Ken Sharp, with light editing.
|
||||
|
||||
We think one solution is a font with a single glyph (.notdef) and a
|
||||
CIDToGIDMap which maps all the CIDs to 0. That map would then be
|
||||
stored as a stream in the PDF file, and when flat compressed should
|
||||
be pretty small. The font, of course, will be approximately the same
|
||||
size as the one you currently use.
|
||||
|
||||
I'm working on such a font now, the CIDToGIDMap is trivial, you just
|
||||
create a stream object which contains 128k bytes (2 bytes per possible
|
||||
CID and your CIDs range from 0 to 65535) and where you currently have
|
||||
"/CIDToGIDMap /Identity" you would have "/CIDToGIDMap <object> 0 R".
|
||||
|
||||
Note that if, in future, you were to use a different (ie not 2 byte)
|
||||
CMap for character codes you could trivially extend the CIDToGIDMap.
|
||||
|
||||
The following is an explanation of how some of the font stuff works,
|
||||
this may be too simple for you in which case please accept my
|
||||
apologies, its hard to know how much knowledge someone has. You can
|
||||
skip all this anyway, its just for information.
|
||||
|
||||
The font embedded in a PDF file is usually intended just to be
|
||||
rendered, but extensions allow for at least some ability to locate (or
|
||||
copy) text from a document. This isn't something which was an original
|
||||
goal of the PDF format, but its been retro-fitted, presumably due to
|
||||
popular demand.
|
||||
|
||||
To do this reliably the PDF file must contain a ToUnicode CMap, a
|
||||
device for mapping character codes to Unicode code points. If one of
|
||||
these is present, then this will be used to convert the character
|
||||
codes into Unicode values. If its not present then the reader will
|
||||
fall back through a series of heuristics to try and guess the
|
||||
result. This is, as you would expect, prone to failure.
|
||||
|
||||
This doesn't concern you of course, since you always write a ToUnicode
|
||||
CMap, so because you are writing the text in text rendering mode 3 it
|
||||
would seem that you don't really need to worry about this, but in the
|
||||
PDF spec you cannot have an isolated ToUnicode CMap, it has to be
|
||||
attached to a font, so in order to get even copy/paste to work you
|
||||
need to define a font.
|
||||
|
||||
This is what leads to problems, tools like pdfwrite assume that they
|
||||
are going to be able to (or even have to) modify the font entries, so
|
||||
they require that the font being embedded be valid, and to be honest
|
||||
the font Tesseract embeds isn't valid (for this purpose).
|
||||
|
||||
|
||||
To see why lets look at how text is specified in a PDF file:
|
||||
|
||||
(Test) Tj
|
||||
|
||||
Now that looks like text but actually it isn't. Each of those bytes is
|
||||
a 'character code'. When it comes to rendering the text a complex
|
||||
sequence of events takes place, which converts the character code into
|
||||
'something' which the font understands. Its entirely possible via
|
||||
character mappings to have that text render as 'Sftu'
|
||||
|
||||
For simple fonts (PostScript type 1), we use the character code as the
|
||||
index into an Encoding array (256 elements), each element of which is
|
||||
a glyph name, so this gives us a glyph name. We then consult the
|
||||
CharStrings dictionary in the font, that's a complex object which
|
||||
contains pairs of keys and values, you can use the key to retrieve a
|
||||
given value. So we have a glyph name, we then use that as the key to
|
||||
the dictionary and retrieve the associated value. For a type 1 font,
|
||||
the value is a glyph program that describes how to draw the glyph.
|
||||
|
||||
For CIDFonts, its a little more complicated. Because CIDFonts can be
|
||||
large, using a glyph name as the key is unreasonable (it would also
|
||||
lead to unfeasibly large Encoding arrays), so instead we use a 'CID'
|
||||
as the key. CIDs are just numbers.
|
||||
|
||||
But.... We don't use the character code as the CID. What we do is use
|
||||
a CMap to convert the character code into a CID. We then use the CID
|
||||
to key the CharStrings dictionary and proceed as before. So the 'CMap'
|
||||
is the equivalent of the Encoding array, but its a more compact and
|
||||
flexible representation.
|
||||
|
||||
Note that you have to use the CMap just to find out how many bytes
|
||||
constitute a character code, and it can be variable. For example you
|
||||
can say if the first byte is 0x00->0x7f then its just one byte, if its
|
||||
0x80->0xf0 then its 2 bytes and if its 0xf0->0xff then its 3 bytes. I
|
||||
have seen CMaps defining character codes up to 5 bytes wide.
|
||||
|
||||
Now that's fine for 'PostScript' CIDFonts, but its not sufficient for
|
||||
TrueType CIDFonts. The thing is that TrueType fonts are accessed using
|
||||
a Glyph ID (GID) (and the LOCA table) which may well not be anything
|
||||
like the CID. So for this case PDF includes a CIDToGIDMap. That maps
|
||||
the CIDs to GIDs, and we can then use the GID to get the glyph
|
||||
description from the GLYF table of the font.
|
||||
|
||||
So for a TrueType CIDFont, character-code->CID->GID->glyf-program.
|
||||
|
||||
Looking at the PDF file I was supplied with we see that it contains
|
||||
text like :
|
||||
|
||||
<0x0075> Tj
|
||||
|
||||
So we start by taking the character code (117) and look it up in the
|
||||
CMap. Well you don't supply a CMap, you just use the Identity-H one
|
||||
which is predefined. So character code 117 maps to CID 117. Then we
|
||||
use the CIDToGIDMap, again you don't supply one, you just use the
|
||||
predefined 'Identity' map. So CID 117 maps to GID 117. But the font we
|
||||
were supplied with only contains 116 glyphs.
|
||||
|
||||
Now for Latin that's not a huge problem, you can just supply a bigger
|
||||
font. But for more complex languages that *is* going to be more of a
|
||||
problem. Either you need to supply a font which contains glyphs for
|
||||
all the possible CID->GID mappings, or we need to think laterally.
|
||||
|
||||
Our solution using a TrueType CIDFont is to intervene at the
|
||||
CIDToGIDMap stage and convert all the CIDs to GID 0. Then we have a
|
||||
font with just one glyph, the .notdef glyph at GID 0. This is what I'm
|
||||
looking into now.
|
||||
|
||||
It would also be possible to have a 'PostScript' (ie type 1 outlines)
|
||||
CIDFont which contained 1 glyph, and a CMap which mapped all character
|
||||
codes to CID 0. The effect would be the same.
|
||||
|
||||
Its possible (I haven't checked) that the PostScript CIDFont and
|
||||
associated CMap would be smaller than the TrueType font and associated
|
||||
CIDToGIDMap.
|
||||
|
||||
--- in a followup ---
|
||||
|
||||
OK there is a small problem there, if I use GID 0 then Acrobat gets
|
||||
upset about it and complains it cannot extract the font. If I set the
|
||||
CIDToGIDMap so that all the entries are 1 instead, it's happy. Totally
|
||||
mad......
|
||||
|
||||
*/
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// If the font is 10 pts, nominal character width is 5 pts
|
||||
static const int kCharWidth = 2;
|
||||
|
||||
// Used for memory allocation. A codepoint must take no more than this
|
||||
// many bytes, when written in the PDF way. e.g. "<0063>" for the
|
||||
// letter 'c'
|
||||
static const int kMaxBytesPerCodepoint = 20;
|
||||
|
||||
/**********************************************************************
|
||||
* PDF Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessPDFRenderer::TessPDFRenderer(const char *outputbase, const char *datadir, bool textonly)
|
||||
: TessResultRenderer(outputbase, "pdf"), datadir_(datadir) {
|
||||
obj_ = 0;
|
||||
textonly_ = textonly;
|
||||
offsets_.push_back(0);
|
||||
}
|
||||
|
||||
void TessPDFRenderer::AppendPDFObjectDIY(size_t objectsize) {
|
||||
offsets_.push_back(objectsize + offsets_.back());
|
||||
obj_++;
|
||||
}
|
||||
|
||||
void TessPDFRenderer::AppendPDFObject(const char *data) {
|
||||
AppendPDFObjectDIY(strlen(data));
|
||||
AppendString(data);
|
||||
}
|
||||
|
||||
// Helper function to prevent us from accidentally writing
|
||||
// scientific notation to an HOCR or PDF file. Besides, three
|
||||
// decimal points are all you really need.
|
||||
static double prec(double x) {
|
||||
double kPrecision = 1000.0;
|
||||
double a = round(x * kPrecision) / kPrecision;
|
||||
if (a == -0) {
|
||||
return 0;
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
static long dist2(int x1, int y1, int x2, int y2) {
|
||||
return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
|
||||
}
|
||||
|
||||
// Viewers like evince can get really confused during copy-paste when
|
||||
// the baseline wanders around. So I've decided to project every word
|
||||
// onto the (straight) line baseline. All numbers are in the native
|
||||
// PDF coordinate system, which has the origin in the bottom left and
|
||||
// the unit is points, which is 1/72 inch. Tesseract reports baselines
|
||||
// left-to-right no matter what the reading order is. We need the
|
||||
// word baseline in reading order, so we do that conversion here. Returns
|
||||
// the word's baseline origin and length.
|
||||
static void GetWordBaseline(int writing_direction, int ppi, int height, int word_x1, int word_y1,
|
||||
int word_x2, int word_y2, int line_x1, int line_y1, int line_x2,
|
||||
int line_y2, double *x0, double *y0, double *length) {
|
||||
if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {
|
||||
std::swap(word_x1, word_x2);
|
||||
std::swap(word_y1, word_y2);
|
||||
}
|
||||
double word_length;
|
||||
double x, y;
|
||||
{
|
||||
int px = word_x1;
|
||||
int py = word_y1;
|
||||
double l2 = dist2(line_x1, line_y1, line_x2, line_y2);
|
||||
if (l2 == 0) {
|
||||
x = line_x1;
|
||||
y = line_y1;
|
||||
} else {
|
||||
double t = ((px - line_x2) * (line_x2 - line_x1) + (py - line_y2) * (line_y2 - line_y1)) / l2;
|
||||
x = line_x2 + t * (line_x2 - line_x1);
|
||||
y = line_y2 + t * (line_y2 - line_y1);
|
||||
}
|
||||
word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1, word_x2, word_y2)));
|
||||
word_length = word_length * 72.0 / ppi;
|
||||
x = x * 72 / ppi;
|
||||
y = height - (y * 72.0 / ppi);
|
||||
}
|
||||
*x0 = x;
|
||||
*y0 = y;
|
||||
*length = word_length;
|
||||
}
|
||||
|
||||
// Compute coefficients for an affine matrix describing the rotation
|
||||
// of the text. If the text is right-to-left such as Arabic or Hebrew,
|
||||
// we reflect over the Y-axis. This matrix will set the coordinate
|
||||
// system for placing text in the PDF file.
|
||||
//
|
||||
// RTL
|
||||
// [ x' ] = [ a b ][ x ] = [-1 0 ] [ cos sin ][ x ]
|
||||
// [ y' ] [ c d ][ y ] [ 0 1 ] [-sin cos ][ y ]
|
||||
static void AffineMatrix(int writing_direction, int line_x1, int line_y1, int line_x2, int line_y2,
|
||||
double *a, double *b, double *c, double *d) {
|
||||
double theta =
|
||||
atan2(static_cast<double>(line_y1 - line_y2), static_cast<double>(line_x2 - line_x1));
|
||||
*a = cos(theta);
|
||||
*b = sin(theta);
|
||||
*c = -sin(theta);
|
||||
*d = cos(theta);
|
||||
switch (writing_direction) {
|
||||
case WRITING_DIRECTION_RIGHT_TO_LEFT:
|
||||
*a = -*a;
|
||||
*b = -*b;
|
||||
break;
|
||||
case WRITING_DIRECTION_TOP_TO_BOTTOM:
|
||||
// TODO(jbreiden) Consider using the vertical PDF writing mode.
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// There are some really awkward PDF viewers in the wild, such as
|
||||
// 'Preview' which ships with the Mac. They do a better job with text
|
||||
// selection and highlighting when given perfectly flat baseline
|
||||
// instead of very slightly tilted. We clip small tilts to appease
|
||||
// these viewers. I chose this threshold large enough to absorb noise,
|
||||
// but small enough that lines probably won't cross each other if the
|
||||
// whole page is tilted at almost exactly the clipping threshold.
|
||||
static void ClipBaseline(int ppi, int x1, int y1, int x2, int y2, int *line_x1, int *line_y1,
|
||||
int *line_x2, int *line_y2) {
|
||||
*line_x1 = x1;
|
||||
*line_y1 = y1;
|
||||
*line_x2 = x2;
|
||||
*line_y2 = y2;
|
||||
int rise = abs(y2 - y1) * 72;
|
||||
int run = abs(x2 - x1) * 72;
|
||||
if (rise < 2 * ppi && 2 * ppi < run) {
|
||||
*line_y1 = *line_y2 = (y1 + y2) / 2;
|
||||
}
|
||||
}
|
||||
|
||||
static bool CodepointToUtf16be(int code, char utf16[kMaxBytesPerCodepoint]) {
|
||||
if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
|
||||
tprintf("Dropping invalid codepoint %d\n", code);
|
||||
return false;
|
||||
}
|
||||
if (code < 0x10000) {
|
||||
snprintf(utf16, kMaxBytesPerCodepoint, "%04X", code);
|
||||
} else {
|
||||
int a = code - 0x010000;
|
||||
int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
|
||||
int low_surrogate = (0x03FF & a) + 0xDC00;
|
||||
snprintf(utf16, kMaxBytesPerCodepoint, "%04X%04X", high_surrogate, low_surrogate);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double height) {
|
||||
double ppi = api->GetSourceYResolution();
|
||||
|
||||
// These initial conditions are all arbitrary and will be overwritten
|
||||
double old_x = 0.0, old_y = 0.0;
|
||||
int old_fontsize = 0;
|
||||
tesseract::WritingDirection old_writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;
|
||||
bool new_block = true;
|
||||
int fontsize = 0;
|
||||
double a = 1;
|
||||
double b = 0;
|
||||
double c = 0;
|
||||
double d = 1;
|
||||
|
||||
std::stringstream pdf_str;
|
||||
// Use "C" locale (needed for double values prec()).
|
||||
pdf_str.imbue(std::locale::classic());
|
||||
// Use 8 digits for double values.
|
||||
pdf_str.precision(8);
|
||||
|
||||
// TODO(jbreiden) This marries the text and image together.
|
||||
// Slightly cleaner from an abstraction standpoint if this were to
|
||||
// live inside a separate text object.
|
||||
pdf_str << "q " << prec(width) << " 0 0 " << prec(height) << " 0 0 cm";
|
||||
if (!textonly_) {
|
||||
pdf_str << " /Im1 Do";
|
||||
}
|
||||
pdf_str << " Q\n";
|
||||
|
||||
int line_x1 = 0;
|
||||
int line_y1 = 0;
|
||||
int line_x2 = 0;
|
||||
int line_y2 = 0;
|
||||
|
||||
const std::unique_ptr</*non-const*/ ResultIterator> res_it(api->GetIterator());
|
||||
while (!res_it->Empty(RIL_BLOCK)) {
|
||||
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
|
||||
pdf_str << "BT\n3 Tr"; // Begin text object, use invisible ink
|
||||
old_fontsize = 0; // Every block will declare its fontsize
|
||||
new_block = true; // Every block will declare its affine matrix
|
||||
}
|
||||
|
||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||
int x1, y1, x2, y2;
|
||||
res_it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
|
||||
ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
|
||||
}
|
||||
|
||||
if (res_it->Empty(RIL_WORD)) {
|
||||
res_it->Next(RIL_WORD);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Writing direction changes at a per-word granularity
|
||||
tesseract::WritingDirection writing_direction;
|
||||
{
|
||||
tesseract::Orientation orientation;
|
||||
tesseract::TextlineOrder textline_order;
|
||||
float deskew_angle;
|
||||
res_it->Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle);
|
||||
if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) {
|
||||
switch (res_it->WordDirection()) {
|
||||
case DIR_LEFT_TO_RIGHT:
|
||||
writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;
|
||||
break;
|
||||
case DIR_RIGHT_TO_LEFT:
|
||||
writing_direction = WRITING_DIRECTION_RIGHT_TO_LEFT;
|
||||
break;
|
||||
default:
|
||||
writing_direction = old_writing_direction;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Where is word origin and how long is it?
|
||||
double x, y, word_length;
|
||||
{
|
||||
int word_x1, word_y1, word_x2, word_y2;
|
||||
res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
|
||||
GetWordBaseline(writing_direction, ppi, height, word_x1, word_y1, word_x2, word_y2, line_x1,
|
||||
line_y1, line_x2, line_y2, &x, &y, &word_length);
|
||||
}
|
||||
|
||||
if (writing_direction != old_writing_direction || new_block) {
|
||||
AffineMatrix(writing_direction, line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
|
||||
pdf_str << " " << prec(a) // . This affine matrix
|
||||
<< " " << prec(b) // . sets the coordinate
|
||||
<< " " << prec(c) // . system for all
|
||||
<< " " << prec(d) // . text that follows.
|
||||
<< " " << prec(x) // .
|
||||
<< " " << prec(y) // .
|
||||
<< (" Tm "); // Place cursor absolutely
|
||||
new_block = false;
|
||||
} else {
|
||||
double dx = x - old_x;
|
||||
double dy = y - old_y;
|
||||
pdf_str << " " << prec(dx * a + dy * b) << " " << prec(dx * c + dy * d)
|
||||
<< (" Td "); // Relative moveto
|
||||
}
|
||||
old_x = x;
|
||||
old_y = y;
|
||||
old_writing_direction = writing_direction;
|
||||
|
||||
// Adjust font size on a per word granularity. Pay attention to
|
||||
// fontsize, old_fontsize, and pdf_str. We've found that for
|
||||
// in Arabic, Tesseract will happily return a fontsize of zero,
|
||||
// so we make up a default number to protect ourselves.
|
||||
{
|
||||
bool bold, italic, underlined, monospace, serif, smallcaps;
|
||||
int font_id;
|
||||
res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps,
|
||||
&fontsize, &font_id);
|
||||
const int kDefaultFontsize = 8;
|
||||
if (fontsize <= 0) {
|
||||
fontsize = kDefaultFontsize;
|
||||
}
|
||||
if (fontsize != old_fontsize) {
|
||||
pdf_str << "/f-0-0 " << fontsize << " Tf ";
|
||||
old_fontsize = fontsize;
|
||||
}
|
||||
}
|
||||
|
||||
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
|
||||
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
|
||||
std::string pdf_word;
|
||||
int pdf_word_len = 0;
|
||||
do {
|
||||
const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
|
||||
if (grapheme && grapheme[0] != '\0') {
|
||||
std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(grapheme.get());
|
||||
char utf16[kMaxBytesPerCodepoint];
|
||||
for (char32 code : unicodes) {
|
||||
if (CodepointToUtf16be(code, utf16)) {
|
||||
pdf_word += utf16;
|
||||
pdf_word_len++;
|
||||
}
|
||||
}
|
||||
}
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
||||
if (res_it->IsAtBeginningOf(RIL_WORD)) {
|
||||
pdf_word += "0020";
|
||||
pdf_word_len++;
|
||||
}
|
||||
if (word_length > 0 && pdf_word_len > 0) {
|
||||
double h_stretch = kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
|
||||
pdf_str << h_stretch << " Tz" // horizontal stretch
|
||||
<< " [ <" << pdf_word // UTF-16BE representation
|
||||
<< "> ] TJ"; // show the text
|
||||
}
|
||||
if (last_word_in_line) {
|
||||
pdf_str << " \n";
|
||||
}
|
||||
if (last_word_in_block) {
|
||||
pdf_str << "ET\n"; // end the text object
|
||||
}
|
||||
}
|
||||
const std::string &text = pdf_str.str();
|
||||
char *result = new char[text.length() + 1];
|
||||
strcpy(result, text.c_str());
|
||||
return result;
|
||||
}
|
||||
|
||||
bool TessPDFRenderer::BeginDocumentHandler() {
|
||||
AppendPDFObject("%PDF-1.5\n%\xDE\xAD\xBE\xEB\n");
|
||||
|
||||
// CATALOG
|
||||
AppendPDFObject(
|
||||
"1 0 obj\n"
|
||||
"<<\n"
|
||||
" /Type /Catalog\n"
|
||||
" /Pages 2 0 R\n"
|
||||
">>\nendobj\n");
|
||||
|
||||
// We are reserving object #2 for the /Pages
|
||||
// object, which I am going to create and write
|
||||
// at the end of the PDF file.
|
||||
AppendPDFObject("");
|
||||
|
||||
// TYPE0 FONT
|
||||
AppendPDFObject(
|
||||
"3 0 obj\n"
|
||||
"<<\n"
|
||||
" /BaseFont /GlyphLessFont\n"
|
||||
" /DescendantFonts [ 4 0 R ]\n" // CIDFontType2 font
|
||||
" /Encoding /Identity-H\n"
|
||||
" /Subtype /Type0\n"
|
||||
" /ToUnicode 6 0 R\n" // ToUnicode
|
||||
" /Type /Font\n"
|
||||
">>\n"
|
||||
"endobj\n");
|
||||
|
||||
// CIDFONTTYPE2
|
||||
std::stringstream stream;
|
||||
// Use "C" locale (needed for int values larger than 999).
|
||||
stream.imbue(std::locale::classic());
|
||||
stream << "4 0 obj\n"
|
||||
"<<\n"
|
||||
" /BaseFont /GlyphLessFont\n"
|
||||
" /CIDToGIDMap 5 0 R\n" // CIDToGIDMap
|
||||
" /CIDSystemInfo\n"
|
||||
" <<\n"
|
||||
" /Ordering (Identity)\n"
|
||||
" /Registry (Adobe)\n"
|
||||
" /Supplement 0\n"
|
||||
" >>\n"
|
||||
" /FontDescriptor 7 0 R\n" // Font descriptor
|
||||
" /Subtype /CIDFontType2\n"
|
||||
" /Type /Font\n"
|
||||
" /DW "
|
||||
<< (1000 / kCharWidth)
|
||||
<< "\n"
|
||||
">>\n"
|
||||
"endobj\n";
|
||||
AppendPDFObject(stream.str().c_str());
|
||||
|
||||
// CIDTOGIDMAP
|
||||
const int kCIDToGIDMapSize = 2 * (1 << 16);
|
||||
const std::unique_ptr<unsigned char[]> cidtogidmap(new unsigned char[kCIDToGIDMapSize]);
|
||||
for (int i = 0; i < kCIDToGIDMapSize; i++) {
|
||||
cidtogidmap[i] = (i % 2) ? 1 : 0;
|
||||
}
|
||||
size_t len;
|
||||
unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
|
||||
stream.str("");
|
||||
stream << "5 0 obj\n"
|
||||
"<<\n"
|
||||
" /Length "
|
||||
<< len
|
||||
<< " /Filter /FlateDecode\n"
|
||||
">>\n"
|
||||
"stream\n";
|
||||
AppendString(stream.str().c_str());
|
||||
long objsize = stream.str().size();
|
||||
AppendData(reinterpret_cast<char *>(comp), len);
|
||||
objsize += len;
|
||||
lept_free(comp);
|
||||
const char *endstream_endobj =
|
||||
"endstream\n"
|
||||
"endobj\n";
|
||||
AppendString(endstream_endobj);
|
||||
objsize += strlen(endstream_endobj);
|
||||
AppendPDFObjectDIY(objsize);
|
||||
|
||||
const char stream2[] =
|
||||
"/CIDInit /ProcSet findresource begin\n"
|
||||
"12 dict begin\n"
|
||||
"begincmap\n"
|
||||
"/CIDSystemInfo\n"
|
||||
"<<\n"
|
||||
" /Registry (Adobe)\n"
|
||||
" /Ordering (UCS)\n"
|
||||
" /Supplement 0\n"
|
||||
">> def\n"
|
||||
"/CMapName /Adobe-Identify-UCS def\n"
|
||||
"/CMapType 2 def\n"
|
||||
"1 begincodespacerange\n"
|
||||
"<0000> <FFFF>\n"
|
||||
"endcodespacerange\n"
|
||||
"1 beginbfrange\n"
|
||||
"<0000> <FFFF> <0000>\n"
|
||||
"endbfrange\n"
|
||||
"endcmap\n"
|
||||
"CMapName currentdict /CMap defineresource pop\n"
|
||||
"end\n"
|
||||
"end\n";
|
||||
|
||||
// TOUNICODE
|
||||
stream.str("");
|
||||
stream << "6 0 obj\n"
|
||||
"<< /Length "
|
||||
<< (sizeof(stream2) - 1)
|
||||
<< " >>\n"
|
||||
"stream\n"
|
||||
<< stream2
|
||||
<< "endstream\n"
|
||||
"endobj\n";
|
||||
AppendPDFObject(stream.str().c_str());
|
||||
|
||||
// FONT DESCRIPTOR
|
||||
stream.str("");
|
||||
stream << "7 0 obj\n"
|
||||
"<<\n"
|
||||
" /Ascent 1000\n"
|
||||
" /CapHeight 1000\n"
|
||||
" /Descent -1\n" // Spec says must be negative
|
||||
" /Flags 5\n" // FixedPitch + Symbolic
|
||||
" /FontBBox [ 0 0 "
|
||||
<< (1000 / kCharWidth)
|
||||
<< " 1000 ]\n"
|
||||
" /FontFile2 8 0 R\n"
|
||||
" /FontName /GlyphLessFont\n"
|
||||
" /ItalicAngle 0\n"
|
||||
" /StemV 80\n"
|
||||
" /Type /FontDescriptor\n"
|
||||
">>\n"
|
||||
"endobj\n";
|
||||
AppendPDFObject(stream.str().c_str());
|
||||
|
||||
stream.str("");
|
||||
stream << datadir_.c_str() << "/pdf.ttf";
|
||||
const uint8_t *font;
|
||||
std::ifstream input(stream.str().c_str(), std::ios::in | std::ios::binary);
|
||||
std::vector<unsigned char> buffer(std::istreambuf_iterator<char>(input), {});
|
||||
auto size = buffer.size();
|
||||
if (size) {
|
||||
font = buffer.data();
|
||||
} else {
|
||||
#if !defined(NDEBUG)
|
||||
tprintf("Cannot open file \"%s\"!\nUsing internal glyphless font.\n", stream.str().c_str());
|
||||
#endif
|
||||
font = pdf_ttf;
|
||||
size = sizeof(pdf_ttf);
|
||||
}
|
||||
|
||||
// FONTFILE2
|
||||
stream.str("");
|
||||
stream << "8 0 obj\n"
|
||||
"<<\n"
|
||||
" /Length "
|
||||
<< size
|
||||
<< "\n"
|
||||
" /Length1 "
|
||||
<< size
|
||||
<< "\n"
|
||||
">>\n"
|
||||
"stream\n";
|
||||
AppendString(stream.str().c_str());
|
||||
objsize = stream.str().size();
|
||||
AppendData(reinterpret_cast<const char *>(font), size);
|
||||
objsize += size;
|
||||
AppendString(endstream_endobj);
|
||||
objsize += strlen(endstream_endobj);
|
||||
AppendPDFObjectDIY(objsize);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TessPDFRenderer::imageToPDFObj(Pix *pix, const char *filename, long int objnum,
|
||||
char **pdf_object, long int *pdf_object_size,
|
||||
const int jpg_quality) {
|
||||
if (!pdf_object_size || !pdf_object) {
|
||||
return false;
|
||||
}
|
||||
*pdf_object = nullptr;
|
||||
*pdf_object_size = 0;
|
||||
if (!filename && !pix) {
|
||||
return false;
|
||||
}
|
||||
|
||||
L_Compressed_Data *cid = nullptr;
|
||||
|
||||
int sad = 0;
|
||||
if (pixGetInputFormat(pix) == IFF_PNG) {
|
||||
sad = pixGenerateCIData(pix, L_FLATE_ENCODE, 0, 0, &cid);
|
||||
}
|
||||
if (!cid) {
|
||||
sad = l_generateCIDataForPdf(filename, pix, jpg_quality, &cid);
|
||||
}
|
||||
|
||||
if (sad || !cid) {
|
||||
l_CIDataDestroy(&cid);
|
||||
return false;
|
||||
}
|
||||
|
||||
const char *group4 = "";
|
||||
const char *filter;
|
||||
switch (cid->type) {
|
||||
case L_FLATE_ENCODE:
|
||||
filter = "/FlateDecode";
|
||||
break;
|
||||
case L_JPEG_ENCODE:
|
||||
filter = "/DCTDecode";
|
||||
break;
|
||||
case L_G4_ENCODE:
|
||||
filter = "/CCITTFaxDecode";
|
||||
group4 = " /K -1\n";
|
||||
break;
|
||||
case L_JP2K_ENCODE:
|
||||
filter = "/JPXDecode";
|
||||
break;
|
||||
default:
|
||||
l_CIDataDestroy(&cid);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Maybe someday we will accept RGBA but today is not that day.
|
||||
// It requires creating an /SMask for the alpha channel.
|
||||
// http://stackoverflow.com/questions/14220221
|
||||
std::stringstream colorspace;
|
||||
// Use "C" locale (needed for int values larger than 999).
|
||||
colorspace.imbue(std::locale::classic());
|
||||
if (cid->ncolors > 0) {
|
||||
colorspace << " /ColorSpace [ /Indexed /DeviceRGB " << (cid->ncolors - 1) << " "
|
||||
<< cid->cmapdatahex << " ]\n";
|
||||
} else {
|
||||
switch (cid->spp) {
|
||||
case 1:
|
||||
if (cid->bps == 1 && pixGetInputFormat(pix) == IFF_PNG) {
|
||||
colorspace.str(
|
||||
" /ColorSpace /DeviceGray\n"
|
||||
" /Decode [1 0]\n");
|
||||
} else {
|
||||
colorspace.str(" /ColorSpace /DeviceGray\n");
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
colorspace.str(" /ColorSpace /DeviceRGB\n");
|
||||
break;
|
||||
default:
|
||||
l_CIDataDestroy(&cid);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
int predictor = (cid->predictor) ? 14 : 1;
|
||||
|
||||
// IMAGE
|
||||
std::stringstream b1;
|
||||
// Use "C" locale (needed for int values larger than 999).
|
||||
b1.imbue(std::locale::classic());
|
||||
b1 << objnum
|
||||
<< " 0 obj\n"
|
||||
"<<\n"
|
||||
" /Length "
|
||||
<< cid->nbytescomp
|
||||
<< "\n"
|
||||
" /Subtype /Image\n";
|
||||
|
||||
std::stringstream b2;
|
||||
// Use "C" locale (needed for int values larger than 999).
|
||||
b2.imbue(std::locale::classic());
|
||||
b2 << " /Width " << cid->w
|
||||
<< "\n"
|
||||
" /Height "
|
||||
<< cid->h
|
||||
<< "\n"
|
||||
" /BitsPerComponent "
|
||||
<< cid->bps
|
||||
<< "\n"
|
||||
" /Filter "
|
||||
<< filter
|
||||
<< "\n"
|
||||
" /DecodeParms\n"
|
||||
" <<\n"
|
||||
" /Predictor "
|
||||
<< predictor
|
||||
<< "\n"
|
||||
" /Colors "
|
||||
<< cid->spp << "\n"
|
||||
<< group4 << " /Columns " << cid->w
|
||||
<< "\n"
|
||||
" /BitsPerComponent "
|
||||
<< cid->bps
|
||||
<< "\n"
|
||||
" >>\n"
|
||||
">>\n"
|
||||
"stream\n";
|
||||
|
||||
const char *b3 =
|
||||
"endstream\n"
|
||||
"endobj\n";
|
||||
|
||||
size_t b1_len = b1.str().size();
|
||||
size_t b2_len = b2.str().size();
|
||||
size_t b3_len = strlen(b3);
|
||||
size_t colorspace_len = colorspace.str().size();
|
||||
|
||||
*pdf_object_size = b1_len + colorspace_len + b2_len + cid->nbytescomp + b3_len;
|
||||
*pdf_object = new char[*pdf_object_size];
|
||||
|
||||
char *p = *pdf_object;
|
||||
memcpy(p, b1.str().c_str(), b1_len);
|
||||
p += b1_len;
|
||||
memcpy(p, colorspace.str().c_str(), colorspace_len);
|
||||
p += colorspace_len;
|
||||
memcpy(p, b2.str().c_str(), b2_len);
|
||||
p += b2_len;
|
||||
memcpy(p, cid->datacomp, cid->nbytescomp);
|
||||
p += cid->nbytescomp;
|
||||
memcpy(p, b3, b3_len);
|
||||
l_CIDataDestroy(&cid);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TessPDFRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
Pix *pix = api->GetInputImage();
|
||||
const char *filename = api->GetInputName();
|
||||
int ppi = api->GetSourceYResolution();
|
||||
if (!pix || ppi <= 0) {
|
||||
return false;
|
||||
}
|
||||
double width = pixGetWidth(pix) * 72.0 / ppi;
|
||||
double height = pixGetHeight(pix) * 72.0 / ppi;
|
||||
|
||||
std::stringstream xobject;
|
||||
// Use "C" locale (needed for int values larger than 999).
|
||||
xobject.imbue(std::locale::classic());
|
||||
if (!textonly_) {
|
||||
xobject << "/XObject << /Im1 " << (obj_ + 2) << " 0 R >>\n";
|
||||
}
|
||||
|
||||
// PAGE
|
||||
std::stringstream stream;
|
||||
// Use "C" locale (needed for double values width and height).
|
||||
stream.imbue(std::locale::classic());
|
||||
stream.precision(2);
|
||||
stream << std::fixed << obj_
|
||||
<< " 0 obj\n"
|
||||
"<<\n"
|
||||
" /Type /Page\n"
|
||||
" /Parent 2 0 R\n" // Pages object
|
||||
" /MediaBox [0 0 "
|
||||
<< width << " " << height
|
||||
<< "]\n"
|
||||
" /Contents "
|
||||
<< (obj_ + 1)
|
||||
<< " 0 R\n" // Contents object
|
||||
" /Resources\n"
|
||||
" <<\n"
|
||||
" "
|
||||
<< xobject.str() << // Image object
|
||||
" /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
|
||||
" /Font << /f-0-0 3 0 R >>\n" // Type0 Font
|
||||
" >>\n"
|
||||
">>\n"
|
||||
"endobj\n";
|
||||
pages_.push_back(obj_);
|
||||
AppendPDFObject(stream.str().c_str());
|
||||
|
||||
// CONTENTS
|
||||
const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));
|
||||
const size_t pdftext_len = strlen(pdftext.get());
|
||||
size_t len;
|
||||
unsigned char *comp_pdftext =
|
||||
zlibCompress(reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
|
||||
long comp_pdftext_len = len;
|
||||
stream.str("");
|
||||
stream << obj_
|
||||
<< " 0 obj\n"
|
||||
"<<\n"
|
||||
" /Length "
|
||||
<< comp_pdftext_len
|
||||
<< " /Filter /FlateDecode\n"
|
||||
">>\n"
|
||||
"stream\n";
|
||||
AppendString(stream.str().c_str());
|
||||
long objsize = stream.str().size();
|
||||
AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
|
||||
objsize += comp_pdftext_len;
|
||||
lept_free(comp_pdftext);
|
||||
const char *b2 =
|
||||
"endstream\n"
|
||||
"endobj\n";
|
||||
AppendString(b2);
|
||||
objsize += strlen(b2);
|
||||
AppendPDFObjectDIY(objsize);
|
||||
|
||||
if (!textonly_) {
|
||||
char *pdf_object = nullptr;
|
||||
int jpg_quality;
|
||||
api->GetIntVariable("jpg_quality", &jpg_quality);
|
||||
if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize, jpg_quality)) {
|
||||
return false;
|
||||
}
|
||||
AppendData(pdf_object, objsize);
|
||||
AppendPDFObjectDIY(objsize);
|
||||
delete[] pdf_object;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TessPDFRenderer::EndDocumentHandler() {
|
||||
// We reserved the /Pages object number early, so that the /Page
|
||||
// objects could refer to their parent. We finally have enough
|
||||
// information to go fill it in. Using lower level calls to manipulate
|
||||
// the offset record in two spots, because we are placing objects
|
||||
// out of order in the file.
|
||||
|
||||
// PAGES
|
||||
const long int kPagesObjectNumber = 2;
|
||||
offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1
|
||||
std::stringstream stream;
|
||||
// Use "C" locale (needed for int values larger than 999).
|
||||
stream.imbue(std::locale::classic());
|
||||
stream << kPagesObjectNumber << " 0 obj\n<<\n /Type /Pages\n /Kids [ ";
|
||||
AppendString(stream.str().c_str());
|
||||
size_t pages_objsize = stream.str().size();
|
||||
for (const auto &page : pages_) {
|
||||
stream.str("");
|
||||
stream << page << " 0 R ";
|
||||
AppendString(stream.str().c_str());
|
||||
pages_objsize += stream.str().size();
|
||||
}
|
||||
stream.str("");
|
||||
stream << "]\n /Count " << pages_.size() << "\n>>\nendobj\n";
|
||||
AppendString(stream.str().c_str());
|
||||
pages_objsize += stream.str().size();
|
||||
offsets_.back() += pages_objsize; // manipulation #2
|
||||
|
||||
// INFO
|
||||
std::string utf16_title = "FEFF"; // byte_order_marker
|
||||
std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(title());
|
||||
char utf16[kMaxBytesPerCodepoint];
|
||||
for (char32 code : unicodes) {
|
||||
if (CodepointToUtf16be(code, utf16)) {
|
||||
utf16_title += utf16;
|
||||
}
|
||||
}
|
||||
|
||||
char *datestr = l_getFormattedDate();
|
||||
stream.str("");
|
||||
stream << obj_
|
||||
<< " 0 obj\n"
|
||||
"<<\n"
|
||||
" /Producer (Tesseract "
|
||||
<< tesseract::TessBaseAPI::Version()
|
||||
<< ")\n"
|
||||
" /CreationDate (D:"
|
||||
<< datestr
|
||||
<< ")\n"
|
||||
" /Title <"
|
||||
<< utf16_title.c_str()
|
||||
<< ">\n"
|
||||
">>\n"
|
||||
"endobj\n";
|
||||
lept_free(datestr);
|
||||
AppendPDFObject(stream.str().c_str());
|
||||
stream.str("");
|
||||
stream << "xref\n0 " << obj_ << "\n0000000000 65535 f \n";
|
||||
AppendString(stream.str().c_str());
|
||||
for (int i = 1; i < obj_; i++) {
|
||||
stream.str("");
|
||||
stream.width(10);
|
||||
stream.fill('0');
|
||||
stream << offsets_[i] << " 00000 n \n";
|
||||
AppendString(stream.str().c_str());
|
||||
}
|
||||
stream.str("");
|
||||
stream << "trailer\n<<\n /Size " << obj_
|
||||
<< "\n"
|
||||
" /Root 1 0 R\n" // catalog
|
||||
" /Info "
|
||||
<< (obj_ - 1)
|
||||
<< " 0 R\n" // info
|
||||
">>\nstartxref\n"
|
||||
<< offsets_.back() << "\n%%EOF\n";
|
||||
AppendString(stream.str().c_str());
|
||||
return true;
|
||||
}
|
||||
} // namespace tesseract
|
241
3rdparty/tesseract_ocr/tesseract/src/api/renderer.cpp
vendored
Normal file
241
3rdparty/tesseract_ocr/tesseract/src/api/renderer.cpp
vendored
Normal file
@ -0,0 +1,241 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: renderer.cpp
|
||||
// Description: Rendering interface to inject into TessBaseAPI
|
||||
//
|
||||
// (C) Copyright 2011, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
# include "config_auto.h"
|
||||
#endif
|
||||
#include <tesseract/baseapi.h>
|
||||
#include <tesseract/renderer.h>
|
||||
#include <cstring>
|
||||
#include <memory> // std::unique_ptr
|
||||
#include <string> // std::string
|
||||
#include "serialis.h" // Serialize
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
/**********************************************************************
|
||||
* Base Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessResultRenderer::TessResultRenderer(const char *outputbase, const char *extension)
|
||||
: file_extension_(extension)
|
||||
, title_("")
|
||||
, imagenum_(-1)
|
||||
, fout_(stdout)
|
||||
, next_(nullptr)
|
||||
, happy_(true) {
|
||||
if (strcmp(outputbase, "-") && strcmp(outputbase, "stdout")) {
|
||||
std::string outfile = std::string(outputbase) + "." + extension;
|
||||
fout_ = fopen(outfile.c_str(), "wb");
|
||||
if (fout_ == nullptr) {
|
||||
happy_ = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TessResultRenderer::~TessResultRenderer() {
|
||||
if (fout_ != nullptr) {
|
||||
if (fout_ != stdout) {
|
||||
fclose(fout_);
|
||||
} else {
|
||||
clearerr(fout_);
|
||||
}
|
||||
}
|
||||
delete next_;
|
||||
}
|
||||
|
||||
void TessResultRenderer::insert(TessResultRenderer *next) {
|
||||
if (next == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
TessResultRenderer *remainder = next_;
|
||||
next_ = next;
|
||||
if (remainder) {
|
||||
while (next->next_ != nullptr) {
|
||||
next = next->next_;
|
||||
}
|
||||
next->next_ = remainder;
|
||||
}
|
||||
}
|
||||
|
||||
bool TessResultRenderer::BeginDocument(const char *title) {
|
||||
if (!happy_) {
|
||||
return false;
|
||||
}
|
||||
title_ = title;
|
||||
imagenum_ = -1;
|
||||
bool ok = BeginDocumentHandler();
|
||||
if (next_) {
|
||||
ok = next_->BeginDocument(title) && ok;
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
bool TessResultRenderer::AddImage(TessBaseAPI *api) {
|
||||
if (!happy_) {
|
||||
return false;
|
||||
}
|
||||
++imagenum_;
|
||||
bool ok = AddImageHandler(api);
|
||||
if (next_) {
|
||||
ok = next_->AddImage(api) && ok;
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
bool TessResultRenderer::EndDocument() {
|
||||
if (!happy_) {
|
||||
return false;
|
||||
}
|
||||
bool ok = EndDocumentHandler();
|
||||
if (next_) {
|
||||
ok = next_->EndDocument() && ok;
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
void TessResultRenderer::AppendString(const char *s) {
|
||||
AppendData(s, strlen(s));
|
||||
}
|
||||
|
||||
void TessResultRenderer::AppendData(const char *s, int len) {
|
||||
if (!tesseract::Serialize(fout_, s, len)) {
|
||||
happy_ = false;
|
||||
}
|
||||
fflush(fout_);
|
||||
}
|
||||
|
||||
bool TessResultRenderer::BeginDocumentHandler() {
|
||||
return happy_;
|
||||
}
|
||||
|
||||
bool TessResultRenderer::EndDocumentHandler() {
|
||||
return happy_;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* UTF8 Text Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessTextRenderer::TessTextRenderer(const char *outputbase)
|
||||
: TessResultRenderer(outputbase, "txt") {}
|
||||
|
||||
bool TessTextRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
const std::unique_ptr<const char[]> utf8(api->GetUTF8Text());
|
||||
if (utf8 == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AppendString(utf8.get());
|
||||
|
||||
const char *pageSeparator = api->GetStringVariable("page_separator");
|
||||
if (pageSeparator != nullptr && *pageSeparator != '\0') {
|
||||
AppendString(pageSeparator);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* TSV Text Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessTsvRenderer::TessTsvRenderer(const char *outputbase) : TessResultRenderer(outputbase, "tsv") {
|
||||
font_info_ = false;
|
||||
}
|
||||
|
||||
TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info)
|
||||
: TessResultRenderer(outputbase, "tsv") {
|
||||
font_info_ = font_info;
|
||||
}
|
||||
|
||||
bool TessTsvRenderer::BeginDocumentHandler() {
|
||||
// Output TSV column headings
|
||||
AppendString(
|
||||
"level\tpage_num\tblock_num\tpar_num\tline_num\tword_"
|
||||
"num\tleft\ttop\twidth\theight\tconf\ttext\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TessTsvRenderer::EndDocumentHandler() {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TessTsvRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
const std::unique_ptr<const char[]> tsv(api->GetTSVText(imagenum()));
|
||||
if (tsv == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AppendString(tsv.get());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* UNLV Text Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessUnlvRenderer::TessUnlvRenderer(const char *outputbase)
|
||||
: TessResultRenderer(outputbase, "unlv") {}
|
||||
|
||||
bool TessUnlvRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
const std::unique_ptr<const char[]> unlv(api->GetUNLVText());
|
||||
if (unlv == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AppendString(unlv.get());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* BoxText Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessBoxTextRenderer::TessBoxTextRenderer(const char *outputbase)
|
||||
: TessResultRenderer(outputbase, "box") {}
|
||||
|
||||
bool TessBoxTextRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
const std::unique_ptr<const char[]> text(api->GetBoxText(imagenum()));
|
||||
if (text == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AppendString(text.get());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
/**********************************************************************
|
||||
* Osd Text Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessOsdRenderer::TessOsdRenderer(const char *outputbase) : TessResultRenderer(outputbase, "osd") {}
|
||||
|
||||
bool TessOsdRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
const std::unique_ptr<const char[]> osd(api->GetOsdText(imagenum()));
|
||||
if (osd == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AppendString(osd.get());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
|
||||
} // namespace tesseract
|
106
3rdparty/tesseract_ocr/tesseract/src/api/wordstrboxrenderer.cpp
vendored
Normal file
106
3rdparty/tesseract_ocr/tesseract/src/api/wordstrboxrenderer.cpp
vendored
Normal file
@ -0,0 +1,106 @@
|
||||
/**********************************************************************
|
||||
* File: wordstrboxrenderer.cpp
|
||||
* Description: Renderer for creating box file with WordStr strings.
|
||||
* based on the tsv renderer.
|
||||
*
|
||||
* (C) Copyright 2019, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include <tesseract/baseapi.h> // for TessBaseAPI
|
||||
#include <tesseract/renderer.h>
|
||||
#include "tesseractclass.h" // for Tesseract
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
/**
|
||||
* Create a UTF8 box file with WordStr strings from the internal data
|
||||
* structures. page_number is a 0-base page index that will appear in the box
|
||||
* file. Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
|
||||
char *TessBaseAPI::GetWordStrBoxText(int page_number = 0) {
|
||||
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::string wordstr_box_str;
|
||||
int left = 0, top = 0, right = 0, bottom = 0;
|
||||
|
||||
bool first_line = true;
|
||||
|
||||
LTRResultIterator *res_it = GetLTRIterator();
|
||||
while (!res_it->Empty(RIL_BLOCK)) {
|
||||
if (res_it->Empty(RIL_WORD)) {
|
||||
res_it->Next(RIL_WORD);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||
if (!first_line) {
|
||||
wordstr_box_str += "\n\t " + std::to_string(right + 1);
|
||||
wordstr_box_str += " " + std::to_string(image_height_ - bottom);
|
||||
wordstr_box_str += " " + std::to_string(right + 5);
|
||||
wordstr_box_str += " " + std::to_string(image_height_ - top);
|
||||
wordstr_box_str += " " + std::to_string(page_number); // row for tab for EOL
|
||||
wordstr_box_str += "\n";
|
||||
} else {
|
||||
first_line = false;
|
||||
}
|
||||
// Use bounding box for whole line for WordStr
|
||||
res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
|
||||
wordstr_box_str += "WordStr " + std::to_string(left);
|
||||
wordstr_box_str += " " + std::to_string(image_height_ - bottom);
|
||||
wordstr_box_str += " " + std::to_string(right);
|
||||
wordstr_box_str += " " + std::to_string(image_height_ - top);
|
||||
wordstr_box_str += " " + std::to_string(page_number); // word
|
||||
wordstr_box_str += " #";
|
||||
}
|
||||
do {
|
||||
wordstr_box_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
|
||||
wordstr_box_str += " ";
|
||||
res_it->Next(RIL_WORD);
|
||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
||||
}
|
||||
|
||||
if (left != 0 && top != 0 && right != 0 && bottom != 0) {
|
||||
wordstr_box_str += "\n\t " + std::to_string(right + 1);
|
||||
wordstr_box_str += " " + std::to_string(image_height_ - bottom);
|
||||
wordstr_box_str += " " + std::to_string(right + 5);
|
||||
wordstr_box_str += " " + std::to_string(image_height_ - top);
|
||||
wordstr_box_str += " " + std::to_string(page_number); // row for tab for EOL
|
||||
wordstr_box_str += "\n";
|
||||
}
|
||||
char *ret = new char[wordstr_box_str.length() + 1];
|
||||
strcpy(ret, wordstr_box_str.c_str());
|
||||
delete res_it;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* WordStrBox Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessWordStrBoxRenderer::TessWordStrBoxRenderer(const char *outputbase)
|
||||
: TessResultRenderer(outputbase, "box") {}
|
||||
|
||||
bool TessWordStrBoxRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
const std::unique_ptr<const char[]> wordstrbox(api->GetWordStrBoxText(imagenum()));
|
||||
if (wordstrbox == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AppendString(wordstrbox.get());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
Reference in New Issue
Block a user