twain3.0/3rdparty/hgOCR/include/ccstruct/blamer.h

334 lines
14 KiB
C++

///////////////////////////////////////////////////////////////////////
// File: blamer.h
// Description: Module allowing precise error causes to be allocated.
// Author: Rike Antonova
// Refactored: Ray Smith
// Created: Mon Feb 04 14:37:01 PST 2013
//
// (C) Copyright 2013, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCSTRUCT_BLAMER_H_
#define TESSERACT_CCSTRUCT_BLAMER_H_
#include <stdio.h>
#include "boxword.h"
#include "genericvector.h"
#include "matrix.h"
#include "params_training_featdef.h"
#include "ratngs.h"
#include "strngs.h"
#include "tesscallback.h"
static const inT16 kBlamerBoxTolerance = 5;
// Enum for expressing the source of error.
// Note: Please update kIncorrectResultReasonNames when modifying this enum.
enum IncorrectResultReason {
// The text recorded in best choice == truth text
IRR_CORRECT,
// Either: Top choice is incorrect and is a dictionary word (language model
// is unlikely to help correct such errors, so blame the classifier).
// Or: the correct unichar was not included in shortlist produced by the
// classifier at all.
IRR_CLASSIFIER,
// Chopper have not found one or more splits that correspond to the correct
// character bounding boxes recorded in BlamerBundle::truth_word.
IRR_CHOPPER,
// Classifier did include correct unichars for each blob in the correct
// segmentation, however its rating could have been too bad to allow the
// language model to pull out the correct choice. On the other hand the
// strength of the language model might have been too weak to favor the
// correct answer, this we call this case a classifier-language model
// tradeoff error.
IRR_CLASS_LM_TRADEOFF,
// Page layout failed to produce the correct bounding box. Blame page layout
// if the truth was not found for the word, which implies that the bounding
// box of the word was incorrect (no truth word had a similar bounding box).
IRR_PAGE_LAYOUT,
// SegSearch heuristic prevented one or more blobs from the correct
// segmentation state to be classified (e.g. the blob was too wide).
IRR_SEGSEARCH_HEUR,
// The correct segmentaiton state was not explored because of poor SegSearch
// pain point prioritization. We blame SegSearch pain point prioritization
// if the best rating of a choice constructed from correct segmentation is
// better than that of the best choice (i.e. if we got to explore the correct
// segmentation state, language model would have picked the correct choice).
IRR_SEGSEARCH_PP,
// Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
// and thus use the old language model (permuters).
// TODO(antonova): integrate the new language mode with chopper
IRR_CLASS_OLD_LM_TRADEOFF,
// If there is an incorrect adaptive template match with a better score than
// a correct one (either pre-trained or adapted), mark this as adaption error.
IRR_ADAPTION,
// split_and_recog_word() failed to find a suitable split in truth.
IRR_NO_TRUTH_SPLIT,
// Truth is not available for this word (e.g. when words in corrected content
// file are turned into ~~~~ because an appropriate alignment was not found.
IRR_NO_TRUTH,
// The text recorded in best choice != truth text, but none of the above
// reasons are set.
IRR_UNKNOWN,
IRR_NUM_REASONS
};
// Blamer-related information to determine the source of errors.
struct BlamerBundle {
static const char *IncorrectReasonName(IncorrectResultReason irr);
BlamerBundle() : truth_has_char_boxes_(false),
incorrect_result_reason_(IRR_CORRECT),
lattice_data_(NULL) {
ClearResults();
}
BlamerBundle(const BlamerBundle &other) {
this->CopyTruth(other);
this->CopyResults(other);
}
~BlamerBundle() { delete[] lattice_data_; }
// Accessors.
STRING TruthString() const {
STRING truth_str;
for (int i = 0; i < truth_text_.length(); ++i)
truth_str += truth_text_[i];
return truth_str;
}
IncorrectResultReason incorrect_result_reason() const {
return incorrect_result_reason_;
}
bool NoTruth() const {
return incorrect_result_reason_ == IRR_NO_TRUTH ||
incorrect_result_reason_ == IRR_PAGE_LAYOUT;
}
bool HasDebugInfo() const {
return debug_.length() > 0 || misadaption_debug_.length() > 0;
}
const STRING& debug() const {
return debug_;
}
const STRING& misadaption_debug() const {
return misadaption_debug_;
}
void UpdateBestRating(float rating) {
if (rating < best_correctly_segmented_rating_)
best_correctly_segmented_rating_ = rating;
}
int correct_segmentation_length() const {
return correct_segmentation_cols_.length();
}
// Returns true if the given ratings matrix col,row position is included
// in the correct segmentation path at the given index.
bool MatrixPositionCorrect(int index, const MATRIX_COORD& coord) {
return correct_segmentation_cols_[index] == coord.col &&
correct_segmentation_rows_[index] == coord.row;
}
void set_best_choice_is_dict_and_top_choice(bool value) {
best_choice_is_dict_and_top_choice_ = value;
}
const char* lattice_data() const {
return lattice_data_;
}
int lattice_size() const {
return lattice_size_; // size of lattice_data in bytes
}
void set_lattice_data(const char* data, int size) {
lattice_size_ = size;
delete[] lattice_data_;
lattice_data_ = new char[lattice_size_];
memcpy(lattice_data_, data, lattice_size_);
}
const tesseract::ParamsTrainingBundle& params_training_bundle() const {
return params_training_bundle_;
}
// Adds a new ParamsTrainingHypothesis to the current hypothesis list.
void AddHypothesis(const tesseract::ParamsTrainingHypothesis& hypo) {
params_training_bundle_.AddHypothesis(hypo);
}
// Functions to setup the blamer.
// Whole word string, whole word bounding box.
void SetWordTruth(const UNICHARSET& unicharset,
const char* truth_str, const TBOX& word_box);
// Single "character" string, "character" bounding box.
// May be called multiple times to indicate the characters in a word.
void SetSymbolTruth(const UNICHARSET& unicharset,
const char* char_str, const TBOX& char_box);
// Marks that there is something wrong with the truth text, like it contains
// reject characters.
void SetRejectedTruth();
// Returns true if the provided word_choice is correct.
bool ChoiceIsCorrect(const WERD_CHOICE* word_choice) const;
void ClearResults() {
norm_truth_word_.DeleteAllBoxes();
norm_box_tolerance_ = 0;
if (!NoTruth()) incorrect_result_reason_ = IRR_CORRECT;
debug_ = "";
segsearch_is_looking_for_blame_ = false;
best_correctly_segmented_rating_ = WERD_CHOICE::kBadRating;
correct_segmentation_cols_.clear();
correct_segmentation_rows_.clear();
best_choice_is_dict_and_top_choice_ = false;
delete[] lattice_data_;
lattice_data_ = NULL;
lattice_size_ = 0;
}
void CopyTruth(const BlamerBundle &other) {
truth_has_char_boxes_ = other.truth_has_char_boxes_;
truth_word_ = other.truth_word_;
truth_text_ = other.truth_text_;
incorrect_result_reason_ =
(other.NoTruth() ? other.incorrect_result_reason_ : IRR_CORRECT);
}
void CopyResults(const BlamerBundle &other) {
norm_truth_word_ = other.norm_truth_word_;
norm_box_tolerance_ = other.norm_box_tolerance_;
incorrect_result_reason_ = other.incorrect_result_reason_;
segsearch_is_looking_for_blame_ = other.segsearch_is_looking_for_blame_;
best_correctly_segmented_rating_ = other.best_correctly_segmented_rating_;
correct_segmentation_cols_ = other.correct_segmentation_cols_;
correct_segmentation_rows_ = other.correct_segmentation_rows_;
best_choice_is_dict_and_top_choice_ =
other.best_choice_is_dict_and_top_choice_;
if (other.lattice_data_ != NULL) {
lattice_data_ = new char[other.lattice_size_];
memcpy(lattice_data_, other.lattice_data_, other.lattice_size_);
lattice_size_ = other.lattice_size_;
}
else {
lattice_data_ = NULL;
}
}
const char *IncorrectReason() const;
// Appends choice and truth details to the given debug string.
void FillDebugString(const STRING &msg, const WERD_CHOICE *choice,
STRING *debug);
// Sets up the norm_truth_word from truth_word using the given DENORM.
void SetupNormTruthWord(const DENORM& denorm);
// Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
// bundles) where the right edge/ of the left-hand word is word1_right,
// and the left edge of the right-hand word is word2_left.
void SplitBundle(int word1_right, int word2_left, bool debug,
BlamerBundle* bundle1, BlamerBundle* bundle2) const;
// "Joins" the blames from bundle1 and bundle2 into *this.
void JoinBlames(const BlamerBundle& bundle1, const BlamerBundle& bundle2,
bool debug);
// If a blob with the same bounding box as one of the truth character
// bounding boxes is not classified as the corresponding truth character
// blames character classifier for incorrect answer.
void BlameClassifier(const UNICHARSET& unicharset,
const TBOX& blob_box,
const BLOB_CHOICE_LIST& choices,
bool debug);
// Checks whether chops were made at all the character bounding box
// boundaries in word->truth_word. If not - blames the chopper for an
// incorrect answer.
void SetChopperBlame(const WERD_RES* word, bool debug);
// Blames the classifier or the language model if, after running only the
// chopper, best_choice is incorrect and no blame has been yet set.
// Blames the classifier if best_choice is classifier's top choice and is a
// dictionary word (i.e. language model could not have helped).
// Otherwise, blames the language model (formerly permuter word adjustment).
void BlameClassifierOrLangModel(
const WERD_RES* word,
const UNICHARSET& unicharset, bool valid_permuter, bool debug);
// Sets up the correct_segmentation_* to mark the correct bounding boxes.
void SetupCorrectSegmentation(const TWERD* word, bool debug);
// Returns true if a guided segmentation search is needed.
bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const;
// Setup ready to guide the segmentation search to the correct segmentation.
// The callback pp_cb is used to avoid a cyclic dependency.
// It calls into LMPainPoints::GenerateForBlamer by pre-binding the
// WERD_RES, and the LMPainPoints itself.
// pp_cb must be a permanent callback, and should be deleted by the caller.
void InitForSegSearch(const WERD_CHOICE *best_choice,
MATRIX* ratings, UNICHAR_ID wildcard_id,
bool debug, STRING *debug_str,
TessResultCallback2<bool, int, int>* pp_cb);
// Returns true if the guided segsearch is in progress.
bool GuidedSegsearchStillGoing() const;
// The segmentation search has ended. Sets the blame appropriately.
void FinishSegSearch(const WERD_CHOICE *best_choice,
bool debug, STRING *debug_str);
// If the bundle is null or still does not indicate the correct result,
// fix it and use some backup reason for the blame.
static void LastChanceBlame(bool debug, WERD_RES* word);
// Sets the misadaption debug if this word is incorrect, as this word is
// being adapted to.
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug);
private:
void SetBlame(IncorrectResultReason irr, const STRING &msg,
const WERD_CHOICE *choice, bool debug) {
incorrect_result_reason_ = irr;
debug_ = IncorrectReason();
debug_ += " to blame: ";
FillDebugString(msg, choice, &debug_);
if (debug) tprintf("SetBlame(): %s", debug_.string());
}
private:
// Set to true when bounding boxes for individual unichars are recorded.
bool truth_has_char_boxes_;
// The true_word (in the original image coordinate space) contains ground
// truth bounding boxes for this WERD_RES.
tesseract::BoxWord truth_word_;
// Same as above, but in normalized coordinates
// (filled in by WERD_RES::SetupForRecognition()).
tesseract::BoxWord norm_truth_word_;
// Tolerance for bounding box comparisons in normalized space.
int norm_box_tolerance_;
// Contains ground truth unichar for each of the bounding boxes in truth_word.
GenericVector<STRING> truth_text_;
// The reason for incorrect OCR result.
IncorrectResultReason incorrect_result_reason_;
// Debug text associated with the blame.
STRING debug_;
// Misadaption debug information (filled in if this word was misadapted to).
STRING misadaption_debug_;
// Variables used by the segmentation search when looking for the blame.
// Set to true while segmentation search is continued after the usual
// termination condition in order to look for the blame.
bool segsearch_is_looking_for_blame_;
// Best rating for correctly segmented path
// (set and used by SegSearch when looking for blame).
float best_correctly_segmented_rating_;
// Vectors populated by SegSearch to indicate column and row indices that
// correspond to blobs with correct bounding boxes.
GenericVector<int> correct_segmentation_cols_;
GenericVector<int> correct_segmentation_rows_;
// Set to true if best choice is a dictionary word and
// classifier's top choice.
bool best_choice_is_dict_and_top_choice_;
// Serialized segmentation search lattice.
char *lattice_data_;
int lattice_size_; // size of lattice_data in bytes
// Information about hypotheses (paths) explored by the segmentation search.
tesseract::ParamsTrainingBundle params_training_bundle_;
};
#endif // TESSERACT_CCSTRUCT_BLAMER_H_