twain3.0/3rdparty/hgOCR/include/ccstruct/blamer.h

///////////////////////////////////////////////////////////////////////
// File:        blamer.h
// Description: Module allowing precise error causes to be allocated.
// Author:      Rike Antonova
// Refactored:  Ray Smith
// Created:     Mon Feb 04 14:37:01 PST 2013
//
// (C) Copyright 2013, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#ifndef TESSERACT_CCSTRUCT_BLAMER_H_
#define TESSERACT_CCSTRUCT_BLAMER_H_

#include <stdio.h>
#include "boxword.h"
#include "genericvector.h"
#include "matrix.h"
#include "params_training_featdef.h"
#include "ratngs.h"
#include "strngs.h"
#include "tesscallback.h"

static const inT16 kBlamerBoxTolerance = 5;

// Enum for expressing the source of error.
// Note: Please update kIncorrectResultReasonNames when modifying this enum.
enum IncorrectResultReason {
	// The text recorded in best choice == truth text
	IRR_CORRECT,
	// Either: Top choice is incorrect and is a dictionary word (language model
	// is unlikely to help correct such errors, so blame the classifier).
	// Or: the correct unichar was not included in shortlist produced by the
	// classifier at all.
	IRR_CLASSIFIER,
	// Chopper have not found one or more splits that correspond to the correct
	// character bounding boxes recorded in BlamerBundle::truth_word.
	IRR_CHOPPER,
	// Classifier did include correct unichars for each blob in the correct
	// segmentation, however its rating could have been too bad to allow the
	// language model to pull out the correct choice. On the other hand the
	// strength of the language model might have been too weak to favor the
	// correct answer, this we call this case a classifier-language model
	// tradeoff error.
	IRR_CLASS_LM_TRADEOFF,
	// Page layout failed to produce the correct bounding box. Blame page layout
	// if the truth was not found for the word, which implies that the bounding
	// box of the word was incorrect (no truth word had a similar bounding box).
	IRR_PAGE_LAYOUT,
	// SegSearch heuristic prevented one or more blobs from the correct
	// segmentation state to be classified (e.g. the blob was too wide).
	IRR_SEGSEARCH_HEUR,
	// The correct segmentaiton state was not explored because of poor SegSearch
	// pain point prioritization. We blame SegSearch pain point prioritization
	// if the best rating of a choice constructed from correct segmentation is
	// better than that of the best choice (i.e. if we got to explore the correct
	// segmentation state, language model would have picked the correct choice).
	IRR_SEGSEARCH_PP,
	// Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
	// and thus use the old language model (permuters).
	// TODO(antonova): integrate the new language mode with chopper
	IRR_CLASS_OLD_LM_TRADEOFF,
	// If there is an incorrect adaptive template match with a better score than
	// a correct one (either pre-trained or adapted), mark this as adaption error.
	IRR_ADAPTION,
	// split_and_recog_word() failed to find a suitable split in truth.
	IRR_NO_TRUTH_SPLIT,
	// Truth is not available for this word (e.g. when words in corrected content
	// file are turned into ~~~~ because an appropriate alignment was not found.
	IRR_NO_TRUTH,
	// The text recorded in best choice != truth text, but none of the above
	// reasons are set.
	IRR_UNKNOWN,

	IRR_NUM_REASONS
};

// Blamer-related information to determine the source of errors.
struct BlamerBundle {
	static const char *IncorrectReasonName(IncorrectResultReason irr);
	BlamerBundle() : truth_has_char_boxes_(false),
		incorrect_result_reason_(IRR_CORRECT),
		lattice_data_(NULL) {
		ClearResults();
	}
	BlamerBundle(const BlamerBundle &other) {
		this->CopyTruth(other);
		this->CopyResults(other);
	}
	~BlamerBundle() { delete[] lattice_data_; }

	// Accessors.
	STRING TruthString() const {
		STRING truth_str;
		for (int i = 0; i < truth_text_.length(); ++i)
			truth_str += truth_text_[i];
		return truth_str;
	}
	IncorrectResultReason incorrect_result_reason() const {
		return incorrect_result_reason_;
	}
	bool NoTruth() const {
		return incorrect_result_reason_ == IRR_NO_TRUTH ||
			incorrect_result_reason_ == IRR_PAGE_LAYOUT;
	}
	bool HasDebugInfo() const {
		return debug_.length() > 0 || misadaption_debug_.length() > 0;
	}
	const STRING& debug() const {
		return debug_;
	}
	const STRING& misadaption_debug() const {
		return misadaption_debug_;
	}
	void UpdateBestRating(float rating) {
		if (rating < best_correctly_segmented_rating_)
			best_correctly_segmented_rating_ = rating;
	}
	int correct_segmentation_length() const {
		return correct_segmentation_cols_.length();
	}
	// Returns true if the given ratings matrix col,row position is included
	// in the correct segmentation path at the given index.
	bool MatrixPositionCorrect(int index, const MATRIX_COORD& coord) {
		return correct_segmentation_cols_[index] == coord.col &&
			correct_segmentation_rows_[index] == coord.row;
	}
	void set_best_choice_is_dict_and_top_choice(bool value) {
		best_choice_is_dict_and_top_choice_ = value;
	}
	const char* lattice_data() const {
		return lattice_data_;
	}
	int lattice_size() const {
		return lattice_size_;  // size of lattice_data in bytes
	}
	void set_lattice_data(const char* data, int size) {
		lattice_size_ = size;
		delete[] lattice_data_;
		lattice_data_ = new char[lattice_size_];
		memcpy(lattice_data_, data, lattice_size_);
	}
	const tesseract::ParamsTrainingBundle& params_training_bundle() const {
		return params_training_bundle_;
	}
	// Adds a new ParamsTrainingHypothesis to the current hypothesis list.
	void AddHypothesis(const tesseract::ParamsTrainingHypothesis& hypo) {
		params_training_bundle_.AddHypothesis(hypo);
	}

	// Functions to setup the blamer.
	// Whole word string, whole word bounding box.
	void SetWordTruth(const UNICHARSET& unicharset,
		const char* truth_str, const TBOX& word_box);
	// Single "character" string, "character" bounding box.
	// May be called multiple times to indicate the characters in a word.
	void SetSymbolTruth(const UNICHARSET& unicharset,
		const char* char_str, const TBOX& char_box);
	// Marks that there is something wrong with the truth text, like it contains
	// reject characters.
	void SetRejectedTruth();

	// Returns true if the provided word_choice is correct.
	bool ChoiceIsCorrect(const WERD_CHOICE* word_choice) const;

	void ClearResults() {
		norm_truth_word_.DeleteAllBoxes();
		norm_box_tolerance_ = 0;
		if (!NoTruth()) incorrect_result_reason_ = IRR_CORRECT;
		debug_ = "";
		segsearch_is_looking_for_blame_ = false;
		best_correctly_segmented_rating_ = WERD_CHOICE::kBadRating;
		correct_segmentation_cols_.clear();
		correct_segmentation_rows_.clear();
		best_choice_is_dict_and_top_choice_ = false;
		delete[] lattice_data_;
		lattice_data_ = NULL;
		lattice_size_ = 0;
	}
	void CopyTruth(const BlamerBundle &other) {
		truth_has_char_boxes_ = other.truth_has_char_boxes_;
		truth_word_ = other.truth_word_;
		truth_text_ = other.truth_text_;
		incorrect_result_reason_ =
			(other.NoTruth() ? other.incorrect_result_reason_ : IRR_CORRECT);
	}
	void CopyResults(const BlamerBundle &other) {
		norm_truth_word_ = other.norm_truth_word_;
		norm_box_tolerance_ = other.norm_box_tolerance_;
		incorrect_result_reason_ = other.incorrect_result_reason_;
		segsearch_is_looking_for_blame_ = other.segsearch_is_looking_for_blame_;
		best_correctly_segmented_rating_ = other.best_correctly_segmented_rating_;
		correct_segmentation_cols_ = other.correct_segmentation_cols_;
		correct_segmentation_rows_ = other.correct_segmentation_rows_;
		best_choice_is_dict_and_top_choice_ =
			other.best_choice_is_dict_and_top_choice_;
		if (other.lattice_data_ != NULL) {
			lattice_data_ = new char[other.lattice_size_];
			memcpy(lattice_data_, other.lattice_data_, other.lattice_size_);
			lattice_size_ = other.lattice_size_;
		}
		else {
			lattice_data_ = NULL;
		}
	}
	const char *IncorrectReason() const;

	// Appends choice and truth details to the given debug string.
	void FillDebugString(const STRING &msg, const WERD_CHOICE *choice,
		STRING *debug);

	// Sets up the norm_truth_word from truth_word using the given DENORM.
	void SetupNormTruthWord(const DENORM& denorm);

	// Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
	// bundles) where the right edge/ of the left-hand word is word1_right,
	// and the left edge of the right-hand word is word2_left.
	void SplitBundle(int word1_right, int word2_left, bool debug,
		BlamerBundle* bundle1, BlamerBundle* bundle2) const;
	// "Joins" the blames from bundle1 and bundle2 into *this.
	void JoinBlames(const BlamerBundle& bundle1, const BlamerBundle& bundle2,
		bool debug);

	// If a blob with the same bounding box as one of the truth character
	// bounding boxes is not classified as the corresponding truth character
	// blames character classifier for incorrect answer.
	void BlameClassifier(const UNICHARSET& unicharset,
		const TBOX& blob_box,
		const BLOB_CHOICE_LIST& choices,
		bool debug);


	// Checks whether chops were made at all the character bounding box
	// boundaries in word->truth_word. If not - blames the chopper for an
	// incorrect answer.
	void SetChopperBlame(const WERD_RES* word, bool debug);
	// Blames the classifier or the language model if, after running only the
	// chopper, best_choice is incorrect and no blame has been yet set.
	// Blames the classifier if best_choice is classifier's top choice and is a
	// dictionary word (i.e. language model could not have helped).
	// Otherwise, blames the language model (formerly permuter word adjustment).
	void BlameClassifierOrLangModel(
		const WERD_RES* word,
		const UNICHARSET& unicharset, bool valid_permuter, bool debug);
	// Sets up the correct_segmentation_* to mark the correct bounding boxes.
	void SetupCorrectSegmentation(const TWERD* word, bool debug);

	// Returns true if a guided segmentation search is needed.
	bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const;
	// Setup ready to guide the segmentation search to the correct segmentation.
	// The callback pp_cb is used to avoid a cyclic dependency.
	// It calls into LMPainPoints::GenerateForBlamer by pre-binding the
	// WERD_RES, and the LMPainPoints itself.
	// pp_cb must be a permanent callback, and should be deleted by the caller.
	void InitForSegSearch(const WERD_CHOICE *best_choice,
		MATRIX* ratings, UNICHAR_ID wildcard_id,
		bool debug, STRING *debug_str,
		TessResultCallback2<bool, int, int>* pp_cb);
	// Returns true if the guided segsearch is in progress.
	bool GuidedSegsearchStillGoing() const;
	// The segmentation search has ended. Sets the blame appropriately.
	void FinishSegSearch(const WERD_CHOICE *best_choice,
		bool debug, STRING *debug_str);

	// If the bundle is null or still does not indicate the correct result,
	// fix it and use some backup reason for the blame.
	static void LastChanceBlame(bool debug, WERD_RES* word);

	// Sets the misadaption debug if this word is incorrect, as this word is
	// being adapted to.
	void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug);

private:
	void SetBlame(IncorrectResultReason irr, const STRING &msg,
		const WERD_CHOICE *choice, bool debug) {
		incorrect_result_reason_ = irr;
		debug_ = IncorrectReason();
		debug_ += " to blame: ";
		FillDebugString(msg, choice, &debug_);
		if (debug) tprintf("SetBlame(): %s", debug_.string());
	}

private:
	// Set to true when bounding boxes for individual unichars are recorded.
	bool truth_has_char_boxes_;
	// The true_word (in the original image coordinate space) contains ground
	// truth bounding boxes for this WERD_RES.
	tesseract::BoxWord truth_word_;
	// Same as above, but in normalized coordinates
	// (filled in by WERD_RES::SetupForRecognition()).
	tesseract::BoxWord norm_truth_word_;
	// Tolerance for bounding box comparisons in normalized space.
	int norm_box_tolerance_;
	// Contains ground truth unichar for each of the bounding boxes in truth_word.
	GenericVector<STRING> truth_text_;
	// The reason for incorrect OCR result.
	IncorrectResultReason incorrect_result_reason_;
	// Debug text associated with the blame.
	STRING debug_;
	// Misadaption debug information (filled in if this word was misadapted to).
	STRING misadaption_debug_;
	// Variables used by the segmentation search when looking for the blame.
	// Set to true while segmentation search is continued after the usual
	// termination condition in order to look for the blame.
	bool segsearch_is_looking_for_blame_;
	// Best rating for correctly segmented path
	// (set and used by SegSearch when looking for blame).
	float best_correctly_segmented_rating_;
	// Vectors populated by SegSearch to indicate column and row indices that
	// correspond to blobs with correct bounding boxes.
	GenericVector<int> correct_segmentation_cols_;
	GenericVector<int> correct_segmentation_rows_;
	// Set to true if best choice is a dictionary word and
	// classifier's top choice.
	bool best_choice_is_dict_and_top_choice_;
	// Serialized segmentation search lattice.
	char *lattice_data_;
	int lattice_size_;  // size of lattice_data in bytes
	// Information about hypotheses (paths) explored by the segmentation search.
	tesseract::ParamsTrainingBundle params_training_bundle_;
};


#endif  // TESSERACT_CCSTRUCT_BLAMER_H_