2022-10-22 00:53:11 +00:00
|
|
|
#include "HGOCRTesseract.hpp"
|
|
|
|
#include "HGOCR.h"
|
|
|
|
#include "HGOCRRetImpl.hpp"
|
|
|
|
#include "HGOCRRetImpl.hpp"
|
2023-03-02 03:49:50 +00:00
|
|
|
#include "../base/HGTime.h"
|
2022-10-22 00:53:11 +00:00
|
|
|
#include "../base/HGUtility.h"
|
|
|
|
#include "../imgfmt/HGBmp.h"
|
|
|
|
#include "../imgfmt/HGJpeg.h"
|
2024-08-01 05:46:17 +00:00
|
|
|
#include "log/log.h"
|
2022-10-22 00:53:11 +00:00
|
|
|
#include "HGString.h"
|
|
|
|
|
2024-08-01 05:46:17 +00:00
|
|
|
extern HLOG g_hLog;
|
|
|
|
|
2022-10-22 00:53:11 +00:00
|
|
|
HGOCRTesseract::HGOCRTesseract()
|
|
|
|
{
|
|
|
|
m_baseApi = NULL;
|
|
|
|
m_tiffFileName.clear();
|
|
|
|
m_tiffWriter = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
HGOCRTesseract::~HGOCRTesseract()
|
|
|
|
{
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
HGResult HGOCRTesseract::Init()
|
|
|
|
{
|
|
|
|
if (NULL != m_baseApi)
|
|
|
|
{
|
|
|
|
return HGBASE_ERR_FAIL;
|
|
|
|
}
|
|
|
|
|
|
|
|
m_baseApi = TessBaseAPICreate();
|
|
|
|
if (NULL == m_baseApi)
|
|
|
|
{
|
2024-08-01 05:46:17 +00:00
|
|
|
ErrorLog(g_hLog, "HGOCRTesseract::Init: TessBaseAPICreate fail");
|
2022-11-24 09:34:44 +00:00
|
|
|
return HGIMGPROC_ERR_OCRINIT;
|
2022-10-22 00:53:11 +00:00
|
|
|
}
|
|
|
|
|
2022-10-22 05:20:20 +00:00
|
|
|
HGChar moduleName[256];
|
|
|
|
HGBase_GetModuleName((void*)HGImgProc_CreateOCRMgr, moduleName, 256);
|
2022-10-22 00:53:11 +00:00
|
|
|
HGChar dataPath[256];
|
|
|
|
HGBase_GetFilePath(moduleName, dataPath, 256);
|
2022-10-22 05:20:20 +00:00
|
|
|
strcat(dataPath, "tessdata");
|
2022-10-22 00:53:11 +00:00
|
|
|
|
2023-03-02 03:49:50 +00:00
|
|
|
int rc = TessBaseAPIInit3(m_baseApi, dataPath, "osd");
|
2022-10-22 00:53:11 +00:00
|
|
|
if (0 != rc)
|
|
|
|
{
|
2024-08-01 05:46:17 +00:00
|
|
|
ErrorLog(g_hLog, "HGOCRTesseract::Init: TessBaseAPIInit3 fail");
|
2022-10-22 00:53:11 +00:00
|
|
|
TessBaseAPIDelete(m_baseApi);
|
|
|
|
m_baseApi = NULL;
|
2022-11-24 09:34:44 +00:00
|
|
|
return HGIMGPROC_ERR_OCRINIT;
|
2022-10-22 00:53:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
TessBaseAPISetPageSegMode(m_baseApi, TessPageSegMode::PSM_AUTO_OSD);
|
|
|
|
return HGBASE_ERR_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
HGResult HGOCRTesseract::Deinit()
|
|
|
|
{
|
|
|
|
if (NULL == m_baseApi)
|
|
|
|
{
|
|
|
|
return HGBASE_ERR_FAIL;
|
|
|
|
}
|
|
|
|
|
|
|
|
ClearImageList();
|
|
|
|
|
|
|
|
TessBaseAPIDelete(m_baseApi);
|
|
|
|
m_baseApi = NULL;
|
|
|
|
return HGBASE_ERR_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
HGResult HGOCRTesseract::ImageOCR(HGImage image, class HGOCRRetImpl** ocrRet)
|
|
|
|
{
|
|
|
|
if (NULL == image || NULL == ocrRet)
|
|
|
|
{
|
|
|
|
return HGBASE_ERR_INVALIDARG;
|
|
|
|
}
|
|
|
|
|
|
|
|
HGImage image2 = NULL;
|
|
|
|
|
|
|
|
HGImageInfo imgInfo;
|
|
|
|
HGBase_GetImageInfo(image, &imgInfo);
|
|
|
|
if (HGBASE_IMGTYPE_RGB != imgInfo.type || HGBASE_IMGORIGIN_TOP != imgInfo.origin)
|
|
|
|
{
|
|
|
|
HGResult ret = HGBase_CloneImage(image, HGBASE_IMGTYPE_RGB, HGBASE_IMGORIGIN_TOP, &image2);
|
|
|
|
if (HGBASE_ERR_OK != ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
image2 = image;
|
|
|
|
}
|
|
|
|
|
|
|
|
HGBase_GetImageInfo(image2, &imgInfo);
|
|
|
|
HGByte* imageData = NULL;
|
|
|
|
HGBase_GetImageData(image2, &imageData);
|
|
|
|
|
|
|
|
TessBaseAPISetImage(m_baseApi, imageData, imgInfo.width, imgInfo.height, 3, imgInfo.widthStep);
|
2022-10-31 06:02:14 +00:00
|
|
|
HGUInt xDpi, yDpi;
|
|
|
|
HGBase_GetImageDpi(image2, &xDpi, &yDpi);
|
|
|
|
TessBaseAPISetSourceResolution(m_baseApi, (xDpi + yDpi) / 2);
|
2022-10-22 00:53:11 +00:00
|
|
|
|
|
|
|
char *text = TessBaseAPIGetUTF8Text(m_baseApi);
|
|
|
|
if (NULL == text)
|
|
|
|
{
|
|
|
|
if (image2 != image)
|
|
|
|
HGBase_DestroyImage(image2);
|
2022-11-24 09:34:44 +00:00
|
|
|
return HGIMGPROC_ERR_OCR;
|
2022-10-22 00:53:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> blockInfo;
|
|
|
|
blockInfo.push_back(Utf8ToStdString(text));
|
|
|
|
*ocrRet = new HGOCRRetImpl(blockInfo);
|
|
|
|
|
|
|
|
TessDeleteText(text);
|
|
|
|
if (image2 != image)
|
|
|
|
HGBase_DestroyImage(image2);
|
|
|
|
return HGBASE_ERR_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
HGResult HGOCRTesseract::ImageOCRToFile(HGImage image, HGUInt outType, const HGChar* outFileName)
|
|
|
|
{
|
|
|
|
if (NULL == image || outType > HGIMGPROC_OCROUTTYPE_OFD || NULL == outFileName)
|
|
|
|
{
|
|
|
|
return HGBASE_ERR_INVALIDARG;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (0 == outType)
|
|
|
|
outType = GetOutTypeByFileName(outFileName);
|
|
|
|
|
|
|
|
HGChar tmpFileName[256];
|
|
|
|
HGBase_GetTmpFileName("bmp", tmpFileName, 256);
|
2022-11-23 03:17:13 +00:00
|
|
|
HGResult ret = HGImgFmt_SaveBmpImage(image, NULL, tmpFileName);
|
|
|
|
if (HGBASE_ERR_OK != ret)
|
|
|
|
{
|
2024-08-01 05:46:17 +00:00
|
|
|
ErrorLog(g_hLog, "HGOCRTesseract::ImageOCRToFile: HGImgFmt_SaveBmpImage fail %s", tmpFileName);
|
2022-11-23 03:17:13 +00:00
|
|
|
return ret;
|
|
|
|
}
|
2022-10-22 00:53:11 +00:00
|
|
|
|
2022-11-23 03:17:13 +00:00
|
|
|
ret = OCRToFile(tmpFileName, outType, outFileName);
|
2022-10-22 00:53:11 +00:00
|
|
|
HGBase_DeleteFile(tmpFileName);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
HGResult HGOCRTesseract::ImageTextDirectOCR(HGImage image, HGUInt* direct)
|
|
|
|
{
|
|
|
|
if (NULL == image || NULL == direct)
|
|
|
|
{
|
|
|
|
return HGBASE_ERR_INVALIDARG;
|
|
|
|
}
|
|
|
|
|
|
|
|
HGImage image2 = NULL;
|
|
|
|
|
|
|
|
HGImageInfo imgInfo;
|
|
|
|
HGBase_GetImageInfo(image, &imgInfo);
|
|
|
|
if (HGBASE_IMGTYPE_RGB != imgInfo.type || HGBASE_IMGORIGIN_TOP != imgInfo.origin)
|
|
|
|
{
|
|
|
|
HGResult ret = HGBase_CloneImage(image, HGBASE_IMGTYPE_RGB, HGBASE_IMGORIGIN_TOP, &image2);
|
|
|
|
if (HGBASE_ERR_OK != ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
image2 = image;
|
|
|
|
}
|
|
|
|
|
|
|
|
HGBase_GetImageInfo(image2, &imgInfo);
|
|
|
|
HGByte* imageData = NULL;
|
|
|
|
HGBase_GetImageData(image2, &imageData);
|
|
|
|
|
|
|
|
TessBaseAPISetImage(m_baseApi, imageData, imgInfo.width, imgInfo.height, 3, imgInfo.widthStep);
|
2022-10-31 06:02:14 +00:00
|
|
|
HGUInt xDpi, yDpi;
|
|
|
|
HGBase_GetImageDpi(image2, &xDpi, &yDpi);
|
|
|
|
TessBaseAPISetSourceResolution(m_baseApi, (xDpi + yDpi) / 2);
|
2022-10-22 00:53:11 +00:00
|
|
|
|
2023-03-02 03:49:50 +00:00
|
|
|
HGULonglong tickStart = 0;
|
|
|
|
HGBase_GetTickCount(&tickStart);
|
2023-02-27 09:55:16 +00:00
|
|
|
int orientation = MyOSD(m_baseApi);
|
2023-03-02 03:49:50 +00:00
|
|
|
HGULonglong tickEnd = 0;
|
|
|
|
HGBase_GetTickCount(&tickEnd);
|
|
|
|
HGDouble seconds = 0.0;
|
|
|
|
HGBase_GetIntervalSeconds(tickStart, tickEnd, &seconds);
|
2024-08-01 05:46:17 +00:00
|
|
|
InfoLog(g_hLog, "HGOCRTesseract::ImageTextDirectOCR seconds:%fs", seconds);
|
2023-03-02 03:49:50 +00:00
|
|
|
|
2022-10-22 00:53:11 +00:00
|
|
|
if (TessOrientation::ORIENTATION_PAGE_UP == orientation)
|
|
|
|
*direct = HGIMGPROC_OCRTEXTDIRECT_ORI;
|
|
|
|
else if (TessOrientation::ORIENTATION_PAGE_RIGHT == orientation)
|
|
|
|
*direct = HGIMGPROC_OCRTEXTDIRECT_RIGHT;
|
|
|
|
else if (TessOrientation::ORIENTATION_PAGE_DOWN == orientation)
|
|
|
|
*direct = HGIMGPROC_OCRTEXTDIRECT_180;
|
|
|
|
else if (TessOrientation::ORIENTATION_PAGE_LEFT == orientation)
|
|
|
|
*direct = HGIMGPROC_OCRTEXTDIRECT_LEFT;
|
|
|
|
|
|
|
|
if (image2 != image)
|
|
|
|
HGBase_DestroyImage(image2);
|
|
|
|
return HGBASE_ERR_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
HGResult HGOCRTesseract::AddToImageList(HGImage image)
|
|
|
|
{
|
|
|
|
if (NULL == image)
|
|
|
|
{
|
|
|
|
return HGBASE_ERR_INVALIDARG;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (NULL == m_tiffWriter)
|
|
|
|
{
|
|
|
|
HGChar tmpFileName[256] = { 0 };
|
|
|
|
HGBase_GetTmpFileName("tif", tmpFileName, 256);
|
|
|
|
m_tiffFileName = tmpFileName;
|
|
|
|
|
|
|
|
HGResult ret = HGImgFmt_OpenTiffWriter(m_tiffFileName.c_str(), &m_tiffWriter);
|
|
|
|
if (HGBASE_ERR_OK != ret)
|
|
|
|
{
|
2024-08-01 05:46:17 +00:00
|
|
|
ErrorLog(g_hLog, "HGOCRTesseract::AddToImageList: HGImgFmt_OpenTiffWriter fail %s", m_tiffFileName.c_str());
|
2022-10-22 00:53:11 +00:00
|
|
|
m_tiffFileName.clear();
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return HGImgFmt_SaveImageToTiffWriter(m_tiffWriter, image, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
HGResult HGOCRTesseract::ClearImageList()
|
|
|
|
{
|
|
|
|
if (NULL != m_tiffWriter)
|
|
|
|
{
|
|
|
|
HGImgFmt_CloseTiffWriter(m_tiffWriter);
|
|
|
|
m_tiffWriter = NULL;
|
|
|
|
HGBase_DeleteFile(m_tiffFileName.c_str());
|
|
|
|
m_tiffFileName.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
return HGBASE_ERR_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
HGResult HGOCRTesseract::ImageListOCRToFile(HGUInt outType, const HGChar* outFileName, HGImageListOcrFunc func, HGPointer param)
|
|
|
|
{
|
|
|
|
if (NULL == m_tiffWriter)
|
|
|
|
{
|
2022-11-24 09:34:44 +00:00
|
|
|
return HGBASE_ERR_FAIL;
|
2022-10-22 00:53:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (outType > HGIMGPROC_OCROUTTYPE_OFD || NULL == outFileName)
|
|
|
|
{
|
|
|
|
return HGBASE_ERR_INVALIDARG;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (0 == outType)
|
|
|
|
outType = GetOutTypeByFileName(outFileName);
|
|
|
|
|
2022-11-22 09:25:38 +00:00
|
|
|
if (HGIMGPROC_OCROUTTYPE_PDF != outType && HGIMGPROC_OCROUTTYPE_TXT != outType)
|
|
|
|
{
|
|
|
|
return HGBASE_ERR_INVALIDARG;
|
|
|
|
}
|
|
|
|
|
2022-10-22 00:53:11 +00:00
|
|
|
HGImgFmt_CloseTiffWriter(m_tiffWriter);
|
|
|
|
m_tiffWriter = NULL;
|
|
|
|
assert(!m_tiffFileName.empty());
|
|
|
|
HGResult ret = OCRToFile(m_tiffFileName.c_str(), outType, outFileName);
|
|
|
|
HGBase_DeleteFile(m_tiffFileName.c_str());
|
|
|
|
m_tiffFileName.clear();
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
HGResult HGOCRTesseract::OCRToFile(const HGChar* inFileName, HGUInt outType, const HGChar* outFileName)
|
|
|
|
{
|
|
|
|
assert(NULL != inFileName);
|
|
|
|
assert(NULL != outFileName);
|
|
|
|
|
|
|
|
HGResult ret = HGBASE_ERR_NOTSUPPORT;
|
|
|
|
if (HGIMGPROC_OCROUTTYPE_PDF == outType)
|
|
|
|
{
|
|
|
|
HGChar outputbase[256] = { 0 };
|
|
|
|
const char* p = strrchr(outFileName, '.');
|
|
|
|
if (NULL != p && 0 == strcmp(p, ".pdf"))
|
|
|
|
memcpy(outputbase, outFileName, p - outFileName);
|
|
|
|
else
|
|
|
|
strcpy(outputbase, outFileName);
|
|
|
|
|
2022-11-24 09:34:44 +00:00
|
|
|
ret = HGIMGPROC_ERR_OCR;
|
2022-10-22 00:53:11 +00:00
|
|
|
TessResultRenderer* pdfRender = TessPDFRendererCreate(outputbase, TessBaseAPIGetDatapath(m_baseApi), FALSE);
|
|
|
|
if (NULL != pdfRender)
|
|
|
|
{
|
|
|
|
if (TessBaseAPIProcessPages(m_baseApi, inFileName, NULL, 0, pdfRender))
|
|
|
|
ret = HGBASE_ERR_OK;
|
|
|
|
TessDeleteResultRenderer(pdfRender);
|
|
|
|
|
|
|
|
if (HGBASE_ERR_OK == ret)
|
|
|
|
{
|
|
|
|
HGChar destFileName[256];
|
|
|
|
sprintf(destFileName, "%s.pdf", outputbase);
|
|
|
|
#if defined(HG_CMP_MSC)
|
|
|
|
MoveFileA(destFileName, outFileName);
|
|
|
|
#else
|
|
|
|
rename(destFileName, outFileName);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (HGIMGPROC_OCROUTTYPE_TXT == outType)
|
|
|
|
{
|
|
|
|
HGChar outputbase[256] = { 0 };
|
|
|
|
const char* p = strrchr(outFileName, '.');
|
|
|
|
if (NULL != p && 0 == strcmp(p, ".txt"))
|
|
|
|
memcpy(outputbase, outFileName, p - outFileName);
|
|
|
|
else
|
|
|
|
strcpy(outputbase, outFileName);
|
|
|
|
|
2022-11-24 09:34:44 +00:00
|
|
|
ret = HGIMGPROC_ERR_OCR;
|
2022-10-22 00:53:11 +00:00
|
|
|
TessResultRenderer* txtRender = TessTextRendererCreate(outputbase);
|
|
|
|
if (NULL != txtRender)
|
|
|
|
{
|
|
|
|
if (TessBaseAPIProcessPages(m_baseApi, inFileName, NULL, 0, txtRender))
|
|
|
|
ret = HGBASE_ERR_OK;
|
|
|
|
TessDeleteResultRenderer(txtRender);
|
|
|
|
|
|
|
|
if (HGBASE_ERR_OK == ret)
|
|
|
|
{
|
|
|
|
HGChar destFileName[256];
|
|
|
|
sprintf(destFileName, "%s.txt", outputbase);
|
|
|
|
#if defined(HG_CMP_MSC)
|
|
|
|
MoveFileA(destFileName, outFileName);
|
|
|
|
#else
|
|
|
|
rename(destFileName, outFileName);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|