/*====================================================================* - Copyright (C) 2001 Leptonica. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - 2. Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials - provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *====================================================================*/ /*! * \file pdfio2.c *
* * Lower-level operations for generating pdf. * * Intermediate function for single page, multi-image conversion * l_int32 pixConvertToPdfData() * * Intermediate function for generating multipage pdf output * l_int32 ptraConcatenatePdfToData() * * Convert tiff multipage to pdf file * l_int32 convertTiffMultipageToPdf() * * Low-level CID-based operations * * Without transcoding * l_int32 l_generateCIDataForPdf() * L_COMP_DATA *l_generateFlateDataPdf() * L_COMP_DATA *l_generateJpegData() * L_COMP_DATA *l_generateJpegDataMem() * static L_COMP_DATA *l_generateJp2kData() * * With transcoding * l_int32 l_generateCIData() * l_int32 pixGenerateCIData() * L_COMP_DATA *l_generateFlateData() * static L_COMP_DATA *pixGenerateFlateData() * static L_COMP_DATA *pixGenerateJpegData() * static L_COMP_DATA *pixGenerateJp2kData() * static L_COMP_DATA *pixGenerateG4Data() * L_COMP_DATA *l_generateG4Data() * * Other * l_int32 cidConvertToPdfData() * void l_CIDataDestroy() * * Helper functions for generating the output pdf string * static l_int32 l_generatePdf() * static void generateFixedStringsPdf() * static char *generateEscapeString() * static void generateMediaboxPdf() * static l_int32 generatePageStringPdf() * static l_int32 generateContentStringPdf() * static l_int32 generatePreXStringsPdf() * static l_int32 generateColormapStringsPdf() * static void generateTrailerPdf() * static l_int32 makeTrailerStringPdf() * static l_int32 generateOutputDataPdf() * * Helper functions for generating multipage pdf output * static l_int32 parseTrailerPdf() * static char *generatePagesObjStringPdf() * static L_BYTEA *substituteObjectNumbers() * * Create/destroy/access pdf data * static L_PDF_DATA *pdfdataCreate() * static void pdfdataDestroy() * static L_COMP_DATA *pdfdataGetCid() * * Set flags for special modes * void l_pdfSetG4ImageMask() * void l_pdfSetDateAndVersion() **/ #include
* Notes: * (1) If %res == 0 and the input resolution field is 0, * this will use DefaultInputRes. * (2) This only writes %data if it is the last image to be * written on the page. * (3) See comments in convertToPdf(). **/ l_ok pixConvertToPdfData(PIX *pix, l_int32 type, l_int32 quality, l_uint8 **pdata, size_t *pnbytes, l_int32 x, l_int32 y, l_int32 res, const char *title, L_PDF_DATA **plpd, l_int32 position) { l_int32 pixres, w, h, ret; l_float32 xpt, ypt, wpt, hpt; L_COMP_DATA *cid = NULL; L_PDF_DATA *lpd = NULL; PROCNAME("pixConvertToPdfData"); if (!pdata) return ERROR_INT("&data not defined", procName, 1); *pdata = NULL; if (!pnbytes) return ERROR_INT("&nbytes not defined", procName, 1); *pnbytes = 0; if (!pix) return ERROR_INT("pix not defined", procName, 1); if (type != L_JPEG_ENCODE && type != L_G4_ENCODE && type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) { selectDefaultPdfEncoding(pix, &type); } if (plpd) { /* part of multi-page invocation */ if (position == L_FIRST_IMAGE) *plpd = NULL; } /* Generate the compressed image data. It must NOT * be ascii85 encoded. */ pixGenerateCIData(pix, type, quality, 0, &cid); if (!cid) return ERROR_INT("cid not made", procName, 1); /* Get media box in pts. Guess the input image resolution * based on the input parameter %res, the resolution data in * the pix, and the size of the image. */ pixres = cid->res; w = cid->w; h = cid->h; if (res <= 0.0) { if (pixres > 0) res = pixres; else res = DefaultInputRes; } xpt = x * 72. / res; ypt = y * 72. / res; wpt = w * 72. / res; hpt = h * 72. / res; /* Set up lpd */ if (!plpd) { /* single image */ if ((lpd = pdfdataCreate(title)) == NULL) return ERROR_INT("lpd not made", procName, 1); } else if (position == L_FIRST_IMAGE) { /* first of multiple images */ if ((lpd = pdfdataCreate(title)) == NULL) return ERROR_INT("lpd not made", procName, 1); *plpd = lpd; } else { /* not the first of multiple images */ lpd = *plpd; } /* Add the data to the lpd */ ptraAdd(lpd->cida, cid); lpd->n++; ptaAddPt(lpd->xy, xpt, ypt); ptaAddPt(lpd->wh, wpt, hpt); /* If a single image or the last of multiple images, * generate the pdf and destroy the lpd */ if (!plpd || (position == L_LAST_IMAGE)) { ret = l_generatePdf(pdata, pnbytes, lpd); pdfdataDestroy(&lpd); if (plpd) *plpd = NULL; if (ret) return ERROR_INT("pdf output not made", procName, 1); } return 0; } /*---------------------------------------------------------------------* * Intermediate function for generating multipage pdf output * *---------------------------------------------------------------------*/ /*! * \brief ptraConcatenatePdfToData() * * \param[in] pa_data ptra array of pdf strings, each for a * single-page pdf file * \param[in] sa [optional] string array of pathnames for * input pdf files; can be null * \param[out] pdata concatenated pdf data in memory * \param[out] pnbytes number of bytes in pdf data * \return 0 if OK, 1 on error * *
* Notes: * (1) This only works with leptonica-formatted single-page pdf files. * pdf files generated by other programs will have unpredictable * (and usually bad) results. The requirements for each pdf file: * (a) The Catalog and Info objects are the first two. * (b) Object 3 is Pages * (c) Object 4 is Page * (d) The remaining objects are Contents, XObjects, and ColorSpace * (2) We remove trailers from each page, and append the full trailer * for all pages at the end. * (3) For all but the first file, remove the ID and the first 3 * objects (catalog, info, pages), so that each subsequent * file has only objects of these classes: * Page, Contents, XObject, ColorSpace (Indexed RGB). * For those objects, we substitute these refs to objects * in the local file: * Page: Parent(object 3), Contents, XObject(typically multiple) * XObject: [ColorSpace if indexed] * The Pages object on the first page (object 3) has a Kids array * of references to all the Page objects, with a Count equal * to the number of pages. Each Page object refers back to * this parent. **/ l_ok ptraConcatenatePdfToData(L_PTRA *pa_data, SARRAY *sa, l_uint8 **pdata, size_t *pnbytes) { char *fname, *str_pages, *str_trailer; l_uint8 *pdfdata, *data; l_int32 i, j, index, nobj, npages; l_int32 *sizes, *locs; size_t size; L_BYTEA *bas, *bad, *bat1, *bat2; L_DNA *da_locs, *da_sizes, *da_outlocs, *da; L_DNAA *daa_locs; /* object locations on each page */ NUMA *na_objs, *napage; NUMAA *naa_objs; /* object mapping numbers to new values */ PROCNAME("ptraConcatenatePdfToData"); if (!pdata) return ERROR_INT("&data not defined", procName, 1); *pdata = NULL; if (!pnbytes) return ERROR_INT("&nbytes not defined", procName, 1); *pnbytes = 0; if (!pa_data) return ERROR_INT("pa_data not defined", procName, 1); /* Parse the files and find the object locations. * Remove file data that cannot be parsed. */ ptraGetActualCount(pa_data, &npages); daa_locs = l_dnaaCreate(npages); for (i = 0; i < npages; i++) { bas = (L_BYTEA *)ptraGetPtrToItem(pa_data, i); if (parseTrailerPdf(bas, &da_locs) != 0) { bas = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION); l_byteaDestroy(&bas); if (sa) { fname = sarrayGetString(sa, i, L_NOCOPY); L_ERROR("can't parse file %s; skipping\n", procName, fname); } else { L_ERROR("can't parse file %d; skipping\n", procName, i); } } else { l_dnaaAddDna(daa_locs, da_locs, L_INSERT); } } /* Recompute npages in case some of the files were not pdf */ ptraCompactArray(pa_data); ptraGetActualCount(pa_data, &npages); if (npages == 0) { l_dnaaDestroy(&daa_locs); return ERROR_INT("no parsable pdf files found", procName, 1); } /* Find the mapping from initial to final object numbers */ naa_objs = numaaCreate(npages); /* stores final object numbers */ napage = numaCreate(npages); /* stores "Page" object numbers */ index = 0; for (i = 0; i < npages; i++) { da = l_dnaaGetDna(daa_locs, i, L_CLONE); nobj = l_dnaGetCount(da); if (i == 0) { numaAddNumber(napage, 4); /* object 4 on first page */ na_objs = numaMakeSequence(0.0, 1.0, nobj - 1); index = nobj - 1; } else { /* skip the first 3 objects in each file */ numaAddNumber(napage, index); /* Page object is first we add */ na_objs = numaMakeConstant(0.0, nobj - 1); numaReplaceNumber(na_objs, 3, 3); /* refers to parent of all */ for (j = 4; j < nobj - 1; j++) numaSetValue(na_objs, j, index++); } numaaAddNuma(naa_objs, na_objs, L_INSERT); l_dnaDestroy(&da); } /* Make the Pages object (#3) */ str_pages = generatePagesObjStringPdf(napage); /* Build the output */ bad = l_byteaCreate(5000); da_outlocs = l_dnaCreate(0); /* locations of all output objects */ for (i = 0; i < npages; i++) { bas = (L_BYTEA *)ptraGetPtrToItem(pa_data, i); pdfdata = l_byteaGetData(bas, &size); da_locs = l_dnaaGetDna(daa_locs, i, L_CLONE); /* locs on this page */ na_objs = numaaGetNuma(naa_objs, i, L_CLONE); /* obj # on this page */ nobj = l_dnaGetCount(da_locs) - 1; da_sizes = l_dnaDiffAdjValues(da_locs); /* object sizes on this page */ sizes = l_dnaGetIArray(da_sizes); locs = l_dnaGetIArray(da_locs); if (i == 0) { l_byteaAppendData(bad, pdfdata, sizes[0]); l_byteaAppendData(bad, pdfdata + locs[1], sizes[1]); l_byteaAppendData(bad, pdfdata + locs[2], sizes[2]); l_byteaAppendString(bad, str_pages); for (j = 0; j < 4; j++) l_dnaAddNumber(da_outlocs, locs[j]); } for (j = 4; j < nobj; j++) { l_dnaAddNumber(da_outlocs, l_byteaGetSize(bad)); bat1 = l_byteaInitFromMem(pdfdata + locs[j], sizes[j]); bat2 = substituteObjectNumbers(bat1, na_objs); data = l_byteaGetData(bat2, &size); l_byteaAppendData(bad, data, size); l_byteaDestroy(&bat1); l_byteaDestroy(&bat2); } if (i == npages - 1) /* last one */ l_dnaAddNumber(da_outlocs, l_byteaGetSize(bad)); LEPT_FREE(sizes); LEPT_FREE(locs); l_dnaDestroy(&da_locs); numaDestroy(&na_objs); l_dnaDestroy(&da_sizes); } /* Add the trailer */ str_trailer = makeTrailerStringPdf(da_outlocs); l_byteaAppendString(bad, str_trailer); /* Transfer the output data */ *pdata = l_byteaCopyData(bad, pnbytes); l_byteaDestroy(&bad); #if DEBUG_MULTIPAGE fprintf(stderr, "******** object mapper **********"); numaaWriteStream(stderr, naa_objs); fprintf(stderr, "******** Page object numbers ***********"); numaWriteStream(stderr, napage); fprintf(stderr, "******** Pages object ***********\n"); fprintf(stderr, "%s\n", str_pages); #endif /* DEBUG_MULTIPAGE */ numaDestroy(&napage); numaaDestroy(&naa_objs); l_dnaDestroy(&da_outlocs); l_dnaaDestroy(&daa_locs); LEPT_FREE(str_pages); LEPT_FREE(str_trailer); return 0; } /*---------------------------------------------------------------------* * Convert tiff multipage to pdf file * *---------------------------------------------------------------------*/ /*! * \brief convertTiffMultipageToPdf() * * \param[in] filein (tiff) * \param[in] fileout (pdf) * \return 0 if OK, 1 on error * *
* Notes: * (1) A multipage tiff file can also be converted to PS, using * convertTiffMultipageToPS() **/ l_ok convertTiffMultipageToPdf(const char *filein, const char *fileout) { l_int32 istiff; PIXA *pixa; FILE *fp; PROCNAME("convertTiffMultipageToPdf"); if ((fp = fopenReadStream(filein)) == NULL) return ERROR_INT("file not found", procName, 1); istiff = fileFormatIsTiff(fp); fclose(fp); if (!istiff) return ERROR_INT("file not tiff format", procName, 1); pixa = pixaReadMultipageTiff(filein); pixaConvertToPdf(pixa, 0, 1.0, 0, 0, "weasel2", fileout); pixaDestroy(&pixa); return 0; } /*---------------------------------------------------------------------* * Low-level CID-based operations * *---------------------------------------------------------------------*/ /*! * \brief l_generateCIDataForPdf() * * \param[in] fname [optional] can be null * \param[in] pix [optional] can be null * \param[in] quality for jpeg if transcoded: 1-100; 0 for default (75) * for jp2k if transcoded: 27-45; 0 for default (34) * \param[out] pcid compressed data * \return 0 if OK, 1 on error * *
* Notes: * (1) You must set either filename or pix. * (2) Given an image file and optionally a pix raster of that data, * this provides a CID that is compatible with PDF, preferably * without transcoding. * (3) The pix is included for efficiency, in case transcoding * is required and the pix is available to the caller. * (4) We don't try to open files named "stdin" or "-" for Tesseract * compatibility reasons. We may remove this restriction * in the future. **/ l_ok l_generateCIDataForPdf(const char *fname, PIX *pix, l_int32 quality, L_COMP_DATA **pcid) { l_int32 format, type; L_COMP_DATA *cid; PIX *pixt; PROCNAME("l_generateCIDataForPdf"); //FILE* file = fopenWriteStream("aaa.bmp", "w"); //pixWriteStreamBmp(file, pix); //fclose(file); if (!pcid) return ERROR_INT("&cid not defined", procName, 1); *pcid = cid = NULL; if (!fname && !pix) return ERROR_INT("neither fname nor pix are defined", procName, 1); /* If a compressed file is given that is not 'stdin', see if we * can generate the pdf output without transcoding. */ if (fname && strcmp(fname, "-") != 0 && strcmp(fname, "stdin") != 0) { findFileFormat(fname, &format); if (format == IFF_UNKNOWN) L_WARNING("file %s format is unknown\n", procName, fname); if (format == IFF_PS || format == IFF_LPDF) { L_ERROR("file %s is unsupported format %d\n", procName, fname, format); return 1; } if (format == IFF_JFIF_JPEG) { cid = l_generateJpegData(fname, 0); } else if (format == IFF_JP2) { cid = l_generateJp2kData(fname); } else if (format == IFF_PNG) { cid = l_generateFlateDataPdf(fname, pix); } } /* Otherwise, use the pix to generate the pdf output */ if (!cid) { if (!pix) pixt = pixRead(fname); else pixt = pixClone(pix); if (!pixt) return ERROR_INT("pixt not made", procName, 1); if (selectDefaultPdfEncoding(pixt, &type)) { pixDestroy(&pixt); return 1; } pixGenerateCIData(pixt, type, quality, 0, &cid); pixDestroy(&pixt); } if (!cid) { L_ERROR("totally kerflummoxed\n", procName); return 1; } *pcid = cid; return 0; } /*! * \brief l_generateFlateDataPdf() * * \param[in] fname preferably png * \param[in] pixs [optional] can be null * \return cid containing png data, or NULL on error * *
* Notes: * (1) If you hand this a png file, you are going to get * png predictors embedded in the flate data. So it has * come to this. http://xkcd.com/1022/ * (2) Exception: if the png is interlaced or if it is RGBA, * it will be transcoded. * (3) If transcoding is required, this will not have to read from * file if you also input a pix. **/ L_COMP_DATA * l_generateFlateDataPdf(const char *fname, PIX *pixs) { l_uint8 *pngcomp = NULL; /* entire PNG compressed file */ l_uint8 *datacomp = NULL; /* gzipped raster data */ l_uint8 *cmapdata = NULL; /* uncompressed colormap */ char *cmapdatahex = NULL; /* hex ascii uncompressed colormap */ l_uint32 i, j, n; l_int32 format, interlaced; l_int32 ncolors; /* in colormap */ l_int32 bps; /* bits/sample: usually 8 */ l_int32 spp; /* samples/pixel: 1-grayscale/cmap); 3-rgb; 4-rgba */ l_int32 w, h, cmapflag; l_int32 xres, yres; size_t nbytescomp = 0, nbytespng = 0; FILE *fp; L_COMP_DATA *cid; PIX *pix; PIXCMAP *cmap = NULL; PROCNAME("l_generateFlateDataPdf"); if (!fname) return (L_COMP_DATA *)ERROR_PTR("fname not defined", procName, NULL); findFileFormat(fname, &format); spp = 0; /* init to spp != 4 if not png */ interlaced = 0; /* initialize to no interlacing */ bps = 0; /* initialize to a nonsense value */ if (format == IFF_PNG) { isPngInterlaced(fname, &interlaced); if (readHeaderPng(fname, NULL, NULL, &bps, &spp, NULL)) return (L_COMP_DATA *)ERROR_PTR("bad png input", procName, NULL); } /* PDF is capable of inlining some types of PNG files, but not all of them. We need to transcode anything with interlacing, an alpha channel, or 1 bpp (which would otherwise be photo-inverted). Be careful with spp. Any PNG image file with an alpha channel is converted on reading to RGBA (spp == 4). This includes the (gray + alpha) format with spp == 2. You will get different results if you look at spp via readHeaderPng() versus pixGetSpp() */ if (format != IFF_PNG || interlaced || bps == 1 || spp == 4 || spp == 2) { if (!pixs) pix = pixRead(fname); else pix = pixClone(pixs); if (!pix) return (L_COMP_DATA *)ERROR_PTR("pix not made", procName, NULL); cid = pixGenerateFlateData(pix, 0); pixDestroy(&pix); return cid; } /* It's png. Generate the pdf data without transcoding. * Implementation by Jeff Breidenbach. * First, read the metadata */ if ((fp = fopenReadStream(fname)) == NULL) return (L_COMP_DATA *)ERROR_PTR("stream not opened", procName, NULL); freadHeaderPng(fp, &w, &h, &bps, &spp, &cmapflag); fgetPngResolution(fp, &xres, &yres); fclose(fp); /* We get pdf corruption when inlining the data from 16 bpp png. */ if (bps == 16) return l_generateFlateData(fname, 0); /* Read the entire png file */ if ((pngcomp = l_binaryRead(fname, &nbytespng)) == NULL) return (L_COMP_DATA *)ERROR_PTR("unable to read file", procName, NULL); /* Extract flate data, copying portions of it to memory, including * the predictor information in a byte at the beginning of each * raster line. The flate data makes up the vast majority of * the png file, so after extraction we expect datacomp to * be nearly full (i.e., nbytescomp will be only slightly less * than nbytespng). Also extract the colormap if present. */ if ((datacomp = (l_uint8 *)LEPT_CALLOC(1, nbytespng)) == NULL) { LEPT_FREE(pngcomp); return (L_COMP_DATA *)ERROR_PTR("unable to allocate memory", procName, NULL); } /* Parse the png file. Each chunk consists of: * length: 4 bytes * name: 4 bytes (e.g., "IDAT") * data: n bytes * CRC: 4 bytes * Start at the beginning of the data section of the first chunk, * byte 16, because the png file begins with 8 bytes of header, * followed by the first 8 bytes of the first chunk * (length and name). On each loop, increment by 12 bytes to * skip over the CRC, length and name of the next chunk. */ for (i = 16; i < nbytespng; i += 12) { /* do each successive chunk */ /* Get the chunk length */ n = pngcomp[i - 8] << 24; n += pngcomp[i - 7] << 16; n += pngcomp[i - 6] << 8; n += pngcomp[i - 5] << 0; if (n >= nbytespng - i) { /* "n + i" can overflow */ LEPT_FREE(pngcomp); LEPT_FREE(datacomp); pixcmapDestroy(&cmap); L_ERROR("invalid png: i = %d, n = %d, nbytes = %zu\n", procName, i, n, nbytespng); return NULL; } /* Is it a data chunk? */ if (memcmp(pngcomp + i - 4, "IDAT", 4) == 0) { memcpy(datacomp + nbytescomp, pngcomp + i, n); nbytescomp += n; } /* Is it a palette chunk? */ if (cmapflag && !cmap && memcmp(pngcomp + i - 4, "PLTE", 4) == 0) { if ((n / 3) > (1 << bps)) { LEPT_FREE(pngcomp); LEPT_FREE(datacomp); pixcmapDestroy(&cmap); L_ERROR("invalid png: i = %d, n = %d, cmapsize = %d\n", procName, i, n, (1 << bps)); return NULL; } cmap = pixcmapCreate(bps); for (j = i; j < i + n; j += 3) { pixcmapAddColor(cmap, pngcomp[j], pngcomp[j + 1], pngcomp[j + 2]); } } i += n; /* move to the end of the data chunk */ } LEPT_FREE(pngcomp); if (nbytescomp == 0) { LEPT_FREE(datacomp); pixcmapDestroy(&cmap); return (L_COMP_DATA *)ERROR_PTR("invalid PNG file", procName, NULL); } /* Extract and encode the colormap data as hexascii */ ncolors = 0; if (cmap) { pixcmapSerializeToMemory(cmap, 3, &ncolors, &cmapdata); pixcmapDestroy(&cmap); if (!cmapdata) { LEPT_FREE(datacomp); return (L_COMP_DATA *)ERROR_PTR("cmapdata not made", procName, NULL); } cmapdatahex = pixcmapConvertToHex(cmapdata, ncolors); LEPT_FREE(cmapdata); } /* Note that this is the only situation where the predictor * field of the CID is set to 1. Adobe's predictor values on * p. 76 of pdf_reference_1-7.pdf give 1 for no predictor and * 10-14 for inline predictors, the specifics of which are * ignored by the pdf interpreter, which just needs to know that * the first byte on each compressed scanline is some predictor * whose type can be inferred from the byte itself. */ cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA)); cid->datacomp = datacomp; cid->type = L_FLATE_ENCODE; cid->cmapdatahex = cmapdatahex; cid->nbytescomp = nbytescomp; cid->ncolors = ncolors; cid->predictor = TRUE; cid->w = w; cid->h = h; cid->bps = bps; cid->spp = spp; cid->res = xres; return cid; } /*! * \brief l_generateJpegData() * * \param[in] fname of jpeg file * \param[in] ascii85flag 0 for jpeg; 1 for ascii85-encoded jpeg * \return cid containing jpeg data, or NULL on error * *
* Notes: * (1) Set ascii85flag: * ~ 0 for binary data (not permitted in PostScript) * ~ 1 for ascii85 (5 for 4) encoded binary data * (not permitted in pdf) * (2) Do not free the data. l_generateJpegDataMem() will free * the data if the data is invalid, or if it does not use * ascii encoding. **/ L_COMP_DATA * l_generateJpegData(const char *fname, l_int32 ascii85flag) { l_uint8 *data = NULL; size_t nbytes; PROCNAME("l_generateJpegData"); if (!fname) return (L_COMP_DATA *)ERROR_PTR("fname not defined", procName, NULL); /* The returned jpeg data in memory is the entire jpeg file, * which starts with ffd8 and ends with ffd9 */ if ((data = l_binaryRead(fname, &nbytes)) == NULL) return (L_COMP_DATA *)ERROR_PTR("data not extracted", procName, NULL); return l_generateJpegDataMem(data, nbytes, ascii85flag); } /*! * \brief l_generateJpegDataMem() * * \param[in] data of jpeg file * \param[in] nbytes of jpeg file * \param[in] ascii85flag 0 for jpeg; 1 for ascii85-encoded jpeg * \return cid containing jpeg data, or NULL on error * *
* Notes: * (1) See l_generateJpegData(). **/ L_COMP_DATA * l_generateJpegDataMem(l_uint8 *data, size_t nbytes, l_int32 ascii85flag) { char *data85 = NULL; /* ascii85 encoded jpeg compressed file */ l_int32 w, h, xres, yres, bps, spp; l_int32 nbytes85; L_COMP_DATA *cid; PROCNAME("l_generateJpegDataMem"); if (!data) return (L_COMP_DATA *)ERROR_PTR("data not defined", procName, NULL); /* Read the metadata */ if (readHeaderMemJpeg(data, nbytes, &w, &h, &spp, NULL, NULL)) { LEPT_FREE(data); return (L_COMP_DATA *)ERROR_PTR("bad jpeg metadata", procName, NULL); } bps = 8; readResolutionMemJpeg(data, nbytes, &xres, &yres); /* Optionally, encode the compressed data */ if (ascii85flag == 1) { data85 = encodeAscii85(data, nbytes, &nbytes85); LEPT_FREE(data); if (!data85) return (L_COMP_DATA *)ERROR_PTR("data85 not made", procName, NULL); else data85[nbytes85 - 1] = '\0'; /* remove the newline */ } cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA)); if (ascii85flag == 0) { cid->datacomp = data; } else { /* ascii85 */ cid->data85 = data85; cid->nbytes85 = nbytes85; } cid->type = L_JPEG_ENCODE; cid->nbytescomp = nbytes; cid->w = w; cid->h = h; cid->bps = bps; cid->spp = spp; cid->res = xres; return cid; } /*! * \brief l_generateJp2kData() * * \param[in] fname of jp2k file * \return cid containing jp2k data, or NULL on error * *
* Notes: * (1) This is only called after the file is verified to be jp2k. **/ static L_COMP_DATA * l_generateJp2kData(const char *fname) { l_int32 w, h, bps, spp, xres, yres; size_t nbytes; L_COMP_DATA *cid; FILE *fp; PROCNAME("l_generateJp2kData"); if (!fname) return (L_COMP_DATA *)ERROR_PTR("fname not defined", procName, NULL); if (readHeaderJp2k(fname, &w, &h, &bps, &spp)) return (L_COMP_DATA *)ERROR_PTR("bad jp2k metadata", procName, NULL); if ((cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA))) == NULL) return (L_COMP_DATA *)ERROR_PTR("cid not made", procName, NULL); /* The returned jp2k data in memory is the entire jp2k file */ if ((cid->datacomp = l_binaryRead(fname, &nbytes)) == NULL) { l_CIDataDestroy(&cid); return (L_COMP_DATA *)ERROR_PTR("data not extracted", procName, NULL); } xres = yres = 0; if ((fp = fopenReadStream(fname)) != NULL) { fgetJp2kResolution(fp, &xres, &yres); fclose(fp); } cid->type = L_JP2K_ENCODE; cid->nbytescomp = nbytes; cid->w = w; cid->h = h; cid->bps = bps; cid->spp = spp; cid->res = xres; return cid; } /*! * \brief l_generateCIData() * * \param[in] fname * \param[in] type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE, * L_JP2K_ENCODE * \param[in] quality for jpeg if transcoded: 1-100; 0 for default (75) * for jp2k if transcoded: 27-45; 0 for default (34) * \param[in] ascii85 0 for binary; 1 for ascii85-encoded * \param[out] pcid compressed data * \return 0 if OK, 1 on error * *
* Notes: * (1) This can be used for both PostScript and pdf. * (1) Set ascii85: * ~ 0 for binary data (not permitted in PostScript) * ~ 1 for ascii85 (5 for 4) encoded binary data * (2) This attempts to compress according to the requested type. * If this can't be done, it falls back to ordinary flate encoding. * (3) This differs from l_generateCIDataPdf(), which determines * the format and attempts to generate the CID without transcoding. **/ l_ok l_generateCIData(const char *fname, l_int32 type, l_int32 quality, l_int32 ascii85, L_COMP_DATA **pcid) { l_int32 format, d, bps, spp, iscmap; L_COMP_DATA *cid; PIX *pix; PROCNAME("l_generateCIData"); if (!pcid) return ERROR_INT("&cid not defined", procName, 1); *pcid = NULL; if (!fname) return ERROR_INT("fname not defined", procName, 1); if (type != L_G4_ENCODE && type != L_JPEG_ENCODE && type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) return ERROR_INT("invalid conversion type", procName, 1); if (ascii85 != 0 && ascii85 != 1) return ERROR_INT("invalid ascii85", procName, 1); /* Sanity check on requested encoding */ pixReadHeader(fname, &format, NULL, NULL, &bps, &spp, &iscmap); d = bps * spp; if (d == 24) d = 32; if (iscmap && type != L_FLATE_ENCODE) { L_WARNING("pixs has cmap; using flate encoding\n", procName); type = L_FLATE_ENCODE; } else if (d < 8 && type == L_JPEG_ENCODE) { L_WARNING("pixs has < 8 bpp; using flate encoding\n", procName); type = L_FLATE_ENCODE; } else if (d < 8 && type == L_JP2K_ENCODE) { L_WARNING("pixs has < 8 bpp; using flate encoding\n", procName); type = L_FLATE_ENCODE; } else if (d > 1 && type == L_G4_ENCODE) { L_WARNING("pixs has > 1 bpp; using flate encoding\n", procName); type = L_FLATE_ENCODE; } if (type == L_JPEG_ENCODE) { if (format == IFF_JFIF_JPEG) { /* do not transcode */ cid = l_generateJpegData(fname, ascii85); } else { if ((pix = pixRead(fname)) == NULL) return ERROR_INT("pix not returned", procName, 1); cid = pixGenerateJpegData(pix, ascii85, quality); pixDestroy(&pix); } if (!cid) return ERROR_INT("jpeg data not made", procName, 1); } else if (type == L_JP2K_ENCODE) { if (format == IFF_JP2) { /* do not transcode */ cid = l_generateJp2kData(fname); } else { if ((pix = pixRead(fname)) == NULL) return ERROR_INT("pix not returned", procName, 1); cid = pixGenerateJp2kData(pix, quality); pixDestroy(&pix); } if (!cid) return ERROR_INT("jp2k data not made", procName, 1); } else if (type == L_G4_ENCODE) { if ((cid = l_generateG4Data(fname, ascii85)) == NULL) return ERROR_INT("g4 data not made", procName, 1); } else if (type == L_FLATE_ENCODE) { if ((cid = l_generateFlateData(fname, ascii85)) == NULL) return ERROR_INT("flate data not made", procName, 1); } else { return ERROR_INT("invalid conversion type", procName, 1); } *pcid = cid; return 0; } /*! * \brief pixGenerateCIData() * * \param[in] pixs 8 or 32 bpp, no colormap * \param[in] type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE or * L_JP2K_ENCODE * \param[in] quality for jpeg if transcoded: 1-100; 0 for default (75) * for jp2k if transcoded: 27-45; 0 for default (34) * \param[in] ascii85 0 for binary; 1 for ascii85-encoded * \param[out] pcid compressed data * \return 0 if OK, 1 on error * *
* Notes: * (1) Set ascii85: * ~ 0 for binary data (not permitted in PostScript) * ~ 1 for ascii85 (5 for 4) encoded binary data **/ l_ok pixGenerateCIData(PIX *pixs, l_int32 type, l_int32 quality, l_int32 ascii85, L_COMP_DATA **pcid) { l_int32 d; PIXCMAP *cmap; PROCNAME("pixGenerateCIData"); if (!pcid) return ERROR_INT("&cid not defined", procName, 1); *pcid = NULL; if (!pixs) return ERROR_INT("pixs not defined", procName, 1); if (type != L_G4_ENCODE && type != L_JPEG_ENCODE && type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) { selectDefaultPdfEncoding(pixs, &type); } if (ascii85 != 0 && ascii85 != 1) return ERROR_INT("invalid ascii85", procName, 1); /* Sanity check on requested encoding */ d = pixGetDepth(pixs); cmap = pixGetColormap(pixs); if (cmap && type != L_FLATE_ENCODE) { L_WARNING("pixs has cmap; using flate encoding\n", procName); type = L_FLATE_ENCODE; } else if (d < 8 && (type == L_JPEG_ENCODE || type == L_JP2K_ENCODE)) { L_WARNING("pixs has < 8 bpp; using flate encoding\n", procName); type = L_FLATE_ENCODE; } else if (d > 1 && type == L_G4_ENCODE) { L_WARNING("pixs has > 1 bpp; using flate encoding\n", procName); type = L_FLATE_ENCODE; } if (type == L_JPEG_ENCODE) { if ((*pcid = pixGenerateJpegData(pixs, ascii85, quality)) == NULL) return ERROR_INT("jpeg data not made", procName, 1); } else if (type == L_JP2K_ENCODE) { if ((*pcid = pixGenerateJp2kData(pixs, quality)) == NULL) return ERROR_INT("jp2k data not made", procName, 1); } else if (type == L_G4_ENCODE) { if ((*pcid = pixGenerateG4Data(pixs, ascii85)) == NULL) return ERROR_INT("g4 data not made", procName, 1); } else { /* type == L_FLATE_ENCODE */ if ((*pcid = pixGenerateFlateData(pixs, ascii85)) == NULL) return ERROR_INT("flate data not made", procName, 1); } return 0; } /*! * \brief l_generateFlateData() * * \param[in] fname * \param[in] ascii85flag 0 for gzipped; 1 for ascii85-encoded gzipped * \return cid flate compressed image data, or NULL on error * *
* Notes: * (1) The input image is converted to one of these 4 types: * ~ 1 bpp * ~ 8 bpp, no colormap * ~ 8 bpp, colormap * ~ 32 bpp rgb * (2) Set ascii85flag: * ~ 0 for binary data (not permitted in PostScript) * ~ 1 for ascii85 (5 for 4) encoded binary data **/ L_COMP_DATA * l_generateFlateData(const char *fname, l_int32 ascii85flag) { L_COMP_DATA *cid; PIX *pixs; PROCNAME("l_generateFlateData"); if (!fname) return (L_COMP_DATA *)ERROR_PTR("fname not defined", procName, NULL); if ((pixs = pixRead(fname)) == NULL) return (L_COMP_DATA *)ERROR_PTR("pixs not made", procName, NULL); cid = pixGenerateFlateData(pixs, ascii85flag); pixDestroy(&pixs); return cid; } /*! * \brief pixGenerateFlateData() * * \param[in] pixs * \param[in] ascii85flag 0 for gzipped; 1 for ascii85-encoded gzipped * \return cid flate compressed image data, or NULL on error * * Notes: * 1) This should not be called with an RGBA pix (spp == 4; it * will ignore the alpha channel. Likewise, if called with a * colormapped pix, the alpha component in the colormap will * be ignored as it is for all leptonica operations * on colormapped pix. */ static L_COMP_DATA * pixGenerateFlateData(PIX *pixs, l_int32 ascii85flag) { l_uint8 *data = NULL; /* uncompressed raster data in required format */ l_uint8 *datacomp = NULL; /* gzipped raster data */ char *data85 = NULL; /* ascii85 encoded gzipped raster data */ l_uint8 *cmapdata = NULL; /* uncompressed colormap */ char *cmapdata85 = NULL; /* ascii85 encoded uncompressed colormap */ char *cmapdatahex = NULL; /* hex ascii uncompressed colormap */ l_int32 ncolors; /* in colormap; not used if cmapdata85 is null */ l_int32 bps; /* bits/sample: usually 8 */ l_int32 spp; /* samples/pixel: 1-grayscale/cmap); 3-rgb */ l_int32 w, h, d, cmapflag; l_int32 ncmapbytes85 = 0; l_int32 nbytes85 = 0; size_t nbytes, nbytescomp; L_COMP_DATA *cid; PIX *pixt; PIXCMAP *cmap; PROCNAME("pixGenerateFlateData"); if (!pixs) return (L_COMP_DATA *)ERROR_PTR("pixs not defined", procName, NULL); /* Convert the image to one of these 4 types: * 1 bpp * 8 bpp, no colormap * 8 bpp, colormap * 32 bpp rgb */ pixGetDimensions(pixs, &w, &h, &d); cmap = pixGetColormap(pixs); cmapflag = (cmap) ? 1 : 0; if (d == 2 || d == 4 || d == 16) { pixt = pixConvertTo8(pixs, cmapflag); cmap = pixGetColormap(pixt); d = pixGetDepth(pixt); } else { pixt = pixClone(pixs); } spp = (d == 32) ? 3 : 1; /* ignores alpha */ bps = (d == 32) ? 8 : d; /* Extract and encode the colormap data as both ascii85 and hexascii */ ncolors = 0; if (cmap) { pixcmapSerializeToMemory(cmap, 3, &ncolors, &cmapdata); if (!cmapdata) { pixDestroy(&pixt); return (L_COMP_DATA *)ERROR_PTR("cmapdata not made", procName, NULL); } cmapdata85 = encodeAscii85(cmapdata, 3 * ncolors, &ncmapbytes85); cmapdatahex = pixcmapConvertToHex(cmapdata, ncolors); LEPT_FREE(cmapdata); } /* Extract and compress the raster data */ pixGetRasterData(pixt, &data, &nbytes); pixDestroy(&pixt); datacomp = zlibCompress(data, nbytes, &nbytescomp); LEPT_FREE(data); if (!datacomp) { LEPT_FREE(cmapdata85); LEPT_FREE(cmapdatahex); return (L_COMP_DATA *)ERROR_PTR("datacomp not made", procName, NULL); } /* Optionally, encode the compressed data */ if (ascii85flag == 1) { data85 = encodeAscii85(datacomp, nbytescomp, &nbytes85); LEPT_FREE(datacomp); if (!data85) { LEPT_FREE(cmapdata85); LEPT_FREE(cmapdatahex); return (L_COMP_DATA *)ERROR_PTR("data85 not made", procName, NULL); } else { data85[nbytes85 - 1] = '\0'; /* remove the newline */ } } cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA)); if (ascii85flag == 0) { cid->datacomp = datacomp; } else { /* ascii85 */ cid->data85 = data85; cid->nbytes85 = nbytes85; } cid->type = L_FLATE_ENCODE; cid->cmapdatahex = cmapdatahex; cid->cmapdata85 = cmapdata85; cid->nbytescomp = nbytescomp; cid->ncolors = ncolors; cid->w = w; cid->h = h; cid->bps = bps; cid->spp = spp; cid->res = pixGetXRes(pixs); cid->nbytes = nbytes; /* only for debugging */ return cid; } /*! * \brief pixGenerateJpegData() * * \param[in] pixs 8 or 32 bpp, no colormap * \param[in] ascii85flag 0 for jpeg; 1 for ascii85-encoded jpeg * \param[in] quality 0 for default, which is 75 * \return cid jpeg compressed data, or NULL on error * *
* Notes: * (1) Set ascii85flag: * ~ 0 for binary data (not permitted in PostScript) * ~ 1 for ascii85 (5 for 4) encoded binary data **/ static L_COMP_DATA * pixGenerateJpegData(PIX *pixs, l_int32 ascii85flag, l_int32 quality) { l_int32 d; char *fname; L_COMP_DATA *cid; PROCNAME("pixGenerateJpegData"); if (!pixs) return (L_COMP_DATA *)ERROR_PTR("pixs not defined", procName, NULL); if (pixGetColormap(pixs)) return (L_COMP_DATA *)ERROR_PTR("pixs has colormap", procName, NULL); d = pixGetDepth(pixs); if (d != 8 && d != 32) return (L_COMP_DATA *)ERROR_PTR("pixs not 8 or 32 bpp", procName, NULL); /* Compress to a temp jpeg file */ fname = l_makeTempFilename(); if (pixWriteJpeg(fname, pixs, quality, 0)) { LEPT_FREE(fname); return NULL; } /* Generate the data */ cid = l_generateJpegData(fname, ascii85flag); if (lept_rmfile(fname) != 0) L_ERROR("temp file %s was not deleted\n", procName, fname); LEPT_FREE(fname); return cid; } /*! * \brief pixGenerateJp2kData() * * \param[in] pixs 8 or 32 bpp, no colormap * \param[in] quality 0 for default, which is 34 * \return cid jp2k compressed data, or NULL on error * *
* Notes: * (1) The quality can be set between 27 (very poor) and 45 * (nearly perfect). Use 0 for default (34). Use 100 for lossless, * but this is very expensive and not recommended. **/ static L_COMP_DATA * pixGenerateJp2kData(PIX *pixs, l_int32 quality) { l_int32 d; char *fname; L_COMP_DATA *cid; PROCNAME("pixGenerateJp2kData"); if (!pixs) return (L_COMP_DATA *)ERROR_PTR("pixs not defined", procName, NULL); if (pixGetColormap(pixs)) return (L_COMP_DATA *)ERROR_PTR("pixs has colormap", procName, NULL); d = pixGetDepth(pixs); if (d != 8 && d != 32) return (L_COMP_DATA *)ERROR_PTR("pixs not 8 or 32 bpp", procName, NULL); /* Compress to a temp jp2k file */ fname = l_makeTempFilename(); if (pixWriteJp2k(fname, pixs, quality, 5, 0, 0)) { LEPT_FREE(fname); return NULL; } /* Generate the data */ cid = l_generateJp2kData(fname); if (lept_rmfile(fname) != 0) L_ERROR("temp file %s was not deleted\n", procName, fname); LEPT_FREE(fname); return cid; } /*! * \brief pixGenerateG4Data() * * \param[in] pixs 1 bpp * \param[in] ascii85flag 0 for gzipped; 1 for ascii85-encoded gzipped * \return cid g4 compressed image data, or NULL on error * *
* Notes: * (1) Set ascii85flag: * ~ 0 for binary data (not permitted in PostScript) * ~ 1 for ascii85 (5 for 4) encoded binary data **/ static L_COMP_DATA * pixGenerateG4Data(PIX *pixs, l_int32 ascii85flag) { char *fname; L_COMP_DATA *cid; PROCNAME("pixGenerateG4Data"); if (!pixs) return (L_COMP_DATA *)ERROR_PTR("pixs not defined", procName, NULL); if (pixGetDepth(pixs) != 1) return (L_COMP_DATA *)ERROR_PTR("pixs not 1 bpp", procName, NULL); /* Compress to a temp tiff g4 file */ fname = l_makeTempFilename(); if (pixWrite(fname, pixs, IFF_TIFF_G4)) { LEPT_FREE(fname); return NULL; } cid = l_generateG4Data(fname, ascii85flag); if (lept_rmfile(fname) != 0) L_ERROR("temp file %s was not deleted\n", procName, fname); LEPT_FREE(fname); return cid; } /*! * \brief l_generateG4Data() * * \param[in] fname of g4 compressed file * \param[in] ascii85flag 0 for g4 compressed; 1 for ascii85-encoded g4 * \return cid g4 compressed image data, or NULL on error * *
* Notes: * (1) Set ascii85flag: * ~ 0 for binary data (not permitted in PostScript) * ~ 1 for ascii85 (5 for 4) encoded binary data * (not permitted in pdf) **/ L_COMP_DATA * l_generateG4Data(const char *fname, l_int32 ascii85flag) { l_uint8 *datacomp = NULL; /* g4 compressed raster data */ char *data85 = NULL; /* ascii85 encoded g4 compressed data */ l_int32 w, h, xres, yres; l_int32 minisblack; /* TRUE or FALSE */ l_int32 nbytes85; size_t nbytescomp; L_COMP_DATA *cid; FILE *fp; PROCNAME("l_generateG4Data"); if (!fname) return (L_COMP_DATA *)ERROR_PTR("fname not defined", procName, NULL); /* Read the resolution */ if ((fp = fopenReadStream(fname)) == NULL) return (L_COMP_DATA *)ERROR_PTR("stream not opened", procName, NULL); getTiffResolution(fp, &xres, &yres); fclose(fp); /* The returned ccitt g4 data in memory is the block of * bytes in the tiff file, starting after 8 bytes and * ending before the directory. */ if (extractG4DataFromFile(fname, &datacomp, &nbytescomp, &w, &h, &minisblack)) { return (L_COMP_DATA *)ERROR_PTR("datacomp not extracted", procName, NULL); } /* Optionally, encode the compressed data */ if (ascii85flag == 1) { data85 = encodeAscii85(datacomp, nbytescomp, &nbytes85); LEPT_FREE(datacomp); if (!data85) return (L_COMP_DATA *)ERROR_PTR("data85 not made", procName, NULL); else data85[nbytes85 - 1] = '\0'; /* remove the newline */ } cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA)); if (ascii85flag == 0) { cid->datacomp = datacomp; } else { /* ascii85 */ cid->data85 = data85; cid->nbytes85 = nbytes85; } cid->type = L_G4_ENCODE; cid->nbytescomp = nbytescomp; cid->w = w; cid->h = h; cid->bps = 1; cid->spp = 1; cid->minisblack = minisblack; cid->res = xres; return cid; } /*! * \brief cidConvertToPdfData() * * \param[in] cid compressed image data * \param[in] title [optional] pdf title; can be NULL * \param[out] pdata output pdf data for image * \param[out] pnbytes size of output pdf data * \return 0 if OK, 1 on error * *
* Notes: * (1) Caller must not destroy the cid. It is absorbed in the * lpd and destroyed by this function. **/ l_ok cidConvertToPdfData(L_COMP_DATA *cid, const char *title, l_uint8 **pdata, size_t *pnbytes) { l_int32 res, ret; l_float32 wpt, hpt; L_PDF_DATA *lpd = NULL; PROCNAME("cidConvertToPdfData"); if (!pdata || !pnbytes) return ERROR_INT("&data and &nbytes not both defined", procName, 1); *pdata = NULL; *pnbytes = 0; if (!cid) return ERROR_INT("cid not defined", procName, 1); /* Get media box parameters, in pts */ res = cid->res; if (res <= 0) res = DefaultInputRes; wpt = cid->w * 72. / res; hpt = cid->h * 72. / res; /* Set up the pdf data struct (lpd) */ if ((lpd = pdfdataCreate(title)) == NULL) return ERROR_INT("lpd not made", procName, 1); ptraAdd(lpd->cida, cid); lpd->n++; ptaAddPt(lpd->xy, 0, 0); /* xpt = ypt = 0 */ ptaAddPt(lpd->wh, wpt, hpt); /* Generate the pdf string and destroy the lpd */ ret = l_generatePdf(pdata, pnbytes, lpd); pdfdataDestroy(&lpd); if (ret) return ERROR_INT("pdf output not made", procName, 1); return 0; } /*! * \brief l_CIDataDestroy() * * \param[in,out] pcid will be set to null before returning * \return void */ void l_CIDataDestroy(L_COMP_DATA **pcid) { L_COMP_DATA *cid; PROCNAME("l_CIDataDestroy"); if (pcid == NULL) { L_WARNING("ptr address is null!\n", procName); return; } if ((cid = *pcid) == NULL) return; if (cid->datacomp) LEPT_FREE(cid->datacomp); if (cid->data85) LEPT_FREE(cid->data85); if (cid->cmapdata85) LEPT_FREE(cid->cmapdata85); if (cid->cmapdatahex) LEPT_FREE(cid->cmapdatahex); LEPT_FREE(cid); *pcid = NULL; return; } /*---------------------------------------------------------------------* * Helper functions for generating the output pdf string * *---------------------------------------------------------------------*/ /*! * \brief l_generatePdf() * * \param[out] pdata pdf array * \param[out] pnbytes number of bytes in pdf array * \param[in] lpd all the required input image data * \return 0 if OK, 1 on error * *
* Notes: * (1) On error, no data is returned. * (2) The objects are: * 1: Catalog * 2: Info * 3: Pages * 4: Page * 5: Contents (rendering command) * 6 to 6+n-1: n XObjects * 6+n to 6+n+m-1: m colormaps **/ static l_int32 l_generatePdf(l_uint8 **pdata, size_t *pnbytes, L_PDF_DATA *lpd) { PROCNAME("l_generatePdf"); if (!pdata) return ERROR_INT("&data not defined", procName, 1); *pdata = NULL; if (!pnbytes) return ERROR_INT("&nbytes not defined", procName, 1); *pnbytes = 0; if (!lpd) return ERROR_INT("lpd not defined", procName, 1); generateFixedStringsPdf(lpd); generateMediaboxPdf(lpd); generatePageStringPdf(lpd); generateContentStringPdf(lpd); generatePreXStringsPdf(lpd); generateColormapStringsPdf(lpd); generateTrailerPdf(lpd); return generateOutputDataPdf(pdata, pnbytes, lpd); } static void generateFixedStringsPdf(L_PDF_DATA *lpd) { char buf[L_SMALLBUF]; char *version, *datestr; SARRAY *sa; PROCNAME("generateFixedStringsPdf"); /* Accumulate data for the header and objects 1-3 */ lpd->id = stringNew("%PDF-1.5\n"); l_dnaAddNumber(lpd->objsize, strlen(lpd->id)); lpd->obj1 = stringNew("1 0 obj\n" "<<\n" "/Type /Catalog\n" "/Pages 3 0 R\n" ">>\n" "endobj\n"); l_dnaAddNumber(lpd->objsize, strlen(lpd->obj1)); sa = sarrayCreate(0); sarrayAddString(sa, "2 0 obj\n" "<<\n", L_COPY); if (var_WRITE_DATE_AND_VERSION) { datestr = l_getFormattedDate(); snprintf(buf, sizeof(buf), "/CreationDate (D:%s)\n", datestr); sarrayAddString(sa, buf, L_COPY); LEPT_FREE(datestr); version = getLeptonicaVersion(); snprintf(buf, sizeof(buf), "/Producer (leptonica: %s)\n", version); LEPT_FREE(version); } else { snprintf(buf, sizeof(buf), "/Producer (leptonica)\n"); } sarrayAddString(sa, buf, L_COPY); if (lpd->title) { char *hexstr; if ((hexstr = generateEscapeString(lpd->title)) != NULL) { snprintf(buf, sizeof(buf), "/Title %s\n", hexstr); sarrayAddString(sa, buf, L_COPY); } else { L_ERROR("title string is not ascii\n", procName); } LEPT_FREE(hexstr); } sarrayAddString(sa, ">>\n" "endobj\n", L_COPY); lpd->obj2 = sarrayToString(sa, 0); l_dnaAddNumber(lpd->objsize, strlen(lpd->obj2)); sarrayDestroy(&sa); lpd->obj3 = stringNew("3 0 obj\n" "<<\n" "/Type /Pages\n" "/Kids [ 4 0 R ]\n" "/Count 1\n" ">>\n"); l_dnaAddNumber(lpd->objsize, strlen(lpd->obj3)); /* Do the post-datastream string */ lpd->poststream = stringNew("\n" "endstream\n" "endobj\n"); return; } /*! * \brief generateEscapeString() * * \param[in] str input string * \return hex escape string, or null on error * *
* Notes: * (1) If the input string is not ascii, returns null. * (2) This takes an input ascii string and generates a hex * ascii output string with 4 bytes out for each byte in. * The feff code at the beginning tells the pdf interpreter * that the data is to be interpreted as big-endian, 4 bytes * at a time. For ascii, the first two bytes are 0 and the * last two bytes are less than 0x80. **/ static char * generateEscapeString(const char *str) { char smallbuf[8]; char *buffer; l_int32 i, nchar, buflen; PROCNAME("generateEscapeString"); if (!str) return (char *)ERROR_PTR("str not defined", procName, NULL); nchar = strlen(str); for (i = 0; i < nchar; i++) { if (str[i] < 0) return (char *)ERROR_PTR("str not all ascii", procName, NULL); } buflen = 4 * nchar + 10; buffer = (char *)LEPT_CALLOC(buflen, sizeof(char)); stringCat(buffer, buflen, "
* Notes: * (1) Only called from l_generatePdf(). On error, no data is returned. **/ static l_int32 generateOutputDataPdf(l_uint8 **pdata, size_t *pnbytes, L_PDF_DATA *lpd) { char *str; l_uint8 *data; l_int32 nimages, i, len; l_int32 *sizes, *locs; size_t nbytes; L_COMP_DATA *cid; PROCNAME("generateOutputDataPdf"); if (!pdata) return ERROR_INT("&data not defined", procName, 1); *pdata = NULL; if (!pnbytes) return ERROR_INT("&nbytes not defined", procName, 1); nbytes = lpd->xrefloc + strlen(lpd->trailer); *pnbytes = nbytes; if ((data = (l_uint8 *)LEPT_CALLOC(nbytes, sizeof(l_uint8))) == NULL) return ERROR_INT("calloc fail for data", procName, 1); *pdata = data; sizes = l_dnaGetIArray(lpd->objsize); locs = l_dnaGetIArray(lpd->objloc); memcpy(data, lpd->id, sizes[0]); memcpy(data + locs[1], lpd->obj1, sizes[1]); memcpy(data + locs[2], lpd->obj2, sizes[2]); memcpy(data + locs[3], lpd->obj3, sizes[3]); memcpy(data + locs[4], lpd->obj4, sizes[4]); memcpy(data + locs[5], lpd->obj5, sizes[5]); /* Each image has 3 parts: variable preamble, the compressed * data stream, and the fixed poststream. */ nimages = lpd->n; for (i = 0; i < nimages; i++) { if ((cid = pdfdataGetCid(lpd, i)) == NULL) { /* should not happen */ LEPT_FREE(sizes); LEPT_FREE(locs); return ERROR_INT("cid not found", procName, 1); } str = sarrayGetString(lpd->saprex, i, L_NOCOPY); len = strlen(str); memcpy(data + locs[6 + i], str, len); memcpy(data + locs[6 + i] + len, cid->datacomp, cid->nbytescomp); memcpy(data + locs[6 + i] + len + cid->nbytescomp, lpd->poststream, strlen(lpd->poststream)); } /* Each colormap is simply a stored string */ for (i = 0; i < lpd->ncmap; i++) { str = sarrayGetString(lpd->sacmap, i, L_NOCOPY); memcpy(data + locs[6 + nimages + i], str, strlen(str)); } /* And finally the trailer */ memcpy(data + lpd->xrefloc, lpd->trailer, strlen(lpd->trailer)); LEPT_FREE(sizes); LEPT_FREE(locs); return 0; } /*---------------------------------------------------------------------* * Helper functions for generating multipage pdf output * *---------------------------------------------------------------------*/ /*! * \brief parseTrailerPdf() * * \param[in] bas lba of a pdf file * \param[out] pda byte locations of the beginning of each object * \return 0 if OK, 1 on error */ static l_int32 parseTrailerPdf(L_BYTEA *bas, L_DNA **pda) { char *str; l_uint8 nl = '\n'; l_uint8 *data; l_int32 i, j, start, startloc, xrefloc, found, loc, nobj, objno, trailer_ok; size_t size; L_DNA *da, *daobj, *daxref; SARRAY *sa; PROCNAME("parseTrailerPdf"); if (!pda) return ERROR_INT("&da not defined", procName, 1); *pda = NULL; if (!bas) return ERROR_INT("bas not defined", procName, 1); data = l_byteaGetData(bas, &size); if (memcmp(data, "%PDF-1.", 7) != 0) return ERROR_INT("PDF header signature not found", procName, 1); /* Search for "startxref" starting 50 bytes from the EOF */ start = 0; if (size > 50) start = size - 50; arrayFindSequence(data + start, size - start, (l_uint8 *)"startxref\n", 10, &loc, &found); if (!found) return ERROR_INT("startxref not found!", procName, 1); if (sscanf((char *)(data + start + loc + 10), "%d\n", &xrefloc) != 1) return ERROR_INT("xrefloc not found!", procName, 1); if (xrefloc < 0 || xrefloc >= size) return ERROR_INT("invalid xrefloc!", procName, 1); sa = sarrayCreateLinesFromString((char *)(data + xrefloc), 0); str = sarrayGetString(sa, 1, L_NOCOPY); if ((sscanf(str, "0 %d", &nobj)) != 1) { sarrayDestroy(&sa); return ERROR_INT("nobj not found", procName, 1); } /* Get starting locations. The numa index is the * object number. loc[0] is the ID; loc[nobj + 1] is xrefloc. */ da = l_dnaCreate(nobj + 1); *pda = da; for (i = 0; i < nobj; i++) { str = sarrayGetString(sa, i + 2, L_NOCOPY); sscanf(str, "%d", &startloc); l_dnaAddNumber(da, startloc); } l_dnaAddNumber(da, xrefloc); #if DEBUG_MULTIPAGE fprintf(stderr, "************** Trailer string ************\n"); fprintf(stderr, "xrefloc = %d", xrefloc); sarrayWriteStream(stderr, sa); fprintf(stderr, "************** Object locations ************"); l_dnaWriteStream(stderr, da); #endif /* DEBUG_MULTIPAGE */ sarrayDestroy(&sa); /* Verify correct parsing */ trailer_ok = TRUE; for (i = 1; i < nobj; i++) { l_dnaGetIValue(da, i, &startloc); if ((sscanf((char *)(data + startloc), "%d 0 obj", &objno)) != 1) { L_ERROR("bad trailer for object %d\n", procName, i); trailer_ok = FALSE; break; } } /* If the trailer is broken, reconstruct the correct obj locations */ if (!trailer_ok) { L_INFO("rebuilding pdf trailer\n", procName); l_dnaEmpty(da); l_dnaAddNumber(da, 0); l_byteaFindEachSequence(bas, (l_uint8 *)" 0 obj\n", 7, &daobj); nobj = l_dnaGetCount(daobj); for (i = 0; i < nobj; i++) { l_dnaGetIValue(daobj, i, &loc); for (j = loc - 1; j > 0; j--) { if (data[j] == nl) break; } l_dnaAddNumber(da, j + 1); } l_byteaFindEachSequence(bas, (l_uint8 *)"xref", 4, &daxref); l_dnaGetIValue(daxref, 0, &loc); l_dnaAddNumber(da, loc); l_dnaDestroy(&daobj); l_dnaDestroy(&daxref); } return 0; } static char * generatePagesObjStringPdf(NUMA *napage) { char *str; char *buf; l_int32 i, n, index, bufsize; SARRAY *sa; PROCNAME("generatePagesObjStringPdf"); if (!napage) return (char *)ERROR_PTR("napage not defined", procName, NULL); n = numaGetCount(napage); bufsize = 100 + 16 * n; /* large enough to hold the output string */ buf = (char *)LEPT_CALLOC(bufsize, sizeof(char)); sa = sarrayCreate(n); for (i = 0; i < n; i++) { numaGetIValue(napage, i, &index); snprintf(buf, bufsize, " %d 0 R ", index); sarrayAddString(sa, buf, L_COPY); } str = sarrayToString(sa, 0); snprintf(buf, bufsize - 1, "3 0 obj\n" "<<\n" "/Type /Pages\n" "/Kids [%s]\n" "/Count %d\n" ">>\n", str, n); sarrayDestroy(&sa); LEPT_FREE(str); return buf; } /*! * \brief substituteObjectNumbers() * * \param[in] bas lba of a pdf object * \param[in] na_objs object number mapping array * \return bad lba of rewritten pdf for the object * *
* Notes: * (1) Interpret the first set of bytes as the object number, * map to the new number, and write it out. * (2) Find all occurrences of this 4-byte sequence: " 0 R" * (3) Find the location and value of the integer preceding this, * and map it to the new value. * (4) Rewrite the object with new object numbers. **/ static L_BYTEA * substituteObjectNumbers(L_BYTEA *bas, NUMA *na_objs) { l_uint8 space = ' '; l_uint8 *datas; l_uint8 buf[32]; /* only needs to hold one integer in ascii format */ l_int32 start, nrepl, i, j, objin, objout, found; l_int32 *objs, *matches; size_t size; L_BYTEA *bad; L_DNA *da_match; datas = l_byteaGetData(bas, &size); bad = l_byteaCreate(100); objs = numaGetIArray(na_objs); /* object number mapper */ /* Substitute the object number on the first line */ sscanf((char *)datas, "%d", &objin); objout = objs[objin]; snprintf((char *)buf, 32, "%d", objout); l_byteaAppendString(bad, (char *)buf); /* Find the set of matching locations for object references */ arrayFindSequence(datas, size, &space, 1, &start, &found); da_match = arrayFindEachSequence(datas, size, (l_uint8 *)" 0 R", 4); if (!da_match) { l_byteaAppendData(bad, datas + start, size - start); LEPT_FREE(objs); return bad; } /* Substitute all the object reference numbers */ nrepl = l_dnaGetCount(da_match); matches = l_dnaGetIArray(da_match); for (i = 0; i < nrepl; i++) { /* Find the first space before the object number */ for (j = matches[i] - 1; j > 0; j--) { if (datas[j] == space) break; } /* Copy bytes from 'start' up to the object number */ l_byteaAppendData(bad, datas + start, j - start + 1); sscanf((char *)(datas + j + 1), "%d", &objin); objout = objs[objin]; snprintf((char *)buf, 32, "%d", objout); l_byteaAppendString(bad, (char *)buf); start = matches[i]; } l_byteaAppendData(bad, datas + start, size - start); LEPT_FREE(objs); LEPT_FREE(matches); l_dnaDestroy(&da_match); return bad; } /*---------------------------------------------------------------------* * Create/destroy/access pdf data * *---------------------------------------------------------------------*/ static L_PDF_DATA * pdfdataCreate(const char *title) { L_PDF_DATA *lpd; lpd = (L_PDF_DATA *)LEPT_CALLOC(1, sizeof(L_PDF_DATA)); if (title) lpd->title = stringNew(title); lpd->cida = ptraCreate(10); lpd->xy = ptaCreate(10); lpd->wh = ptaCreate(10); lpd->saprex = sarrayCreate(10); lpd->sacmap = sarrayCreate(10); lpd->objsize = l_dnaCreate(20); lpd->objloc = l_dnaCreate(20); return lpd; } static void pdfdataDestroy(L_PDF_DATA **plpd) { l_int32 i; L_COMP_DATA *cid; L_PDF_DATA *lpd; PROCNAME("pdfdataDestroy"); if (plpd== NULL) { L_WARNING("ptr address is null!\n", procName); return; } if ((lpd = *plpd) == NULL) return; if (lpd->title) LEPT_FREE(lpd->title); for (i = 0; i < lpd->n; i++) { cid = (L_COMP_DATA *)ptraRemove(lpd->cida, i, L_NO_COMPACTION); l_CIDataDestroy(&cid); } ptraDestroy(&lpd->cida, 0, 0); if (lpd->id) LEPT_FREE(lpd->id); if (lpd->obj1) LEPT_FREE(lpd->obj1); if (lpd->obj2) LEPT_FREE(lpd->obj2); if (lpd->obj3) LEPT_FREE(lpd->obj3); if (lpd->obj4) LEPT_FREE(lpd->obj4); if (lpd->obj5) LEPT_FREE(lpd->obj5); if (lpd->poststream) LEPT_FREE(lpd->poststream); if (lpd->trailer) LEPT_FREE(lpd->trailer); if (lpd->xy) ptaDestroy(&lpd->xy); if (lpd->wh) ptaDestroy(&lpd->wh); if (lpd->mediabox) boxDestroy(&lpd->mediabox); if (lpd->saprex) sarrayDestroy(&lpd->saprex); if (lpd->sacmap) sarrayDestroy(&lpd->sacmap); if (lpd->objsize) l_dnaDestroy(&lpd->objsize); if (lpd->objloc) l_dnaDestroy(&lpd->objloc); LEPT_FREE(lpd); *plpd = NULL; return; } static L_COMP_DATA * pdfdataGetCid(L_PDF_DATA *lpd, l_int32 index) { PROCNAME("pdfdataGetCid"); if (!lpd) return (L_COMP_DATA *)ERROR_PTR("lpd not defined", procName, NULL); if (index < 0 || index >= lpd->n) return (L_COMP_DATA *)ERROR_PTR("invalid image index", procName, NULL); return (L_COMP_DATA *)ptraGetPtrToItem(lpd->cida, index); } /*---------------------------------------------------------------------* * Set flags for special modes * *---------------------------------------------------------------------*/ /*! * \brief l_pdfSetG4ImageMask() * * \param[in] flag 1 for writing g4 data as fg only through a mask; * 0 for writing fg and bg * \return void * *
* Notes: * (1) The default is for writing only the fg (through the mask). * That way when you write a 1 bpp image, the bg is transparent, * so any previously written image remains visible behind it. **/ void l_pdfSetG4ImageMask(l_int32 flag) { var_WRITE_G4_IMAGE_MASK = flag; } /*! * \brief l_pdfSetDateAndVersion() * * \param[in] flag 1 for writing date/time and leptonica version; * 0 for omitting this from the metadata * \return void * *
* Notes: * (1) The default is for writing this data. For regression tests * that compare output against golden files, it is useful to omit. **/ void l_pdfSetDateAndVersion(l_int32 flag) { var_WRITE_DATE_AND_VERSION = flag; } /* --------------------------------------------*/ #endif /* USE_PDFIO */ /* --------------------------------------------*/