/*====================================================================* - Copyright (C) 2001 Leptonica. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - 2. Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials - provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *====================================================================*/ /*! * \file ptafunc2.c *

 *
 *      --------------------------------------
 *      This file has these Pta utilities:
 *         - sorting
 *         - ordered set operations
 *         - hash map operations
 *      --------------------------------------
 *
 *      Sorting
 *           PTA        *ptaSort()
 *           l_int32     ptaGetSortIndex()
 *           PTA        *ptaSortByIndex()
 *           PTAA       *ptaaSortByIndex()
 *           l_int32     ptaGetRankValue()
 *
 *      Set operations using aset (rbtree)
 *           PTA        *ptaUnionByAset()
 *           PTA        *ptaRemoveDupsByAset()
 *           PTA        *ptaIntersectionByAset()
 *           L_ASET     *l_asetCreateFromPta()
 *
 *      Set operations using hashing (dnahash)
 *           PTA        *ptaUnionByHash()
 *           l_int32     ptaRemoveDupsByHash()
 *           PTA        *ptaIntersectionByHash();
 *           l_int32     ptaFindPtByHash()
 *           L_DNAHASH  *l_dnaHashCreateFromPta()
 *
 *
 * We have two implementations of set operations on an array of points:
 *
 *   (1) Using an underlying tree (rbtree)
 *       This uses a good 64 bit hashing function for the key,
 *       that is not expected to have hash collisions (and we do
 *       not test for them).  The tree is built up of the hash
 *       values, and if the hash is found in the tree, it is
 *       assumed that the point has already been found.
 *
 *   (2) Using an underlying hashing of the keys (dnahash)
 *       This uses a fast 64 bit hashing function for the key,
 *       which is then hashed into a bucket (a dna in a dnaHash).
 *       Because hash collisions can occur, the index into the
 *       pta for the point that gave rise to that key is stored,
 *       and the dna (bucket) is traversed, using the stored indices
 *       to determine if that point had already been seen.
 *
 *

*/ #include "allheaders.h" /*---------------------------------------------------------------------* * Sorting * *---------------------------------------------------------------------*/ /*! * \brief ptaSort() * * \param[in] ptas * \param[in] sorttype L_SORT_BY_X, L_SORT_BY_Y * \param[in] sortorder L_SORT_INCREASING, L_SORT_DECREASING * \param[out] pnaindex [optional] index of sorted order into * original array * \return ptad sorted version of ptas, or NULL on error */ PTA * ptaSort(PTA *ptas, l_int32 sorttype, l_int32 sortorder, NUMA **pnaindex) { PTA *ptad; NUMA *naindex; PROCNAME("ptaSort"); if (pnaindex) *pnaindex = NULL; if (!ptas) return (PTA *)ERROR_PTR("ptas not defined", procName, NULL); if (sorttype != L_SORT_BY_X && sorttype != L_SORT_BY_Y) return (PTA *)ERROR_PTR("invalid sort type", procName, NULL); if (sortorder != L_SORT_INCREASING && sortorder != L_SORT_DECREASING) return (PTA *)ERROR_PTR("invalid sort order", procName, NULL); if (ptaGetSortIndex(ptas, sorttype, sortorder, &naindex) != 0) return (PTA *)ERROR_PTR("naindex not made", procName, NULL); ptad = ptaSortByIndex(ptas, naindex); if (pnaindex) *pnaindex = naindex; else numaDestroy(&naindex); if (!ptad) return (PTA *)ERROR_PTR("ptad not made", procName, NULL); return ptad; } /*! * \brief ptaGetSortIndex() * * \param[in] ptas * \param[in] sorttype L_SORT_BY_X, L_SORT_BY_Y * \param[in] sortorder L_SORT_INCREASING, L_SORT_DECREASING * \param[out] pnaindex index of sorted order into original array * \return 0 if OK, 1 on error */ l_ok ptaGetSortIndex(PTA *ptas, l_int32 sorttype, l_int32 sortorder, NUMA **pnaindex) { l_int32 i, n; l_float32 x, y; NUMA *na; PROCNAME("ptaGetSortIndex"); if (!pnaindex) return ERROR_INT("&naindex not defined", procName, 1); *pnaindex = NULL; if (!ptas) return ERROR_INT("ptas not defined", procName, 1); if (sorttype != L_SORT_BY_X && sorttype != L_SORT_BY_Y) return ERROR_INT("invalid sort type", procName, 1); if (sortorder != L_SORT_INCREASING && sortorder != L_SORT_DECREASING) return ERROR_INT("invalid sort order", procName, 1); /* Build up numa of specific data */ n = ptaGetCount(ptas); if ((na = numaCreate(n)) == NULL) return ERROR_INT("na not made", procName, 1); for (i = 0; i < n; i++) { ptaGetPt(ptas, i, &x, &y); if (sorttype == L_SORT_BY_X) numaAddNumber(na, x); else numaAddNumber(na, y); } /* Get the sort index for data array */ *pnaindex = numaGetSortIndex(na, sortorder); numaDestroy(&na); if (!*pnaindex) return ERROR_INT("naindex not made", procName, 1); return 0; } /*! * \brief ptaSortByIndex() * * \param[in] ptas * \param[in] naindex na that maps from the new pta to the input pta * \return ptad sorted, or NULL on error */ PTA * ptaSortByIndex(PTA *ptas, NUMA *naindex) { l_int32 i, index, n; l_float32 x, y; PTA *ptad; PROCNAME("ptaSortByIndex"); if (!ptas) return (PTA *)ERROR_PTR("ptas not defined", procName, NULL); if (!naindex) return (PTA *)ERROR_PTR("naindex not defined", procName, NULL); /* Build up sorted pta using sort index */ n = numaGetCount(naindex); if ((ptad = ptaCreate(n)) == NULL) return (PTA *)ERROR_PTR("ptad not made", procName, NULL); for (i = 0; i < n; i++) { numaGetIValue(naindex, i, &index); ptaGetPt(ptas, index, &x, &y); ptaAddPt(ptad, x, y); } return ptad; } /*! * \brief ptaaSortByIndex() * * \param[in] ptaas * \param[in] naindex na that maps from the new ptaa to the input ptaa * \return ptaad sorted, or NULL on error */ PTAA * ptaaSortByIndex(PTAA *ptaas, NUMA *naindex) { l_int32 i, n, index; PTA *pta; PTAA *ptaad; PROCNAME("ptaaSortByIndex"); if (!ptaas) return (PTAA *)ERROR_PTR("ptaas not defined", procName, NULL); if (!naindex) return (PTAA *)ERROR_PTR("naindex not defined", procName, NULL); n = ptaaGetCount(ptaas); if (numaGetCount(naindex) != n) return (PTAA *)ERROR_PTR("numa and ptaa sizes differ", procName, NULL); ptaad = ptaaCreate(n); for (i = 0; i < n; i++) { numaGetIValue(naindex, i, &index); pta = ptaaGetPta(ptaas, index, L_COPY); ptaaAddPta(ptaad, pta, L_INSERT); } return ptaad; } /*! * \brief ptaGetRankValue() * * \param[in] pta * \param[in] fract use 0.0 for smallest, 1.0 for largest * \param[in] ptasort [optional] version of %pta sorted by %sorttype * \param[in] sorttype L_SORT_BY_X, L_SORT_BY_Y * \param[out] pval rankval: the x or y value at %fract * \return 0 if OK, 1 on error */ l_ok ptaGetRankValue(PTA *pta, l_float32 fract, PTA *ptasort, l_int32 sorttype, l_float32 *pval) { l_int32 index, n; PTA *ptas; PROCNAME("ptaGetRankValue"); if (!pval) return ERROR_INT("&val not defined", procName, 1); *pval = 0.0; if (!pta) return ERROR_INT("pta not defined", procName, 1); if (sorttype != L_SORT_BY_X && sorttype != L_SORT_BY_Y) return ERROR_INT("invalid sort type", procName, 1); if (fract < 0.0 || fract > 1.0) return ERROR_INT("fract not in [0.0 ... 1.0]", procName, 1); if ((n = ptaGetCount(pta)) == 0) return ERROR_INT("pta empty", procName, 1); if (ptasort) ptas = ptasort; else ptas = ptaSort(pta, sorttype, L_SORT_INCREASING, NULL); index = (l_int32)(fract * (l_float32)(n - 1) + 0.5); if (sorttype == L_SORT_BY_X) ptaGetPt(ptas, index, pval, NULL); else /* sort by y */ ptaGetPt(ptas, index, NULL, pval); if (!ptasort) ptaDestroy(&ptas); return 0; } /*---------------------------------------------------------------------* * Set operations using aset (rbtree) * *---------------------------------------------------------------------*/ /*! * \brief ptaUnionByAset() * * \param[in] pta1, pta2 * \return ptad with the union of the set of points, or NULL on error * *

 * Notes:
 *      (1) See sarrayRemoveDupsByAset() for the approach.
 *      (2) The key is a 64-bit hash from the (x,y) pair.
 *      (3) This is slower than ptaUnionByHash(), mostly because of the
 *          nlogn sort to build up the rbtree.  Do not use for large
 *          numbers of points (say, > 1M).
 *      (4) The *Aset() functions use the sorted l_Aset, which is just
 *          an rbtree in disguise.
 *

*/ PTA * ptaUnionByAset(PTA *pta1, PTA *pta2) { PTA *pta3, *ptad; PROCNAME("ptaUnionByAset"); if (!pta1) return (PTA *)ERROR_PTR("pta1 not defined", procName, NULL); if (!pta2) return (PTA *)ERROR_PTR("pta2 not defined", procName, NULL); /* Join */ pta3 = ptaCopy(pta1); ptaJoin(pta3, pta2, 0, -1); /* Eliminate duplicates */ ptad = ptaRemoveDupsByAset(pta3); ptaDestroy(&pta3); return ptad; } /*! * \brief ptaRemoveDupsByAset() * * \param[in] ptas assumed to be integer values * \return ptad with duplicates removed, or NULL on error * *

 * Notes:
 *      (1) This is slower than ptaRemoveDupsByHash(), mostly because
 *          of the nlogn sort to build up the rbtree.  Do not use for
 *          large numbers of points (say, > 1M).
 *

*/ PTA * ptaRemoveDupsByAset(PTA *ptas) { l_int32 i, n, x, y; PTA *ptad; l_uint64 hash; L_ASET *set; RB_TYPE key; PROCNAME("ptaRemoveDupsByAset"); if (!ptas) return (PTA *)ERROR_PTR("ptas not defined", procName, NULL); set = l_asetCreate(L_UINT_TYPE); n = ptaGetCount(ptas); ptad = ptaCreate(n); for (i = 0; i < n; i++) { ptaGetIPt(ptas, i, &x, &y); l_hashPtToUint64(x, y, &hash); key.utype = hash; if (!l_asetFind(set, key)) { ptaAddPt(ptad, x, y); l_asetInsert(set, key); } } l_asetDestroy(&set); return ptad; } /*! * \brief ptaIntersectionByAset() * * \param[in] pta1, pta2 * \return ptad intersection of the point sets, or NULL on error * *

 * Notes:
 *      (1) See sarrayIntersectionByAset() for the approach.
 *      (2) The key is a 64-bit hash from the (x,y) pair.
 *      (3) This is slower than ptaIntersectionByHash(), mostly because
 *          of the nlogn sort to build up the rbtree.  Do not use for
 *          large numbers of points (say, > 1M).
 *

*/ PTA * ptaIntersectionByAset(PTA *pta1, PTA *pta2) { l_int32 n1, n2, i, n, x, y; l_uint64 hash; L_ASET *set1, *set2; RB_TYPE key; PTA *pta_small, *pta_big, *ptad; PROCNAME("ptaIntersectionByAset"); if (!pta1) return (PTA *)ERROR_PTR("pta1 not defined", procName, NULL); if (!pta2) return (PTA *)ERROR_PTR("pta2 not defined", procName, NULL); /* Put the elements of the biggest array into a set */ n1 = ptaGetCount(pta1); n2 = ptaGetCount(pta2); pta_small = (n1 < n2) ? pta1 : pta2; /* do not destroy pta_small */ pta_big = (n1 < n2) ? pta2 : pta1; /* do not destroy pta_big */ set1 = l_asetCreateFromPta(pta_big); /* Build up the intersection of points */ ptad = ptaCreate(0); n = ptaGetCount(pta_small); set2 = l_asetCreate(L_UINT_TYPE); for (i = 0; i < n; i++) { ptaGetIPt(pta_small, i, &x, &y); l_hashPtToUint64(x, y, &hash); key.utype = hash; if (l_asetFind(set1, key) && !l_asetFind(set2, key)) { ptaAddPt(ptad, x, y); l_asetInsert(set2, key); } } l_asetDestroy(&set1); l_asetDestroy(&set2); return ptad; } /*! * \brief l_asetCreateFromPta() * * \param[in] pta * \return set using a 64-bit hash of (x,y) as the key */ L_ASET * l_asetCreateFromPta(PTA *pta) { l_int32 i, n, x, y; l_uint64 hash; L_ASET *set; RB_TYPE key; PROCNAME("l_asetCreateFromPta"); if (!pta) return (L_ASET *)ERROR_PTR("pta not defined", procName, NULL); set = l_asetCreate(L_UINT_TYPE); n = ptaGetCount(pta); for (i = 0; i < n; i++) { ptaGetIPt(pta, i, &x, &y); l_hashPtToUint64(x, y, &hash); key.utype = hash; l_asetInsert(set, key); } return set; } /*---------------------------------------------------------------------* * Set operations using hashing (rbtree) * *---------------------------------------------------------------------*/ /*! * \brief ptaUnionByHash() * * \param[in] pta1, pta2 * \return ptad with the union of the set of points, or NULL on error * *

 * Notes:
 *      (1) This is faster than ptaUnionByAset(), because the
 *          bucket lookup is O(n).  It should be used if the pts are
 *          integers (e.g., representing pixel positions).
 *

*/ PTA * ptaUnionByHash(PTA *pta1, PTA *pta2) { PTA *pta3, *ptad; PROCNAME("ptaUnionByHash"); if (!pta1) return (PTA *)ERROR_PTR("pta1 not defined", procName, NULL); if (!pta2) return (PTA *)ERROR_PTR("pta2 not defined", procName, NULL); /* Join */ pta3 = ptaCopy(pta1); ptaJoin(pta3, pta2, 0, -1); /* Eliminate duplicates */ ptaRemoveDupsByHash(pta3, &ptad, NULL); ptaDestroy(&pta3); return ptad; } /*! * \brief ptaRemoveDupsByHash() * * \param[in] ptas assumed to be integer values * \param[out] pptad unique set of pts; duplicates removed * \param[out] pdahash [optional] dnahash used for lookup * \return 0 if OK, 1 on error * *

 * Notes:
 *      (1) Generates a pta with unique values.
 *      (2) The dnahash is built up with ptad to assure uniqueness.
 *          It can be used to find if a point is in the set:
 *              ptaFindPtByHash(ptad, dahash, x, y, &index)
 *      (3) The hash of the (x,y) location is simple and fast.  It scales
 *          up with the number of buckets to insure a fairly random
 *          bucket selection for adjacent points.
 *      (4) A Dna is used rather than a Numa because we need accurate
 *          representation of 32-bit integers that are indices into ptas.
 *          Integer --> float --> integer conversion makes errors for
 *          integers larger than 10M.
 *      (5) This is faster than ptaRemoveDupsByAset(), because the
 *          bucket lookup is O(n), although there is a double-loop
 *          lookup within the dna in each bucket.
 *

*/ l_ok ptaRemoveDupsByHash(PTA *ptas, PTA **pptad, L_DNAHASH **pdahash) { l_int32 i, n, index, items, x, y; l_uint32 nsize; l_uint64 key; PTA *ptad; L_DNAHASH *dahash; PROCNAME("ptaRemoveDupsByHash"); if (pdahash) *pdahash = NULL; if (!pptad) return ERROR_INT("&ptad not defined", procName, 1); *pptad = NULL; if (!ptas) return ERROR_INT("ptas not defined", procName, 1); n = ptaGetCount(ptas); findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */ dahash = l_dnaHashCreate(nsize, 8); ptad = ptaCreate(n); *pptad = ptad; for (i = 0, items = 0; i < n; i++) { ptaGetIPt(ptas, i, &x, &y); ptaFindPtByHash(ptad, dahash, x, y, &index); if (index < 0) { /* not found */ l_hashPtToUint64(x, y, &key); l_dnaHashAdd(dahash, key, (l_float64)items); ptaAddPt(ptad, x, y); items++; } } if (pdahash) *pdahash = dahash; else l_dnaHashDestroy(&dahash); return 0; } /*! * \brief ptaIntersectionByHash() * * \param[in] pta1, pta2 * \return ptad intersection of the point sets, or NULL on error * *

 * Notes:
 *      (1) This is faster than ptaIntersectionByAset(), because the
 *          bucket lookup is O(n).  It should be used if the pts are
 *          integers (e.g., representing pixel positions).
 *

*/ PTA * ptaIntersectionByHash(PTA *pta1, PTA *pta2) { l_int32 n1, n2, nsmall, i, x, y, index1, index2; l_uint32 nsize2; l_uint64 key; L_DNAHASH *dahash1, *dahash2; PTA *pta_small, *pta_big, *ptad; PROCNAME("ptaIntersectionByHash"); if (!pta1) return (PTA *)ERROR_PTR("pta1 not defined", procName, NULL); if (!pta2) return (PTA *)ERROR_PTR("pta2 not defined", procName, NULL); /* Put the elements of the biggest pta into a dnahash */ n1 = ptaGetCount(pta1); n2 = ptaGetCount(pta2); pta_small = (n1 < n2) ? pta1 : pta2; /* do not destroy pta_small */ pta_big = (n1 < n2) ? pta2 : pta1; /* do not destroy pta_big */ dahash1 = l_dnaHashCreateFromPta(pta_big); /* Build up the intersection of points. Add to ptad * if the point is in pta_big (using dahash1) but hasn't * yet been seen in the traversal of pta_small (using dahash2). */ ptad = ptaCreate(0); nsmall = ptaGetCount(pta_small); findNextLargerPrime(nsmall / 20, &nsize2); /* buckets in hash table */ dahash2 = l_dnaHashCreate(nsize2, 0); for (i = 0; i < nsmall; i++) { ptaGetIPt(pta_small, i, &x, &y); ptaFindPtByHash(pta_big, dahash1, x, y, &index1); if (index1 >= 0) { /* found */ ptaFindPtByHash(pta_small, dahash2, x, y, &index2); if (index2 == -1) { /* not found */ ptaAddPt(ptad, x, y); l_hashPtToUint64(x, y, &key); l_dnaHashAdd(dahash2, key, (l_float64)i); } } } l_dnaHashDestroy(&dahash1); l_dnaHashDestroy(&dahash2); return ptad; } /*! * \brief ptaFindPtByHash() * * \param[in] pta * \param[in] dahash built from pta * \param[in] x, y arbitrary points * \param[out] pindex index into pta if (x,y) is in pta; -1 otherwise * \return 0 if OK, 1 on error * *

 * Notes:
 *      (1) Fast lookup in dnaHash associated with a pta, to see if a
 *          random point (x,y) is already stored in the hash table.
 *      (2) We use a strong hash function to minimize the chance that
 *          two different points hash to the same key value.
 *      (3) We select the number of buckets to be about 5% of the size
 *          of the input %pta, so that when fully populated, each
 *          bucket (dna) will have about 20 entries, each being an index
 *          into %pta.  In lookup, after hashing to the key, and then
 *          again to the bucket, we traverse the bucket (dna), using the
 *          index into %pta to check if the point (x,y) has been found before.
 *

*/ l_ok ptaFindPtByHash(PTA *pta, L_DNAHASH *dahash, l_int32 x, l_int32 y, l_int32 *pindex) { l_int32 i, nvals, index, xi, yi; l_uint64 key; L_DNA *da; PROCNAME("ptaFindPtByHash"); if (!pindex) return ERROR_INT("&index not defined", procName, 1); *pindex = -1; if (!pta) return ERROR_INT("pta not defined", procName, 1); if (!dahash) return ERROR_INT("dahash not defined", procName, 1); l_hashPtToUint64(x, y, &key); da = l_dnaHashGetDna(dahash, key, L_NOCOPY); if (!da) return 0; /* Run through the da, looking for this point */ nvals = l_dnaGetCount(da); for (i = 0; i < nvals; i++) { l_dnaGetIValue(da, i, &index); ptaGetIPt(pta, index, &xi, &yi); if (x == xi && y == yi) { *pindex = index; return 0; } } return 0; } /*! * \brief l_dnaHashCreateFromPta() * * \param[in] pta * \return dahash, or NULL on error */ L_DNAHASH * l_dnaHashCreateFromPta(PTA *pta) { l_int32 i, n, x, y; l_uint32 nsize; l_uint64 key; L_DNAHASH *dahash; PROCNAME("l_dnaHashCreateFromPta"); if (!pta) return (L_DNAHASH *)ERROR_PTR("pta not defined", procName, NULL); /* Build up dnaHash of indices, hashed by a key that is * a large linear combination of x and y values designed to * randomize the key. Having about 20 pts in each bucket is * roughly optimal for speed for large sets. */ n = ptaGetCount(pta); findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */ /* fprintf(stderr, "Prime used: %d\n", nsize); */ /* Add each point, using the hash as key and the index into * %ptas as the value. Storing the index enables operations * that check for duplicates. */ dahash = l_dnaHashCreate(nsize, 8); for (i = 0; i < n; i++) { ptaGetIPt(pta, i, &x, &y); l_hashPtToUint64(x, y, &key); l_dnaHashAdd(dahash, key, (l_float64)i); } return dahash; }