pranavjha/text-detector

View on GitHub
third-party/leptonica/src/recog.h

Summary

Maintainability
Test Coverage
/*====================================================================*
 -  Copyright (C) 2001 Leptonica.  All rights reserved.
 -
 -  Redistribution and use in source and binary forms, with or without
 -  modification, are permitted provided that the following conditions
 -  are met:
 -  1. Redistributions of source code must retain the above copyright
 -     notice, this list of conditions and the following disclaimer.
 -  2. Redistributions in binary form must reproduce the above
 -     copyright notice, this list of conditions and the following
 -     disclaimer in the documentation and/or other materials
 -     provided with the distribution.
 -
 -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
 -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *====================================================================*/

#ifndef  LEPTONICA_RECOG_H
#define  LEPTONICA_RECOG_H

/*
 *  recog.h
 *
 *     A simple utility for training and recognizing individual
 *     machine-printed text characters.  In an application, one can
 *     envision using a number of these, one for each trained set.
 *
 *     In training mode, a set of labelled bitmaps is presented, either
 *     one at a time, or in a directory, or in a pixa.  If in a directory,
 *     or a pixa, the labelling text string must be embedded in the
 *     text field of the image file.
 *
 *     Any number of recognizers (L_Recog) can be trained and then used
 *     together in an array (L_Recoga).  All these trained structures
 *     can be serialized to file and read back.  The serialized version
 *     holds all the bitmaps used for training, plus, for arbitrary
 *     character sets, the UTF8 representation and the lookup table
 *     mapping from the character representation to index.
 *
 *     There are three levels of "sets" here:
 *
 *       (1) Example set: the examples representing a character that
 *           were printed in the same way, so that they can be combined
 *           without scaling to form an "average" template for the character.
 *           In the recognition phase, we use either this aligned average,
 *           or the individual bitmaps.  All examples in the set are given
 *           the same character label.   Example: the letter 'a' in the
 *           predominant font in a book.
 *
 *       (2) Character set (represented by L_Recog, a single recognizer):
 *           The set of different characters, each of which is described
 *           by (1).  Each element of the set has a different character
 *           label.  Example: the digits '0' through '9' that are used for
 *           page numbering in a book.
 *
 *       (3) Recognizer set (represented by L_Recoga, an array of recogs):
 *           A set of recognizers, each of which is described by (2).
 *           In general, we do not want to combine the character sets
 *           with the same labels within different recognizer sets,
 *           because the bitmaps can differ in font type, style or size.
 *           Example 1: the letter 'a' can be printed in two very different
 *           ways (either with a large loop or with a smaller loop in
 *           the lower half); both share the same label but need to be
 *           distinguished so that they are not mixed when averaging.
 *           Example 2: a recognizer trained for a book may be missing
 *           some characters, so we need to supplement it with another
 *           "generic" or "bootstrap" recognizer that has the additional
 *           characters from a variety of sources.  Bootstrap recognizers
 *           must be run in a mode where all characters are scaled.
 *
 *     In the recognition process, for each component in an input image,
 *     each recognizer (L_Recog) records the best match (highest
 *     correlation score).  If there is more than one recognizer, these
 *     results are aggregated to find the best match for each character
 *     for all the recognizers, and this is stored in L_Recoga.
 */

#define  RECOG_VERSION_NUMBER      1

struct L_Recoga {
    l_int32              n;      /* number of recogs                         */
    l_int32              nalloc; /* number of recog ptrs allocated           */
    struct L_Recog     **recog;  /* recog ptr array                          */
    struct L_Rcha       *rcha;   /* stores the array of best chars           */
};
typedef struct L_Recoga L_RECOGA;


struct L_Recog {
    l_int32        scalew;       /* scale all examples to this width;        */
                                 /* use 0 prevent horizontal scaling         */
    l_int32        scaleh;       /* scale all examples to this height;       */
                                 /* use 0 prevent vertical scaling           */
    l_int32        templ_type;   /* template type: either an average of      */
                                 /* examples (L_USE_AVERAGE) or the set      */
                                 /* of all examples (L_USE_ALL)              */
    l_int32        maxarraysize; /* initialize container arrays to this      */
    l_int32        setsize;      /* size of character set                    */
    l_int32        threshold;    /* for binarizing if depth > 1              */
    l_int32        maxyshift;    /* vertical jiggle on nominal centroid      */
                                 /* alignment; typically 0 or 1              */
    l_float32      asperity_fr;  /* +- allowed fractional asperity ratio     */
    l_int32        charset_type; /* one of L_ARABIC_NUMERALS, etc.           */
    l_int32        charset_size; /* expected number of classes in charset    */
    char          *bootdir;      /* dir with bootstrap pixa charsets         */
    char          *bootpattern;  /* file pattern for bootstrap pixa charsets */
    char          *bootpath;     /* path for single bootstrap pixa charset   */
    l_int32        min_nopad;    /* min number of samples without padding    */
    l_int32        max_afterpad; /* max number of samples after padding      */
    l_int32        samplenum;    /* keep track of number of training samples */
    l_int32        minwidth_u;   /* min width of averaged unscaled templates */
    l_int32        maxwidth_u;   /* max width of averaged unscaled templates */
    l_int32        minheight_u;  /* min height of averaged unscaled templates */
    l_int32        maxheight_u;  /* max height of averaged unscaled templates */
    l_int32        minwidth;     /* min width of averaged scaled templates   */
    l_int32        maxwidth;     /* max width of averaged scaled templates   */
    l_int32        ave_done;     /* set to 1 when averaged bitmaps are made  */
    l_int32        train_done;   /* set to 1 when training is complete or    */
                                 /* identification has started               */
    l_int32        min_splitw;   /* min component width kept in splitting    */
    l_int32        min_splith;   /* min component height kept in splitting   */
    l_int32        max_splith;   /* max component height kept in splitting   */
    struct Sarray *sa_text;      /* text array for arbitrary char set        */
    struct L_Dna  *dna_tochar;   /* index-to-char lut for arbitrary char set */
    l_int32       *centtab;      /* table for finding centroids              */
    l_int32       *sumtab;       /* table for finding pixel sums             */
    char          *fname;        /* serialized filename (if read)            */
    struct Pixaa  *pixaa_u;      /* all unscaled bitmaps for each class      */
    struct Pixa   *pixa_u;       /* averaged unscaled bitmaps for each class */
    struct Ptaa   *ptaa_u;       /* centroids of all unscaled bitmaps        */
    struct Pta    *pta_u;        /* centroids of unscaled averaged bitmaps   */
    struct Numaa  *naasum_u;     /* area of all unscaled bitmap examples     */
    struct Numa   *nasum_u;      /* area of unscaled averaged bitmaps        */
    struct Pixaa  *pixaa;        /* all bitmap examples for each class       */
    struct Pixa   *pixa;         /* averaged bitmaps for each class          */
    struct Ptaa   *ptaa;         /* centroids of all bitmap examples         */
    struct Pta    *pta;          /* centroids of averaged bitmaps            */
    struct Numaa  *naasum;       /* area of all bitmap examples              */
    struct Numa   *nasum;        /* area of averaged bitmaps                 */
    struct Pixa   *pixa_tr;      /* input training images                    */
    struct Pixa   *pixadb_ave;   /* unscaled and scaled averaged bitmaps     */
    struct Pixa   *pixa_id;      /* input images for identifying             */
    struct Pix    *pixdb_ave;    /* debug: best match of input against ave.  */
    struct Pix    *pixdb_range;  /* debug: best matches within range         */
    struct Pixa   *pixadb_boot;  /* debug: bootstrap training results        */
    struct Pixa   *pixadb_split; /* debug: splitting results                 */
    struct L_Bmf  *bmf;          /* bmf fonts                                */
    l_int32        bmf_size;     /* font size of bmf; default is 6 pt        */
    struct L_Rdid *did;          /* temp data used for image decoding        */
    struct L_Rch  *rch;          /* temp data used for holding best char     */
    struct L_Rcha *rcha;         /* temp data used for array of best chars   */
    l_int32        bootrecog;    /* 1 if using bootstrap samples; else 0     */
    l_int32        index;        /* recog index in recoga; -1 if no parent   */
    struct L_Recoga  *parent;    /* ptr to parent array; can be null         */

};
typedef struct L_Recog L_RECOG;

/*
 *  Data returned from correlation matching on a single character
 */
struct L_Rch {
    l_int32        index;        /* index of best template                   */
    l_float32      score;        /* correlation score of best template       */
    char          *text;         /* character string of best template        */
    l_int32        sample;       /* index of best sample (within the best    */
                                 /* template class, if all samples are used) */
    l_int32        xloc;         /* x-location of template (delx + shiftx)   */
    l_int32        yloc;         /* y-location of template (dely + shifty)   */
    l_int32        width;        /* width of best template                   */
};
typedef struct L_Rch L_RCH;

/*
 *  Data returned from correlation matching on an array of characters
 */
struct L_Rcha {
    struct Numa   *naindex;      /* indices of best templates                */
    struct Numa   *nascore;      /* correlation scores of best templates     */
    struct Sarray *satext;       /* character strings of best templates      */
    struct Numa   *nasample;     /* indices of best samples                  */
    struct Numa   *naxloc;       /* x-locations of templates (delx + shiftx) */
    struct Numa   *nayloc;       /* y-locations of templates (dely + shifty) */
    struct Numa   *nawidth;      /* widths of best templates                 */
};
typedef struct L_Rcha L_RCHA;

/*
 *  Data used for decoding a line of characters.
 */
struct L_Rdid {
    struct Pix    *pixs;         /* clone of pix to be decoded               */
    l_int32      **counta;       /* count array for each averaged template   */
    l_int32      **delya;        /* best y-shift array per averaged template */
    l_int32        narray;       /* number of averaged templates             */
    l_int32        size;         /* size of count array (width of pixs)      */
    l_int32       *setwidth;     /* setwidths for each template              */
    struct Numa   *nasum;        /* pixel count in pixs by column            */
    struct Numa   *namoment;     /* first moment of pixels in pixs by column */
    l_int32        fullarrays;   /* 1 if full arrays are made; 0 otherwise   */
    l_float32     *beta;         /* channel coeffs for template fg term      */
    l_float32     *gamma;        /* channel coeffs for bit-and term          */
    l_float32     *trellisscore; /* score on trellis                         */
    l_int32       *trellistempl; /* template on trellis (for backtrack)      */
    struct Numa   *natempl;      /* indices of best path templates           */
    struct Numa   *naxloc;       /* x locations of best path templates       */
    struct Numa   *nadely;       /* y locations of best path templates       */
    struct Numa   *nawidth;      /* widths of best path templates            */
    struct Numa   *nascore;      /* correlation scores: best path templates  */
    struct Numa   *natempl_r;    /* indices of best rescored templates       */
    struct Numa   *naxloc_r;     /* x locations of best rescoredtemplates    */
    struct Numa   *nadely_r;     /* y locations of best rescoredtemplates    */
    struct Numa   *nawidth_r;    /* widths of best rescoredtemplates         */
    struct Numa   *nascore_r;    /* correlation scores: rescored templates   */
};
typedef struct L_Rdid L_RDID;


/*-------------------------------------------------------------------------*
 *                    Flags for selecting processing                       *
 *-------------------------------------------------------------------------*/
enum {
    L_SELECT_UNSCALED = 0,       /* select the unscaled bitmaps            */
    L_SELECT_SCALED = 1,         /* select the scaled bitmaps              */
    L_SELECT_BOTH = 2            /* select both unscaled and scaled        */
};

/*-------------------------------------------------------------------------*
 *                Flags for determining what to test against               *
 *-------------------------------------------------------------------------*/
enum {
    L_USE_AVERAGE = 0,         /* form template from class average         */
    L_USE_ALL = 1              /* match against all elements of each class */
};

/*-------------------------------------------------------------------------*
 *             Flags for describing limited character sets                 *
 *-------------------------------------------------------------------------*/
enum {
    L_UNKNOWN = 0,             /* character set type is not specified      */
    L_ARABIC_NUMERALS = 1,     /* 10 digits                                */
    L_LC_ROMAN_NUMERALS = 2,   /* 7 lower-case letters (i,v,x,l,c,d,m)     */
    L_UC_ROMAN_NUMERALS = 3,   /* 7 upper-case letters (I,V,X,L,C,D,M)     */
    L_LC_ALPHA = 4,            /* 26 lower-case letters                    */
    L_UC_ALPHA = 5             /* 26 upper-case letters                    */
};

#endif  /* LEPTONICA_RECOG_H */