pranavjha/text-detector

View on GitHub
third-party/leptonica/src/encoding.c

Summary

Maintainability
Test Coverage
/*====================================================================*
 -  Copyright (C) 2001 Leptonica.  All rights reserved.
 -  This software is distributed in the hope that it will be
 -  useful, but with NO WARRANTY OF ANY KIND.
 -  No author or distributor accepts responsibility to anyone for the
 -  consequences of using this software, or for whether it serves any
 -  particular purpose or works at all, unless he or she says so in
 -  writing.  Everyone is granted permission to copy, modify and
 -  redistribute this source code, for commercial or non-commercial
 -  purposes, with the following restrictions: (1) the origin of this
 -  source code must not be misrepresented; (2) modified versions must
 -  be plainly marked as such; and (3) this notice may not be removed
 -  or altered from any source or modified source distribution.
 *====================================================================*/

/*
 *  encodings.c
 *
 *    Base64
 *        char           *encodeBase64()
 *        l_uint8        *decodeBase64()
 *        static l_int32  isBase64()
 *        static l_int32 *genReverseTab64()
 *        static void     byteConvert3to4()
 *        static void     byteConvert4to3()
 *
 *    Ascii85
 *        char           *encodeAscii85()
 *        l_uint8        *decodeAscii85()
 *        static l_int32  convertChunkToAscii85()
 *
 *    String reformatting for base 64 encoded data
 *        char           *reformatPacked64()
 *
 *  Base64 encoding is useful for encding binary data in a restricted set of
 *  64 printable ascii symbols, that includes the 62 alphanumerics and '+'
 *  and '/'.  Notably it does not include quotes, so that base64 encoded
 *  strings can be used in situations where quotes are used for formatting.
 *  64 symbols was chosen because it is the smallest number that can be used
 *  in 4-for-3 byte encoding of binary data:
 *         log2(64) / log2(256) = 0.75 = 3/4
 *
 *  Ascii85 encoding is used in PostScript and some pdf files for
 *  representing binary data (for example, a compressed image) in printable
 *  ascii symbols.  It has a dictionary of 85 symbols; 85 was chosen because
 *  it is the smallest number that can be used in 5-for-4 byte encoding
 *  of binary data (256 possible input values).  This can be seen from
 *  the max information content in such a sequence:
 *         log2(84) / log2(256) = 0.799 < 4/5
 *         log2(85) / log2(256) = 0.801 > 4/5
 */

#include <ctype.h>
#include "allheaders.h"

    /* Base64 encoding table in string representation */
static const l_int32  MAX_BASE64_LINE   = 72;  /* max line length base64 */
static const char *tablechar64 =
             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
             "abcdefghijklmnopqrstuvwxyz"
             "0123456789+/";

static l_int32 isBase64(char);
static l_int32 *genReverseTab64(void);
static void byteConvert3to4(l_uint8 *in3, l_uint8 *out4);
static void byteConvert4to3(l_uint8 *in4, l_uint8 *out3);

    /* Ascii85 encoding */
static const l_int32  MAX_ASCII85_LINE   = 64;  /* max line length ascii85 */
static const l_uint32  power85[5] = {1,
                                     85,
                                     85 * 85,
                                     85 * 85 * 85,
                                     85 * 85 * 85 * 85};

static l_int32 convertChunkToAscii85(l_uint8 *inarray, l_int32 insize,
                                     l_int32 *pindex, char *outbuf,
                                     l_int32 *pnbout);


/*-------------------------------------------------------------*
 *      Utility for encoding and decoding data with base64     *
 *-------------------------------------------------------------*/
/*!
 *  encodeBase64()
 *
 *      Input:  inarray (input binary data)
 *              insize (number of bytes in input array)
 *              &outsize (<return> number of bytes in output char array)
 *      Return: chara (with MAX_BASE64_LINE characters + \n in each line)
 *
 *  Notes:
 *      (1) The input character data is unrestricted binary.
 *          The ouput encoded data consists of the 64 characters
 *          in the base64 set, plus newlines and the pad character '='.
 */
char *
encodeBase64(l_uint8  *inarray,
             l_int32   insize,
             l_int32  *poutsize)
{
char     *chara;
l_uint8  *bytea;
l_uint8   array3[3], array4[4];
l_int32   outsize, i, j, index, linecount;

    PROCNAME("encodeBase64");

    if (!poutsize)
        return (char *)ERROR_PTR("&outsize not defined", procName, NULL);
    *poutsize = 0;
    if (!inarray)
        return (char *)ERROR_PTR("inarray not defined", procName, NULL);
    if (insize <= 0)
        return (char *)ERROR_PTR("insize not > 0", procName, NULL);

        /* The output array is padded to a multiple of 4 bytes, not
         * counting the newlines.  We just need to allocate a large
         * enough array, and add 4 bytes to make sure it is big enough. */
    outsize = 4 * ((insize + 2) / 3);  /* without newlines */
    outsize += outsize / MAX_BASE64_LINE + 4;  /* with the newlines */
    if ((chara = (char *)CALLOC(outsize, sizeof(char))) == NULL)
        return (char *)ERROR_PTR("chara not made", procName, NULL);

        /* Read all the input data, and convert in sets of 3 input
         * bytes --> 4 output bytes. */
    i = index = linecount = 0;
    bytea = inarray;
    while (insize--) {
        if (linecount == MAX_BASE64_LINE) {
            chara[index++] = '\n';
            linecount = 0;
        }
        array3[i++] = *bytea++;
        if (i == 3) {  /* convert 3 to 4 and save */
            byteConvert3to4(array3, array4);
            for (j = 0; j < 4; j++)
                chara[index++] = tablechar64[array4[j]];
            i = 0;
            linecount += 4;
        }
    }

        /* Suppose 1 or 2 bytes has been read but not yet processed.
         * If 1 byte has been read, this will generate 2 bytes of
         * output, with 6 bits to the first byte and 2 bits to the second.
         * We will add two bytes of '=' for padding.
         * If 2 bytes has been read, this will generate 3 bytes of output,
         * with 6 bits to the first 2 bytes and 4 bits to the third, and
         * we add a fourth padding byte ('='). */
    if (i > 0) {  /* left-over 1 or 2 input bytes */
        for (j = i; j < 3; j++)
            array3[j] = '\0';  /* zero the remaining input bytes */
        byteConvert3to4(array3, array4);
        for (j = 0; j <= i; j++)
            chara[index++] = tablechar64[array4[j]];
        for (j = i + 1; j < 4; j++)
            chara[index++] = '=';
    }
    *poutsize = index;

    return chara;
}


/*!
 *  decodeBase64()
 *
 *      Input:  inarray (input encoded char data, with 72 chars/line))
 *              insize (number of bytes in input array)
 *              &outsize (<return> number of bytes in output byte array)
 *      Return: bytea (decoded byte data), or null on error
 *
 *  Notes:
 *      (1) The input character data should have only 66 different characters:
 *          The 64 character set for base64 encoding, plus the pad
 *          character '=' and newlines for formatting with fixed line
 *          lengths.  If there are any other characters, the decoder
 *          will declare the input data to be invalid and return NULL.
 *      (2) The decoder ignores newlines and, for a valid input string,
 *          stops reading input when a pad byte is found.
 */
l_uint8 *
decodeBase64(const char  *inarray,
             l_int32      insize,
             l_int32     *poutsize)
{
char      inchar;
l_uint8  *bytea;
l_uint8   array3[3], array4[4];
l_int32  *rtable64;
l_int32   i, j, outsize, in_index, out_index;

    PROCNAME("decodeBase64");

    if (!poutsize)
        return (l_uint8 *)ERROR_PTR("&outsize not defined", procName, NULL);
    *poutsize = 0;
    if (!inarray)
        return (l_uint8 *)ERROR_PTR("inarray not defined", procName, NULL);
    if (insize <= 0)
        return (l_uint8 *)ERROR_PTR("insize not > 0", procName, NULL);

        /* Validate the input data */
    for (i = 0; i < insize; i++) {
        inchar = inarray[i];
        if (inchar == '\n') continue;
        if (isBase64(inchar) == 0 && inchar != '=')
            return (l_uint8 *)ERROR_PTR("invalid char in inarray",
                                        procName, NULL);
    }

        /* The input array typically is made with a newline every
         * MAX_BASE64_LINE input bytes.  However, as a printed string, the
         * newlines would be stripped.  So when we allocate the output
         * array, assume the input array is all data, but strip
         * out the newlines during decoding.  This guarantees that
         * the allocated array is large enough. */
    outsize = 3 * ((insize + 3) / 4) + 4;
    if ((bytea = (l_uint8 *)CALLOC(outsize, sizeof(l_uint8))) == NULL)
        return (l_uint8 *)ERROR_PTR("bytea not made", procName, NULL);

        /* The number of encoded input data bytes is always a multiple of 4.
         * Read all the data, until you reach either the end or
         * the first pad character '='.  The data is processed in
         * units of 4 input bytes, generating 3 output decoded bytes
         * of binary data.  Newlines are ignored.  If there are no
         * pad bytes, i == 0 at the end of this section. */
    rtable64 = genReverseTab64();
    i = in_index = out_index = 0;
    for (in_index = 0; in_index < insize; in_index++) {
        inchar = inarray[in_index];
        if (inchar == '\n') continue;
        if (inchar == '=') break;
        array4[i++] = rtable64[(unsigned char)inchar];
        if (i < 4) {
            continue;
        } else {  /* i == 4; convert 4 to 3 and save */
            byteConvert4to3(array4, array3);
            for (j = 0; j < 3; j++)
                bytea[out_index++] = array3[j];
            i = 0;
        }
    }

        /* If i > 0, we ran into pad bytes ('=').  If i == 2, there are
         * two input pad bytes and one output data byte.  If i == 3,
         * there is one input pad byte and two output data bytes. */
    if (i > 0) {
        for (j = i; j < 4; j++)
            array4[j] = '\0';  /* zero the remaining input bytes */
        byteConvert4to3(array4, array3);
        for (j = 0; j < i - 1; j++)
            bytea[out_index++] = array3[j];
    }
    *poutsize = out_index;

    FREE(rtable64);
    return bytea;
}


/*!
 *  isBase64()
 */
static l_int32
isBase64(char  c)
{
    return (isalnum(((int)c)) || ((c) == '+') || ((c) == '/')) ? 1 : 0;
}

/*!
 *  genReverseTab64()
 */
static l_int32 *
genReverseTab64()
{
l_int32   i;
l_int32  *rtable64;

    rtable64 = (l_int32 *)CALLOC(128, sizeof(l_int32));
    for (i = 0; i < 64; i++) {
        rtable64[(unsigned char)tablechar64[i]] = i;
    }
    return rtable64;
}

/*!
 *  byteConvert3to4()
 */
static void
byteConvert3to4(l_uint8  *in3,
                l_uint8  *out4)
{
    out4[0] = in3[0] >> 2;
    out4[1] = ((in3[0] & 0x03) << 4) | (in3[1] >> 4);
    out4[2] = ((in3[1] & 0x0f) << 2) | (in3[2] >> 6);
    out4[3] = in3[2] & 0x3f;
    return;
}

/*!
 *  byteConvert4to3()
 */
static void
byteConvert4to3(l_uint8  *in4,
                l_uint8  *out3)
{
    out3[0] = (in4[0] << 2) | (in4[1] >> 4);
    out3[1] = ((in4[1] & 0x0f) << 4) | (in4[2] >> 2);
    out3[2] = ((in4[2] & 0x03) << 6) | in4[3];
    return;
}


/*-------------------------------------------------------------*
 *      Utility for encoding and decoding data with ascii85    *
 *-------------------------------------------------------------*/
/*!
 *  encodeAscii85()
 *
 *      Input:  inarray (input data)
 *              insize (number of bytes in input array)
 *              &outsize (<return> number of bytes in output char array)
 *      Return: chara (with 64 characters + \n in each line)
 *
 *  Notes:
 *      (1) Ghostscript has a stack break if the last line of
 *          data only has a '>', so we avoid the problem by
 *          always putting '~>' on the last line.
 */
char *
encodeAscii85(l_uint8  *inarray,
              l_int32   insize,
              l_int32  *poutsize)
{
char    *chara;
char    *outbuf;
l_int32  maxsize, i, index, outindex, linecount, nbout, eof;

    PROCNAME("encodeAscii85");

    if (!poutsize)
        return (char *)ERROR_PTR("&outsize not defined", procName, NULL);
    *poutsize = 0;
    if (!inarray)
        return (char *)ERROR_PTR("inarray not defined", procName, NULL);
    if (insize <= 0)
        return (char *)ERROR_PTR("insize not > 0", procName, NULL);

        /* Accumulate results in char array */
    maxsize = (l_int32)(80. + (insize * 5. / 4.) *
                        (1. + 2. / MAX_ASCII85_LINE));
    if ((chara = (char *)CALLOC(maxsize, sizeof(char))) == NULL)
        return (char *)ERROR_PTR("chara not made", procName, NULL);
    if ((outbuf = (char *)CALLOC(8, sizeof(char))) == NULL)
        return (char *)ERROR_PTR("outbuf not made", procName, NULL);

    linecount = 0;
    index = 0;
    outindex = 0;
    while (1) {
        eof = convertChunkToAscii85(inarray, insize, &index, outbuf, &nbout);
        for (i = 0; i < nbout; i++) {
            chara[outindex++] = outbuf[i];
            linecount++;
            if (linecount >= MAX_ASCII85_LINE) {
                chara[outindex++] = '\n';
                linecount = 0;
            }
        }
        if (eof == TRUE) {
            if (linecount != 0)
                chara[outindex++] = '\n';
            chara[outindex++] = '~';
            chara[outindex++] = '>';
            chara[outindex++] = '\n';
            break;
        }
    }

    FREE(outbuf);
    *poutsize = outindex;
    return chara;
}


/*!
 *  convertChunkToAscii85()
 *
 *      Input:  inarray (input data)
 *              insize  (number of bytes in input array)
 *              &index (use and <return> -- ptr)
 *              outbuf (holds 8 ascii chars; we use no more than 7)
 *              &nbsout (<return> number of bytes written to outbuf)
 *      Return: boolean for eof (0 if more data, 1 if end of file)
 *
 *  Notes:
 *      (1) Attempts to read 4 bytes and write 5.
 *      (2) Writes 1 byte if the value is 0.
 */
static l_int32
convertChunkToAscii85(l_uint8  *inarray,
                      l_int32   insize,
                      l_int32  *pindex,
                      char     *outbuf,
                      l_int32  *pnbout)
{
l_uint8   inbyte;
l_uint32  inword, val;
l_int32   eof, index, nread, nbout, i;

    eof = FALSE;
    index = *pindex;
    nread = L_MIN(4, (insize - index));
    if (insize == index + nread)
        eof = TRUE;
    *pindex += nread;  /* save new index */

        /* Read input data and save in l_uint32 */
    inword = 0;
    for (i = 0; i < nread; i++) {
        inbyte = inarray[index + i];
        inword += inbyte << (8 * (3 - i));
    }

#if 0
    fprintf(stderr, "index = %d, nread = %d\n", index, nread);
    fprintf(stderr, "inword = %x\n", inword);
    fprintf(stderr, "eof = %d\n", eof);
#endif

        /* Special case: output 1 byte only */
    if (inword == 0) {
        outbuf[0] = 'z';
        nbout = 1;
    } else { /* output nread + 1 bytes */
        for (i = 4; i >= 4 - nread; i--) {
            val = inword / power85[i];
            outbuf[4 - i] = (l_uint8)(val + '!');
            inword -= val * power85[i];
        }
        nbout = nread + 1;
    }
    *pnbout = nbout;

    return eof;
}


/*!
 *  decodeAscii85()
 *
 *      Input:  inarray (ascii85 input data)
 *              insize (number of bytes in input array)
 *              &outsize (<return> number of bytes in output l_uint8 array)
 *      Return: outarray (binary)
 *
 *  Notes:
 *      (1) We assume the data is properly encoded, so we do not check
 *          for invalid characters or the final '>' character.
 *      (2) We permit whitespace to be added to the encoding in an
 *          arbitrary way.
 */
l_uint8 *
decodeAscii85(char     *inarray,
              l_int32   insize,
              l_int32  *poutsize)
{
char      inc;
char     *pin;
l_uint8   val;
l_uint8  *outa;
l_int32   maxsize, ocount, bytecount, index;
l_uint32  oword;

    PROCNAME("decodeAscii85");

    if (!poutsize)
        return (l_uint8 *)ERROR_PTR("&outsize not defined", procName, NULL);
    *poutsize = 0;
    if (!inarray)
        return (l_uint8 *)ERROR_PTR("inarray not defined", procName, NULL);
    if (insize <= 0)
        return (l_uint8 *)ERROR_PTR("insize not > 0", procName, NULL);

        /* Accumulate results in outa */
    maxsize = (l_int32)(80. + (insize * 4. / 5.));  /* plenty big */
    if ((outa = (l_uint8 *)CALLOC(maxsize, sizeof(l_uint8))) == NULL)
        return (l_uint8 *)ERROR_PTR("outa not made", procName, NULL);

    pin = inarray;
    ocount = 0;  /* byte index into outa */
    oword = 0;
    for (index = 0, bytecount = 0; index < insize; index++, pin++) {
        inc = *pin;

        if (inc == ' ' || inc == '\t' || inc == '\n' ||
            inc == '\f' || inc == '\r' || inc == '\v')  /* ignore white space */
            continue;

        val = inc - '!';
        if (val < 85) {
            oword = oword * 85 + val;
            if (bytecount < 4) {
                bytecount++;
            } else {  /* we have all 5 input chars for the oword */
                outa[ocount] = (oword >> 24) & 0xff;
                outa[ocount + 1] = (oword >> 16) & 0xff;
                outa[ocount + 2] = (oword >> 8) & 0xff;
                outa[ocount + 3] = oword & 0xff;
                ocount += 4;
                bytecount = 0;
                oword = 0;
            }
        } else if (inc == 'z' && bytecount == 0) {
            outa[ocount] = 0;
            outa[ocount + 1] = 0;
            outa[ocount + 2] = 0;
            outa[ocount + 3] = 0;
            ocount += 4;
        } else if (inc == '~') {  /* end of data */
            L_INFO(" %d extra bytes output\n", procName, bytecount - 1);
            switch (bytecount) {
            case 0:   /* normal eof */
            case 1:   /* error */
                break;
            case 2:   /* 1 extra byte */
                oword = oword * power85[3] + 0xffffff;
                outa[ocount] = (oword >> 24) & 0xff;
                break;
            case 3:   /* 2 extra bytes */
                oword = oword * power85[2] + 0xffff;
                outa[ocount] = (oword >> 24) & 0xff;
                outa[ocount + 1] = (oword >> 16) & 0xff;
                break;
            case 4:   /* 3 extra bytes */
                oword = oword * 85 + 0xff;
                outa[ocount] = (oword >> 24) & 0xff;
                outa[ocount + 1] = (oword >> 16) & 0xff;
                outa[ocount + 2] = (oword >> 8) & 0xff;
                break;
            }
            if (bytecount > 1)
                ocount += (bytecount - 1);
            break;
        }
    }
    *poutsize = ocount;

    return outa;
}


/*-------------------------------------------------------------*
 *       String reformatting for base 64 encoded data          *
 *-------------------------------------------------------------*/
/*!
 *  reformatPacked64()
 *
 *      Input:  inarray (base64 encoded string with newlines)
 *              insize (number of bytes in input array)
 *              leadspace (number of spaces in each line before the data)
 *              linechars (number of bytes of data in each line; multiple of 4)
 *              addquotes (1 to add quotes to each line of data; 0 to skip)
 *              &outsize (<return> number of bytes in output char array)
 *      Return: outarray (ascii)
 *
 *  Notes:
 *      (1) Each line in the output array has @leadspace space characters,
 *          followed optionally by a double-quote, followed by @linechars
 *          bytes of base64 data, followed optionally by a double-quote,
 *          followed by a newline.
 *      (2) This can be used to convert a base64 encoded string to a
 *          string formatted for inclusion in a C source file.
 */
char *
reformatPacked64(char     *inarray,
                 l_int32   insize,
                 l_int32   leadspace,
                 l_int32   linechars,
                 l_int32   addquotes,
                 l_int32  *poutsize)
{
char    *flata, *outa;
l_int32  i, j, flatindex, flatsize, outindex, nlines, linewithpad, linecount;

    PROCNAME("reformatPacked64");

    if (!poutsize)
        return (char *)ERROR_PTR("&outsize not defined", procName, NULL);
    *poutsize = 0;
    if (!inarray)
        return (char *)ERROR_PTR("inarray not defined", procName, NULL);
    if (insize <= 0)
        return (char *)ERROR_PTR("insize not > 0", procName, NULL);
    if (leadspace < 0)
        return (char *)ERROR_PTR("leadspace must be >= 0", procName, NULL);
    if (linechars % 4)
        return (char *)ERROR_PTR("linechars % 4 must be 0", procName, NULL);

        /* Remove all white space */
    if ((flata = (char *)CALLOC(insize, sizeof(char))) == NULL)
        return (char *)ERROR_PTR("flata not made", procName, NULL);
    for (i = 0, flatindex = 0; i < insize; i++) {
        if (isBase64(inarray[i]) || inarray[i] == '=')
            flata[flatindex++] = inarray[i];
    }

        /* Generate output string */
    flatsize = flatindex;
    nlines = (flatsize + linechars - 1) / linechars;
    linewithpad = leadspace + linechars + 1;  /* including newline */
    if (addquotes) linewithpad += 2;
    if ((outa = (char *)CALLOC(nlines * linewithpad, sizeof(char))) == NULL)
        return (char *)ERROR_PTR("outa not made", procName, NULL);
    for (j = 0, outindex = 0; j < leadspace; j++)
        outa[outindex++] = ' ';
    if (addquotes) outa[outindex++] = '"';
    for (i = 0, linecount = 0; i < flatsize; i++) {
        if (linecount == linechars) {
            if (addquotes) outa[outindex++] = '"';
            outa[outindex++] = '\n';
            for (j = 0; j < leadspace; j++)
                outa[outindex++] = ' ';
            if (addquotes) outa[outindex++] = '"';
            linecount = 0;
        }
        outa[outindex++] = flata[i];
        linecount++;
    }
    if (addquotes) outa[outindex++] = '"';
    *poutsize = outindex;

    FREE(flata);
    return outa;
}