pranavjha/text-detector

View on GitHub
third-party/leptonica/prog/pdfiotest.c

Summary

Maintainability
Test Coverage
/*====================================================================*
 -  Copyright (C) 2001 Leptonica.  All rights reserved.
 -
 -  Redistribution and use in source and binary forms, with or without
 -  modification, are permitted provided that the following conditions
 -  are met:
 -  1. Redistributions of source code must retain the above copyright
 -     notice, this list of conditions and the following disclaimer.
 -  2. Redistributions in binary form must reproduce the above
 -     copyright notice, this list of conditions and the following
 -     disclaimer in the documentation and/or other materials
 -     provided with the distribution.
 -
 -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
 -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *====================================================================*/

/*
 *  pdfiotest.c
 *
 *    Note: pdftk (pdftk.exe for Windows) is required to run the
 *          last part of this test.
 */

#include <string.h>
#include "allheaders.h"

static void GetImageMask(PIX *pixs, l_int32 res, BOXA **pboxa,
                         const char *debugfile);
static PIX * QuantizeNonImageRegion(PIX *pixs, PIX *pixm, l_int32 levels);


int main(int    argc,
         char **argv)
{
char         buffer[512];
char        *tempfile1, *tempfile2;
l_uint8     *data;
l_int32      i, j, w, h, seq, ret, same;
size_t       nbytes;
const char  *title;
BOX         *box;
BOXA        *boxa1, *boxa2;
L_BYTEA     *ba;
L_PDF_DATA  *lpd;
PIX         *pix1, *pix2, *pix3, *pix4, *pix5, *pix6;
PIX         *pixs, *pixt, *pixg, *pixgc, *pixc;
static char  mainName[] = "pdfiotest";

    if (argc != 1)
        return ERROR_INT("syntax: pdfiotest", mainName, 1);
    l_pdfSetDateAndVersion(0);

    lept_mkdir("pdf");

#if 1
    /* ---------------  Single image tests  ------------------- */
    fprintf(stderr, "\n*** Writing single images as pdf files\n");

    convertToPdf("weasel2.4c.png", L_FLATE_ENCODE, 0, "/tmp/pdf/file01.pdf",
                 0, 0, 72, "weasel2.4c.png", NULL, 0);
    convertToPdf("test24.jpg", L_JPEG_ENCODE, 0, "/tmp/pdf/file02.pdf",
                 0, 0, 72, "test24.jpg", NULL, 0);
    convertToPdf("feyn.tif", L_G4_ENCODE, 0, "/tmp/pdf/file03.pdf",
                 0, 0, 300, "feyn.tif", NULL, 0);

    pixs = pixRead("feyn.tif");
    pixConvertToPdf(pixs, L_G4_ENCODE, 0, "/tmp/pdf/file04.pdf", 0, 0, 300,
                    "feyn.tif", NULL, 0);
    pixDestroy(&pixs);

    pixs = pixRead("test24.jpg");
    pixConvertToPdf(pixs, L_JPEG_ENCODE, 5, "/tmp/pdf/file05.pdf", 0, 0, 72,
                    "test24.jpg", NULL, 0);
    pixDestroy(&pixs);

    pixs = pixRead("feyn.tif");
    pixt = pixScaleToGray2(pixs);
    pixWrite("/tmp/pdf/feyn8.png", pixt, IFF_PNG);
    convertToPdf("/tmp/pdf/feyn8.png", L_JPEG_ENCODE, 0, "/tmp/pdf/file06.pdf",
                 0, 0, 150, "feyn8.png", NULL, 0);
    pixDestroy(&pixs);
    pixDestroy(&pixt);

    convertToPdf("weasel4.16g.png", L_FLATE_ENCODE, 0, "/tmp/pdf/file07.pdf",
                 0, 0, 30, "weasel4.16g.png", NULL, 0);

    pixs = pixRead("test24.jpg");
    pixg = pixConvertTo8(pixs, 0);
    box = boxCreate(100, 100, 100, 100);
    pixc = pixClipRectangle(pixs, box, NULL);
    pixgc = pixClipRectangle(pixg, box, NULL);
    pixWrite("/tmp/pdf/pix32.jpg", pixc, IFF_JFIF_JPEG);
    pixWrite("/tmp/pdf/pix8.jpg", pixgc, IFF_JFIF_JPEG);
    convertToPdf("/tmp/pdf/pix32.jpg", L_FLATE_ENCODE, 0, "/tmp/pdf/file08.pdf",
                 0, 0, 72, "pix32.jpg", NULL, 0);
    convertToPdf("/tmp/pdf/pix8.jpg", L_FLATE_ENCODE, 0, "/tmp/pdf/file09.pdf",
                 0, 0, 72, "pix8.jpg", NULL, 0);
    pixDestroy(&pixs);
    pixDestroy(&pixg);
    pixDestroy(&pixc);
    pixDestroy(&pixgc);
    boxDestroy(&box);
#endif


#if 1
    /* ---------------  Multiple image tests  ------------------- */
    fprintf(stderr, "\n*** Writing multiple images as single page pdf files\n");

    pix1 = pixRead("feyn-fract.tif");
    pix2 = pixRead("weasel8.240c.png");

/*    l_pdfSetDateAndVersion(0); */
        /* First, write the 1 bpp image through the mask onto the weasels */
    for (i = 0; i < 5; i++) {
        for (j = 0; j < 10; j++) {
            seq = (i == 0 && j == 0) ? L_FIRST_IMAGE : L_NEXT_IMAGE;
            title = (i == 0 && j == 0) ? "feyn-fract.tif" : NULL;
            pixConvertToPdf(pix2, L_FLATE_ENCODE, 0, NULL, 100 * j,
                            100 * i, 70, title, &lpd, seq);
        }
    }
    pixConvertToPdf(pix1, L_G4_ENCODE, 0, "/tmp/pdf/file10.pdf", 0, 0, 80,
                    NULL, &lpd, L_LAST_IMAGE);

        /* Now, write the 1 bpp image over the weasels */
    l_pdfSetG4ImageMask(0);
    for (i = 0; i < 5; i++) {
        for (j = 0; j < 10; j++) {
            seq = (i == 0 && j == 0) ? L_FIRST_IMAGE : L_NEXT_IMAGE;
            title = (i == 0 && j == 0) ? "feyn-fract.tif" : NULL;
            pixConvertToPdf(pix2, L_FLATE_ENCODE, 0, NULL, 100 * j,
                            100 * i, 70, title, &lpd, seq);
        }
    }
    pixConvertToPdf(pix1, L_G4_ENCODE, 0, "/tmp/pdf/file11.pdf", 0, 0, 80,
                    NULL, &lpd, L_LAST_IMAGE);
    l_pdfSetG4ImageMask(1);
    pixDestroy(&pix1);
    pixDestroy(&pix2);
#endif

#if 1
    /* -------- pdf convert segmented with no image regions -------- */
    fprintf(stderr, "\n*** Writing segmented images without image regions\n");

    pix1 = pixRead("rabi.png");
    pix2 = pixScaleToGray2(pix1);
    pixWrite("/tmp/pdf/rabi8.jpg", pix2, IFF_JFIF_JPEG);
    pix3 = pixThresholdTo4bpp(pix2, 16, 1);
    pixWrite("/tmp/pdf/rabi4.png", pix3, IFF_PNG);
    pixDestroy(&pix1);
    pixDestroy(&pix2);
    pixDestroy(&pix3);

        /* 1 bpp input */
    convertToPdfSegmented("rabi.png", 300, L_G4_ENCODE, 128, NULL, 0, 0,
                          NULL, "/tmp/pdf/file12.pdf");
    convertToPdfSegmented("rabi.png", 300, L_JPEG_ENCODE, 128, NULL, 0, 0,
                          NULL, "/tmp/pdf/file13.pdf");
    convertToPdfSegmented("rabi.png", 300, L_FLATE_ENCODE, 128, NULL, 0, 0,
                          NULL, "/tmp/pdf/file14.pdf");

        /* 8 bpp input, no cmap */
    convertToPdfSegmented("/tmp/pdf/rabi8.jpg", 150, L_G4_ENCODE, 128,
                          NULL, 0, 0, NULL, "/tmp/pdf/file15.pdf");
    convertToPdfSegmented("/tmp/pdf/rabi8.jpg", 150, L_JPEG_ENCODE, 128,
                          NULL, 0, 0, NULL, "/tmp/pdf/file16.pdf");
    convertToPdfSegmented("/tmp/pdf/rabi8.jpg", 150, L_FLATE_ENCODE, 128,
                          NULL, 0, 0, NULL, "/tmp/pdf/file17.pdf");

        /* 4 bpp input, cmap */
    convertToPdfSegmented("/tmp/pdf/rabi4.png", 150, L_G4_ENCODE, 128,
                          NULL, 0, 0, NULL, "/tmp/pdf/file18.pdf");
    convertToPdfSegmented("/tmp/pdf/rabi4.png", 150, L_JPEG_ENCODE, 128,
                          NULL, 0, 0, NULL, "/tmp/pdf/file19.pdf");
    convertToPdfSegmented("/tmp/pdf/rabi4.png", 150, L_FLATE_ENCODE, 128,
                          NULL, 0, 0, NULL, "/tmp/pdf/file20.pdf");

#endif

#if 1
    /* ---------- pdf convert segmented with image regions ---------- */
    fprintf(stderr, "\n*** Writing segmented images with image regions\n");

        /* Get the image region(s) for rabi.png.  There are two
         * small bogus regions at the top, but we'll keep them for
         * the demonstration. */
    pix1 = pixRead("rabi.png");
    pixSetResolution(pix1, 300, 300);
    pixGetDimensions(pix1, &w, &h, NULL);
    pix2 = pixGenHalftoneMask(pix1, NULL, NULL, 0);
    pix3 = pixMorphSequence(pix2, "c20.1 + c1.20", 0);
    boxa1 = pixConnComp(pix3, NULL, 8);
    boxa2 = boxaTransform(boxa1, 0, 0, 0.5, 0.5);
    pixDestroy(&pix1);
    pixDestroy(&pix2);
    pixDestroy(&pix3);

        /* 1 bpp input */
    convertToPdfSegmented("rabi.png", 300, L_G4_ENCODE, 128, boxa1,
                          0, 0.25, NULL, "/tmp/pdf/file21.pdf");
    convertToPdfSegmented("rabi.png", 300, L_JPEG_ENCODE, 128, boxa1,
                          0, 0.25, NULL, "/tmp/pdf/file22.pdf");
    convertToPdfSegmented("rabi.png", 300, L_FLATE_ENCODE, 128, boxa1,
                          0, 0.25, NULL, "/tmp/pdf/file23.pdf");

        /* 8 bpp input, no cmap */
    convertToPdfSegmented("/tmp/pdf/rabi8.jpg", 150, L_G4_ENCODE, 128, boxa2,
                          0, 0.5, NULL, "/tmp/pdf/file24.pdf");
    convertToPdfSegmented("/tmp/pdf/rabi8.jpg", 150, L_JPEG_ENCODE, 128, boxa2,
                          0, 0.5, NULL, "/tmp/pdf/file25.pdf");
    convertToPdfSegmented("/tmp/pdf/rabi8.jpg", 150, L_FLATE_ENCODE, 128, boxa2,
                          0, 0.5, NULL, "/tmp/pdf/file26.pdf");

        /* 4 bpp input, cmap */
    convertToPdfSegmented("/tmp/pdf/rabi4.png", 150, L_G4_ENCODE, 128, boxa2,
                          0, 0.5, NULL, "/tmp/pdf/file27.pdf");
    convertToPdfSegmented("/tmp/pdf/rabi4.png", 150, L_JPEG_ENCODE, 128, boxa2,
                          0, 0.5, NULL, "/tmp/pdf/file28.pdf");
    convertToPdfSegmented("/tmp/pdf/rabi4.png", 150, L_FLATE_ENCODE, 128, boxa2,
                          0, 0.5, NULL, "/tmp/pdf/file29.pdf");

        /* 4 bpp input, cmap, data output */
    data = NULL;
    convertToPdfDataSegmented("/tmp/pdf/rabi4.png", 150, L_G4_ENCODE,
                              128, boxa2, 0, 0.5, NULL, &data, &nbytes);
    l_binaryWrite("/tmp/pdf/file30.pdf", "w", data, nbytes);
    lept_free(data);
    convertToPdfDataSegmented("/tmp/pdf/rabi4.png", 150, L_JPEG_ENCODE,
                              128, boxa2, 0, 0.5, NULL, &data, &nbytes);
    l_binaryWrite("/tmp/pdf/file31.pdf", "w", data, nbytes);
    lept_free(data);
    convertToPdfDataSegmented("/tmp/pdf/rabi4.png", 150, L_FLATE_ENCODE,
                              128, boxa2, 0, 0.5, NULL, &data, &nbytes);
    l_binaryWrite("/tmp/pdf/file32.pdf", "w", data, nbytes);
    lept_free(data);

    boxaDestroy(&boxa1);
    boxaDestroy(&boxa2);
#endif


#if 1
    /* -------- pdf convert segmented from color image -------- */
    fprintf(stderr, "\n*** Writing color segmented images\n");

    pix1 = pixRead("candelabrum-11.jpg");
    pix2 = pixScale(pix1, 3.0, 3.0);
    pixWrite("/tmp/pdf/candelabrum3.jpg", pix2, IFF_JFIF_JPEG);
    GetImageMask(pix2, 200, &boxa1, "/tmp/pdf/seg1.jpg");
    convertToPdfSegmented("/tmp/pdf/candelabrum3.jpg", 200, L_G4_ENCODE,
                          100, boxa1, 0, 0.25, NULL, "/tmp/pdf/file33.pdf");
    convertToPdfSegmented("/tmp/pdf/candelabrum3.jpg", 200, L_JPEG_ENCODE,
                          100, boxa1, 0, 0.25, NULL, "/tmp/pdf/file34.pdf");
    convertToPdfSegmented("/tmp/pdf/candelabrum3.jpg", 200, L_FLATE_ENCODE,
                          100, boxa1, 0, 0.25, NULL, "/tmp/pdf/file35.pdf");

    pixDestroy(&pix1);
    pixDestroy(&pix2);
    boxaDestroy(&boxa1);

    pix1 = pixRead("lion-page.00016.jpg");
    pix2 = pixScale(pix1, 3.0, 3.0);
    pixWrite("/tmp/pdf/lion16.jpg", pix2, IFF_JFIF_JPEG);
    pix3 = pixRead("lion-mask.00016.tif");
    boxa1 = pixConnComp(pix3, NULL, 8);
    boxa2 = boxaTransform(boxa1, 0, 0, 3.0, 3.0);
    convertToPdfSegmented("/tmp/pdf/lion16.jpg", 200, L_G4_ENCODE,
                          190, boxa2, 0, 0.5, NULL, "/tmp/pdf/file36.pdf");
    convertToPdfSegmented("/tmp/pdf/lion16.jpg", 200, L_JPEG_ENCODE,
                          190, boxa2, 0, 0.5, NULL, "/tmp/pdf/file37.pdf");
    convertToPdfSegmented("/tmp/pdf/lion16.jpg", 200, L_FLATE_ENCODE,
                          190, boxa2, 0, 0.5, NULL, "/tmp/pdf/file38.pdf");

        /* Quantize the non-image part and flate encode.
         * This is useful because it results in a smaller file than
         * when you flate-encode the un-quantized non-image regions. */
    pix4 = pixScale(pix3, 3.0, 3.0);  /* higher res mask, for combining */
    pix5 = QuantizeNonImageRegion(pix2, pix4, 12);
    pixWrite("/tmp/pdf/lion16-quant.png", pix5, IFF_PNG);
    convertToPdfSegmented("/tmp/pdf/lion16-quant.png", 200, L_FLATE_ENCODE,
                          190, boxa2, 0, 0.5, NULL, "/tmp/pdf/file39.pdf");

    pixDestroy(&pix1);
    pixDestroy(&pix2);
    pixDestroy(&pix3);
    pixDestroy(&pix4);
    pixDestroy(&pix5);
    boxaDestroy(&boxa1);
    boxaDestroy(&boxa2);
#endif

#if 1
    /* ------------------ Test multipage pdf generation ----------------- */
    fprintf(stderr, "\n*** Writing multipage pdfs from single page pdfs\n");

        /* Generate a multi-page pdf from all these files */
    startTimer();
    concatenatePdf("/tmp/pdf", "file", "/tmp/pdf/cat_lept.pdf");
    fprintf(stderr, "All files have been concatenated: /tmp/pdf/cat_lept.pdf\n"
                    "Concatenation time: %7.3f\n", stopTimer());
#endif

#if 1
    /* -------------------- Test corruption recovery ------------------- */
        /* Put two good pdf files in a directory */
    lept_mkdir("good");
    lept_cp("testfile1.pdf", "good", NULL, NULL);
    lept_cp("testfile2.pdf", "good", NULL, NULL);
    concatenatePdf("/tmp/good", "file", "/tmp/pdf/good.pdf");

        /* Make a version with the pdf id removed, so that it is not
         * recognized as a pdf */
    ba = l_byteaInitFromFile("testfile2.pdf");
    data = l_byteaGetData(ba, &nbytes);
    l_binaryWrite("testfile0.notpdf.pdf", "w", data + 10, nbytes - 10);

        /* Make a version with a corrupted trailer */
    if (data)
        data[2297] = '2';  /* munge trailer object 6: change 458 --> 428 */
    l_binaryWrite("testfile2.bad.pdf", "w", data, nbytes);

        /* Put these two bad files, along with a good file, in a directory */
    lept_mkdir("bad");
    lept_mv("testfile0.notpdf.pdf", "bad", NULL, NULL);
    lept_cp("testfile1.pdf", "bad", NULL, NULL);
    lept_mv("testfile2.bad.pdf", "bad", NULL, NULL);
    l_byteaDestroy(&ba);

        /* Run concat on the bad files.   In the /tmp/bad/ directory,
         * the "not pdf" file should be ignored, and the corrupted pdf
         * file should be properly parsed, so the resulting
         * concatenated files should be identical.  */
    fprintf(stderr, "\nWe attempt to build from the bad directory\n");
    concatenatePdf("/tmp/bad", "file", "/tmp/pdf/bad.pdf");
    filesAreIdentical("/tmp/pdf/good.pdf", "/tmp/pdf/bad.pdf", &same);
    if (same)
        fprintf(stderr, "Fixed: files are the same\n"
                        "Attempt succeeded\n\n");
    else
        fprintf(stderr, "Busted: files are different\n");

        /* pdftk fails because the first file is not a pdf */
    fprintf(stderr, "pdftk attempts to build from the bad directory\n");
    tempfile1 = genPathname("/tmp/bad", "*.pdf");
    tempfile2 = genPathname("/tmp", "pdftk.bad.pdf");
    snprintf(buffer, sizeof(buffer), "pdftk %s output %s",
             tempfile1, tempfile2);
    ret = system(buffer);  /* pdftk */
    lept_free(tempfile1);
    lept_free(tempfile2);
    fprintf(stderr, "Attempt failed\n\n");
#endif

#if 1
    fprintf(stderr, "\n*** pdftk writes multipage pdfs from images\n");
    tempfile1 = genPathname("/tmp/pdf", "file*.pdf");
    tempfile2 = genPathname("/tmp/pdf", "cat_pdftk.pdf");
    snprintf(buffer, sizeof(buffer), "pdftk %s output %s",
             tempfile1, tempfile2);
    ret = system(buffer);  /* pdftk */
    lept_free(tempfile1);
    lept_free(tempfile2);
#endif

#if 1
    /* -- Test simple interface for generating multi-page pdf from images -- */
    fprintf(stderr, "\n*** Writing multipage pdfs from images\n");

        /* Put four image files in a directory.  They will be encoded thus:
         *     file1.png:  flate (8 bpp, only 10 colors)
         *     file2.jpg:  dct (8 bpp, 256 colors because of the jpeg encoding)
         *     file3.tif:  g4 (1 bpp)
         *     file4.jpg:  dct (32 bpp)    */
    lept_mkdir("image");
    pix1 = pixRead("feyn.tif");
    pix2 = pixRead("rabi.png");
    pix3 = pixScaleToGray3(pix1);
    pix4 = pixScaleToGray3(pix2);
    pix5 = pixScale(pix1, 0.33, 0.33);
    pix6 = pixRead("test24.jpg");
    pixWrite("/tmp/image/file1.png", pix3, IFF_PNG);  /* 10 colors */
    pixWrite("/tmp/image/file2.jpg", pix4, IFF_JFIF_JPEG);  /* 256 colors */
    pixWrite("/tmp/image/file3.tif", pix5, IFF_TIFF_G4);
    pixWrite("/tmp/image/file4.jpg", pix6, IFF_JFIF_JPEG);

    startTimer();
    convertFilesToPdf("/tmp/image", "file", 100, 0.8, 0, 75, "4 file test",
                      "/tmp/pdf/fourimages.pdf");
    fprintf(stderr, "4-page pdf generated: /tmp/pdf/fourimages.pdf\n"
                    "Time: %7.3f\n", stopTimer());
    pixDestroy(&pix1);
    pixDestroy(&pix2);
    pixDestroy(&pix3);
    pixDestroy(&pix4);
    pixDestroy(&pix5);
    pixDestroy(&pix6);
#endif

    return 0;
}


static void
GetImageMask(PIX         *pixs,
             l_int32      res,
             BOXA       **pboxa,
             const char  *debugfile)
{
PIX   *pix1, *pix2, *pix3, *pix4;
PIXA  *pixa;

    pixSetResolution(pixs, 200, 200);
    pix1 = pixConvertTo1(pixs, 100);
    pix2 = pixGenHalftoneMask(pix1, NULL, NULL, 0);
    pix3 = pixMorphSequence(pix2, "c20.1 + c1.20", 0);
    *pboxa = pixConnComp(pix3, NULL, 8);
    if (debugfile) {
        pixa = pixaCreate(0);
        pixaAddPix(pixa, pixs, L_COPY);
        pixaAddPix(pixa, pix1, L_INSERT);
        pixaAddPix(pixa, pix2, L_INSERT);
        pixaAddPix(pixa, pix3, L_INSERT);
        pix4 = pixaDisplayTiledInRows(pixa, 32, 1800, 0.25, 0, 25, 2);
        pixWrite(debugfile, pix4, IFF_JFIF_JPEG);
        pixDisplay(pix4, 100, 100);
        pixDestroy(&pix4);
        pixaDestroy(&pixa);
    } else {
        pixDestroy(&pix1);
        pixDestroy(&pix2);
        pixDestroy(&pix3);
    }

    return;
}

static PIX *
QuantizeNonImageRegion(PIX     *pixs,
                       PIX     *pixm,
                       l_int32  levels)
{
PIX  *pix1, *pix2, *pixd;

    pix1 = pixConvertTo8(pixs, 0);
    pix2 = pixThresholdOn8bpp(pix1, levels, 1);
    pixd = pixConvertTo32(pix2);  /* save in rgb */
    pixCombineMasked(pixd, pixs, pixm);  /* rgb result */
    pixDestroy(&pix1);
    pixDestroy(&pix2);
    return pixd;
}