third-party/leptonica/prog/cleanpdf.c
/*====================================================================*
- Copyright (C) 2001 Leptonica. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following
- disclaimer in the documentation and/or other materials
- provided with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*====================================================================*/
/*
* cleanpdf.c
*
* This program takes as input pdf files that have been constructed
* from poorly compressed images -- typically images that have been
* scanned in grayscale or color but should be rendered in black
* and white (1 bpp). It cleans and compresses them, and generates
* a pdf composed of tiff-g4 compressed images.
*
* Syntax: cleanpdf basedir threshold resolution
*
* The basedir is a directory where the input pdf files are located.
* The program will operate on every file in this directory with
* the ".pdf" extension.
*
* The input threshold should be somewhere in the range [130 - 190].
* The result is typically not very sensitive to the value, because
* internally we use a pixel mapping that is adapted to the local
* background before thresholding to binarize the image.
*
* The resolution should be the scanned resolution. This is typically
* 300 ppi, which for an 8.5 x 11 page would be 2550 x 3300 pixels.
*
* Whenever possible, the images have been deskewed.
*
* N.B. This requires pdfimages. For non-unix systems, this requires
* installation of the cygwin Poppler package:
* https://cygwin.com/cgi-bin2/package-cat.cgi?file=x86/poppler/
* poppler-0.26.5-1
*/
#ifdef _WIN32
#ifdef _MSC_VER
#include <direct.h>
#else
#include <io.h>
#endif /* _MSC_VER */
#endif /* _WIN32 */
#include "string.h"
#include <sys/stat.h>
#include <sys/types.h>
#include "allheaders.h"
l_int32 main(int argc,
char **argv)
{
char buf[256];
char *basedir, *fname, *tail, *basename, *imagedir;
l_int32 thresh, res, i, n, ret;
PIX *pixs, *pix1, *pix2, *pix3, *pix4;
SARRAY *sa;
static char mainName[] = "cleanpdf";
if (argc != 4)
return ERROR_INT(
"Syntax: cleanpdf basedir threshold resolution", mainName, 1);
basedir = argv[1];
thresh = atoi(argv[2]);
res = atoi(argv[3]);
#if 1
/* Get the names of the pdf files */
if ((sa = getSortedPathnamesInDirectory(basedir, "pdf", 0, 0)) == NULL)
return ERROR_INT("files not found", mainName, 1);
sarrayWriteStream(stderr, sa);
n = sarrayGetCount(sa);
#endif
/* Rasterize: pdfimages -f fname root */
imagedir = stringJoin(basedir, "/image");
#if 1
#ifndef _WIN32
mkdir(imagedir, 0777);
#else
_mkdir(imagedir);
#endif /* _WIN32 */
for (i = 0; i < n; i++) {
fname = sarrayGetString(sa, i, L_NOCOPY);
splitPathAtDirectory(fname, NULL, &tail);
splitPathAtExtension(tail, &basename, NULL);
snprintf(buf, sizeof(buf), "pdfimages -j %s %s/%s",
fname, imagedir, basename);
FREE(tail);
FREE(basename);
fprintf(stderr, "%s\n", buf);
ret = system(buf); /* pdfimages -j */
}
sarrayDestroy(&sa);
#endif
#if 1
/* Clean, deskew and compress */
sa = getSortedPathnamesInDirectory(imagedir, NULL, 0, 0);
sarrayWriteStream(stderr, sa);
n = sarrayGetCount(sa);
for (i = 0; i < n; i++) {
fname = sarrayGetString(sa, i, L_NOCOPY);
pixs = pixRead(fname);
pix1 = pixConvertTo8(pixs, FALSE);
pix2 = pixFindSkewAndDeskew(pix1, 2, NULL, NULL);
pix3 = pixBackgroundNormSimple(pix2, NULL, NULL);
pixGammaTRC(pix3, pix3, 2.0, 50, 250);
pix4 = pixThresholdToBinary(pix3, thresh);
if (0) pixRotate180(pix4, pix4); /* remove this usually!! */
splitPathAtDirectory(fname, NULL, &tail);
splitPathAtExtension(tail, &basename, NULL);
snprintf(buf, sizeof(buf), "%s/%s.tif", imagedir, basename);
fprintf(stderr, "%s\n", buf);
pixWrite(buf, pix4, IFF_TIFF_G4);
pixDestroy(&pixs);
pixDestroy(&pix1);
pixDestroy(&pix2);
pixDestroy(&pix3);
pixDestroy(&pix4);
FREE(tail);
FREE(basename);
}
sarrayDestroy(&sa);
#endif
#if 1
/* Generate the pdf */
convertFilesToPdf(imagedir, "tif", res, 1.0, L_G4_ENCODE, 0, NULL,
"/tmp/output.pdf");
#endif
return 0;
}