third-party/leptonica/src/sarray.c
/*====================================================================*
- Copyright (C) 2001 Leptonica. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following
- disclaimer in the documentation and/or other materials
- provided with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*====================================================================*/
/*
* sarray.c
*
* Create/Destroy/Copy
* SARRAY *sarrayCreate()
* SARRAY *sarrayCreateInitialized()
* SARRAY *sarrayCreateWordsFromString()
* SARRAY *sarrayCreateLinesFromString()
* void *sarrayDestroy()
* SARRAY *sarrayCopy()
* SARRAY *sarrayClone()
*
* Add/Remove string
* l_int32 sarrayAddString()
* static l_int32 sarrayExtendArray()
* char *sarrayRemoveString()
* l_int32 sarrayReplaceString()
* l_int32 sarrayClear()
*
* Accessors
* l_int32 sarrayGetCount()
* char **sarrayGetArray()
* char *sarrayGetString()
* l_int32 sarrayGetRefcount()
* l_int32 sarrayChangeRefcount()
*
* Conversion back to string
* char *sarrayToString()
* char *sarrayToStringRange()
*
* Concatenate 2 sarrays
* l_int32 sarrayConcatenate()
* l_int32 sarrayAppendRange()
*
* Pad an sarray to be the same size as another sarray
* l_int32 sarrayPadToSameSize()
*
* Convert word sarray to (formatted) line sarray
* SARRAY *sarrayConvertWordsToLines()
*
* Split string on separator list
* SARRAY *sarraySplitString()
*
* Filter sarray
* SARRAY *sarraySelectBySubstring()
* SARRAY *sarraySelectByRange()
* l_int32 sarrayParseRange()
*
* Sort
* SARRAY *sarraySort()
* SARRAY *sarraySortByIndex()
* l_int32 stringCompareLexical()
*
* Serialize for I/O
* SARRAY *sarrayRead()
* SARRAY *sarrayReadStream()
* l_int32 sarrayWrite()
* l_int32 sarrayWriteStream()
* l_int32 sarrayAppend()
*
* Directory filenames
* SARRAY *getNumberedPathnamesInDirectory()
* SARRAY *getSortedPathnamesInDirectory()
* SARRAY *convertSortedToNumberedPathnames()
* SARRAY *getFilenamesInDirectory()
*
* These functions are important for efficient manipulation
* of string data, and they have found widespread use in
* leptonica. For example:
* (1) to generate text files: e.g., PostScript and PDF
* wrappers around sets of images
* (2) to parse text files: e.g., extracting prototypes
* from the source to generate allheaders.h
* (3) to generate code for compilation: e.g., the fast
* dwa code for arbitrary structuring elements.
*
* Comments on usage:
*
* The user is responsible for correctly disposing of strings
* that have been extracted from sarrays:
* - When you want a string from an Sarray to inspect it, or
* plan to make a copy of it later, use sarrayGetString()
* with copyflag = 0. In this case, you must neither free
* the string nor put it directly in another array.
* We provide the copyflag constant L_NOCOPY, which is 0,
* for this purpose:
* str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
* To extract a copy of a string, use:
* str-owned = sarrayGetString(sa, index, L_COPY);
*
* - When you want to insert a string that is in one
* array into another array (always leaving the first
* array intact), you have two options:
* (1) use copyflag = L_COPY to make an immediate copy,
* which you must then add to the second array
* by insertion; namely,
* str-owned = sarrayGetString(sa, index, L_COPY);
* sarrayAddString(sa, str-owned, L_INSERT);
* (2) use copyflag = L_NOCOPY to get another handle to
* the string, in which case you must add
* a copy of it to the second string array:
* str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
* sarrayAddString(sa, str-not-owned, L_COPY).
*
* In all cases, when you use copyflag = L_COPY to extract
* a string from an array, you must either free it
* or insert it in an array that will be freed later.
*/
#include <string.h>
#ifndef _WIN32
#include <dirent.h> /* unix only */
#endif /* ! _WIN32 */
#include "allheaders.h"
static const l_int32 INITIAL_PTR_ARRAYSIZE = 50; /* n'importe quoi */
static const l_int32 L_BUF_SIZE = 512;
/* Static function */
static l_int32 sarrayExtendArray(SARRAY *sa);
/*--------------------------------------------------------------------------*
* String array create/destroy/copy/extend *
*--------------------------------------------------------------------------*/
/*!
* sarrayCreate()
*
* Input: size of string ptr array to be alloc'd
* (use 0 for default)
* Return: sarray, or null on error
*/
SARRAY *
sarrayCreate(l_int32 n)
{
SARRAY *sa;
PROCNAME("sarrayCreate");
if (n <= 0)
n = INITIAL_PTR_ARRAYSIZE;
if ((sa = (SARRAY *)CALLOC(1, sizeof(SARRAY))) == NULL)
return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
if ((sa->array = (char **)CALLOC(n, sizeof(char *))) == NULL)
return (SARRAY *)ERROR_PTR("ptr array not made", procName, NULL);
sa->nalloc = n;
sa->n = 0;
sa->refcount = 1;
return sa;
}
/*!
* sarrayCreateInitialized()
*
* Input: n (size of string ptr array to be alloc'd)
* initstr (string to be initialized on the full array)
* Return: sarray, or null on error
*/
SARRAY *
sarrayCreateInitialized(l_int32 n,
char *initstr)
{
l_int32 i;
SARRAY *sa;
PROCNAME("sarrayCreateInitialized");
if (n <= 0)
return (SARRAY *)ERROR_PTR("n must be > 0", procName, NULL);
if (!initstr)
return (SARRAY *)ERROR_PTR("initstr not defined", procName, NULL);
sa = sarrayCreate(n);
for (i = 0; i < n; i++)
sarrayAddString(sa, initstr, L_COPY);
return sa;
}
/*!
* sarrayCreateWordsFromString()
*
* Input: string
* Return: sarray, or null on error
*
* Notes:
* (1) This finds the number of word substrings, creates an sarray
* of this size, and puts copies of each substring into the sarray.
*/
SARRAY *
sarrayCreateWordsFromString(const char *string)
{
char separators[] = " \n\t";
l_int32 i, nsub, size, inword;
SARRAY *sa;
PROCNAME("sarrayCreateWordsFromString");
if (!string)
return (SARRAY *)ERROR_PTR("textstr not defined", procName, NULL);
/* Find the number of words */
size = strlen(string);
nsub = 0;
inword = FALSE;
for (i = 0; i < size; i++) {
if (inword == FALSE &&
(string[i] != ' ' && string[i] != '\t' && string[i] != '\n')) {
inword = TRUE;
nsub++;
} else if (inword == TRUE &&
(string[i] == ' ' || string[i] == '\t' || string[i] == '\n')) {
inword = FALSE;
}
}
if ((sa = sarrayCreate(nsub)) == NULL)
return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
sarraySplitString(sa, string, separators);
return sa;
}
/*!
* sarrayCreateLinesFromString()
*
* Input: string
* blankflag (0 to exclude blank lines; 1 to include)
* Return: sarray, or null on error
*
* Notes:
* (1) This finds the number of line substrings, each of which
* ends with a newline, and puts a copy of each substring
* in a new sarray.
* (2) The newline characters are removed from each substring.
*/
SARRAY *
sarrayCreateLinesFromString(char *string,
l_int32 blankflag)
{
l_int32 i, nsub, size, startptr;
char *cstring, *substring;
SARRAY *sa;
PROCNAME("sarrayCreateLinesFromString");
if (!string)
return (SARRAY *)ERROR_PTR("textstr not defined", procName, NULL);
/* find the number of lines */
size = strlen(string);
nsub = 0;
for (i = 0; i < size; i++) {
if (string[i] == '\n')
nsub++;
}
if ((sa = sarrayCreate(nsub)) == NULL)
return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
if (blankflag) { /* keep blank lines as null strings */
/* Make a copy for munging */
if ((cstring = stringNew(string)) == NULL)
return (SARRAY *)ERROR_PTR("cstring not made", procName, NULL);
/* We'll insert nulls like strtok */
startptr = 0;
for (i = 0; i < size; i++) {
if (cstring[i] == '\n') {
cstring[i] = '\0';
if (i > 0 && cstring[i - 1] == '\r')
cstring[i - 1] = '\0'; /* also remove Windows CR */
if ((substring = stringNew(cstring + startptr)) == NULL)
return (SARRAY *)ERROR_PTR("substring not made",
procName, NULL);
sarrayAddString(sa, substring, L_INSERT);
/* fprintf(stderr, "substring = %s\n", substring); */
startptr = i + 1;
}
}
if (startptr < size) { /* no newline at end of last line */
if ((substring = stringNew(cstring + startptr)) == NULL)
return (SARRAY *)ERROR_PTR("substring not made",
procName, NULL);
sarrayAddString(sa, substring, L_INSERT);
/* fprintf(stderr, "substring = %s\n", substring); */
}
FREE(cstring);
} else { /* remove blank lines; use strtok */
sarraySplitString(sa, string, "\r\n");
}
return sa;
}
/*!
* sarrayDestroy()
*
* Input: &sarray <to be nulled>
* Return: void
*
* Notes:
* (1) Decrements the ref count and, if 0, destroys the sarray.
* (2) Always nulls the input ptr.
*/
void
sarrayDestroy(SARRAY **psa)
{
l_int32 i;
SARRAY *sa;
PROCNAME("sarrayDestroy");
if (psa == NULL) {
L_WARNING("ptr address is NULL!\n", procName);
return;
}
if ((sa = *psa) == NULL)
return;
sarrayChangeRefcount(sa, -1);
if (sarrayGetRefcount(sa) <= 0) {
if (sa->array) {
for (i = 0; i < sa->n; i++) {
if (sa->array[i])
FREE(sa->array[i]);
}
FREE(sa->array);
}
FREE(sa);
}
*psa = NULL;
return;
}
/*!
* sarrayCopy()
*
* Input: sarray
* Return: copy of sarray, or null on error
*/
SARRAY *
sarrayCopy(SARRAY *sa)
{
l_int32 i;
SARRAY *csa;
PROCNAME("sarrayCopy");
if (!sa)
return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
if ((csa = sarrayCreate(sa->nalloc)) == NULL)
return (SARRAY *)ERROR_PTR("csa not made", procName, NULL);
for (i = 0; i < sa->n; i++)
sarrayAddString(csa, sa->array[i], L_COPY);
return csa;
}
/*!
* sarrayClone()
*
* Input: sarray
* Return: ptr to same sarray, or null on error
*/
SARRAY *
sarrayClone(SARRAY *sa)
{
PROCNAME("sarrayClone");
if (!sa)
return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
sarrayChangeRefcount(sa, 1);
return sa;
}
/*!
* sarrayAddString()
*
* Input: sarray
* string (string to be added)
* copyflag (L_INSERT, L_COPY)
* Return: 0 if OK, 1 on error
*
* Notes:
* (1) Legacy usage decrees that we always use 0 to insert a string
* directly and 1 to insert a copy of the string. The
* enums for L_INSERT and L_COPY agree with this convention,
* and will not change in the future.
* (2) See usage comments at the top of this file.
*/
l_int32
sarrayAddString(SARRAY *sa,
char *string,
l_int32 copyflag)
{
l_int32 n;
PROCNAME("sarrayAddString");
if (!sa)
return ERROR_INT("sa not defined", procName, 1);
if (!string)
return ERROR_INT("string not defined", procName, 1);
if (copyflag != L_INSERT && copyflag != L_COPY)
return ERROR_INT("invalid copyflag", procName, 1);
n = sarrayGetCount(sa);
if (n >= sa->nalloc)
sarrayExtendArray(sa);
if (copyflag == L_INSERT)
sa->array[n] = string;
else /* L_COPY */
sa->array[n] = stringNew(string);
sa->n++;
return 0;
}
/*!
* sarrayExtendArray()
*
* Input: sarray
* Return: 0 if OK, 1 on error
*/
static l_int32
sarrayExtendArray(SARRAY *sa)
{
PROCNAME("sarrayExtendArray");
if (!sa)
return ERROR_INT("sa not defined", procName, 1);
if ((sa->array = (char **)reallocNew((void **)&sa->array,
sizeof(char *) * sa->nalloc,
2 * sizeof(char *) * sa->nalloc)) == NULL)
return ERROR_INT("new ptr array not returned", procName, 1);
sa->nalloc *= 2;
return 0;
}
/*!
* sarrayRemoveString()
*
* Input: sarray
* index (of string within sarray)
* Return: removed string, or null on error
*/
char *
sarrayRemoveString(SARRAY *sa,
l_int32 index)
{
char *string;
char **array;
l_int32 i, n, nalloc;
PROCNAME("sarrayRemoveString");
if (!sa)
return (char *)ERROR_PTR("sa not defined", procName, NULL);
if ((array = sarrayGetArray(sa, &nalloc, &n)) == NULL)
return (char *)ERROR_PTR("array not returned", procName, NULL);
if (index < 0 || index >= n)
return (char *)ERROR_PTR("array index out of bounds", procName, NULL);
string = array[index];
/* If removed string is not at end of array, shift
* to fill in, maintaining original ordering.
* Note: if we didn't care about the order, we could
* put the last string array[n - 1] directly into the hole. */
for (i = index; i < n - 1; i++)
array[i] = array[i + 1];
sa->n--;
return string;
}
/*!
* sarrayReplaceString()
*
* Input: sarray
* index (of string within sarray to be replaced)
* newstr (string to replace existing one)
* copyflag (L_INSERT, L_COPY)
* Return: 0 if OK, 1 on error
*
* Notes:
* (1) This destroys an existing string and replaces it with
* the new string or a copy of it.
* (2) By design, an sarray is always compacted, so there are
* never any holes (null ptrs) in the ptr array up to the
* current count.
*/
l_int32
sarrayReplaceString(SARRAY *sa,
l_int32 index,
char *newstr,
l_int32 copyflag)
{
char *str;
l_int32 n;
PROCNAME("sarrayReplaceString");
if (!sa)
return ERROR_INT("sa not defined", procName, 1);
n = sarrayGetCount(sa);
if (index < 0 || index >= n)
return ERROR_INT("array index out of bounds", procName, 1);
if (!newstr)
return ERROR_INT("newstr not defined", procName, 1);
if (copyflag != L_INSERT && copyflag != L_COPY)
return ERROR_INT("invalid copyflag", procName, 1);
FREE(sa->array[index]);
if (copyflag == L_INSERT)
str = newstr;
else /* L_COPY */
str = stringNew(newstr);
sa->array[index] = str;
return 0;
}
/*!
* sarrayClear()
*
* Input: sarray
* Return: 0 if OK; 1 on error
*/
l_int32
sarrayClear(SARRAY *sa)
{
l_int32 i;
PROCNAME("sarrayClear");
if (!sa)
return ERROR_INT("sa not defined", procName, 1);
for (i = 0; i < sa->n; i++) { /* free strings and null ptrs */
FREE(sa->array[i]);
sa->array[i] = NULL;
}
sa->n = 0;
return 0;
}
/*----------------------------------------------------------------------*
* Accessors *
*----------------------------------------------------------------------*/
/*!
* sarrayGetCount()
*
* Input: sarray
* Return: count, or 0 if no strings or on error
*/
l_int32
sarrayGetCount(SARRAY *sa)
{
PROCNAME("sarrayGetCount");
if (!sa)
return ERROR_INT("sa not defined", procName, 0);
return sa->n;
}
/*!
* sarrayGetArray()
*
* Input: sarray
* &nalloc (<optional return> number allocated string ptrs)
* &n (<optional return> number allocated strings)
* Return: ptr to string array, or null on error
*
* Notes:
* (1) Caution: the returned array is not a copy, so caller
* must not destroy it!
*/
char **
sarrayGetArray(SARRAY *sa,
l_int32 *pnalloc,
l_int32 *pn)
{
char **array;
PROCNAME("sarrayGetArray");
if (!sa)
return (char **)ERROR_PTR("sa not defined", procName, NULL);
array = sa->array;
if (pnalloc) *pnalloc = sa->nalloc;
if (pn) *pn = sa->n;
return array;
}
/*!
* sarrayGetString()
*
* Input: sarray
* index (to the index-th string)
* copyflag (L_NOCOPY or L_COPY)
* Return: string, or null on error
*
* Notes:
* (1) Legacy usage decrees that we always use 0 to get the
* pointer to the string itself, and 1 to get a copy of
* the string.
* (2) See usage comments at the top of this file.
* (3) To get a pointer to the string itself, use for copyflag:
* L_NOCOPY or 0 or FALSE
* To get a copy of the string, use for copyflag:
* L_COPY or 1 or TRUE
* The const values of L_NOCOPY and L_COPY are guaranteed not
* to change.
*/
char *
sarrayGetString(SARRAY *sa,
l_int32 index,
l_int32 copyflag)
{
PROCNAME("sarrayGetString");
if (!sa)
return (char *)ERROR_PTR("sa not defined", procName, NULL);
if (index < 0 || index >= sa->n)
return (char *)ERROR_PTR("index not valid", procName, NULL);
if (copyflag != L_NOCOPY && copyflag != L_COPY)
return (char *)ERROR_PTR("invalid copyflag", procName, NULL);
if (copyflag == L_NOCOPY)
return sa->array[index];
else /* L_COPY */
return stringNew(sa->array[index]);
}
/*!
* sarrayGetRefCount()
*
* Input: sarray
* Return: refcount, or UNDEF on error
*/
l_int32
sarrayGetRefcount(SARRAY *sa)
{
PROCNAME("sarrayGetRefcount");
if (!sa)
return ERROR_INT("sa not defined", procName, UNDEF);
return sa->refcount;
}
/*!
* sarrayChangeRefCount()
*
* Input: sarray
* delta (change to be applied)
* Return: 0 if OK, 1 on error
*/
l_int32
sarrayChangeRefcount(SARRAY *sa,
l_int32 delta)
{
PROCNAME("sarrayChangeRefcount");
if (!sa)
return ERROR_INT("sa not defined", procName, UNDEF);
sa->refcount += delta;
return 0;
}
/*----------------------------------------------------------------------*
* Conversion to string *
*----------------------------------------------------------------------*/
/*!
* sarrayToString()
*
* Input: sarray
* addnlflag (flag: 0 adds nothing to each substring
* 1 adds '\n' to each substring
* 2 adds ' ' to each substring)
* Return: dest string, or null on error
*
* Notes:
* (1) Concatenates all the strings in the sarray, preserving
* all white space.
* (2) If addnlflag != 0, adds either a '\n' or a ' ' after
* each substring.
* (3) This function was NOT implemented as:
* for (i = 0; i < n; i++)
* strcat(dest, sarrayGetString(sa, i, L_NOCOPY));
* Do you see why?
*/
char *
sarrayToString(SARRAY *sa,
l_int32 addnlflag)
{
PROCNAME("sarrayToString");
if (!sa)
return (char *)ERROR_PTR("sa not defined", procName, NULL);
return sarrayToStringRange(sa, 0, 0, addnlflag);
}
/*!
* sarrayToStringRange()
*
* Input: sarray
* first (index of first string to use; starts with 0)
* nstrings (number of strings to append into the result; use
* 0 to append to the end of the sarray)
* addnlflag (flag: 0 adds nothing to each substring
* 1 adds '\n' to each substring
* 2 adds ' ' to each substring)
* Return: dest string, or null on error
*
* Notes:
* (1) Concatenates the specified strings inthe sarray, preserving
* all white space.
* (2) If addnlflag != 0, adds either a '\n' or a ' ' after
* each substring.
* (3) If the sarray is empty, this returns a string with just
* the character corresponding to @addnlflag.
*/
char *
sarrayToStringRange(SARRAY *sa,
l_int32 first,
l_int32 nstrings,
l_int32 addnlflag)
{
char *dest, *src, *str;
l_int32 n, i, last, size, index, len;
PROCNAME("sarrayToStringRange");
if (!sa)
return (char *)ERROR_PTR("sa not defined", procName, NULL);
if (addnlflag != 0 && addnlflag != 1 && addnlflag != 2)
return (char *)ERROR_PTR("invalid addnlflag", procName, NULL);
n = sarrayGetCount(sa);
/* Empty sa; return char corresponding to addnlflag only */
if (n == 0) {
if (first == 0) {
if (addnlflag == 0)
return stringNew("");
if (addnlflag == 1)
return stringNew("\n");
else /* addnlflag == 2) */
return stringNew(" ");
} else {
return (char *)ERROR_PTR("first not valid", procName, NULL);
}
}
if (first < 0 || first >= n)
return (char *)ERROR_PTR("first not valid", procName, NULL);
if (nstrings == 0 || (nstrings > n - first))
nstrings = n - first; /* no overflow */
last = first + nstrings - 1;
size = 0;
for (i = first; i <= last; i++) {
if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL)
return (char *)ERROR_PTR("str not found", procName, NULL);
size += strlen(str) + 2;
}
if ((dest = (char *)CALLOC(size + 1, sizeof(char))) == NULL)
return (char *)ERROR_PTR("dest not made", procName, NULL);
index = 0;
for (i = first; i <= last; i++) {
src = sarrayGetString(sa, i, L_NOCOPY);
len = strlen(src);
memcpy(dest + index, src, len);
index += len;
if (addnlflag == 1) {
dest[index] = '\n';
index++;
} else if (addnlflag == 2) {
dest[index] = ' ';
index++;
}
}
return dest;
}
/*----------------------------------------------------------------------*
* Concatenate 2 sarrays *
*----------------------------------------------------------------------*/
/*!
* sarrayConcatenate()
*
* Input: sa1 (to be added to)
* sa2 (append to sa1)
* Return: 0 if OK, 1 on error
*
* Notes:
* (1) Copies of the strings in sarray2 are added to sarray1.
*/
l_int32
sarrayConcatenate(SARRAY *sa1,
SARRAY *sa2)
{
char *str;
l_int32 n, i;
PROCNAME("sarrayConcatenate");
if (!sa1)
return ERROR_INT("sa1 not defined", procName, 1);
if (!sa2)
return ERROR_INT("sa2 not defined", procName, 1);
n = sarrayGetCount(sa2);
for (i = 0; i < n; i++) {
str = sarrayGetString(sa2, i, L_NOCOPY);
sarrayAddString(sa1, str, L_COPY);
}
return 0;
}
/*!
* sarrayAppendRange()
*
* Input: sa1 (to be added to)
* sa2 (append specified range of strings in sa2 to sa1)
* start (index of first string of sa2 to append)
* end (index of last string of sa2 to append; -1 to end of array)
* Return: 0 if OK, 1 on error
*
* Notes:
* (1) Copies of the strings in sarray2 are added to sarray1.
* (2) The [start ... end] range is truncated if necessary.
* (3) Use end == -1 to append to the end of sa2.
*/
l_int32
sarrayAppendRange(SARRAY *sa1,
SARRAY *sa2,
l_int32 start,
l_int32 end)
{
char *str;
l_int32 n, i;
PROCNAME("sarrayAppendRange");
if (!sa1)
return ERROR_INT("sa1 not defined", procName, 1);
if (!sa2)
return ERROR_INT("sa2 not defined", procName, 1);
if (start < 0)
start = 0;
n = sarrayGetCount(sa2);
if (end < 0 || end >= n)
end = n - 1;
if (start > end)
return ERROR_INT("start > end", procName, 1);
for (i = start; i <= end; i++) {
str = sarrayGetString(sa2, i, L_NOCOPY);
sarrayAddString(sa1, str, L_COPY);
}
return 0;
}
/*----------------------------------------------------------------------*
* Pad an sarray to be the same size as another sarray *
*----------------------------------------------------------------------*/
/*!
* sarrayPadToSameSize()
*
* Input: sa1, sa2
* padstring
* Return: 0 if OK, 1 on error
*
* Notes:
* (1) If two sarrays have different size, this adds enough
* instances of @padstring to the smaller so that they are
* the same size. It is useful when two or more sarrays
* are being sequenced in parallel, and it is necessary to
* find a valid string at each index.
*/
l_int32
sarrayPadToSameSize(SARRAY *sa1,
SARRAY *sa2,
char *padstring)
{
l_int32 i, n1, n2;
PROCNAME("sarrayPadToSameSize");
if (!sa1 || !sa2)
return ERROR_INT("both sa1 and sa2 not defined", procName, 1);
n1 = sarrayGetCount(sa1);
n2 = sarrayGetCount(sa2);
if (n1 < n2) {
for (i = n1; i < n2; i++)
sarrayAddString(sa1, padstring, L_COPY);
} else if (n1 > n2) {
for (i = n2; i < n1; i++)
sarrayAddString(sa2, padstring, L_COPY);
}
return 0;
}
/*----------------------------------------------------------------------*
* Convert word sarray to line sarray *
*----------------------------------------------------------------------*/
/*!
* sarrayConvertWordsToLines()
*
* Input: sa (sa of individual words)
* linesize (max num of chars in each line)
* Return: saout (sa of formatted lines), or null on error
*
* This is useful for re-typesetting text to a specific maximum
* line length. The individual words in the input sarray
* are concatenated into textlines. An input word string of zero
* length is taken to be a paragraph separator. Each time
* such a string is found, the current line is ended and
* a new line is also produced that contains just the
* string of zero length (""). When the output sarray
* of lines is eventually converted to a string with newlines
* (typically) appended to each line string, the empty
* strings are just converted to newlines, producing the visible
* paragraph separation.
*
* What happens when a word is larger than linesize?
* We write it out as a single line anyway! Words preceding
* or following this long word are placed on lines preceding
* or following the line with the long word. Why this choice?
* Long "words" found in text documents are typically URLs, and
* it's often desirable not to put newlines in the middle of a URL.
* The text display program (e.g., text editor) will typically
* wrap the long "word" to fit in the window.
*/
SARRAY *
sarrayConvertWordsToLines(SARRAY *sa,
l_int32 linesize)
{
char *wd, *strl;
char emptystring[] = "";
l_int32 n, i, len, totlen;
SARRAY *sal, *saout;
PROCNAME("sarrayConvertWordsToLines");
if (!sa)
return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
if ((saout = sarrayCreate(0)) == NULL)
return (SARRAY *)ERROR_PTR("saout not defined", procName, NULL);
n = sarrayGetCount(sa);
totlen = 0;
sal = NULL;
for (i = 0; i < n; i++) {
if (!sal) {
if ((sal = sarrayCreate(0)) == NULL)
return (SARRAY *)ERROR_PTR("sal not made", procName, NULL);
}
wd = sarrayGetString(sa, i, L_NOCOPY);
len = strlen(wd);
if (len == 0) { /* end of paragraph: end line & insert blank line */
if (totlen > 0) {
strl = sarrayToString(sal, 2);
sarrayAddString(saout, strl, L_INSERT);
}
sarrayAddString(saout, emptystring, L_COPY);
sarrayDestroy(&sal);
totlen = 0;
} else if (totlen == 0 && len + 1 > linesize) { /* long word! */
sarrayAddString(saout, wd, L_COPY); /* copy to one line */
} else if (totlen + len + 1 > linesize) { /* end line & start new */
strl = sarrayToString(sal, 2);
sarrayAddString(saout, strl, L_INSERT);
sarrayDestroy(&sal);
if ((sal = sarrayCreate(0)) == NULL)
return (SARRAY *)ERROR_PTR("sal not made", procName, NULL);
sarrayAddString(sal, wd, L_COPY);
totlen = len + 1;
} else { /* add to current line */
sarrayAddString(sal, wd, L_COPY);
totlen += len + 1;
}
}
if (totlen > 0) { /* didn't end with blank line; output last line */
strl = sarrayToString(sal, 2);
sarrayAddString(saout, strl, L_INSERT);
sarrayDestroy(&sal);
}
return saout;
}
/*----------------------------------------------------------------------*
* Split string on separator list *
*----------------------------------------------------------------------*/
/*
* sarraySplitString()
*
* Input: sa (to append to; typically empty initially)
* str (string to split; not changed)
* separators (characters that split input string)
* Return: 0 if OK, 1 on error.
*
* Notes:
* (1) This uses strtokSafe(). See the notes there in utils.c.
*/
l_int32
sarraySplitString(SARRAY *sa,
const char *str,
const char *separators)
{
char *cstr, *substr, *saveptr;
PROCNAME("sarraySplitString");
if (!sa)
return ERROR_INT("sa not defined", procName, 1);
if (!str)
return ERROR_INT("str not defined", procName, 1);
if (!separators)
return ERROR_INT("separators not defined", procName, 1);
cstr = stringNew(str); /* preserves const-ness of input str */
substr = strtokSafe(cstr, separators, &saveptr);
if (substr)
sarrayAddString(sa, substr, L_INSERT);
while ((substr = strtokSafe(NULL, separators, &saveptr)))
sarrayAddString(sa, substr, L_INSERT);
FREE(cstr);
return 0;
}
/*----------------------------------------------------------------------*
* Filter sarray *
*----------------------------------------------------------------------*/
/*!
* sarraySelectBySubstring()
*
* Input: sain (input sarray)
* substr (<optional> substring for matching; can be NULL)
* Return: saout (output sarray, filtered with substring) or null on error
*
* Notes:
* (1) This selects all strings in sain that have substr as a substring.
* Note that we can't use strncmp() because we're looking for
* a match to the substring anywhere within each filename.
* (2) If substr == NULL, returns a copy of the sarray.
*/
SARRAY *
sarraySelectBySubstring(SARRAY *sain,
const char *substr)
{
char *str;
l_int32 n, i, offset, found;
SARRAY *saout;
PROCNAME("sarraySelectBySubstring");
if (!sain)
return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
n = sarrayGetCount(sain);
if (!substr || n == 0)
return sarrayCopy(sain);
saout = sarrayCreate(n);
for (i = 0; i < n; i++) {
str = sarrayGetString(sain, i, L_NOCOPY);
arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
strlen(substr), &offset, &found);
if (found)
sarrayAddString(saout, str, L_COPY);
}
return saout;
}
/*!
* sarraySelectByRange()
*
* Input: sain (input sarray)
* first (index of first string to be selected)
* last (index of last string to be selected; use 0 to go to the
* end of the sarray)
* Return: saout (output sarray), or null on error
*
* Notes:
* (1) This makes @saout consisting of copies of all strings in @sain
* in the index set [first ... last]. Use @last == 0 to get all
* strings from @first to the last string in the sarray.
*/
SARRAY *
sarraySelectByRange(SARRAY *sain,
l_int32 first,
l_int32 last)
{
char *str;
l_int32 n, i;
SARRAY *saout;
PROCNAME("sarraySelectByRange");
if (!sain)
return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
if (first < 0) first = 0;
n = sarrayGetCount(sain);
if (last <= 0) last = n - 1;
if (last >= n) {
L_WARNING("@last > n - 1; setting to n - 1\n", procName);
last = n - 1;
}
if (first > last)
return (SARRAY *)ERROR_PTR("first must be >= last", procName, NULL);
saout = sarrayCreate(0);
for (i = first; i <= last; i++) {
str = sarrayGetString(sain, i, L_COPY);
sarrayAddString(saout, str, L_INSERT);
}
return saout;
}
/*!
* sarrayParseRange()
*
* Input: sa (input sarray)
* start (index to start range search)
* &actualstart (<return> index of actual start; may be > 'start')
* &end (<return> index of end)
* &newstart (<return> index of start of next range)
* substr (substring for matching at beginning of string)
* loc (byte offset within the string for the pattern; use
* -1 if the location does not matter);
* Return: 0 if valid range found; 1 otherwise
*
* Notes:
* (1) This finds the range of the next set of strings in SA,
* beginning the search at 'start', that does NOT have
* the substring 'substr' either at the indicated location
* in the string or anywhere in the string. The input
* variable 'loc' is the specified offset within the string;
* use -1 to indicate 'anywhere in the string'.
* (2) Always check the return value to verify that a valid range
* was found.
* (3) If a valid range is not found, the values of actstart,
* end and newstart are all set to the size of sa.
* (4) If this is the last valid range, newstart returns the value n.
* In use, this should be tested before calling the function.
* (5) Usage example. To find all the valid ranges in a file
* where the invalid lines begin with two dashes, copy each
* line in the file to a string in an sarray, and do:
* start = 0;
* while (!sarrayParseRange(sa, start, &actstart, &end, &start,
* "--", 0))
* fprintf(stderr, "start = %d, end = %d\n", actstart, end);
*/
l_int32
sarrayParseRange(SARRAY *sa,
l_int32 start,
l_int32 *pactualstart,
l_int32 *pend,
l_int32 *pnewstart,
const char *substr,
l_int32 loc)
{
char *str;
l_int32 n, i, offset, found;
PROCNAME("sarrayParseRange");
if (!sa)
return ERROR_INT("sa not defined", procName, 1);
if (!pactualstart || !pend || !pnewstart)
return ERROR_INT("not all range addresses defined", procName, 1);
n = sarrayGetCount(sa);
*pactualstart = *pend = *pnewstart = n;
if (!substr)
return ERROR_INT("substr not defined", procName, 1);
/* Look for the first string without the marker */
if (start < 0 || start >= n)
return 1;
for (i = start; i < n; i++) {
str = sarrayGetString(sa, i, L_NOCOPY);
arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
strlen(substr), &offset, &found);
if (loc < 0) {
if (!found) break;
} else {
if (!found || offset != loc) break;
}
}
start = i;
if (i == n) /* couldn't get started */
return 1;
/* Look for the last string without the marker */
*pactualstart = start;
for (i = start + 1; i < n; i++) {
str = sarrayGetString(sa, i, L_NOCOPY);
arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
strlen(substr), &offset, &found);
if (loc < 0) {
if (found) break;
} else {
if (found && offset == loc) break;
}
}
*pend = i - 1;
start = i;
if (i == n) /* no further range */
return 0;
/* Look for the first string after *pend without the marker.
* This will start the next run of strings, if it exists. */
for (i = start; i < n; i++) {
str = sarrayGetString(sa, i, L_NOCOPY);
arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
strlen(substr), &offset, &found);
if (loc < 0) {
if (!found) break;
} else {
if (!found || offset != loc) break;
}
}
if (i < n)
*pnewstart = i;
return 0;
}
/*----------------------------------------------------------------------*
* Sort *
*----------------------------------------------------------------------*/
/*!
* sarraySort()
*
* Input: saout (output sarray; can be NULL or equal to sain)
* sain (input sarray)
* sortorder (L_SORT_INCREASING or L_SORT_DECREASING)
* Return: saout (output sarray, sorted by ascii value), or null on error
*
* Notes:
* (1) Set saout = sain for in-place; otherwise, set naout = NULL.
* (2) Shell sort, modified from K&R, 2nd edition, p.62.
* Slow but simple O(n logn) sort.
*/
SARRAY *
sarraySort(SARRAY *saout,
SARRAY *sain,
l_int32 sortorder)
{
char **array;
char *tmp;
l_int32 n, i, j, gap;
PROCNAME("sarraySort");
if (!sain)
return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
/* Make saout if necessary; otherwise do in-place */
if (!saout)
saout = sarrayCopy(sain);
else if (sain != saout)
return (SARRAY *)ERROR_PTR("invalid: not in-place", procName, NULL);
array = saout->array; /* operate directly on the array */
n = sarrayGetCount(saout);
/* Shell sort */
for (gap = n/2; gap > 0; gap = gap / 2) {
for (i = gap; i < n; i++) {
for (j = i - gap; j >= 0; j -= gap) {
if ((sortorder == L_SORT_INCREASING &&
stringCompareLexical(array[j], array[j + gap])) ||
(sortorder == L_SORT_DECREASING &&
stringCompareLexical(array[j + gap], array[j])))
{
tmp = array[j];
array[j] = array[j + gap];
array[j + gap] = tmp;
}
}
}
}
return saout;
}
/*!
* sarraySortByIndex()
*
* Input: sain
* naindex (na that maps from the new sarray to the input sarray)
* Return: saout (sorted), or null on error
*/
SARRAY *
sarraySortByIndex(SARRAY *sain,
NUMA *naindex)
{
char *str;
l_int32 i, n, index;
SARRAY *saout;
PROCNAME("sarraySortByIndex");
if (!sain)
return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
if (!naindex)
return (SARRAY *)ERROR_PTR("naindex not defined", procName, NULL);
n = sarrayGetCount(sain);
saout = sarrayCreate(n);
for (i = 0; i < n; i++) {
numaGetIValue(naindex, i, &index);
str = sarrayGetString(sain, index, L_COPY);
sarrayAddString(saout, str, L_INSERT);
}
return saout;
}
/*!
* stringCompareLexical()
*
* Input: str1
* str2
* Return: 1 if str1 > str2 (lexically); 0 otherwise
*
* Notes:
* (1) If the lexical values are identical, return a 0, to
* indicate that no swapping is required to sort the strings.
*/
l_int32
stringCompareLexical(const char *str1,
const char *str2)
{
l_int32 i, len1, len2, len;
PROCNAME("sarrayCompareLexical");
if (!str1)
return ERROR_INT("str1 not defined", procName, 1);
if (!str2)
return ERROR_INT("str2 not defined", procName, 1);
len1 = strlen(str1);
len2 = strlen(str2);
len = L_MIN(len1, len2);
for (i = 0; i < len; i++) {
if (str1[i] == str2[i])
continue;
if (str1[i] > str2[i])
return 1;
else
return 0;
}
if (len1 > len2)
return 1;
else
return 0;
}
/*----------------------------------------------------------------------*
* Serialize for I/O *
*----------------------------------------------------------------------*/
/*!
* sarrayRead()
*
* Input: filename
* Return: sarray, or null on error
*/
SARRAY *
sarrayRead(const char *filename)
{
FILE *fp;
SARRAY *sa;
PROCNAME("sarrayRead");
if (!filename)
return (SARRAY *)ERROR_PTR("filename not defined", procName, NULL);
if ((fp = fopenReadStream(filename)) == NULL)
return (SARRAY *)ERROR_PTR("stream not opened", procName, NULL);
if ((sa = sarrayReadStream(fp)) == NULL) {
fclose(fp);
return (SARRAY *)ERROR_PTR("sa not read", procName, NULL);
}
fclose(fp);
return sa;
}
/*!
* sarrayReadStream()
*
* Input: stream
* Return: sarray, or null on error
*
* Notes:
* (1) We store the size of each string along with the string.
* (2) This allows a string to have embedded newlines. By reading
* the entire string, as determined by its size, we are
* not affected by any number of embedded newlines.
*/
SARRAY *
sarrayReadStream(FILE *fp)
{
char *stringbuf;
l_int32 i, n, size, index, bufsize, version, ignore;
SARRAY *sa;
PROCNAME("sarrayReadStream");
if (!fp)
return (SARRAY *)ERROR_PTR("stream not defined", procName, NULL);
if (fscanf(fp, "\nSarray Version %d\n", &version) != 1)
return (SARRAY *)ERROR_PTR("not an sarray file", procName, NULL);
if (version != SARRAY_VERSION_NUMBER)
return (SARRAY *)ERROR_PTR("invalid sarray version", procName, NULL);
if (fscanf(fp, "Number of strings = %d\n", &n) != 1)
return (SARRAY *)ERROR_PTR("error on # strings", procName, NULL);
if ((sa = sarrayCreate(n)) == NULL)
return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
bufsize = L_BUF_SIZE + 1;
if ((stringbuf = (char *)CALLOC(bufsize, sizeof(char))) == NULL)
return (SARRAY *)ERROR_PTR("stringbuf not made", procName, NULL);
for (i = 0; i < n; i++) {
/* Get the size of the stored string */
if (fscanf(fp, "%d[%d]:", &index, &size) != 2)
return (SARRAY *)ERROR_PTR("error on string size", procName, NULL);
/* Expand the string buffer if necessary */
if (size > bufsize - 5) {
FREE(stringbuf);
bufsize = (l_int32)(1.5 * size);
stringbuf = (char *)CALLOC(bufsize, sizeof(char));
}
/* Read the stored string, plus leading spaces and trailing \n */
if (fread(stringbuf, 1, size + 3, fp) != size + 3)
return (SARRAY *)ERROR_PTR("error reading string", procName, NULL);
/* Remove the \n that was added by sarrayWriteStream() */
stringbuf[size + 2] = '\0';
/* Copy it in, skipping the 2 leading spaces */
sarrayAddString(sa, stringbuf + 2, L_COPY);
}
ignore = fscanf(fp, "\n");
FREE(stringbuf);
return sa;
}
/*!
* sarrayWrite()
*
* Input: filename
* sarray
* Return: 0 if OK; 1 on error
*/
l_int32
sarrayWrite(const char *filename,
SARRAY *sa)
{
FILE *fp;
PROCNAME("sarrayWrite");
if (!filename)
return ERROR_INT("filename not defined", procName, 1);
if (!sa)
return ERROR_INT("sa not defined", procName, 1);
if ((fp = fopenWriteStream(filename, "w")) == NULL)
return ERROR_INT("stream not opened", procName, 1);
if (sarrayWriteStream(fp, sa))
return ERROR_INT("sa not written to stream", procName, 1);
fclose(fp);
return 0;
}
/*!
* sarrayWriteStream()
*
* Input: stream
* sarray
* Returns 0 if OK; 1 on error
*
* Notes:
* (1) This appends a '\n' to each string, which is stripped
* off by sarrayReadStream().
*/
l_int32
sarrayWriteStream(FILE *fp,
SARRAY *sa)
{
l_int32 i, n, len;
PROCNAME("sarrayWriteStream");
if (!fp)
return ERROR_INT("stream not defined", procName, 1);
if (!sa)
return ERROR_INT("sa not defined", procName, 1);
n = sarrayGetCount(sa);
fprintf(fp, "\nSarray Version %d\n", SARRAY_VERSION_NUMBER);
fprintf(fp, "Number of strings = %d\n", n);
for (i = 0; i < n; i++) {
len = strlen(sa->array[i]);
fprintf(fp, " %d[%d]: %s\n", i, len, sa->array[i]);
}
fprintf(fp, "\n");
return 0;
}
/*!
* sarrayAppend()
*
* Input: filename
* sarray
* Return: 0 if OK; 1 on error
*/
l_int32
sarrayAppend(const char *filename,
SARRAY *sa)
{
FILE *fp;
PROCNAME("sarrayAppend");
if (!filename)
return ERROR_INT("filename not defined", procName, 1);
if (!sa)
return ERROR_INT("sa not defined", procName, 1);
if ((fp = fopenWriteStream(filename, "a")) == NULL)
return ERROR_INT("stream not opened", procName, 1);
if (sarrayWriteStream(fp, sa))
return ERROR_INT("sa not appended to stream", procName, 1);
fclose(fp);
return 0;
}
/*---------------------------------------------------------------------*
* Directory filenames *
*---------------------------------------------------------------------*/
/*!
* getNumberedPathnamesInDirectory()
*
* Input: directory name
* substr (<optional> substring filter on filenames; can be NULL)
* numpre (number of characters in name before number)
* numpost (number of characters in name after the number,
* up to a dot before an extension)
* maxnum (only consider page numbers up to this value)
* Return: sarray of numbered pathnames, or NULL on error
*
* Notes:
* (1) Returns the full pathnames of the numbered filenames in
* the directory. The number in the filename is the index
* into the sarray. For indices for which there are no filenames,
* an empty string ("") is placed into the sarray.
* This makes reading numbered files very simple. For example,
* the image whose filename includes number N can be retrieved using
* pixReadIndexed(sa, N);
* (2) If @substr is not NULL, only filenames that contain
* the substring can be included. If @substr is NULL,
* all matching filenames are used.
* (3) If no numbered files are found, it returns an empty sarray,
* with no initialized strings.
* (4) It is assumed that the page number is contained within
* the basename (the filename without directory or extension).
* @numpre is the number of characters in the basename
* preceeding the actual page number; @numpost is the number
* following the page number, up to either the end of the
* basename or a ".", whichever comes first.
* (5) This is useful when all filenames contain numbers that are
* not necessarily consecutive. 0-padding is not required.
* (6) To use a O(n) matching algorithm, the largest page number
* is found and two internal arrays of this size are created.
* This maximum is constrained not to exceed @maxsum,
* to make sure that an unrealistically large number is not
* accidentally used to determine the array sizes.
*/
SARRAY *
getNumberedPathnamesInDirectory(const char *dirname,
const char *substr,
l_int32 numpre,
l_int32 numpost,
l_int32 maxnum)
{
l_int32 nfiles;
SARRAY *sa, *saout;
PROCNAME("getNumberedPathnamesInDirectory");
if (!dirname)
return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
if ((nfiles = sarrayGetCount(sa)) == 0)
return sarrayCreate(1);
saout = convertSortedToNumberedPathnames(sa, numpre, numpost, maxnum);
sarrayDestroy(&sa);
return saout;
}
/*!
* getSortedPathnamesInDirectory()
*
* Input: directory name
* substr (<optional> substring filter on filenames; can be NULL)
* first (0-based)
* nfiles (use 0 for all to the end)
* Return: sarray of sorted pathnames, or NULL on error
*
* Notes:
* (1) Use @substr to filter filenames in the directory. If
* @substr == NULL, this takes all files.
* (2) The files in the directory, after optional filtering by
* the substring, are lexically sorted in increasing order.
* Use @first and @nfiles to select a contiguous set of files.
* (3) The full pathnames are returned for the requested sequence.
* If no files are found after filtering, returns an empty sarray.
*/
SARRAY *
getSortedPathnamesInDirectory(const char *dirname,
const char *substr,
l_int32 first,
l_int32 nfiles)
{
char *fname, *fullname;
l_int32 i, n, last;
SARRAY *sa, *safiles, *saout;
PROCNAME("getSortedPathnamesInDirectory");
if (!dirname)
return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
if ((sa = getFilenamesInDirectory(dirname)) == NULL)
return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
safiles = sarraySelectBySubstring(sa, substr);
sarrayDestroy(&sa);
n = sarrayGetCount(safiles);
if (n == 0) {
L_WARNING("no files found\n", procName);
return safiles;
}
sarraySort(safiles, safiles, L_SORT_INCREASING);
first = L_MIN(L_MAX(first, 0), n - 1);
if (nfiles == 0)
nfiles = n - first;
last = L_MIN(first + nfiles - 1, n - 1);
saout = sarrayCreate(last - first + 1);
for (i = first; i <= last; i++) {
fname = sarrayGetString(safiles, i, L_NOCOPY);
fullname = genPathname(dirname, fname);
sarrayAddString(saout, fullname, L_INSERT);
}
sarrayDestroy(&safiles);
return saout;
}
/*!
* convertSortedToNumberedPathnames()
*
* Input: sorted pathnames (including zero-padded integers)
* numpre (number of characters in name before number)
* numpost (number of characters in name after the number,
* up to a dot before an extension)
* maxnum (only consider page numbers up to this value)
* Return: sarray of numbered pathnames, or NULL on error
*
* Notes:
* (1) Typically, numpre = numpost = 0; e.g., when the filename
* just has a number followed by an optional extension.
*/
SARRAY *
convertSortedToNumberedPathnames(SARRAY *sa,
l_int32 numpre,
l_int32 numpost,
l_int32 maxnum)
{
char *fname, *str;
l_int32 i, nfiles, num, index;
SARRAY *saout;
PROCNAME("convertSortedToNumberedPathnames");
if (!sa)
return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
if ((nfiles = sarrayGetCount(sa)) == 0)
return sarrayCreate(1);
/* Find the last file in the sorted array that has a number
* that (a) matches the count pattern and (b) does not
* exceed @maxnum. @maxnum sets an upper limit on the size
* of the sarray. */
num = 0;
for (i = nfiles - 1; i >= 0; i--) {
fname = sarrayGetString(sa, i, L_NOCOPY);
num = extractNumberFromFilename(fname, numpre, numpost);
if (num < 0) continue;
num = L_MIN(num + 1, maxnum);
break;
}
if (num <= 0) /* none found */
return sarrayCreate(1);
/* Insert pathnames into the output sarray.
* Ignore numbers that are out of the range of sarray. */
saout = sarrayCreateInitialized(num, (char *)"");
for (i = 0; i < nfiles; i++) {
fname = sarrayGetString(sa, i, L_NOCOPY);
index = extractNumberFromFilename(fname, numpre, numpost);
if (index < 0 || index >= num) continue;
str = sarrayGetString(saout, index, L_NOCOPY);
if (str[0] != '\0')
L_WARNING("\n Multiple files with same number: %d\n",
procName, index);
sarrayReplaceString(saout, index, fname, L_COPY);
}
return saout;
}
/*!
* getFilenamesInDirectory()
*
* Input: directory name
* Return: sarray of file names, or NULL on error
*
* Notes:
* (1) The versions compiled under unix and cygwin use the POSIX C
* library commands for handling directories. For windows,
* there is a separate implementation.
* (2) It returns an array of filename tails; i.e., only the part of
* the path after the last slash.
* (3) Use of the d_type field of dirent is not portable:
* "According to POSIX, the dirent structure contains a field
* char d_name[] of unspecified size, with at most NAME_MAX
* characters preceding the terminating null character. Use
* of other fields will harm the portability of your programs."
* (4) As a consequence of (3), we note several things:
* - MINGW doesn't have a d_type member.
* - Older versions of gcc (e.g., 2.95.3) return DT_UNKNOWN
* for d_type from all files.
* On these systems, this function will return directories
* (except for '.' and '..', which are eliminated using
* the d_name field).
*/
#ifndef _WIN32
SARRAY *
getFilenamesInDirectory(const char *dirname)
{
char *realdir, *name;
l_int32 len;
SARRAY *safiles;
DIR *pdir;
struct dirent *pdirentry;
PROCNAME("getFilenamesInDirectory");
if (!dirname)
return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
realdir = genPathname(dirname, NULL);
pdir = opendir(realdir);
FREE(realdir);
if (!pdir)
return (SARRAY *)ERROR_PTR("pdir not opened", procName, NULL);
if ((safiles = sarrayCreate(0)) == NULL)
return (SARRAY *)ERROR_PTR("safiles not made", procName, NULL);
while ((pdirentry = readdir(pdir))) {
/* It's nice to ignore directories. For this it is necessary to
* define _BSD_SOURCE in the CC command, because the DT_DIR
* flag is non-standard. */
#if !defined(__SOLARIS__)
if (pdirentry->d_type == DT_DIR)
continue;
#endif
/* Filter out "." and ".." if they're passed through */
name = pdirentry->d_name;
len = strlen(name);
if (len == 1 && name[len - 1] == '.') continue;
if (len == 2 && name[len - 1] == '.' && name[len - 2] == '.') continue;
sarrayAddString(safiles, name, L_COPY);
}
closedir(pdir);
return safiles;
}
#else /* _WIN32 */
/* http://msdn2.microsoft.com/en-us/library/aa365200(VS.85).aspx */
#include <windows.h>
SARRAY *
getFilenamesInDirectory(const char *dirname)
{
char *pszDir;
char *realdir;
HANDLE hFind = INVALID_HANDLE_VALUE;
SARRAY *safiles;
WIN32_FIND_DATAA ffd;
PROCNAME("getFilenamesInDirectory");
if (!dirname)
return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
realdir = genPathname(dirname, NULL);
pszDir = stringJoin(realdir, "\\*");
FREE(realdir);
if (strlen(pszDir) + 1 > MAX_PATH) {
FREE(pszDir);
return (SARRAY *)ERROR_PTR("dirname is too long", procName, NULL);
}
if ((safiles = sarrayCreate(0)) == NULL) {
FREE(pszDir);
return (SARRAY *)ERROR_PTR("safiles not made", procName, NULL);
}
hFind = FindFirstFileA(pszDir, &ffd);
if (INVALID_HANDLE_VALUE == hFind) {
sarrayDestroy(&safiles);
FREE(pszDir);
return (SARRAY *)ERROR_PTR("hFind not opened", procName, NULL);
}
while (FindNextFileA(hFind, &ffd) != 0) {
if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) /* skip dirs */
continue;
convertSepCharsInPath(ffd.cFileName, UNIX_PATH_SEPCHAR);
sarrayAddString(safiles, ffd.cFileName, L_COPY);
}
FindClose(hFind);
FREE(pszDir);
return safiles;
}
#endif /* _WIN32 */