src/mime/rfc2047.c from jens-maus/yam

src/mime/rfc2047.c
Summary

Maintainability

Test Coverage

Issues
/***************************************************************************

 YAM - Yet Another Mailer
 Copyright (C) 1995-2000 Marcel Beck
 Copyright (C) 2000-2022 YAM Open Source Team

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

 YAM Official Support Site :  http://www.yam.ch
 YAM OpenSource project    :  http://sourceforge.net/projects/yamos/

 $Id$

***************************************************************************/

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <proto/exec.h>
#include <proto/codesets.h>

#include "SDI_compiler.h"

#include "YAM.h"

#include "mime/base64.h"
#include "mime/qprintable.h"

#include "Config.h"

#include "Debug.h"

// RFC 2047 routines
// The following RFC2047 encoding/decoding routines are highly inspired and
// partly adopted from the Courier IMAP project available at:
//
//  http://www.sourceforge.net/projects/courier
//
// whereas the RFC2047 related code is available at:
//
//  http://courier.cvs.sourceforge.net/courier/libs/rfc822/rfc2047.c
//

#define ISSPACE(i) ((i)=='\t' || (i)=='\r' || (i)=='\n' || (i)==' ')
#define DOENCODE(i) (((i) & 0x80) || (i)=='"' || (i)=='=' || \
      ((unsigned char)(i) < 0x20 && !ISSPACE(i)) || \
      !(*qp_allow)(i))
#define DOENCODEWORD(c) \
  (((c) & 0x80) || (c) == '"' || (unsigned char)(c) <= 0x20 || \
   (c) == '_' || (c) == '=' || (c) == '?' || !(*qp_allow)(c))

#define ENCWORD_LEN    75                  // max. encoded-word length (rfc2047 2.0)

#define CHARS_LWSP     " \t\n\r"           // linear whitespace chars
#define CHARS_TSPECIAL "()<>@,;:\\\"/[]?="
#define CHARS_SPECIAL  "()<>@,;:\\\".[]"
#define CHARS_CSPECIAL "()\\\r"            // not in comments
#define CHARS_DSPECIAL "[]\\\r \t"         // not in domains
#define CHARS_ESPECIAL "()<>@,;:\"/[]?.=_" // encoded word specials (rfc2047 5.1)
#define CHARS_PSPECIAL "!*+-/=_"           // encoded phrase specials (rfc2047 5.3)

// local functions
static int rfc2047_decode_int(const char *text,
                              int (*func)(const char *, unsigned int, const char *, const char *, void *),
                              void *arg);
static int rfc2047_decode_callback(const char *txt, unsigned int len, const char *chset,
                                   const char *lang, void *arg);
INLINE char *rfc2047_search_quote(const char **ptr);

/*** RFC 2047 MIME encoding/decoding routines ***/
/// rfc2047_encode_qp()
// RFC2047 quoted-printable string encoding routines. It takes a source string
// and encodes the string according to the RFC2047 standard in the quoted
// printable format.
static int rfc2047_encode_qp(const char *str, size_t len, const char *charset,
                             int (*qp_allow)(const char),
                             int (*func)(const char *, size_t, void *), void *arg,
                             size_t foldlen)
{
  int rc = 0;
  size_t maxlen;
  size_t i = len;
  size_t c = 0;
  size_t clen;

  ENTER();

  // Output mimeified text, insert spaces at 70+ character
  // boundaries for line wrapping.
  clen = strlen(charset);
  maxlen = clen+10;

  if(maxlen < 65)
    maxlen = foldlen + 4 - maxlen;
  else
    maxlen = 10;

  while(i > 0)
  {
    if(c == 0)
    {
      // begin a new encoded word
      if((rc = (*func)("=?", 2, arg)) != 0 ||
         (rc = (*func)(charset, clen, arg)) != 0 ||
         (rc = (*func)("?Q?", 3, arg)) != 0)
      {
        break;
      }

      c += clen+3+2;
    }

    if(DOENCODEWORD(*str))
    {
      char buf[3];

      buf[0] = '=';
      buf[1] = basis_hex[ ( *str >> 4) & 0x0F ];
      buf[2] = basis_hex[ *str & 0x0F ];

      if((rc = (*str == ' ') ? (*func)("_", 1, arg) : (*func)(buf, 3, arg)) != 0)
        break;

      c += (*str == ' ') ? 1 : 3;

      ++str;
      --i;
    }
    else
    {
      size_t j;

      for(j=0; j < i && DOENCODEWORD(str[j]) == FALSE; j++)
      {
        if((j + c) >= maxlen)
          break;
      }

      if((rc = (*func)(str, j, arg)) != 0)
        break;

      c += j;
      str += j;
      i -= j;
    }

    if(i == 0 || c >= maxlen)
    {
      if((rc = (*func)("?= ", (i != 0) ? 3 : 2, arg)) != 0)
        break;

      c=0;
    }
  }

  RETURN(rc);
  return rc;
}

///
/// rfc2047_encode_base64()
// RFC2047 base64 string encoding routines. It takes a source string and encodes
// the string according to the rules in RFC2047, but with the bas64 encoding
static int rfc2047_encode_base64(const char *ptr, size_t len, const char *charset,
                                 int (*func)(const char *, size_t, void *), void *arg,
                                 size_t foldlen)
{
  int rc = 0;
  size_t clen;

  ENTER();

  clen = strlen(charset);

  while(len)
  {
    size_t i;

    if((rc=(*func)("=?", 2, arg)) ||
       (rc=(*func)(charset, clen, arg))||
       (rc=(*func)("?B?", 3, arg)))
    {
      break;
    }

    i = 2 + clen + 3;

    while(len)
    {
      unsigned char ibuf[3];
      char obuf[4];
      size_t n=len > 3 ? 3:len;

      ibuf[0]= ptr[0];

      if(n>1)
        ibuf[1]=ptr[1];
      else
        ibuf[1]=0;

      if(n>2)
        ibuf[2]=ptr[2];
      else
        ibuf[2]=0;

      ptr += n;
      len -= n;

      obuf[0] = basis_64[ ibuf[0]        >>2 ];
      obuf[1] = basis_64[(ibuf[0] & 0x03)<<4|ibuf[1]>>4];
      obuf[2] = basis_64[(ibuf[1] & 0x0F)<<2|ibuf[2]>>6];
      obuf[3] = basis_64[ ibuf[2] & 0x3F ];

      if(n < 2)
        obuf[2] = '=';

      if(n < 3)
        obuf[3] = '=';

      if((rc=(*func)(obuf, 4, arg)))
        break;

      i += 4;

      // check that the encoded word does not get to large
      if(foldlen && i + 2 > foldlen - 1 + 4)
        break;
    }

    if(rc != 0)
      break;

    // Encoded-words must be separated by
    // a linear-white-space.
    if((rc = (*func)("?= ", len > 0 ? 3 : 2, arg)))
      break;
  }

  RETURN(rc);
  return rc;
}

///
/// rfc2047_encode_callback()
// The main callback function of the RFC2047 encoding. It takes a source
// string and a charset definition. In addition, it takes a qp_allow()
// function pointer which is a function that returns 1 if a character is
// to be accepted for the QP encoding or not. In addition, it takes
// a storage function pointer (func) which is called for every character
// that was encoded and should be written into a string.
static int rfc2047_encode_callback(const char *str, const char *charset,
                                   int (*qp_allow)(const char),
                                   int (*func)(const char *, size_t, void *),
                                   void *arg)
{
  int rc = 0;

  ENTER();

  if(IsStrEmpty(str) == FALSE)
  {
    // output quoted-printable-encoded
    while(*str != '\0')
    {
      size_t i;
      size_t j;
      size_t n;
      size_t c;

      for(i=0; str[i] != '\0'; i++)
      {
        if(!ISSPACE((int)(unsigned char)str[i]) && DOENCODE(str[i]))
          break;
      }

      if(str[i] == '\0')
      {
        rc = (i ? (*func)(str, i, arg) : 0);
        break;
      }

      // Find start of word
      while(i)
      {
        --i;

        if(ISSPACE((int)(unsigned char)str[i]))
        {
          ++i;
          break;
        }
      }

      if(i > 0)
      {
        if((rc = (*func)(str, i, arg)) != 0)
          break;

        str += i;
      }

      // Figure out when to stop MIME decoding. Consecutive
      // MIME-encoded words are MIME-encoded together.
      for(i=0;;)
      {
        for(; str[i] != '\0'; i++)
        {
          if(ISSPACE((int)(unsigned char)str[i]))
            break;
        }

        if(str[i] == '\0')
          break;

        for(c=i; str[c] != '\0' && ISSPACE((int)(unsigned char)str[c]); ++c)
          ; // nothing

        for(; str[c] != '\0'; c++)
        {
          if(ISSPACE((int)(unsigned char)str[c]) || DOENCODE(str[c]))
            break;
        }

        if(str[c] == '\0' || ISSPACE((int)(unsigned char)str[c]))
          break;

        i=c;
      }

      // Figure out whether base64 is a better choice.
      for(n=0, j=0; j < i; j++)
      {
        if(DOENCODEWORD(str[j]))
          ++n;
      }

      // check the ratio and decide which encoding to take
      if(n > i/10)
        rc = rfc2047_encode_base64(str, i, charset, func, arg, 70);
      else
        rc = rfc2047_encode_qp(str, i, charset, qp_allow, func, arg, 70);

      if(rc != 0)
        break;

      str += i;
    }
  }

  RETURN(rc);
  return rc;
}

///
/// rfc2047_count_char()
// a function that can be used as one of the callback functions
// for rfc2047_encode_callback(). Here it simply counts the number of
// chars.
static int rfc2047_count_char(const char *c UNUSED, size_t l, void *p)
{
  size_t *i=(size_t *)p;

  *i += l;

  return 0;
}

///
/// rfc2047_save_char()
// a function that can be used as one of the callback functions
// for rfc2047_encode_callback(). This function stores the supplied
// characters in the destination string.
static int rfc2047_save_char(const char *c, size_t l, void *p)
{
  char **s=(char **)p;

  memcpy(*s, c, l);
  *s += l;

  return 0;
}

///
/// rfc2047_encode_str()
// Special RFC2047 string encoding routine which directly encodes a
// supplied string according to the RFC2047 standard. Please note that this
// function encodes everything in a target string while automatically
// taking care of eventually required spaces due to the maximum line limits
// defined in the RFC822/RFC2047 standards. It tries to only encode those
// passages which really require any encoding due to non US-ASCII chars.
static char *rfc2047_encode_str(const char *str, const char *charset,
                                int (*qp_allow)(const char c))
{
  size_t i=1;
  char *s;

  ENTER();

  // let us first count how much chars we are going to produce
  rfc2047_encode_callback(str, charset, qp_allow, &rfc2047_count_char, &i);
  if((s = malloc(i)) != NULL)
  {
    char *p = s;

    // now do the real work
    rfc2047_encode_callback(str, charset, qp_allow, &rfc2047_save_char, &p);
    *p = '\0';
  }

  RETURN(s);
  return s;
}

///
/// rfc2047_qp_allow_any()
// This query function always returns 1 to signal the encode_callback
// function that it should accept all characters.
static int rfc2047_qp_allow_any(const char c UNUSED)
{
  return 1;
}

///
/// rfc2047_qp_allow_comment()
/*
static int rfc2047_qp_allow_comment(const char c)
{
  if(c == '(' || c == ')' || c == '"')
    return 0;
  else
    return 1;
}
*/
///
/// rfc2047_qp_allow_word()
/*
static int rfc2047_qp_allow_word(const char c)
{
  return strchr(basis_64, c) != NULL ||
         strchr("*-=_", c) != NULL;
}
*/

///
/// rfc2047_encode_file()
// generic function that encodes a source str according to RFC2047 rules.
// It will use the encode_str function and then afterwards find out where
// exactly the encoded string should be split into different lines of
// maximum 75 chars long as the RFC2047 demands it.
int rfc2047_encode_file(FILE *fh, const char *src, const size_t offset)
{
  char *dst;

  ENTER();

  // call encode_str() to encode the source string into a valid
  // RFC2047 encoded string with enough spaces so that we can
  // split it later into separate lines not longer than 75 chars.
  if((dst = rfc2047_encode_str(src, strippedCharsetName(G->writeCodeset), &rfc2047_qp_allow_any)))
  {
    size_t len = strlen(dst);

    D(DBF_MIME, "encoded rfc2047 string: '%s'", dst);

    // now we split the dst string into different lines
    if(len >= 75-offset)
    {
      char *s = dst;
      char *e = dst;
      char *last_space = NULL;
      size_t c = offset;

      // start our search
      while(len > 0)
      {
        if(*e == ' ')
          last_space = e;

        // check if we need a newline and
        // if so we go and write out the last
        // stuff including a newline.
        if(c >= 75 && last_space != NULL)
        {
          fwrite(s, last_space-s, 1, fh);

          if(len > 1)
            fwrite("\n ", 2, 1, fh);

          s = last_space+1;
          c = e-s;
          last_space = NULL;
        }

        c++;
        e++;
        len--;
      }

      if(c > 0)
        fwrite(s, e-s, 1, fh);
    }
    else
      fwrite(dst, len, 1, fh);

    // free the dst string
    free(dst);
  }

  RETURN(0);
  return 0;
}

///
/// rfc2047_decode()
// decodes a rfc2047 encoded string and eventually translates
// each character of the decoded string according to the provided
// translation table.
struct rfc2047_decode_info
{
  char *dst;
  unsigned int maxlen;
};

// this function uses the decode_int() function and attachs a callback
// function to it which will be called each time a token has been
// decoded so that an eventually existing translation table can be used
// to translate the charsets. returns the number of translated chars or
// -1 if an malloc error occurred and -2 if some unknown encoding was
// specified in the text, -3 if the base64 decoding failed.
int rfc2047_decode(char *dst, const char *src, unsigned int maxlen)
{
  int result;
  // pack all the necessary information in the decode_info
  // structure so that the decode function can process it.
  struct rfc2047_decode_info info;
  info.dst    = dst;
  info.maxlen = maxlen;

  // call the decode_int function to start decoding our
  // data.
  result = rfc2047_decode_int(src, &rfc2047_decode_callback, &info);
  info.dst[0] = '\0'; // make sure this string is null-terminated

  // on success return the decoded string len.
  if(result > 0)  return (int)(maxlen-info.maxlen);
  else            return result;
}

///
/// rfc2047_decode_callback()
// the callback function that is called by the decode_int() function each
// time a string was successfully decoded so that it can be converted.
static int rfc2047_decode_callback(const char *txt, unsigned int len, const char *chset,
                                   UNUSED const char *lang, void *arg)
{
  struct rfc2047_decode_info *info = (struct rfc2047_decode_info *)arg;

  // in case the user wants us to detect the correct cyrillic codeset
  // we do it now
  if(C->DetectCyrillic == TRUE)
  {
    if(chset == NULL || stricmp(chset, "utf-8") != 0)
    {
      struct codeset *cs = CodesetsFindBest(CSA_Source,         txt,
                                            CSA_SourceLen,      len,
                                            CSA_CodesetFamily,  CSV_CodesetFamily_Cyrillic,
                                            TAG_DONE);

      if(cs != NULL)
        chset = cs->name;
    }
  }

  // now we try to get the src codeset from codesets.library
  // and convert the string into our local charset if required
  if(IsStrEmpty(chset) == FALSE)
  {
    // check if the src codeset of the string isn't the same
    // like our local one.
    if(stricmp(chset, strippedCharsetName(G->localCodeset)) != 0)
    {
      struct codeset *srcCodeset;

      if((srcCodeset = CodesetsFind((char *)chset,
                                    CSA_CodesetList,       G->codesetsList,
                                    CSA_FallbackToDefault, FALSE,
                                    TAG_DONE)) != NULL)
      {
        ULONG dstLen = 0;

        // now we convert the text from the source codeset to our
        // local codeset
        STRPTR str = CodesetsConvertStr(CSA_SourceCodeset,   srcCodeset,
                                        CSA_DestCodeset,     G->localCodeset,
                                        CSA_Source,          txt,
                                        CSA_SourceLen,       len,
                                        CSA_DestLenPtr,      &dstLen,
                                        CSA_MapForeignChars, C->MapForeignChars,
                                        TAG_DONE);

        // now that we have our converted string we can go and
        // copy it over our destination buffer
        if(str != NULL && dstLen > 0)
        {
          // before we go on we check whether we have enough space
          // to put the txt in our destination
          if(info->maxlen < dstLen)
          {
            W(DBF_MIME, "not enough space to put converted string with len %ld into info->dst!", dstLen);

            CodesetsFreeA(str, NULL);
            return -1;
          }

          // so we can savely decrease it.
          info->maxlen -= dstLen;

          // lets copy all data from our converted string to
          // our own destination buffer
          memmove(info->dst, str, dstLen);

          // free our codesets buffer.
          CodesetsFreeA(str, NULL);

          // increase the destination pointer so that the
          // next iteration can profit out of it.
          info->dst += dstLen;

          // return without an error
          return 0;
        }
        else
          W(DBF_MIME, "couldn't convert src str via CodesetsConvertStr(), codesets '%s' -> '%s'!", SafeStr(srcCodeset->name), SafeStr(G->localCodeset->name));
      }
      #if defined(DEBUG)
      else if(stricmp(chset, "us-ascii") != 0)
        W(DBF_MIME, "couldn't find charset '%s' in codesets.library!", chset);
      #endif
    }
  }

  if(info->maxlen < len)
  {
    W(DBF_MIME, "not enough space to put string with len %ld into info->dst!", len);
    return -1;
  }

  // so we can savely decrease it.
  info->maxlen -= len;

  // lets copy all data from our converted string to
  // our own destination buffer
  memmove(info->dst, txt, len);

  // increase the destination pointer so that the
  // next iteration can profit out of it.
  info->dst += len;

  return 0;
}
///
/// rfc2047_search_quote()
//
INLINE char *rfc2047_search_quote(const char **ptr)
{
  const char *p = *ptr;
  size_t l;
  char *s;

  while(**ptr && **ptr != '?')
    ++(*ptr);

  l = *ptr - p + 1;
  if((s = malloc(l)) != NULL)
  {
    strlcpy(s, p, l);
  }

  return (s);
}

///
/// rfc2047_decode_int()
// This is the main rcf2047 decoding function. It receives rfc2047-encoded
// text, and a callback function.  The callback function is repeatedly
// called, each time receiving a piece of decoded text.  The decoded
// info includes the text fragment, text length, the input charset, the
// language (if existant) and void pointer to user specific data for
// the processing. If the callback function returns non-zero, rfc2047
// decoding terminates, returning the result code.  Otherwise,
// rfc2047_decode returns 0 after a successfull decoding (-1 if malloc
// failed).
static int rfc2047_decode_int(const char *text,
                              int (*func)(const char *, unsigned int, const char *, const char *, void *),
                              void *arg)
{
  int rc, result=0;
  int unknown_enc=0;
  int had_last_word=0;
  const char *p;
  char *chset, *lang;
  char *encoding;
  char *enctext;

  while(IsStrEmpty(text) == FALSE)
  {
    p=text;

    if(*text != '=' || *(text+1) != '?')
    {
      while(*text)
      {
        // even if RFC2047 exactly defines which chars are allowed right before
        // an "=?" tag, we do accept all chars right before an "=?" tag for compatibility
        // reasons (even Thunderbird accepts something like 'Titel/Abs=?ISO-8859-1?Q?tract_f=FCr_Pap?=er Contribution')
        if(*text == '=' && *(text+1) == '?')
          break;

        if(!isspace((int)(unsigned char)*text))
          had_last_word=0;

        ++text;
      }

      if(text > p && !had_last_word)
      {
        rc = (*func)(p, (unsigned int)(text-p), 0, 0, arg);
        if(rc != 0)
          return rc;
      }

      continue;
    }

    text += 2;
    if((chset = rfc2047_search_quote(&text)) == 0)
      return -1;

    if(*text) ++text;
    if((encoding = rfc2047_search_quote(&text)) == 0)
    {
      free(chset);
      return -1;
    }

    if(*text) ++text;
    if((enctext = rfc2047_search_quote(&text)) == 0)
    {
      free(encoding);
      free(chset);
      return -1;
    }

    if(*text == '?' && text[1] == '=')
      text += 2;

    // now we check the encoding string.
    // q/Q defines to quoted-printable decoding
    // b/B defines to base64 encoding
    // and to sort out possible errors we check for the finalizing NUL-byte
    if(encoding[0] != '\0' && encoding[1] == '\0') // have to be 1 char long
    {
      switch(encoding[0])
      {
        // we found a quoted-printable encoded rfc2047 compliant string, so
        // lets decode it.
        case 'q':
        case 'Q':
        {
          char *q, *r;

          for(q=r=enctext; *q; )
          {
            int c;

            if(*q == '=' && q[1] && q[2])
            {
              unsigned char c1 = hexchar(q[1]);
              unsigned char c2 = hexchar(q[2]);

              if(c1 != 255 && c2 != 255)
              {
                *r++ = c1<<4 | c2;
              }
              else
              {
                *r++ = *q;
                *r++ = q[1];
                *r++ = q[2];
              }

              q += 3;

              continue;
            }

            c = *q++;
            if(c == '_')
              c = ' ';
            *r++ = c ;
          }
          *r=0;
        }
        break;

        // we found a base64 encoded rfc2047 compliant string, so
        // lets decode it.
        case 'b':
        case 'B':
        {
          char *etext = Trim(enctext);
          char *dectext = NULL;
          int res;

          if((res = base64decode(&dectext, etext, strlen(etext))) > 0)
          {
            free(enctext);
            enctext = dectext;
          }
          else
          {
            W(DBF_MIME, "base64 decoding returned: %d '%s'", res, enctext);
            result = -3; // signal a base64 decoding error to caller
            unknown_enc = 1;
          }
        }
        break;

        default:
        {
          // if we end up here the probabilty that we have a rfc2047 compliant
          // string with an unknown encoding is somehow high.
          result = -2; // signal an unknown encoding to caller
          unknown_enc = 1;
        }
        break;
      }
    }
    else
      unknown_enc = 1;

    // if no error occurred we are going to call the callback function
    if(unknown_enc == 1)
    {
      rc = (*func)(p, (unsigned int)(text-p), 0, 0, arg);
      unknown_enc = 0; // clear it immediatly
    }
    else
    {
      // rfc2231 updates rfc2047 encoded words...
      // The ABNF given in RFC 2047 for encoded-words is:
      //   encoded-word := "=?" charset "?" encoding "?" encoded-text "?="
      // This specification changes this ABNF to:
      //   encoded-word := "=?" charset ["*" language] "?" encoding "?" encoded-text "?="
      if((lang = strrchr(chset, '*')))
        *lang++ = '\0';

      rc = (*func)(enctext, strlen(enctext), chset, lang, arg);
    }

    // free everything and check if a returncode was given.
    free(enctext);
    free(chset);
    free(encoding);
    if(rc != 0)
      return rc;

    // Ignore blanks between enc words
    had_last_word=1;
  }

  return result;
}

///