FundraiserStatisticsGen/fundstatgen.py from wikimedia/wikimedia-fundraising-tools

FundraiserStatisticsGen/fundstatgen.py
Summary

Maintainability

5 hrs
Test Coverage

Issues
#!/usr/bin/python

"""
Fundraiser Statistics Generation

Queries the CiviCRM database to
"""

import pymysql as db
import csv
import json
from optparse import OptionParser
from configparser import ConfigParser
import logging
from logging.handlers import SysLogHandler

CONFIG_DEFAULT_PATH = "/etc/fundraising/fundstatgen.cfg"


def main():
    # Extract any command line options
    parser = OptionParser(usage="usage: %prog [options] <working directory>")
    parser.add_option("-c", "--config", dest='configFile', default=None, help='Path to configuration file')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.print_help()
        exit(1)
    workingDir = args[0]

    # Load the configuration from the file
    config = ConfigParser()
    fileList = [CONFIG_DEFAULT_PATH]
    if options.configFile:
        fileList.append(options.configFile)
    config.read(fileList)

    # === BEGIN PROCESSING ===
    hostname = config.get('MySQL', 'hostname')
    port = config.getint('MySQL', 'port')
    username = config.get('MySQL', 'username')
    password = config.get('MySQL', 'password')
    database = config.get('MySQL', 'schema')

    logging.debug("Will connect to MySQL using: %s@%s:%d", username, hostname, port)

    logging.info("Running per year query...")
    stats = getPerYearData(hostname, port, username, password, database)

    logging.info("Pivoting data into year/day form...")
    (years, pivot) = pivotDataByYear(stats)

    logging.info("Writing year data output files...")
    createSingleOutFile(stats, 'date', workingDir + '/donationdata-vs-day')
    createOutputFiles(pivot, 'date', workingDir + '/yeardata-day-vs-', years)

    logging.info("Running per campaign query...")
    pcStats = getPerCampaignData(hostname, port, username, password, database)

    logging.info("Writing campaign data output files...")
    createSingleOutFile(pcStats, ('medium', 'campaign'), workingDir + '/campaign-vs-amount')


def getPerYearData(host, port, username, password, database):
    """
    Obtain basic statistics (USD sum, number donations, USD avg amount, USD max amount,
    USD YTD sum) per day from the MySQL server.

    Returns a dict like: {date => {report type => {value}} where report types are:
    - sum, refund_sum, donations, refunds, avg, max, ytdsum, ytdloss
    """
    con = db.connect(host=host, port=port, user=username, passwd=password, db=database)
    cur = con.cursor()
    cur.execute("""
        SELECT
          DATE_FORMAT(con.receive_date, "%Y-%m-%d") as receive_date,
          SUM(IF(con.total_amount >= 0, total_amount, 0)) as credit,
          SUM(IF(con.total_amount >= 0, 1, 0)) as credit_count,
          SUM(IF(con.total_amount < 0, total_amount, 0)) as refund,
          SUM(IF(con.total_amount < 0, 1, 0)) as refund_count,
          AVG(IF(con.total_amount >= 0, total_amount, 0)) as `avg`,
          MAX(con.total_amount)
        FROM civicrm_contribution con, civicrm_contribution_tracking ct
        WHERE
          con.id=ct.contribution_id AND
          receive_date >= '2006-01-01'
        GROUP BY DATE_FORMAT(con.receive_date, "%Y-%m-%d") ASC;
        """)

    data = {}
    ytdCreditSum = 0
    ytdRefundSum = 0
    cyear = 0
    for row in cur:
        (date, credit_sum, credit_count, refund_sum, refund_count, avg, max) = row
        year = int(date[0:4])
        credit_sum = float(credit_sum)
        credit_count = int(credit_count)
        refund_sum = float(refund_sum)
        refund_count = int(refund_count)
        avg = float(avg)
        max = float(max)

        if cyear != year:
            cyear = year
            ytdCreditSum = 0
            ytdRefundSum = 0
        ytdCreditSum += credit_sum
        ytdRefundSum += refund_sum

        data[date] = {
            'sum': credit_sum,
            'refund_sum': refund_sum,
            'donations': credit_count,
            'refunds': refund_count,
            'avg': avg,
            'max': max,
            'ytdsum': ytdCreditSum,
            'ytdloss': ytdRefundSum
        }

    del cur
    con.close()
    return data


def getPerCampaignData(host, port, username, password, database):
    """
    Obtain basic statistics (USD sum, number donations, USD avg amount, USD max amount,
    USD YTD sum) per medium, campaign

    Returns a dict like: {(medium, campaign) => {value => value} where value types are:
    - start_date, stop_date, count, sum, avg, std, max
    """
    con = db.connect(host=host, port=port, user=username, passwd=password, db=database)
    cur = con.cursor()
    cur.execute("""
        SELECT
          COALESCE(ct.utm_medium, ''),
          COALESCE(ct.utm_campaign, ''),
          min(c.receive_date),
          max(c.receive_date),
          count(*),
          sum(c.total_amount),
          avg(c.total_amount),
          std(c.total_amount),
          max(c.total_amount)
        FROM civicrm_contribution_tracking ct, civicrm.civicrm_contribution c
        WHERE
          ct.contribution_id=c.id AND
          c.total_amount >= 0
        GROUP BY utm_medium, utm_campaign;
        """)

    data = {}
    for row in cur:
        (medium, campaign, start, stop, count, sum, usdavg, usdstd, usdmax) = row
        count = int(count)
        sum = float(sum)
        usdavg = float(usdavg)
        usdstd = float(usdstd)
        usdmax = float(usdmax)

        data[(medium, campaign)] = {
            'start_date': start.isoformat(),
            'stop_date': stop.isoformat(),
            'count': count,
            'sum': sum,
            'avg': usdavg,
            'usdstd': usdstd,
            'usdmax': usdmax
        }

    del cur
    con.close()
    return data


def pivotDataByYear(stats):
    """
    Transformation of the statistical data -- grouping reports by date

    Returns ((list of years), {report: {date: [year data]}})
    """
    years = []
    pivot = {}

    reports = list(stats.values())[0].keys()
    for report in reports:
        pivot[report] = {}

    # Do the initial pivot
    for date in stats:
        (year, month, day) = date.split('-')
        if year not in years:
            years.append(year)

        for report in reports:
            if ('2000/%s/%s 23:59:59' % (month, day)) not in pivot[report]:
                pivot[report]['2000/%s/%s 23:59:59' % (month, day)] = {}
            pivot[report]['2000/%s/%s 23:59:59' % (month, day)][year] = stats[date][report]

    # Now listify the data
    years.sort()
    for report in reports:
        for linedate in pivot[report]:
            newline = []
            linedata = pivot[report][linedate]
            for year in years:
                if year in linedata:
                    newline.append(linedata[year])
                else:
                    newline.append(None)
            pivot[report][linedate] = newline

    return years, pivot


def createOutputFiles(stats, firstcol, basename, colnames=None):
    """
    Creates a CSV file for each report in stats
    """
    reports = list(stats.keys())
    for report in reports:
        createSingleOutFile(stats[report], firstcol, basename + report, colnames)


def createSingleOutFile(stats, firstcols, basefilename, colnames=None):
    """
    Creates a single report file from a keyed dict

    stats       must be a dictionary of something list like; if internally it is a dictionary
                then the column names will be taken from the dict; otherwise they come colnames

    firstcols   can be a string or a list depending on how the data is done but it should
                reflect the primary key of stats
    """
    if colnames is None:
        colnames = list(next(iter(stats.values())).keys())
        colindices = colnames
    else:
        colindices = list(range(0, len(colnames)))

    if isinstance(firstcols, str):
        firstcols = [firstcols]
    else:
        firstcols = list(firstcols)

    csvfilename = basefilename + ".csv"
    f = open(csvfilename, 'w')
    csvf = csv.writer(f)
    csvf.writerow(firstcols + colnames)

    alldata = []
    for linekey in sorted(stats.keys()):
        if isinstance(linekey, str):
            linekeyl = [linekey]
        else:
            linekeyl = list(linekey)

        rowdata = [stats[linekey][col] for col in colindices]
        alldata.append(linekeyl + rowdata)
        csvf.writerow(linekeyl + rowdata)
    f.close()

    jsonfilename = basefilename + ".json"
    f = open(jsonfilename, 'w')
    mapstats = [dict(list(zip(firstcols + colnames, line))) for line in alldata]
    json.dump(mapstats, f)
    f.close()


if __name__ == "__main__":
    logging.getLogger().setLevel(logging.DEBUG)

    try:
        logging.captureWarnings(True)
    except Exception:
        # Damn; python 2.6; just ignore it
        pass

    # Log to console
    console = logging.StreamHandler()
    console.setFormatter(logging.Formatter('%(levelname)s:%(filename)s:%(lineno)d -- %(message)s'))
    logging.getLogger().addHandler(console)

    # Log to syslog
    syslog = SysLogHandler(address='/dev/log')
    syslog.setFormatter(logging.Formatter('%(asctime)-15s %(levelname)s:%(filename)s:%(lineno)d -- %(message)s'))
    logging.getLogger().addHandler(syslog)

    # Run the program
    try:
        main()
    except Exception:
        logging.exception('Unexpected exception! Death is me.')
        exit(1)