openva/rs-video-processor

View on GitHub
bin/resolve_chyrons.php

Summary

Maintainability
C
1 day
Test Coverage
<?php

require_once __DIR__ . '/../includes/settings.inc.php';
require_once __DIR__ . '/../includes/functions.inc.php';

$video_dir = (__DIR__ . '/../video');

# Connect to the database.
$database = new Database();
$database->connect_old();

# Insert a matched chyron and ID.
function insert_match($linked_id, $chyron_id)
{
    if (empty($chyron_id) || empty($linked_id)) {
        return false;
    }
    $sql = 'UPDATE video_index
            SET linked_id=' . $linked_id . '
            WHERE id=' . $chyron_id;
    $result = mysql_query($sql);
    if (!$result) {
        return false;
    }
}

# Get the video ID from the command line.
$video_id = trim($_SERVER['argv'][1]);
if (empty($video_id)) {
    return false;
}


###
# MATCH UNRESOLVED BILLS
###

# If we've passed an ID, use that.
$sql = 'SELECT video_index.file_id, files.date, files.chamber
        FROM video_index
        LEFT JOIN files
            ON video_index.file_id=files.id
        WHERE video_index.type="bill" AND video_index.linked_id IS NULL
        AND files.id=' . $video_id . '
        GROUP BY file_id';

$result = mysql_query($sql);
if (mysql_num_rows($result) > 0) {
    $video = mysql_fetch_array($result);

    # Get a list of all bills that were addressed on that date (in bills_status).
    $sql = 'SELECT DISTINCT bills_status.bill_id AS id, bills.number
            FROM bills_status
            LEFT JOIN bills
                ON bills_status.bill_id = bills.id
            WHERE bills_status.date = "' . $video['date'] . '"';
    $result = mysql_query($sql);
    if (mysql_num_rows($result) == 0) {
        # If we can't get the bills heard on this date (generally because we're
        # parsing the video on the same day that it was recorded), then use all
        # bill numbers from this session, instead.
        $sql = 'SELECT id, number
                FROM bills
                WHERE session_id = (
                    SELECT id
                    FROM sessions
                    WHERE date_started <= ' . $video['date'] . '
                    AND (date_ended >= ' . $video['date'] . '
                        OR
                        date_ended IS NULL)
                )';
        $result = mysql_query($sql);
    }

    # Build up an array of bills, using the ID as the key and the number as the content.
    while ($bill = mysql_fetch_array($result)) {
        $bills[$bill{'id'}] = $bill['number'];
    }

    # Get a list of all bills that are in the legislature, period.
    $sql = 'SELECT DISTINCT id, number
            FROM bills
            WHERE session_id=
                (SELECT id
                FROM sessions
                WHERE "' . $video['date'] . '" > date_started
                AND ("' . $video['date'] . '" < date_ended OR date_ended IS NULL)
                ORDER BY date_started DESC
                LIMIT 1)';
    $result = mysql_query($sql);
    if (mysql_num_rows($result) > 0) {
        # Build up an array of bills, using the ID as the key and the number as the content.
        while ($bill = mysql_fetch_array($result)) {
            $all_bills[$bill{'id'}] = $bill['number'];
        }
    }

    # Step through each bill chyron.
    $sql = 'SELECT id, raw_text
            FROM video_index
            WHERE file_id=' . $video['file_id'] . ' AND type="bill" AND linked_id IS NULL
            AND ignored = "n"
            ORDER BY time ASC';

    $result = mysql_query($sql);
    while ($chyron = mysql_fetch_array($result)) {
        # Strip out any spaces in the bill number -- just compare the bills straight up. Although
        # bill numbers in the chyrons have spaces between the prefix ("HB") and the number ("1"),
        # the OCR software doesn't always catch that. Better to just ignore the spaces entirely.
        $chyron['raw_text'] = str_replace(' ', '', $chyron['raw_text']);

        # Also, we're dealing with this in lower case.
        $chyron['raw_text'] = strtolower($chyron['raw_text']);

        # Make any obvious corrections that tend to occur with OCR software.
        if (
            (substr($chyron['raw_text'], 0, 2) == 's8')
            ||
            (substr($chyron['raw_text'], 0, 2) == '58')
            ||
            (substr($chyron['raw_text'], 0, 2) == 'ss')
            ||
            (substr($chyron['raw_text'], 0, 2) == '$8')
        ) {
            $chyron['raw_text'] = 'SB' . substr($chyron['raw_text'], 2);
        } elseif (
            (substr($chyron['raw_text'], 0, 3) == 'sir')
            ||
            (substr($chyron['raw_text'], 0, 3) == 'sjr')
        ) {
            $chyron['raw_text'] = 'SJ' . substr($chyron['raw_text'], 3);
        } elseif (
            (substr($chyron['raw_text'], 0, 3) == 'hjr')
            ||
            (substr($chyron['raw_text'], 0, 3) == 'pur')
            ||
            (substr($chyron['raw_text'], 0, 3) == 'fur')
            ||
            (substr($chyron['raw_text'], 0, 3) == 'i-ur')
        ) {
            $chyron['raw_text'] = 'hj' . substr($chyron['raw_text'], 3);
        } elseif (
            (substr($chyron['raw_text'], 0, 2) == 'I ')
            ||
            (substr($chyron['raw_text'], 0, 2) == '| ')
            ||
            (substr($chyron['raw_text'], 0, 2) == '! ')
            ||
            (substr($chyron['raw_text'], 0, 2) == '; ')
        ) {
            $chyron['raw_text'] = substr($chyron['raw_text'], 2);
        }


        # If there is a direct match with a bill dealt with on that day, insert it.
        $bill_id = array_search(strtolower($chyron['raw_text']), $bills);
        if (($bill_id !== false) && !empty($bill_id)) {
            echo $chyron['raw_text'] . ' matched to ' . $bills[$bill_id] . ' (' . $bill_id . ")\n";
            insert_match($bill_id, $chyron['id']);
        }

        # If we couldn't match it with a bill dealt with on that day, see if we can match it with
        # any bill introduced that year. This helps to allow bills to be recognized in spite of
        # legislative recordkeeping errors.
        else {
            $bill_id = array_search(strtolower($chyron['raw_text']), $all_bills);
            if (($bill_id !== false) && !empty($bill_id)) {
                echo $chyron['raw_text'] . ' matched to ' . $bills[$bill_id] . ' (' . $bill_id . ")\n";
                insert_match($bill_id, $chyron['id']);
            }
        }
    }

    # If any single unresolved bill chyrons are found that are surrounded by resolved chyrons that
    # are resolved on both sides, then we just fill in that gap with the obvious chyron, which is
    # the bill number on either side of it.
    $sql = 'SELECT id, time
            FROM video_index
            WHERE file_id = ' . $video['file_id'] . '
            AND TYPE = "bill"
            AND linked_id IS NULL';
    $result = mysql_query($sql);
    if (mysql_num_rows($result) > 0) {
        while ($unresolved = mysql_fetch_array($result)) {
            # Retrieve a list of linked IDs present for fifteen seconds on either side of this
            # unknown chyron.
            $sql = 'SELECT DISTINCT linked_id
                    FROM video_index
                    WHERE file_id = ' . $video['file_id'] . ' AND type="bill" AND linked_id IS NOT NULL
                    AND
                    (
                        (TIMEDIFF("' . $unresolved['time'] . '", time)<=15)
                        AND
                        (TIMEDIFF("' . $unresolved['time'] . '", time)>=-15)
                    )';
            $result2 = mysql_query($sql);
            # If we've got just one row—which is to say that there's only one bill discussed in this
            # thirty-second window—then we'll take it.
            if (mysql_num_rows($result2) === 1) {
                $resolved = mysql_fetch_array($result2);
                insert_match($resolved['linked_id'], $unresolved['id']);
            }
        }
    }

    echo 'Finished matching bill chyrons';

    # Store the new bill number chyrons for this video.
    if (isset($video['file_id'])) {
        # Create a new instance of the Video class.
        $vid = new Video();
        $vid->id = $video['file_id'];
        $vid->store_clips();

        echo '(Re)indexed ' . $vid->clip_count . ' clips, cued by updating bill number chyrons, and
            stored those clips.';
    }
}


###
# MATCH UNRESOLVED LEGISLATORS
###

# Select a listing of all legislators, parties, and placenames.
$sql = 'SELECT representatives.id, representatives.chamber, representatives.name,
        representatives.party, representatives.place, districts.number AS district
        FROM representatives
        LEFT JOIN districts
            ON representatives.district_id = districts.id';
// LIMIT THIS TO THOSE LEGISLATORS WHO ARE IN OFFICE ON THE DATE OF THIS VIDEO
$result = mysql_query($sql);
if (mysql_num_rows($result) > 0) {
    # Initalize the arrays.
    $legislators = array();

    # Iterate through the MySQL results and store them in an array.
    while ($legislator = mysql_fetch_array($result)) {
        $legislator = array_map('stripslashes', $legislator);

        # Depending on the chamber, assign the legislator's prefix.
        if ($legislator['chamber'] == 'house') {
            $legislator['prefix'] = 'Del.';
        } else {
            $legislator['prefix'] = 'Sen.';
        }

        # Assemble the array of legislator data into the same format as the chyron text, so that
        # we can do a direct comparison later.
        $legislator['complete'] = $legislator['prefix'] . ' ' . pivot($legislator['name']) . "\r"
            . $legislator['place'] . ' (' . $legislator['party'] . '-' . $legislator['district'] . ')';

        # Append this legislator to the array storing all of them.
        $legislators[] = $legislator;
    }
}

# Select a list of legislators by last name, party, and placename. This is for those times when
# legislative video producers decided to identify legislators only by last name, replacing the
# abbreviated title ("Sen.") with the full title ("Senator"), and failing to specify the district #.
$sql = 'SELECT representatives.id, representatives.chamber, representatives.name,
        representatives.party, representatives.place, districts.number AS district
        FROM representatives
        LEFT JOIN districts
            ON representatives.district_id = districts.id';
// LIMIT THIS TO THOSE LEGISLATORS WHO ARE IN OFFICE ON THE DATE OF THIS VIDEO
$result = mysql_query($sql);
if (mysql_num_rows($result) > 0) {
    # Iterate through the MySQL results and store them in an array.
    while ($legislator = mysql_fetch_array($result)) {
        $legislator = array_map('stripslashes', $legislator);

        # Depending on the chamber, assign the legislator's prefix.
        if ($legislator['chamber'] == 'house') {
            $legislator['prefix'] = 'Delegate';
        } else {
            $legislator['prefix'] = 'Senator';
        }

        # Extract the last name of the legislator.
        $tmp = explode(',', $legislator['name']);
        $legislator['last_name'] = $tmp[0];

        # Assemble the array of legislator data into the same format as the chyron text, so that
        # we can do a direct comparison later.
        $legislator['complete'] = $legislator['prefix'] . ' ' . $legislator['last_name'] . "\r"
            . ' (' . $legislator['party'] . ') ' . $legislator['place'];
    }
}


# Select the raw text for the top 500 IDd legislators and append that to our array.
$sql = 'SELECT raw_text, linked_id, COUNT(*) AS number
        FROM video_index
        WHERE type = "legislator"
        AND linked_id IS NOT NULL
        GROUP BY raw_text
        ORDER BY number DESC
        LIMIT 500';
$result = mysql_query($sql);
if (mysql_num_rows($result) > 0) {
    $priors = array();
    while ($tmp = mysql_fetch_array($result)) {
        $tmp['raw_text'] = stripslashes($tmp['raw_text']);
        # We can't use a newline in an array key.
        $tmp['raw_text'] = str_replace("\n", ' ', $tmp['raw_text']);
        $priors[$tmp{'raw_text'}] = $tmp['linked_id'];
    }
}

# Select the raw text for the past 2,000 successfully IDd legislators and append that to our array.
$sql = 'SELECT DISTINCT raw_text, linked_id
        FROM video_index
        WHERE TYPE = "legislator" AND linked_id IS NOT NULL
        ORDER BY date_created DESC
        LIMIT 2000';
$result = mysql_query($sql);
if (mysql_num_rows($result) > 0) {
    while ($tmp = mysql_fetch_array($result)) {
        $tmp['raw_text'] = stripslashes($tmp['raw_text']);
        # We can't use a newline in an array key.
        $tmp['raw_text'] = str_replace("\n", ' ', $tmp['raw_text']);
        $priors[$tmp{'raw_text'}] = $tmp['linked_id'];
    }
}

# Select the last 5,000 unresolved legislator chyrons that don't contain known noisewords.
$sql = 'SELECT id, raw_text
        FROM video_index
        WHERE linked_id IS NULL AND type="legislator"
        AND ignored = "n"
        AND raw_text NOT LIKE "%Virginia Senate%" AND raw_text NOT LIKE "%Schaar%"
        AND raw_text NOT LIKE "%Delegates%" AND raw_text NOT LIKE "%At Ease%"
        AND raw_text NOT LIKE "%Reverend%" AND raw_text NOT LIKE "%Rabbi%"
        AND raw_text NOT LIKE "%in Recess%" AND raw_text NOT LIKE "%at Ease%"
        ORDER BY date_created DESC
        LIMIT 5000';
$result = mysql_query($sql);
# If there are no chyrons in need of resolution, then we can stop right now. (This is vanishingly
# unlikely.)
if (mysql_num_rows($result) == 0) {
    exit;
}
while ($chyron = mysql_fetch_array($result)) {
    $chyron['raw_text'] = stripslashes($chyron['raw_text']);

    # Break up the chyron text into the first and second lines, the first dealing with who the
    # legislator is (e.g. "Del. John Q. Smith"), the second dealing (mostly) with where the
    # legislator represents (e.g. "Springfield (I-1)").
    $tmp = explode("\n", $chyron['raw_text']);
    $chyron['name'] = $tmp[0];
    $chyron['place'] = $tmp[1];
    unset($tmp);

    # First, attempt a straight match of the text.
    foreach ($legislators as $legislator) {
        # If we can find a legislator text string that's identical to the chyron, then we can stop
        # right here.
        if ($legislator['complete'] == $chyron['raw_text']) {
            echo '<li>Made a straight match. ' . $legislator['complete'] . ' = ' . $chyron['raw_text'] . "\n";
            insert_match($legislator['id'], $chyron['id']);
            next;
        }
    }

    # Second, attempt to use our lookup table of prior conversions, assuming that we have one.
    if (isset($priors)) {
        # We've stored the raw text of the prior as the array key, but minus the newline, since we
        # can't use newlines in an array key. So we need to compare with that in mind.
        $tmp = str_replace("\n", ' ', $chyron['raw_text']);
        if (array_key_exists($tmp, $priors) === true) {
            echo 'Match found among priors. (' . $tmp . ' = ' . $chyron['raw_text'] . "\n";
            insert_match($priors[$tmp], $chyron['id']);
            next;
        }
    }

    # Third, get a listing of all direct matches within 20%.
    if (isset($legislators)) {
        $matches = array();
        foreach ($legislators as $index => $legislator) {
            # If the Levenshtein distance is within 20% of the string length.
            if (levenshtein($legislator['complete'], $chyron['raw_text']) <= (strlen($chyron['raw_text']) * .2)) {
                echo 'Match found within 80% confidence. (' . $tmp . ' = ' . $chyron['raw_text'] . "\n";
                # Store this match.
                $matches[$index] = levenshtein($legislator['complete'], $chyron);
            }
        }

        # If we've made any matches.
        if (count($matches) > 0) {
            # Sort by the strength of the match.
            asort($matches);

            reset($matches);
            $index = key($matches);
            $match = current($matches);
            $match = $legislators[$index]['id'];

            insert_match($match, $chyron['id']);
            next;
        }
    }

    # Fourth, get a listing of all prior matches within 15%.
    if (isset($priors)) {
        # We've stored the raw text of the prior as the array key, but minus the newline, since we
        # can't use newlines in an array key. So we need to compare with that in mind.
        $tmp = str_replace("\n", ' ', $chyron['raw_text']);

        $matches = array();
        foreach ($priors as $text => $id) {
            # If the Levenshtein distance is within 15% of the string length. You might be tempted
            # to be more liberal in what you'll accept. Don't. "Senator Saslaw (D) Fairfax County"
            # is 81.8% identical to "Senator Howell (D) Fairfax County."
            if (levenshtein($text, $tmp) <= (strlen($tmp) * .15)) {
                echo 'Match found within 85% confidence. (' . $tmp . ' == ' . $text . "\n";
                # Store this match.
                $matches[$id] = levenshtein($text, $tmp);
            }
        }

        # If we've made any matches.
        if (count($matches) > 0) {
            # Sort by the strength of the match.
            asort($matches);

            reset($matches);
            $legislator_id = key($matches);

            insert_match($legislator_id, $chyron['id']);
            continue;
        }
    }
}

echo 'Finished matching legislator chyrons' . "\n";


# Create a new instance of the Video class.
$video = new Video();

# Get a list of every file that is not currently indexed in the video_clips table.
$sql = 'SELECT DISTINCT video_index.file_id AS id
        FROM video_index
        LEFT JOIN video_clips
            ON video_index.file_id = video_clips.file_id
        WHERE video_clips.file_id IS NULL';
$result = mysql_query($sql);
while ($file = mysql_fetch_array($result)) {
    $video->id = $file['id'];
    $video->store_clips();
    echo 'Indexed video clips for file ' . $file['id'] . "\n";
}

if (isset($video_id)) {
    $video->id = $video_id;
    $video->store_clips();
    echo '<p>Indexed video clips for file ' . $file['id'] . "\n";
}