maicroft/words/most_used_words.py from thundergolfer/mAIcroft

maicroft/words/most_used_words.py
Summary

Maintainability

2 days
Test Coverage

Issues
"""
    Credit to the Reddit Analysis project by Randal S. Olson.
"""

import os
import praw
import re
import sys
from bs4 import BeautifulSoup
from collections import defaultdict
from markdown import markdown
from optparse import OptionParser
from requests.exceptions import HTTPError
from update_checker import update_check

__version__ = "1.0.2"

PACKAGE_DIR = os.path.dirname(__file__)

all_words = defaultdict(int)
popular_words = defaultdict(int)
COMMON_WORDS = set()

# load a list of common words to ignore
with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "common-words.txt"), "r") as in_file:
    for line in in_file:
        COMMON_WORDS.add(line.strip().lower())

# Tokens that match this regular expression are immediately discared
# This should be used pretty much to just discard links
URL_RE = re.compile(
    "|".join(["^(http(s)?://|www\.)",  # begins with
              "\.(com|it|net|org)($|/)"  # ends with tld or is followed by /
             ]))

# A valid token regular expression
TOKEN_RE = re.compile(r"[\w]+(?:\'(?:d|ll|m|re|s|t|ve))?", flags=re.UNICODE)


def parse_cmd_line():
    """Command-line argument parsing."""

    usage = ("usage: %prog [options] USERNAME TARGET\n\n"
             "USERNAME sets your Reddit username for the bot\n"
             "TARGET sets the subreddit or user to count word frequencies for."
             "\nenter /r/TARGET for subreddits or /u/TARGET for users.")
    parser = OptionParser(usage=usage)

    parser.add_option("-p", "--period",
                      action="store",
                      type="string",
                      dest="period",
                      default="month",
                      help=("period to count words over:"
                            " day/week/month/year/all"
                            " [default: month]"))

    parser.add_option("-l", "--limit",
                      action="store",
                      type="int",
                      dest="limit",
                      help=("maximum number of submissions/comments to count"
                            " word frequencies for"
                            " [default: no limit]"))

    parser.add_option("-m", "--maxthresh",
                      action="store",
                      type="float",
                      dest="max_threshold",
                      default=0.34,
                      help=("maximum relative frequency in the text a word can"
                            " appear to be considered in word counts (prevents"
                            " word spamming in a single submission)"
                            " [default: 0.34]"))

    parser.add_option("-o", "--only_one",
                      action="store_false",
                      dest="count_word_freqs",
                      default=True,
                      help=("only count a word once per text block (title,"
                            " selftext, comment body) rather than incrementing"
                            " the total for each instance"
                            " [default: false]"))

    parser.add_option("-u", "--multiprocess",
                      action="store_true",
                      default=False,
                      help=("enable PRAW multiprocess support"
                            " [default: false]"))

    parser.add_option("-i", "--include-dictionary",
                      action="store_true",
                      default=False,
                      help=("exclude words found in the dictionary from the"
                            " word cloud"
                            " [default: false]"))

    parser.add_option("-r", "--no-raw-data",
                      action="store_true",
                      default=False,
                      help=("disable raw word count output file"
                            " [default: false]"))

    parser.add_option("-v", "--verbose",
                      action="store_true",
                      default=False,
                      help=("print all program output to the terminal"
                            " [default: false]"))

    options, args = parser.parse_args()

    if len(args) != 2:
        parser.error("Invalid number of arguments provided: {}".format(args))
    user, target = args

    if target.startswith("/r/"):
        options.is_subreddit = True
    elif target.startswith("/u/"):
        options.is_subreddit = False
    else:
        parser.error("Invalid target.")

    if options.period not in ["day", "week", "month", "year", "all"]:
        parser.error("Invalid period.")

    if options.include_dictionary:
        with open(os.path.join(PACKAGE_DIR, "words", "dict-words.txt"), "r") as in_file:
            for line in in_file:
                COMMON_WORDS.add(line.strip().lower())

    return user, target, options


def tokenize(text):
    """Return individual tokens from a block of text."""
    def normalized_tokens(token):
        """Yield lower-case tokens from the given token."""
        for sub in TOKEN_RE.findall(token):
            if sub:
                yield sub.lower()

    for token in text.split():  # first split on whitespace
        if URL_RE.search(token):  # Ignore invalid tokens
            continue
        for sub_token in normalized_tokens(token):
            if sub_token.endswith("'s"):  # Fix possessive form
                sub_token = sub_token[:-2]
            yield sub_token


def parse_text(text, count_word_freqs, max_threshold, is_markdown=True):
    """Parse the passed in text and add words that are not common.

    :param count_word_freqs: if False, only count a word once per text block
        (title, selftext, comment body) rather than incrementing the total for
        each instance.

    :param max_threshold: maximum relative frequency in the text a word can
        appear to be considered in word counts. prevents word spamming in a
        single submission.

    :param is_markdown: When True, parse as markdown and extract the text.

    """

    if is_markdown:
        soup = BeautifulSoup(markdown(text), "lxml")
        text = "".join(soup.findAll(text=True))

    total = 0.0  # intentionally a float
    text_words = defaultdict(int)
    for token in tokenize(text):
        total += 1
        # add to the raw word list
        all_words[token] += 1
        if token not in COMMON_WORDS:
            text_words[token] += 1

    # Count the popular words
    for word, count in text_words.items():
        if count / total <= max_threshold:
            if count_word_freqs:
                popular_words[word] += count
            else:
                popular_words[word] += 1


def with_status(iterable):
    """Wrap an iterable outputting '.' for each item (up to 100 per line)."""
    for i, item in enumerate(iterable):
        sys.stderr.write(".")
        sys.stderr.flush()
        if i % 100 == 99:
            sys.stderr.write("\n")
        yield item

    sys.stderr.write("\n")


def process_redditor(redditor, limit, count_word_freqs, max_threshold):
    """Parse submissions and comments for the given Redditor.

    :param limit: the maximum number of submissions to scrape from the
        subreddit

    :param count_word_freqs: if False, only count a word once per text block
        (title, selftext, comment body) rather than incrementing the total for
        for each instance.

    :param max_threshold: maximum relative frequency in the text a word can
        appear to be considered in word counts. prevents word spamming in a
        single submission.

    """
    for entry in with_status(iterable=redditor.get_overview(limit=limit)):
        if isinstance(entry, praw.objects.Comment):  # Parse comment
            parse_text(text=entry.body, count_word_freqs=count_word_freqs,
                       max_threshold=max_threshold)
        else:  # Parse submission
            process_submission(submission=entry,
                               count_word_freqs=count_word_freqs,
                               max_threshold=max_threshold,
                               include_comments=False)


def process_submission(submission, count_word_freqs, max_threshold, include_comments=True):
    """Parse a submission's text and body (if applicable).

    :param count_word_freqs: if False, only count a word once per text block
        (title, selftext, comment body) rather than incrementing the total for
        for each instance.

    :param max_threshold: maximum relative frequency in the text a word can
        appear to be considered in word counts. prevents word spamming in a
        single submission.

    :param include_comments: include the submission's comments when True

    """
    if include_comments:  # parse all the comments for the submission
        submission.replace_more_comments()
        for comment in praw.helpers.flatten_tree(submission.comments):
            parse_text(text=comment.body, count_word_freqs=count_word_freqs,
                       max_threshold=max_threshold)

    # parse the title of the submission
    parse_text(text=submission.title, count_word_freqs=count_word_freqs,
               max_threshold=max_threshold, is_markdown=False)

    # parse the selftext of the submission (if applicable)
    if submission.is_self:
        parse_text(text=submission.selftext, count_word_freqs=count_word_freqs,
                   max_threshold=max_threshold)


def process_subreddit(subreddit, period, limit, count_word_freqs, max_threshold):
    """Parse comments, title text, and selftext in a given subreddit.

    :param period: the time period to scrape the subreddit over (day, week,
    month, etc.)

    :param limit: the maximum number of submissions to scrape from the
    subreddit

    :param count_word_freqs: if False, only count a word once per text block
        (title, selftext, comment body) rather than incrementing the total for
        for each instance.

    :param max_threshold: maximum relative frequency in the text a word can
        appear to be considered in word counts. prevents word spamming in a
        single submission.

    """

    # determine period to count the words over
    params = {"t": period}
    for submission in with_status(iterable=subreddit.get_top(limit=limit, params=params)):
        try:
            process_submission(submission=submission,
                               count_word_freqs=count_word_freqs,
                               max_threshold=max_threshold)
        except HTTPError as exc:
            sys.stderr.write("\nSkipping submission {0} due to HTTP status {1}"
                             " error. Continuing...\n"
                             .format(submission.permalink.encode("UTF-8"),
                                     exc.response.status_code))
        except ValueError:  # Occurs occasionally with empty responses
            sys.stderr.write("\nSkipping submission {0} due to ValueError.\n"
                             .format(submission.permalink.encode("UTF-8")))


def main():
    # parse the command-line options and arguments
    user, target, options = parse_cmd_line()

    # Check for package updates
    update_check(__name__, __version__)

    # open connection to Reddit
    handler = None

    if options.multiprocess:
        from praw.handlers import MultiprocessHandler
        handler = MultiprocessHandler()

    reddit = praw.Reddit(
        user_agent="/u/{0} reddit analyzer".format(user), handler=handler)

    reddit.config.decode_html_entities = True

    # run analysis
    sys.stderr.write("Analyzing {0}\n".format(target))
    sys.stderr.flush()

    target = target[3:]

    if options.is_subreddit:
        process_subreddit(subreddit=reddit.get_subreddit(target),
                          period=options.period, limit=options.limit,
                          count_word_freqs=options.count_word_freqs,
                          max_threshold=options.max_threshold)
    else:
        process_redditor(redditor=reddit.get_redditor(target),
                         limit=options.limit,
                         count_word_freqs=options.count_word_freqs,
                         max_threshold=options.max_threshold)

    # build a string containing all the words for the word cloud software
    output = ""

    # open output file to store the output string
    out_file_name = "{0}.csv".format(target)

    if options.is_subreddit:
        out_file_name = "subreddit-{0}".format(out_file_name)
    else:
        out_file_name = "user-{0}".format(out_file_name)

    out_file = open(out_file_name, "w")

    # combine singular and plural forms of words into single count
    for word in list(popular_words.keys()):
        count = popular_words[word]

        # e.g.: "picture" and "pictures"
        if word.endswith("s"):
            # if the singular form of the word was used
            singular = word[:-1]
            if popular_words[singular] > 0:

                # combine the count into the most-used form of the word
                if popular_words[singular] > count:
                    popular_words[singular] += popular_words[word]
                    del popular_words[word]
                else:
                    popular_words[word] += popular_words[singular]
                    del popular_words[singular]

        # e.g.: "furry" and "furries"
        if word.endswith("ies"):
            # if the singular form of the word was used
            singular = "{0}y".format(word[:-3])
            if popular_words[singular] > 0:
                # combine the count into the most-used form of the word
                if popular_words[singular] > count:
                    popular_words[singular] += popular_words[word]
                    del popular_words[word]
                else:
                    popular_words[word] += popular_words[singular]
                    del popular_words[singular]

    for word in sorted(popular_words, key=popular_words.get, reverse=True):
        # tweak this number depending on the subreddit
        # some subreddits end up having TONS of words and it seems to overflow
        # the Python string buffer
        if popular_words[word] > 5:
            pri = True

            # don't print the word if it's just a number
            if word.isdigit():
                pri = False

            # add as many copies of the word as it was mentioned in the
            # subreddit
            if pri:
                out_text = str("{0}:{1}\n".format(word, popular_words[word]))
                output += out_text
                out_file.write(out_text)

    out_file.close()

    # print the series of words for the word cloud software
    # place this text into wordle.net
    if options.verbose:
        print(output)

    # save the raw word counts to a file
    if not options.no_raw_data:
        out_file = open("raw-{0}".format(out_file_name), "w")
        for word in sorted(all_words, key=all_words.get, reverse=True):
            out_text = str("{0}:{1}\n".format(word, all_words[word]))
            out_file.write(out_text)
        out_file.close()


if __name__ == "__main__":
    sys.exit(main())