src/main/python/tm.py from PuzaTech/Fugue

src/main/python/tm.py
Summary

Maintainability

6 hrs
Test Coverage

Issues
import argparse
import sys
import json
import codecs
import string

def cleaned(raw_word):
    clean_word = []
    for i in raw_word:
        if i in string.ascii_letters:
            clean_word.append(i)
    if len(clean_word) < 2:
        return ''
    return ''.join(clean_word)

def tokenize(raw_string):
    raw_str = raw_string.strip().lower()
    words = []
    for w in raw_str.split(' '):
        c_w = cleaned(w)
        if c_w != '':
            words.append(c_w)
    return words

def parse_docs(dictionary, args):
    input_f = open(args.input_file, 'r')
    output_f = open(args.output_file,'w')
    doc_id = 0
    for line in input_f:
        if line.strip()!='':
            try:
                json_obj = json.loads(line)
                title_raw = json_obj['title']
                review_raw = json_obj['review']
                tokens = []
                tokens.extend(tokenize(title_raw))
                tokens.extend(tokenize(review_raw))
                output_buffer = []
                for t in tokens:
                    if t in dictionary:
                        output_buffer.append({'feature_type' : 'TOKEN', 'feature_name' : t, 'feature_value' : 1.0})
                if len(output_buffer) > 5:
                    output_obj = {}
                    output_obj['doc_id'] = str(doc_id)
                    output_obj['features'] = output_buffer
                    output_f.write(json.dumps(output_obj) + '\n')
                    output_f.flush()
                    doc_id += 1
            except:
                continue
    output_f.close()

def load_raw(args):
    input_f = open(args.input_file, 'r')
    docs = []
    for line in input_f:
        if line.strip()!='':
            try:
                json_obj = json.loads(line)
                title_raw = json_obj['title']
                review_raw = json_obj['review']
                tokens = []
                tokens.extend(tokenize(title_raw))
                tokens.extend(tokenize(review_raw))
                docs.append(tokens)
            except:
                continue
    input_f.close()
    return docs

def compute_term_stats(docs, args = None):
    """
    This function is to compute TF and DF stats
    :param docs:
    :return: a return obj with tf and df
    """
    df = {}
    tf = {}
    for i in range(len(docs)):
        local_df = set()
        for token in docs[i]:
            if not token in local_df:
                local_df.add(token)
                if not token in df:
                    df[token] = 0
                df[token] = df[token] + 1
            if not token in tf:
                tf[token] = 0
            tf[token] = tf[token] + 1
    result_obj = {}
    result_obj['tf'] = tf
    result_obj['df'] = df
    return result_obj

def compute_dictionary(results, args = None):
    """
    This function is to construct the dictionary
    :param results:
    :return:
    """
    tf = results['tf']
    df = results['df']
    tf_list = [(tf_value, tf_term) for (tf_term, tf_value) in tf.iteritems()]
    tf_list = sorted(tf_list, key = lambda s: s[0], reverse = True)
    df_list = [(df_value, df_term) for (df_term, df_value) in df.iteritems()]
    df_list = sorted(df_list, key = lambda s: s[0], reverse = True)
    total = len(df_list)
    upper_bound = int( float(args.term_upper) * total )
    lower_bound = int( float(args.term_lower) * total )
    dictionary = []
    for i in range(upper_bound, lower_bound):
        df_value, df_term = df_list[i]
        if df_value > int(args.term_min):
            dictionary.append((df_term, tf[df_term], df_value))
    return_obj = {}
    return_obj['dictionary'] = dictionary
    return return_obj
    

def save_dictionary(results, args = None):
    """
    This function is to save term results
    :param results: a return obj with term and stats
    :param args: the output filename
    :return:
    """
    output_f = open(args.output_file, 'w')
    dictionary = results['dictionary']
    for t in dictionary:
        term = t[0]
        tf = t[1]
        df = t[2]
        output_f.write(term + '\t' + str(tf) + '\t' + str(df) + '\n')
    output_f.close()

def load_dictionary(args):
    local_dic = set()
    input_f = open(args.dic_file,'r')
    for line in input_f:
        w = line.strip().split('\t')
        local_dic.add(w[0])
    input_f.close()
    return local_dic

def load_model(model_filename):
    json_file_content = ''
    input_f = open(model_filename, 'r')
    for line in input_f:
        json_file_content += line.strip()
    input_f.close()
    model_obj = json.loads(json_file_content)
    alpha = model_obj['alpha']
    topK = len(alpha)
    alphaSum = 0.0
    for a in alpha:
        alphaSum += a
    beta = model_obj['beta']
    betaSum = 0.0
    for b, b_value in beta.iteritems():
        betaSum += b_value
    wordTopicCounts = model_obj['wordTopicCounts']
    topicCounts = model_obj['topicCounts']
    topicsOrder = [{} for i in range(topK)]
    for v in wordTopicCounts.keys():
        local_counts =  wordTopicCounts[v]
        for i in range(len(local_counts)):
            value = (local_counts[i] + beta[v]) / (topicCounts[i] + betaSum)
            topicsOrder[i][v] = value
    return topicsOrder

def load_models(args):
    files = args.model_file.split(',')
    topicsOrders = {}
    N = 0
    for file in files:
        print(file)
        topicsOrder = load_model(file)
        for k in range(len(topicsOrder)):
            if not k in topicsOrders:
                topicsOrders[k] = {}
            for term, value in topicsOrder[k].iteritems():
                if not term in topicsOrders[k]:
                    topicsOrders[k][term] = 0.0
                topicsOrders[k][term] = topicsOrders[k][term] + value
        N += 1
    # averaging
    topicsK = len(topicsOrders)
    finalOrder = [[] for i in range(topicsK)]
    for k in topicsOrders.keys():
        for term, value in topicsOrders[k].iteritems():
            finalOrder[k].append((value/float(N), term))
    # output
    output_f = open(args.output_file, 'w')
    for k in range(topicsK):
        local_order = sorted(finalOrder[k], key = lambda s: s[0], reverse = True)
        output_buffer = [w for (v, w) in local_order[:40]]
        output_f.write(str(k) + '\t' + ' '.join(output_buffer) + '\n')
    output_f.close()
    return

if __name__ == '__main__':
    sys.stdin = codecs.getreader('utf-8')(sys.stdin)
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)
    parser = argparse.ArgumentParser(description='The prototype ranker for local events.')
    parser.add_argument('--task', help = 'the task to be performed', default = '')
    parser.add_argument('--input_file', help = 'the input filename', default = 'input.db')
    parser.add_argument('--output_file', help = 'the output filename', default = 'output.db')
    parser.add_argument('--dic_file', help = 'the dictionary filename', default = 'dic.db')
    parser.add_argument('--model_file', help = 'the model filename', default = 'model.db')
    parser.add_argument('--term_upper', help = 'the upper bound of term dist.', default = '0.01')
    parser.add_argument('--term_lower', help = 'the lower bound of term dist.', default = '0.80')
    parser.add_argument('--term_min', help = 'the minimum DF value for a term', default = '5')
    args = parser.parse_args()
    if args.task is not None and args.task != '':
        if args.task == 'dictionary':
            docs = load_raw(args)
            results = compute_term_stats(docs, args)
            dictionary = compute_dictionary(results, args)
            save_dictionary(dictionary, args)
        if args.task == 'parse':
            dictionary = load_dictionary(args)
            parse_docs(dictionary, args)
        if args.task == 'topics':
            load_models(args)