implementation/primaries/ExtractMetadata/classes/MetaParser.py from Godley/Music-Library

implementation/primaries/ExtractMetadata/classes/MetaParser.py
Summary

Maintainability

7 hrs
Test Coverage

Issues
import xml.sax
import copy
from xml.sax import handler, make_parser
from MuseParse import helpers

from implementation.primaries.ExtractMetadata.classes.hashdict import hashdict
from implementation.primaries.ExtractMetadata.classes.DataLayer.helpers import get_if_exists
from implementation.primaries.ExtractMetadata.classes.helpers import combine_dictionaries, init_kv


class Extractor(xml.sax.ContentHandler):

    def __init__(self, parent):
        self.parent = parent

    def startElement(self, name, attrs):
        attribs = {}
        for attrname in attrs.getNames():
            attrvalue = attrs.getValue(attrname)
            attribs[attrname] = attrvalue
        self.parent.startTag(name, attribs)

    def characters(self, text):
        self.parent.newData(text)

    def endElement(self, name):
        self.parent.endTag(name)


class MetaParser(object):

    def __init__(self):
        self.tags = []
        self.attribs = {}
        self.chars = {}
        self.handlers = {
            "part-name": make_new_part,
            "key": handle_clef_or_key,
            "clef": handle_clef_or_key,
            "transpose": handleTransposition,
            "time": handleMeter,
            "metronome": handle_tempo,
            "movement-title": handleBibliography,
            "work-title": handleBibliography,
            "creator": handleBibliography,
            "credit-words": handleBibliography}
        self.close_tags = ["beat-unit-dot"]
        self.current_handler = None
        self.parts = {}
        self.data = {}

    def startTag(self, name, attrs):
        self.tags.append(name)
        self.attribs[name] = attrs
        if name in self.handlers:
            self.current_handler = self.handlers[name]

    def validateData(self, data):

        pass

    def newData(self, data):
        if len(self.tags) > 0:
            self.chars[self.tags[-1]] = data
            if self.current_handler is not None and self.tags[
                    -1] not in self.close_tags:
                self.current_handler(
                    self.tags,
                    self.attribs,
                    self.chars,
                    self.parts,
                    self.data)

    def endTag(self, name):
        if name in self.close_tags:
            if self.current_handler is not None:
                self.current_handler(
                    self.tags,
                    self.attribs,
                    self.chars,
                    self.parts,
                    self.data)
        self.pop_name(name)

        if len(self.tags) > 0 and self.tags[-1] in self.handlers:
            self.current_handler = self.handlers[self.tags[-1]]

    def pop_name(self, name):
        if name in self.tags:
            self.tags.remove(name)
        if name in self.attribs:
            self.attribs.pop(name)
        if name in self.chars:
            self.chars.pop(name)

    def parse(self, file):
        parser = make_parser()

        parser.setContentHandler(Extractor(self))
        # OFFLINE MODE
        parser.setFeature(handler.feature_external_ges, False)
        fob = open(file, 'r')
        parser.parse(fob)
        self.collate_parts()
        fob.close()
        return self.data

    def collate_parts(self):
        instrument_list = []
        keys = ["key", "clef"]
        for part in self.parts:
            data = {}
            data["name"] = get_if_exists(
                self.parts[part], "name", "hello, world")

            if "transposition" in self.parts[part]:
                data.update(self.parts[part]["transposition"])
                # data["transposition"] = self.parts[part]["transposition"]

            for key in keys:
                if key in self.parts[part]:
                    init_kv(self.data, "{}s".format(key), init_value={})
                    self.data["{}s".format(key)][
                        data["name"]] = remove_duplicates(
                        self.parts[part][key])

            instrument_list.append(data)
        self.data["instruments"] = instrument_list


# HANDLER METHODS


def make_new_part(tags, attrs, chars, parts, data):
    """
    handler which works with anything inside the score-part tag, which creates a new part inside the data dict
    :param tags: current list of xml tags
    :param attrs: current dict of attribs
    :param chars: current dict of content of each tag
    :return: string of instrument name and a dict so the part dict can be updated
    """
    id = helpers.GetID(attrs, "score-part", "id")
    if id is not None:
        init_kv(parts, id, init_value={})
    if tags[-1] == "part-name":
        if "part-name" in chars:
            name = chars["part-name"]
            parts[id]["name"] = name
            if "instruments" not in data:
                data["instruments"] = []
            data["instruments"].append(name)


def handle_clef_or_key(tags, attrs, chars, parts, data):
    """
    Since clefs and keys are very similar, handle them in the same method
    """
    id = helpers.GetID(attrs, "part", "id")
    tag_dict = {"clef": ("line", "sign"), "key": ("fifths", "mode")}
    if id is not None:
        init_kv(parts, id, init_value={})
        if tags[-2] in tag_dict:
            init_kv(parts[id], tags[-2], init_value=[])
            elem_to_add = create_elem(
                chars, tag_dict[tags[-2]][0], tag_dict[tags[-2]][1])
            tag_type = tags[-2]
            update_or_append_entry(parts[id][tag_type], tags[-1], elem_to_add)


def update_or_append_entry(dictionary, tag, entry):
    """
    Method to check if a dictionary is empty, or if it's last entry contains the tag
    being added. If yes, create a new entry. If it doesn't contain the tag,
    add it. Used by clef and key creation queries, might be used by other handlers in future
    """
    if len(dictionary) == 0 or tag in dictionary[-1]:
        dictionary.append({tag: entry})
    elif tag not in dictionary[-1]:
        dictionary[-1][tag] = entry


def create_elem(chars, key1, key2, cast1=int, cast2=str):
    """
    select a return value from chars dictionary. Priority given to key 2 over key 1,
    both are cast to whatever value we need. Used by clef and key handlers as both
    need the first element, then the second, and both happen to be format int then str.
    Made generic as poss incase this is useful elsewhere.
    """
    return_value = 0
    if key1 in chars:
        return_value = cast1(chars[key1])
    if key2 in chars:
        return_value = cast2(chars[key2])
    return return_value


def handleTransposition(tags, attrs, chars, parts, data):
    id = helpers.GetID(attrs, "part", "id")
    if id is not None:
        init_kv(parts, id, init_value={})
        init_kv(parts[id], "transposition", init_value={})

        if tags[-1] == "diatonic" or tags[-1] == "chromatic":
            content = 0
            if tags[-1] in chars:
                content = int(chars[tags[-1]])
            parts[id]["transposition"][tags[-1]] = content


def handleMeter(tags, attrs, chars, parts, data):
    data.setdefault("time_signatures", [])

    if tags[-1] != "time" and "time" in tags:
        elem = create_elem(chars, "beats", "beat-type", cast2=int)
        tag_type = ''
        if tags[-1] == "beats":
            tag_type = "beat"

        if tags[-1] == "beat-type":
            tag_type = "beat_type"

        update_or_append_entry(data["time_signatures"], tag_type, elem)
        # if len(data["time"]) == 0 or "type" in data["time"][-1]:
        #     data["time"].append({"type": int(b_type)})
        # else:
        #     data["time"][-1]["type"] = int(b_type)


def handle_tempo(tags, attrs, chars, parts, data):
    data.setdefault("tempos", [])

    if tags[-1] != "metronome":
        elem = create_elem(
            chars,
            "beat-unit",
            "per-minute",
            cast1=str,
            cast2=int)
        data["tempos"] = handle_beat_unit(elem, data["tempos"], tags[-1])
        data["tempos"] = handle_beat_unit_dot(data["tempos"], tags[-1])

        if tags[-1] == "per-minute":
            update_or_append_entry(data["tempos"], "minute", elem)


def handle_beat_unit_dot(list_of_dicts, tag):
    if tag == "beat-unit-dot":
        copy_of_dict = copy.deepcopy(list_of_dicts)
        if len(copy_of_dict) > 0:
            if "beat_2" in copy_of_dict[-1]:
                copy_of_dict[-1]["beat_2"] += "."
            elif "beat" in copy_of_dict[-1]:
                copy_of_dict[-1]["beat"] += "."
        return copy_of_dict
    return list_of_dicts


def handle_beat_unit(elem, list_of_dicts, tag):

    if tag == 'beat-unit':
        copy_of_dict = copy.deepcopy(list_of_dicts)

        if beat1_and_minute_or_beat2_exists(copy_of_dict):
            copy_of_dict.append({"beat": elem})

        # if not, we dump it as the first beat of the first tempo
        elif "beat" not in copy_of_dict[-1]:
            copy_of_dict[-1]["beat"] = elem

        # finally, if the first tempo doesn't have a second beat, or a minute,
        # we put it as the second beat of the tempo. MusicXML sucks.
        elif not minute_or_beat2_exists(copy_of_dict):
            copy_of_dict[-1]["beat_2"] = elem
        return copy_of_dict
    return list_of_dicts


def beat1_and_minute_or_beat2_exists(list_of_dicts):
    answer = False
    if len(list_of_dicts) == 0:
        answer = True
    elif "beat" in list_of_dicts[-1] and minute_or_beat2_exists(list_of_dicts):
        answer = True
    return answer


def minute_or_beat2_exists(list_of_dicts):
    answer = False
    if len(list_of_dicts) == 0:
        answer = True
    elif "minute" in list_of_dicts[-1] or "beat_2" in list_of_dicts[-1]:
        answer = True
    return answer


def handleBibliography(tags, attrs, chars, parts, data):
    if tags[-1] == "creator":
        creator_type = helpers.GetID(attrs, "creator", "type")
        if creator_type is not None:
            init_kv(data, creator_type, init_value="")
            if "creator" in chars:
                data[creator_type] += chars["creator"].lower()

    handle_title(tags[-1], chars, data)


def handle_title(tag, chars, data):
    if tag == "movement-title" or tag == "work-title":
        title = create_elem(
            chars,
            "movement-title",
            "work-title",
            cast1=str,
            cast2=str)
        init_kv(data, "title", init_value="")
        data["title"] += title.lower()


def remove_duplicates(list_of_dicts):
    hashdict_list = [hashdict(item) for item in list_of_dicts]
    hashdict_set = set(hashdict_list)
    hslist = [dict(item) for item in hashdict_set]
    return hslist