failmap/admin

View on GitHub
websecmap/organizations/datasources/dutch_government.py

Summary

Maintainability
A
55 mins
Test Coverage
"""
Importer for Dutch governmental organizations, using open data.

Example:
failmap import_organizations dutch_government

Warning: this is XML, set aside your intuition about programming.

https://almanak-redactie.overheid.nl/archive/
"""

import logging
import xml.etree.ElementTree as ET

from websecmap.celery import app
from websecmap.organizations.datasources import download_http_get_no_credentials, generic_dataset_import, read_data

log = logging.getLogger(__package__)


def parse_data(dataset, filename):
    data = read_data(filename)
    # this is some kind of XML format. for which an XSD is available.
    # for each document another namespace version is available, which makes it harder.
    # how can we identify the correct namespace for p correctly automatically?
    found_organizations = []

    root = ET.fromstring(data)
    ns = root.attrib["{http://www.w3.org/2001/XMLSchema-instance}schemaLocation"].split(" ")[0]
    log.debug("Using namespace: %s" % ns)

    # of course this doesn't work out the box, so how do we autoregister a namespace?
    ET.register_namespace("p", ns)
    # so just fake / overwrite the namespaces variable
    namespaces = {"p": ns}

    organizations = root.find("p:%s" % dataset["xml_plural"], namespaces)

    # why can't i use a similar construct as get?
    # i want: bla = et.find(x. alaternative if not found)
    for organization in organizations.iterfind("p:%s" % dataset["xml_single"], namespaces):
        name = emulate_get(organization, "p:naam", namespaces)
        if not name:
            # gemeenschappelijke regelingen has a title, not a name.
            name = emulate_get(organization, "p:titel", namespaces)

        abbreviation = emulate_get(organization, "p:afkorting", namespaces)

        contact = organization.find("p:contact", namespaces)
        bezoekAdres = contact.find("p:bezoekAdres", namespaces)
        adres = bezoekAdres.find("p:adres", namespaces)
        straat = emulate_get(adres, "p:straat", namespaces)
        huisnummer = emulate_get(adres, "p:huisnummer", namespaces)
        postcode = emulate_get(adres, "p:postcode", namespaces)
        plaats = emulate_get(adres, "p:plaats", namespaces)

        site = emulate_get(contact, "p:internet", namespaces)

        if not postcode and not plaats:
            # try to find something by name... might not have an address...
            geocoding_hint = "%s, Nederland" % name
        else:
            geocoding_hint = "Nederland"

        found_organizations.append(
            {
                "name": "%s (%s)" % (name, abbreviation) if abbreviation else name,
                "address": "%s %s, %s, %s" % (straat, huisnummer, postcode, plaats),
                # make sure that the geocoder is looking at the Netherlands.
                "geocoding_hint": geocoding_hint,
                "websites": [site],
                "country": dataset["country"],
                "layer": dataset["layer"],
                "lat": None,
                "lng": None,
                "dataset": dataset,
            }
        )

    # debug_organizations(found_organizations)

    return found_organizations


def emulate_get(xml, element, namespaces):
    # xml.find(element, namespaces) cannot be compared, it's always false.
    # This thus doesn't work:
    # return xml.find(element, namespaces).text if xml.find(element, namespaces) else ""
    try:
        return xml.find(element, namespaces).text
    except AttributeError:
        return ""


@app.task(queue="storage")
def import_datasets(**dataset):
    generic_dataset_import(
        dataset=dataset, parser_function=parse_data, download_function=download_http_get_no_credentials
    )