webservices/legal_docs/index_management.py

Summary

Maintainability
C
1 day
Test Coverage
import logging

import elasticsearch

from webservices import utils

logger = logging.getLogger(__name__)

MAPPINGS = {
    "_default_": {
        "properties": {
            "sort1": {
                "type": "integer",
                "include_in_all": False
            },
            "sort2": {
                "type": "integer",
                "include_in_all": False
            },
        }
    },
    "citations": {
        "properties": {
            "citation_type": {
                "type": "string",
                "index": "not_analyzed"
            },
            "citation_text": {
                "type": "string",
                "index": "not_analyzed"
            }
        }
    },
    "murs": {
        "properties": {
            "no": {
                "type": "string",
                "index": "not_analyzed"
            },
            "doc_id": {
                "type": "string",
                "index": "no"
            },
            "mur_type": {
                "type": "string"
            },
            "name": {
                "type": "string",
                "analyzer": "english"
            },
            "election_cycles": {
                "type": "long"
            },
            "open_date": {
                "type": "date",
                "format": "dateOptionalTime"
            },
            "close_date": {
                "type": "date",
                "format": "dateOptionalTime"
            },
            "url": {
                "type": "string",
                "index": "no"
            },
            "subjects": {
                "type": "string"
            },
            "commission_votes": {
                "properties": {
                    "text": {
                        "type": "string"
                    },
                    "vote_date": {
                        "type": "date",
                        "format": "dateOptionalTime"
                    }
                }
            },
            "dispositions": {
                "properties": {
                    "citations": {
                        "properties": {
                            "text": {
                                "type": "string"
                            },
                            "title": {
                                "type": "string"
                            },
                            "type": {
                                "type": "string"
                            },
                            "url": {
                                "type": "string"
                            }
                        }
                    },
                    "disposition": {
                        "type": "string",
                        "index": "not_analyzed"
                    },
                    "penalty": {
                        "type": "double"
                    },
                    "respondent": {
                        "type": "string"
                    }
                }
            },
            "documents": {
                "type": "nested",
                "properties": {
                    "category": {
                        "type": "string",
                        "index": "not_analyzed"
                    },
                    "description": {
                        "type": "string"
                    },
                    "document_date": {
                        "type": "date",
                        "format": "dateOptionalTime"
                    },
                    "document_id": {
                        "type": "long",
                        "index": "no"
                    },
                    "length": {
                        "type": "long",
                        "index": "no"
                    },
                    "text": {
                        "type": "string"
                    },
                    "url": {
                        "type": "string",
                        "index": "no"
                    }
                }
            },
            "participants": {
                "properties": {
                    "citations": {
                        "type": "object"
                    },
                    "name": {
                        "type": "string"
                    },
                    "role": {
                        "type": "string"
                    }
                }
            },
            "respondents": {
                "type": "string"
            }
        }
    },
    "statutes": {
        "properties": {
            "doc_id": {
                "type": "string",
                "index": "no"
            },
            "name": {
                "type": "string",
                "analyzer": "english"
            },
            "text": {
                "type": "string",
                "analyzer": "english"
            },
            "no": {
                "type": "string",
                "index": "not_analyzed"
            },
            "title": {
                "type": "string"
            },
            "chapter": {
                "type": "string"
            },
            "subchapter": {
                "type": "string"
            },
            "url": {
                "type": "string",
                "index": "no"
            }
        }
    },
    "regulations": {
        "properties": {
            "doc_id": {
                "type": "string",
                "index": "no"
            },
            "name": {
                "type": "string",
                "analyzer": "english"
            },
            "text": {
                "type": "string",
                "analyzer": "english"
            },
            "no": {
                "type": "string",
                "index": "not_analyzed"
            },
            "url": {
                "type": "string",
                "index": "no"
            }
        }
    },
    "advisory_opinions": {
        "properties": {
            "no": {
                "type": "string",
                "index": "not_analyzed"
            },
            "name": {
                "type": "string",
                "analyzer": "english"
            },
            "summary": {
                "type": "string",
                "analyzer": "english"
            },
            "issue_date": {
                "type": "date",
                "format": "dateOptionalTime"
            },
            "is_pending": {
                "type": "boolean"
            },
            "status": {
                "type": "string"
            },
            "ao_citations": {
                "properties": {
                    "name": {
                        "type": "string"
                    },
                    "no": {
                        "type": "string"
                    }
                }
            },
            "aos_cited_by": {
                "properties": {
                    "name": {
                        "type": "string"
                    },
                    "no": {
                        "type": "string"
                    }
                }
            },
            "statutory_citations": {
                "type": "nested",
                "properties": {
                    "section": {
                        "type": "long"
                    },
                    "title": {
                        "type": "long"
                    }
                }
            },
            "regulatory_citations": {
                "type": "nested",
                "properties": {
                    "part": {
                        "type": "long"
                    },
                    "section": {
                        "type": "long"
                    },
                    "title": {
                        "type": "long"
                    }
                }
            },
            "requestor_names": {
                "type": "string"
            },
            "requestor_types": {
                "type": "string",
                "index": "not_analyzed"
            },
            "documents": {
                "type": "nested",
                "properties": {
                    "document_id": {
                        "type": "long",
                        "index": "no"
                    },
                    "category": {
                        "type": "string",
                        "index": "not_analyzed"
                    },
                    "description": {
                        "type": "string"
                    },
                    "date": {
                        "type": "date",
                        "format": "dateOptionalTime"
                    },
                    "text": {
                        "type": "string"
                    },
                    "url": {
                        "type": "string",
                        "index": "no"
                    }
                }
            }
        }
    }
}

ANALYZER_SETTINGS = {
    "analysis": {"analyzer": {"default": {"type": "english"}}}
}


def create_docs_index():
    """
    Initialize Elasticsearch for storing legal documents.
    Create the `docs` index, and set up the aliases `docs_index` and `docs_search`
    to point to the `docs` index. If the `doc` index already exists, delete it.
    """

    es = utils.get_elasticsearch_connection()
    try:
        logger.info("Delete index 'docs'")
        es.indices.delete('docs')
    except elasticsearch.exceptions.NotFoundError:
        pass

    try:
        logger.info("Delete index 'docs_index'")
        es.indices.delete('docs_index')
    except elasticsearch.exceptions.NotFoundError:
        pass

    logger.info("Create index 'docs'")
    es.indices.create('docs', {
        "mappings": MAPPINGS,
        "settings": ANALYZER_SETTINGS,
        "aliases": {
            'docs_index': {},
            'docs_search': {}
        }
    })


def create_archived_murs_index():
    """
    Initialize Elasticsearch for storing archived MURs.
    If the `archived_murs` index already exists, delete it.
    Create the `archived_murs` index.
    Set up the alias `archived_murs_index` to point to the `archived_murs` index.
    Set up the alias `docs_search` to point `archived_murs` index, allowing the
    legal search to work across current and archived MURs
    """

    es = utils.get_elasticsearch_connection()

    try:
        logger.info("Delete index 'archived_murs'")
        es.indices.delete('archived_murs')
    except elasticsearch.exceptions.NotFoundError:
        pass

    logger.info("Create index 'archived_murs' with aliases 'docs_search' and 'archived_murs_index'")
    es.indices.create('archived_murs', {
        "mappings": MAPPINGS,
        "settings": ANALYZER_SETTINGS,
        "aliases": {
            'archived_murs_index': {},
            'docs_search': {}
        }
    })


def delete_docs_index():
    """
    Delete index `docs`.
    This is usually done in preparation for restoring indexes from a snapshot backup.
    """

    es = utils.get_elasticsearch_connection()
    try:
        logger.info("Delete index 'docs'")
        es.indices.delete('docs')
    except elasticsearch.exceptions.NotFoundError:
        pass


def create_staging_index():
    """
    Create the index `docs_staging`.
    Move the alias docs_index to point to `docs_staging` instead of `docs`.
    """
    es = utils.get_elasticsearch_connection()
    try:
        logger.info("Delete index 'docs_staging'")
        es.indices.delete('docs_staging')
    except:
        pass

    logger.info("Create index 'docs_staging'")
    es.indices.create('docs_staging', {
        "mappings": MAPPINGS,
        "settings": ANALYZER_SETTINGS,
    })

    logger.info("Move alias 'docs_index' to point to 'docs_staging'")
    es.indices.update_aliases(body={"actions": [
        {"remove": {"index": 'docs', "alias": 'docs_index'}},
        {"add": {"index": 'docs_staging', "alias": 'docs_index'}}
    ]})


def restore_from_staging_index():
    """
    A 4-step process:
    1. Move the alias docs_search to point to `docs_staging` instead of `docs`.
    2. Reinitialize the index `docs`.
    3. Reindex `doc_staging` to `docs`
    4. Move `docs_index` and `docs_search` aliases to point to the `docs` index.
       Delete index `docs_staging`.
    """
    es = utils.get_elasticsearch_connection()

    logger.info("Move alias 'docs_search' to point to 'docs_staging'")
    es.indices.update_aliases(body={"actions": [
        {"remove": {"index": 'docs', "alias": 'docs_search'}},
        {"add": {"index": 'docs_staging', "alias": 'docs_search'}}
    ]})

    logger.info("Delete and re-create index 'docs'")
    es.indices.delete('docs')
    es.indices.create('docs', {
        "mappings": MAPPINGS,
        "settings": ANALYZER_SETTINGS
    })

    logger.info("Reindex all documents from index 'docs_staging' to index 'docs'")

    body = {
      "source": {
        "index": "docs_staging",
      },
      "dest": {
        "index": "docs"
      }
    }
    es.reindex(body=body, wait_for_completion=True, request_timeout=1500)

    logger.info("Move aliases 'docs_index' and 'docs_search' to point to 'docs'")
    es.indices.update_aliases(body={"actions": [
        {"remove": {"index": 'docs_staging', "alias": 'docs_index'}},
        {"remove": {"index": 'docs_staging', "alias": 'docs_search'}},
        {"add": {"index": 'docs', "alias": 'docs_index'}},
        {"add": {"index": 'docs', "alias": 'docs_search'}}
    ]})
    logger.info("Delete index 'docs_staging'")
    es.indices.delete('docs_staging')


def move_archived_murs():
    '''
    Move archived MURs from `docs` index to `archived_murs_index`
    This should only need to be run once.
    Once archived MURs are on their own index, we will be able to
    re-index current legal docs after a schema change much more quickly.
    '''
    es = utils.get_elasticsearch_connection()

    body = {
          "source": {
            "index": "docs",
            "type": "murs",
            "query": {
              "match": {
                "mur_type": "archived"
              }
            }
          },
          "dest": {
            "index": "archived_murs"
          }
        }

    logger.info("Copy archived MURs from 'docs' index to 'archived_murs' index")
    es.reindex(body=body, wait_for_completion=True, request_timeout=1500)