Futsch1/form-analyzer

View on GitHub
form_analyzer/form_parser.py

Summary

Maintainability
A
25 mins
Test Coverage
import glob
import json
import logging
import os
import typing
from dataclasses import dataclass

import trp


@dataclass
class FieldWithPage:
    page: int
    field: trp.Field


FieldList = typing.List[FieldWithPage]


@dataclass
class ParsedForm:
    page_files: typing.List[str]
    fields: FieldList


@dataclass
class FormPages:
    pages: int
    words_on_page: typing.List[typing.List[str]]


def __get_field_list_from_document(document: trp.Document) -> typing.List[FieldList]:
    fields = []
    for page_num, page in enumerate(document.pages):
        for field in page.form.fields:
            fields.append(FieldWithPage(page_num, field))
        if page_num % 4 == 3:
            yield fields
            fields = []


def __is_any_word_in_blocks(blocks, words: typing.List[str]) -> bool:
    word_found = False

    for block in blocks:
        if any(['Text' in block and word in block['Text'] for word in words]):
            word_found = True

    return not len(words) or word_found


def __get_parsed_form(file_names: typing.List[str], form_pages: FormPages) -> ParsedForm:
    base_file_names = []
    responses = []

    for file_name in file_names:
        base_file_names.append(os.path.splitext(os.path.split(file_name)[1])[0])
        with open(file_name) as f:
            responses.append(json.load(f))

    doc = trp.Document(responses)
    for page, words in zip(doc.pages, form_pages.words_on_page):
        page: trp.Page
        assert __is_any_word_in_blocks(page.blocks, words), f'Words {words} not found in files {file_names}\n{page}'

    fields: FieldList = []
    for page_num, page in enumerate(doc.pages):
        for field in page.form.fields:
            fields.append((FieldWithPage(page_num, field)))
    return ParsedForm(base_file_names, fields)


def parse(path_or_file: str, form_pages: FormPages) -> typing.List[ParsedForm]:
    file_names = sorted(glob.glob(path_or_file + '/*.json')) if os.path.isdir(path_or_file) else [path_or_file]

    from form_analyzer import form_analyzer_logger

    form_analyzer_logger.log(logging.INFO, f'Loading textract data for {len(file_names)} pages')

    if form_pages.pages == 0:
        form_pages.pages = len(file_names)
        form_pages.words_on_page = [] * len(file_names)
    else:
        if len(file_names) == 0:
            raise FileNotFoundError(f'No textract JSON result files found in {path_or_file}')

    for i in range(0, len(file_names), form_pages.pages):
        yield __get_parsed_form(file_names[i:i + form_pages.pages], form_pages)