WtfJoke/kult-lunchbot

View on GitHub
menu/kult_textractor.py

Summary

Maintainability
A
1 hr
Test Coverage
import os
import re

from menu.dailymenu import KultDailyMenu
from menu.lunchmenu import WEEK_DAYS
from menu.menuitem import MenuItem
from menu.weeklymenu import KultWeeklyMenu
from pdf_textractor import convert_pdf_to_txt_lines


class KultTexTractor:
    DAY_DATE_PATTERN = re.compile("(\\w+)\\s(\\d\\d\.\\d\\d\\.\\d\\d\\d\\d)")

    @staticmethod
    def get_menu_text(menu_filename):
        return str(KultTexTractor.get_menu(menu_filename))

    @staticmethod
    def get_menu_from_pdf(pdf):
        menu = KultTexTractor.analyze_menu_text(convert_pdf_to_txt_lines(pdf), os.path.basename(pdf))
        return menu

    @staticmethod
    def analyze_menu_text(text_lines, menu_filename):
        menu = KultWeeklyMenu(menu_filename)

        next_weekday = next_date = ''
        daily_menu = KultDailyMenu()

        for positioned_text in text_lines:
            line = positioned_text.text

            if 'KW' in line:
                menu.set_title(line)
            # begins line with monday-friday
            elif KultTexTractor.is_week_day(line):
                match_day = KultTexTractor.DAY_DATE_PATTERN.match(line)
                # match 'Montag 06.11.2017'
                if match_day:
                    if not daily_menu.get_date():
                        weekday = match_day.group(1)
                        date = match_day.group(2)
                        daily_menu.set_weekday(weekday)
                        daily_menu.set_date(date)
                    else:
                        next_weekday = match_day.group(1)
                        next_date = match_day.group(2)
            elif 'eschlossen' in line:
                daily_menu.set_weekday(next_weekday)
                daily_menu.set_date(next_date)
            elif 'Menü' in line:
                menu_number = KultTexTractor.extract_menu_number(line)
                positioned_text.text = KultTexTractor.remove_menu_prefix(line)

                item = MenuItem(daily_menu, '', menu_number)
                KultTexTractor.fill_menu_item(positioned_text, text_lines, item)

                daily_menu.add_menu_item(item)

                if daily_menu.is_complete():

                    menu.add_daily_menu(daily_menu)
                    daily_menu = KultDailyMenu()

                    if next_weekday or next_date:
                        daily_menu.set_weekday(next_weekday)
                        daily_menu.set_date(next_date)
                        next_weekday = ''
                        next_date = ''

        return menu

    @staticmethod
    def is_week_day(line):
        return any(item in line for item in WEEK_DAYS)

    @staticmethod
    def next_line_is_day(positioned_text, text_lines):
        next_positioned_text = KultTexTractor.get_next_line(positioned_text, text_lines)
        return KultTexTractor.is_week_day(next_positioned_text.text)

    @staticmethod
    def remove_menu_prefix(line):
        replace_string = KultTexTractor.match_menu_line(line)
        menu_text = ''
        if replace_string:  # menu X can be replaced, replace it
            menu_text = line.replace(replace_string.group(), '').strip()
        return menu_text

    @staticmethod
    def fill_menu_item(positioned_text, text_lines, menu_item):
        current_positioned_text = positioned_text
        # not menu and not begins line with monday-friday:
        while "Menü" not in current_positioned_text.text and not \
                KultTexTractor.is_week_day(current_positioned_text.text):

            if "Kl. Salat" in current_positioned_text.text:
                menu_item.set_salad(True)
                current_positioned_text.text = current_positioned_text.text \
                    .replace("Kl. Salat", "") \
                    .replace("|", "") \
                    .replace(" I ", " ").strip()

            if "Dessert" in current_positioned_text.text:
                menu_item.set_dessert(True)
                current_positioned_text.text = current_positioned_text.text \
                    .replace("IDessert", "") \
                    .replace("Dessert", "") \
                    .replace("|", "") \
                    .replace(" I ", " ").strip()

            is_price_match = re.compile('.*(\\d,\\d\\d).*').match(current_positioned_text.text)
            if is_price_match:
                price = is_price_match.group(1)
                menu_item.set_price(price)
                current_positioned_text.text = current_positioned_text.text.replace(price, "")

            menu_item.add_menu_text(current_positioned_text.text.strip())

            is_friday = menu_item.daily_menu.get_weekday() == WEEK_DAYS[4]
            is_friday_and_last_menu = is_friday and menu_item.menu_number == 3
            if is_friday_and_last_menu:
                # stop reading bottom text like dailysoup
                break

            current_positioned_text = KultTexTractor.get_next_line(current_positioned_text, text_lines)

    @staticmethod
    def concat_menu_texts(menu_text, positioned_text, next_positioned_text, text_lines):
        while next_positioned_text.y_max == positioned_text.y_max:
            menu_text += " " + next_positioned_text.text.strip()
            positioned_text = next_positioned_text
            next_positioned_text = KultTexTractor.get_next_line(positioned_text, text_lines)
        return menu_text

    @staticmethod
    def extract_menu_text(line, next_line):
        menu_text = KultTexTractor.remove_menu_prefix(line)
        # if line is uncompleted and has menu on next line, take also next line
        if menu_text and next_line:
            menu_text += next_line
        else:  # fallback
            menu_text = line
        return menu_text

    @staticmethod
    def match_menu_line(menu_text_line):
        matcher = re.compile('Menü\\s(\\d)').match(menu_text_line)
        return matcher

    @staticmethod
    def extract_menu_number(menu_text_line):
        menu_number = 0
        matcher = KultTexTractor.match_menu_line(menu_text_line)
        if matcher:
            menu_number = matcher.group(1)
        return menu_number

    @staticmethod
    def get_next_line(line, text_lines):
        counter = 0
        next_line = None

        while not next_line:
            counter = counter + 1
            line_counter = text_lines.index(line) + counter
            if len(text_lines) <= line_counter:
                break

            next_line = text_lines[line_counter]
        return next_line


# starter method
if __name__ == "__main__":
    import scraper

    pdf = scraper.get_pdf()
    current_menu = KultTexTractor.get_menu_from_pdf(pdf)
    print(current_menu)