weblyzard/inscriptis

View on GitHub
src/inscriptis/cli/inscript.py

Summary

Maintainability
A
1 hr
Test Coverage
#!/usr/bin/env python3
# coding:utf-8
"""Inscriptis command line client."""

import argparse
import sys
from json import load, dumps
from pathlib import Path
from typing import Optional

import requests

from inscriptis import get_text, get_annotated_text
from inscriptis.css_profiles import CSS_PROFILES
from inscriptis.metadata import __version__, __copyright__, __license__
from inscriptis.model.config import ParserConfig

DEFAULT_ENCODING = "utf8"
DEFAULT_TIMEOUT = 5  # default timeout in seconds


def get_postprocessor(name):
    """Return the postprocessor (if available) for the given name.

    Args:
        name: the name of the postprocessor

    Returns:
        The matching postprocessing function
    """
    pp_class = name.capitalize() + "Extractor"
    mod = __import__("inscriptis.annotation.output." + name, fromlist=[pp_class])
    return getattr(mod, pp_class)()


def parse_command_line() -> argparse.Namespace:
    """Parse the command line arguments.

    Returns:
        The parsed command line arguments.
    """
    parser = argparse.ArgumentParser(
        description="Convert the given HTML document to text."
    )
    parser.add_argument(
        "input",
        nargs="?",
        default=None,
        help="Html input either from a file or a URL " "(default:stdin).",
    )
    parser.add_argument(
        "-o", "--output", type=str, help="Output file (default:stdout)."
    )
    parser.add_argument(
        "-e",
        "--encoding",
        type=str,
        help="Input encoding to use (default:utf-8 for "
        "files; detected server encoding for Web URLs).",
    )
    parser.add_argument(
        "-i",
        "--display-image-captions",
        action="store_true",
        default=False,
        help="Display image captions (default:false).",
    )
    parser.add_argument(
        "-d",
        "--deduplicate-image-captions",
        action="store_true",
        default=False,
        help="Deduplicate image captions (default:false).",
    )
    parser.add_argument(
        "-l",
        "--display-link-targets",
        action="store_true",
        default=False,
        help="Display link targets (default:false).",
    )
    parser.add_argument(
        "-a",
        "--display-anchor-urls",
        action="store_true",
        default=False,
        help="Display anchor URLs (default:false).",
    )
    parser.add_argument(
        "-r",
        "--annotation-rules",
        default=None,
        help="Path to an optional JSON file containing rules "
        "for annotating the retrieved text.",
    )
    parser.add_argument(
        "-p",
        "--postprocessor",
        type=get_postprocessor,
        default=lambda x: x,
        help="Optional component for postprocessing the "
        "result (html, surface, xml). ",
    )
    parser.add_argument(
        "--indentation",
        default="extended",
        help="How to handle indentation (extended or strict;" " default: extended).",
    )
    parser.add_argument(
        "--table-cell-separator",
        default="  ",
        help="Separator to use between table cells (default: " "three spaces).",
    )
    parser.add_argument(
        "--timeout",
        default=DEFAULT_TIMEOUT,
        help="Request timeout in seconds (default: " f"{DEFAULT_TIMEOUT}).",
    )
    parser.add_argument(
        "-v",
        "--version",
        action="store_true",
        default=False,
        help="display version information",
    )

    # parse command line arguments
    args = parser.parse_args()
    if args.version:
        print(
            "Inscript HTML to text conversion (based on the inscriptis "
            "library version {0})".format(__version__)
        )
        print("Copyright (C)", __copyright__)
        print("\nInscript comes with ABSOLUTELY NO WARRANTY.")
        print(
            "This is free software and you are welcome to redistribute it "
            "under the terms of the {0}.".format(__license__)
        )
        sys.exit(0)
    return args


def get_html_content(url: str, timeout: int, encoding: str = None) -> Optional[str]:
    """
    Return the HTML content to convert.

    Args:
        url: URL to the HTML content, or None if the content is obtained from stdin.
        encoding: used encoding.
        timeout: timeout in seconds for retrieving the URL.

    Returns:
        The html_content or None, if no content could be extracted.

    """
    if not url:
        return sys.stdin.read()
    elif (p := Path(url)).is_file():
        with p.open(encoding=encoding or DEFAULT_ENCODING, errors="ignore") as f:
            return f.read()
    elif url.startswith("http://") or url.startswith("https://"):
        req = requests.get(url, timeout=timeout)
        return req.content.decode(encoding or req.encoding)


def cli() -> None:
    """Run the inscript command line client."""
    args = parse_command_line()
    if not (html_content := get_html_content(args.input, args.timeout, args.encoding)):
        print("ERROR: Cannot open input file '{0}'.".format(args.input))
        sys.exit(-1)

    if args.annotation_rules:
        try:
            with Path(args.annotation_rules).open() as f:
                annotation_rules = load(f)
        except IOError:
            print(
                "ERROR: Cannot open annotation rule file '{0}'.".format(
                    args.annotation_rules
                )
            )
            sys.exit(-1)
    else:
        annotation_rules = None

    css_profile = (
        CSS_PROFILES["relaxed"]
        if args.indentation == "extended"
        else CSS_PROFILES["strict"]
    )
    config = ParserConfig(
        css=css_profile,
        display_images=args.display_image_captions,
        deduplicate_captions=args.deduplicate_image_captions,
        display_links=args.display_link_targets,
        display_anchors=args.display_anchor_urls,
        annotation_rules=annotation_rules,
        table_cell_separator=args.table_cell_separator,
    )
    if not annotation_rules:
        output = get_text(html_content, config)
    else:
        output = args.postprocessor(get_annotated_text(html_content, config))
        if hasattr(args.postprocessor, "verbatim") and not args.postprocessor.verbatim:
            output = dumps(output)

    if args.output:
        with Path(args.output).open("w", encoding=DEFAULT_ENCODING) as f:
            f.write(output)
    else:
        print(output)