modules/feeds/internetstiftelsen.py from elliotwutingfeng/Inversion-DNSBL-Generator

modules/feeds/internetstiftelsen.py
Summary

Maintainability

0 mins
Test Coverage

Issues
"""
For fetching and scanning URLs from Internetstiftelsen
"""

from collections.abc import AsyncIterator

from more_itertools import chunked

from modules.utils.feeds import (
    generate_hostname_expressions,
    hostname_expression_batch_size,
)
from modules.utils.http_requests import get_async
from modules.utils.log import init_logger

logger = init_logger()


async def get_internetstiftelsen_domains() -> AsyncIterator[set[str]]:
    """Download and extract domains from Internetstiftelsen
    and yield all listed URLs in batches.

    Yields:
        Iterator[AsyncIterator[set[str]]]: Batch of URLs as a set
    """
    sources: list[str] = [
        "https://raw.githubusercontent.com/elliotwutingfeng/InternetstiftelsenDomains/main/se_domains.txt",
        "https://raw.githubusercontent.com/elliotwutingfeng/InternetstiftelsenDomains/main/nu_domains.txt",
    ]
    txt_data: dict[str, bytes] = await get_async(
        sources, max_concurrent_requests=1, max_retries=2
    )
    for source in txt_data.keys():
        if txt_data[source] != b"{}":
            # Extract URLs from TXT file
            raw_urls = [url.strip() for url in txt_data[source].decode().splitlines()]
            for batch in chunked(raw_urls, hostname_expression_batch_size):
                yield generate_hostname_expressions(batch)


class Internetstiftelsen:
    """
    For fetching and scanning URLs from Internetstiftelsen
    """

    def __init__(self, parser_args: dict, update_time: int):
        self.db_filenames: list[str] = []
        self.jobs: list[tuple] = []
        if "internetstiftelsen" in parser_args["sources"]:
            self.db_filenames = ["internetstiftelsen"]
            if parser_args["fetch"]:
                # Download and Add Internetstiftelsen URLs to database
                self.jobs = [
                    (get_internetstiftelsen_domains, update_time, self.db_filenames[0])
                ]