cityssm/get-site-urls

View on GitHub
index.ts

Summary

Maintainability
A
45 mins
Test Coverage
import { cleanUrl, getLinks, sleep } from "./utils.js";
import * as config from "./config.js";

import { Semaphore } from "async-mutex";
import axios from "axios";

import type { SearchPages, SearchSettings, Pages } from "./types";
import type { AxiosRequestConfig } from "axios";

import Debug from "debug";
const debug = Debug("get-site-urls");

/**
 * Fetch all of the URLs from a website
 */
const searchSite = async (
    settings: SearchSettings,
    pages: SearchPages,
    depth: number
): Promise<Pages> => {
    const { siteUrl, maxDepth, username, password } = settings;

    const lock = new Semaphore(config.maxRequestCount);

    let sleepMillis = 0;
    let throttleSleepMillis = config.throttleSleepMillis;

    // For each url fetch the page data
    const links = [...pages.queue].map(async (url) => {
        // Delete the URL from queue
        pages.queue.delete(url);

        sleepMillis += throttleSleepMillis;

        await sleep(sleepMillis);

        /* eslint-disable-next-line @typescript-eslint/no-unused-vars */
        const [_lockValue, lockRelease] = await lock.acquire();

        try {
            const axiosOptions: AxiosRequestConfig = {
                timeout: config.axios_requestTimeoutMillis,
                httpAgent: config.axios_httpAgent
            };

            if (username && password) {
                axiosOptions.auth = {
                    username,
                    password
                };
            }

            // Get the page header so we can check the type is text/html

            const headerStartMillis = Date.now();

            debug("Getting headers: " + url);
            const { headers } = await axios.head(url, axiosOptions);

            throttleSleepMillis = (throttleSleepMillis + (Date.now() - headerStartMillis)) / 2;

            // If it is an HTML page get the body and search for links
            if (headers["content-type"].includes("text/html")) {
                await sleep(throttleSleepMillis);

                debug("Preparing to get the body: " + url);

                const response = await axios(url, axiosOptions);

                const body = response.data as string;

                // Add to found as it is a HTML page
                pages.found.add(url);

                // Add the unique links to the queue
                const pageLinks = getLinks(body, url, cleanUrl(siteUrl));
                for (const link of pageLinks) {
                    // If the link is not in error or found add to queue
                    if (!pages.found.has(link) && !pages.errors.has(link)) {
                        pages.queue.add(link);
                    }
                }
            }
        } catch (error) {
            debug("Error loading URL: " + url);
            debug(error);
            pages.errors.add(url);
        } finally {
            lockRelease();
        }
    });

    await Promise.all(links);

    // If we have reached maximum depth or the queue is empty
    // maxDepth + 1 as the first page doesn't count
    if (depth === maxDepth || pages.queue.size === 0) {
        return {
            pages: [...pages.found],
            errors: [...pages.errors]
        };
    }

    // Start the search again as the queue has more to search
    return await searchSite(settings, pages, depth + 1);
};

/**
 * Fetch all of the URLs from a website
 */
export const getSiteUrls = async (url: string, maxDepth = 1): Promise<Pages> => {
    const goUpOneLevel = url.includes("?") && !url.includes("/?");

    const siteUrl = cleanUrl(url, goUpOneLevel);

    const pages: SearchPages = {
        queue: new Set<string>([url]),
        found: new Set<string>([]),
        errors: new Set<string>([])
    };

    const { username, password } = new URL(siteUrl);

    const settings = {
        siteUrl,
        maxDepth,
        username,
        password
    };

    return await searchSite(settings, pages, 0);
};