cityssm/get-site-urls

View on GitHub
index.js

Summary

Maintainability
A
45 mins
Test Coverage
A
93%
import { cleanUrl, getLinks, sleep } from "./utils.js";
import * as config from "./config.js";
import { Semaphore } from "async-mutex";
import axios from "axios";
import Debug from "debug";
const debug = Debug("get-site-urls");
const searchSite = async (settings, pages, depth) => {
    const { siteUrl, maxDepth, username, password } = settings;
    const lock = new Semaphore(config.maxRequestCount);
    let sleepMillis = 0;
    let throttleSleepMillis = config.throttleSleepMillis;
    const links = [...pages.queue].map(async (url) => {
        pages.queue.delete(url);
        sleepMillis += throttleSleepMillis;
        await sleep(sleepMillis);
        const [_lockValue, lockRelease] = await lock.acquire();
        try {
            const axiosOptions = {
                timeout: config.axios_requestTimeoutMillis,
                httpAgent: config.axios_httpAgent
            };
            if (username && password) {
                axiosOptions.auth = {
                    username,
                    password
                };
            }
            const headerStartMillis = Date.now();
            debug("Getting headers: " + url);
            const { headers } = await axios.head(url, axiosOptions);
            throttleSleepMillis = (throttleSleepMillis + (Date.now() - headerStartMillis)) / 2;
            if (headers["content-type"].includes("text/html")) {
                await sleep(throttleSleepMillis);
                debug("Preparing to get the body: " + url);
                const response = await axios(url, axiosOptions);
                const body = response.data;
                pages.found.add(url);
                const pageLinks = getLinks(body, url, cleanUrl(siteUrl));
                for (const link of pageLinks) {
                    if (!pages.found.has(link) && !pages.errors.has(link)) {
                        pages.queue.add(link);
                    }
                }
            }
        }
        catch (error) {
            debug("Error loading URL: " + url);
            debug(error);
            pages.errors.add(url);
        }
        finally {
            lockRelease();
        }
    });
    await Promise.all(links);
    if (depth === maxDepth || pages.queue.size === 0) {
        return {
            pages: [...pages.found],
            errors: [...pages.errors]
        };
    }
    return await searchSite(settings, pages, depth + 1);
};
export const getSiteUrls = async (url, maxDepth = 1) => {
    const goUpOneLevel = url.includes("?") && !url.includes("/?");
    const siteUrl = cleanUrl(url, goUpOneLevel);
    const pages = {
        queue: new Set([url]),
        found: new Set([]),
        errors: new Set([])
    };
    const { username, password } = new URL(siteUrl);
    const settings = {
        siteUrl,
        maxDepth,
        username,
        password
    };
    return await searchSite(settings, pages, 0);
};