src/core/actions/accessibility/crawl.ts
import { sourceBuild } from "@a11ywatch/website-source-builder";
import { emailMessager } from "../../messagers";
import { pubsub } from "../../../database/pubsub";
import { ISSUE_ADDED } from "../../static";
import { collectionUpsert } from "../../utils";
import { getWebsite } from "../../controllers/websites";
import { IssuesController } from "../../controllers/issues";
import { AnalyticsController } from "../../controllers/analytics";
import { PagesController } from "../../../core/controllers/pages";
import { UsersController } from "../../controllers/users";
import { extractPageData } from "../../utils/shapes/extract-page-data";
import { filterRunnerDuplicates } from "../../utils/filters/runners";
import { fetchPageIssues } from "./fetch-issues";
import { ResponseModel } from "../../models/response/types";
import { crawlEmitter, crawlTrackingEmitter } from "../../../event";
import { SUPER_MODE } from "../../../config/config";
import { findPageActionsByPath } from "../../controllers/page-actions/find";
import { validateScanEnabled } from "../../controllers/users/update/scan-attempt";
import { RATE_EXCEEDED_ERROR } from "../../strings";
import { collectionIncrement } from "../../utils/collection-upsert";
import { SCAN_TIMEOUT } from "../../strings/errors";
import { StatusCode } from "../../../web/messages/message";
import { watcherCrawl } from "./watcher_crawl";
import { shapeResponse } from "../../models/response/shape-response";
import { crawlingSet, getKey } from "../../../event/crawl-tracking";
import { getActiveCrawlKey } from "../../../event/names";
import type { User, Website } from "../../../types/types";
export type CrawlConfig = {
userId: number; // user id
url: string; // the target url to crawl
pageInsights?: boolean; // use page insights to get info
sendSub?: boolean; // use pub sub
user?: User; // optional pass user
html?: string; // raw html to validate
standard?: string; // accessibility standard
ignore?: string[]; // ignore list of rules
rules?: string[]; // list of rules
runners?: string[]; // list of runners axe, htmlcs, and a11y
};
// track the crawl events between crawls
const trackerProccess = (
data: { data: Website },
{ domain, urlMap, userId, shutdown = false }: any,
blockEvent?: boolean
) => {
crawlTrackingEmitter.emit("crawl-processed", {
user_id: userId,
domain,
pages: [urlMap],
shutdown,
});
// send data back to rpc or http stream emitter after emit `crawl processed`
if (!blockEvent && data) {
crawlEmitter.emit(getActiveCrawlKey(domain, userId), data);
}
};
// determine insights
const getInsightsEnabled = ({
pageInsights,
insightsLocked,
pageSpeedApiKey,
rootPage,
}) =>
insightsLocked && !pageSpeedApiKey ? pageInsights && rootPage : pageInsights;
/**
* Send to gRPC pagemind request. Stores data upon return into database.
*
* Examples:
*
* await crawlPage({ url: "https://a11ywatch.com" });
* await crawlPage({ url: "https://a11ywatch.com", sendSub: true }); // send pub sub to front-end client
* await crawlPage({ url: "https://a11ywatch.com", userId: 122, pageInsights: true }); // run request with lighthouse
* await crawlPage({ url: "https://a11ywatch.com", userId: 122, pageInsights: true }, true); // send email with config
*/
export const crawlPage = async (
crawlConfig: CrawlConfig,
sendEmail?: boolean, // determine if email should be sent based on results
blockEvent?: boolean // block event from emitting to protect crawl interfere
): Promise<ResponseModel> => {
const {
url: urlMap,
pageInsights = false,
user: usr,
sendSub: sub = true,
html,
standard,
ignore,
rules,
runners,
} = crawlConfig ?? {};
const userId = usr?.id ?? crawlConfig?.userId;
// todo: use prior user params if found
const [userData, userCollection] = await UsersController().getUser({
id: userId,
});
const { domain, pathname } = sourceBuild(urlMap);
// block scans from running
if (!sendEmail && validateScanEnabled({ user: userData }) === false) {
trackerProccess(
undefined,
{ domain, urlMap, userId, shutdown: true },
blockEvent
);
return shapeResponse({
data: null,
code: 300,
success: false,
message: RATE_EXCEEDED_ERROR,
});
}
// WEBSITE COLLECTION
const [website, websiteCollection] = await getWebsite({
domain,
userId,
});
const { role: urole, pageSpeedApiKey, scanInfo } = userData ?? {};
const {
standard: websiteStandard,
pageHeaders,
pageInsights: websitePageInsights,
mobile,
ua,
ignore: websiteIgnore,
rules: websiteRules,
runners: websiteRunners,
actionsEnabled,
} = website ?? {};
const freeAccount = !urole; // free account
const rootPage = pathname === "/"; // the url is the base domain index.
const actions =
actionsEnabled &&
(await findPageActionsByPath({
userId,
path: pathname,
domain: website.domain,
}));
const dataSource = await fetchPageIssues({
pageHeaders,
url: urlMap,
userId,
pageInsights: getInsightsEnabled({
pageInsights: pageInsights || websitePageInsights,
insightsLocked: !SUPER_MODE && freeAccount,
pageSpeedApiKey: !!pageSpeedApiKey,
rootPage,
}),
mobile,
ua,
standard: standard || websiteStandard,
actions,
cv: SUPER_MODE || !!urole,
pageSpeedApiKey: pageSpeedApiKey,
html,
ignore:
ignore && Array.isArray(ignore) && ignore.length ? ignore : websiteIgnore,
rules: rules && Array.isArray(rules) && rules.length ? rules : websiteRules,
runners: filterRunnerDuplicates(
runners && Array.isArray(runners) && runners.length
? runners
: websiteRunners || []
),
});
let shutdown = false;
// determine usage used
if (!SUPER_MODE) {
const ttime = dataSource?.usage || 0;
shutdown =
validateScanEnabled({
user: {
role: urole,
scanInfo: {
usageLimit: scanInfo?.usageLimit,
totalUptime: ttime + scanInfo?.totalUptime || 0,
},
},
}) === false;
// todo: negate usage from uptime outside plan. One generic method to handle uptime.
setImmediate(async () => {
await collectionIncrement(
{
"scanInfo.totalUptime": ttime, // add new uptime to collection
},
userCollection,
{ id: userId }
); // User
});
}
// TODO: SET PAGE OFFLINE DB
if (!dataSource || !dataSource?.webPage?.issuesInfo || shutdown) {
trackerProccess(
undefined,
{ domain, urlMap, userId, shutdown },
blockEvent
);
return shapeResponse({
data: null,
code: StatusCode.BadRequest,
success: false,
message: SCAN_TIMEOUT,
});
}
const {
issues: pageIssues,
webPage,
issuesInfo,
} = extractPageData(dataSource);
const pageUrl = webPage.url;
// issues array
const issueCount = pageIssues.issues.length;
// if website record exist update integrity of the data.
if (website) {
const analyticsCollection = AnalyticsController().getCollection;
const pagesCollection = PagesController().getCollection;
setImmediate(async () => {
// if ROOT domain for scan update Website Collection.
if (rootPage) {
await collectionUpsert(
{
domain: webPage.domain,
url: webPage.url,
pageLoadTime: webPage.pageLoadTime,
lastScanDate: webPage.lastScanDate,
online: true,
userId,
},
[websiteCollection, !!webPage],
{
searchProps: { url: pageUrl, userId },
}
);
}
const [issueExist, issuesCollection] = await IssuesController().getIssue(
{ pageUrl, userId, noRetries: true },
true
);
// add hostname to track outside website targeting
const hostname = website.tld || website.tld ? webPage.domain : undefined;
await Promise.all([
// analytics
collectionUpsert(
{
pageUrl,
domain: website.domain,
hostname,
userId,
possibleIssuesFixedByCdn: issuesInfo.possibleIssuesFixedByCdn,
totalIssues: issuesInfo.totalIssues,
issuesFixedByCdn: issuesInfo.issuesFixedByCdn,
errorCount: issuesInfo.errorCount,
warningCount: issuesInfo.warningCount,
noticeCount: issuesInfo.noticeCount,
accessScore: issuesInfo.accessScore,
},
[analyticsCollection, "upsert"]
),
// issues
collectionUpsert(
{
issues: pageIssues.issues,
documentTitle: pageIssues.documentTitle,
pageUrl: pageIssues.pageUrl,
domain: website.domain,
hostname,
webdomain: website.domain,
userId,
},
[issuesCollection, issueExist, !issueCount],
{
searchProps: { pageUrl, userId },
}
),
// pages
collectionUpsert(
{
url: webPage.url,
domain: website.domain,
pageLoadTime: webPage.pageLoadTime,
lastScanDate: webPage.lastScanDate,
hostname,
userId,
online: true,
},
[pagesCollection, "upsert"],
{
searchProps: { url: pageUrl, userId },
}
),
]);
// send email if issues of type error exist for the page. TODO: remove from layer.
if (sendEmail && issuesInfo?.errorCount && userData?.emailConfirmed) {
await emailMessager.sendMail({
userId,
data: {
issues: pageIssues.issues,
documentTitle: pageIssues.documentTitle,
pageUrl: pageIssues.pageUrl,
domain: pageIssues.domain,
userId,
issuesInfo,
},
confirmedOnly: true,
sendEmail: true,
});
}
});
}
if (issueCount && sub) {
setImmediate(async () => {
try {
await pubsub.publish(ISSUE_ADDED, {
issueAdded: {
domain: webPage.domain,
url: webPage.url,
pageLoadTime: webPage.pageLoadTime,
lastScanDate: webPage.lastScanDate,
issue: pageIssues.issues,
issuesInfo,
userId,
online: true,
},
});
} catch (_) {
// silent pub sub errors
}
});
}
const responseData = {
data: {
domain: webPage.domain,
url: webPage.url,
pageLoadTime: webPage.pageLoadTime,
lastScanDate: webPage.lastScanDate,
issues: pageIssues.issues,
issuesInfo,
userId,
online: true,
},
};
trackerProccess(
responseData,
{
domain,
urlMap,
userId,
shutdown,
},
blockEvent
);
return shapeResponse(responseData);
};
// async generator for large jobs
async function* entriesFromWebsite(
pages: string[],
userId: number
): AsyncGenerator<[ResponseModel, string]> {
for (const url of pages) {
yield [await crawlPage({ url, userId }, false), url];
}
}
// async generator for full site wide scans [only non active crawls]
export async function* entriesFromWebsiteSync(
pages: Website[]
): AsyncGenerator<[void, string]> {
for (const { url, userId, subdomains, tld, ua, proxy } of pages) {
yield [
!crawlingSet.has(getKey(url, [], userId)) &&
(await watcherCrawl({
url,
subdomains,
tld,
userId,
scan: true,
agent: ua,
proxy,
})),
url,
];
}
}
/*
* Send request for crawl queue - Sends an email follow up on the crawl data. TODO: remove from file.
* @return Promise<Websites | Pages>
*/
export const crawlMultiSite = async (data) => {
const { pages = [], userId: uid, user_id } = data;
const userId = uid ?? user_id;
const responseData = [];
for await (const [scanResult, url] of entriesFromWebsite(pages, userId)) {
responseData.push(scanResult.data ?? { url, online: scanResult.success });
}
return responseData;
};