lib/mechanize/agent.js
import { writeFile } from 'fs/promises'
import { URL } from 'url'
import { Cookie, CookieJar } from 'tough-cookie'
import { newHistory } from './history.js'
import { newPage } from './page.js'
import { USER_AGENTS } from './constants.js'
import { nodeAttr } from './utils.js'
import { encode, decode, labels } from 'windows-1252'
import * as mime from 'mime'
import * as path from 'path'
import fetch from 'node-fetch'
import * as querystring from 'querystring'
export function newAgent() {
let logDir
let userAgent = USER_AGENTS.Mechanize
const agent = {}
const history = newHistory()
const cookieJar = new CookieJar(null, {
rejectPublicSuffixes: false,
})
const addResponseCookie = async ({ cookieString, uri }) => {
const cookie = Cookie.parse(cookieString)
await setCookie(cookie, uri.toString())
}
const addResponseCookies = async ({ response, uri, page }) => {
const metas = page.search('//head/meta[@http-equiv="Set-Cookie"]')
const cookieStrings = metas
.map((meta) => nodeAttr(meta, 'content'))
.concat(response.headers.raw()['set-cookie'])
.filter((cookieString) => cookieString)
for (const cookieString of cookieStrings) {
await addResponseCookie({
cookieString,
uri,
})
}
}
const getCookieString = async (uri) => {
const getCookiesP = cookieJar.getCookies.bind(cookieJar)
const cookies = await getCookiesP(uri)
const cookieStrs = cookies && cookies.map((cookie) => cookie.cookieString())
return cookieStrs && cookieStrs.join('; ')
}
const requestHeaders = async (options) => {
const headers = {
'User-Agent': userAgent,
Accept: '*/*',
connection: 'keep-alive',
}
const referer =
options.referer ||
history.currentPage() ||
newPage({
response: {
'content-type': 'text/html',
},
agent,
})
Object.assign(headers, options.headers)
let refererURI
if (typeof referer === 'string') {
refererURI = referer
} else if (referer) {
refererURI = referer.uri
}
if (refererURI) {
const parsedURI = new URL(refererURI)
headers.Referer = refererURI
headers.Origin = parsedURI.protocol + '//' + parsedURI.host
}
const uri = new URL(options.uri, refererURI).toString()
const cookieString = await getCookieString(uri)
if (cookieString) {
headers.cookie = cookieString
}
return headers
}
const encodeBody = (options) => options.body
const requestOptions = async (options) => {
// uri
// baseUrl
// method
// headers
// qs
// qsParseOptions
// qsStringifyOptions
// useQuerystring
// body
// form
// formData
// multipart
// preambleCRLF
// postambleCRLF
// json
// jsonReviver
// jsonReplacer
// auth
// oauth
// hawk
// aws
// httpSignature
// followRedirect
// followAllRedirects
// followOriginalHttpMethod
// maxRedirects
// removeRefererHeader
// encoding
// gzip
// jar
// agent
// agentClass
// agentOptions
// forever
// pool
// timeout
// localAddress
// proxy
// strictSSL
// tunnel
// proxyHeaderWhiteList
// proxyHeaderExclusiveList
// time
// har
const reqOptions = {
headers: await requestHeaders(options),
body: encodeBody(options),
followAllRedirects: options.followAllRedirects,
}
if (reqOptions.headers.Referer) {
reqOptions.uri = new URL(
options.uri,
reqOptions.headers.Referer
).toString()
} else {
reqOptions.uri = options.uri.toString()
}
reqOptions.method = (options.verb && options.verb.toUpperCase()) || 'GET'
reqOptions.encoding =
options.encoding === 'undefined' ? null : options.encoding
reqOptions.resolveWithFullResponse = true
return reqOptions
}
const logPage = async ({ body, uri, response }) => {
const contentType = response.headers.raw()['content-type']
const ext = mime.extension(
contentType && contentType.split(/[ \t]*;[ \t]*/)[0]
)
const timestamp = new Date().toISOString().replaceAll(/[-T:.Z]/g, '')
const filename =
path.join(
logDir,
timestamp + '_' + path.basename(uri, path.extname(uri))
) + (ext ? '.' + ext : '')
const encoding = 'utf8'
await writeFile(filename, body, {
encoding,
})
return filename
}
const fetchPage = async (options) => {
const reqOptions = await requestOptions(options)
const uri = reqOptions.uri
const response = await fetch(uri, reqOptions)
let responseBody = await response.text()
if (reqOptions.encoding === null) {
const body = responseBody
if (body[0] === 0xef && body[1] === 0xbb && body[2] === 0xbf) {
// encoded UTF-8
const body2 = Buffer.allocUnsafe(body.length - 3)
body.copy(body2, 0, 3)
responseBody = body2.toString('utf8')
} else if (body[0] === 0xfe && body[1] === 0xff) {
// encoded UTF-16 big-endian
body.swap16()
const body2 = Buffer.allocUnsafe(body.length - 2)
body.copy(body2, 0, 2)
responseBody = body2.toString('utf16le')
} else if (body[0] === 0xff && body[1] === 0xfe) {
// encoded UTF-16 little-endian
const body2 = Buffer.allocUnsafe(body.length - 2)
body.copy(body2, 0, 2)
responseBody = body2.toString('utf16le')
} else {
// encoded UTF-8
responseBody = body.toString('binary')
}
}
if (options.fixCharset) {
responseBody = responseBody.replace('charset=utf-16le', 'utf-8')
if (responseBody.match(/charset=windows-1252/)) {
responseBody = decode(responseBody)
}
}
if (response.headers.get('content-type').startsWith('application/json')) {
try {
responseBody = JSON.parse(responseBody)
} catch (e) {
// console.warn(`error parsing ${responseBody}`, e)
}
}
const page = newPage({
uri,
response,
body: responseBody,
agent,
})
await addResponseCookies({
response,
uri: new URL(uri),
page,
})
history.push(page)
if (logDir) {
await logPage({
body: responseBody,
uri,
response,
})
}
return page
}
const setLogDir = (dir) => {
logDir = dir
}
const submit = ({ form, button, headers, followAllRedirects }) => {
const action = (button && button.action) || form.action || ''
const enctype = (button && button.enctype) || form.enctype
const method = (button && button.method) || form.method
let verb
let params
let body
let uri =
(action && querystring.unescape(action)) || (form.page && form.page.uri)
let contentType = enctype
let requestHeaders = {}
if (button) {
form.addButtonToQuery(button)
}
if (method && method.toLowerCase() === 'post') {
if (contentType === 'multipart/form-data') {
contentType += '; boundary=' + form.boundary
}
verb = 'post'
body = form.requestData(enctype)
requestHeaders = {
'Content-Type': contentType,
'Content-Length': body.length.toString(),
}
} else {
verb = 'get'
uri = uri.replace(/\?[!#$&-;=?-[\]_a-z~]*$/, '')
params = form.buildQuery()
}
return fetchPage({
verb,
uri,
headers: Object.assign(requestHeaders, headers),
referer: form.page,
followAllRedirects,
params,
body,
})
}
const setUserAgent = (agentAlias) => {
userAgent = USER_AGENTS[agentAlias]
}
const getCookies = async ({ domain, path = '/', secure = true }) => {
const protocol = secure ? 'https' : 'http'
const currentUrl = `${protocol}://${domain}${path}`
return await cookieJar.getCookies(currentUrl)
}
const setCookie = cookieJar.setCookie.bind(cookieJar)
Object.assign(agent, {
get: fetchPage,
getCookies,
getCookieString,
setCookie,
setLogDir,
setUserAgent,
submit,
userAgent: () => userAgent,
})
return Object.freeze(agent)
}