server/controller/archiveController.js
import winston from 'winston';
import scrape from 'website-scraper';
import path from 'path';
import urlLib from 'url';
import archiver from 'archiver';
import fs from 'fs-extra';
import unzip from 'unzipper';
import { Promise } from 'bluebird';
import express from 'express';
import ResponseUtil from '../../src/util/ResponseUtil';
import BaseProcessor from './BaseProcessor';
import linkDao from '../dao/linkDao';
import archiveDao from '../dao/archiveDao';
import { updateTagHierarchy, createObject } from '../logic/Link';
import { getArchiveDomain, ensureArchiveDomain } from '../logic/Archive';
import { ALL, ARCHIVE } from '../../src/util/TagRegistry';
import { hashSha256Hex } from '../util/HashUtil';
import JwtUtil from '../util/JwtUtil';
import properties from '../util/linkyproperties';
/* eslint-disable no-underscore-dangle */
const zip = (pathToZip, archiveRec) => new Promise((resolve, reject) => {
const output = archiveDao.attachmentInsert(archiveRec._id, 'archive', null, 'application/zip', { rev: archiveRec._rev });
const archive = archiver('zip', {
zlib: { level: 5 },
});
output.on('close', () => {
resolve();
});
archive.on('warning', (err) => {
if (err.code === 'ENOENT') {
winston.loggers.get('application').error(err);
} else {
reject(err);
}
});
archive.on('error', (err) => {
reject(err);
});
archive.pipe(output);
archive.directory(pathToZip, false);
archive.finalize();
});
class CreateArchiveProcessor extends BaseProcessor {
constructor(req, res, next) {
super(req, res, next, true);
}
collectBodyParameters() {
const { linkid } = this.req.params;
this.data = { linkid };
}
async initArchiveRec(userHash, url) {
const archiveRec = {
userid: this.data.userid,
userHash,
createdDate: new Date(),
originalLinkid: this.data.linkid,
url,
type: 'archive',
};
const { id, rev } = await archiveDao.insert(archiveRec);
archiveRec._id = id;
archiveRec._rev = rev;
return archiveRec;
}
static async updateArchiveRec(archiveRec, archiveLinkRecId) {
/* eslint-disable no-param-reassign */
archiveRec.archiveLinkid = archiveLinkRecId;
const { rev } = await archiveDao.insert(archiveRec);
archiveRec._rev = rev;
/* eslint-enable no-param-reassign */
}
async createLinkRec(userHash, archiveRecId, linkRec) {
const filename = CreateArchiveProcessor.getFilename(linkRec.linkUrl, '');
const newRecord = createObject({
linkUrl: `${getArchiveDomain()}/archive/${userHash}/${archiveRecId}/${filename}`,
userid: this.data.userid,
notes: `Archived ${linkRec.linkUrl} on ${new Date()}`,
tags: [ALL, ARCHIVE],
pageTitle: `[ARCHIVE] ${linkRec.pageTitle}`,
faviconUrl: linkRec.faviconUrl,
});
const { id } = await linkDao.insert(newRecord);
newRecord.id = id;
return newRecord;
}
static async scrape(cachePath, url) {
const filename = CreateArchiveProcessor.getFilename(url);
// we need to store all content-types into the file `SCRAPED_MIME_TYPE_MAP`.
// see server/httpRoutes/archive.js at "FILE `SCRAPED_MIME_TYPE_MAP`"
const urlToContentTypeMap = new Map();
const fileNameToContentTypeMap = new Map();
await scrape({
urls: [url],
directory: cachePath,
defaultFilename: filename,
subdirectories: null,
httpResponseHandler: (response) => {
if (response.statusCode === 404) {
return Promise.reject(new Error('status is 404'));
}
const contentType = response.headers['content-type'];
urlToContentTypeMap.set(response.request.uri.href, contentType);
return Promise.resolve(response.body);
},
onResourceSaved: (resource) => {
const mimeType = urlToContentTypeMap.get(resource.url);
fileNameToContentTypeMap.set(resource.filename, mimeType);
},
request: {
headers: {
'User-Agent': properties.server.http.userAgent,
},
},
});
return fs.writeFile(path.join(cachePath, 'SCRAPED_MIME_TYPE_MAP'), JSON.stringify(fileNameToContentTypeMap));
}
static getFilename(url, defaultName = 'index.html') {
const filename = path.basename(urlLib.parse(url).pathname);
if (filename && filename.indexOf('.') !== -1) {
return filename;
}
return defaultName;
}
async process() {
try {
const originalLinkRec = await linkDao.getById(this.data.linkid);
if (originalLinkRec.userid !== this.data.userid) {
throw new Error('Forbidden');
}
const userHash = hashSha256Hex(this.data.userid);
const archiveRec = await this.initArchiveRec(userHash, originalLinkRec.linkUrl);
const cachePath = path.join(properties.server.archive.cachePath, userHash, archiveRec._id);
await CreateArchiveProcessor.scrape(cachePath, originalLinkRec.linkUrl);
const archiveLinkRec = await this.createLinkRec(userHash, archiveRec._id, originalLinkRec);
await CreateArchiveProcessor.updateArchiveRec(archiveRec, archiveLinkRec.id);
updateTagHierarchy(this.data.userid, archiveLinkRec.tags);
zip(cachePath, archiveRec);
this.res.send({ primary: archiveLinkRec });
winston.loggers.get('application').debug('Created archive db: %j', archiveLinkRec);
} catch (err) {
winston.loggers.get('application').error(err);
ResponseUtil.sendErrorResponse500(err, this.res);
}
this.res.end();
}
}
class ReadArchiveController {
constructor(req, res, next) {
this.req = req;
this.res = res;
this.next = next;
}
createParameterFromUrl() {
let extId = this.req.url.substr(1); // remove starting /
this.userhash = extId.substr(0, extId.indexOf('/')); // char seq between two slashes
extId = extId.substr(extId.indexOf('/') + 1); // cut userid
if (extId.indexOf('?') > -1) {
extId = extId.substr(0, extId.indexOf('?'));
}
let endPos = extId.indexOf('/');
if (endPos === -1) {
this.filename = 'index.html';
endPos = extId.length;
} else {
this.filename = extId.substr(endPos + 1);
}
this.archiveid = extId.substr(0, endPos);
this.archivePath = path.join(
properties.server.archive.cachePath, this.userhash, this.archiveid);
}
handleContentTypeForSpecialUrls() {
if (this.req.url.endsWith('.php')) {
const mimeFile = path.join(properties.server.archive.cachePath, this.userhash, this.archiveid, 'SCRAPED_MIME_TYPE_MAP');
const map = new Map(JSON.parse(fs.readFileSync(mimeFile, { encoding: 'utf-8' })));
const contentType = map.get(this.filename);
if (contentType) {
this.req.SAVED_CONTENTTYPE = contentType;
}
}
}
async restoreArchiveFromDB() {
try {
await Promise.all([
archiveDao.getById(this.archiveid),
fs.ensureDir(this.archivePath),
]);
winston.loggers.get('application').debug('unzipping %s ...', this.archiveid);
const targetStream = unzip.Extract({ path: this.archivePath });
targetStream.on('close', () => {
this.handleContentTypeForSpecialUrls();
this.next();
});
archiveDao.attachmentGet(this.archiveid, 'archive').pipe(targetStream);
} catch (err) {
winston.loggers.get('application').warn('Unable to find %s - %s', this.archiveid, err);
this.next();
}
}
async serveFiles() {
try {
await fs.stat(this.archivePath);
this.handleContentTypeForSpecialUrls();
this.next();
} catch (err) {
this.restoreArchiveFromDB();
}
}
async ensureFilesOnCacheAndSecurity() {
if (ensureArchiveDomain(this.req.headers.host)) {
this.res.status(403).send(`Forbidden. Must use ${properties.server.archive.domain}`);
return;
}
if (this.req.query.tmpAuthToken) {
this.res.cookie('tmpAuthToken', this.req.query.tmpAuthToken);
// security: don't keep the token in the url
this.res.redirect(this.req.originalUrl.substr(0, this.req.originalUrl.indexOf('?')));
} else {
try {
const claim = await JwtUtil.verify(this.req.cookies.tmpAuthToken);
this.createParameterFromUrl();
if (this.userhash !== claim.archiveUserHash) {
throw new Error();
}
this.serveFiles();
} catch (err) {
this.res.status(403).send('403 - Forbidden');
}
}
}
}
export default {
createArchive: (req, res, next) => {
const glp = new CreateArchiveProcessor(req, res, next);
glp.doProcess();
},
ensureFilesOnCacheAndSecurity: (req, res, next) => {
const rac = new ReadArchiveController(req, res, next);
rac.ensureFilesOnCacheAndSecurity();
},
// FILE `SCRAPED_MIME_TYPE_MAP`
// .php files contain virtually anything (like html, js or css). so the content-type
// cannot be derived from the file extension. therefore we use the file
// `SCRAPED_MIME_TYPE_MAP` which was saved during web scrape time
serveStatic: express.static(path.join(properties.server.archive.cachePath), {
setHeaders: (res) => {
if (res.req.SAVED_CONTENTTYPE) {
res.setHeader('content-type', res.req.SAVED_CONTENTTYPE);
}
},
}),
};