digitalfabrik/integreat-app

View on GitHub
native/src/utils/ResourceURLFinder.ts

Summary

Maintainability
A
0 mins
Test Coverage
A
94%
import { Parser } from 'htmlparser2'
import { reduce } from 'lodash'
import md5 from 'md5'
import Url from 'url-parse'

import { getExtension } from './helpers'
import { FetchMapType } from './loadResourceCache'

const RESOURCE_EXTENSIONS = ['png', 'jpg', 'jpeg', 'pdf', 'svg']

type InputEntryType = {
  path: string
  content: string
  thumbnail: string | null
}
/**
 * A ResourceURLFinder allows to find resource urls in html source code.
 * It only searches for urls ending on png,jpg,jpeg,pdf in src and href attribute tags of any html element.
 * It also allows to build a fetch map using an array of inputs of the form { path, content, thumbnail }
 * Before calling findResourceUrls or buildFetchMap you need to initialize the finder by calling init.
 * After finishing your work with the finder, you need to call finalize, to clear the  resources
 */

export default class ResourceURLFinder {
  _parser: Parser | null = null
  _foundUrls: Set<string> = new Set<string>()
  _allowedHostNames: Array<string>

  constructor(allowedHostNames: Array<string>) {
    this._allowedHostNames = allowedHostNames
  }

  _onAttributeTagFound = (name: string, value: string): void => {
    if (name === 'href' || name === 'src') {
      try {
        const extension = getExtension(value)

        if (RESOURCE_EXTENSIONS.includes(extension) && this._allowedHostNames.includes(new Url(value).host)) {
          this._foundUrls.add(value)
        }
      } catch (ignored) {
        // invalid urls get ignored
      }
    }
  }

  init(): void {
    this._parser = new Parser(
      {
        onattribute: this._onAttributeTagFound,
      },
      {
        decodeEntities: true,
      },
    )
  }

  finalize(): void {
    if (this._parser === null) {
      throw new Error('Did you forget to call the init method?')
    }
    this._parser.end()
  }

  findResourceUrls = (html: string): Set<string> => {
    if (this._parser === null) {
      throw new Error('Did you forget to call the init method?')
    }
    this._foundUrls.clear()

    this._parser.write(html)

    return this._foundUrls
  }

  buildFetchMap(
    inputs: InputEntryType[],
    buildFilePath: (url: string, urlHash: string) => string,
    currentlyCachedFiles: string[],
  ): FetchMapType {
    return reduce<InputEntryType, FetchMapType>(
      inputs,
      (fetchMap, input: InputEntryType) => {
        const { path } = input
        this.findResourceUrls(input.content)
        const urlSet = this._foundUrls

        if (input.thumbnail) {
          urlSet.add(input.thumbnail)
        }

        const newFetchMap = fetchMap
        newFetchMap[path] = Array.from(urlSet)
          .filter(url => !currentlyCachedFiles.includes(url))
          .map(url => {
            const urlHash = md5(url)
            const filePath = buildFilePath(url, urlHash)
            return {
              url,
              urlHash,
              filePath,
            }
          })
        return newFetchMap
      },
      {},
    )
  }
}