mrgodhani/raven-reader

View on GitHub
src/main/article.js

Summary

Maintainability
A
3 hrs
Test Coverage
import Mercury from '@postlight/mercury-parser'
import cheerio from 'cheerio'
import fetch from 'node-fetch'

async function decodeFetchResponse (response, isHTML = false) {
  const CHARSET_RE = /charset=([^()<>@,;:\\"/[\]?.=\s]*)/i
  const buffer = await response.arrayBuffer()
  let ctype = response.headers.has('content-type') && response.headers.get('content-type')
  let charset = (ctype && CHARSET_RE.test(ctype)) ? CHARSET_RE.exec(ctype)[1] : undefined
  let content = Buffer.from(buffer, charset);
  const document = cheerio.load(content)
  if (typeof charset === typeof undefined) {
    charset = document.querySelector('meta[charset]') !== null ? document.querySelector('meta[charset]').getAttribute('charset').toLowerCase() : null
    if (!charset) {
      ctype = document.querySelector("meta[http-equiv='Content-Type']") !== null ? document.querySelector("meta[http-equiv='Content-Type']").getAttribute('content') : null
      charset = ctype && CHARSET_RE.test(ctype) && CHARSET_RE.exec(ctype)[1].toLowerCase()
    }
    if (charset && charset !== 'utf-8' && charset !== 'utf8') {
      content = Buffer.from(buffer, charset);
    }
  }
  return content
}

export async function parseArticle (url) {
  const res = await fetch(url)
  const content = await decodeFetchResponse(res)
  const result = await Mercury.parse(url, {
    html: Buffer.from(content, 'utf-8')
  })
  return result
}