search/SearchVisitor.ts from RR0/rr0.org

search/SearchVisitor.ts
Summary

Maintainability

2 hrs
Test Coverage

Issues
import { HtmlFileContents } from "ssg-api"
import { HtmlRR0SsgContext } from "../RR0SsgContext"
import fs from "fs"
import { TimeTextBuilder } from "../time/TimeTextBuilder"
import { FileVisitor } from "../RR0ContentStep"

export type PageInfo = {
  title: string
  url: string
  time: string
  html?: string
}

type WordCount = {
  pageIndex: number;
  count: number;
}

export type SearchIndex = {
  pages: PageInfo[],
  words: {
    [key: string]: WordCount[]
  }
}

export type SearchCommandConfig = {
  notIndexedUrls: string[]
  indexWords: boolean
  indexContent?: string
}

/**
 * Builds an index of pages.
 */
export class SearchVisitor implements FileVisitor {

  readonly index: SearchIndex = {
    pages: [],
    words: {}
  }

  protected readonly contentStream: fs.WriteStream | undefined

  constructor(protected config: SearchCommandConfig, protected timeTextBuilder: TimeTextBuilder) {
    const indexContent = this.config.indexContent
    if (indexContent) {
      this.contentStream = fs.createWriteStream(indexContent)
    }
  }

  async contentStepEnd() {
    const contentStream = this.contentStream
    if (contentStream) {
      contentStream.write("\n]")
      contentStream.end()
    }
  }

  async visit(context: HtmlRR0SsgContext): Promise<void> {
    const file = context.file
    const title = file.title
    const outDir = "out/"
    const url = file.name.startsWith(outDir) ? file.name.substring(outDir.length) : file.name
    if (title && !this.config.notIndexedUrls.includes(url)) {
      const indexedPages = this.index.pages
      const titleIndexed = indexedPages.find(page => page.title === title && page.url !== url)
      if (titleIndexed) {
        context.warn(`Title "${title}" with URL ${url} is already indexed with URL ${titleIndexed.url}`)
      }
      const indexContext = context.clone()
      const time = this.timeTextBuilder.build(indexContext, true,
        {year: "numeric", month: "short", day: "numeric"}).toLowerCase()
      indexedPages.push({title, url, time})
    }
    if (this.config.indexWords) {
      this.indexWords(context, file)
    }
    if (this.config.indexContent) {
      this.indexContent(context, file)
    }
  }

  protected getContents(doc: Document) {
    const div = doc.createElement("div")
    div.append(doc.body)
    this.removeTags(div, "script")
    this.removeTags(div, "nav")
    this.removeTags(div, "footer")
    return div.textContent
  }

  protected indexContent(context: HtmlRR0SsgContext, outputFile: HtmlFileContents) {
    const contents = this.getContents(outputFile.document)
    const contentsRecord: PageInfo = {
      title: outputFile.title,
      url: context.file.name,
      time: context.time.toString(),
      html: contents
    }
    const prefix = this.contentStream.bytesWritten === 0 ? "[\n" : ",\n"
    let str = prefix + JSON.stringify(contentsRecord)
    this.contentStream.write(str)
  }

  protected indexWords(context: HtmlRR0SsgContext, outputFile: HtmlFileContents) {
    const pageIndex = this.index.pages.length
    const nonSignificant = context.messages.nonSignificantWords
    const contents = this.getContents(outputFile.document)
    const pageText = contents.toLowerCase()
    const pageWords = pageText.split(/[ \t,.…'’\-" :!?;()\[\]\n]/g)
      .filter(w => w.length > 1)
      .filter(w => !nonSignificant.includes(w))
      .filter(w => {
        const num = parseInt(w, 10)
        return Number.isNaN(num) || num > 1000
      })
    const pageWordsCount = new Map<string, number>()
    for (const pageWord of pageWords) {
      let pageWordCount = pageWordsCount.get(pageWord)
      if (!pageWordCount) {
        pageWordCount = 0
      }
      pageWordsCount.set(pageWord, pageWordCount + 1)
    }
    for (const word of new Set(pageWords)) {
      let existingWordCounts = this.index.words[word]
      if (!existingWordCounts) {
        existingWordCounts = this.index.words[word] = []
      }
      const pageWordCount = pageWordsCount.get(word)
      existingWordCounts.push({pageIndex, count: pageWordCount})
    }
  }

  protected removeTags(div: HTMLDivElement, selector: string) {
    const found = div.querySelectorAll(selector)
    let i = found.length
    while (i--) {
      found[i].parentNode.removeChild(found[i])
    }
  }
}