rnegron/cc-api

View on GitHub
src/scripts/add-movie-runs.ts

Summary

Maintainability
A
30 mins
Test Coverage
import axios, { AxiosInstance } from 'axios';
import { capitalize, chunk, first, includes, last } from 'lodash';
import * as cheerio from 'cheerio';
import * as signale from 'signale';
import * as sanitize from 'sanitize-html';

import parseDates from './utils/parse-dates';
import { CC_URL, API_GIT_URL } from '../constants';
import dbConnect from '../database';
import Theatre from '../models/theatre';
// import MovieRun from '../models/movie-run';

interface IMovieDetail {
  title: string;
  language: string;
  subtitles: string | null;
  times: Date[];
}

/**
 * Removes whitespace from the start and end of an HTML element
 * and also removes </br> tags.
 *
 * @param elem - The HTML element object to strip.
 * @returns The `elem` object trimmed and without <br> tags.
 */
function stripTags(elem: string) {
  return elem.trim().replace(/<\/?br?>/g, '');
}

export function removeImageFromTitle(movieTitle: string) {
  return sanitize(movieTitle).trim();
}

/**
 * Obtains the subtitle language from a text which may contain it.
 *
 * @param subtitleText - The subtitle text portion obtained from parsing a movie run.
 * @returns The subtitle language (ex: Spanish | English)
 */
export function getSubtitleLanguage(subtitleText: string) {
  const subtitleLanguage = first(subtitleText.trim().split(' ')) || '';

  return capitalize(subtitleLanguage.trim());
}

function parseButtonText(buttonText: string) {
  const excludedWords = ['CXC'];
  return buttonText
    .split(' ')
    .filter((word) => !includes(excludedWords, word))
    .join(' ')
    .trim();
}

function trimFormatButtonText(buttonText: string) {
  // Remove certain words like "CXC" from the button text
  const parsedFormatButtonText = parseButtonText(buttonText);

  const trimmedText = capitalize(first(parsedFormatButtonText.split('  ')));

  const language = first(trimmedText.split(' ')) || '';
  let subtitles = null;

  const hasSubtitles = includes(trimmedText, 'subtitle');

  if (hasSubtitles) {
    const subtitleText = last(trimmedText.split('with'));

    if (subtitleText) {
      subtitles = getSubtitleLanguage(subtitleText);
    }
  }

  return { language: language.trim(), subtitles };
}

export function getMovieTimeData(movieTimeHtml: string) {
  const movieTimeData = movieTimeHtml
    .trim()
    .split('\n')
    .map((elem: string) => stripTags(elem));

  return chunk(movieTimeData, 2);
}

function getTitle($: CheerioStatic, movieData: IMovieDetail) {
  // Get the movie title
  const movieTitle = $(movieData).find('h5 > b').text().trim();

  return removeImageFromTitle(movieTitle);
}

function getLanguageAndSubtitles($: CheerioStatic, movieData: IMovieDetail) {
  const movieFormatButtonText = $(movieData).find('a').text().trim();

  // Get the movie format (language and optional subtitles)
  const { language, subtitles } = trimFormatButtonText(movieFormatButtonText);

  return { language, subtitles };
}

function getTimes($: CheerioStatic, movieData: IMovieDetail) {
  // Get all the movie times
  const movieTimeHtml = $(movieData).find('div').last().html() || '';

  const movieTimes = getMovieTimeData(movieTimeHtml);

  const movieTimeObj: {
    [date: string]: string[];
  } = {};

  // First element is Dates, second element is Times
  for (const movieTime of movieTimes) {
    movieTimeObj[movieTime[0]] = movieTime[1]
      .split(',')
      .map((time) => time.trim());
  }

  const parsedDates = parseDates(movieTimeObj);
  return parsedDates;
}

export async function scrapeTheatreMovieRuns(
  theatreSlug: string,
  instance: AxiosInstance
) {
  const page = await instance.get(`theater/${theatreSlug}`);
  const $ = cheerio.load(page.data);

  // Get all movie data (title/format/times)
  const moviesData = $('.column_column .three-fourth').get();

  const results: IMovieDetail[] = moviesData.map((moviesData) => {
    // Add title, language, subtitles and times
    const title = getTitle($, moviesData);
    const { language, subtitles } = getLanguageAndSubtitles($, moviesData);
    const times = getTimes($, moviesData);
    return { title, language, subtitles, times };
  });

  return results;
}

export default async function run() {
  await dbConnect();

  const instance = axios.create({
    baseURL: CC_URL,
    headers: {
      'X-CC-API': API_GIT_URL,
      'User-Agent': `CC-API (${API_GIT_URL})`,
    },
  });

  // Get all saved theatres
  const theatres = await Theatre.find({}).exec();

  // For each theatre, scrape the current movie runs
  for (const theatre of theatres) {
    // https://github.com/klaussinani/signale/issues/44#issuecomment-499476792
    console.log();

    const theatreLog = new signale.Signale({
      interactive: true,
      scope: `Theatre Slug: ${theatre.slug}`,
    });

    theatreLog.await('Scraping movie runs...');
    const movieRuns = await scrapeTheatreMovieRuns(theatre.slug, instance);
    theatreLog.success('Obtained runs:', movieRuns);
    // If the movie run does not exist, create it
    // If it exists, update it
  }
}