ahbeng/NUSMods

View on GitHub
scrapers/cpex-scraper/src/index.ts

Summary

Maintainability
C
1 day
Test Coverage
import axios from 'axios';
import fs from 'fs';
import path from 'path';

import env from '../env.json';

const TERM = '2420';

// Sanity check to see if there are at least this many modules before overwriting cpexModules.json
// The last time I ran this fully there were 3418 modules
const threshold = 1500;

const baseUrl = env['baseUrl'].endsWith('/') ? env['baseUrl'].slice(0, -1) : env['baseUrl'];

const FETCH_OK = '00000';

axios.defaults.headers.common = {
  'X-STUDENT-API': env['studentKey'],
  'X-APP-API': env['appKey'],
};

function getTimestampForFilename(): string {
  function pad2(n: number): string {
    return n < 10 ? '0' + n : String(n);
  }

  const date = new Date();

  return (
    date.getFullYear().toString() +
    pad2(date.getMonth() + 1) +
    pad2(date.getDate()) +
    pad2(date.getHours()) +
    pad2(date.getMinutes()) +
    pad2(date.getSeconds())
  );
}

type ApiResponse<T> = {
  msg: string;
  code: string;
  ts: string;
  data: T;
};

type GetDepartmentsResponseData = {
  AcademicOrganisation: string;
  Description: string;
};

// Set everything to optional because we cannot trust the API to be consistent
type Module = {
  CourseTitle?: string;
  ModularCredit?: string;
  Subject?: string;
  CatalogNumber?: string;
  PrintCatalog?: string;
  ModuleAttributes?: {
    CourseAttribute: string;
    CourseAttributeValue: string;
  }[];
};

export type CPExModule = {
  title: string;
  moduleCode: string;
  moduleCredit: string;
  inS1CPEx?: boolean;
  inS2CPEx?: boolean;
};

async function scraper() {
  const getDepartmentsResponse = await axios.post<ApiResponse<GetDepartmentsResponseData[]>>(
    `${baseUrl}/config/get-acadorg`,
    {
      eff_status: 'A',
      acad_org: '%',
    },
  );
  const departmentsData = getDepartmentsResponse.data.data;
  console.log(`Total departments: ${departmentsData.length}`);

  const collatedCPExModulesMap = new Map<string, CPExModule>();

  for (let i = 0; i < departmentsData.length; i++) {
    const department = departmentsData[i];

    console.log(
      `[${i + 1}/${departmentsData.length}] Fetching modules for ${
        department.Description
      } with acadorg: ${department.AcademicOrganisation}...`,
    );

    const getModulesResponse = await axios.post<ApiResponse<Module[]>>(`${baseUrl}/module`, {
      acadorg: department.AcademicOrganisation,
      term: TERM,
    });

    if (getModulesResponse.data.code !== FETCH_OK) {
      console.log(
        `Error fetching modules for ${department.Description} with acadorg: ${department.AcademicOrganisation}`,
      );
      continue;
    }

    const modulesData = getModulesResponse.data.data;

    for (const module of modulesData) {
      if (
        !module.CourseTitle ||
        !module.ModularCredit ||
        !module.Subject ||
        !module.CatalogNumber ||
        !module.ModuleAttributes ||
        !module.PrintCatalog
      ) {
        continue;
      }

      // Filter out hidden modules
      if (module.PrintCatalog !== 'Y') {
        continue;
      }

      const moduleTitle = module.CourseTitle;
      const moduleCode = `${module.Subject}${module.CatalogNumber}`;

      // Filter duplicate modules
      if (collatedCPExModulesMap.has(moduleCode)) {
        continue;
      }

      const moduleCredit = module.ModularCredit;
      const cpexAttribute = module.ModuleAttributes.find(
        (attribute) => attribute.CourseAttribute === 'MPE', // this still isn't changed to CPEx
      );

      if (!cpexAttribute) {
        continue;
      }

      const cpexModuleToAdd: CPExModule = {
        title: moduleTitle,
        moduleCode,
        moduleCredit,
      };

      switch (cpexAttribute.CourseAttributeValue) {
        case 'S1':
          cpexModuleToAdd.inS1CPEx = true;
          break;
        case 'S2':
          cpexModuleToAdd.inS2CPEx = true;
          break;
        case 'S1&S2':
          cpexModuleToAdd.inS1CPEx = true;
          cpexModuleToAdd.inS2CPEx = true;
          break;
        default:
          console.log(
            `Unknown CPEx attribute value: ${cpexAttribute.CourseAttributeValue} for ${moduleCode} ${moduleTitle}`,
          );
          break;
      }
      collatedCPExModulesMap.set(moduleCode, cpexModuleToAdd);
    }
  }

  const collatedCPExModules = Array.from(collatedCPExModulesMap.values());
  console.log(`Collated ${collatedCPExModules.length} modules.`);

  const DATA_DIR = path.join(__dirname, '../../data');
  if (!fs.existsSync(DATA_DIR)) {
    fs.mkdirSync(DATA_DIR);
  }
  const OLD_DATA_DIR = path.join(DATA_DIR, '/old');
  if (!fs.existsSync(OLD_DATA_DIR)) {
    fs.mkdirSync(OLD_DATA_DIR);
  }

  if (collatedCPExModules.length >= threshold) {
    fs.writeFileSync(path.join(DATA_DIR, 'cpexModules.json'), JSON.stringify(collatedCPExModules));
    console.log(`Wrote ${collatedCPExModules.length} modules to cpexModules.json.`);
  } else {
    console.log(
      `Not writing to cpexModules.json because the number of modules ${collatedCPExModules.length} is less than the threshold of ${threshold}.`,
    );
  }

  const archiveFilename = `cpexModules-${getTimestampForFilename()}.json`;
  fs.writeFileSync(path.join(OLD_DATA_DIR, archiveFilename), JSON.stringify(collatedCPExModules));
  console.log(`Wrote ${collatedCPExModules.length} modules to archive ${archiveFilename}.`);
  console.log('Done!');
}

scraper().catch((error) => {
  console.error(`Failed to scrape: ${error}`);
});