scrapers/nus-v2/src/services/io/elastic.ts
import { Client } from '@elastic/elasticsearch';
import { Persist } from '../../types/persist';
import { ModuleCode, ModuleInformation } from '../../types/modules';
import config from '../../config';
import logger from '../logger';
/* eslint-disable camelcase, no-underscore-dangle */
// Typings for the result from the all modules search. This is a partial typing
type ModuleSearchBody = {
hits: {
total: number;
hits: {
_id: string;
_score: number;
_source: {
moduleCode: ModuleCode;
};
}[];
};
};
const INDEX_NAME = 'modules_v2';
// Tokenizes a string into an array of digits
const first_digit_tokenizer = {
type: 'simple_pattern',
pattern: '[1-9]{1}',
};
// Only pick the first token
const first_token_limit_filter = {
type: 'limit',
max_token_count: 1,
};
// Add 3 '0's to a number token
const thousandizer_filter = {
type: 'pattern_replace',
pattern: '(\\d+)',
replacement: '$1000',
};
async function createIndex(client: Client): Promise<Client> {
try {
await client.indices.create({
index: INDEX_NAME,
body: {
settings: {
analysis: {
analyzer: {
// An analyzer that produces a level string from a modcode, i.e.
// "CNS1010SX" => "1000", "CS2030" => "2000", etc.
level_analyzer: {
type: 'custom',
tokenizer: 'first_digit_tokenizer',
filter: ['first_token_limit_filter', 'thousandizer_filter'],
},
},
tokenizer: { first_digit_tokenizer },
filter: { first_token_limit_filter, thousandizer_filter },
},
index: {
max_result_window: 20_000, // Default limit is 10k, but we have >11k mods
},
},
},
});
} catch (e) {
// Ignore resource exist exception as we will handle cases where the index
// already exists.
if (
e.name !== 'ResponseError' ||
e.meta.body.error.type !== 'resource_already_exists_exception'
) {
throw e;
}
}
await client.indices.putMapping({
index: INDEX_NAME,
body: {
properties: {
workload: { type: 'text' },
moduleCredit: { type: 'short' },
moduleCode: {
type: 'text',
fields: {
keyword: {
type: 'keyword',
ignore_above: 10,
},
level: {
type: 'text',
analyzer: 'level_analyzer',
fielddata: true, // To allow usage in MultiList on the frontend
},
},
},
semesterData: {
type: 'nested',
},
},
},
});
return client;
}
/* eslint-disable class-methods-use-this */
export default class ElasticPersist implements Persist {
private readonly client: Promise<Client>;
constructor() {
if (!config.elasticConfig) {
throw new Error('elasticConfig in config.json is not set');
}
const client = new Client(config.elasticConfig);
this.client = createIndex(client);
}
deleteModule = async (moduleCode: ModuleCode) => {
const client = await this.client;
await client.delete({
id: moduleCode,
index: INDEX_NAME,
});
};
moduleInfo = async (moduleInfo: ModuleInformation[]) => {
const bulkBody: any[] = []; // eslint-disable-line @typescript-eslint/no-explicit-any
for (const module of moduleInfo) {
bulkBody.push({
index: { _id: module.moduleCode },
});
if (module.attributes) {
bulkBody.push({
...module,
moduleAttributes: Object.keys(module.attributes),
});
} else {
bulkBody.push(module);
}
}
const client = await this.client;
const res = await client.bulk({
index: INDEX_NAME,
body: bulkBody,
});
const { items } = res.body;
// Log errors
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const erroredItems = items.filter((i: any) => {
const { status } = i.index;
// Filter out status code 2xx
return status < 200 || status >= 300;
});
if (erroredItems.length) {
logger.error(`Insertion errors encountered`, {
erroredLength: erroredItems.length,
totalLength: res.body.items.length,
});
for (const item of erroredItems) {
logger.error('Error importing item', item.index.error);
}
}
};
facultyDepartments() {
return Promise.resolve();
}
async getModuleCodes() {
const client = await this.client;
const { body } = await client.search({
index: INDEX_NAME,
body: {
query: {
match_all: {},
},
_source: 'moduleCode',
size: 20_000, // Arbitrarily large number to force ES to return all results. Must be <= index.max_result_window
},
});
return (body as ModuleSearchBody).hits.hits.map((hit) => hit._source.moduleCode);
}
module() {
return Promise.resolve();
}
moduleAliases() {
return Promise.resolve();
}
moduleInformation() {
return Promise.resolve();
}
moduleList() {
return Promise.resolve();
}
mpeModules() {
return Promise.resolve();
}
semesterData() {
return Promise.resolve();
}
timetable() {
return Promise.resolve();
}
venueInformation() {
return Promise.resolve();
}
venueList() {
return Promise.resolve();
}
}