BookBrainz/bookbrainz-site

View on GitHub
src/common/helpers/search.ts

Summary

Maintainability
D
1 day
Test Coverage
/*
 * Copyright (C) 2016  Sean Burke
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */
/* eslint-disable camelcase */

import * as commonUtils from './utils';
import {camelCase, isString, snakeCase, upperFirst} from 'lodash';

import ElasticSearch from '@elastic/elasticsearch';
import type {EntityTypeString} from 'bookbrainz-data/lib/types/entity';
import httpStatus from 'http-status';
import log from 'log';


const _index = 'bookbrainz';
const _bulkIndexSize = 10000;

// In milliseconds
const _retryDelay = 10;
const _maxJitter = 75;

let _client = null;

function sanitizeEntityType(type) {
    if (!type) {
        return null;
    }
    if (Array.isArray(type)) {
        return type.map(snakeCase);
    }
    if (snakeCase(type) === 'all_entities') {
        return ['author', 'edition', 'edition_group', 'series', 'work', 'publisher'];
    }

    return snakeCase(type);
}

type IndexableEntities = EntityTypeString | 'Editor' | 'Collection' | 'Area';
const commonProperties = ['bbid', 'id', 'name', 'type', 'disambiguation'];

/* We don't currently want to index the entire Model in ElasticSearch,
   which contains a lot of fields we don't use as well as some internal props (_pivot props)
   This utility function prepares the Model into a minimal object that will be indexed
*/
export function getDocumentToIndex(entity:any, entityType: IndexableEntities) {
    const additionalProperties = [];
    switch (entityType) {
        case 'Work':
            additionalProperties.push('authors');
            break;
        default:
            break;
    }
    let aliases = entity.related('aliasSet')?.related('aliases')?.toJSON({ignorePivot: true, visible: 'name'});
    if (!aliases) {
        // Some models don't have the same aliasSet structure, i.e. Collection, Editor, Area, …
        const name = entity.get('name');
        aliases = {name};
    }
    const identifiers = entity.related('identifierSet')?.related('identifiers')?.toJSON({ignorePivot: true, visible: 'value'});

    return {
        ...entity.toJSON({ignorePivot: true, visible: commonProperties.concat(additionalProperties)}),
        aliases,
        identifiers: identifiers ?? null
    };
}

async function _fetchEntityModelsForESResults(orm, results) {
    const {Area, Editor, UserCollection} = orm;

    if (!results?.hits) {
        return null;
    }

    const processedResults = await Promise.all(results.hits.map(async (hit) => {
        const entityStub = hit._source;

        // Special cases first
        if (entityStub.type === 'Area') {
            const area = await Area.forge({gid: entityStub.id})
                .fetch({withRelated: ['areaType']});

            const areaJSON = area.toJSON({omitPivot: true});
            const areaParents = await area.parents();
            areaJSON.defaultAlias = {
                name: areaJSON.name
            };
            areaJSON.type = 'Area';
            areaJSON.disambiguation = {
                comment: `${areaJSON.areaType?.name}${areaParents?.length ? ' - ' : ''}${areaParents?.map(parent => parent.name).join(', ')}`
            };
            return areaJSON;
        }
        if (entityStub.type === 'Editor') {
            const editor = await Editor.forge({id: entityStub.id})
                .fetch();

            const editorJSON = editor.toJSON({omitPivot: true});
            editorJSON.defaultAlias = {
                name: editorJSON.name
            };
            editorJSON.type = 'Editor';
            editorJSON.id = entityStub.id;
            return editorJSON;
        }
        if (entityStub.type === 'Collection') {
            const collection = await UserCollection.forge({id: entityStub.id})
                .fetch();

            const collectionJSON = collection.toJSON({omitPivot: true});
            collectionJSON.defaultAlias = {
                name: collectionJSON.name
            };
            collectionJSON.type = 'Collection';
            collectionJSON.id = entityStub.id;
            return collectionJSON;
        }
        // Regular entity
        const model = commonUtils.getEntityModelByType(orm, entityStub.type);
        const entity = await model.forge({bbid: entityStub.bbid})
            .fetch({require: false, withRelated: ['defaultAlias.language', 'disambiguation', 'aliasSet.aliases', 'identifierSet.identifiers',
                'relationshipSet.relationships.source', 'relationshipSet.relationships.target', 'relationshipSet.relationships.type', 'annotation']});
        const entityJSON = entity?.toJSON({omitPivot: true});
        if (entityJSON && entityJSON.relationshipSet) {
            entityJSON.relationshipSet.relationships = await Promise.all(entityJSON.relationshipSet.relationships.map(async (rel) => {
                rel.source = await commonUtils.getEntity(orm, rel.source.bbid, rel.source.type);
                rel.target = await commonUtils.getEntity(orm, rel.target.bbid, rel.target.type);
                return rel;
            }));
        }
        if (entityStub.authors) {
            entityJSON.authors = entityStub.authors;
        }
        return entityJSON;
    })).catch(err => log.error(err));
    return processedResults;
}

// Returns the results of a search translated to entity objects
async function _searchForEntities(orm, dslQuery) {
    try {
        const searchResponse = await _client.search(dslQuery);
        const results = await _fetchEntityModelsForESResults(orm, searchResponse.body.hits);
        return {results, total: searchResponse.body.hits.total};
    }
    catch (error) {
        log.error(error);
    }
    return {results: [], total: 0};
}

export async function _bulkIndexEntities(entities) {
    if (!entities.length) {
        return;
    }

    // Proxy the list of entities to index in case we need to retry
    let entitiesToIndex = entities;

    let operationSucceeded = false;
    while (!operationSucceeded) {
        const bulkOperations = entitiesToIndex.reduce((accumulator, entity) => {
            accumulator.push({
                index: {
                    _id: entity.bbid ?? entity.id,
                    _index,
                    _type: snakeCase(entity.type)
                }
            });
            accumulator.push(entity);

            return accumulator;
        }, []);

        operationSucceeded = true;

        // eslint-disable-next-line no-await-in-loop
        const response = await _client.bulk({
            body: bulkOperations
        }).catch(error => { log.error('error bulk indexing entities for search:', error); });

        /*
         * In case of failed index operations, the promise won't be rejected;
         * instead, we have to inspect the response and respond to any failures
         * individually.
         */
        if (response?.errors === true) {
            entitiesToIndex = response.items.reduce((accumulator, item) => {
                // We currently only handle queue overrun
                if (item.index.status === httpStatus.TOO_MANY_REQUESTS) {
                    const failedEntity = entities.find(
                        (element) => (element.bbid ?? element.id) === item.index._id
                    );

                    accumulator.push(failedEntity);
                }

                return accumulator;
            }, []);


            if (entitiesToIndex.length) {
                operationSucceeded = false;

                const jitter = Math.random() * _maxJitter;
                // eslint-disable-next-line no-await-in-loop
                await new Promise(resolve => setTimeout(resolve, _retryDelay + jitter));
            }
        }
    }
}

async function _processEntityListForBulk(entityList) {
    const indexOperations = [];

    let bulkQueue = [];
    for (const entity of entityList) {
        bulkQueue.push(entity);

        if (bulkQueue.length >= _bulkIndexSize) {
            indexOperations.push(_bulkIndexEntities(bulkQueue));
            bulkQueue = [];
        }
    }
    indexOperations.push(_bulkIndexEntities(bulkQueue));

    await Promise.all(indexOperations);
}

export async function autocomplete(orm, query, type, size = 42) {
    let queryBody = null;

    if (commonUtils.isValidBBID(query)) {
        queryBody = {
            ids: {
                values: [query]
            }
        };
    }
    else {
        queryBody = {
            match: {
                'aliases.name.autocomplete': {
                    minimum_should_match: '80%',
                    query
                }
            }
        };
    }

    const dslQuery = {
        body: {
            query: queryBody,
            size
        },
        index: _index,
        type: sanitizeEntityType(type)
    };


    const searchResponse = await _searchForEntities(orm, dslQuery);
    // Only return the results array, we're not interested in the total number of hits for this endpoint
    return searchResponse.results;
}

// eslint-disable-next-line consistent-return
export function indexEntity(entity) {
    const entityType = entity.get('type');
    const document = getDocumentToIndex(entity, entityType);
    if (entity) {
        return _client.index({
            body: document,
            id: entity.get('bbid') || entity.get('id'),
            index: _index,
            type: snakeCase(entityType)
        }).catch(error => { log.error('error indexing entity for search:', error); });
    }
}

export function deleteEntity(entity) {
    return _client.delete({
        id: entity.bbid ?? entity.id,
        index: _index,
        type: snakeCase(entity.type)
    }).catch(error => { log.error('error deleting entity from index:', error); });
}

export function refreshIndex() {
    return _client.indices.refresh({index: _index}).catch(error => { log.error('error refreshing search index:', error); });
}

export async function generateIndex(orm) {
    const {Area, Author, Edition, EditionGroup, Editor, Publisher, Series, UserCollection, Work} = orm;
    const indexMappings = {
        mappings: {
            _default_: {
                properties: {
                    aliases: {
                        properties: {
                            name: {
                                fields: {
                                    autocomplete: {
                                        analyzer: 'edge',
                                        type: 'text'
                                    },
                                    search: {
                                        analyzer: 'trigrams',
                                        type: 'text'
                                    }
                                },
                                type: 'text'
                            }
                        }
                    },
                    authors: {
                        analyzer: 'trigrams',
                        type: 'text'
                    },
                    disambiguation: {
                        analyzer: 'trigrams',
                        type: 'text'
                    }
                }
            }
        },
        settings: {
            analysis: {
                analyzer: {
                    edge: {
                        filter: [
                            'asciifolding',
                            'lowercase'
                        ],
                        tokenizer: 'edge_ngram_tokenizer',
                        type: 'custom'
                    },
                    trigrams: {
                        filter: [
                            'asciifolding',
                            'lowercase'
                        ],
                        tokenizer: 'trigrams',
                        type: 'custom'
                    }
                },
                tokenizer: {
                    edge_ngram_tokenizer: {
                        max_gram: 10,
                        min_gram: 2,
                        token_chars: [
                            'letter',
                            'digit'
                        ],
                        type: 'edge_ngram'
                    },
                    trigrams: {
                        max_gram: 3,
                        min_gram: 1,
                        type: 'ngram'
                    }
                }
            },
            'index.mapping.ignore_malformed': true
        }
    };

    // First, drop index and recreate
    const mainIndexExistsRequest = await _client.indices.exists({index: _index});
    const mainIndexExists = mainIndexExistsRequest?.body;

    if (mainIndexExists) {
        await _client.indices.delete({index: _index});
    }

    await _client.indices.create(
        {body: indexMappings, index: _index}
    );

    const baseRelations = [
        'annotation',
        'defaultAlias',
        'aliasSet.aliases',
        'identifierSet.identifiers'
    ];

    const entityBehaviors = [
        {
            model: Author,
            relations: [
                'gender',
                'beginArea',
                'endArea'
            ]
        },
        {
            model: Edition,
            relations: [
                'editionGroup',
                'editionFormat',
                'editionStatus'
            ]
        },
        {model: EditionGroup, relations: []},
        {model: Publisher, relations: ['area']},
        {model: Series, relations: ['seriesOrderingType']},
        {model: Work, relations: ['relationshipSet.relationships.type']}
    ];

    // Update the indexed entries for each entity type
    const behaviorPromise = entityBehaviors.map(
        (behavior) => behavior.model.forge()
            .query((qb) => {
                qb.where('master', true);
                qb.whereNotNull('data_id');
            })
            .fetchAll({
                withRelated: baseRelations.concat(behavior.relations)
            })
    );
    const entityLists = await Promise.all(behaviorPromise);
    /* eslint-disable @typescript-eslint/no-unused-vars */
    const entityFetchOrder:EntityTypeString[] = ['Author', 'Edition', 'EditionGroup', 'Publisher', 'Series', 'Work'];
    const [authorsCollection,
        editionCollection,
        editionGroupCollection,
        publisherCollection,
        seriesCollection,
        workCollection] = entityLists;
    /* eslint-enable @typescript-eslint/no-unused-vars */
    const listIndexes = [];

    workCollection.forEach(workEntity => {
        const relationshipSet = workEntity.related('relationshipSet');
        if (relationshipSet) {
            const authorWroteWorkRels = relationshipSet.related('relationships')?.filter(relationshipModel => relationshipModel.get('typeId') === 8);
            const authorNames = [];
            authorWroteWorkRels.forEach(relationshipModel => {
                // Search for the Author in the already fetched BookshelfJS Collection
                const source = authorsCollection.get(relationshipModel.get('sourceBbid'));
                const name = source?.related('defaultAlias')?.get('name');
                if (name) {
                    authorNames.push(name);
                }
            });
            workEntity.set('authors', authorNames);
        }
    });
    // Index all the entities
    entityLists.forEach((entityList, idx) => {
        const entityType:EntityTypeString = entityFetchOrder[idx];
        const listArray = entityList.map(entity => getDocumentToIndex(entity, entityType));
        listIndexes.push(_processEntityListForBulk(listArray));
    });

    await Promise.all(listIndexes);

    const areaCollection = await Area.forge()
        .fetchAll();

    const areas = areaCollection.toJSON({omitPivot: true});

    /** To index names, we use aliases.name and type, which Areas don't have.
   * We massage the area to return a similar format as BB entities
   */
    const processedAreas = areas.map((area) => ({
        aliases: [
            {name: area.name}
        ],
        id: area.gid,
        type: 'Area'
    }));
    await _processEntityListForBulk(processedAreas);

    const editorCollection = await Editor.forge()
        // no bots
        .where('type_id', 1)
        .fetchAll();
    const editors = editorCollection.toJSON({omitPivot: true});

    /** To index names, we use aliases.name and type, which Editors don't have.
   * We massage the editor to return a similar format as BB entities
   */
    const processedEditors = editors.map((editor) => ({
        aliases: [
            {name: editor.name}
        ],
        id: editor.id,
        type: 'Editor'
    }));
    await _processEntityListForBulk(processedEditors);

    const userCollections = await UserCollection.forge().where({public: true})
        .fetchAll();
    const userCollectionsJSON = userCollections.toJSON({omitPivot: true});

    /** To index names, we use aliases.name and type, which UserCollections don't have.
   * We massage the editor to return a similar format as BB entities
   */
    const processedCollections = userCollectionsJSON.map((collection) => ({
        aliases: [
            {name: collection.name}
        ],
        id: collection.id,
        type: 'Collection'
    }));
    await _processEntityListForBulk(processedCollections);

    await refreshIndex();
}

export async function checkIfExists(orm, name, type) {
    const {bookshelf} = orm;
    const bbids:string[] = await new Promise((resolve, reject) => {
        bookshelf.transaction(async (transacting) => {
            try {
                const result = await orm.func.alias.getBBIDsWithMatchingAlias(
                    transacting, snakeCase(type), name
                );
                resolve(result);
            }
            catch (error) {
                reject(error);
            }
        });
    });

    // Follow-up: Fetch all entities in a single transaction from the postgres server
    const baseRelations = [
        'aliasSet.aliases.language',
        'defaultAlias',
        'disambiguation',
        'identifierSet.identifiers.type',
        'relationshipSet.relationships.type',
        'revision.revision'
    ];
    const processedResults = await Promise.all(
        bbids.map(
            bbid => orm.func.entity.getEntity(orm, upperFirst(camelCase(type)), bbid, baseRelations)
        )
    );

    return processedResults;
}

export function searchByName(orm, name, type, size, from) {
    const sanitizedEntityType = sanitizeEntityType(type);
    const dslQuery = {
        body: {
            from,
            query: {
                multi_match: {
                    fields: [
                        'aliases.name^3',
                        'aliases.name.search',
                        'disambiguation',
                        'identifiers.value'
                    ],
                    minimum_should_match: '80%',
                    query: name,
                    type: 'cross_fields'
                }
            },
            size
        },
        index: _index,
        type: sanitizedEntityType
    };
    if (sanitizedEntityType === 'work' || (Array.isArray(sanitizedEntityType) && sanitizedEntityType.includes('work'))) {
        dslQuery.body.query.multi_match.fields.push('authors');
    }

    return _searchForEntities(orm, dslQuery);
}

export async function init(orm, options) {
    if (!isString(options.host)) {
        options.host = 'localhost:9200';
    }

    _client = new ElasticSearch.Client(options);

    // Automatically index on app startup if we haven't already
    try {
        const mainIndexExists = await _client.indices.exists({index: _index});
        if (mainIndexExists) {
            return null;
        }
        return generateIndex(orm);
    }
    catch (error) {
        return null;
    }
}