huridocs/uwazi

View on GitHub
app/api/services/informationextraction/getFiles.ts

Summary

Maintainability
A
1 hr
Test Coverage
A
96%
/* eslint-disable camelcase */
import moment from 'moment';

import {
  ExtractedMetadataSchema,
  ObjectIdSchema,
  PropertyTypeSchema,
} from 'shared/types/commonTypes';
import { filesModel } from 'api/files/filesModel';
import { SegmentationType } from 'shared/types/segmentationType';
import entitiesModel from 'api/entities/entitiesModel';
import { SegmentationModel } from 'api/services/pdfsegmentation/segmentationModel';
import { IXSuggestionsModel } from 'api/suggestions/IXSuggestionsModel';
import ixmodels from 'api/services/informationextraction/ixmodels';
import { FileType } from 'shared/types/fileType';
import { objectIndex } from 'shared/data_utils/objectIndex';
import settings from 'api/settings/settings';
import templatesModel from 'api/templates/templates';
import { propertyTypes } from 'shared/propertyTypes';
import languages from 'shared/languages';
import { ensure } from 'shared/tsUtils';

const BATCH_SIZE = 50;
const MAX_TRAINING_FILES_NUMBER = 2000;

type PropertyValue = string | Array<{ value: string; label: string }>;

interface FileWithAggregation {
  _id: ObjectIdSchema;
  segmentation: SegmentationType;
  entity: string;
  language: string;
  extractedMetadata: ExtractedMetadataSchema[];
  propertyType: PropertyTypeSchema;
  propertyValue?: PropertyValue;
}

type FileEnforcedNotUndefined = {
  _id: ObjectIdSchema;
  filename: string;
  language: string;
  entity: string;
  propertyType: PropertyTypeSchema;
  propertyValue?: PropertyValue;
};

const selectProperties: Set<string> = new Set([propertyTypes.select, propertyTypes.multiselect]);
const propertiesWithoutExtractedMetadata: Set<string> = new Set([
  ...Array.from(selectProperties),
  propertyTypes.relationship,
]);
const multiValuedProperties: Set<string> = new Set([
  propertyTypes.multiselect,
  propertyTypes.relationship,
]);

const propertyTypeIsSelectOrMultiSelect = (type: string) => selectProperties.has(type);

const propertyTypeIsWithoutExtractedMetadata = (type: string) =>
  propertiesWithoutExtractedMetadata.has(type);

const propertyTypeIsMultiValued = (type: string) => multiValuedProperties.has(type);

async function getFilesWithAggregations(files: (FileType & FileEnforcedNotUndefined)[]) {
  const filesNames = files.filter(x => x.filename).map(x => x.filename);

  const segmentationForFiles = (await SegmentationModel.get(
    { filename: { $in: filesNames } },
    'filename segmentation xmlname'
  )) as (SegmentationType & { filename: string })[];

  const segmentationDictionary = Object.assign(
    {},
    ...segmentationForFiles.map(segmentation => ({ [segmentation.filename]: segmentation }))
  );

  return files.map(file => ({
    _id: file._id,
    language: file.language,
    extractedMetadata: file.extractedMetadata ? file.extractedMetadata : [],
    entity: file.entity,
    segmentation: segmentationDictionary[file.filename ? file.filename : 'no value'],
    propertyType: file.propertyType,
    propertyValue: file.propertyValue,
  }));
}

async function getSegmentedFilesIds() {
  const segmentations = await SegmentationModel.get({ status: 'ready' }, 'fileID');
  return segmentations.filter(x => x.fileID).map(x => x.fileID) as ObjectIdSchema[];
}

async function getPropertyType(templates: ObjectIdSchema[], property: string) {
  const template = await templatesModel.getById(templates[0]);

  let type: PropertyTypeSchema | undefined = 'text';
  if (property !== 'title') {
    const prop = template?.properties?.find(p => p.name === property);
    type = prop?.type;
  }

  if (!type) {
    throw new Error(`Property "${property}" does not exists`);
  }

  return type;
}

async function fileQuery(
  property: string,
  propertyType: string,
  entitiesFromTrainingTemplatesIds: string[]
) {
  const needsExtractedMetadata = !propertyTypeIsWithoutExtractedMetadata(propertyType);
  const query: {
    type: string;
    filename: { $exists: Boolean };
    language: { $exists: Boolean };
    _id: { $in: ObjectIdSchema[] };
    'extractedMetadata.name'?: string;
    entity: { $in: string[] };
  } = {
    type: 'document',
    filename: { $exists: true },
    language: { $exists: true },
    _id: { $in: await getSegmentedFilesIds() },
    entity: { $in: entitiesFromTrainingTemplatesIds },
  };
  if (needsExtractedMetadata) query['extractedMetadata.name'] = property;
  return query;
}

function entityForTrainingQuery(
  templates: ObjectIdSchema[],
  property: string,
  propertyType: PropertyTypeSchema
) {
  const query: {
    [key: string]: { $in?: ObjectIdSchema[]; $exists?: Boolean; $ne?: any[] };
  } = { template: { $in: templates } };
  if (propertyTypeIsWithoutExtractedMetadata(propertyType)) {
    query[`metadata.${property}`] = { $exists: true, $ne: [] };
  }
  return query;
}

async function getFilesForTraining(templates: ObjectIdSchema[], property: string) {
  const propertyType = await getPropertyType(templates, property);
  const entities = await entitiesModel.getUnrestricted(
    entityForTrainingQuery(templates, property, propertyType),
    `sharedId metadata.${property} language`
  );
  const entitiesFromTrainingTemplatesIds = entities
    .filter(x => x.sharedId)
    .map(x => x.sharedId) as string[];

  const files = (await filesModel.get(
    await fileQuery(property, propertyType, entitiesFromTrainingTemplatesIds),
    'extractedMetadata entity language filename',
    { limit: MAX_TRAINING_FILES_NUMBER }
  )) as (FileType & FileEnforcedNotUndefined)[];

  const indexedEntities = objectIndex(
    entities,
    e => e.sharedId! + e.language!,
    objectIndex.NoTransform
  );

  const defaultLang = (await settings.getDefaultLanguage())?.key;

  const filesWithEntityValue = files.map(file => {
    const fileLang = languages.get(file.language, 'ISO639_1') || defaultLang;
    const entity = indexedEntities[file.entity + fileLang];
    if (!entity?.metadata || !entity?.metadata[property]?.length) {
      return { ...file, propertyType };
    }

    if (propertyTypeIsWithoutExtractedMetadata(propertyType)) {
      const propertyValue = (entity.metadata?.[property] || []).map(({ value, label }) => ({
        value: ensure<string>(value),
        label: ensure<string>(label),
      }));
      return { ...file, propertyValue, propertyType };
    }

    const [{ value }] = entity.metadata[property] || [{}];
    let stringValue: string;
    if (propertyType === propertyTypes.date) {
      stringValue = moment(<number>value * 1000).format('YYYY-MM-DD');
    } else if (propertyType === propertyTypes.numeric) {
      stringValue = value?.toString() || '';
    } else {
      stringValue = <string>value;
    }

    return { ...file, propertyValue: stringValue, propertyType };
  });

  return getFilesWithAggregations(filesWithEntityValue);
}

async function getFilesForSuggestions(extractorId: ObjectIdSchema) {
  const [currentModel] = await ixmodels.get({ extractorId });

  const suggestions = await IXSuggestionsModel.get(
    {
      extractorId,
      date: { $lt: currentModel.creationDate },
      'state.error': { $ne: true },
    },
    'fileId',
    { limit: BATCH_SIZE }
  );

  const fileIds = suggestions.filter(x => x.fileId).map(x => x.fileId);

  const files = (await filesModel.get(
    {
      $and: [
        {
          type: 'document',
          filename: { $exists: true },
          language: { $exists: true },
        },
        { _id: { $in: fileIds } },
      ],
    },
    'extractedMetadata entity language filename'
  )) as (FileType & FileEnforcedNotUndefined)[];

  return getFilesWithAggregations(files);
}

export {
  getFilesForTraining,
  getFilesForSuggestions,
  getSegmentedFilesIds,
  propertyTypeIsSelectOrMultiSelect,
  propertyTypeIsWithoutExtractedMetadata,
  propertyTypeIsMultiValued,
};
export type { FileWithAggregation };