huridocs/uwazi

View on GitHub
app/api/services/ocr/specs/OcrManager.spec.ts

Summary

Maintainability
A
0 mins
Test Coverage
/* eslint-disable max-statements */
/* eslint-disable max-lines */
import fetchMock from 'fetch-mock';
import { files, storage } from 'api/files';
import * as filesApi from 'api/files/filesystem';
import * as processDocumentApi from 'api/files/processDocument';
import { tenants } from 'api/tenants/tenantContext';
import settings from 'api/settings/settings';
import { testingEnvironment } from 'api/utils/testingEnvironment';
import { Readable } from 'stream';
import request from 'shared/JSONRequest';
import * as sockets from 'api/socketio/setupSockets';
import * as handleError from 'api/utils/handleError';
import { getOcrStatus, OcrManager } from '../OcrManager';
import { OcrModel, OcrStatus } from '../ocrModel';
import { ResultsMessage, TaskManager } from '../../tasksmanager/TaskManager';
import { mockTaskManagerImpl } from '../../tasksmanager/specs/TaskManagerImplementationMocker';
import { fixtures, fixturesFactory } from './fixtures/fixtures';
import { cleanupRecordsOfFiles } from '../ocrRecords';

jest.mock('api/services/tasksmanager/TaskManager.ts');

class Mocks {
  jestMocks: { [k: string]: jest.SpyInstance };

  taskManagerMock: {
    mock: Partial<TaskManager>;
    trigger: (m: ResultsMessage) => Promise<void>;
  };

  constructor() {
    this.jestMocks = {
      'storage.fileContents': jest
        .spyOn(storage, 'fileContents')
        .mockResolvedValue(Buffer.from('file_content')),
      'storage.storeFile': jest.spyOn(storage, 'storeFile').mockResolvedValue(),
      'filesApi.generateFileName': jest
        .spyOn(filesApi, 'generateFileName')
        .mockReturnValue('generatedUwaziFilename'),
      'request.uploadFile': jest.spyOn(request, 'uploadFile').mockReturnValue(Promise.resolve()),
      'processDocumentApi.processDocument': jest
        .spyOn(processDocumentApi, 'processDocument')
        .mockResolvedValue({
          _id: fixturesFactory.id('resultFile'),
          originalname: 'ocr_sourceFileName.pdf',
          entity: 'parentEntity',
          filename: 'generatedfilename.pdf',
          mimetype: 'pdf',
          size: 42,
          language: 'en',
          type: 'document',
        }),
      'date.now': jest.spyOn(Date, 'now').mockReturnValue(1000),
      'sockets.emitToTenant': jest.spyOn(sockets, 'emitToTenant').mockImplementation(() => {}),
      'handleError.handleError': jest
        .spyOn(handleError, 'handleError')
        .mockImplementation(() => {}),
    };

    this.taskManagerMock = mockTaskManagerImpl(TaskManager as jest.Mock<TaskManager>);

    fetchMock.mock('end:/info', '{ "supported_languages": ["en", "es"] }');

    fetchMock.mock(
      'protocol://link/to/result/file',
      //@ts-ignore
      new Response(Readable.from(Buffer.from('resultFileContent')), {
        headers: { 'Content-Type': 'some/mimetype' },
        size: 17,
      })
    );
  }

  release() {
    Object.values(this.jestMocks).forEach(m => m.mockRestore());
    fetchMock.restore();
  }

  clearJestMocks() {
    Object.values(this.jestMocks).forEach(m => m.mockClear());
  }
}

describe('OcrManager', () => {
  let tenantName: string;
  let mockedMessageFromRedis: ResultsMessage;
  let ocrManager: OcrManager;
  let mocks: Mocks;

  beforeAll(async () => {
    await testingEnvironment.setUp(fixtures);
    tenantName = tenants.current().name;
    mocks = new Mocks();
    mockedMessageFromRedis = {
      tenant: tenantName,
      task: 'ocr_results',
      file_url: 'protocol://link/to/result/file',
      params: { filename: 'sourceFileName.pdf', language: 'en' },
      success: true,
    };

    ocrManager = new OcrManager();
    ocrManager.start();
  });

  beforeEach(() => {
    mocks.jestMocks['storage.fileContents'] = jest
      .spyOn(storage, 'fileContents')
      .mockResolvedValue(Buffer.from('file_content'));
  });

  afterAll(async () => {
    mocks.release();
    await testingEnvironment.tearDown();
  });

  describe('on success', () => {
    beforeAll(async () => {
      const [sourceFile] = await files.get({ _id: fixturesFactory.id('sourceFile') });
      await ocrManager.addToQueue(sourceFile);
      await storage.removeFile('generatedUwaziFilename', 'document');
    });

    describe('when creating a new task', () => {
      it('should upload the material', () => {
        expect(request.uploadFile).toHaveBeenCalledWith(
          `serviceUrl/upload/${tenantName}`,
          'sourceFileName.pdf',
          Buffer.from('file_content')
        );
      });

      it('should dispatch a job to the TaskManager', () => {
        const mock = jest.spyOn(ocrManager.ocrTaskManager, 'startTask');
        expect(mock).toHaveBeenCalledWith(
          expect.objectContaining({
            tenant: tenantName,
            params: { filename: 'sourceFileName.pdf', language: 'en' },
            task: 'ocr',
          })
        );
      });

      it('should add a record to the DB', async () => {
        const records = await OcrModel.get({});
        expect(records).toHaveLength(4);
        const lastRecord = records[records.length - 1];
        expect(lastRecord).toMatchObject({
          status: OcrStatus.PROCESSING,
          sourceFile: fixturesFactory.id('sourceFile'),
          language: 'eng',
          lastUpdated: 1000,
        });
        expect(lastRecord).not.toHaveProperty('resultFile');
      });
    });

    describe('when there are results', () => {
      beforeAll(async () => {
        mocks.jestMocks['date.now'].mockReturnValue(1001);
        mocks.jestMocks['storage.storeFile'].mockRestore();
        await filesApi.deleteFile('/tmp/generatedUwaziFilename');
        await mocks.taskManagerMock.trigger(mockedMessageFromRedis);
      });

      it('should download the results', () => {
        expect(fetchMock.lastUrl()).toBe('protocol://link/to/result/file');
      });

      it('should save the file with a generated filename', async () => {
        expect(filesApi.generateFileName).toHaveBeenCalled();
        mocks.jestMocks['storage.fileContents'].mockRestore();
        expect((await storage.fileContents('generatedUwaziFilename', 'document')).toString()).toBe(
          'resultFileContent'
        );
      });

      it('should run the file processing and tmp file should exist', async () => {
        expect(await filesApi.fileExistsOnPath('/tmp/generatedUwaziFilename'));
        expect(processDocumentApi.processDocument).toHaveBeenCalledWith(
          'parentEntity',
          expect.objectContaining({
            destination: '/tmp',
            filename: 'generatedUwaziFilename',
            language: 'eng',
            mimetype: 'some/mimetype',
            originalname: 'ocr_sourceFileOriginalName.pdf',
            type: 'document',
          }),
          false
        );
      });

      it('should move the original file to the attachments', async () => {
        const [file] = await files.get({ _id: fixturesFactory.id('sourceFile') });
        expect(file.type).toBe('attachment');
      });

      it('should update the job status', async () => {
        const matchingRecords = await OcrModel.get({
          sourceFile: fixturesFactory.id('sourceFile'),
        });
        expect(matchingRecords).toHaveLength(1);
        const [record] = matchingRecords;
        expect(record).toMatchObject({
          status: 'withOCR',
          sourceFile: fixturesFactory.id('sourceFile'),
          language: 'eng',
          resultFile: fixturesFactory.id('resultFile'),
          lastUpdated: 1001,
        });
      });

      it('should emit through the sockets', async () => {
        const [file] = await files.get({ _id: fixturesFactory.id('sourceFile') });
        expect(sockets.emitToTenant).toHaveBeenCalledWith(
          tenantName,
          'ocr:ready',
          file._id.toHexString()
        );
      });
    });

    it('should find the record in the database when requesting the status of a file', async () => {
      const [existingSourceFile] = await files.get({
        _id: fixturesFactory.id('sourceForExistingRecord'),
      });
      const status = await getOcrStatus(existingSourceFile);
      expect(status).toEqual({ status: OcrStatus.READY, lastUpdated: 1000 });
    });
  });

  describe('on validation', () => {
    it('should throw an error when enqueueing if the file is not a document', async () => {
      const [attachmentFile] = await files.get({ _id: fixturesFactory.id('unrelatedAttachment') });

      try {
        await ocrManager.addToQueue(attachmentFile);
        fail('Should throw.');
      } catch (err) {
        expect(err).toMatchObject({
          message: 'The file is not a document.',
          code: 400,
        });
      }
    });

    it('should throw an error when file is not a document and does not have ocr record when getting status', async () => {
      const [attachmentFile] = await files.get({ _id: fixturesFactory.id('unrelatedAttachment') });

      try {
        await getOcrStatus(attachmentFile);
        fail('Should throw.');
      } catch (err) {
        expect(err).toMatchObject({
          message: 'The file is not a document.',
          code: 400,
        });
      }
    });

    it('should throw an error when an ocr model is already in queue', async () => {
      await OcrModel.delete({ sourceFile: fixturesFactory.id('sourceFile') });

      await files.save({ _id: fixturesFactory.id('sourceFile'), type: 'document' });

      const [sourceFile] = await files.get({ _id: fixturesFactory.id('sourceFile') });

      await ocrManager.addToQueue(sourceFile);
      await expect(ocrManager.addToQueue(sourceFile)).rejects.toThrow('already in the queue');
    });

    it('should throw an error when settings are missing from the database', async () => {
      const oldSettings = await settings.get();
      await settings.save({ features: {} });

      const [sourceFile] = await files.get({ _id: fixturesFactory.id('erroringSourceFile') });

      await expect(ocrManager.addToQueue(sourceFile)).rejects.toThrow(
        'Ocr settings are missing from the database'
      );

      await settings.save(oldSettings);
    });

    it('should throw an error when language is not supported', async () => {
      const [sourceFile] = await files.get({ _id: fixturesFactory.id('erroringSourceFile') });
      await expect(ocrManager.addToQueue(sourceFile)).rejects.toThrow('Language not supported');
    });

    it('should do nothing when record is missing', async () => {
      await OcrModel.delete({ sourceFile: fixturesFactory.id('sourceFile') });
      mocks.clearJestMocks();
      mocks.jestMocks['storage.storeFile'] = jest.spyOn(storage, 'storeFile').mockResolvedValue();

      await mocks.taskManagerMock.trigger(mockedMessageFromRedis);

      const records = await OcrModel.get({});
      expect(records).toHaveLength(3);
      expect(storage.storeFile).not.toHaveBeenCalled();
      expect(processDocumentApi.processDocument).not.toHaveBeenCalled();
    });
  });

  describe('on error', () => {
    it('should record error in db if service response is not a success', async () => {
      mocks.jestMocks['date.now'].mockReturnValue(1002);
      await OcrModel.delete({ sourceFile: fixturesFactory.id('sourceFile') });

      const [sourceFile] = await files.get({ _id: fixturesFactory.id('sourceFile') });
      await ocrManager.addToQueue(sourceFile);
      await mocks.taskManagerMock.trigger({
        ...mockedMessageFromRedis,
        success: false,
        error_message: 'some error message',
      });

      const matchingRecords = await OcrModel.get({
        sourceFile: fixturesFactory.id('sourceFile'),
      });
      expect(matchingRecords).toHaveLength(1);
      const [record] = matchingRecords;
      expect(record).toMatchObject({
        status: 'cannotProcess',
        sourceFile: fixturesFactory.id('sourceFile'),
        language: 'eng',
        lastUpdated: 1002,
      });
      expect(sockets.emitToTenant).toHaveBeenCalledWith(
        tenantName,
        'ocr:error',
        sourceFile._id.toHexString()
      );
    });

    it('should catch an unexpected error while processing the response and log it', async () => {
      const error = new Error('some error');
      jest.spyOn(files, 'get').mockReturnValueOnce(Promise.reject(error));
      await mocks.taskManagerMock.trigger(mockedMessageFromRedis);
      expect(handleError.handleError).toHaveBeenCalledWith(error);
    });
  });

  describe('on cleanup', () => {
    it('should modify record source to null when source is deleted', async () => {
      const filesToCleanup = await files.get({
        $or: [
          { _id: fixturesFactory.id('sourceToDelete') },
          { _id: fixturesFactory.id('sourceToDelete2') },
        ],
      });
      let records = await OcrModel.get({ sourceFile: { $in: filesToCleanup.map(f => f._id) } });
      await cleanupRecordsOfFiles(filesToCleanup.map(f => f._id));
      records = await OcrModel.get({ _id: { $in: records.map(r => r._id) } });
      expect(records[0].sourceFile).toBeNull();
      expect(records[1].sourceFile).toBeNull();
    });

    it(' should delete record when result is deleted', async () => {
      const filesToCleanup = await files.get({
        $or: [
          { _id: fixturesFactory.id('resultToDelete') },
          { _id: fixturesFactory.id('resultToDelete2') },
        ],
      });
      let records = await OcrModel.get({ sourceFile: { $in: filesToCleanup.map(f => f._id) } });
      await cleanupRecordsOfFiles(filesToCleanup.map(f => f._id));
      records = await OcrModel.get({ _id: { $in: records.map(r => r._id) } });
      expect(records).toHaveLength(0);
    });
  });
});