ezpaarse-project/ezpaarse

View on GitHub
lib/pkbvalidator.js

Summary

Maintainability
C
1 day
Test Coverage
'use strict';

var util         = require('util');
var fs           = require('fs');
var csv          = require('csv');
var jschardet    = require('jschardet');
var EventEmitter = require('events').EventEmitter;

var ridchecker   = require('./rid-syntax-checker.js');
var unauthorizedFields = require('./proxyformats/pkb-unauthorized-fields.json');

var ISSNPattern  = /[0-9]{4}-[0-9]{3}([0-9Xx])?/;
var ISBNPattern  = /((978[-– ])?[0-9][0-9-– ]{10}[-– ][0-9xX])|((978)?[0-9]{9}[0-9])/;

/**
 * Read a file, parse it and look for errors
 * @param  {String}   file     path to the file
 * @param  {Function} callback (err, nbErrors, nbWarnings)
 *
 * @emit error       program error or bad CSV syntax
 *       pkbError    blocking pkb errors
 *       pkbWarning
 */
exports.validate = function (file, callback) {
  var titleIDs      = {}; // List of title IDs
  var titleIdColumn = false;
  var syntaxError   = false;
  var currentLine   = 1;
  var nbErrors      = 0;
  var nbWarnings    = 0;

  var fileStream    = fs.createReadStream(file);
  var emitter       = new EventEmitter();
  var detector      = new jschardet.UniversalDetector();
  var data;
  detector.reset();

  // Store errors if a callback is provided
  if (typeof callback === 'function') {
    var pkbErrors    = [];
    var pkbWarnings  = [];

    emitter
      .on('error', function (error)       { callback(error); })
      .on('syntaxError', function (error) { callback(error); })
      .on('pkbError', function (msg, line)     { pkbErrors.push({ message: msg, line: line }); })
      .on('pkbWarning', function (msg, line)   { pkbWarnings.push({ message: msg, line: line }); })
      .on('end', function () { callback(null, pkbErrors, pkbWarnings); });
  }

  var parser = csv.parse({
    delimiter: '\t',
    relax_column_count : true,
    columns: function (headers) {

      // check that title_id is present
      if (headers.indexOf('title_id') !== -1) {
        titleIdColumn = true;
      } else {
        nbErrors++;

        emitter.emit('pkbError', 'title_id header is missing', 1);
      }

      // look for unauthorized headers
      var erroredHeaders = [];
      headers.forEach(function (header) {
        if (unauthorizedFields.indexOf(header) !== -1) {
          erroredHeaders.push(header);
        }
      });

      if (erroredHeaders.length) {
        nbErrors++;

        emitter.emit('pkbError',
          util.format('unauthorized headers : %s', erroredHeaders.join(',')), 1);
      }

      return headers;
    }});

  parser.on('error', function (err) {
    syntaxError = true;
    fileStream.close(function () {
      emitter.emit('syntaxError', err);
    });
  }).on('readable', function () {
    var record = parser.read();
    if (!record) { return; }

    currentLine++;

    if (!record.title_id) {
      if (titleIdColumn) {
        nbErrors++;

        // If the record has no title_id although the column exists
        emitter.emit('pkbError', 'title_id is missing', currentLine);
      }
    } else {
      var titleID = record.title_id;

      if (titleIDs[titleID]) {
        nbWarnings++;

        emitter.emit('pkbWarning', util.format('duplicated title_id with line %d (%s)',
          titleIDs[titleID], titleID), currentLine);
      } else {
        titleIDs[titleID] = currentLine;
      }
    }

    for (var field in record) {
      var value = record[field];

      if (value === '') { continue; }                  // no control if empty
      if (value.substr(0, 1) === '#') { continue; } // no control when starting with #

      if (field === 'print_identifier' || field === 'online_identifier') {
        /* check each issn is valid */
        var ridValue = value;

        if (ISSNPattern.test(ridValue)) { // check for ISSN
          var ISSN = ridchecker.getISSN(value);

          if (ISSN.checkable && !ISSN.isValid) {
            nbWarnings++;

            emitter.emit('pkbWarning',
              util.format('invalid value (%s = %s), controled value is %s',
                field, value, ISSN.calculatedKey), currentLine);
          }
        } else if (ISBNPattern.test(ridValue)) { // check for ISBN
          var ISBN = ridchecker.getISBN(value);

          if (ISBN.checkable &&  !ISBN.isValid) {
            nbWarnings++;

            emitter.emit('pkbWarning',
              util.format('invalid value (%s = %s), controled value is %s',
                field, value, ISBN.calculatedKey), currentLine);
          }
        } else if (field === 'online_identifier' && value == 'N/A') {
          // pass controls with this value
          value = '';
        } else {
          nbWarnings++;

          emitter.emit('pkbWarning',
            util.format('invalid value (%s = %s), correct it or use # to remove controls',
              field, value), currentLine);
        }
      }
    }
  }).on('finish', function () {
    if (!syntaxError) { emitter.emit('end', nbErrors, nbWarnings); }
  });

  fileStream.on('error', function (err) {
    emitter.emit('error', err);
  });

  fileStream.on('readable', function () {
    data = fileStream.read();
    if (data === null) { return; }

    for (var i = 0, l = data.length; i < l; ++i) {
      detector.feed(String.fromCharCode(data[i]));
    }
    parser.write(data);
  });

  fileStream.on('end', function () {
    // Check charset result
    detector.close();

    if (!detector.result.encoding) {
      emitter.emit('pkbWarning', 'could not determine file encoding');
    } else {
      var encoding   = detector.result.encoding.toLowerCase();
      var confidence = detector.result.confidence;

      if (encoding === 'windows-1252' && confidence > 0.93) {
        nbWarnings++;

        var err = 'charset detected as %s with a confidence of %d%. '
          + 'Check that the file does not contain special characters';
        emitter.emit('pkbWarning', util.format(err, encoding, Math.round(confidence * 100)));
      } else if (encoding !== 'utf-8' && encoding !== 'ascii') {
        nbErrors++;

        emitter.emit('pkbError', util.format('charset detected as %s (should be UTF-8)', encoding));
      }
    }

    parser.end();
  });

  return emitter;
};