NatLibFi/marc-record-validators-melinda

View on GitHub
src/removeDuplicateDataFields.js

Summary

Maintainability
D
1 day
Test Coverage
import createDebugLogger from 'debug';
import {fieldsToString, fieldToString, nvdebug} from './utils';
import {fieldHasValidSubfield6, fieldsGetOccurrenceNumbers, fieldsToNormalizedString, fieldToNormalizedString, get6s} from './subfield6Utils';
import {add8s, fieldHasLinkingNumber, fieldHasValidSubfield8, fieldsGetAllSubfield8LinkingNumbers, getSubfield8LinkingNumber, recordGetAllSubfield8LinkingNumbers, recordGetFieldsWithSubfield8LinkingNumber} from './subfield8Utils';

const LINK_ROOT = 4;
const LINKED_AND_PROCESSED = 2;
const LINKED_NOT_PROCESSED = 1;


// Relocated from melinda-marc-record-merge-reducers (and renamed)

const debug = createDebugLogger('@natlibfi/marc-record-validators-melinda:removeDuplicateDataFields');

export default function () {
  return {
    description: 'Remove duplicate data fields. Certain exceptions apply, mainly too complited chained fields',
    validate, fix
  };

  function fix(record) {
    nvdebug('Remove duplicate data fields');
    const res = {message: [], fix: [], valid: true};
    removeDuplicateDatafields(record, true);
    // This can not really fail...
    return res;
  }

  function validate(record) {
    // Check max, and check number of different indexes
    nvdebug('Validate record: duplicate data fields cause (t)error', debug);

    const duplicates = removeDuplicateDatafields(record, false);

    //const orphanedFields = getOrphanedFields(fieldsContainingSubfield6);

    const res = {message: duplicates};

    /*
    if (orphanedFields.length > 0) { // eslint-disable-line functional/no-conditional-statements
      res.message = [`${orphanedFields.length} orphaned occurrence number field(s) detected`]; // eslint-disable-line functional/immutable-data
    }
    */
    res.valid = res.message.length < 1; // eslint-disable-line functional/immutable-data
    return res;
  }
}


/*
function numberOfLinkageSubfields(field) {
  nvdebug(`N of Linkage Subs(${fieldToString(field)})`);
  const subfields = field.subfields.filter(sf => sf.code === '6' || sf.code === '8');
  return subfields.length;
}
*/

function removeLinkNotes(record) {
  record.fields.forEach(f => delete f.linkNote); // eslint-disable-line functional/immutable-data
}


function newGetAllLinkedFields(field, record, useSixes = true, useEights = true) {
  removeLinkNotes(record); // should be clear, but let's play safe

  /* eslint-disable */
  field.linkNote = LINK_ROOT;

  let currField = field;

  // Loop until all linked fields have been processed:
  while (currField !== undefined) {
    if (useSixes) {
      const related6s = get6s(currField, record.fields)
      related6s.forEach(f => linkField(f));
    }
    if (useEights) {
      const related8s = add8s([currField], record);
      related8s.forEach(f => linkField(f));
    }
    if (currField.linkNote !== LINK_ROOT) {
      currField.linkNote = LINKED_AND_PROCESSED;
    }
    currField = record.fields.find(f => f.linkNote === LINKED_NOT_PROCESSED);
  }

  // Collect relevant fields:
  const linkedFields = record.fields.filter(f => f.linkNote);

  removeLinkNotes(record);

  return linkedFields;

  function linkField(f) {
    if (!f.linkNote) {
      f.linkNote = LINKED_NOT_PROCESSED;
    }
  }


  /* eslint-enable */
}


function recordRemoveFieldOrSubfield8(record, field, currLinkingNumber) {
  const eights = field.subfields.filter(sf => sf.code === '8');
  if (eights.length < 2) {
    record.removeField(field);
    return;
  }
  const subfields = field.subfields.filter(sf => getSubfield8LinkingNumber(sf) === currLinkingNumber);
  subfields.forEach(sf => record.removeSubfield(sf, field));
}

function newRecordRemoveFieldOrSubfield8(record, field, currLinkingNumber, fix) {
  const eights = field.subfields.filter(sf => sf.code === '8');
  if (eights.length < 2) {
    field.deleted = 1; // eslint-disable-line functional/immutable-data
    return;
  }
  const subfields = field.subfields.filter(sf => getSubfield8LinkingNumber(sf) === currLinkingNumber);
  subfields.forEach(sf => {
    field.modified = 1; // eslint-disable-line functional/immutable-data
    if (fix) { // eslint-disable-line functional/no-conditional-statements
      record.removeSubfield(sf, field);
    }
  });
}


export function removeDuplicateSubfield8Chains(record, fix = true) {

  // Seen $8 subsfields in various fields:
  // 161 700
  // 17 710
  // 11 110
  // 8 730
  // 1 100
  // Given these stats, there's no need to check for 1XX-vs-7XX removals

  /* eslint-disable */
  let seen = {};

  let removables = []; // for validation

  nvdebug("CHAIN-8");
  const seenLinkingNumbers = recordGetAllSubfield8LinkingNumbers(record);
  if (seenLinkingNumbers.length === 0) {
    return removables;
  }

  nvdebug(`seen linking numbers ($8): ${seenLinkingNumbers.join(', ')}`, debug);

  seenLinkingNumbers.forEach(currLinkingNumber => {
    const linkedFields = recordGetFieldsWithSubfield8LinkingNumber(record, currLinkingNumber) //getFieldsWithSubfield8Index(base, baseIndex);
    // As/If there's just one occurrence number it should be fine to use normalizeOccurrenceNumber = true
    const normalizeOccurrenceNumber = true;
    const linkedFieldsAsString = fieldsToNormalizedString(linkedFields, currLinkingNumber, normalizeOccurrenceNumber, true);
    nvdebug(`Results for LINKING NUMBER ${currLinkingNumber}:`, debug);
    nvdebug(`${linkedFieldsAsString}`, debug);

    if (linkedFieldsAsString in seen)  {
      if (!removables.includes(linkedFieldsAsString)) {
        removables.push(linkedFieldsAsString);
      }

      if (fix) {
        nvdebug(`$8 CHAIN FIX: REMOVE $8 GROUP: ${fieldsToString(linkedFields)}`, debug);
        linkedFields.forEach(field => recordRemoveFieldOrSubfield8(record, field, currLinkingNumber));
        return;
      }

      nvdebug(`$8 VALIDATION: DUPLICATE DETECTED ${linkedFieldsAsString}`, debug);
      return;
    }
    nvdebug(`$8 DOUBLE REMOVAL OR VALIDATION: ADD2SEEN ${linkedFieldsAsString}`, debug);
    seen[linkedFieldsAsString] = 1;
    return;
  });

  /* eslint-enable */
  return removables;
}


export function handleDuplicateSubfield8Chains(record, fix) {

  // Seen $8 subsfields in various fields:
  // 161 700
  // 17 710
  // 11 110
  // 8 730
  // 1 100
  // Given these stats, there's no need to check for 1XX-vs-7XX removals

  /* eslint-disable */
  let seen = {};

  nvdebug("CHAIN-8");
  const seenLinkingNumbers = recordGetAllSubfield8LinkingNumbers(record);
  if (seenLinkingNumbers.length === 0) {
    return;
  }

  nvdebug(`seen linking numbers ($8): ${seenLinkingNumbers.join(', ')}`, debug);

  seenLinkingNumbers.forEach(currLinkingNumber => {
    const linkedFields = recordGetFieldsWithSubfield8LinkingNumber(record, currLinkingNumber) //getFieldsWithSubfield8Index(base, baseIndex);
    // As/If there's just one occurrence number it should be fine to use normalizeOccurrenceNumber = true
    const normalizeOccurrenceNumber = false; //true;
    const linkedFieldsAsString = fieldsToNormalizedString(linkedFields, currLinkingNumber, normalizeOccurrenceNumber, true);
    nvdebug(`Results for LINKING NUMBER ${currLinkingNumber}:`, debug);
    nvdebug(`${linkedFieldsAsString}`, debug);

    if (linkedFieldsAsString in seen)  {
      nvdebug(`$8 CHAIN FIX: REMOVE $8 GROUP: ${fieldsToString(linkedFields)}`, debug);
      linkedFields.forEach(field => newRecordRemoveFieldOrSubfield8(record, field, currLinkingNumber, fix));
      return;
    }
    nvdebug(`$8 DOUBLE REMOVAL OR VALIDATION: ADD2SEEN ${linkedFieldsAsString}`, debug);
    seen[linkedFieldsAsString] = 1;
    return;
  });

  /* eslint-enable */

}

function markIdenticalSubfield6Chains(chain, record) {
  const normalizeOccurrenceNumber = true;
  const normalizeTag = chain.some(field => field.tag.substring(0, 1) === '1'); // 1XX can delete 7XX as well!
  const chainAsString = fieldsToNormalizedString(chain, 0, normalizeOccurrenceNumber, normalizeTag);

  nvdebug(`markIdenticalSubfield6Chains: ${chainAsString}`);
  record.fields.forEach(f => compareWithChain(f));


  function compareWithChain(f) {
    nvdebug(`FIELD2CHAIN ${fieldToString(f)}`);
    const otherChain = fieldToChain(f, record);
    // Not a lone field or chain (head) or ... or is-same-chain
    if (otherChain.length === 0 || sameField(chain[0], otherChain[0])) {
      return;
    }
    const otherChainAsString = fieldsToNormalizedString(otherChain, 0, normalizeOccurrenceNumber, normalizeTag);

    // Mark other chain as deleted:
    if (chainAsString === otherChainAsString) {
      otherChain.forEach(f => {
        nvdebug(` mark ${fieldToString(f)} as deleted ($6-chain)...`);
        f.deleted = 1; // eslint-disable-line functional/immutable-data
      });
      return;
    }
  }

}

function markIdenticalLoneFieldsAsDeletable(field, record) {
  if (field.deleted) {
    return;
  }
  // targetLinkingNumber = 0, normalizedOccurenceNumber = false, normalizeTag = true)
  const normalizeTag = field.tag.substring(0, 1) === '1'; // 1XX can delete 7XX as well!
  const fieldAsString = fieldToNormalizedString(field, 0, false, normalizeTag);

  const identicalLoneFields = record.fields.filter(f => !sameField(f, field) && fieldToNormalizedString(f, 0, false, normalizeTag) === fieldAsString);

  // Mark fields as deleted:
  identicalLoneFields.forEach(f => {
    nvdebug(` mark ${fieldToString(f)} as deleted (lone field)...`);
    f.deleted = 1; // eslint-disable-line functional/immutable-data
  });

}

function acceptFieldsWithSubfield6(fieldsWithSubfield6) {
  // There can be only one non-880 field:
  const non880 = fieldsWithSubfield6.filter(f => f.tag !== '880');
  if (non880.length > 1) {
    return false;
  }

  const occurrenceNumbers = fieldsGetOccurrenceNumbers(fieldsWithSubfield6);
  // Chain can contain only single occurrence number:
  if (occurrenceNumbers.length > 1) {
    return false;
  }

  return true;
}


function isSingleTagLinkingNumber(linkingNumber, fields, tag) {
  const relevantFields = fields.filter(f => fieldHasLinkingNumber(f, linkingNumber));
  if (relevantFields.some(f => f.tag !== tag)) {
    return false;
  }
  return true;
}

function acceptFieldsWithSubfield8(fieldsWithSubfield8, requireSingleTag = false) {
  const linkingNumbers = fieldsGetAllSubfield8LinkingNumbers(fieldsWithSubfield8);
  if (linkingNumbers.some(linkingNumber => anomaly8(linkingNumber))) {
    return false;
  }
  return true;

  // If linking number
  function anomaly8(linkingNumber) {
    nvdebug(`  Looking for anomalies in linkin number ${linkingNumber}`);
    const relevantFields = fieldsWithSubfield8.filter(f => fieldHasLinkingNumber(f, linkingNumber));
    if (requireSingleTag) {
      return !isSingleTagLinkingNumber(linkingNumber, relevantFields, relevantFields[0].tag);
    }

    const f880 = relevantFields.filter(f => f.tag === '880');
    if (f880.length === 0 || f880.length === relevantFields.length) {
      return false;
    }
    return true;
  }
}


export function sameField(field1, field2) {
  /* eslint-disable */
    field1.tmpMyId = 666;
    const result = field2.tmpMyId === 666 ? true : false;
    delete field1.tmpMyId;
    /* eslint-enable */
  return result;
}

export function isChainHead(field, chain) {
  return sameField(field, chain[0]);
}

export function fieldToChain(field, record) {
  if (field.deleted || !field.subfields) {
    return [];
  }
  const chain = newGetAllLinkedFields(field, record, true, true);

  nvdebug(` Chain contains ${chain.length} field(s)`);
  if (!isChainHead(field, chain)) { // newGetAllLinkedFields() marks relevant record.fields!
    return [];
  }

  const fieldsWithSubfield6 = chain.filter(f => fieldHasValidSubfield6(f));
  // Hack: multiple $6 fields, but either all are non-880 or all are 880: treat field as a single entry
  if (fieldsWithSubfield6.length > 0) {
    const non880 = fieldsWithSubfield6.filter(f => f.tag !== '880');
    if (non880.length === 0 || non880.length === fieldsWithSubfield6.length) {
      return [field];
    }
    if (non880.length !== 1) {
      return [field];
    }
  }

  if (!acceptFieldsWithSubfield6(fieldsWithSubfield6)) { // Check tag subfield $6s are legal(ish)
    return [];
  }
  const fieldsWithSubfield8 = chain.filter(f => fieldHasValidSubfield8(f));
  if (!acceptFieldsWithSubfield8(fieldsWithSubfield8, false)) {
    return [];
  }

  //nvdebug(`Proceed with ${fieldsToString(chain)}`);


  return chain;
}


function fieldHandleDuplicateDatafields(field, record) {
  const chain = fieldToChain(field, record);
  nvdebug(` TRY TO HANDLE DUPLICATES OF '${fieldsToString(chain)}'`);

  if (chain.length === 0) {
    return;
  }

  const fieldsWithSubfield6 = chain.filter(f => fieldHasValidSubfield6(f));
  const fieldsWithSubfield8 = chain.filter(f => fieldHasValidSubfield8(f));

  // Lone fields:
  if (chain.length === 1) {
    markIdenticalLoneFieldsAsDeletable(chain[0], record);
    return;
  }
  if (fieldsWithSubfield6.length === 0) {

    if (fieldsWithSubfield8.length === 0) { // chain.length === 1?
      nvdebug(` Trying to find duplicates of single field '${fieldToString(chain[0])}'`);
      markIdenticalLoneFieldsAsDeletable(chain[0], record);
      return;
    }
    const linkingNumbers = fieldsGetAllSubfield8LinkingNumbers(fieldsWithSubfield8);
    if (linkingNumbers.length < 2) {
      markIdenticalSubfield6Chains(chain, record);
      return;
    }
  }

  if (fieldsWithSubfield6.length > 0 && acceptFieldsWithSubfield8(fieldsWithSubfield8, true)) { // Checks that non-880 tags are all same
    // Chain is removable
    markIdenticalSubfield6Chains(chain, record);
    return;
  }


  nvdebug(` NO HANDLER FOUND FOR '${fieldsToString(chain)}'`);
  nvdebug(`  N8s: ${fieldsWithSubfield6.length}`);

}


export function removeDuplicateDatafields(record, fix = true) {
  // Sometimes only $8 subfield (vs the whole field) is removed. Thus they are handled separately:
  handleDuplicateSubfield8Chains(record, fix);

  const dataFields = record.fields.filter(f => f.subfields !== undefined);

  dataFields.forEach(f => fieldHandleDuplicateDatafields(f, record));

  const deletableFields = dataFields.filter(f => f.deleted);
  const modifiedFields = dataFields.filter(f => f.modified && !f.deleted);

  const result = deletableFields.map(f => `DEL: ${fieldToString(f)}`);
  if (modifiedFields.length) { // eslint-disable-line functional/no-conditional-statements
    modifiedFields.forEach(f => delete f.modified); // eslint-disable-line functional/immutable-data
    result.push(modifiedFields.map(f => `MOD: ${fieldToString(f)}`)); // eslint-disable-line functional/immutable-data
  }

  if (fix) {
    deletableFields.forEach(f => record.removeField(f));
    return result;
  }

  deletableFields.forEach(f => delete f.deleted); // eslint-disable-line functional/immutable-data
  deletableFields.forEach(f => delete f.modified); // eslint-disable-line functional/immutable-data


  return result;
}