NatLibFi/marc-record-validators-melinda

View on GitHub
src/cyrillux-usemarcon-replacement.js

Summary

Maintainability
D
2 days
Test Coverage
/*
*  cyrillux-usemarcon-replacement.js -- implement and improve https://github.com/NatLibFi/USEMARCON-Cyrillux/tree/master
*
* Author(s): Nicholas Volk <nicholas.volk@helsinki.fi>
*
*/

import clone from 'clone';
import {MarcRecord} from '@natlibfi/marc-record';
import {default as fix33X} from './fix-33X';
import {default as add041} from './addMissingField041';
import {default as add336} from './addMissingField336';
import {default as add337} from './addMissingField337';
import {default as add338} from './addMissingField338';
import {default as fixCountryCodes} from './fix-country-codes';
import {default as fixLanguageCodes} from './fix-language-codes';
import {default as fixRelatorTerms} from './fixRelatorTerms';
import {default as fixIndicators} from './indicator-fixes';
import {default as fixPunctuation} from './punctuation2';
import {default as fixQualifyingInformation} from './normalize-qualifying-information';
import {sortAdjacentSubfields} from './sortSubfields';


// import createDebugLogger from 'debug';
import {nvdebug, recordToString} from './utils';

// const debug = createDebugLogger('@natlibfi/marc-record-validators-melinda/punctuation2');

const description = 'Replacement for Cyrillux usemarcon rules';

// Extended original list with 541, 561, 562, 583, 584
const dropTags = ['001', '003', '010', '012', '014', '015', '016', '019', '025', '029', '032', '035', '036', '037', '038', '042', '049', '051', '061', '068', '071', '074', '079', '090', '091', '092', '094', '095', '096', '097', '099', '249', '261', '262', '350', '400', '411', '541', '561', '562', '574', '575', '577', '578', '583', '584', '589', '590', '591', '592', '593', '594', '595', '596', '597', '598', '599', '653', '698', '741', '742', '744', '761', '790', '841', '842', '843', '844', '845', '850', '852', '853', '854', '855', '858', '859', '863', '864', '865', '866', '867', '868', '876', '877', '878', '882', '886', '887', '888', '890', '899'];

export default function () {
  return {
    description, fix, validate
  };

  function fix(record) {
    nvdebug(`${description}: fix`);
    realFix(record);
    const res = {message: [], fix: [], valid: true};
    return res;
  }

  function realFix(record) {
    // Fix leader: standard fixes + update LDR/17 to '4'
    fixLeader(record);
    record.leader = `${record.leader.substring(0, 17)}4${record.leader.substring(18, 24)}`; // eslint-disable-line functional/immutable-data

    // Remove unwanted fields:
    record.fields = record.fields.filter(f => !dropTags.includes(f.tag)); // eslint-disable-line functional/immutable-data

    record.fields.forEach(f => fieldSpecificStuff(f));

    function fieldSpecificStuff(field) {
      removeOwnershipSubfield5(field);
      removeFromOldCatalog(field); // Remove LoC phrase "[from old catalog]" from srings
      translateFieldToFinnish(field);
    }

    fixCountryCodes().fix(record); // 008/15-17
    fixLanguageCodes().fix(record); // 008/35-37 AND 041 (note that all relevant subfield codes are fixed, not just $a)

    fixQualifyingInformation().fix(record); // 015, 020, 024 and 028

    // Field 028: use $b$a, not $a$b:
    const f028 = record.fields.filter(f => f.tag === '028');
    f028.forEach(f => sortAdjacentSubfields(f));

    fixField040(record); // All $b values are changed to 'mul'. As a side effect 33X$b=>$a mappings are in Finnish! Ok in this domain!
    add041().fix(record);

    fixRelatorTerms().fix(record);

    fix33X().fix(record); // 33X$a => 33X$a$b$2
    add336().fix(record);
    add337().fix(record);
    add338().fix(record);

    record.fields.forEach(f => fieldSpecificStuff2(f));

    function fieldSpecificStuff2(field) {
      removeSubfieldH(field); // only after 33X creation, as 245$h might be useful

      field260To264s(field, record);

      // NB! 300 (before or after 33X creation?)
      field410To490And810(field, record);
      field440To490And830(field, record);
      // handle505(field); // not applying them usemarcon-cyrillux rules for field 505 as I can't understand their motivation.

    }

    // The fixer below implement Cyrillux rules such as 245I1 | 245I1  | If (Exists(@100) Or Exists(@110) Or Exists(@111) Or Exists(@130)) Then '1' Else '0' and plenty of other good stuff:
    fixIndicators().fix(record);

    fixPunctuation().fix(record);

    const res = {message: [], fix: [], valid: true};
    return res;
  }

  // Validation is currently done in subparts
  function validate(record) {
    nvdebug(`${description}: validate`);
    const originalString = recordToString(record);
    const clonedRecord = new MarcRecord(record, {subfieldValues: false});
    realFix(clonedRecord);
    const modifiedString = recordToString(clonedRecord);

    if (originalString === modifiedString) {
      return {message: [], valid: true};
    }

    return {message: ['Record changed'], valid: false}; // Less than descriptive but will do...

  }
}

function fixField040(record) {
  const f040 = record.fields.filter(f => f.tag === '040');

  const subfieldsBE = [
    {code: 'b', value: 'mul'},
    {code: 'e', value: 'rda'}
  ];

  // Add 040 if there isn't one:
  if (f040.length === 0) {
    const data = {tag: '040', ind1: ' ', ind2: ' ', subfields: subfieldsBE};

    record.insertField(data);
    return;
  }

  f040.forEach(f => fixField040Subfields(f));

  function fixField040Subfields(field) {
    field.subfields = field.subfields.filter(sf => !['b', 'e'].includes(sf.code)); // eslint-disable-line functional/immutable-data
    field.subfields.push(subfieldsBE[0]); // eslint-disable-line functional/immutable-data
    field.subfields.push(subfieldsBE[1]); // eslint-disable-line functional/immutable-data
    sortAdjacentSubfields(field); // put $b and $e to their proper places
  }

}

export function removeFromOldCatalog(field) {
  // See https://catalog.loc.gov/vwebv/ui/en_US/htdocs/help/faqs.html for motivation
  field.subfields?.forEach(sf => removeFromOldCatalogFromSubfield(sf));

  function removeFromOldCatalogFromSubfield(subfield) {
    if (!subfield.value.includes('[from old catalog]')) {
      return;
    }
    subfield.value = subfield.value.replace(/ *\[from old catalog\]/gui, ''); // eslint-disable-line functional/immutable-data
  }
}

function removeSubfieldH(field) {
  if (!field.subfields || !['245', '246', '247', '740', '760', '762', '765', '767', '770', '772', '773', '774', '775', '776', '777', '780', '785', '786', '787', '788'].includes(field.tag)) {
    return;
  }

  const filteredFields = field.subfields.filter(sf => sf.code !== 'h');
  if (filteredFields.length > 0) {
    field.subfields = filteredFields; // eslint-disable-line functional/immutable-data
    return;
  }

}

export function removeOwnershipSubfield5(field) {
  if (!field.subfields || field.subfields.length === 0) {
    return;
  }
  const remainingSubfields = field.subfields.filter(sf => sf.code !== '5');
  if (remainingSubfields.length === 0) { // sanity check/robustness
    return;
  }
  field.subfields = remainingSubfields; // eslint-disable-line functional/immutable-data
}

export function fixLeader(record) {
  record.leader = `${record.leader.substring(0, 9)}a22${record.leader.substring(12, 18)}i${record.leader.substring(19, 20)}4500`; // eslint-disable-line functional/immutable-data
}

function field410To490And810(field, record) { // might be generic... if so, move to utils...
  if (field.tag !== '410') {
    return;
  }

  const field810 = clone(field);

  field.tag = '490'; // eslint-disable-line functional/immutable-data
  field.ind1 = '1'; // eslint-disable-line functional/immutable-data
  field.ind2 = ' '; // eslint-disable-line functional/immutable-data
  sortAdjacentSubfields(field);
  // 490: Fix punctuation elsewhere. (Note that the current support is lagging...)


  field810.tag = '810'; // eslint-disable-line functional/immutable-data
  field810.ind2 = ' '; // eslint-disable-line functional/immutable-data
  // 810: Fix punctuation elsewhere. (Note that the current support is lagging...)
  record.insertField(field810);
}

function field440To490And830(field, record) { // might be generic... if so, move to utils...
  if (field.tag !== '440') {
    return;
  }

  const field830 = clone(field);

  field.tag = '490'; // eslint-disable-line functional/immutable-data
  field.ind1 = '1'; // eslint-disable-line functional/immutable-data
  field.ind2 = ' '; // eslint-disable-line functional/immutable-data
  // 490: Fix punctuation elsewhere. (Note that the current support is lagging...)
  field830.tag = '830'; // eslint-disable-line functional/immutable-data
  // 830: Fix punctuation elsewhere. (Note that the current support is lagging...)
  record.insertField(field830);
}


function field260To264s(field, record) { // might be generic... if so, move to utils...
  // As per my quick reading of usemarcon-cyrillux
  if (field.tag !== '260') {
    return;
  }

  createCopyright264Field(field);

  field.tag = '264'; // eslint-disable-line functional/immutable-data
  field.ind1 = ' '; // eslint-disable-line functional/immutable-data
  field.ind2 = '1'; // eslint-disable-line functional/immutable-data

  // NB! Usemarcon does not handle 260$e$f$g => 264$a$b$c, so I'm not botherin with it either... (However, we could check our merge reducer code...)

  function getCopyrightYear(string) {
    if (string.match(/^(?:\[?[Ccp]|[^0-9]*(?:cop|©|℗))[^0-9]*(?:1[789][0-9][0-9]|20[0-2][0-9])[^0-9]*$/u)) {
      return string.replace(/[^0-9]/ug, '');
    }
    return false;
  }

  field.subfields?.forEach(sf => field260To264Normalization(sf));

  function field260To264Normalization(subfield) {
    subfield.value = field260To264Normalization2(subfield); // eslint-disable-line functional/immutable-data
  }

  function createCopyright264Value(field) {
    // Extract/split copyright year to a separate field:
    const [c] = field.subfields.filter(sf => sf.code === 'c');
    if (!c) {
      return undefined;
    }
    const copyrightYear = getCopyrightYear(c.value);
    if (!copyrightYear) {
      return undefined;
    }
    const copType = c.value.match(/(?:^\[?p|℗)/u) ? '℗' : '©';
    const returnValue = c.value.includes('[') ? `[${copType}${copyrightYear}]` : `${copType}${copyrightYear}`;
    // Moidy the original value:
    c.value = `[${copyrightYear}]`; // eslint-disable-line functional/immutable-data
    return returnValue;
  }

  function createCopyright264Field(field) {
    const c = createCopyright264Value(field);
    if (!c) {
      return undefined;
    }
    const data = {'tag': '264', 'ind1': ' ', 'ind2': '4', 'subfields': [{'code': 'c', 'value': c}]};
    record.insertField(data);
  }

  function field260To264Normalization2(subfield) {
    if (subfield.code === 'a') {
      return subfield.value.replace(/\b[Ss]\. ?l\./u, 'Kustannuspaikka tuntematon');
    }
    if (subfield.code === 'b') {
      return subfield.value.replace(/\b[Ss]\. ?n\./u, 'kustantaja tuntematon');
    }
    if (subfield.code === 'c') {
      const year = getCopyrightYear(subfield.value);
      if (year) {
        const c = subfield.value.match(/(?:^p|℗)/u) ? 'p' : 'c';
        if (subfield.value.includes('[')) {
          return `${c}[${year}]`;
        }
        return `${c}${year}`;
      }
      return subfield.value.replace(/\b[Ss]\. ?a\./u, 'julkaisuaika tuntematon');
    }
    return subfield.value;
  }


}

/*
function handle505(field) {
  if (field.tag !== '505') {
    return;
  }
  // Don't know how/why usemarcon-cyrillux is so sure about ind1...
  field.ind1 = '0'; // eslint-disable-line functional/immutable-data
  // usemarcon-cyrillux drops irrelevant subfields, so we do the same. However, we have included some control subfields in the kept side:
  const keptSubfields = field.subfields.filter(sf => ['a', 'g', 'r', 't', 'u', '6', '8', '9'].includes(sf.code));

  if (keptSubfields.some(sf => ['a', 'g', 'r', 't', 'u'].includes(sf.code))) {
    field.subfields = keptSubfields; // eslint-disable-line functional/immutable-data
    return;
  }
}
*/

function translateFieldToFinnish(field) {
  if (!['020', '300'].includes(field.tag)) {
    return;
  }
  field.subfields?.forEach(sf => translateSubfieldToFinnish(sf));

  function translateSubfieldToFinnish(subfield) {
    if (field.tag === '020' && ['a', 'q', 'z'].includes(subfield.code)) {
      subfield.value = finnishTranslationsAndMappings(expandFinnishAbbreviations(expandSwedishAbbreviations(expandEnglishAbbreviations(subfield.value)))); // eslint-disable-line functional/immutable-data
      return;
    }
    if (field.tag === '300') {
      subfield.value = finnishTranslationsAndMappings(expandFinnishAbbreviations(expandSwedishAbbreviations(expandEnglishAbbreviations(subfield.value)))); // eslint-disable-line functional/immutable-data
      return;
    }
  }
}

function expandEnglishAbbreviations(value) {
  return value.replace(/\bbk\.\b/gui, 'book').
    replace(/chiefly col\./ui, 'chiefly color').
    replace(/col\. ill\./ui, 'color illustrations').
    replace(/diagrs\./ui, 'diagrams').
    replace(/\bhbk\.\b/gui, 'hardcover').replace(/\bhbk\b/gui, 'hardcover'). // expand to MTS-compliant form
    replace(/\b1 hr\./gui, '1 hour').
    replace(/\bhr\./gui, 'hours').
    replace(/\bill\./gui, 'illustrated'). // or illustrations (or Swedish "illustrerad" or...)
    replace(/\billus\./gui, 'illustrated'). // or illustrations
    replace(/incl\./gui, 'includes').
    replace(/fold\.? maps/gui, 'folded maps').
    // replace(/\bmin\./gu, 'minutes').
    // replace(/\bmin\b/gu, 'minutes').
    replace(/\bp\.\b/gui, 'pages').replace(/\bp\b/gu, 'pages').
    replace(/\bpbk\.\b/gui, 'paperback').replace(/\bpbk\b/gui, 'paperback'). // expand to MTS-compliant form
    replace(/\bpdf\b/gui, 'PDF').
    replace(/\bports\./gui, 'portraits').
    replace('sd., col.', 'sound, color').
    replace(/ *\((?:chiefly col\.|chiefly color|some col[s.])\)/gui, '').
    replace(/\b1 hr\./gui, '1 hour');
}

function expandFinnishAbbreviations(value) {
  return value.replace(/\bcn\. /gu, 'noin ').
    // replace(/\bmin\./gu, 'minuuttia').
    // replace(/\bmin\b/gu, 'minuuttia').
    replace(/\bnid\./gu, 'nidottu').replace(/\bnid\b/gui, 'nidottu').
    replace(/\bsid\./gu, 'sidottu').replace(/\bsid\b/gui, 'sidottu').
    replace(/\bverkkojulk\.\b/gu, 'verkkojulkaisu').replace(/\bverkkojulk\b/gu, 'verkkojulkaisu').
    replace(/^\(([^)]+)\)$/u, '$1'); // eslint-disable-line prefer-named-capture-group
  // <- removal of brackets above could use a better location
}

function expandSwedishAbbreviations(value) {
  return value.replace(/\bca\. /gu, 'circa ').
    replace(/\bhft\./gui, 'häftad').replace(/\bhft\b/gui, 'häftad');
  // replace(/\bmin\./gu, 'minuter').
  // replace(/\bmin\b/gu, 'minuter');
}

function finnishTranslationsAndMappings(value) {
  return value.replace('analog', 'analoginen').
    replace('approximately', 'noin').
    replace('audio discs', 'äänilevyä').
    replace('black and white', 'mustavalkoinen').
    replace(/\bbilaga\b/gui, 'liite').
    replace(/\bbilagor\b/gui, 'liitettä').
    replace(/\bcharts\b/gui, 'kaavioita').
    replace('chiefly color illustrations', 'pääosin värikuvitettu').
    replace('chiefly', 'pääosin').
    replace(/\bcirca\b/gui, 'noin').
    replace(/coil[- ]?bound/gui, 'kierreselkä').
    replace('color illustrations', 'värikuvitus').
    replace(/comb[- ]?bound/gui, 'kierreselkä').
    replace(/\bdigital\b/gui, 'digitaalinen').
    replace(/\belectronic book\b/gui, 'verkkoaineisto').
    replace('(flera nummersviter)', '(useita numerointijaksoja)').
    replace(/\bfolded sheet\b/gui, 'taitelehti').
    replace(/\bfärgillustratione[nr]\b/gui, 'värikuvitus').
    replace(/\bhard(?:back|cover)\b/gui, 'kovakantinen').
    replace(/\bhours\b/gui, 'tuntia').
    replace(/\bi flera nummersviter/gui, 'useina numerointijaksoina').
    replace('illustrated', 'kuvitettu').
    replace(/illustrations?\b/gui, 'kuvitettu'). // Based on usemacron-bookwhere (NB! usemarcon-cyrillux had kuvitus/kuvitettu)
    replace(/\binbunden\b/gui, 'kovakantinen'). // swe
    replace(/\binsert\b/gui, 'liite').
    replace(/\binserts\b/gui, 'liitteitä').
    replace(/\bin various pagings/gui, 'useina numerointijaksoina').
    replace('leaves of plates', 'kuvalehteä').
    replace(/\bljudskiva\b/gui, 'äänilevy').
    replace(/\bljudskivor\b/gui, 'äänilevyä').
    replace(/\bmap\b/gui, 'kartta').
    replace(/\bmaps\b/gui, 'karttoja'). // or karttaa?
    replace('minutes', 'minuuttia').
    replace('mjuka pärmar', 'pehmeäkantinen').
    replace('online resource', 'verkkoaineisto').
    replace('onlineresurs', 'verkkoaineisto').
    replace('onumrerade', 'numeroimatonta').
    replace('pages of plates', 'kuvalehteä').
    replace(/\bpages\b/gui, 'sivua').
    replace(/\bpaperback\b/gui, 'pehmeäkantinen'). // MTS alt
    replace(/\bSeiten\b/gu, 'sivua').
    replace(/\bsoftcover\b/gui, 'pehmeäkantinen'). // MTS pref
    replace('sound, color', 'äänellinen, värillinen').
    replace('sound cassettes', 'äänikasettia').replace('sound cassette', 'äänikasetti').
    replace('sound discs', 'äänilevyä').replace(/sound disc\b/gui, 'äänilevy').
    replace(/(?:spiral[- ]?bound|spiralrygg)/gui, 'kierreselkä').
    replace('svartvit', 'mustavalkoinen').
    replace('unnumbered', 'numeroimatonta').
    replace('(various pagings)', '(useita numerointijaksoja)').
    replace(/\bverkkojulkaisu\b/gui, 'verkkoaineisto').
    replace('videodiscs', 'videolevyä').
    replace('videodisc', 'videolevy').
    replace(/\b1 hour\b/gui, '1 tunti');

}