digitalheir/bibliography-js

View on GitHub
depr/bibtex/field_value/StringValue.js

Summary

Maintainability
D
2 days
Test Coverage
import {diacritics, specialChars} from './specialCharsHandlers'
import Immutable, {Set} from 'immutable'

function flatten(obj) {
  if (typeof obj === 'object' &&
    (obj.type == 'quotedstring' ||
    obj.type == 'quotedstringwrapper' ||
    obj.type == 'bracedstringwrapper')
  ) return flatten(obj.data);
  else if (typeof obj === 'object' && obj.type == 'ws') return obj;
  else if (obj._raw) return flatten(obj._raw);
  else if (typeof obj === 'object' && obj.string) return obj.string;
  else if (obj.constructor == Number) return obj+"";
  else if (typeof obj === 'string') return obj;
  else if (obj.constructor == Array) {
    let tokens = [];
    obj.forEach(o => {
      const conc = flatten(o);
      //console.log(JSON.stringify(o));
      //console.log(JSON.stringify(conc));
      if (conc.constructor == Array)  tokens = tokens.concat(conc);
      else tokens.push(conc);
    });
    return tokens;
    //return tokens.reduce((prev, curr) => {
    //  const previousToken = prev[prev.length - 1];
    //  if (typeof previousToken === 'string' && typeof curr === 'string')
    //    prev[prev.length - 1] = prev[prev.length - 1] + curr;
    //  else prev.push(curr);
    //  return prev;
    //}, []).map(o => asAuthorToken(o));
  } else if (obj.type == 'braced') {
    const braced = obj.type == 'braced';
    return ({
      type: 'braced',
      data: flatten(obj.data)
    });
  }
  else throw new Error("Could not handle string value to normalize: " + JSON.stringify(obj));
}

function toWords(wordParts, retainWhitespace) {
  return wordParts.reduce((prev, current)=> {
    if (current.type == 'ws') {
      if (retainWhitespace) prev[prev.length - 1].push(current)
      else prev.push([]);
    }
    else if (current.type == 'braced') prev[prev.length - 1].push({
      type: current.type,
      data: toWords(current.data, true)
    });
    else if (current.type == ',') {
      prev.push(current);
      //prev[prev.length - 1].push(current)
    }
    else if (typeof current === 'string') prev[prev.length - 1].push(current);
    else throw new Error("! toWords error: " + JSON.stringify(current));
    return prev;
  }, [[]])
}
function concatStrings(array) {
  const words = [];
  for (let i = 0; i < array.length; i++) {
    const obj = array[i];
    if (obj == ',') words.push({type: ','});
    else if (typeof obj === 'string') {
      if (typeof(words[words.length - 1]) == 'string') words[words.length - 1] = words[words.length - 1] + obj;
      else words.push(obj);
    } else if (obj.type == 'ws') words.push(obj);
    else if (obj.type == 'braced') words.push({
      type: obj.type,
      data: concatStrings(obj.data)
    });
    else throw new Error("Could not handle string value to concat: " + JSON.stringify(obj));
  }
  return words;
}

/**
 * A special character is a
 * part of a field starting with a left brace being at brace depth 0 immediately followed with a backslash,
 * and ending with the corresponding right brace. For instance, in the above example, there is no special
 * character, since \LaTeX is at depth 2. It should be noticed that anything in a special character is
 * considered as being at brace depth 0, even if it is placed between another pair of braces.
 */
function processSpecialChars(words) {
  return words.map(word=> {
    if (word.constructor == Array) {
      return word.map(wordSegment => {
        if (wordSegment.type == 'braced'
          && wordSegment.data[0]
          && wordSegment.data[0].constructor == Array
          && wordSegment.data[0].length == 1
          && typeof wordSegment.data[0][0] == 'string'
          && wordSegment.data[0][0].charAt(0) == '\\') {
          const escapeString = wordSegment.data[0][0];
          const specialChar = specialChars[escapeString.substring(1)];
          if (specialChar) return {
            type: 'specialChar',
            data: wordSegment,
            unicode: specialChar(specialChar)
          };
          const diacriticHandler = diacritics[escapeString.charAt(1)];
          if (diacriticHandler) return {
            type: 'specialChar',
            data: wordSegment,
            unicode: diacriticHandler(escapeString.substring(2))
          };
          throw new Error("Unexpected escape string: " + escapeString);
        }
        return wordSegment;
      });
    }
    else return word;
  });
}

/**
 * String wrapper that is meant to deal with the subtleties of BiBTeX / LaTeX styling.
 * Class is pretty incomplete, but we may want a higher-level class to do store formatting information or something.
 */
export default class StringValue {
  constructor(strRaw) {
    if (!strRaw) throw new Error("Please specify string contents");
    if (strRaw.type != 'quotedstringwrapper' && strRaw.type != 'bracedstringwrapper')
      throw new Error("Did not expect object to instantiate StringValue: " + JSON.stringify(strRaw));

    this._raw = strRaw;
    this._normalizedRaw = processSpecialChars(toWords(concatStrings(flatten(this._raw))));
    //console.log("Computed",JSON.stringify(this._normalizedRaw));

    this._unicode = StringValue.computeUnicodeString(0, this._raw);
  }

  toUnicode() {
    return this._unicode;
  }

  // Will turn to lowercase:
  // {\'{E}}cole
  // {\'E}cole
  //
  // Will not turn to lowercase:
  // {{\'E}}cole
  lowercase$() {

  }

  static purify$() {
//     There are thirteen LATEX commands that won’t follow the above rules: \OE, \ae, \AE,
//       \aa, \AA, \o, \O, \l, \L, \ss. Those commands correspond to ı, , œ, Œ, æ, Æ, å, Å, ø, Ø, ł, Ł,
//       ß, and purify$ transforms them (if they are in a special character, in i, j, oe, OE, ae, AE, aa,
//       AA, o, O, l, L, ss, respectively.
    const purifyEscapeExceptions = {
      'i': 'i',
      'j': 'j',
      'oe': 'oe',
      'OE': 'OE',
      'ae': 'ae',
      'AE': 'AE',
      'aa': 'aa',
      'AA': 'AA',
      'o': 'o',
      'O': 'O',
      'ss': 'ss',
      'l': 'l',
      'L': 'L'
    }
  }

  // TODO
  // The following ten characters have special meanings in (La)TeX:
  // & % $ # _ { } ~ ^ \
  // Outside \verb, the first seven of them can be typeset by prepending a backslash; for the other three, use the macros \textasciitilde, \textasciicircum, and \textbackslash.
  static computeUnicodeString(braceDepth, obj) {
    //console.log(braceDepth, JSON.stringify(obj));
    if (typeof obj === 'string') return obj;
    else if (obj.constructor == Array) return obj.map(o => {
      if (!o) throw new Error("Expected non-null elements in " + JSON.stringify(o));
      //console.log("array", obj);
      return StringValue.computeUnicodeString(braceDepth, o);
    }).join('');
    else if (obj.toUnicode) {
      //console.log("toUnicode", JSON.stringify(obj));
      //console.log("toUnicode", (obj.toUnicode()));
      return obj.toUnicode();
    }
    else if (typeof obj === 'object' && (obj.type == 'number' || obj.type == 'id' || obj.type == 'ws')) return StringValue.computeUnicodeString(0, obj.string);
    else if (typeof obj === 'object' && obj.unicode) return obj.unicode;
    else if (obj.constructor === Number) return obj + "";
    else if (typeof obj === 'object' && obj.type == 'quotedstring') return StringValue.computeUnicodeString(braceDepth, obj.data);
    else if (typeof obj === 'object' && (obj.type == 'quotedstringwrapper' || obj.type == 'bracedstringwrapper')) return StringValue.computeUnicodeString(0, obj.data);
    /**
     * A special character is a
     * part of a field starting with a left brace being at brace depth 0 immediately followed with a backslash,
     * and ending with the corresponding right brace. For instance, in the above example, there is no special
     * character, since \LaTeX is at depth 2. It should be noticed that anything in a special character is
     * considered as being at brace depth 0, even if it is placed between another pair of braces.
     */
    else if (braceDepth == 0 && obj.type == 'braced' && obj.data[0] == '\\') {
      // Found special character
      const escapeString = StringValue.joinSimpleString(obj.data);

      const specialChar = specialChars[escapeString.substring(1)];
      if (specialChar) return specialChar(specialChar);

      const diacriticHandler = diacritics[escapeString.charAt(1)];
      if (diacriticHandler) return diacriticHandler(escapeString.substring(2));

      throw new Error("Unexpected escape string: " + escapeString);
    } else if (obj.type == 'braced') {
      const braced = obj.type == 'braced';
      return StringValue.computeUnicodeString(braceDepth, obj.data.map(o => {
        if (!o) throw new Error("Expected non-null elements in " + JSON.stringify(o));
        //console.log("braced", obj.data);
        return StringValue.computeUnicodeString(braceDepth + 1, o)
      }));
    }
    else throw new Error("Could not handle string value to turn into Unicode: " + JSON.stringify(obj));
  }

  static joinSimpleString(data) {
    if (typeof data === 'string') return data;
    else if (data.constructor == Array) {
      const str = [];
      for (let i = 0; i < data.length; i++) {
        str.push(StringValue.joinSimpleString(data[i]));
      }
      return str.join('');
    }
    else if (typeof data.string == 'string') return data.string;
    else if (data.type == 'braced') return StringValue.joinSimpleString(data.data);
    else if (typeof data.data == 'string') return data.string;
    else throw new Error("Could not read escaped string value " + JSON.stringify(data));
  }


  static resolveStrings(keyvals) {
    const refs = {};
    for (let key in keyvals)
      if (keyvals.hasOwnProperty(key) && !refs[key])
        refs[key] = StringValue.resolveStringDeclarations(Set.of(), keyvals[key], refs, keyvals);
    return refs;
  }

  static resolveStringDeclarations(referenceStack, wrapper, compiledSoFar, rawStrings) {
    if (wrapper.type == 'quotedstringwrapper') {
      return new StringValue({
        type: wrapper.type,
        data: wrapper.data.map((strObj) => {
          if (typeof strObj === 'object' && strObj.stringref) {
            const refName = strObj.stringref;
            if (referenceStack.has(refName)) throw new Error("Cycle detected: " + refName);
            if (compiledSoFar[refName]) return compiledSoFar[refName];
            if (!rawStrings[refName]) throw new Error("Unresolved reference: " + JSON.stringify(strObj));
            //console.log("RESOLVE", refName);
            compiledSoFar[refName] = StringValue.resolveStringDeclarations(referenceStack.add(refName), rawStrings[refName], compiledSoFar, rawStrings);
            return compiledSoFar[refName];
          } else if (strObj._raw) return strObj;
          else return strObj;
        })
      });
    }
    else if (wrapper.type == 'bracedstringwrapper') return new StringValue(wrapper);
    else throw new Error("Unexpected object to resolve: " + JSON.stringify(wrapper));
  }

}