digitalheir/bibliography-js

View on GitHub
depr/bibtex/parser/parser.ne

Summary

Maintainability
Test Coverage
# For more information, see:
#     http://ftp.math.purdue.edu/mirrors/ctan.org/info/bibtex/tamethebeast/ttb_en.pdf
#     http://artis.imag.fr/~Xavier.Decoret/resources/xdkbibtex/bibtex_summary.html
#     http://www.bibtex.org/Format/


#
# See http://ftp.math.purdue.edu/mirrors/ctan.org/info/bibtex/tamethebeast/ttb_en.pdf:
#
#   There is a special entry type named @comment. The main use of such an entry type is to comment a large part
#    of the bibliography easily, since anything outside an entry is already a comment, and commenting out one
#    entry may be achieved by just removing its initial @
#
# ^ this suggests that opening braces within the comment MUST be closed, and the comment ends on the first ending brace
#   that balances with this opening brace
#
#
# Case-independent sequence of non-whitespace, non-brace, non-commas
#

#
# • Values (i.e. right hand sides of each assignment) can be either between curly braces or between
#   double quotes. The main difference is that you can write double quotes in the first case, and not
#   in the second case.
# • For numerical values, curly braces and double quotes can be omitted.
#
# Text that is enclosed in braces is marked not to be touched by any formating instructions. For instance, when a style defines the title to become depicted using only lowercase, italic letters, the enclosed part will be left untouched. "An Introduction To {BibTeX}" would become ,,an introduction to the BibTeX'' when such a style is applied. Nested braces are ignored.

@{%
var isNumber = function(x) {return x.constructor === Number || (typeof x== 'object'&&x.type == 'number')};
var tok_id =              {test: function(x) {return typeof x === 'object' && x.type=='id'; }}
var entry_type_bib =      {test: function(x) {return typeof x === 'object' && x.type=='@bib'; }}
var entry_type_string =   {test: function(x) {return typeof x === 'object' && x.type=='@string'; }}
var entry_type_preamble = {test: function(x) {return typeof x === 'object' && x.type=='@preamble'; }}
var entry_type_comment =  {test: function(x) {return typeof x === 'object' && x.type=='@comment'; }}
var ws =                  {test: function(x) {return typeof x === 'object' && x.type=='ws';}}
var num =                 {test: isNumber}
var pound =     {literal: '#' }
var eq =        {literal: '=' }
var esc =       {literal: '\\' }
var paren_l =   {literal: '(' }
var paren_r =   {literal: ')' }
var brace_l =   {literal: '{' }
var brace_r =   {literal: '}' }
var quote_dbl = {literal: '"' }
var comma =     {literal: ',' }


function addToObj(obj, keyval){
  if(!keyval.type == 'keyval') throw new Error("Expected a keyval object");
  var key = keyval.key.toLowerCase();
      if(obj.fields[key]) {
      console.log("WARNING: field "+key+ " was already defined on object "+obj._id+". Ignoring this value.");
      return;
    }else{
      obj.fields[key]=keyval.value;
      return obj;
    }
}

function joinTokens(arr){
    var strs = [];
    for(var i=0;i<arr.length;i++){
      if(typeof arr[i] == 'object'){
        if(!arr[i].string) throw new Error("Expected token to have a string field called 'string' in object "+JSON.stringify(arr[i]));
        strs.push(arr[i].string);
      } else if(typeof arr[i] == 'string' || typeof arr[i] == 'number'){
        strs.push(arr[i]);
      } else throw new Error("Could not handle token "+JSON.stringify(arr[i]) +" in array "+JSON.stringify(arr));
    }
    return strs.join('');
}

%}

#####################
# A bibfile is a sequence of entries, with comments interspersed
# Note that % is "is not a comment character in the database files"
# (ftp://ftp.ctan.org/tex-archive/biblio/bibtex/contrib/doc/btxdoc.pdf)
#####################
main  -> non_entry:? (entry non_entry:?):*  {%
                                                   function (data, location, reject) {
                                                     var comments = [];
                                                     var entries = [];
                                                     //console.log(JSON.stringify(data));
                                                     if(data[0]) comments.push(data[0]);
                                                     for(var i=0;i < data[1].length;i++){
                                                      entries.push(data[1][i][0]);
                                                      if(data[1][i][1]) comments.push(data[1][i][1]);
                                                     }
                                                     return {comments: comments, entries: entries};
                                                   }
                                                 %}
_ -> %ws:*
parenthesized[X]        -> %paren_l $X %paren_r {% function (data, location, reject) { return data[1]; } %}
braced[X]               -> %brace_l $X %brace_r {% function (data, location, reject) { return data[1]; } %}
parenthesizedPadded[X]  -> %paren_l _ $X _ %paren_r  {% function (data, location, reject) { return data[2]; } %}
bracedPadded[X]         -> %brace_l _ $X _ %brace_r  {% function (data, location, reject) { return data[2]; } %}

#####################
# ENTRY
#####################
entry_decl         -> (%entry_type_bib |
                         %entry_type_string |
                         %entry_type_preamble |
                         %entry_type_comment)            {% function (data, location, reject) { return data[0][0]; } %}

entry              -> (bib_entry | string_entry | preamble_entry | comment_entry)
                      {% function (data, location, reject) { return data[0][0]; }%}

#
# See http://ftp.math.purdue.edu/mirrors/ctan.org/info/bibtex/tamethebeast/ttb_en.pdf:
#
#   There is a special entry type named @comment. The main use of such an entry type is to comment a large part
#    of the bibliography easily, since anything outside an entry is already a comment, and commenting out one
#    entry may be achieved by just removing its initial @
#
# ^ this suggests that opening and closing brackets within the comment MUST be closed, and the comment ends on the first ending bracket
#   that balances with the opening bracket
#
comment                    -> (entry_body_comment|non_bracket):*         {% function (data, location, reject) {
                                                                              var toeknz=[];
                                                                              for(var tk=0; tk < data[0].length; tk++)
                                                                                toeknz.push(data[0][tk][0]);
                                                                              return data[0];
                                                                            } %}
entry_body_comment         -> (parenthesized[comment] | braced[comment]) {% function (data, location, reject) { return data[0][0][0]; } %}
entry_body_string          -> (parenthesizedPadded[keyval] | bracedPadded[keyval]) {% function (data, location, reject) { return data[0][0][0]; } %}
entry_body_bib             -> (parenthesizedPadded[bib_content] | bracedPadded[bib_content]) {% function (data, location, reject) {
                                                                                                var obj = data[0][0][0];
                                                                                                return obj;
                                                                                              } %}
bib_content                -> key_string _ %comma _ (keyval _ %comma _):* keyval (_ %comma):?
                                                    {% function (data, location, reject) {
                                                           var obj = {
                                                            _id: data[0],
                                                            fields:[]
                                                           };
                                                           var keyvals = data[4];
                                                           for(var kv=0;kv<keyvals.length;kv++) {
                                                             obj.fields.push(keyvals[kv][0]);
                                                           }
                                                           obj.fields.push(data[5]);
                                                           return obj;
                                                       } %}

bib_entry          -> %entry_type_bib      _ entry_body_bib       {% function (data, location, reject) {
                                                                     var obj = {
                                                                                 _id: data[2]._id,
                                                                                };
                                                                     obj['@type'] = data[0].string;
                                                                     obj.fields = {};

                                                                     var keyvals = data[2].fields;
                                                                     for(var kv=0;kv<keyvals.length;kv++) {
                                                                       addToObj(obj, keyvals[kv]);
                                                                     }
                                                                     return obj;
                                                                  }%}
string_entry       -> %entry_type_string   _ entry_body_string    {% function (data, location, reject) { return {type: 'string', data: data[2]}; } %}
preamble_entry     -> %entry_type_preamble _ entry_body_comment   {% function (data, location, reject) { return {type: 'preamble', data: data[2]}; } %}
comment_entry      -> %entry_type_comment  _ entry_body_comment   {% function (data, location, reject) { return {type: 'comment', data: data[2]}; } %}

keyval             -> key_string _ %eq _ value_string
                      {% function (data, location, reject) {return {type: 'keyval', key: data[0], value: data[4]};}%}

braced_string         ->  %brace_l (non_brace|braced_string):* %brace_r {% function (data, location, reject) {
                                                                                                    var tkz = [];
                                                                                                    for(var i in data[1]) tkz.push(data[1][i][0]);
                                                                                                    return {type:'braced', data:tkz};
                                                                                                  }

                                                                                                %}
quoted_string        -> %quote_dbl (escaped_quote|non_quote_non_brace|braced_string):* %quote_dbl
                        {% function (data, location, reject) {
                             var tks = [];
                             for(var i in data[1]) tks.push(data[1][i][0]);
                             return {type:'quotedstring', data:tks};
                           }
                        %}
escaped_quote        -> %esc %quote_dbl
non_quote_non_brace  -> (%tok_id |
                        %entry_type_bib |
                        %entry_type_string |
                        %entry_type_preamble |
                        %entry_type_comment |
                        %ws |
                        %num |
                        %pound |
                        %eq |
                        %esc |
                        %paren_l |
                        %paren_r |
                        %comma)

#
# Case-independent sequence of non-whitespace, non-brace, non-commas
#
key_string         -> stringreftoken:+ {% function (data, location, reject) { return joinTokens(data[0]).toLowerCase(); } %}

#
# • Values (i.e. right hand sides of each assignment) can be either between curly braces or between
#   double quotes. The main difference is that you can write double quotes in the first case, and not
#   in the second case.
# • For numerical values, curly braces and double quotes can be omitted.
#
value_string       ->  (quoted_string_or_ref (_ %pound _ quoted_string_or_ref):* | braced_string)
                      {% function (data, location, reject) {
                             //console.log("DATA",JSON.stringify(data));
                             var match = data[0];
                             if(match.length == 2){
                              // quoted string
                              var tokenz = [];
                              tokenz.push(match[0]);
                              for(var i=0;i<match[1].length;i++) tokenz.push(match[1][i][3]);
                              return {type: 'quotedstringwrapper', data: tokenz};
                             } else if(match[0].type == 'braced')
                               return {type: 'bracedstringwrapper', data: match[0].data};
                             //else if(isNumber(match[0]) return [match[0]];
                             else throw new Error("Don't know how to handle value "+JSON.stringify(match[0]));
                         }
                      %}

quoted_string_or_ref -> (quoted_string | string_ref | %num) {% function (data, location, reject) {
                                                          //console.log(data);
                                                          if (data[0][0].type=='quotedstring') return data[0][0];
                                                          else{return data[0][0];}
                                                        }
                                                     %}

string_ref         -> (stringreftoken_n_num stringreftoken:*)
                      {% function (data, location, reject) { var str = data[0][0]+joinTokens(data[0][1]); return {stringref: str}; } %}

# Non-white non-brace, non-comma
stringreftoken       -> (%esc | %paren_l | %paren_r | %tok_id | %num  | %entry_type_bib | %entry_type_string | %entry_type_preamble | %entry_type_comment) {% function (data, location, reject) { if(typeof data[0][0]=='object') {if(!data[0][0].string)throw new Error("Expected "+data[0]+"to have a 'string' field");return data[0][0].string;} else {if((!(typeof data[0][0] == 'string'||typeof data[0][0]=='number')))throw new Error("Expected "+data[0][0]+" to be a string");return data[0][0]; }} %}
stringreftoken_n_num -> (%esc | %paren_l | %paren_r | %tok_id |         %entry_type_bib | %entry_type_string | %entry_type_preamble | %entry_type_comment) {% function (data, location, reject) { if(typeof data[0][0]=='object') {if(!data[0][0].string)throw new Error("Expected "+data[0]+"to have a 'string' field");return data[0][0].string;} else {if((!(typeof data[0][0] == 'string'||typeof data[0][0]=='number')))throw new Error("Expected "+data[0][0]+" to be a string");return data[0][0]; }} %}
non_brace            -> (%esc | %paren_l | %paren_r | %tok_id | %quote_dbl | %ws | %num | %comma | %entry_type_bib | %entry_type_string | %entry_type_preamble | %entry_type_comment | %pound | %eq)
{% function (data, location, reject) {
  return data[0][0];
}
%}
non_bracket          -> (%esc |                       %tok_id | %quote_dbl | %ws | %num | %comma | %entry_type_bib | %entry_type_string | %entry_type_preamble | %entry_type_comment | %pound | %eq)
{% function (data, location, reject) {
  return data[0][0];
}
%}

#####################
# NON ENTRY
#####################
non_entry           -> (escaped_entry | escaped_escape | escaped_non_esc_outside_entry | non_esc_outside_entry):+
{% function (data, location, reject) {
  //console.log("non_entry",data);
  var tokens = [];
  for(var Ti = 0;Ti<data[0].length;Ti++) tokens.push(data[0][Ti][0]);
  return tokens;
  }
%}
escaped_escape        -> %esc %esc {% function (data, location, reject) { return '\\'; } %}
escaped_entry         -> %esc entry_decl
                         {% function (data, location, reject) { return data[1]; } %}
escaped_non_esc_outside_entry -> %esc non_esc_outside_entry
                         {% function (data, location, reject) { return '\\' + data[1]; } %}
non_esc_outside_entry -> (%tok_id |
                         %ws |
                         %num |
                         %pound |
                         %eq |
                         %paren_l |
                         %paren_r |
                         %brace_l |
                         %brace_r |
                         %quote_dbl |
                         %comma)
                         {% function (data, location, reject) {
                            //console.log("ooutside_entry",data[0][0]);
                            return data[0][0];
                          } %}