lib/mathml2asciimath/m2a.rb
require "nokogiri"
require "htmlentities"
require "pp"
module MathML2AsciiMath
def self.m2a(xml)
normalized = xml
# &:noblanks skips non-significant whitespaces in MathML
docxml = Nokogiri::XML.parse(normalized, &:noblanks)
# Get rid of things like
# <mtext> </mtext>
parse(docxml.root).gsub(/[[:blank:]]/, " ").unicode_normalize.squeeze(" ")
end
def self.encodechars(xml)
xml.gsub(/\u03b1/, "alpha")
.gsub(/\u03b2/, "beta")
.gsub(/\u03b3/, "gamma")
.gsub(/\u0393/, "Gamma")
.gsub(/\u03b4/, "delta")
.gsub(/\u0394/, "Delta")
.gsub(/\u2206/, "Delta")
.gsub(/\u03b5/, "epsilon")
.gsub(/\u025b/, "varepsilon")
.gsub(/\u03b6/, "zeta")
.gsub(/\u03b7/, "eta")
.gsub(/\u03b8/, "theta")
.gsub(/\u0398/, "Theta")
.gsub(/\u03d1/, "vartheta")
.gsub(/\u03b9/, "iota")
.gsub(/\u03ba/, "kappa")
.gsub(/\u03bb/, "lambda")
.gsub(/\u039b/, "Lambda")
.gsub(/\u03bc/, "mu")
.gsub(/\u03bd/, "nu")
.gsub(/\u03be/, "xi")
.gsub(/\u039e/, "Xi")
.gsub(/\u03c0/, "pi")
.gsub(/\u03a0/, "Pi")
.gsub(/\u03c1/, "rho")
.gsub(/\u03c2/, "beta")
.gsub(/\u03c3/, "sigma")
.gsub(/\u03a3/, "Sigma")
.gsub(/\u03c4/, "tau")
.gsub(/\u03c5/, "upsilon")
.gsub(/\u03c6/, "phi")
.gsub(/\u03a6/, "Phi")
.gsub(/\u03d5/, "varphi")
.gsub(/\u03c7/, "chi")
.gsub(/\u03c8/, "psi")
.gsub(/\u03a8/, "Psi")
.gsub(/\u03c9/, "omega")
.gsub(/\u03a9/, "omega")
.gsub(/\u22c5/, "*")
.gsub(/\u2219/, "*")
.gsub(/\u00b7/, "*")
.gsub(/\u2217/, "**")
.gsub(/\u22c6/, "***")
.gsub(/\//, "//")
.gsub(/\\/, "\\\\")
.gsub(/\u00d7/, "xx")
.gsub(/\u22c9/, "|><")
.gsub(/\u22ca/, "><|")
.gsub(/\u22c8/, "|><|")
.gsub(/\u00f7/, "-:")
.gsub(/\u2218/, "@")
.gsub(/\u2295/, "o+")
.gsub(/\u2a01/, "o+")
.gsub(/\u2297/, "ox")
.gsub(/\u2299/, "o.")
.gsub(/\u2211/, "sum")
.gsub(/\u220f/, "prod")
.gsub(/\u2227/, "^^")
.gsub(/\u22c0/, "^^^")
.gsub(/\u2228/, "vv")
.gsub(/\u22c1/, "vvv")
.gsub(/\u2229/, "nn")
.gsub(/\u22c2/, "nnn")
.gsub(/\u222a/, "uu")
.gsub(/\u22c3/, "uuu")
.gsub(/\u2260/, "!=")
.gsub(/\u2264/, "<=")
.gsub(/\u2265/, ">=")
.gsub(/\u227a/, "-<")
.gsub(/\u227b/, ">-")
.gsub(/\u2aaf/, "-<=")
.gsub(/\u2ab0/, ">-=")
.gsub(/\u2208/, "in")
.gsub(/\u2209/, "!in")
.gsub(/\u2282/, "sub")
.gsub(/\u2283/, "sup")
.gsub(/\u2286/, "sube")
.gsub(/\u2287/, "supe")
.gsub(/\u2261/, "-=")
.gsub(/\u2245/, "~=")
.gsub(/\u2248/, "~~")
.gsub(/\u221d/, "prop")
.gsub(/\u00ac/, "not")
.gsub(/\u21d2/, "=>")
.gsub(/\u21d4/, "<=>")
.gsub(/\u2200/, "AA")
.gsub(/\u2203/, "EE")
.gsub(/\u22a5/, "_|_")
.gsub(/\u22a4/, "TT")
.gsub(/\u22a2/, "|--")
.gsub(/\u22a8/, "|==")
.gsub(/\u22a8/, "|==")
.gsub(/\u2329/, "(:")
.gsub(/\u232a/, ":)")
.gsub(/\u2329/, "<<")
.gsub(/\u27e8/, "<<")
.gsub(/\u232a/, ">>")
.gsub(/\u27e9/, ">>")
.gsub(/\u222b/, "int")
.gsub(/\u222e/, "oint")
.gsub(/\u2202/, "del")
.gsub(/\u2207/, "grad")
.gsub(/\u00b1/, "+-")
.gsub(/\u2205/, "O/")
.gsub(/\u221e/, "oo")
.gsub(/\u2135/, "aleph")
.gsub(/\u2234/, ":.")
.gsub(/\u2235/, ":'")
.gsub(/\u2220/, "/_")
.gsub(/\u25b3/, "/_\\")
.gsub(/\u2032/, "'")
.gsub(/~/, "tilde")
.gsub(/\u00a0\u00a0\u00a0\u00a0/, "qquad")
.gsub(/\u00a0\u00a0/, "quad")
.gsub(/\u00a0/, "\\ ")
.gsub(/\u2322/, "frown")
.gsub(/\u00a0/, "quad")
.gsub(/\u22ef/, "cdots")
.gsub(/\u22ee/, "vdots")
.gsub(/\u22f1/, "ddots")
.gsub(/\u22c4/, "diamond")
.gsub(/\u25a1/, "square")
.gsub(/\u230a/, "|__")
.gsub(/\u230b/, "__|")
.gsub(/\u2308/, "|~")
.gsub(/\u2309/, "~|")
.gsub(/\u2102/, "CC")
.gsub(/\u2115/, "NN")
.gsub(/\u211a/, "QQ")
.gsub(/\u211d/, "RR")
.gsub(/\u2124/, "ZZ")
.gsub(/\u2191/, "uarr")
.gsub(/\u2193/, "darr")
.gsub(/\u2190/, "larr")
.gsub(/\u2194/, "harr")
.gsub(/\u21d2/, "rArr")
.gsub(/\u21d0/, "lArr")
.gsub(/\u21d4/, "hArr")
.gsub(/\u2192/, "->")
.gsub(/\u21a3/, ">->")
.gsub(/\u21a0/, "->>")
.gsub(/\u2916/, ">->>")
.gsub(/\u21a6/, "|->")
.gsub(/\u2026/, "...")
.gsub(/\u2212/, "-")
.gsub(/\u2061/, "") # function application
.gsub(/\u2751/, "square")
.gsub(/[\u2028\u2029]/, " ") # normalize thin spaces like \u2009, \u2008
end
def self.join_parsed_children(children, delimiter = " ")
children.map do |n|
parse(n).strip
end.join(delimiter)
end
def self.parse(node)
out = ""
if node.text?
return encodechars(HTMLEntities.new.decode(node.text))
end
case node.name.sub(/^[^:]*:/, "")
when "math"
join_parsed_children(node.elements)
when "annotation"
""
when "semantics"
join_parsed_children(node.elements)
when "mrow"
out = join_parsed_children(node.elements)
if %w[mfrac msub munder munderover]
.include? node.parent.name.sub(/^[^:]*:/, "")
out = "(#{out})"
end
out
when "mfenced"
sym_open = node["open"] || "("
sym_close = node["close"] || ")"
separator = "," # TODO currently ignore the supplied separators
out = join_parsed_children(node.elements, separator)
"#{sym_open}#{out}#{sym_close}"
when "msqrt"
"sqrt(#{join_parsed_children(node.elements)})"
when "mfrac"
"(#{parse(node.elements[0])})/(#{parse(node.elements[1])})"
when "msup"
sup = parse(node.elements[1])
sup = "(#{sup})" unless sup.length == 1
op = parse(node.elements[0]).gsub(/ $/, "")
"#{op}^#{sup}"
when "msub"
sub = parse(node.elements[1])
sub = "(#{sub})" unless sub.length == 1
op = parse(node.elements[0]).gsub(/ $/, "")
"#{op}_#{sub}"
when "munderover", "msubsup"
sub = parse(node.elements[1])
sub = "(#{sub})" unless sub.length == 1
sup = parse(node.elements[2])
sup = "(#{sup})" unless sup.length == 1
op = parse(node.elements[0]).gsub(/ $/, "")
"#{op}_#{sub}^#{sup}"
when "munder"
elem1 = parse(node.elements[1]).strip
accent = case elem1
when "\u0332" then "ul"
when "\u23df" then "ubrace"
else
"underset"
end
if accent == "underset"
"underset(#{elem1})(#{parse(node.elements[0])})"
else
"#{accent} #{parse(node.elements[0])}"
end
when "mover"
elem1 = parse(node.elements[1]).strip
accent = case elem1
when "\u005e" then "hat"
when "\u00af" then "bar"
# when "\u2192" then "vec"
when "->" then "vec"
when "." then "dot"
when ".." then "ddot"
when "\u23de" then "obrace"
else
"overset"
end
if accent == "overset"
"overset(#{elem1})(#{parse(node.elements[0])})"
else
"#{accent} #{parse(node.elements[0])}"
end
when "mtable"
"[#{join_parsed_children(node.elements, ',')}]"
when "mtr"
"[#{join_parsed_children(node.elements, ',')}]"
when "mtd"
join_parsed_children(node.elements, ",")
when "mn", "mtext"
join_parsed_children(node.children, "")
when "mi"
# FIXME: What does this comment have to do with Word?
# mi is not meant to have space around it,
# but Word is conflating operators and operands
join_parsed_children(node.children)
# FIXME: Why do we need to add extra spaces?
# out = " #{out} " if /[^a-zA-Z0-9',]|[a-z][a-z]/.match out
when "mo"
out = join_parsed_children(node.children)
out = " #{out} " unless node["fence"]
out
when "mstyle"
join_parsed_children(node.children)
else
"<math xmlns=\"http://www.w3.org/1998/Math/MathML\">" +
node.to_xml +
"</math>"
end
end
end