lib/regexptree.rb
=begin
= RegexpTree
RegexpTree represents regular expression.
It can be converted to Regexp.
== class methods
--- RegexpTree.str(string)
returns an instance of RegexpTree which only matches ((|string|))
--- RegexpTree.alt(*regexp_trees)
returns an instance of RegexpTree which is alternation of ((|regexp_trees|)).
--- RegexpTree.seq(*regexp_trees)
returns an instance of RegexpTree which is concatination of ((|regexp_trees|)).
--- RegexpTree.rep(regexp_tree, min=0, max=nil, greedy=true)
returns an instance of RegexpTree which is reptation of ((|regexp_tree|)).
--- RegexpTree.charclass(natset)
returns an instance of RegexpTree which matches characters in ((|natset|)).
#--- RegexpTree.linebeg
#--- RegexpTree.lineend
#--- RegexpTree.strbeg
#--- RegexpTree.strend
#--- RegexpTree.strlineend
#--- RegexpTree.word_boundary
#--- RegexpTree.non_word_boundary
#--- RegexpTree.previous_match
#--- RegexpTree.backref(n)
== methods
--- regexp(anchored=false)
convert to Regexp.
If ((|anchored|)) is true, the Regexp is anchored by (({\A})) and (({\z})).
--- to_s
convert to String.
--- empty_set?
returns true iff self never matches.
--- empty_sequence?
returns true iff self only matches empty string.
--- self | other
returns alternation of ((|self|)) and ((|other|)).
--- self + other
returns concatination of ((|self|)) and ((|other|)).
--- self * n
returns ((|n|)) times repetation of ((|self|)).
--- rep(min=0, max=nil, greedy=true)
returns ((|min|)) to ((|max|)) times repetation of ((|self|)).
#--- closure(greedy=true)
#--- positive_closure(greedy=true)
#--- optional(greedy=true)
#--- ntimes(min, max=min, greedy=true)
#--- nongreedy_rep(min=0, max=nil)
#--- nongreedy_closure
#--- nongreedy_positive_closure
#--- nongreedy_optional
#--- nongreedy_ntimes(min, max=min)
=end
require 'prettyprint'
require 'natset'
class RegexpTree
@curr_prec = 1
def RegexpTree.inherited(c)
return if c.superclass != RegexpTree
c.const_set(:Prec, @curr_prec)
@curr_prec += 1
end
def parenthesize(target)
if target::Prec <= self.class::Prec
self
else
Paren.new(self)
end
end
def pretty_print(pp)
case_insensitive = case_insensitive?
pp.group(3, '%r{', '}x') {
(case_insensitive ? self.downcase : self).pretty_format(pp)
}
pp.text 'i' if case_insensitive
end
def inspect
case_insensitive = case_insensitive? ? "i" : ""
r = PrettyPrint.singleline_format('') {|out|
(case_insensitive ? self.downcase : self).pretty_format(out)
}
if %r{/} =~ r
"%r{#{r}}#{case_insensitive}"
else
"%r/#{r}/#{case_insensitive}"
end
end
def regexp(anchored=false)
if case_insensitive?
r = downcase
opt = Regexp::IGNORECASE
else
r = self
opt = 0
end
r = RegexpTree.seq(RegexpTree.strbeg, r, RegexpTree.strend) if anchored
Regexp.compile(
PrettyPrint.singleline_format('') {|out|
r.pretty_format(out)
},
opt)
end
def to_s
PrettyPrint.singleline_format('') {|out|
# x flag is not required because all whitespaces are escaped.
if case_insensitive?
out.text '(?i-m:'
downcase.pretty_format(out)
out.text ')'
else
out.text '(?-im:'
pretty_format(out)
out.text ')'
end
}
end
def empty_set?
false
end
def empty_sequence?
false
end
def |(other)
RegexpTree.alt(self, other)
end
def RegexpTree.alt(*rs)
rs2 = []
rs.each {|r|
if r.empty_set?
next
elsif Alt === r
rs2.concat r.rs
elsif CharClass === r
if CharClass === rs2.last
rs2[-1] = CharClass.new(rs2.last.natset + r.natset)
else
rs2 << r
end
else
rs2 << r
end
}
case rs2.length
when 0; EmptySet
when 1; rs2.first
else; Alt.new(rs2)
end
end
class Alt < RegexpTree
def initialize(rs)
@rs = rs
end
attr_reader :rs
def empty_set?
@rs.empty?
end
def case_insensitive?
@rs.all? {|r| r.case_insensitive?}
end
def multiline_insensitive?
@rs.all? {|r| r.multiline_insensitive?}
end
def downcase
Alt.new(@rs.map {|r| r.downcase})
end
def pretty_format(out)
if @rs.empty?
out.text '(?!)'
else
out.group {
@rs.each_with_index {|r, i|
unless i == 0
out.text '|'
out.breakable ''
end
r.parenthesize(Alt).pretty_format(out)
}
}
end
end
end
EmptySet = Alt.new([])
def +(other)
RegexpTree.seq(self, other)
end
def RegexpTree.seq(*rs)
rs2 = []
rs.each {|r|
if r.empty_sequence?
next
elsif Seq === r
rs2.concat r.rs
elsif r.empty_set?
return EmptySet
else
rs2 << r
end
}
case rs2.length
when 0; EmptySequence
when 1; rs2.first
else; Seq.new(rs2)
end
end
class Seq < RegexpTree
def initialize(rs)
@rs = rs
end
attr_reader :rs
def empty_sequence?
@rs.empty?
end
def case_insensitive?
@rs.all? {|r| r.case_insensitive?}
end
def multiline_insensitive?
@rs.all? {|r| r.multiline_insensitive?}
end
def downcase
Seq.new(@rs.map {|r| r.downcase})
end
def pretty_format(out)
out.group {
@rs.each_with_index {|r, i|
unless i == 0
out.group {out.breakable ''}
end
r.parenthesize(Seq).pretty_format(out)
}
}
end
end
EmptySequence = Seq.new([])
def *(n)
case n
when Integer
RegexpTree.rep(self, n, n)
when Range
RegexpTree.rep(self, n.first, n.last - (n.exclude_end? ? 1 : 0))
else
raise TypeError.new("Integer or Range expected: #{n}")
end
end
def nongreedy_closure() RegexpTree.rep(self, 0, nil, false) end
def nongreedy_positive_closure() RegexpTree.rep(self, 1, nil, false) end
def nongreedy_optional() RegexpTree.rep(self, 0, 1, false) end
def nongreedy_ntimes(m, n=m) RegexpTree.rep(self, m, n, false) end
def nongreedy_rep(m=0, n=nil) RegexpTree.rep(self, m, n, false) end
def closure(greedy=true) RegexpTree.rep(self, 0, nil, greedy) end
def positive_closure(greedy=true) RegexpTree.rep(self, 1, nil, greedy) end
def optional(greedy=true) RegexpTree.rep(self, 0, 1, greedy) end
def ntimes(m, n=m, greedy=true) RegexpTree.rep(self, m, n, greedy) end
def rep(m=0, n=nil, greedy=true) RegexpTree.rep(self, m, n, greedy) end
def RegexpTree.rep(r, m=0, n=nil, greedy=true)
return EmptySequence if m == 0 && n == 0
return r if m == 1 && n == 1
return EmptySequence if r.empty_sequence?
if r.empty_set?
return m == 0 ? EmptySequence : EmptySet
end
Rep.new(r, m, n, greedy)
end
class Rep < RegexpTree
def initialize(r, m=0, n=nil, greedy=true)
@r = r
@m = m
@n = n
@greedy = greedy
end
def case_insensitive?
@r.case_insensitive?
end
def multiline_insensitive?
@r.multiline_insensitive?
end
def downcase
Rep.new(@r.downcase, @m, @n, @greedy)
end
def pretty_format(out)
@r.parenthesize(Elt).pretty_format(out)
case @m
when 0
case @n
when 0
out.text '{0}'
when 1
out.text '?'
when nil
out.text '*'
else
out.text "{#{@m},#{@n}}"
end
when 1
case @n
when 1
when nil
out.text '+'
else
out.text "{#{@m},#{@n}}"
end
else
if @m == @n
out.text "{#{@m}}"
else
out.text "{#{@m},#{@n}}"
end
end
out.text '?' unless @greedy
end
end
class Elt < RegexpTree
end
def RegexpTree.charclass(natset)
if natset.empty?
EmptySet
else
CharClass.new(natset)
end
end
class CharClass < Elt
None = NatSet.empty
Any = NatSet.universal
NL = NatSet.new(?\n)
NonNL = ~NL
Word = NatSet.new(?0..?9, ?A..?Z, ?_, ?a..?z)
NonWord = ~Word
Space = NatSet.new(?t, ?\n, ?\f, ?\r, ?\s)
NonSpace = ~Space
Digit = NatSet.new(?0..?9)
NonDigit = ~Digit
UpAlpha = NatSet.new(?A..?Z)
LowAlpha = NatSet.new(?a..?z)
def initialize(natset)
@natset = natset
end
attr_reader :natset
def empty_set?
@natset.empty?
end
def case_insensitive?
up = @natset & UpAlpha
low = @natset & LowAlpha
return false if up.es.length != low.es.length
up.es.map! {|ch|
ch - 0x41 + 0x61 # ?A + ?a
}
up == low
end
def multiline_insensitive?
@natset != NonNL
end
def downcase
up = @natset & UpAlpha
up.es.map! {|ch|
ch - 0x41 + 0x61 # ?A + ?a
}
CharClass.new((@natset - UpAlpha) | up)
end
def pretty_format(out)
case @natset
when None; out.text '(?!)'
when Any; out.text '[\s\S]'
when NL; out.text '\n'
when NonNL; out.text '.'
when Word; out.text '\w'
when NonWord; out.text '\W'
when Space; out.text '\s'
when NonSpace; out.text '\S'
when Digit; out.text '\d'
when NonDigit; out.text '\D'
else
if val = @natset.singleton?
out.text encode_elt(val)
else
if @natset.open?
neg_mark = '^'
es = (~@natset).es
else
neg_mark = ''
es = @natset.es.dup
end
r = ''
until es.empty?
if es[0] + 1 == es[1]
r << encode_elt(es[0])
elsif es[0] + 2 == es[1]
r << encode_elt(es[0]) << encode_elt(es[1] - 1)
else
r << encode_elt(es[0]) << '-' << encode_elt(es[1] - 1)
end
es.shift
es.shift
end
out.text "[#{neg_mark}#{r}]"
end
end
end
def encode_elt(e)
case e
when 0x09; '\t'
when 0x0a; '\n'
when 0x0d; '\r'
when 0x0c; '\f'
when 0x0b; '\v'
when 0x07; '\a'
when 0x1b; '\e'
#when ?!, ?", ?%, ?&, ?', ?,, ?:, ?;, ?<, ?=, ?>, ?/, ?0..?9, ?@, ?A..?Z, ?_, ?`, ?a..?z, ?~
when 0x21, 0x22, 0x25, 0x26, 0x27, 0x2c, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x2f, 0x30..0x39, 0x40, 0x41..0x5a, 0x5f, 0x60, 0x61..0x7a, 0x7e
sprintf("%c", e)
else
sprintf("\\x%02x", e)
end
end
end
def RegexpTree.linebeg() Special.new('^') end
def RegexpTree.lineend() Special.new('$') end
def RegexpTree.strbeg() Special.new('\A') end
def RegexpTree.strend() Special.new('\z') end
def RegexpTree.strlineend() Special.new('\Z') end
def RegexpTree.word_boundary() Special.new('\b') end
def RegexpTree.non_word_boundary() Special.new('\B') end
def RegexpTree.previous_match() Special.new('\G') end
def RegexpTree.backref(n) Special.new("\\#{n}") end
class Special < Elt
def initialize(str)
@str = str
end
def case_insensitive?
true
end
def multiline_insensitive?
true
end
def downcase
self
end
def pretty_format(out)
out.text @str
end
end
def group() Paren.new(self, '') end
def paren() Paren.new(self) end
def lookahead() Paren.new(self, '?=') end
def negative_lookahead() Paren.new(self, '?!') end
# (?ixm-ixm:...)
# (?>...)
class Paren < Elt
def initialize(r, mark='?:')
@mark = mark
@r = r
end
def case_insensitive?
# xxx: if @mark contains "i"...
@r.case_insensitive?
end
def multiline_insensitive?
# xxx: if @mark contains "m"...
@r.multiline_insensitive?
end
def downcase
Paren.new(@r.downcase, @mark)
end
def pretty_format(out)
out.group(1 + @mark.length, "(#@mark", ')') {
@r.pretty_format(out)
}
end
end
# def RegexpTree.comment(str) ... end # (?#...)
def RegexpTree.str(str)
ccs = []
str.each_byte {|ch|
ccs << CharClass.new(NatSet.new(ch))
}
seq(*ccs)
end
end