tom-lord/regexp-examples

View on GitHub
lib/regexp-examples/unicode_char_ranges.rb

Summary

Maintainability
A
0 mins
Test Coverage
require 'pstore'
require 'singleton'

module RegexpExamples
  # Interface to the retrieve the character sets that match a regex named property.
  # E.g. `/\p{Alpha}/`
  # These matching values are stored, compressed, in a PStore. They are specific to
  # the ruby minor version.
  class UnicodeCharRanges
    include Singleton
    # These values were generated by: scripts/unicode_lister.rb
    # Note: Only the first 128 results are listed, for performance.
    # Also, some groups seem to have no matches (weird!)
    STORE_FILENAME = "unicode_ranges_#{RbConfig::CONFIG['UNICODE_VERSION']}.pstore".freeze

    attr_reader :range_store

    def initialize
      @range_store = PStore.new(unicode_ranges_file)
    end

    def get(key)
      range_store.transaction(true) do
        ranges_to_unicode(range_store[key])
      end
    end

    alias [] get

    private

    # The method is written like this to future-proof it a little,
    # i.e. the gem won't completely break for a new ruby version release
    def unicode_ranges_file
      db_path = File.join(__dir__, '../../db')
      Dir["#{db_path}/*.pstore"].sort.select do |file|
        file <= "#{db_path}/#{STORE_FILENAME}"
      end.last
    end

    # TODO: Document example input/output of this method
    # It's pretty simple, but this code is a little confusing!!
    def ranges_to_unicode(ranges)
      result = []
      ranges.each do |range|
        if range.is_a? Integer # Small hack to increase data compression
          result << hex_to_unicode(range.to_s(16))
        else
          range.each { |num| result << hex_to_unicode(num.to_s(16)) }
        end
      end
      result
    end

    def hex_to_unicode(hex)
      [hex.to_i(16)].pack('U')
    end
  end
end