CartoDB/cartodb20

View on GitHub
services/importer/lib/importer/string_sanitizer.rb

Summary

Maintainability
B
4 hrs
Test Coverage
module CartoDB
  module Importer2
    module StringSanitizer
      module_function

      def normalize(string, transliterate_cyrillic: false)
        return '' if string.nil? || string.empty?

        n = string.force_encoding("UTF-8")
        n.gsub!(/[àáâãäåāă]/,    'a')
        n.gsub!(/æ/,            'ae')
        n.gsub!(/[ďđ]/,          'd')
        n.gsub!(/[çćčĉċ]/,       'c')
        n.gsub!(/[èéêëēęěĕė]/,   'e')
        n.gsub!(/ƒ/,             'f')
        n.gsub!(/[ĝğġģ]/,        'g')
        n.gsub!(/[ĥħ]/,          'h')
        n.gsub!(/[ììíîïīĩĭ]/,    'i')
        n.gsub!(/[įıijĵ]/,        'j')
        n.gsub!(/[ķĸ]/,          'k')
        n.gsub!(/[łľĺļŀ]/,       'l')
        n.gsub!(/[ñńňņʼnŋ]/,      'n')
        n.gsub!(/[òóôõöøōőŏŏ]/,  'o')
        n.gsub!(/œ/,            'oe')
        n.gsub!(/ą/,             'q')
        n.gsub!(/[ŕřŗ]/,         'r')
        n.gsub!(/[śšşŝș]/,       's')
        n.gsub!(/[ťţŧț]/,        't')
        n.gsub!(/[ùúûüūůűŭũų]/,  'u')
        n.gsub!(/ŵ/,             'w')
        n.gsub!(/[ýÿŷ]/,         'y')
        n.gsub!(/[žżź]/,         'z')
        n.gsub!(/[ÀÁÂÃÄÅĀĂ]/,    'A')
        n.gsub!(/Æ/,            'AE')
        n.gsub!(/[ĎĐ]/,          'D')
        n.gsub!(/[ÇĆČĈĊ]/,       'C')
        n.gsub!(/[ÈÉÊËĒĘĚĔĖ]/,   'E')
        n.gsub!(/Ƒ/i,            'F')
        n.gsub!(/[ĜĞĠĢ]/,        'G')
        n.gsub!(/[ĤĦ]/i,         'H')
        n.gsub!(/[ÌÌÍÎÏĪĨĬ]/,    'I')
        n.gsub!(/[IJĴ]/,          'J')
        n.gsub!(/[Ķĸ]/,          'K')
        n.gsub!(/[ŁĽĹĻĿ]/,       'L')
        n.gsub!(/[ÑŃŇŅʼnŊ]/,      'N')
        n.gsub!(/[ÒÓÔÕÖØŌŐŎŎ]/,  'O')
        n.gsub!(/Œ/,            'OE')
        n.gsub!(/Ą/,             'Q')
        n.gsub!(/[ŔŘŖ]/,         'R')
        n.gsub!(/[ŚŠŞŜȘ]/,       'S')
        n.gsub!(/[ŤŢŦȚ]/,        'T')
        n.gsub!(/[ÙÚÛÜŪŮŰŬŨŲ]/,  'U')
        n.gsub!(/Ŵ/,             'W')
        n.gsub!(/[ÝŸŶ]/,         'Y')
        n.gsub!(/[ŽŻŹ]/,         'Z')
        if transliterate_cyrillic
          n.gsub!(/Б/, 'B')
          n.gsub!(/б/, 'b')
          n.gsub!(/В/, 'V')
          n.gsub!(/в/, 'v')
          n.gsub!(/Г/, 'G')
          n.gsub!(/г/, 'g')
          n.gsub!(/Д/, 'D')
          n.gsub!(/д/, 'd')
          n.gsub!(/Е/, 'E')
          n.gsub!(/е/, 'e')
          n.gsub!(/Ё/, 'Yo')
          n.gsub!(/ё/, 'yo')
          n.gsub!(/Ж/, 'Zh')
          n.gsub!(/ж/, 'zh')
          n.gsub!(/З/, 'Z')
          n.gsub!(/з/, 'z')
          n.gsub!(/И/, 'I')
          n.gsub!(/и/, 'i')
          n.gsub!(/Й/, 'J')
          n.gsub!(/й/, 'j')
          n.gsub!(/К/, 'K')
          n.gsub!(/к/, 'k')
          n.gsub!(/Л/, 'L')
          n.gsub!(/л/, 'l')
          n.gsub!(/М/, 'M')
          n.gsub!(/м/, 'm')
          n.gsub!(/Н/, 'N')
          n.gsub!(/н/, 'n')
          n.gsub!(/О/, 'O')
          n.gsub!(/о/, 'o')
          n.gsub!(/П/, 'P')
          n.gsub!(/п/, 'p')
          n.gsub!(/Р/, 'R')
          n.gsub!(/р/, 'r')
          n.gsub!(/С/, 'S')
          n.gsub!(/с/, 's')
          n.gsub!(/Т/, 'T')
          n.gsub!(/т/, 't')
          n.gsub!(/У/, 'U')
          n.gsub!(/у/, 'u')
          n.gsub!(/Ф/, 'F')
          n.gsub!(/ф/, 'f')
          n.gsub!(/Х/, 'X')
          n.gsub!(/х/, 'x')
          n.gsub!(/Ц/, 'Cz')
          n.gsub!(/ц/, 'cz')
          n.gsub!(/Ч/, 'Ch')
          n.gsub!(/ч/, 'ch')
          n.gsub!(/Ш/, 'Sh')
          n.gsub!(/ш/, 'sh')
          n.gsub!(/Щ/, 'Shh')
          n.gsub!(/щ/, 'shh')
          n.gsub!(/Ъ/, '')
          n.gsub!(/ъ/, '')
          n.gsub!(/Ы/, 'Y')
          n.gsub!(/ы/, 'y')
          n.gsub!(/Ь/, '')
          n.gsub!(/ь/, '')
          n.gsub!(/Э/, 'E')
          n.gsub!(/э/, 'e')
          n.gsub!(/Ю/, 'Yu')
          n.gsub!(/ю/, 'yu')
          n.gsub!(/Я/, 'Ya')
          n.gsub!(/я/, 'ya')
        end
        n
      end #normalize

      def legacy_sanitize(string)
        return '' if string.nil? || string.empty?
        normalize(string.downcase.gsub(/<[^>]+>/m,''), transliterate_cyrillic: false)
         .gsub(/&.+?;/,'-')
         .gsub(/[^a-z0-9 _-]/,'-').strip
         .gsub(/\s+/,'-')
         .gsub(/-+/,'-')
         .gsub(/-/,' ').strip
         .gsub(/ /,'-')
         .gsub(/-/,'_')
       end

      def sanitize(string, transliterate_cyrillic: false)
       return '' if string.nil? || string.empty?
       normalize(string.gsub(/<[^>]+>/m,''), transliterate_cyrillic: transliterate_cyrillic)
        .downcase
        .gsub(/&.+?;/,'-')
        .gsub(/[^a-z0-9 _-]/,'-').strip
        .gsub(/\s+/,'-')
        .gsub(/-+/,'-')
        .gsub(/-/,' ').strip
        .gsub(/ /,'-')
        .gsub(/-/,'_')
      end #sanitize
    end # StringSanitizer
  end # Importer2
end # CartoDB