services/importer/lib/importer/string_sanitizer.rb
module CartoDB
module Importer2
module StringSanitizer
module_function
def normalize(string, transliterate_cyrillic: false)
return '' if string.nil? || string.empty?
n = string.force_encoding("UTF-8")
n.gsub!(/[àáâãäåāă]/, 'a')
n.gsub!(/æ/, 'ae')
n.gsub!(/[ďđ]/, 'd')
n.gsub!(/[çćčĉċ]/, 'c')
n.gsub!(/[èéêëēęěĕė]/, 'e')
n.gsub!(/ƒ/, 'f')
n.gsub!(/[ĝğġģ]/, 'g')
n.gsub!(/[ĥħ]/, 'h')
n.gsub!(/[ììíîïīĩĭ]/, 'i')
n.gsub!(/[įıijĵ]/, 'j')
n.gsub!(/[ķĸ]/, 'k')
n.gsub!(/[łľĺļŀ]/, 'l')
n.gsub!(/[ñńňņʼnŋ]/, 'n')
n.gsub!(/[òóôõöøōőŏŏ]/, 'o')
n.gsub!(/œ/, 'oe')
n.gsub!(/ą/, 'q')
n.gsub!(/[ŕřŗ]/, 'r')
n.gsub!(/[śšşŝș]/, 's')
n.gsub!(/[ťţŧț]/, 't')
n.gsub!(/[ùúûüūůűŭũų]/, 'u')
n.gsub!(/ŵ/, 'w')
n.gsub!(/[ýÿŷ]/, 'y')
n.gsub!(/[žżź]/, 'z')
n.gsub!(/[ÀÁÂÃÄÅĀĂ]/, 'A')
n.gsub!(/Æ/, 'AE')
n.gsub!(/[ĎĐ]/, 'D')
n.gsub!(/[ÇĆČĈĊ]/, 'C')
n.gsub!(/[ÈÉÊËĒĘĚĔĖ]/, 'E')
n.gsub!(/Ƒ/i, 'F')
n.gsub!(/[ĜĞĠĢ]/, 'G')
n.gsub!(/[ĤĦ]/i, 'H')
n.gsub!(/[ÌÌÍÎÏĪĨĬ]/, 'I')
n.gsub!(/[IJĴ]/, 'J')
n.gsub!(/[Ķĸ]/, 'K')
n.gsub!(/[ŁĽĹĻĿ]/, 'L')
n.gsub!(/[ÑŃŇŅʼnŊ]/, 'N')
n.gsub!(/[ÒÓÔÕÖØŌŐŎŎ]/, 'O')
n.gsub!(/Œ/, 'OE')
n.gsub!(/Ą/, 'Q')
n.gsub!(/[ŔŘŖ]/, 'R')
n.gsub!(/[ŚŠŞŜȘ]/, 'S')
n.gsub!(/[ŤŢŦȚ]/, 'T')
n.gsub!(/[ÙÚÛÜŪŮŰŬŨŲ]/, 'U')
n.gsub!(/Ŵ/, 'W')
n.gsub!(/[ÝŸŶ]/, 'Y')
n.gsub!(/[ŽŻŹ]/, 'Z')
if transliterate_cyrillic
n.gsub!(/Б/, 'B')
n.gsub!(/б/, 'b')
n.gsub!(/В/, 'V')
n.gsub!(/в/, 'v')
n.gsub!(/Г/, 'G')
n.gsub!(/г/, 'g')
n.gsub!(/Д/, 'D')
n.gsub!(/д/, 'd')
n.gsub!(/Е/, 'E')
n.gsub!(/е/, 'e')
n.gsub!(/Ё/, 'Yo')
n.gsub!(/ё/, 'yo')
n.gsub!(/Ж/, 'Zh')
n.gsub!(/ж/, 'zh')
n.gsub!(/З/, 'Z')
n.gsub!(/з/, 'z')
n.gsub!(/И/, 'I')
n.gsub!(/и/, 'i')
n.gsub!(/Й/, 'J')
n.gsub!(/й/, 'j')
n.gsub!(/К/, 'K')
n.gsub!(/к/, 'k')
n.gsub!(/Л/, 'L')
n.gsub!(/л/, 'l')
n.gsub!(/М/, 'M')
n.gsub!(/м/, 'm')
n.gsub!(/Н/, 'N')
n.gsub!(/н/, 'n')
n.gsub!(/О/, 'O')
n.gsub!(/о/, 'o')
n.gsub!(/П/, 'P')
n.gsub!(/п/, 'p')
n.gsub!(/Р/, 'R')
n.gsub!(/р/, 'r')
n.gsub!(/С/, 'S')
n.gsub!(/с/, 's')
n.gsub!(/Т/, 'T')
n.gsub!(/т/, 't')
n.gsub!(/У/, 'U')
n.gsub!(/у/, 'u')
n.gsub!(/Ф/, 'F')
n.gsub!(/ф/, 'f')
n.gsub!(/Х/, 'X')
n.gsub!(/х/, 'x')
n.gsub!(/Ц/, 'Cz')
n.gsub!(/ц/, 'cz')
n.gsub!(/Ч/, 'Ch')
n.gsub!(/ч/, 'ch')
n.gsub!(/Ш/, 'Sh')
n.gsub!(/ш/, 'sh')
n.gsub!(/Щ/, 'Shh')
n.gsub!(/щ/, 'shh')
n.gsub!(/Ъ/, '')
n.gsub!(/ъ/, '')
n.gsub!(/Ы/, 'Y')
n.gsub!(/ы/, 'y')
n.gsub!(/Ь/, '')
n.gsub!(/ь/, '')
n.gsub!(/Э/, 'E')
n.gsub!(/э/, 'e')
n.gsub!(/Ю/, 'Yu')
n.gsub!(/ю/, 'yu')
n.gsub!(/Я/, 'Ya')
n.gsub!(/я/, 'ya')
end
n
end #normalize
def legacy_sanitize(string)
return '' if string.nil? || string.empty?
normalize(string.downcase.gsub(/<[^>]+>/m,''), transliterate_cyrillic: false)
.gsub(/&.+?;/,'-')
.gsub(/[^a-z0-9 _-]/,'-').strip
.gsub(/\s+/,'-')
.gsub(/-+/,'-')
.gsub(/-/,' ').strip
.gsub(/ /,'-')
.gsub(/-/,'_')
end
def sanitize(string, transliterate_cyrillic: false)
return '' if string.nil? || string.empty?
normalize(string.gsub(/<[^>]+>/m,''), transliterate_cyrillic: transliterate_cyrillic)
.downcase
.gsub(/&.+?;/,'-')
.gsub(/[^a-z0-9 _-]/,'-').strip
.gsub(/\s+/,'-')
.gsub(/-+/,'-')
.gsub(/-/,' ').strip
.gsub(/ /,'-')
.gsub(/-/,'_')
end #sanitize
end # StringSanitizer
end # Importer2
end # CartoDB