app/models/smart_encoding.rb
class SmartEncoding
CHARACTERS = {
"\u00ab" => '"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
"\u00bb" => '"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
"\u201c" => '"', # LEFT DOUBLE QUOTATION MARK
"\u201d" => '"', # RIGHT DOUBLE QUOTATION MARK
"\u02ba" => '"', # MODIFIER LETTER DOUBLE PRIME
"\u02ee" => '"', # MODIFIER LETTER DOUBLE APOSTROPHE
"\u201f" => '"', # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
"\u275d" => '"', # HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT
"\u275e" => '"', # HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT
"\u301d" => '"', # REVERSED DOUBLE PRIME QUOTATION MARK
"\u301e" => '"', # DOUBLE PRIME QUOTATION MARK
"\uff02" => '"', # FULLWIDTH QUOTATION MARK
"\u2018" => "'", # LEFT SINGLE QUOTATION MARK
"\u2019" => "'", # RIGHT SINGLE QUOTATION MARK
"\u02BB" => "'", # MODIFIER LETTER TURNED COMMA
"\u02c8" => "'", # MODIFIER LETTER VERTICAL LINE
"\u02bc" => "'", # MODIFIER LETTER APOSTROPHE
"\u02bd" => "'", # MODIFIER LETTER REVERSED COMMA
"\u02b9" => "'", # MODIFIER LETTER PRIME
"\u201b" => "'", # SINGLE HIGH-REVERSED-9 QUOTATION MARK
"\uff07" => "'", # FULLWIDTH APOSTROPHE
"\u00b4" => "'", # ACUTE ACCENT
"\u02ca" => "'", # MODIFIER LETTER ACUTE ACCENT
"\u0060" => "'", # GRAVE ACCENT
"\u02cb" => "'", # MODIFIER LETTER GRAVE ACCENT
"\u275b" => "'", # HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT
"\u275c" => "'", # HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT
"\u0313" => "'", # COMBINING COMMA ABOVE
"\u0314" => "'", # COMBINING REVERSED COMMA ABOVE
"\ufe10" => "'", # PRESENTATION FORM FOR VERTICAL COMMA
"\ufe11" => "'", # PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA
"\u00F7" => "/", # DIVISION SIGN
"\u00bc" => "1/4", # VULGAR FRACTION ONE QUARTER
"\u00bd" => "1/2", # VULGAR FRACTION ONE HALF
"\u00be" => "3/4", # VULGAR FRACTION THREE QUARTERS
"\u29f8" => "/", # BIG SOLIDUS
"\u0337" => "/", # COMBINING SHORT SOLIDUS OVERLAY
"\u0338" => "/", # COMBINING LONG SOLIDUS OVERLAY
"\u2044" => "/", # FRACTION SLASH
"\u2215" => "/", # DIVISION SLASH
"\uff0f" => "/", # FULLWIDTH SOLIDUS
"\u29f9" => "\\", # BIG REVERSE SOLIDUS
"\u29f5" => "\\", # REVERSE SOLIDUS OPERATOR
"\u20e5" => "\\", # COMBINING REVERSE SOLIDUS OVERLAY
"\ufe68" => "\\", # SMALL REVERSE SOLIDUS
"\uff3c" => "\\", # FULLWIDTH REVERSE SOLIDUS
"\u0332" => "_", # COMBINING LOW LINE
"\uff3f" => "_", # FULLWIDTH LOW LINE
"\u20d2" => "|", # COMBINING LONG VERTICAL LINE OVERLAY
"\u20d3" => "|", # COMBINING SHORT VERTICAL LINE OVERLAY
"\u2223" => "|", # DIVIDES
"\uff5c" => "|", # FULLWIDTH VERTICAL LINE
"\u23b8" => "|", # LEFT VERTICAL BOX LINE
"\u23b9" => "|", # RIGHT VERTICAL BOX LINE
"\u23d0" => "|", # VERTICAL LINE EXTENSION
"\u239c" => "|", # LEFT PARENTHESIS EXTENSION
"\u239f" => "|", # RIGHT PARENTHESIS EXTENSION
"\u23bc" => "-", # HORIZONTAL SCAN LINE-7
"\u23bd" => "-", # HORIZONTAL SCAN LINE-9
"\u2015" => "-", # HORIZONTAL BAR
"\ufe63" => "-", # SMALL HYPHEN-MINUS
"\uff0d" => "-", # FULLWIDTH HYPHEN-MINUS
"\u2010" => "-", # HYPHEN
"\u2043" => "-", # HYPHEN BULLET
"\ufe6b" => "@", # SMALL COMMERCIAL AT
"\uff20" => "@", # FULLWIDTH COMMERCIAL AT
"\ufe69" => "$", # SMALL DOLLAR SIGN
"\uff04" => "$", # FULLWIDTH DOLLAR SIGN
"\u01c3" => "!", # LATIN LETTER RETROFLEX CLICK
"\ufe15" => "!", # PRESENTATION FORM FOR VERTICAL EXLAMATION MARK
"\ufe57" => "!", # SMALL EXCLAMATION MARK
"\uff01" => "!", # FULLWIDTH EXCLAMATION MARK
"\ufe5f" => "#", # SMALL NUMBER SIGN
"\uff03" => "#", # FULLWIDTH NUMBER SIGN
"\ufe6a" => "%", # SMALL PERCENT SIGN
"\uff05" => "%", # FULLWIDTH PERCENT SIGN
"\ufe60" => "&", # SMALL AMPERSAND
"\uff06" => "&", # FULLWIDTH AMPERSAND
"\u201a" => ",", # SINGLE LOW-9 QUOTATION MARK
"\u0326" => ",", # COMBINING COMMA BELOW
"\ufe50" => ",", # SMALL COMMA
"\ufe51" => ",", # SMALL IDEOGRAPHIC COMMA
"\uff0c" => ",", # FULLWIDTH COMMA
"\uff64" => ",", # HALFWIDTH IDEOGRAPHIC COMMA
"\u2768" => "(", # MEDIUM LEFT PARENTHESIS ORNAMENT
"\u276a" => "(", # MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT
"\ufe59" => "(", # SMALL LEFT PARENTHESIS
"\uff08" => "(", # FULLWIDTH LEFT PARENTHESIS
"\u27ee" => "(", # MATHEMATICAL LEFT FLATTENED PARENTHESIS
"\u2985" => "(", # LEFT WHITE PARENTHESIS
"\u2769" => ")", # MEDIUM RIGHT PARENTHESIS ORNAMENT
"\u276b" => ")", # MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT
"\ufe5a" => ")", # SMALL RIGHT PARENTHESIS
"\uff09" => ")", # FULLWIDTH RIGHT PARENTHESIS
"\u27ef" => ")", # MATHEMATICAL RIGHT FLATTENED PARENTHESIS
"\u2986" => ")", # RIGHT WHITE PARENTHESIS
"\u204e" => "*", # LOW ASTERISK
"\u2217" => "*", # ASTERISK OPERATOR
"\u229B" => "*", # CIRCLED ASTERISK OPERATOR
"\u2722" => "*", # FOUR TEARDROP-SPOKED ASTERISK
"\u2723" => "*", # FOUR BALLOON-SPOKED ASTERISK
"\u2724" => "*", # HEAVY FOUR BALLOON-SPOKED ASTERISK
"\u2725" => "*", # FOUR CLUB-SPOKED ASTERISK
"\u2731" => "*", # HEAVY ASTERISK
"\u2732" => "*", # OPEN CENTRE ASTERISK
"\u2733" => "*", # EIGHT SPOKED ASTERISK
"\u273a" => "*", # SIXTEEN POINTED ASTERISK
"\u273b" => "*", # TEARDROP-SPOKED ASTERISK
"\u273c" => "*", # OPEN CENTRE TEARDROP-SPOKED ASTERISK
"\u273d" => "*", # HEAVY TEARDROP-SPOKED ASTERISK
"\u2743" => "*", # HEAVY TEARDROP-SPOKED PINWHEEL ASTERISK
"\u2749" => "*", # BALLOON-SPOKED ASTERISK
"\u274a" => "*", # EIGHT TEARDROP-SPOKED PROPELLER ASTERISK
"\u274b" => "*", # HEAVY EIGHT TEARDROP-SPOKED PROPELLER ASTERISK
"\u29c6" => "*", # SQUARED ASTERISK
"\ufe61" => "*", # SMALL ASTERISK
"\uff0a" => "*", # FULLWIDTH ASTERISK
"\u02d6" => "+", # MODIFIER LETTER PLUS SIGN
"\ufe62" => "+", # SMALL PLUS SIGN
"\uff0b" => "+", # FULLWIDTH PLUS SIGN
"\u3002" => ".", # IDEOGRAPHIC FULL STOP
"\ufe52" => ".", # SMALL FULL STOP
"\uff0e" => ".", # FULLWIDTH FULL STOP
"\uff61" => ".", # HALFWIDTH IDEOGRAPHIC FULL STOP
"\uff10" => "0", # FULLWIDTH DIGIT ZERO
"\uff11" => "1", # FULLWIDTH DIGIT ONE
"\uff12" => "2", # FULLWIDTH DIGIT TWO
"\uff13" => "3", # FULLWIDTH DIGIT THREE
"\uff14" => "4", # FULLWIDTH DIGIT FOUR
"\uff15" => "5", # FULLWIDTH DIGIT FIVE
"\uff16" => "6", # FULLWIDTH DIGIT SIX
"\uff17" => "7", # FULLWIDTH DIGIT SEVEN
"\uff18" => "8", # FULLWIDTH DIGIT EIGHT
"\uff19" => "9", # FULLWIDTH DIGIT NINE
"\u02d0" => ":", # MODIFIER LETTER TRIANGULAR COLON
"\u02f8" => ":", # MODIFIER LETTER RAISED COLON
"\u2982" => ":", # Z NOTATION TYPE COLON
"\ua789" => ":", # MODIFIER LETTER COLON
"\ufe13" => ":", # PRESENTATION FORM FOR VERTICAL COLON
"\uff1a" => ":", # FULLWIDTH COLON
"\u204f" => ";", # REVERSED SEMICOLON
"\ufe14" => ";", # PRESENTATION FORM FOR VERTICAL SEMICOLON
"\ufe54" => ";", # SMALL SEMICOLON
"\uff1b" => ";", # FULLWIDTH SEMICOLON
"\ufe64" => "<", # SMALL LESS-THAN SIGN
"\uff1c" => "<", # FULLWIDTH LESS-THAN SIGN
"\u0347" => "=", # COMBINING EQUALS SIGN BELOW
"\ua78a" => "=", # MODIFIER LETTER SHORT EQUALS SIGN
"\ufe66" => "=", # SMALL EQUALS SIGN
"\uff1d" => "=", # FULLWIDTH EQUALS SIGN
"\ufe65" => ">", # SMALL GREATER-THAN SIGN
"\uff1e" => ">", # FULLWIDTH GREATER-THAN SIGN
"\ufe16" => "?", # PRESENTATION FORM FOR VERTICAL QUESTION MARK
"\ufe56" => "?", # SMALL QUESTION MARK
"\uff1f" => "?", # FULLWIDTH QUESTION MARK
"\uff21" => "A", # FULLWIDTH LATIN CAPITAL LETTER A
"\u1d00" => "A", # LATIN LETTER SMALL CAPITAL A
"\uff22" => "B", # FULLWIDTH LATIN CAPITAL LETTER B
"\u0299" => "B", # LATIN LETTER SMALL CAPITAL B
"\uff23" => "C", # FULLWIDTH LATIN CAPITAL LETTER C
"\u1d04" => "C", # LATIN LETTER SMALL CAPITAL C
"\uff24" => "D", # FULLWIDTH LATIN CAPITAL LETTER D
"\u1d05" => "D", # LATIN LETTER SMALL CAPITAL D
"\uff25" => "E", # FULLWIDTH LATIN CAPITAL LETTER E
"\u1d07" => "E", # LATIN LETTER SMALL CAPITAL E
"\uff26" => "F", # FULLWIDTH LATIN CAPITAL LETTER F
"\ua730" => "F", # LATIN LETTER SMALL CAPITAL F
"\uff27" => "G", # FULLWIDTH LATIN CAPITAL LETTER G
"\u0262" => "G", # LATIN LETTER SMALL CAPITAL G
"\uff28" => "H", # FULLWIDTH LATIN CAPITAL LETTER H
"\u029c" => "H", # LATIN LETTER SMALL CAPITAL H
"\uff29" => "I", # FULLWIDTH LATIN CAPITAL LETTER I
"\u026a" => "I", # LATIN LETTER SMALL CAPITAL I
"\uff2a" => "J", # FULLWIDTH LATIN CAPITAL LETTER J
"\u1d0a" => "J", # LATIN LETTER SMALL CAPITAL J
"\uff2b" => "K", # FULLWIDTH LATIN CAPITAL LETTER K
"\u1d0b" => "K", # LATIN LETTER SMALL CAPITAL K
"\uff2c" => "L", # FULLWIDTH LATIN CAPITAL LETTER L
"\u029f" => "L", # LATIN LETTER SMALL CAPITAL L
"\uff2d" => "M", # FULLWIDTH LATIN CAPITAL LETTER M
"\u1d0d" => "M", # LATIN LETTER SMALL CAPITAL M
"\uff2e" => "N", # FULLWIDTH LATIN CAPITAL LETTER N
"\u0274" => "N", # LATIN LETTER SMALL CAPITAL N
"\uff2f" => "O", # FULLWIDTH LATIN CAPITAL LETTER O
"\u1d0f" => "O", # LATIN LETTER SMALL CAPITAL O
"\uff30" => "P", # FULLWIDTH LATIN CAPITAL LETTER P
"\u1d18" => "P", # LATIN LETTER SMALL CAPITAL P
"\uff31" => "Q", # FULLWIDTH LATIN CAPITAL LETTER Q
"\uff32" => "R", # FULLWIDTH LATIN CAPITAL LETTER R
"\u0280" => "R", # LATIN LETTER SMALL CAPITAL R
"\uff33" => "S", # FULLWIDTH LATIN CAPITAL LETTER S
"\ua731" => "S", # LATIN LETTER SMALL CAPITAL S
"\uff34" => "T", # FULLWIDTH LATIN CAPITAL LETTER T
"\u1d1b" => "T", # LATIN LETTER SMALL CAPITAL T
"\uff35" => "U", # FULLWIDTH LATIN CAPITAL LETTER U
"\u1d1c" => "U", # LATIN LETTER SMALL CAPITAL U
"\uff36" => "V", # FULLWIDTH LATIN CAPITAL LETTER V
"\u1d20" => "V", # LATIN LETTER SMALL CAPITAL V
"\uff37" => "W", # FULLWIDTH LATIN CAPITAL LETTER W
"\u1d21" => "W", # LATIN LETTER SMALL CAPITAL W
"\uff38" => "X", # FULLWIDTH LATIN CAPITAL LETTER X
"\uff39" => "Y", # FULLWIDTH LATIN CAPITAL LETTER Y
"\u028f" => "Y", # LATIN LETTER SMALL CAPITAL Y
"\uff3a" => "Z", # FULLWIDTH LATIN CAPITAL LETTER Z
"\u1d22" => "Z", # LATIN LETTER SMALL CAPITAL Z
"\u02c6" => "^", # MODIFIER LETTER CIRCUMFLEX ACCENT
"\u0302" => "^", # COMBINING CIRCUMFLEX ACCENT
"\uff3e" => "^", # FULLWIDTH CIRCUMFLEX ACCENT
"\u1dcd" => "^", # COMBINING DOUBLE CIRCUMFLEX ABOVE
"\u2774" => "{", # MEDIUM LEFT CURLY BRACKET ORNAMENT
"\ufe5b" => "{", # SMALL LEFT CURLY BRACKET
"\uff5b" => "{", # FULLWIDTH LEFT CURLY BRACKET
"\u2775" => "}", # MEDIUM RIGHT CURLY BRACKET ORNAMENT
"\ufe5c" => "}", # SMALL RIGHT CURLY BRACKET
"\uff5d" => "}", # FULLWIDTH RIGHT CURLY BRACKET
"\uff3b" => "[", # FULLWIDTH LEFT SQUARE BRACKET
"\uff3d" => "]", # FULLWIDTH RIGHT SQUARE BRACKET
"\u02dc" => "~", # SMALL TILDE
"\u02f7" => "~", # MODIFIER LETTER LOW TILDE
"\u0303" => "~", # COMBINING TILDE
"\u0330" => "~", # COMBINING TILDE BELOW
"\u0334" => "~", # COMBINING TILDE OVERLAY
"\u223c" => "~", # TILDE OPERATOR
"\uff5e" => "~", # FULLWIDTH TILDE
"\u00a0" => " ", # NO-BREAK SPACE
"\u2000" => " ", # EN QUAD
"\u2002" => " ", # EN SPACE
"\u2003" => " ", # EM SPACE
"\u2004" => " ", # THREE-PER-EM SPACE
"\u2005" => " ", # FOUR-PER-EM SPACE
"\u2006" => " ", # SIX-PER-EM SPACE
"\u2007" => " ", # FIGURE SPACE
"\u2008" => " ", # PUNCTUATION SPACE
"\u2009" => " ", # THIN SPACE
"\u200a" => " ", # HAIR SPACE
"\u202f" => " ", # NARROW NO-BREAK SPACE
"\u205f" => " ", # MEDIUM MATHEMATICAL SPACE
"\u3000" => " ", # IDEOGRAHPIC SPACE
"\u008d" => " ", # REVERSE LINE FEED (standard LF looks like \n, this looks like a space)
"\u009f" => " ", # <control>
"\u0080" => " ", # C1 CONTROL CODES
"\u0090" => " ", # DEVICE CONTROL STRING
"\u009b" => " ", # CONTROL SEQUENCE INTRODUCER
"\u0010" => "", # ESCAPE, DATA LINK (not visible)
"\u0009" => " ", # TAB (7 spaces based on print statement in Python interpreter)
"\u0000" => "", # NULL
"\u0003" => "", # END OF TEXT
"\u0004" => "", # END OF TRANSMISSION
"\u0017" => "", # END OF TRANSMISSION BLOCK
"\u0019" => "", # END OF MEDIUM
"\u0011" => "", # DEVICE CONTROL ONE
"\u0012" => "", # DEVICE CONTROL TWO
"\u0013" => "", # DEVICE CONTROL THREE
"\u0014" => "", # DEVICE CONTROL FOUR
"\u2060" => "", # WORD JOINER
"\u2017" => "'", # Horizontal ellipsis
"\u2014" => "-", # Single low-9 quotation mark
"\u2013" => "-", # Single high-reversed-9 quotation mark
"\u2039" => ">", # Single left-pointing angle quotation mark
"\u203A" => "<", # Single right-pointing angle quotation mark
"\u203C" => "!!", # Double exclamation mark
"\u201E" => '"', # Double low line
"\u2028" => " ", # Whitespace: Line Separator
"\u2029" => " ", # Whitespace: Paragraph Separator
"\u2026" => "...", # Whitespace: Narrow No-Break Space
"\u2001" => " ", # Whitespace: Medium Mathematical Space
"\u200b" => "", # ZERO WIDTH SPACE
"\u3001" => ",", # IDEOGRAPHIC COMMA
"\uFEFF" => "", # ZERO WIDTH NO-BREAK SPACE
"\u2022" => "-" # Bullet
}.freeze
Result = Struct.new(:value, :smart_encoded?, keyword_init: true) do
def to_s
value
end
end
def encode(value)
encoded_value = CHARACTERS.each_with_object(value.dup) do |(character, replacement), result|
result.gsub!(character, replacement)
end
Result.new(value: encoded_value, smart_encoded?: encoded_value != value)
end
end