app/templates/documents.rb
# frozen_string_literal: true
class Documents
include Templatable
LIGHT_STEMMERS = {
de: 'german',
es: 'spanish',
fr: 'french',
it: 'italian',
pt: 'portuguese'
}.freeze
STANDARD_STEMMERS = {
bn: 'bengali',
en: 'english',
fi: 'finnish',
hi: 'hindi',
hu: 'hungarian',
ru: 'russian',
sv: 'swedish'
}.freeze
def initialize
@synonym_filter_locales = Set.new
@protected_filter_locales = Set.new
end
def body
Jbuilder.encode do |json|
json.index_patterns("*-#{I14y::APP_NAME}-documents-*")
json.settings do
json.analysis do
char_filter(json)
filter(json)
analyzer(json)
tokenizer(json)
end
end
json.mappings do
dynamic_templates(json)
properties(json)
end
end
end
def char_filter(json)
json.char_filter do
json.quotes do
json.type('mapping')
json.mappings(['\\u0091=>\\u0027', '\\u0092=>\\u0027', '\\u2018=>\\u0027', '\\u2019=>\\u0027', '\\u201B=>\\u0027'])
end
end
end
def filter(json)
json.filter do
json.bigrams_filter do
json.type('shingle')
end
language_synonyms(json)
language_protwords(json)
language_stemmers(json)
end
end
def analyzer(json)
json.analyzer do
generic_analyzers(json)
french_analyzer(json)
japanese_analyzer(json)
korean_analyzer(json)
chinese_analyzer(json)
bigrams_analyzer(json)
url_path_analyzer(json)
domain_name_analyzer(json)
default_analyzer(json)
end
end
def default_analyzer(json)
json.default do
json.type('custom')
json.filter(%w[icu_normalizer icu_folding])
json.tokenizer('icu_tokenizer')
json.char_filter(%w[html_strip quotes])
end
end
def domain_name_analyzer(json)
json.domain_name_analyzer do
json.type('custom')
json.filter('lowercase')
json.tokenizer('domain_name_tokenizer')
end
end
def url_path_analyzer(json)
json.url_path_analyzer do
json.type('custom')
json.filter('lowercase')
json.tokenizer('url_path_tokenizer')
end
end
def bigrams_analyzer(json)
json.bigrams_analyzer do
json.type('custom')
json.filter(%w[icu_normalizer icu_folding bigrams_filter])
json.tokenizer('icu_tokenizer')
json.char_filter(%w[html_strip quotes])
end
end
def generic_analyzers(json)
GENERIC_ANALYZER_LOCALES.each do |locale|
generic_analyzer(json, locale)
end
end
def chinese_analyzer(json)
json.zh_analyzer do
json.type('custom')
json.filter(%w[smartcn_word icu_normalizer icu_folding])
json.tokenizer('smartcn_sentence')
json.char_filter(['html_strip'])
end
end
def korean_analyzer(json)
json.ko_analyzer do
json.type('cjk')
json.filter([])
end
end
def japanese_analyzer(json)
json.ja_analyzer do
json.type('custom')
json.filter(%w[kuromoji_baseform ja_pos_filter icu_normalizer icu_folding cjk_width])
json.tokenizer('kuromoji_tokenizer')
json.char_filter(['html_strip'])
end
end
def french_analyzer(json)
json.fr_analyzer do
json.type('custom')
json.filter(%w[icu_normalizer elision fr_stem_filter icu_folding])
json.tokenizer('icu_tokenizer')
json.char_filter(%w[html_strip quotes])
end
end
def tokenizer(json)
json.tokenizer do
json.kuromoji do
json.type('kuromoji_tokenizer')
json.mode('search')
json.char_filter(['html_strip'])
end
json.url_path_tokenizer do
json.type('PathHierarchy')
end
json.domain_name_tokenizer do
json.type('PathHierarchy')
json.delimiter('.')
json.reverse(true)
end
end
end
def filter_array(locale)
array = ['icu_normalizer']
array << "#{locale}_protected_filter" if @protected_filter_locales.include?(locale)
array << "#{locale}_stem_filter"
array << "#{locale}_synonym" if @synonym_filter_locales.include?(locale)
array << 'icu_folding'
array
end
def properties(json)
json.properties do
%w[updated created changed].each { |field| date(json, field) }
%w[audience content_type document_id extension thumbnail_url language mime_type path
searchgov_custom1 searchgov_custom2 searchgov_custom3 tags].each { |field| keyword(json, field) }
basename(json)
url_path(json)
domain_name(json)
promote(json)
bigrams(json)
click_count(json)
end
end
def basename(json)
json.basename do
json.type('text')
end
end
def bigrams(json)
json.bigrams do
json.analyzer('bigrams_analyzer')
json.type('text')
end
end
def promote(json)
json.promote do
json.type('boolean')
end
end
def domain_name(json)
json.domain_name do
json.type('text')
json.analyzer('domain_name_analyzer')
end
end
def url_path(json)
json.url_path do
json.type('text')
json.analyzer('url_path_analyzer')
end
end
def click_count(json)
json.click_count do
json.type('integer')
end
end
def dynamic_templates(json)
json.dynamic_templates do
language_templates(json)
string_fields_template(json, 'text')
end
end
def language_stemmers(json)
light_stemmers(json)
standard_stemmers(json)
japanese_position_filter(json)
end
def japanese_position_filter(json)
json.ja_pos_filter do
json.type('kuromoji_part_of_speech')
json.stoptags(['\\u52a9\\u8a5e-\\u683c\\u52a9\\u8a5e-\\u4e00\\u822c', '\\u52a9\\u8a5e-\\u7d42\\u52a9\\u8a5e'])
end
end
def light_stemmers(json)
LIGHT_STEMMERS.each do |locale, language|
generic_stemmer(json, locale, language, 'light')
end
end
def standard_stemmers(json)
STANDARD_STEMMERS.each do |locale, language|
generic_stemmer(json, locale, language, 'standard')
end
end
def language_templates(json)
LANGUAGE_ANALYZER_LOCALES.each do |locale|
json.child! do
json.set!(locale) do
json.match("*_#{locale}")
json.match_mapping_type('string')
json.mapping do
json.analyzer("#{locale}_analyzer")
json.type('text')
json.term_vector('with_positions_offsets')
json.copy_to('bigrams')
end
end
end
end
end
def language_synonyms(json)
parse_configuration_file(json, 'synonyms')
end
def language_protwords(json)
parse_configuration_file(json, 'protwords')
end
def synonyms_filter(json, locale, lines)
@synonym_filter_locales.add(locale)
linguistic_filter(json, locale, lines, 'synonym', 'synonyms', 'synonym')
end
def protwords_filter(json, locale, lines)
@protected_filter_locales.add(locale)
linguistic_filter(json, locale, lines, 'protected_filter', 'keywords', 'keyword_marker')
end
end