gdbots/pbj-php

View on GitHub
src/Marshaler/Elastica/MappingBuilder.php

Summary

Maintainability
C
1 day
Test Coverage
<?php
declare(strict_types=1);

namespace Gdbots\Pbj\Marshaler\Elastica;

use Elastica\Mapping;
use Gdbots\Pbj\Enum\Format;
use Gdbots\Pbj\Field;
use Gdbots\Pbj\Message;
use Gdbots\Pbj\MessageResolver;
use Gdbots\Pbj\Schema;
use Gdbots\Pbj\Util\ClassUtil;
use Gdbots\Pbj\Util\SlugUtil;
use Gdbots\Pbj\Util\StringUtil;

class MappingBuilder
{
    /**
     * Generally we use "__" to indicate a derived field but kibana won't recognize it.
     * So for now, we'll use "d__" to indicate a derived field for ES.
     *
     * @link  https://github.com/elastic/kibana/issues/2551
     * @link  https://github.com/elastic/kibana/issues/4762
     */
    const ALL_FIELD = 'd__all';
    const TYPE_FIELD = 'd__type';

    /**
     * Map of pbj type to elasticsearch data types.
     */
    const TYPES = [
        'big-int'           => ['type' => 'long'],
        'binary'            => ['type' => 'binary'],
        'blob'              => ['type' => 'binary'],
        'boolean'           => ['type' => 'boolean'],
        'date'              => ['type' => 'date'],
        'date-time'         => ['type' => 'date'],
        'decimal'           => ['type' => 'double'],
        'dynamic-field'     => [
            'type'       => 'object',
            'properties' => [
                'name'       => ['type' => 'keyword', 'normalizer' => 'pbj_keyword'],
                'bool_val'   => ['type' => 'boolean'],
                'date_val'   => ['type' => 'date'],
                'float_val'  => ['type' => 'float'],
                'int_val'    => ['type' => 'long'],
                'string_val' => [
                    'type'    => 'text',
                    'copy_to' => self::ALL_FIELD,
                    'fields'  => ['raw' => ['type' => 'keyword', 'normalizer' => 'pbj_keyword']],
                ],
                'text_val'   => ['type' => 'text', 'copy_to' => self::ALL_FIELD],
            ],
        ],
        'float'             => ['type' => 'float'],
        'geo-point'         => ['type' => 'geo_point'],
        'identifier'        => ['type' => 'keyword'],
        'int'               => ['type' => 'long'],
        'int-enum'          => ['type' => 'integer'],
        'medium-blob'       => ['type' => 'binary'],
        'medium-int'        => ['type' => 'integer'],
        'medium-text'       => ['type' => 'text', 'copy_to' => self::ALL_FIELD],
        'message'           => ['type' => 'object'],
        'message-ref'       => [
            'type'       => 'object',
            'properties' => [
                'curie' => ['type' => 'keyword'],
                'id'    => ['type' => 'keyword'],
                'tag'   => ['type' => 'keyword'],
            ],
        ],
        'microtime'         => ['type' => 'long'],
        'node-ref'          => ['type' => 'keyword'],
        'signed-big-int'    => ['type' => 'long'],
        'signed-int'        => ['type' => 'integer'],
        'signed-medium-int' => ['type' => 'integer'],
        'signed-small-int'  => ['type' => 'short'],
        'signed-tiny-int'   => ['type' => 'byte'],
        'small-int'         => ['type' => 'integer'],
        'string'            => ['type' => 'text', 'copy_to' => self::ALL_FIELD],
        'string-enum'       => ['type' => 'keyword'],
        'text'              => ['type' => 'text', 'copy_to' => self::ALL_FIELD],
        'time-uuid'         => ['type' => 'keyword'],
        'timestamp'         => ['type' => 'date'],
        'tiny-int'          => ['type' => 'short'],
        'trinary'           => ['type' => 'byte'],
        'uuid'              => ['type' => 'keyword'],
    ];

    const MAX_PATH_DEPTH = 4;

    /**
     * During the creation of a mapping any string types that are indexed will
     * use the "english" analyzer unless something else is specified.
     * @link https://www.elastic.co/guide/en/elasticsearch/guide/current/custom-analyzers.html
     */
    protected string $analyzer = 'english';

    /**
     * @link https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping.html
     */
    protected array $properties = [];

    /**
     * @link https://www.elastic.co/guide/en/elasticsearch/reference/current/dynamic-templates.html
     */
    protected array $dynamicTemplates = [];

    /**
     * When mappings are created with nested messages the path is tracked
     * so the dynamic templates are correctly associated with the path.
     *
     * @var array
     */
    protected array $path = [];

    /**
     * Returns the custom analyzers that an index will need to when indexing some
     * pbj fields/types when certain options are used (urls, hashtag format, etc.)
     *
     * @link http://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-custom-analyzer.html
     */
    public static function getCustomAnalyzers(): array
    {
        return [
            'pbj_keyword' => [
                'tokenizer' => 'keyword',
                'filter'    => 'lowercase',
            ],
        ];
    }

    /**
     * Returns the custom normalizers that an index will need to when indexing some
     * pbj fields/types when certain options are used (urls, hashtag format, etc.)
     *
     * @link https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-normalizers.html
     */
    public static function getCustomNormalizers(): array
    {
        return [
            'pbj_keyword' => [
                'type'        => 'custom',
                'char_filter' => [],
                'filter'      => ['lowercase', 'asciifolding'],
            ],
        ];
    }

    public function build(): Mapping
    {
        $properties = $this->properties;
        $properties[self::TYPE_FIELD] = ['type' => 'keyword'];
        $properties[self::ALL_FIELD] = ['type' => 'text', 'analyzer' => $this->analyzer];
        $mapping = new Mapping($properties);
        $dynamicTemplates = $this->getDynamicTemplates();
        if (!empty($dynamicTemplates)) {
            $mapping->setParam('dynamic_templates', $dynamicTemplates);
        }

        return $mapping;
    }

    public function setAnalyzer(string $analyzer): self
    {
        $this->analyzer = $analyzer;
        return $this;
    }

    public function addDynamicTemplate(string $name, array $template): self
    {
        $this->dynamicTemplates[$name] = [$name => $template];
        return $this;
    }

    public function getDynamicTemplates(): array
    {
        return array_values($this->dynamicTemplates);
    }

    public function addSchema(Schema $schema): self
    {
        $this->properties = array_replace_recursive($this->properties, $this->buildSchema($schema));
        return $this;
    }

    protected function buildSchema(Schema $schema): array
    {
        $properties = [];

        if ($this->getPathDepth() > static::MAX_PATH_DEPTH) {
            return $properties;
        }

        foreach ($schema->getFields() as $field) {
            $fieldName = $field->getName();
            $type = $field->getType();
            $this->enterField($fieldName);
            $path = $this->getPath();

            if ($fieldName === Schema::PBJ_FIELD_NAME) {
                $properties[$fieldName] = $this->filterProperties($schema, $field, $path, ['type' => 'keyword']);
                $this->leaveField();
                continue;
            }

            if ($this->shouldIgnoreField($field, $path)) {
                $properties[$fieldName] = ['type' => 'text', 'index' => false];
                $this->leaveField();
                continue;
            }

            $method = 'build' . ucfirst(StringUtil::toCamelFromSlug($type->getTypeValue()));

            if ($field->isAMap()) {
                $templateName = str_replace('-', '_', SlugUtil::create($path . '-template'));
                if (is_callable([$this, $method])) {
                    $this->addDynamicTemplate($templateName, [
                        'path_match' => $path . '.*',
                        'mapping'    => $this->filterProperties($schema, $field, $path, $this->$method($field)),
                    ]);
                } else {
                    $this->addDynamicTemplate($templateName, [
                        'path_match' => $path . '.*',
                        'mapping'    => $this->filterProperties(
                            $schema, $field, $path, $this->withAnalyzer(self::TYPES[$type->getTypeValue()], $field)
                        ),
                    ]);
                }
            } else {
                if (is_callable([$this, $method])) {
                    $properties[$fieldName] = $this->filterProperties($schema, $field, $path, $this->$method($field));
                } else {
                    $properties[$fieldName] = $this->filterProperties(
                        $schema, $field, $path, $this->withAnalyzer(self::TYPES[$type->getTypeValue()], $field)
                    );
                }
            }

            $this->leaveField();
        }

        return $properties;
    }

    /**
     * @link https://www.elastic.co/guide/en/elasticsearch/reference/current/nested.html
     *
     * @param Field $field
     *
     * @return array
     */
    protected function buildMessage(Field $field): array
    {
        $properties = [Schema::PBJ_FIELD_NAME => ['type' => 'keyword']];

        foreach ($this->getSupportedMessages($field) as $message) {
            $properties = array_replace_recursive($properties, $this->buildSchema($message::schema()));
        }

        return [
            'type'       => $field->isAList() ? 'nested' : 'object',
            'properties' => $properties,
        ];
    }

    /**
     * @link https://www.elastic.co/guide/en/elasticsearch/reference/current/nested.html
     *
     * @param Field $field
     *
     * @return array
     */
    protected function buildDynamicField(Field $field): array
    {
        $properties = self::TYPES[$field->getType()->getTypeValue()];

        if ($field->isAList()) {
            $properties['type'] = 'nested';
        }

        $properties['properties']['string_val'] = $this->withAnalyzer($properties['properties']['string_val'], $field);
        $properties['properties']['text_val'] = $this->withAnalyzer($properties['properties']['text_val'], $field);

        return $properties;
    }

    protected function buildString(Field $field): array
    {
        return $this->withFormat($field);
    }

    protected function buildText(Field $field): array
    {
        return $this->withFormat($field);
    }

    protected function withFormat(Field $field): array
    {
        $format = $field->hasFormat() ? $field->getFormat() : null;

        switch ($format) {
            case Format::DATE:
            case Format::DATE_TIME:
                return self::TYPES['date-time'];

            /**
             * String fields with these formats should use "pbj_keyword" (or something similar)
             * so searches on these fields are not case sensitive.
             *
             * @link http://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-custom-analyzer.html
             * @link http://stackoverflow.com/questions/15079064/how-to-setup-a-tokenizer-in-elasticsearch
             */
            case Format::EMAIL:
            case Format::HASHTAG:
            case Format::HOSTNAME:
            case Format::SLUG:
            case Format::URI:
            case Format::URL:
            case Format::UUID:
                return ['type' => 'keyword', 'normalizer' => 'pbj_keyword'];

            case Format::IPV4:
            case Format::IPV6:
                return ['type' => 'ip'];

            default:
                if ($field->getPattern()) {
                    return ['type' => 'keyword', 'normalizer' => 'pbj_keyword'];
                }

                return $this->withAnalyzer(self::TYPES['text'], $field);
        }
    }

    /**
     * Modify the analyzer for a property prior to adding it to the document mapping.
     * This is only applied to "text" types.
     *
     * @param array $properties
     * @param Field $field
     *
     * @return array
     */
    protected function withAnalyzer(array $properties, Field $field): array
    {
        if (null === $this->analyzer) {
            return $properties;
        }

        if (!isset($properties['type']) || 'text' !== $properties['type']) {
            return $properties;
        }

        if (isset($properties['index']) && false === $properties['index']) {
            return $properties;
        }

        if (isset($properties['analyzer'])) {
            return $properties;
        }

        $properties['analyzer'] = $this->analyzer;
        return $properties;
    }

    /**
     * Override to customize the properties for a given field.
     *
     * @param Schema $schema
     * @param Field  $field
     * @param string $path
     * @param array  $properties
     *
     * @return array
     */
    protected function filterProperties(Schema $schema, Field $field, string $path, array $properties): array
    {
        return $properties;
    }

    protected function shouldIgnoreField(Field $field, string $path): bool
    {
        return false;
    }

    protected function enterField(string $fieldName): void
    {
        $this->path[] = $fieldName;
    }

    protected function leaveField(): void
    {
        array_pop($this->path);
    }

    protected function getPath(): string
    {
        return implode('.', $this->path);
    }

    protected function getPathDepth(): int
    {
        return count($this->path);
    }

    /**
     * @param Field $field
     *
     * @return Message[]
     */
    protected function getSupportedMessages(Field $field): array
    {
        if (!$field->hasAnyOfCuries()) {
            return MessageResolver::all();
        }

        $supported = [];
        $anyOfCuries = $field->getAnyOfCuries();

        /** @var Message|string $message */
        foreach (MessageResolver::all() as $message) {
            $constants = ClassUtil::getConstants($message);
            $curies = [];
            if (isset($constants['SCHEMA_CURIE'])) {
                $curies[] = $constants['SCHEMA_CURIE'];
                $curies[] = $constants['SCHEMA_CURIE_MAJOR'];
                $curies = array_merge($curies, $constants['MIXINS']);
            } else {
                $schema = $message::schema();
                $curies[] = $schema->getCurie()->toString();
                $curies[] = $schema->getCurieMajor();
                $curies = array_merge($curies, $schema->getMixins());
            }

            if (array_intersect($anyOfCuries, $curies)) {
                $supported[] = $message;
            }
        }

        return $supported;
    }
}