MatthewMi11er/mi11er-library

View on GitHub
src/mi11er-library/Text/Formatters/TitleCaseFormatter.php

Summary

Maintainability
A
1 hr
Test Coverage
<?php
/**
 * The Mi11er Library
 *
 * @package    Mi11er\Library
 * @copyright 2016 Matthew Miller
 * @license MIT
 *
 * The MIT License (MIT)
 * Copyright (c) 2016 Matthew Miller
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

namespace Mi11er\Library\Text\Formatters;

/**
 * PHP port of to-title-case ({@link https://github.com/gouch/to-title-case }) javascript function.
 * See also:{@link http://camendesign.com/code/title-case}
 */
class TitleCaseFormatter
{
    /**
     * These are words that generally should not be capitalized in the title.
     */
    private $smallWords = '/^(a|an|and|as|at|but|by|en|for|if|in|nor|of|on|or|per|the|to|vs?\.?|via)$/iu';
    
    /**
     * Regex to remove basic HTML from the $title string
     */
    private $elements   = '/<(code|var)[^>]*>.*?<\/\1>|<[^>]+>|&\S+;/';
    
    /**
     * Holds the working title string
     */
    private $title;
    
    /**
     * Holds the html we stripped from the $title
     */
    private $foundElements;
    
    /**
     * Track where we are in the $title string
     */
    private $offset;

    /**
     * Convert to title-case.
     *
     * @param string $title The string title to convert.
     *
     * @return string
     */
    public function format($title)
    {
        $this->reset();

        $this->title = $title;

        $this->removeHtml();

        $this->title = preg_replace_callback('/[A-Za-z0-9\xC0-\xFF]+[^\s-]*/u', [$this,'callback'], $this->title);

        $this->restoreHtml();

        return $this->title;
    }

    /**
     * Private Helper Functions
     */

    /**
     * Clears the working vars
     */
    private function reset()
    {
        $this->foundElements = [];
        $this->offset        = 0;
    }
    
    /**
     * Remove HTML like tags and entities
     */
    private function removeHtml()
    {
        preg_match_all($this->elements, $this->title, $this->foundElements, PREG_OFFSET_CAPTURE);
        $this->title = preg_replace($this->elements, '', $this->title);
    }
    
    /**
     * Try to put the HTML tags and entities back where they belong
     */
    private function restoreHtml()
    {
        foreach ($this->foundElements[0] as $element) {
            $this->title = substr_replace($this->title, $element[0], $element[1], 0);
        }
    }

    /**
     * Find and return the current offset then advance the
     * offset for next itteration.
     */
    private function getNextOffset($match)
    {
        // Find where the match starts in our $title.
        $offset = mb_strpos($this->title, $match, $this->offset, 'UTF-8');

        // Move the pointer for the next match.
        $this->offset = $offset + mb_strlen($match, 'UTF-8');
        return $offset;
    }

    /**
     * Get the char from the $title string
     * at the specified index. Empty if $charIndex
     * less than zero.
     *
     * @param int $charIndex Which char to return.
     * @return string
     */
    private function getChar($charIndex)
    {
        if ($charIndex<0) {
            return '';
        }
        return mb_substr($this->title, $charIndex, 1, 'UTF-8');
    }
    
    /**
     * Handler for the preg_replace_callback.
     *
     * Determines if each word should be capitalized
     * or not and returns the word with the correct
     * Capitalization.
     *
     * @suppress PhanTypeMismatchArgumentInternal
     * @SuppressWarnings(PHPMD.UnusedPrivateMethod)
     * @param array $matches The matches from preg_replace_callback.
     * @return string
     */
    private function callback($matches)
    {
        $offset      = $this->getNextOffset($matches[0]);
        $matchEnd    = $offset + mb_strlen($matches[0], 'UTF-8');

        /**
         * Determine Lower Case Words
         */
        if ($offset > 0                                         && // We're not at the start.
            $matchEnd !== mb_strlen($this->title, 'UTF-8')      && // A single word title is not lower case.
            preg_match($this->smallWords, $matches[0])          && // Small words are lower case.
            $this->getChar($offset - 2) !== ':'                 && // Words after a colon are not lower case.
          ! preg_match('/[^\s-]/', $this->getChar($offset - 1)) &&
            (
               $this->getChar($matchEnd)   !== '-' ||
               $this->getChar($offset - 1) === '-'
            )
        ) {
            return mb_strtolower($matches[0], 'UTF-8');
        }

        /**
         * It's already capitalized.
         */
        if (preg_match('/[A-Z]|\../', mb_substr($matches[0], 1, null, 'UTF-8'))) {
            return $matches[0];
        }
        
        /**
         * Capitalize it
         */
        return mb_strtoupper(mb_substr($matches[0], 0, 1, 'UTF-8'), 'UTF-8') . mb_substr($matches[0], 1, null, 'UTF-8');
    }
}