YetiForceCompany/YetiForcePDF

View on GitHub
lib/Html/Parser.php

Summary

Maintainability
C
1 day
Test Coverage
<?php

declare(strict_types=1);

/**
 * Parser class.
 *
 * @package   YetiForcePDF\Html
 *
 * @copyright YetiForce Sp. z o.o
 * @license   MIT
 * @author    Rafal Pospiech <r.pospiech@yetiforce.com>
 */

namespace YetiForcePDF\Html;

use YetiForcePDF\Base;
use YetiForcePDF\Layout\PageGroupBox;

/**
 * Class Parser.
 */
class Parser extends Base
{
    /**
     * @var \DOMDocument
     */
    protected $domDocument;

    /**
     * @var string
     */
    protected $html = '';

    /**
     * @var array page groups with html content divided
     */
    protected $htmlPageGroups = [];

    /**
     * @var array
     */
    protected $pageGroups = [];

    /**
     * Cleanup html.
     *
     * @param string $html
     * @param string $fromEncoding
     *
     * @return string
     */
    protected function cleanUpHtml(string $html)
    {
        $html = preg_replace('/\r\n/', "\r", $html);
        return preg_replace('/\n/', "\r", $html);
    }

    /**
     * Load html string.
     *
     * @param string $html
     * @param string $fromEncoding
     *
     * @return Parser
     */
    public function loadHtml(string $html, string $fromEncoding): self
    {
        $this->html = htmlspecialchars_decode($html, ENT_HTML5);
        $this->html = $this->cleanUpHtml($html);

        // 0x80 - start of unicode range
        // 0x10FFFF - end of unicode range
        // 0 - do not ommit any unicode char
        // ~0 - negated 0 - convert negation of nothing (so convert all)
        $this->html = mb_encode_numericentity($this->html, [0x80, 0x10FFFF, 0, ~0], $fromEncoding);

        return $this;
    }

    /**
     * Get html.
     *
     * @return string
     */
    public function getHtml(): string
    {
        return $this->html;
    }

    /**
     * Remove comment blocks.
     *
     * @param string $html
     *
     * @return string
     */
    public function removeComments(string $html)
    {
        return preg_replace('/<!--((?!-->))[\w\W]+-->/uUi', '', $html);
    }

    /**
     * Divide html into page groups.
     *
     * @param string $html
     *
     * @return array
     */
    public function getHtmlPageGroups(string $html)
    {
        $pageGroups = [];
        $matches = [];
        preg_match_all('/\<div\s+data-page-group\s?/ui', $html, $matches, PREG_OFFSET_CAPTURE);
        $matches = $matches[0];
        $groupsCount = \count($matches);
        for ($i = 0; $i < $groupsCount; ++$i) {
            $start = $matches[$i][1];
            if (isset($matches[$i + 1])) {
                $stop = $matches[$i + 1][1];
                $len = $stop - $start;
                $pageGroups[] = substr($html, $start, $len);
            } else {
                $pageGroups[] = substr($html, $start);
            }
        }
        if (empty($pageGroups)) {
            return [$html];
        }
        return $pageGroups;
    }

    /**
     * Set page group options.
     *
     * @param PageGroupBox $root
     * @param \DOMDocument $domDocument
     *
     * @return $this
     */
    public function setGroupOptions(PageGroupBox $root, \DOMDocument $domDocument)
    {
        $childDomElement = $domDocument->documentElement->firstChild;
        if (!$childDomElement instanceof \DOMElement) {
            return $this;
        }
        if ($childDomElement->hasAttribute('data-format')) {
            $root->format = $childDomElement->getAttribute('data-format');
            if (!$root->format) {
                $root->format = 'A4';
            }
        }
        if ($childDomElement->hasAttribute('data-orientation')) {
            $root->orientation = $childDomElement->getAttribute('data-orientation');
            if (!$root->orientation) {
                $root->orientation = 'P';
            }
        }
        if ($childDomElement->hasAttribute('data-margin-left')) {
            $root->marginLeft = (float) $childDomElement->getAttribute('data-margin-left');
            if (!$root->marginLeft) {
                $root->marginLeft = 30;
            }
        }
        if ($childDomElement->hasAttribute('data-margin-right')) {
            $root->marginRight = (float) $childDomElement->getAttribute('data-margin-right');
            if (!$root->marginRight) {
                $root->marginRight = 30;
            }
        }
        if ($childDomElement->hasAttribute('data-margin-top')) {
            $root->marginTop = (float) $childDomElement->getAttribute('data-margin-top');
            if (!$root->marginTop) {
                $root->marginTop = 40;
            }
        }
        if ($childDomElement->hasAttribute('data-margin-bottom')) {
            $root->marginBottom = (float) $childDomElement->getAttribute('data-margin-bottom');
            if (!$root->marginBottom) {
                $root->marginBottom = 40;
            }
        }
        if ($childDomElement->hasAttribute('data-header-top')) {
            $root->headerTop = (float) $childDomElement->getAttribute('data-header-top');
            if (!$root->headerTop) {
                $root->headerTop = 10;
            }
        }
        if ($childDomElement->hasAttribute('data-footer-bottom')) {
            $root->footerBottom = (float) $childDomElement->getAttribute('data-footer-bottom');
            if (!$root->footerBottom) {
                $root->footerBottom = 10;
            }
        }
        return $this;
    }

    /**
     * Convert loaded html to pdf objects.
     */
    public function parse()
    {
        if ('' === $this->html) {
            return null;
        }
        $this->html = $this->removeComments($this->html);
        $this->htmlPageGroups = $this->getHtmlPageGroups($this->html);
        foreach ($this->htmlPageGroups as $groupIndex => $htmlPageGroup) {
            $domDocument = new \DOMDocument();
            $domDocument->encoding = 'UTF-8';
            $domDocument->strictErrorChecking = false;
            $domDocument->substituteEntities = false;
            $domDocument->recover = false;
            $domDocument->loadHTML('<div id="yetiforcepdf">' . $htmlPageGroup . '</div>', LIBXML_HTML_NOIMPLIED | LIBXML_NOWARNING | LIBXML_NOBLANKS | LIBXML_NOERROR);
            $pageGroup = (new PageGroupBox())
                ->setDocument($this->document)
                ->setRoot(true)
                ->init();
            $pageGroup->format = $this->document->getDefaultFormat();
            $margins = $this->document->getDefaultMargins();
            $pageGroup->marginLeft = $margins['left'];
            $pageGroup->marginTop = $margins['top'];
            $pageGroup->marginRight = $margins['right'];
            $pageGroup->marginBottom = $margins['bottom'];
            $pageGroup->orientation = $this->document->getDefaultOrientation();
            $this->setGroupOptions($pageGroup, $domDocument);
            $page = $this->document->addPage($pageGroup->format, $pageGroup->orientation);
            $page->setPageNumber(1);
            $page->setGroup($groupIndex);
            $page->setMargins($pageGroup->marginLeft, $pageGroup->marginTop, $pageGroup->marginRight, $pageGroup->marginBottom);
            $rootElement = (new \YetiForcePDF\Html\Element())
                ->setDocument($this->document)
                ->setDOMElement($domDocument->documentElement);
            // root element must be defined before initialisation
            $rootElement->init();
            $pageGroup->setElement($rootElement);
            $pageGroup->setStyle($rootElement->parseStyle());

            $pageGroup->buildTree();
            $pageGroup->fixTables();
            $pageGroup->getStyle()->fixDomTree();
            $pageGroup->layout();
            $page->setBox($pageGroup);

            foreach ($this->document->getPages($groupIndex) as $page) {
                $page->getBox()->breakPageAfter();
            }
            foreach ($this->document->getPages($groupIndex) as $page) {
                $page->getBox()->spanAllRows();
            }
            foreach ($this->document->getPages($groupIndex) as $page) {
                $page->breakOverflow();
            }
            $page->getBox()->getStyle()->fixDomTree();
            $this->document->fixPageNumbers();
            foreach ($this->document->getPages($groupIndex) as $page) {
                $this->document->setCurrentPage($page);
                $children = [];
                $page->setUpAbsoluteBoxes();
                $page->getBox()->replacePageNumbers();
                $page->getBox()->getAllChildren($children);
                foreach ($children as $box) {
                    if (!$box instanceof \YetiForcePDF\Layout\LineBox && $box->isRenderable()) {
                        $page->getContentStream()->addRawContent($box->getInstructions());
                    }
                }
            }
        }
    }
}