
View on GitHub


1 day
Test Coverage


 * Parser class.
 * @package   YetiForcePDF\Html
 * @copyright YetiForce Sp. z o.o
 * @license   MIT
 * @author    Rafal Pospiech <>

namespace YetiForcePDF\Html;

use YetiForcePDF\Base;
use YetiForcePDF\Layout\PageGroupBox;

 * Class Parser.
class Parser extends Base
     * @var \DOMDocument
    protected $domDocument;

     * @var string
    protected $html = '';

     * @var array page groups with html content divided
    protected $htmlPageGroups = [];

     * @var array
    protected $pageGroups = [];

     * Cleanup html.
     * @param string $html
     * @param string $fromEncoding
     * @return string
    protected function cleanUpHtml(string $html)
        $html = preg_replace('/\r\n/', "\r", $html);
        return preg_replace('/\n/', "\r", $html);

     * Load html string.
     * @param string $html
     * @param string $fromEncoding
     * @return Parser
    public function loadHtml(string $html, string $fromEncoding): self
        $this->html = htmlspecialchars_decode($html, ENT_HTML5);
        $this->html = $this->cleanUpHtml($html);

        // 0x80 - start of unicode range
        // 0x10FFFF - end of unicode range
        // 0 - do not ommit any unicode char
        // ~0 - negated 0 - convert negation of nothing (so convert all)
        $this->html = mb_encode_numericentity($this->html, [0x80, 0x10FFFF, 0, ~0], $fromEncoding);

        return $this;

     * Get html.
     * @return string
    public function getHtml(): string
        return $this->html;

     * Remove comment blocks.
     * @param string $html
     * @return string
    public function removeComments(string $html)
        return preg_replace('/<!--((?!-->))[\w\W]+-->/uUi', '', $html);

     * Divide html into page groups.
     * @param string $html
     * @return array
    public function getHtmlPageGroups(string $html)
        $pageGroups = [];
        $matches = [];
        preg_match_all('/\<div\s+data-page-group\s?/ui', $html, $matches, PREG_OFFSET_CAPTURE);
        $matches = $matches[0];
        $groupsCount = \count($matches);
        for ($i = 0; $i < $groupsCount; ++$i) {
            $start = $matches[$i][1];
            if (isset($matches[$i + 1])) {
                $stop = $matches[$i + 1][1];
                $len = $stop - $start;
                $pageGroups[] = substr($html, $start, $len);
            } else {
                $pageGroups[] = substr($html, $start);
        if (empty($pageGroups)) {
            return [$html];
        return $pageGroups;

     * Set page group options.
     * @param PageGroupBox $root
     * @param \DOMDocument $domDocument
     * @return $this
    public function setGroupOptions(PageGroupBox $root, \DOMDocument $domDocument)
        $childDomElement = $domDocument->documentElement->firstChild;
        if (!$childDomElement instanceof \DOMElement) {
            return $this;
        if ($childDomElement->hasAttribute('data-format')) {
            $root->format = $childDomElement->getAttribute('data-format');
            if (!$root->format) {
                $root->format = 'A4';
        if ($childDomElement->hasAttribute('data-orientation')) {
            $root->orientation = $childDomElement->getAttribute('data-orientation');
            if (!$root->orientation) {
                $root->orientation = 'P';
        if ($childDomElement->hasAttribute('data-margin-left')) {
            $root->marginLeft = (float) $childDomElement->getAttribute('data-margin-left');
            if (!$root->marginLeft) {
                $root->marginLeft = 30;
        if ($childDomElement->hasAttribute('data-margin-right')) {
            $root->marginRight = (float) $childDomElement->getAttribute('data-margin-right');
            if (!$root->marginRight) {
                $root->marginRight = 30;
        if ($childDomElement->hasAttribute('data-margin-top')) {
            $root->marginTop = (float) $childDomElement->getAttribute('data-margin-top');
            if (!$root->marginTop) {
                $root->marginTop = 40;
        if ($childDomElement->hasAttribute('data-margin-bottom')) {
            $root->marginBottom = (float) $childDomElement->getAttribute('data-margin-bottom');
            if (!$root->marginBottom) {
                $root->marginBottom = 40;
        if ($childDomElement->hasAttribute('data-header-top')) {
            $root->headerTop = (float) $childDomElement->getAttribute('data-header-top');
            if (!$root->headerTop) {
                $root->headerTop = 10;
        if ($childDomElement->hasAttribute('data-footer-bottom')) {
            $root->footerBottom = (float) $childDomElement->getAttribute('data-footer-bottom');
            if (!$root->footerBottom) {
                $root->footerBottom = 10;
        return $this;

     * Convert loaded html to pdf objects.
    public function parse()
        if ('' === $this->html) {
            return null;
        $this->html = $this->removeComments($this->html);
        $this->htmlPageGroups = $this->getHtmlPageGroups($this->html);
        foreach ($this->htmlPageGroups as $groupIndex => $htmlPageGroup) {
            $domDocument = new \DOMDocument();
            $domDocument->encoding = 'UTF-8';
            $domDocument->strictErrorChecking = false;
            $domDocument->substituteEntities = false;
            $domDocument->recover = false;
            $domDocument->loadHTML('<div id="yetiforcepdf">' . $htmlPageGroup . '</div>', LIBXML_HTML_NOIMPLIED | LIBXML_NOWARNING | LIBXML_NOBLANKS | LIBXML_NOERROR);
            $pageGroup = (new PageGroupBox())
            $pageGroup->format = $this->document->getDefaultFormat();
            $margins = $this->document->getDefaultMargins();
            $pageGroup->marginLeft = $margins['left'];
            $pageGroup->marginTop = $margins['top'];
            $pageGroup->marginRight = $margins['right'];
            $pageGroup->marginBottom = $margins['bottom'];
            $pageGroup->orientation = $this->document->getDefaultOrientation();
            $this->setGroupOptions($pageGroup, $domDocument);
            $page = $this->document->addPage($pageGroup->format, $pageGroup->orientation);
            $page->setMargins($pageGroup->marginLeft, $pageGroup->marginTop, $pageGroup->marginRight, $pageGroup->marginBottom);
            $rootElement = (new \YetiForcePDF\Html\Element())
            // root element must be defined before initialisation


            foreach ($this->document->getPages($groupIndex) as $page) {
            foreach ($this->document->getPages($groupIndex) as $page) {
            foreach ($this->document->getPages($groupIndex) as $page) {
            foreach ($this->document->getPages($groupIndex) as $page) {
                $children = [];
                foreach ($children as $box) {
                    if (!$box instanceof \YetiForcePDF\Layout\LineBox && $box->isRenderable()) {