Includes: Developed to get new system working with inline includes

Adds logic for locating and splitting text nodes. Adds specific classes to offload tag/content specific logic.
2023-11-23 14:29:07 +00:00 · 2023-11-23 14:29:07 +00:00 · 75936454cc
commit 75936454cc
parent 04d21c8a97
5 changed files with 192 additions and 8 deletions
--- a/app/Entities/Tools/PageIncludeContent.php
+++ b/app/Entities/Tools/PageIncludeContent.php
@ -0,0 +1,68 @@
 <?php
 namespace BookStack\Entities\Tools;
 use BookStack\Util\HtmlDocument;
 use DOMNode;
 class PageIncludeContent
 {
    protected static array $topLevelTags = ['table', 'ul', 'ol', 'pre'];
    /**
     * @var DOMNode[]
     */
    protected array $contents = [];
    protected bool $isTopLevel;
    public function __construct(
        string $html,
        PageIncludeTag $tag,
    ) {
        $this->parseHtml($html, $tag);
    }
    protected function parseHtml(string $html, PageIncludeTag $tag): void
    {
        if (empty($html)) {
            return;
        }
        $doc = new HtmlDocument($html);
        $sectionId = $tag->getSectionId();
        if (!$sectionId) {
            $this->contents = [...$doc->getBodyChildren()];
            $this->isTopLevel = true;
            return;
        }
        $section = $doc->getElementById($sectionId);
        if (!$section) {
            return;
        }
        $isTopLevel = in_array(strtolower($section->nodeName), static::$topLevelTags);
        $this->isTopLevel = $isTopLevel;
        $this->contents = $isTopLevel ? [$section] : [...$section->childNodes];
    }
    public function isInline(): bool
    {
        return !$this->isTopLevel;
    }
    public function isEmpty(): bool
    {
        return empty($this->contents);
    }
    /**
     * @return DOMNode[]
     */
    public function toDomNodes(): array
    {
        return $this->contents;
    }
 }
--- a/app/Entities/Tools/PageIncludeParser.php
+++ b/app/Entities/Tools/PageIncludeParser.php
@ -4,6 +4,8 @@ namespace BookStack\Entities\Tools;
 use BookStack\Util\HtmlDocument;
 use Closure;
 use DOMNode;
 use DOMText;
 class PageIncludeParser
 {
@ -17,14 +19,25 @@ class PageIncludeParser
    public function parse(): string
    {
-        $html = new HtmlDocument($this->pageHtml);
+        $doc = new HtmlDocument($this->pageHtml);
-        $includeHosts = $html->queryXPath("//body//*[contains(text(), '{{@')]");
+        $tags = $this->locateAndIsolateIncludeTags($doc);
        $node = $includeHosts->item(0);
-        // One of the direct child textnodes of the "$includeHosts" should be
+        foreach ($tags as $tag) {
-        // the one with the include tag within.
+            $htmlContent = $this->pageContentForId->call($this, $tag->getPageId());
-        $textNode = $node->childNodes->item(0);
+            $content = new PageIncludeContent($htmlContent, $tag);
            if ($content->isInline()) {
                $adopted = $doc->adoptNodes($content->toDomNodes());
                foreach ($adopted as $adoptedContentNode) {
                    $tag->domNode->parentNode->insertBefore($adoptedContentNode, $tag->domNode);
                }
                $tag->domNode->parentNode->removeChild($tag->domNode);
                continue;
            }
            // TODO - Non-inline
        }
        // TODO:
        // Hunt down the specific text nodes with matches
@ -52,6 +65,64 @@ class PageIncludeParser
        // in changes affecting the next tag, where tags may be in the same/adjacent nodes.
-        return $html->getBodyInnerHtml();
+        return $doc->getBodyInnerHtml();
    }
    /**
     * Locate include tags within the given document, isolating them to their
     * own nodes in the DOM for future targeted manipulation.
     * @return PageIncludeTag[]
     */
    protected function locateAndIsolateIncludeTags(HtmlDocument $doc): array
    {
        $includeHosts = $doc->queryXPath("//body//*[contains(text(), '{{@')]");
        $includeTags = [];
        /** @var DOMNode $node */
        /** @var DOMNode $childNode */
        foreach ($includeHosts as $node) {
            foreach ($node->childNodes as $childNode) {
                if ($childNode->nodeName === '#text') {
                    array_push($includeTags, ...$this->splitTextNodesAtTags($childNode));
                }
            }
        }
        return $includeTags;
    }
    /**
     * Takes a text DOMNode and splits its text content at include tags
     * into multiple text nodes within the original parent.
     * Returns found PageIncludeTag references.
     * @return PageIncludeTag[]
     */
    protected function splitTextNodesAtTags(DOMNode $textNode): array
    {
        $includeTags = [];
        $text = $textNode->textContent;
        preg_match_all(static::$includeTagRegex, $text, $matches, PREG_OFFSET_CAPTURE);
        $currentOffset = 0;
        foreach ($matches[0] as $index => $fullTagMatch) {
            $tagOuterContent = $fullTagMatch[0];
            $tagInnerContent = $matches[1][$index][0];
            $tagStartOffset = $fullTagMatch[1];
            if ($currentOffset < $tagStartOffset) {
                $previousText = substr($text, $currentOffset, $tagStartOffset - $currentOffset);
                $textNode->parentNode->insertBefore(new DOMText($previousText), $textNode);
            }
            $node = $textNode->parentNode->insertBefore(new DOMText($tagOuterContent), $textNode);
            $includeTags[] = new PageIncludeTag($tagInnerContent, $node);
            $currentOffset = $tagStartOffset + strlen($tagOuterContent);
        }
        if ($currentOffset > 0) {
            $textNode->textContent = substr($text, $currentOffset);
        }
        return $includeTags;
    }
 }
--- a/app/Entities/Tools/PageIncludeTag.php
+++ b/app/Entities/Tools/PageIncludeTag.php
@ -0,0 +1,30 @@
 <?php
 namespace BookStack\Entities\Tools;
 use DOMNode;
 class PageIncludeTag
 {
    public function __construct(
        public string $tagContent,
        public DOMNode $domNode,
    ) {
    }
    /**
     * Get the page ID that this tag references.
     */
    public function getPageId(): int
    {
        return intval(trim(explode('#', $this->tagContent, 2)[0]));
    }
    /**
     * Get the section ID that this tag references (if any)
     */
    public function getSectionId(): string
    {
        return trim(explode('#', $this->tagContent, 2)[1] ?? '');
    }
 }
--- a/app/Util/HtmlDocument.php
+++ b/app/Util/HtmlDocument.php
@ -149,4 +149,19 @@ class HtmlDocument
    {
        return $this->document->saveHTML($node);
    }
    /**
     * Adopt the given nodes into this document.
     * @param DOMNode[] $nodes
     * @return DOMNode[]
     */
    public function adoptNodes(array $nodes): array
    {
        $adopted = [];
        foreach ($nodes as $node) {
            $adopted[] = $this->document->importNode($node, true);
        }
        return $adopted;
    }
 }
--- a/tests/Unit/PageIncludeParserTest.php
+++ b/tests/Unit/PageIncludeParserTest.php
@ -37,7 +37,7 @@ class PageIncludeParserTest extends TestCase
    protected function runParserTest(string $html, array $contentById, string $expected)
    {
        $parser = new PageIncludeParser($html, function (int $id) use ($contentById) {
-            return $contentById[strval($id)] ?? null;
+            return $contentById[strval($id)] ?? '';
        });
        $result = $parser->parse();