From 71c93c88787fe3522d1f0f49d445eb24e3a2789e Mon Sep 17 00:00:00 2001 From: Dan Brown Date: Mon, 27 Nov 2023 19:54:47 +0000 Subject: [PATCH] Includes: Switched page to new system - Added mulit-level depth parsing. - Updating usage of HTML doc in page content to be efficient. - Removed now redundant PageContentTest cases. - Made some include system fixes based upon testing. --- app/Entities/Tools/PageContent.php | 113 ++++-------------- app/Entities/Tools/PageIncludeContent.php | 2 +- app/Entities/Tools/PageIncludeParser.php | 24 ++-- app/Theming/CustomHtmlHeadContentProvider.php | 2 +- app/Util/HtmlContentFilter.php | 23 ++-- tests/Entity/PageContentTest.php | 34 +----- tests/Unit/PageIncludeParserTest.php | 10 +- 7 files changed, 60 insertions(+), 148 deletions(-) diff --git a/app/Entities/Tools/PageContent.php b/app/Entities/Tools/PageContent.php index 3e75bd5bb..7f4d695fe 100644 --- a/app/Entities/Tools/PageContent.php +++ b/app/Entities/Tools/PageContent.php @@ -275,21 +275,33 @@ class PageContent */ public function render(bool $blankIncludes = false): string { - $content = $this->page->html ?? ''; + $html = $this->page->html ?? ''; + + if (empty($html)) { + return $html; + } + + $doc = new HtmlDocument($html); + + $contentProvider = function (int $id) use ($blankIncludes) { + if ($blankIncludes) { + return ''; + } + return Page::visible()->find($id)->html ?? ''; + }; + + $parser = new PageIncludeParser($doc, $contentProvider); + $nodesAdded = 1; + + for ($includeDepth = 0; $includeDepth < 3 && $nodesAdded !== 0; $includeDepth++) { + $nodesAdded = $parser->parse(); + } if (!config('app.allow_content_scripts')) { - $content = HtmlContentFilter::removeScripts($content); + HtmlContentFilter::removeScriptsFromDocument($doc); } - if ($blankIncludes) { - $content = $this->blankPageIncludes($content); - } else { - for ($includeDepth = 0; $includeDepth < 3; $includeDepth++) { - $content = $this->parsePageIncludes($content); - } - } - - return $content; + return $doc->getBodyInnerHtml(); } /** @@ -337,83 +349,4 @@ class PageContent return $tree->toArray(); } - - /** - * Remove any page include tags within the given HTML. - */ - protected function blankPageIncludes(string $html): string - { - return preg_replace("/{{@\s?([0-9].*?)}}/", '', $html); - } - - /** - * Parse any include tags "{{@#section}}" to be part of the page. - */ - protected function parsePageIncludes(string $html): string - { - $matches = []; - preg_match_all("/{{@\s?([0-9].*?)}}/", $html, $matches); - - foreach ($matches[1] as $index => $includeId) { - $fullMatch = $matches[0][$index]; - $splitInclude = explode('#', $includeId, 2); - - // Get page id from reference - $pageId = intval($splitInclude[0]); - if (is_nan($pageId)) { - continue; - } - - // Find page to use, and default replacement to empty string for non-matches. - /** @var ?Page $matchedPage */ - $matchedPage = Page::visible()->find($pageId); - $replacement = ''; - - if ($matchedPage && count($splitInclude) === 1) { - // If we only have page id, just insert all page html and continue. - $replacement = $matchedPage->html; - } elseif ($matchedPage && count($splitInclude) > 1) { - // Otherwise, if our include tag defines a section, load that specific content - $innerContent = $this->fetchSectionOfPage($matchedPage, $splitInclude[1]); - $replacement = trim($innerContent); - } - - $themeReplacement = Theme::dispatch( - ThemeEvents::PAGE_INCLUDE_PARSE, - $includeId, - $replacement, - clone $this->page, - $matchedPage ? (clone $matchedPage) : null, - ); - - // Perform the content replacement - $html = str_replace($fullMatch, $themeReplacement ?? $replacement, $html); - } - - return $html; - } - - /** - * Fetch the content from a specific section of the given page. - */ - protected function fetchSectionOfPage(Page $page, string $sectionId): string - { - $topLevelTags = ['table', 'ul', 'ol', 'pre']; - $doc = new HtmlDocument($page->html); - - // Search included content for the id given and blank out if not exists. - $matchingElem = $doc->getElementById($sectionId); - if ($matchingElem === null) { - return ''; - } - - // Otherwise replace the content with the found content - // Checks if the top-level wrapper should be included by matching on tag types - $isTopLevel = in_array(strtolower($matchingElem->nodeName), $topLevelTags); - if ($isTopLevel) { - return $doc->getNodeOuterHtml($matchingElem); - } - - return $doc->getNodeInnerHtml($matchingElem); - } } diff --git a/app/Entities/Tools/PageIncludeContent.php b/app/Entities/Tools/PageIncludeContent.php index 97c470c68..2e173859d 100644 --- a/app/Entities/Tools/PageIncludeContent.php +++ b/app/Entities/Tools/PageIncludeContent.php @@ -14,7 +14,7 @@ class PageIncludeContent */ protected array $contents = []; - protected bool $isTopLevel; + protected bool $isTopLevel = false; public function __construct( string $html, diff --git a/app/Entities/Tools/PageIncludeParser.php b/app/Entities/Tools/PageIncludeParser.php index e55fc22c7..660c60372 100644 --- a/app/Entities/Tools/PageIncludeParser.php +++ b/app/Entities/Tools/PageIncludeParser.php @@ -20,19 +20,19 @@ class PageIncludeParser protected array $toCleanup = []; public function __construct( - protected string $pageHtml, + protected HtmlDocument $doc, protected Closure $pageContentForId, ) { } /** * Parse out the include tags. + * Returns the count of new content DOM nodes added to the document. */ - public function parse(): string + public function parse(): int { - $doc = new HtmlDocument($this->pageHtml); - - $tags = $this->locateAndIsolateIncludeTags($doc); + $nodesAdded = 0; + $tags = $this->locateAndIsolateIncludeTags(); foreach ($tags as $tag) { $htmlContent = $this->pageContentForId->call($this, $tag->getPageId()); @@ -48,12 +48,14 @@ class PageIncludeParser } } - $this->replaceNodeWithNodes($tag->domNode, $content->toDomNodes()); + $replacementNodes = $content->toDomNodes(); + $nodesAdded += count($replacementNodes); + $this->replaceNodeWithNodes($tag->domNode, $replacementNodes); } $this->cleanup(); - return $doc->getBodyInnerHtml(); + return $nodesAdded; } /** @@ -61,9 +63,9 @@ class PageIncludeParser * own nodes in the DOM for future targeted manipulation. * @return PageIncludeTag[] */ - protected function locateAndIsolateIncludeTags(HtmlDocument $doc): array + protected function locateAndIsolateIncludeTags(): array { - $includeHosts = $doc->queryXPath("//body//*[text()[contains(., '{{@')]]"); + $includeHosts = $this->doc->queryXPath("//*[text()[contains(., '{{@')]]"); $includeTags = []; /** @var DOMNode $node */ @@ -125,7 +127,7 @@ class PageIncludeParser foreach ($replacements as $replacement) { if ($replacement->ownerDocument !== $targetDoc) { - $replacement = $targetDoc->adoptNode($replacement); + $replacement = $targetDoc->importNode($replacement, true); } $toReplace->parentNode->insertBefore($replacement, $toReplace); @@ -190,7 +192,7 @@ class PageIncludeParser return $parent; } - $parent = $parent->parentElement; + $parent = $parent->parentNode; } while ($parent !== null); return null; diff --git a/app/Theming/CustomHtmlHeadContentProvider.php b/app/Theming/CustomHtmlHeadContentProvider.php index 041e5d025..95d9ff5ad 100644 --- a/app/Theming/CustomHtmlHeadContentProvider.php +++ b/app/Theming/CustomHtmlHeadContentProvider.php @@ -50,7 +50,7 @@ class CustomHtmlHeadContentProvider $hash = md5($content); return $this->cache->remember('custom-head-export:' . $hash, 86400, function () use ($content) { - return HtmlContentFilter::removeScripts($content); + return HtmlContentFilter::removeScriptsFromHtmlString($content); }); } diff --git a/app/Util/HtmlContentFilter.php b/app/Util/HtmlContentFilter.php index 2dbb34086..758591729 100644 --- a/app/Util/HtmlContentFilter.php +++ b/app/Util/HtmlContentFilter.php @@ -9,16 +9,10 @@ use DOMNodeList; class HtmlContentFilter { /** - * Remove all the script elements from the given HTML. + * Remove all the script elements from the given HTML document. */ - public static function removeScripts(string $html): string + public static function removeScriptsFromDocument(HtmlDocument $doc) { - if (empty($html)) { - return $html; - } - - $doc = new HtmlDocument($html); - // Remove standard script tags $scriptElems = $doc->queryXPath('//script'); static::removeNodes($scriptElems); @@ -53,6 +47,19 @@ class HtmlContentFilter // Remove 'on*' attributes $onAttributes = $doc->queryXPath('//@*[starts-with(name(), \'on\')]'); static::removeAttributes($onAttributes); + } + + /** + * Remove scripts from the given HTML string. + */ + public static function removeScriptsFromHtmlString(string $html): string + { + if (empty($html)) { + return $html; + } + + $doc = new HtmlDocument($html); + static::removeScriptsFromDocument($doc); return $doc->getBodyInnerHtml(); } diff --git a/tests/Entity/PageContentTest.php b/tests/Entity/PageContentTest.php index d8845fe12..5b46c08a3 100644 --- a/tests/Entity/PageContentTest.php +++ b/tests/Entity/PageContentTest.php @@ -8,7 +8,7 @@ use Tests\TestCase; class PageContentTest extends TestCase { - protected $base64Jpeg = '/9j/2wBDAAMCAgICAgMCAgIDAwMDBAYEBAQEBAgGBgUGCQgKCgkICQkKDA8MCgsOCwkJDRENDg8QEBEQCgwSExIQEw8QEBD/yQALCAABAAEBAREA/8wABgAQEAX/2gAIAQEAAD8A0s8g/9k='; + protected string $base64Jpeg = '/9j/2wBDAAMCAgICAgMCAgIDAwMDBAYEBAQEBAgGBgUGCQgKCgkICQkKDA8MCgsOCwkJDRENDg8QEBEQCgwSExIQEw8QEBD/yQALCAABAAEBAREA/8wABgAQEAX/2gAIAQEAAD8A0s8g/9k='; public function test_page_includes() { @@ -57,38 +57,6 @@ class PageContentTest extends TestCase $this->assertEquals('', $page->text); } - public function test_page_includes_do_not_break_tables() - { - $page = $this->entities->page(); - $secondPage = $this->entities->page(); - - $content = '
test
'; - $secondPage->html = $content; - $secondPage->save(); - - $page->html = "{{@{$secondPage->id}#table}}"; - $page->save(); - - $pageResp = $this->asEditor()->get($page->getUrl()); - $pageResp->assertSee($content, false); - } - - public function test_page_includes_do_not_break_code() - { - $page = $this->entities->page(); - $secondPage = $this->entities->page(); - - $content = '
var cat = null;
'; - $secondPage->html = $content; - $secondPage->save(); - - $page->html = "{{@{$secondPage->id}#bkmrk-code}}"; - $page->save(); - - $pageResp = $this->asEditor()->get($page->getUrl()); - $pageResp->assertSee($content, false); - } - public function test_page_includes_rendered_on_book_export() { $page = $this->entities->page(); diff --git a/tests/Unit/PageIncludeParserTest.php b/tests/Unit/PageIncludeParserTest.php index ee7a64344..c4c127626 100644 --- a/tests/Unit/PageIncludeParserTest.php +++ b/tests/Unit/PageIncludeParserTest.php @@ -3,6 +3,7 @@ namespace Tests\Unit; use BookStack\Entities\Tools\PageIncludeParser; +use BookStack\Util\HtmlDocument; use Tests\TestCase; class PageIncludeParserTest extends TestCase @@ -214,13 +215,14 @@ class PageIncludeParserTest extends TestCase ); } - protected function runParserTest(string $html, array $contentById, string $expected) + protected function runParserTest(string $html, array $contentById, string $expected): void { - $parser = new PageIncludeParser($html, function (int $id) use ($contentById) { + $doc = new HtmlDocument($html); + $parser = new PageIncludeParser($doc, function (int $id) use ($contentById) { return $contentById[strval($id)] ?? ''; }); - $result = $parser->parse(); - $this->assertEquals($expected, $result); + $parser->parse(); + $this->assertEquals($expected, $doc->getBodyInnerHtml()); } }