From 4874dc1304ee26737884d787893743ae323e5cf2 Mon Sep 17 00:00:00 2001 From: Dan Brown Date: Sat, 25 Nov 2023 17:32:00 +0000 Subject: [PATCH] Includes: Updated logic regarding parent block els, added tests Expanded tests with many more cases, and added fixes for failed scenarios. Updated logic to specifically handling parent

tags, and now assume compatibility with parent block types elswhere to allow use in a variety of scenarios (td, details, blockquote etc...). --- app/Entities/Tools/PageIncludeParser.php | 99 ++++++++++++++++-------- tests/Unit/PageIncludeParserTest.php | 90 +++++++++++++++++++++ 2 files changed, 158 insertions(+), 31 deletions(-) diff --git a/app/Entities/Tools/PageIncludeParser.php b/app/Entities/Tools/PageIncludeParser.php index 5ce847d6c..e55fc22c7 100644 --- a/app/Entities/Tools/PageIncludeParser.php +++ b/app/Entities/Tools/PageIncludeParser.php @@ -13,37 +13,45 @@ class PageIncludeParser { protected static string $includeTagRegex = "/{{@\s?([0-9].*?)}}/"; + /** + * Elements to clean up and remove if left empty after a parsing operation. + * @var DOMElement[] + */ + protected array $toCleanup = []; + public function __construct( protected string $pageHtml, protected Closure $pageContentForId, ) { } + /** + * Parse out the include tags. + */ public function parse(): string { $doc = new HtmlDocument($this->pageHtml); $tags = $this->locateAndIsolateIncludeTags($doc); - $topLevel = [...$doc->getBodyChildren()]; foreach ($tags as $tag) { $htmlContent = $this->pageContentForId->call($this, $tag->getPageId()); $content = new PageIncludeContent($htmlContent, $tag); if (!$content->isInline()) { - $isParentTopLevel = in_array($tag->domNode->parentNode, $topLevel, true); - if ($isParentTopLevel) { + $parentP = $this->getParentParagraph($tag->domNode); + $isWithinParentP = $parentP === $tag->domNode->parentNode; + if ($parentP && $isWithinParentP) { $this->splitNodeAtChildNode($tag->domNode->parentNode, $tag->domNode); - } else { - $this->promoteTagNodeToBody($tag, $doc->getBody()); + } else if ($parentP) { + $this->moveTagNodeToBesideParent($tag, $parentP); } } $this->replaceNodeWithNodes($tag->domNode, $content->toDomNodes()); } - // TODO Notes: May want to eventually parse through backwards, which should avoid issues - // in changes affecting the next tag, where tags may be in the same/adjacent nodes. + $this->cleanup(); return $doc->getBodyInnerHtml(); } @@ -55,7 +63,7 @@ class PageIncludeParser */ protected function locateAndIsolateIncludeTags(HtmlDocument $doc): array { - $includeHosts = $doc->queryXPath("//body//*[contains(text(), '{{@')]"); + $includeHosts = $doc->queryXPath("//body//*[text()[contains(., '{{@')]]"); $includeTags = []; /** @var DOMNode $node */ @@ -107,6 +115,7 @@ class PageIncludeParser } /** + * Replace the given node with all those in $replacements * @param DOMNode[] $replacements */ protected function replaceNodeWithNodes(DOMNode $toReplace, array $replacements): void @@ -125,51 +134,79 @@ class PageIncludeParser $toReplace->parentNode->removeChild($toReplace); } - protected function promoteTagNodeToBody(PageIncludeTag $tag, DOMNode $body): void + /** + * Move a tag node to become a sibling of the given parent. + * Will attempt to guess a position based upon the tag content within the parent. + */ + protected function moveTagNodeToBesideParent(PageIncludeTag $tag, DOMNode $parent): void { - /** @var DOMNode $topParent */ - $topParent = $tag->domNode->parentNode; - while ($topParent->parentNode !== $body) { - $topParent = $topParent->parentNode; - } - - $parentText = $topParent->textContent; + $parentText = $parent->textContent; $tagPos = strpos($parentText, $tag->tagContent); $before = $tagPos < (strlen($parentText) / 2); if ($before) { - $body->insertBefore($tag->domNode, $topParent); + $parent->parentNode->insertBefore($tag->domNode, $parent); } else { - $body->insertBefore($tag->domNode, $topParent->nextSibling); + $parent->parentNode->insertBefore($tag->domNode, $parent->nextSibling); } } + /** + * Splits the given $parentNode at the location of the $domNode within it. + * Attempts replicate the original $parentNode, moving some of their parent + * children in where needed, before adding the $domNode between. + */ protected function splitNodeAtChildNode(DOMElement $parentNode, DOMNode $domNode): void { $children = [...$parentNode->childNodes]; - $splitPos = array_search($domNode, $children, true) ?: count($children); + $splitPos = array_search($domNode, $children, true); + if ($splitPos === false) { + $splitPos = count($children) - 1; + } + $parentClone = $parentNode->cloneNode(); + $parentNode->parentNode->insertBefore($parentClone, $parentNode); $parentClone->removeAttribute('id'); /** @var DOMNode $child */ for ($i = 0; $i < $splitPos; $i++) { - $child = $children[0]; + $child = $children[$i]; $parentClone->appendChild($child); } - if ($parentClone->hasChildNodes()) { - $parentNode->parentNode->insertBefore($parentClone, $parentNode); - } - $parentNode->parentNode->insertBefore($domNode, $parentNode); - $parentClone->normalize(); - $parentNode->normalize(); - if (!$parentNode->hasChildNodes()) { - $parentNode->remove(); - } - if (!$parentClone->hasChildNodes()) { - $parentClone->remove(); + $this->toCleanup[] = $parentNode; + $this->toCleanup[] = $parentClone; + } + + /** + * Get the parent paragraph of the given node, if existing. + */ + protected function getParentParagraph(DOMNode $parent): ?DOMNode + { + do { + if (strtolower($parent->nodeName) === 'p') { + return $parent; + } + + $parent = $parent->parentElement; + } while ($parent !== null); + + return null; + } + + /** + * Cleanup after a parse operation. + * Removes stranded elements we may have left during the parse. + */ + protected function cleanup(): void + { + foreach ($this->toCleanup as $element) { + $element->normalize(); + if ($element->parentNode && !$element->hasChildNodes()) { + $element->parentNode->removeChild($element); + } } } } diff --git a/tests/Unit/PageIncludeParserTest.php b/tests/Unit/PageIncludeParserTest.php index e0bd69e93..ee7a64344 100644 --- a/tests/Unit/PageIncludeParserTest.php +++ b/tests/Unit/PageIncludeParserTest.php @@ -34,6 +34,15 @@ class PageIncludeParserTest extends TestCase ); } + public function test_complex_inline_text_within_other_text() + { + $this->runParserTest( + '

Hello {{@45#content}}there!

', + ['45' => '

Testing withsomeextratags

'], + '

Hello Testing withsomeextratagsthere!

', + ); + } + public function test_block_content_types() { $inputs = [ @@ -97,6 +106,51 @@ class PageIncludeParserTest extends TestCase ); } + public function test_block_content_in_allowable_parent_element() + { + $this->runParserTest( + '
{{@45#content}}
', + ['45' => '
doggos
'], + '
doggos
', + ); + } + + public function test_block_content_in_paragraph_origin_with_allowable_grandparent() + { + $this->runParserTest( + '

{{@45#content}}

', + ['45' => '
doggos
'], + '
doggos
', + ); + } + + public function test_block_content_in_paragraph_origin_with_allowable_grandparent_with_adjacent_content() + { + $this->runParserTest( + '

Cute {{@45#content}} over there!

', + ['45' => '
doggos
'], + '

Cute

doggos

over there!

', + ); + } + + public function test_block_content_in_child_within_paragraph_origin_with_allowable_grandparent_with_adjacent_content() + { + $this->runParserTest( + '

Cute {{@45#content}} over there!

', + ['45' => '
doggos
'], + '
doggos

Cute over there!

', + ); + } + + public function test_block_content_in_paragraph_origin_within_details() + { + $this->runParserTest( + '

{{@45#content}}

', + ['45' => '
doggos
'], + '
doggos
', + ); + } + public function test_simple_whole_document() { $this->runParserTest( @@ -124,6 +178,42 @@ class PageIncludeParserTest extends TestCase ); } + public function test_multiple_tags_in_same_origin_with_inline_content() + { + $this->runParserTest( + '

This {{@45#content}}{{@45#content}} content is {{@45#content}}

', + ['45' => '

inline

'], + '

This inlineinline content is inline

', + ); + } + + public function test_multiple_tags_in_same_origin_with_block_content() + { + $this->runParserTest( + '

This {{@45#content}}{{@45#content}} content is {{@45#content}}

', + ['45' => '
block
'], + '

This

block
block

content is

block
', + ); + } + + public function test_multiple_tags_in_differing_origin_levels_with_block_content() + { + $this->runParserTest( + '

This {{@45#content}} content is {{@45#content}}

{{@45#content}}
', + ['45' => '
block
'], + '
block

This content is

block
block
', + ); + } + + public function test_multiple_tags_in_shallow_origin_with_multi_block_content() + { + $this->runParserTest( + '

{{@45}}C{{@45}}

{{@45}}{{@45}}
', + ['45' => '

A

B

'], + '

A

B

C

A

B

A

B

A

B

', + ); + } + protected function runParserTest(string $html, array $contentById, string $expected) { $parser = new PageIncludeParser($html, function (int $id) use ($contentById) {