2021-06-26 17:23:15 +02:00
|
|
|
<?php
|
|
|
|
|
|
|
|
namespace BookStack\Util;
|
2021-05-04 00:59:52 +02:00
|
|
|
|
2021-09-03 23:34:49 +02:00
|
|
|
use DOMAttr;
|
2021-05-04 00:59:52 +02:00
|
|
|
use DOMDocument;
|
2021-11-06 01:32:01 +01:00
|
|
|
use DOMElement;
|
2021-05-04 00:59:52 +02:00
|
|
|
use DOMNodeList;
|
|
|
|
use DOMXPath;
|
|
|
|
|
|
|
|
class HtmlContentFilter
|
|
|
|
{
|
|
|
|
/**
|
2021-09-04 00:32:42 +02:00
|
|
|
* Remove all the script elements from the given HTML.
|
2021-05-04 00:59:52 +02:00
|
|
|
*/
|
|
|
|
public static function removeScripts(string $html): string
|
|
|
|
{
|
|
|
|
if (empty($html)) {
|
|
|
|
return $html;
|
|
|
|
}
|
|
|
|
|
2023-02-23 23:59:26 +01:00
|
|
|
$html = '<?xml encoding="utf-8" ?><body>' . $html . '</body>';
|
2021-05-04 00:59:52 +02:00
|
|
|
libxml_use_internal_errors(true);
|
|
|
|
$doc = new DOMDocument();
|
2023-02-23 23:59:26 +01:00
|
|
|
$doc->loadHTML($html);
|
2021-05-04 00:59:52 +02:00
|
|
|
$xPath = new DOMXPath($doc);
|
|
|
|
|
|
|
|
// Remove standard script tags
|
|
|
|
$scriptElems = $xPath->query('//script');
|
|
|
|
static::removeNodes($scriptElems);
|
|
|
|
|
|
|
|
// Remove clickable links to JavaScript URI
|
2021-09-02 23:02:30 +02:00
|
|
|
$badLinks = $xPath->query('//*[' . static::xpathContains('@href', 'javascript:') . ']');
|
2021-05-04 00:59:52 +02:00
|
|
|
static::removeNodes($badLinks);
|
|
|
|
|
|
|
|
// Remove forms with calls to JavaScript URI
|
2021-09-02 23:02:30 +02:00
|
|
|
$badForms = $xPath->query('//*[' . static::xpathContains('@action', 'javascript:') . '] | //*[' . static::xpathContains('@formaction', 'javascript:') . ']');
|
2021-05-04 00:59:52 +02:00
|
|
|
static::removeNodes($badForms);
|
|
|
|
|
|
|
|
// Remove meta tag to prevent external redirects
|
2021-09-02 23:02:30 +02:00
|
|
|
$metaTags = $xPath->query('//meta[' . static::xpathContains('@content', 'url') . ']');
|
2021-05-04 00:59:52 +02:00
|
|
|
static::removeNodes($metaTags);
|
|
|
|
|
|
|
|
// Remove data or JavaScript iFrames
|
2021-09-02 23:02:30 +02:00
|
|
|
$badIframes = $xPath->query('//*[' . static::xpathContains('@src', 'data:') . '] | //*[' . static::xpathContains('@src', 'javascript:') . '] | //*[@srcdoc]');
|
2021-05-04 00:59:52 +02:00
|
|
|
static::removeNodes($badIframes);
|
|
|
|
|
2022-09-06 18:01:56 +02:00
|
|
|
// Remove attributes, within svg children, hiding JavaScript or data uris.
|
|
|
|
// A bunch of svg element and attribute combinations expose xss possibilities.
|
2022-08-11 11:26:33 +02:00
|
|
|
// For example, SVG animate tag can exploit javascript in values.
|
2022-09-06 18:01:56 +02:00
|
|
|
$badValuesAttrs = $xPath->query('//svg//@*[' . static::xpathContains('.', 'data:') . '] | //svg//@*[' . static::xpathContains('.', 'javascript:') . ']');
|
|
|
|
static::removeAttributes($badValuesAttrs);
|
2022-08-11 11:26:33 +02:00
|
|
|
|
2021-09-03 23:34:49 +02:00
|
|
|
// Remove elements with a xlink:href attribute
|
|
|
|
// Used in SVG but deprecated anyway, so we'll be a bit more heavy-handed here.
|
|
|
|
$xlinkHrefAttributes = $xPath->query('//@*[contains(name(), \'xlink:href\')]');
|
|
|
|
static::removeAttributes($xlinkHrefAttributes);
|
|
|
|
|
2021-05-04 00:59:52 +02:00
|
|
|
// Remove 'on*' attributes
|
|
|
|
$onAttributes = $xPath->query('//@*[starts-with(name(), \'on\')]');
|
2021-09-03 23:34:49 +02:00
|
|
|
static::removeAttributes($onAttributes);
|
2021-05-04 00:59:52 +02:00
|
|
|
|
|
|
|
$html = '';
|
|
|
|
$topElems = $doc->documentElement->childNodes->item(0)->childNodes;
|
|
|
|
foreach ($topElems as $child) {
|
|
|
|
$html .= $doc->saveHTML($child);
|
|
|
|
}
|
|
|
|
|
|
|
|
return $html;
|
|
|
|
}
|
|
|
|
|
2021-09-02 23:02:30 +02:00
|
|
|
/**
|
|
|
|
* Create a xpath contains statement with a translation automatically built within
|
|
|
|
* to affectively search in a cases-insensitive manner.
|
|
|
|
*/
|
|
|
|
protected static function xpathContains(string $property, string $value): string
|
|
|
|
{
|
|
|
|
$value = strtolower($value);
|
|
|
|
$upperVal = strtoupper($value);
|
2021-09-06 23:19:06 +02:00
|
|
|
|
2021-09-02 23:02:30 +02:00
|
|
|
return 'contains(translate(' . $property . ', \'' . $upperVal . '\', \'' . $value . '\'), \'' . $value . '\')';
|
|
|
|
}
|
|
|
|
|
2021-05-04 00:59:52 +02:00
|
|
|
/**
|
2021-09-03 23:34:49 +02:00
|
|
|
* Remove all the given DOMNodes.
|
2021-05-04 00:59:52 +02:00
|
|
|
*/
|
2021-06-13 13:53:04 +02:00
|
|
|
protected static function removeNodes(DOMNodeList $nodes): void
|
2021-05-04 00:59:52 +02:00
|
|
|
{
|
|
|
|
foreach ($nodes as $node) {
|
|
|
|
$node->parentNode->removeChild($node);
|
|
|
|
}
|
|
|
|
}
|
2021-09-03 23:34:49 +02:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Remove all the given attribute nodes.
|
|
|
|
*/
|
|
|
|
protected static function removeAttributes(DOMNodeList $attrs): void
|
|
|
|
{
|
|
|
|
/** @var DOMAttr $attr */
|
|
|
|
foreach ($attrs as $attr) {
|
|
|
|
$attrName = $attr->nodeName;
|
2021-11-06 01:32:01 +01:00
|
|
|
/** @var DOMElement $parentNode */
|
|
|
|
$parentNode = $attr->parentNode;
|
|
|
|
$parentNode->removeAttribute($attrName);
|
2021-09-03 23:34:49 +02:00
|
|
|
}
|
|
|
|
}
|
2021-06-13 13:53:04 +02:00
|
|
|
}
|