Added page content parsing to up-rank header text in search

This adds parsing of page content so that headers apply a boost to
scores in the search term index.
Additionally, this merges title and content terms to reduce the amount
of stored terms a little.
Includes testing to cover.
This commit is contained in:
Dan Brown 2021-11-12 13:47:23 +00:00
parent 820be162f5
commit f28daa01d9
No known key found for this signature in database
GPG key ID: 46D9F943C24A2EF9
8 changed files with 158 additions and 38 deletions

View file

@ -24,7 +24,7 @@ class Book extends Entity implements HasCoverImage
{
use HasFactory;
public $searchFactor = 2;
public $searchFactor = 1.5;
protected $fillable = ['name', 'description'];
protected $hidden = ['restricted', 'pivot', 'image_id', 'deleted_at'];

View file

@ -13,7 +13,7 @@ class Bookshelf extends Entity implements HasCoverImage
protected $table = 'bookshelves';
public $searchFactor = 3;
public $searchFactor = 1.5;
protected $fillable = ['name', 'description', 'image_id'];

View file

@ -16,7 +16,7 @@ class Chapter extends BookChild
{
use HasFactory;
public $searchFactor = 1.3;
public $searchFactor = 1.5;
protected $fillable = ['name', 'description', 'priority', 'book_id'];
protected $hidden = ['restricted', 'pivot', 'deleted_at'];

View file

@ -238,20 +238,12 @@ abstract class Entity extends Model implements Sluggable, Favouritable, Viewable
return mb_substr($this->name, 0, $length - 3) . '...';
}
/**
* Get the body text of this entity.
*/
public function getText(): string
{
return $this->{$this->textField} ?? '';
}
/**
* Get an excerpt of this entity's descriptive content to the specified length.
*/
public function getExcerpt(int $length = 100): string
{
$text = $this->getText();
$text = $this->{$this->textField} ?? '';
if (mb_strlen($text) > $length) {
$text = mb_substr($text, 0, $length - 3) . '...';

View file

@ -3,13 +3,13 @@
namespace BookStack\Entities\Models;
use BookStack\Entities\Tools\PageContent;
use BookStack\Facades\Permissions;
use BookStack\Uploads\Attachment;
use Illuminate\Database\Eloquent\Builder;
use Illuminate\Database\Eloquent\Collection;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Relations\BelongsTo;
use Illuminate\Database\Eloquent\Relations\HasMany;
use Permissions;
/**
* Class Page.
@ -64,10 +64,8 @@ class Page extends BookChild
/**
* Check if this page has a chapter.
*
* @return bool
*/
public function hasChapter()
public function hasChapter(): bool
{
return $this->chapter()->count() > 0;
}

View file

@ -157,8 +157,8 @@ class PageRepo
*/
public function publishDraft(Page $draft, array $input): Page
{
$this->baseRepo->update($draft, $input);
$this->updateTemplateStatusAndContentFromInput($draft, $input);
$this->baseRepo->update($draft, $input);
$draft->draft = false;
$draft->revision_count = 1;

View file

@ -4,7 +4,10 @@ namespace BookStack\Entities\Tools;
use BookStack\Entities\EntityProvider;
use BookStack\Entities\Models\Entity;
use BookStack\Entities\Models\Page;
use BookStack\Entities\Models\SearchTerm;
use DOMDocument;
use DOMNode;
use Illuminate\Support\Collection;
class SearchIndex
@ -64,7 +67,8 @@ class SearchIndex
SearchTerm::query()->truncate();
foreach ($this->entityProvider->all() as $entityModel) {
$selectFields = ['id', 'name', $entityModel->textField];
$indexContentField = $entityModel instanceof Page ? 'html' : 'description';
$selectFields = ['id', 'name', $indexContentField];
$total = $entityModel->newQuery()->withTrashed()->count();
$chunkSize = 250;
$processed = 0;
@ -93,11 +97,70 @@ class SearchIndex
}
/**
* Create a scored term array from the given text.
* Create a scored term array from the given text, where the keys are the terms
* and the values are their scores.
*
* @returns array{term: string, score: float}
* @returns array<string, int>
*/
protected function generateTermArrayFromText(string $text, int $scoreAdjustment = 1): array
protected function generateTermScoreMapFromText(string $text, int $scoreAdjustment = 1): array
{
$termMap = $this->textToTermCountMap($text);
foreach ($termMap as $term => $count) {
$termMap[$term] = $count * $scoreAdjustment;
}
return $termMap;
}
/**
* Create a scored term array from the given HTML, where the keys are the terms
* and the values are their scores.
*
* @returns array<string, int>
*/
protected function generateTermScoreMapFromHtml(string $html): array
{
if (empty($html)) {
return [];
}
$scoresByTerm = [];
$elementScoreAdjustmentMap = [
'h1' => 10,
'h2' => 5,
'h3' => 4,
'h4' => 3,
'h5' => 2,
'h6' => 1.5,
];
$html = '<body>' . $html . '</body>';
libxml_use_internal_errors(true);
$doc = new DOMDocument();
$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
$topElems = $doc->documentElement->childNodes->item(0)->childNodes;
/** @var DOMNode $child */
foreach ($topElems as $child) {
$nodeName = $child->nodeName;
$termCounts = $this->textToTermCountMap(trim($child->textContent));
foreach ($termCounts as $term => $count) {
$scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1);
$scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange;
}
}
return $scoresByTerm;
}
/**
* For the given text, return an array where the keys are the unique term words
* and the values are the frequency of that term.
*
* @returns array<string, int>
*/
protected function textToTermCountMap(string $text): array
{
$tokenMap = []; // {TextToken => OccurrenceCount}
$splitChars = " \n\t.,!?:;()[]{}<>`'\"";
@ -111,34 +174,61 @@ class SearchIndex
$token = strtok($splitChars);
}
$terms = [];
foreach ($tokenMap as $token => $count) {
$terms[] = [
'term' => $token,
'score' => $count * $scoreAdjustment,
];
}
return $terms;
return $tokenMap;
}
/**
* For the given entity, Generate an array of term data details.
* Is the raw term data, not instances of SearchTerm models.
*
* @returns array{term: string, score: float}[]
* @returns array{term: string, score: float, entity_id: int, entity_type: string}[]
*/
protected function entityToTermDataArray(Entity $entity): array
{
$nameTerms = $this->generateTermArrayFromText($entity->name, 40 * $entity->searchFactor);
$bodyTerms = $this->generateTermArrayFromText($entity->getText(), 1 * $entity->searchFactor);
$termData = array_merge($nameTerms, $bodyTerms);
$nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor);
foreach ($termData as $index => $term) {
$termData[$index]['entity_type'] = $entity->getMorphClass();
$termData[$index]['entity_id'] = $entity->id;
if ($entity instanceof Page) {
$bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html);
} else {
$bodyTermsMap = $this->generateTermScoreMapFromText($entity->description, $entity->searchFactor);
}
return $termData;
$mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap);
$dataArray = [];
$entityId = $entity->id;
$entityType = $entity->getMorphClass();
foreach ($mergedScoreMap as $term => $score) {
$dataArray[] = [
'term' => $term,
'score' => $score,
'entity_type' => $entityType,
'entity_id' => $entityId,
];
}
return $dataArray;
}
/**
* For the given term data arrays, Merge their contents by term
* while combining any scores.
*
* @param array<string, int>[] ...$scoreMaps
*
* @returns array<string, int>
*/
protected function mergeTermScoreMaps(...$scoreMaps): array
{
$mergedMap = [];
foreach ($scoreMaps as $scoreMap) {
foreach ($scoreMap as $term => $score) {
$mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score;
}
}
return $mergedMap;
}
}

View file

@ -7,6 +7,7 @@ use BookStack\Entities\Models\Book;
use BookStack\Entities\Models\Bookshelf;
use BookStack\Entities\Models\Chapter;
use BookStack\Entities\Models\Page;
use BookStack\Entities\Models\SearchTerm;
use Tests\TestCase;
class EntitySearchTest extends TestCase
@ -320,4 +321,43 @@ class EntitySearchTest extends TestCase
$search->assertElementContains('.entity-list > .page', 'Test page B', 1);
$search->assertElementContains('.entity-list > .page', 'Test page A', 2);
}
public function test_terms_in_headers_have_an_adjusted_index_score()
{
$page = $this->newPage(['name' => 'Test page A', 'html' => '
<p>TermA</p>
<h1>TermB <strong>TermNested</strong></h1>
<h2>TermC</h2>
<h3>TermD</h3>
<h4>TermE</h4>
<h5>TermF</h5>
<h6>TermG</h6>
']);
$entityRelationCols = ['entity_id' => $page->id, 'entity_type' => 'BookStack\\Page'];
$scoreByTerm = SearchTerm::query()->where($entityRelationCols)->pluck('score', 'term');
$this->assertEquals(1, $scoreByTerm->get('TermA'));
$this->assertEquals(10, $scoreByTerm->get('TermB'));
$this->assertEquals(10, $scoreByTerm->get('TermNested'));
$this->assertEquals(5, $scoreByTerm->get('TermC'));
$this->assertEquals(4, $scoreByTerm->get('TermD'));
$this->assertEquals(3, $scoreByTerm->get('TermE'));
$this->assertEquals(2, $scoreByTerm->get('TermF'));
// Is 1.5 but stored as integer, rounding up
$this->assertEquals(2, $scoreByTerm->get('TermG'));
}
public function test_name_and_content_terms_are_merged_to_single_score()
{
$page = $this->newPage(['name' => 'TermA', 'html' => '
<p>TermA</p>
']);
$entityRelationCols = ['entity_id' => $page->id, 'entity_type' => 'BookStack\\Page'];
$scoreByTerm = SearchTerm::query()->where($entityRelationCols)->pluck('score', 'term');
// Scores 40 for being in the name then 1 for being in the content
$this->assertEquals(41, $scoreByTerm->get('TermA'));
}
}