Added page content parsing to up-rank header text in search
This adds parsing of page content so that headers apply a boost to scores in the search term index. Additionally, this merges title and content terms to reduce the amount of stored terms a little. Includes testing to cover.
This commit is contained in:
parent
820be162f5
commit
f28daa01d9
8 changed files with 158 additions and 38 deletions
|
@ -24,7 +24,7 @@ class Book extends Entity implements HasCoverImage
|
|||
{
|
||||
use HasFactory;
|
||||
|
||||
public $searchFactor = 2;
|
||||
public $searchFactor = 1.5;
|
||||
|
||||
protected $fillable = ['name', 'description'];
|
||||
protected $hidden = ['restricted', 'pivot', 'image_id', 'deleted_at'];
|
||||
|
|
|
@ -13,7 +13,7 @@ class Bookshelf extends Entity implements HasCoverImage
|
|||
|
||||
protected $table = 'bookshelves';
|
||||
|
||||
public $searchFactor = 3;
|
||||
public $searchFactor = 1.5;
|
||||
|
||||
protected $fillable = ['name', 'description', 'image_id'];
|
||||
|
||||
|
|
|
@ -16,7 +16,7 @@ class Chapter extends BookChild
|
|||
{
|
||||
use HasFactory;
|
||||
|
||||
public $searchFactor = 1.3;
|
||||
public $searchFactor = 1.5;
|
||||
|
||||
protected $fillable = ['name', 'description', 'priority', 'book_id'];
|
||||
protected $hidden = ['restricted', 'pivot', 'deleted_at'];
|
||||
|
|
|
@ -238,20 +238,12 @@ abstract class Entity extends Model implements Sluggable, Favouritable, Viewable
|
|||
return mb_substr($this->name, 0, $length - 3) . '...';
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the body text of this entity.
|
||||
*/
|
||||
public function getText(): string
|
||||
{
|
||||
return $this->{$this->textField} ?? '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an excerpt of this entity's descriptive content to the specified length.
|
||||
*/
|
||||
public function getExcerpt(int $length = 100): string
|
||||
{
|
||||
$text = $this->getText();
|
||||
$text = $this->{$this->textField} ?? '';
|
||||
|
||||
if (mb_strlen($text) > $length) {
|
||||
$text = mb_substr($text, 0, $length - 3) . '...';
|
||||
|
|
|
@ -3,13 +3,13 @@
|
|||
namespace BookStack\Entities\Models;
|
||||
|
||||
use BookStack\Entities\Tools\PageContent;
|
||||
use BookStack\Facades\Permissions;
|
||||
use BookStack\Uploads\Attachment;
|
||||
use Illuminate\Database\Eloquent\Builder;
|
||||
use Illuminate\Database\Eloquent\Collection;
|
||||
use Illuminate\Database\Eloquent\Factories\HasFactory;
|
||||
use Illuminate\Database\Eloquent\Relations\BelongsTo;
|
||||
use Illuminate\Database\Eloquent\Relations\HasMany;
|
||||
use Permissions;
|
||||
|
||||
/**
|
||||
* Class Page.
|
||||
|
@ -64,10 +64,8 @@ class Page extends BookChild
|
|||
|
||||
/**
|
||||
* Check if this page has a chapter.
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function hasChapter()
|
||||
public function hasChapter(): bool
|
||||
{
|
||||
return $this->chapter()->count() > 0;
|
||||
}
|
||||
|
|
|
@ -157,8 +157,8 @@ class PageRepo
|
|||
*/
|
||||
public function publishDraft(Page $draft, array $input): Page
|
||||
{
|
||||
$this->baseRepo->update($draft, $input);
|
||||
$this->updateTemplateStatusAndContentFromInput($draft, $input);
|
||||
$this->baseRepo->update($draft, $input);
|
||||
|
||||
$draft->draft = false;
|
||||
$draft->revision_count = 1;
|
||||
|
|
|
@ -4,7 +4,10 @@ namespace BookStack\Entities\Tools;
|
|||
|
||||
use BookStack\Entities\EntityProvider;
|
||||
use BookStack\Entities\Models\Entity;
|
||||
use BookStack\Entities\Models\Page;
|
||||
use BookStack\Entities\Models\SearchTerm;
|
||||
use DOMDocument;
|
||||
use DOMNode;
|
||||
use Illuminate\Support\Collection;
|
||||
|
||||
class SearchIndex
|
||||
|
@ -64,7 +67,8 @@ class SearchIndex
|
|||
SearchTerm::query()->truncate();
|
||||
|
||||
foreach ($this->entityProvider->all() as $entityModel) {
|
||||
$selectFields = ['id', 'name', $entityModel->textField];
|
||||
$indexContentField = $entityModel instanceof Page ? 'html' : 'description';
|
||||
$selectFields = ['id', 'name', $indexContentField];
|
||||
$total = $entityModel->newQuery()->withTrashed()->count();
|
||||
$chunkSize = 250;
|
||||
$processed = 0;
|
||||
|
@ -93,11 +97,70 @@ class SearchIndex
|
|||
}
|
||||
|
||||
/**
|
||||
* Create a scored term array from the given text.
|
||||
* Create a scored term array from the given text, where the keys are the terms
|
||||
* and the values are their scores.
|
||||
*
|
||||
* @returns array{term: string, score: float}
|
||||
* @returns array<string, int>
|
||||
*/
|
||||
protected function generateTermArrayFromText(string $text, int $scoreAdjustment = 1): array
|
||||
protected function generateTermScoreMapFromText(string $text, int $scoreAdjustment = 1): array
|
||||
{
|
||||
$termMap = $this->textToTermCountMap($text);
|
||||
|
||||
foreach ($termMap as $term => $count) {
|
||||
$termMap[$term] = $count * $scoreAdjustment;
|
||||
}
|
||||
|
||||
return $termMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a scored term array from the given HTML, where the keys are the terms
|
||||
* and the values are their scores.
|
||||
*
|
||||
* @returns array<string, int>
|
||||
*/
|
||||
protected function generateTermScoreMapFromHtml(string $html): array
|
||||
{
|
||||
if (empty($html)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$scoresByTerm = [];
|
||||
$elementScoreAdjustmentMap = [
|
||||
'h1' => 10,
|
||||
'h2' => 5,
|
||||
'h3' => 4,
|
||||
'h4' => 3,
|
||||
'h5' => 2,
|
||||
'h6' => 1.5,
|
||||
];
|
||||
|
||||
$html = '<body>' . $html . '</body>';
|
||||
libxml_use_internal_errors(true);
|
||||
$doc = new DOMDocument();
|
||||
$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
|
||||
|
||||
$topElems = $doc->documentElement->childNodes->item(0)->childNodes;
|
||||
/** @var DOMNode $child */
|
||||
foreach ($topElems as $child) {
|
||||
$nodeName = $child->nodeName;
|
||||
$termCounts = $this->textToTermCountMap(trim($child->textContent));
|
||||
foreach ($termCounts as $term => $count) {
|
||||
$scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1);
|
||||
$scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange;
|
||||
}
|
||||
}
|
||||
|
||||
return $scoresByTerm;
|
||||
}
|
||||
|
||||
/**
|
||||
* For the given text, return an array where the keys are the unique term words
|
||||
* and the values are the frequency of that term.
|
||||
*
|
||||
* @returns array<string, int>
|
||||
*/
|
||||
protected function textToTermCountMap(string $text): array
|
||||
{
|
||||
$tokenMap = []; // {TextToken => OccurrenceCount}
|
||||
$splitChars = " \n\t.,!?:;()[]{}<>`'\"";
|
||||
|
@ -111,34 +174,61 @@ class SearchIndex
|
|||
$token = strtok($splitChars);
|
||||
}
|
||||
|
||||
$terms = [];
|
||||
foreach ($tokenMap as $token => $count) {
|
||||
$terms[] = [
|
||||
'term' => $token,
|
||||
'score' => $count * $scoreAdjustment,
|
||||
];
|
||||
}
|
||||
|
||||
return $terms;
|
||||
return $tokenMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* For the given entity, Generate an array of term data details.
|
||||
* Is the raw term data, not instances of SearchTerm models.
|
||||
*
|
||||
* @returns array{term: string, score: float}[]
|
||||
* @returns array{term: string, score: float, entity_id: int, entity_type: string}[]
|
||||
*/
|
||||
protected function entityToTermDataArray(Entity $entity): array
|
||||
{
|
||||
$nameTerms = $this->generateTermArrayFromText($entity->name, 40 * $entity->searchFactor);
|
||||
$bodyTerms = $this->generateTermArrayFromText($entity->getText(), 1 * $entity->searchFactor);
|
||||
$termData = array_merge($nameTerms, $bodyTerms);
|
||||
$nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor);
|
||||
|
||||
foreach ($termData as $index => $term) {
|
||||
$termData[$index]['entity_type'] = $entity->getMorphClass();
|
||||
$termData[$index]['entity_id'] = $entity->id;
|
||||
if ($entity instanceof Page) {
|
||||
$bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html);
|
||||
} else {
|
||||
$bodyTermsMap = $this->generateTermScoreMapFromText($entity->description, $entity->searchFactor);
|
||||
}
|
||||
|
||||
return $termData;
|
||||
$mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap);
|
||||
|
||||
$dataArray = [];
|
||||
$entityId = $entity->id;
|
||||
$entityType = $entity->getMorphClass();
|
||||
foreach ($mergedScoreMap as $term => $score) {
|
||||
$dataArray[] = [
|
||||
'term' => $term,
|
||||
'score' => $score,
|
||||
'entity_type' => $entityType,
|
||||
'entity_id' => $entityId,
|
||||
];
|
||||
}
|
||||
|
||||
return $dataArray;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* For the given term data arrays, Merge their contents by term
|
||||
* while combining any scores.
|
||||
*
|
||||
* @param array<string, int>[] ...$scoreMaps
|
||||
*
|
||||
* @returns array<string, int>
|
||||
*/
|
||||
protected function mergeTermScoreMaps(...$scoreMaps): array
|
||||
{
|
||||
$mergedMap = [];
|
||||
|
||||
foreach ($scoreMaps as $scoreMap) {
|
||||
foreach ($scoreMap as $term => $score) {
|
||||
$mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score;
|
||||
}
|
||||
}
|
||||
|
||||
return $mergedMap;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,6 +7,7 @@ use BookStack\Entities\Models\Book;
|
|||
use BookStack\Entities\Models\Bookshelf;
|
||||
use BookStack\Entities\Models\Chapter;
|
||||
use BookStack\Entities\Models\Page;
|
||||
use BookStack\Entities\Models\SearchTerm;
|
||||
use Tests\TestCase;
|
||||
|
||||
class EntitySearchTest extends TestCase
|
||||
|
@ -320,4 +321,43 @@ class EntitySearchTest extends TestCase
|
|||
$search->assertElementContains('.entity-list > .page', 'Test page B', 1);
|
||||
$search->assertElementContains('.entity-list > .page', 'Test page A', 2);
|
||||
}
|
||||
|
||||
public function test_terms_in_headers_have_an_adjusted_index_score()
|
||||
{
|
||||
$page = $this->newPage(['name' => 'Test page A', 'html' => '
|
||||
<p>TermA</p>
|
||||
<h1>TermB <strong>TermNested</strong></h1>
|
||||
<h2>TermC</h2>
|
||||
<h3>TermD</h3>
|
||||
<h4>TermE</h4>
|
||||
<h5>TermF</h5>
|
||||
<h6>TermG</h6>
|
||||
']);
|
||||
|
||||
$entityRelationCols = ['entity_id' => $page->id, 'entity_type' => 'BookStack\\Page'];
|
||||
$scoreByTerm = SearchTerm::query()->where($entityRelationCols)->pluck('score', 'term');
|
||||
|
||||
$this->assertEquals(1, $scoreByTerm->get('TermA'));
|
||||
$this->assertEquals(10, $scoreByTerm->get('TermB'));
|
||||
$this->assertEquals(10, $scoreByTerm->get('TermNested'));
|
||||
$this->assertEquals(5, $scoreByTerm->get('TermC'));
|
||||
$this->assertEquals(4, $scoreByTerm->get('TermD'));
|
||||
$this->assertEquals(3, $scoreByTerm->get('TermE'));
|
||||
$this->assertEquals(2, $scoreByTerm->get('TermF'));
|
||||
// Is 1.5 but stored as integer, rounding up
|
||||
$this->assertEquals(2, $scoreByTerm->get('TermG'));
|
||||
}
|
||||
|
||||
public function test_name_and_content_terms_are_merged_to_single_score()
|
||||
{
|
||||
$page = $this->newPage(['name' => 'TermA', 'html' => '
|
||||
<p>TermA</p>
|
||||
']);
|
||||
|
||||
$entityRelationCols = ['entity_id' => $page->id, 'entity_type' => 'BookStack\\Page'];
|
||||
$scoreByTerm = SearchTerm::query()->where($entityRelationCols)->pluck('score', 'term');
|
||||
|
||||
// Scores 40 for being in the name then 1 for being in the content
|
||||
$this->assertEquals(41, $scoreByTerm->get('TermA'));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue