From c0a21045fb9197927f18c2fdf181af1a6cc5f4d9 Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Mon, 30 Aug 2021 11:26:57 +0200 Subject: [PATCH] People: Improve search query parser #22 #882 --- internal/query/geo.go | 20 ++++++++++----- internal/query/geo_test.go | 10 -------- internal/query/like.go | 45 ++++++++++++++++++++++++++------- internal/query/like_test.go | 42 +++++++++++++++++++++++------- internal/query/photo_search.go | 18 ++++++++++--- internal/query/subjects.go | 43 +++++++++++++++++++++++++++++++ internal/query/subjects_test.go | 11 ++++++++ pkg/txt/clip.go | 1 + pkg/txt/resources/stopwords.txt | 1 - pkg/txt/stopwords.go | 1 - 10 files changed, 152 insertions(+), 40 deletions(-) diff --git a/internal/query/geo.go b/internal/query/geo.go index daac0955d..925cb3286 100644 --- a/internal/query/geo.go +++ b/internal/query/geo.go @@ -39,7 +39,17 @@ func Geo(f form.GeoSearch) (results GeoResults, err error) { Where("photos.deleted_at IS NULL"). Where("photos.photo_lat <> 0") - f.Query = txt.Clip(f.Query, txt.ClipKeyword) + // Clip query to reasonable size if needed. + f.Query = txt.Clip(f.Query, txt.ClipQuery) + + // Modify query if it contains subject names. + if f.Query != "" && f.Subject == "" { + if subj, remaining := SubjectUIDs(f.Query); len(subj) > 0 { + log.Debugf("search: subjects %#v", subj) + f.Subject = strings.Join(subj, Or) + f.Query = remaining + } + } if f.Query != "" { // Filter by label, label category and keywords. @@ -47,12 +57,8 @@ func Geo(f form.GeoSearch) (results GeoResults, err error) { var labels []entity.Label var labelIds []uint - if len(f.Query) < 2 { - return results, fmt.Errorf("query too short") - } - if err := Db().Where(AnySlug("custom_slug", f.Query, " ")).Find(&labels).Error; len(labels) == 0 || err != nil { - log.Infof("search: label %s not found, using fuzzy search", txt.Quote(f.Query)) + log.Debugf("search: label %s not found, using fuzzy search", txt.Quote(f.Query)) for _, where := range LikeAnyKeyword("k.keyword", f.Query) { s = s.Where("photos.id IN (SELECT pk.photo_id FROM keywords k JOIN photos_keywords pk ON k.id = pk.keyword_id WHERE (?))", gorm.Expr(where)) @@ -63,7 +69,7 @@ func Geo(f form.GeoSearch) (results GeoResults, err error) { Db().Where("category_id = ?", l.ID).Find(&categories) - log.Infof("search: label %s includes %d categories", txt.Quote(l.LabelName), len(categories)) + log.Debugf("search: label %s includes %d categories", txt.Quote(l.LabelName), len(categories)) for _, category := range categories { labelIds = append(labelIds, category.LabelID) diff --git a/internal/query/geo_test.go b/internal/query/geo_test.go index 79a67ee24..699b3ed3a 100644 --- a/internal/query/geo_test.go +++ b/internal/query/geo_test.go @@ -158,16 +158,6 @@ func TestGeo(t *testing.T) { } assert.IsType(t, GeoResults{}, result) }) - t.Run("query too short", func(t *testing.T) { - f := form.GeoSearch{ - Query: "a", - } - - result, err := Geo(f) - - assert.Error(t, err) - assert.IsType(t, GeoResults{}, result) - }) t.Run("query for label flower", func(t *testing.T) { f := form.GeoSearch{ Query: "flower", diff --git a/internal/query/like.go b/internal/query/like.go index 0e9ade865..55873a57f 100644 --- a/internal/query/like.go +++ b/internal/query/like.go @@ -11,7 +11,7 @@ import ( ) // LikeAny returns a single where condition matching the search words. -func LikeAny(col, s string, keywords bool) (wheres []string) { +func LikeAny(col, s string, keywords, exact bool) (wheres []string) { if s == "" { return wheres } @@ -22,7 +22,9 @@ func LikeAny(col, s string, keywords bool) (wheres []string) { var wildcardThreshold int - if keywords { + if exact { + wildcardThreshold = -1 + } else if keywords { wildcardThreshold = 4 } else { wildcardThreshold = 2 @@ -43,7 +45,7 @@ func LikeAny(col, s string, keywords bool) (wheres []string) { } for _, w := range words { - if len(w) >= wildcardThreshold { + if wildcardThreshold > 0 && len(w) >= wildcardThreshold { orWheres = append(orWheres, fmt.Sprintf("%s LIKE '%s%%'", col, w)) } else { orWheres = append(orWheres, fmt.Sprintf("%s LIKE '%s'", col, w)) @@ -70,16 +72,16 @@ func LikeAny(col, s string, keywords bool) (wheres []string) { // LikeAnyKeyword returns a single where condition matching the search keywords. func LikeAnyKeyword(col, s string) (wheres []string) { - return LikeAny(col, s, true) + return LikeAny(col, s, true, false) } // LikeAnyWord returns a single where condition matching the search word. func LikeAnyWord(col, s string) (wheres []string) { - return LikeAny(col, s, false) + return LikeAny(col, s, false, false) } // LikeAll returns a list of where conditions matching all search words. -func LikeAll(col, s string, keywords bool) (wheres []string) { +func LikeAll(col, s string, keywords, exact bool) (wheres []string) { if s == "" { return wheres } @@ -97,10 +99,12 @@ func LikeAll(col, s string, keywords bool) (wheres []string) { if len(words) == 0 { return wheres + } else if exact { + wildcardThreshold = -1 } for _, w := range words { - if len(w) >= wildcardThreshold { + if wildcardThreshold > 0 && len(w) >= wildcardThreshold { wheres = append(wheres, fmt.Sprintf("%s LIKE '%s%%'", col, w)) } else { wheres = append(wheres, fmt.Sprintf("%s LIKE '%s'", col, w)) @@ -112,12 +116,35 @@ func LikeAll(col, s string, keywords bool) (wheres []string) { // LikeAllKeywords returns a list of where conditions matching all search keywords. func LikeAllKeywords(col, s string) (wheres []string) { - return LikeAll(col, s, true) + return LikeAll(col, s, true, false) } // LikeAllWords returns a list of where conditions matching all search words. func LikeAllWords(col, s string) (wheres []string) { - return LikeAll(col, s, false) + return LikeAll(col, s, false, false) +} + +// LikeAllNames returns a list of where conditions matching all names. +func LikeAllNames(col, s string) (wheres []string) { + if s == "" { + return wheres + } + + words := txt.UniqueWords(txt.Words(s)) + + if len(words) == 0 { + return wheres + } + + for _, w := range words { + wheres = append(wheres, fmt.Sprintf("%s LIKE '%s'", col, w)) + + if len(w) >= 2 { + wheres = append(wheres, fmt.Sprintf("%s LIKE '%s %%'", col, w)) + } + } + + return wheres } // AnySlug returns a where condition that matches any slug in search. diff --git a/internal/query/like_test.go b/internal/query/like_test.go index d9041a5b2..9b4aa1f5a 100644 --- a/internal/query/like_test.go +++ b/internal/query/like_test.go @@ -8,7 +8,7 @@ import ( func TestLikeAny(t *testing.T) { t.Run("and_or_search", func(t *testing.T) { - if w := LikeAny("k.keyword", "table spoon & usa | img json", true); len(w) != 2 { + if w := LikeAny("k.keyword", "table spoon & usa | img json", true, false); len(w) != 2 { t.Fatal("two where conditions expected") } else { assert.Equal(t, "k.keyword LIKE 'spoon%' OR k.keyword LIKE 'table%'", w[0]) @@ -16,7 +16,7 @@ func TestLikeAny(t *testing.T) { } }) t.Run("and_or_search_en", func(t *testing.T) { - if w := LikeAny("k.keyword", "table spoon and usa or img json", true); len(w) != 2 { + if w := LikeAny("k.keyword", "table spoon and usa or img json", true, false); len(w) != 2 { t.Fatal("two where conditions expected") } else { assert.Equal(t, "k.keyword LIKE 'spoon%' OR k.keyword LIKE 'table%'", w[0]) @@ -24,7 +24,7 @@ func TestLikeAny(t *testing.T) { } }) t.Run("table spoon usa img json", func(t *testing.T) { - if w := LikeAny("k.keyword", "table spoon usa img json", true); len(w) != 1 { + if w := LikeAny("k.keyword", "table spoon usa img json", true, false); len(w) != 1 { t.Fatal("one where condition expected") } else { assert.Equal(t, "k.keyword LIKE 'json%' OR k.keyword LIKE 'spoon%' OR k.keyword LIKE 'table%' OR k.keyword LIKE 'usa'", w[0]) @@ -32,7 +32,7 @@ func TestLikeAny(t *testing.T) { }) t.Run("cat dog", func(t *testing.T) { - if w := LikeAny("k.keyword", "cat dog", true); len(w) != 1 { + if w := LikeAny("k.keyword", "cat dog", true, false); len(w) != 1 { t.Fatal("one where condition expected") } else { assert.Equal(t, "k.keyword LIKE 'cat' OR k.keyword LIKE 'dog'", w[0]) @@ -40,7 +40,7 @@ func TestLikeAny(t *testing.T) { }) t.Run("cats dogs", func(t *testing.T) { - if w := LikeAny("k.keyword", "cats dogs", true); len(w) != 1 { + if w := LikeAny("k.keyword", "cats dogs", true, false); len(w) != 1 { t.Fatal("one where condition expected") } else { assert.Equal(t, "k.keyword LIKE 'cats%' OR k.keyword LIKE 'cat' OR k.keyword LIKE 'dogs%' OR k.keyword LIKE 'dog'", w[0]) @@ -48,7 +48,7 @@ func TestLikeAny(t *testing.T) { }) t.Run("spoon", func(t *testing.T) { - if w := LikeAny("k.keyword", "spoon", true); len(w) != 1 { + if w := LikeAny("k.keyword", "spoon", true, false); len(w) != 1 { t.Fatal("one where condition expected") } else { assert.Equal(t, "k.keyword LIKE 'spoon%'", w[0]) @@ -56,13 +56,13 @@ func TestLikeAny(t *testing.T) { }) t.Run("img", func(t *testing.T) { - if w := LikeAny("k.keyword", "img", true); len(w) > 0 { + if w := LikeAny("k.keyword", "img", true, false); len(w) > 0 { t.Fatal("no where condition expected") } }) t.Run("empty", func(t *testing.T) { - if w := LikeAny("k.keyword", "", true); len(w) > 0 { + if w := LikeAny("k.keyword", "", true, false); len(w) > 0 { t.Fatal("no where condition expected") } }) @@ -108,7 +108,7 @@ func TestLikeAnyWord(t *testing.T) { func TestLikeAll(t *testing.T) { t.Run("keywords", func(t *testing.T) { - if w := LikeAll("k.keyword", "Jo Mander 李", true); len(w) == 2 { + if w := LikeAll("k.keyword", "Jo Mander 李", true, false); len(w) == 2 { assert.Equal(t, "k.keyword LIKE 'mander%'", w[0]) assert.Equal(t, "k.keyword LIKE '李'", w[1]) } else { @@ -116,6 +116,15 @@ func TestLikeAll(t *testing.T) { t.Fatal("two where conditions expected") } }) + t.Run("exact", func(t *testing.T) { + if w := LikeAll("k.keyword", "Jo Mander 李", true, true); len(w) == 2 { + assert.Equal(t, "k.keyword LIKE 'mander'", w[0]) + assert.Equal(t, "k.keyword LIKE '李'", w[1]) + } else { + t.Logf("wheres: %#v", w) + t.Fatal("two where conditions expected") + } + }) } func TestLikeAllKeywords(t *testing.T) { @@ -142,6 +151,21 @@ func TestLikeAllWords(t *testing.T) { } }) } + +func TestLikeAllNames(t *testing.T) { + t.Run("keywords", func(t *testing.T) { + if w := LikeAllNames("k.name", "j Mander 王"); len(w) == 4 { + assert.Equal(t, "k.name LIKE 'mander'", w[0]) + assert.Equal(t, "k.name LIKE 'mander %'", w[1]) + assert.Equal(t, "k.name LIKE '王'", w[2]) + assert.Equal(t, "k.name LIKE '王 %'", w[3]) + } else { + t.Logf("wheres: %#v", w) + t.Fatal("4 where conditions expected") + } + }) +} + func TestAnySlug(t *testing.T) { t.Run("table spoon usa img json", func(t *testing.T) { where := AnySlug("custom_slug", "table spoon usa img json", " ") diff --git a/internal/query/photo_search.go b/internal/query/photo_search.go index eb840a12f..abfca8549 100644 --- a/internal/query/photo_search.go +++ b/internal/query/photo_search.go @@ -132,7 +132,19 @@ func PhotoSearch(f form.PhotoSearch) (results PhotoResults, count int, err error } } - // Filter by location. + // Clip query to reasonable size if needed. + f.Query = txt.Clip(f.Query, txt.ClipQuery) + + // Modify query if it contains subject names. + if f.Query != "" && f.Subject == "" { + if subj, remaining := SubjectUIDs(f.Query); len(subj) > 0 { + log.Debugf("search: subjects %#v", subj) + f.Subject = strings.Join(subj, Or) + f.Query = remaining + } + } + + // Filter by location? if f.Geo == true { s = s.Where("photos.cell_id <> 'zz'") @@ -141,7 +153,7 @@ func PhotoSearch(f form.PhotoSearch) (results PhotoResults, count int, err error } } else if f.Query != "" { if err := Db().Where(AnySlug("custom_slug", f.Query, " ")).Find(&labels).Error; len(labels) == 0 || err != nil { - log.Infof("search: label %s not found, using fuzzy search", txt.Quote(f.Query)) + log.Debugf("search: label %s not found, using fuzzy search", txt.Quote(f.Query)) for _, where := range LikeAnyKeyword("k.keyword", f.Query) { s = s.Where("photos.id IN (SELECT pk.photo_id FROM keywords k JOIN photos_keywords pk ON k.id = pk.keyword_id WHERE (?))", gorm.Expr(where)) @@ -152,7 +164,7 @@ func PhotoSearch(f form.PhotoSearch) (results PhotoResults, count int, err error Db().Where("category_id = ?", l.ID).Find(&categories) - log.Infof("search: label %s includes %d categories", txt.Quote(l.LabelName), len(categories)) + log.Debugf("search: label %s includes %d categories", txt.Quote(l.LabelName), len(categories)) for _, category := range categories { labelIds = append(labelIds, category.LabelID) diff --git a/internal/query/subjects.go b/internal/query/subjects.go index 9ad12dbe6..1b7b3326f 100644 --- a/internal/query/subjects.go +++ b/internal/query/subjects.go @@ -2,7 +2,9 @@ package query import ( "fmt" + "strings" + "github.com/jinzhu/gorm" "github.com/photoprism/photoprism/pkg/txt" "github.com/photoprism/photoprism/internal/entity" @@ -93,3 +95,44 @@ func CreateMarkerSubjects() (affected int64, err error) { return affected, err } + +// SubjectUIDs finds subject UIDs matching the search string. +func SubjectUIDs(s string) (result []string, remaining string) { + if s == "" { + return result, s + } + + type Matches struct { + SubjectUID string + SubjectName string + } + + var matches []Matches + + stmt := Db().Model(entity.Subject{}) + stmt = stmt.Where("subject_src <> ?", entity.SrcDefault) + + if where := LikeAllNames("subject_name", s); len(where) == 0 { + return result, s + } else { + stmt = stmt.Where("?", gorm.Expr(strings.Join(where, " OR "))) + } + + if err := stmt.Scan(&matches).Error; err != nil { + log.Errorf("search: %s while finding subjects", err) + } else if len(matches) == 0 { + return result, s + } + + for _, m := range matches { + result = append(result, m.SubjectUID) + + for _, n := range strings.Split(strings.ToLower(m.SubjectName), " ") { + s = strings.ReplaceAll(s, n, "") + } + } + + s = strings.Trim(s, "&| ") + + return result, s +} diff --git a/internal/query/subjects_test.go b/internal/query/subjects_test.go index 5e2372e04..845f77abb 100644 --- a/internal/query/subjects_test.go +++ b/internal/query/subjects_test.go @@ -52,3 +52,14 @@ func TestCreateMarkerSubjects(t *testing.T) { assert.NoError(t, err) assert.GreaterOrEqual(t, affected, int64(2)) } + +func TestSubjectUIDs(t *testing.T) { + result, remaining := SubjectUIDs("john & his | cats") + + if len(result) != 1 { + t.Fatal("expected one result") + } else { + assert.Equal(t, "jqu0xs11qekk9jx8", result[0]) + assert.Equal(t, "his | cats", remaining) + } +} diff --git a/pkg/txt/clip.go b/pkg/txt/clip.go index b7089002e..79a696400 100644 --- a/pkg/txt/clip.go +++ b/pkg/txt/clip.go @@ -7,6 +7,7 @@ const ( ClipSlug = 80 ClipKeyword = 40 ClipVarchar = 255 + ClipQuery = 1000 ClipDescription = 16000 ) diff --git a/pkg/txt/resources/stopwords.txt b/pkg/txt/resources/stopwords.txt index 500a34365..b2517659d 100644 --- a/pkg/txt/resources/stopwords.txt +++ b/pkg/txt/resources/stopwords.txt @@ -13,7 +13,6 @@ handy tumblr bilder bild -film films filme foto diff --git a/pkg/txt/stopwords.go b/pkg/txt/stopwords.go index 10670eecb..db59fa9c0 100644 --- a/pkg/txt/stopwords.go +++ b/pkg/txt/stopwords.go @@ -18,7 +18,6 @@ var StopWords = map[string]bool{ "tumblr": true, "bilder": true, "bild": true, - "film": true, "films": true, "filme": true, "foto": true,