From 91acaaa57372e2c8d312246d6be8dde3e7cb4276 Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Sun, 27 Dec 2020 16:37:28 +0100 Subject: [PATCH] Index shorter keywords in languages like Chinese #746 Signed-off-by: Michael Mayer --- internal/entity/photo_test.go | 5 ++ internal/query/photo_search.go | 4 -- internal/query/photo_search_test.go | 16 ++---- internal/query/query.go | 8 +++ pkg/txt/contains.go | 54 ++++++++++++++++++ pkg/txt/contains_test.go | 88 +++++++++++++++++++++++++++++ pkg/txt/file_title.go | 8 +-- pkg/txt/file_title_test.go | 6 ++ pkg/txt/is.go | 43 ++++++++++++++ pkg/txt/is_test.go | 87 ++++++++++++++++++++++++++++ pkg/txt/strings.go | 19 ------- pkg/txt/strings_test.go | 9 --- pkg/txt/words.go | 32 ++++++++--- pkg/txt/words_test.go | 8 +++ 14 files changed, 331 insertions(+), 56 deletions(-) create mode 100644 pkg/txt/contains.go create mode 100644 pkg/txt/contains_test.go create mode 100644 pkg/txt/is.go create mode 100644 pkg/txt/is_test.go diff --git a/internal/entity/photo_test.go b/internal/entity/photo_test.go index 48e6042b5..88379ec35 100644 --- a/internal/entity/photo_test.go +++ b/internal/entity/photo_test.go @@ -324,6 +324,11 @@ func TestPhoto_GetDetails(t *testing.T) { } func TestPhoto_FileTitle(t *testing.T) { + t.Run("non-latin", func(t *testing.T) { + photo := Photo{PhotoName: "桥", PhotoPath: "", OriginalName: ""} + result := photo.FileTitle() + assert.Equal(t, "桥", result) + }) t.Run("changing-of-the-guard--buckingham-palace_7925318070_o.jpg", func(t *testing.T) { photo := Photo{PhotoName: "20200102_194030_9EFA9E5E", PhotoPath: "2000/05", OriginalName: "flickr import/changing-of-the-guard--buckingham-palace_7925318070_o.jpg"} result := photo.FileTitle() diff --git a/internal/query/photo_search.go b/internal/query/photo_search.go index 77d52815f..9acb62611 100644 --- a/internal/query/photo_search.go +++ b/internal/query/photo_search.go @@ -109,10 +109,6 @@ func PhotoSearch(f form.PhotoSearch) (results PhotoResults, count int, err error s = s.Where("photos.id IN (SELECT pk.photo_id FROM keywords k JOIN photos_keywords pk ON k.id = pk.keyword_id WHERE (?))", gorm.Expr(likeAny)) } } else if f.Query != "" { - if len(f.Query) < 2 { - return results, 0, fmt.Errorf("query too short") - } - if err := Db().Where(AnySlug("custom_slug", f.Query, " ")).Find(&labels).Error; len(labels) == 0 || err != nil { log.Infof("search: label %s not found, using fuzzy search", txt.Quote(f.Query)) diff --git a/internal/query/photo_search_test.go b/internal/query/photo_search_test.go index 693e6c6c0..30b99ce19 100644 --- a/internal/query/photo_search_test.go +++ b/internal/query/photo_search_test.go @@ -137,18 +137,6 @@ func TestPhotoSearch(t *testing.T) { assert.LessOrEqual(t, 1, len(photos)) }) - t.Run("query too short", func(t *testing.T) { - var f form.PhotoSearch - f.Query = "a" - f.Count = 5000 - f.Offset = 0 - f.Geo = false - - photos, _, err := PhotoSearch(f) - - assert.Equal(t, "query too short", err.Error()) - assert.Empty(t, photos) - }) t.Run("search for keyword", func(t *testing.T) { var f form.PhotoSearch f.Query = "bridge" @@ -156,9 +144,11 @@ func TestPhotoSearch(t *testing.T) { f.Offset = 0 photos, _, err := PhotoSearch(f) + if err != nil { t.Fatal(err) } + assert.LessOrEqual(t, 2, len(photos)) }) t.Run("search for label in query", func(t *testing.T) { @@ -168,9 +158,11 @@ func TestPhotoSearch(t *testing.T) { f.Offset = 0 photos, _, err := PhotoSearch(f) + if err != nil { t.Fatal(err) } + assert.LessOrEqual(t, 1, len(photos)) }) t.Run("search for archived", func(t *testing.T) { diff --git a/internal/query/query.go b/internal/query/query.go index baf8c1d3a..40103f3d7 100644 --- a/internal/query/query.go +++ b/internal/query/query.go @@ -109,6 +109,10 @@ func LikeAny(col, search string) (where string) { wheres = append(wheres, fmt.Sprintf("%s = '%s'", col, w)) } + if !txt.ContainsASCIILetters(w) { + continue + } + singular := inflection.Singular(w) if singular != w { @@ -137,6 +141,10 @@ func AnySlug(col, search, sep string) (where string) { words = append(words, slug.Make(w)) + if !txt.ContainsASCIILetters(w) { + continue + } + singular := inflection.Singular(w) if singular != w { diff --git a/pkg/txt/contains.go b/pkg/txt/contains.go new file mode 100644 index 000000000..19bdf69b5 --- /dev/null +++ b/pkg/txt/contains.go @@ -0,0 +1,54 @@ +package txt + +import ( + "regexp" + "unicode" +) + +var ContainsNumberRegexp = regexp.MustCompile("\\d+") + +// ContainsNumber returns true if string contains a number. +func ContainsNumber(s string) bool { + return ContainsNumberRegexp.MatchString(s) +} + +// ContainsLetters reports whether the string only contains letters. +func ContainsLetters(s string) bool { + if s == "" { + return false + } + + for _, r := range s { + if !unicode.IsLetter(r) { + return false + } + } + + return true +} + +// ContainsASCIILetters reports if the string only contains ascii chars without whitespace, numbers, and punctuation marks. +func ContainsASCIILetters(s string) bool { + for _, r := range s { + if (r < 65 || r > 90) && (r < 97 || r > 122) { + return false + } + } + + return true +} + +// ContainsSymbols reports whether the string only contains symbolic characters. +func ContainsSymbols(s string) bool { + if s == "" { + return false + } + + for _, r := range s { + if !unicode.IsSymbol(r) { + return false + } + } + + return true +} diff --git a/pkg/txt/contains_test.go b/pkg/txt/contains_test.go new file mode 100644 index 000000000..4ac3d18bd --- /dev/null +++ b/pkg/txt/contains_test.go @@ -0,0 +1,88 @@ +package txt + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestContainsNumber(t *testing.T) { + t.Run("True", func(t *testing.T) { + assert.Equal(t, true, ContainsNumber("f3abcde")) + }) + t.Run("False", func(t *testing.T) { + assert.Equal(t, false, ContainsNumber("abcd")) + }) +} + +func TestContainsSymbols(t *testing.T) { + t.Run("123", func(t *testing.T) { + assert.False(t, ContainsSymbols("123")) + }) + t.Run("The quick brown fox.", func(t *testing.T) { + assert.False(t, ContainsSymbols("The quick brown fox.")) + }) + t.Run("bridge", func(t *testing.T) { + assert.False(t, ContainsSymbols("bridge")) + }) + t.Run("桥", func(t *testing.T) { + assert.False(t, ContainsSymbols("桥")) + }) + t.Run("桥船", func(t *testing.T) { + assert.False(t, ContainsSymbols("桥船")) + }) + t.Run("स्थान", func(t *testing.T) { + assert.False(t, ContainsSymbols("स्थान")) + }) + t.Run("réseau", func(t *testing.T) { + assert.False(t, ContainsSymbols("réseau")) + }) +} + +func TestContainsLetters(t *testing.T) { + t.Run("123", func(t *testing.T) { + assert.False(t, ContainsLetters("123")) + }) + t.Run("The quick brown fox.", func(t *testing.T) { + assert.False(t, ContainsLetters("The quick brown fox.")) + }) + t.Run("bridge", func(t *testing.T) { + assert.True(t, ContainsLetters("bridge")) + }) + t.Run("桥", func(t *testing.T) { + assert.True(t, ContainsLetters("桥")) + }) + t.Run("桥船", func(t *testing.T) { + assert.True(t, ContainsLetters("桥船")) + }) + t.Run("स्थान", func(t *testing.T) { + assert.False(t, ContainsLetters("स्थान")) + }) + t.Run("réseau", func(t *testing.T) { + assert.True(t, ContainsLetters("réseau")) + }) +} + +func TestContainsASCIILetters(t *testing.T) { + t.Run("123", func(t *testing.T) { + assert.False(t, ContainsASCIILetters("123")) + }) + t.Run("The quick brown fox.", func(t *testing.T) { + assert.False(t, ContainsASCIILetters("The quick brown fox.")) + }) + t.Run("bridge", func(t *testing.T) { + assert.True(t, ContainsASCIILetters("bridge")) + }) + t.Run("桥", func(t *testing.T) { + assert.False(t, ContainsASCIILetters("桥")) + }) + t.Run("桥船", func(t *testing.T) { + assert.False(t, ContainsASCIILetters("桥船")) + }) + t.Run("स्थान", func(t *testing.T) { + assert.False(t, ContainsASCIILetters("स्थान")) + }) + t.Run("réseau", func(t *testing.T) { + assert.False(t, ContainsASCIILetters("réseau")) + }) +} diff --git a/pkg/txt/file_title.go b/pkg/txt/file_title.go index d578e3ee3..5b5a386ee 100644 --- a/pkg/txt/file_title.go +++ b/pkg/txt/file_title.go @@ -7,13 +7,13 @@ import ( "github.com/photoprism/photoprism/pkg/fs" ) -var FileTitleRegexp = regexp.MustCompile("[\\p{L}\\-,':&+]{2,}|( [&+] )?") +var FileTitleRegexp = regexp.MustCompile("[\\p{L}\\-,':&+!?]{1,}|( [&+] )?") // FileTitle returns the string with the first characters of each word converted to uppercase. func FileTitle(s string) string { s = fs.BasePrefix(s, true) - if len(s) < 3 { + if len(s) < 3 && IsASCII(s) { return "" } @@ -25,7 +25,7 @@ func FileTitle(s string) string { for _, w := range words { w = strings.ToLower(w) - if len(w) < 3 && found == 0 { + if IsASCII(w) && (len(w) < 3 && found == 0 || len(w) == 1) { continue } @@ -56,7 +56,7 @@ func FileTitle(s string) string { title = strings.ReplaceAll(title, "-", " ") title = strings.ReplaceAll(title, " ", " ") - if len(title) <= 4 { + if len(title) <= 4 && IsASCII(title) { return "" } diff --git a/pkg/txt/file_title_test.go b/pkg/txt/file_title_test.go index 8b1c822e3..66ca9acf2 100644 --- a/pkg/txt/file_title_test.go +++ b/pkg/txt/file_title_test.go @@ -7,6 +7,12 @@ import ( ) func TestFileTitle(t *testing.T) { + t.Run("桥", func(t *testing.T) { + assert.Equal(t, "桥", FileTitle("桥")) + }) + t.Run("i_love_you!", func(t *testing.T) { + assert.Equal(t, "Love You!", FileTitle("i_love_you!")) + }) t.Run("photoprism", func(t *testing.T) { assert.Equal(t, "PhotoPrism: Browse Your Life in Pictures", FileTitle("photoprism: Browse your life in pictures")) }) diff --git a/pkg/txt/is.go b/pkg/txt/is.go new file mode 100644 index 000000000..5df3752bc --- /dev/null +++ b/pkg/txt/is.go @@ -0,0 +1,43 @@ +package txt + +import "unicode" + +// Is reports whether the all string runes are in the specified range. +func Is(rangeTab *unicode.RangeTable, s string) bool { + if s == "" { + return false + } + + for _, r := range s { + if !unicode.Is(rangeTab, r) { + return false + } + } + + return true +} + +// IsASCII tests if the string only contains ascii runes. +func IsASCII(s string) bool { + for i := 0; i < len(s); i++ { + if s[i] > unicode.MaxASCII { + return false + } + } + return true +} + +// IsLatin reports whether the string only contains latin letters. +func IsLatin(s string) bool { + if s == "" { + return false + } + + for _, r := range s { + if !unicode.Is(unicode.Latin, r) { + return false + } + } + + return true +} diff --git a/pkg/txt/is_test.go b/pkg/txt/is_test.go new file mode 100644 index 000000000..3578b345a --- /dev/null +++ b/pkg/txt/is_test.go @@ -0,0 +1,87 @@ +package txt + +import ( + "testing" + "unicode" + + "github.com/stretchr/testify/assert" +) + +func TestIs(t *testing.T) { + t.Run("The quick brown fox.", func(t *testing.T) { + assert.False(t, Is(unicode.Latin, "The quick brown fox.")) + assert.False(t, Is(unicode.L, "The quick brown fox.")) + assert.False(t, Is(unicode.Letter, "The quick brown fox.")) + }) + t.Run("bridge", func(t *testing.T) { + assert.True(t, Is(unicode.Latin, "bridge")) + assert.True(t, Is(unicode.L, "bridge")) + assert.True(t, Is(unicode.Letter, "bridge")) + }) + t.Run("桥", func(t *testing.T) { + assert.False(t, Is(unicode.Latin, "桥")) + assert.True(t, Is(unicode.L, "桥")) + assert.True(t, Is(unicode.Letter, "桥")) + }) + t.Run("桥船", func(t *testing.T) { + assert.False(t, Is(unicode.Latin, "桥船")) + assert.True(t, Is(unicode.L, "桥船")) + assert.True(t, Is(unicode.Letter, "桥船")) + }) + t.Run("स्थान", func(t *testing.T) { + assert.False(t, Is(unicode.Latin, "स्थान")) + assert.False(t, Is(unicode.L, "स्थान")) + assert.False(t, Is(unicode.Letter, "स्थान")) + assert.False(t, Is(unicode.Tamil, "स्थान")) + }) + t.Run("réseau", func(t *testing.T) { + assert.True(t, Is(unicode.Latin, "réseau")) + assert.True(t, Is(unicode.L, "réseau")) + assert.True(t, Is(unicode.Letter, "réseau")) + }) +} + +func TestIsASCII(t *testing.T) { + t.Run("123", func(t *testing.T) { + assert.True(t, IsASCII("123")) + }) + t.Run("The quick brown fox.", func(t *testing.T) { + assert.True(t, IsASCII("The quick brown fox.")) + }) + t.Run("bridge", func(t *testing.T) { + assert.True(t, IsASCII("bridge")) + }) + t.Run("桥", func(t *testing.T) { + assert.False(t, IsASCII("桥")) + }) + t.Run("桥船", func(t *testing.T) { + assert.False(t, IsASCII("桥船")) + }) + t.Run("स्थान", func(t *testing.T) { + assert.False(t, IsASCII("स्थान")) + }) + t.Run("réseau", func(t *testing.T) { + assert.False(t, IsASCII("réseau")) + }) +} + +func TestIsLatin(t *testing.T) { + t.Run("The quick brown fox.", func(t *testing.T) { + assert.False(t, IsLatin("The quick brown fox.")) + }) + t.Run("bridge", func(t *testing.T) { + assert.True(t, IsLatin("bridge")) + }) + t.Run("桥", func(t *testing.T) { + assert.False(t, IsLatin("桥")) + }) + t.Run("桥船", func(t *testing.T) { + assert.False(t, IsLatin("桥船")) + }) + t.Run("स्थान", func(t *testing.T) { + assert.False(t, IsLatin("स्थान")) + }) + t.Run("réseau", func(t *testing.T) { + assert.True(t, IsLatin("réseau")) + }) +} diff --git a/pkg/txt/strings.go b/pkg/txt/strings.go index a927fa43b..97cef3074 100644 --- a/pkg/txt/strings.go +++ b/pkg/txt/strings.go @@ -1,17 +1,9 @@ package txt import ( - "regexp" "strings" ) -var ContainsNumberRegexp = regexp.MustCompile("\\d+") - -// ContainsNumber returns true if string contains a number. -func ContainsNumber(s string) bool { - return ContainsNumberRegexp.MatchString(s) -} - // Bool casts a string to bool. func Bool(s string) bool { s = strings.TrimSpace(s) @@ -22,14 +14,3 @@ func Bool(s string) bool { return true } - -// ASCII returns true if the string only contains ascii chars without whitespace, numbers, and punctuation marks. -func ASCII(s string) bool { - for _, r := range s { - if (r < 65 || r > 90) && (r < 97 || r > 122) { - return false - } - } - - return true -} diff --git a/pkg/txt/strings_test.go b/pkg/txt/strings_test.go index a416abd59..10c654f07 100644 --- a/pkg/txt/strings_test.go +++ b/pkg/txt/strings_test.go @@ -6,15 +6,6 @@ import ( "github.com/stretchr/testify/assert" ) -func TestContainsNumber(t *testing.T) { - t.Run("True", func(t *testing.T) { - assert.Equal(t, true, ContainsNumber("f3abcde")) - }) - t.Run("False", func(t *testing.T) { - assert.Equal(t, false, ContainsNumber("abcd")) - }) -} - func TestBool(t *testing.T) { t.Run("not empty", func(t *testing.T) { assert.Equal(t, true, Bool("Browse your life in pictures")) diff --git a/pkg/txt/words.go b/pkg/txt/words.go index b26f92495..564b84701 100644 --- a/pkg/txt/words.go +++ b/pkg/txt/words.go @@ -6,11 +6,11 @@ import ( "strings" ) -var KeywordsRegexp = regexp.MustCompile("[\\p{L}\\-]{3,}") +var KeywordsRegexp = regexp.MustCompile("[\\p{L}\\-]{1,}") // UnknownWord returns true if the string does not seem to be a real word. func UnknownWord(s string) bool { - if len(s) > 3 || !ASCII(s) { + if len(s) > 3 || !ContainsASCIILetters(s) { return false } @@ -29,7 +29,15 @@ func UnknownWord(s string) bool { // Words returns a slice of words with at least 3 characters from a string, dashes count as character ("ile-de-france"). func Words(s string) (results []string) { - return KeywordsRegexp.FindAllString(s, -1) + for _, s := range KeywordsRegexp.FindAllString(s, -1) { + if len(s) < 3 && IsLatin(s) { + continue + } + + results = append(results, s) + } + + return results } // ReplaceSpaces replaces all spaces with another string. @@ -37,11 +45,19 @@ func ReplaceSpaces(s string, char string) string { return strings.Replace(s, " ", char, -1) } -var FilenameKeywordsRegexp = regexp.MustCompile("[\\p{L}]{3,}") +var FilenameKeywordsRegexp = regexp.MustCompile("[\\p{L}]{1,}") // FilenameWords returns a slice of words with at least 3 characters from a string ("ile", "france"). func FilenameWords(s string) (results []string) { - return FilenameKeywordsRegexp.FindAllString(s, -1) + for _, s := range FilenameKeywordsRegexp.FindAllString(s, -1) { + if len(s) < 3 && IsLatin(s) { + continue + } + + results = append(results, s) + } + + return results } // FilenameKeywords returns a slice of keywords without stopwords. @@ -87,7 +103,7 @@ func UniqueWords(words []string) (results []string) { for _, w := range words { w = strings.ToLower(w) - if len(w) < 3 || w == last { + if len(w) < 3 && IsLatin(w) || w == last { continue } @@ -109,7 +125,7 @@ func RemoveFromWords(words []string, remove string) (results []string) { for _, w := range words { w = strings.ToLower(w) - if len(w) < 3 || w == last || strings.Contains(remove, w) { + if len(w) < 3 && IsLatin(w) || w == last || strings.Contains(remove, w) { continue } @@ -132,7 +148,7 @@ func UniqueKeywords(s string) (results []string) { for _, w := range words { w = strings.ToLower(w) - if len(w) < 3 || w == last { + if len(w) < 3 && IsLatin(w) || w == last { continue } diff --git a/pkg/txt/words_test.go b/pkg/txt/words_test.go index 64d4b6e64..f516c86fd 100644 --- a/pkg/txt/words_test.go +++ b/pkg/txt/words_test.go @@ -7,6 +7,10 @@ import ( ) func TestWords(t *testing.T) { + t.Run("桥", func(t *testing.T) { + result := Words("桥") + assert.Equal(t, []string{"桥"}, result) + }) t.Run("I'm a lazy-brown fox!", func(t *testing.T) { result := Words("I'm a lazy-BRoWN fox!") assert.Equal(t, []string{"lazy-BRoWN", "fox"}, result) @@ -60,6 +64,10 @@ func TestFilenameWords(t *testing.T) { } func TestFilenameKeywords(t *testing.T) { + t.Run("桥.jpg", func(t *testing.T) { + result := FilenameKeywords("桥.jpg") + assert.Equal(t, []string{"桥"}, result) + }) t.Run("I'm a lazy-brown var fox.jpg!", func(t *testing.T) { result := FilenameKeywords("I'm a lazy-brown var fox.jpg!") assert.Equal(t, []string{"lazy", "brown", "fox"}, result)