Index shorter keywords in languages like Chinese #746

Signed-off-by: Michael Mayer <michael@liquidbytes.net>
This commit is contained in:
Michael Mayer 2020-12-27 16:37:28 +01:00
parent 00a768173f
commit 91acaaa573
14 changed files with 331 additions and 56 deletions

View File

@ -324,6 +324,11 @@ func TestPhoto_GetDetails(t *testing.T) {
}
func TestPhoto_FileTitle(t *testing.T) {
t.Run("non-latin", func(t *testing.T) {
photo := Photo{PhotoName: "桥", PhotoPath: "", OriginalName: ""}
result := photo.FileTitle()
assert.Equal(t, "桥", result)
})
t.Run("changing-of-the-guard--buckingham-palace_7925318070_o.jpg", func(t *testing.T) {
photo := Photo{PhotoName: "20200102_194030_9EFA9E5E", PhotoPath: "2000/05", OriginalName: "flickr import/changing-of-the-guard--buckingham-palace_7925318070_o.jpg"}
result := photo.FileTitle()

View File

@ -109,10 +109,6 @@ func PhotoSearch(f form.PhotoSearch) (results PhotoResults, count int, err error
s = s.Where("photos.id IN (SELECT pk.photo_id FROM keywords k JOIN photos_keywords pk ON k.id = pk.keyword_id WHERE (?))", gorm.Expr(likeAny))
}
} else if f.Query != "" {
if len(f.Query) < 2 {
return results, 0, fmt.Errorf("query too short")
}
if err := Db().Where(AnySlug("custom_slug", f.Query, " ")).Find(&labels).Error; len(labels) == 0 || err != nil {
log.Infof("search: label %s not found, using fuzzy search", txt.Quote(f.Query))

View File

@ -137,18 +137,6 @@ func TestPhotoSearch(t *testing.T) {
assert.LessOrEqual(t, 1, len(photos))
})
t.Run("query too short", func(t *testing.T) {
var f form.PhotoSearch
f.Query = "a"
f.Count = 5000
f.Offset = 0
f.Geo = false
photos, _, err := PhotoSearch(f)
assert.Equal(t, "query too short", err.Error())
assert.Empty(t, photos)
})
t.Run("search for keyword", func(t *testing.T) {
var f form.PhotoSearch
f.Query = "bridge"
@ -156,9 +144,11 @@ func TestPhotoSearch(t *testing.T) {
f.Offset = 0
photos, _, err := PhotoSearch(f)
if err != nil {
t.Fatal(err)
}
assert.LessOrEqual(t, 2, len(photos))
})
t.Run("search for label in query", func(t *testing.T) {
@ -168,9 +158,11 @@ func TestPhotoSearch(t *testing.T) {
f.Offset = 0
photos, _, err := PhotoSearch(f)
if err != nil {
t.Fatal(err)
}
assert.LessOrEqual(t, 1, len(photos))
})
t.Run("search for archived", func(t *testing.T) {

View File

@ -109,6 +109,10 @@ func LikeAny(col, search string) (where string) {
wheres = append(wheres, fmt.Sprintf("%s = '%s'", col, w))
}
if !txt.ContainsASCIILetters(w) {
continue
}
singular := inflection.Singular(w)
if singular != w {
@ -137,6 +141,10 @@ func AnySlug(col, search, sep string) (where string) {
words = append(words, slug.Make(w))
if !txt.ContainsASCIILetters(w) {
continue
}
singular := inflection.Singular(w)
if singular != w {

54
pkg/txt/contains.go Normal file
View File

@ -0,0 +1,54 @@
package txt
import (
"regexp"
"unicode"
)
var ContainsNumberRegexp = regexp.MustCompile("\\d+")
// ContainsNumber returns true if string contains a number.
func ContainsNumber(s string) bool {
return ContainsNumberRegexp.MatchString(s)
}
// ContainsLetters reports whether the string only contains letters.
func ContainsLetters(s string) bool {
if s == "" {
return false
}
for _, r := range s {
if !unicode.IsLetter(r) {
return false
}
}
return true
}
// ContainsASCIILetters reports if the string only contains ascii chars without whitespace, numbers, and punctuation marks.
func ContainsASCIILetters(s string) bool {
for _, r := range s {
if (r < 65 || r > 90) && (r < 97 || r > 122) {
return false
}
}
return true
}
// ContainsSymbols reports whether the string only contains symbolic characters.
func ContainsSymbols(s string) bool {
if s == "" {
return false
}
for _, r := range s {
if !unicode.IsSymbol(r) {
return false
}
}
return true
}

88
pkg/txt/contains_test.go Normal file
View File

@ -0,0 +1,88 @@
package txt
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestContainsNumber(t *testing.T) {
t.Run("True", func(t *testing.T) {
assert.Equal(t, true, ContainsNumber("f3abcde"))
})
t.Run("False", func(t *testing.T) {
assert.Equal(t, false, ContainsNumber("abcd"))
})
}
func TestContainsSymbols(t *testing.T) {
t.Run("123", func(t *testing.T) {
assert.False(t, ContainsSymbols("123"))
})
t.Run("The quick brown fox.", func(t *testing.T) {
assert.False(t, ContainsSymbols("The quick brown fox."))
})
t.Run("bridge", func(t *testing.T) {
assert.False(t, ContainsSymbols("bridge"))
})
t.Run("桥", func(t *testing.T) {
assert.False(t, ContainsSymbols("桥"))
})
t.Run("桥船", func(t *testing.T) {
assert.False(t, ContainsSymbols("桥船"))
})
t.Run("स्थान", func(t *testing.T) {
assert.False(t, ContainsSymbols("स्थान"))
})
t.Run("réseau", func(t *testing.T) {
assert.False(t, ContainsSymbols("réseau"))
})
}
func TestContainsLetters(t *testing.T) {
t.Run("123", func(t *testing.T) {
assert.False(t, ContainsLetters("123"))
})
t.Run("The quick brown fox.", func(t *testing.T) {
assert.False(t, ContainsLetters("The quick brown fox."))
})
t.Run("bridge", func(t *testing.T) {
assert.True(t, ContainsLetters("bridge"))
})
t.Run("桥", func(t *testing.T) {
assert.True(t, ContainsLetters("桥"))
})
t.Run("桥船", func(t *testing.T) {
assert.True(t, ContainsLetters("桥船"))
})
t.Run("स्थान", func(t *testing.T) {
assert.False(t, ContainsLetters("स्थान"))
})
t.Run("réseau", func(t *testing.T) {
assert.True(t, ContainsLetters("réseau"))
})
}
func TestContainsASCIILetters(t *testing.T) {
t.Run("123", func(t *testing.T) {
assert.False(t, ContainsASCIILetters("123"))
})
t.Run("The quick brown fox.", func(t *testing.T) {
assert.False(t, ContainsASCIILetters("The quick brown fox."))
})
t.Run("bridge", func(t *testing.T) {
assert.True(t, ContainsASCIILetters("bridge"))
})
t.Run("桥", func(t *testing.T) {
assert.False(t, ContainsASCIILetters("桥"))
})
t.Run("桥船", func(t *testing.T) {
assert.False(t, ContainsASCIILetters("桥船"))
})
t.Run("स्थान", func(t *testing.T) {
assert.False(t, ContainsASCIILetters("स्थान"))
})
t.Run("réseau", func(t *testing.T) {
assert.False(t, ContainsASCIILetters("réseau"))
})
}

View File

@ -7,13 +7,13 @@ import (
"github.com/photoprism/photoprism/pkg/fs"
)
var FileTitleRegexp = regexp.MustCompile("[\\p{L}\\-,':&+]{2,}|( [&+] )?")
var FileTitleRegexp = regexp.MustCompile("[\\p{L}\\-,':&+!?]{1,}|( [&+] )?")
// FileTitle returns the string with the first characters of each word converted to uppercase.
func FileTitle(s string) string {
s = fs.BasePrefix(s, true)
if len(s) < 3 {
if len(s) < 3 && IsASCII(s) {
return ""
}
@ -25,7 +25,7 @@ func FileTitle(s string) string {
for _, w := range words {
w = strings.ToLower(w)
if len(w) < 3 && found == 0 {
if IsASCII(w) && (len(w) < 3 && found == 0 || len(w) == 1) {
continue
}
@ -56,7 +56,7 @@ func FileTitle(s string) string {
title = strings.ReplaceAll(title, "-", " ")
title = strings.ReplaceAll(title, " ", " ")
if len(title) <= 4 {
if len(title) <= 4 && IsASCII(title) {
return ""
}

View File

@ -7,6 +7,12 @@ import (
)
func TestFileTitle(t *testing.T) {
t.Run("桥", func(t *testing.T) {
assert.Equal(t, "桥", FileTitle("桥"))
})
t.Run("i_love_you!", func(t *testing.T) {
assert.Equal(t, "Love You!", FileTitle("i_love_you!"))
})
t.Run("photoprism", func(t *testing.T) {
assert.Equal(t, "PhotoPrism: Browse Your Life in Pictures", FileTitle("photoprism: Browse your life in pictures"))
})

43
pkg/txt/is.go Normal file
View File

@ -0,0 +1,43 @@
package txt
import "unicode"
// Is reports whether the all string runes are in the specified range.
func Is(rangeTab *unicode.RangeTable, s string) bool {
if s == "" {
return false
}
for _, r := range s {
if !unicode.Is(rangeTab, r) {
return false
}
}
return true
}
// IsASCII tests if the string only contains ascii runes.
func IsASCII(s string) bool {
for i := 0; i < len(s); i++ {
if s[i] > unicode.MaxASCII {
return false
}
}
return true
}
// IsLatin reports whether the string only contains latin letters.
func IsLatin(s string) bool {
if s == "" {
return false
}
for _, r := range s {
if !unicode.Is(unicode.Latin, r) {
return false
}
}
return true
}

87
pkg/txt/is_test.go Normal file
View File

@ -0,0 +1,87 @@
package txt
import (
"testing"
"unicode"
"github.com/stretchr/testify/assert"
)
func TestIs(t *testing.T) {
t.Run("The quick brown fox.", func(t *testing.T) {
assert.False(t, Is(unicode.Latin, "The quick brown fox."))
assert.False(t, Is(unicode.L, "The quick brown fox."))
assert.False(t, Is(unicode.Letter, "The quick brown fox."))
})
t.Run("bridge", func(t *testing.T) {
assert.True(t, Is(unicode.Latin, "bridge"))
assert.True(t, Is(unicode.L, "bridge"))
assert.True(t, Is(unicode.Letter, "bridge"))
})
t.Run("桥", func(t *testing.T) {
assert.False(t, Is(unicode.Latin, "桥"))
assert.True(t, Is(unicode.L, "桥"))
assert.True(t, Is(unicode.Letter, "桥"))
})
t.Run("桥船", func(t *testing.T) {
assert.False(t, Is(unicode.Latin, "桥船"))
assert.True(t, Is(unicode.L, "桥船"))
assert.True(t, Is(unicode.Letter, "桥船"))
})
t.Run("स्थान", func(t *testing.T) {
assert.False(t, Is(unicode.Latin, "स्थान"))
assert.False(t, Is(unicode.L, "स्थान"))
assert.False(t, Is(unicode.Letter, "स्थान"))
assert.False(t, Is(unicode.Tamil, "स्थान"))
})
t.Run("réseau", func(t *testing.T) {
assert.True(t, Is(unicode.Latin, "réseau"))
assert.True(t, Is(unicode.L, "réseau"))
assert.True(t, Is(unicode.Letter, "réseau"))
})
}
func TestIsASCII(t *testing.T) {
t.Run("123", func(t *testing.T) {
assert.True(t, IsASCII("123"))
})
t.Run("The quick brown fox.", func(t *testing.T) {
assert.True(t, IsASCII("The quick brown fox."))
})
t.Run("bridge", func(t *testing.T) {
assert.True(t, IsASCII("bridge"))
})
t.Run("桥", func(t *testing.T) {
assert.False(t, IsASCII("桥"))
})
t.Run("桥船", func(t *testing.T) {
assert.False(t, IsASCII("桥船"))
})
t.Run("स्थान", func(t *testing.T) {
assert.False(t, IsASCII("स्थान"))
})
t.Run("réseau", func(t *testing.T) {
assert.False(t, IsASCII("réseau"))
})
}
func TestIsLatin(t *testing.T) {
t.Run("The quick brown fox.", func(t *testing.T) {
assert.False(t, IsLatin("The quick brown fox."))
})
t.Run("bridge", func(t *testing.T) {
assert.True(t, IsLatin("bridge"))
})
t.Run("桥", func(t *testing.T) {
assert.False(t, IsLatin("桥"))
})
t.Run("桥船", func(t *testing.T) {
assert.False(t, IsLatin("桥船"))
})
t.Run("स्थान", func(t *testing.T) {
assert.False(t, IsLatin("स्थान"))
})
t.Run("réseau", func(t *testing.T) {
assert.True(t, IsLatin("réseau"))
})
}

View File

@ -1,17 +1,9 @@
package txt
import (
"regexp"
"strings"
)
var ContainsNumberRegexp = regexp.MustCompile("\\d+")
// ContainsNumber returns true if string contains a number.
func ContainsNumber(s string) bool {
return ContainsNumberRegexp.MatchString(s)
}
// Bool casts a string to bool.
func Bool(s string) bool {
s = strings.TrimSpace(s)
@ -22,14 +14,3 @@ func Bool(s string) bool {
return true
}
// ASCII returns true if the string only contains ascii chars without whitespace, numbers, and punctuation marks.
func ASCII(s string) bool {
for _, r := range s {
if (r < 65 || r > 90) && (r < 97 || r > 122) {
return false
}
}
return true
}

View File

@ -6,15 +6,6 @@ import (
"github.com/stretchr/testify/assert"
)
func TestContainsNumber(t *testing.T) {
t.Run("True", func(t *testing.T) {
assert.Equal(t, true, ContainsNumber("f3abcde"))
})
t.Run("False", func(t *testing.T) {
assert.Equal(t, false, ContainsNumber("abcd"))
})
}
func TestBool(t *testing.T) {
t.Run("not empty", func(t *testing.T) {
assert.Equal(t, true, Bool("Browse your life in pictures"))

View File

@ -6,11 +6,11 @@ import (
"strings"
)
var KeywordsRegexp = regexp.MustCompile("[\\p{L}\\-]{3,}")
var KeywordsRegexp = regexp.MustCompile("[\\p{L}\\-]{1,}")
// UnknownWord returns true if the string does not seem to be a real word.
func UnknownWord(s string) bool {
if len(s) > 3 || !ASCII(s) {
if len(s) > 3 || !ContainsASCIILetters(s) {
return false
}
@ -29,7 +29,15 @@ func UnknownWord(s string) bool {
// Words returns a slice of words with at least 3 characters from a string, dashes count as character ("ile-de-france").
func Words(s string) (results []string) {
return KeywordsRegexp.FindAllString(s, -1)
for _, s := range KeywordsRegexp.FindAllString(s, -1) {
if len(s) < 3 && IsLatin(s) {
continue
}
results = append(results, s)
}
return results
}
// ReplaceSpaces replaces all spaces with another string.
@ -37,11 +45,19 @@ func ReplaceSpaces(s string, char string) string {
return strings.Replace(s, " ", char, -1)
}
var FilenameKeywordsRegexp = regexp.MustCompile("[\\p{L}]{3,}")
var FilenameKeywordsRegexp = regexp.MustCompile("[\\p{L}]{1,}")
// FilenameWords returns a slice of words with at least 3 characters from a string ("ile", "france").
func FilenameWords(s string) (results []string) {
return FilenameKeywordsRegexp.FindAllString(s, -1)
for _, s := range FilenameKeywordsRegexp.FindAllString(s, -1) {
if len(s) < 3 && IsLatin(s) {
continue
}
results = append(results, s)
}
return results
}
// FilenameKeywords returns a slice of keywords without stopwords.
@ -87,7 +103,7 @@ func UniqueWords(words []string) (results []string) {
for _, w := range words {
w = strings.ToLower(w)
if len(w) < 3 || w == last {
if len(w) < 3 && IsLatin(w) || w == last {
continue
}
@ -109,7 +125,7 @@ func RemoveFromWords(words []string, remove string) (results []string) {
for _, w := range words {
w = strings.ToLower(w)
if len(w) < 3 || w == last || strings.Contains(remove, w) {
if len(w) < 3 && IsLatin(w) || w == last || strings.Contains(remove, w) {
continue
}
@ -132,7 +148,7 @@ func UniqueKeywords(s string) (results []string) {
for _, w := range words {
w = strings.ToLower(w)
if len(w) < 3 || w == last {
if len(w) < 3 && IsLatin(w) || w == last {
continue
}

View File

@ -7,6 +7,10 @@ import (
)
func TestWords(t *testing.T) {
t.Run("桥", func(t *testing.T) {
result := Words("桥")
assert.Equal(t, []string{"桥"}, result)
})
t.Run("I'm a lazy-brown fox!", func(t *testing.T) {
result := Words("I'm a lazy-BRoWN fox!")
assert.Equal(t, []string{"lazy-BRoWN", "fox"}, result)
@ -60,6 +64,10 @@ func TestFilenameWords(t *testing.T) {
}
func TestFilenameKeywords(t *testing.T) {
t.Run("桥.jpg", func(t *testing.T) {
result := FilenameKeywords("桥.jpg")
assert.Equal(t, []string{"桥"}, result)
})
t.Run("I'm a lazy-brown var fox.jpg!", func(t *testing.T) {
result := FilenameKeywords("I'm a lazy-brown var fox.jpg!")
assert.Equal(t, []string{"lazy", "brown", "fox"}, result)