Backend: Refactor txt package #260
Signed-off-by: Michael Mayer <michael@liquidbytes.net>
This commit is contained in:
parent
97cab01e6d
commit
5c2ae9e698
5 changed files with 252 additions and 235 deletions
167
pkg/txt/capitalization.go
Normal file
167
pkg/txt/capitalization.go
Normal file
|
@ -0,0 +1,167 @@
|
|||
package txt
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
"unicode"
|
||||
|
||||
"github.com/photoprism/photoprism/pkg/fs"
|
||||
)
|
||||
|
||||
var FileTitleRegexp = regexp.MustCompile("[\\p{L}\\-]{2,}")
|
||||
|
||||
var TitleReplacements = map[string]string{
|
||||
"Nyc": "NYC",
|
||||
"Ny ": "NY ",
|
||||
"Uae": "UAE",
|
||||
"Usa": "USA",
|
||||
"Amd ": "AMD ",
|
||||
"Tiff": "TIFF",
|
||||
"Ibm": "IBM",
|
||||
"Usd": "USD",
|
||||
"Gbp": "GBP",
|
||||
"Chf": "CHF",
|
||||
"Ceo": "CEO",
|
||||
"Cto": "CTO",
|
||||
"Cfo": "CFO",
|
||||
"Cia ": "CIA ",
|
||||
"Fbi": "FBI",
|
||||
"Bnd": "BND",
|
||||
"Fsb": "FSB",
|
||||
"Nsa": "NSA",
|
||||
"Lax ": "LAX ",
|
||||
"Sfx": "SFX",
|
||||
"Ber ": "BER ",
|
||||
"Sfo": "SFO",
|
||||
"Lh ": "LH ",
|
||||
"Lhr": "LHR",
|
||||
"Afl ": "AFL ",
|
||||
"Nrl": "NRL",
|
||||
"Nsw": "NSW",
|
||||
"Qld": "QLD",
|
||||
"Vic ": "VIC ",
|
||||
"Iphone": "iPhone",
|
||||
"Imac": "iMac",
|
||||
"Ipad": "iPad",
|
||||
"Macbook": "MacBook",
|
||||
" And ": " and ",
|
||||
" Or ": " or ",
|
||||
" A ": " a ",
|
||||
" An ": " an ",
|
||||
" To ": " to ",
|
||||
" At ": " at ",
|
||||
" By ": " by ",
|
||||
" But ": " but ",
|
||||
" For ": " for ",
|
||||
" Of ": " of ",
|
||||
" The ": " the ",
|
||||
" On ": " on ",
|
||||
" From ": " from ",
|
||||
" With ": " with ",
|
||||
}
|
||||
|
||||
// isSeparator reports whether the rune could mark a word boundary.
|
||||
func isSeparator(r rune) bool {
|
||||
// ASCII alphanumerics and underscore are not separators
|
||||
if r <= 0x7F {
|
||||
switch {
|
||||
case '0' <= r && r <= '9':
|
||||
return false
|
||||
case 'a' <= r && r <= 'z':
|
||||
return false
|
||||
case 'A' <= r && r <= 'Z':
|
||||
return false
|
||||
case r == '_', r == '\'':
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
// Letters and digits are not separators
|
||||
if unicode.IsLetter(r) || unicode.IsDigit(r) {
|
||||
return false
|
||||
}
|
||||
// Otherwise, all we can do for now is treat spaces as separators.
|
||||
return unicode.IsSpace(r)
|
||||
}
|
||||
|
||||
// UcFirst returns the string with the first character converted to uppercase.
|
||||
func UcFirst(str string) string {
|
||||
for i, v := range str {
|
||||
return string(unicode.ToUpper(v)) + str[i+1:]
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// Title returns the string with the first characters of each word converted to uppercase.
|
||||
func Title(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
s = strings.ReplaceAll(s, "_", " ")
|
||||
|
||||
prev := ' '
|
||||
result := strings.Map(
|
||||
func(r rune) rune {
|
||||
if isSeparator(prev) {
|
||||
prev = r
|
||||
return unicode.ToTitle(r)
|
||||
}
|
||||
prev = r
|
||||
return r
|
||||
},
|
||||
s)
|
||||
|
||||
for match, abbr := range TitleReplacements {
|
||||
result = strings.ReplaceAll(result, match, abbr)
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// TitleFromFileName returns the string with the first characters of each word converted to uppercase.
|
||||
func TitleFromFileName(s string) string {
|
||||
s = fs.Base(s, true)
|
||||
|
||||
if len(s) < 3 {
|
||||
return ""
|
||||
}
|
||||
|
||||
words := FileTitleRegexp.FindAllString(s, -1)
|
||||
var result []string
|
||||
|
||||
found := 0
|
||||
|
||||
for _, w := range words {
|
||||
w = strings.ToLower(w)
|
||||
|
||||
if len(w) < 3 && found == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
if _, ok := Stopwords[w]; ok && found == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
result = append(result, w)
|
||||
|
||||
found++
|
||||
|
||||
if found >= 10 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if found == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
title := strings.Join(result, " ")
|
||||
|
||||
title = strings.ReplaceAll(title, "--", " / ")
|
||||
title = strings.ReplaceAll(title, "-", " ")
|
||||
title = strings.ReplaceAll(title, " ", " ")
|
||||
|
||||
if len(title) < 3 {
|
||||
return ""
|
||||
}
|
||||
|
||||
return Title(title)
|
||||
}
|
85
pkg/txt/capitalization_test.go
Normal file
85
pkg/txt/capitalization_test.go
Normal file
|
@ -0,0 +1,85 @@
|
|||
package txt
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestIsSeparator(t *testing.T) {
|
||||
t.Run("rune A", func(t *testing.T) {
|
||||
assert.Equal(t, false, isSeparator('A'))
|
||||
})
|
||||
t.Run("rune 99", func(t *testing.T) {
|
||||
assert.Equal(t, false, isSeparator('9'))
|
||||
})
|
||||
t.Run("rune /", func(t *testing.T) {
|
||||
assert.Equal(t, true, isSeparator('/'))
|
||||
})
|
||||
t.Run("rune \\", func(t *testing.T) {
|
||||
assert.Equal(t, true, isSeparator('\\'))
|
||||
})
|
||||
t.Run("rune ♥ ", func(t *testing.T) {
|
||||
assert.Equal(t, false, isSeparator('♥'))
|
||||
})
|
||||
t.Run("rune space", func(t *testing.T) {
|
||||
assert.Equal(t, true, isSeparator(' '))
|
||||
})
|
||||
t.Run("rune '", func(t *testing.T) {
|
||||
assert.Equal(t, false, isSeparator('\''))
|
||||
})
|
||||
t.Run("rune ý", func(t *testing.T) {
|
||||
assert.Equal(t, false, isSeparator('ý'))
|
||||
})
|
||||
}
|
||||
|
||||
func TestUcFirst(t *testing.T) {
|
||||
t.Run("photo-lover", func(t *testing.T) {
|
||||
assert.Equal(t, "Photo-lover", UcFirst("photo-lover"))
|
||||
})
|
||||
t.Run("cat", func(t *testing.T) {
|
||||
assert.Equal(t, "Cat", UcFirst("Cat"))
|
||||
})
|
||||
t.Run("empty string", func(t *testing.T) {
|
||||
assert.Equal(t, "", UcFirst(""))
|
||||
})
|
||||
}
|
||||
|
||||
func TestTitle(t *testing.T) {
|
||||
t.Run("Browse your life in pictures", func(t *testing.T) {
|
||||
assert.Equal(t, "Browse Your Life In Pictures", Title("Browse your life in pictures"))
|
||||
})
|
||||
t.Run("photo-lover", func(t *testing.T) {
|
||||
assert.Equal(t, "Photo-Lover", Title("photo-lover"))
|
||||
})
|
||||
}
|
||||
|
||||
func TestTitleFromFileName(t *testing.T) {
|
||||
t.Run("Browse your life in pictures", func(t *testing.T) {
|
||||
assert.Equal(t, "Browse Your Life In Pictures", TitleFromFileName("Browse your life in pictures"))
|
||||
})
|
||||
t.Run("photo-lover", func(t *testing.T) {
|
||||
assert.Equal(t, "Photo Lover", TitleFromFileName("photo-lover"))
|
||||
})
|
||||
t.Run("BRIDGE in nyc", func(t *testing.T) {
|
||||
assert.Equal(t, "Bridge In NYC", TitleFromFileName("BRIDGE in nyc"))
|
||||
})
|
||||
t.Run("phil unveils iphone, ipad, imac or macbook 11 pro and max", func(t *testing.T) {
|
||||
assert.Equal(t, "Phil Unveils iPhone iPad iMac or MacBook Pro and Max", TitleFromFileName("phil unveils iphone, ipad, imac or macbook 11 pro and max"))
|
||||
})
|
||||
t.Run("IMG_4568", func(t *testing.T) {
|
||||
assert.Equal(t, "", TitleFromFileName("IMG_4568"))
|
||||
})
|
||||
t.Run("queen-city-yacht-club--toronto-island_7999432607_o.jpg", func(t *testing.T) {
|
||||
assert.Equal(t, "Queen City Yacht Club / Toronto Island", TitleFromFileName("queen-city-yacht-club--toronto-island_7999432607_o.jpg"))
|
||||
})
|
||||
t.Run("tim-robbins--tiff-2012_7999233420_o.jpg", func(t *testing.T) {
|
||||
assert.Equal(t, "Tim Robbins / TIFF", TitleFromFileName("tim-robbins--tiff-2012_7999233420_o.jpg"))
|
||||
})
|
||||
t.Run("20200102-204030-Berlin-Germany-2020-3h4.jpg", func(t *testing.T) {
|
||||
assert.Equal(t, "Berlin Germany", TitleFromFileName("20200102-204030-Berlin-Germany-2020-3h4.jpg"))
|
||||
})
|
||||
t.Run("changing-of-the-guard--buckingham-palace_7925318070_o.jpg", func(t *testing.T) {
|
||||
assert.Equal(t, "Changing of the Guard / Buckingham Palace", TitleFromFileName("changing-of-the-guard--buckingham-palace_7925318070_o.jpg"))
|
||||
})
|
||||
}
|
|
@ -1,48 +0,0 @@
|
|||
package txt
|
||||
|
||||
var TitleReplacements = map[string]string{
|
||||
"Nyc": "NYC",
|
||||
"Ny ": "NY ",
|
||||
"Uae": "UAE",
|
||||
"Usa": "USA",
|
||||
"Amd ": "AMD ",
|
||||
"Tiff": "TIFF",
|
||||
"Ibm": "IBM",
|
||||
"Usd": "USD",
|
||||
"Gbp": "GBP",
|
||||
"Chf": "CHF",
|
||||
"Ceo": "CEO",
|
||||
"Cto": "CTO",
|
||||
"Cfo": "CFO",
|
||||
"Cia ": "CIA ",
|
||||
"Fbi": "FBI",
|
||||
"Bnd": "BND",
|
||||
"Fsb": "FSB",
|
||||
"Nsa": "NSA",
|
||||
"Lax ": "LAX ",
|
||||
"Sfx": "SFX",
|
||||
"Ber ": "BER ",
|
||||
"Sfo": "SFO",
|
||||
"Lh ": "LH ",
|
||||
"Lhr": "LHR",
|
||||
"Afl ": "AFL ",
|
||||
"Nrl": "NRL",
|
||||
"Nsw": "NSW",
|
||||
"Qld": "QLD",
|
||||
"Vic ": "VIC ",
|
||||
"Iphone": "iPhone",
|
||||
"Imac": "iMac",
|
||||
"Ipad": "iPad",
|
||||
"Macbook": "MacBook",
|
||||
" And ": " and ",
|
||||
" Or ": " or ",
|
||||
" A ": " a ",
|
||||
" An ": " an ",
|
||||
" To ": " to ",
|
||||
" At ": " at ",
|
||||
" By ": " by ",
|
||||
" But ": " but ",
|
||||
" For ": " for ",
|
||||
" Of ": " of ",
|
||||
" The ": " the ",
|
||||
}
|
|
@ -3,124 +3,15 @@ package txt
|
|||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
"unicode"
|
||||
|
||||
"github.com/photoprism/photoprism/pkg/fs"
|
||||
)
|
||||
|
||||
var ContainsNumberRegexp = regexp.MustCompile("\\d+")
|
||||
var FileTitleRegexp = regexp.MustCompile("[\\p{L}\\-]{2,}")
|
||||
|
||||
// ContainsNumber returns true if string contains a number.
|
||||
func ContainsNumber(s string) bool {
|
||||
return ContainsNumberRegexp.MatchString(s)
|
||||
}
|
||||
|
||||
// isSeparator reports whether the rune could mark a word boundary.
|
||||
func isSeparator(r rune) bool {
|
||||
// ASCII alphanumerics and underscore are not separators
|
||||
if r <= 0x7F {
|
||||
switch {
|
||||
case '0' <= r && r <= '9':
|
||||
return false
|
||||
case 'a' <= r && r <= 'z':
|
||||
return false
|
||||
case 'A' <= r && r <= 'Z':
|
||||
return false
|
||||
case r == '_', r == '\'':
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
// Letters and digits are not separators
|
||||
if unicode.IsLetter(r) || unicode.IsDigit(r) {
|
||||
return false
|
||||
}
|
||||
// Otherwise, all we can do for now is treat spaces as separators.
|
||||
return unicode.IsSpace(r)
|
||||
}
|
||||
|
||||
// UcFirst returns the string with the first character converted to uppercase.
|
||||
func UcFirst(str string) string {
|
||||
for i, v := range str {
|
||||
return string(unicode.ToUpper(v)) + str[i+1:]
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// Title returns the string with the first characters of each word converted to uppercase.
|
||||
func Title(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
s = strings.ReplaceAll(s, "_", " ")
|
||||
|
||||
prev := ' '
|
||||
result := strings.Map(
|
||||
func(r rune) rune {
|
||||
if isSeparator(prev) {
|
||||
prev = r
|
||||
return unicode.ToTitle(r)
|
||||
}
|
||||
prev = r
|
||||
return r
|
||||
},
|
||||
s)
|
||||
|
||||
for match, abbr := range TitleReplacements {
|
||||
result = strings.ReplaceAll(result, match, abbr)
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// TitleFromFileName returns the string with the first characters of each word converted to uppercase.
|
||||
func TitleFromFileName(s string) string {
|
||||
s = fs.Base(s, true)
|
||||
|
||||
if len(s) < 3 {
|
||||
return ""
|
||||
}
|
||||
|
||||
words := FileTitleRegexp.FindAllString(s, -1)
|
||||
var result []string
|
||||
|
||||
found := 0
|
||||
|
||||
for _, w := range words {
|
||||
w = strings.ToLower(w)
|
||||
|
||||
if len(w) < 3 && found == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
if _, ok := Stopwords[w]; ok && found == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
result = append(result, w)
|
||||
|
||||
found++
|
||||
|
||||
if found >= 10 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if found == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
title := strings.Join(result, " ")
|
||||
|
||||
title = strings.ReplaceAll(title, "--", " / ")
|
||||
title = strings.ReplaceAll(title, "-", " ")
|
||||
title = strings.ReplaceAll(title, " ", " ")
|
||||
|
||||
if len(title) < 3 {
|
||||
return ""
|
||||
}
|
||||
|
||||
return Title(title)
|
||||
}
|
||||
|
||||
// Bool casts a string to bool.
|
||||
func Bool(s string) bool {
|
||||
|
|
|
@ -15,84 +15,6 @@ func TestContainsNumber(t *testing.T) {
|
|||
})
|
||||
}
|
||||
|
||||
func TestIsSeparator(t *testing.T) {
|
||||
t.Run("rune A", func(t *testing.T) {
|
||||
assert.Equal(t, false, isSeparator('A'))
|
||||
})
|
||||
t.Run("rune 99", func(t *testing.T) {
|
||||
assert.Equal(t, false, isSeparator('9'))
|
||||
})
|
||||
t.Run("rune /", func(t *testing.T) {
|
||||
assert.Equal(t, true, isSeparator('/'))
|
||||
})
|
||||
t.Run("rune \\", func(t *testing.T) {
|
||||
assert.Equal(t, true, isSeparator('\\'))
|
||||
})
|
||||
t.Run("rune ♥ ", func(t *testing.T) {
|
||||
assert.Equal(t, false, isSeparator('♥'))
|
||||
})
|
||||
t.Run("rune space", func(t *testing.T) {
|
||||
assert.Equal(t, true, isSeparator(' '))
|
||||
})
|
||||
t.Run("rune '", func(t *testing.T) {
|
||||
assert.Equal(t, false, isSeparator('\''))
|
||||
})
|
||||
t.Run("rune ý", func(t *testing.T) {
|
||||
assert.Equal(t, false, isSeparator('ý'))
|
||||
})
|
||||
}
|
||||
|
||||
func TestUcFirst(t *testing.T) {
|
||||
t.Run("photo-lover", func(t *testing.T) {
|
||||
assert.Equal(t, "Photo-lover", UcFirst("photo-lover"))
|
||||
})
|
||||
t.Run("cat", func(t *testing.T) {
|
||||
assert.Equal(t, "Cat", UcFirst("Cat"))
|
||||
})
|
||||
t.Run("empty string", func(t *testing.T) {
|
||||
assert.Equal(t, "", UcFirst(""))
|
||||
})
|
||||
}
|
||||
|
||||
func TestTitle(t *testing.T) {
|
||||
t.Run("Browse your life in pictures", func(t *testing.T) {
|
||||
assert.Equal(t, "Browse Your Life In Pictures", Title("Browse your life in pictures"))
|
||||
})
|
||||
t.Run("photo-lover", func(t *testing.T) {
|
||||
assert.Equal(t, "Photo-Lover", Title("photo-lover"))
|
||||
})
|
||||
}
|
||||
|
||||
func TestTitleFromFileName(t *testing.T) {
|
||||
t.Run("Browse your life in pictures", func(t *testing.T) {
|
||||
assert.Equal(t, "Browse Your Life In Pictures", TitleFromFileName("Browse your life in pictures"))
|
||||
})
|
||||
t.Run("photo-lover", func(t *testing.T) {
|
||||
assert.Equal(t, "Photo Lover", TitleFromFileName("photo-lover"))
|
||||
})
|
||||
t.Run("BRIDGE in nyc", func(t *testing.T) {
|
||||
assert.Equal(t, "Bridge In NYC", TitleFromFileName("BRIDGE in nyc"))
|
||||
})
|
||||
t.Run("phil unveils iphone, ipad, imac or macbook 11 pro and max", func(t *testing.T) {
|
||||
assert.Equal(t, "Phil Unveils iPhone iPad iMac or MacBook Pro and Max", TitleFromFileName("phil unveils iphone, ipad, imac or macbook 11 pro and max"))
|
||||
})
|
||||
t.Run("IMG_4568", func(t *testing.T) {
|
||||
assert.Equal(t, "", TitleFromFileName("IMG_4568"))
|
||||
})
|
||||
t.Run("queen-city-yacht-club--toronto-island_7999432607_o.jpg", func(t *testing.T) {
|
||||
assert.Equal(t, "Queen City Yacht Club / Toronto Island", TitleFromFileName("queen-city-yacht-club--toronto-island_7999432607_o.jpg"))
|
||||
})
|
||||
t.Run("tim-robbins--tiff-2012_7999233420_o.jpg", func(t *testing.T) {
|
||||
assert.Equal(t, "Tim Robbins / TIFF", TitleFromFileName("tim-robbins--tiff-2012_7999233420_o.jpg"))
|
||||
})
|
||||
t.Run("20200102-204030-Berlin-Germany-2020-3h4.jpg", func(t *testing.T) {
|
||||
assert.Equal(t, "Berlin Germany", TitleFromFileName("20200102-204030-Berlin-Germany-2020-3h4.jpg"))
|
||||
})
|
||||
t.Run("changing-of-the-guard--buckingham-palace_7925318070_o.jpg", func(t *testing.T) {
|
||||
assert.Equal(t, "Changing of the Guard / Buckingham Palace", TitleFromFileName("changing-of-the-guard--buckingham-palace_7925318070_o.jpg"))
|
||||
})
|
||||
}
|
||||
|
||||
func TestBool(t *testing.T) {
|
||||
t.Run("not empty", func(t *testing.T) {
|
||||
assert.Equal(t, true, Bool("Browse your life in pictures"))
|
||||
|
|
Loading…
Reference in a new issue