Backend: Refactor txt package #260

Signed-off-by: Michael Mayer <michael@liquidbytes.net>
This commit is contained in:
Michael Mayer 2020-05-31 15:17:01 +02:00
parent 97cab01e6d
commit 5c2ae9e698
5 changed files with 252 additions and 235 deletions

167
pkg/txt/capitalization.go Normal file
View file

@ -0,0 +1,167 @@
package txt
import (
"regexp"
"strings"
"unicode"
"github.com/photoprism/photoprism/pkg/fs"
)
var FileTitleRegexp = regexp.MustCompile("[\\p{L}\\-]{2,}")
var TitleReplacements = map[string]string{
"Nyc": "NYC",
"Ny ": "NY ",
"Uae": "UAE",
"Usa": "USA",
"Amd ": "AMD ",
"Tiff": "TIFF",
"Ibm": "IBM",
"Usd": "USD",
"Gbp": "GBP",
"Chf": "CHF",
"Ceo": "CEO",
"Cto": "CTO",
"Cfo": "CFO",
"Cia ": "CIA ",
"Fbi": "FBI",
"Bnd": "BND",
"Fsb": "FSB",
"Nsa": "NSA",
"Lax ": "LAX ",
"Sfx": "SFX",
"Ber ": "BER ",
"Sfo": "SFO",
"Lh ": "LH ",
"Lhr": "LHR",
"Afl ": "AFL ",
"Nrl": "NRL",
"Nsw": "NSW",
"Qld": "QLD",
"Vic ": "VIC ",
"Iphone": "iPhone",
"Imac": "iMac",
"Ipad": "iPad",
"Macbook": "MacBook",
" And ": " and ",
" Or ": " or ",
" A ": " a ",
" An ": " an ",
" To ": " to ",
" At ": " at ",
" By ": " by ",
" But ": " but ",
" For ": " for ",
" Of ": " of ",
" The ": " the ",
" On ": " on ",
" From ": " from ",
" With ": " with ",
}
// isSeparator reports whether the rune could mark a word boundary.
func isSeparator(r rune) bool {
// ASCII alphanumerics and underscore are not separators
if r <= 0x7F {
switch {
case '0' <= r && r <= '9':
return false
case 'a' <= r && r <= 'z':
return false
case 'A' <= r && r <= 'Z':
return false
case r == '_', r == '\'':
return false
}
return true
}
// Letters and digits are not separators
if unicode.IsLetter(r) || unicode.IsDigit(r) {
return false
}
// Otherwise, all we can do for now is treat spaces as separators.
return unicode.IsSpace(r)
}
// UcFirst returns the string with the first character converted to uppercase.
func UcFirst(str string) string {
for i, v := range str {
return string(unicode.ToUpper(v)) + str[i+1:]
}
return ""
}
// Title returns the string with the first characters of each word converted to uppercase.
func Title(s string) string {
s = strings.TrimSpace(s)
s = strings.ReplaceAll(s, "_", " ")
prev := ' '
result := strings.Map(
func(r rune) rune {
if isSeparator(prev) {
prev = r
return unicode.ToTitle(r)
}
prev = r
return r
},
s)
for match, abbr := range TitleReplacements {
result = strings.ReplaceAll(result, match, abbr)
}
return result
}
// TitleFromFileName returns the string with the first characters of each word converted to uppercase.
func TitleFromFileName(s string) string {
s = fs.Base(s, true)
if len(s) < 3 {
return ""
}
words := FileTitleRegexp.FindAllString(s, -1)
var result []string
found := 0
for _, w := range words {
w = strings.ToLower(w)
if len(w) < 3 && found == 0 {
continue
}
if _, ok := Stopwords[w]; ok && found == 0 {
continue
}
result = append(result, w)
found++
if found >= 10 {
break
}
}
if found == 0 {
return ""
}
title := strings.Join(result, " ")
title = strings.ReplaceAll(title, "--", " / ")
title = strings.ReplaceAll(title, "-", " ")
title = strings.ReplaceAll(title, " ", " ")
if len(title) < 3 {
return ""
}
return Title(title)
}

View file

@ -0,0 +1,85 @@
package txt
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestIsSeparator(t *testing.T) {
t.Run("rune A", func(t *testing.T) {
assert.Equal(t, false, isSeparator('A'))
})
t.Run("rune 99", func(t *testing.T) {
assert.Equal(t, false, isSeparator('9'))
})
t.Run("rune /", func(t *testing.T) {
assert.Equal(t, true, isSeparator('/'))
})
t.Run("rune \\", func(t *testing.T) {
assert.Equal(t, true, isSeparator('\\'))
})
t.Run("rune ♥ ", func(t *testing.T) {
assert.Equal(t, false, isSeparator('♥'))
})
t.Run("rune space", func(t *testing.T) {
assert.Equal(t, true, isSeparator(' '))
})
t.Run("rune '", func(t *testing.T) {
assert.Equal(t, false, isSeparator('\''))
})
t.Run("rune ý", func(t *testing.T) {
assert.Equal(t, false, isSeparator('ý'))
})
}
func TestUcFirst(t *testing.T) {
t.Run("photo-lover", func(t *testing.T) {
assert.Equal(t, "Photo-lover", UcFirst("photo-lover"))
})
t.Run("cat", func(t *testing.T) {
assert.Equal(t, "Cat", UcFirst("Cat"))
})
t.Run("empty string", func(t *testing.T) {
assert.Equal(t, "", UcFirst(""))
})
}
func TestTitle(t *testing.T) {
t.Run("Browse your life in pictures", func(t *testing.T) {
assert.Equal(t, "Browse Your Life In Pictures", Title("Browse your life in pictures"))
})
t.Run("photo-lover", func(t *testing.T) {
assert.Equal(t, "Photo-Lover", Title("photo-lover"))
})
}
func TestTitleFromFileName(t *testing.T) {
t.Run("Browse your life in pictures", func(t *testing.T) {
assert.Equal(t, "Browse Your Life In Pictures", TitleFromFileName("Browse your life in pictures"))
})
t.Run("photo-lover", func(t *testing.T) {
assert.Equal(t, "Photo Lover", TitleFromFileName("photo-lover"))
})
t.Run("BRIDGE in nyc", func(t *testing.T) {
assert.Equal(t, "Bridge In NYC", TitleFromFileName("BRIDGE in nyc"))
})
t.Run("phil unveils iphone, ipad, imac or macbook 11 pro and max", func(t *testing.T) {
assert.Equal(t, "Phil Unveils iPhone iPad iMac or MacBook Pro and Max", TitleFromFileName("phil unveils iphone, ipad, imac or macbook 11 pro and max"))
})
t.Run("IMG_4568", func(t *testing.T) {
assert.Equal(t, "", TitleFromFileName("IMG_4568"))
})
t.Run("queen-city-yacht-club--toronto-island_7999432607_o.jpg", func(t *testing.T) {
assert.Equal(t, "Queen City Yacht Club / Toronto Island", TitleFromFileName("queen-city-yacht-club--toronto-island_7999432607_o.jpg"))
})
t.Run("tim-robbins--tiff-2012_7999233420_o.jpg", func(t *testing.T) {
assert.Equal(t, "Tim Robbins / TIFF", TitleFromFileName("tim-robbins--tiff-2012_7999233420_o.jpg"))
})
t.Run("20200102-204030-Berlin-Germany-2020-3h4.jpg", func(t *testing.T) {
assert.Equal(t, "Berlin Germany", TitleFromFileName("20200102-204030-Berlin-Germany-2020-3h4.jpg"))
})
t.Run("changing-of-the-guard--buckingham-palace_7925318070_o.jpg", func(t *testing.T) {
assert.Equal(t, "Changing of the Guard / Buckingham Palace", TitleFromFileName("changing-of-the-guard--buckingham-palace_7925318070_o.jpg"))
})
}

View file

@ -1,48 +0,0 @@
package txt
var TitleReplacements = map[string]string{
"Nyc": "NYC",
"Ny ": "NY ",
"Uae": "UAE",
"Usa": "USA",
"Amd ": "AMD ",
"Tiff": "TIFF",
"Ibm": "IBM",
"Usd": "USD",
"Gbp": "GBP",
"Chf": "CHF",
"Ceo": "CEO",
"Cto": "CTO",
"Cfo": "CFO",
"Cia ": "CIA ",
"Fbi": "FBI",
"Bnd": "BND",
"Fsb": "FSB",
"Nsa": "NSA",
"Lax ": "LAX ",
"Sfx": "SFX",
"Ber ": "BER ",
"Sfo": "SFO",
"Lh ": "LH ",
"Lhr": "LHR",
"Afl ": "AFL ",
"Nrl": "NRL",
"Nsw": "NSW",
"Qld": "QLD",
"Vic ": "VIC ",
"Iphone": "iPhone",
"Imac": "iMac",
"Ipad": "iPad",
"Macbook": "MacBook",
" And ": " and ",
" Or ": " or ",
" A ": " a ",
" An ": " an ",
" To ": " to ",
" At ": " at ",
" By ": " by ",
" But ": " but ",
" For ": " for ",
" Of ": " of ",
" The ": " the ",
}

View file

@ -3,124 +3,15 @@ package txt
import (
"regexp"
"strings"
"unicode"
"github.com/photoprism/photoprism/pkg/fs"
)
var ContainsNumberRegexp = regexp.MustCompile("\\d+")
var FileTitleRegexp = regexp.MustCompile("[\\p{L}\\-]{2,}")
// ContainsNumber returns true if string contains a number.
func ContainsNumber(s string) bool {
return ContainsNumberRegexp.MatchString(s)
}
// isSeparator reports whether the rune could mark a word boundary.
func isSeparator(r rune) bool {
// ASCII alphanumerics and underscore are not separators
if r <= 0x7F {
switch {
case '0' <= r && r <= '9':
return false
case 'a' <= r && r <= 'z':
return false
case 'A' <= r && r <= 'Z':
return false
case r == '_', r == '\'':
return false
}
return true
}
// Letters and digits are not separators
if unicode.IsLetter(r) || unicode.IsDigit(r) {
return false
}
// Otherwise, all we can do for now is treat spaces as separators.
return unicode.IsSpace(r)
}
// UcFirst returns the string with the first character converted to uppercase.
func UcFirst(str string) string {
for i, v := range str {
return string(unicode.ToUpper(v)) + str[i+1:]
}
return ""
}
// Title returns the string with the first characters of each word converted to uppercase.
func Title(s string) string {
s = strings.TrimSpace(s)
s = strings.ReplaceAll(s, "_", " ")
prev := ' '
result := strings.Map(
func(r rune) rune {
if isSeparator(prev) {
prev = r
return unicode.ToTitle(r)
}
prev = r
return r
},
s)
for match, abbr := range TitleReplacements {
result = strings.ReplaceAll(result, match, abbr)
}
return result
}
// TitleFromFileName returns the string with the first characters of each word converted to uppercase.
func TitleFromFileName(s string) string {
s = fs.Base(s, true)
if len(s) < 3 {
return ""
}
words := FileTitleRegexp.FindAllString(s, -1)
var result []string
found := 0
for _, w := range words {
w = strings.ToLower(w)
if len(w) < 3 && found == 0 {
continue
}
if _, ok := Stopwords[w]; ok && found == 0 {
continue
}
result = append(result, w)
found++
if found >= 10 {
break
}
}
if found == 0 {
return ""
}
title := strings.Join(result, " ")
title = strings.ReplaceAll(title, "--", " / ")
title = strings.ReplaceAll(title, "-", " ")
title = strings.ReplaceAll(title, " ", " ")
if len(title) < 3 {
return ""
}
return Title(title)
}
// Bool casts a string to bool.
func Bool(s string) bool {

View file

@ -15,84 +15,6 @@ func TestContainsNumber(t *testing.T) {
})
}
func TestIsSeparator(t *testing.T) {
t.Run("rune A", func(t *testing.T) {
assert.Equal(t, false, isSeparator('A'))
})
t.Run("rune 99", func(t *testing.T) {
assert.Equal(t, false, isSeparator('9'))
})
t.Run("rune /", func(t *testing.T) {
assert.Equal(t, true, isSeparator('/'))
})
t.Run("rune \\", func(t *testing.T) {
assert.Equal(t, true, isSeparator('\\'))
})
t.Run("rune ♥ ", func(t *testing.T) {
assert.Equal(t, false, isSeparator('♥'))
})
t.Run("rune space", func(t *testing.T) {
assert.Equal(t, true, isSeparator(' '))
})
t.Run("rune '", func(t *testing.T) {
assert.Equal(t, false, isSeparator('\''))
})
t.Run("rune ý", func(t *testing.T) {
assert.Equal(t, false, isSeparator('ý'))
})
}
func TestUcFirst(t *testing.T) {
t.Run("photo-lover", func(t *testing.T) {
assert.Equal(t, "Photo-lover", UcFirst("photo-lover"))
})
t.Run("cat", func(t *testing.T) {
assert.Equal(t, "Cat", UcFirst("Cat"))
})
t.Run("empty string", func(t *testing.T) {
assert.Equal(t, "", UcFirst(""))
})
}
func TestTitle(t *testing.T) {
t.Run("Browse your life in pictures", func(t *testing.T) {
assert.Equal(t, "Browse Your Life In Pictures", Title("Browse your life in pictures"))
})
t.Run("photo-lover", func(t *testing.T) {
assert.Equal(t, "Photo-Lover", Title("photo-lover"))
})
}
func TestTitleFromFileName(t *testing.T) {
t.Run("Browse your life in pictures", func(t *testing.T) {
assert.Equal(t, "Browse Your Life In Pictures", TitleFromFileName("Browse your life in pictures"))
})
t.Run("photo-lover", func(t *testing.T) {
assert.Equal(t, "Photo Lover", TitleFromFileName("photo-lover"))
})
t.Run("BRIDGE in nyc", func(t *testing.T) {
assert.Equal(t, "Bridge In NYC", TitleFromFileName("BRIDGE in nyc"))
})
t.Run("phil unveils iphone, ipad, imac or macbook 11 pro and max", func(t *testing.T) {
assert.Equal(t, "Phil Unveils iPhone iPad iMac or MacBook Pro and Max", TitleFromFileName("phil unveils iphone, ipad, imac or macbook 11 pro and max"))
})
t.Run("IMG_4568", func(t *testing.T) {
assert.Equal(t, "", TitleFromFileName("IMG_4568"))
})
t.Run("queen-city-yacht-club--toronto-island_7999432607_o.jpg", func(t *testing.T) {
assert.Equal(t, "Queen City Yacht Club / Toronto Island", TitleFromFileName("queen-city-yacht-club--toronto-island_7999432607_o.jpg"))
})
t.Run("tim-robbins--tiff-2012_7999233420_o.jpg", func(t *testing.T) {
assert.Equal(t, "Tim Robbins / TIFF", TitleFromFileName("tim-robbins--tiff-2012_7999233420_o.jpg"))
})
t.Run("20200102-204030-Berlin-Germany-2020-3h4.jpg", func(t *testing.T) {
assert.Equal(t, "Berlin Germany", TitleFromFileName("20200102-204030-Berlin-Germany-2020-3h4.jpg"))
})
t.Run("changing-of-the-guard--buckingham-palace_7925318070_o.jpg", func(t *testing.T) {
assert.Equal(t, "Changing of the Guard / Buckingham Palace", TitleFromFileName("changing-of-the-guard--buckingham-palace_7925318070_o.jpg"))
})
}
func TestBool(t *testing.T) {
t.Run("not empty", func(t *testing.T) {
assert.Equal(t, true, Bool("Browse your life in pictures"))