diff --git a/pkg/txt/capitalization.go b/pkg/txt/capitalization.go new file mode 100644 index 000000000..836daac4d --- /dev/null +++ b/pkg/txt/capitalization.go @@ -0,0 +1,167 @@ +package txt + +import ( + "regexp" + "strings" + "unicode" + + "github.com/photoprism/photoprism/pkg/fs" +) + +var FileTitleRegexp = regexp.MustCompile("[\\p{L}\\-]{2,}") + +var TitleReplacements = map[string]string{ + "Nyc": "NYC", + "Ny ": "NY ", + "Uae": "UAE", + "Usa": "USA", + "Amd ": "AMD ", + "Tiff": "TIFF", + "Ibm": "IBM", + "Usd": "USD", + "Gbp": "GBP", + "Chf": "CHF", + "Ceo": "CEO", + "Cto": "CTO", + "Cfo": "CFO", + "Cia ": "CIA ", + "Fbi": "FBI", + "Bnd": "BND", + "Fsb": "FSB", + "Nsa": "NSA", + "Lax ": "LAX ", + "Sfx": "SFX", + "Ber ": "BER ", + "Sfo": "SFO", + "Lh ": "LH ", + "Lhr": "LHR", + "Afl ": "AFL ", + "Nrl": "NRL", + "Nsw": "NSW", + "Qld": "QLD", + "Vic ": "VIC ", + "Iphone": "iPhone", + "Imac": "iMac", + "Ipad": "iPad", + "Macbook": "MacBook", + " And ": " and ", + " Or ": " or ", + " A ": " a ", + " An ": " an ", + " To ": " to ", + " At ": " at ", + " By ": " by ", + " But ": " but ", + " For ": " for ", + " Of ": " of ", + " The ": " the ", + " On ": " on ", + " From ": " from ", + " With ": " with ", +} + +// isSeparator reports whether the rune could mark a word boundary. +func isSeparator(r rune) bool { + // ASCII alphanumerics and underscore are not separators + if r <= 0x7F { + switch { + case '0' <= r && r <= '9': + return false + case 'a' <= r && r <= 'z': + return false + case 'A' <= r && r <= 'Z': + return false + case r == '_', r == '\'': + return false + } + return true + } + // Letters and digits are not separators + if unicode.IsLetter(r) || unicode.IsDigit(r) { + return false + } + // Otherwise, all we can do for now is treat spaces as separators. + return unicode.IsSpace(r) +} + +// UcFirst returns the string with the first character converted to uppercase. +func UcFirst(str string) string { + for i, v := range str { + return string(unicode.ToUpper(v)) + str[i+1:] + } + return "" +} + +// Title returns the string with the first characters of each word converted to uppercase. +func Title(s string) string { + s = strings.TrimSpace(s) + s = strings.ReplaceAll(s, "_", " ") + + prev := ' ' + result := strings.Map( + func(r rune) rune { + if isSeparator(prev) { + prev = r + return unicode.ToTitle(r) + } + prev = r + return r + }, + s) + + for match, abbr := range TitleReplacements { + result = strings.ReplaceAll(result, match, abbr) + } + + return result +} + +// TitleFromFileName returns the string with the first characters of each word converted to uppercase. +func TitleFromFileName(s string) string { + s = fs.Base(s, true) + + if len(s) < 3 { + return "" + } + + words := FileTitleRegexp.FindAllString(s, -1) + var result []string + + found := 0 + + for _, w := range words { + w = strings.ToLower(w) + + if len(w) < 3 && found == 0 { + continue + } + + if _, ok := Stopwords[w]; ok && found == 0 { + continue + } + + result = append(result, w) + + found++ + + if found >= 10 { + break + } + } + + if found == 0 { + return "" + } + + title := strings.Join(result, " ") + + title = strings.ReplaceAll(title, "--", " / ") + title = strings.ReplaceAll(title, "-", " ") + title = strings.ReplaceAll(title, " ", " ") + + if len(title) < 3 { + return "" + } + + return Title(title) +} diff --git a/pkg/txt/capitalization_test.go b/pkg/txt/capitalization_test.go new file mode 100644 index 000000000..cf85a0ba4 --- /dev/null +++ b/pkg/txt/capitalization_test.go @@ -0,0 +1,85 @@ +package txt + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestIsSeparator(t *testing.T) { + t.Run("rune A", func(t *testing.T) { + assert.Equal(t, false, isSeparator('A')) + }) + t.Run("rune 99", func(t *testing.T) { + assert.Equal(t, false, isSeparator('9')) + }) + t.Run("rune /", func(t *testing.T) { + assert.Equal(t, true, isSeparator('/')) + }) + t.Run("rune \\", func(t *testing.T) { + assert.Equal(t, true, isSeparator('\\')) + }) + t.Run("rune ♥ ", func(t *testing.T) { + assert.Equal(t, false, isSeparator('♥')) + }) + t.Run("rune space", func(t *testing.T) { + assert.Equal(t, true, isSeparator(' ')) + }) + t.Run("rune '", func(t *testing.T) { + assert.Equal(t, false, isSeparator('\'')) + }) + t.Run("rune ý", func(t *testing.T) { + assert.Equal(t, false, isSeparator('ý')) + }) +} + +func TestUcFirst(t *testing.T) { + t.Run("photo-lover", func(t *testing.T) { + assert.Equal(t, "Photo-lover", UcFirst("photo-lover")) + }) + t.Run("cat", func(t *testing.T) { + assert.Equal(t, "Cat", UcFirst("Cat")) + }) + t.Run("empty string", func(t *testing.T) { + assert.Equal(t, "", UcFirst("")) + }) +} + +func TestTitle(t *testing.T) { + t.Run("Browse your life in pictures", func(t *testing.T) { + assert.Equal(t, "Browse Your Life In Pictures", Title("Browse your life in pictures")) + }) + t.Run("photo-lover", func(t *testing.T) { + assert.Equal(t, "Photo-Lover", Title("photo-lover")) + }) +} + +func TestTitleFromFileName(t *testing.T) { + t.Run("Browse your life in pictures", func(t *testing.T) { + assert.Equal(t, "Browse Your Life In Pictures", TitleFromFileName("Browse your life in pictures")) + }) + t.Run("photo-lover", func(t *testing.T) { + assert.Equal(t, "Photo Lover", TitleFromFileName("photo-lover")) + }) + t.Run("BRIDGE in nyc", func(t *testing.T) { + assert.Equal(t, "Bridge In NYC", TitleFromFileName("BRIDGE in nyc")) + }) + t.Run("phil unveils iphone, ipad, imac or macbook 11 pro and max", func(t *testing.T) { + assert.Equal(t, "Phil Unveils iPhone iPad iMac or MacBook Pro and Max", TitleFromFileName("phil unveils iphone, ipad, imac or macbook 11 pro and max")) + }) + t.Run("IMG_4568", func(t *testing.T) { + assert.Equal(t, "", TitleFromFileName("IMG_4568")) + }) + t.Run("queen-city-yacht-club--toronto-island_7999432607_o.jpg", func(t *testing.T) { + assert.Equal(t, "Queen City Yacht Club / Toronto Island", TitleFromFileName("queen-city-yacht-club--toronto-island_7999432607_o.jpg")) + }) + t.Run("tim-robbins--tiff-2012_7999233420_o.jpg", func(t *testing.T) { + assert.Equal(t, "Tim Robbins / TIFF", TitleFromFileName("tim-robbins--tiff-2012_7999233420_o.jpg")) + }) + t.Run("20200102-204030-Berlin-Germany-2020-3h4.jpg", func(t *testing.T) { + assert.Equal(t, "Berlin Germany", TitleFromFileName("20200102-204030-Berlin-Germany-2020-3h4.jpg")) + }) + t.Run("changing-of-the-guard--buckingham-palace_7925318070_o.jpg", func(t *testing.T) { + assert.Equal(t, "Changing of the Guard / Buckingham Palace", TitleFromFileName("changing-of-the-guard--buckingham-palace_7925318070_o.jpg")) + }) +} diff --git a/pkg/txt/replacements.go b/pkg/txt/replacements.go deleted file mode 100644 index 54fcab462..000000000 --- a/pkg/txt/replacements.go +++ /dev/null @@ -1,48 +0,0 @@ -package txt - -var TitleReplacements = map[string]string{ - "Nyc": "NYC", - "Ny ": "NY ", - "Uae": "UAE", - "Usa": "USA", - "Amd ": "AMD ", - "Tiff": "TIFF", - "Ibm": "IBM", - "Usd": "USD", - "Gbp": "GBP", - "Chf": "CHF", - "Ceo": "CEO", - "Cto": "CTO", - "Cfo": "CFO", - "Cia ": "CIA ", - "Fbi": "FBI", - "Bnd": "BND", - "Fsb": "FSB", - "Nsa": "NSA", - "Lax ": "LAX ", - "Sfx": "SFX", - "Ber ": "BER ", - "Sfo": "SFO", - "Lh ": "LH ", - "Lhr": "LHR", - "Afl ": "AFL ", - "Nrl": "NRL", - "Nsw": "NSW", - "Qld": "QLD", - "Vic ": "VIC ", - "Iphone": "iPhone", - "Imac": "iMac", - "Ipad": "iPad", - "Macbook": "MacBook", - " And ": " and ", - " Or ": " or ", - " A ": " a ", - " An ": " an ", - " To ": " to ", - " At ": " at ", - " By ": " by ", - " But ": " but ", - " For ": " for ", - " Of ": " of ", - " The ": " the ", -} diff --git a/pkg/txt/strings.go b/pkg/txt/strings.go index bc1c780c9..fda462c03 100644 --- a/pkg/txt/strings.go +++ b/pkg/txt/strings.go @@ -3,124 +3,15 @@ package txt import ( "regexp" "strings" - "unicode" - - "github.com/photoprism/photoprism/pkg/fs" ) var ContainsNumberRegexp = regexp.MustCompile("\\d+") -var FileTitleRegexp = regexp.MustCompile("[\\p{L}\\-]{2,}") // ContainsNumber returns true if string contains a number. func ContainsNumber(s string) bool { return ContainsNumberRegexp.MatchString(s) } -// isSeparator reports whether the rune could mark a word boundary. -func isSeparator(r rune) bool { - // ASCII alphanumerics and underscore are not separators - if r <= 0x7F { - switch { - case '0' <= r && r <= '9': - return false - case 'a' <= r && r <= 'z': - return false - case 'A' <= r && r <= 'Z': - return false - case r == '_', r == '\'': - return false - } - return true - } - // Letters and digits are not separators - if unicode.IsLetter(r) || unicode.IsDigit(r) { - return false - } - // Otherwise, all we can do for now is treat spaces as separators. - return unicode.IsSpace(r) -} - -// UcFirst returns the string with the first character converted to uppercase. -func UcFirst(str string) string { - for i, v := range str { - return string(unicode.ToUpper(v)) + str[i+1:] - } - return "" -} - -// Title returns the string with the first characters of each word converted to uppercase. -func Title(s string) string { - s = strings.TrimSpace(s) - s = strings.ReplaceAll(s, "_", " ") - - prev := ' ' - result := strings.Map( - func(r rune) rune { - if isSeparator(prev) { - prev = r - return unicode.ToTitle(r) - } - prev = r - return r - }, - s) - - for match, abbr := range TitleReplacements { - result = strings.ReplaceAll(result, match, abbr) - } - - return result -} - -// TitleFromFileName returns the string with the first characters of each word converted to uppercase. -func TitleFromFileName(s string) string { - s = fs.Base(s, true) - - if len(s) < 3 { - return "" - } - - words := FileTitleRegexp.FindAllString(s, -1) - var result []string - - found := 0 - - for _, w := range words { - w = strings.ToLower(w) - - if len(w) < 3 && found == 0 { - continue - } - - if _, ok := Stopwords[w]; ok && found == 0 { - continue - } - - result = append(result, w) - - found++ - - if found >= 10 { - break - } - } - - if found == 0 { - return "" - } - - title := strings.Join(result, " ") - - title = strings.ReplaceAll(title, "--", " / ") - title = strings.ReplaceAll(title, "-", " ") - title = strings.ReplaceAll(title, " ", " ") - - if len(title) < 3 { - return "" - } - - return Title(title) -} // Bool casts a string to bool. func Bool(s string) bool { diff --git a/pkg/txt/strings_test.go b/pkg/txt/strings_test.go index 46b3d31c8..a416abd59 100644 --- a/pkg/txt/strings_test.go +++ b/pkg/txt/strings_test.go @@ -15,84 +15,6 @@ func TestContainsNumber(t *testing.T) { }) } -func TestIsSeparator(t *testing.T) { - t.Run("rune A", func(t *testing.T) { - assert.Equal(t, false, isSeparator('A')) - }) - t.Run("rune 99", func(t *testing.T) { - assert.Equal(t, false, isSeparator('9')) - }) - t.Run("rune /", func(t *testing.T) { - assert.Equal(t, true, isSeparator('/')) - }) - t.Run("rune \\", func(t *testing.T) { - assert.Equal(t, true, isSeparator('\\')) - }) - t.Run("rune ♥ ", func(t *testing.T) { - assert.Equal(t, false, isSeparator('♥')) - }) - t.Run("rune space", func(t *testing.T) { - assert.Equal(t, true, isSeparator(' ')) - }) - t.Run("rune '", func(t *testing.T) { - assert.Equal(t, false, isSeparator('\'')) - }) - t.Run("rune ý", func(t *testing.T) { - assert.Equal(t, false, isSeparator('ý')) - }) -} - -func TestUcFirst(t *testing.T) { - t.Run("photo-lover", func(t *testing.T) { - assert.Equal(t, "Photo-lover", UcFirst("photo-lover")) - }) - t.Run("cat", func(t *testing.T) { - assert.Equal(t, "Cat", UcFirst("Cat")) - }) - t.Run("empty string", func(t *testing.T) { - assert.Equal(t, "", UcFirst("")) - }) -} - -func TestTitle(t *testing.T) { - t.Run("Browse your life in pictures", func(t *testing.T) { - assert.Equal(t, "Browse Your Life In Pictures", Title("Browse your life in pictures")) - }) - t.Run("photo-lover", func(t *testing.T) { - assert.Equal(t, "Photo-Lover", Title("photo-lover")) - }) -} - -func TestTitleFromFileName(t *testing.T) { - t.Run("Browse your life in pictures", func(t *testing.T) { - assert.Equal(t, "Browse Your Life In Pictures", TitleFromFileName("Browse your life in pictures")) - }) - t.Run("photo-lover", func(t *testing.T) { - assert.Equal(t, "Photo Lover", TitleFromFileName("photo-lover")) - }) - t.Run("BRIDGE in nyc", func(t *testing.T) { - assert.Equal(t, "Bridge In NYC", TitleFromFileName("BRIDGE in nyc")) - }) - t.Run("phil unveils iphone, ipad, imac or macbook 11 pro and max", func(t *testing.T) { - assert.Equal(t, "Phil Unveils iPhone iPad iMac or MacBook Pro and Max", TitleFromFileName("phil unveils iphone, ipad, imac or macbook 11 pro and max")) - }) - t.Run("IMG_4568", func(t *testing.T) { - assert.Equal(t, "", TitleFromFileName("IMG_4568")) - }) - t.Run("queen-city-yacht-club--toronto-island_7999432607_o.jpg", func(t *testing.T) { - assert.Equal(t, "Queen City Yacht Club / Toronto Island", TitleFromFileName("queen-city-yacht-club--toronto-island_7999432607_o.jpg")) - }) - t.Run("tim-robbins--tiff-2012_7999233420_o.jpg", func(t *testing.T) { - assert.Equal(t, "Tim Robbins / TIFF", TitleFromFileName("tim-robbins--tiff-2012_7999233420_o.jpg")) - }) - t.Run("20200102-204030-Berlin-Germany-2020-3h4.jpg", func(t *testing.T) { - assert.Equal(t, "Berlin Germany", TitleFromFileName("20200102-204030-Berlin-Germany-2020-3h4.jpg")) - }) - t.Run("changing-of-the-guard--buckingham-palace_7925318070_o.jpg", func(t *testing.T) { - assert.Equal(t, "Changing of the Guard / Buckingham Palace", TitleFromFileName("changing-of-the-guard--buckingham-palace_7925318070_o.jpg")) - }) -} - func TestBool(t *testing.T) { t.Run("not empty", func(t *testing.T) { assert.Equal(t, true, Bool("Browse your life in pictures"))