From ae130dc5007ba782dd7e2437c229322ae0b9d770 Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Tue, 15 Nov 2022 14:45:21 +0100 Subject: [PATCH] Metadata: Sanitize bad Unicode strings #2897 Signed-off-by: Michael Mayer --- internal/meta/json_exiftool.go | 23 +++++++++++------------ internal/meta/sanitize.go | 3 ++- pkg/clean/unicode.go | 21 +++++++++++++++++++++ pkg/clean/unicode_test.go | 16 ++++++++++++++++ 4 files changed, 50 insertions(+), 13 deletions(-) create mode 100644 pkg/clean/unicode.go create mode 100644 pkg/clean/unicode_test.go diff --git a/internal/meta/json_exiftool.go b/internal/meta/json_exiftool.go index a7ce1da55..23c56e31d 100644 --- a/internal/meta/json_exiftool.go +++ b/internal/meta/json_exiftool.go @@ -9,15 +9,14 @@ import ( "strings" "time" - "github.com/photoprism/photoprism/pkg/video" - - "github.com/photoprism/photoprism/pkg/projection" - - "github.com/photoprism/photoprism/pkg/clean" - "github.com/photoprism/photoprism/pkg/rnd" - "github.com/photoprism/photoprism/pkg/txt" "github.com/tidwall/gjson" "gopkg.in/photoprism/go-tz.v2/tz" + + "github.com/photoprism/photoprism/pkg/clean" + "github.com/photoprism/photoprism/pkg/projection" + "github.com/photoprism/photoprism/pkg/rnd" + "github.com/photoprism/photoprism/pkg/txt" + "github.com/photoprism/photoprism/pkg/video" ) const MimeVideoMP4 = "video/mp4" @@ -47,7 +46,7 @@ func (data *Data) Exiftool(jsonData []byte, originalName string) (err error) { jsonValues := j.Map() for key, val := range jsonValues { - data.json[key] = val.String() + data.json[key] = SanitizeString(val.String()) } if fileName, ok := data.json["FileName"]; ok && fileName != "" && originalName != "" && fileName != originalName { @@ -134,22 +133,22 @@ func (data *Data) Exiftool(jsonData []byte, originalName string) (err error) { } case []string: existing := fieldValue.Interface().([]string) - fieldValue.Set(reflect.ValueOf(txt.AddToWords(existing, strings.TrimSpace(jsonValue.String())))) + fieldValue.Set(reflect.ValueOf(txt.AddToWords(existing, SanitizeString(jsonValue.String())))) case Keywords: existing := fieldValue.Interface().(Keywords) - fieldValue.Set(reflect.ValueOf(txt.AddToWords(existing, strings.TrimSpace(jsonValue.String())))) + fieldValue.Set(reflect.ValueOf(txt.AddToWords(existing, SanitizeString(jsonValue.String())))) case projection.Type: if !fieldValue.IsZero() { continue } - fieldValue.Set(reflect.ValueOf(projection.Type(strings.TrimSpace(jsonValue.String())))) + fieldValue.Set(reflect.ValueOf(projection.Type(SanitizeString(jsonValue.String())))) case string: if !fieldValue.IsZero() { continue } - fieldValue.SetString(strings.TrimSpace(jsonValue.String())) + fieldValue.SetString(SanitizeString(jsonValue.String())) case bool: if !fieldValue.IsZero() { continue diff --git a/internal/meta/sanitize.go b/internal/meta/sanitize.go index 6dd72c01b..4f4247062 100644 --- a/internal/meta/sanitize.go +++ b/internal/meta/sanitize.go @@ -5,6 +5,7 @@ import ( "regexp" "strings" + "github.com/photoprism/photoprism/pkg/clean" "github.com/photoprism/photoprism/pkg/fs" "github.com/photoprism/photoprism/pkg/txt" ) @@ -64,7 +65,7 @@ func SanitizeString(s string) string { s = strings.TrimSpace(s) - return strings.Replace(s, "\"", "", -1) + return clean.Unicode(strings.Replace(s, "\"", "", -1)) } // SanitizeUID normalizes unique IDs found in XMP or Exif metadata. diff --git a/pkg/clean/unicode.go b/pkg/clean/unicode.go new file mode 100644 index 000000000..490a4ed05 --- /dev/null +++ b/pkg/clean/unicode.go @@ -0,0 +1,21 @@ +package clean + +import "strings" + +// Unicode returns a string a valid unicode. +func Unicode(s string) string { + if s == "" { + return "" + } + + var b strings.Builder + + for _, c := range s { + if c == '\uFFFD' { + continue + } + b.WriteRune(c) + } + + return b.String() +} diff --git a/pkg/clean/unicode_test.go b/pkg/clean/unicode_test.go new file mode 100644 index 000000000..62cad0f4b --- /dev/null +++ b/pkg/clean/unicode_test.go @@ -0,0 +1,16 @@ +package clean + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestUnicode(t *testing.T) { + t.Run("Valid", func(t *testing.T) { + assert.Equal(t, "NaΓ―ve bonds and futures surge as inflation eases πŸš€πŸš€πŸš€", Unicode("NaΓ―ve bonds and futures surge as inflation eases πŸš€πŸš€πŸš€")) + }) + t.Run("Empty", func(t *testing.T) { + assert.Equal(t, "", Unicode("")) + }) +}