Indexer: Improve duplicate detection #568

Only non-sidecar files in the originals folder will be added to the
duplicates table to avoid side effects.
In addition, the duplicates table is cleaned before and after indexing.
This commit is contained in:
Michael Mayer 2020-11-15 10:31:46 +01:00
parent ee9baa37d2
commit e22e6c6d37
6 changed files with 80 additions and 5 deletions

View file

@ -46,9 +46,24 @@ func AddDuplicate(fileName, fileRoot, fileHash string, fileSize, modTime int64)
return nil
}
// Find returns a photo from the database.
func RemoveDuplicate(fileName, fileRoot string) error {
if fileName == "" {
return fmt.Errorf("duplicate: file name must not be empty (remove)")
} else if fileRoot == "" {
return fmt.Errorf("duplicate: file root must not be empty (remove)")
}
if err := UnscopedDb().Delete(Duplicate{}, "file_name = ? AND file_root = ?", fileName, fileRoot).Error; err != nil {
log.Errorf("duplicate: %s (remove %s)", err, txt.Quote(fileName))
return err
}
return nil
}
// Find returns a duplicate from the database.
func (m *Duplicate) Find() error {
return UnscopedDb().First(m, "file_name = ?", m.FileName).Error
return UnscopedDb().First(m, "file_name = ? AND file_root = ?", m.FileName, m.FileRoot).Error
}
// Create inserts a new row to the database.

View file

@ -34,6 +34,10 @@ func (m *Files) Init() error {
return nil
}
if err := query.CleanDuplicates(); err != nil {
return fmt.Errorf("%s (clean duplicates)", err.Error())
}
files, err := query.IndexedFiles()
if err != nil {

View file

@ -115,10 +115,12 @@ func (ind *Index) MediaFile(m *MediaFile, o IndexOptions, originalName string) (
"baseName": filepath.Base(fileName),
})
// Try to find existing file by path and name.
fileQuery = entity.UnscopedDb().First(&file, "file_name = ? AND (file_root = ? OR file_root = '')", fileName, fileRoot)
fileExists = fileQuery.Error == nil
if !fileExists && !m.IsSidecar() {
// Try to find existing file by hash. Skip this for sidecar files, and files outside the originals folder.
if !fileExists && !m.IsSidecar() && m.Root() == entity.RootOriginals {
fileHash = m.Hash()
fileQuery = entity.UnscopedDb().First(&file, "file_hash = ?", fileHash)
fileExists = fileQuery.Error == nil
@ -133,9 +135,11 @@ func (ind *Index) MediaFile(m *MediaFile, o IndexOptions, originalName string) (
}
}
// Try to find existing photo by file path and name.
if !fileExists {
photoQuery = entity.UnscopedDb().First(&photo, "photo_path = ? AND photo_name = ?", filePath, fileBase)
// Try to find existing photo by exact time and location.
if photoQuery.Error != nil && m.MetaData().HasTimeAndPlace() {
metaData = m.MetaData()
photoQuery = entity.UnscopedDb().First(&photo, "photo_lat = ? AND photo_lng = ? AND taken_at = ? AND camera_serial = ?", metaData.Lat, metaData.Lng, metaData.TakenAt, metaData.CameraSerial)
@ -145,6 +149,7 @@ func (ind *Index) MediaFile(m *MediaFile, o IndexOptions, originalName string) (
}
}
// Try to find existing photo by unique image id.
if photoQuery.Error != nil && m.MetaData().HasDocumentID() {
photoQuery = entity.UnscopedDb().First(&photo, "uuid = ?", m.MetaData().DocumentID)
@ -169,8 +174,14 @@ func (ind *Index) MediaFile(m *MediaFile, o IndexOptions, originalName string) (
return result
}
// Remove file from duplicates table if exists.
if err := entity.RemoveDuplicate(m.RootRelName(), m.Root()); err != nil {
log.Error(err)
}
details := photo.GetDetails()
// Try to recover photo metadata from backup if not exists.
if !photoExists {
photo.PhotoQuality = -1
@ -186,6 +197,7 @@ func (ind *Index) MediaFile(m *MediaFile, o IndexOptions, originalName string) (
}
}
// Calculate SHA1 file hash if not exists.
if fileHash == "" {
fileHash = m.Hash()
}
@ -194,6 +206,7 @@ func (ind *Index) MediaFile(m *MediaFile, o IndexOptions, originalName string) (
photo.PhotoName = fileBase
file.FileError = ""
// Flag first JPEG as primary file for this photo.
if !file.FilePrimary {
if photoExists {
if q := entity.UnscopedDb().Where("file_type = 'jpg' AND file_primary = 1 AND photo_id = ?", photo.ID).First(&primaryFile); q.Error != nil {

View file

@ -176,7 +176,11 @@ func (prg *Purge) Start(opt PurgeOptions) (purgedFiles map[string]bool, purgedPh
}
if err := entity.UpdatePhotoCounts(); err != nil {
log.Errorf("purge: %s", err)
log.Errorf("purge: %s (update photo counts)", err)
}
if err := query.CleanDuplicates(); err != nil {
log.Errorf("purge: %s (clean duplicates)", err)
}
return purgedFiles, purgedPhotos, nil

View file

@ -130,7 +130,7 @@ func IndexedFiles() (result FileMap, err error) {
// Query indexed files.
var files []File
if err := UnscopedDb().Raw("SELECT file_root, file_name, mod_time FROM files").Scan(&files).Error; err != nil {
if err := UnscopedDb().Raw("SELECT file_root, file_name, mod_time FROM files WHERE file_missing = 0").Scan(&files).Error; err != nil {
return result, err
}
@ -140,3 +140,12 @@ func IndexedFiles() (result FileMap, err error) {
return result, err
}
// CleanDuplicates removes all files from the duplicates table that don't exist in the files table.
func CleanDuplicates() error {
if res := UnscopedDb().Delete(entity.Duplicate{}, "file_hash IN (SELECT d.file_hash FROM duplicates d LEFT JOIN files f ON d.file_hash = f.file_hash AND f.file_missing = 0 AND f.deleted_at IS NULL WHERE f.file_hash IS NULL)"); res.Error != nil {
return res.Error
}
return nil
}

View file

@ -204,3 +204,33 @@ func TestIndexedFiles(t *testing.T) {
t.Logf("INDEXED FILES: %#v", result)
}
func TestCleanDuplicates(t *testing.T) {
fileName := "hd89e5yhb8p9h.jpg"
if err := entity.AddDuplicate(
fileName,
entity.RootOriginals,
"2cad9168fa6acc5c5c2965ddf6ec465ca42fd811",
661858,
time.Date(2019, 3, 6, 2, 6, 51, 0, time.UTC).Unix(),
); err != nil {
t.Fatal(err)
}
d := &entity.Duplicate{FileName: fileName, FileRoot: entity.RootOriginals}
if err := d.Find(); err != nil {
t.Fatal(err)
}
err := CleanDuplicates()
assert.NoError(t, err)
dp := &entity.Duplicate{FileName: fileName, FileRoot: entity.RootOriginals}
if err := dp.Find(); err == nil {
t.Fatalf("duplicate should be removed: %+v", dp)
}
}