Indexer: Improve duplicate detection #568
Only non-sidecar files in the originals folder will be added to the duplicates table to avoid side effects. In addition, the duplicates table is cleaned before and after indexing.
This commit is contained in:
parent
ee9baa37d2
commit
e22e6c6d37
6 changed files with 80 additions and 5 deletions
|
@ -46,9 +46,24 @@ func AddDuplicate(fileName, fileRoot, fileHash string, fileSize, modTime int64)
|
|||
return nil
|
||||
}
|
||||
|
||||
// Find returns a photo from the database.
|
||||
func RemoveDuplicate(fileName, fileRoot string) error {
|
||||
if fileName == "" {
|
||||
return fmt.Errorf("duplicate: file name must not be empty (remove)")
|
||||
} else if fileRoot == "" {
|
||||
return fmt.Errorf("duplicate: file root must not be empty (remove)")
|
||||
}
|
||||
|
||||
if err := UnscopedDb().Delete(Duplicate{}, "file_name = ? AND file_root = ?", fileName, fileRoot).Error; err != nil {
|
||||
log.Errorf("duplicate: %s (remove %s)", err, txt.Quote(fileName))
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Find returns a duplicate from the database.
|
||||
func (m *Duplicate) Find() error {
|
||||
return UnscopedDb().First(m, "file_name = ?", m.FileName).Error
|
||||
return UnscopedDb().First(m, "file_name = ? AND file_root = ?", m.FileName, m.FileRoot).Error
|
||||
}
|
||||
|
||||
// Create inserts a new row to the database.
|
||||
|
|
|
@ -34,6 +34,10 @@ func (m *Files) Init() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
if err := query.CleanDuplicates(); err != nil {
|
||||
return fmt.Errorf("%s (clean duplicates)", err.Error())
|
||||
}
|
||||
|
||||
files, err := query.IndexedFiles()
|
||||
|
||||
if err != nil {
|
||||
|
|
|
@ -115,10 +115,12 @@ func (ind *Index) MediaFile(m *MediaFile, o IndexOptions, originalName string) (
|
|||
"baseName": filepath.Base(fileName),
|
||||
})
|
||||
|
||||
// Try to find existing file by path and name.
|
||||
fileQuery = entity.UnscopedDb().First(&file, "file_name = ? AND (file_root = ? OR file_root = '')", fileName, fileRoot)
|
||||
fileExists = fileQuery.Error == nil
|
||||
|
||||
if !fileExists && !m.IsSidecar() {
|
||||
// Try to find existing file by hash. Skip this for sidecar files, and files outside the originals folder.
|
||||
if !fileExists && !m.IsSidecar() && m.Root() == entity.RootOriginals {
|
||||
fileHash = m.Hash()
|
||||
fileQuery = entity.UnscopedDb().First(&file, "file_hash = ?", fileHash)
|
||||
fileExists = fileQuery.Error == nil
|
||||
|
@ -133,9 +135,11 @@ func (ind *Index) MediaFile(m *MediaFile, o IndexOptions, originalName string) (
|
|||
}
|
||||
}
|
||||
|
||||
// Try to find existing photo by file path and name.
|
||||
if !fileExists {
|
||||
photoQuery = entity.UnscopedDb().First(&photo, "photo_path = ? AND photo_name = ?", filePath, fileBase)
|
||||
|
||||
// Try to find existing photo by exact time and location.
|
||||
if photoQuery.Error != nil && m.MetaData().HasTimeAndPlace() {
|
||||
metaData = m.MetaData()
|
||||
photoQuery = entity.UnscopedDb().First(&photo, "photo_lat = ? AND photo_lng = ? AND taken_at = ? AND camera_serial = ?", metaData.Lat, metaData.Lng, metaData.TakenAt, metaData.CameraSerial)
|
||||
|
@ -145,6 +149,7 @@ func (ind *Index) MediaFile(m *MediaFile, o IndexOptions, originalName string) (
|
|||
}
|
||||
}
|
||||
|
||||
// Try to find existing photo by unique image id.
|
||||
if photoQuery.Error != nil && m.MetaData().HasDocumentID() {
|
||||
photoQuery = entity.UnscopedDb().First(&photo, "uuid = ?", m.MetaData().DocumentID)
|
||||
|
||||
|
@ -169,8 +174,14 @@ func (ind *Index) MediaFile(m *MediaFile, o IndexOptions, originalName string) (
|
|||
return result
|
||||
}
|
||||
|
||||
// Remove file from duplicates table if exists.
|
||||
if err := entity.RemoveDuplicate(m.RootRelName(), m.Root()); err != nil {
|
||||
log.Error(err)
|
||||
}
|
||||
|
||||
details := photo.GetDetails()
|
||||
|
||||
// Try to recover photo metadata from backup if not exists.
|
||||
if !photoExists {
|
||||
photo.PhotoQuality = -1
|
||||
|
||||
|
@ -186,6 +197,7 @@ func (ind *Index) MediaFile(m *MediaFile, o IndexOptions, originalName string) (
|
|||
}
|
||||
}
|
||||
|
||||
// Calculate SHA1 file hash if not exists.
|
||||
if fileHash == "" {
|
||||
fileHash = m.Hash()
|
||||
}
|
||||
|
@ -194,6 +206,7 @@ func (ind *Index) MediaFile(m *MediaFile, o IndexOptions, originalName string) (
|
|||
photo.PhotoName = fileBase
|
||||
file.FileError = ""
|
||||
|
||||
// Flag first JPEG as primary file for this photo.
|
||||
if !file.FilePrimary {
|
||||
if photoExists {
|
||||
if q := entity.UnscopedDb().Where("file_type = 'jpg' AND file_primary = 1 AND photo_id = ?", photo.ID).First(&primaryFile); q.Error != nil {
|
||||
|
|
|
@ -176,7 +176,11 @@ func (prg *Purge) Start(opt PurgeOptions) (purgedFiles map[string]bool, purgedPh
|
|||
}
|
||||
|
||||
if err := entity.UpdatePhotoCounts(); err != nil {
|
||||
log.Errorf("purge: %s", err)
|
||||
log.Errorf("purge: %s (update photo counts)", err)
|
||||
}
|
||||
|
||||
if err := query.CleanDuplicates(); err != nil {
|
||||
log.Errorf("purge: %s (clean duplicates)", err)
|
||||
}
|
||||
|
||||
return purgedFiles, purgedPhotos, nil
|
||||
|
|
|
@ -130,7 +130,7 @@ func IndexedFiles() (result FileMap, err error) {
|
|||
// Query indexed files.
|
||||
var files []File
|
||||
|
||||
if err := UnscopedDb().Raw("SELECT file_root, file_name, mod_time FROM files").Scan(&files).Error; err != nil {
|
||||
if err := UnscopedDb().Raw("SELECT file_root, file_name, mod_time FROM files WHERE file_missing = 0").Scan(&files).Error; err != nil {
|
||||
return result, err
|
||||
}
|
||||
|
||||
|
@ -140,3 +140,12 @@ func IndexedFiles() (result FileMap, err error) {
|
|||
|
||||
return result, err
|
||||
}
|
||||
|
||||
// CleanDuplicates removes all files from the duplicates table that don't exist in the files table.
|
||||
func CleanDuplicates() error {
|
||||
if res := UnscopedDb().Delete(entity.Duplicate{}, "file_hash IN (SELECT d.file_hash FROM duplicates d LEFT JOIN files f ON d.file_hash = f.file_hash AND f.file_missing = 0 AND f.deleted_at IS NULL WHERE f.file_hash IS NULL)"); res.Error != nil {
|
||||
return res.Error
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
|
@ -204,3 +204,33 @@ func TestIndexedFiles(t *testing.T) {
|
|||
|
||||
t.Logf("INDEXED FILES: %#v", result)
|
||||
}
|
||||
|
||||
func TestCleanDuplicates(t *testing.T) {
|
||||
fileName := "hd89e5yhb8p9h.jpg"
|
||||
|
||||
if err := entity.AddDuplicate(
|
||||
fileName,
|
||||
entity.RootOriginals,
|
||||
"2cad9168fa6acc5c5c2965ddf6ec465ca42fd811",
|
||||
661858,
|
||||
time.Date(2019, 3, 6, 2, 6, 51, 0, time.UTC).Unix(),
|
||||
); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
d := &entity.Duplicate{FileName: fileName, FileRoot: entity.RootOriginals}
|
||||
|
||||
if err := d.Find(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
err := CleanDuplicates()
|
||||
|
||||
assert.NoError(t, err)
|
||||
|
||||
dp := &entity.Duplicate{FileName: fileName, FileRoot: entity.RootOriginals}
|
||||
|
||||
if err := dp.Find(); err == nil {
|
||||
t.Fatalf("duplicate should be removed: %+v", dp)
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue