badger/internal/library/duplicates.go
2023-03-14 09:53:33 +01:00

145 lines
4.4 KiB
Go

package library
import (
"database/sql"
"path"
"strconv"
"velvettear/badger/internal/config"
"velvettear/badger/internal/database"
"velvettear/badger/internal/database/models"
"velvettear/badger/internal/log"
"velvettear/badger/internal/metadata"
"velvettear/badger/internal/tools"
)
// exported function(s)
func FindDuplicates() {
timestamp := tools.LogTimestamp()
var duplicates []Duplicate
rows, error := database.Connection().Model(&models.Track{}).Select("id", "path", "fingerprint", "bitrate").Rows()
if error != nil {
log.Error("encountered an error selecting all tracks as rows", error.Error())
return
}
defer rows.Close()
formatMismatch := config.DuplicatesFormatMismatch()
var comparisonObjects []comparisonObject
for rows.Next() {
comparisonObject, error := toComparisonObject(rows)
if error != nil {
continue
}
comparisonObjects = append(comparisonObjects, comparisonObject)
}
waitChannel := make(chan struct{}, config.Concurrency())
var objectFormat string
var duplicateIndices []int
done := 0
objectCount := len(comparisonObjects)
log.Info("comparing "+strconv.Itoa(objectCount)+" audio fingerprints for duplicates...", "concurrency: "+strconv.Itoa(config.Concurrency()))
for objectCount > 0 {
object := comparisonObjects[0]
comparisonObjects = comparisonObjects[1:]
if formatMismatch {
objectFormat = path.Ext(object.path)
}
waitChannel <- struct{}{}
go func(object comparisonObject) {
tmpTimestamp := tools.LogTimestamp()
for index, comparisonObject := range comparisonObjects {
if formatMismatch && objectFormat == path.Ext(comparisonObject.path) {
continue
}
duplicate := getDuplicate(&object, &comparisonObject)
if !duplicate.isValid() {
continue
}
log.Debug("duplicate track detected", "id '"+strconv.Itoa(duplicate.id)+"', good file: "+duplicate.good+", bad file: "+duplicate.bad+", score: "+strconv.FormatFloat(duplicate.score, 'f', 2, 64))
duplicates = append(duplicates, duplicate)
if duplicate.id == object.id {
break
}
duplicateIndices = append(duplicateIndices, index)
}
done++
objectCount := len(comparisonObjects)
log.DebugTimed("finished comparison of the audio fingerprint for track (id: '"+strconv.Itoa(object.id)+"')", tmpTimestamp, strconv.Itoa(done)+"/"+strconv.Itoa(objectCount))
<-waitChannel
}(object)
comparisonObjects = filterDuplicates(comparisonObjects, duplicateIndices)
duplicateIndices = nil
}
log.InfoTimed("found "+strconv.Itoa(len(duplicates))+" duplicates", timestamp)
}
func filterDuplicates(objects []comparisonObject, duplicateIndices []int) []comparisonObject {
if len(objects) == 0 || len(duplicateIndices) == 0 {
return objects
}
timestamp := tools.LogTimestamp()
removed := 0
var tmp []comparisonObject
for index, object := range objects {
copyObject := true
for _, value := range duplicateIndices {
if index == value {
copyObject = false
break
}
}
if !copyObject {
removed++
continue
}
tmp = append(tmp, object)
}
log.DebugTimed("filtered "+strconv.Itoa(removed)+" duplicate track(s) from list", timestamp)
return tmp
}
func toComparisonObject(row *sql.Rows) (comparisonObject, error) {
var comparisonObject comparisonObject
var tmp string
row.Scan(&comparisonObject.id, &comparisonObject.path, &tmp, &comparisonObject.bitrate)
fingerprint, error := metadata.FingerprintFromString(tmp)
if error != nil {
log.Error("encountered an error parsing the audio fingerprint for file '" + comparisonObject.path + "' from 'string' to '[]int32'")
return comparisonObject, error
}
comparisonObject.fingerprint = fingerprint.Value
return comparisonObject, nil
}
func getDuplicate(object *comparisonObject, comparisonObject *comparisonObject) Duplicate {
score := metadata.CompareWith(object.fingerprint, comparisonObject.fingerprint)
if score < config.DuplicatesFingerprintThreshold() {
return Duplicate{}
}
var duplicate Duplicate
if object.bitrate > comparisonObject.bitrate {
duplicate = Duplicate{id: comparisonObject.id, good: object.path, bad: comparisonObject.path}
} else {
duplicate = Duplicate{id: object.id, good: comparisonObject.path, bad: object.path}
}
duplicate.score = score
return duplicate
}
func (duplicate *Duplicate) isValid() bool {
return duplicate.id > 0
}
// struct(s)
type Duplicate struct {
id int
good string
bad string
score float64
}
type comparisonObject struct {
id int
path string
fingerprint []int32
bitrate int
}