photoprism/pkg/clusters/kmeans_estimator.go

251 lines
4.4 KiB
Go

package clusters
import (
"math"
"math/rand"
"time"
"gonum.org/v1/gonum/floats"
)
type kmeansEstimator struct {
iterations, number, max int
// variables keeping count of changes of points' membership every iteration. User as a stopping condition.
changes, oldchanges, counter, threshold int
distance DistanceFunc
a, b []int
// slices holding values of centroids of each clusters
m, n [][]float64
// dataset
d [][]float64
}
// Implementation of cluster number estimator using gap statistic
// ("Estimating the number of clusters in a data set via the gap statistic", Tibshirani et al.) with k-means++ as
// clustering algorithm
func KMeansEstimator(iterations, clusters int, distance DistanceFunc) (Estimator, error) {
if iterations < 1 {
return nil, errZeroIterations
}
if clusters < 2 {
return nil, errOneCluster
}
var d DistanceFunc
{
if distance != nil {
d = distance
} else {
d = EuclideanDistance
}
}
return &kmeansEstimator{
iterations: iterations,
max: clusters,
distance: d,
}, nil
}
func (c *kmeansEstimator) Estimate(data [][]float64) (int, error) {
if len(data) == 0 {
return 0, errEmptySet
}
var (
estimated = 0
size = len(data)
bounds = bounds(data)
wks = make([]float64, c.max)
wkbs = make([]float64, c.max)
sk = make([]float64, c.max)
one = make([]float64, c.max)
bwkbs = make([]float64, c.max)
)
for i := 0; i < c.max; i++ {
c.number = i + 1
c.learn(data)
wks[i] = math.Log(c.wk(c.d, c.m, c.a))
for j := 0; j < c.max; j++ {
c.learn(c.buildRandomizedSet(size, bounds))
bwkbs[j] = math.Log(c.wk(c.d, c.m, c.a))
one[j] = 1
}
wkbs[i] = floats.Sum(bwkbs) / float64(c.max)
floats.Scale(wkbs[i], one)
floats.Sub(bwkbs, one)
floats.Mul(bwkbs, bwkbs)
sk[i] = math.Sqrt(floats.Sum(bwkbs) / float64(c.max))
}
floats.Scale(math.Sqrt(1+(1/float64(c.max))), sk)
for i := 0; i < c.max-1; i++ {
if wkbs[i] >= wkbs[i+1]-sk[i+1] {
estimated = i + 1
break
}
}
return estimated, nil
}
// private
func (c *kmeansEstimator) learn(data [][]float64) {
c.d = data
c.a = make([]int, len(data))
c.b = make([]int, c.number)
c.counter = 0
c.threshold = changesThreshold
c.changes = 0
c.oldchanges = 0
c.initializeMeansWithData()
for i := 0; i < c.iterations && c.counter != c.threshold; i++ {
c.run()
c.check()
}
}
func (c *kmeansEstimator) initializeMeansWithData() {
c.m = make([][]float64, c.number)
c.n = make([][]float64, c.number)
rand.Seed(time.Now().UTC().Unix())
var (
k int
s, t, l, f float64
d []float64 = make([]float64, len(c.d))
)
c.m[0] = c.d[rand.Intn(len(c.d)-1)]
for i := 1; i < c.number; i++ {
s = 0
t = 0
for j := 0; j < len(c.d); j++ {
l = c.distance(c.m[0], c.d[j])
for g := 1; g < i; g++ {
if f = c.distance(c.m[g], c.d[j]); f < l {
l = f
}
}
d[j] = math.Pow(l, 2)
s += d[j]
}
t = rand.Float64() * s
k = 0
for s = d[0]; s < t; s += d[k] {
k++
}
c.m[i] = c.d[k]
}
for i := 0; i < c.number; i++ {
c.n[i] = make([]float64, len(c.m[0]))
}
}
func (c *kmeansEstimator) run() {
var (
l, k, n int = len(c.m[0]), 0, 0
m, d float64
)
for i := 0; i < c.number; i++ {
c.b[i] = 0
}
for i := 0; i < len(c.d); i++ {
m = c.distance(c.d[i], c.m[0])
n = 0
for j := 1; j < c.number; j++ {
if d = c.distance(c.d[i], c.m[j]); d < m {
m = d
n = j
}
}
k = n + 1
if c.a[i] != k {
c.changes++
}
c.a[i] = k
c.b[n]++
floats.Add(c.n[n], c.d[i])
}
for i := 0; i < c.number; i++ {
floats.Scale(1/float64(c.b[i]), c.n[i])
for j := 0; j < l; j++ {
c.m[i][j] = c.n[i][j]
c.n[i][j] = 0
}
}
}
func (c *kmeansEstimator) check() {
if c.changes == c.oldchanges {
c.counter++
}
c.oldchanges = c.changes
}
func (c *kmeansEstimator) wk(data [][]float64, centroids [][]float64, mapping []int) float64 {
var (
l = float64(2 * len(data[0]))
wk = make([]float64, len(centroids))
)
for i := 0; i < len(mapping); i++ {
wk[mapping[i]-1] += EuclideanDistanceSquared(centroids[mapping[i]-1], data[i]) / l
}
return floats.Sum(wk)
}
func (c *kmeansEstimator) buildRandomizedSet(size int, bounds []*[2]float64) [][]float64 {
var (
l = len(bounds)
r = make([][]float64, size)
)
for i := 0; i < size; i++ {
r[i] = make([]float64, l)
for j := 0; j < l; j++ {
r[i][j] = uniform(bounds[j])
}
}
return r
}