2021-08-19 14:24:30 +02:00
|
|
|
package clusters
|
|
|
|
|
|
|
|
import (
|
|
|
|
"math"
|
|
|
|
"math/rand"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"gonum.org/v1/gonum/floats"
|
|
|
|
)
|
|
|
|
|
|
|
|
type kmeansEstimator struct {
|
|
|
|
iterations, number, max int
|
|
|
|
|
|
|
|
// variables keeping count of changes of points' membership every iteration. User as a stopping condition.
|
|
|
|
changes, oldchanges, counter, threshold int
|
|
|
|
|
2022-04-03 17:25:37 +02:00
|
|
|
distance DistFunc
|
2021-08-19 14:24:30 +02:00
|
|
|
|
|
|
|
a, b []int
|
|
|
|
|
|
|
|
// slices holding values of centroids of each clusters
|
|
|
|
m, n [][]float64
|
|
|
|
|
|
|
|
// dataset
|
|
|
|
d [][]float64
|
|
|
|
}
|
|
|
|
|
|
|
|
// Implementation of cluster number estimator using gap statistic
|
|
|
|
// ("Estimating the number of clusters in a data set via the gap statistic", Tibshirani et al.) with k-means++ as
|
|
|
|
// clustering algorithm
|
2022-04-03 17:25:37 +02:00
|
|
|
func KMeansEstimator(iterations, clusters int, distance DistFunc) (Estimator, error) {
|
2021-08-19 14:24:30 +02:00
|
|
|
if iterations < 1 {
|
|
|
|
return nil, errZeroIterations
|
|
|
|
}
|
|
|
|
|
|
|
|
if clusters < 2 {
|
|
|
|
return nil, errOneCluster
|
|
|
|
}
|
|
|
|
|
2022-04-03 17:25:37 +02:00
|
|
|
var d DistFunc
|
2021-08-19 14:24:30 +02:00
|
|
|
{
|
|
|
|
if distance != nil {
|
|
|
|
d = distance
|
|
|
|
} else {
|
2022-04-03 17:25:37 +02:00
|
|
|
d = EuclideanDist
|
2021-08-19 14:24:30 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return &kmeansEstimator{
|
|
|
|
iterations: iterations,
|
|
|
|
max: clusters,
|
|
|
|
distance: d,
|
|
|
|
}, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *kmeansEstimator) Estimate(data [][]float64) (int, error) {
|
|
|
|
if len(data) == 0 {
|
|
|
|
return 0, errEmptySet
|
|
|
|
}
|
|
|
|
|
|
|
|
var (
|
|
|
|
estimated = 0
|
|
|
|
size = len(data)
|
|
|
|
bounds = bounds(data)
|
|
|
|
wks = make([]float64, c.max)
|
|
|
|
wkbs = make([]float64, c.max)
|
|
|
|
sk = make([]float64, c.max)
|
|
|
|
one = make([]float64, c.max)
|
|
|
|
bwkbs = make([]float64, c.max)
|
|
|
|
)
|
|
|
|
|
|
|
|
for i := 0; i < c.max; i++ {
|
|
|
|
c.number = i + 1
|
|
|
|
|
|
|
|
c.learn(data)
|
|
|
|
|
|
|
|
wks[i] = math.Log(c.wk(c.d, c.m, c.a))
|
|
|
|
|
|
|
|
for j := 0; j < c.max; j++ {
|
|
|
|
c.learn(c.buildRandomizedSet(size, bounds))
|
|
|
|
|
|
|
|
bwkbs[j] = math.Log(c.wk(c.d, c.m, c.a))
|
|
|
|
one[j] = 1
|
|
|
|
}
|
|
|
|
|
|
|
|
wkbs[i] = floats.Sum(bwkbs) / float64(c.max)
|
|
|
|
|
|
|
|
floats.Scale(wkbs[i], one)
|
|
|
|
floats.Sub(bwkbs, one)
|
|
|
|
floats.Mul(bwkbs, bwkbs)
|
|
|
|
|
|
|
|
sk[i] = math.Sqrt(floats.Sum(bwkbs) / float64(c.max))
|
|
|
|
}
|
|
|
|
|
|
|
|
floats.Scale(math.Sqrt(1+(1/float64(c.max))), sk)
|
|
|
|
|
|
|
|
for i := 0; i < c.max-1; i++ {
|
|
|
|
if wkbs[i] >= wkbs[i+1]-sk[i+1] {
|
|
|
|
estimated = i + 1
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return estimated, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// private
|
|
|
|
func (c *kmeansEstimator) learn(data [][]float64) {
|
|
|
|
c.d = data
|
|
|
|
|
|
|
|
c.a = make([]int, len(data))
|
|
|
|
c.b = make([]int, c.number)
|
|
|
|
|
|
|
|
c.counter = 0
|
|
|
|
c.threshold = changesThreshold
|
|
|
|
c.changes = 0
|
|
|
|
c.oldchanges = 0
|
|
|
|
|
|
|
|
c.initializeMeansWithData()
|
|
|
|
|
|
|
|
for i := 0; i < c.iterations && c.counter != c.threshold; i++ {
|
|
|
|
c.run()
|
|
|
|
c.check()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *kmeansEstimator) initializeMeansWithData() {
|
|
|
|
c.m = make([][]float64, c.number)
|
|
|
|
c.n = make([][]float64, c.number)
|
|
|
|
|
|
|
|
rand.Seed(time.Now().UTC().Unix())
|
|
|
|
|
|
|
|
var (
|
|
|
|
k int
|
|
|
|
s, t, l, f float64
|
|
|
|
d []float64 = make([]float64, len(c.d))
|
|
|
|
)
|
|
|
|
|
|
|
|
c.m[0] = c.d[rand.Intn(len(c.d)-1)]
|
|
|
|
|
|
|
|
for i := 1; i < c.number; i++ {
|
|
|
|
s = 0
|
|
|
|
t = 0
|
|
|
|
for j := 0; j < len(c.d); j++ {
|
|
|
|
|
|
|
|
l = c.distance(c.m[0], c.d[j])
|
|
|
|
for g := 1; g < i; g++ {
|
|
|
|
if f = c.distance(c.m[g], c.d[j]); f < l {
|
|
|
|
l = f
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
d[j] = math.Pow(l, 2)
|
|
|
|
s += d[j]
|
|
|
|
}
|
|
|
|
|
|
|
|
t = rand.Float64() * s
|
|
|
|
k = 0
|
|
|
|
for s = d[0]; s < t; s += d[k] {
|
|
|
|
k++
|
|
|
|
}
|
|
|
|
|
|
|
|
c.m[i] = c.d[k]
|
|
|
|
}
|
|
|
|
|
|
|
|
for i := 0; i < c.number; i++ {
|
|
|
|
c.n[i] = make([]float64, len(c.m[0]))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *kmeansEstimator) run() {
|
|
|
|
var (
|
|
|
|
l, k, n int = len(c.m[0]), 0, 0
|
|
|
|
m, d float64
|
|
|
|
)
|
|
|
|
|
|
|
|
for i := 0; i < c.number; i++ {
|
|
|
|
c.b[i] = 0
|
|
|
|
}
|
|
|
|
|
|
|
|
for i := 0; i < len(c.d); i++ {
|
|
|
|
m = c.distance(c.d[i], c.m[0])
|
|
|
|
n = 0
|
|
|
|
|
|
|
|
for j := 1; j < c.number; j++ {
|
|
|
|
if d = c.distance(c.d[i], c.m[j]); d < m {
|
|
|
|
m = d
|
|
|
|
n = j
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
k = n + 1
|
|
|
|
|
|
|
|
if c.a[i] != k {
|
|
|
|
c.changes++
|
|
|
|
}
|
|
|
|
|
|
|
|
c.a[i] = k
|
|
|
|
c.b[n]++
|
|
|
|
|
|
|
|
floats.Add(c.n[n], c.d[i])
|
|
|
|
}
|
|
|
|
|
|
|
|
for i := 0; i < c.number; i++ {
|
|
|
|
floats.Scale(1/float64(c.b[i]), c.n[i])
|
|
|
|
|
|
|
|
for j := 0; j < l; j++ {
|
|
|
|
c.m[i][j] = c.n[i][j]
|
|
|
|
c.n[i][j] = 0
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *kmeansEstimator) check() {
|
|
|
|
if c.changes == c.oldchanges {
|
|
|
|
c.counter++
|
|
|
|
}
|
|
|
|
|
|
|
|
c.oldchanges = c.changes
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *kmeansEstimator) wk(data [][]float64, centroids [][]float64, mapping []int) float64 {
|
|
|
|
var (
|
|
|
|
l = float64(2 * len(data[0]))
|
|
|
|
wk = make([]float64, len(centroids))
|
|
|
|
)
|
|
|
|
|
|
|
|
for i := 0; i < len(mapping); i++ {
|
2022-04-03 17:25:37 +02:00
|
|
|
wk[mapping[i]-1] += EuclideanDistSquared(centroids[mapping[i]-1], data[i]) / l
|
2021-08-19 14:24:30 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return floats.Sum(wk)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *kmeansEstimator) buildRandomizedSet(size int, bounds []*[2]float64) [][]float64 {
|
|
|
|
var (
|
|
|
|
l = len(bounds)
|
|
|
|
r = make([][]float64, size)
|
|
|
|
)
|
|
|
|
|
|
|
|
for i := 0; i < size; i++ {
|
|
|
|
r[i] = make([]float64, l)
|
|
|
|
|
|
|
|
for j := 0; j < l; j++ {
|
|
|
|
r[i][j] = uniform(bounds[j])
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return r
|
|
|
|
}
|