github.com/matrixorigin/matrixone@v1.2.0/pkg/sql/colexec/aggexec/algos/kmeans/elkans/initializer_test.go (about)

     1  // Copyright 2023 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package elkans
    16  
    17  import (
    18  	"github.com/matrixorigin/matrixone/pkg/vectorize/moarray"
    19  	"reflect"
    20  	"testing"
    21  )
    22  
    23  func TestRandom_InitCentroids(t *testing.T) {
    24  	type args struct {
    25  		vectors [][]float64
    26  		k       int
    27  	}
    28  	tests := []struct {
    29  		name          string
    30  		args          args
    31  		wantCentroids [][]float64
    32  	}{
    33  		{
    34  			name: "TestRandom_InitCentroids",
    35  			args: args{
    36  				vectors: [][]float64{
    37  					{1, 2, 3, 4},
    38  					{1, 2, 4, 5},
    39  					{1, 2, 4, 5},
    40  					{1, 2, 3, 4},
    41  					{1, 2, 4, 5},
    42  					{1, 2, 4, 5},
    43  					{10, 2, 4, 5},
    44  					{10, 3, 4, 5},
    45  					{10, 5, 4, 5},
    46  					{10, 2, 4, 5},
    47  					{10, 3, 4, 5},
    48  					{10, 5, 4, 5},
    49  				},
    50  				k: 2,
    51  			},
    52  			wantCentroids: [][]float64{
    53  				// NOTE: values of random initialization need not be farther apart, it is random.
    54  				// NOTE: we get the same random values in the test case because we are using a constant seed value.
    55  				{1, 2, 4, 5},
    56  				{1, 2, 3, 4},
    57  			},
    58  		},
    59  	}
    60  	for _, tt := range tests {
    61  		t.Run(tt.name, func(t *testing.T) {
    62  			r := NewRandomInitializer()
    63  			gonumVectors, _ := moarray.ToGonumVectors[float64](tt.args.vectors...)
    64  
    65  			gotCentroids := r.InitCentroids(gonumVectors, tt.args.k)
    66  			if arrays, _ := moarray.ToMoArrays[float64](gotCentroids); !reflect.DeepEqual(arrays, tt.wantCentroids) {
    67  				t.Errorf("InitCentroids() = %v, want %v", arrays, tt.wantCentroids)
    68  			}
    69  
    70  		})
    71  	}
    72  }
    73  
    74  func TestKMeansPlusPlus_InitCentroids(t *testing.T) {
    75  	type args struct {
    76  		vectors [][]float64
    77  		k       int
    78  	}
    79  	tests := []struct {
    80  		name          string
    81  		args          args
    82  		wantCentroids [][]float64
    83  	}{
    84  		{
    85  			name: "TestKMeansPlusPlus_InitCentroids",
    86  			args: args{
    87  				vectors: [][]float64{
    88  					{1, 2, 3, 4},
    89  					{1, 2, 4, 5},
    90  					{1, 2, 4, 5},
    91  					{1, 2, 3, 4},
    92  					{1, 2, 4, 5},
    93  					{1, 2, 4, 5},
    94  					{10, 2, 4, 5},
    95  					{10, 3, 4, 5},
    96  					{10, 5, 4, 5},
    97  					{10, 2, 4, 5},
    98  					{10, 3, 4, 5},
    99  					{10, 5, 4, 5},
   100  				},
   101  				k: 2,
   102  			},
   103  			// Kmeans++ picked the relatively farthest points as the initial centroids
   104  			wantCentroids: [][]float64{
   105  				{1, 2, 4, 5},
   106  				{10, 5, 4, 5},
   107  			},
   108  		},
   109  	}
   110  	for _, tt := range tests {
   111  		t.Run(tt.name, func(t *testing.T) {
   112  			r := NewKMeansPlusPlusInitializer(L2Distance)
   113  			gonumVectors, _ := moarray.ToGonumVectors[float64](tt.args.vectors...)
   114  
   115  			gotCentroids := r.InitCentroids(gonumVectors, tt.args.k)
   116  			if arrays, _ := moarray.ToMoArrays[float64](gotCentroids); !reflect.DeepEqual(arrays, tt.wantCentroids) {
   117  				t.Errorf("InitCentroids() = %v, want %v", arrays, tt.wantCentroids)
   118  			}
   119  		})
   120  	}
   121  }
   122  
   123  /*
   124  date : 2023-11-20
   125  goos: darwin
   126  goarch: arm64
   127  cpu: Apple M2 Pro
   128  rows: 10_000
   129  dims: 1024
   130  k : 10
   131  Benchmark_InitCentroids/RANDOM-10         		108	        10574740 ns/op
   132  Benchmark_InitCentroids/KMEANS++-10       	      1		  1081363458 ns/op
   133  */
   134  func Benchmark_InitCentroids(b *testing.B) {
   135  	rowCnt := 10_000
   136  	dims := 1024
   137  	k := 10
   138  
   139  	data := make([][]float64, rowCnt)
   140  	populateRandData(rowCnt, dims, data)
   141  
   142  	random := NewRandomInitializer()
   143  	kmeanspp := NewKMeansPlusPlusInitializer(L2Distance)
   144  
   145  	b.Run("RANDOM", func(b *testing.B) {
   146  		b.ResetTimer()
   147  		for i := 0; i < b.N; i++ {
   148  			gonumVectors, _ := moarray.ToGonumVectors[float64](data...)
   149  			_ = random.InitCentroids(gonumVectors, k)
   150  		}
   151  	})
   152  
   153  	b.Run("KMEANS++", func(b *testing.B) {
   154  		b.ResetTimer()
   155  		for i := 0; i < b.N; i++ {
   156  			gonumVectors, _ := moarray.ToGonumVectors[float64](data...)
   157  			_ = kmeanspp.InitCentroids(gonumVectors, k)
   158  		}
   159  	})
   160  }