github.com/fluhus/gostuff@v0.4.1-0.20240331134726-be71864f2b5d/clustering/rand.go (about) 1 package clustering 2 3 import ( 4 "fmt" 5 ) 6 7 // AdjustedRandIndex compares 2 taggings of the data for similarity. A score of 8 // 1 means identical, a score of 0 means as good as random, and a negative 9 // score means worse than random. 10 func AdjustedRandIndex(tags1, tags2 []int) float64 { 11 // Check input. 12 if len(tags1) != len(tags2) { 13 panic(fmt.Sprintf("Mismatching lengths: %d, %d", 14 len(tags1), len(tags2))) 15 } 16 17 sets1 := tagsToSets(tags1) 18 sets2 := tagsToSets(tags2) 19 20 r := randIndex(sets1, sets2) 21 e := expectedRandIndex(sets1, sets2) 22 m := maxRandIndex(sets1, sets2) 23 return (r - e) / (m - e) 24 } 25 26 // randIndex returns the RI part of the adjusted index. 27 func randIndex(tags1, tags2 []intSet) float64 { 28 r := 0 29 for _, t1 := range tags1 { 30 for _, t2 := range tags2 { 31 r += choose2(t1.intersect(t2)) 32 } 33 } 34 return float64(r) 35 } 36 37 // expectedRandIndex returns the expected index according to hypergeometrical 38 // distribution. 39 func expectedRandIndex(tags1, tags2 []intSet) float64 { 40 p1 := 0 41 n := 0 42 for _, tags := range tags1 { 43 n += len(tags) 44 p1 += choose2(len(tags)) 45 } 46 p2 := 0 47 for _, tags := range tags2 { 48 p2 += choose2(len(tags)) 49 } 50 p := float64(choose2(n)) 51 return float64(p1) * float64(p2) / p 52 } 53 54 // maxRandIndex returns the maximal possible index. 55 func maxRandIndex(tags1, tags2 []intSet) float64 { 56 p := 0 57 for _, tags := range tags1 { 58 p += choose2(len(tags)) 59 } 60 for _, tags := range tags2 { 61 p += choose2(len(tags)) 62 } 63 return float64(p) / 2 64 } 65 66 func choose2(n int) int { 67 return n * (n - 1) / 2 68 } 69 70 // ----- INT SET -------------------------------------------------------------- 71 72 // intSet is a set of integers. 73 type intSet map[int]struct{} 74 75 // tagsToSets converts a list of tags to a list of sets of indexes, one list 76 // for each tag. 77 func tagsToSets(tags []int) []intSet { 78 // Make map from tag to its set. 79 sets := map[int]intSet{} 80 for i, tag := range tags { 81 if sets[tag] == nil { 82 sets[tag] = intSet{} 83 } 84 sets[tag].add(i) 85 } 86 87 // Convert map to slice. 88 result := make([]intSet, 0, len(sets)) 89 for _, set := range sets { 90 result = append(result, set) 91 } 92 93 return result 94 } 95 96 // add adds a number to the set. 97 func (is intSet) add(i int) { 98 is[i] = struct{}{} 99 } 100 101 // contains checks if a set contains the given element. 102 func (is intSet) contains(i int) bool { 103 _, ok := is[i] 104 return ok 105 } 106 107 // intersect returns the size of the intersection of the 2 sets. 108 func (is intSet) intersect(other intSet) int { 109 result := 0 110 for i := range is { 111 if other.contains(i) { 112 result++ 113 } 114 } 115 return result 116 }