github.com/pbberlin/tools@v0.0.0-20160910141205-7aa5421c2169/text/levenshtein/levenshtein.go (about) 1 // Package levenshtein core computes the edit distance of two slices of tokens, 2 // of slim interface type Equaler; subpackages provide various granularity. 3 // Tokens must be of interface type <Equaler> - implementing (tok) Equal(tok) bool. 4 // An edit script for converting slice1 to slice2 can also be derived. 5 // Preference for substitution over insertion/deletion is configurable. 6 package levenshtein 7 8 import ( 9 "fmt" 10 "strings" 11 12 "github.com/pbberlin/tools/stringspb" 13 "github.com/pbberlin/tools/util" 14 ) 15 16 const cl = 11 // column length for Print funcs 17 18 // Equaler is the neccessary interface to compute the levenshtein distance. 19 type Equaler interface { 20 Equal(compare2 interface{}) bool 21 } 22 23 // works also for idx == -1 24 func insertAfter(s []Equaler, idx int, newVal Equaler) []Equaler { 25 if idx > len(s)-1 { 26 panic("Cannot insert beyond existing length") 27 } 28 s = append(s, nil) 29 copy(s[idx+2:], s[idx+1:]) 30 s[idx+1] = newVal 31 return s 32 } 33 34 func deleteAt(s []Equaler, idx int) []Equaler { 35 if idx > len(s)-1 { 36 panic("Cannot delete beyond existing length") 37 } 38 copy(s[idx:], s[idx+1:]) 39 s = s[:len(s)-1] 40 return s 41 } 42 43 // The internal levenshtein matrix is only exported, 44 // because calling packages need to declare its type. 45 type Matrix struct { 46 mx [][]int 47 rows, cols []Equaler 48 opt Options 49 } 50 51 // New generates a 2-D array, 52 // representing the dynamic programming table 53 // used by the Levenshtein algorithm. 54 // Compare http://www.let.rug.nl/kleiweg/lev/. 55 // Matrix can be used for retrieval of edit distance 56 // and for backtrace scripts 57 func New(argRows, argCols []Equaler, opt Options) Matrix { 58 59 // Make a 2-D matrix. Rows correspond to prefixes of source, columns to 60 // prefixes of target. Cells will contain edit distances. 61 // Cf. http://www.let.rug.nl/~kleiweg/lev/levenshtein.html 62 m := Matrix{} 63 m.opt = opt 64 m.rows = argRows 65 m.cols = argCols 66 h := len(m.rows) + 1 67 w := len(m.cols) + 1 68 m.mx = make([][]int, h) 69 70 // Initialize trivial distances (from/to empty string): 71 // Filling the left column and the top row with row/column indices. 72 for i := 0; i < h; i++ { 73 m.mx[i] = make([]int, w) 74 m.mx[i][0] = i 75 } 76 for j := 1; j < w; j++ { 77 m.mx[0][j] = j 78 } 79 80 // Filling the remaining cells: 81 // For each prefix pair: 82 // Choose couple {edit history, operation} with lowest cost. 83 for i := 1; i < h; i++ { 84 for j := 1; j < w; j++ { 85 delCost := m.mx[i-1][j] + opt.DelCost 86 matchSubCost := m.mx[i-1][j-1] 87 if !(m.rows[i-1]).Equal(m.cols[j-1]) { 88 matchSubCost += opt.SubCost 89 } 90 insCost := m.mx[i][j-1] + opt.InsCost 91 m.mx[i][j] = min(delCost, min(matchSubCost, insCost)) 92 } 93 } 94 95 return m 96 } 97 98 func min(a int, b int) int { 99 if b < a { 100 return b 101 } 102 return a 103 } 104 105 // Distance returns levenshtein edit distance for the two slices of tokens of m. 106 func (m *Matrix) Distance() (int, float64) { 107 108 dist := m.mx[len(m.mx)-1][len(m.mx[0])-1] 109 110 relDist := 0.0 111 112 ls1, ls2 := len(m.mx), len(m.mx[0]) 113 114 // First relDist computation: 115 // 1.) compensated for the size 116 // 2.) related to the smaller slice 117 // Can lead to Zero, when diff == dist 118 diff := util.Abs(ls1 - ls2) 119 if ls1 >= ls2 { // row > col 120 relDist = float64(dist-diff) / float64(ls2) 121 } else { 122 relDist = float64(dist-diff) / float64(ls1) 123 } 124 125 // Second relDist: Simply related to the larger slice. 126 // Also account for ls1 and ls2 being one larger than the practical number of tokens. 127 divisor := float64(ls1) 128 if ls2 > ls1 { // row > col 129 divisor = float64(ls2) 130 } 131 divisor-- 132 if divisor == 0.0 { 133 divisor = 1.0 134 } 135 relDist = float64(dist) / divisor 136 if relDist == 0.25 { 137 fmt.Printf("dist %v - ls1 %v - relDist %5.2v\n", dist, divisor, relDist) 138 } 139 140 return dist, relDist 141 } 142 143 // EditScript returns an optimal edit script for an existing matrix. 144 func (m *Matrix) EditScript() TEditScrpt { 145 return m.backtrace(len(m.mx)-1, len(m.mx[0])-1) 146 } 147 148 // backtrace is recursive. 149 // It starts bottom right and steps left/top/lefttop 150 func (m *Matrix) backtrace(i, j int) TEditScrpt { 151 152 pf := func(str string) {} 153 // pf := fmt.Printf 154 155 mx := m.mx 156 opt := m.opt 157 eo := EditOpExt{} 158 159 if i > 0 && mx[i-1][j]+opt.DelCost == mx[i][j] { 160 pf("c1") 161 eo.op = Del 162 eo.src = i - 1 163 eo.dst = j 164 return append(m.backtrace(i-1, j), eo) 165 } 166 if j > 0 && mx[i][j-1]+opt.InsCost == mx[i][j] { 167 pf("c2") 168 eo.op = Ins 169 eo.src = i 170 eo.dst = j - 1 171 return append(m.backtrace(i, j-1), eo) 172 } 173 if i > 0 && j > 0 && mx[i-1][j-1]+opt.SubCost == mx[i][j] { 174 pf("c3") 175 eo.op = Sub 176 eo.src = i - 1 177 eo.dst = j - 1 178 return append(m.backtrace(i-1, j-1), eo) 179 } 180 if i > 0 && j > 0 && mx[i-1][j-1] == mx[i][j] { 181 pf("c4") 182 eo.op = Match 183 eo.src = i - 1 184 eo.dst = j - 1 185 return append(m.backtrace(i-1, j-1), eo) 186 } 187 pf("c5") 188 return []EditOpExt{} 189 } 190 191 // Print prints a visual representation 192 // of the slices of tokens and their distance matrix 193 func (m *Matrix) Print() { 194 195 rows, cols := m.rows, m.cols 196 mx := m.mx 197 198 fp := fmt.Printf 199 200 fmt2 := fmt.Sprintf("%s-%vd", "%", cl) 201 202 fp(strings.Repeat(" ", 2*cl)) 203 for _, col := range cols { 204 scol := fmt.Sprintf("%v", col) 205 fp("%v ", stringspb.ToLen(scol, cl-1)) // at least one space right 206 } 207 fp("\n") 208 209 fp(strings.Repeat(" ", cl)) 210 fp(fmt2, mx[0][0]) 211 for j, _ := range cols { 212 fp(fmt2, mx[0][j+1]) 213 } 214 fp("\n") 215 216 // 217 for i, row := range rows { 218 srow := fmt.Sprintf("%v", row) 219 fp("%v ", stringspb.ToLen(srow, cl-1)) // at least one space right 220 fp(fmt2, mx[i+1][0]) 221 for j, _ := range cols { 222 fp(fmt2, mx[i+1][j+1]) 223 } 224 fp("\n") 225 } 226 // fp("\n") 227 } 228 229 // ApplyEditScript applies the given Editscript 230 // to the first slice of tokens of m. 231 // The returned slice should be equal 232 // to the second slice of tokens of m. 233 func (m *Matrix) ApplyEditScript(es TEditScrpt) []Equaler { 234 235 sumIns := 0 236 sumDel := 0 237 fmt2 := fmt.Sprintf("%s-%vv", "%", cl) 238 239 rows2 := make([]Equaler, 0, len(m.rows)) 240 for _, v := range m.rows { 241 rows2 = append(rows2, v) 242 } 243 244 const offs = 1 245 fmt.Printf("%v", strings.Repeat(" ", 2*cl)) 246 for _, v := range es { 247 248 s := fmt.Sprintf("%v-%v-%v", v.op, offs+v.src+sumIns-sumDel, offs+v.dst) 249 fmt.Printf(fmt2, s) 250 251 pos := v.src + sumIns - sumDel 252 253 if v.op == Ins { 254 // rows2 = insertAfter(rows2, util.Min(pos, len(rows2)-1), m.cols[v.dst]) 255 rows2 = insertAfter(rows2, pos-1, m.cols[v.dst]) 256 sumIns++ 257 } 258 259 if v.op == Del { 260 rows2 = deleteAt(rows2, pos) 261 sumDel++ 262 } 263 264 if v.op == Sub { 265 rows2[pos] = m.cols[v.dst] 266 } 267 } 268 fmt.Printf("\n") 269 270 fmt.Printf("%v", strings.Repeat(" ", 2*cl)) 271 for _, row := range rows2 { 272 fmt.Printf(fmt2, row) 273 } 274 return rows2 275 276 } 277 278 // CompareToCol takes a slice of Equaler-Tokens 279 // and compares them against the second matrix slice. 280 func (m *Matrix) CompareToCol(col2 []Equaler) bool { 281 equal := true 282 for idx, v := range m.cols { 283 if v != col2[idx] { 284 equal = false 285 break 286 } 287 } 288 return equal 289 }