github.com/pbberlin/tools@v0.0.0-20160910141205-7aa5421c2169/text/levenshtein/levenshtein.go (about)

     1  // Package levenshtein core computes the edit distance of two slices of tokens,
     2  // of slim interface type Equaler; subpackages provide various granularity.
     3  // Tokens must be of interface type <Equaler> - implementing (tok) Equal(tok) bool.
     4  // An edit script for converting slice1 to slice2 can also be derived.
     5  // Preference for substitution over insertion/deletion is configurable.
     6  package levenshtein
     7  
     8  import (
     9  	"fmt"
    10  	"strings"
    11  
    12  	"github.com/pbberlin/tools/stringspb"
    13  	"github.com/pbberlin/tools/util"
    14  )
    15  
    16  const cl = 11 // column length for Print funcs
    17  
    18  // Equaler is the neccessary interface to compute the levenshtein distance.
    19  type Equaler interface {
    20  	Equal(compare2 interface{}) bool
    21  }
    22  
    23  // works also for idx == -1
    24  func insertAfter(s []Equaler, idx int, newVal Equaler) []Equaler {
    25  	if idx > len(s)-1 {
    26  		panic("Cannot insert beyond existing length")
    27  	}
    28  	s = append(s, nil)
    29  	copy(s[idx+2:], s[idx+1:])
    30  	s[idx+1] = newVal
    31  	return s
    32  }
    33  
    34  func deleteAt(s []Equaler, idx int) []Equaler {
    35  	if idx > len(s)-1 {
    36  		panic("Cannot delete beyond existing length")
    37  	}
    38  	copy(s[idx:], s[idx+1:])
    39  	s = s[:len(s)-1]
    40  	return s
    41  }
    42  
    43  // The internal levenshtein matrix is only exported,
    44  // because calling packages need to declare its type.
    45  type Matrix struct {
    46  	mx         [][]int
    47  	rows, cols []Equaler
    48  	opt        Options
    49  }
    50  
    51  // New generates a 2-D array,
    52  // representing the dynamic programming table
    53  // used by the Levenshtein algorithm.
    54  // Compare http://www.let.rug.nl/kleiweg/lev/.
    55  // Matrix can be used for retrieval of edit distance
    56  // and for backtrace scripts
    57  func New(argRows, argCols []Equaler, opt Options) Matrix {
    58  
    59  	// Make a 2-D matrix. Rows correspond to prefixes of source, columns to
    60  	// prefixes of target. Cells will contain edit distances.
    61  	// Cf. http://www.let.rug.nl/~kleiweg/lev/levenshtein.html
    62  	m := Matrix{}
    63  	m.opt = opt
    64  	m.rows = argRows
    65  	m.cols = argCols
    66  	h := len(m.rows) + 1
    67  	w := len(m.cols) + 1
    68  	m.mx = make([][]int, h)
    69  
    70  	// Initialize trivial distances (from/to empty string):
    71  	// Filling the left column and the top row with row/column indices.
    72  	for i := 0; i < h; i++ {
    73  		m.mx[i] = make([]int, w)
    74  		m.mx[i][0] = i
    75  	}
    76  	for j := 1; j < w; j++ {
    77  		m.mx[0][j] = j
    78  	}
    79  
    80  	// Filling the remaining cells:
    81  	// 	For each prefix pair:
    82  	// 		Choose couple {edit history, operation} with lowest cost.
    83  	for i := 1; i < h; i++ {
    84  		for j := 1; j < w; j++ {
    85  			delCost := m.mx[i-1][j] + opt.DelCost
    86  			matchSubCost := m.mx[i-1][j-1]
    87  			if !(m.rows[i-1]).Equal(m.cols[j-1]) {
    88  				matchSubCost += opt.SubCost
    89  			}
    90  			insCost := m.mx[i][j-1] + opt.InsCost
    91  			m.mx[i][j] = min(delCost, min(matchSubCost, insCost))
    92  		}
    93  	}
    94  
    95  	return m
    96  }
    97  
    98  func min(a int, b int) int {
    99  	if b < a {
   100  		return b
   101  	}
   102  	return a
   103  }
   104  
   105  // Distance returns levenshtein edit distance for the two slices of tokens of m.
   106  func (m *Matrix) Distance() (int, float64) {
   107  
   108  	dist := m.mx[len(m.mx)-1][len(m.mx[0])-1]
   109  
   110  	relDist := 0.0
   111  
   112  	ls1, ls2 := len(m.mx), len(m.mx[0])
   113  
   114  	// First relDist computation:
   115  	// 		1.) compensated for the size
   116  	// 		2.) related to the smaller slice
   117  	// Can lead to Zero, when diff == dist
   118  	diff := util.Abs(ls1 - ls2)
   119  	if ls1 >= ls2 { // row > col
   120  		relDist = float64(dist-diff) / float64(ls2)
   121  	} else {
   122  		relDist = float64(dist-diff) / float64(ls1)
   123  	}
   124  
   125  	// Second relDist: Simply related to the larger slice.
   126  	// Also account for ls1 and ls2 being one larger than the practical number of tokens.
   127  	divisor := float64(ls1)
   128  	if ls2 > ls1 { // row > col
   129  		divisor = float64(ls2)
   130  	}
   131  	divisor--
   132  	if divisor == 0.0 {
   133  		divisor = 1.0
   134  	}
   135  	relDist = float64(dist) / divisor
   136  	if relDist == 0.25 {
   137  		fmt.Printf("dist %v - ls1 %v - relDist %5.2v\n", dist, divisor, relDist)
   138  	}
   139  
   140  	return dist, relDist
   141  }
   142  
   143  // EditScript returns an optimal edit script for an existing matrix.
   144  func (m *Matrix) EditScript() TEditScrpt {
   145  	return m.backtrace(len(m.mx)-1, len(m.mx[0])-1)
   146  }
   147  
   148  // backtrace is recursive.
   149  // It starts bottom right and steps left/top/lefttop
   150  func (m *Matrix) backtrace(i, j int) TEditScrpt {
   151  
   152  	pf := func(str string) {}
   153  	// pf := fmt.Printf
   154  
   155  	mx := m.mx
   156  	opt := m.opt
   157  	eo := EditOpExt{}
   158  
   159  	if i > 0 && mx[i-1][j]+opt.DelCost == mx[i][j] {
   160  		pf("c1")
   161  		eo.op = Del
   162  		eo.src = i - 1
   163  		eo.dst = j
   164  		return append(m.backtrace(i-1, j), eo)
   165  	}
   166  	if j > 0 && mx[i][j-1]+opt.InsCost == mx[i][j] {
   167  		pf("c2")
   168  		eo.op = Ins
   169  		eo.src = i
   170  		eo.dst = j - 1
   171  		return append(m.backtrace(i, j-1), eo)
   172  	}
   173  	if i > 0 && j > 0 && mx[i-1][j-1]+opt.SubCost == mx[i][j] {
   174  		pf("c3")
   175  		eo.op = Sub
   176  		eo.src = i - 1
   177  		eo.dst = j - 1
   178  		return append(m.backtrace(i-1, j-1), eo)
   179  	}
   180  	if i > 0 && j > 0 && mx[i-1][j-1] == mx[i][j] {
   181  		pf("c4")
   182  		eo.op = Match
   183  		eo.src = i - 1
   184  		eo.dst = j - 1
   185  		return append(m.backtrace(i-1, j-1), eo)
   186  	}
   187  	pf("c5")
   188  	return []EditOpExt{}
   189  }
   190  
   191  // Print prints a visual representation
   192  // of the slices of tokens and their distance matrix
   193  func (m *Matrix) Print() {
   194  
   195  	rows, cols := m.rows, m.cols
   196  	mx := m.mx
   197  
   198  	fp := fmt.Printf
   199  
   200  	fmt2 := fmt.Sprintf("%s-%vd", "%", cl)
   201  
   202  	fp(strings.Repeat(" ", 2*cl))
   203  	for _, col := range cols {
   204  		scol := fmt.Sprintf("%v", col)
   205  		fp("%v ", stringspb.ToLen(scol, cl-1)) // at least one space right
   206  	}
   207  	fp("\n")
   208  
   209  	fp(strings.Repeat(" ", cl))
   210  	fp(fmt2, mx[0][0])
   211  	for j, _ := range cols {
   212  		fp(fmt2, mx[0][j+1])
   213  	}
   214  	fp("\n")
   215  
   216  	//
   217  	for i, row := range rows {
   218  		srow := fmt.Sprintf("%v", row)
   219  		fp("%v ", stringspb.ToLen(srow, cl-1)) // at least one space right
   220  		fp(fmt2, mx[i+1][0])
   221  		for j, _ := range cols {
   222  			fp(fmt2, mx[i+1][j+1])
   223  		}
   224  		fp("\n")
   225  	}
   226  	// fp("\n")
   227  }
   228  
   229  // ApplyEditScript applies the given Editscript
   230  // to the first slice of tokens of m.
   231  // The returned slice should be equal
   232  // to the second slice of tokens of m.
   233  func (m *Matrix) ApplyEditScript(es TEditScrpt) []Equaler {
   234  
   235  	sumIns := 0
   236  	sumDel := 0
   237  	fmt2 := fmt.Sprintf("%s-%vv", "%", cl)
   238  
   239  	rows2 := make([]Equaler, 0, len(m.rows))
   240  	for _, v := range m.rows {
   241  		rows2 = append(rows2, v)
   242  	}
   243  
   244  	const offs = 1
   245  	fmt.Printf("%v", strings.Repeat(" ", 2*cl))
   246  	for _, v := range es {
   247  
   248  		s := fmt.Sprintf("%v-%v-%v", v.op, offs+v.src+sumIns-sumDel, offs+v.dst)
   249  		fmt.Printf(fmt2, s)
   250  
   251  		pos := v.src + sumIns - sumDel
   252  
   253  		if v.op == Ins {
   254  			// rows2 = insertAfter(rows2, util.Min(pos, len(rows2)-1), m.cols[v.dst])
   255  			rows2 = insertAfter(rows2, pos-1, m.cols[v.dst])
   256  			sumIns++
   257  		}
   258  
   259  		if v.op == Del {
   260  			rows2 = deleteAt(rows2, pos)
   261  			sumDel++
   262  		}
   263  
   264  		if v.op == Sub {
   265  			rows2[pos] = m.cols[v.dst]
   266  		}
   267  	}
   268  	fmt.Printf("\n")
   269  
   270  	fmt.Printf("%v", strings.Repeat(" ", 2*cl))
   271  	for _, row := range rows2 {
   272  		fmt.Printf(fmt2, row)
   273  	}
   274  	return rows2
   275  
   276  }
   277  
   278  // CompareToCol takes a slice of Equaler-Tokens
   279  // and compares them against the second matrix slice.
   280  func (m *Matrix) CompareToCol(col2 []Equaler) bool {
   281  	equal := true
   282  	for idx, v := range m.cols {
   283  		if v != col2[idx] {
   284  			equal = false
   285  			break
   286  		}
   287  	}
   288  	return equal
   289  }