github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/dedup/5_similar_texts.go (about)

     1  package dedup
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"strings"
     7  
     8  	"github.com/pbberlin/tools/os/fsi"
     9  	"github.com/pbberlin/tools/os/fsi/common"
    10  	"github.com/pbberlin/tools/stringspb"
    11  	"github.com/pbberlin/tools/text/levenshtein"
    12  	"github.com/pbberlin/tools/text/levenshtein/wordb"
    13  	"github.com/pbberlin/tools/util"
    14  )
    15  
    16  var opt = levenshtein.Options{1, 1, 1} // cheap substitution
    17  
    18  var levelsToProcess = map[int]bool{1: true}
    19  
    20  var levelsTolerance = 0
    21  
    22  const excerptLen = 20
    23  
    24  var appliedLevenshtein = 0
    25  var appliedCompare = 0
    26  var breakMapsTooDistinct = 0
    27  
    28  func similarTextifiedTrees(mp map[string][]*TextifiedTree, skipPrefix map[string]bool, onlyKeys map[string]bool) []TextifiedTree {
    29  
    30  	pf = pfDevNull
    31  	defer func() { pf = pfRestore }()
    32  
    33  	frags := []TextifiedTree{}
    34  
    35  	for fnKey, tts := range mp {
    36  		if !onlyKeys[fnKey] {
    37  			continue
    38  		}
    39  		pf("%v\n", fnKey)
    40  
    41  	MarkX:
    42  		for _, tt := range tts {
    43  
    44  			if !levelsToProcess[tt.Lvl] {
    45  				continue
    46  			}
    47  
    48  			outls := strings.Split(tt.Outline, ".")
    49  			for i := 0; i < len(outls)-1; i++ {
    50  				jn := strings.Join(outls[0:i+1], ".") + "."
    51  				if skipPrefix[jn] {
    52  					// log.Printf("  %-8v contains %-6v => skip\n", tt.Outline, jn)
    53  					continue MarkX
    54  				} else {
    55  					// log.Printf("  %-8v proccessing ...\n", tt.Outline)
    56  				}
    57  
    58  			}
    59  
    60  			similarTextifiedTrees2(tt, mp, skipPrefix)
    61  			if len(tt.Similars) > 0 {
    62  				frags = append(frags, *tt)
    63  			}
    64  		}
    65  	}
    66  
    67  	return frags
    68  }
    69  
    70  func similarTextifiedTrees2(src *TextifiedTree, mp map[string][]*TextifiedTree, skipPrefix map[string]bool) {
    71  
    72  	// srcE := word.WrapAsEqualer(string(src.Text), true) // ssrc as Equaler
    73  	srcE := wordb.WrapAsEqualer(src.Text, true)
    74  	srcLen := float64(len(src.Text))
    75  
    76  	for fnKey, tts := range mp {
    77  
    78  		if fnKey == src.SourceID {
    79  			pf("    to %v SKIP self\n", fnKey)
    80  			continue
    81  		}
    82  
    83  		pf("    to %v\n", fnKey)
    84  
    85  		cntr, br := 0, true
    86  		for _, tt := range tts {
    87  			// outl, text := tt.Outl, tt.Text
    88  
    89  			if tt.Lvl > src.Lvl+levelsTolerance {
    90  				break // since we are now sorted by lvl, we can this is safe
    91  			}
    92  
    93  			if tt.Lvl == src.Lvl ||
    94  				(tt.Lvl > src.Lvl && tt.Lvl <= src.Lvl+levelsTolerance) {
    95  				// proceed
    96  			} else {
    97  				continue
    98  			}
    99  
   100  			if src.NumTokens < 1 {
   101  				continue
   102  			}
   103  
   104  			if src.NumTokens < 5 && tt.NumTokens > 7 {
   105  				continue
   106  			}
   107  
   108  			if HistoBasedDistance(src, tt) > 0.51 {
   109  				breakMapsTooDistinct++
   110  				continue
   111  			}
   112  
   113  			relSize := srcLen / float64(util.Max(1, len(tt.Text)))
   114  			if relSize < 0.33 || relSize > 3 {
   115  				continue
   116  			}
   117  
   118  			absDist, relDist := 0, 0.0
   119  
   120  			if tt.NumTokens == src.NumTokens &&
   121  				len(tt.Text) == len(src.Text) &&
   122  				bytes.Equal(tt.Text, src.Text) {
   123  				absDist, relDist = 0, 0.0
   124  				appliedCompare++
   125  			} else {
   126  				dstE := wordb.WrapAsEqualer(tt.Text, true) // destinations as Equaler
   127  				m := levenshtein.New(srcE, dstE, opt)
   128  				absDist, relDist = m.Distance()
   129  				appliedLevenshtein++
   130  			}
   131  
   132  			//
   133  			if relDist < 0.26 && absDist < 10 {
   134  				if br {
   135  					pf("\t")
   136  				}
   137  
   138  				sd := ""
   139  				sd = string(tt.Text[:util.Min(2*excerptLen, len(tt.Text)-1)])
   140  				sd = stringspb.ToLen(sd, 2*excerptLen+1)
   141  				pf("%12v %v %4v %5.2v   ", tt.Outline, sd, absDist, relDist)
   142  
   143  				cntr++
   144  				br = false
   145  
   146  				sim := Similar{}
   147  				sim.SourceID = fnKey
   148  				sim.Lvl = tt.Lvl
   149  				sim.Outline = tt.Outline
   150  				sim.AbsLevenshtein = absDist
   151  				sim.RelLevenshtein = relDist
   152  				sim.Text = tt.Text
   153  				src.Similars = append(src.Similars, sim)
   154  				src.SumAbsLevenshtein += absDist
   155  				src.SumRelLevenshtein += relDist
   156  
   157  				if cntr%2 == 0 || cntr > 20 {
   158  					pf("\n")
   159  					br = true
   160  				}
   161  				if cntr > 20 {
   162  					break
   163  				}
   164  			}
   165  
   166  		}
   167  		if !br {
   168  			pf("\n")
   169  		}
   170  	}
   171  
   172  }
   173  
   174  func similaritiesToFile(fs fsi.FileSystem, logdir string, frags []TextifiedTree, stage int) {
   175  
   176  	// bfrags := stringspb.IndentedDumpBytes(frags)
   177  	b := new(bytes.Buffer)
   178  	for _, v := range frags {
   179  		b.WriteString(fmt.Sprintf("%v %2v ", v.SourceID, v.Lvl))
   180  		b.WriteString(fmt.Sprintf("%-8v             ", v.Outline))
   181  		b.Write(v.Text)
   182  		b.WriteString("\n")
   183  		for _, v1 := range v.Similars {
   184  			b.WriteString(fmt.Sprintf("%v %2v ", v1.SourceID, v1.Lvl))
   185  			b.WriteString(fmt.Sprintf("%-8v    ", string(v1.Outline)))
   186  			b.WriteString(spf("%2v ", v1.AbsLevenshtein))
   187  			b.WriteString(spf("%-5.2v ", v1.RelLevenshtein))
   188  			b.Write(v1.Text)
   189  			b.WriteByte(10)
   190  		}
   191  		b.WriteByte(10)
   192  	}
   193  	common.WriteFile(fs, spf("%v/outp_fragments_st%v.txt", logdir, stage), b.Bytes())
   194  
   195  }
   196  
   197  // HistoBasedDistance isa cheap alternative to Levenshtein.distance.
   198  // Particularly, since we compute the histogram anyway.
   199  // Tests show, that LevenshteinDistance > HistoBasedDistance
   200  // i.e.             0.36                  0.31
   201  // i.e.             0.6                   0.4
   202  // Thus we can break early if i.e. HistoBasedDistance > 0.5
   203  // implying that LevenshteinDistance is at least >= 0.5
   204  func HistoBasedDistance(src, dst *TextifiedTree) float64 {
   205  
   206  	largerOuter := src
   207  	inner := dst
   208  	if src.NumTokens < dst.NumTokens {
   209  		largerOuter = dst
   210  		inner = src
   211  	}
   212  
   213  	// Handle division by zero
   214  	if largerOuter.NumTokens == 0 {
   215  		return 0.0
   216  	}
   217  
   218  	// inner overlap
   219  	same := 0
   220  	for k, _ := range largerOuter.Histo {
   221  		if _, ok := inner.Histo[k]; ok {
   222  			same++
   223  		}
   224  	}
   225  
   226  	distinctBySheerSize := largerOuter.NumTokens - inner.NumTokens
   227  	distinctInner := inner.NumTokens - same
   228  
   229  	ret := float64(distinctBySheerSize+distinctInner) / float64(largerOuter.NumTokens)
   230  
   231  	// crit1 := inner.NumTokens > 5 && distinctBySheerSize < 5 && distinctInner < 5
   232  	// _ = crit1
   233  	// if ret > 0 && ret < 0.51 {
   234  	// 	fmt.Printf("%3v %3v ; sizediff %3v worddiff %3v =>  %4.2v\n", largerOuter.NumTokens, inner.NumTokens,
   235  	// 		distinctBySheerSize, distinctInner, ret)
   236  	// }
   237  
   238  	return ret
   239  }
   240  
   241  // slow - only for debug
   242  func mpKeys(mp map[string]int) string {
   243  	ret := ""
   244  	for k, _ := range mp {
   245  		ret += k + " "
   246  	}
   247  	return ret
   248  }