github.com/aarzilli/tools@v0.0.0-20151123112009-0d27094f75e0/net/http/dedup/5_similar_texts.go (about) 1 package dedup 2 3 import ( 4 "bytes" 5 "fmt" 6 "strings" 7 8 "github.com/pbberlin/tools/os/fsi" 9 "github.com/pbberlin/tools/os/fsi/common" 10 "github.com/pbberlin/tools/stringspb" 11 "github.com/pbberlin/tools/text/levenshtein" 12 "github.com/pbberlin/tools/text/levenshtein/wordb" 13 "github.com/pbberlin/tools/util" 14 ) 15 16 var opt = levenshtein.Options{1, 1, 1} // cheap substitution 17 18 var levelsToProcess = map[int]bool{1: true} 19 20 var levelsTolerance = 0 21 22 const excerptLen = 20 23 24 var appliedLevenshtein = 0 25 var appliedCompare = 0 26 var breakMapsTooDistinct = 0 27 28 func similarTextifiedTrees(mp map[string][]*TextifiedTree, skipPrefix map[string]bool, onlyKeys map[string]bool) []TextifiedTree { 29 30 pf = pfDevNull 31 defer func() { pf = pfRestore }() 32 33 frags := []TextifiedTree{} 34 35 for fnKey, tts := range mp { 36 if !onlyKeys[fnKey] { 37 continue 38 } 39 pf("%v\n", fnKey) 40 41 MarkX: 42 for _, tt := range tts { 43 44 if !levelsToProcess[tt.Lvl] { 45 continue 46 } 47 48 outls := strings.Split(tt.Outline, ".") 49 for i := 0; i < len(outls)-1; i++ { 50 jn := strings.Join(outls[0:i+1], ".") + "." 51 if skipPrefix[jn] { 52 // log.Printf(" %-8v contains %-6v => skip\n", tt.Outline, jn) 53 continue MarkX 54 } else { 55 // log.Printf(" %-8v proccessing ...\n", tt.Outline) 56 } 57 58 } 59 60 similarTextifiedTrees2(tt, mp, skipPrefix) 61 if len(tt.Similars) > 0 { 62 frags = append(frags, *tt) 63 } 64 } 65 } 66 67 return frags 68 } 69 70 func similarTextifiedTrees2(src *TextifiedTree, mp map[string][]*TextifiedTree, skipPrefix map[string]bool) { 71 72 // srcE := word.WrapAsEqualer(string(src.Text), true) // ssrc as Equaler 73 srcE := wordb.WrapAsEqualer(src.Text, true) 74 srcLen := float64(len(src.Text)) 75 76 for fnKey, tts := range mp { 77 78 if fnKey == src.SourceID { 79 pf(" to %v SKIP self\n", fnKey) 80 continue 81 } 82 83 pf(" to %v\n", fnKey) 84 85 cntr, br := 0, true 86 for _, tt := range tts { 87 // outl, text := tt.Outl, tt.Text 88 89 if tt.Lvl > src.Lvl+levelsTolerance { 90 break // since we are now sorted by lvl, we can this is safe 91 } 92 93 if tt.Lvl == src.Lvl || 94 (tt.Lvl > src.Lvl && tt.Lvl <= src.Lvl+levelsTolerance) { 95 // proceed 96 } else { 97 continue 98 } 99 100 if src.NumTokens < 1 { 101 continue 102 } 103 104 if src.NumTokens < 5 && tt.NumTokens > 7 { 105 continue 106 } 107 108 if HistoBasedDistance(src, tt) > 0.51 { 109 breakMapsTooDistinct++ 110 continue 111 } 112 113 relSize := srcLen / float64(util.Max(1, len(tt.Text))) 114 if relSize < 0.33 || relSize > 3 { 115 continue 116 } 117 118 absDist, relDist := 0, 0.0 119 120 if tt.NumTokens == src.NumTokens && 121 len(tt.Text) == len(src.Text) && 122 bytes.Equal(tt.Text, src.Text) { 123 absDist, relDist = 0, 0.0 124 appliedCompare++ 125 } else { 126 dstE := wordb.WrapAsEqualer(tt.Text, true) // destinations as Equaler 127 m := levenshtein.New(srcE, dstE, opt) 128 absDist, relDist = m.Distance() 129 appliedLevenshtein++ 130 } 131 132 // 133 if relDist < 0.26 && absDist < 10 { 134 if br { 135 pf("\t") 136 } 137 138 sd := "" 139 sd = string(tt.Text[:util.Min(2*excerptLen, len(tt.Text)-1)]) 140 sd = stringspb.ToLen(sd, 2*excerptLen+1) 141 pf("%12v %v %4v %5.2v ", tt.Outline, sd, absDist, relDist) 142 143 cntr++ 144 br = false 145 146 sim := Similar{} 147 sim.SourceID = fnKey 148 sim.Lvl = tt.Lvl 149 sim.Outline = tt.Outline 150 sim.AbsLevenshtein = absDist 151 sim.RelLevenshtein = relDist 152 sim.Text = tt.Text 153 src.Similars = append(src.Similars, sim) 154 src.SumAbsLevenshtein += absDist 155 src.SumRelLevenshtein += relDist 156 157 if cntr%2 == 0 || cntr > 20 { 158 pf("\n") 159 br = true 160 } 161 if cntr > 20 { 162 break 163 } 164 } 165 166 } 167 if !br { 168 pf("\n") 169 } 170 } 171 172 } 173 174 func similaritiesToFile(fs fsi.FileSystem, logdir string, frags []TextifiedTree, stage int) { 175 176 // bfrags := stringspb.IndentedDumpBytes(frags) 177 b := new(bytes.Buffer) 178 for _, v := range frags { 179 b.WriteString(fmt.Sprintf("%v %2v ", v.SourceID, v.Lvl)) 180 b.WriteString(fmt.Sprintf("%-8v ", v.Outline)) 181 b.Write(v.Text) 182 b.WriteString("\n") 183 for _, v1 := range v.Similars { 184 b.WriteString(fmt.Sprintf("%v %2v ", v1.SourceID, v1.Lvl)) 185 b.WriteString(fmt.Sprintf("%-8v ", string(v1.Outline))) 186 b.WriteString(spf("%2v ", v1.AbsLevenshtein)) 187 b.WriteString(spf("%-5.2v ", v1.RelLevenshtein)) 188 b.Write(v1.Text) 189 b.WriteByte(10) 190 } 191 b.WriteByte(10) 192 } 193 common.WriteFile(fs, spf("%v/outp_fragments_st%v.txt", logdir, stage), b.Bytes()) 194 195 } 196 197 // HistoBasedDistance isa cheap alternative to Levenshtein.distance. 198 // Particularly, since we compute the histogram anyway. 199 // Tests show, that LevenshteinDistance > HistoBasedDistance 200 // i.e. 0.36 0.31 201 // i.e. 0.6 0.4 202 // Thus we can break early if i.e. HistoBasedDistance > 0.5 203 // implying that LevenshteinDistance is at least >= 0.5 204 func HistoBasedDistance(src, dst *TextifiedTree) float64 { 205 206 largerOuter := src 207 inner := dst 208 if src.NumTokens < dst.NumTokens { 209 largerOuter = dst 210 inner = src 211 } 212 213 // Handle division by zero 214 if largerOuter.NumTokens == 0 { 215 return 0.0 216 } 217 218 // inner overlap 219 same := 0 220 for k, _ := range largerOuter.Histo { 221 if _, ok := inner.Histo[k]; ok { 222 same++ 223 } 224 } 225 226 distinctBySheerSize := largerOuter.NumTokens - inner.NumTokens 227 distinctInner := inner.NumTokens - same 228 229 ret := float64(distinctBySheerSize+distinctInner) / float64(largerOuter.NumTokens) 230 231 // crit1 := inner.NumTokens > 5 && distinctBySheerSize < 5 && distinctInner < 5 232 // _ = crit1 233 // if ret > 0 && ret < 0.51 { 234 // fmt.Printf("%3v %3v ; sizediff %3v worddiff %3v => %4.2v\n", largerOuter.NumTokens, inner.NumTokens, 235 // distinctBySheerSize, distinctInner, ret) 236 // } 237 238 return ret 239 } 240 241 // slow - only for debug 242 func mpKeys(mp map[string]int) string { 243 ret := "" 244 for k, _ := range mp { 245 ret += k + " " 246 } 247 return ret 248 }