github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/triage/berghelroach_test.py (about) 1 #!/usr/bin/env python2 2 3 # Copyright 2017 The Kubernetes Authors. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 # Ported from Java com.google.gwt.dev.util.editdistance, which is: 18 # Copyright 2010 Google Inc. 19 # 20 # Licensed under the Apache License, Version 2.0 (the "License"); you may not 21 # use this file except in compliance with the License. You may obtain a copy of 22 # the License at 23 # 24 # http://www.apache.org/licenses/LICENSE-2.0 25 # 26 # Unless required by applicable law or agreed to in writing, software 27 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 28 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 29 # License for the specific language governing permissions and limitations under 30 # the License. 31 32 # pylint: disable=missing-docstring,invalid-name 33 34 import random 35 import unittest 36 37 import berghelroach 38 39 40 # A very large string for testing. 41 MAGNA = ( 42 "We have granted to God, and by this our present Charter have " 43 "confirmed, for Us and our Heirs for ever, that the Church of " 44 "England shall be free, and shall have all her whole Rights and " 45 "Liberties inviolable. We have granted also, and given to all " 46 "the Freemen of our Realm, for Us and our Heirs for ever, these " 47 "Liberties under-written, to have and to hold to them and their " 48 "Heirs, of Us and our Heirs for ever." 49 ) 50 51 # A small set of words for testing, including at least some of 52 # each of these: empty, very short, more than 32/64 character, 53 # punctuation, non-ASCII characters 54 words = [ 55 "", "a", "b", "c", "ab", "ace", 56 "fortressing", "inadequately", "prank", "authored", 57 "fortresing", "inadeqautely", "prang", "awthered", 58 "cruller's", "fanatic", "Laplace", "recollections", 59 "Kevlar", "underpays", u"jalape\u00f1o", u"ch\u00e2telaine", 60 "kevlar", "overpaid", "jalapeno", "chatelaine", 61 "A survey of algorithms for running text search by Navarro appeared", 62 "in ACM Computing Surveys 33#1: http://portal.acm.org/citation.cfm?...", 63 "Another algorithm (Four Russians) that Navarro", 64 "long patterns and high limits was not evaluated for inclusion here.", 65 "long patterns and low limits were evaluated for inclusion here.", 66 "Filtering algorithms also improve running search", 67 "for pure edit distance." 68 ] 69 70 wordDistances = {} 71 72 # Computes Levenshtein edit distance using the far-from-optimal 73 # dynamic programming technique. This is here purely to verify 74 # the results of better algorithms. 75 def dynamicProgrammingLevenshtein(s1, s2): 76 lastRow = range(len(s1) + 1) 77 for j in range(0, len(s2)): 78 thisRow = [0] * len(lastRow) 79 thisRow[0] = j + 1 80 for i in range(1, len(thisRow)): 81 thisRow[i] = min(lastRow[i] + 1, 82 thisRow[i - 1] + 1, 83 lastRow[i - 1] + int(s2[j] != s1[i-1])) 84 lastRow = thisRow 85 return lastRow[-1] 86 87 for wordA in words: 88 for wordB in words: 89 wordDistances[wordA, wordB] = dynamicProgrammingLevenshtein(wordA, wordB) 90 91 92 class AbstractLevenshteinTestCase(object): 93 # pylint: disable=no-member 94 95 # Tests a Levenshtein engine against the DP-based computation 96 # for a bunch of string pairs. 97 def testLevenshteinOnWords(self): 98 for a in words: 99 for b in words: 100 ed = self.getInstance(a) 101 self.specificAlgorithmVerify(ed, a, b, wordDistances[a, b]) 102 103 # Tests Levenshtein edit distance on a longer pattern 104 def testLongerPattern(self): 105 self.genericLevenshteinVerify("abcdefghijklmnopqrstuvwxyz", 106 "abcefghijklMnopqrStuvwxyz..", 107 5) # dMS.. 108 109 # Tests Levenshtein edit distance on a very short pattern 110 def testShortPattern(self): 111 self.genericLevenshteinVerify("short", "shirt", 1) 112 113 # Verifies zero-length behavior 114 def testZeroLengthPattern(self): 115 nonEmpty = "target" 116 self.genericLevenshteinVerify("", nonEmpty, len(nonEmpty)) 117 self.genericLevenshteinVerify(nonEmpty, "", len(nonEmpty)) 118 119 # Tests the default Levenshtein engine on a pair of strings 120 def genericLevenshteinVerify(self, s1, s2, expectedResult): 121 self.specificAlgorithmVerify(self.getInstance(s1), s1, s2, expectedResult) 122 123 # Performs some edits on a string in a StringBuilder. 124 # @param b string to be modified 125 # @param alphabet some characters guaranteed not to be in the original 126 # @param replaces how many single-character replacements to try 127 # @param inserts how many characters to insert 128 # @return the number of edits actually performed, the new string 129 @staticmethod 130 def performSomeEdits(b, alphabet, replaces, inserts): 131 r = random.Random(768614336404564651L) 132 edits = 0 133 b = list(b) 134 135 for _ in range(inserts): 136 b.insert(r.randint(0, len(b) - 1), r.choice(alphabet)) 137 edits += 1 138 for _ in range(replaces): 139 where = r.randint(0, len(b) - 1) 140 if b[where] not in alphabet: 141 b[where] = r.choice(alphabet) 142 edits += 1 143 return edits, ''.join(b) 144 145 # Generates a long random alphabetic string, 146 # suitable for use with verifySomeEdits (using digits for the alphabet). 147 # @param size desired string length 148 # @param seed random number generator seed 149 # @return random alphabetic string of the requested length 150 @staticmethod 151 def generateRandomString(size, seed): 152 alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" 153 154 # Create a (repeatable) random string from the alphabet 155 rand = random.Random(seed) 156 return ''.join(rand.choice(alphabet) for _ in range(size)) 157 158 # Exercises an edit distance engine across a wide range of limit values 159 def genericVerification(self, ed, s1, s2, expectedResult): 160 if len(s1) < 500: 161 # For small strings, try every limit 162 maxDiff = max(len(s1), len(s2)) + 2 163 for k in range(maxDiff): 164 self.verifyResult(s1, s2, expectedResult, k, ed.getDistance(s2, k)) 165 else: 166 # For big strings, try a sampling of limits: 167 # 0 to 3, 168 # another 4 on either side of the expected result 169 # s2 length 170 for k in range(4): 171 self.verifyResult(s1, s2, expectedResult, k, ed.getDistance(s2, k)) 172 for k in range(max(4, expectedResult - 4), expectedResult + 4): 173 self.verifyResult(s1, s2, expectedResult, k, ed.getDistance(s2, k)) 174 self.verifyResult(s1, s2, expectedResult, len(s2), 175 ed.getDistance(s2, len(s2))) 176 177 # Always try near MAX_VALUE 178 self.assertEquals(ed.getDistance(s2, 2**63 - 1), expectedResult) 179 self.assertEquals(ed.getDistance(s2, 2**63), expectedResult) 180 181 # Tests a specific engine on a pair of strings 182 def specificAlgorithmVerify(self, ed, s1, s2, expectedResult): 183 self.genericVerification(ed, s1, s2, expectedResult) 184 185 # Try again with the same instance 186 self.genericVerification(ed, s1, s2, expectedResult) 187 188 # Verifies the distance between an original string and some 189 # number of simple edits on it. The distance is assumed to 190 # be unit-cost Levenshtein distance. 191 def verifySomeEdits(self, original, replaces, inserts): 192 edits, modified = self.performSomeEdits(original, "0123456789", replaces, inserts) 193 194 self.specificAlgorithmVerify(self.getInstance(original), original, modified, edits) 195 196 self.specificAlgorithmVerify(self.getInstance(modified), modified, original, edits) 197 198 # we don't have duplicate() in Python, so... 199 # self.specificAlgorithmVerify(self.getInstance(modified).duplicate(), 200 # modified, original, edits) 201 202 # Verifies a single edit distance result. 203 # If the expected distance is within limit, result must b 204 # be correct; otherwise, result must be over limit. 205 # 206 # @param s1 one string compared 207 # @param s2 other string compared 208 # @param expectedResult correct distance from s1 to s2 209 # @param k limit applied to computation 210 # @param d distance computed 211 def verifyResult(self, s1, s2, expectedResult, k, d): 212 if k >= expectedResult: 213 self.assertEquals( 214 expectedResult, d, 215 'Distance from %r to %r should be %d (within limit=%d) but was %d' % 216 (s1, s2, expectedResult, k, d)) 217 else: 218 self.assertTrue( 219 d > k, 220 'Distance from %r to %r should be %d (exceeding limit=%d) but was %d' % 221 (s1, s2, expectedResult, k, d)) 222 223 224 # Test cases for the ModifiedBerghelRoachEditDistance class. 225 # 226 # The bulk of the test is provided by the superclass, for 227 # which we provide GeneralEditDistance instances. 228 # 229 # Since Berghel-Roach is superior for longer strings with moderately 230 # low edit distances, we try a few of those specifically. 231 # This Modified form uses less space, and can handle yet larger ones. 232 233 class BerghelRoachTest(unittest.TestCase, AbstractLevenshteinTestCase): 234 @staticmethod 235 def getInstance(s): 236 return berghelroach.BerghelRoach(s) 237 238 def testHugeEdit(self): 239 SIZE = 10000 240 SEED = 1 241 242 self.verifySomeEdits(self.generateRandomString(SIZE, SEED), (SIZE / 50), (SIZE / 50)) 243 244 def testHugeString(self): 245 # An even larger size is feasible, but the test would no longer 246 # qualify as "small". 247 SIZE = 20000 248 SEED = 1 249 250 self.verifySomeEdits(self.generateRandomString(SIZE, SEED), 30, 25) 251 252 def testLongString(self): 253 self.verifySomeEdits(MAGNA, 8, 10) 254 255 def testLongStringMoreEdits(self): 256 self.verifySomeEdits(MAGNA, 40, 30) 257 258 259 if __name__ == '__main__': 260 unittest.main()