github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/triage/berghelroach_test.py

github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/triage/berghelroach_test.py (about)

     1  #!/usr/bin/env python2
     2  
     3  # Copyright 2017 The Kubernetes Authors.
     4  #
     5  # Licensed under the Apache License, Version 2.0 (the "License");
     6  # you may not use this file except in compliance with the License.
     7  # You may obtain a copy of the License at
     8  #
     9  #     http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  # Ported from Java com.google.gwt.dev.util.editdistance, which is:
    18  # Copyright 2010 Google Inc.
    19  #
    20  # Licensed under the Apache License, Version 2.0 (the "License"); you may not
    21  # use this file except in compliance with the License. You may obtain a copy of
    22  # the License at
    23  #
    24  # http://www.apache.org/licenses/LICENSE-2.0
    25  #
    26  # Unless required by applicable law or agreed to in writing, software
    27  # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
    28  # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
    29  # License for the specific language governing permissions and limitations under
    30  # the License.
    31  
    32  # pylint: disable=missing-docstring,invalid-name
    33  
    34  import random
    35  import unittest
    36  
    37  import berghelroach
    38  
    39  
    40  # A very large string for testing.
    41  MAGNA = (
    42      "We have granted to God, and by this our present Charter have "
    43      "confirmed, for Us and our Heirs for ever, that the Church of "
    44      "England shall be free, and shall have all her whole Rights and "
    45      "Liberties inviolable.  We have granted also, and given to all "
    46      "the Freemen of our Realm, for Us and our Heirs for ever, these "
    47      "Liberties under-written, to have and to hold to them and their "
    48      "Heirs, of Us and our Heirs for ever."
    49  )
    50  
    51  # A small set of words for testing, including at least some of
    52  # each of these: empty, very short, more than 32/64 character,
    53  # punctuation, non-ASCII characters
    54  words = [
    55      "", "a", "b", "c", "ab", "ace",
    56      "fortressing", "inadequately", "prank", "authored",
    57      "fortresing", "inadeqautely", "prang", "awthered",
    58      "cruller's", "fanatic", "Laplace", "recollections",
    59      "Kevlar", "underpays", u"jalape\u00f1o", u"ch\u00e2telaine",
    60      "kevlar", "overpaid", "jalapeno", "chatelaine",
    61      "A survey of algorithms for running text search by Navarro appeared",
    62      "in ACM Computing Surveys 33#1: http://portal.acm.org/citation.cfm?...",
    63      "Another algorithm (Four Russians) that Navarro",
    64      "long patterns and high limits was not evaluated for inclusion here.",
    65      "long patterns and low limits were evaluated for inclusion here.",
    66      "Filtering algorithms also improve running search",
    67      "for pure edit distance."
    68  ]
    69  
    70  wordDistances = {}
    71  
    72  # Computes Levenshtein edit distance using the far-from-optimal
    73  # dynamic programming technique.  This is here purely to verify
    74  # the results of better algorithms.
    75  def dynamicProgrammingLevenshtein(s1, s2):
    76      lastRow = range(len(s1) + 1)
    77      for j in range(0, len(s2)):
    78          thisRow = [0] * len(lastRow)
    79          thisRow[0] = j + 1
    80          for i in range(1, len(thisRow)):
    81              thisRow[i] = min(lastRow[i] + 1,
    82                               thisRow[i - 1] + 1,
    83                               lastRow[i - 1] + int(s2[j] != s1[i-1]))
    84          lastRow = thisRow
    85      return lastRow[-1]
    86  
    87  for wordA in words:
    88      for wordB in words:
    89          wordDistances[wordA, wordB] = dynamicProgrammingLevenshtein(wordA, wordB)
    90  
    91  
    92  class AbstractLevenshteinTestCase(object):
    93      # pylint: disable=no-member
    94  
    95      # Tests a Levenshtein engine against the DP-based computation
    96      # for a bunch of string pairs.
    97      def testLevenshteinOnWords(self):
    98          for a in words:
    99              for b in words:
   100                  ed = self.getInstance(a)
   101                  self.specificAlgorithmVerify(ed, a, b, wordDistances[a, b])
   102  
   103      # Tests Levenshtein edit distance on a longer pattern
   104      def testLongerPattern(self):
   105          self.genericLevenshteinVerify("abcdefghijklmnopqrstuvwxyz",
   106                                        "abcefghijklMnopqrStuvwxyz..",
   107                                        5)  # dMS..
   108  
   109      # Tests Levenshtein edit distance on a very short pattern
   110      def testShortPattern(self):
   111          self.genericLevenshteinVerify("short", "shirt", 1)
   112  
   113      # Verifies zero-length behavior
   114      def testZeroLengthPattern(self):
   115          nonEmpty = "target"
   116          self.genericLevenshteinVerify("", nonEmpty, len(nonEmpty))
   117          self.genericLevenshteinVerify(nonEmpty, "", len(nonEmpty))
   118  
   119      # Tests the default Levenshtein engine on a pair of strings
   120      def genericLevenshteinVerify(self, s1, s2, expectedResult):
   121          self.specificAlgorithmVerify(self.getInstance(s1), s1, s2, expectedResult)
   122  
   123      # Performs some edits on a string in a StringBuilder.
   124      # @param b string to be modified
   125      # @param alphabet some characters guaranteed not to be in the original
   126      # @param replaces how many single-character replacements to try
   127      # @param inserts how many characters to insert
   128      # @return the number of edits actually performed, the new string
   129      @staticmethod
   130      def performSomeEdits(b, alphabet, replaces, inserts):
   131          r = random.Random(768614336404564651L)
   132          edits = 0
   133          b = list(b)
   134  
   135          for _ in range(inserts):
   136              b.insert(r.randint(0, len(b) - 1), r.choice(alphabet))
   137              edits += 1
   138          for _ in range(replaces):
   139              where = r.randint(0, len(b) - 1)
   140              if b[where] not in alphabet:
   141                  b[where] = r.choice(alphabet)
   142                  edits += 1
   143          return edits, ''.join(b)
   144  
   145      # Generates a long random alphabetic string,
   146      # suitable for use with verifySomeEdits (using digits for the alphabet).
   147      # @param size desired string length
   148      # @param seed random number generator seed
   149      # @return random alphabetic string of the requested length
   150      @staticmethod
   151      def generateRandomString(size, seed):
   152          alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
   153  
   154          # Create a (repeatable) random string from the alphabet
   155          rand = random.Random(seed)
   156          return ''.join(rand.choice(alphabet) for _ in range(size))
   157  
   158      # Exercises an edit distance engine across a wide range of limit values
   159      def genericVerification(self, ed, s1, s2, expectedResult):
   160          if len(s1) < 500:
   161               # For small strings, try every limit
   162              maxDiff = max(len(s1), len(s2)) + 2
   163              for k in range(maxDiff):
   164                  self.verifyResult(s1, s2, expectedResult, k, ed.getDistance(s2, k))
   165          else:
   166              # For big strings, try a sampling of limits:
   167              #   0 to 3,
   168              #   another 4 on either side of the expected result
   169              #   s2 length
   170              for k in range(4):
   171                  self.verifyResult(s1, s2, expectedResult, k, ed.getDistance(s2, k))
   172              for k in range(max(4, expectedResult - 4), expectedResult + 4):
   173                  self.verifyResult(s1, s2, expectedResult, k, ed.getDistance(s2, k))
   174              self.verifyResult(s1, s2, expectedResult, len(s2),
   175                                ed.getDistance(s2, len(s2)))
   176  
   177          # Always try near MAX_VALUE
   178          self.assertEquals(ed.getDistance(s2, 2**63 - 1), expectedResult)
   179          self.assertEquals(ed.getDistance(s2, 2**63), expectedResult)
   180  
   181      # Tests a specific engine on a pair of strings
   182      def specificAlgorithmVerify(self, ed, s1, s2, expectedResult):
   183          self.genericVerification(ed, s1, s2, expectedResult)
   184  
   185          # Try again with the same instance
   186          self.genericVerification(ed, s1, s2, expectedResult)
   187  
   188      # Verifies the distance between an original string and some
   189      # number of simple edits on it.  The distance is assumed to
   190      # be unit-cost Levenshtein distance.
   191      def verifySomeEdits(self, original, replaces, inserts):
   192          edits, modified = self.performSomeEdits(original, "0123456789", replaces, inserts)
   193  
   194          self.specificAlgorithmVerify(self.getInstance(original), original, modified, edits)
   195  
   196          self.specificAlgorithmVerify(self.getInstance(modified), modified, original, edits)
   197  
   198          # we don't have duplicate() in Python, so...
   199          # self.specificAlgorithmVerify(self.getInstance(modified).duplicate(),
   200          #                              modified, original, edits)
   201  
   202      # Verifies a single edit distance result.
   203      # If the expected distance is within limit, result must b
   204      # be correct; otherwise, result must be over limit.
   205      #
   206      # @param s1 one string compared
   207      # @param s2 other string compared
   208      # @param expectedResult correct distance from s1 to s2
   209      # @param k limit applied to computation
   210      # @param d distance computed
   211      def verifyResult(self, s1, s2, expectedResult, k, d):
   212          if k >= expectedResult:
   213              self.assertEquals(
   214                  expectedResult, d,
   215                  'Distance from %r to %r should be %d (within limit=%d) but was %d' %
   216                  (s1, s2, expectedResult, k, d))
   217          else:
   218              self.assertTrue(
   219                  d > k,
   220                  'Distance from %r to %r should be %d (exceeding limit=%d) but was %d' %
   221                  (s1, s2, expectedResult, k, d))
   222  
   223  
   224  # Test cases for the ModifiedBerghelRoachEditDistance class.
   225  #
   226  # The bulk of the test is provided by the superclass, for
   227  # which we provide GeneralEditDistance instances.
   228  #
   229  # Since Berghel-Roach is superior for longer strings with moderately
   230  # low edit distances, we try a few of those specifically.
   231  # This Modified form uses less space, and can handle yet larger ones.
   232  
   233  class BerghelRoachTest(unittest.TestCase, AbstractLevenshteinTestCase):
   234      @staticmethod
   235      def getInstance(s):
   236          return berghelroach.BerghelRoach(s)
   237  
   238      def testHugeEdit(self):
   239          SIZE = 10000
   240          SEED = 1
   241  
   242          self.verifySomeEdits(self.generateRandomString(SIZE, SEED), (SIZE / 50), (SIZE / 50))
   243  
   244      def testHugeString(self):
   245           # An even larger size is feasible, but the test would no longer
   246           # qualify as "small".
   247          SIZE = 20000
   248          SEED = 1
   249  
   250          self.verifySomeEdits(self.generateRandomString(SIZE, SEED), 30, 25)
   251  
   252      def testLongString(self):
   253          self.verifySomeEdits(MAGNA, 8, 10)
   254  
   255      def testLongStringMoreEdits(self):
   256          self.verifySomeEdits(MAGNA, 40, 30)
   257  
   258  
   259  if __name__ == '__main__':
   260      unittest.main()