BookReaderIA/datanode/diff_match_patch.py

   1 #!/usr/bin/python2.4
   2
   3 """Diff Match and Patch
   4
   5 Copyright 2006 Google Inc.
   6 http://code.google.com/p/google-diff-match-patch/
   7
   8 Licensed under the Apache License, Version 2.0 (the "License");
   9 you may not use this file except in compliance with the License.
  10 You may obtain a copy of the License at
  11
  12   http://www.apache.org/licenses/LICENSE-2.0
  13
  14 Unless required by applicable law or agreed to in writing, software
  15 distributed under the License is distributed on an "AS IS" BASIS,
  16 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17 See the License for the specific language governing permissions and
  18 limitations under the License.
  19 """
  20
  21 """Functions for diff, match and patch.
  22
  23 Computes the difference between two texts to create a patch.
  24 Applies the patch onto another text, allowing for errors.
  25 """
  26
  27 __author__ = 'fraser@google.com (Neil Fraser)'
  28
  29 import math
  30 import time
  31 import urllib
  32 import re
  33
  34 class diff_match_patch:
  35   """Class containing the diff, match and patch methods.
  36
  37   Also contains the behaviour settings.
  38   """
  39
  40   def __init__(self):
  41     """Inits a diff_match_patch object with default settings.
  42     Redefine these in your program to override the defaults.
  43     """
  44
  45     # Number of seconds to map a diff before giving up (0 for infinity).
  46     self.Diff_Timeout = 1.0
  47     # Cost of an empty edit operation in terms of edit characters.
  48     self.Diff_EditCost = 4
  49     # The size beyond which the double-ended diff activates.
  50     # Double-ending is twice as fast, but less accurate.
  51     self.Diff_DualThreshold = 32
  52     # At what point is no match declared (0.0 = perfection, 1.0 = very loose).
  53     self.Match_Threshold = 0.5
  54     # How far to search for a match (0 = exact location, 1000+ = broad match).
  55     # A match this many characters away from the expected location will add
  56     # 1.0 to the score (0.0 is a perfect match).
  57     self.Match_Distance = 1000
  58     # When deleting a large block of text (over ~64 characters), how close does
  59     # the contents have to match the expected contents. (0.0 = perfection,
  60     # 1.0 = very loose).  Note that Match_Threshold controls how closely the
  61     # end points of a delete need to match.
  62     self.Patch_DeleteThreshold = 0.5
  63     # Chunk size for context length.
  64     self.Patch_Margin = 4
  65
  66     # How many bits in a number?
  67     # Python has no maximum, thus to disable patch splitting set to 0.
  68     # However to avoid long patches in certain pathological cases, use 32.
  69     # Multiple short patches (using native ints) are much faster than long ones.
  70     self.Match_MaxBits = 32
  71
  72   #  DIFF FUNCTIONS
  73
  74   # The data structure representing a diff is an array of tuples:
  75   # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]
  76   # which means: delete "Hello", add "Goodbye" and keep " world."
  77   DIFF_DELETE = -1
  78   DIFF_INSERT = 1
  79   DIFF_EQUAL = 0
  80
  81   def diff_main(self, text1, text2, checklines=True):
  82     """Find the differences between two texts.  Simplifies the problem by
  83       stripping any common prefix or suffix off the texts before diffing.
  84
  85     Args:
  86       text1: Old string to be diffed.
  87       text2: New string to be diffed.
  88       checklines: Optional speedup flag.  If present and false, then don't run
  89         a line-level diff first to identify the changed areas.
  90         Defaults to true, which does a faster, slightly less optimal diff.
  91
  92     Returns:
  93       Array of changes.
  94     """
  95
  96     # Check for null inputs.
  97     if text1 == None or text2 == None:
  98       raise ValueError("Null inputs. (diff_main)")
  99
 100     # Check for equality (speedup).
 101     if text1 == text2:
 102       return [(self.DIFF_EQUAL, text1)]
 103
 104     # Trim off common prefix (speedup).
 105     commonlength = self.diff_commonPrefix(text1, text2)
 106     commonprefix = text1[:commonlength]
 107     text1 = text1[commonlength:]
 108     text2 = text2[commonlength:]
 109
 110     # Trim off common suffix (speedup).
 111     commonlength = self.diff_commonSuffix(text1, text2)
 112     if commonlength == 0:
 113       commonsuffix = ''
 114     else:
 115       commonsuffix = text1[-commonlength:]
 116       text1 = text1[:-commonlength]
 117       text2 = text2[:-commonlength]
 118
 119     # Compute the diff on the middle block.
 120     diffs = self.diff_compute(text1, text2, checklines)
 121
 122     # Restore the prefix and suffix.
 123     if commonprefix:
 124       diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]
 125     if commonsuffix:
 126       diffs.append((self.DIFF_EQUAL, commonsuffix))
 127     self.diff_cleanupMerge(diffs)
 128     return diffs
 129
 130   def diff_compute(self, text1, text2, checklines):
 131     """Find the differences between two texts.  Assumes that the texts do not
 132       have any common prefix or suffix.
 133
 134     Args:
 135       text1: Old string to be diffed.
 136       text2: New string to be diffed.
 137       checklines: Speedup flag.  If false, then don't run a line-level diff
 138         first to identify the changed areas.
 139         If true, then run a faster, slightly less optimal diff.
 140
 141     Returns:
 142       Array of changes.
 143     """
 144     if not text1:
 145       # Just add some text (speedup).
 146       return [(self.DIFF_INSERT, text2)]
 147
 148     if not text2:
 149       # Just delete some text (speedup).
 150       return [(self.DIFF_DELETE, text1)]
 151
 152     if len(text1) > len(text2):
 153       (longtext, shorttext) = (text1, text2)
 154     else:
 155       (shorttext, longtext) = (text1, text2)
 156     i = longtext.find(shorttext)
 157     if i != -1:
 158       # Shorter text is inside the longer text (speedup).
 159       diffs = [(self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext),
 160                (self.DIFF_INSERT, longtext[i + len(shorttext):])]
 161       # Swap insertions for deletions if diff is reversed.
 162       if len(text1) > len(text2):
 163         diffs[0] = (self.DIFF_DELETE, diffs[0][1])
 164         diffs[2] = (self.DIFF_DELETE, diffs[2][1])
 165       return diffs
 166     longtext = shorttext = None  # Garbage collect.
 167
 168     # Check to see if the problem can be split in two.
 169     hm = self.diff_halfMatch(text1, text2)
 170     if hm:
 171       # A half-match was found, sort out the return data.
 172       (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
 173       # Send both pairs off for separate processing.
 174       diffs_a = self.diff_main(text1_a, text2_a, checklines)
 175       diffs_b = self.diff_main(text1_b, text2_b, checklines)
 176       # Merge the results.
 177       return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b
 178
 179     # Perform a real diff.
 180     if checklines and (len(text1) < 100 or len(text2) < 100):
 181       checklines = False  # Too trivial for the overhead.
 182     if checklines:
 183       # Scan the text on a line-by-line basis first.
 184       (text1, text2, linearray) = self.diff_linesToChars(text1, text2)
 185
 186     diffs = self.diff_map(text1, text2)
 187     if not diffs:  # No acceptable result.
 188       diffs = [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
 189     if checklines:
 190       # Convert the diff back to original text.
 191       self.diff_charsToLines(diffs, linearray)
 192       # Eliminate freak matches (e.g. blank lines)
 193       self.diff_cleanupSemantic(diffs)
 194
 195       # Rediff any replacement blocks, this time character-by-character.
 196       # Add a dummy entry at the end.
 197       diffs.append((self.DIFF_EQUAL, ''))
 198       pointer = 0
 199       count_delete = 0
 200       count_insert = 0
 201       text_delete = ''
 202       text_insert = ''
 203       while pointer < len(diffs):
 204         if diffs[pointer][0] == self.DIFF_INSERT:
 205           count_insert += 1
 206           text_insert += diffs[pointer][1]
 207         elif diffs[pointer][0] == self.DIFF_DELETE:
 208           count_delete += 1
 209           text_delete += diffs[pointer][1]
 210         elif diffs[pointer][0] == self.DIFF_EQUAL:
 211           # Upon reaching an equality, check for prior redundancies.
 212           if count_delete >= 1 and count_insert >= 1:
 213             # Delete the offending records and add the merged ones.
 214             a = self.diff_main(text_delete, text_insert, False)
 215             diffs[pointer - count_delete - count_insert : pointer] = a
 216             pointer = pointer - count_delete - count_insert + len(a)
 217           count_insert = 0
 218           count_delete = 0
 219           text_delete = ''
 220           text_insert = ''
 221
 222         pointer += 1
 223
 224       diffs.pop()  # Remove the dummy entry at the end.
 225     return diffs
 226
 227   def diff_linesToChars(self, text1, text2):
 228     """Split two texts into an array of strings.  Reduce the texts to a string
 229     of hashes where each Unicode character represents one line.
 230
 231     Args:
 232       text1: First string.
 233       text2: Second string.
 234
 235     Returns:
 236       Three element tuple, containing the encoded text1, the encoded text2 and
 237       the array of unique strings.  The zeroth element of the array of unique
 238       strings is intentionally blank.
 239     """
 240     lineArray = []  # e.g. lineArray[4] == "Hello\n"
 241     lineHash = {}   # e.g. lineHash["Hello\n"] == 4
 242
 243     # "\x00" is a valid character, but various debuggers don't like it.
 244     # So we'll insert a junk entry to avoid generating a null character.
 245     lineArray.append('')
 246
 247     def diff_linesToCharsMunge(text):
 248       """Split a text into an array of strings.  Reduce the texts to a string
 249       of hashes where each Unicode character represents one line.
 250       Modifies linearray and linehash through being a closure.
 251
 252       Args:
 253         text: String to encode.
 254
 255       Returns:
 256         Encoded string.
 257       """
 258       chars = []
 259       # Walk the text, pulling out a substring for each line.
 260       # text.split('\n') would would temporarily double our memory footprint.
 261       # Modifying text would create many large strings to garbage collect.
 262       lineStart = 0
 263       lineEnd = -1
 264       while lineEnd < len(text) - 1:
 265         lineEnd = text.find('\n', lineStart)
 266         if lineEnd == -1:
 267           lineEnd = len(text) - 1
 268         line = text[lineStart:lineEnd + 1]
 269         lineStart = lineEnd + 1
 270
 271         if line in lineHash:
 272           chars.append(unichr(lineHash[line]))
 273         else:
 274           lineArray.append(line)
 275           lineHash[line] = len(lineArray) - 1
 276           chars.append(unichr(len(lineArray) - 1))
 277       return "".join(chars)
 278
 279     chars1 = diff_linesToCharsMunge(text1)
 280     chars2 = diff_linesToCharsMunge(text2)
 281     return (chars1, chars2, lineArray)
 282
 283   def diff_charsToLines(self, diffs, lineArray):
 284     """Rehydrate the text in a diff from a string of line hashes to real lines
 285     of text.
 286
 287     Args:
 288       diffs: Array of diff tuples.
 289       lineArray: Array of unique strings.
 290     """
 291     for x in xrange(len(diffs)):
 292       text = []
 293       for char in diffs[x][1]:
 294         text.append(lineArray[ord(char)])
 295       diffs[x] = (diffs[x][0], "".join(text))
 296
 297   def diff_map(self, text1, text2):
 298     """Explore the intersection points between the two texts.
 299
 300     Args:
 301       text1: Old string to be diffed.
 302       text2: New string to be diffed.
 303
 304     Returns:
 305       Array of diff tuples or None if no diff available.
 306     """
 307
 308     # Unlike in most languages, Python counts time in seconds.
 309     s_end = time.time() + self.Diff_Timeout  # Don't run for too long.
 310     # Cache the text lengths to prevent multiple calls.
 311     text1_length = len(text1)
 312     text2_length = len(text2)
 313     max_d = text1_length + text2_length - 1
 314     doubleEnd = self.Diff_DualThreshold * 2 < max_d
 315     # Python efficiency note: (x << 32) + y is the fastest way to combine
 316     # x and y into a single hashable value.  Tested in Python 2.5.
 317     # It is unclear why it is faster for v_map[d] to be indexed with an
 318     # integer whereas footsteps is indexed with a string.
 319     v_map1 = []
 320     v_map2 = []
 321     v1 = {}
 322     v2 = {}
 323     v1[1] = 0
 324     v2[1] = 0
 325     footsteps = {}
 326     done = False
 327     # If the total number of characters is odd, then the front path will
 328     # collide with the reverse path.
 329     front = (text1_length + text2_length) % 2
 330     for d in xrange(max_d):
 331       # Bail out if timeout reached.
 332       if self.Diff_Timeout > 0 and time.time() > s_end:
 333         return None
 334
 335       # Walk the front path one step.
 336       v_map1.append({})
 337       for k in xrange(-d, d + 1, 2):
 338         if k == -d or k != d and v1[k - 1] < v1[k + 1]:
 339           x = v1[k + 1]
 340         else:
 341           x = v1[k - 1] + 1
 342         y = x - k
 343         if doubleEnd:
 344           footstep = str((x << 32) + y)
 345           if front and footstep in footsteps:
 346             done = True
 347           if not front:
 348             footsteps[footstep] = d
 349
 350         while (not done and x < text1_length and y < text2_length and
 351                text1[x] == text2[y]):
 352           x += 1
 353           y += 1
 354           if doubleEnd:
 355             footstep = str((x << 32) + y)
 356             if front and footstep in footsteps:
 357               done = True
 358             if not front:
 359               footsteps[footstep] = d
 360
 361         v1[k] = x
 362         v_map1[d][(x << 32) + y] = True
 363         if x == text1_length and y == text2_length:
 364           # Reached the end in single-path mode.
 365           return self.diff_path1(v_map1, text1, text2)
 366         elif done:
 367           # Front path ran over reverse path.
 368           v_map2 = v_map2[:footsteps[footstep] + 1]
 369           a = self.diff_path1(v_map1, text1[:x], text2[:y])
 370           b = self.diff_path2(v_map2, text1[x:], text2[y:])
 371           return a + b
 372
 373       if doubleEnd:
 374         # Walk the reverse path one step.
 375         v_map2.append({})
 376         for k in xrange(-d, d + 1, 2):
 377           if k == -d or k != d and v2[k - 1] < v2[k + 1]:
 378             x = v2[k + 1]
 379           else:
 380             x = v2[k - 1] + 1
 381           y = x - k
 382           footstep = str((text1_length - x << 32) + text2_length - y)
 383           if not front and footstep in footsteps:
 384             done = True
 385           if front:
 386             footsteps[footstep] = d
 387           while (not done and x < text1_length and y < text2_length and
 388                  text1[-x - 1] == text2[-y - 1]):
 389             x += 1
 390             y += 1
 391             footstep = str((text1_length - x << 32) + text2_length - y)
 392             if not front and footstep in footsteps:
 393               done = True
 394             if front:
 395               footsteps[footstep] = d
 396
 397           v2[k] = x
 398           v_map2[d][(x << 32) + y] = True
 399           if done:
 400             # Reverse path ran over front path.
 401             v_map1 = v_map1[:footsteps[footstep] + 1]
 402             a = self.diff_path1(v_map1, text1[:text1_length - x],
 403                                 text2[:text2_length - y])
 404             b = self.diff_path2(v_map2, text1[text1_length - x:],
 405                                 text2[text2_length - y:])
 406             return a + b
 407
 408     # Number of diffs equals number of characters, no commonality at all.
 409     return None
 410
 411   def diff_path1(self, v_map, text1, text2):
 412     """Work from the middle back to the start to determine the path.
 413
 414     Args:
 415       v_map: Array of paths.
 416       text1: Old string fragment to be diffed.
 417       text2: New string fragment to be diffed.
 418
 419     Returns:
 420       Array of diff tuples.
 421     """
 422     path = []
 423     x = len(text1)
 424     y = len(text2)
 425     last_op = None
 426     for d in xrange(len(v_map) - 2, -1, -1):
 427       while True:
 428         if (x - 1 << 32) + y in v_map[d]:
 429           x -= 1
 430           if last_op == self.DIFF_DELETE:
 431             path[0] = (self.DIFF_DELETE, text1[x] + path[0][1])
 432           else:
 433             path[:0] = [(self.DIFF_DELETE, text1[x])]
 434           last_op = self.DIFF_DELETE
 435           break
 436         elif (x << 32) + y - 1 in v_map[d]:
 437           y -= 1
 438           if last_op == self.DIFF_INSERT:
 439             path[0] = (self.DIFF_INSERT, text2[y] + path[0][1])
 440           else:
 441             path[:0] = [(self.DIFF_INSERT, text2[y])]
 442           last_op = self.DIFF_INSERT
 443           break
 444         else:
 445           x -= 1
 446           y -= 1
 447           assert text1[x] == text2[y], ("No diagonal.  " +
 448               "Can't happen. (diff_path1)")
 449           if last_op == self.DIFF_EQUAL:
 450             path[0] = (self.DIFF_EQUAL, text1[x] + path[0][1])
 451           else:
 452             path[:0] = [(self.DIFF_EQUAL, text1[x])]
 453           last_op = self.DIFF_EQUAL
 454     return path
 455
 456   def diff_path2(self, v_map, text1, text2):
 457     """Work from the middle back to the end to determine the path.
 458
 459     Args:
 460       v_map: Array of paths.
 461       text1: Old string fragment to be diffed.
 462       text2: New string fragment to be diffed.
 463
 464     Returns:
 465       Array of diff tuples.
 466     """
 467     path = []
 468     x = len(text1)
 469     y = len(text2)
 470     last_op = None
 471     for d in xrange(len(v_map) - 2, -1, -1):
 472       while True:
 473         if (x - 1 << 32) + y in v_map[d]:
 474           x -= 1
 475           if last_op == self.DIFF_DELETE:
 476             path[-1] = (self.DIFF_DELETE, path[-1][1] + text1[-x - 1])
 477           else:
 478             path.append((self.DIFF_DELETE, text1[-x - 1]))
 479           last_op = self.DIFF_DELETE
 480           break
 481         elif (x << 32) + y - 1 in v_map[d]:
 482           y -= 1
 483           if last_op == self.DIFF_INSERT:
 484             path[-1] = (self.DIFF_INSERT, path[-1][1] + text2[-y - 1])
 485           else:
 486             path.append((self.DIFF_INSERT, text2[-y - 1]))
 487           last_op = self.DIFF_INSERT
 488           break
 489         else:
 490           x -= 1
 491           y -= 1
 492           assert text1[-x - 1] == text2[-y - 1], ("No diagonal.  " +
 493               "Can't happen. (diff_path2)")
 494           if last_op == self.DIFF_EQUAL:
 495             path[-1] = (self.DIFF_EQUAL, path[-1][1] + text1[-x - 1])
 496           else:
 497             path.append((self.DIFF_EQUAL, text1[-x - 1]))
 498           last_op = self.DIFF_EQUAL
 499     return path
 500
 501   def diff_commonPrefix(self, text1, text2):
 502     """Determine the common prefix of two strings.
 503
 504     Args:
 505       text1: First string.
 506       text2: Second string.
 507
 508     Returns:
 509       The number of characters common to the start of each string.
 510     """
 511     # Quick check for common null cases.
 512     if not text1 or not text2 or text1[0] != text2[0]:
 513       return 0
 514     # Binary search.
 515     # Performance analysis: http://neil.fraser.name/news/2007/10/09/
 516     pointermin = 0
 517     pointermax = min(len(text1), len(text2))
 518     pointermid = pointermax
 519     pointerstart = 0
 520     while pointermin < pointermid:
 521       if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:
 522         pointermin = pointermid
 523         pointerstart = pointermin
 524       else:
 525         pointermax = pointermid
 526       pointermid = int((pointermax - pointermin) / 2 + pointermin)
 527     return pointermid
 528
 529   def diff_commonSuffix(self, text1, text2):
 530     """Determine the common suffix of two strings.
 531
 532     Args:
 533       text1: First string.
 534       text2: Second string.
 535
 536     Returns:
 537       The number of characters common to the end of each string.
 538     """
 539     # Quick check for common null cases.
 540     if not text1 or not text2 or text1[-1] != text2[-1]:
 541       return 0
 542     # Binary search.
 543     # Performance analysis: http://neil.fraser.name/news/2007/10/09/
 544     pointermin = 0
 545     pointermax = min(len(text1), len(text2))
 546     pointermid = pointermax
 547     pointerend = 0
 548     while pointermin < pointermid:
 549       if (text1[-pointermid:len(text1) - pointerend] ==
 550           text2[-pointermid:len(text2) - pointerend]):
 551         pointermin = pointermid
 552         pointerend = pointermin
 553       else:
 554         pointermax = pointermid
 555       pointermid = int((pointermax - pointermin) / 2 + pointermin)
 556     return pointermid
 557
 558   def diff_halfMatch(self, text1, text2):
 559     """Do the two texts share a substring which is at least half the length of
 560     the longer text?
 561
 562     Args:
 563       text1: First string.
 564       text2: Second string.
 565
 566     Returns:
 567       Five element Array, containing the prefix of text1, the suffix of text1,
 568       the prefix of text2, the suffix of text2 and the common middle.  Or None
 569       if there was no match.
 570     """
 571     if len(text1) > len(text2):
 572       (longtext, shorttext) = (text1, text2)
 573     else:
 574       (shorttext, longtext) = (text1, text2)
 575     if len(longtext) < 10 or len(shorttext) < 1:
 576       return None  # Pointless.
 577
 578     def diff_halfMatchI(longtext, shorttext, i):
 579       """Does a substring of shorttext exist within longtext such that the
 580       substring is at least half the length of longtext?
 581       Closure, but does not reference any external variables.
 582
 583       Args:
 584         longtext: Longer string.
 585         shorttext: Shorter string.
 586         i: Start index of quarter length substring within longtext.
 587
 588       Returns:
 589         Five element Array, containing the prefix of longtext, the suffix of
 590         longtext, the prefix of shorttext, the suffix of shorttext and the
 591         common middle.  Or None if there was no match.
 592       """
 593       seed = longtext[i:i + len(longtext) / 4]
 594       best_common = ''
 595       j = shorttext.find(seed)
 596       while j != -1:
 597         prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:])
 598         suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j])
 599         if len(best_common) < suffixLength + prefixLength:
 600           best_common = (shorttext[j - suffixLength:j] +
 601               shorttext[j:j + prefixLength])
 602           best_longtext_a = longtext[:i - suffixLength]
 603           best_longtext_b = longtext[i + prefixLength:]
 604           best_shorttext_a = shorttext[:j - suffixLength]
 605           best_shorttext_b = shorttext[j + prefixLength:]
 606         j = shorttext.find(seed, j + 1)
 607
 608       if len(best_common) >= len(longtext) / 2:
 609         return (best_longtext_a, best_longtext_b,
 610                 best_shorttext_a, best_shorttext_b, best_common)
 611       else:
 612         return None
 613
 614     # First check if the second quarter is the seed for a half-match.
 615     hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) / 4)
 616     # Check again based on the third quarter.
 617     hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) / 2)
 618     if not hm1 and not hm2:
 619       return None
 620     elif not hm2:
 621       hm = hm1
 622     elif not hm1:
 623       hm = hm2
 624     else:
 625       # Both matched.  Select the longest.
 626       if len(hm1[4]) > len(hm2[4]):
 627         hm = hm1
 628       else:
 629         hm = hm2
 630
 631     # A half-match was found, sort out the return data.
 632     if len(text1) > len(text2):
 633       (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
 634     else:
 635       (text2_a, text2_b, text1_a, text1_b, mid_common) = hm
 636     return (text1_a, text1_b, text2_a, text2_b, mid_common)
 637
 638   def diff_cleanupSemantic(self, diffs):
 639     """Reduce the number of edits by eliminating semantically trivial
 640     equalities.
 641
 642     Args:
 643       diffs: Array of diff tuples.
 644     """
 645     changes = False
 646     equalities = []  # Stack of indices where equalities are found.
 647     lastequality = None  # Always equal to equalities[-1][1]
 648     pointer = 0  # Index of current position.
 649     length_changes1 = 0  # Number of chars that changed prior to the equality.
 650     length_changes2 = 0  # Number of chars that changed after the equality.
 651     while pointer < len(diffs):
 652       if diffs[pointer][0] == self.DIFF_EQUAL:  # equality found
 653         equalities.append(pointer)
 654         length_changes1 = length_changes2
 655         length_changes2 = 0
 656         lastequality = diffs[pointer][1]
 657       else:  # an insertion or deletion
 658         length_changes2 += len(diffs[pointer][1])
 659         if (lastequality != None and (len(lastequality) <= length_changes1) and
 660             (len(lastequality) <= length_changes2)):
 661           # Duplicate record
 662           diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
 663           # Change second copy to insert.
 664           diffs[equalities[-1] + 1] = (self.DIFF_INSERT,
 665               diffs[equalities[-1] + 1][1])
 666           # Throw away the equality we just deleted.
 667           equalities.pop()
 668           # Throw away the previous equality (it needs to be reevaluated).
 669           if len(equalities) != 0:
 670             equalities.pop()
 671           if len(equalities):
 672             pointer = equalities[-1]
 673           else:
 674             pointer = -1
 675           length_changes1 = 0  # Reset the counters.
 676           length_changes2 = 0
 677           lastequality = None
 678           changes = True
 679       pointer += 1
 680
 681     if changes:
 682       self.diff_cleanupMerge(diffs)
 683
 684     self.diff_cleanupSemanticLossless(diffs)
 685
 686   def diff_cleanupSemanticLossless(self, diffs):
 687     """Look for single edits surrounded on both sides by equalities
 688     which can be shifted sideways to align the edit to a word boundary.
 689     e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
 690
 691     Args:
 692       diffs: Array of diff tuples.
 693     """
 694
 695     def diff_cleanupSemanticScore(one, two):
 696       """Given two strings, compute a score representing whether the
 697       internal boundary falls on logical boundaries.
 698       Scores range from 5 (best) to 0 (worst).
 699       Closure, but does not reference any external variables.
 700
 701       Args:
 702         one: First string.
 703         two: Second string.
 704
 705       Returns:
 706         The score.
 707       """
 708       if not one or not two:
 709         # Edges are the best.
 710         return 5
 711
 712       # Each port of this function behaves slightly differently due to
 713       # subtle differences in each language's definition of things like
 714       # 'whitespace'.  Since this function's purpose is largely cosmetic,
 715       # the choice has been made to use each language's native features
 716       # rather than force total conformity.
 717       score = 0
 718       # One point for non-alphanumeric.
 719       if not one[-1].isalnum() or not two[0].isalnum():
 720         score += 1
 721         # Two points for whitespace.
 722         if one[-1].isspace() or two[0].isspace():
 723           score += 1
 724           # Three points for line breaks.
 725           if (one[-1] == "\r" or one[-1] == "\n" or
 726               two[0] == "\r" or two[0] == "\n"):
 727             score += 1
 728             # Four points for blank lines.
 729             if (re.search("\\n\\r?\\n$", one) or
 730                 re.match("^\\r?\\n\\r?\\n", two)):
 731               score += 1
 732       return score
 733
 734     pointer = 1
 735     # Intentionally ignore the first and last element (don't need checking).
 736     while pointer < len(diffs) - 1:
 737       if (diffs[pointer - 1][0] == self.DIFF_EQUAL and
 738           diffs[pointer + 1][0] == self.DIFF_EQUAL):
 739         # This is a single edit surrounded by equalities.
 740         equality1 = diffs[pointer - 1][1]
 741         edit = diffs[pointer][1]
 742         equality2 = diffs[pointer + 1][1]
 743
 744         # First, shift the edit as far left as possible.
 745         commonOffset = self.diff_commonSuffix(equality1, edit)
 746         if commonOffset:
 747           commonString = edit[-commonOffset:]
 748           equality1 = equality1[:-commonOffset]
 749           edit = commonString + edit[:-commonOffset]
 750           equality2 = commonString + equality2
 751
 752         # Second, step character by character right, looking for the best fit.
 753         bestEquality1 = equality1
 754         bestEdit = edit
 755         bestEquality2 = equality2
 756         bestScore = (diff_cleanupSemanticScore(equality1, edit) +
 757             diff_cleanupSemanticScore(edit, equality2))
 758         while edit and equality2 and edit[0] == equality2[0]:
 759           equality1 += edit[0]
 760           edit = edit[1:] + equality2[0]
 761           equality2 = equality2[1:]
 762           score = (diff_cleanupSemanticScore(equality1, edit) +
 763               diff_cleanupSemanticScore(edit, equality2))
 764           # The >= encourages trailing rather than leading whitespace on edits.
 765           if score >= bestScore:
 766             bestScore = score
 767             bestEquality1 = equality1
 768             bestEdit = edit
 769             bestEquality2 = equality2
 770
 771         if diffs[pointer - 1][1] != bestEquality1:
 772           # We have an improvement, save it back to the diff.
 773           if bestEquality1:
 774             diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1)
 775           else:
 776             del diffs[pointer - 1]
 777             pointer -= 1
 778           diffs[pointer] = (diffs[pointer][0], bestEdit)
 779           if bestEquality2:
 780             diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2)
 781           else:
 782             del diffs[pointer + 1]
 783             pointer -= 1
 784       pointer += 1
 785
 786   def diff_cleanupEfficiency(self, diffs):
 787     """Reduce the number of edits by eliminating operationally trivial
 788     equalities.
 789
 790     Args:
 791       diffs: Array of diff tuples.
 792     """
 793     changes = False
 794     equalities = []  # Stack of indices where equalities are found.
 795     lastequality = ''  # Always equal to equalities[-1][1]
 796     pointer = 0  # Index of current position.
 797     pre_ins = False  # Is there an insertion operation before the last equality.
 798     pre_del = False  # Is there a deletion operation before the last equality.
 799     post_ins = False  # Is there an insertion operation after the last equality.
 800     post_del = False  # Is there a deletion operation after the last equality.
 801     while pointer < len(diffs):
 802       if diffs[pointer][0] == self.DIFF_EQUAL:  # equality found
 803         if (len(diffs[pointer][1]) < self.Diff_EditCost and
 804             (post_ins or post_del)):
 805           # Candidate found.
 806           equalities.append(pointer)
 807           pre_ins = post_ins
 808           pre_del = post_del
 809           lastequality = diffs[pointer][1]
 810         else:
 811           # Not a candidate, and can never become one.
 812           equalities = []
 813           lastequality = ''
 814
 815         post_ins = post_del = False
 816       else:  # an insertion or deletion
 817         if diffs[pointer][0] == self.DIFF_DELETE:
 818           post_del = True
 819         else:
 820           post_ins = True
 821
 822         # Five types to be split:
 823         # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
 824         # <ins>A</ins>X<ins>C</ins><del>D</del>
 825         # <ins>A</ins><del>B</del>X<ins>C</ins>
 826         # <ins>A</del>X<ins>C</ins><del>D</del>
 827         # <ins>A</ins><del>B</del>X<del>C</del>
 828
 829         if lastequality and ((pre_ins and pre_del and post_ins and post_del) or
 830                              ((len(lastequality) < self.Diff_EditCost / 2) and
 831                               (pre_ins + pre_del + post_ins + post_del) == 3)):
 832           # Duplicate record
 833           diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
 834           # Change second copy to insert.
 835           diffs[equalities[-1] + 1] = (self.DIFF_INSERT,
 836               diffs[equalities[-1] + 1][1])
 837           equalities.pop()  # Throw away the equality we just deleted
 838           lastequality = ''
 839           if pre_ins and pre_del:
 840             # No changes made which could affect previous entry, keep going.
 841             post_ins = post_del = True
 842             equalities = []
 843           else:
 844             if len(equalities):
 845               equalities.pop()  # Throw away the previous equality
 846             if len(equalities):
 847               pointer = equalities[-1]
 848             else:
 849               pointer = -1
 850             post_ins = post_del = False
 851           changes = True
 852       pointer += 1
 853
 854     if changes:
 855       self.diff_cleanupMerge(diffs)
 856
 857   def diff_cleanupMerge(self, diffs):
 858     """Reorder and merge like edit sections.  Merge equalities.
 859     Any edit section can move as long as it doesn't cross an equality.
 860
 861     Args:
 862       diffs: Array of diff tuples.
 863     """
 864     diffs.append((self.DIFF_EQUAL, ''))  # Add a dummy entry at the end.
 865     pointer = 0
 866     count_delete = 0
 867     count_insert = 0
 868     text_delete = ''
 869     text_insert = ''
 870     while pointer < len(diffs):
 871       if diffs[pointer][0] == self.DIFF_INSERT:
 872         count_insert += 1
 873         text_insert += diffs[pointer][1]
 874         pointer += 1
 875       elif diffs[pointer][0] == self.DIFF_DELETE:
 876         count_delete += 1
 877         text_delete += diffs[pointer][1]
 878         pointer += 1
 879       elif diffs[pointer][0] == self.DIFF_EQUAL:
 880         # Upon reaching an equality, check for prior redundancies.
 881         if count_delete != 0 or count_insert != 0:
 882           if count_delete != 0 and count_insert != 0:
 883             # Factor out any common prefixies.
 884             commonlength = self.diff_commonPrefix(text_insert, text_delete)
 885             if commonlength != 0:
 886               x = pointer - count_delete - count_insert - 1
 887               if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:
 888                 diffs[x] = (diffs[x][0], diffs[x][1] +
 889                             text_insert[:commonlength])
 890               else:
 891                 diffs.insert(0, (self.DIFF_EQUAL, text_insert[:commonlength]))
 892                 pointer += 1
 893               text_insert = text_insert[commonlength:]
 894               text_delete = text_delete[commonlength:]
 895             # Factor out any common suffixies.
 896             commonlength = self.diff_commonSuffix(text_insert, text_delete)
 897             if commonlength != 0:
 898               diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] +
 899                   diffs[pointer][1])
 900               text_insert = text_insert[:-commonlength]
 901               text_delete = text_delete[:-commonlength]
 902           # Delete the offending records and add the merged ones.
 903           if count_delete == 0:
 904             diffs[pointer - count_insert : pointer] = [
 905                 (self.DIFF_INSERT, text_insert)]
 906           elif count_insert == 0:
 907             diffs[pointer - count_delete : pointer] = [
 908                 (self.DIFF_DELETE, text_delete)]
 909           else:
 910             diffs[pointer - count_delete - count_insert : pointer] = [
 911                 (self.DIFF_DELETE, text_delete),
 912                 (self.DIFF_INSERT, text_insert)]
 913           pointer = pointer - count_delete - count_insert + 1
 914           if count_delete != 0:
 915             pointer += 1
 916           if count_insert != 0:
 917             pointer += 1
 918         elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:
 919           # Merge this equality with the previous one.
 920           diffs[pointer - 1] = (diffs[pointer - 1][0],
 921                                 diffs[pointer - 1][1] + diffs[pointer][1])
 922           del diffs[pointer]
 923         else:
 924           pointer += 1
 925
 926         count_insert = 0
 927         count_delete = 0
 928         text_delete = ''
 929         text_insert = ''
 930
 931     if diffs[-1][1] == '':
 932       diffs.pop()  # Remove the dummy entry at the end.
 933
 934     # Second pass: look for single edits surrounded on both sides by equalities
 935     # which can be shifted sideways to eliminate an equality.
 936     # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
 937     changes = False
 938     pointer = 1
 939     # Intentionally ignore the first and last element (don't need checking).
 940     while pointer < len(diffs) - 1:
 941       if (diffs[pointer - 1][0] == self.DIFF_EQUAL and
 942           diffs[pointer + 1][0] == self.DIFF_EQUAL):
 943         # This is a single edit surrounded by equalities.
 944         if diffs[pointer][1].endswith(diffs[pointer - 1][1]):
 945           # Shift the edit over the previous equality.
 946           diffs[pointer] = (diffs[pointer][0],
 947               diffs[pointer - 1][1] +
 948               diffs[pointer][1][:-len(diffs[pointer - 1][1])])
 949           diffs[pointer + 1] = (diffs[pointer + 1][0],
 950                                 diffs[pointer - 1][1] + diffs[pointer + 1][1])
 951           del diffs[pointer - 1]
 952           changes = True
 953         elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):
 954           # Shift the edit over the next equality.
 955           diffs[pointer - 1] = (diffs[pointer - 1][0],
 956                                 diffs[pointer - 1][1] + diffs[pointer + 1][1])
 957           diffs[pointer] = (diffs[pointer][0],
 958               diffs[pointer][1][len(diffs[pointer + 1][1]):] +
 959               diffs[pointer + 1][1])
 960           del diffs[pointer + 1]
 961           changes = True
 962       pointer += 1
 963
 964     # If shifts were made, the diff needs reordering and another shift sweep.
 965     if changes:
 966       self.diff_cleanupMerge(diffs)
 967
 968   def diff_xIndex(self, diffs, loc):
 969     """loc is a location in text1, compute and return the equivalent location
 970     in text2.  e.g. "The cat" vs "The big cat", 1->1, 5->8
 971
 972     Args:
 973       diffs: Array of diff tuples.
 974       loc: Location within text1.
 975
 976     Returns:
 977       Location within text2.
 978     """
 979     chars1 = 0
 980     chars2 = 0
 981     last_chars1 = 0
 982     last_chars2 = 0
 983     for x in xrange(len(diffs)):
 984       (op, text) = diffs[x]
 985       if op != self.DIFF_INSERT:  # Equality or deletion.
 986         chars1 += len(text)
 987       if op != self.DIFF_DELETE:  # Equality or insertion.
 988         chars2 += len(text)
 989       if chars1 > loc:  # Overshot the location.
 990         break
 991       last_chars1 = chars1
 992       last_chars2 = chars2
 993
 994     if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:
 995       # The location was deleted.
 996       return last_chars2
 997     # Add the remaining len(character).
 998     return last_chars2 + (loc - last_chars1)
 999
1000   def diff_prettyHtml(self, diffs):
1001     """Convert a diff array into a pretty HTML report.
1002
1003     Args:
1004       diffs: Array of diff tuples.
1005
1006     Returns:
1007       HTML representation.
1008     """
1009     html = []
1010     i = 0
1011     for (op, data) in diffs:
1012       text = (data.replace("&", "&amp;").replace("<", "&lt;")
1013                  .replace(">", "&gt;").replace("\n", "&para;<BR>"))
1014       if op == self.DIFF_INSERT:
1015         html.append("<INS STYLE=\"background:#E6FFE6;\" TITLE=\"i=%i\">%s</INS>"
1016             % (i, text))
1017       elif op == self.DIFF_DELETE:
1018         html.append("<DEL STYLE=\"background:#FFE6E6;\" TITLE=\"i=%i\">%s</DEL>"
1019             % (i, text))
1020       elif op == self.DIFF_EQUAL:
1021         html.append("<SPAN TITLE=\"i=%i\">%s</SPAN>" % (i, text))
1022       if op != self.DIFF_DELETE:
1023         i += len(data)
1024     return "".join(html)
1025
1026   def diff_text1(self, diffs):
1027     """Compute and return the source text (all equalities and deletions).
1028
1029     Args:
1030       diffs: Array of diff tuples.
1031
1032     Returns:
1033       Source text.
1034     """
1035     text = []
1036     for (op, data) in diffs:
1037       if op != self.DIFF_INSERT:
1038         text.append(data)
1039     return "".join(text)
1040
1041   def diff_text2(self, diffs):
1042     """Compute and return the destination text (all equalities and insertions).
1043
1044     Args:
1045       diffs: Array of diff tuples.
1046
1047     Returns:
1048       Destination text.
1049     """
1050     text = []
1051     for (op, data) in diffs:
1052       if op != self.DIFF_DELETE:
1053         text.append(data)
1054     return "".join(text)
1055
1056   def diff_levenshtein(self, diffs):
1057     """Compute the Levenshtein distance; the number of inserted, deleted or
1058     substituted characters.
1059
1060     Args:
1061       diffs: Array of diff tuples.
1062
1063     Returns:
1064       Number of changes.
1065     """
1066     levenshtein = 0
1067     insertions = 0
1068     deletions = 0
1069     for (op, data) in diffs:
1070       if op == self.DIFF_INSERT:
1071         insertions += len(data)
1072       elif op == self.DIFF_DELETE:
1073         deletions += len(data)
1074       elif op == self.DIFF_EQUAL:
1075         # A deletion and an insertion is one substitution.
1076         levenshtein += max(insertions, deletions)
1077         insertions = 0
1078         deletions = 0
1079     levenshtein += max(insertions, deletions)
1080     return levenshtein
1081
1082   def diff_toDelta(self, diffs):
1083     """Crush the diff into an encoded string which describes the operations
1084     required to transform text1 into text2.
1085     E.g. =3\t-2\t+ing  -> Keep 3 chars, delete 2 chars, insert 'ing'.
1086     Operations are tab-separated.  Inserted text is escaped using %xx notation.
1087
1088     Args:
1089       diffs: Array of diff tuples.
1090
1091     Returns:
1092       Delta text.
1093     """
1094     text = []
1095     for (op, data) in diffs:
1096       if op == self.DIFF_INSERT:
1097         # High ascii will raise UnicodeDecodeError.  Use Unicode instead.
1098         data = data.encode("utf-8")
1099         text.append("+" + urllib.quote(data, "!~*'();/?:@&=+$,# "))
1100       elif op == self.DIFF_DELETE:
1101         text.append("-%d" % len(data))
1102       elif op == self.DIFF_EQUAL:
1103         text.append("=%d" % len(data))
1104     return "\t".join(text)
1105
1106   def diff_fromDelta(self, text1, delta):
1107     """Given the original text1, and an encoded string which describes the
1108     operations required to transform text1 into text2, compute the full diff.
1109
1110     Args:
1111       text1: Source string for the diff.
1112       delta: Delta text.
1113
1114     Returns:
1115       Array of diff tuples.
1116
1117     Raises:
1118       ValueError: If invalid input.
1119     """
1120     if type(delta) == unicode:
1121       # Deltas should be composed of a subset of ascii chars, Unicode not
1122       # required.  If this encode raises UnicodeEncodeError, delta is invalid.
1123       delta = delta.encode("ascii")
1124     diffs = []
1125     pointer = 0  # Cursor in text1
1126     tokens = delta.split("\t")
1127     for token in tokens:
1128       if token == "":
1129         # Blank tokens are ok (from a trailing \t).
1130         continue
1131       # Each token begins with a one character parameter which specifies the
1132       # operation of this token (delete, insert, equality).
1133       param = token[1:]
1134       if token[0] == "+":
1135         param = urllib.unquote(param).decode("utf-8")
1136         diffs.append((self.DIFF_INSERT, param))
1137       elif token[0] == "-" or token[0] == "=":
1138         try:
1139           n = int(param)
1140         except ValueError:
1141           raise ValueError("Invalid number in diff_fromDelta: " + param)
1142         if n < 0:
1143           raise ValueError("Negative number in diff_fromDelta: " + param)
1144         text = text1[pointer : pointer + n]
1145         pointer += n
1146         if token[0] == "=":
1147           diffs.append((self.DIFF_EQUAL, text))
1148         else:
1149           diffs.append((self.DIFF_DELETE, text))
1150       else:
1151         # Anything else is an error.
1152         raise ValueError("Invalid diff operation in diff_fromDelta: " +
1153             token[0])
1154     if pointer != len(text1):
1155       raise ValueError(
1156           "Delta length (%d) does not equal source text length (%d)." %
1157          (pointer, len(text1)))
1158     return diffs
1159
1160   #  MATCH FUNCTIONS
1161
1162   def match_main(self, text, pattern, loc):
1163     """Locate the best instance of 'pattern' in 'text' near 'loc'.
1164
1165     Args:
1166       text: The text to search.
1167       pattern: The pattern to search for.
1168       loc: The location to search around.
1169
1170     Returns:
1171       Best match index or -1.
1172     """
1173     # Check for null inputs.
1174     if text == None or pattern == None:
1175       raise ValueError("Null inputs. (match_main)")
1176
1177     loc = max(0, min(loc, len(text)))
1178     if text == pattern:
1179       # Shortcut (potentially not guaranteed by the algorithm)
1180       return 0
1181     elif not text:
1182       # Nothing to match.
1183       return -1
1184     elif text[loc:loc + len(pattern)] == pattern:
1185       # Perfect match at the perfect spot!  (Includes case of null pattern)
1186       return loc
1187     else:
1188       # Do a fuzzy compare.
1189       match = self.match_bitap(text, pattern, loc)
1190       return match
1191
1192   def match_bitap(self, text, pattern, loc):
1193     """Locate the best instance of 'pattern' in 'text' near 'loc' using the
1194     Bitap algorithm.
1195
1196     Args:
1197       text: The text to search.
1198       pattern: The pattern to search for.
1199       loc: The location to search around.
1200
1201     Returns:
1202       Best match index or -1.
1203     """
1204     # Python doesn't have a maxint limit, so ignore this check.
1205     #if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:
1206     #  raise ValueError("Pattern too long for this application.")
1207
1208     # Initialise the alphabet.
1209     s = self.match_alphabet(pattern)
1210
1211     def match_bitapScore(e, x):
1212       """Compute and return the score for a match with e errors and x location.
1213       Accesses loc and pattern through being a closure.
1214
1215       Args:
1216         e: Number of errors in match.
1217         x: Location of match.
1218
1219       Returns:
1220         Overall score for match (0.0 = good, 1.0 = bad).
1221       """
1222       accuracy = float(e) / len(pattern)
1223       proximity = abs(loc - x)
1224       if not self.Match_Distance:
1225         # Dodge divide by zero error.
1226         return proximity and 1.0 or accuracy
1227       return accuracy + (proximity / float(self.Match_Distance))
1228
1229     # Highest score beyond which we give up.
1230     score_threshold = self.Match_Threshold
1231     # Is there a nearby exact match? (speedup)
1232     best_loc = text.find(pattern, loc)
1233     if best_loc != -1:
1234       score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1235       # What about in the other direction? (speedup)
1236       best_loc = text.rfind(pattern, loc + len(pattern))
1237       if best_loc != -1:
1238         score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1239
1240     # Initialise the bit arrays.
1241     matchmask = 1 << (len(pattern) - 1)
1242     best_loc = -1
1243
1244     bin_max = len(pattern) + len(text)
1245     # Empty initialization added to appease pychecker.
1246     last_rd = None
1247     for d in xrange(len(pattern)):
1248       # Scan for the best match each iteration allows for one more error.
1249       # Run a binary search to determine how far from 'loc' we can stray at
1250       # this error level.
1251       bin_min = 0
1252       bin_mid = bin_max
1253       while bin_min < bin_mid:
1254         if match_bitapScore(d, loc + bin_mid) <= score_threshold:
1255           bin_min = bin_mid
1256         else:
1257           bin_max = bin_mid
1258         bin_mid = (bin_max - bin_min) / 2 + bin_min
1259
1260       # Use the result from this iteration as the maximum for the next.
1261       bin_max = bin_mid
1262       start = max(1, loc - bin_mid + 1)
1263       finish = min(loc + bin_mid, len(text)) + len(pattern)
1264
1265       rd = range(finish + 1)
1266       rd.append((1 << d) - 1)
1267       for j in xrange(finish, start - 1, -1):
1268         if len(text) <= j - 1:
1269           # Out of range.
1270           charMatch = 0
1271         else:
1272           charMatch = s.get(text[j - 1], 0)
1273         if d == 0:  # First pass: exact match.
1274           rd[j] = ((rd[j + 1] << 1) | 1) & charMatch
1275         else:  # Subsequent passes: fuzzy match.
1276           rd[j] = ((rd[j + 1] << 1) | 1) & charMatch | (
1277               ((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]
1278         if rd[j] & matchmask:
1279           score = match_bitapScore(d, j - 1)
1280           # This match will almost certainly be better than any existing match.
1281           # But check anyway.
1282           if score <= score_threshold:
1283             # Told you so.
1284             score_threshold = score
1285             best_loc = j - 1
1286             if best_loc > loc:
1287               # When passing loc, don't exceed our current distance from loc.
1288               start = max(1, 2 * loc - best_loc)
1289             else:
1290               # Already passed loc, downhill from here on in.
1291               break
1292       # No hope for a (better) match at greater error levels.
1293       if match_bitapScore(d + 1, loc) > score_threshold:
1294         break
1295       last_rd = rd
1296     return best_loc
1297
1298   def match_alphabet(self, pattern):
1299     """Initialise the alphabet for the Bitap algorithm.
1300
1301     Args:
1302       pattern: The text to encode.
1303
1304     Returns:
1305       Hash of character locations.
1306     """
1307     s = {}
1308     for char in pattern:
1309       s[char] = 0
1310     for i in xrange(len(pattern)):
1311       s[pattern[i]] |= 1 << (len(pattern) - i - 1)
1312     return s
1313
1314   #  PATCH FUNCTIONS
1315
1316   def patch_addContext(self, patch, text):
1317     """Increase the context until it is unique,
1318     but don't let the pattern expand beyond Match_MaxBits.
1319
1320     Args:
1321       patch: The patch to grow.
1322       text: Source text.
1323     """
1324     if len(text) == 0:
1325       return
1326     pattern = text[patch.start2 : patch.start2 + patch.length1]
1327     padding = 0
1328
1329     # Look for the first and last matches of pattern in text.  If two different
1330     # matches are found, increase the pattern length.
1331     while (text.find(pattern) != text.rfind(pattern) and (self.Match_MaxBits ==
1332         0 or len(pattern) < self.Match_MaxBits - self.Patch_Margin -
1333         self.Patch_Margin)):
1334       padding += self.Patch_Margin
1335       pattern = text[max(0, patch.start2 - padding) :
1336                      patch.start2 + patch.length1 + padding]
1337     # Add one chunk for good luck.
1338     padding += self.Patch_Margin
1339
1340     # Add the prefix.
1341     prefix = text[max(0, patch.start2 - padding) : patch.start2]
1342     if prefix:
1343       patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]
1344     # Add the suffix.
1345     suffix = text[patch.start2 + patch.length1 :
1346                   patch.start2 + patch.length1 + padding]
1347     if suffix:
1348       patch.diffs.append((self.DIFF_EQUAL, suffix))
1349
1350     # Roll back the start points.
1351     patch.start1 -= len(prefix)
1352     patch.start2 -= len(prefix)
1353     # Extend lengths.
1354     patch.length1 += len(prefix) + len(suffix)
1355     patch.length2 += len(prefix) + len(suffix)
1356
1357   def patch_make(self, a, b=None, c=None):
1358     """Compute a list of patches to turn text1 into text2.
1359     Use diffs if provided, otherwise compute it ourselves.
1360     There are four ways to call this function, depending on what data is
1361     available to the caller:
1362     Method 1:
1363     a = text1, b = text2
1364     Method 2:
1365     a = diffs
1366     Method 3 (optimal):
1367     a = text1, b = diffs
1368     Method 4 (deprecated, use method 3):
1369     a = text1, b = text2, c = diffs
1370
1371     Args:
1372       a: text1 (methods 1,3,4) or Array of diff tuples for text1 to
1373           text2 (method 2).
1374       b: text2 (methods 1,4) or Array of diff tuples for text1 to
1375           text2 (method 3) or undefined (method 2).
1376       c: Array of diff tuples for text1 to text2 (method 4) or
1377           undefined (methods 1,2,3).
1378
1379     Returns:
1380       Array of patch objects.
1381     """
1382     text1 = None
1383     diffs = None
1384     # Note that texts may arrive as 'str' or 'unicode'.
1385     if isinstance(a, basestring) and isinstance(b, basestring) and c is None:
1386       # Method 1: text1, text2
1387       # Compute diffs from text1 and text2.
1388       text1 = a
1389       diffs = self.diff_main(text1, b, True)
1390       if len(diffs) > 2:
1391         self.diff_cleanupSemantic(diffs)
1392         self.diff_cleanupEfficiency(diffs)
1393     elif isinstance(a, list) and b is None and c is None:
1394       # Method 2: diffs
1395       # Compute text1 from diffs.
1396       diffs = a
1397       text1 = self.diff_text1(diffs)
1398     elif isinstance(a, basestring) and isinstance(b, list) and c is None:
1399       # Method 3: text1, diffs
1400       text1 = a
1401       diffs = b
1402     elif (isinstance(a, basestring) and isinstance(b, basestring) and
1403           isinstance(c, list)):
1404       # Method 4: text1, text2, diffs
1405       # text2 is not used.
1406       text1 = a
1407       diffs = c
1408     else:
1409       raise ValueError("Unknown call format to patch_make.")
1410
1411     if not diffs:
1412       return []  # Get rid of the None case.
1413     patches = []
1414     patch = patch_obj()
1415     char_count1 = 0  # Number of characters into the text1 string.
1416     char_count2 = 0  # Number of characters into the text2 string.
1417     prepatch_text = text1  # Recreate the patches to determine context info.
1418     postpatch_text = text1
1419     for x in xrange(len(diffs)):
1420       (diff_type, diff_text) = diffs[x]
1421       if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:
1422         # A new patch starts here.
1423         patch.start1 = char_count1
1424         patch.start2 = char_count2
1425       if diff_type == self.DIFF_INSERT:
1426         # Insertion
1427         patch.diffs.append(diffs[x])
1428         patch.length2 += len(diff_text)
1429         postpatch_text = (postpatch_text[:char_count2] + diff_text +
1430                           postpatch_text[char_count2:])
1431       elif diff_type == self.DIFF_DELETE:
1432         # Deletion.
1433         patch.length1 += len(diff_text)
1434         patch.diffs.append(diffs[x])
1435         postpatch_text = (postpatch_text[:char_count2] +
1436                           postpatch_text[char_count2 + len(diff_text):])
1437       elif (diff_type == self.DIFF_EQUAL and
1438             len(diff_text) <= 2 * self.Patch_Margin and
1439             len(patch.diffs) != 0 and len(diffs) != x + 1):
1440         # Small equality inside a patch.
1441         patch.diffs.append(diffs[x])
1442         patch.length1 += len(diff_text)
1443         patch.length2 += len(diff_text)
1444
1445       if (diff_type == self.DIFF_EQUAL and
1446           len(diff_text) >= 2 * self.Patch_Margin):
1447         # Time for a new patch.
1448         if len(patch.diffs) != 0:
1449           self.patch_addContext(patch, prepatch_text)
1450           patches.append(patch)
1451           patch = patch_obj()
1452           # Unlike Unidiff, our patch lists have a rolling context.
1453           # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
1454           # Update prepatch text & pos to reflect the application of the
1455           # just completed patch.
1456           prepatch_text = postpatch_text
1457           char_count1 = char_count2
1458
1459       # Update the current character count.
1460       if diff_type != self.DIFF_INSERT:
1461         char_count1 += len(diff_text)
1462       if diff_type != self.DIFF_DELETE:
1463         char_count2 += len(diff_text)
1464
1465     # Pick up the leftover patch if not empty.
1466     if len(patch.diffs) != 0:
1467       self.patch_addContext(patch, prepatch_text)
1468       patches.append(patch)
1469     return patches
1470
1471   def patch_deepCopy(self, patches):
1472     """Given an array of patches, return another array that is identical.
1473
1474     Args:
1475       patches: Array of patch objects.
1476
1477     Returns:
1478       Array of patch objects.
1479     """
1480     patchesCopy = []
1481     for patch in patches:
1482       patchCopy = patch_obj()
1483       # No need to deep copy the tuples since they are immutable.
1484       patchCopy.diffs = patch.diffs[:]
1485       patchCopy.start1 = patch.start1
1486       patchCopy.start2 = patch.start2
1487       patchCopy.length1 = patch.length1
1488       patchCopy.length2 = patch.length2
1489       patchesCopy.append(patchCopy)
1490     return patchesCopy
1491
1492   def patch_apply(self, patches, text):
1493     """Merge a set of patches onto the text.  Return a patched text, as well
1494     as a list of true/false values indicating which patches were applied.
1495
1496     Args:
1497       patches: Array of patch objects.
1498       text: Old text.
1499
1500     Returns:
1501       Two element Array, containing the new text and an array of boolean values.
1502     """
1503     if not patches:
1504       return (text, [])
1505
1506     # Deep copy the patches so that no changes are made to originals.
1507     patches = self.patch_deepCopy(patches)
1508
1509     nullPadding = self.patch_addPadding(patches)
1510     text = nullPadding + text + nullPadding
1511     self.patch_splitMax(patches)
1512
1513     # delta keeps track of the offset between the expected and actual location
1514     # of the previous patch.  If there are patches expected at positions 10 and
1515     # 20, but the first patch was found at 12, delta is 2 and the second patch
1516     # has an effective expected position of 22.
1517     delta = 0
1518     results = []
1519     for patch in patches:
1520       expected_loc = patch.start2 + delta
1521       text1 = self.diff_text1(patch.diffs)
1522       end_loc = -1
1523       if len(text1) > self.Match_MaxBits:
1524         # patch_splitMax will only provide an oversized pattern in the case of
1525         # a monster delete.
1526         start_loc = self.match_main(text, text1[:self.Match_MaxBits],
1527                                     expected_loc)
1528         if start_loc != -1:
1529           end_loc = self.match_main(text, text1[-self.Match_MaxBits:],
1530               expected_loc + len(text1) - self.Match_MaxBits)
1531           if end_loc == -1 or start_loc >= end_loc:
1532             # Can't find valid trailing context.  Drop this patch.
1533             start_loc = -1
1534       else:
1535         start_loc = self.match_main(text, text1, expected_loc)
1536       if start_loc == -1:
1537         # No match found.  :(
1538         results.append(False)
1539         # Subtract the delta for this failed patch from subsequent patches.
1540         delta -= patch.length2 - patch.length1
1541       else:
1542         # Found a match.  :)
1543         results.append(True)
1544         delta = start_loc - expected_loc
1545         if end_loc == -1:
1546           text2 = text[start_loc : start_loc + len(text1)]
1547         else:
1548           text2 = text[start_loc : end_loc + self.Match_MaxBits]
1549         if text1 == text2:
1550           # Perfect match, just shove the replacement text in.
1551           text = (text[:start_loc] + self.diff_text2(patch.diffs) +
1552                       text[start_loc + len(text1):])
1553         else:
1554           # Imperfect match.
1555           # Run a diff to get a framework of equivalent indices.
1556           diffs = self.diff_main(text1, text2, False)
1557           if (len(text1) > self.Match_MaxBits and
1558               self.diff_levenshtein(diffs) / float(len(text1)) >
1559               self.Patch_DeleteThreshold):
1560             # The end points match, but the content is unacceptably bad.
1561             results[-1] = False
1562           else:
1563             self.diff_cleanupSemanticLossless(diffs)
1564             index1 = 0
1565             for (op, data) in patch.diffs:
1566               if op != self.DIFF_EQUAL:
1567                 index2 = self.diff_xIndex(diffs, index1)
1568               if op == self.DIFF_INSERT:  # Insertion
1569                 text = text[:start_loc + index2] + data + text[start_loc +
1570                                                                index2:]
1571               elif op == self.DIFF_DELETE:  # Deletion
1572                 text = text[:start_loc + index2] + text[start_loc +
1573                     self.diff_xIndex(diffs, index1 + len(data)):]
1574               if op != self.DIFF_DELETE:
1575                 index1 += len(data)
1576     # Strip the padding off.
1577     text = text[len(nullPadding):-len(nullPadding)]
1578     return (text, results)
1579
1580   def patch_addPadding(self, patches):
1581     """Add some padding on text start and end so that edges can match
1582     something.  Intended to be called only from within patch_apply.
1583
1584     Args:
1585       patches: Array of patch objects.
1586
1587     Returns:
1588       The padding string added to each side.
1589     """
1590     paddingLength = self.Patch_Margin
1591     nullPadding = ""
1592     for x in xrange(1, paddingLength + 1):
1593       nullPadding += chr(x)
1594
1595     # Bump all the patches forward.
1596     for patch in patches:
1597       patch.start1 += paddingLength
1598       patch.start2 += paddingLength
1599
1600     # Add some padding on start of first diff.
1601     patch = patches[0]
1602     diffs = patch.diffs
1603     if not diffs or diffs[0][0] != self.DIFF_EQUAL:
1604       # Add nullPadding equality.
1605       diffs.insert(0, (self.DIFF_EQUAL, nullPadding))
1606       patch.start1 -= paddingLength  # Should be 0.
1607       patch.start2 -= paddingLength  # Should be 0.
1608       patch.length1 += paddingLength
1609       patch.length2 += paddingLength
1610     elif paddingLength > len(diffs[0][1]):
1611       # Grow first equality.
1612       extraLength = paddingLength - len(diffs[0][1])
1613       newText = nullPadding[len(diffs[0][1]):] + diffs[0][1]
1614       diffs[0] = (diffs[0][0], newText)
1615       patch.start1 -= extraLength
1616       patch.start2 -= extraLength
1617       patch.length1 += extraLength
1618       patch.length2 += extraLength
1619
1620     # Add some padding on end of last diff.
1621     patch = patches[-1]
1622     diffs = patch.diffs
1623     if not diffs or diffs[-1][0] != self.DIFF_EQUAL:
1624       # Add nullPadding equality.
1625       diffs.append((self.DIFF_EQUAL, nullPadding))
1626       patch.length1 += paddingLength
1627       patch.length2 += paddingLength
1628     elif paddingLength > len(diffs[-1][1]):
1629       # Grow last equality.
1630       extraLength = paddingLength - len(diffs[-1][1])
1631       newText = diffs[-1][1] + nullPadding[:extraLength]
1632       diffs[-1] = (diffs[-1][0], newText)
1633       patch.length1 += extraLength
1634       patch.length2 += extraLength
1635
1636     return nullPadding
1637
1638   def patch_splitMax(self, patches):
1639     """Look through the patches and break up any which are longer than the
1640     maximum limit of the match algorithm.
1641
1642     Args:
1643       patches: Array of patch objects.
1644     """
1645     if self.Match_MaxBits == 0:
1646       return
1647     for x in xrange(len(patches)):
1648       if patches[x].length1 > self.Match_MaxBits:
1649         bigpatch = patches[x]
1650         # Remove the big old patch.
1651         del patches[x]
1652         x -= 1
1653         patch_size = self.Match_MaxBits
1654         start1 = bigpatch.start1
1655         start2 = bigpatch.start2
1656         precontext = ''
1657         while len(bigpatch.diffs) != 0:
1658           # Create one of several smaller patches.
1659           patch = patch_obj()
1660           empty = True
1661           patch.start1 = start1 - len(precontext)
1662           patch.start2 = start2 - len(precontext)
1663           if precontext:
1664             patch.length1 = patch.length2 = len(precontext)
1665             patch.diffs.append((self.DIFF_EQUAL, precontext))
1666
1667           while (len(bigpatch.diffs) != 0 and
1668                  patch.length1 < patch_size - self.Patch_Margin):
1669             (diff_type, diff_text) = bigpatch.diffs[0]
1670             if diff_type == self.DIFF_INSERT:
1671               # Insertions are harmless.
1672               patch.length2 += len(diff_text)
1673               start2 += len(diff_text)
1674               patch.diffs.append(bigpatch.diffs.pop(0))
1675               empty = False
1676             elif (diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and
1677                 patch.diffs[0][0] == self.DIFF_EQUAL and
1678                 len(diff_text) > 2 * patch_size):
1679               # This is a large deletion.  Let it pass in one chunk.
1680               patch.length1 += len(diff_text)
1681               start1 += len(diff_text)
1682               empty = False
1683               patch.diffs.append((diff_type, diff_text))
1684               del bigpatch.diffs[0]
1685             else:
1686               # Deletion or equality.  Only take as much as we can stomach.
1687               diff_text = diff_text[:patch_size - patch.length1 -
1688                                     self.Patch_Margin]
1689               patch.length1 += len(diff_text)
1690               start1 += len(diff_text)
1691               if diff_type == self.DIFF_EQUAL:
1692                 patch.length2 += len(diff_text)
1693                 start2 += len(diff_text)
1694               else:
1695                 empty = False
1696
1697               patch.diffs.append((diff_type, diff_text))
1698               if diff_text == bigpatch.diffs[0][1]:
1699                 del bigpatch.diffs[0]
1700               else:
1701                 bigpatch.diffs[0] = (bigpatch.diffs[0][0],
1702                                      bigpatch.diffs[0][1][len(diff_text):])
1703
1704           # Compute the head context for the next patch.
1705           precontext = self.diff_text2(patch.diffs)
1706           precontext = precontext[-self.Patch_Margin:]
1707           # Append the end context for this patch.
1708           postcontext = self.diff_text1(bigpatch.diffs)[:self.Patch_Margin]
1709           if postcontext:
1710             patch.length1 += len(postcontext)
1711             patch.length2 += len(postcontext)
1712             if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:
1713               patch.diffs[-1] = (self.DIFF_EQUAL, patch.diffs[-1][1] +
1714                                  postcontext)
1715             else:
1716               patch.diffs.append((self.DIFF_EQUAL, postcontext))
1717
1718           if not empty:
1719             x += 1
1720             patches.insert(x, patch)
1721
1722   def patch_toText(self, patches):
1723     """Take a list of patches and return a textual representation.
1724
1725     Args:
1726       patches: Array of patch objects.
1727
1728     Returns:
1729       Text representation of patches.
1730     """
1731     text = []
1732     for patch in patches:
1733       text.append(str(patch))
1734     return "".join(text)
1735
1736   def patch_fromText(self, textline):
1737     """Parse a textual representation of patches and return a list of patch
1738     objects.
1739
1740     Args:
1741       textline: Text representation of patches.
1742
1743     Returns:
1744       Array of patch objects.
1745
1746     Raises:
1747       ValueError: If invalid input.
1748     """
1749     if type(textline) == unicode:
1750       # Patches should be composed of a subset of ascii chars, Unicode not
1751       # required.  If this encode raises UnicodeEncodeError, patch is invalid.
1752       textline = textline.encode("ascii")
1753     patches = []
1754     if not textline:
1755       return patches
1756     text = textline.split('\n')
1757     while len(text) != 0:
1758       m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0])
1759       if not m:
1760         raise ValueError("Invalid patch string: " + text[0])
1761       patch = patch_obj()
1762       patches.append(patch)
1763       patch.start1 = int(m.group(1))
1764       if m.group(2) == '':
1765         patch.start1 -= 1
1766         patch.length1 = 1
1767       elif m.group(2) == '0':
1768         patch.length1 = 0
1769       else:
1770         patch.start1 -= 1
1771         patch.length1 = int(m.group(2))
1772
1773       patch.start2 = int(m.group(3))
1774       if m.group(4) == '':
1775         patch.start2 -= 1
1776         patch.length2 = 1
1777       elif m.group(4) == '0':
1778         patch.length2 = 0
1779       else:
1780         patch.start2 -= 1
1781         patch.length2 = int(m.group(4))
1782
1783       del text[0]
1784
1785       while len(text) != 0:
1786         if text[0]:
1787           sign = text[0][0]
1788         else:
1789           sign = ''
1790         line = urllib.unquote(text[0][1:])
1791         line = line.decode("utf-8")
1792         if sign == '+':
1793           # Insertion.
1794           patch.diffs.append((self.DIFF_INSERT, line))
1795         elif sign == '-':
1796           # Deletion.
1797           patch.diffs.append((self.DIFF_DELETE, line))
1798         elif sign == ' ':
1799           # Minor equality.
1800           patch.diffs.append((self.DIFF_EQUAL, line))
1801         elif sign == '@':
1802           # Start of next patch.
1803           break
1804         elif sign == '':
1805           # Blank line?  Whatever.
1806           pass
1807         else:
1808           # WTF?
1809           raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line))
1810         del text[0]
1811     return patches
1812
1813
1814 class patch_obj:
1815   """Class representing one patch operation.
1816   """
1817
1818   def __init__(self):
1819     """Initializes with an empty list of diffs.
1820     """
1821     self.diffs = []
1822     self.start1 = None
1823     self.start2 = None
1824     self.length1 = 0
1825     self.length2 = 0
1826
1827   def __str__(self):
1828     """Emmulate GNU diff's format.
1829     Header: @@ -382,8 +481,9 @@
1830     Indicies are printed as 1-based, not 0-based.
1831
1832     Returns:
1833       The GNU diff string.
1834     """
1835     if self.length1 == 0:
1836       coords1 = str(self.start1) + ",0"
1837     elif self.length1 == 1:
1838       coords1 = str(self.start1 + 1)
1839     else:
1840       coords1 = str(self.start1 + 1) + "," + str(self.length1)
1841     if self.length2 == 0:
1842       coords2 = str(self.start2) + ",0"
1843     elif self.length2 == 1:
1844       coords2 = str(self.start2 + 1)
1845     else:
1846       coords2 = str(self.start2 + 1) + "," + str(self.length2)
1847     text = ["@@ -", coords1, " +", coords2, " @@\n"]
1848     # Escape the body of the patch with %xx notation.
1849     for (op, data) in self.diffs:
1850       if op == diff_match_patch.DIFF_INSERT:
1851         text.append("+")
1852       elif op == diff_match_patch.DIFF_DELETE:
1853         text.append("-")
1854       elif op == diff_match_patch.DIFF_EQUAL:
1855         text.append(" ")
1856       # High ascii will raise UnicodeDecodeError.  Use Unicode instead.
1857       data = data.encode("utf-8")
1858       text.append(urllib.quote(data, "!~*'();/?:@&=+$,# ") + "\n")
1859     return "".join(text)