From 41ce422d54bb205f8e4f92694772191202cf79f3 Mon Sep 17 00:00:00 2001 From: Liu Zhanhong <275368990@qq.com> Date: Wed, 13 Feb 2019 11:25:05 +0800 Subject: [PATCH 1/5] patch_apply use StringBuilder instead of String. After using StringBuilder, it's more than 4 times faster than original version. Speed test using Speedtest.java --- .../neil/plaintext/diff_match_patch.java | 39 ++++++++++++------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/java/src/name/fraser/neil/plaintext/diff_match_patch.java b/java/src/name/fraser/neil/plaintext/diff_match_patch.java index 9d07867..373b8f7 100644 --- a/java/src/name/fraser/neil/plaintext/diff_match_patch.java +++ b/java/src/name/fraser/neil/plaintext/diff_match_patch.java @@ -1925,16 +1925,19 @@ public LinkedList patch_deepCopy(LinkedList patches) { * @return Two element Object array, containing the new text and an array of * boolean values. */ - public Object[] patch_apply(LinkedList patches, String text) { + public Object[] patch_apply(LinkedList patches, String oritext) { if (patches.isEmpty()) { return new Object[]{text, new boolean[0]}; } + StringBuilder text = new StringBuilder(oritext); // Deep copy the patches so that no changes are made to originals. patches = patch_deepCopy(patches); String nullPadding = patch_addPadding(patches); - text = nullPadding + text + nullPadding; + text.insert(0, nullPadding); + text.append(nullPadding); + // text = nullPadding + text + nullPadding; patch_splitMax(patches); int x = 0; @@ -1952,10 +1955,10 @@ public Object[] patch_apply(LinkedList patches, String text) { if (text1.length() > this.Match_MaxBits) { // patch_splitMax will only provide an oversized pattern in the case of // a monster delete. - start_loc = match_main(text, + start_loc = match_main(text.toString(), text1.substring(0, this.Match_MaxBits), expected_loc); if (start_loc != -1) { - end_loc = match_main(text, + end_loc = match_main(text.toString(), text1.substring(text1.length() - this.Match_MaxBits), expected_loc + text1.length() - this.Match_MaxBits); if (end_loc == -1 || start_loc >= end_loc) { @@ -1964,7 +1967,7 @@ public Object[] patch_apply(LinkedList patches, String text) { } } } else { - start_loc = match_main(text, text1, expected_loc); + start_loc = match_main(text.toString(), text1, expected_loc); } if (start_loc == -1) { // No match found. :( @@ -1985,8 +1988,9 @@ public Object[] patch_apply(LinkedList patches, String text) { } if (text1.equals(text2)) { // Perfect match, just shove the replacement text in. - text = text.substring(0, start_loc) + diff_text2(aPatch.diffs) - + text.substring(start_loc + text1.length()); + text.replace(start_loc, start_loc + text1.length(), diff_text2(aPatch.diffs)); + // text = text.substring(0, start_loc) + diff_text2(aPatch.diffs) + // + text.substring(start_loc + text1.length()); } else { // Imperfect match. Run a diff to get a framework of equivalent // indices. @@ -2004,13 +2008,17 @@ && diff_levenshtein(diffs) / (float) text1.length() int index2 = diff_xIndex(diffs, index1); if (aDiff.operation == Operation.INSERT) { // Insertion - text = text.substring(0, start_loc + index2) + aDiff.text - + text.substring(start_loc + index2); + text.insert(start_loc + index2, aDiff.text); + // text = text.substring(0, start_loc + index2) + aDiff.text + // + text.substring(start_loc + index2); } else if (aDiff.operation == Operation.DELETE) { // Deletion - text = text.substring(0, start_loc + index2) - + text.substring(start_loc + diff_xIndex(diffs, - index1 + aDiff.text.length())); + text.delete( + start_loc + index2, + start_loc + diff_xIndex(diffs, index1 + aDiff.text.length())); + // text = text.substring(0, start_loc + index2) + // + text.substring(start_loc + diff_xIndex(diffs, + // index1 + aDiff.text.length())); } } if (aDiff.operation != Operation.DELETE) { @@ -2023,9 +2031,10 @@ && diff_levenshtein(diffs) / (float) text1.length() x++; } // Strip the padding off. - text = text.substring(nullPadding.length(), text.length() - - nullPadding.length()); - return new Object[]{text, results}; + String textRet = text.substring(nullPadding.length(), text.length() - nullPadding.length()); + // text = text.substring(nullPadding.length(), text.length() + // - nullPadding.length()); + return new Object[]{textRet, results}; } /** From 997f6a25aaca2d2ae3d5eee7a52456cc0c01d65d Mon Sep 17 00:00:00 2001 From: Liu Zhanhong <275368990@qq.com> Date: Wed, 13 Feb 2019 11:32:17 +0800 Subject: [PATCH 2/5] match_main/match_bitap use StringBuilder After using StringBuilder, it's more than 10 times faster than original version. Speed test using Speedtest.java --- .../neil/plaintext/diff_match_patch.java | 118 ++++++++++++++++++ 1 file changed, 118 insertions(+) diff --git a/java/src/name/fraser/neil/plaintext/diff_match_patch.java b/java/src/name/fraser/neil/plaintext/diff_match_patch.java index 373b8f7..c624251 100644 --- a/java/src/name/fraser/neil/plaintext/diff_match_patch.java +++ b/java/src/name/fraser/neil/plaintext/diff_match_patch.java @@ -1570,6 +1570,29 @@ public int match_main(String text, String pattern, int loc) { } } + public int match_main(StringBuilder text, String pattern, int loc) { + // Check for null inputs. + if (text == null || pattern == null) { + throw new IllegalArgumentException("Null inputs. (match_main)"); + } + + loc = Math.max(0, Math.min(loc, text.length())); + if (text.equals(pattern)) { + // Shortcut (potentially not guaranteed by the algorithm) + return 0; + } else if (text.length() == 0) { + // Nothing to match. + return -1; + } else if (loc + pattern.length() <= text.length() + && text.substring(loc, loc + pattern.length()).equals(pattern)) { + // Perfect match at the perfect spot! (Includes case of null pattern) + return loc; + } else { + // Do a fuzzy compare. + return match_bitap(text, pattern, loc); + } + } + /** * Locate the best instance of 'pattern' in 'text' near 'loc' using the * Bitap algorithm. Returns -1 if no match found. @@ -1673,6 +1696,101 @@ protected int match_bitap(String text, String pattern, int loc) { return best_loc; } + protected int match_bitap(StringBuilder text, String pattern, int loc) { + assert (Match_MaxBits == 0 || pattern.length() <= Match_MaxBits) + : "Pattern too long for this application."; + + // Initialise the alphabet. + Map s = match_alphabet(pattern); + + // Highest score beyond which we give up. + double score_threshold = Match_Threshold; + // Is there a nearby exact match? (speedup) + int best_loc = text.indexOf(pattern, loc); + if (best_loc != -1) { + score_threshold = Math.min(match_bitapScore(0, best_loc, loc, pattern), + score_threshold); + // What about in the other direction? (speedup) + best_loc = text.lastIndexOf(pattern, loc + pattern.length()); + if (best_loc != -1) { + score_threshold = Math.min(match_bitapScore(0, best_loc, loc, pattern), + score_threshold); + } + } + + // Initialise the bit arrays. + int matchmask = 1 << (pattern.length() - 1); + best_loc = -1; + + int bin_min, bin_mid; + int bin_max = pattern.length() + text.length(); + // Empty initialization added to appease Java compiler. + int[] last_rd = new int[0]; + for (int d = 0; d < pattern.length(); d++) { + // Scan for the best match; each iteration allows for one more error. + // Run a binary search to determine how far from 'loc' we can stray at + // this error level. + bin_min = 0; + bin_mid = bin_max; + while (bin_min < bin_mid) { + if (match_bitapScore(d, loc + bin_mid, loc, pattern) + <= score_threshold) { + bin_min = bin_mid; + } else { + bin_max = bin_mid; + } + bin_mid = (bin_max - bin_min) / 2 + bin_min; + } + // Use the result from this iteration as the maximum for the next. + bin_max = bin_mid; + int start = Math.max(1, loc - bin_mid + 1); + int finish = Math.min(loc + bin_mid, text.length()) + pattern.length(); + + int[] rd = new int[finish + 2]; + rd[finish + 1] = (1 << d) - 1; + for (int j = finish; j >= start; j--) { + int charMatch; + if (text.length() <= j - 1 || !s.containsKey(text.charAt(j - 1))) { + // Out of range. + charMatch = 0; + } else { + charMatch = s.get(text.charAt(j - 1)); + } + if (d == 0) { + // First pass: exact match. + rd[j] = ((rd[j + 1] << 1) | 1) & charMatch; + } else { + // Subsequent passes: fuzzy match. + rd[j] = (((rd[j + 1] << 1) | 1) & charMatch) + | (((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]; + } + if ((rd[j] & matchmask) != 0) { + double score = match_bitapScore(d, j - 1, loc, pattern); + // This match will almost certainly be better than any existing + // match. But check anyway. + if (score <= score_threshold) { + // Told you so. + score_threshold = score; + best_loc = j - 1; + if (best_loc > loc) { + // When passing loc, don't exceed our current distance from loc. + start = Math.max(1, 2 * loc - best_loc); + } else { + // Already passed loc, downhill from here on in. + break; + } + } + } + } + if (match_bitapScore(d + 1, loc, loc, pattern) > score_threshold) { + // No hope for a (better) match at greater error levels. + break; + } + last_rd = rd; + } + return best_loc; + } + /** * Compute and return the score for a match with e errors and x location. * @param e Number of errors in match. From e7a9281c774b40b472146f482d1b39eacd97b9f2 Mon Sep 17 00:00:00 2001 From: Liu Zhanhong <275368990@qq.com> Date: Wed, 13 Feb 2019 12:34:39 +0800 Subject: [PATCH 3/5] fix test --- .../tests/name/fraser/neil/plaintext/diff_match_patch_test.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java b/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java index 2f38793..50d7f3c 100644 --- a/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java +++ b/java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java @@ -652,7 +652,7 @@ public static void testMatchMain() { // Test null inputs. try { - dmp.match_main(null, null, 0); + dmp.match_main((String)null, null, 0); fail("match_main: Null inputs."); } catch (IllegalArgumentException ex) { // Error expected. From 7324f74d0e2568828891feb814121e695684487f Mon Sep 17 00:00:00 2001 From: Liu Zhanhong <275368990@qq.com> Date: Wed, 13 Feb 2019 13:56:51 +0800 Subject: [PATCH 4/5] Rename variable name --- .../neil/plaintext/diff_match_patch.java | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/java/src/name/fraser/neil/plaintext/diff_match_patch.java b/java/src/name/fraser/neil/plaintext/diff_match_patch.java index c624251..b17b0af 100644 --- a/java/src/name/fraser/neil/plaintext/diff_match_patch.java +++ b/java/src/name/fraser/neil/plaintext/diff_match_patch.java @@ -2043,18 +2043,18 @@ public LinkedList patch_deepCopy(LinkedList patches) { * @return Two element Object array, containing the new text and an array of * boolean values. */ - public Object[] patch_apply(LinkedList patches, String oritext) { + public Object[] patch_apply(LinkedList patches, String text) { if (patches.isEmpty()) { return new Object[]{text, new boolean[0]}; } - StringBuilder text = new StringBuilder(oritext); + StringBuilder sb = new StringBuilder(text); // Deep copy the patches so that no changes are made to originals. patches = patch_deepCopy(patches); String nullPadding = patch_addPadding(patches); - text.insert(0, nullPadding); - text.append(nullPadding); + sb.insert(0, nullPadding); + sb.append(nullPadding); // text = nullPadding + text + nullPadding; patch_splitMax(patches); @@ -2073,10 +2073,10 @@ public Object[] patch_apply(LinkedList patches, String oritext) { if (text1.length() > this.Match_MaxBits) { // patch_splitMax will only provide an oversized pattern in the case of // a monster delete. - start_loc = match_main(text.toString(), + start_loc = match_main(sb.toString(), text1.substring(0, this.Match_MaxBits), expected_loc); if (start_loc != -1) { - end_loc = match_main(text.toString(), + end_loc = match_main(sb.toString(), text1.substring(text1.length() - this.Match_MaxBits), expected_loc + text1.length() - this.Match_MaxBits); if (end_loc == -1 || start_loc >= end_loc) { @@ -2085,7 +2085,7 @@ public Object[] patch_apply(LinkedList patches, String oritext) { } } } else { - start_loc = match_main(text.toString(), text1, expected_loc); + start_loc = match_main(sb.toString(), text1, expected_loc); } if (start_loc == -1) { // No match found. :( @@ -2098,15 +2098,15 @@ public Object[] patch_apply(LinkedList patches, String oritext) { delta = start_loc - expected_loc; String text2; if (end_loc == -1) { - text2 = text.substring(start_loc, - Math.min(start_loc + text1.length(), text.length())); + text2 = sb.substring(start_loc, + Math.min(start_loc + text1.length(), sb.length())); } else { - text2 = text.substring(start_loc, - Math.min(end_loc + this.Match_MaxBits, text.length())); + text2 = sb.substring(start_loc, + Math.min(end_loc + this.Match_MaxBits, sb.length())); } if (text1.equals(text2)) { // Perfect match, just shove the replacement text in. - text.replace(start_loc, start_loc + text1.length(), diff_text2(aPatch.diffs)); + sb.replace(start_loc, start_loc + text1.length(), diff_text2(aPatch.diffs)); // text = text.substring(0, start_loc) + diff_text2(aPatch.diffs) // + text.substring(start_loc + text1.length()); } else { @@ -2126,12 +2126,12 @@ && diff_levenshtein(diffs) / (float) text1.length() int index2 = diff_xIndex(diffs, index1); if (aDiff.operation == Operation.INSERT) { // Insertion - text.insert(start_loc + index2, aDiff.text); + sb.insert(start_loc + index2, aDiff.text); // text = text.substring(0, start_loc + index2) + aDiff.text // + text.substring(start_loc + index2); } else if (aDiff.operation == Operation.DELETE) { // Deletion - text.delete( + sb.delete( start_loc + index2, start_loc + diff_xIndex(diffs, index1 + aDiff.text.length())); // text = text.substring(0, start_loc + index2) @@ -2149,7 +2149,7 @@ && diff_levenshtein(diffs) / (float) text1.length() x++; } // Strip the padding off. - String textRet = text.substring(nullPadding.length(), text.length() - nullPadding.length()); + String textRet = sb.substring(nullPadding.length(), sb.length() - nullPadding.length()); // text = text.substring(nullPadding.length(), text.length() // - nullPadding.length()); return new Object[]{textRet, results}; From e679bd7fe7051c1f7e46cc0c7e4f00f4a52e2a81 Mon Sep 17 00:00:00 2001 From: Liu Zhanhong <275368990@qq.com> Date: Wed, 13 Feb 2019 14:03:33 +0800 Subject: [PATCH 5/5] use StringBuilder version of match_main --- java/src/name/fraser/neil/plaintext/diff_match_patch.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/java/src/name/fraser/neil/plaintext/diff_match_patch.java b/java/src/name/fraser/neil/plaintext/diff_match_patch.java index b17b0af..04864b6 100644 --- a/java/src/name/fraser/neil/plaintext/diff_match_patch.java +++ b/java/src/name/fraser/neil/plaintext/diff_match_patch.java @@ -2073,10 +2073,10 @@ public Object[] patch_apply(LinkedList patches, String text) { if (text1.length() > this.Match_MaxBits) { // patch_splitMax will only provide an oversized pattern in the case of // a monster delete. - start_loc = match_main(sb.toString(), + start_loc = match_main(sb, text1.substring(0, this.Match_MaxBits), expected_loc); if (start_loc != -1) { - end_loc = match_main(sb.toString(), + end_loc = match_main(sb, text1.substring(text1.length() - this.Match_MaxBits), expected_loc + text1.length() - this.Match_MaxBits); if (end_loc == -1 || start_loc >= end_loc) { @@ -2085,7 +2085,7 @@ public Object[] patch_apply(LinkedList patches, String text) { } } } else { - start_loc = match_main(sb.toString(), text1, expected_loc); + start_loc = match_main(sb, text1, expected_loc); } if (start_loc == -1) { // No match found. :(