001/*-------------------------------------------------------------------------+ 002| | 003| Copyright 2005-2011 The ConQAT Project | 004| | 005| Licensed under the Apache License, Version 2.0 (the "License"); | 006| you may not use this file except in compliance with the License. | 007| You may obtain a copy of the License at | 008| | 009| http://www.apache.org/licenses/LICENSE-2.0 | 010| | 011| Unless required by applicable law or agreed to in writing, software | 012| distributed under the License is distributed on an "AS IS" BASIS, | 013| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 014| See the License for the specific language governing permissions and | 015| limitations under the License. | 016+-------------------------------------------------------------------------*/ 017package org.conqat.lib.commons.algo; 018 019import java.util.ArrayList; 020import java.util.Arrays; 021import java.util.List; 022 023import org.conqat.lib.commons.assertion.CCSMAssert; 024import org.conqat.lib.commons.equals.DefaultEquator; 025import org.conqat.lib.commons.equals.IEquator; 026import org.conqat.lib.commons.string.StringUtils; 027 028/** 029 * Implementation of the diff algorithm described in: E.W. Myers: "An O(ND) 030 * Difference Algorithm and Its Variations". 031 * <p> 032 * Let N be the sum of the concatenated input strings and D the size of the 033 * delta (i.e. the number of changes required to transform one string into the 034 * other). Then the time complexity is O(ND) and the space complexity is O(D^2). 035 * 036 * @param <T> 037 * The type of objects for which the diff is constructed. 038 */ 039public class Diff<T> { 040 041 /** The first list of objects. */ 042 private final List<T> a; 043 044 /** The second list of objects. */ 045 private final List<T> b; 046 047 /** Equator used for comparing elements. */ 048 private final IEquator<? super T> equator; 049 050 /** Length of {@link #a}. */ 051 private int n; 052 053 /** Length of {@link #b}. */ 054 private int m; 055 056 /** The maximal possible difference between {@link #a} and {@link #b}. */ 057 private final int max; 058 059 /** 060 * Maximal size of the delta produced. If the "real" delta would be larger, 061 * a truncated delta will be created. 062 */ 063 private final int maxDeltaSize; 064 065 /** 066 * The array for storing the positions on each diagonal. This is an 067 * "unrolled" version compared to the original paper, i.e. we create a new 068 * array for each iteration of the main loop. 069 */ 070 private final int[][] v; 071 072 /** 073 * This array stores from where we came during the 074 * {@link #calculateDeltaSize()} method. Its structure is the same as 075 * {@link #v}. 076 */ 077 private final boolean[][] from; 078 079 /** 080 * Hidden constructor. Use one of the {@link #computeDelta(List, List)} or 081 * {@link #computeDelta(Object[], Object[])} methods instead. 082 */ 083 private Diff(List<T> a, List<T> b, int maxDeltaSize, IEquator<? super T> equator) { 084 this.a = a; 085 this.b = b; 086 this.maxDeltaSize = maxDeltaSize; 087 this.equator = equator; 088 089 n = a.size(); 090 m = b.size(); 091 max = n + m; 092 v = new int[max + 1][]; 093 from = new boolean[max + 1][]; 094 } 095 096 /** Performs the actual computations. */ 097 private Delta<T> computeDelta() { 098 return constructDelta(calculateDeltaSize()); 099 } 100 101 /** Constructs the actual delta. */ 102 private Delta<T> constructDelta(int size) { 103 int d = size; 104 int k = -size; 105 while (v[size][size + k] < n || v[size][d + k] - k < m) { 106 ++k; 107 } 108 109 Delta<T> delta = new Delta<T>(size, n, m); 110 111 int difference = n - m; 112 while (d > 0) { 113 if (from[d][d + k]) { 114 ++k; 115 } else { 116 --k; 117 } 118 --d; 119 120 int x = v[d][d + k]; 121 int y = x - k; 122 123 int newDifference = x - y; 124 if (newDifference > difference || x >= n) { 125 delta.position[d] = y + 1; 126 delta.t[d] = b.get(y); 127 } else { 128 delta.position[d] = -x - 1; 129 delta.t[d] = a.get(x); 130 } 131 difference = newDifference; 132 } 133 return delta; 134 } 135 136 /** 137 * Calculates the size of the delta (i.e. the number of additions and 138 * deletions. Additionally the {@link #v} and {@link #from} arrays are 139 * filled. 140 */ 141 private int calculateDeltaSize() { 142 int size = -1; 143 for (int d = 0; size < 0 && d <= max; ++d) { 144 v[d] = new int[2 * d + 1]; 145 from[d] = new boolean[2 * d + 1]; 146 147 int bestSum = -1; 148 for (int k = -d; k <= d; k += 2) { 149 int x = 0; 150 if (d > 0) { 151 if (k == -d || k != d && v[d - 1][d - 1 + k - 1] < v[d - 1][d - 1 + k + 1]) { 152 x = v[d - 1][d - 1 + k + 1]; 153 from[d][d + k] = true; 154 } else { 155 x = v[d - 1][d - 1 + k - 1] + 1; 156 from[d][d + k] = false; 157 } 158 } 159 int y = x - k; 160 while (x < n && y < m && equator.equals(a.get(x), b.get(y))) { 161 ++x; 162 ++y; 163 } 164 v[d][d + k] = x; 165 if (d >= maxDeltaSize && x <= n && y <= m && x + y > bestSum) { 166 bestSum = x + y; 167 168 // truncate strings 169 n = Math.min(x, n); 170 m = Math.min(y, m); 171 } 172 if (x >= n && y >= m) { 173 size = d; 174 } 175 } 176 } 177 return size; 178 } 179 180 /** 181 * Applies the diff algorithm on the supplied arrays and returns the delta 182 * between them. 183 * 184 * @param a 185 * the first "word", i.e., array of objects to produce a delta 186 * for. 187 * @param b 188 * the second "word", i.e., array of objects to produce a delta 189 * for. 190 * @return a delta containing the differences between a and b. 191 */ 192 public static <T> Delta<T> computeDelta(T[] a, T[] b) { 193 return computeDelta(Arrays.asList(a), Arrays.asList(b)); 194 } 195 196 /** 197 * Applies the diff algorithm on the supplied arrays and returns the delta 198 * between them. 199 * 200 * @param a 201 * the first "word", i.e., array of objects to produce a delta 202 * for. 203 * @param b 204 * the second "word", i.e., array of objects to produce a delta 205 * for. 206 * @param maxDeltaSize 207 * the maximal size of the delta produced. As the running size 208 * depends linearly on this size and the space required depends 209 * quadratically on it, limiting this value can reduce 210 * calculation overhead at the risk of receiving 211 * partial/incomplete deltas. 212 * @return a delta containing the differences between a and b. 213 */ 214 public static <T> Delta<T> computeDelta(T[] a, T[] b, int maxDeltaSize) { 215 return computeDelta(Arrays.asList(a), Arrays.asList(b), maxDeltaSize); 216 } 217 218 /** 219 * Applies the diff algorithm on the supplied arrays and returns the delta 220 * between them. 221 * 222 * @param a 223 * the first "word", i.e., array of objects to produce a delta 224 * for. 225 * @param b 226 * the second "word", i.e., array of objects to produce a delta 227 * for. 228 * @param equator 229 * an object that can check whether two elements are equal. 230 * 231 * @return a delta containing the differences between a and b. 232 */ 233 public static <T> Delta<T> computeDelta(T[] a, T[] b, IEquator<? super T> equator) { 234 return computeDelta(Arrays.asList(a), Arrays.asList(b), equator); 235 } 236 237 /** 238 * Applies the diff algorithm on the supplied lists and returns the delta 239 * between them. 240 * 241 * @param a 242 * the first "word", i.e., list of objects to produce a delta 243 * for. 244 * @param b 245 * the second "word", i.e., list of objects to produce a delta 246 * for. 247 * @return a delta containing the differences between a and b. 248 */ 249 public static <T> Delta<T> computeDelta(List<T> a, List<T> b) { 250 return computeDelta(a, b, DefaultEquator.INSTANCE); 251 } 252 253 /** 254 * Applies the diff algorithm on the supplied lists and returns the delta 255 * between them. 256 * 257 * @param a 258 * the first "word", i.e., list of objects to produce a delta 259 * for. 260 * @param b 261 * the second "word", i.e., list of objects to produce a delta 262 * for. 263 * @param equator 264 * an object that can check whether two elements are equal. 265 * 266 * @return a delta containing the differences between a and b. 267 */ 268 public static <T> Delta<T> computeDelta(List<T> a, List<T> b, IEquator<? super T> equator) { 269 return computeDelta(a, b, Integer.MAX_VALUE, equator); 270 } 271 272 /** 273 * Applies the diff algorithm on the supplied lists and returns the delta 274 * between them. 275 * 276 * @param a 277 * the first "word", i.e., list of objects to produce a delta 278 * for. 279 * @param b 280 * the second "word", i.e., list of objects to produce a delta 281 * for. 282 * @param maxDeltaSize 283 * the maximal size of the delta produced. As the running size 284 * depends linearly on this size and the space required depends 285 * quadratically on it, limiting this value can reduce 286 * calculation overhead at the risk of receiving 287 * partial/incomplete deltas. 288 * 289 * @return a delta containing the differences between a and b. 290 */ 291 public static <T> Delta<T> computeDelta(List<T> a, List<T> b, int maxDeltaSize) { 292 return computeDelta(a, b, maxDeltaSize, DefaultEquator.INSTANCE); 293 } 294 295 /** 296 * Applies the diff algorithm on the supplied lists and returns the delta 297 * between them. 298 * 299 * @param a 300 * the first "word", i.e., list of objects to produce a delta 301 * for. 302 * @param b 303 * the second "word", i.e., list of objects to produce a delta 304 * for. 305 * @param maxDeltaSize 306 * the maximal size of the delta produced. As the running size 307 * depends linearly on this size and the space required depends 308 * quadratically on it, limiting this value can reduce 309 * calculation overhead at the risk of receiving 310 * partial/incomplete deltas. 311 * @param equator 312 * an object that can check whether two elements are equal. 313 * 314 * @return a delta containing the differences between a and b. 315 */ 316 public static <T> Delta<T> computeDelta(List<T> a, List<T> b, int maxDeltaSize, IEquator<? super T> equator) { 317 return new Diff<T>(a, b, maxDeltaSize, equator).computeDelta(); 318 } 319 320 /** 321 * Objects of this class describe the additions and deletions used to 322 * transform between two words. 323 */ 324 public static class Delta<T> { 325 326 /** The size of the first word. */ 327 private final int n; 328 329 /** The size of the second word. */ 330 private final int m; 331 332 /** 333 * This array stores the position at which a string is changed. If it is 334 * positive, it indicates an addition (i.e. the position is for the 335 * second string). Otherwise it is a deletion (i.e. the (negated) 336 * position is for the first string). To allow storing a sign for 337 * position 0, all positions are incremented before (so this has to be 338 * compensated for). 339 */ 340 private final int[] position; 341 342 /** 343 * This array stores the characters which are added or deleted 344 * (interpretation depends on {@link #position}). 345 */ 346 private final T[] t; 347 348 /** Create new delta of given size. */ 349 @SuppressWarnings("unchecked") 350 private Delta(int size, int n, int m) { 351 this.n = n; 352 this.m = m; 353 position = new int[size]; 354 t = (T[]) new Object[size]; 355 } 356 357 /** 358 * Returns the size of the delta, i.e. the number of additions and 359 * deletions. 360 */ 361 public int getSize() { 362 return position.length; 363 } 364 365 /** Returns the size of the first word the delta was created for. */ 366 public int getN() { 367 return n; 368 } 369 370 /** Returns the size of the second word the delta was created for. */ 371 public int getM() { 372 return m; 373 } 374 375 /** Returns the i-th element stored as addition or deletion. */ 376 public T getT(int i) { 377 return t[i]; 378 } 379 380 /** 381 * Returns the i-th element of the change positions. If it is positive, 382 * it indicates an addition (i.e. the position is for the second 383 * string). Otherwise it is a deletion (i.e. the (negated) position is 384 * for the first string). To allow storing a sign for position 0, all 385 * positions are incremented before (so this has to be compensated for). 386 */ 387 public int getPosition(int i) { 388 return position[i]; 389 } 390 391 /** 392 * Applies the forward patch, i.e. if the first string is inserted, then 393 * the second string is returned. The input word must be of length n, 394 * the output word will be of length m. 395 */ 396 public List<T> forwardPatch(List<T> a) { 397 CCSMAssert.isTrue(a.size() == n, "Input word must be of size " + n); 398 return doPatch(a, new ArrayList<T>(m), 1); 399 } 400 401 /** 402 * Applies the backward patch, i.e. if the second string is inserted, 403 * then the first string is returned. The input word must be of length 404 * m, the output word will be of length n. 405 */ 406 public List<T> backwardPatch(List<T> b) { 407 CCSMAssert.isTrue(b.size() == m, "Input word must be of size " + m); 408 return doPatch(b, new ArrayList<T>(n), -1); 409 } 410 411 /** 412 * Performs the patching from a to b put pre-multiplying the positions 413 * with the given factor. 414 */ 415 private List<T> doPatch(List<T> a, List<T> b, int positionFactor) { 416 int posA = 0; 417 int posB = 0; 418 419 for (int j = 0; j < position.length; ++j) { 420 int k = position[j] * positionFactor; 421 if (k > 0) { 422 // add character 423 k = k - 1; 424 while (posB < k) { 425 b.add(a.get(posA)); 426 ++posA; 427 ++posB; 428 } 429 b.add(t[j]); 430 ++posB; 431 } else { 432 // delete character 433 k = -k - 1; 434 while (posA < k) { 435 b.add(a.get(posA)); 436 ++posA; 437 ++posB; 438 } 439 ++posA; 440 } 441 } 442 while (posA < a.size()) { 443 b.add(a.get(posA)); 444 ++posA; 445 ++posB; 446 } 447 448 return b; 449 } 450 451 /** {@inheritDoc} */ 452 @Override 453 public String toString() { 454 StringBuilder sb = new StringBuilder(); 455 for (int i = 0; i < position.length; ++i) { 456 sb.append(Math.abs(position[i]) - 1); 457 if (position[i] > 0) { 458 sb.append("+ "); 459 } else { 460 sb.append("- "); 461 } 462 sb.append(t[i] + StringUtils.LINE_SEPARATOR); 463 } 464 return sb.toString(); 465 } 466 } 467}