001/*-------------------------------------------------------------------------+
002|                                                                          |
003| Copyright 2005-2011 The ConQAT Project                                   |
004|                                                                          |
005| Licensed under the Apache License, Version 2.0 (the "License");          |
006| you may not use this file except in compliance with the License.         |
007| You may obtain a copy of the License at                                  |
008|                                                                          |
009|    http://www.apache.org/licenses/LICENSE-2.0                            |
010|                                                                          |
011| Unless required by applicable law or agreed to in writing, software      |
012| distributed under the License is distributed on an "AS IS" BASIS,        |
013| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
014| See the License for the specific language governing permissions and      |
015| limitations under the License.                                           |
016+-------------------------------------------------------------------------*/
017package org.conqat.lib.commons.algo;
018
019import java.util.ArrayList;
020import java.util.Arrays;
021import java.util.List;
022
023import org.conqat.lib.commons.assertion.CCSMAssert;
024import org.conqat.lib.commons.equals.DefaultEquator;
025import org.conqat.lib.commons.equals.IEquator;
026import org.conqat.lib.commons.string.StringUtils;
027
028/**
029 * Implementation of the diff algorithm described in: E.W. Myers: "An O(ND)
030 * Difference Algorithm and Its Variations".
031 * <p>
032 * Let N be the sum of the concatenated input strings and D the size of the
033 * delta (i.e. the number of changes required to transform one string into the
034 * other). Then the time complexity is O(ND) and the space complexity is O(D^2).
035 * 
036 * @param <T>
037 *            The type of objects for which the diff is constructed.
038 */
039public class Diff<T> {
040
041        /** The first list of objects. */
042        private final List<T> a;
043
044        /** The second list of objects. */
045        private final List<T> b;
046
047        /** Equator used for comparing elements. */
048        private final IEquator<? super T> equator;
049
050        /** Length of {@link #a}. */
051        private int n;
052
053        /** Length of {@link #b}. */
054        private int m;
055
056        /** The maximal possible difference between {@link #a} and {@link #b}. */
057        private final int max;
058
059        /**
060         * Maximal size of the delta produced. If the "real" delta would be larger,
061         * a truncated delta will be created.
062         */
063        private final int maxDeltaSize;
064
065        /**
066         * The array for storing the positions on each diagonal. This is an
067         * "unrolled" version compared to the original paper, i.e. we create a new
068         * array for each iteration of the main loop.
069         */
070        private final int[][] v;
071
072        /**
073         * This array stores from where we came during the
074         * {@link #calculateDeltaSize()} method. Its structure is the same as
075         * {@link #v}.
076         */
077        private final boolean[][] from;
078
079        /**
080         * Hidden constructor. Use one of the {@link #computeDelta(List, List)} or
081         * {@link #computeDelta(Object[], Object[])} methods instead.
082         */
083        private Diff(List<T> a, List<T> b, int maxDeltaSize, IEquator<? super T> equator) {
084                this.a = a;
085                this.b = b;
086                this.maxDeltaSize = maxDeltaSize;
087                this.equator = equator;
088
089                n = a.size();
090                m = b.size();
091                max = n + m;
092                v = new int[max + 1][];
093                from = new boolean[max + 1][];
094        }
095
096        /** Performs the actual computations. */
097        private Delta<T> computeDelta() {
098                return constructDelta(calculateDeltaSize());
099        }
100
101        /** Constructs the actual delta. */
102        private Delta<T> constructDelta(int size) {
103                int d = size;
104                int k = -size;
105                while (v[size][size + k] < n || v[size][d + k] - k < m) {
106                        ++k;
107                }
108
109                Delta<T> delta = new Delta<T>(size, n, m);
110
111                int difference = n - m;
112                while (d > 0) {
113                        if (from[d][d + k]) {
114                                ++k;
115                        } else {
116                                --k;
117                        }
118                        --d;
119
120                        int x = v[d][d + k];
121                        int y = x - k;
122
123                        int newDifference = x - y;
124                        if (newDifference > difference || x >= n) {
125                                delta.position[d] = y + 1;
126                                delta.t[d] = b.get(y);
127                        } else {
128                                delta.position[d] = -x - 1;
129                                delta.t[d] = a.get(x);
130                        }
131                        difference = newDifference;
132                }
133                return delta;
134        }
135
136        /**
137         * Calculates the size of the delta (i.e. the number of additions and
138         * deletions. Additionally the {@link #v} and {@link #from} arrays are
139         * filled.
140         */
141        private int calculateDeltaSize() {
142                int size = -1;
143                for (int d = 0; size < 0 && d <= max; ++d) {
144                        v[d] = new int[2 * d + 1];
145                        from[d] = new boolean[2 * d + 1];
146
147                        int bestSum = -1;
148                        for (int k = -d; k <= d; k += 2) {
149                                int x = 0;
150                                if (d > 0) {
151                                        if (k == -d || k != d && v[d - 1][d - 1 + k - 1] < v[d - 1][d - 1 + k + 1]) {
152                                                x = v[d - 1][d - 1 + k + 1];
153                                                from[d][d + k] = true;
154                                        } else {
155                                                x = v[d - 1][d - 1 + k - 1] + 1;
156                                                from[d][d + k] = false;
157                                        }
158                                }
159                                int y = x - k;
160                                while (x < n && y < m && equator.equals(a.get(x), b.get(y))) {
161                                        ++x;
162                                        ++y;
163                                }
164                                v[d][d + k] = x;
165                                if (d >= maxDeltaSize && x <= n && y <= m && x + y > bestSum) {
166                                        bestSum = x + y;
167
168                                        // truncate strings
169                                        n = Math.min(x, n);
170                                        m = Math.min(y, m);
171                                }
172                                if (x >= n && y >= m) {
173                                        size = d;
174                                }
175                        }
176                }
177                return size;
178        }
179
180        /**
181         * Applies the diff algorithm on the supplied arrays and returns the delta
182         * between them.
183         * 
184         * @param a
185         *            the first "word", i.e., array of objects to produce a delta
186         *            for.
187         * @param b
188         *            the second "word", i.e., array of objects to produce a delta
189         *            for.
190         * @return a delta containing the differences between a and b.
191         */
192        public static <T> Delta<T> computeDelta(T[] a, T[] b) {
193                return computeDelta(Arrays.asList(a), Arrays.asList(b));
194        }
195
196        /**
197         * Applies the diff algorithm on the supplied arrays and returns the delta
198         * between them.
199         * 
200         * @param a
201         *            the first "word", i.e., array of objects to produce a delta
202         *            for.
203         * @param b
204         *            the second "word", i.e., array of objects to produce a delta
205         *            for.
206         * @param maxDeltaSize
207         *            the maximal size of the delta produced. As the running size
208         *            depends linearly on this size and the space required depends
209         *            quadratically on it, limiting this value can reduce
210         *            calculation overhead at the risk of receiving
211         *            partial/incomplete deltas.
212         * @return a delta containing the differences between a and b.
213         */
214        public static <T> Delta<T> computeDelta(T[] a, T[] b, int maxDeltaSize) {
215                return computeDelta(Arrays.asList(a), Arrays.asList(b), maxDeltaSize);
216        }
217
218        /**
219         * Applies the diff algorithm on the supplied arrays and returns the delta
220         * between them.
221         * 
222         * @param a
223         *            the first "word", i.e., array of objects to produce a delta
224         *            for.
225         * @param b
226         *            the second "word", i.e., array of objects to produce a delta
227         *            for.
228         * @param equator
229         *            an object that can check whether two elements are equal.
230         * 
231         * @return a delta containing the differences between a and b.
232         */
233        public static <T> Delta<T> computeDelta(T[] a, T[] b, IEquator<? super T> equator) {
234                return computeDelta(Arrays.asList(a), Arrays.asList(b), equator);
235        }
236
237        /**
238         * Applies the diff algorithm on the supplied lists and returns the delta
239         * between them.
240         * 
241         * @param a
242         *            the first "word", i.e., list of objects to produce a delta
243         *            for.
244         * @param b
245         *            the second "word", i.e., list of objects to produce a delta
246         *            for.
247         * @return a delta containing the differences between a and b.
248         */
249        public static <T> Delta<T> computeDelta(List<T> a, List<T> b) {
250                return computeDelta(a, b, DefaultEquator.INSTANCE);
251        }
252
253        /**
254         * Applies the diff algorithm on the supplied lists and returns the delta
255         * between them.
256         * 
257         * @param a
258         *            the first "word", i.e., list of objects to produce a delta
259         *            for.
260         * @param b
261         *            the second "word", i.e., list of objects to produce a delta
262         *            for.
263         * @param equator
264         *            an object that can check whether two elements are equal.
265         * 
266         * @return a delta containing the differences between a and b.
267         */
268        public static <T> Delta<T> computeDelta(List<T> a, List<T> b, IEquator<? super T> equator) {
269                return computeDelta(a, b, Integer.MAX_VALUE, equator);
270        }
271
272        /**
273         * Applies the diff algorithm on the supplied lists and returns the delta
274         * between them.
275         * 
276         * @param a
277         *            the first "word", i.e., list of objects to produce a delta
278         *            for.
279         * @param b
280         *            the second "word", i.e., list of objects to produce a delta
281         *            for.
282         * @param maxDeltaSize
283         *            the maximal size of the delta produced. As the running size
284         *            depends linearly on this size and the space required depends
285         *            quadratically on it, limiting this value can reduce
286         *            calculation overhead at the risk of receiving
287         *            partial/incomplete deltas.
288         * 
289         * @return a delta containing the differences between a and b.
290         */
291        public static <T> Delta<T> computeDelta(List<T> a, List<T> b, int maxDeltaSize) {
292                return computeDelta(a, b, maxDeltaSize, DefaultEquator.INSTANCE);
293        }
294
295        /**
296         * Applies the diff algorithm on the supplied lists and returns the delta
297         * between them.
298         * 
299         * @param a
300         *            the first "word", i.e., list of objects to produce a delta
301         *            for.
302         * @param b
303         *            the second "word", i.e., list of objects to produce a delta
304         *            for.
305         * @param maxDeltaSize
306         *            the maximal size of the delta produced. As the running size
307         *            depends linearly on this size and the space required depends
308         *            quadratically on it, limiting this value can reduce
309         *            calculation overhead at the risk of receiving
310         *            partial/incomplete deltas.
311         * @param equator
312         *            an object that can check whether two elements are equal.
313         * 
314         * @return a delta containing the differences between a and b.
315         */
316        public static <T> Delta<T> computeDelta(List<T> a, List<T> b, int maxDeltaSize, IEquator<? super T> equator) {
317                return new Diff<T>(a, b, maxDeltaSize, equator).computeDelta();
318        }
319
320        /**
321         * Objects of this class describe the additions and deletions used to
322         * transform between two words.
323         */
324        public static class Delta<T> {
325
326                /** The size of the first word. */
327                private final int n;
328
329                /** The size of the second word. */
330                private final int m;
331
332                /**
333                 * This array stores the position at which a string is changed. If it is
334                 * positive, it indicates an addition (i.e. the position is for the
335                 * second string). Otherwise it is a deletion (i.e. the (negated)
336                 * position is for the first string). To allow storing a sign for
337                 * position 0, all positions are incremented before (so this has to be
338                 * compensated for).
339                 */
340                private final int[] position;
341
342                /**
343                 * This array stores the characters which are added or deleted
344                 * (interpretation depends on {@link #position}).
345                 */
346                private final T[] t;
347
348                /** Create new delta of given size. */
349                @SuppressWarnings("unchecked")
350                private Delta(int size, int n, int m) {
351                        this.n = n;
352                        this.m = m;
353                        position = new int[size];
354                        t = (T[]) new Object[size];
355                }
356
357                /**
358                 * Returns the size of the delta, i.e. the number of additions and
359                 * deletions.
360                 */
361                public int getSize() {
362                        return position.length;
363                }
364
365                /** Returns the size of the first word the delta was created for. */
366                public int getN() {
367                        return n;
368                }
369
370                /** Returns the size of the second word the delta was created for. */
371                public int getM() {
372                        return m;
373                }
374
375                /** Returns the i-th element stored as addition or deletion. */
376                public T getT(int i) {
377                        return t[i];
378                }
379
380                /**
381                 * Returns the i-th element of the change positions. If it is positive,
382                 * it indicates an addition (i.e. the position is for the second
383                 * string). Otherwise it is a deletion (i.e. the (negated) position is
384                 * for the first string). To allow storing a sign for position 0, all
385                 * positions are incremented before (so this has to be compensated for).
386                 */
387                public int getPosition(int i) {
388                        return position[i];
389                }
390
391                /**
392                 * Applies the forward patch, i.e. if the first string is inserted, then
393                 * the second string is returned. The input word must be of length n,
394                 * the output word will be of length m.
395                 */
396                public List<T> forwardPatch(List<T> a) {
397                        CCSMAssert.isTrue(a.size() == n, "Input word must be of size " + n);
398                        return doPatch(a, new ArrayList<T>(m), 1);
399                }
400
401                /**
402                 * Applies the backward patch, i.e. if the second string is inserted,
403                 * then the first string is returned. The input word must be of length
404                 * m, the output word will be of length n.
405                 */
406                public List<T> backwardPatch(List<T> b) {
407                        CCSMAssert.isTrue(b.size() == m, "Input word must be of size " + m);
408                        return doPatch(b, new ArrayList<T>(n), -1);
409                }
410
411                /**
412                 * Performs the patching from a to b put pre-multiplying the positions
413                 * with the given factor.
414                 */
415                private List<T> doPatch(List<T> a, List<T> b, int positionFactor) {
416                        int posA = 0;
417                        int posB = 0;
418
419                        for (int j = 0; j < position.length; ++j) {
420                                int k = position[j] * positionFactor;
421                                if (k > 0) {
422                                        // add character
423                                        k = k - 1;
424                                        while (posB < k) {
425                                                b.add(a.get(posA));
426                                                ++posA;
427                                                ++posB;
428                                        }
429                                        b.add(t[j]);
430                                        ++posB;
431                                } else {
432                                        // delete character
433                                        k = -k - 1;
434                                        while (posA < k) {
435                                                b.add(a.get(posA));
436                                                ++posA;
437                                                ++posB;
438                                        }
439                                        ++posA;
440                                }
441                        }
442                        while (posA < a.size()) {
443                                b.add(a.get(posA));
444                                ++posA;
445                                ++posB;
446                        }
447
448                        return b;
449                }
450
451                /** {@inheritDoc} */
452                @Override
453                public String toString() {
454                        StringBuilder sb = new StringBuilder();
455                        for (int i = 0; i < position.length; ++i) {
456                                sb.append(Math.abs(position[i]) - 1);
457                                if (position[i] > 0) {
458                                        sb.append("+ ");
459                                } else {
460                                        sb.append("- ");
461                                }
462                                sb.append(t[i] + StringUtils.LINE_SEPARATOR);
463                        }
464                        return sb.toString();
465                }
466        }
467}