001/*-----------------------------------------------------------------------+
002 | com.teamscale.ui
003 |                                                                       |
004   $Id$            
005 |                                                                       |
006 | Copyright (c)  2009-2013 CQSE GmbH                                 |
007 +-----------------------------------------------------------------------*/
008package org.conqat.lib.commons.diff;
009
010import java.util.ArrayList;
011import java.util.List;
012import java.util.regex.Matcher;
013import java.util.regex.Pattern;
014
015import org.conqat.lib.commons.algo.Diff;
016import org.conqat.lib.commons.algo.Diff.Delta;
017import org.conqat.lib.commons.collections.CollectionUtils;
018import org.conqat.lib.commons.collections.Pair;
019
020/**
021 * Base class for the code that produces a {@link DiffDescription} for two
022 * elements.
023 */
024public abstract class DifferBase<T> {
025
026        /**
027         * Maximal size of the delta computed during diff calculation. If there are
028         * more than this number of changes, a partial diff will be displayed (i.e.
029         * only changes in the top region). The reason to limit this, is that the
030         * used algorithm uses O(D^2) memory for a diff of size D.
031         */
032        private static final int MAX_DIFF_SIZE = 10000;
033
034        /**
035         * Pattern for extracting fragments. This matches consecutive word
036         * characters, consecutive digits, consecutive whitespace, or a single
037         * character.
038         */
039        private static final Pattern FRAGMENT_MATCH_PATTERN = Pattern.compile("\\w+|\\d+|\\s+|.");
040
041        /** Text of the left element. */
042        private String leftText;
043
044        /** Text of the right element. */
045        private String rightText;
046
047        /** Chunks for the left element. */
048        private List<TextChunk> leftChunks;
049
050        /** Chunks for the right element. */
051        private List<TextChunk> rightChunks;
052
053        /** Performs the diff and returns a {@link DiffDescription}. */
054        public DiffDescription performDiff(T leftElement, T rightElement) {
055                leftText = getElementText(leftElement);
056                rightText = getElementText(rightElement);
057
058                leftChunks = getChunks(leftElement, true);
059                rightChunks = getChunks(rightElement, false);
060                Delta<TextChunk> delta = Diff.computeDelta(leftChunks, rightChunks, MAX_DIFF_SIZE);
061
062                DiffDescription diffDescription = new DiffDescription(getDiffName());
063                
064                convertDiffToDiffDescription(delta, diffDescription);
065
066                return postProcessDiffDescription(diffDescription);
067        }
068
069        /**
070         * Post processing of the diffDescription.
071         */
072        protected DiffDescription postProcessDiffDescription(DiffDescription diffDescription) {
073                // default does no post processing
074                return diffDescription;
075        }
076
077        /** Get the text of the element. */
078        protected abstract String getElementText(T element);
079
080        /**
081         * Processes the given diff and inserts the changed regions into the diff
082         * description.
083         */
084        private void convertDiffToDiffDescription(Delta<TextChunk> delta, DiffDescription diffDescription) {
085                boolean[] chunkRemoved = new boolean[leftChunks.size()];
086                boolean[] chunkAdded = new boolean[rightChunks.size()];
087
088                for (int i = 0; i < delta.getSize(); i++) {
089                        int position = delta.getPosition(i);
090                        if (position < 0) { // chunk was removed
091                                chunkRemoved[-position - 1] = true;
092                        } else { // chunk was added
093                                chunkAdded[position - 1] = true;
094                        }
095                }
096
097                int leftChunkPosition = 0;
098                int rightChunkPosition = 0;
099
100                while (leftChunkPosition < chunkRemoved.length || rightChunkPosition < chunkAdded.length) {
101                        int removedChunksRegionStart = leftChunkPosition;
102                        int addedChunksRegionStart = rightChunkPosition;
103
104                        // Moving leftChunkPosition to the end of the removed chunks
105                        while (leftChunkPosition < chunkRemoved.length && chunkRemoved[leftChunkPosition]) {
106                                leftChunkPosition++;
107                        }
108
109                        // Moving rightChunkPosition to the end of the added chunks
110                        while (rightChunkPosition < chunkAdded.length && chunkAdded[rightChunkPosition]) {
111                                rightChunkPosition++;
112                        }
113
114                        // Check if there where actual changes
115                        if (removedChunksRegionStart != leftChunkPosition || addedChunksRegionStart != rightChunkPosition) {
116                                insertDiffRegion(diffDescription, removedChunksRegionStart, leftChunkPosition, addedChunksRegionStart,
117                                                rightChunkPosition);
118                        } else {
119                                leftChunkPosition++;
120                                rightChunkPosition++;
121                        }
122                }
123        }
124
125        /**
126         * Inserts a diff region based on unit indexes. Start indexes are inclusive,
127         * end indexes are exclusive.
128         */
129        private void insertDiffRegion(DiffDescription diffDescription, int currentLeftStart, int currentLeftEnd,
130                        int currentRightStart, int currentRightEnd) {
131
132                Pair<Integer, Integer> leftLines = convertToFirstLastLine(currentLeftStart, currentLeftEnd, leftChunks);
133                Pair<Integer, Integer> rightLines = convertToFirstLastLine(currentRightStart, currentRightEnd, rightChunks);
134                diffDescription.addLineRegion(leftLines.getFirst(), leftLines.getSecond(), rightLines.getFirst(),
135                                rightLines.getSecond());
136
137                List<TextChunk> leftFragments = extractFragments(currentLeftStart, currentLeftEnd, leftText, leftChunks);
138                List<TextChunk> rightFragments = extractFragments(currentRightStart, currentRightEnd, rightText, rightChunks);
139                Delta<TextChunk> delta = Diff.computeDelta(leftFragments, rightFragments, MAX_DIFF_SIZE);
140                for (int i = 0; i < delta.getSize(); ++i) {
141                        int position = delta.getPosition(i);
142                        if (position > 0) {
143                                TextChunk fragment = rightFragments.get(position - 1);
144                                diffDescription.addRightChange(fragment.startOffset, fragment.endOffset);
145                        } else {
146                                TextChunk fragment = leftFragments.get(-position - 1);
147                                diffDescription.addLeftChange(fragment.startOffset, fragment.endOffset);
148                        }
149                }
150        }
151
152        /**
153         * Converts start/end indexes to start/end lines (start inclusive, end
154         * exclusive). Line indices are starting from 1, indices for units from 0. If
155         * units is empty or the endIndex is 0, the method returns new Pair(1,1).
156         */
157        private static Pair<Integer, Integer> convertToFirstLastLine(int startIndex, int endIndex,
158                        List<TextChunk> units) {
159                if (units.isEmpty() || endIndex == 0) {
160                        return new Pair<>(1, 1);
161                }
162
163                int firstLine;
164                if (startIndex >= units.size()) {
165                        firstLine = units.get(startIndex - 1).endLine;
166                } else {
167                        firstLine = units.get(startIndex).startLine;
168                }
169                int lastLine = Math.max(firstLine, units.get(endIndex - 1).endLine);
170                return new Pair<Integer, Integer>(firstLine, lastLine);
171        }
172
173        /**
174         * Extracts fragments for the diff region denoted by start/end indexes in
175         * the chunks. Fragments are used for "diff in diff" to highlight the
176         * actually changed parts.
177         * 
178         * @param startIndex
179         *            The zero-based index of the chunk in the given units which
180         *            marks the start of the region
181         * @param endIndex
182         *            The zero-based index of the chunk in the given units which
183         *            marks the end of the region (exclusive)
184         * @param text
185         *            The text to extract the region from
186         * @param units
187         *            The chunks of the text. Used to find out, the substring of the
188         *            region inside the given text
189         * 
190         * @return The region split into chunks of fragments denoted by the
191         *         {@link #FRAGMENT_MATCH_PATTERN}
192         */
193        private static List<TextChunk> extractFragments(int startIndex, int endIndex, String text, List<TextChunk> units) {
194                if (startIndex >= units.size() || startIndex >= endIndex) {
195                        return CollectionUtils.emptyList();
196                }
197
198                List<TextChunk> fragments = new ArrayList<TextChunk>();
199                int startOffset = units.get(startIndex).startOffset;
200                int endOffset = Math.min(units.get(endIndex - 1).endOffset, text.length());
201
202                Matcher matcher = FRAGMENT_MATCH_PATTERN.matcher(text.substring(startOffset, endOffset));
203                while (matcher.find()) {
204                        int fragmentStart = startOffset + matcher.start();
205                        String fragmentText = matcher.group();
206
207                        // ignore fragments with new lines, as they are only filler
208                        // white-space
209                        if (fragmentText.contains("\n")) {
210                                continue;
211                        }
212
213                        // lines are not used and thus initialized with dummy values
214                        fragments.add(new TextChunk(fragmentStart, fragmentStart + fragmentText.length(), 0, 0, fragmentText));
215                }
216
217                return fragments;
218        }
219
220        /**
221         * Returns the chunks for an element.
222         * 
223         * @param isLeft
224         *            indicates whether this method is called for the left or right
225         *            element. The implementer then may behave differently for both
226         *            elements (possibly depending on additional information).
227         */
228        protected abstract List<TextChunk> getChunks(T element, boolean isLeft);
229
230        /** Returns the name for this diff. */
231        protected abstract String getDiffName();
232
233}