001/*-------------------------------------------------------------------------+
002|                                                                          |
003| Copyright 2005-2011 the ConQAT Project                                   |
004|                                                                          |
005| Licensed under the Apache License, Version 2.0 (the "License");          |
006| you may not use this file except in compliance with the License.         |
007| You may obtain a copy of the License at                                  |
008|                                                                          |
009|    http://www.apache.org/licenses/LICENSE-2.0                            |
010|                                                                          |
011| Unless required by applicable law or agreed to in writing, software      |
012| distributed under the License is distributed on an "AS IS" BASIS,        |
013| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
014| See the License for the specific language governing permissions and      |
015| limitations under the License.                                           |
016+-------------------------------------------------------------------------*/
017package org.conqat.engine.commons.findings.location;
018
019import java.util.ArrayList;
020import java.util.Collections;
021import java.util.Comparator;
022import java.util.List;
023import java.util.Set;
024import java.util.regex.Matcher;
025import java.util.regex.Pattern;
026
027import org.conqat.lib.commons.algo.Diff;
028import org.conqat.lib.commons.algo.Diff.Delta;
029import org.conqat.lib.commons.region.LineBasedRegion;
030import org.conqat.lib.commons.region.Region;
031import org.conqat.lib.commons.string.LineOffsetConverter;
032
033/**
034 * This class is used for adjusting the offsets used in locations (i.e.
035 * subclasses of {@link ElementLocation} for text that is slightly modified. The
036 * main use-case is the update of locations where the local (adjusted) text has
037 * different line ending, different content due to keyword expansion, or minor
038 * local modifications compared to the text on which the analysis was executed
039 * (original text).
040 *
041 * Both the original and adjusted text may have arbitrary line endings.
042 *
043 * The implementation is based on a token diff, which can lead to minor
044 * deviations for offsets that are not aligned with token boundaries. A
045 * character diff would be more precise, but is too performance and memory
046 * intensive for large files.
047 */
048public class LocationAdjuster implements ILineAdjuster {
049
050        /**
051         * If the number of tokens in the adjusted region differs by the tokens in the
052         * original region by more than this factor, the mapping is counted as wrong.
053         */
054        private static final double LOSS_FACTOR = 2;
055
056        /** Maximal number of tokens in the diff we accept. */
057        private static final int MAX_DIFF_SIZE = 5000;
058
059        /**
060         * Pattern defining tokens for the diff. Matches either alphanumeric strings
061         * (typical identifiers), or single non-whitespace characters.
062         */
063        private static final Pattern TOKEN_PATTERN = Pattern.compile("[a-zA-Z0-9_]+|\\S");
064
065        /** The tokens of the original string. */
066        private final List<AdjusterToken> originalTokens;
067
068        /**
069         * Adjusted tokens corresponding to the {@link #originalTokens}. If there is no
070         * corresponding token, this list contains null at the index. If the content
071         * could not be matched/adjusted at all (too many differences), this field is
072         * null.
073         */
074        private final List<AdjusterToken> mappedAdjustedTokens;
075
076        /** Line offset converted for the original text. */
077        private final LineOffsetConverter originalLineOffsetConverter;
078
079        /** Line offset converted for the adjusted text. */
080        private final LineOffsetConverter adjustedLineOffsetConverter;
081
082        /**
083         * Optional uniform path to correct element locations of
084         * {@link #adjustLocation(ElementLocation)}. Null if no correction will be
085         * performed.
086         */
087        private final String adjustedUniformPath;
088
089        /**
090         * Constructor.
091         *
092         * WARNING: Creating a location adjuster is very expensive and should only be
093         * done once per file. In case originalText and adjustedText is identical take a
094         * look at {@link SimpleValidLinesFilter}.
095         *
096         * @param originalText
097         *            the text for which the input locations have been created, i.e. the
098         *            text from the analysis. May be <code>null</code>, in which case no
099         *            adjustment is performed.
100         * @param adjustedText
101         *            the text for which the locations should be adjusted, i.e. the
102         *            local text.
103         * @param adjustedUniformPath
104         *            the adjusted uniform path for adjusted findings.
105         */
106        public LocationAdjuster(String originalText, String adjustedText, String adjustedUniformPath) {
107                if (originalText == null) {
108                        originalText = adjustedText;
109                }
110                this.adjustedUniformPath = adjustedUniformPath;
111                originalLineOffsetConverter = new LineOffsetConverter(originalText);
112                adjustedLineOffsetConverter = new LineOffsetConverter(adjustedText);
113                originalTokens = toTokens(originalText);
114                mappedAdjustedTokens = calculateMappedAdjustedTokens(adjustedText, originalTokens);
115        }
116
117        /**
118         * Constructor.
119         *
120         * @param originalText
121         *            the text for which the input locations have been created, i.e. the
122         *            text from the analysis.
123         * @param adjustedText
124         *            the text for which the locations should be adjusted, i.e. the
125         *            local text.
126         */
127        public LocationAdjuster(String originalText, String adjustedText) {
128                this(originalText, adjustedText, null);
129        }
130
131        /**
132         * Calculates the #mappedAdjustedTokens based on original tokens and adjusted
133         * text. May return null if adjustment is not possible due too many changes.
134         */
135        private static List<AdjusterToken> calculateMappedAdjustedTokens(String adjustedText,
136                        List<AdjusterToken> originalTokens) {
137
138                List<AdjusterToken> adjustedTokens = toTokens(adjustedText);
139                Delta<AdjusterToken> delta = Diff.computeDelta(originalTokens, adjustedTokens, MAX_DIFF_SIZE);
140
141                if (delta.getSize() >= MAX_DIFF_SIZE) {
142                        return null;
143                }
144
145                return calculateMappedAdjustedTokensFromDelta(delta, originalTokens, adjustedTokens);
146        }
147
148        /**
149         * Calculates the #mappedAdjustedTokens based on original tokens, adjusted text,
150         * and delta.
151         */
152        private static List<AdjusterToken> calculateMappedAdjustedTokensFromDelta(Delta<AdjusterToken> delta,
153                        List<AdjusterToken> originalTokens, List<AdjusterToken> adjustedTokens) {
154                List<AdjusterToken> mappedAdjustedTokens = new ArrayList<>(Collections.nCopies(originalTokens.size(), null));
155                int originalIndex = 0;
156                int adjustedIndex = 0;
157                for (int i = 0; i < delta.getSize(); ++i) {
158                        int position = delta.getPosition(i);
159                        if (position > 0) {
160                                position -= 1;
161
162                                while (adjustedIndex < position) {
163                                        mappedAdjustedTokens.set(originalIndex++, adjustedTokens.get(adjustedIndex++));
164                                }
165                                adjustedIndex += 1;
166                        } else {
167                                position = -position - 1;
168
169                                while (originalIndex < position) {
170                                        mappedAdjustedTokens.set(originalIndex++, adjustedTokens.get(adjustedIndex++));
171                                }
172                                originalIndex += 1;
173                        }
174                }
175
176                while (originalIndex < originalTokens.size()) {
177                        mappedAdjustedTokens.set(originalIndex++, adjustedTokens.get(adjustedIndex++));
178                }
179
180                return mappedAdjustedTokens;
181        }
182
183        /** Splits a string into tokens. */
184        private static List<AdjusterToken> toTokens(String s) {
185                List<AdjusterToken> tokens = new ArrayList<>();
186                Matcher matcher = TOKEN_PATTERN.matcher(s);
187                while (matcher.find()) {
188                        tokens.add(new AdjusterToken(matcher.group(), matcher.start()));
189                }
190                return tokens;
191        }
192
193        /**
194         * Maps a zero-based offset range (both inclusive) to the adjusted string.
195         * Returns <code>null</code> if the region could not be approximately mapped.
196         */
197        public Region getAdjustedRegion(int originalStartOffset, int originalEndOffset) {
198
199                if (mappedAdjustedTokens == null) {
200                        return null;
201                }
202
203                Region originalIndexRegion = findOriginalIndexRegion(originalStartOffset, originalEndOffset);
204                if (originalIndexRegion.isEmpty()) {
205                        return null;
206                }
207
208                int numOriginalTokens = originalIndexRegion.getLength();
209                int numAdjustedTokens = 0;
210
211                AdjusterToken firstAdjustedToken = null;
212                AdjusterToken lastAdjustedToken = null;
213                for (int i = originalIndexRegion.getStart(); i <= originalIndexRegion.getEnd(); ++i) {
214                        AdjusterToken adjustedToken = mappedAdjustedTokens.get(i);
215                        if (adjustedToken != null) {
216                                numAdjustedTokens += 1;
217                                if (firstAdjustedToken == null) {
218                                        firstAdjustedToken = adjustedToken;
219                                }
220                                lastAdjustedToken = adjustedToken;
221                        }
222                }
223
224                if (firstAdjustedToken == null || lastAdjustedToken == null
225                                || LOSS_FACTOR * numAdjustedTokens < numOriginalTokens) {
226                        return null;
227                }
228
229                return new Region(firstAdjustedToken.startOffset, lastAdjustedToken.endOffset);
230        }
231
232        /**
233         * Returns the region of indexes in the {@link #originalTokens} contained in the
234         * given offsets.
235         */
236        private Region findOriginalIndexRegion(int originalStartOffset, int originalEndOffset) {
237                AdjusterToken searchToken = new AdjusterToken(null, originalStartOffset, originalEndOffset);
238                int originalStartTokenIndex = Collections.binarySearch(originalTokens, searchToken,
239                                AdjusterToken.COMPARE_BY_START_OFFSET);
240                if (originalStartTokenIndex < 0) {
241                        originalStartTokenIndex = -originalStartTokenIndex - 1;
242                }
243
244                int originalEndTokenIndex = Collections.binarySearch(originalTokens, searchToken,
245                                AdjusterToken.COMPARE_BY_END_OFFSET);
246                if (originalEndTokenIndex < 0) {
247                        // we want insertion point -1
248                        originalEndTokenIndex = -originalEndTokenIndex - 2;
249                }
250
251                return new Region(originalStartTokenIndex, originalEndTokenIndex);
252        }
253
254        /**
255         * Returns a new location with adjusted offsets (if necessary). Returns
256         * <code>null</code> if the location cannot be mapped to the adjusted text.
257         */
258        public ElementLocation adjustLocation(ElementLocation location) {
259                if (location instanceof TextRegionLocation) {
260                        return adjustLocation((TextRegionLocation) location);
261                }
262
263                // other locations do not have offsets, if the uniform path should not
264                // be adjusted simply return the original location.
265                if (adjustedUniformPath == null) {
266                        return location;
267                }
268
269                if (location instanceof QualifiedNameLocation) {
270                        return new QualifiedNameLocation(((QualifiedNameLocation) location).getQualifiedName(),
271                                        location.getLocation(), adjustedUniformPath);
272                }
273
274                return new ElementLocation(location.getLocation(), adjustedUniformPath);
275        }
276
277        /**
278         * Returns a new location with adjusted offsets (if necessary). Returns
279         * <code>null</code> if the cannot be mapped to the adjusted text.
280         */
281        public TextRegionLocation adjustLocation(TextRegionLocation location) {
282                int startOffset = location.getRawStartOffset();
283                int endOffset = location.getRawEndOffset();
284                if (startOffset < 0) {
285                        int startLine = location.getRawStartLine();
286                        if (!originalLineOffsetConverter.isValidLine(startLine)) {
287                                return null;
288                        }
289                        startOffset = originalLineOffsetConverter.getOffset(startLine);
290                }
291                if (endOffset < 0) {
292                        int endLine = location.getRawEndLine() + 1;
293                        if (!originalLineOffsetConverter.isValidLine(endLine)) {
294                                return null;
295                        }
296                        endOffset = originalLineOffsetConverter.getOffset(endLine) - 1;
297                }
298
299                Region adjustedOffsets = getAdjustedRegion(startOffset, endOffset);
300
301                if (adjustedOffsets == null || adjustedOffsets.isEmpty()) {
302                        return null;
303                }
304
305                String uniformPath = location.getUniformPath();
306                if (adjustedUniformPath != null) {
307                        uniformPath = adjustedUniformPath;
308                }
309
310                int newStartOffset = adjustedOffsets.getStart();
311                int newEndOffset = adjustedOffsets.getEnd();
312                return new TextRegionLocation(location.getLocation(), uniformPath, newStartOffset, newEndOffset,
313                                adjustedLineOffsetConverter.getLine(newStartOffset), adjustedLineOffsetConverter.getLine(newEndOffset));
314        }
315
316        /**
317         * Adjusts the location of a single line. This only respects the token part of a
318         * line, i.e. leading and trailing whitespace of a line will be ignored. This
319         * method is robust w.r.t lines numbers that are out of the range of the
320         * original text. In case of such an invalid line, the line is logged as error
321         * to the given logger and <code>null</code> is returned.
322         *
323         * @param line
324         *            the one-based line number of be adjusted.
325         * @param invalidLines
326         *            used for collecting invalid lines.
327         *
328         * @return the one-based lines encoded as a region, as a line may map to
329         *         multiple lines after changing. This may also return null, if no
330         *         non-empty lines could be found that correspond to the input line
331         *         after adjustment.
332         */
333        @Override
334        public LineBasedRegion adjustLine(int line, Set<Integer> invalidLines) {
335                if (!originalLineOffsetConverter.isValidLine(line) || !originalLineOffsetConverter.isValidLine(line + 1)) {
336                        invalidLines.add(line);
337                        return null;
338                }
339                int originalStartOffset = originalLineOffsetConverter.getOffset(line);
340                int originalEndOffset = originalLineOffsetConverter.getOffset(line + 1) - 1;
341                Region adjustedOffsets = getAdjustedRegion(originalStartOffset, originalEndOffset);
342                if (adjustedOffsets == null) {
343                        return null;
344                }
345
346                int adjustedStartLine = adjustedLineOffsetConverter.getLine(adjustedOffsets.getStart());
347                int adjustedEndLine = adjustedLineOffsetConverter.getLine(adjustedOffsets.getEnd());
348                return new LineBasedRegion(adjustedStartLine, adjustedEndLine);
349        }
350
351        /** Returns the line count of the original text */
352        @Override
353        public int getOriginalLineCount() {
354                return originalLineOffsetConverter.getLineCount();
355        }
356
357        /** Simple token representation used in location adjustment. */
358        private static class AdjusterToken {
359
360                /** Compares by start offset. */
361                private static final Comparator<AdjusterToken> COMPARE_BY_START_OFFSET = Comparator
362                                .comparingInt(token -> token.startOffset);
363
364                /** Compares by end offset. */
365                private static final Comparator<AdjusterToken> COMPARE_BY_END_OFFSET = Comparator
366                                .comparingInt(token -> token.endOffset);
367
368                /** The text content. */
369                private final String text;
370
371                /** The start offset in the text. */
372                private final int startOffset;
373
374                /** The inclusive end offset in the text. */
375                private final int endOffset;
376
377                /** Constructor. */
378                private AdjusterToken(String text, int startOffset) {
379                        this(text, startOffset, startOffset + text.length() - 1);
380                }
381
382                /** Constructor. */
383                private AdjusterToken(String text, int startOffset, int endOffset) {
384                        this.text = text;
385                        this.startOffset = startOffset;
386                        this.endOffset = endOffset;
387                }
388
389                /** {@inheritDoc} */
390                @Override
391                public boolean equals(Object obj) {
392                        return (obj instanceof AdjusterToken) && ((AdjusterToken) obj).text.equals(text);
393                }
394
395                /** {@inheritDoc} */
396                @Override
397                public int hashCode() {
398                        return text.hashCode();
399                }
400        }
401}