001/*-------------------------------------------------------------------------+ 002| | 003| Copyright 2005-2011 the ConQAT Project | 004| | 005| Licensed under the Apache License, Version 2.0 (the "License"); | 006| you may not use this file except in compliance with the License. | 007| You may obtain a copy of the License at | 008| | 009| http://www.apache.org/licenses/LICENSE-2.0 | 010| | 011| Unless required by applicable law or agreed to in writing, software | 012| distributed under the License is distributed on an "AS IS" BASIS, | 013| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 014| See the License for the specific language governing permissions and | 015| limitations under the License. | 016+-------------------------------------------------------------------------*/ 017package org.conqat.engine.commons.findings.location; 018 019import java.util.ArrayList; 020import java.util.Collections; 021import java.util.Comparator; 022import java.util.List; 023import java.util.Set; 024import java.util.regex.Matcher; 025import java.util.regex.Pattern; 026 027import org.conqat.lib.commons.algo.Diff; 028import org.conqat.lib.commons.algo.Diff.Delta; 029import org.conqat.lib.commons.region.LineBasedRegion; 030import org.conqat.lib.commons.region.Region; 031import org.conqat.lib.commons.string.LineOffsetConverter; 032 033/** 034 * This class is used for adjusting the offsets used in locations (i.e. 035 * subclasses of {@link ElementLocation} for text that is slightly modified. The 036 * main use-case is the update of locations where the local (adjusted) text has 037 * different line ending, different content due to keyword expansion, or minor 038 * local modifications compared to the text on which the analysis was executed 039 * (original text). 040 * 041 * Both the original and adjusted text may have arbitrary line endings. 042 * 043 * The implementation is based on a token diff, which can lead to minor 044 * deviations for offsets that are not aligned with token boundaries. A 045 * character diff would be more precise, but is too performance and memory 046 * intensive for large files. 047 */ 048public class LocationAdjuster implements ILineAdjuster { 049 050 /** 051 * If the number of tokens in the adjusted region differs by the tokens in the 052 * original region by more than this factor, the mapping is counted as wrong. 053 */ 054 private static final double LOSS_FACTOR = 2; 055 056 /** Maximal number of tokens in the diff we accept. */ 057 private static final int MAX_DIFF_SIZE = 5000; 058 059 /** 060 * Pattern defining tokens for the diff. Matches either alphanumeric strings 061 * (typical identifiers), or single non-whitespace characters. 062 */ 063 private static final Pattern TOKEN_PATTERN = Pattern.compile("[a-zA-Z0-9_]+|\\S"); 064 065 /** The tokens of the original string. */ 066 private final List<AdjusterToken> originalTokens; 067 068 /** 069 * Adjusted tokens corresponding to the {@link #originalTokens}. If there is no 070 * corresponding token, this list contains null at the index. If the content 071 * could not be matched/adjusted at all (too many differences), this field is 072 * null. 073 */ 074 private final List<AdjusterToken> mappedAdjustedTokens; 075 076 /** Line offset converted for the original text. */ 077 private final LineOffsetConverter originalLineOffsetConverter; 078 079 /** Line offset converted for the adjusted text. */ 080 private final LineOffsetConverter adjustedLineOffsetConverter; 081 082 /** 083 * Optional uniform path to correct element locations of 084 * {@link #adjustLocation(ElementLocation)}. Null if no correction will be 085 * performed. 086 */ 087 private final String adjustedUniformPath; 088 089 /** 090 * Constructor. 091 * 092 * WARNING: Creating a location adjuster is very expensive and should only be 093 * done once per file. In case originalText and adjustedText is identical take a 094 * look at {@link SimpleValidLinesFilter}. 095 * 096 * @param originalText 097 * the text for which the input locations have been created, i.e. the 098 * text from the analysis. May be <code>null</code>, in which case no 099 * adjustment is performed. 100 * @param adjustedText 101 * the text for which the locations should be adjusted, i.e. the 102 * local text. 103 * @param adjustedUniformPath 104 * the adjusted uniform path for adjusted findings. 105 */ 106 public LocationAdjuster(String originalText, String adjustedText, String adjustedUniformPath) { 107 if (originalText == null) { 108 originalText = adjustedText; 109 } 110 this.adjustedUniformPath = adjustedUniformPath; 111 originalLineOffsetConverter = new LineOffsetConverter(originalText); 112 adjustedLineOffsetConverter = new LineOffsetConverter(adjustedText); 113 originalTokens = toTokens(originalText); 114 mappedAdjustedTokens = calculateMappedAdjustedTokens(adjustedText, originalTokens); 115 } 116 117 /** 118 * Constructor. 119 * 120 * @param originalText 121 * the text for which the input locations have been created, i.e. the 122 * text from the analysis. 123 * @param adjustedText 124 * the text for which the locations should be adjusted, i.e. the 125 * local text. 126 */ 127 public LocationAdjuster(String originalText, String adjustedText) { 128 this(originalText, adjustedText, null); 129 } 130 131 /** 132 * Calculates the #mappedAdjustedTokens based on original tokens and adjusted 133 * text. May return null if adjustment is not possible due too many changes. 134 */ 135 private static List<AdjusterToken> calculateMappedAdjustedTokens(String adjustedText, 136 List<AdjusterToken> originalTokens) { 137 138 List<AdjusterToken> adjustedTokens = toTokens(adjustedText); 139 Delta<AdjusterToken> delta = Diff.computeDelta(originalTokens, adjustedTokens, MAX_DIFF_SIZE); 140 141 if (delta.getSize() >= MAX_DIFF_SIZE) { 142 return null; 143 } 144 145 return calculateMappedAdjustedTokensFromDelta(delta, originalTokens, adjustedTokens); 146 } 147 148 /** 149 * Calculates the #mappedAdjustedTokens based on original tokens, adjusted text, 150 * and delta. 151 */ 152 private static List<AdjusterToken> calculateMappedAdjustedTokensFromDelta(Delta<AdjusterToken> delta, 153 List<AdjusterToken> originalTokens, List<AdjusterToken> adjustedTokens) { 154 List<AdjusterToken> mappedAdjustedTokens = new ArrayList<>(Collections.nCopies(originalTokens.size(), null)); 155 int originalIndex = 0; 156 int adjustedIndex = 0; 157 for (int i = 0; i < delta.getSize(); ++i) { 158 int position = delta.getPosition(i); 159 if (position > 0) { 160 position -= 1; 161 162 while (adjustedIndex < position) { 163 mappedAdjustedTokens.set(originalIndex++, adjustedTokens.get(adjustedIndex++)); 164 } 165 adjustedIndex += 1; 166 } else { 167 position = -position - 1; 168 169 while (originalIndex < position) { 170 mappedAdjustedTokens.set(originalIndex++, adjustedTokens.get(adjustedIndex++)); 171 } 172 originalIndex += 1; 173 } 174 } 175 176 while (originalIndex < originalTokens.size()) { 177 mappedAdjustedTokens.set(originalIndex++, adjustedTokens.get(adjustedIndex++)); 178 } 179 180 return mappedAdjustedTokens; 181 } 182 183 /** Splits a string into tokens. */ 184 private static List<AdjusterToken> toTokens(String s) { 185 List<AdjusterToken> tokens = new ArrayList<>(); 186 Matcher matcher = TOKEN_PATTERN.matcher(s); 187 while (matcher.find()) { 188 tokens.add(new AdjusterToken(matcher.group(), matcher.start())); 189 } 190 return tokens; 191 } 192 193 /** 194 * Maps a zero-based offset range (both inclusive) to the adjusted string. 195 * Returns <code>null</code> if the region could not be approximately mapped. 196 */ 197 public Region getAdjustedRegion(int originalStartOffset, int originalEndOffset) { 198 199 if (mappedAdjustedTokens == null) { 200 return null; 201 } 202 203 Region originalIndexRegion = findOriginalIndexRegion(originalStartOffset, originalEndOffset); 204 if (originalIndexRegion.isEmpty()) { 205 return null; 206 } 207 208 int numOriginalTokens = originalIndexRegion.getLength(); 209 int numAdjustedTokens = 0; 210 211 AdjusterToken firstAdjustedToken = null; 212 AdjusterToken lastAdjustedToken = null; 213 for (int i = originalIndexRegion.getStart(); i <= originalIndexRegion.getEnd(); ++i) { 214 AdjusterToken adjustedToken = mappedAdjustedTokens.get(i); 215 if (adjustedToken != null) { 216 numAdjustedTokens += 1; 217 if (firstAdjustedToken == null) { 218 firstAdjustedToken = adjustedToken; 219 } 220 lastAdjustedToken = adjustedToken; 221 } 222 } 223 224 if (firstAdjustedToken == null || lastAdjustedToken == null 225 || LOSS_FACTOR * numAdjustedTokens < numOriginalTokens) { 226 return null; 227 } 228 229 return new Region(firstAdjustedToken.startOffset, lastAdjustedToken.endOffset); 230 } 231 232 /** 233 * Returns the region of indexes in the {@link #originalTokens} contained in the 234 * given offsets. 235 */ 236 private Region findOriginalIndexRegion(int originalStartOffset, int originalEndOffset) { 237 AdjusterToken searchToken = new AdjusterToken(null, originalStartOffset, originalEndOffset); 238 int originalStartTokenIndex = Collections.binarySearch(originalTokens, searchToken, 239 AdjusterToken.COMPARE_BY_START_OFFSET); 240 if (originalStartTokenIndex < 0) { 241 originalStartTokenIndex = -originalStartTokenIndex - 1; 242 } 243 244 int originalEndTokenIndex = Collections.binarySearch(originalTokens, searchToken, 245 AdjusterToken.COMPARE_BY_END_OFFSET); 246 if (originalEndTokenIndex < 0) { 247 // we want insertion point -1 248 originalEndTokenIndex = -originalEndTokenIndex - 2; 249 } 250 251 return new Region(originalStartTokenIndex, originalEndTokenIndex); 252 } 253 254 /** 255 * Returns a new location with adjusted offsets (if necessary). Returns 256 * <code>null</code> if the location cannot be mapped to the adjusted text. 257 */ 258 public ElementLocation adjustLocation(ElementLocation location) { 259 if (location instanceof TextRegionLocation) { 260 return adjustLocation((TextRegionLocation) location); 261 } 262 263 // other locations do not have offsets, if the uniform path should not 264 // be adjusted simply return the original location. 265 if (adjustedUniformPath == null) { 266 return location; 267 } 268 269 if (location instanceof QualifiedNameLocation) { 270 return new QualifiedNameLocation(((QualifiedNameLocation) location).getQualifiedName(), 271 location.getLocation(), adjustedUniformPath); 272 } 273 274 return new ElementLocation(location.getLocation(), adjustedUniformPath); 275 } 276 277 /** 278 * Returns a new location with adjusted offsets (if necessary). Returns 279 * <code>null</code> if the cannot be mapped to the adjusted text. 280 */ 281 public TextRegionLocation adjustLocation(TextRegionLocation location) { 282 int startOffset = location.getRawStartOffset(); 283 int endOffset = location.getRawEndOffset(); 284 if (startOffset < 0) { 285 int startLine = location.getRawStartLine(); 286 if (!originalLineOffsetConverter.isValidLine(startLine)) { 287 return null; 288 } 289 startOffset = originalLineOffsetConverter.getOffset(startLine); 290 } 291 if (endOffset < 0) { 292 int endLine = location.getRawEndLine() + 1; 293 if (!originalLineOffsetConverter.isValidLine(endLine)) { 294 return null; 295 } 296 endOffset = originalLineOffsetConverter.getOffset(endLine) - 1; 297 } 298 299 Region adjustedOffsets = getAdjustedRegion(startOffset, endOffset); 300 301 if (adjustedOffsets == null || adjustedOffsets.isEmpty()) { 302 return null; 303 } 304 305 String uniformPath = location.getUniformPath(); 306 if (adjustedUniformPath != null) { 307 uniformPath = adjustedUniformPath; 308 } 309 310 int newStartOffset = adjustedOffsets.getStart(); 311 int newEndOffset = adjustedOffsets.getEnd(); 312 return new TextRegionLocation(location.getLocation(), uniformPath, newStartOffset, newEndOffset, 313 adjustedLineOffsetConverter.getLine(newStartOffset), adjustedLineOffsetConverter.getLine(newEndOffset)); 314 } 315 316 /** 317 * Adjusts the location of a single line. This only respects the token part of a 318 * line, i.e. leading and trailing whitespace of a line will be ignored. This 319 * method is robust w.r.t lines numbers that are out of the range of the 320 * original text. In case of such an invalid line, the line is logged as error 321 * to the given logger and <code>null</code> is returned. 322 * 323 * @param line 324 * the one-based line number of be adjusted. 325 * @param invalidLines 326 * used for collecting invalid lines. 327 * 328 * @return the one-based lines encoded as a region, as a line may map to 329 * multiple lines after changing. This may also return null, if no 330 * non-empty lines could be found that correspond to the input line 331 * after adjustment. 332 */ 333 @Override 334 public LineBasedRegion adjustLine(int line, Set<Integer> invalidLines) { 335 if (!originalLineOffsetConverter.isValidLine(line) || !originalLineOffsetConverter.isValidLine(line + 1)) { 336 invalidLines.add(line); 337 return null; 338 } 339 int originalStartOffset = originalLineOffsetConverter.getOffset(line); 340 int originalEndOffset = originalLineOffsetConverter.getOffset(line + 1) - 1; 341 Region adjustedOffsets = getAdjustedRegion(originalStartOffset, originalEndOffset); 342 if (adjustedOffsets == null) { 343 return null; 344 } 345 346 int adjustedStartLine = adjustedLineOffsetConverter.getLine(adjustedOffsets.getStart()); 347 int adjustedEndLine = adjustedLineOffsetConverter.getLine(adjustedOffsets.getEnd()); 348 return new LineBasedRegion(adjustedStartLine, adjustedEndLine); 349 } 350 351 /** Returns the line count of the original text */ 352 @Override 353 public int getOriginalLineCount() { 354 return originalLineOffsetConverter.getLineCount(); 355 } 356 357 /** Simple token representation used in location adjustment. */ 358 private static class AdjusterToken { 359 360 /** Compares by start offset. */ 361 private static final Comparator<AdjusterToken> COMPARE_BY_START_OFFSET = Comparator 362 .comparingInt(token -> token.startOffset); 363 364 /** Compares by end offset. */ 365 private static final Comparator<AdjusterToken> COMPARE_BY_END_OFFSET = Comparator 366 .comparingInt(token -> token.endOffset); 367 368 /** The text content. */ 369 private final String text; 370 371 /** The start offset in the text. */ 372 private final int startOffset; 373 374 /** The inclusive end offset in the text. */ 375 private final int endOffset; 376 377 /** Constructor. */ 378 private AdjusterToken(String text, int startOffset) { 379 this(text, startOffset, startOffset + text.length() - 1); 380 } 381 382 /** Constructor. */ 383 private AdjusterToken(String text, int startOffset, int endOffset) { 384 this.text = text; 385 this.startOffset = startOffset; 386 this.endOffset = endOffset; 387 } 388 389 /** {@inheritDoc} */ 390 @Override 391 public boolean equals(Object obj) { 392 return (obj instanceof AdjusterToken) && ((AdjusterToken) obj).text.equals(text); 393 } 394 395 /** {@inheritDoc} */ 396 @Override 397 public int hashCode() { 398 return text.hashCode(); 399 } 400 } 401}