001/*-----------------------------------------------------------------------+ 002 | org.conqat.engine.index.incubator 003 | | 004 $Id$ 005 | | 006 | Copyright (c) 2009-2013 CQSE GmbH | 007 +-----------------------------------------------------------------------*/ 008package eu.cqse.check.framework.util.tokens; 009 010import java.util.ArrayList; 011import java.util.List; 012import java.util.Set; 013 014import org.conqat.lib.commons.assertion.CCSMAssert; 015import org.conqat.lib.commons.collections.CollectionUtils; 016 017import eu.cqse.check.framework.scanner.ETokenType; 018import eu.cqse.check.framework.scanner.ETokenType.ETokenClass; 019import eu.cqse.check.framework.scanner.IToken; 020 021/** 022 * A pattern that can be applied like a regular expression to a token stream. 023 * 024 * This class is not fully redundant to TokenTypePattern for two reasons: 025 * <ul> 026 * <li>It allows matching nested structures, e.g. parentheses, with 027 * {@link #skipNested(Object, Object, boolean)} 028 * <li>It allows named capture groups, something which is not currently possible 029 * with Java 7: http://bugs.java.com/bugdatabase/view_bug.do?bug_id=8013252 030 * </ul> 031 */ 032public class TokenPattern { 033 /** 034 * The parts of the pattern that are matched in sequence. 035 */ 036 private final List<TokenPatternBase> submatchers = new ArrayList<>(); 037 038 /** {@inheritDoc} */ 039 @Override 040 public String toString() { 041 return "submatchers " + submatchers.toString(); 042 } 043 044 /** 045 * Matches the pattern at all positions in the given token stream and returns 046 * the found matches. This also finds overlapping matches. Given the token 047 * sequence <code>const int a</code> and pattern 048 * <code>(KEYWORD)* IDENTIFIFER</code>, this method will return 3 matches 049 * <code>const int a</code>, <code>int a</code> & <code>a</code>. 050 */ 051 public List<TokenPatternMatch> findAll(List<IToken> tokens) { 052 List<TokenPatternMatch> matches = new ArrayList<>(); 053 TokenStream stream = new TokenStream(tokens, 0); 054 for (int i = 0; i < tokens.size(); i++) { 055 stream.setPosition(i); 056 TokenPatternMatch match = matchesLocally(stream); 057 if (match != null) { 058 matches.add(match); 059 } 060 } 061 return matches; 062 } 063 064 /** 065 * Matches the pattern at all positions in the given token stream and returns 066 * the first found match or <code>null</code> if no match happened. 067 */ 068 public TokenPatternMatch findFirstMatch(List<IToken> tokens) { 069 for (int i = 0; i < tokens.size(); i++) { 070 TokenStream stream = new TokenStream(tokens, i); 071 TokenPatternMatch match = matchesLocally(stream); 072 if (match != null) { 073 return match; 074 } 075 } 076 return null; 077 } 078 079 /** 080 * Finds and returns non-overlapping matches in the tokens. Given the token 081 * sequence <code>const int a</code> and pattern 082 * <code>(KEYWORD)* IDENTIFIFER</code>, this method will return only one match 083 * contrary to {@link #findAll} that will return 3 matches. The search starts at 084 * beginning of token stream say point x. If a match is found say at position x 085 * + 5, the next search starts just after the position of the previous 086 * successful search (x + 6). If no match is found, the search continues from 087 * position x + 1. This ensures no overlapping matches. 088 */ 089 public List<TokenPatternMatch> findNonOverlappingMatches(List<IToken> tokens) { 090 List<TokenPatternMatch> matches = new ArrayList<>(); 091 TokenStream stream = new TokenStream(tokens, 0); 092 int i = 0; 093 while (i < tokens.size()) { 094 TokenPatternMatch match = matchesLocally(stream); 095 if (match == null) { 096 i += 1; 097 stream.setPosition(i); 098 continue; 099 } 100 101 // Stream position advances any time we have a match 102 matches.add(match); 103 i = stream.getPosition(); 104 if (stream.isExhausted()) { 105 break; 106 } 107 } 108 109 return matches; 110 } 111 112 /** 113 * Matches the pattern at the start position of the given tokens. Returns the 114 * found match or <code>null</code> if no match happened. 115 */ 116 public TokenPatternMatch matchAtStartOf(List<IToken> tokens) { 117 TokenStream stream = new TokenStream(tokens, 0); 118 TokenPatternMatch match = matchesLocally(stream); 119 return match; 120 } 121 122 /** 123 * Matches the pattern at the start position of the token stream. Returns the 124 * found match or <code>null</code> if no match happened. The stream's position 125 * is preserved. 126 */ 127 public TokenPatternMatch matchAtCurrentPosition(TokenStream stream) { 128 int position = stream.getPosition(); 129 TokenPatternMatch match = matchesLocally(stream); 130 stream.setPosition(position); 131 return match; 132 } 133 134 /** 135 * Returns <code>true</code> if the pattern matches at least once in the given 136 * token stream. 137 */ 138 public boolean matchesAnywhere(List<IToken> tokens) { 139 return findFirstMatch(tokens) != null; 140 } 141 142 /** 143 * Matches the pattern once at the current position of the given token stream 144 * and records the results in the given match. 145 * 146 * @return A {@link TokenPatternMatch} if the pattern matched or 147 * <code>null</code> if the pattern did not match. 148 */ 149 private TokenPatternMatch matchesLocally(TokenStream stream) { 150 return new SequencePattern(CollectionUtils.toArray(submatchers, TokenPatternBase.class)).matchesLocally(stream); 151 } 152 153 /** 154 * Specifies that the last added subpattern should be appended to the group with 155 * the given index. The group index may be any integer number. Group indices are 156 * not required to be ordered or continuous. 157 */ 158 public TokenPattern group(int groupIndex) { 159 CCSMAssert.isFalse(submatchers.isEmpty(), 160 "You must specify a pattern part before you can assign a group to it."); 161 submatchers.get(submatchers.size() - 1).setGroupIndex(groupIndex); 162 return this; 163 } 164 165 /** 166 * Adds a new pattern part that consumes anything until it encounters one of the 167 * given match terms. The stop token matching one of the given terms is also 168 * consumed. 169 * 170 * A group applied to this pattern will only contain the stop token. 171 * 172 * @param matchTerms 173 * a list of match terms which must match in order. These may be 174 * instances of {@link ETokenType}, {@link ETokenClass}, or sets of 175 * them, as well as {@link TokenPattern}s. 176 */ 177 public TokenPattern skipTo(Object... matchTerms) { 178 TokenPatternBase[] matchers = convertMatchTerms(matchTerms); 179 submatchers.add(new SkipPattern(matchers)); 180 submatchers.add(new AlternativePattern(matchers)); 181 return this; 182 } 183 184 /** 185 * Adds a new pattern part that consumes nested structures. 186 * 187 * @param openTerm 188 * a match term that identifies the opening braces. May be an 189 * instance of {@link ETokenType}, {@link ETokenClass}, or a set of 190 * them, as well as an {@link TokenPattern}. 191 * @param closeTerm 192 * a match term that identifies the closing braces. May be an 193 * instance of {@link ETokenType}, {@link ETokenClass}, or a set of 194 * them, as well as an {@link TokenPattern}. 195 * @param optional 196 * if this is <code>false</code>, the nested structure must be 197 * present or the pattern will not match. If this is 198 * <code>true</code>, this pattern may return an empty match. 199 */ 200 public TokenPattern skipNested(Object openTerm, Object closeTerm, boolean optional) { 201 TokenPatternBase openMatcher = convertMatchTerm(openTerm); 202 TokenPatternBase closeMatcher = convertMatchTerm(closeTerm); 203 submatchers.add(new SkipNestedPattern(openMatcher, closeMatcher, optional)); 204 return this; 205 } 206 207 /** 208 * Adds a new "repeated" pattern part that matches the given terms zero or more 209 * times in the given order. 210 * 211 * @param matchTerms 212 * a list of match terms which must match in order. These may be 213 * instances of {@link ETokenType}, {@link ETokenClass}, or sets of 214 * them, as well as {@link TokenPattern}s. 215 */ 216 public TokenPattern repeated(Object... matchTerms) { 217 TokenPatternBase[] matchers = convertMatchTerms(matchTerms); 218 submatchers.add(new RepeatedPattern(matchers)); 219 return this; 220 } 221 222 /** 223 * Adds a new pattern part that matches the given sequence of terms exactly once 224 * in the given order. 225 * 226 * @param matchTerms 227 * a list of match terms which must match in order. These may be 228 * instances of {@link ETokenType}, {@link ETokenClass}, or sets of 229 * them, as well as {@link TokenPattern}s. 230 */ 231 public TokenPattern sequence(Object... matchTerms) { 232 TokenPatternBase[] matchers = convertMatchTerms(matchTerms); 233 submatchers.add(new SequencePattern(matchers)); 234 return this; 235 } 236 237 /** 238 * Adds a new pattern part that matches the end of the token stream (i.e. there 239 * are no more tokens to match). 240 */ 241 public TokenPattern endOfStream() { 242 submatchers.add(new ZeroLengthTokenPatternBase() { 243 244 @Override 245 public boolean conditionApplies(TokenStream stream) { 246 return stream.isExhausted(); 247 } 248 249 /** {@inheritDoc} */ 250 @Override 251 public String toString() { 252 return "ZeroLength is exhausted"; 253 } 254 }); 255 return this; 256 } 257 258 /** 259 * Adds a new pattern part that matches the beginning of the token stream (i.e. 260 * no tokens have been consumed so far). 261 */ 262 public TokenPattern beginningOfStream() { 263 submatchers.add(new ZeroLengthTokenPatternBase() { 264 265 @Override 266 public boolean conditionApplies(TokenStream stream) { 267 return stream.isAtBeginning(); 268 } 269 270 /** {@inheritDoc} */ 271 @Override 272 public String toString() { 273 return "ZeroLength at beginning"; 274 } 275 }); 276 return this; 277 } 278 279 /** 280 * Adds a new pattern part that matches anything but the beginning of the token 281 * stream (i.e. at least one token has been consumed so far). 282 */ 283 public TokenPattern notAtBeginningOfStream() { 284 submatchers.add(new ZeroLengthTokenPatternBase() { 285 286 @Override 287 public boolean conditionApplies(TokenStream stream) { 288 return !stream.isAtBeginning(); 289 } 290 291 /** {@inheritDoc} */ 292 @Override 293 public String toString() { 294 return "ZeroLength not at beginning"; 295 } 296 }); 297 return this; 298 } 299 300 /** 301 * Adds a new pattern part that matches if the last matched token is not 302 * followed by a token that matches the given term. This pattern part does not 303 * consume any tokens. 304 * 305 * @param matchTerm 306 * May be an instance of {@link ETokenType}, {@link ETokenClass}, or 307 * a set of them, as well as a {@link TokenPattern}. 308 */ 309 public TokenPattern notFollowedBy(Object matchTerm) { 310 TokenPatternBase matcher = convertMatchTerm(matchTerm); 311 submatchers.add(new NotFollowedByPattern(matcher)); 312 return this; 313 } 314 315 /** 316 * Adds a new pattern part that matches if the last matched token is not 317 * preceded by a token that matches the given term. This pattern part does not 318 * consume any tokens. 319 * 320 * NOTE: passing a {@link TokenPattern} to this function will not result in an 321 * exception, but it will not yield the intuitive result. The given pattern will 322 * be matched against the token stream, starting with the token preceding the 323 * current token. It is therefore discouraged to pass a {@link TokenPattern} to 324 * this function as it makes understanding the pattern difficult. 325 * 326 * @param matchTerm 327 * May be an instance of {@link ETokenType}, {@link ETokenClass}, or 328 * a set of them, but usually not a {@link TokenPattern}. 329 */ 330 public TokenPattern notPrecededBy(Object matchTerm) { 331 TokenPatternBase matcher = convertMatchTerm(matchTerm); 332 submatchers.add(new NotPrecededByPattern(matcher)); 333 return this; 334 } 335 336 /** 337 * Adds a new alternative pattern part that tries the given match terms one 338 * after the other and stops as soon as one of them matches. 339 * 340 * @param matchTerms 341 * a list of match terms which must match in order. These may be 342 * instances of {@link ETokenType}, {@link ETokenClass}, or sets of 343 * them, as well as {@link TokenPattern}s. 344 */ 345 public TokenPattern alternative(Object... matchTerms) { 346 TokenPatternBase[] matchers = convertMatchTerms(matchTerms); 347 submatchers.add(new AlternativePattern(matchers)); 348 return this; 349 } 350 351 /** 352 * Adds a new optional pattern part that matches the given terms zero times or 353 * once in the given order. 354 * 355 * @param matchTerms 356 * a list of match terms which must match in order. These may be 357 * instances of {@link ETokenType}, {@link ETokenClass}, or sets of 358 * them, as well as {@link TokenPattern}s. 359 */ 360 public TokenPattern optional(Object... matchTerms) { 361 TokenPatternBase[] matchers = convertMatchTerms(matchTerms); 362 submatchers.add(new OptionalPattern(matchers)); 363 return this; 364 } 365 366 /** 367 * Adds a new pattern part that matches token text with specified regex. 368 * 369 * @param regex 370 * string to contain in token text. 371 */ 372 public TokenPattern regex(String regex) { 373 submatchers.add(new RegexPattern(regex)); 374 return this; 375 } 376 377 /** Converts the given match terms to {@link TokenPatternBase}s. */ 378 private static TokenPatternBase[] convertMatchTerms(Object[] matchTerms) { 379 CCSMAssert.isFalse(matchTerms.length == 0, "You must provide at least one match term."); 380 TokenPatternBase[] matchers = new TokenPatternBase[matchTerms.length]; 381 for (int i = 0; i < matchTerms.length; ++i) { 382 matchers[i] = convertMatchTerm(matchTerms[i]); 383 } 384 return matchers; 385 } 386 387 /** Converts a {@link TokenPattern} match term to a matcher. */ 388 private static TokenPatternBase convertMatchTerm(final TokenPattern matchTerm) { 389 return new TokenPatternBase() { 390 @Override 391 protected TokenPatternMatch matchesLocally(TokenStream stream) { 392 TokenPattern pattern = matchTerm; 393 int beforePosition = stream.getPosition(); 394 TokenPatternMatch match = pattern.matchesLocally(stream); 395 if (match == null) { 396 stream.setPosition(beforePosition); 397 return null; 398 } 399 return match; 400 } 401 402 /** {@inheritDoc} */ 403 @Override 404 public String toString() { 405 return matchTerm.toString(); 406 } 407 }; 408 } 409 410 /** Converts a {@link ETokenType} match term to a matcher. */ 411 private static TokenPatternBase convertMatchTerm(final ETokenType matchTerm) { 412 return new SingleTokenPatternBase() { 413 @Override 414 public boolean matchesToken(IToken token) { 415 return token.getType() == matchTerm; 416 } 417 418 /** {@inheritDoc} */ 419 @Override 420 public String toString() { 421 return matchTerm.toString(); 422 } 423 }; 424 } 425 426 /** Converts a ETokenClass match term to a matcher. */ 427 private static TokenPatternBase convertMatchTerm(final ETokenClass matchTerm) { 428 return new SingleTokenPatternBase() { 429 @Override 430 public boolean matchesToken(IToken token) { 431 return token.getType().getTokenClass() == matchTerm; 432 } 433 434 /** {@inheritDoc} */ 435 @Override 436 public String toString() { 437 return matchTerm.toString(); 438 } 439 }; 440 } 441 442 /** Converts a Set of match terms to a matcher. */ 443 private static TokenPatternBase convertMatchTerm(final Set<?> matchTerm) { 444 final Set<?> set = matchTerm; 445 return new SingleTokenPatternBase() { 446 @Override 447 public boolean matchesToken(IToken token) { 448 ETokenType type = token.getType(); 449 return set.contains(type) || set.contains(type.getTokenClass()); 450 } 451 452 /** {@inheritDoc} */ 453 @Override 454 public String toString() { 455 return matchTerm.toString(); 456 } 457 }; 458 } 459 460 /** Converts a match term to a matcher. */ 461 private static TokenPatternBase convertMatchTerm(final Object matchTerm) { 462 if (matchTerm instanceof TokenPatternBase) { 463 return (TokenPatternBase) matchTerm; 464 } 465 if (matchTerm instanceof TokenPattern) { 466 return convertMatchTerm((TokenPattern) matchTerm); 467 } 468 if (matchTerm instanceof ETokenType) { 469 return convertMatchTerm((ETokenType) matchTerm); 470 } 471 if (matchTerm instanceof ETokenClass) { 472 return convertMatchTerm((ETokenClass) matchTerm); 473 } 474 if (matchTerm instanceof Set<?>) { 475 return convertMatchTerm((Set<?>) matchTerm); 476 } 477 throw new AssertionError("Unsupported match term of type " + matchTerm.getClass()); 478 } 479 480 /** 481 * Matches the given terms one or more times. Shorthand for 482 * <code>.sequence(matchTerms).repeated(matchTerms)</code> 483 */ 484 public TokenPattern repeatedAtLeastOnce(Object... matchTerms) { 485 return sequence(matchTerms).repeated(matchTerms); 486 } 487 488 /** 489 * Returns a new Token pattern that matches on tokens that have exactly the 490 * given text. 491 */ 492 public static TokenPatternBase text(String tokenText) { 493 return new SingleTokenPatternBase() { 494 @Override 495 public boolean matchesToken(IToken token) { 496 return tokenText.equals(token.getText()); 497 } 498 499 /** {@inheritDoc} */ 500 @Override 501 public String toString() { 502 return tokenText; 503 } 504 }; 505 } 506}