Source code

001/*-----------------------------------------------------------------------+
002 | org.conqat.engine.index.incubator
003 |                                                                       |
004   $Id$
005 |                                                                       |
006 | Copyright (c)  2009-2013 CQSE GmbH                                 |
007 +-----------------------------------------------------------------------*/
008package eu.cqse.check.framework.util.tokens;
009
010import java.util.ArrayList;
011import java.util.List;
012import java.util.Set;
013
014import org.conqat.lib.commons.assertion.CCSMAssert;
015import org.conqat.lib.commons.collections.CollectionUtils;
016
017import eu.cqse.check.framework.scanner.ETokenType;
018import eu.cqse.check.framework.scanner.ETokenType.ETokenClass;
019import eu.cqse.check.framework.scanner.IToken;
020
021/**
022 * A pattern that can be applied like a regular expression to a token stream.
023 *
024 * This class is not fully redundant to TokenTypePattern for two reasons:
025 * <ul>
026 * <li>It allows matching nested structures, e.g. parentheses, with
027 * {@link #skipNested(Object, Object, boolean)}
028 * <li>It allows named capture groups, something which is not currently possible
029 * with Java 7: http://bugs.java.com/bugdatabase/view_bug.do?bug_id=8013252
030 * </ul>
031 */
032public class TokenPattern {
033        /**
034         * The parts of the pattern that are matched in sequence.
035         */
036        private final List<TokenPatternBase> submatchers = new ArrayList<>();
037
038        /** {@inheritDoc} */
039        @Override
040        public String toString() {
041                return "submatchers " + submatchers.toString();
042        }
043
044        /**
045         * Matches the pattern at all positions in the given token stream and returns
046         * the found matches. This also finds overlapping matches. Given the token
047         * sequence <code>const int a</code> and pattern
048         * <code>(KEYWORD)* IDENTIFIFER</code>, this method will return 3 matches
049         * <code>const int a</code>, <code>int a</code> & <code>a</code>.
050         */
051        public List<TokenPatternMatch> findAll(List<IToken> tokens) {
052                List<TokenPatternMatch> matches = new ArrayList<>();
053                TokenStream stream = new TokenStream(tokens, 0);
054                for (int i = 0; i < tokens.size(); i++) {
055                        stream.setPosition(i);
056                        TokenPatternMatch match = matchesLocally(stream);
057                        if (match != null) {
058                                matches.add(match);
059                        }
060                }
061                return matches;
062        }
063
064        /**
065         * Matches the pattern at all positions in the given token stream and returns
066         * the first found match or <code>null</code> if no match happened.
067         */
068        public TokenPatternMatch findFirstMatch(List<IToken> tokens) {
069                for (int i = 0; i < tokens.size(); i++) {
070                        TokenStream stream = new TokenStream(tokens, i);
071                        TokenPatternMatch match = matchesLocally(stream);
072                        if (match != null) {
073                                return match;
074                        }
075                }
076                return null;
077        }
078
079        /**
080         * Finds and returns non-overlapping matches in the tokens. Given the token
081         * sequence <code>const int a</code> and pattern
082         * <code>(KEYWORD)* IDENTIFIFER</code>, this method will return only one match
083         * contrary to {@link #findAll} that will return 3 matches. The search starts at
084         * beginning of token stream say point x. If a match is found say at position x
085         * + 5, the next search starts just after the position of the previous
086         * successful search (x + 6). If no match is found, the search continues from
087         * position x + 1. This ensures no overlapping matches.
088         */
089        public List<TokenPatternMatch> findNonOverlappingMatches(List<IToken> tokens) {
090                List<TokenPatternMatch> matches = new ArrayList<>();
091                TokenStream stream = new TokenStream(tokens, 0);
092                int i = 0;
093                while (i < tokens.size()) {
094                        TokenPatternMatch match = matchesLocally(stream);
095                        if (match == null) {
096                                i += 1;
097                                stream.setPosition(i);
098                                continue;
099                        }
100
101                        // Stream position advances any time we have a match
102                        matches.add(match);
103                        i = stream.getPosition();
104                        if (stream.isExhausted()) {
105                                break;
106                        }
107                }
108
109                return matches;
110        }
111
112        /**
113         * Matches the pattern at the start position of the given tokens. Returns the
114         * found match or <code>null</code> if no match happened.
115         */
116        public TokenPatternMatch matchAtStartOf(List<IToken> tokens) {
117                TokenStream stream = new TokenStream(tokens, 0);
118                TokenPatternMatch match = matchesLocally(stream);
119                return match;
120        }
121
122        /**
123         * Matches the pattern at the start position of the token stream. Returns the
124         * found match or <code>null</code> if no match happened. The stream's position
125         * is preserved.
126         */
127        public TokenPatternMatch matchAtCurrentPosition(TokenStream stream) {
128                int position = stream.getPosition();
129                TokenPatternMatch match = matchesLocally(stream);
130                stream.setPosition(position);
131                return match;
132        }
133
134        /**
135         * Returns <code>true</code> if the pattern matches at least once in the given
136         * token stream.
137         */
138        public boolean matchesAnywhere(List<IToken> tokens) {
139                return findFirstMatch(tokens) != null;
140        }
141
142        /**
143         * Matches the pattern once at the current position of the given token stream
144         * and records the results in the given match.
145         *
146         * @return A {@link TokenPatternMatch} if the pattern matched or
147         *         <code>null</code> if the pattern did not match.
148         */
149        private TokenPatternMatch matchesLocally(TokenStream stream) {
150                return new SequencePattern(CollectionUtils.toArray(submatchers, TokenPatternBase.class)).matchesLocally(stream);
151        }
152
153        /**
154         * Specifies that the last added subpattern should be appended to the group with
155         * the given index. The group index may be any integer number. Group indices are
156         * not required to be ordered or continuous.
157         */
158        public TokenPattern group(int groupIndex) {
159                CCSMAssert.isFalse(submatchers.isEmpty(),
160                                "You must specify a pattern part before you can assign a group to it.");
161                submatchers.get(submatchers.size() - 1).setGroupIndex(groupIndex);
162                return this;
163        }
164
165        /**
166         * Adds a new pattern part that consumes anything until it encounters one of the
167         * given match terms. The stop token matching one of the given terms is also
168         * consumed.
169         *
170         * A group applied to this pattern will only contain the stop token.
171         *
172         * @param matchTerms
173         *            a list of match terms which must match in order. These may be
174         *            instances of {@link ETokenType}, {@link ETokenClass}, or sets of
175         *            them, as well as {@link TokenPattern}s.
176         */
177        public TokenPattern skipTo(Object... matchTerms) {
178                TokenPatternBase[] matchers = convertMatchTerms(matchTerms);
179                submatchers.add(new SkipPattern(matchers));
180                submatchers.add(new AlternativePattern(matchers));
181                return this;
182        }
183
184        /**
185         * Adds a new pattern part that consumes nested structures.
186         *
187         * @param openTerm
188         *            a match term that identifies the opening braces. May be an
189         *            instance of {@link ETokenType}, {@link ETokenClass}, or a set of
190         *            them, as well as an {@link TokenPattern}.
191         * @param closeTerm
192         *            a match term that identifies the closing braces. May be an
193         *            instance of {@link ETokenType}, {@link ETokenClass}, or a set of
194         *            them, as well as an {@link TokenPattern}.
195         * @param optional
196         *            if this is <code>false</code>, the nested structure must be
197         *            present or the pattern will not match. If this is
198         *            <code>true</code>, this pattern may return an empty match.
199         */
200        public TokenPattern skipNested(Object openTerm, Object closeTerm, boolean optional) {
201                TokenPatternBase openMatcher = convertMatchTerm(openTerm);
202                TokenPatternBase closeMatcher = convertMatchTerm(closeTerm);
203                submatchers.add(new SkipNestedPattern(openMatcher, closeMatcher, optional));
204                return this;
205        }
206
207        /**
208         * Adds a new "repeated" pattern part that matches the given terms zero or more
209         * times in the given order.
210         *
211         * @param matchTerms
212         *            a list of match terms which must match in order. These may be
213         *            instances of {@link ETokenType}, {@link ETokenClass}, or sets of
214         *            them, as well as {@link TokenPattern}s.
215         */
216        public TokenPattern repeated(Object... matchTerms) {
217                TokenPatternBase[] matchers = convertMatchTerms(matchTerms);
218                submatchers.add(new RepeatedPattern(matchers));
219                return this;
220        }
221
222        /**
223         * Adds a new pattern part that matches the given sequence of terms exactly once
224         * in the given order.
225         *
226         * @param matchTerms
227         *            a list of match terms which must match in order. These may be
228         *            instances of {@link ETokenType}, {@link ETokenClass}, or sets of
229         *            them, as well as {@link TokenPattern}s.
230         */
231        public TokenPattern sequence(Object... matchTerms) {
232                TokenPatternBase[] matchers = convertMatchTerms(matchTerms);
233                submatchers.add(new SequencePattern(matchers));
234                return this;
235        }
236
237        /**
238         * Adds a new pattern part that matches the end of the token stream (i.e. there
239         * are no more tokens to match).
240         */
241        public TokenPattern endOfStream() {
242                submatchers.add(new ZeroLengthTokenPatternBase() {
243
244                        @Override
245                        public boolean conditionApplies(TokenStream stream) {
246                                return stream.isExhausted();
247                        }
248
249                        /** {@inheritDoc} */
250                        @Override
251                        public String toString() {
252                                return "ZeroLength is exhausted";
253                        }
254                });
255                return this;
256        }
257
258        /**
259         * Adds a new pattern part that matches the beginning of the token stream (i.e.
260         * no tokens have been consumed so far).
261         */
262        public TokenPattern beginningOfStream() {
263                submatchers.add(new ZeroLengthTokenPatternBase() {
264
265                        @Override
266                        public boolean conditionApplies(TokenStream stream) {
267                                return stream.isAtBeginning();
268                        }
269
270                        /** {@inheritDoc} */
271                        @Override
272                        public String toString() {
273                                return "ZeroLength at beginning";
274                        }
275                });
276                return this;
277        }
278
279        /**
280         * Adds a new pattern part that matches anything but the beginning of the token
281         * stream (i.e. at least one token has been consumed so far).
282         */
283        public TokenPattern notAtBeginningOfStream() {
284                submatchers.add(new ZeroLengthTokenPatternBase() {
285
286                        @Override
287                        public boolean conditionApplies(TokenStream stream) {
288                                return !stream.isAtBeginning();
289                        }
290
291                        /** {@inheritDoc} */
292                        @Override
293                        public String toString() {
294                                return "ZeroLength not at beginning";
295                        }
296                });
297                return this;
298        }
299
300        /**
301         * Adds a new pattern part that matches if the last matched token is not
302         * followed by a token that matches the given term. This pattern part does not
303         * consume any tokens.
304         *
305         * @param matchTerm
306         *            May be an instance of {@link ETokenType}, {@link ETokenClass}, or
307         *            a set of them, as well as a {@link TokenPattern}.
308         */
309        public TokenPattern notFollowedBy(Object matchTerm) {
310                TokenPatternBase matcher = convertMatchTerm(matchTerm);
311                submatchers.add(new NotFollowedByPattern(matcher));
312                return this;
313        }
314
315        /**
316         * Adds a new pattern part that matches if the last matched token is not
317         * preceded by a token that matches the given term. This pattern part does not
318         * consume any tokens.
319         *
320         * NOTE: passing a {@link TokenPattern} to this function will not result in an
321         * exception, but it will not yield the intuitive result. The given pattern will
322         * be matched against the token stream, starting with the token preceding the
323         * current token. It is therefore discouraged to pass a {@link TokenPattern} to
324         * this function as it makes understanding the pattern difficult.
325         *
326         * @param matchTerm
327         *            May be an instance of {@link ETokenType}, {@link ETokenClass}, or
328         *            a set of them, but usually not a {@link TokenPattern}.
329         */
330        public TokenPattern notPrecededBy(Object matchTerm) {
331                TokenPatternBase matcher = convertMatchTerm(matchTerm);
332                submatchers.add(new NotPrecededByPattern(matcher));
333                return this;
334        }
335
336        /**
337         * Adds a new alternative pattern part that tries the given match terms one
338         * after the other and stops as soon as one of them matches.
339         *
340         * @param matchTerms
341         *            a list of match terms which must match in order. These may be
342         *            instances of {@link ETokenType}, {@link ETokenClass}, or sets of
343         *            them, as well as {@link TokenPattern}s.
344         */
345        public TokenPattern alternative(Object... matchTerms) {
346                TokenPatternBase[] matchers = convertMatchTerms(matchTerms);
347                submatchers.add(new AlternativePattern(matchers));
348                return this;
349        }
350
351        /**
352         * Adds a new optional pattern part that matches the given terms zero times or
353         * once in the given order.
354         *
355         * @param matchTerms
356         *            a list of match terms which must match in order. These may be
357         *            instances of {@link ETokenType}, {@link ETokenClass}, or sets of
358         *            them, as well as {@link TokenPattern}s.
359         */
360        public TokenPattern optional(Object... matchTerms) {
361                TokenPatternBase[] matchers = convertMatchTerms(matchTerms);
362                submatchers.add(new OptionalPattern(matchers));
363                return this;
364        }
365
366        /**
367         * Adds a new pattern part that matches token text with specified regex.
368         *
369         * @param regex
370         *            string to contain in token text.
371         */
372        public TokenPattern regex(String regex) {
373                submatchers.add(new RegexPattern(regex));
374                return this;
375        }
376
377        /** Converts the given match terms to {@link TokenPatternBase}s. */
378        private static TokenPatternBase[] convertMatchTerms(Object[] matchTerms) {
379                CCSMAssert.isFalse(matchTerms.length == 0, "You must provide at least one match term.");
380                TokenPatternBase[] matchers = new TokenPatternBase[matchTerms.length];
381                for (int i = 0; i < matchTerms.length; ++i) {
382                        matchers[i] = convertMatchTerm(matchTerms[i]);
383                }
384                return matchers;
385        }
386
387        /** Converts a {@link TokenPattern} match term to a matcher. */
388        private static TokenPatternBase convertMatchTerm(final TokenPattern matchTerm) {
389                return new TokenPatternBase() {
390                        @Override
391                        protected TokenPatternMatch matchesLocally(TokenStream stream) {
392                                TokenPattern pattern = matchTerm;
393                                int beforePosition = stream.getPosition();
394                                TokenPatternMatch match = pattern.matchesLocally(stream);
395                                if (match == null) {
396                                        stream.setPosition(beforePosition);
397                                        return null;
398                                }
399                                return match;
400                        }
401
402                        /** {@inheritDoc} */
403                        @Override
404                        public String toString() {
405                                return matchTerm.toString();
406                        }
407                };
408        }
409
410        /** Converts a {@link ETokenType} match term to a matcher. */
411        private static TokenPatternBase convertMatchTerm(final ETokenType matchTerm) {
412                return new SingleTokenPatternBase() {
413                        @Override
414                        public boolean matchesToken(IToken token) {
415                                return token.getType() == matchTerm;
416                        }
417
418                        /** {@inheritDoc} */
419                        @Override
420                        public String toString() {
421                                return matchTerm.toString();
422                        }
423                };
424        }
425
426        /** Converts a ETokenClass match term to a matcher. */
427        private static TokenPatternBase convertMatchTerm(final ETokenClass matchTerm) {
428                return new SingleTokenPatternBase() {
429                        @Override
430                        public boolean matchesToken(IToken token) {
431                                return token.getType().getTokenClass() == matchTerm;
432                        }
433
434                        /** {@inheritDoc} */
435                        @Override
436                        public String toString() {
437                                return matchTerm.toString();
438                        }
439                };
440        }
441
442        /** Converts a Set of match terms to a matcher. */
443        private static TokenPatternBase convertMatchTerm(final Set<?> matchTerm) {
444                final Set<?> set = matchTerm;
445                return new SingleTokenPatternBase() {
446                        @Override
447                        public boolean matchesToken(IToken token) {
448                                ETokenType type = token.getType();
449                                return set.contains(type) || set.contains(type.getTokenClass());
450                        }
451
452                        /** {@inheritDoc} */
453                        @Override
454                        public String toString() {
455                                return matchTerm.toString();
456                        }
457                };
458        }
459
460        /** Converts a match term to a matcher. */
461        private static TokenPatternBase convertMatchTerm(final Object matchTerm) {
462                if (matchTerm instanceof TokenPatternBase) {
463                        return (TokenPatternBase) matchTerm;
464                }
465                if (matchTerm instanceof TokenPattern) {
466                        return convertMatchTerm((TokenPattern) matchTerm);
467                }
468                if (matchTerm instanceof ETokenType) {
469                        return convertMatchTerm((ETokenType) matchTerm);
470                }
471                if (matchTerm instanceof ETokenClass) {
472                        return convertMatchTerm((ETokenClass) matchTerm);
473                }
474                if (matchTerm instanceof Set<?>) {
475                        return convertMatchTerm((Set<?>) matchTerm);
476                }
477                throw new AssertionError("Unsupported match term of type " + matchTerm.getClass());
478        }
479
480        /**
481         * Matches the given terms one or more times. Shorthand for
482         * <code>.sequence(matchTerms).repeated(matchTerms)</code>
483         */
484        public TokenPattern repeatedAtLeastOnce(Object... matchTerms) {
485                return sequence(matchTerms).repeated(matchTerms);
486        }
487
488        /**
489         * Returns a new Token pattern that matches on tokens that have exactly the
490         * given text.
491         */
492        public static TokenPatternBase text(String tokenText) {
493                return new SingleTokenPatternBase() {
494                        @Override
495                        public boolean matchesToken(IToken token) {
496                                return tokenText.equals(token.getText());
497                        }
498
499                        /** {@inheritDoc} */
500                        @Override
501                        public String toString() {
502                                return tokenText;
503                        }
504                };
505        }
506}