001/*-------------------------------------------------------------------------+
002|                                                                          |
003| Copyright 2005-2011 the ConQAT Project                                   |
004|                                                                          |
005| Licensed under the Apache License, Version 2.0 (the "License");          |
006| you may not use this file except in compliance with the License.         |
007| You may obtain a copy of the License at                                  |
008|                                                                          |
009|    http://www.apache.org/licenses/LICENSE-2.0                            |
010|                                                                          |
011| Unless required by applicable law or agreed to in writing, software      |
012| distributed under the License is distributed on an "AS IS" BASIS,        |
013| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
014| See the License for the specific language governing permissions and      |
015| limitations under the License.                                           |
016+-------------------------------------------------------------------------*/
017package eu.cqse.check.framework.shallowparser.languages.base;
018
019import static eu.cqse.check.framework.scanner.ETokenType.AND;
020import static eu.cqse.check.framework.scanner.ETokenType.ANDAND;
021import static eu.cqse.check.framework.scanner.ETokenType.ANDEQ;
022import static eu.cqse.check.framework.scanner.ETokenType.CASE;
023import static eu.cqse.check.framework.scanner.ETokenType.CATCH;
024import static eu.cqse.check.framework.scanner.ETokenType.COLON;
025import static eu.cqse.check.framework.scanner.ETokenType.COMMA;
026import static eu.cqse.check.framework.scanner.ETokenType.COMP;
027import static eu.cqse.check.framework.scanner.ETokenType.CONST;
028import static eu.cqse.check.framework.scanner.ETokenType.DEFAULT;
029import static eu.cqse.check.framework.scanner.ETokenType.DO;
030import static eu.cqse.check.framework.scanner.ETokenType.ELSE;
031import static eu.cqse.check.framework.scanner.ETokenType.ENUM;
032import static eu.cqse.check.framework.scanner.ETokenType.EQ;
033import static eu.cqse.check.framework.scanner.ETokenType.FINALLY;
034import static eu.cqse.check.framework.scanner.ETokenType.IDENTIFIER;
035import static eu.cqse.check.framework.scanner.ETokenType.IF;
036import static eu.cqse.check.framework.scanner.ETokenType.LBRACE;
037import static eu.cqse.check.framework.scanner.ETokenType.LBRACK;
038import static eu.cqse.check.framework.scanner.ETokenType.LITERALS;
039import static eu.cqse.check.framework.scanner.ETokenType.LPAREN;
040import static eu.cqse.check.framework.scanner.ETokenType.MINUS;
041import static eu.cqse.check.framework.scanner.ETokenType.MULT;
042import static eu.cqse.check.framework.scanner.ETokenType.NOT;
043import static eu.cqse.check.framework.scanner.ETokenType.NOTEQ;
044import static eu.cqse.check.framework.scanner.ETokenType.OR;
045import static eu.cqse.check.framework.scanner.ETokenType.OREQ;
046import static eu.cqse.check.framework.scanner.ETokenType.OROR;
047import static eu.cqse.check.framework.scanner.ETokenType.PLUS;
048import static eu.cqse.check.framework.scanner.ETokenType.RBRACE;
049import static eu.cqse.check.framework.scanner.ETokenType.RBRACK;
050import static eu.cqse.check.framework.scanner.ETokenType.RPAREN;
051import static eu.cqse.check.framework.scanner.ETokenType.SEMICOLON;
052import static eu.cqse.check.framework.scanner.ETokenType.TRY;
053import static eu.cqse.check.framework.scanner.ETokenType.TYPEDEF;
054import static eu.cqse.check.framework.scanner.ETokenType.UNION;
055import static eu.cqse.check.framework.scanner.ETokenType.USING;
056import static eu.cqse.check.framework.scanner.ETokenType.VAR;
057import static eu.cqse.check.framework.scanner.ETokenType.WHILE;
058import static eu.cqse.check.framework.scanner.ETokenType.XOR;
059import static eu.cqse.check.framework.scanner.ETokenType.XOREQ;
060import static eu.cqse.check.framework.shallowparser.languages.base.EGenericParserStates.IN_EXPRESSION;
061import static eu.cqse.check.framework.shallowparser.languages.base.EGenericParserStates.IN_METHOD;
062import static eu.cqse.check.framework.shallowparser.languages.base.EGenericParserStates.IN_TYPE;
063import static eu.cqse.check.framework.shallowparser.languages.base.EGenericParserStates.TOP_LEVEL;
064
065import java.util.Arrays;
066import java.util.EnumSet;
067import java.util.HashSet;
068import java.util.List;
069
070import org.conqat.lib.commons.region.Region;
071
072import eu.cqse.check.framework.scanner.ETokenType;
073import eu.cqse.check.framework.scanner.IToken;
074import eu.cqse.check.framework.shallowparser.SubTypeNames;
075import eu.cqse.check.framework.shallowparser.TokenStreamUtils;
076import eu.cqse.check.framework.shallowparser.framework.EShallowEntityType;
077import eu.cqse.check.framework.shallowparser.framework.RecognizerBase;
078import eu.cqse.check.framework.shallowparser.framework.ShallowParserBase;
079
080/**
081 * Base class for C-style languages (C++, Java, C#).
082 */
083public abstract class CStyleShallowParserBase extends ShallowParserBase<EGenericParserStates> {
084
085        /** C++ operators that have an alternative (string) representations. */
086        protected static final EnumSet<ETokenType> OPERATORS_WITH_ALTERNATIVE_REPRESENTATION = EnumSet.of(NOT, COMP, NOTEQ,
087                        ANDAND, OROR, AND, OR, XOR, ANDEQ, OREQ, XOREQ);
088
089        /** Constructor. */
090        protected CStyleShallowParserBase() {
091                super(EGenericParserStates.class, EGenericParserStates.TOP_LEVEL);
092                createMetaRules();
093                createTypeRules();
094                createClassElementsRules();
095                createStatementRules();
096                createSubExpressionRules();
097        }
098
099        /** Creates rules for meta elements. */
100        protected void createMetaRules() {
101                // deal with dangling closing braces by inserting broken node (endNode
102                // intentionally omitted)
103                inAnyState().sequence(RBRACE).createNode(EShallowEntityType.META, "dangling closing brace");
104        }
105
106        /** Parser rules for module/namespace and type creation. */
107        protected void createTypeRules() {
108                // types; we have to ensure when skipping to the LBRACE, that there is
109                // no earlier SEMICOLON
110                inAnyState().repeated(getTypeModifier()).markStart().sequence(getTypeKeywords(), getValidIdentifiers())
111                                .skipBefore(EnumSet.of(SEMICOLON, LBRACE)).sequence(LBRACE).createNode(EShallowEntityType.TYPE, 0, 1)
112                                .parseUntil(IN_TYPE).sequence(RBRACE).endNode();
113        }
114
115        /** Creates rules for C style typedef */
116        protected void createTypedefRules() {
117                RecognizerBase<EGenericParserStates> typeInTypedefAlternative = inAnyState().sequence(TYPEDEF).optional(CONST);
118                typeInTypedefAlternative.sequenceBefore(getTypeKeywords(), IDENTIFIER, LBRACE)
119                                .createNode(EShallowEntityType.TYPE, 0).parseOnce(TOP_LEVEL).skipTo(IDENTIFIER, SEMICOLON)
120                                .endNodeWithName(-2);
121                typeInTypedefAlternative.sequenceBefore(getTypeKeywords(), LBRACE).createNode(EShallowEntityType.TYPE, 0)
122                                .parseOnce(TOP_LEVEL).skipTo(IDENTIFIER, SEMICOLON).endNodeWithName(-2);
123
124                // skips to the name identifier (skips anything enclosed in <...>)
125                RecognizerBase<EGenericParserStates> simpleTypedefAlternative = inAnyState().sequence(TYPEDEF)
126                                .createNode(EShallowEntityType.TYPE, 0)
127                                .skipBeforeWithNesting(Arrays.asList(IDENTIFIER, EnumSet.of(SEMICOLON, RPAREN, LBRACK)),
128                                                Arrays.asList(ETokenType.LT), Arrays.asList(ETokenType.GT), null);
129
130                // array types such as "typedef foo <COMPLEX STUFF> name [COMPLEX STUFF];"
131                simpleTypedefAlternative.markStart().sequenceBefore(IDENTIFIER, EnumSet.of(SEMICOLON, LBRACK))
132                                .skipToWithNesting(SEMICOLON, LBRACK, RBRACK).endNodeWithName(0);
133                simpleTypedefAlternative.markStart().sequence(IDENTIFIER, RPAREN).skipTo(SEMICOLON).endNodeWithName(0);
134        }
135
136        /**
137         * Returns the valid type modifiers for the language. Default implementation
138         * returns empty set. Override to use correct modifiers.
139         */
140        protected EnumSet<ETokenType> getTypeModifier() {
141                return EnumSet.noneOf(ETokenType.class);
142        }
143
144        /** Returns the set of keywords that start a type. */
145        protected abstract EnumSet<ETokenType> getTypeKeywords();
146
147        /** Parser rules for both attributes and methods. */
148        protected abstract void createClassElementsRules();
149
150        /** Creates parser rules for statements. */
151        protected void createStatementRules() {
152                createEmptyStatementRule();
153                createLabelRule();
154                createElseIfRule();
155                createBasicBlockRules();
156                createCaseRule();
157                createDoWhileRule();
158                createGenericBlockRule();
159                createSimpleStatementRule();
160        }
161
162        /** The empty statement. */
163        private void createEmptyStatementRule() {
164                inState(IN_METHOD).sequence(SEMICOLON).createNode(EShallowEntityType.STATEMENT, SubTypeNames.EMPTY_STATEMENT)
165                                .endNode();
166        }
167
168        /** Matches labels. */
169        private void createLabelRule() {
170                // filter out labels as meta as they do not increase statement count
171                inState(IN_METHOD).sequence(getValidIdentifiers(), COLON)
172                                .createNode(EShallowEntityType.META, SubTypeNames.LABEL, 0).endNode();
173        }
174
175        /** Special rule for else-if. */
176        protected void createElseIfRule() {
177                RecognizerBase<EGenericParserStates> elseIfAlternative = inState(IN_METHOD).sequence(ELSE, IF)
178                                .createNode(EShallowEntityType.STATEMENT, new int[] { 0, 1 })
179                                .skipNested(LPAREN, RPAREN, getSubExpressionRecognizer());
180                endWithPossibleContinuation(elseIfAlternative.sequence(LBRACE).parseUntil(IN_METHOD).sequence(RBRACE),
181                                EnumSet.of(ELSE));
182                endWithPossibleContinuation(elseIfAlternative.parseOnce(IN_METHOD), EnumSet.of(ELSE));
183        }
184
185        /**
186         * Block constructs, such as if/else, while/for/switch, try/catch/finally,
187         * synchronized (only in some languages).
188         */
189        protected void createBasicBlockRules() {
190                createBlockRuleWithContinuation(getSimpleBlockKeywordsWithParentheses(), null, true, false);
191                createBlockRuleWithContinuation(getSimpleBlockKeywordsWithoutParentheses(), null, false, false);
192                createBlockRuleWithContinuation(EnumSet.of(IF), EnumSet.of(ELSE), true, false);
193                createBlockRuleWithContinuation(EnumSet.of(TRY, CATCH), EnumSet.of(CATCH, FINALLY), true, true);
194        }
195
196        /**
197         * Case statement is parsed as meta, as it is hardly a statement on its own.
198         */
199        protected void createCaseRule() {
200                HashSet<ETokenType> literalsAndIdentifiers = new HashSet<>(LITERALS);
201                literalsAndIdentifiers.addAll(getValidIdentifiers());
202                inState(IN_METHOD).sequence(CASE).optional(MINUS).sequence(literalsAndIdentifiers)
203                                .optional(PLUS, literalsAndIdentifiers).sequence(COLON).createNode(EShallowEntityType.META, 0, 1)
204                                .endNode();
205                inState(IN_METHOD).sequence(CASE, LPAREN).skipToWithNesting(RPAREN, LPAREN, RPAREN).sequence(COLON)
206                                .createNode(EShallowEntityType.META, 0, new Region(1, -2)).endNode();
207
208                inState(IN_METHOD).sequence(DEFAULT, COLON).createNode(EShallowEntityType.META, 0).endNode();
209        }
210
211        /** Creates the do-while rule. */
212        protected void createDoWhileRule() {
213                RecognizerBase<EGenericParserStates> doWhileAlternative = inState(IN_METHOD).sequence(DO)
214                                .createNode(EShallowEntityType.STATEMENT, 0);
215                doWhileAlternative.sequence(LBRACE).parseUntil(IN_METHOD).sequence(RBRACE, WHILE)
216                                .skipNested(LPAREN, RPAREN, getSubExpressionRecognizer()).optional(ETokenType.SEMICOLON).endNode();
217                doWhileAlternative.parseOnce(IN_METHOD).sequence(WHILE).skipNested(LPAREN, RPAREN, getSubExpressionRecognizer())
218                                .optional(ETokenType.SEMICOLON).endNode();
219        }
220
221        /** Generic block. */
222        private void createGenericBlockRule() {
223                inState(IN_METHOD).sequence(LBRACE).createNode(EShallowEntityType.STATEMENT, SubTypeNames.ANONYMOUS_BLOCK)
224                                .parseUntil(IN_METHOD).sequence(RBRACE).endNode();
225        }
226
227        /**
228         * Contributes rules for detecting simple statements (local variable
229         * declarations, assignments, etc.) inside methods.
230         */
231        protected void contributeSimpleStatementRules(EnumSet<ETokenType> localVariableIdentifiers,
232                        EnumSet<ETokenType> statementStartTokens) {
233
234                // heuristic for detecting local variables
235                completeSimpleStatement(
236                                typePatternInState(IN_METHOD).skipNested(LBRACE, RBRACE).markStart()
237                                                .sequenceBefore(localVariableIdentifiers, EnumSet.of(COMMA, EQ, SEMICOLON, LBRACK)),
238                                SubTypeNames.LOCAL_VARIABLE);
239
240                // enum and union local variables
241                completeSimpleStatement(inState(IN_METHOD).sequence(EnumSet.of(ENUM, UNION)).skipNested(LBRACE, RBRACE)
242                                .markStart().sequenceBefore(localVariableIdentifiers, SEMICOLON), SubTypeNames.LOCAL_VARIABLE);
243
244                // function pointer local variables
245                completeSimpleStatement(typePatternInState(IN_METHOD).repeated(LPAREN).sequence(MULT).markStart()
246                                .sequenceBefore(localVariableIdentifiers, RPAREN), SubTypeNames.LOCAL_VARIABLE);
247
248                // C#8 using declaration
249                completeSimpleStatement(inState(IN_METHOD).sequence(USING).optional(VAR).markStart().sequence(IDENTIFIER));
250
251                completeSimpleStatement(inState(IN_METHOD).sequence(LPAREN).markStart().sequence(IDENTIFIER));
252                completeSimpleStatement(inState(IN_METHOD).sequence(statementStartTokens));
253                completeSimpleStatement(typePatternInState(IN_METHOD));
254                completeSimpleStatement(inState(IN_METHOD).sequence(LITERALS));
255        }
256
257        /** Simple statement. */
258        protected void createSimpleStatementRule() {
259                contributeSimpleStatementRules(getValidIdentifiers(), getStatementStartTokens());
260        }
261
262        /**
263         * Creates rules for dealing with constructs in subexpressions, such as
264         * anonymous classes, lambdas, etc.
265         */
266        protected void createSubExpressionRules() {
267                // default implementation does nothing
268        }
269
270        /**
271         * Returns a recognizer used for detecting sub expressions (anonymous classes,
272         * lambdas, etc.) within expressions. This may return null (which is done by the
273         * default implementation).
274         */
275        protected RecognizerBase<EGenericParserStates> getSubExpressionRecognizer() {
276                return null;
277        }
278
279        /**
280         * Returns the set of all valid identifiers, i.e. token types that can be used
281         * to name elements in the language.
282         */
283        protected EnumSet<ETokenType> getValidIdentifiers() {
284                return EnumSet.of(IDENTIFIER);
285        }
286
287        /**
288         * Returns the set of all keywords that start a simple block with optional
289         * parentheses (see implementers for examples).
290         */
291        protected abstract EnumSet<ETokenType> getSimpleBlockKeywordsWithParentheses();
292
293        /**
294         * Returns the set of all keywords that start a simple block but are never
295         * followed by parentheses (see implementers for examples).
296         */
297        protected abstract EnumSet<ETokenType> getSimpleBlockKeywordsWithoutParentheses();
298
299        /**
300         * Returns a set of all tokens that can start a statement, besides a type (see
301         * {@link #typePatternInState(EGenericParserStates...)} and a literal.
302         */
303        protected abstract EnumSet<ETokenType> getStatementStartTokens();
304
305        /** Creates a recognizer that matches all valid types. */
306        protected abstract RecognizerBase<EGenericParserStates> typePattern(
307                        RecognizerBase<EGenericParserStates> currentState);
308
309        /**
310         * Creates a recognizer that matches all valid types, starting from the given
311         * state.
312         */
313        protected RecognizerBase<EGenericParserStates> typePatternInState(EGenericParserStates... states) {
314                return typePattern(inState(states));
315        }
316
317        /**
318         * Creates a rule for recognizing a statement starting with a single keyword,
319         * optionally followed by an expression in parentheses, and followed by a block
320         * or a single statement.
321         * 
322         * @param continuationTokens
323         *            list of tokens that indicate a continued statement if encountered
324         *            after the block. May be null.
325         */
326        protected void createBlockRuleWithContinuation(EnumSet<ETokenType> startTokens,
327                        EnumSet<ETokenType> continuationTokens, boolean canBeFollowedByParentheses, boolean alwaysBraces) {
328                RecognizerBase<EGenericParserStates> alternative = inState(IN_METHOD).sequence(startTokens)
329                                // The `var` keyword can be only used for variable declarations and not for
330                                // statement declarations.
331                                .notPreCondition(createRecognizer(start -> start.sequence(ETokenType.VAR)))
332                                .createNode(EShallowEntityType.STATEMENT, 0);
333                if (canBeFollowedByParentheses) {
334                        alternative = alternative.skipNested(LPAREN, RPAREN, getSubExpressionRecognizer());
335                }
336
337                if (alwaysBraces) {
338                        alternative = alternative.skipBefore(LBRACE);
339                }
340
341                endWithPossibleContinuation(alternative.sequence(LBRACE).parseUntil(IN_METHOD).sequence(RBRACE),
342                                continuationTokens);
343                endWithPossibleContinuation(alternative.parseOnce(IN_METHOD), continuationTokens);
344        }
345
346        /** Completes a recognizer for a simple statement. */
347        protected void completeSimpleStatement(RecognizerBase<EGenericParserStates> baseRecognizer) {
348                completeSimpleStatement(baseRecognizer, SubTypeNames.SIMPLE_STATEMENT);
349        }
350
351        /** Completes a recognizer for a simple statement. */
352        protected void completeSimpleStatement(RecognizerBase<EGenericParserStates> baseRecognizer, String subtype) {
353                RecognizerBase<EGenericParserStates> alternative = baseRecognizer
354                                .createNode(EShallowEntityType.STATEMENT, subtype, 0).skipBeforeWithNesting(
355                                                EnumSet.of(SEMICOLON, RBRACE), LBRACE, RBRACE, LPAREN, RPAREN, getSubExpressionRecognizer());
356
357                alternative.sequence(SEMICOLON).endNode();
358
359                // this (empty) alternative captures the case where a statement is not
360                // closed by a semicolon, so we deliberately leave it open. While in
361                // most languages this is an error (and then this rule helps us to
362                // continue parsing), in C++ you can construct valid statements without
363                // semicolon using macros (although it is discouraged).
364                alternative.sequence();
365        }
366
367        /** Creates rules for parsing lambdas with arrows like in Java or C#. */
368        protected void createLambdaWithArrowRules(ETokenType arrowType) {
369                // lambda expressions
370                completeLambda(inState(IN_EXPRESSION).sequence(getValidIdentifiers()), arrowType);
371                completeLambda(inState(IN_EXPRESSION).sequence(LPAREN).skipTo(RPAREN), arrowType);
372
373                // additional rule for parsing lambda expressions (without braces). see
374                // completeLambda() for details
375                // the node start is moved one token to the right so the shallow
376                // entities produced by this rule don't include the arrow (instead it
377                // will be included in the parent entity)
378                inState(IN_EXPRESSION).sequence(arrowType)
379                                .createNode(EShallowEntityType.STATEMENT, SubTypeNames.LAMBDA_EXPRESSION, null, 1)
380                                .skipBeforeWithNesting(EnumSet.of(RPAREN, SEMICOLON, RBRACE, COMMA), LPAREN, RPAREN, LBRACE, RBRACE)
381                                .endNode();
382        }
383
384        /** Completes a rule for parsing lambda expressions. */
385        private static void completeLambda(RecognizerBase<EGenericParserStates> ruleStart, ETokenType arrowType) {
386                RecognizerBase<EGenericParserStates> lambdaAlternative = ruleStart.createNode(EShallowEntityType.METHOD,
387                                SubTypeNames.LAMBDA);
388                lambdaAlternative.sequence(arrowType, LBRACE).parseUntil(IN_METHOD).sequence(RBRACE).endNode();
389
390                // we start parsing before the arrow as this allows our special
391                // statement rule to capture this case. This is required, as this kind
392                // of expression is not terminated by a semicolon.
393                lambdaAlternative.sequenceBefore(arrowType).parseOnce(IN_EXPRESSION).endNode();
394        }
395
396        /**
397         * Skips over the optional parameters and returns the new offset or
398         * {@link RecognizerBase#NO_MATCH} if it is malformed.
399         */
400        public static int skipOptionalParameters(List<IToken> tokens, int currentOffset) {
401                if (!TokenStreamUtils.hasTokenTypeSequence(tokens, currentOffset, LPAREN)) {
402                        return currentOffset;
403                }
404
405                int closingParenthesis = TokenStreamUtils.findMatchingClosingToken(tokens, currentOffset + 1, LPAREN, RPAREN);
406                if (closingParenthesis == TokenStreamUtils.NOT_FOUND) {
407                        return RecognizerBase.NO_MATCH;
408                }
409                return closingParenthesis + 1;
410        }
411}