001/*-------------------------------------------------------------------------+ 002| | 003| Copyright 2005-2011 the ConQAT Project | 004| | 005| Licensed under the Apache License, Version 2.0 (the "License"); | 006| you may not use this file except in compliance with the License. | 007| You may obtain a copy of the License at | 008| | 009| http://www.apache.org/licenses/LICENSE-2.0 | 010| | 011| Unless required by applicable law or agreed to in writing, software | 012| distributed under the License is distributed on an "AS IS" BASIS, | 013| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 014| See the License for the specific language governing permissions and | 015| limitations under the License. | 016+-------------------------------------------------------------------------*/ 017package eu.cqse.check.framework.shallowparser.languages.base; 018 019import static eu.cqse.check.framework.scanner.ETokenType.AND; 020import static eu.cqse.check.framework.scanner.ETokenType.ANDAND; 021import static eu.cqse.check.framework.scanner.ETokenType.ANDEQ; 022import static eu.cqse.check.framework.scanner.ETokenType.CASE; 023import static eu.cqse.check.framework.scanner.ETokenType.CATCH; 024import static eu.cqse.check.framework.scanner.ETokenType.COLON; 025import static eu.cqse.check.framework.scanner.ETokenType.COMMA; 026import static eu.cqse.check.framework.scanner.ETokenType.COMP; 027import static eu.cqse.check.framework.scanner.ETokenType.CONST; 028import static eu.cqse.check.framework.scanner.ETokenType.DEFAULT; 029import static eu.cqse.check.framework.scanner.ETokenType.DO; 030import static eu.cqse.check.framework.scanner.ETokenType.ELSE; 031import static eu.cqse.check.framework.scanner.ETokenType.ENUM; 032import static eu.cqse.check.framework.scanner.ETokenType.EQ; 033import static eu.cqse.check.framework.scanner.ETokenType.FINALLY; 034import static eu.cqse.check.framework.scanner.ETokenType.IDENTIFIER; 035import static eu.cqse.check.framework.scanner.ETokenType.IF; 036import static eu.cqse.check.framework.scanner.ETokenType.LBRACE; 037import static eu.cqse.check.framework.scanner.ETokenType.LBRACK; 038import static eu.cqse.check.framework.scanner.ETokenType.LITERALS; 039import static eu.cqse.check.framework.scanner.ETokenType.LPAREN; 040import static eu.cqse.check.framework.scanner.ETokenType.MINUS; 041import static eu.cqse.check.framework.scanner.ETokenType.MULT; 042import static eu.cqse.check.framework.scanner.ETokenType.NOT; 043import static eu.cqse.check.framework.scanner.ETokenType.NOTEQ; 044import static eu.cqse.check.framework.scanner.ETokenType.OR; 045import static eu.cqse.check.framework.scanner.ETokenType.OREQ; 046import static eu.cqse.check.framework.scanner.ETokenType.OROR; 047import static eu.cqse.check.framework.scanner.ETokenType.PLUS; 048import static eu.cqse.check.framework.scanner.ETokenType.RBRACE; 049import static eu.cqse.check.framework.scanner.ETokenType.RBRACK; 050import static eu.cqse.check.framework.scanner.ETokenType.RPAREN; 051import static eu.cqse.check.framework.scanner.ETokenType.SEMICOLON; 052import static eu.cqse.check.framework.scanner.ETokenType.TRY; 053import static eu.cqse.check.framework.scanner.ETokenType.TYPEDEF; 054import static eu.cqse.check.framework.scanner.ETokenType.UNION; 055import static eu.cqse.check.framework.scanner.ETokenType.USING; 056import static eu.cqse.check.framework.scanner.ETokenType.VAR; 057import static eu.cqse.check.framework.scanner.ETokenType.WHILE; 058import static eu.cqse.check.framework.scanner.ETokenType.XOR; 059import static eu.cqse.check.framework.scanner.ETokenType.XOREQ; 060import static eu.cqse.check.framework.shallowparser.languages.base.EGenericParserStates.IN_EXPRESSION; 061import static eu.cqse.check.framework.shallowparser.languages.base.EGenericParserStates.IN_METHOD; 062import static eu.cqse.check.framework.shallowparser.languages.base.EGenericParserStates.IN_TYPE; 063import static eu.cqse.check.framework.shallowparser.languages.base.EGenericParserStates.TOP_LEVEL; 064 065import java.util.Arrays; 066import java.util.EnumSet; 067import java.util.HashSet; 068import java.util.List; 069 070import org.conqat.lib.commons.region.Region; 071 072import eu.cqse.check.framework.scanner.ETokenType; 073import eu.cqse.check.framework.scanner.IToken; 074import eu.cqse.check.framework.shallowparser.SubTypeNames; 075import eu.cqse.check.framework.shallowparser.TokenStreamUtils; 076import eu.cqse.check.framework.shallowparser.framework.EShallowEntityType; 077import eu.cqse.check.framework.shallowparser.framework.RecognizerBase; 078import eu.cqse.check.framework.shallowparser.framework.ShallowParserBase; 079 080/** 081 * Base class for C-style languages (C++, Java, C#). 082 */ 083public abstract class CStyleShallowParserBase extends ShallowParserBase<EGenericParserStates> { 084 085 /** C++ operators that have an alternative (string) representations. */ 086 protected static final EnumSet<ETokenType> OPERATORS_WITH_ALTERNATIVE_REPRESENTATION = EnumSet.of(NOT, COMP, NOTEQ, 087 ANDAND, OROR, AND, OR, XOR, ANDEQ, OREQ, XOREQ); 088 089 /** Constructor. */ 090 protected CStyleShallowParserBase() { 091 super(EGenericParserStates.class, EGenericParserStates.TOP_LEVEL); 092 createMetaRules(); 093 createTypeRules(); 094 createClassElementsRules(); 095 createStatementRules(); 096 createSubExpressionRules(); 097 } 098 099 /** Creates rules for meta elements. */ 100 protected void createMetaRules() { 101 // deal with dangling closing braces by inserting broken node (endNode 102 // intentionally omitted) 103 inAnyState().sequence(RBRACE).createNode(EShallowEntityType.META, "dangling closing brace"); 104 } 105 106 /** Parser rules for module/namespace and type creation. */ 107 protected void createTypeRules() { 108 // types; we have to ensure when skipping to the LBRACE, that there is 109 // no earlier SEMICOLON 110 inAnyState().repeated(getTypeModifier()).markStart().sequence(getTypeKeywords(), getValidIdentifiers()) 111 .skipBefore(EnumSet.of(SEMICOLON, LBRACE)).sequence(LBRACE).createNode(EShallowEntityType.TYPE, 0, 1) 112 .parseUntil(IN_TYPE).sequence(RBRACE).endNode(); 113 } 114 115 /** Creates rules for C style typedef */ 116 protected void createTypedefRules() { 117 RecognizerBase<EGenericParserStates> typeInTypedefAlternative = inAnyState().sequence(TYPEDEF).optional(CONST); 118 typeInTypedefAlternative.sequenceBefore(getTypeKeywords(), IDENTIFIER, LBRACE) 119 .createNode(EShallowEntityType.TYPE, 0).parseOnce(TOP_LEVEL).skipTo(IDENTIFIER, SEMICOLON) 120 .endNodeWithName(-2); 121 typeInTypedefAlternative.sequenceBefore(getTypeKeywords(), LBRACE).createNode(EShallowEntityType.TYPE, 0) 122 .parseOnce(TOP_LEVEL).skipTo(IDENTIFIER, SEMICOLON).endNodeWithName(-2); 123 124 // skips to the name identifier (skips anything enclosed in <...>) 125 RecognizerBase<EGenericParserStates> simpleTypedefAlternative = inAnyState().sequence(TYPEDEF) 126 .createNode(EShallowEntityType.TYPE, 0) 127 .skipBeforeWithNesting(Arrays.asList(IDENTIFIER, EnumSet.of(SEMICOLON, RPAREN, LBRACK)), 128 Arrays.asList(ETokenType.LT), Arrays.asList(ETokenType.GT), null); 129 130 // array types such as "typedef foo <COMPLEX STUFF> name [COMPLEX STUFF];" 131 simpleTypedefAlternative.markStart().sequenceBefore(IDENTIFIER, EnumSet.of(SEMICOLON, LBRACK)) 132 .skipToWithNesting(SEMICOLON, LBRACK, RBRACK).endNodeWithName(0); 133 simpleTypedefAlternative.markStart().sequence(IDENTIFIER, RPAREN).skipTo(SEMICOLON).endNodeWithName(0); 134 } 135 136 /** 137 * Returns the valid type modifiers for the language. Default implementation 138 * returns empty set. Override to use correct modifiers. 139 */ 140 protected EnumSet<ETokenType> getTypeModifier() { 141 return EnumSet.noneOf(ETokenType.class); 142 } 143 144 /** Returns the set of keywords that start a type. */ 145 protected abstract EnumSet<ETokenType> getTypeKeywords(); 146 147 /** Parser rules for both attributes and methods. */ 148 protected abstract void createClassElementsRules(); 149 150 /** Creates parser rules for statements. */ 151 protected void createStatementRules() { 152 createEmptyStatementRule(); 153 createLabelRule(); 154 createElseIfRule(); 155 createBasicBlockRules(); 156 createCaseRule(); 157 createDoWhileRule(); 158 createGenericBlockRule(); 159 createSimpleStatementRule(); 160 } 161 162 /** The empty statement. */ 163 private void createEmptyStatementRule() { 164 inState(IN_METHOD).sequence(SEMICOLON).createNode(EShallowEntityType.STATEMENT, SubTypeNames.EMPTY_STATEMENT) 165 .endNode(); 166 } 167 168 /** Matches labels. */ 169 private void createLabelRule() { 170 // filter out labels as meta as they do not increase statement count 171 inState(IN_METHOD).sequence(getValidIdentifiers(), COLON) 172 .createNode(EShallowEntityType.META, SubTypeNames.LABEL, 0).endNode(); 173 } 174 175 /** Special rule for else-if. */ 176 protected void createElseIfRule() { 177 RecognizerBase<EGenericParserStates> elseIfAlternative = inState(IN_METHOD).sequence(ELSE, IF) 178 .createNode(EShallowEntityType.STATEMENT, new int[] { 0, 1 }) 179 .skipNested(LPAREN, RPAREN, getSubExpressionRecognizer()); 180 endWithPossibleContinuation(elseIfAlternative.sequence(LBRACE).parseUntil(IN_METHOD).sequence(RBRACE), 181 EnumSet.of(ELSE)); 182 endWithPossibleContinuation(elseIfAlternative.parseOnce(IN_METHOD), EnumSet.of(ELSE)); 183 } 184 185 /** 186 * Block constructs, such as if/else, while/for/switch, try/catch/finally, 187 * synchronized (only in some languages). 188 */ 189 protected void createBasicBlockRules() { 190 createBlockRuleWithContinuation(getSimpleBlockKeywordsWithParentheses(), null, true, false); 191 createBlockRuleWithContinuation(getSimpleBlockKeywordsWithoutParentheses(), null, false, false); 192 createBlockRuleWithContinuation(EnumSet.of(IF), EnumSet.of(ELSE), true, false); 193 createBlockRuleWithContinuation(EnumSet.of(TRY, CATCH), EnumSet.of(CATCH, FINALLY), true, true); 194 } 195 196 /** 197 * Case statement is parsed as meta, as it is hardly a statement on its own. 198 */ 199 protected void createCaseRule() { 200 HashSet<ETokenType> literalsAndIdentifiers = new HashSet<>(LITERALS); 201 literalsAndIdentifiers.addAll(getValidIdentifiers()); 202 inState(IN_METHOD).sequence(CASE).optional(MINUS).sequence(literalsAndIdentifiers) 203 .optional(PLUS, literalsAndIdentifiers).sequence(COLON).createNode(EShallowEntityType.META, 0, 1) 204 .endNode(); 205 inState(IN_METHOD).sequence(CASE, LPAREN).skipToWithNesting(RPAREN, LPAREN, RPAREN).sequence(COLON) 206 .createNode(EShallowEntityType.META, 0, new Region(1, -2)).endNode(); 207 208 inState(IN_METHOD).sequence(DEFAULT, COLON).createNode(EShallowEntityType.META, 0).endNode(); 209 } 210 211 /** Creates the do-while rule. */ 212 protected void createDoWhileRule() { 213 RecognizerBase<EGenericParserStates> doWhileAlternative = inState(IN_METHOD).sequence(DO) 214 .createNode(EShallowEntityType.STATEMENT, 0); 215 doWhileAlternative.sequence(LBRACE).parseUntil(IN_METHOD).sequence(RBRACE, WHILE) 216 .skipNested(LPAREN, RPAREN, getSubExpressionRecognizer()).optional(ETokenType.SEMICOLON).endNode(); 217 doWhileAlternative.parseOnce(IN_METHOD).sequence(WHILE).skipNested(LPAREN, RPAREN, getSubExpressionRecognizer()) 218 .optional(ETokenType.SEMICOLON).endNode(); 219 } 220 221 /** Generic block. */ 222 private void createGenericBlockRule() { 223 inState(IN_METHOD).sequence(LBRACE).createNode(EShallowEntityType.STATEMENT, SubTypeNames.ANONYMOUS_BLOCK) 224 .parseUntil(IN_METHOD).sequence(RBRACE).endNode(); 225 } 226 227 /** 228 * Contributes rules for detecting simple statements (local variable 229 * declarations, assignments, etc.) inside methods. 230 */ 231 protected void contributeSimpleStatementRules(EnumSet<ETokenType> localVariableIdentifiers, 232 EnumSet<ETokenType> statementStartTokens) { 233 234 // heuristic for detecting local variables 235 completeSimpleStatement( 236 typePatternInState(IN_METHOD).skipNested(LBRACE, RBRACE).markStart() 237 .sequenceBefore(localVariableIdentifiers, EnumSet.of(COMMA, EQ, SEMICOLON, LBRACK)), 238 SubTypeNames.LOCAL_VARIABLE); 239 240 // enum and union local variables 241 completeSimpleStatement(inState(IN_METHOD).sequence(EnumSet.of(ENUM, UNION)).skipNested(LBRACE, RBRACE) 242 .markStart().sequenceBefore(localVariableIdentifiers, SEMICOLON), SubTypeNames.LOCAL_VARIABLE); 243 244 // function pointer local variables 245 completeSimpleStatement(typePatternInState(IN_METHOD).repeated(LPAREN).sequence(MULT).markStart() 246 .sequenceBefore(localVariableIdentifiers, RPAREN), SubTypeNames.LOCAL_VARIABLE); 247 248 // C#8 using declaration 249 completeSimpleStatement(inState(IN_METHOD).sequence(USING).optional(VAR).markStart().sequence(IDENTIFIER)); 250 251 completeSimpleStatement(inState(IN_METHOD).sequence(LPAREN).markStart().sequence(IDENTIFIER)); 252 completeSimpleStatement(inState(IN_METHOD).sequence(statementStartTokens)); 253 completeSimpleStatement(typePatternInState(IN_METHOD)); 254 completeSimpleStatement(inState(IN_METHOD).sequence(LITERALS)); 255 } 256 257 /** Simple statement. */ 258 protected void createSimpleStatementRule() { 259 contributeSimpleStatementRules(getValidIdentifiers(), getStatementStartTokens()); 260 } 261 262 /** 263 * Creates rules for dealing with constructs in subexpressions, such as 264 * anonymous classes, lambdas, etc. 265 */ 266 protected void createSubExpressionRules() { 267 // default implementation does nothing 268 } 269 270 /** 271 * Returns a recognizer used for detecting sub expressions (anonymous classes, 272 * lambdas, etc.) within expressions. This may return null (which is done by the 273 * default implementation). 274 */ 275 protected RecognizerBase<EGenericParserStates> getSubExpressionRecognizer() { 276 return null; 277 } 278 279 /** 280 * Returns the set of all valid identifiers, i.e. token types that can be used 281 * to name elements in the language. 282 */ 283 protected EnumSet<ETokenType> getValidIdentifiers() { 284 return EnumSet.of(IDENTIFIER); 285 } 286 287 /** 288 * Returns the set of all keywords that start a simple block with optional 289 * parentheses (see implementers for examples). 290 */ 291 protected abstract EnumSet<ETokenType> getSimpleBlockKeywordsWithParentheses(); 292 293 /** 294 * Returns the set of all keywords that start a simple block but are never 295 * followed by parentheses (see implementers for examples). 296 */ 297 protected abstract EnumSet<ETokenType> getSimpleBlockKeywordsWithoutParentheses(); 298 299 /** 300 * Returns a set of all tokens that can start a statement, besides a type (see 301 * {@link #typePatternInState(EGenericParserStates...)} and a literal. 302 */ 303 protected abstract EnumSet<ETokenType> getStatementStartTokens(); 304 305 /** Creates a recognizer that matches all valid types. */ 306 protected abstract RecognizerBase<EGenericParserStates> typePattern( 307 RecognizerBase<EGenericParserStates> currentState); 308 309 /** 310 * Creates a recognizer that matches all valid types, starting from the given 311 * state. 312 */ 313 protected RecognizerBase<EGenericParserStates> typePatternInState(EGenericParserStates... states) { 314 return typePattern(inState(states)); 315 } 316 317 /** 318 * Creates a rule for recognizing a statement starting with a single keyword, 319 * optionally followed by an expression in parentheses, and followed by a block 320 * or a single statement. 321 * 322 * @param continuationTokens 323 * list of tokens that indicate a continued statement if encountered 324 * after the block. May be null. 325 */ 326 protected void createBlockRuleWithContinuation(EnumSet<ETokenType> startTokens, 327 EnumSet<ETokenType> continuationTokens, boolean canBeFollowedByParentheses, boolean alwaysBraces) { 328 RecognizerBase<EGenericParserStates> alternative = inState(IN_METHOD).sequence(startTokens) 329 // The `var` keyword can be only used for variable declarations and not for 330 // statement declarations. 331 .notPreCondition(createRecognizer(start -> start.sequence(ETokenType.VAR))) 332 .createNode(EShallowEntityType.STATEMENT, 0); 333 if (canBeFollowedByParentheses) { 334 alternative = alternative.skipNested(LPAREN, RPAREN, getSubExpressionRecognizer()); 335 } 336 337 if (alwaysBraces) { 338 alternative = alternative.skipBefore(LBRACE); 339 } 340 341 endWithPossibleContinuation(alternative.sequence(LBRACE).parseUntil(IN_METHOD).sequence(RBRACE), 342 continuationTokens); 343 endWithPossibleContinuation(alternative.parseOnce(IN_METHOD), continuationTokens); 344 } 345 346 /** Completes a recognizer for a simple statement. */ 347 protected void completeSimpleStatement(RecognizerBase<EGenericParserStates> baseRecognizer) { 348 completeSimpleStatement(baseRecognizer, SubTypeNames.SIMPLE_STATEMENT); 349 } 350 351 /** Completes a recognizer for a simple statement. */ 352 protected void completeSimpleStatement(RecognizerBase<EGenericParserStates> baseRecognizer, String subtype) { 353 RecognizerBase<EGenericParserStates> alternative = baseRecognizer 354 .createNode(EShallowEntityType.STATEMENT, subtype, 0).skipBeforeWithNesting( 355 EnumSet.of(SEMICOLON, RBRACE), LBRACE, RBRACE, LPAREN, RPAREN, getSubExpressionRecognizer()); 356 357 alternative.sequence(SEMICOLON).endNode(); 358 359 // this (empty) alternative captures the case where a statement is not 360 // closed by a semicolon, so we deliberately leave it open. While in 361 // most languages this is an error (and then this rule helps us to 362 // continue parsing), in C++ you can construct valid statements without 363 // semicolon using macros (although it is discouraged). 364 alternative.sequence(); 365 } 366 367 /** Creates rules for parsing lambdas with arrows like in Java or C#. */ 368 protected void createLambdaWithArrowRules(ETokenType arrowType) { 369 // lambda expressions 370 completeLambda(inState(IN_EXPRESSION).sequence(getValidIdentifiers()), arrowType); 371 completeLambda(inState(IN_EXPRESSION).sequence(LPAREN).skipTo(RPAREN), arrowType); 372 373 // additional rule for parsing lambda expressions (without braces). see 374 // completeLambda() for details 375 // the node start is moved one token to the right so the shallow 376 // entities produced by this rule don't include the arrow (instead it 377 // will be included in the parent entity) 378 inState(IN_EXPRESSION).sequence(arrowType) 379 .createNode(EShallowEntityType.STATEMENT, SubTypeNames.LAMBDA_EXPRESSION, null, 1) 380 .skipBeforeWithNesting(EnumSet.of(RPAREN, SEMICOLON, RBRACE, COMMA), LPAREN, RPAREN, LBRACE, RBRACE) 381 .endNode(); 382 } 383 384 /** Completes a rule for parsing lambda expressions. */ 385 private static void completeLambda(RecognizerBase<EGenericParserStates> ruleStart, ETokenType arrowType) { 386 RecognizerBase<EGenericParserStates> lambdaAlternative = ruleStart.createNode(EShallowEntityType.METHOD, 387 SubTypeNames.LAMBDA); 388 lambdaAlternative.sequence(arrowType, LBRACE).parseUntil(IN_METHOD).sequence(RBRACE).endNode(); 389 390 // we start parsing before the arrow as this allows our special 391 // statement rule to capture this case. This is required, as this kind 392 // of expression is not terminated by a semicolon. 393 lambdaAlternative.sequenceBefore(arrowType).parseOnce(IN_EXPRESSION).endNode(); 394 } 395 396 /** 397 * Skips over the optional parameters and returns the new offset or 398 * {@link RecognizerBase#NO_MATCH} if it is malformed. 399 */ 400 public static int skipOptionalParameters(List<IToken> tokens, int currentOffset) { 401 if (!TokenStreamUtils.hasTokenTypeSequence(tokens, currentOffset, LPAREN)) { 402 return currentOffset; 403 } 404 405 int closingParenthesis = TokenStreamUtils.findMatchingClosingToken(tokens, currentOffset + 1, LPAREN, RPAREN); 406 if (closingParenthesis == TokenStreamUtils.NOT_FOUND) { 407 return RecognizerBase.NO_MATCH; 408 } 409 return closingParenthesis + 1; 410 } 411}