001/*-------------------------------------------------------------------------+
002|                                                                          |
003| Copyright (c) 2009-2017 CQSE GmbH                                        |
004|                                                                          |
005+-------------------------------------------------------------------------*/
006package eu.cqse.check.framework.util.python;
007
008import eu.cqse.check.framework.scanner.ETokenType;
009import eu.cqse.check.framework.scanner.IToken;
010import eu.cqse.check.framework.shallowparser.SubTypeNames;
011import eu.cqse.check.framework.shallowparser.TokenStreamUtils;
012import eu.cqse.check.framework.shallowparser.framework.ShallowEntity;
013import org.conqat.lib.commons.collections.CollectionUtils;
014
015import java.util.ArrayList;
016import java.util.Arrays;
017import java.util.Collections;
018import java.util.EnumSet;
019import java.util.List;
020
021import static eu.cqse.check.framework.scanner.ETokenType.AS;
022import static eu.cqse.check.framework.scanner.ETokenType.COLON;
023import static eu.cqse.check.framework.scanner.ETokenType.COMMA;
024import static eu.cqse.check.framework.scanner.ETokenType.DOT;
025import static eu.cqse.check.framework.scanner.ETokenType.EQ;
026import static eu.cqse.check.framework.scanner.ETokenType.FOR;
027import static eu.cqse.check.framework.scanner.ETokenType.IDENTIFIER;
028import static eu.cqse.check.framework.scanner.ETokenType.IN;
029import static eu.cqse.check.framework.scanner.ETokenType.LAMBDA;
030import static eu.cqse.check.framework.scanner.ETokenType.LBRACE;
031import static eu.cqse.check.framework.scanner.ETokenType.LBRACK;
032import static eu.cqse.check.framework.scanner.ETokenType.LPAREN;
033import static eu.cqse.check.framework.scanner.ETokenType.MULT;
034import static eu.cqse.check.framework.scanner.ETokenType.POWER;
035import static eu.cqse.check.framework.scanner.ETokenType.RBRACE;
036import static eu.cqse.check.framework.scanner.ETokenType.RBRACK;
037import static eu.cqse.check.framework.scanner.ETokenType.RPAREN;
038
039/**
040 * A class that extracts declared variable names from shallow entities in
041 * python. Currently extraction is implemented for.
042 *
043 * <ul>
044 * <li>expect statements</li>
045 * <li>for statements</li>
046 * <li>with statements</li>
047 * <li>method parameters</li>
048 * <li>lambda parameters</li>
049 * <li>variable and attribute declarations (with destructuring)</li>
050 * </ul>
051 * <p>
052 * For loops within list comprehensions are currently not supported.
053 */
054public class PythonVariableNameExtractor {
055
056        /**
057         * Marker tokens in method argument lists, as of <a href=
058         * "https://docs.python.org/dev/tutorial/controlflow.html#function-examples">Python
059         * Documentation</a>
060         */
061        private static final EnumSet<ETokenType> MARKER_TOKENS = EnumSet.of(MULT, ETokenType.SLASH);
062
063        /**
064         * Extracts declared variable names from the given entity.
065         */
066        public List<IToken> extractVariableNames(ShallowEntity entity) {
067                switch (entity.getType()) {
068                case ATTRIBUTE:
069                        return extractFromDeclarationStatement(entity.ownStartTokens());
070                case STATEMENT:
071                        return extractFromStatement(entity);
072                case METHOD:
073                        return extractFromMethodOrLambda(entity);
074                default:
075                        return CollectionUtils.emptyList();
076                }
077        }
078
079        /**
080         * Extracts declared variable names from the given statement.
081         */
082        private static List<IToken> extractFromStatement(ShallowEntity entity) {
083                switch (entity.getSubtype()) {
084                case SubTypeNames.SIMPLE_STATEMENT:
085                        return extractFromDeclarationStatement(entity.ownStartTokens());
086                case SubTypeNames.FOR:
087                        return extractFromForStatement(entity.ownStartTokens());
088                case SubTypeNames.WITH:
089                case SubTypeNames.EXCEPT:
090                        return extractFromStatementWithAs(entity.ownStartTokens());
091                default:
092                        return CollectionUtils.emptyList();
093                }
094        }
095
096        /**
097         * Extracts declared variable names from a declaration statement.
098         */
099        private static List<IToken> extractFromDeclarationStatement(List<IToken> tokens) {
100                int index = TokenStreamUtils.findFirstTopLevel(tokens,
101                                // EQ for regular assignment, COLON for variable annotation
102                                EnumSet.of(EQ, COLON),
103                                Collections.singletonList(LPAREN),
104                                Collections.singletonList(RPAREN));
105                if (index == TokenStreamUtils.NOT_FOUND) {
106                        // In this case there is no variable declaration
107                        return CollectionUtils.emptyList();
108                }
109
110                List<IToken> leftHandSideTokens = tokens.subList(0, index);
111
112                // If the left-hand-side contains any commas, it must be a tuple assignment
113                // (possible with or without parentheses/brackets)
114                // e.g. a,b = 1,2 ; [a, b] = [1, 2] ; (1,2) = (1,2)
115                if (TokenStreamUtils.containsAny(leftHandSideTokens, COMMA)) {
116                        return extractIdentifiersFromTupleAssignment(leftHandSideTokens);
117                }
118
119                return extractIdentifiers(leftHandSideTokens);
120        }
121
122        /**
123         * Extract the identifiers from a tuple assignment which are actually used to declare a new variable.
124         * @param tokens list of tokens from the left-hand-side of a tuple assignment
125         *               (possible nested with parentheses or brackets)
126         * @return list of tokens which are identifiers that are actually used as the declaration of a new variable
127         */
128        private static List<IToken> extractIdentifiersFromTupleAssignment(List<IToken> tokens) {
129                // Remove parentheses/brackets at beginning and end, then recursive call with remaining tokens
130                if (TokenStreamUtils.startsWith(tokens, LPAREN)) {
131                        tokens = TokenStreamUtils.removeAtFront(tokens, LPAREN);
132                        tokens = TokenStreamUtils.removeAtEnd(tokens, RPAREN);
133                        return extractIdentifiersFromTupleAssignment(tokens);
134                } else if (TokenStreamUtils.startsWith(tokens, LBRACK)) {
135                        tokens = TokenStreamUtils.removeAtFront(tokens, LBRACK);
136                        tokens = TokenStreamUtils.removeAtEnd(tokens, RBRACK);
137                        return extractIdentifiersFromTupleAssignment(tokens);
138                }
139
140                // Plain tuple assignment (possibly still nested), e.g.
141                // a, (b, c) = 1, (2, 3)
142                if (TokenStreamUtils.contains(tokens, COMMA)) {
143                        List<List<IToken>> splitTokenLists = TokenStreamUtils.split(tokens, COMMA);
144                        List<List<IToken>> plainTokenLists = new ArrayList<>();
145                        for (List<IToken> tokenList : splitTokenLists) {
146                                plainTokenLists.add(extractIdentifiersFromTupleAssignment(tokenList));
147                        }
148                        List<IToken> identifiers = new ArrayList<>();
149                        for (List<IToken> plainTokenList : plainTokenLists) {
150                                identifiers.addAll(extractIdentifiers(plainTokenList));
151                        }
152                        return identifiers;
153                }
154
155                // No more commas left
156                // -> we have reached the plain identifiers
157                //    or access of attributes or array/dictionary entries, i.e. a.str or a[int("1")]
158                return extractIdentifiers(tokens);
159        }
160
161        /**
162         * Extracts declared variable names from a for statement.
163         */
164        private static List<IToken> extractFromForStatement(List<IToken> tokens) {
165                return CollectionUtils.filter(TokenStreamUtils.tokensBetween(tokens, FOR, IN),
166                                token -> token.getType() == IDENTIFIER);
167        }
168
169        /**
170         * Extracts declared variables names from a statement that introduces variables
171         * with as. This includes with and except statements.
172         */
173        private static List<IToken> extractFromStatementWithAs(List<IToken> tokens) {
174                int index = TokenStreamUtils.firstTokenOfTypeSequence(tokens, 2, AS, IDENTIFIER);
175                if (index == TokenStreamUtils.NOT_FOUND) {
176                        return CollectionUtils.emptyList();
177                }
178                return tokens.subList(index + 1, index + 2);
179        }
180
181        /**
182         * Extracts parameter names from a method or lambda.
183         */
184        public List<IToken> extractFromMethodOrLambda(ShallowEntity methodOrLambda) {
185                if (SubTypeNames.LAMBDA.equals(methodOrLambda.getSubtype())) {
186                        return extractFromLambda(methodOrLambda);
187                }
188                return extractFromMethod(methodOrLambda);
189        }
190
191        /**
192         * Extracts parameter names from a lambda.
193         */
194        private static List<IToken> extractFromLambda(ShallowEntity lambda) {
195                return extractIdentifiers(TokenStreamUtils.tokensBetween(lambda.ownStartTokens(), LAMBDA, COLON));
196        }
197
198        /**
199         * Extracts parameter names from a method.
200         */
201        private static List<IToken> extractFromMethod(ShallowEntity method) {
202                List<List<IToken>> splitParameterTokens = getSplitParameterTokens(method);
203                return CollectionUtils.filterAndMap(splitParameterTokens,
204                                tokens -> !tokens.isEmpty() && !(tokens.size() == 1 && MARKER_TOKENS.contains(tokens.get(0).getType())),
205                                tokens -> {
206                                        IToken idToken = tokens.get(0);
207                                        if (idToken.getType() == MULT || idToken.getType() == POWER) {
208                                                return tokens.get(1);
209                                        }
210                                        return idToken;
211                                });
212        }
213
214        /**
215         * Returns the split parameter tokens from the given method. Returns a list of
216         * token list where each inner list represents all tokens of one parameter.
217         */
218        private static List<List<IToken>> getSplitParameterTokens(ShallowEntity method) {
219                List<IToken> parameterTokens = TokenStreamUtils.tokensBetweenWithNesting(method.includedTokens(), 2, LPAREN,
220                                RPAREN);
221                if (parameterTokens.isEmpty()) {
222                        return CollectionUtils.emptyList();
223                }
224
225                List<ETokenType> openingTypes = new ArrayList<>(Arrays.asList(LPAREN, LBRACK, LBRACE, LAMBDA));
226                List<ETokenType> closingTypes = new ArrayList<>(Arrays.asList(RPAREN, RBRACK, RBRACE, COLON));
227
228                // we need to handle nesting between lambdas and colon to handle lambda
229                // default parameter properly
230                return TokenStreamUtils.splitWithNesting(parameterTokens, COMMA, openingTypes, closingTypes);
231
232        }
233
234        /**
235         * Extracts all identifiers from the given tokens.
236         */
237        private static List<IToken> extractIdentifiers(List<IToken> tokens) {
238                // If the remaining token list contains
239                // - a bracket, it is an array or dictionary access
240                // - a parenthesis, it is a function call
241                // - a dot, it is an attribute access
242                // None of these declare a new variable.
243                if (TokenStreamUtils.containsAny(tokens, EnumSet.of(LBRACK, LPAREN, DOT))) {
244                        return CollectionUtils.emptyList();
245                }
246
247                return CollectionUtils.filter(tokens, token -> token.getType() == IDENTIFIER);
248        }
249
250}