001/*-------------------------------------------------------------------------+ 002| | 003| Copyright 2005-2011 The ConQAT Project | 004| | 005| Licensed under the Apache License, Version 2.0 (the "License"); | 006| you may not use this file except in compliance with the License. | 007| You may obtain a copy of the License at | 008| | 009| http://www.apache.org/licenses/LICENSE-2.0 | 010| | 011| Unless required by applicable law or agreed to in writing, software | 012| distributed under the License is distributed on an "AS IS" BASIS, | 013| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 014| See the License for the specific language governing permissions and | 015| limitations under the License. | 016+-------------------------------------------------------------------------*/ 017package eu.cqse.check.framework.scanner; 018 019import static eu.cqse.check.framework.scanner.ELanguage.ABAP; 020import static eu.cqse.check.framework.scanner.ELanguage.ABAP_DDIC; 021import static eu.cqse.check.framework.scanner.ELanguage.ADA; 022import static eu.cqse.check.framework.scanner.ELanguage.COBOL; 023import static eu.cqse.check.framework.scanner.ELanguage.CPP; 024import static eu.cqse.check.framework.scanner.ELanguage.CS; 025import static eu.cqse.check.framework.scanner.ELanguage.DELPHI; 026import static eu.cqse.check.framework.scanner.ELanguage.FORTRAN; 027import static eu.cqse.check.framework.scanner.ELanguage.GO; 028import static eu.cqse.check.framework.scanner.ELanguage.GOSU; 029import static eu.cqse.check.framework.scanner.ELanguage.GROOVY; 030import static eu.cqse.check.framework.scanner.ELanguage.HANA_VIEW; 031import static eu.cqse.check.framework.scanner.ELanguage.IEC61131; 032import static eu.cqse.check.framework.scanner.ELanguage.JAVA; 033import static eu.cqse.check.framework.scanner.ELanguage.JAVADOC; 034import static eu.cqse.check.framework.scanner.ELanguage.JAVASCRIPT; 035import static eu.cqse.check.framework.scanner.ELanguage.JPL; 036import static eu.cqse.check.framework.scanner.ELanguage.KOTLIN; 037import static eu.cqse.check.framework.scanner.ELanguage.LINE; 038import static eu.cqse.check.framework.scanner.ELanguage.MATLAB; 039import static eu.cqse.check.framework.scanner.ELanguage.MTEXT; 040import static eu.cqse.check.framework.scanner.ELanguage.NL_REQUIREMENTS; 041import static eu.cqse.check.framework.scanner.ELanguage.NL_TESTS; 042import static eu.cqse.check.framework.scanner.ELanguage.OBJECTIVE_C; 043import static eu.cqse.check.framework.scanner.ELanguage.OCAML; 044import static eu.cqse.check.framework.scanner.ELanguage.OPEN_CL; 045import static eu.cqse.check.framework.scanner.ELanguage.OSCRIPT; 046import static eu.cqse.check.framework.scanner.ELanguage.PHP; 047import static eu.cqse.check.framework.scanner.ELanguage.PL1; 048import static eu.cqse.check.framework.scanner.ELanguage.PLSQL; 049import static eu.cqse.check.framework.scanner.ELanguage.PYTHON; 050import static eu.cqse.check.framework.scanner.ELanguage.RUBY; 051import static eu.cqse.check.framework.scanner.ELanguage.RUST; 052import static eu.cqse.check.framework.scanner.ELanguage.SIMULINK; 053import static eu.cqse.check.framework.scanner.ELanguage.SQLSCRIPT; 054import static eu.cqse.check.framework.scanner.ELanguage.SWIFT; 055import static eu.cqse.check.framework.scanner.ELanguage.TEXT; 056import static eu.cqse.check.framework.scanner.ELanguage.TSQL; 057import static eu.cqse.check.framework.scanner.ELanguage.VB; 058import static eu.cqse.check.framework.scanner.ELanguage.XML; 059import static eu.cqse.check.framework.scanner.ELanguage.XTEND; 060import static eu.cqse.check.framework.scanner.ELanguageConstants.CLIKE_COMMENT_REGEX; 061import static eu.cqse.check.framework.scanner.ETokenType.COLON; 062import static eu.cqse.check.framework.scanner.ETokenType.DOT; 063import static eu.cqse.check.framework.scanner.ETokenType.EOF; 064import static eu.cqse.check.framework.scanner.ETokenType.EOL; 065import static eu.cqse.check.framework.scanner.ETokenType.EXCLAMATION; 066import static eu.cqse.check.framework.scanner.ETokenType.LBRACE; 067import static eu.cqse.check.framework.scanner.ETokenType.LEFT_ANGLE_BRACKET; 068import static eu.cqse.check.framework.scanner.ETokenType.MULTIPLE_EOL; 069import static eu.cqse.check.framework.scanner.ETokenType.QUESTION; 070import static eu.cqse.check.framework.scanner.ETokenType.RBRACE; 071import static eu.cqse.check.framework.scanner.ETokenType.RIGHT_ANGLE_BRACKET; 072import static eu.cqse.check.framework.scanner.ETokenType.SEMICOLON; 073import static eu.cqse.check.framework.scanner.ETokenType.SLASH; 074import static eu.cqse.check.framework.scanner.ETokenType.THEN; 075import static org.conqat.lib.commons.collections.CollectionUtils.asHashSet; 076 077import java.util.HashMap; 078import java.util.Map; 079import java.util.Set; 080import java.util.regex.Pattern; 081 082import org.conqat.lib.commons.string.StringUtils; 083 084/** 085 * Class which provides parser relevant properties for the languages supported 086 * by the scanner framework. These are intentionally stored separately from 087 * {@link eu.cqse.check.framework.scanner.ELanguage} as ELanguage does not have 088 * access to the language's token classes. 089 */ 090public class LanguageProperties { 091 092 /** This maps from extensions to languages. */ 093 private static final Map<ELanguage, LanguageProperties> LANGUAGE_PROPERTIES_MAPPING = new HashMap<>(); 094 095 static { 096 register(JAVA, JavaToken::new, asHashSet(SEMICOLON, RBRACE, LBRACE), CLIKE_COMMENT_REGEX, true); 097 register(CPP, CPPToken::new, asHashSet(SEMICOLON, RBRACE, LBRACE), CLIKE_COMMENT_REGEX, true); 098 register(OPEN_CL, OpenCLToken::new, asHashSet(SEMICOLON, RBRACE, LBRACE), CLIKE_COMMENT_REGEX, true); 099 register(RUST, RustToken::new, asHashSet(SEMICOLON, RBRACE, LBRACE), CLIKE_COMMENT_REGEX, true); 100 register(VB, VBToken::new, asHashSet(COLON, EOL), "^ *'", false); 101 register(PL1, PL1Token::new, asHashSet(SEMICOLON, RBRACE, LBRACE), CLIKE_COMMENT_REGEX, false); 102 register(COBOL, CobolToken::new, new CobolStatementOracle(), "^ *[*/]+", false); 103 register(CS, CSToken::new, asHashSet(SEMICOLON, RBRACE, LBRACE), CLIKE_COMMENT_REGEX, true); 104 register(ABAP, ABAPToken::new, asHashSet(DOT), "^ *[*\"]+", false); 105 register(ABAP_DDIC, TextToken::new, asHashSet(EOL), "", false); 106 register(ADA, AdaToken::new, asHashSet(SEMICOLON, THEN), "^ *--+", false); 107 register(TEXT, TextToken::new, asHashSet(DOT, QUESTION, EXCLAMATION, COLON, MULTIPLE_EOL), "", false); 108 register(XML, XMLToken::new, asHashSet(LEFT_ANGLE_BRACKET, SLASH, RIGHT_ANGLE_BRACKET), "(^ *<!--+)|(--+>$)", 109 false); 110 register(SQLSCRIPT, HanaSQLScriptToken::new, asHashSet(SEMICOLON), "(^ *(--+|/[*]+))|([*]+/ *$)", false); 111 register(HANA_VIEW, XMLToken::new, asHashSet(LEFT_ANGLE_BRACKET, SLASH, RIGHT_ANGLE_BRACKET), 112 "(^ *<!--+)|(--+>$)", false); 113 register(PLSQL, PLSQLToken::new, asHashSet(SEMICOLON), "(^ *(--+|/[*]+))|([*]+/ *$)", false); 114 register(PYTHON, PythonToken::new, asHashSet(EOL, EOF), "^ *#+", false); 115 register(TSQL, TSQLToken::new, asHashSet(EOL), "(^ *(--+|/[*]+))|([*]+/ *$)", false); 116 register(MATLAB, MatlabToken::new, asHashSet(EOL, SEMICOLON), "^ *%+", false); 117 register(PHP, PHPToken::new, asHashSet(SEMICOLON, RBRACE, LBRACE), CLIKE_COMMENT_REGEX, false); 118 register(RUBY, RubyToken::new, asHashSet(EOL), "^ *#+", false); 119 register(JAVASCRIPT, JavaScriptToken::new, asHashSet(SEMICOLON, RBRACE, LBRACE), CLIKE_COMMENT_REGEX, true); 120 register(MTEXT, MTextToken::new, asHashSet(EOL), "(?i)^ *[.]DSC", false); 121 register(JPL, JPLToken::new, asHashSet(EOL), CLIKE_COMMENT_REGEX, false); 122 register(LINE, LineToken::new, asHashSet(EOL), "", false); 123 register(DELPHI, DelphiToken::new, asHashSet(SEMICOLON), "(^ *([(][*]+|[{]|//+)|([*]+[)]|[}])$)", false); 124 register(IEC61131, Iec61131Token::new, asHashSet(SEMICOLON, ETokenType.ELEMENT), 125 "(^ *([(][*]+|//+)|([*]+[)])$)", false); 126 register(FORTRAN, FortranToken::new, asHashSet(EOL), "^ *!+", false); 127 register(XTEND, XtendToken::new, asHashSet(SEMICOLON, RBRACE, LBRACE, EOL), CLIKE_COMMENT_REGEX, true); 128 register(SWIFT, SwiftToken::new, asHashSet(SEMICOLON, RBRACE, LBRACE, EOL), CLIKE_COMMENT_REGEX, true); 129 register(OCAML, OCamlToken::new, asHashSet(SEMICOLON, RBRACE, LBRACE), "", false); 130 register(OSCRIPT, OScriptToken::new, asHashSet(EOL, EOF), CLIKE_COMMENT_REGEX, false); 131 register(GROOVY, GroovyToken::new, asHashSet(SEMICOLON, RBRACE, LBRACE), CLIKE_COMMENT_REGEX, true); 132 register(NL_REQUIREMENTS, TextToken::new, asHashSet(DOT, QUESTION, EXCLAMATION, COLON, MULTIPLE_EOL), "", 133 false); 134 register(NL_TESTS, TextToken::new, asHashSet(DOT, QUESTION, EXCLAMATION, COLON, MULTIPLE_EOL), "", false); 135 register(SIMULINK, TextToken::new, asHashSet(EOL), "", false); 136 register(GOSU, GosuToken::new, asHashSet(SEMICOLON, RBRACE, LBRACE), CLIKE_COMMENT_REGEX, true); 137 register(KOTLIN, KotlinToken::new, asHashSet(SEMICOLON, RBRACE, LBRACE, EOL, EOF), CLIKE_COMMENT_REGEX, true); 138 register(OBJECTIVE_C, ObjectiveCToken::new, asHashSet(SEMICOLON, RBRACE, LBRACE, EOL, EOF), CLIKE_COMMENT_REGEX, 139 true); 140 register(JAVADOC, JavaDocToken::new, asHashSet(), "", false); 141 register(GO, GoToken::new, asHashSet(RBRACE, LBRACE, EOL, EOF), "", false); 142 } 143 144 private static void register(ELanguage language, ITokenFactory tokenFactory, Set<ETokenType> statementDelimiters, 145 String commentLineTrimRegex, boolean cLike) { 146 LANGUAGE_PROPERTIES_MAPPING.put(language, new LanguageProperties(tokenFactory, 147 new StatementOracle(statementDelimiters), commentLineTrimRegex, cLike)); 148 } 149 150 private static void register(ELanguage language, ITokenFactory tokenFactory, IStatementOracle oracle, 151 String commentLineTrimRegex, boolean cLike) { 152 LANGUAGE_PROPERTIES_MAPPING.put(language, 153 new LanguageProperties(tokenFactory, oracle, commentLineTrimRegex, cLike)); 154 } 155 156 /** 157 * @return The {@link LanguageProperties} associated with the given language. 158 */ 159 public static LanguageProperties of(ELanguage language) { 160 return LANGUAGE_PROPERTIES_MAPPING.get(language); 161 } 162 163 /** The statement oracle for this language. */ 164 private final IStatementOracle statementOracle; 165 166 /** 167 * Pattern describing the parts of a comment line that should be trimmed to 168 * reveal the text. 169 */ 170 private final Pattern commentLineTrimPattern; 171 172 /** Whether the language is C-like, i.e. based on curly braces, etc. */ 173 private final boolean cLike; 174 175 private final ITokenFactory tokenFactory; 176 177 /** Create language properties. */ 178 private LanguageProperties(ITokenFactory tokenFactory, IStatementOracle oracle, String commentLineTrimRegex, 179 boolean cLike) { 180 this.statementOracle = oracle; 181 this.commentLineTrimPattern = Pattern.compile(commentLineTrimRegex); 182 this.cLike = cLike; 183 this.tokenFactory = tokenFactory; 184 } 185 186 /** Get statement oracle for this language. */ 187 public IStatementOracle getStatementOracle() { 188 return statementOracle; 189 } 190 191 /** 192 * Get the content of a comment, i.e. with the comment delimiters removed. 193 */ 194 public String getCommentContent(String commentText) { 195 StringBuilder content = new StringBuilder(); 196 for (String line : StringUtils.splitLinesAsList(commentText)) { 197 if (content.length() > 0) { 198 content.append(StringUtils.LINE_SEPARATOR); 199 } 200 content.append(commentLineTrimPattern.matcher(line).replaceAll(StringUtils.EMPTY_STRING).trim()); 201 } 202 return content.toString(); 203 } 204 205 /** @see #cLike */ 206 public boolean isCLike() { 207 return cLike; 208 } 209 210 /** Creates a new token for the language. */ 211 public IToken createToken(ETokenType type, int offset, int lineNumber, String text, String originId) { 212 return tokenFactory.create(type, offset, lineNumber, text, originId); 213 } 214 215 /** Factory interface for token creation. */ 216 @FunctionalInterface 217 private interface ITokenFactory { 218 219 /** Creates a token. */ 220 IToken create(ETokenType type, int offset, int lineNumber, String text, String originId); 221 } 222}