001/*-------------------------------------------------------------------------+ 002| | 003| Copyright 2005-2011 The ConQAT Project | 004| | 005| Licensed under the Apache License, Version 2.0 (the "License"); | 006| you may not use this file except in compliance with the License. | 007| You may obtain a copy of the License at | 008| | 009| http://www.apache.org/licenses/LICENSE-2.0 | 010| | 011| Unless required by applicable law or agreed to in writing, software | 012| distributed under the License is distributed on an "AS IS" BASIS, | 013| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 014| See the License for the specific language governing permissions and | 015| limitations under the License. | 016+-------------------------------------------------------------------------*/ 017package org.conqat.lib.commons.xml; 018 019import java.io.ByteArrayInputStream; 020import java.io.File; 021import java.io.FileInputStream; 022import java.io.FileNotFoundException; 023import java.io.IOException; 024import java.io.PrintStream; 025import java.io.StringReader; 026import java.io.StringWriter; 027import java.io.UnsupportedEncodingException; 028import java.net.MalformedURLException; 029import java.net.URL; 030import java.util.ArrayList; 031import java.util.List; 032 033import javax.xml.parsers.DocumentBuilder; 034import javax.xml.parsers.DocumentBuilderFactory; 035import javax.xml.parsers.ParserConfigurationException; 036import javax.xml.parsers.SAXParser; 037import javax.xml.parsers.SAXParserFactory; 038import javax.xml.stream.XMLEventReader; 039import javax.xml.stream.XMLInputFactory; 040import javax.xml.stream.XMLStreamException; 041import javax.xml.transform.Transformer; 042import javax.xml.transform.TransformerException; 043import javax.xml.transform.TransformerFactory; 044import javax.xml.transform.dom.DOMSource; 045import javax.xml.transform.stream.StreamResult; 046import javax.xml.transform.stream.StreamSource; 047 048import org.conqat.lib.commons.assertion.CCSMAssert; 049import org.conqat.lib.commons.collections.CollectionUtils; 050import org.conqat.lib.commons.filesystem.FileSystemUtils; 051import org.conqat.lib.commons.string.StringUtils; 052import org.w3c.dom.Document; 053import org.w3c.dom.Element; 054import org.w3c.dom.Node; 055import org.w3c.dom.NodeList; 056import org.xml.sax.ErrorHandler; 057import org.xml.sax.InputSource; 058import org.xml.sax.SAXException; 059import org.xml.sax.SAXParseException; 060import org.xml.sax.helpers.DefaultHandler; 061 062/** 063 * Collection of utility methods for XML. 064 */ 065public class XMLUtils { 066 067 /** Identifier for schema source. */ 068 private static final String ATTRIBUTE_SCHEMA_SOURCE = "http://java.sun.com/xml/jaxp/properties/schemaSource"; 069 070 /** Schema URL */ 071 private static final String SCHEMA_URL = "http://www.w3.org/2001/XMLSchema"; 072 073 /** Identifier for schema language. */ 074 private static final String ATTRIBUTE_SCHEMA_LANGUAGE = "http://java.sun.com/xml/jaxp/properties/schemaLanguage"; 075 076 /** Creates a new {@link XMLWriter} that writes to the given output file. */ 077 public static <ElementsEnum extends Enum<ElementsEnum>, AttributesEnum extends Enum<AttributesEnum>> XMLWriter<ElementsEnum, AttributesEnum> createUtf8Writer( 078 File outputFile, Class<AttributesEnum> attributesClass) 079 throws FileNotFoundException, UnsupportedEncodingException { 080 return new XMLWriter<>(new PrintStream(outputFile, FileSystemUtils.UTF8_ENCODING), 081 new XMLResolver<ElementsEnum, AttributesEnum>(attributesClass)); 082 } 083 084 /** 085 * Parse a file without validation. 086 * 087 * @param file 088 * the file to parse. 089 * @return the DOM document. 090 * 091 * @throws SAXException 092 * if a parsing exception occurs, i.e. if the file is not 093 * well-formed. 094 * @throws IOException 095 * if an IO exception occurs. 096 */ 097 public static Document parse(File file) throws SAXException, IOException { 098 return createSchemaUnawareParser().parse(file); 099 } 100 101 /** 102 * Parse an input source without validation. 103 * 104 * @param input 105 * the input source to parse 106 * @return the DOM document. 107 * 108 * @throws SAXException 109 * if a parsing exception occurs, i.e. if the file is not 110 * well-formed. 111 * @throws IOException 112 * if an IO exception occurs. 113 */ 114 public static Document parse(InputSource input) throws SAXException, IOException { 115 116 return createSchemaUnawareParser().parse(input); 117 } 118 119 /** 120 * Parse an input source using SAX without validation. 121 * 122 * @throws SAXException 123 * if a parsing exception occurs, i.e. if the file is not 124 * well-formed. 125 * @throws IOException 126 * if an IO exception occurs. 127 */ 128 public static void parseSAX(File file, DefaultHandler handler) throws SAXException, IOException { 129 createSchemaUnawareSAXParser().parse(file, handler); 130 } 131 132 /** 133 * Parse an input source using SAX without validation. 134 * 135 * @throws SAXException 136 * if a parsing exception occurs, i.e. if the file is not 137 * well-formed. 138 * @throws IOException 139 * if an IO exception occurs. 140 */ 141 public static void parseSAX(InputSource input, DefaultHandler handler) throws SAXException, IOException { 142 createSchemaUnawareSAXParser().parse(input, handler); 143 } 144 145 /** 146 * Parse a string that contains XML without validation. 147 * 148 * @throws SAXException 149 * if a parsing exception occurs, i.e. if the file is not 150 * well-formed. 151 * @throws IOException 152 * if an IO exception occurs. 153 */ 154 public static void parseSAX(String content, DefaultHandler handler) throws SAXException, IOException { 155 parseSAX(new InputSource(new ByteArrayInputStream(StringUtils.stringToBytes(content))), handler); 156 } 157 158 /** 159 * Parse and validate file using schema. This implements a custom error handler 160 * to avoid different behaviour between the JAXP implementations shipping with 161 * Java 1.5 and Java 1.6. 162 * 163 * @param file 164 * the file to parse. 165 * @param schemaURL 166 * URL point to schema, may not be null 167 * @return the DOM document. 168 * 169 * @throws SAXException 170 * if a parsing exception occurs, i.e. if the file is not 171 * well-formed or not valid 172 * @throws IOException 173 * if an IO exception occurs. 174 */ 175 public static Document parse(File file, URL schemaURL) throws SAXException, IOException { 176 177 FileInputStream stream = new FileInputStream(file); 178 try { 179 return parse(new InputSource(stream), schemaURL); 180 } finally { 181 stream.close(); 182 } 183 } 184 185 /** 186 * Parse and validate file using schema. This implements a custom error handler 187 * to avoid different behaviour between the JAXP implementations shipping with 188 * Java 1.5 and Java 1.6. 189 * 190 * @param input 191 * the input to parse. 192 * @param schemaURL 193 * URL point to schema, may not be null 194 * @return the DOM document. 195 * 196 * @throws SAXException 197 * if a parsing exception occurs, i.e. if the file is not 198 * well-formed or not valid 199 * @throws IOException 200 * if an IO exception occurs. 201 */ 202 public static Document parse(InputSource input, URL schemaURL) throws SAXException, IOException { 203 204 CCSMAssert.isTrue(schemaURL != null, "Schema URL may not be null!"); 205 206 DocumentBuilder parser = createSchemaAwareParser(schemaURL); 207 208 XMLErrorHandler errorHandler = new XMLErrorHandler(); 209 parser.setErrorHandler(errorHandler); 210 Document document = parser.parse(input); 211 212 if (errorHandler.exception != null) { 213 throw errorHandler.exception; 214 } 215 216 return document; 217 } 218 219 /** 220 * Parse and validate file using SAX and schema. 221 * 222 * @param file 223 * the file to parse. 224 * @param schemaURL 225 * URL point to schema, may not be null 226 * 227 * @throws SAXException 228 * if a parsing exception occurs, i.e. if the file is not 229 * well-formed or not valid 230 * @throws IOException 231 * if an IO exception occurs. 232 */ 233 public static void parseSAX(File file, URL schemaURL, DefaultHandler handler) throws SAXException, IOException { 234 try (FileInputStream stream = new FileInputStream(file)) { 235 parseSAX(new InputSource(stream), schemaURL, handler); 236 } 237 } 238 239 /** 240 * Parse and validate file using SAX and schema. 241 * 242 * @param input 243 * the input to parse. 244 * @param schemaURL 245 * URL point to schema, may not be null 246 * 247 * @throws SAXException 248 * if a parsing exception occurs, i.e. if the file is not 249 * well-formed or not valid 250 * @throws IOException 251 * if an IO exception occurs. 252 */ 253 public static void parseSAX(InputSource input, URL schemaURL, DefaultHandler handler) 254 throws SAXException, IOException { 255 256 CCSMAssert.isTrue(schemaURL != null, "Schema URL may not be null!"); 257 createSchemaAwareSAXParser(schemaURL).parse(input, handler); 258 } 259 260 /** 261 * Creates a StAX parser. The parser can be queried for new parsing events via 262 * hasNext()/next(). It continues parsing only on next() calls. 263 * 264 * This parser should be used if performance is relevant and only part of the 265 * XML document is needed. 266 */ 267 public static XMLEventReader createStAXParser(InputSource input) throws XMLStreamException { 268 XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory(); 269 xmlInputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, true); 270 XMLEventReader reader = xmlInputFactory.createXMLEventReader(input.getByteStream(), input.getEncoding()); 271 return reader; 272 } 273 274 /** Creates a schema-unaware XML parser */ 275 private static DocumentBuilder createSchemaUnawareParser() { 276 277 try { 278 return createNamespaceAwareDocumentBuilderFactory().newDocumentBuilder(); 279 } catch (ParserConfigurationException e) { 280 throw new IllegalStateException("No document builder found, probably Java is misconfigured!", e); 281 } 282 } 283 284 /** Creates a schema-unaware SAX parser */ 285 private static SAXParser createSchemaUnawareSAXParser() throws SAXException { 286 try { 287 return createNamespaceAwareSAXParserFactory().newSAXParser(); 288 } catch (ParserConfigurationException e) { 289 throw new IllegalStateException("No SAX parser found, probably Java is misconfigured!", e); 290 } 291 } 292 293 /** Creates a schema-aware XML parser */ 294 private static DocumentBuilder createSchemaAwareParser(URL schemaURL) { 295 DocumentBuilderFactory dbf = createNamespaceAwareDocumentBuilderFactory(); 296 dbf.setValidating(true); 297 dbf.setAttribute(ATTRIBUTE_SCHEMA_LANGUAGE, SCHEMA_URL); 298 dbf.setAttribute(ATTRIBUTE_SCHEMA_SOURCE, schemaURL.toString()); 299 300 try { 301 return dbf.newDocumentBuilder(); 302 } catch (ParserConfigurationException e) { 303 throw new IllegalStateException("No document builder found, probably Java is misconfigured!", e); 304 } 305 } 306 307 /** Creates a schema-aware SAX parser */ 308 private static SAXParser createSchemaAwareSAXParser(URL schemaURL) throws SAXException { 309 SAXParserFactory spf = createNamespaceAwareSAXParserFactory(); 310 spf.setValidating(true); 311 try { 312 SAXParser parser = spf.newSAXParser(); 313 parser.setProperty(ATTRIBUTE_SCHEMA_LANGUAGE, SCHEMA_URL); 314 parser.setProperty(ATTRIBUTE_SCHEMA_SOURCE, schemaURL.toString()); 315 return parser; 316 } catch (ParserConfigurationException e) { 317 throw new IllegalStateException("No SAX parser found, probably Java is misconfigured!", e); 318 } 319 } 320 321 /** Creates a namespace-aware {@link DocumentBuilderFactory} */ 322 private static DocumentBuilderFactory createNamespaceAwareDocumentBuilderFactory() { 323 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); 324 dbf.setNamespaceAware(true); 325 326 return dbf; 327 } 328 329 /** Creates a namespace-aware {@link SAXParserFactory} */ 330 private static SAXParserFactory createNamespaceAwareSAXParserFactory() { 331 SAXParserFactory spf = SAXParserFactory.newInstance(); 332 spf.setNamespaceAware(true); 333 return spf; 334 } 335 336 /** 337 * Same as {@link #parse(File, URL)} but with schema file. 338 * 339 * @throws IllegalArgumentException 340 * if the schema file could not be converted to an URL 341 */ 342 public static Document parse(File file, File schema) throws SAXException, IOException { 343 try { 344 return parse(file, schema.toURI().toURL()); 345 } catch (MalformedURLException e) { 346 throw new IllegalArgumentException("Schema file could not be converted to URL: " + e); 347 } 348 } 349 350 /** 351 * Returns a string representation of the given XML document, which is "pretty 352 * printed", i.e. the tags are indented. 353 */ 354 public static String prettyPrint(Document doc) throws TransformerException { 355 URL url = XMLUtils.class.getResource("pretty.xsl"); 356 StreamSource xslSource = new StreamSource(url.toExternalForm()); 357 Transformer transformer = TransformerFactory.newInstance().newTransformer(xslSource); 358 return StringUtils.normalizeLineSeparatorsPlatformSpecific(transformDocumentToString(doc, transformer)); 359 } 360 361 /** 362 * Transforms the document to an xml string (flat, with no line breaks). Use 363 * {@link #prettyPrint(Document)} for readable xml output. 364 */ 365 public static String print(Document document) throws TransformerException { 366 Transformer transformer = TransformerFactory.newInstance().newTransformer(); 367 return transformDocumentToString(document, transformer); 368 } 369 370 /** Transform an XML document to a string using the given transformer */ 371 private static String transformDocumentToString(Document document, Transformer transformer) 372 throws TransformerException { 373 DOMSource source = new DOMSource(document); 374 StringWriter stringWriter = new StringWriter(); 375 StreamResult resultStream = new StreamResult(stringWriter); 376 transformer.transform(source, resultStream); 377 return stringWriter.toString(); 378 } 379 380 /** 381 * Determines the index (starting at 0) of the given element relative to other 382 * element nodes for the same parent. 383 */ 384 public static int getElementPosition(Element element) { 385 int num = -1; 386 Node node = element; 387 while (node != null) { 388 if (node.getNodeType() == Node.ELEMENT_NODE) { 389 ++num; 390 } 391 node = node.getPreviousSibling(); 392 } 393 return num; 394 } 395 396 /** 397 * Returns all children of the given element which are element named as 398 * specified. 399 */ 400 public static List<Element> getNamedChildren(Element element, String elementNames) { 401 List<Element> result = new ArrayList<>(); 402 NodeList children = element.getChildNodes(); 403 for (int i = 0; i < children.getLength(); ++i) { 404 Node node = children.item(i); 405 if (node.getNodeType() == Node.ELEMENT_NODE && node.getNodeName().equals(elementNames)) { 406 result.add((Element) node); 407 } 408 } 409 return result; 410 } 411 412 /** 413 * Returns the first child of the given element which is an element named as 414 * specified. Returns null if none are found. 415 */ 416 public static Element getNamedChild(Element element, String name) { 417 List<Element> children = XMLUtils.getNamedChildren(element, name); 418 if (children.size() > 0) { 419 return children.get(0); 420 } 421 return null; 422 } 423 424 /** 425 * Returns the last child of the given element which is an element named as 426 * specified. Returns null if none are found. 427 */ 428 public static Element getLastNamedChild(Element element, String name) { 429 List<Element> children = XMLUtils.getNamedChildren(element, name); 430 if (children.size() > 0) { 431 return CollectionUtils.getLast(children); 432 } 433 return null; 434 } 435 436 /** 437 * Get the text content of the given element's first child that is an element 438 * named as specified. If none is found, the empty string is returned. 439 */ 440 public static String getNamedChildContent(Element parent, String name) { 441 Element element = XMLUtils.getNamedChild(parent, name); 442 if (element == null) { 443 return StringUtils.EMPTY_STRING; 444 } 445 return element.getTextContent(); 446 } 447 448 /** 449 * Returns the first element whose child with the given name has the given text 450 * content. 451 */ 452 public static Element getElementByChildContent(List<Element> elements, String childName, String childContent) { 453 for (Element element : elements) { 454 Element child = XMLUtils.getNamedChild(element, childName); 455 if (child != null && child.getTextContent().equals(childContent)) { 456 return element; 457 } 458 } 459 return null; 460 } 461 462 /** 463 * Returns the ancestor of the given element with the given distance. Distance 464 * zero means the element itself, one the parent and so on. 465 */ 466 public static Element getAncestor(Element element, int distance) { 467 for (int i = 0; i < distance; i++) { 468 if (element == null) { 469 return null; 470 } 471 element = CCSMAssert.checkedCast(element.getParentNode(), Element.class); 472 } 473 return element; 474 } 475 476 /** 477 * Extracts all ElementNodes from a NodeList and returns the result as a list. 478 * 479 * @param nodeList 480 * the NodeList to be searched for ElementNodes. 481 * @return an array containing all ElementNodes stored in the given node list or 482 * null if the input has been null. 483 */ 484 public static List<Element> elementNodes(NodeList nodeList) { 485 if (nodeList == null) { 486 return null; 487 } 488 List<Element> result = new ArrayList<>(); 489 int len = nodeList.getLength(); 490 for (int i = 0; i < len; ++i) { 491 Node node = nodeList.item(i); 492 if (node.getNodeType() == Node.ELEMENT_NODE) { 493 result.add((Element) node); 494 } 495 } 496 return result; 497 } 498 499 /** Removes the given element from its parent. */ 500 public static void removeElement(Element element) { 501 Node parent = element.getParentNode(); 502 if (parent != null) { 503 parent.removeChild(element); 504 } 505 } 506 507 /** 508 * Appends a child element with the given tag name to the given element and 509 * returns the new element. 510 */ 511 public static Element appendChild(Element element, String tagName) { 512 Element newElement = CCSMAssert.checkedCast(element.getOwnerDocument().createElement(tagName), Element.class); 513 element.appendChild(newElement); 514 return newElement; 515 } 516 517 /** 518 * Appends a child element with the given tag name and the given text content to 519 * the given element and returns the new element. 520 */ 521 public static Element appendChild(Element parent, String tagName, String textContent) { 522 Element newElement = appendChild(parent, tagName); 523 newElement.setTextContent(textContent); 524 return newElement; 525 } 526 527 /** 528 * Get all leaf elements of an XML tree rooted at an element 529 * 530 * @param root 531 * The root element 532 * @return List of all leaf elements 533 */ 534 public static List<Element> leafElementNodes(Element root) { 535 List<Element> leafElementNodes = new ArrayList<>(); 536 leafElementNodes(root, leafElementNodes); 537 return leafElementNodes; 538 } 539 540 /** 541 * Add all leaf element nodes of an XML tree rooted at an element to a list 542 */ 543 private static void leafElementNodes(Element root, List<Element> leafElementNodes) { 544 List<Element> children = XMLUtils.elementNodes(root.getChildNodes()); 545 if (children.isEmpty()) { 546 leafElementNodes.add(root); 547 } else { 548 for (Element child : children) { 549 leafElementNodes(child, leafElementNodes); 550 } 551 } 552 } 553 554 /** Creates an empty XML document. */ 555 public static Document createEmptyDocument() { 556 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); 557 DocumentBuilder builder; 558 try { 559 builder = factory.newDocumentBuilder(); 560 } catch (ParserConfigurationException e) { 561 throw new IllegalStateException("No document builder found, probably Java is misconfigured!", e); 562 } 563 return builder.newDocument(); 564 } 565 566 /** Converts the given {@link String} to a SAX {@link InputSource}. */ 567 public static InputSource toInputSource(String string) { 568 return new InputSource(new StringReader(string)); 569 } 570 571 /** 572 * Simple error handler for handling validation errors. This handler stores the 573 * first problem raised during parsing. 574 */ 575 private static class XMLErrorHandler implements ErrorHandler { 576 577 /** 578 * The stored exception. Value unequal <code>null</code> signals a validation 579 * problem. 580 */ 581 private SAXParseException exception; 582 583 /** {@inheritDoc} */ 584 @Override 585 public void error(SAXParseException exception) { 586 if (this.exception == null) { 587 this.exception = exception; 588 } 589 } 590 591 /** {@inheritDoc} */ 592 @Override 593 public void fatalError(SAXParseException exception) { 594 error(exception); 595 } 596 597 /** {@inheritDoc} */ 598 @Override 599 public void warning(SAXParseException exception) { 600 System.out.println(exception); 601 // ignore 602 } 603 } 604 605 /** 606 * Fixes chars which are not allowed in XML content. The following replacements 607 * are allowed: 608 * <ul> 609 * <li>All '&' which are not part of an XML escape char sequence are replaced by 610 * '&'. 611 * <li>All low ASCII control chars are removed, besides TAB, LF, CR 612 * <li>Escaped ASCII control chars are removed (e.g. � or �) with 613 * variable zero padding in hex and decimal format. 614 * </ul> 615 */ 616 public static String fixIllegalXmlChars(String content) { 617 String replacedContent = content.replaceAll("(?i)&(?!(lt|gt|amp|apos|quot|#x[0-9a-f]+|#\\d+);)", "&"); 618 replacedContent = replacedContent.replaceAll("([\\x00-\\x08\\x0b\\x0c\\x0e-\\x1f\\x7f])", 619 StringUtils.EMPTY_STRING); 620 replacedContent = replacedContent.replaceAll("(?i)�*([0-8bcef]|1[0-9a-f]|7f);", StringUtils.EMPTY_STRING); 621 replacedContent = replacedContent.replaceAll("(?i)�*([0-8]|1[124-9]|2[0-9]|3[01]|127);", 622 StringUtils.EMPTY_STRING); 623 return replacedContent; 624 } 625}