Source code

001/*-------------------------------------------------------------------------+
002|                                                                          |
003| Copyright 2005-2011 The ConQAT Project                                   |
004|                                                                          |
005| Licensed under the Apache License, Version 2.0 (the "License");          |
006| you may not use this file except in compliance with the License.         |
007| You may obtain a copy of the License at                                  |
008|                                                                          |
009|    http://www.apache.org/licenses/LICENSE-2.0                            |
010|                                                                          |
011| Unless required by applicable law or agreed to in writing, software      |
012| distributed under the License is distributed on an "AS IS" BASIS,        |
013| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
014| See the License for the specific language governing permissions and      |
015| limitations under the License.                                           |
016+-------------------------------------------------------------------------*/
017package org.conqat.lib.commons.xml;
018
019import java.io.ByteArrayInputStream;
020import java.io.File;
021import java.io.FileInputStream;
022import java.io.FileNotFoundException;
023import java.io.IOException;
024import java.io.PrintStream;
025import java.io.StringReader;
026import java.io.StringWriter;
027import java.io.UnsupportedEncodingException;
028import java.net.MalformedURLException;
029import java.net.URL;
030import java.util.ArrayList;
031import java.util.List;
032
033import javax.xml.parsers.DocumentBuilder;
034import javax.xml.parsers.DocumentBuilderFactory;
035import javax.xml.parsers.ParserConfigurationException;
036import javax.xml.parsers.SAXParser;
037import javax.xml.parsers.SAXParserFactory;
038import javax.xml.stream.XMLEventReader;
039import javax.xml.stream.XMLInputFactory;
040import javax.xml.stream.XMLStreamException;
041import javax.xml.transform.Transformer;
042import javax.xml.transform.TransformerException;
043import javax.xml.transform.TransformerFactory;
044import javax.xml.transform.dom.DOMSource;
045import javax.xml.transform.stream.StreamResult;
046import javax.xml.transform.stream.StreamSource;
047
048import org.conqat.lib.commons.assertion.CCSMAssert;
049import org.conqat.lib.commons.collections.CollectionUtils;
050import org.conqat.lib.commons.filesystem.FileSystemUtils;
051import org.conqat.lib.commons.string.StringUtils;
052import org.w3c.dom.Document;
053import org.w3c.dom.Element;
054import org.w3c.dom.Node;
055import org.w3c.dom.NodeList;
056import org.xml.sax.ErrorHandler;
057import org.xml.sax.InputSource;
058import org.xml.sax.SAXException;
059import org.xml.sax.SAXParseException;
060import org.xml.sax.helpers.DefaultHandler;
061
062/**
063 * Collection of utility methods for XML.
064 */
065public class XMLUtils {
066
067        /** Identifier for schema source. */
068        private static final String ATTRIBUTE_SCHEMA_SOURCE = "http://java.sun.com/xml/jaxp/properties/schemaSource";
069
070        /** Schema URL */
071        private static final String SCHEMA_URL = "http://www.w3.org/2001/XMLSchema";
072
073        /** Identifier for schema language. */
074        private static final String ATTRIBUTE_SCHEMA_LANGUAGE = "http://java.sun.com/xml/jaxp/properties/schemaLanguage";
075
076        /** Creates a new {@link XMLWriter} that writes to the given output file. */
077        public static <ElementsEnum extends Enum<ElementsEnum>, AttributesEnum extends Enum<AttributesEnum>> XMLWriter<ElementsEnum, AttributesEnum> createUtf8Writer(
078                        File outputFile, Class<AttributesEnum> attributesClass)
079                        throws FileNotFoundException, UnsupportedEncodingException {
080                return new XMLWriter<>(new PrintStream(outputFile, FileSystemUtils.UTF8_ENCODING),
081                                new XMLResolver<ElementsEnum, AttributesEnum>(attributesClass));
082        }
083
084        /**
085         * Parse a file without validation.
086         *
087         * @param file
088         *            the file to parse.
089         * @return the DOM document.
090         *
091         * @throws SAXException
092         *             if a parsing exception occurs, i.e. if the file is not
093         *             well-formed.
094         * @throws IOException
095         *             if an IO exception occurs.
096         */
097        public static Document parse(File file) throws SAXException, IOException {
098                return createSchemaUnawareParser().parse(file);
099        }
100
101        /**
102         * Parse an input source without validation.
103         *
104         * @param input
105         *            the input source to parse
106         * @return the DOM document.
107         *
108         * @throws SAXException
109         *             if a parsing exception occurs, i.e. if the file is not
110         *             well-formed.
111         * @throws IOException
112         *             if an IO exception occurs.
113         */
114        public static Document parse(InputSource input) throws SAXException, IOException {
115
116                return createSchemaUnawareParser().parse(input);
117        }
118
119        /**
120         * Parse an input source using SAX without validation.
121         *
122         * @throws SAXException
123         *             if a parsing exception occurs, i.e. if the file is not
124         *             well-formed.
125         * @throws IOException
126         *             if an IO exception occurs.
127         */
128        public static void parseSAX(File file, DefaultHandler handler) throws SAXException, IOException {
129                createSchemaUnawareSAXParser().parse(file, handler);
130        }
131
132        /**
133         * Parse an input source using SAX without validation.
134         *
135         * @throws SAXException
136         *             if a parsing exception occurs, i.e. if the file is not
137         *             well-formed.
138         * @throws IOException
139         *             if an IO exception occurs.
140         */
141        public static void parseSAX(InputSource input, DefaultHandler handler) throws SAXException, IOException {
142                createSchemaUnawareSAXParser().parse(input, handler);
143        }
144
145        /**
146         * Parse a string that contains XML without validation.
147         *
148         * @throws SAXException
149         *             if a parsing exception occurs, i.e. if the file is not
150         *             well-formed.
151         * @throws IOException
152         *             if an IO exception occurs.
153         */
154        public static void parseSAX(String content, DefaultHandler handler) throws SAXException, IOException {
155                parseSAX(new InputSource(new ByteArrayInputStream(StringUtils.stringToBytes(content))), handler);
156        }
157
158        /**
159         * Parse and validate file using schema. This implements a custom error handler
160         * to avoid different behaviour between the JAXP implementations shipping with
161         * Java 1.5 and Java 1.6.
162         *
163         * @param file
164         *            the file to parse.
165         * @param schemaURL
166         *            URL point to schema, may not be null
167         * @return the DOM document.
168         *
169         * @throws SAXException
170         *             if a parsing exception occurs, i.e. if the file is not
171         *             well-formed or not valid
172         * @throws IOException
173         *             if an IO exception occurs.
174         */
175        public static Document parse(File file, URL schemaURL) throws SAXException, IOException {
176
177                FileInputStream stream = new FileInputStream(file);
178                try {
179                        return parse(new InputSource(stream), schemaURL);
180                } finally {
181                        stream.close();
182                }
183        }
184
185        /**
186         * Parse and validate file using schema. This implements a custom error handler
187         * to avoid different behaviour between the JAXP implementations shipping with
188         * Java 1.5 and Java 1.6.
189         *
190         * @param input
191         *            the input to parse.
192         * @param schemaURL
193         *            URL point to schema, may not be null
194         * @return the DOM document.
195         *
196         * @throws SAXException
197         *             if a parsing exception occurs, i.e. if the file is not
198         *             well-formed or not valid
199         * @throws IOException
200         *             if an IO exception occurs.
201         */
202        public static Document parse(InputSource input, URL schemaURL) throws SAXException, IOException {
203
204                CCSMAssert.isTrue(schemaURL != null, "Schema URL may not be null!");
205
206                DocumentBuilder parser = createSchemaAwareParser(schemaURL);
207
208                XMLErrorHandler errorHandler = new XMLErrorHandler();
209                parser.setErrorHandler(errorHandler);
210                Document document = parser.parse(input);
211
212                if (errorHandler.exception != null) {
213                        throw errorHandler.exception;
214                }
215
216                return document;
217        }
218
219        /**
220         * Parse and validate file using SAX and schema.
221         *
222         * @param file
223         *            the file to parse.
224         * @param schemaURL
225         *            URL point to schema, may not be null
226         *
227         * @throws SAXException
228         *             if a parsing exception occurs, i.e. if the file is not
229         *             well-formed or not valid
230         * @throws IOException
231         *             if an IO exception occurs.
232         */
233        public static void parseSAX(File file, URL schemaURL, DefaultHandler handler) throws SAXException, IOException {
234                try (FileInputStream stream = new FileInputStream(file)) {
235                        parseSAX(new InputSource(stream), schemaURL, handler);
236                }
237        }
238
239        /**
240         * Parse and validate file using SAX and schema.
241         *
242         * @param input
243         *            the input to parse.
244         * @param schemaURL
245         *            URL point to schema, may not be null
246         *
247         * @throws SAXException
248         *             if a parsing exception occurs, i.e. if the file is not
249         *             well-formed or not valid
250         * @throws IOException
251         *             if an IO exception occurs.
252         */
253        public static void parseSAX(InputSource input, URL schemaURL, DefaultHandler handler)
254                        throws SAXException, IOException {
255
256                CCSMAssert.isTrue(schemaURL != null, "Schema URL may not be null!");
257                createSchemaAwareSAXParser(schemaURL).parse(input, handler);
258        }
259
260        /**
261         * Creates a StAX parser. The parser can be queried for new parsing events via
262         * hasNext()/next(). It continues parsing only on next() calls.
263         * 
264         * This parser should be used if performance is relevant and only part of the
265         * XML document is needed.
266         */
267        public static XMLEventReader createStAXParser(InputSource input) throws XMLStreamException {
268                XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory();
269                xmlInputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, true);
270                XMLEventReader reader = xmlInputFactory.createXMLEventReader(input.getByteStream(), input.getEncoding());
271                return reader;
272        }
273
274        /** Creates a schema-unaware XML parser */
275        private static DocumentBuilder createSchemaUnawareParser() {
276
277                try {
278                        return createNamespaceAwareDocumentBuilderFactory().newDocumentBuilder();
279                } catch (ParserConfigurationException e) {
280                        throw new IllegalStateException("No document builder found, probably Java is misconfigured!", e);
281                }
282        }
283
284        /** Creates a schema-unaware SAX parser */
285        private static SAXParser createSchemaUnawareSAXParser() throws SAXException {
286                try {
287                        return createNamespaceAwareSAXParserFactory().newSAXParser();
288                } catch (ParserConfigurationException e) {
289                        throw new IllegalStateException("No SAX parser found, probably Java is misconfigured!", e);
290                }
291        }
292
293        /** Creates a schema-aware XML parser */
294        private static DocumentBuilder createSchemaAwareParser(URL schemaURL) {
295                DocumentBuilderFactory dbf = createNamespaceAwareDocumentBuilderFactory();
296                dbf.setValidating(true);
297                dbf.setAttribute(ATTRIBUTE_SCHEMA_LANGUAGE, SCHEMA_URL);
298                dbf.setAttribute(ATTRIBUTE_SCHEMA_SOURCE, schemaURL.toString());
299
300                try {
301                        return dbf.newDocumentBuilder();
302                } catch (ParserConfigurationException e) {
303                        throw new IllegalStateException("No document builder found, probably Java is misconfigured!", e);
304                }
305        }
306
307        /** Creates a schema-aware SAX parser */
308        private static SAXParser createSchemaAwareSAXParser(URL schemaURL) throws SAXException {
309                SAXParserFactory spf = createNamespaceAwareSAXParserFactory();
310                spf.setValidating(true);
311                try {
312                        SAXParser parser = spf.newSAXParser();
313                        parser.setProperty(ATTRIBUTE_SCHEMA_LANGUAGE, SCHEMA_URL);
314                        parser.setProperty(ATTRIBUTE_SCHEMA_SOURCE, schemaURL.toString());
315                        return parser;
316                } catch (ParserConfigurationException e) {
317                        throw new IllegalStateException("No SAX parser found, probably Java is misconfigured!", e);
318                }
319        }
320
321        /** Creates a namespace-aware {@link DocumentBuilderFactory} */
322        private static DocumentBuilderFactory createNamespaceAwareDocumentBuilderFactory() {
323                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
324                dbf.setNamespaceAware(true);
325
326                return dbf;
327        }
328
329        /** Creates a namespace-aware {@link SAXParserFactory} */
330        private static SAXParserFactory createNamespaceAwareSAXParserFactory() {
331                SAXParserFactory spf = SAXParserFactory.newInstance();
332                spf.setNamespaceAware(true);
333                return spf;
334        }
335
336        /**
337         * Same as {@link #parse(File, URL)} but with schema file.
338         *
339         * @throws IllegalArgumentException
340         *             if the schema file could not be converted to an URL
341         */
342        public static Document parse(File file, File schema) throws SAXException, IOException {
343                try {
344                        return parse(file, schema.toURI().toURL());
345                } catch (MalformedURLException e) {
346                        throw new IllegalArgumentException("Schema file could not be converted to URL: " + e);
347                }
348        }
349
350        /**
351         * Returns a string representation of the given XML document, which is "pretty
352         * printed", i.e. the tags are indented.
353         */
354        public static String prettyPrint(Document doc) throws TransformerException {
355                URL url = XMLUtils.class.getResource("pretty.xsl");
356                StreamSource xslSource = new StreamSource(url.toExternalForm());
357                Transformer transformer = TransformerFactory.newInstance().newTransformer(xslSource);
358                return StringUtils.normalizeLineSeparatorsPlatformSpecific(transformDocumentToString(doc, transformer));
359        }
360
361        /**
362         * Transforms the document to an xml string (flat, with no line breaks). Use
363         * {@link #prettyPrint(Document)} for readable xml output.
364         */
365        public static String print(Document document) throws TransformerException {
366                Transformer transformer = TransformerFactory.newInstance().newTransformer();
367                return transformDocumentToString(document, transformer);
368        }
369
370        /** Transform an XML document to a string using the given transformer */
371        private static String transformDocumentToString(Document document, Transformer transformer)
372                        throws TransformerException {
373                DOMSource source = new DOMSource(document);
374                StringWriter stringWriter = new StringWriter();
375                StreamResult resultStream = new StreamResult(stringWriter);
376                transformer.transform(source, resultStream);
377                return stringWriter.toString();
378        }
379
380        /**
381         * Determines the index (starting at 0) of the given element relative to other
382         * element nodes for the same parent.
383         */
384        public static int getElementPosition(Element element) {
385                int num = -1;
386                Node node = element;
387                while (node != null) {
388                        if (node.getNodeType() == Node.ELEMENT_NODE) {
389                                ++num;
390                        }
391                        node = node.getPreviousSibling();
392                }
393                return num;
394        }
395
396        /**
397         * Returns all children of the given element which are element named as
398         * specified.
399         */
400        public static List<Element> getNamedChildren(Element element, String elementNames) {
401                List<Element> result = new ArrayList<>();
402                NodeList children = element.getChildNodes();
403                for (int i = 0; i < children.getLength(); ++i) {
404                        Node node = children.item(i);
405                        if (node.getNodeType() == Node.ELEMENT_NODE && node.getNodeName().equals(elementNames)) {
406                                result.add((Element) node);
407                        }
408                }
409                return result;
410        }
411
412        /**
413         * Returns the first child of the given element which is an element named as
414         * specified. Returns null if none are found.
415         */
416        public static Element getNamedChild(Element element, String name) {
417                List<Element> children = XMLUtils.getNamedChildren(element, name);
418                if (children.size() > 0) {
419                        return children.get(0);
420                }
421                return null;
422        }
423
424        /**
425         * Returns the last child of the given element which is an element named as
426         * specified. Returns null if none are found.
427         */
428        public static Element getLastNamedChild(Element element, String name) {
429                List<Element> children = XMLUtils.getNamedChildren(element, name);
430                if (children.size() > 0) {
431                        return CollectionUtils.getLast(children);
432                }
433                return null;
434        }
435
436        /**
437         * Get the text content of the given element's first child that is an element
438         * named as specified. If none is found, the empty string is returned.
439         */
440        public static String getNamedChildContent(Element parent, String name) {
441                Element element = XMLUtils.getNamedChild(parent, name);
442                if (element == null) {
443                        return StringUtils.EMPTY_STRING;
444                }
445                return element.getTextContent();
446        }
447
448        /**
449         * Returns the first element whose child with the given name has the given text
450         * content.
451         */
452        public static Element getElementByChildContent(List<Element> elements, String childName, String childContent) {
453                for (Element element : elements) {
454                        Element child = XMLUtils.getNamedChild(element, childName);
455                        if (child != null && child.getTextContent().equals(childContent)) {
456                                return element;
457                        }
458                }
459                return null;
460        }
461
462        /**
463         * Returns the ancestor of the given element with the given distance. Distance
464         * zero means the element itself, one the parent and so on.
465         */
466        public static Element getAncestor(Element element, int distance) {
467                for (int i = 0; i < distance; i++) {
468                        if (element == null) {
469                                return null;
470                        }
471                        element = CCSMAssert.checkedCast(element.getParentNode(), Element.class);
472                }
473                return element;
474        }
475
476        /**
477         * Extracts all ElementNodes from a NodeList and returns the result as a list.
478         *
479         * @param nodeList
480         *            the NodeList to be searched for ElementNodes.
481         * @return an array containing all ElementNodes stored in the given node list or
482         *         null if the input has been null.
483         */
484        public static List<Element> elementNodes(NodeList nodeList) {
485                if (nodeList == null) {
486                        return null;
487                }
488                List<Element> result = new ArrayList<>();
489                int len = nodeList.getLength();
490                for (int i = 0; i < len; ++i) {
491                        Node node = nodeList.item(i);
492                        if (node.getNodeType() == Node.ELEMENT_NODE) {
493                                result.add((Element) node);
494                        }
495                }
496                return result;
497        }
498
499        /** Removes the given element from its parent. */
500        public static void removeElement(Element element) {
501                Node parent = element.getParentNode();
502                if (parent != null) {
503                        parent.removeChild(element);
504                }
505        }
506
507        /**
508         * Appends a child element with the given tag name to the given element and
509         * returns the new element.
510         */
511        public static Element appendChild(Element element, String tagName) {
512                Element newElement = CCSMAssert.checkedCast(element.getOwnerDocument().createElement(tagName), Element.class);
513                element.appendChild(newElement);
514                return newElement;
515        }
516
517        /**
518         * Appends a child element with the given tag name and the given text content to
519         * the given element and returns the new element.
520         */
521        public static Element appendChild(Element parent, String tagName, String textContent) {
522                Element newElement = appendChild(parent, tagName);
523                newElement.setTextContent(textContent);
524                return newElement;
525        }
526
527        /**
528         * Get all leaf elements of an XML tree rooted at an element
529         *
530         * @param root
531         *            The root element
532         * @return List of all leaf elements
533         */
534        public static List<Element> leafElementNodes(Element root) {
535                List<Element> leafElementNodes = new ArrayList<>();
536                leafElementNodes(root, leafElementNodes);
537                return leafElementNodes;
538        }
539
540        /**
541         * Add all leaf element nodes of an XML tree rooted at an element to a list
542         */
543        private static void leafElementNodes(Element root, List<Element> leafElementNodes) {
544                List<Element> children = XMLUtils.elementNodes(root.getChildNodes());
545                if (children.isEmpty()) {
546                        leafElementNodes.add(root);
547                } else {
548                        for (Element child : children) {
549                                leafElementNodes(child, leafElementNodes);
550                        }
551                }
552        }
553
554        /** Creates an empty XML document. */
555        public static Document createEmptyDocument() {
556                DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
557                DocumentBuilder builder;
558                try {
559                        builder = factory.newDocumentBuilder();
560                } catch (ParserConfigurationException e) {
561                        throw new IllegalStateException("No document builder found, probably Java is misconfigured!", e);
562                }
563                return builder.newDocument();
564        }
565
566        /** Converts the given {@link String} to a SAX {@link InputSource}. */
567        public static InputSource toInputSource(String string) {
568                return new InputSource(new StringReader(string));
569        }
570
571        /**
572         * Simple error handler for handling validation errors. This handler stores the
573         * first problem raised during parsing.
574         */
575        private static class XMLErrorHandler implements ErrorHandler {
576
577                /**
578                 * The stored exception. Value unequal <code>null</code> signals a validation
579                 * problem.
580                 */
581                private SAXParseException exception;
582
583                /** {@inheritDoc} */
584                @Override
585                public void error(SAXParseException exception) {
586                        if (this.exception == null) {
587                                this.exception = exception;
588                        }
589                }
590
591                /** {@inheritDoc} */
592                @Override
593                public void fatalError(SAXParseException exception) {
594                        error(exception);
595                }
596
597                /** {@inheritDoc} */
598                @Override
599                public void warning(SAXParseException exception) {
600                        System.out.println(exception);
601                        // ignore
602                }
603        }
604
605        /**
606         * Fixes chars which are not allowed in XML content. The following replacements
607         * are allowed:
608         * <ul>
609         * <li>All '&' which are not part of an XML escape char sequence are replaced by
610         * '&amp;'.
611         * <li>All low ASCII control chars are removed, besides TAB, LF, CR
612         * <li>Escaped ASCII control chars are removed (e.g. &#x0; or &#0;) with
613         * variable zero padding in hex and decimal format.
614         * </ul>
615         */
616        public static String fixIllegalXmlChars(String content) {
617                String replacedContent = content.replaceAll("(?i)&(?!(lt|gt|amp|apos|quot|#x[0-9a-f]+|#\\d+);)", "&amp;");
618                replacedContent = replacedContent.replaceAll("([\\x00-\\x08\\x0b\\x0c\\x0e-\\x1f\\x7f])",
619                                StringUtils.EMPTY_STRING);
620                replacedContent = replacedContent.replaceAll("(?i)&#x0*([0-8bcef]|1[0-9a-f]|7f);", StringUtils.EMPTY_STRING);
621                replacedContent = replacedContent.replaceAll("(?i)&#0*([0-8]|1[124-9]|2[0-9]|3[01]|127);",
622                                StringUtils.EMPTY_STRING);
623                return replacedContent;
624        }
625}