001/*-------------------------------------------------------------------------+
002|                                                                          |
003| Copyright 2005-2011 The ConQAT Project                                   |
004|                                                                          |
005| Licensed under the Apache License, Version 2.0 (the "License");          |
006| you may not use this file except in compliance with the License.         |
007| You may obtain a copy of the License at                                  |
008|                                                                          |
009|    http://www.apache.org/licenses/LICENSE-2.0                            |
010|                                                                          |
011| Unless required by applicable law or agreed to in writing, software      |
012| distributed under the License is distributed on an "AS IS" BASIS,        |
013| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
014| See the License for the specific language governing permissions and      |
015| limitations under the License.                                           |
016+-------------------------------------------------------------------------*/
017package org.conqat.lib.commons.string;
018
019import java.util.Iterator;
020
021/**
022 * This class is used to split a string in lines using an {@link Iterator}. The
023 * default setting is to not return trailing empty lines. Use
024 * {@link #setIncludeTrailingEmptyLine(boolean)} to include them.
025 * <p>
026 * <b>Note:</b> According to tests I performed this is the fastest method to
027 * split a string. It is about nine times faster than the regex-bases split
028 * with:
029 * 
030 * <pre>
031 * Pattern pattern = Pattern.compile(&quot;\r\n|\r|\n&quot;);
032 * pattern.split(content);
033 * </pre>
034 */
035public class LineSplitter implements Iterator<String>, Iterable<String> {
036
037        /** Unicode Character 'NEXT LINE (NEL)' */
038        private static final char UNICODE_NEL = '\u0085';
039
040        /** The string content to split. */
041        private String content;
042
043        /** Starting index. */
044        private int startIndex;
045
046        /** Flag for returning the trailing empty line. */
047        private boolean includeTrailingEmptyLine = false;
048
049        /**
050         * Constructor for empty content.
051         */
052        public LineSplitter() {
053                // Does nothing as content is empty.
054        }
055
056        /**
057         * Constructor which calls {@link #setContent(String)}.
058         */
059        public LineSplitter(String content) {
060                setContent(content);
061        }
062
063        /**
064         * Set the string to split and reset the iterator.
065         * 
066         * @param content
067         *            The string to split. If string is <code>null</code> or the empty
068         *            string, {@link #next()} will return <code>null</code>.
069         * 
070         */
071        public void setContent(String content) {
072                this.content = content;
073                startIndex = 0;
074        }
075
076        /** {@inheritDoc} */
077        @Override
078        public boolean hasNext() {
079                if (content == null) {
080                        return false;
081                }
082
083                if (includeTrailingEmptyLine && isTrailingEmptyLine()) {
084                        return true;
085                }
086
087                if (startIndex >= content.length()) {
088                        // delete reference to array to allow garbage collection
089                        content = null;
090                        return false;
091                }
092
093                return true;
094        }
095
096        /**
097         * Obtain next identified line.
098         * 
099         * @return <code>null</code> if all lines were returned. On returning the last
100         *         line all references to the input string are deleted. So it is free
101         *         for garbage collection.
102         */
103        @Override
104        public String next() {
105                if (!hasNext()) {
106                        return null;
107                }
108
109                if (includeTrailingEmptyLine && isTrailingEmptyLine()) {
110                        startIndex++; // shift index, so it is beyond the content length
111                        return StringUtils.EMPTY_STRING;
112                }
113
114                // length to skip may vary due to the length of the line separator (\r,
115                // \n or \r\n)
116                int skip = 0;
117
118                int endIndex = startIndex;
119
120                while (skip == 0 && endIndex < content.length()) {
121                        char c = content.charAt(endIndex);
122
123                        endIndex++;
124
125                        // Skip newlines.
126                        if (c == '\n' || c == UNICODE_NEL) {
127                                skip = 1;
128                        }
129
130                        // Skip newlines.
131                        if (c == '\r') {
132                                skip = 1;
133                                if (endIndex < content.length() && content.charAt(endIndex) == '\n') {
134                                        skip = 2;
135                                        endIndex++;
136                                }
137                        }
138                }
139
140                String result = content.substring(startIndex, endIndex - skip);
141
142                startIndex = endIndex;
143                return result;
144        }
145
146        /**
147         * @return <code>true</code> if the iterator is at the end of the string content
148         *         and the content contains an empty trailing line.
149         */
150        private boolean isTrailingEmptyLine() {
151                if (startIndex > 0 && startIndex == content.length()) {
152                        char lastChar = content.charAt(startIndex - 1);
153                        return lastChar == '\n' || lastChar == '\r';
154                }
155                return false;
156        }
157
158        /**
159         * Enables returning of trailing empty lines during the iteration. Default is
160         * <code>false</code>
161         * <p>
162         * If <code>true</code> the string <code>Foo\nBar\n</code>will yield three items
163         * (Foo, Bar and the empty string), otherwise two items (Foo and Bar).
164         */
165        public void setIncludeTrailingEmptyLine(boolean includeTrailingEmptyLine) {
166                this.includeTrailingEmptyLine = includeTrailingEmptyLine;
167        }
168
169        /** {@inheritDoc} */
170        @Override
171        public void remove() {
172                throw new UnsupportedOperationException();
173        }
174
175        /** {@inheritDoc} */
176        @Override
177        public Iterator<String> iterator() {
178                return this;
179        }
180
181}