001/*-------------------------------------------------------------------------+
002|                                                                          |
003| Copyright 2005-2011 the ConQAT Project                                   |
004|                                                                          |
005| Licensed under the Apache License, Version 2.0 (the "License");          |
006| you may not use this file except in compliance with the License.         |
007| You may obtain a copy of the License at                                  |
008|                                                                          |
009|    http://www.apache.org/licenses/LICENSE-2.0                            |
010|                                                                          |
011| Unless required by applicable law or agreed to in writing, software      |
012| distributed under the License is distributed on an "AS IS" BASIS,        |
013| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
014| See the License for the specific language governing permissions and      |
015| limitations under the License.                                           |
016+-------------------------------------------------------------------------*/
017package org.conqat.lib.commons.datamining;
018
019import java.util.Collection;
020import java.util.HashMap;
021import java.util.Map;
022
023import org.conqat.lib.commons.collections.CollectionUtils;
024
025/**
026 * A sparse vector in the n-dimensional space of a numerical type T. Basically a
027 * mapping from component (represented as an Integer) to a double value. Unset
028 * components correspond to 0, i.e. a newly created instance represents the
029 * 0-vector.
030 */
031public class SparseVector {
032
033        /** The data of this vector */
034        private Map<Integer, Double> data = new HashMap<Integer, Double>();
035
036        /**
037         * Sets the given value under the given key, value must not be <code>null</code>
038         */
039        public void set(int key, double value) {
040                data.put(key, value);
041        }
042
043        /**
044         * Computes the cosine distance between this and the given vector. See also
045         * http://en.wikipedia.org/wiki/Cosine_similarity
046         */
047        public double cosineSimilarity(SparseVector other) {
048
049                double dotProduct = 0.0d;
050
051                Collection<Integer> commonPositions = CollectionUtils.intersectionSet(data.keySet(), other.data.keySet());
052
053                for (Integer commonPosition : commonPositions) {
054                        dotProduct += data.get(commonPosition) * other.data.get(commonPosition);
055                }
056
057                double divisor = l2norm() * other.l2norm();
058
059                if (divisor == 0) {
060                        return 0;
061                }
062
063                return dotProduct / divisor;
064        }
065
066        /** Returns the L2 norm of this vector. */
067        private double l2norm() {
068                double tmp = 0.0d;
069                for (Integer position : data.keySet()) {
070                        tmp += Math.pow(data.get(position), 2);
071                }
072                return Math.sqrt(tmp);
073        }
074}