001/*-------------------------------------------------------------------------+ 002| | 003| Copyright 2005-2011 the ConQAT Project | 004| | 005| Licensed under the Apache License, Version 2.0 (the "License"); | 006| you may not use this file except in compliance with the License. | 007| You may obtain a copy of the License at | 008| | 009| http://www.apache.org/licenses/LICENSE-2.0 | 010| | 011| Unless required by applicable law or agreed to in writing, software | 012| distributed under the License is distributed on an "AS IS" BASIS, | 013| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 014| See the License for the specific language governing permissions and | 015| limitations under the License. | 016+-------------------------------------------------------------------------*/ 017package org.conqat.lib.commons.datamining; 018 019import java.util.Collection; 020import java.util.HashMap; 021import java.util.Map; 022 023import org.conqat.lib.commons.collections.CollectionUtils; 024 025/** 026 * A sparse vector in the n-dimensional space of a numerical type T. Basically a 027 * mapping from component (represented as an Integer) to a double value. Unset 028 * components correspond to 0, i.e. a newly created instance represents the 029 * 0-vector. 030 */ 031public class SparseVector { 032 033 /** The data of this vector */ 034 private Map<Integer, Double> data = new HashMap<Integer, Double>(); 035 036 /** 037 * Sets the given value under the given key, value must not be <code>null</code> 038 */ 039 public void set(int key, double value) { 040 data.put(key, value); 041 } 042 043 /** 044 * Computes the cosine distance between this and the given vector. See also 045 * http://en.wikipedia.org/wiki/Cosine_similarity 046 */ 047 public double cosineSimilarity(SparseVector other) { 048 049 double dotProduct = 0.0d; 050 051 Collection<Integer> commonPositions = CollectionUtils.intersectionSet(data.keySet(), other.data.keySet()); 052 053 for (Integer commonPosition : commonPositions) { 054 dotProduct += data.get(commonPosition) * other.data.get(commonPosition); 055 } 056 057 double divisor = l2norm() * other.l2norm(); 058 059 if (divisor == 0) { 060 return 0; 061 } 062 063 return dotProduct / divisor; 064 } 065 066 /** Returns the L2 norm of this vector. */ 067 private double l2norm() { 068 double tmp = 0.0d; 069 for (Integer position : data.keySet()) { 070 tmp += Math.pow(data.get(position), 2); 071 } 072 return Math.sqrt(tmp); 073 } 074}