001/*-------------------------------------------------------------------------+ 002 | | 003 | Copyright 2005-2011 The ConQAT Project | 004 | | 005 | Licensed under the Apache License, Version 2.0 (the "License"); | 006 | you may not use this file except in compliance with the License. | 007 | You may obtain a copy of the License at | 008 | | 009 | http://www.apache.org/licenses/LICENSE-2.0 | 010 | | 011 | Unless required by applicable law or agreed to in writing, software | 012 | distributed under the License is distributed on an "AS IS" BASIS, | 013 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 014 | See the License for the specific language governing permissions and | 015 | limitations under the License. | 016 +-------------------------------------------------------------------------*/ 017package org.conqat.lib.commons.datamining; 018 019import java.util.ArrayList; 020import java.util.Collections; 021import java.util.HashMap; 022import java.util.HashSet; 023import java.util.List; 024import java.util.Map; 025import java.util.Set; 026 027import org.conqat.lib.commons.assertion.CCSMAssert; 028import org.conqat.lib.commons.collections.CollectionUtils; 029 030/** 031 * A-priori algorithm for mining frequent item sets from shopping baskets. See 032 * http://en.wikipedia.org/wiki/Apriori_algorithm 033 */ 034public class FrequentItemSetMiner<T> { 035 036 /** Threshold for support */ 037 private final double supportThreshold; 038 039 /** 040 * Constructs a new {@link FrequentItemSetMiner}. 041 * 042 * @param supportThreshold 043 * [0..1] denotes in what fraction of baskets an item set must occur 044 * to be considered frequent. 045 */ 046 public FrequentItemSetMiner(double supportThreshold) { 047 CCSMAssert.isTrue(supportThreshold >= 0 && supportThreshold <= 1, "supportThreshold must be in [0,1]"); 048 this.supportThreshold = supportThreshold; 049 } 050 051 /** 052 * Mines frequent item sets from the given shopping baskets. The support 053 * threshold is the fraction of baskets from which a frequent item set is a 054 * subset. The commodity factor is used to ignore items that are purchased 055 * extremely often. If an item occurs in the fraction commodityFactor of the 056 * baskets, it is ignored for identifying frequent item sets. Elements in 057 * baskets must be hashable. 058 * 059 * @param baskets 060 * the baskets to be analyzed. 061 */ 062 public Set<FrequentItemSet<T>> mineFrequentItemSets(Set<Set<T>> baskets) { 063 064 Set<FrequentItemSet<T>> result = new HashSet<FrequentItemSet<T>>(); 065 066 // The choice of the names for the identifiers of the local variables 067 // are intended and originate from the Wikipedia page (see class 068 // comment). 069 Map<Integer, Set<Set<T>>> L_ks = new HashMap<Integer, Set<Set<T>>>(); 070 071 // compute all item sets of size 1 with support >= supportThreshold 072 HashSet<Set<T>> singletonItemSets = new HashSet<Set<T>>(); 073 L_ks.put(1, singletonItemSets); 074 for (T item : CollectionUtils.unionSetAll(baskets)) { 075 Set<T> singletonItemSet = Collections.singleton(item); 076 double support = support(singletonItemSet, baskets); 077 if (support >= supportThreshold) { 078 singletonItemSets.add(singletonItemSet); 079 result.add(new FrequentItemSet<T>(singletonItemSet, support)); 080 } 081 } 082 083 int k = 1; 084 085 while (true) { 086 // generate frequent item sets of size k+1 from frequent item sets 087 // of size k 088 089 Set<Set<T>> candidates = apriori_gen(L_ks.get(k), k + 1); 090 091 Set<Set<T>> L_k = new HashSet<Set<T>>(); 092 L_ks.put(k + 1, L_k); 093 094 for (Set<T> candidate : candidates) { 095 double support = support(candidate, baskets); 096 if (support >= supportThreshold) { 097 L_k.add(candidate); 098 result.add(new FrequentItemSet<T>(candidate, support)); 099 } 100 } 101 102 if (L_k.isEmpty()) { 103 // We're done. 104 break; 105 } 106 107 k++; 108 109 } 110 111 return result; 112 } 113 114 /** Generate candidate item sets */ 115 private Set<Set<T>> apriori_gen(Set<Set<T>> L_k_1, int k) { 116 Set<Set<T>> C_k = new HashSet<Set<T>>(); 117 118 List<Set<T>> asList = new ArrayList<Set<T>>(L_k_1); 119 for (int i = 0; i < asList.size(); i++) { 120 inner: for (int j = i + 1; j < asList.size(); j++) { 121 Set<T> union = CollectionUtils.unionSet(asList.get(i), asList.get(j)); 122 if (union.size() == k) { 123 for (T item : union) { 124 Set<T> check = new HashSet<T>(union); 125 check.remove(item); 126 if (!L_k_1.contains(check)) { 127 continue inner; 128 } 129 } 130 C_k.add(union); 131 } 132 } 133 } 134 135 return C_k; 136 } 137 138 /** Returns the support of itemSet within the baskets. */ 139 private static <T> double support(Set<T> itemSet, Set<Set<T>> baskets) { 140 int count = 0; 141 142 for (Set<T> basket : baskets) { 143 if (basket.containsAll(itemSet)) { 144 count++; 145 } 146 } 147 148 return (double) count / baskets.size(); 149 } 150 151}