001/*-------------------------------------------------------------------------+
002 |                                                                          |
003 | Copyright 2005-2011 The ConQAT Project                                   |
004 |                                                                          |
005 | Licensed under the Apache License, Version 2.0 (the "License");          |
006 | you may not use this file except in compliance with the License.         |
007 | You may obtain a copy of the License at                                  |
008 |                                                                          |
009 |    http://www.apache.org/licenses/LICENSE-2.0                            |
010 |                                                                          |
011 | Unless required by applicable law or agreed to in writing, software      |
012 | distributed under the License is distributed on an "AS IS" BASIS,        |
013 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
014 | See the License for the specific language governing permissions and      |
015 | limitations under the License.                                           |
016 +-------------------------------------------------------------------------*/
017package org.conqat.lib.commons.datamining;
018
019import java.util.ArrayList;
020import java.util.Collections;
021import java.util.HashMap;
022import java.util.HashSet;
023import java.util.List;
024import java.util.Map;
025import java.util.Set;
026
027import org.conqat.lib.commons.assertion.CCSMAssert;
028import org.conqat.lib.commons.collections.CollectionUtils;
029
030/**
031 * A-priori algorithm for mining frequent item sets from shopping baskets. See
032 * http://en.wikipedia.org/wiki/Apriori_algorithm
033 */
034public class FrequentItemSetMiner<T> {
035
036        /** Threshold for support */
037        private final double supportThreshold;
038
039        /**
040         * Constructs a new {@link FrequentItemSetMiner}.
041         * 
042         * @param supportThreshold
043         *            [0..1] denotes in what fraction of baskets an item set must occur
044         *            to be considered frequent.
045         */
046        public FrequentItemSetMiner(double supportThreshold) {
047                CCSMAssert.isTrue(supportThreshold >= 0 && supportThreshold <= 1, "supportThreshold must be in [0,1]");
048                this.supportThreshold = supportThreshold;
049        }
050
051        /**
052         * Mines frequent item sets from the given shopping baskets. The support
053         * threshold is the fraction of baskets from which a frequent item set is a
054         * subset. The commodity factor is used to ignore items that are purchased
055         * extremely often. If an item occurs in the fraction commodityFactor of the
056         * baskets, it is ignored for identifying frequent item sets. Elements in
057         * baskets must be hashable.
058         * 
059         * @param baskets
060         *            the baskets to be analyzed.
061         */
062        public Set<FrequentItemSet<T>> mineFrequentItemSets(Set<Set<T>> baskets) {
063
064                Set<FrequentItemSet<T>> result = new HashSet<FrequentItemSet<T>>();
065
066                // The choice of the names for the identifiers of the local variables
067                // are intended and originate from the Wikipedia page (see class
068                // comment).
069                Map<Integer, Set<Set<T>>> L_ks = new HashMap<Integer, Set<Set<T>>>();
070
071                // compute all item sets of size 1 with support >= supportThreshold
072                HashSet<Set<T>> singletonItemSets = new HashSet<Set<T>>();
073                L_ks.put(1, singletonItemSets);
074                for (T item : CollectionUtils.unionSetAll(baskets)) {
075                        Set<T> singletonItemSet = Collections.singleton(item);
076                        double support = support(singletonItemSet, baskets);
077                        if (support >= supportThreshold) {
078                                singletonItemSets.add(singletonItemSet);
079                                result.add(new FrequentItemSet<T>(singletonItemSet, support));
080                        }
081                }
082
083                int k = 1;
084
085                while (true) {
086                        // generate frequent item sets of size k+1 from frequent item sets
087                        // of size k
088
089                        Set<Set<T>> candidates = apriori_gen(L_ks.get(k), k + 1);
090
091                        Set<Set<T>> L_k = new HashSet<Set<T>>();
092                        L_ks.put(k + 1, L_k);
093
094                        for (Set<T> candidate : candidates) {
095                                double support = support(candidate, baskets);
096                                if (support >= supportThreshold) {
097                                        L_k.add(candidate);
098                                        result.add(new FrequentItemSet<T>(candidate, support));
099                                }
100                        }
101
102                        if (L_k.isEmpty()) {
103                                // We're done.
104                                break;
105                        }
106
107                        k++;
108
109                }
110
111                return result;
112        }
113
114        /** Generate candidate item sets */
115        private Set<Set<T>> apriori_gen(Set<Set<T>> L_k_1, int k) {
116                Set<Set<T>> C_k = new HashSet<Set<T>>();
117
118                List<Set<T>> asList = new ArrayList<Set<T>>(L_k_1);
119                for (int i = 0; i < asList.size(); i++) {
120                        inner: for (int j = i + 1; j < asList.size(); j++) {
121                                Set<T> union = CollectionUtils.unionSet(asList.get(i), asList.get(j));
122                                if (union.size() == k) {
123                                        for (T item : union) {
124                                                Set<T> check = new HashSet<T>(union);
125                                                check.remove(item);
126                                                if (!L_k_1.contains(check)) {
127                                                        continue inner;
128                                                }
129                                        }
130                                        C_k.add(union);
131                                }
132                        }
133                }
134
135                return C_k;
136        }
137
138        /** Returns the support of itemSet within the baskets. */
139        private static <T> double support(Set<T> itemSet, Set<Set<T>> baskets) {
140                int count = 0;
141
142                for (Set<T> basket : baskets) {
143                        if (basket.containsAll(itemSet)) {
144                                count++;
145                        }
146                }
147
148                return (double) count / baskets.size();
149        }
150
151}