关联项的求解问题非常常见,最典型的例子如“牛奶面包”问题。然而常常需要通过对具体数据的分析来获悉一些不是很明显的关联对。例如我们熟知的“啤酒尿布”关联。如果不是事先听说过这两者之间微妙的关联,我想很多人是很难想像到其中的关联的。相信还有许多微妙的关联关系只有通过对具体数据的分析才能被发现。
求解关联pair或者更大的关联集合的最朴素的想法就是列举并统计所有可能的pair的出现频数,判断其是否足够大。但是组合结果通常过大,而使得计算量过大。可以发现#{K-子集合}>...>#{triple}>#{pair},其中K>N/2,N表示所有物品的个数。
“Mining of massive dataset”中提及的“单调原理”可以避免上述朴素想法中大量的不必要的组合:
If a set I of items is frequent, then so is every subset of I.
这个命题:子集I是频繁子集=>其中元素形成的的组合也是频繁的。其逆否命题:某组合x不是频繁的=>不会有任何频繁集中会包含x。
进而可以迭代地求解{pair} -> {triple} ...这个过程就是Apriori算法。如下图:
Construct和Filter过程是关键的两步。
1) Construct即从{i-itemset}构建{i+1 itemset}。伪代码如下:
for itemset i in {i-itemset} temp <- i for itemset j in {i-itemset}.{after i} if #{{i}-{j}}==2 # i and j has n-1 items the same, only one difference for_return.add(temp.append(diff)) # diff is in j but not i
2) Filter即统计1)中创建的每个新的itemset出现的频数是否足够多。
1)中可以进一步对上一步的itemset进行trim:书中提到,如果{i-itemset}中出现的item至少在i+1个set中出现才能作为求解{i+1-itemset}的candidate。
例如:假设L2={1,2},{2,3},{3,4},{4,5}。1和5只出现了一次,不可能出现在L3中,因为L3中出现的每个item再进行2元组合肯定都会出现在L2中,这决定了L3中每个元素在L2中都至少出现2次。
这个trim会减少Filter步骤的判断,但是一般而言#{pair}以及之后出现的candidate的数量都比较少,这个trim可以不做。
代码如下(测试数据见附件):
package com.apriori; import java.io.*; import java.util.*; /** * */ public class Apriori { public static void main(String[] args) throws Exception { new Apriori(args); } /** the list of current itemsets */ private List<int[]> itemsets; /** the name of the transcation file */ private String transaFile; /** number of different items in the dataset */ private int numItems; /** total number of transactions in transaFile */ private int numTransactions; /** minimum support for a frequent itemset in percentage, e.g. 0.8 */ private double minSup; /** by default, Apriori is used with the command line interface */ /** * * generates the apriori itemsets from a file * * * * @param args * * configuration parameters: args[0] is a filename, args[1] the * * min support (e.g. 0.8 for 80%) */ public Apriori(String[] args) throws Exception { configure(args); go(); } /** starts the algorithm after configuration */ private void go() throws Exception { // start timer long start = System.currentTimeMillis(); // first we generate the candidates of size 1 createItemsetsOfSize1(); int itemsetNumber = 1; // the current itemset being looked at int nbFrequentSets = 0; while (itemsets.size() > 0) { calculateFrequentItemsets(); if (itemsets.size() != 0) { nbFrequentSets += itemsets.size(); log("Found " + itemsets.size() + " frequent itemsets of size " + itemsetNumber + " (with support " + (minSup * 100) + "%)"); createNewItemsetsFromPreviousOnes(); } itemsetNumber++; } // display the execution time long end = System.currentTimeMillis(); log("Execution time is: " + ((double) (end - start) / 1000) + " seconds."); log("Found " + nbFrequentSets + " frequents sets for support " + (minSup * 100) + "% (absolute " + Math.round(numTransactions * minSup) + ")"); log("Done"); } /** triggers actions if a frequent item set has been found */ private void foundFrequentItemSet(int[] itemset, int support) { System.out.println(Arrays.toString(itemset) + " (" + ((support / (double) numTransactions)) + " " + support + ")"); } /** outputs a message in Sys.err if not used as library */ private void log(String message) { System.out.println("log\t" + message); } /** computes numItems, numTransactions, and sets minSup */ private void configure(String[] args) throws Exception { // setting transafile if (args.length != 0) transaFile = args[0]; else transaFile = "chess.dat"; // default // setting minsupport if (args.length >= 2) minSup = (Double.valueOf(args[1]).doubleValue()); else minSup = .8;// by default if (minSup > 1 || minSup < 0) throw new Exception("minSup: bad value"); // going thourgh the file to compute numItems and numTransactions numItems = 0; numTransactions = 0; BufferedReader data_in = new BufferedReader(new FileReader(transaFile)); while (data_in.ready()) { String line = data_in.readLine(); if (line.matches("\\s*")) continue; // be friendly with empty lines numTransactions++; StringTokenizer t = new StringTokenizer(line, " "); while (t.hasMoreTokens()) { int x = Integer.parseInt(t.nextToken()); // log(x); if (x + 1 > numItems) numItems = x + 1; } } outputConfig(); } /** * * outputs the current configuration */ private void outputConfig() { // output config info to the user log("Input configuration: " + numItems + " items, " + numTransactions + " transactions, "); log("minsup = " + minSup + "%"); } /** * * puts in itemsets all sets of size 1, i.e. all possibles items of the * * datasets */ private void createItemsetsOfSize1() { itemsets = new ArrayList<int[]>(); for (int i = 0; i < numItems; i++) { int[] cand = { i }; itemsets.add(cand); } } /** * if m is the size of the current itemsets, generate all possible itemsets * of size n+1 from pairs of current itemsets replaces the itemsets of * itemsets by the new ones */ private void createNewItemsetsFromPreviousOnes() { // by construction, all existing itemsets have the same size int currentSizeOfItemsets = itemsets.get(0).length; log("Creating itemsets of size " + (currentSizeOfItemsets + 1) + " based on " + itemsets.size() + " itemsets of size " + currentSizeOfItemsets); HashMap<String, int[]> tempCandidates = new HashMap<String, int[]>(); // temporary // candidates compare each pair of itemsets of size n-1 for (int i = 0; i < itemsets.size(); i++) { for (int j = i + 1; j < itemsets.size(); j++) { int[] X = itemsets.get(i); int[] Y = itemsets.get(j); assert (X.length == Y.length); // make a string of the first n-2 tokens of the strings int[] newCand = new int[currentSizeOfItemsets + 1]; for (int s = 0; s < newCand.length - 1; s++) { newCand[s] = X[s]; } int ndifferent = 0; // then we find the missing value for (int s1 = 0; s1 < Y.length; s1++) { boolean found = false; // is Y[s1] in X? for (int s2 = 0; s2 < X.length; s2++) { if (X[s2] == Y[s1]) { found = true; break; } } if (!found) { // Y[s1] is not in X ndifferent++; // we put the missing value at the end of newCand newCand[newCand.length - 1] = Y[s1]; } } // we have to find at least 1 different, otherwise it means that // we have two times the same set in the existing candidates assert (ndifferent > 0); if (ndifferent == 1) { // HashMap does not have the correct "equals" for int[] :-( // I have to create the hash myself using a String :-( // I use Arrays.toString to reuse equals and hashcode of // String Arrays.sort(newCand); tempCandidates.put(Arrays.toString(newCand), newCand); } } } // set the new itemsets itemsets = new ArrayList<int[]>(tempCandidates.values()); log("Created " + itemsets.size() + " unique itemsets of size " + (currentSizeOfItemsets + 1)); } /** put "true" in trans[i] if the integer i is in line */ private void line2booleanArray(String line, boolean[] trans) { Arrays.fill(trans, false); StringTokenizer stFile = new StringTokenizer(line, " "); // read a line while (stFile.hasMoreTokens()) { int parsedVal = Integer.parseInt(stFile.nextToken()); trans[parsedVal] = true; // if it is not a 0, assign the value to } } /** * * passes through the data to measure the frequency of sets in * * {@link itemsets}, then filters thoses who are under the minimum support * * (minSup) */ private void calculateFrequentItemsets() throws Exception { log("Passing through the data to compute the frequency of " + itemsets.size() + " itemsets of size " + "!!! "+itemsets.get(0).length); List<int[]> frequentCandidates = new ArrayList<int[]>(); // the frequent boolean match; // whether the transaction has all the items in an // itemset int count[] = new int[itemsets.size()]; // the number of successful // matches, initialized by zeros load the transaction file BufferedReader data_in = new BufferedReader(new InputStreamReader( new FileInputStream(transaFile))); boolean[] trans = new boolean[numItems]; // for each transaction for (int i = 0; i < numTransactions; i++) { // boolean[] trans = extractEncoding1(data_in.readLine()); String line = data_in.readLine(); line2booleanArray(line, trans); // check each candidate for (int c = 0; c < itemsets.size(); c++) { match = true; // reset match to false // tokenize the candidate so that we know what items need to be // present for a match int[] cand = itemsets.get(c); // check each item in the itemset to see if it is present in the // transaction for (int xx : cand) { if (trans[xx] == false) { match = false; break; } } if (match) { // if at this point it is a match, increase the // count count[c]++; } } } data_in.close(); for (int i = 0; i < itemsets.size(); i++) { // if the count% is larger than the minSup%, add to the candidate to // the frequent candidates if ((count[i] / (double) (numTransactions)) >= minSup) { foundFrequentItemSet(itemsets.get(i), count[i]); frequentCandidates.add(itemsets.get(i)); } } // new candidates are only the frequent candidates itemsets = frequentCandidates; } }