{"id":56,"date":"2013-12-12T02:59:52","date_gmt":"2013-12-11T18:59:52","guid":{"rendered":"http:\/\/www.lixiaodong.com\/?p=56"},"modified":"2014-02-03T07:29:31","modified_gmt":"2014-02-02T23:29:31","slug":"%e9%a2%91%e7%b9%81%e9%a1%b9%e9%9b%86%e6%8c%96%e6%8e%98frequent-itemsets-mining%e4%b9%8ba-priori%e7%ae%97%e6%b3%95java%e5%ae%9e%e7%8e%b0","status":"publish","type":"post","link":"https:\/\/www.lixiaodong.com\/?p=56","title":{"rendered":"\u9891\u7e41\u9879\u96c6\u6316\u6398(Frequent Itemsets mining)\u4e4bA-Priori\u7b97\u6cd5java\u5b9e\u73b0"},"content":{"rendered":"<p>Frequent Itemsets mining \u6700\u5e38\u89c1\u7684\u5f62\u5f0f\u5c31\u662f\u7ed9\u5b9amarket-basket\u5f62\u5f0f\u7684\u6570\u636e\uff08\u6bcf\u4e00\u884c\u76f8\u5f53\u4e8e\u4e00\u4e2a\u8d2d\u7269\u7bee\uff0c\u5305\u542b\u591a\u4e2a\u5546\u54c1item\uff09\uff0c\u7136\u540e\u6211\u4eec\u627e\u51fa\u5173\u8054\u5ea6\u5927\u4e8e\u67d0\u4e2a\u503c\u7684\u6240\u6709item\u96c6\u5408\u3002A-Priori\u7b97\u6cd5\u662fFrequent Itemsets mining\u91cc\u6700\u57fa\u672c\u7684\u7b97\u6cd5\u3002<\/p>\n<p>\u4e00\u3001A-Priori\u7684\u57fa\u672c\u601d\u8def<\/p>\n<p>\u7b2c\u4e00\u904d\uff08pass 1\uff09\uff0c\u626b\u63cf\u6587\u4ef6\uff0c\u7edf\u8ba1\u5355\u9879(single item)\u7684\u51fa\u73b0\u6b21\u6570\uff08\u4f7f\u75281\u4e2amap\u8fdb\u884c\u7edf\u8ba1\uff0ckey\u5c31\u662fitem\uff0cvalue\u5c31\u662f\u51fa\u73b0\u6b21\u6570\uff09\u3002\u6700\u540e\uff0c\u8fc7\u6ee4\u6389\u5c0f\u4e8e\u6700\u5c0f\u652f\u6301\u5ea6\u7684\uff0c\u5f97\u5230\u9891\u7e41\u5355\u9879\u96c6\u3002<\/p>\n<p>\u7b2c\u4e8c\u904d\uff08pass 2\uff09\uff0c\u626b\u63cf\u6587\u4ef6\uff0c\u5bf9\u4e8e\u6bcf\u4e00\u884c\uff0c\u5bf9\u4efb\u610f\u4e24\u4e2aitem\u7ec4\u5408\u5f97\u5230pair item\u3002\u5982\u679cpair item\u76842\u4e2a\u5355\u9879\u90fd\u5728\u9891\u7e41\u5355\u9879\u96c6\u91cc\uff0c\u5219\u7edf\u8ba1\u8fd9\u4e2apair item\u7684\u51fa\u73b0\u6b21\u6570\uff1b\u5426\u5219\u7565\u8fc7\u3002\u6700\u540e\u5f97\u5230\u6240\u6709pair item\u7684\u51fa\u73b0\u9891\u7387\uff0c\u8fc7\u6ee4\u6389\u5c0f\u4e8e\u6700\u5c0f\u652f\u6301\u5ea6\u7684\uff0c\u5f97\u5230\u9891\u7e412\u9879\u96c6\u3002<\/p>\n<p>\u7b2cN\u904d\uff0c\u5982\u679c\u53d1\u73b0\u9891\u7e41N-1\u9879\u96c6\u4e0d\u4e3a\u7a7a\uff0c\u5219\u8bf4\u660emining\u8fd8\u6ca1\u6709\u5b8c\u6210\uff0c\u9700\u8981\u8fdb\u884c\u7b2cn\u6b21\u626b\u63cf\u3002\u4ece\u6bcf\u884c\u4e2d\u53d6\u5f97\u4efb\u610fN\u4e2aitem\uff0c\u5982\u679c\u8fd9N\u9879\u7684\u6240\u6709N-1\u9879\u5b50\u96c6\u90fd\u5728\u9891\u7e41N-1\u9879\u96c6\u91cc\uff0c\u5219\u7edf\u8ba1\u5176\u51fa\u73b0\u6b21\u6570\uff0c\u5426\u5219\u7565\u8fc7\u3002\u6700\u540e\uff0c\u8fc7\u6ee4\u6389\u5c0f\u4e8e\u6700\u5c0f\u652f\u6301\u5ea6\u7684\uff0c\u5f97\u5230\u9891\u7e41N\u9879\u96c6\u3002<\/p>\n<p>\u76f4\u5230\u9891\u7e41\u9879\u96c6\u4e3a\u7a7a\u624d\u505c\u6b62\u5faa\u73af\u3002<\/p>\n<p>\u4e8c\u3001java\u5b9e\u73b0<\/p>\n<pre class=\"brush: java; title: ; notranslate\" title=\"\">\r\nimport java.io.BufferedReader;\r\nimport java.io.BufferedWriter;\r\nimport java.io.FileNotFoundException;\r\nimport java.io.FileOutputStream;\r\nimport java.io.FileReader;\r\nimport java.io.IOException;\r\nimport java.io.OutputStreamWriter;\r\nimport java.util.ArrayList;\r\nimport java.util.HashMap;\r\nimport java.util.HashSet;\r\nimport java.util.Iterator;\r\nimport java.util.LinkedHashMap;\r\nimport java.util.List;\r\nimport java.util.Map;\r\nimport java.util.Set;\r\nimport java.util.TreeSet;\r\n\r\npublic class Apriori {\r\n\r\n\tprivate String inputFile;\r\n\r\n\tprivate double minSupport;\r\n\r\n\tprivate BufferedWriter bw;\r\n\r\n\tprivate HashSet&lt;Set&lt;Integer&gt;&gt; frequentItems = new HashSet&lt;Set&lt;Integer&gt;&gt;(); \/\/\u9891\u7e41\u9879\u96c6\r\n\r\n\tprivate Set&lt;Integer&gt; frequentSingleItems = new HashSet&lt;Integer&gt;(); \/\/\u9891\u7e41\u5355\u9879\u96c6\r\n\r\n\tprivate int totalCount; \/\/\u9879\u76ee\u6570\u91cf\r\n\r\n\tpublic Apriori(String inputFile, double minSupport) {\r\n\t\tthis.inputFile = inputFile;\r\n\t\tthis.minSupport = minSupport;\r\n\t}\r\n\r\n\t\/**\r\n\t * \u627e\u51fa\u9891\u7e41\u4e00\u9879\u96c6\r\n\t * @return\r\n\t * @throws IOException\r\n\t *\/\r\n\tpublic Map&lt;Set&lt;Integer&gt;,Integer&gt; findF1Item() throws IOException {\r\n\t\tMap&lt;Set&lt;Integer&gt;,Integer&gt; result = new LinkedHashMap&lt;Set&lt;Integer&gt;, Integer&gt;();\r\n\t\tMap&lt;Integer, Integer&gt; map = new HashMap&lt;Integer, Integer&gt;();\r\n\t\tBufferedReader reader = new BufferedReader(new FileReader(inputFile));\r\n\t\tString line;\r\n\t\tint numberOfLine = 0;\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tnumberOfLine++;\r\n\t\t\tString&#x5B;] items = line.split(&quot; &quot;);\r\n\t\t\tfor(String item : items){\r\n\t\t\t\tint intItem = Integer.valueOf(item);\r\n\t\t\t\tif (map.containsKey(intItem)) {\r\n\t\t\t\t\tmap.put(intItem, map.get(intItem) + 1);\r\n\t\t\t\t} else {\r\n\t\t\t\t\tmap.put(intItem, 1);\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t}\r\n\t\ttotalCount = numberOfLine;\r\n\t\treader.close();\r\n\t\t\/\/\u4f7f\u7528TreeSet\u6309\u7167item\u7f16\u53f7\u4ece\u5c0f\u5230\u5927\u6392\u5e8f\r\n\t\tTreeSet&lt;Integer&gt; treeSet = new TreeSet&lt;Integer&gt;(map.keySet());\r\n\t\tfor (Integer item : treeSet) {\r\n\t\t\tint count = map.get(item);\r\n\t\t\t\/\/\u8fc7\u6ee4\u6389\u51fa\u73b0\u9891\u7387\u5c0f\u4e8e\u6700\u5c0f\u652f\u6301\u5ea6\u7684item\r\n\t\t\tif (count &gt;= minSupport*totalCount) {\r\n\t\t\t\tSet&lt;Integer&gt; f1Set = new TreeSet&lt;Integer&gt;();\r\n\t\t\t\tf1Set.add(item);\r\n\t\t\t\tresult.put(f1Set, count);\r\n\t\t\t\tfrequentItems.add(f1Set);\r\n\t\t\t\tfrequentSingleItems.add(item);\r\n\t\t\t}\r\n\t\t}\r\n\t\treturn result;\r\n\t}\r\n\r\n\tpublic Map&lt;Set&lt;Integer&gt;,Integer&gt; generateNextPass(int k) throws Exception{\r\n\t\tMap&lt;Set&lt;Integer&gt;, Integer&gt; map = new HashMap&lt;Set&lt;Integer&gt;, Integer&gt;();\r\n\t\tBufferedReader reader = new BufferedReader(new FileReader(inputFile));\r\n\t\tString line;\r\n\t\twhile ((line = reader.readLine()) != null) {\r\n\t\t\tString&#x5B;] items = line.split(&quot; &quot;);\r\n\t\t\tList&lt;Set&lt;Integer&gt;&gt; list = generateSubset(items, k);\r\n\t\t\tfor(Set&lt;Integer&gt; set :list){\r\n\t\t\t\tif(map.containsKey(set)){\r\n\t\t\t\t\tmap.put(set, map.get(set)+1);\r\n\t\t\t\t}\r\n\t\t\t\telse{\r\n\t\t\t\t\tmap.put(set, 1);\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t}\r\n\t\treader.close();\r\n\t\tfrequentItems = new HashSet&lt;Set&lt;Integer&gt;&gt;();\r\n\t\tfrequentSingleItems = new HashSet&lt;Integer&gt;();\r\n\t\tIterator&lt;Set&lt;Integer&gt;&gt; ite = map.keySet().iterator();\r\n\t\twhile(ite.hasNext()){\r\n\t\t\tSet&lt;Integer&gt; key = ite.next();\r\n\t\t\tint value = map.get(key);\r\n\t\t\tif(value&lt;totalCount*minSupport){\r\n\t\t\t\tite.remove();\r\n\t\t\t}\r\n\t\t\telse{\r\n\t\t\t\tfrequentItems.add(key);\r\n\t\t\t\tfor(int item: key){\r\n\t\t\t\t\tfrequentSingleItems.add(item);\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t}\r\n\t\treturn map;\r\n\t}\r\n\r\n\tprivate List&lt;Set&lt;Integer&gt;&gt; generateSubset(String&#x5B;] array, int k) throws Exception{\r\n\t\tList&lt;Set&lt;Integer&gt;&gt; result = new ArrayList&lt;Set&lt;Integer&gt;&gt;();\r\n\t\tint&#x5B;] intArray = new int&#x5B;array.length];\r\n\t\tfor(int i=0; i&lt;array.length;i++){\r\n\t\t\tintArray&#x5B;i] = Integer.parseInt(array&#x5B;i]);\r\n\t\t}\r\n\t\tint&#x5B;] newArray = filterItems(intArray);\r\n\t\tList&lt;Set&lt;Integer&gt;&gt; list = generateSubSets(newArray,k);\r\n\t\tfor(Set&lt;Integer&gt; set: list){\r\n\t\t\t\/\/\u5c06set\u53d8\u6210\u6570\u7ec4\r\n\t\t\tint smallArray&#x5B;] = new int&#x5B;set.size()];\r\n\t\t\tint i=0;\r\n\t\t\tIterator&lt;Integer&gt; ite = set.iterator();\r\n\t\t\twhile(ite.hasNext()){\r\n\t\t\t\tsmallArray&#x5B;i]= ite.next();\r\n\t\t\t\ti++;\r\n\t\t\t}\r\n\t\t\t\/\/\u627e\u51faset\u7684\u6240\u6709k-1\u6b21subItemSet\r\n\t\t\tList&lt;Set&lt;Integer&gt;&gt; smallList = generateSubSets(smallArray, k-1);\r\n\t\t\t\/\/\u5982\u679c\u67091\u4e2asubItemSet\u4e0d\u662f\u9891\u7e41\u7684\uff0c\u5219\u5224\u65adset\u4e0d\u662f\u9891\u7e41\u7684\r\n\t\t\tboolean flag = true;\r\n\t\t\tfor(Set&lt;Integer&gt; item: smallList){\r\n\t\t\t\tif(!frequentItems.contains(item)){\r\n\t\t\t\t\tflag = false;\r\n\t\t\t\t\tbreak;\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t\tif(flag){\r\n\t\t\t\tresult.add(set);\r\n\t\t\t}\r\n\t\t}\r\n\t\treturn result;\r\n\t}\r\n\r\n\tpublic void printFrequentItems(Map&lt;Set&lt;Integer&gt;,Integer&gt; itemSets, int i) throws FileNotFoundException, IOException {\r\n\t\tif(bw == null){\r\n\t\t\tbw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(&quot;output.txt&quot;)));\r\n\t\t}\r\n\t\tStringBuffer sb = new StringBuffer();\r\n\t\tfor (Set&lt;Integer&gt; set : itemSets.keySet()) {\r\n\t\t\tsb.append(&quot;&#x5B;&quot;);\r\n\t\t\tfor (Integer str : set) {\r\n\t\t\t\tsb.append(str + &quot; &quot;);\r\n\t\t\t}\r\n\t\t\tsb.append(&quot;], count:&quot;);\r\n\t\t\t\t\tsb.append(itemSets.get(set));\r\n\t\t\t\t\tsb.append(&quot;\\n&quot;);\r\n\t\t}\r\n\t\tbw.write(sb.toString());\r\n\t\tbw.flush();\r\n\t}\r\n\r\n\t\/**\r\n\t* \u8fc7\u6ee4\u6389\u4e0d\u5728\u524d\u4e00\u6b21\u9891\u7e41\u5355\u9879\u96c6\u91cc\u7684item\r\n\t* @param array\r\n\t* @return\r\n\t*\/\r\n\tprivate int&#x5B;] filterItems(int&#x5B;] array){\r\n\t\tList&lt;Integer&gt; list = new ArrayList&lt;Integer&gt;();\r\n\t\tfor(int i=0;i&lt;array.length;i++){\r\n\t\t\tif(frequentSingleItems.contains(array&#x5B;i])){\r\n\t\t\t\tlist.add(array&#x5B;i]);\r\n\t\t\t}\r\n\t\t}\r\n\t\tint&#x5B;] newArray = new int&#x5B;list.size()];\r\n\t\tfor(int i=0;i&lt;newArray.length;i++){\r\n\t\t\tnewArray&#x5B;i] = list.get(i);\r\n\t\t}\r\n\t\treturn newArray;\r\n\t}\r\n\r\n\tpublic void closeOutputWriter() throws IOException{\r\n\t\tif(bw == null){\r\n\t\t\tbw.close();\r\n\t\t}\r\n\t}\r\n\r\n\tprivate List&lt;Set&lt;Integer&gt;&gt; generateSubSets(int&#x5B;] array, int k){\r\n\t\tList&lt;Set&lt;Integer&gt;&gt; list = new ArrayList&lt;Set&lt;Integer&gt;&gt;();\r\n\t\tif(array.length&lt;k){\r\n\t\t\treturn list;\r\n\t\t}\r\n\t\t\/\/ \u521d\u59cb\u5316\u79fb\u4f4d\u6cd5\u9700\u8981\u7684\u6570\u7ec4\r\n\t\tbyte&#x5B;] bits = new byte&#x5B;array.length];\r\n\t\tfor (int i = 0; i &lt; bits.length; i++) {\r\n\t\t\tbits&#x5B;i] = i &lt; k ? (byte) 1 : (byte) 0;\r\n\t\t}\r\n\t\tboolean find = false;\r\n\t\tdo {\r\n\t\t\t\/\/ \u627e\u523010\uff0c\u6362\u621001\r\n\t\t\tSet&lt;Integer&gt; set = getCombination(array, bits);\r\n\t\t\tif(set!=null &amp;&amp; set.size()!=0){\r\n\t\t\t\tlist.add(set);\r\n\t\t\t}\r\n\t\t\tfind = false;\r\n\t\t\tfor (int i = 0; i &lt; array.length - 1; i++) {\r\n\t\t\t\tif (bits&#x5B;i] == 1 &amp;&amp; bits&#x5B;i+1] == 0) {\r\n\t\t\t\t\tfind = true;\r\n\t\t\t\t\tbits&#x5B;i] = 0;\r\n\t\t\t\t\tbits&#x5B;i+1] = 1;\r\n\t\t\t\t\tif(bits&#x5B;0] == 0){\r\n\t\t\t\t\t\tfor (int p=0, q=0; p &lt; i; p++){\r\n\t\t\t\t\t\t\tif(bits&#x5B;p] == 1){\r\n\t\t\t\t\t\t\t\tbyte temp = bits&#x5B;p];\r\n\t\t\t\t\t\t\t\tbits&#x5B;p] = bits&#x5B;q];\r\n\t\t\t\t\t\t\t\tbits&#x5B;q] = temp;\r\n\t\t\t\t\t\t\t\tq++;\r\n\t\t\t\t\t\t\t}\r\n\t\t\t\t\t\t}\r\n\t\t\t\t\t}\r\n\t\t\t\t\tbreak;\r\n\t\t\t\t}\r\n\t\t\t}\r\n\r\n\t\t} while (find);\r\n\t\treturn list;\r\n\t}\r\n\r\n\tprivate Set&lt;Integer&gt; getCombination(int&#x5B;] array, byte&#x5B;] bits) {\r\n\t\tSet&lt;Integer&gt; set = new TreeSet&lt;Integer&gt;();\r\n\t\tfor (int i = 0; i &lt; bits.length; i++) {\r\n\t\t\tif (bits&#x5B;i] == (byte) 1) {\r\n\t\t\t\tset.add(array&#x5B;i]);\r\n\t\t\t}\r\n\t\t}\r\n\t\treturn set;\r\n\t}\r\n\r\n}\r\n\r\n\r\n<\/pre>\n<pre class=\"brush: java; title: ; notranslate\" title=\"\">\r\n\r\nimport java.util.Map;\r\nimport java.util.Set;\r\n\r\npublic class Main {\r\n\r\n\tpublic static void main(String&#x5B;] args) {\r\n\t\tSystem.out.println(&quot;program starts\u2026&quot;);\r\n\t\tlong startTime = System.currentTimeMillis();\r\n\t\tString inputFile = &quot;src\/test.txt&quot;;\r\n\t\tdouble minSupport = 0.02;\r\n\t\tApriori apriori = new Apriori(inputFile, minSupport);\r\n\t\ttry {\r\n\t\t\tSystem.out.println(&quot;pass 1&quot;);\r\n\t\t\tMap&lt;Set&lt;Integer&gt;, Integer&gt; f1Set = apriori.findF1Item();\r\n\t\t\tapriori.printFrequentItems(f1Set, 1);\r\n\t\t\tMap&lt;Set&lt;Integer&gt;, Integer&gt; result = f1Set;\r\n\t\t\tint i = 2;\r\n\t\t\tdo {\r\n\t\t\t\tSystem.out.println(&quot;pass &quot; + i);\r\n\t\t\t\tresult = apriori.generateNextPass(i);\r\n\t\t\t\tapriori.printFrequentItems(result, i);\r\n\t\t\t\ti++;\r\n\t\t\t} while (result.size() != 0);\r\n\t\t\tapriori.closeOutputWriter();\r\n\t\t} catch (Exception e) {\r\n\t\t\te.printStackTrace();\r\n\t\t}\r\n\t\tlong endTime = System.currentTimeMillis();\r\n\t\tSystem.out.println(&quot;execution time:&quot; + (endTime - startTime) + &quot;ms&quot;);\r\n\t}\r\n}\r\n\r\n<\/pre>\n<p>\u8f93\u5165\u6587\u4ef6\uff1atest.txt<\/p>\n<p>1 2 5<br \/>\n2 4<br \/>\n2 3<br \/>\n1 2 4<br \/>\n1 3<br \/>\n2 3<br \/>\n1 3<br \/>\n1 2 3 5<br \/>\n1 2 3<\/p>\n<p>\u8f93\u51fa\u6587\u4ef6\uff1aoutput.txt<\/p>\n<p>[1 ], count:6<br \/>\n[2 ], count:7<br \/>\n[3 ], count:6<br \/>\n[4 ], count:2<br \/>\n[5 ], count:2<br \/>\n[1 2 ], count:4<br \/>\n[1 3 ], count:4<br \/>\n[1 4 ], count:1<br \/>\n[2 3 ], count:4<br \/>\n[2 4 ], count:2<br \/>\n[1 5 ], count:2<br \/>\n[2 5 ], count:2<br \/>\n[3 5 ], count:1<br \/>\n[1 2 3 ], count:2<br \/>\n[1 2 4 ], count:1<br \/>\n[1 2 5 ], count:2<br \/>\n[1 3 5 ], count:1<br \/>\n[2 3 5 ], count:1<br \/>\n[1 2 3 5 ], count:1<\/p>\n","protected":false},"excerpt":{"rendered":"<p>Frequent Itemsets mining \u6700\u5e38\u89c1\u7684\u5f62\u5f0f\u5c31\u662f\u7ed9\u5b9amarke &hellip; <a href=\"https:\/\/www.lixiaodong.com\/?p=56\">\u7ee7\u7eed\u9605\u8bfb <span class=\"meta-nav\">&rarr;<\/span><\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"closed","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[8],"tags":[20,19],"class_list":["post-56","post","type-post","status-publish","format-standard","hentry","category-8","tag-a-priori","tag-19"],"_links":{"self":[{"href":"https:\/\/www.lixiaodong.com\/index.php?rest_route=\/wp\/v2\/posts\/56","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.lixiaodong.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.lixiaodong.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.lixiaodong.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.lixiaodong.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=56"}],"version-history":[{"count":8,"href":"https:\/\/www.lixiaodong.com\/index.php?rest_route=\/wp\/v2\/posts\/56\/revisions"}],"predecessor-version":[{"id":133,"href":"https:\/\/www.lixiaodong.com\/index.php?rest_route=\/wp\/v2\/posts\/56\/revisions\/133"}],"wp:attachment":[{"href":"https:\/\/www.lixiaodong.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=56"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.lixiaodong.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=56"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.lixiaodong.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=56"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}