实战演习(十二)——基于关联规则分析用户行为频繁项集的关键页面

笔者希望在平日的工作学习中，挖掘数据的价值，找寻数据的秘密，笔者认为，数据的价值不仅仅只体现在企业中，个人也可以体会到数据的魅力，用技术力量探索行为密码，让大数据助跑每一个人，欢迎直筒们关注我的公众号，大家一起讨论数据中的那些有趣的事情。

我的公众号为：livandata

啤酒尿布的案例是引发大数据思考的一个非常重要的案例，自从这个案例出现之后，对其进行深度研究的人员不计其数，本文是基于网站页面，对这一案例进行借鉴引用，将用户访问的页面看成是对应的产品，通过频繁项集关联规则来分析访问某一页面的客户还会访问哪些其他的案例，进而分析用户访问页面之间的关联规则。

我们在设计页面时需要了解到用户关键访问哪些页面，同时根据某个关键页面可以确定用户经过哪几个重要页面进入到了关键页面，进而确定重要页面，优化客户的访问路径。

在进行页面关联规则的访问的过程中，比较大的问题即为数据量的问题，用户行为数据中可以获取到用户对页面的访问信息，但是由于这一信息量较大，在应用这一案例的过程中往往需要进行分布式处理，基于使用的工具限制，此处基于python分批文件读取来模拟分布式过程。

具体代码如下：

1、Apriori算法：

#coding=utf-8
# 个人公众号：livandata
import sys
def apriori(D, minSup):
    C1 = {}
    for T in D:
        for I in T:
            if I in C1:
                C1[I] += 1
            else:
                C1[I] = 1
    print(C1)
    _keys1 = C1.keys()
    keys1 = []
    for i in _keys1:
        keys1.append([i])
    n = len(D)
    cutKeys1 = []
    for k in keys1[:]:
        if C1[k[0]]*1.0/n >= minSup:
            cutKeys1.append(k)
    cutKeys1.sort()
    keys = cutKeys1
    all_keys = []
    all_C = []
    while keys!= []:
        C = getC(D, keys)
        cutKeys, curC = getCutKeys(keys, C, minSup, len(D))
        for key in cutKeys:
            all_keys.append(key)
        for c in curC:
            all_C.append(c)
        keys = aproiri_gen(cutKeys)
    return all_keys, all_C

def getC(D, keys):
    '''对keys中的每一个key进行计数'''
    C = []
    for key in keys:
        c = 0
        for T in D:
            have = True
            for k in key:
                if k not in T:
                    have = False
            if have:
                c += 1
        C.append(c)
    return C

def getCutKeys(keys, C, minSup, length):
    '''剪枝步'''
    keyss = []
    Cs = []
    for i, key in enumerate(keys):
        if float(C[i]) / length >= minSup:
            keyss.append(key)
            Cs.append(C[i])
    return keyss, Cs

def keyInT(key, T):
    '''判断项key是否在数据库中某一元组T中'''
    for k in key:
        if k not in T:      # 只要有一个不匹配，就返回False
            return False
    return True

def aproiri_gen(keys1):
    '''连接步'''
    keys2 = []
    for k1 in keys1:
        for k2 in keys1:
            if k1 != k2:
                key = []
                for k in k1:
                    if k not in key:
                        key.append(k)
                for k in k2:
                    if k not in key:
                        key.append(k)
                key.sort()
                if key not in keys2:
                    keys2.append(key)

    return keys2

2、FP_tree算法：

# encoding: utf-8
# 个人公众号：livandata
from collections import defaultdict, namedtuple

# original author information, this verison is updated by lina.
__license__ = 'MIT License'
def find_frequent_itemsets(transactions, minimum_support, include_support=False):
    """
    Find frequent itemsets in the given transactions using FP-growth. This
    function returns a generator instead of an eagerly-populated list of items.

    The `transactions` parameter can be any iterable of iterables of items.
    `minimum_support` should be an integer specifying the minimum number of
    occurrences of an itemset for it to be accepted.

    Each item must be hashable (i.e., it must be valid as a member of a
    dictionary or a set).

    If `include_support` is true, yield (itemset, support) pairs instead of
    just the itemsets.
    """
    items = defaultdict(lambda: 0)  # mapping from items to their supports

    # Load the passed-in transactions and count the support that individual
    # items have.
    for transaction in transactions:
        for item in transaction:
            items[item] += 1

    # Remove infrequent items from the item support dictionary.
    items = dict((item, support) for item, support in items.items()
        if support >= minimum_support)

    # Build our FP-tree. Before any transactions can be added to the tree, they
    # must be stripped of infrequent items and their surviving items must be
    # sorted in decreasing order of frequency.
    def clean_transaction(transaction):
        transaction = filter(lambda v: v in items, transaction)
        transaction_list = list(transaction)   # 为了防止变量在其他部分调用，这里引入临时变量transaction_list
        transaction_list.sort(key=lambda v: items[v], reverse=True)
        return transaction_list

    master = FPTree()
    for transaction in map(clean_transaction, transactions):
        master.add(transaction)

    def find_with_suffix(tree, suffix):
        for item, nodes in tree.items():
            support = sum(n.count for n in nodes)
            if support >= minimum_support and item not in suffix:
                # New winner!
                found_set = [item] + suffix
                yield (found_set, support) if include_support else found_set

                # Build a conditional tree and recursively search for frequent
                # itemsets within it.
                cond_tree = conditional_tree_from_paths(tree.prefix_paths(item))
                for s in find_with_suffix(cond_tree, found_set):
                    yield s # pass along the good news to our caller

    # Search for frequent itemsets, and yield the results we find.
    for itemset in find_with_suffix(master, []):
        yield itemset

class FPTree(object):
    """
    An FP tree.

    This object may only store transaction items that are hashable
    (i.e., all items must be valid as dictionary keys or set members).
    """

    Route = namedtuple('Route', 'head tail')

    def __init__(self):
        # The root node of the tree.
        self._root = FPNode(self, None, None)

        # A dictionary mapping items to the head and tail of a path of
        # "neighbors" that will hit every node containing that item.
        self._routes = {}

    @property
    def root(self):
        """The root node of the tree."""
        return self._root

    def add(self, transaction):
        """Add a transaction to the tree."""
        point = self._root

        for item in transaction:
            next_point = point.search(item)
            if next_point:
                # There is already a node in this tree for the current
                # transaction item; reuse it.
                next_point.increment()
            else:
                # Create a new point and add it as a child of the point we're
                # currently looking at.
                next_point = FPNode(self, item)
                point.add(next_point)

                # Update the route of nodes that contain this item to include
                # our new node.
                self._update_route(next_point)

            point = next_point

    def _update_route(self, point):
        """Add the given node to the route through all nodes for its item."""
        assert self is point.tree

        try:
            route = self._routes[point.item]
            route[1].neighbor = point # route[1] is the tail
            self._routes[point.item] = self.Route(route[0], point)
        except KeyError:
            # First node for this item; start a new route.
            self._routes[point.item] = self.Route(point, point)

    def items(self):
        """
        Generate one 2-tuples for each item represented in the tree. The first
        element of the tuple is the item itself, and the second element is a
        generator that will yield the nodes in the tree that belong to the item.
        """
        for item in self._routes.keys():
            yield (item, self.nodes(item))

    def nodes(self, item):
        """
        Generate the sequence of nodes that contain the given item.
        """

        try:
            node = self._routes[item][0]
        except KeyError:
            return

        while node:
            yield node
            node = node.neighbor

    def prefix_paths(self, item):
        """Generate the prefix paths that end with the given item."""

        def collect_path(node):
            path = []
            while node and not node.root:
                path.append(node)
                node = node.parent
            path.reverse()
            return path

        return (collect_path(node) for node in self.nodes(item))

    def inspect(self):
        print('Tree:')
        self.root.inspect(1)

        print
        print('Routes:')
        for item, nodes in self.items():
            print('  %r' % item)
            for node in nodes:
                print('    %r' % node)

def conditional_tree_from_paths(paths):
    """Build a conditional FP-tree from the given prefix paths."""
    tree = FPTree()
    condition_item = None
    items = set()

    # Import the nodes in the paths into the new tree. Only the counts of the
    # leaf notes matter; the remaining counts will be reconstructed from the
    # leaf counts.
    for path in paths:
        if condition_item is None:
            condition_item = path[-1].item

        point = tree.root
        for node in path:
            next_point = point.search(node.item)
            if not next_point:
                # Add a new node to the tree.
                items.add(node.item)
                count = node.count if node.item == condition_item else 0
                next_point = FPNode(tree, node.item, count)
                point.add(next_point)
                tree._update_route(next_point)
            point = next_point

    assert condition_item is not None

    # Calculate the counts of the non-leaf nodes.
    for path in tree.prefix_paths(condition_item):
        count = path[-1].count
        for node in reversed(path[:-1]):
            node._count += count

    return tree

class FPNode(object):
    """A node in an FP tree."""

    def __init__(self, tree, item, count=1):
        self._tree = tree
        self._item = item
        self._count = count
        self._parent = None
        self._children = {}
        self._neighbor = None

    def add(self, child):
        """Add the given FPNode `child` as a child of this node."""

        if not isinstance(child, FPNode):
            raise TypeError("Can only add other FPNodes as children")

        if not child.item in self._children:
            self._children[child.item] = child
            child.parent = self

    def search(self, item):
        """
        Check whether this node contains a child node for the given item.
        If so, that node is returned; otherwise, `None` is returned.
        """
        try:
            return self._children[item]
        except KeyError:
            return None

    def __contains__(self, item):
        return item in self._children

    @property
    def tree(self):
        """The tree in which this node appears."""
        return self._tree

    @property
    def item(self):
        """The item contained in this node."""
        return self._item

    @property
    def count(self):
        """The count associated with this node's item."""
        return self._count

    def increment(self):
        """Increment the count associated with this node's item."""
        if self._count is None:
            raise ValueError("Root nodes have no associated count.")
        self._count += 1

    @property
    def root(self):
        """True if this node is the root of a tree; false if otherwise."""
        return self._item is None and self._count is None

    @property
    def leaf(self):
        """True if this node is a leaf in the tree; false if otherwise."""
        return len(self._children) == 0

    @property
    def parent(self):
        """The node's parent"""
        return self._parent

    @parent.setter
    def parent(self, value):
        if value is not None and not isinstance(value, FPNode):
            raise TypeError("A node must have an FPNode as a parent.")
        if value and value.tree is not self.tree:
            raise ValueError("Cannot have a parent from another tree.")
        self._parent = value

    @property
    def neighbor(self):
        """
        The node's neighbor; the one with the same value that is "to the right"
        of it in the tree.
        """
        return self._neighbor

    @neighbor.setter
    def neighbor(self, value):
        if value is not None and not isinstance(value, FPNode):
            raise TypeError("A node must have an FPNode as a neighbor.")
        if value and value.tree is not self.tree:
            raise ValueError("Cannot have a neighbor from another tree.")
        self._neighbor = value

    @property
    def children(self):
        """The nodes that are children of this node."""
        return tuple(self._children.itervalues())

    def inspect(self, depth=0):
        print(('  ' * depth) + repr(self))
        for child in self.children:
            child.inspect(depth + 1)

    def __repr__(self):
        if self.root:
            return "<%s (root)>" % type(self).__name__
        return "<%s %r (%r)>" % (type(self).__name__, self.item, self.count)


if __name__ == '__main__':
    from optparse import OptionParser
    import csv

    p = OptionParser(usage='%prog data_file')
    p.add_option('-s', '--minimum-support', dest='minsup', type='int',
        help='Minimum itemset support (default: 2)')
    p.add_option('-n', '--numeric', dest='numeric', action='store_true',
        help='Convert the values in datasets to numerals (default: false)')
    p.set_defaults(minsup=2)
    p.set_defaults(numeric=False)

    options, args = p.parse_args()
    if len(args) < 1:
        p.error('must provide the path to a CSV file to read')

    transactions = []
    with open(args[0]) as database:
        for row in csv.reader(database):
            if options.numeric:
                transaction = []
                for item in row:
                    transaction.append(long(item))
                transactions.append(transaction)
            else:
                transactions.append(row)

    result = []
    for itemset, support in find_frequent_itemsets(transactions, options.minsup, True):
        result.append((itemset, support))

    result = sorted(result, key=lambda i: i[0])
    for itemset, support in result:
        print(str(itemset) + ' ' + str(support))

以上两个算法是从网上找到的，可以作为我们这次数据挖掘的基础算法。

3、data_analysis文件，主要是对数据进行一些基本的分析，将一些分类不在一个级别上的数据进行规整，将一些不容易区分页面信息的数据进行转换。

#!/usr/bin/env python
# _*_ UTF-8 _*_
# 个人公众号：livandata

import re
def open_big_data(path):
    with open(path) as f:
        for i in f:
            yield i

def data_check(sess_data):
    with open('pingan_pro', 'r') as f:
        data_c = f.read()
    check_data = data_c.split(',\n')
    for i in range(len(sess_data)):
        for j in range(len(sess_data[i])):
            if(sess_data[i][j]=='今日步数' or sess_data[i][j] == '免费领月卡' or sess_data[i][j] == '健康服务'):
                sess_data[i][j] = '我的健康'
            if (sess_data[i][j] == '购房贷' or sess_data[i][j] == '买家私' or sess_data[i][j] == '装修超预算'):
                sess_data[i][j] = '房屋贷款'
          
            if ((re.search('消息中心', sess_data[i][j]) != None)):
                sess_data[i][j] = '消息中心'

            if ((re.search('信用卡', sess_data[i][j]) != None)
                or (re.search('信用额度', sess_data[i][j]) != None)
                or (re.search('临额调整', sess_data[i][j]) != None)
                or (re.search('我的额度', sess_data[i][j]) != None)
                or (re.search('额度评估', sess_data[i][j]) != None)
                or (re.search('还款', sess_data[i][j]) != None)):
                sess_data[i][j] = '信用卡'

            if ((re.search('二维码', sess_data[i][j]) != None)
                or (re.search('支付记录', sess_data[i][j]) != None)):
                sess_data[i][j] = '收付款'

            if ((re.search('通讯录', sess_data[i][j]) != None)):
                sess_data[i][j] = '通讯录'

            if ((re.search('http:', sess_data[i][j]) != None)
                or (re.search('结束页', sess_data[i][j]) != None)
                or (re.search('首页', sess_data[i][j]) != None)
                or (re.search('购买', sess_data[i][j]) != None)
                or (re.search('申请记录', sess_data[i][j]) != None)
                or (re.search('交易详情页', sess_data[i][j]) != None)):
                sess_data[i][j] = 'nan'

            for t in range(len(check_data)):
                if(re.search(check_data[t], sess_data[i][j])!=None):
                    sess_data[i][j] = check_data[t]

    for i in range(len(sess_data)):
        page_tmp = list(set(sess_data[i]))
        sess_data[i] = [i for i in page_tmp if i!='nan']

    return sess_data

4、refresh_data文件，主要是对分析的文件进行存储，通过文件存取的方式实现分布式处理：

#!/usr/bin/env python
# _*_ UTF-8 _*_
# 个人公众号：livandata

import os
def write_result(items):
    with open('data_result.txt', 'a+') as f:
        for it in items:
            f.write(str(it)+':'+str(items[it])+'\n')

def read_result(items):
    data_res = {}
    data_res_2 = {}
    with open('data_result.txt', 'r+') as f:
        for data_tmp in f:
            datas_tmp = data_tmp.split('\n')
            datas = datas_tmp[0].split(':')
            for it in items:
                if(datas[0] == it):
                    datas_val = str(int(datas[1])+int(items[it]))
                    data_res[it] = datas_val
                    data_res_2[data_tmp] = dat+'\n'

    datass_res_list = [i for i in data_res]
    items_list = [j for j in items]
    res = list(set(items_list).difference(set(datass_res_list)))
    datass_={}
    for i in res:
        if(i in list(items.keys())):
            datass_[i] = items[i]
    return data_res_2, datass_

def refresh_data(items):
    if(os.path.exists('data_result.txt')):
        datas_res, datas_new = read_result(items)
        print(datas_res)
        with open('data_result.txt', 'a+') as f:
            for i in datas_new:
                f.write(i+':'+str(datas_new[i])+'\n')
        datas_res_li = [i for i in datas_res]
        with open('data_result.txt', 'r+') as f:
            for j in f:
                if(j not in datas_res_li):
                    with open('data_result2.txt', 'a+') as f2:
                        f2.write(j)
                else:
                    with open('data_result2.txt', 'a+') as f2:
                        f2.write(datas_res[j])
        os.remove('data_result.txt')
        os.rename('data_result2.txt', 'data_result.txt')
    else:
        write_result(items)

5、pro文件：即将一些页面进行转换所需要的材料库：

96搜索，
借钱，
口袋社区，
领券中心，
猜金价，
种摇钱树，
车主贷，
宅易通，

6、run文件，主要是运行文件的过程：

#!/usr/bin/env python
# _*_ UTF-8 _*_
# 个人公众号：livandata

import pandas as pd
import data_analysis as das
import Fp_growth as fpg
import refresh_data as rfd

path='..\data\sub_customer.csv'
loop = True
chunkSize = 10
chunks = []
reader = pd.read_csv(path, iterator=True, dtype=str)
while loop:
    try:
        chunk = reader.get_chunk(chunkSize).fillna('nan')
        data = chunk[chunk['page_name']!='nan']['page_name'].reset_index()
        page_names = []
        for i in range(len(data['page_name'])):
            names = data['page_name'][i].split('"')
            page_name = [j for j in names if(j!='[' and j!=']' and j!=',')]
            page_names.append(page_name)
        page_names = das.data_check(page_name)
        page_names = [i for i in page_names if i!=[]]

        frequent_itemsets = fpg.find_frequent_itemsets(page_names, minimum_support=1,
                                                       include_support=True)
        result = []
        for itemset, support in frequent_itemsets:
            result.append(itemset, support)
        items = {}
        n = 5
        minSup = 0.6
        for itemset, support in result:
            keys = str(itemset)
            values = str(support)
            if(float(values)/n >= minSup):
                items[keys] = values

        rfd.refresh_data(items)
    except StopIteration:
        loop = False
        print('Iteration is stopped')

以上是算法运行的全过程，融合了伪分布式处理，主要是参考了hadoop的处理方式。

由于关联规则算法本身较为耗时，为节省时间往往需要使用多线程的方法，本文对应的调整了代码，同时省去了代码中较为费时的部分：

#!/usr/bin/env python
# _*_ UTF-8 _*_
# 个人公众号：livandata

import pandas as pd
import data_analysis as das
import Fp_growth as fpg
from Multiprocessing import Pool
import time
import os

def data_process(chunk, idx):
    data = chunk[chunk['page_name'] != 'nan']['page_name'].reset_index()
    page_names = []
    for i in range(len(data['page_name'])):
        names = data['page_name'][i].split('"')
        page_name = [j for j in names if (j != '[' and j != ']' and j != ',')]
        page_names.append(page_name)
    page_names = das.data_check(page_name)
    page_names = [i for i in page_names if i != []]

    frequent_itemsets = fpg.find_frequent_itemsets(page_names, minimum_support=1,
                                                   include_support=True)
    with open('data_result/data_result_%s.txt' % idx, 'w+') as f:
        for it in frequent_itemsets:
            if(len(it[0])>=2):
                f.write(str(it[0])+':'+str(it[1])+'\n')

def reduce_data(path):
    time_tmp = time.localtime(time.time())
    dates = str(time_tmp.tm_year)+str(time_tmp.tm_mon)+str(time_tmp.tm_mday)
    pathdir = os.listdir(path)
    result_data = {}
    for dir in pathdir:
        dir_t = path+'/'+dir
        if(os.path.isfile(dir_t)):
            with open(dir_t, 'r+') as f:
                data = f.read()
                data = data.split('\n')
                for da in data:
                    das = da.split(':')
                    if(len(das)>1):
                        if(das[0] in list(result_data.keys())):
                            result_data[das[0]] = result_data[das[0]]+int(das[1])
                        else:
                            result_data[das[0]] = int(das[1])

    with open('data_result.txt', 'w+') as f:
        for it in result_data:
            f.write(dates+','+str(it)+','+str(result_data[it])+'\n')

def main():
    path = '..\data\sub_customer.csv'
    loop = True
    chunkSize = 1000000
    reader = pd.read_csv(path, iterator=True, dtype=str)
    idx = 0
    ps = Pool(8)
    while loop:
        try:
            chunk = reader.get_chunk(chunkSize).fillna('nan')
            ps.apply_async(data_process, args=(chunk, idx,))
            idx = idx + 1
        except StopIteration:
            loop = False
            print('Iteration is stopped')
    ps.close()
    ps.join()
    
    path = 'data_result'
    reduce_data(path)
    
if __name__ == '__main__':
    main()

对应的代码为：https://download.csdn.net/download/livan1234/11238216

livan1234

发布了137 篇原创文章 · 获赞 93 · 访问量 16万+

私信关注

实战演习(十二)——基于关联规则分析用户行为频繁项集的关键页面

猜你喜欢