机器学习部分:PV,UV,取topN【Python】

#coding:utf-8
import sys
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from com.bjsxt.spark.wordcount import showResult

print(sys.getdefaultencoding())
reload(sys)
sys.setdefaultencoding('utf-8')
print(sys.getdefaultencoding())

#方法
def pv(lines):
    pairSite = lines.map(lambda line:(line.split("\t")[4],1))
    reduceResult = pairSite.reduceByKey(lambda v1,v2:v1+v2)
    result = reduceResult.sortBy(lambda tp:tp[1],ascending=False)
    result.foreach(lambda one:showResult(one))

def uv(lines):
    distinct = lines.map(lambda line:line.split("\t")[1]+"_"+line.split("\t")[4]).distinct()
    reduceResult = distinct.map(lambda distinct:(distinct.split("_")[1],1)).reduceByKey(lambda v1,v2:v1+v2)
    result = reduceResult.sortBy(lambda tp:tp[1],ascending=False)
    result.foreach(lambda one:showResult(one))

def uvExceptBJ(lines):
    distinct = lines.filter(lambda line:line.split("\t")[3]<>'beijing').map(lambda line:line.split("\t")[1]+"_"+line.split("\t")[4]).distinct()
    reduceResult = distinct.map(lambda distinct:(distinct.split("_")[1],1)).reduceByKey(lambda v1,v2:v1+v2)
    result = reduceResult.sortBy(lambda tp:tp[1],ascending=False)
    result.foreach(lambda one:showResult(one))

def getCurrSiteTop2Location(one):
    site = one[0]
    locations = one[1]
    locationDict = {}
    for location in locations:
        if location in locationDict:
            locationDict[location] += 1
        else:
            locationDict[location] = 1
    resultList = [] 
    sortedList = sorted(locationDict.items(),key = lambda kv:kv[1],reverse=True)        
    if len(sortedList)<2:
        resultList = sortedList
    else:
        for i in range(2):
            resultList.append(sortedList[i])
    return site,resultList
    
def getTop2Location(lines):
    site_locations = lines.map(lambda line:(line.split("\t")[4],line.split("\t")[3])).groupByKey()
    result = site_locations.map(lambda one:getCurrSiteTop2Location(one)).collect()
    for elem in result:
        print(elem)
if __name__=='__main__':
    conf = SparkConf().setMaster("local").setAppName("test")
    sc = SparkContext(conf=conf)
    lines = sc.textFile("./pvuvdata")
#     pv(lines)
#     uv(lines)
#     uvExceptBJ(lines)
    
    getTop2Location(lines)

猜你喜欢

转载自blog.csdn.net/wyqwilliam/article/details/81624075