机器学习部分:分区取topN(类的调用方法)

#encoding:utf-8
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from com.bjsxt.spark.wc import show, lines
# from babel.util import distinct
    
def getCurrSiteTop2Location(one):
    site=one(0)
    locations=one(1)
    locationDict={}
    for location in locations:
        if location in locationDict:
            locationDict[location]+=1
        else:
            locationDict[location]=1
    resultList=[]
    sortedList=sorted(locationDict.items(),key=lambda kv:kv[1],reverse=True)
    if len(sortedList)<2:
        sortedList=sortedList
    else:
        for i in range(2):
            resultList.append(sortedList[i])
    return site,resultList
            
    
def getTop2Location(lines):
    site_locations=lines.map(lambda line:(line.split("\t")[4],line.split("\t")[3])).groupByKey()
    result=site_locations.map(lambda one:getCurrSiteTop2Location(one)).collect()
    for elem in result:
        print(elem)
      
def getSiteInfo(one):
    userid=one[0]
    sites=one[1]
    dic={}
    for site in sites:
        if site in dic:
            dic[site]+=1
        else:
            dic[site]=1
    resultList=[]
    for site,count in dic.items():
        resultList.append(site,(userid,count))
        
def getCurSiteTop3User(one):
    site=one[0]
    userid_count_iterable=one[1]
    top3List=["","",""]
    for userid_count in userid_count_iterable:
        userid=userid_count[0]
        count=userid_count[1]
        for i in range(0,len(top3List)):
            if top3List[i] == "":
                top3List[i]=userid_count
                break
            else:
                if userid_count[1]>top3List[i][1]:
                    for j in range(2,i,-1):
                        top3List[j]=top3List[j-1]
                    top3List[i]=userid_count
                break
    return site,top3List
                    
      
def getTop3User(lines):
    site_uid_count=lines.map(lambda line:(line.split("\t")[2],line.split("\t")[4])).groupByKey().flatMap(lambda one:getSiteInfo(one))
    result=site_uid_count.groupByKey().map(lambda one:getCurSiteTop3User(one)).collect()
    for elem in result:
        print(elem)

if __name__ == '__main__':
    conf=SparkConf().setAppName("test").setMaster("local")
    sc=SparkContext(conf=conf)
    line=sc.textFile("./spark")
    getTop2Location(lines)
    getTop3User(lines)
    

猜你喜欢

转载自blog.csdn.net/wyqwilliam/article/details/81660123