#encoding:utf-8
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from com.bjsxt.spark.wc import show, lines
# from babel.util import distinct
def getCurrSiteTop2Location(one):
site=one(0)
locations=one(1)
locationDict={}
for location in locations:
if location in locationDict:
locationDict[location]+=1
else:
locationDict[location]=1
resultList=[]
sortedList=sorted(locationDict.items(),key=lambda kv:kv[1],reverse=True)
if len(sortedList)<2:
sortedList=sortedList
else:
for i in range(2):
resultList.append(sortedList[i])
return site,resultList
def getTop2Location(lines):
site_locations=lines.map(lambda line:(line.split("\t")[4],line.split("\t")[3])).groupByKey()
result=site_locations.map(lambda one:getCurrSiteTop2Location(one)).collect()
for elem in result:
print(elem)
def getSiteInfo(one):
userid=one[0]
sites=one[1]
dic={}
for site in sites:
if site in dic:
dic[site]+=1
else:
dic[site]=1
resultList=[]
for site,count in dic.items():
resultList.append(site,(userid,count))
def getCurSiteTop3User(one):
site=one[0]
userid_count_iterable=one[1]
top3List=["","",""]
for userid_count in userid_count_iterable:
userid=userid_count[0]
count=userid_count[1]
for i in range(0,len(top3List)):
if top3List[i] == "":
top3List[i]=userid_count
break
else:
if userid_count[1]>top3List[i][1]:
for j in range(2,i,-1):
top3List[j]=top3List[j-1]
top3List[i]=userid_count
break
return site,top3List
def getTop3User(lines):
site_uid_count=lines.map(lambda line:(line.split("\t")[2],line.split("\t")[4])).groupByKey().flatMap(lambda one:getSiteInfo(one))
result=site_uid_count.groupByKey().map(lambda one:getCurSiteTop3User(one)).collect()
for elem in result:
print(elem)
if __name__ == '__main__':
conf=SparkConf().setAppName("test").setMaster("local")
sc=SparkContext(conf=conf)
line=sc.textFile("./spark")
getTop2Location(lines)
getTop3User(lines)
机器学习部分:分区取topN(类的调用方法)
猜你喜欢
转载自blog.csdn.net/wyqwilliam/article/details/81660123
今日推荐
周排行