python spark ML

window system 
1. 
Anaconda or Python 
Spark environment variables 
2. 
Configuration Home Spark 
D: \ Develop \ Spark-1.6.0-bin-hadoop2.6 \ Spark-1.6.0-bin-hadoop2.6 

3. 
C: \ the Users \ Administrator> PIP py4j the install 

Python and C for CPython java java interaction is through py4j 

PIP Uninstall py4j 

4. 
installation pyspark (not recommended pip install,) for the corresponding version, using the copy 
D: \ Develop \ spark-1.6.0 -bin-hadoop2 .6 \ python \ lib 
copy py4j-0.9-src pyspark to 
D: \ Develop \ the Python \ Anaconda3 \ lib \ Packages-Site 

C: \ the Users \ Administrator> Python 
>>> Import py4j 
>>> Import pyspark ## is not given , the installation was successful 


idea python plugin download version

 

eclipse scala IDE plug mounting pydev

 

 

python spark 

environment description 
Python 2.7.9 
Spark Spark-1.6.0-bin-hadoop2.6

Installation pyspark (not recommended pip install,) for the corresponding version, using copy, pay attention to extract the folder names may have two layers, the outer layer off @@@@@@@ pyspark 
D: \ Develop \ the Spark-1.6.0-bin -hadoop2.6 \ python \ lib 
copy py4j-0.9-src pyspark to 
D: \ Develop \ Python \ Anaconda3 \ lib \ site-packages

 

 

Install the PyDev 
PyCharm configuration is successful. But it can not automatically prompt. 

scala IDE version is too low, the official website to download the latest version, eclispe marketplace install the old version and the new version are being given. 

Finally: Reference bing Bing search, [how to install pydev on eclipse scala ide] 
http://www.planetofbits.com/python/how-to-install-python-pydev-plugin-in-eclipse/ 
re-download eclipse, Download \ PyDev 5.2.0 copied to eclipse dropins. In the scala mounted eclispe marketplace. Ok.

 

eclipse run Python console gibberish (because only supports gbk)

 

 

 

 

# coding:utf-8
'''
Created on 2019年10月3日

@author: Administrator

python wordcount

python print
'''
from pyspark.conf import SparkConf
from pyspark.context import SparkContext


print "hello"
print("world")

def showResult(one):
    print(one)
    
if __name__ == '__main__':
    conf = SparkConf()
    conf.setMaster("local")
    conf.setAppName("test")
    sc=SparkContext(conf=conf)
    lines = sc.textFile("./words")
    words = lines.flatMap(lambda line:line.split(" "))
    pairWords = words.map(lambda word:(word,1))
    reduceResult=pairWords.reduceByKey(lambda v1,v2:v1+v2)
    reduceResult.foreach(lambda one:showResult(one))
hello spark
hello hdfs
hello python
hello scala
hello hbase
hello storm
hello python
hello scala
hello hbase
hello storm

  

 

Demo2.py ## 
# Coding: UTF-8 
'' ' 
the Created ON 2019 Nian 10 Yue 3 Ri 

@author: Administrator 
' '' 
from os SYS Import 
Import Random 
IF __name__ == '__main__': 
    File = sys.argv [0 ] of the present document path ## 
    OutputPath the sys.argv = [. 1] 
    Print ( "% S,% S"% (file, OutputPath)) ## real parameters 
    
    print (random.randint (0,255)) ## 0 and containing 255 
    



pvuvdata 

2019-10-01 192.168.112.101 uid123214 Beijing www.taobao.com Buy	  
2019-10-02 192.168.112.111 uid123223 Beijing www.jingdong.com Buy	  
2019-10-03 192.168.112.101 uid123214 Beijing www.tencent.com Login	  
2019-10-04 192.168.112.101 uid123214 shanghai www.taobao.com buy	 
2019-10-01	192.168.112.101	uid123214	guangdong	www.taobao.com	logout	 
2019-10-01	192.168.112.101	uid123214	shanghai	www.taobao.com	view	 
2019-10-02	192.168.112.111	uid123223	beijing	www.jingdong.com	comment	 
2019-10-03	192.168.112.101	uid123214	shanghai	www.tencent.com	login	 
2019-10-04	192.168.112.101	uid123214	beijing	www.xiaomi.com	buy	 
2019-10-01	192.168.112.101	uid123214	shanghai	www.huawei.com	buy	 
2019-10-03	192.168.112.101	uid123214	beijing	www.tencent.com	login	 
2019-10-04	192.168.112.101	uid123214	shanghai	www.taobao.com	buy	 
2019-10-01	192.168.112.101	uid123214	guangdong	www.taobao.com	logout	 
2019-10-01	192.168.112.101	uid123214	beijing	www.taobao.com	view	 
2019-10-02	192.168.112.111	uid123223	guangdong	www.jingdong.com	comment	 
2019-10-03	192.168.112.101	uid123214	beijing	www.tencent.com	login	 
2019-10-04	192.168.112.101	uid123214	guangdong	www.xiaomi.com	buy	 
2019-10-01	192.168.112.101	uid123214	beijing	www.huawei.com	buy	 


pvuv.py
Coding #: UTF-8 
# Import SYS 
# print (sys.getdefaultencoding ()) ASCII ## 
# reload (SYS) 
# sys.setdefaultencoding ( "UTF-8") ## 2.x version 
# print (sys.getdefaultencoding ( )) 
from pyspark.conf Import SparkConf 
from pyspark.context Import SparkContext 
from cProfile Import label 
from com.sxt.spark.wordcount Import the showResult 

'' ' 
the Created ON 2019 Nian 10 Yue 3 Ri 

@author: Administrator 
' '' 

'' ' 
6 . PySpark statistics PV, UV part of the code 
1). statistics PV, UV 
2). in addition statistics UV outside of an area 
3). statistics for each site of the most active areas top2 
4). statistics for each site's most popular operating 
5 .) at each site statistics top3 most active users 

' '' 

## method 
def pv (lines):
    pairSite = lines.map(lambda line:(line.split("\t")[4],1))
    reduceResult = pairSite.reduceByKey(lambda v1,v2:v1+v2)
    result = reduceResult.sortBy(lambda tp:tp[1],ascending=False)
    result.foreach(lambda one:showResult(one))

def uv(lines):
    distinct = lines.map(lambda line:line.split("\t")[1] +'_' + line.split("\t")[4]).distinct()
    reduceResult= distinct.map(lambda distinct:(distinct.split("_")[1],1)).reduceByKey(lambda v1,v2:v1+v2)
    result = reduceResult.sortBy(lambda tp:tp[1],ascending=False)
    result.foreach(lambda one:showResult(one))

def uvExceptBJ(lines):  
    distinct = lines.filter(lambda line:line.split('\t')[3]<>'beijing').map(lambda line:line.split("\t")[1] +'_' + line.split("\t")[4]).distinct()
    reduceResult= distinct.map(lambda distinct:(distinct.split("_")[1],1)).reduceByKey(lambda v1,v2:v1+v2)
    result = reduceResult.sortBy(lambda tp:tp[1],ascending=False)
    result.foreach(lambda one:showResult(one))  

def getCurrentSiteTop2Location(one):
    site = one[0]
    locations = one[1]
    locationDict = {}
    for location in locations:
        if location in locationDict:
            locationDict[location] +=1
        else:
            locationDict[location] =1
    
    sortedList = sorted(locationDict.items(),key=lambda kv : kv[1],reverse=True)
    
    resultList = []
    if len(sortedList) < 2:
        resultList =  sortedList
    else:
        for i in range(2):
            resultList.append(sortedList[i])
    return site,resultList

def getTop2Location(line):
    site_locations = lines.map(lambda line:(line.split("\t")[4],line.split("\t")[3])).groupByKey() 
    result = site_locations.map(lambda one:getCurrentSiteTop2Location(one)).collect() 
    for elem in result:
        print(elem)


def getSiteInfo(one):
    userid = one[0]
    sites = one[1]
    dic = {}
    for site in sites:
        if site in dic:
            dic[site] +=1
        else:
            dic[site] = 1
            
    resultList = []
    for site,count in dic.items():
        resultList.append((site,(userid,count)))
    return resultList

'''
如下一片程序感觉有错,我写
'''
def getCurrectSiteTop3User(one):
    site = one[0]
    uid_c_tuples = one[1]
    
    top3List = ["","",""]
    for uid_count in uid_c_tuples:
        for i in range(len(top3List)):
            if top3List[i] == "":
                top3List[i] = uid_count
                break
            else:
                if uid_count [1]> top3List [ i] [1]: ## tuples 
                    for J in Range (2, I, -1): 
                        top3List [J] = top3List [-J. 1] 
                    top3List [I] = uid_count 
                BREAK 
    return Site, top3List 
  

'' ' 
follows a program feel wrong, the teacher wrote 
' ''   
DEF getCurSiteTop3User2 (One): 
    Site = One [0] 
    userid_count_Iterable = One [1] 
    top3List = [ "", "", ""] 
    for userid_count userid_count_Iterable in: 
        for I in Range (0, len (top3List)): 
            IF top3List [I] == "":
                top3List[i] = userid_count
                break
            else:
                if userid_count[1]>top3List[i][1]:
                    for j in range(2,i,-1):
                        top3List[j] = top3List[j-1]
                    top3List[i] = userid_count
                break        
    return site,top3List    
        
def getTop3User(lines):
    site_uid_count = lines.map(lambda line:(line.split('\t')[2],line.split("\t")[4])).groupByKey().flatMap(lambda one:getSiteInfo(one))
    result = site_uid_count.groupByKey().map(lambda one:getCurrectSiteTop3User(one)).collect()
    for ele in result:
        print(ele)
        
if __name__ == '__main__':
#     conf = SparkConf().setMaster("local").setAppName("test") 
# Sc = SparkContext ()
#     lines = sc.textFile("./pvuvdata")
# #     pv(lines)
# #     uv(lines)
# #     uvExceptBJ(lines)
# #     getTop2Location(lines)
#     
#     getTop3User(lines)
    res = getCurrectSiteTop3User(("baidu",[('A',12),('B',5),('C',12),('D',1),('E',21),('F',20)]))
    print(res)
    res2 = getCurSiteTop3User2(("baidu",[('A',12),('B',5),('C',12),('D',1),('E',21),('F',20)]))
    print(res)
    
    

    
    
    
    
    
    
    
    
    
    
    

  

python pycharm anaconda switched to version 3.5

  

 

 

 

 

  

  

 

  

  

 

Guess you like

Origin www.cnblogs.com/xhzd/p/11621172.html