window system 1. Anaconda or Python Spark environment variables 2. Configuration Home Spark D: \ Develop \ Spark-1.6.0-bin-hadoop2.6 \ Spark-1.6.0-bin-hadoop2.6 3. C: \ the Users \ Administrator> PIP py4j the install Python and C for CPython java java interaction is through py4j PIP Uninstall py4j 4. installation pyspark (not recommended pip install,) for the corresponding version, using the copy D: \ Develop \ spark-1.6.0 -bin-hadoop2 .6 \ python \ lib copy py4j-0.9-src pyspark to D: \ Develop \ the Python \ Anaconda3 \ lib \ Packages-Site C: \ the Users \ Administrator> Python >>> Import py4j >>> Import pyspark ## is not given , the installation was successful idea python plugin download version
eclipse scala IDE plug mounting pydev
python spark environment description Python 2.7.9 Spark Spark-1.6.0-bin-hadoop2.6
Installation pyspark (not recommended pip install,) for the corresponding version, using copy, pay attention to extract the folder names may have two layers, the outer layer off @@@@@@@ pyspark
D: \ Develop \ the Spark-1.6.0-bin -hadoop2.6 \ python \ lib
copy py4j-0.9-src pyspark to
D: \ Develop \ Python \ Anaconda3 \ lib \ site-packages
Install the PyDev PyCharm configuration is successful. But it can not automatically prompt. scala IDE version is too low, the official website to download the latest version, eclispe marketplace install the old version and the new version are being given. Finally: Reference bing Bing search, [how to install pydev on eclipse scala ide] http://www.planetofbits.com/python/how-to-install-python-pydev-plugin-in-eclipse/ re-download eclipse, Download \ PyDev 5.2.0 copied to eclipse dropins. In the scala mounted eclispe marketplace. Ok.
eclipse run Python console gibberish (because only supports gbk)
# coding:utf-8 ''' Created on 2019年10月3日 @author: Administrator python wordcount python print ''' from pyspark.conf import SparkConf from pyspark.context import SparkContext print "hello" print("world") def showResult(one): print(one) if __name__ == '__main__': conf = SparkConf() conf.setMaster("local") conf.setAppName("test") sc=SparkContext(conf=conf) lines = sc.textFile("./words") words = lines.flatMap(lambda line:line.split(" ")) pairWords = words.map(lambda word:(word,1)) reduceResult=pairWords.reduceByKey(lambda v1,v2:v1+v2) reduceResult.foreach(lambda one:showResult(one))
hello spark hello hdfs hello python hello scala hello hbase hello storm hello python hello scala hello hbase hello storm
Demo2.py ## # Coding: UTF-8 '' ' the Created ON 2019 Nian 10 Yue 3 Ri @author: Administrator ' '' from os SYS Import Import Random IF __name__ == '__main__': File = sys.argv [0 ] of the present document path ## OutputPath the sys.argv = [. 1] Print ( "% S,% S"% (file, OutputPath)) ## real parameters print (random.randint (0,255)) ## 0 and containing 255 pvuvdata 2019-10-01 192.168.112.101 uid123214 Beijing www.taobao.com Buy 2019-10-02 192.168.112.111 uid123223 Beijing www.jingdong.com Buy 2019-10-03 192.168.112.101 uid123214 Beijing www.tencent.com Login 2019-10-04 192.168.112.101 uid123214 shanghai www.taobao.com buy 2019-10-01 192.168.112.101 uid123214 guangdong www.taobao.com logout 2019-10-01 192.168.112.101 uid123214 shanghai www.taobao.com view 2019-10-02 192.168.112.111 uid123223 beijing www.jingdong.com comment 2019-10-03 192.168.112.101 uid123214 shanghai www.tencent.com login 2019-10-04 192.168.112.101 uid123214 beijing www.xiaomi.com buy 2019-10-01 192.168.112.101 uid123214 shanghai www.huawei.com buy 2019-10-03 192.168.112.101 uid123214 beijing www.tencent.com login 2019-10-04 192.168.112.101 uid123214 shanghai www.taobao.com buy 2019-10-01 192.168.112.101 uid123214 guangdong www.taobao.com logout 2019-10-01 192.168.112.101 uid123214 beijing www.taobao.com view 2019-10-02 192.168.112.111 uid123223 guangdong www.jingdong.com comment 2019-10-03 192.168.112.101 uid123214 beijing www.tencent.com login 2019-10-04 192.168.112.101 uid123214 guangdong www.xiaomi.com buy 2019-10-01 192.168.112.101 uid123214 beijing www.huawei.com buy pvuv.py
Coding #: UTF-8 # Import SYS # print (sys.getdefaultencoding ()) ASCII ## # reload (SYS) # sys.setdefaultencoding ( "UTF-8") ## 2.x version # print (sys.getdefaultencoding ( )) from pyspark.conf Import SparkConf from pyspark.context Import SparkContext from cProfile Import label from com.sxt.spark.wordcount Import the showResult '' ' the Created ON 2019 Nian 10 Yue 3 Ri @author: Administrator ' '' '' ' 6 . PySpark statistics PV, UV part of the code 1). statistics PV, UV 2). in addition statistics UV outside of an area 3). statistics for each site of the most active areas top2 4). statistics for each site's most popular operating 5 .) at each site statistics top3 most active users ' '' ## method def pv (lines): pairSite = lines.map(lambda line:(line.split("\t")[4],1)) reduceResult = pairSite.reduceByKey(lambda v1,v2:v1+v2) result = reduceResult.sortBy(lambda tp:tp[1],ascending=False) result.foreach(lambda one:showResult(one)) def uv(lines): distinct = lines.map(lambda line:line.split("\t")[1] +'_' + line.split("\t")[4]).distinct() reduceResult= distinct.map(lambda distinct:(distinct.split("_")[1],1)).reduceByKey(lambda v1,v2:v1+v2) result = reduceResult.sortBy(lambda tp:tp[1],ascending=False) result.foreach(lambda one:showResult(one)) def uvExceptBJ(lines): distinct = lines.filter(lambda line:line.split('\t')[3]<>'beijing').map(lambda line:line.split("\t")[1] +'_' + line.split("\t")[4]).distinct() reduceResult= distinct.map(lambda distinct:(distinct.split("_")[1],1)).reduceByKey(lambda v1,v2:v1+v2) result = reduceResult.sortBy(lambda tp:tp[1],ascending=False) result.foreach(lambda one:showResult(one)) def getCurrentSiteTop2Location(one): site = one[0] locations = one[1] locationDict = {} for location in locations: if location in locationDict: locationDict[location] +=1 else: locationDict[location] =1 sortedList = sorted(locationDict.items(),key=lambda kv : kv[1],reverse=True) resultList = [] if len(sortedList) < 2: resultList = sortedList else: for i in range(2): resultList.append(sortedList[i]) return site,resultList def getTop2Location(line): site_locations = lines.map(lambda line:(line.split("\t")[4],line.split("\t")[3])).groupByKey() result = site_locations.map(lambda one:getCurrentSiteTop2Location(one)).collect() for elem in result: print(elem) def getSiteInfo(one): userid = one[0] sites = one[1] dic = {} for site in sites: if site in dic: dic[site] +=1 else: dic[site] = 1 resultList = [] for site,count in dic.items(): resultList.append((site,(userid,count))) return resultList ''' 如下一片程序感觉有错,我写 ''' def getCurrectSiteTop3User(one): site = one[0] uid_c_tuples = one[1] top3List = ["","",""] for uid_count in uid_c_tuples: for i in range(len(top3List)): if top3List[i] == "": top3List[i] = uid_count break else: if uid_count [1]> top3List [ i] [1]: ## tuples for J in Range (2, I, -1): top3List [J] = top3List [-J. 1] top3List [I] = uid_count BREAK return Site, top3List '' ' follows a program feel wrong, the teacher wrote ' '' DEF getCurSiteTop3User2 (One): Site = One [0] userid_count_Iterable = One [1] top3List = [ "", "", ""] for userid_count userid_count_Iterable in: for I in Range (0, len (top3List)): IF top3List [I] == "": top3List[i] = userid_count break else: if userid_count[1]>top3List[i][1]: for j in range(2,i,-1): top3List[j] = top3List[j-1] top3List[i] = userid_count break return site,top3List def getTop3User(lines): site_uid_count = lines.map(lambda line:(line.split('\t')[2],line.split("\t")[4])).groupByKey().flatMap(lambda one:getSiteInfo(one)) result = site_uid_count.groupByKey().map(lambda one:getCurrectSiteTop3User(one)).collect() for ele in result: print(ele) if __name__ == '__main__': # conf = SparkConf().setMaster("local").setAppName("test") # Sc = SparkContext () # lines = sc.textFile("./pvuvdata") # # pv(lines) # # uv(lines) # # uvExceptBJ(lines) # # getTop2Location(lines) # # getTop3User(lines) res = getCurrectSiteTop3User(("baidu",[('A',12),('B',5),('C',12),('D',1),('E',21),('F',20)])) print(res) res2 = getCurSiteTop3User2(("baidu",[('A',12),('B',5),('C',12),('D',1),('E',21),('F',20)])) print(res)
python pycharm anaconda switched to version 3.5