版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_14959801/article/details/77728721
hive> create table pi_cust_item_recommend2(
> cust_id string,
> item_id bigint,
> advise_level double
> )
> partitioned by(
> ymday string)
> ;
OK
Time taken: 0.204 seconds
hive> describe pi_cust_item_recommend2;
OK
cust_id string
item_id bigint
advise_level double
ymday string
# Partition Information
# col_name data_type comment
ymday string
Time taken: 0.065 seconds, Fetched: 9 row(s)
hive> select * from pi_cust_item_recommend2 limit 31;
OK
110101100985 90190202 25.33671962331747 20170830
110101100985 34030326 23.906456902069216 20170830
110101100985 90020726 16.010945324507635 20170830
110101100985 90190101 15.628847628582498 20170830
110101100985 90020727 15.442605374580097 20170830
110101100985 90020214 13.276358249567217 20170830
110101100985 90160248 12.218660323168212 20170830
110101100985 90020222 11.55113640113565 20170830
110101100985 90160257 11.001857833289318 20170830
110101100985 53020411 10.808402849652841 20170830
110101100985 90160255 10.55634487493214 20170830
110101100985 43010101 10.293531517670374 20170830
110101100985 53020502 9.967549321098497 20170830
110101100985 34020101 9.52875123200268 20170830
110101100985 34030228 9.479434291239455 20170830
110101100985 37020308 9.472155882738065 20170830
110101100985 90020927 9.112810136181182 20170830
110101100985 90020934 9.112810136181182 20170830
110101100985 90021202 9.112810136181182 20170830
110101100985 33010104 9.041770412446649 20170830
110101100985 90021408 8.76375384049508 20170830
110101100985 41020211 8.753379761624661 20170830
110101100985 53020413 8.720947985984836 20170830
110101100985 32010101 8.66504292797535 20170830
110101100985 13070505 8.536145306189216 20170830
110101100985 53090115 8.490782608501519 20170830
110101100985 22240103 8.456414691761115 20170830
110101100985 53010210 8.330416055770293 20170830
110101100985 12010504 8.319379734896234 20170830
110101100985 35260104 8.090941846795186 20170830
110101101012 34030316 45.461241571600425 20170830
Time taken: 0.127 seconds, Fetched: 31 row(s)
hive> select count(distinct cust_id) from pi_cust_item_recommend2 limit 3;
Query ID = hive_20170831001737_95d891d6-319e-483e-897f-8287b5452dfd
Total jobs = 1
Launching Job 1 out of 1
Tez session was closed. Reopening...
Session re-established.
Status: Running (Executing on YARN cluster with App id application_1503306179155_0057)
--------------------------------------------------------------------------------
VERTICES STATUS TOTAL COMPLETED RUNNING PENDING FAILED KILLED
--------------------------------------------------------------------------------
Map 1 .......... SUCCEEDED 4 4 0 0 0 0
Reducer 2 ...... SUCCEEDED 1 1 0 0 0 0
Reducer 3 ...... SUCCEEDED 1 1 0 0 0 0
--------------------------------------------------------------------------------
VERTICES: 03/03 [==========================>>] 100% ELAPSED TIME: 8.81 s
--------------------------------------------------------------------------------
OK
33695
Time taken: 15.046 seconds, Fetched: 1 row(s)
hive> select * from pi_cust_item_recommend limit 3;
OK
Time taken: 0.13 seconds
hive> describe pi_cust_item_recommend;
OK
cust_id string
item_id bigint
advise_level decimal(10,0)
date int
cust_code varchar(30)
pack_bar varchar(30)
ymday string
# Partition Information
# col_name data_type comment
ymday string
Time taken: 0.066 seconds, Fetched: 12 row(s)
hive> ALTER TABLE pi_cust_item_recommend REPLACE COLUMNS ( cust_code string,pack_bar string )
> ;
OK
Time taken: 0.087 seconds
hive> describe pi_cust_item_recommend;
OK
cust_code string
pack_bar string
ymday string
# Partition Information
# col_name data_type comment
ymday string
Time taken: 0.081 seconds, Fetched: 8 row(s)
hive> select * from pi_cust_item_recommend2 limit 3;
OK
110101100985 90190202 25.33671962331747 20170830
110101100985 34030326 23.906456902069216 20170830
110101100985 90020726 16.010945324507635 20170830
Time taken: 0.308 seconds, Fetched: 3 row(s)
hive>
>
> describe pi_cust_item_recommend2;
OK
cust_id string
item_id bigint
advise_level double
ymday string
# Partition Information
# col_name data_type comment
ymday string
Time taken: 0.113 seconds, Fetched: 9 row(s)
hive> select * from pi_cust_item_recommend2 limit 3;
OK
110101100985 90190202 25.33671962331747 20170830
110101100985 34030326 23.906456902069216 20170830
110101100985 90020726 16.010945324507635 20170830
Time taken: 0.129 seconds, Fetched: 3 row(s)
hive> describe pi_cust_item_recommend2
> ;
OK
cust_id string
item_id bigint
advise_level double
ymday string
# Partition Information
# col_name data_type comment
ymday string
Time taken: 0.068 seconds, Fetched: 9 row(s)
hive> describe pi_cust_item_recommend;
OK
cust_code string
pack_bar string
ymday string
# Partition Information
# col_name data_type comment
ymday string
Time taken: 0.059 seconds, Fetched: 8 row(s)
hive> create table pi_cust_item_recommend3(
> cust_id string,
> cust_code string)
> ;
OK
Time taken: 0.073 seconds
hive> select * from pi_cust_item_recommend3 limit 3;
OK
110105106838 110105106838
110108207746 110108207746
110228100250 110228100250
Time taken: 0.1 seconds, Fetched: 3 row(s)
hive> select count(distinct cust_id) from pi_cust_item_recommend3;
Query ID = hive_20170831042612_238eca7f-54d8-49ea-b368-fbf82a088e2b
Total jobs = 1
Launching Job 1 out of 1
Tez session was closed. Reopening...
Session re-established.
Status: Running (Executing on YARN cluster with App id application_1503306179155_0058)
--------------------------------------------------------------------------------
VERTICES STATUS TOTAL COMPLETED RUNNING PENDING FAILED KILLED
--------------------------------------------------------------------------------
Map 1 .......... SUCCEEDED 1 1 0 0 0 0
Reducer 2 ...... SUCCEEDED 1 1 0 0 0 0
Reducer 3 ...... SUCCEEDED 1 1 0 0 0 0
--------------------------------------------------------------------------------
VERTICES: 03/03 [==========================>>] 100% ELAPSED TIME: 5.69 s
--------------------------------------------------------------------------------
OK
33695
Time taken: 10.932 seconds, Fetched: 1 row(s)
hive> create table pi_cust_item_recommend4(
> item_id bigint,
> pack_bar string
> );
OK
Time taken: 0.063 seconds
hive> select * from pi_cust_item_recommend4 limit 3;
OK
51520615 6901028032957
90020219 4893225033276
34030316 6901028208550
Time taken: 1.01 seconds, Fetched: 3 row(s)
hive> select count(distinct item_id) from pi_cust_item_recommend4;
Query ID = hive_20170831043216_7de9a54d-53c9-443b-846c-296ce2f76584
Total jobs = 1
Launching Job 1 out of 1
Status: Running (Executing on YARN cluster with App id application_1503306179155_0058)
--------------------------------------------------------------------------------
VERTICES STATUS TOTAL COMPLETED RUNNING PENDING FAILED KILLED
--------------------------------------------------------------------------------
Map 1 .......... SUCCEEDED 3 3 0 0 0 0
Reducer 2 ...... SUCCEEDED 1 1 0 0 0 0
Reducer 3 ...... SUCCEEDED 1 1 0 0 0 0
--------------------------------------------------------------------------------
VERTICES: 03/03 [==========================>>] 100% ELAPSED TIME: 5.07 s
--------------------------------------------------------------------------------
OK
289
Time taken: 5.524 seconds, Fetched: 1 row(s)
hive> describe pi_cust_item_recommend;
OK
cust_code string
pack_bar string
ymday string
# Partition Information
# col_name data_type comment
ymday string
Time taken: 0.289 seconds, Fetched: 8 row(s)
hive> describe pi_cust_item_recommend2;
OK
cust_id string
item_id bigint
advise_level double
ymday string
# Partition Information
# col_name data_type comment
ymday string
Time taken: 0.151 seconds, Fetched: 9 row(s)
hive> describe pi_cust_item_recommend3;
OK
cust_id string
cust_code string
Time taken: 0.184 seconds, Fetched: 2 row(s)
hive> describe pi_cust_item_recommend4;
OK
item_id bigint
pack_bar string
Time taken: 0.072 seconds, Fetched: 2 row(s)
[hdfs@bdddev-agent-205 bin]$ ./pyspark
Python 2.7.5 (default, Nov 6 2016, 00:28:07)
[GCC 4.8.5 20150623 (Red Hat 4.8.5-11)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/__ / .__/\_,_/_/ /_/\_\ version 2.1.0
/_/
Using Python version 2.7.5 (default, Nov 6 2016 00:28:07)
SparkSession available as 'spark'.
>>> from __future__ import division
>>> from pyspark.mllib.recommendation import ALS
>>> from pyspark.sql import HiveContext
>>> from pyspark.sql import SparkSession
>>> from collections import namedtuple
>>> import math
>>> import datetime
>>> spark = SparkSession.builder.appName("bjrecommender").enableHiveSupport().getOrCreate()
>>> sc = spark.sparkContext
>>> hiveCtx = HiveContext(sc)
>>> Rating = namedtuple("Rating", ["user", "product", "rating"])
>>> tid = namedtuple('tid',['id','cust_id'])
>>> now = datetime.datetime.now()
>>> begin_date = (now-datetime.timedelta(days=150)).strftime('%Y%m%d')
>>> begin_date = (now-datetime.timedelta(days=200)).strftime('%Y%m%d')
>>> end_date = now.strftime('%Y%m%d')
>>> sql="select dense_rank() over(order by cust_id) id,cust_id,item_id,need_score+sold_score score from (select cust_id,item_id,qty_need,qty_sold,ntile(5) over(partition by cust_id order by qty_need) need_score,ntile(5) over(partition by cust_id order by qty_sold) sold_score from (select cust_id,item_id,sum(qty_need) qty_need,sum(qty_sold) qty_sold from yxpt.pi_cust_item_day where date1>=" +begin_date + " and date1<="+end_date+ " group by cust_id,item_id) a1) b1"
>>> total = hiveCtx.sql(sql)
>>> id_custid=total.rdd.map(lambda x : tid(str(x[0]),x[1])).distinct()
17/08/30 09:18:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
>>> id_custid.toDF().registerTempTable("id_cid")
>>> ratings=total.rdd.map(lambda x : Rating(str(x[0]),int(x[2]),float(x[3])))
>>> ratings.toDF().show(3)
17/08/30 09:29:31 ERROR Utils: Uncaught exception in thread stdout writer for python2.7
java.net.SocketException: Socket is closed
at java.net.Socket.shutdownOutput(Socket.java:1551)
at org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3$$anonfun$apply$3.apply$mcV$sp(PythonRDD.scala:336)
at org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3$$anonfun$apply$3.apply(PythonRDD.scala:336)
at org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3$$anonfun$apply$3.apply(PythonRDD.scala:336)
at org.apache.spark.util.Utils$.tryLog(Utils.scala:1964)
at org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3.apply(PythonRDD.scala:336)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1951)
at org.apache.spark.api.python.PythonRunner$WriterThread.run(PythonRDD.scala:269)
+----+--------+------+
|user| product|rating|
+----+--------+------+
| 1|42010319| 2.0|
| 1|31010401| 2.0|
| 1|22240114| 2.0|
+----+--------+------+
only showing top 3 rows
>>>
>>> model = ALS.train(ratings, rank=15, iterations=10,seed=0,lambda_=0.001)
17/08/30 09:34:59 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
17/08/30 09:34:59 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
17/08/30 09:34:59 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
17/08/30 09:34:59 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK
17/08/30 09:35:03 WARN Executor: 1 block locks were not released by TID = 4644:
[rdd_3272_0]
17/08/30 09:35:03 WARN Executor: 1 block locks were not released by TID = 4645:
[rdd_3273_0]
17/08/30 09:35:03 WARN Executor: 1 block locks were not released by TID = 4646:
[rdd_3272_0]
17/08/30 09:35:03 WARN Executor: 1 block locks were not released by TID = 4647:
[rdd_3273_0]
>>> all_rating=model.recommendProductsForUsers(30).map(lambda x:x[1]).collect()
>>> len(rating)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
NameError: name 'rating' is not defined
>>> len(all_rating)
33695
>>> len(all_rating[0])
30
>>> userProducts = ratings.map(lambda rating:(rating.user,rating.product))
>>> predictions = model.predictAll(userProducts).map(lambda rating:((rating.user,rating.product),rating.rating))
>>> type(predictions)
<class 'pyspark.rdd.PipelinedRDD'>
>>> predictions.toDF().show(3)
[Stage 258:=====================> (3 + 5) / 8]17/08/30 09:44:26 WARN Executor: Managed memory leak detected; size = 15977666 bytes, TID = 4787
17/08/30 09:44:27 WARN Executor: Managed memory leak detected; size = 15977666 bytes, TID = 4788
+---------------+-----------------+
| _1| _2|
+---------------+-----------------+
|[4904,37020312]|4.338272324285362|
|[4904,32010112]|2.763827789148973|
|[4904,12010504]|6.962521675730641|
+---------------+-----------------+
only showing top 3 rows
>>> ratingsAndPredictions = ratings.map(lambda rating:((int(rating.user),rating.product),rating.rating)).join(predictions)
>>> ratingsAndPredictions.toDF().show(3)
+----------------+--------------------+
| _1| _2|
+----------------+--------------------+
| [4075,53100103]|[4.0,3.1492042815...|
|[20152,13070515]|[3.0,4.0453910858...|
| [1335,34030227]|[4.0,3.9336990908...|
+----------------+--------------------+
only showing top 3 rows
>>> MSE = ratingsAndPredictions.map(lambda ((x,y),(m,n)):math.pow(m-n,2)).reduce(lambda x,y:x+y)/ratingsAndPredictions.count()
>>> print "***************" +str(math.sqrt(MSE)) + "*****************"
***************1.39966771197*****************
>>> k=[]
>>> for row in all_rating:
... k.extend(row)
...
>>> all_rating_rdd = sc.parallelize(k)
>>> all_rating_rdd.toDF().registerTempTable("all_score")
17/08/30 10:04:58 WARN TaskSetManager: Stage 415 contains a task of very large size (3118 KB). The maximum recommended task size is 100 KB.
>>> hiveCtx.sql("select * from all_score limit 5").show(3)
17/08/30 10:08:19 WARN TaskSetManager: Stage 416 contains a task of very large size (3118 KB). The maximum recommended task size is 100 KB.
+-----+--------+------------------+
| user| product| rating|
+-----+--------+------------------+
|27456|51520615| 35.52432167919441|
|27456|90020219|29.566229211420946|
|27456|34030316| 28.08260143903327|
+-----+--------+------------------+
only showing top 3 rows
>>> hiveCtx.sql("select a2.cust_id,a1.product,rating," + end_date +" date "+ " from all_score a1,id_cid a2 " + "where a1.user=a2.id").show(5)
17/08/30 10:18:22 WARN TaskSetManager: Stage 417 contains a task of very large size (3118 KB). The maximum recommended task size is 100 KB.
+------------+--------+------------------+--------+
| cust_id| product| rating| date|
+------------+--------+------------------+--------+
|110101100985|90190202| 25.33671962331747|20170830|
|110101100985|34030326|23.906456902069216|20170830|
|110101100985|90020726|16.010945324507635|20170830|
|110101100985|90190101|15.628847628582498|20170830|
|110101100985|90020727|15.442605374580097|20170830|
+------------+--------+------------------+--------+
only showing top 5 rows
>>>
>>> hiveCtx.sql("SELECT * from id_cid limit 5").show(5)
+-----+------------+
| id| cust_id|
+-----+------------+
| 4549|110105106838|
|12992|110108207746|
|30968|110228100250|
|22213|110114100048|
|19728|110113101105|
+-----+------------+
>>> hiveCtx.sql("select B.NATION_CUST_CODE CUST_CODE "+" from yxpt.CO_CUST B,id_cid A "+" where B.CUST_ID=A.CUST_ID limit 3").show(3)
+------------+
| CUST_CODE|
+------------+
|110105106838|
|110108207746|
|110228100250|
+------------+
>>> hiveCtx.sql("select B.NATION_CUST_CODE CUST_CODE "+" from yxpt.CO_CUST B,id_cid A "+" where B.CUST_ID=A.CUST_ID limit 3 ").show(3) +------------+
| CUST_CODE|
+------------+
|110105106838|
|110108207746|
|110228100250|
+------------+
>>> hiveCtx.sql("select C.PACK_BAR "+" from yxpt.PLM_ITEM C,all_score D "+" where C.ITEM_ID=D.product limit 3").show(3)
17/08/30 15:08:24 WARN TaskSetManager: Stage 448 contains a task of very large size (3118 KB). The maximum recommended task size is 100 KB.
+-------------+
| PACK_BAR|
+-------------+
|6901028032957|
|4893225033276|
|6901028208550|
+-------------+
>>>
>>> hiveCtx.sql("select * from all_score limit 5").show(3)
17/08/30 16:09:04 WARN TaskSetManager: Stage 465 contains a task of very large size (3118 KB). The maximum recommended task size is 100 KB.
+-----+--------+------------------+
| user| product| rating|
+-----+--------+------------------+
|27456|51520615| 35.52432167919441|
|27456|90020219|29.566229211420946|
|27456|34030316| 28.08260143903327|
+-----+--------+------------------+
only showing top 3 rows
>>>
>>> hiveCtx.sql("select CO_CUST_T.NATION_CUST_CODE CUST_CODE "+" from yxpt.CO_CUST CO_CUST_T,id_cid id_cid_t "+" where CO_CUST_T.CUST_ID=id_cid_t.CUST_ID limit 3").show(3)
+------------+
| CUST_CODE|
+------------+
|110105106838|
|110108207746|
|110228100250|
+------------+
>>> hiveCtx.sql("select PLM_ITEM_T.PACK_BAR "+" from yxpt.PLM_ITEM PLM_ITEM_T,all_score all_score_t "+" where PLM_ITEM_T.ITEM_ID=all_score_t.product limit 3").show(3)
17/08/30 18:32:13 WARN TaskSetManager: Stage 481 contains a task of very large size (3118 KB). The maximum recommended task size is 100 KB.
+-------------+
| PACK_BAR|
+-------------+
|6901028032957|
|4893225033276|
|6901028208550|
+-------------+
>>> hiveCtx.sql("select * from all_score limit 5").show(3) 17/08/30 23:53:47 WARN TaskSetManager: Stage 482 contains a task of very large size (3118 KB). The maximum recommended task size is 100 KB.
+-----+--------+------------------+
| user| product| rating|
+-----+--------+------------------+
|27456|51520615| 35.52432167919441|
|27456|90020219|29.566229211420946|
|27456|34030316| 28.08260143903327|
+-----+--------+------------------+
only showing top 3 rows
>>> hiveCtx.sql("insert into table yxpt.pi_cust_item_recommend2 PARTITION (ymday='"+end_date +"') select a2.cust_id,a1.product,rating "+ " from all_score a1,id_cid a2 " + "where a1.user=a2.id")
17/08/31 00:13:05 WARN TaskSetManager: Stage 493 contains a task of very large size (3118 KB). The maximum recommended task size is 100 KB.
17/08/31 00:13:12 ERROR KeyProviderCache: Could not find uri with key [dfs.encryption.key.provider.uri] to create a keyProvider !!
DataFrame[]
>>> len(all_rating)
33695
>>> hiveCtx.sql("select CO_CUST_T.NATION_CUST_CODE CUST_CODE "+" from yxpt.CO_CUST CO_CUST_T,id_cid id_cid_t "+" where CO_CUST_T.CUST_ID=id_cid_t.CUST_ID limit 3").show(3)
+------------+
| CUST_CODE|
+------------+
|110105106838|
|110108207746|
|110228100250|
+------------+
>>> len(all_rating)
33695
>>> hiveCtx.sql("select B.NATION_CUST_CODE CUST_CODE "+" from yxpt.CO_CUST B,id_cid A "+" where B.CUST_ID=A.CUST_ID limit 3").show(3)
+------------+
| CUST_CODE|
+------------+
|110105106838|
|110108207746|
|110228100250|
+------------+
>>> hiveCtx.sql("select A.CUST_ID,B.NATION_CUST_CODE CUST_CODE "+" from yxpt.CO_CUST B,id_cid A "+" where B.CUST_ID=A.CUST_ID limit 3").show(3)
+------------+------------+
| CUST_ID| CUST_CODE|
+------------+------------+
|110105106838|110105106838|
|110108207746|110108207746|
|110228100250|110228100250|
+------------+------------+
>>> hiveCtx.sql("select D.product,C.PACK_BAR "+" from yxpt.PLM_ITEM C,all_score D "+" where C.ITEM_ID=D.product limit 3").show(3)
17/08/31 04:02:23 WARN TaskSetManager: Stage 514 contains a task of very large size (3118 KB). The maximum recommended task size is 100 KB.
+--------+-------------+
| product| PACK_BAR|
+--------+-------------+
|51520615|6901028032957|
|90020219|4893225033276|
|34030316|6901028208550|
+--------+-------------+
>>> hiveCtx.sql("select D.product,C.PACK_BAR "+" from yxpt.PLM_ITEM C,all_score D "+" where C.ITEM_ID=D.product limit 5").show(5)
17/08/31 04:02:50 WARN TaskSetManager: Stage 516 contains a task of very large size (3118 KB). The maximum recommended task size is 100 KB.
+--------+-------------+
| product| PACK_BAR|
+--------+-------------+
|51520615|6901028032957|
|90020219|4893225033276|
|34030316|6901028208550|
|90020923|4893225020443|
|51520635|6901028085724|
+--------+-------------+
>>> hiveCtx.sql("insert into table yxpt.pi_cust_item_recommend3 select A.CUST_ID,B.NATION_CUST_CODE CUST_CODE "+" from yxpt.CO_CUST B,id_cid A "+" where B.CUST_ID=A.CUST_ID ")
DataFrame[]
>>>
>>> hiveCtx.sql("insert into table yxpt.pi_cust_item_recommend4 select D.product,C.PACK_BAR "+" from yxpt.PLM_ITEM C,all_score D "+" where C.ITEM_ID=D.product ")
17/08/31 04:31:08 WARN TaskSetManager: Stage 524 contains a task of very large size (3118 KB). The maximum recommended task size is 100 KB.