from pyspark.mllib.stat import Statistics from pyspark.mllib.linalg import Vectors if __name__ == '__main__': for i in range(0, 4, 1): print(i) test_li = [1, 2, 3, 45, 6, 7] test_li1 = [1, 3, 4, 56, 26, 17] rdd2 = sc.parallelize([Vectors.dense(test_li), Vectors.dense(test_li1)]) rdd1 = sc.parallelize([Vectors.dense(test_li)]) result = Statistics.colStats(rdd2) print(result.mean()) rdd = sc.parallelize([Vectors.dense([2, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]), Vectors.dense([6, 7, 0, 8])]) cStats = Statistics.colStats(rdd)
注意点:以上代码为pyspark的python shell环境代码,若要独立运行需sparkcontext sc实例;
1. rdd=sc.parallelize([]),构建rdd时,序列化数组必须为[]元组形式,不然出异常
File "/opt/modules/spark-2.2.0/python/lib/pyspark.zip/pyspark/mllib/linalg/__init__.py", line 83, in _convert_to_vector
raise TypeError("Cannot convert type %s into Vector" % type(l))
TypeError: Cannot convert type <class 'numpy.float64'> into Vector
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:108)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)