版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/d413122031/article/details/82631478
SQL
Apache Hive
加载数据与使用
# Import Spark SQL from pyspark.sql import HiveContext, Row # Or if you can't include the hive requirements from pyspark.sql import SQLContext, Row hiveCtx = HiveContext(sc) rows = hiveCtx.sql("SELECT name, age FROM users") # rows (SchemaRDDs) firstRow = rows.first() print firstRow.name """tweets {"user": {"name": "Holden", "location": "San Francisco"}, "text": "Nice day out today"} {"user": {"name": "Matei", "location": "Berkeley"}, "text": "Even nicer here :)"} """ tweets = hiveCtx.jsonFile("tweets.json") tweets.registerTempTable("tweets") #注册成为临时表 提供sql查询 results = hiveCtx.sql("SELECT user.name, text FROM tweets") results.show() #画图表 results.printSchema() #话出结构 results.count() #
自定义结构
schema = StructType([ StructField("id", LongType(), True), StructField("name", StringType(), True), StructField("age", LongType(), True), StructField("eyeColor", StringType(), True) ])
import pyspark.sql.types as typ fraud = sc.textFile('ccFraud.csv.gz') # 提取表头 header = fraud.first() # 获取数据 fraud = fraud \ .filter(lambda row: row != header) \ .map(lambda row: [int(elem) for elem in row.split(',')]) 使用typ够着结构 fields = [ *[ typ.StructField(h[1:-1], typ.IntegerType(), True) for h in header.split(',') ] ] schema = typ.StructType(fields) 创建DataFrame fraud_df = spark.createDataFrame(fraud, schema) fraud_df.printSchema() # 打印表结构
Mongo
- 数据加载与使用
# mongo 加载数据 from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext from pyspark.sql.types import * sc = SparkContext() ctx = SQLContext(sc) test_collection = ctx.read.format("com.mongodb.spark.sql").options(uri="mongodb://192.168.0.1:27017", database="test_db", collection="test_collection").load() # 不指定数据类型加载太慢 这里自定义类型 fields_list = "name age sex grade exp" fields = [StructField(field_name, StringType(), True) for field_name in fields_list.split()] schema = StructType(fields) test_collection = job_ctx.read.schema(schema).format("com.mongodb.spark.sql").options(uri="mongodb://192.168.0.1:27017", database="test_db", collection="test_collection").load() # 注册成为临时表 test_collection.registerTempTable("Account") sql = "select * from Account where age > '18'" result = ctx.sql(sql) #若 result为rdd 格式需要先转换为 dataframe fields_list = "name age sex grade exp" fields = [StructField(field_name, StringType(), True) for field_name in fields_list] schema = StructType(fields) df = ctx.createDataFrame(result, schema=schema) df.write.format("com.mongodb.spark.sql").mode("overwrite").options(uri="mongodb://192.168.0.1:27017", database="test_db", collection="test_collection_out").load() overwrite就是先删除mongodb中指定的表,然后把数据写到这个表中;ignore就是如果mongodb中有这个表,就不写数据了,且不会报错;errorifexists就是如果mongodb中存在这个表就报错,如果不存在就正常写入;append就是不管mongodb中这个表存不存在直接往里写数据
- 数据加载与使用
DataFrame
构造
df = spark.createDataFrame([ (1, 144.5, 5.9, 33, 'M'), (2, 167.2, 5.4, 45, 'M'), (3, 124.1, 5.2, 23, 'F'), (4, 144.5, 5.9, 33, 'M'), (5, 133.2, 5.7, 54, 'F'), (3, 124.1, 5.2, 23, 'F'), (5, 129.2, 5.3, 42, 'M'), ], ['id', 'weight', 'height', 'age', 'gender'])
使用
count
print('Count of rows: {0}'.format(df.count())) >>> 7 print('Count of distinct rows: {0}'.format(df.distinct().count())) >>> 6
show
df.show() df.where('id == 3').show() 选出id = 3 的row sparksession.sql("select count(1) from swimmers").show()
filter
df_outliers = df_outliers.join(outliers, on='id') df_outliers.filter('weight_o').select('id', 'weight').show() df_outliers.filter('age_o').select('id', 'age').show()
select
swimmers.select("name", "eyeColor").filter("eyeColor like 'b%'").show() 用filter进行筛选 # 输出的字段将没有income df_miss_no_income = df_miss.select([ c for c in df_miss.columns if c != 'income' ]).show
dropDuplicates
#删除重复字段复制 df = df.dropDuplicates() # 自定义条件的复制 df = df.dropDuplicates(subset=[ c for c in df.columns if c != 'id' ])
dropna
# 当某一个字段超过三条计入有空 则删除该row df_miss_no_income.dropna(thresh=3).show()
agg
# 聚合操作 alias 为显示字段的命名 import pyspark.sql.functions as fn df.agg( fn.count('id').alias('count'), fn.countDistinct('id').alias('distinct') ).show() # 多个字段以生成式方式写出 计算每一个字段中少了多少 df_miss.agg(*[ (1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing') for c in df_miss.columns ]).show()
map
# 计算每行有几个None df_miss.rdd.map( lambda row: (row['id'], sum([c == None for c in row])) ).collect()
withColumn
#添加一个字段 df.withColumn('new_id', fn.monotonically_increasing_id()).show()
fillna
# 用于估算空值的 这里使用 mean 平均数 先把每一个平均数给补上 # gender不是数字类型所以跳过 # to_dict 的计入在records内 means = df_miss_no_income.agg( *[fn.mean(c).alias(c) for c in df_miss_no_income.columns if c != 'gender'] ).toPandas().to_dict('records')[0] # 补上gender的值 means['gender'] = 'missing' # 对其中的空值进行赋值 df_miss_no_income.fillna(means).show()
SchemaRDDs
- 可以和RDD一样的操作
- 还可以有SQL的查询操作
## rdd --> scemardd happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD) happyPeopleSchemaRDD.registerTempTable("happy_people")
Types stored
SparkSQL/HiveQL Python TINYINT int/long (in range of –128 to 127) SMALLINT int/long (in range of –32768 to 32767) INT int or long BIGINT long FLOAT float DOUBLE float DECIMAL decimal.Decimal STRING string BINARY bytearray BOOLEAN bool TIMESTAMP datetime.datetime ARRAY list, tuple, or array MAP dict STRUCT Row UDFs (自定义查询返回)
hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType()) lengthSchemaRDD = hiveCtx.sql("SELECT strLenPython('text') FROM tweets LIMIT 10")
Performance Tuning Options
- spark.sql.codegen
- 当这是真的时,Spark SQL会编译每个动态地查询Java字节码。这可以提高大型查询的性能,但是
codegen可以减缓非常简短的查询。
- 当这是真的时,Spark SQL会编译每个动态地查询Java字节码。这可以提高大型查询的性能,但是
- spark.sql.inMemoryColumnarStorage.com
pressed
- 自动压缩内存中的存储的数据
- spark.sql.inMemoryColumnarStorage.batch
Size
- 缓存大小。需要考虑内存容量
- spark.sql.parquet.compression.codec
- 压缩编解码器,包括 uncompressed, snappy,gzip, and lzo.
- spark.sql.codegen