pyspark读写dataframe
- 1. 连接spark
- 2. 创建dataframe
- 2.1. 从变量创建
- 2.2. 从变量创建
- 2.3. 读取json
- 2.4. 读取csv
- 2.5. 读取MySQL
- 2.6. 从pandas.dataframe创建
- 2.7. 从列式存储的parquet读取
- 2.8. 从hive读取
- 3. 保存数据
- 3.1. 写到csv
- 3.2. 保存到parquet
- 3.3. 写到hive
- 3.4. 写到hdfs
- 3.5. 写到mysql
1. 连接spark
-
from pyspark.sql import SparkSession
-
-
spark = SparkSession \
-
. builder \
-
. appName ( 'my_first_app_name' ) \
-
. getOrCreate ()
2. 创建dataframe
2.1. 从变量创建
-
# 生成以逗号分隔的数据
-
stringCSVRDD = spark . sparkContext . parallelize ([
-
( 123 , "Katie" , 19 , "brown" ),
-
( 234 , "Michael" , 22 , "green" ),
-
( 345 , "Simone" , 23 , "blue" )
-
])
-
# 指定模式, StructField(name,dataType,nullable)
-
# 其中:
-
# name: 该字段的名字,
-
# dataType:该字段的数据类型,
-
# nullable: 指示该字段的值是否为空
-
from pyspark.sql.types import StructType , StructField , LongType , StringType # 导入类型
-
-
schema = StructType ([
-
StructField ( "id" , LongType (), True ),
-
StructField ( "name" , StringType (), True ),
-
StructField ( "age" , LongType (), True ),
-
StructField ( "eyeColor" , StringType (), True )
-
])
-
-
# 对RDD应用该模式并且创建DataFrame
-
swimmers = spark . createDataFrame ( stringCSVRDD , schema )
-
-
# 利用DataFrame创建一个临时视图
-
swimmers . registerTempTable ( "swimmers" )
-
-
# 查看DataFrame的行数
-
swimmers . count ()
2.2. 从变量创建
-
# 使用自动类型推断的方式创建dataframe
-
-
data = [( 123 , "Katie" , 19 , "brown" ),
-
( 234 , "Michael" , 22 , "green" ),
-
( 345 , "Simone" , 23 , "blue" )]
-
df = spark . createDataFrame ( data , schema = [ 'id' , 'name' , 'age' , 'eyccolor' ])
-
df . show ()
-
df . count ()
2.3. 读取json
-
# 读取spark下面的示例数据
-
-
file = r"D:\hadoop_spark\spark-2.1.0-bin-hadoop2.7\examples\src\main\resources\people.json"
-
df = spark . read . json ( file )
-
df . show ()
2.4. 读取csv
-
# 先创建csv文件
-
import pandas as pd
-
import numpy as np
-
df = pd . DataFrame ( np . random . rand ( 5 , 5 ), columns = [ 'a' , 'b' , 'c' , 'd' , 'e' ]) .\
-
applymap ( lambda x : int ( x * 10 ))
-
file = r"D:\hadoop_spark\spark-2.1.0-bin-hadoop2.7\examples\src\main\resources\random.csv"
-
df . to_csv ( file , index = False )
-
-
# 再读取csv文件
-
monthlySales = spark . read . csv ( file , header = True , inferSchema = True )
-
monthlySales . show ()
2.5. 读取MySQL
-
# 此时需要将mysql-jar驱动放到spark-2.2.0-bin-hadoop2.7\jars下面
-
# 单机环境可行,集群环境不行
-
# 重新执行
-
df = spark . read . format ( 'jdbc' ) . options (
-
url = 'jdbc:mysql://127.0.0.1' ,
-
dbtable = 'mysql.db' ,
-
user = 'root' ,
-
password = '123456'
-
) . load ()
-
df . show ()
-
-
# 也可以传入SQL语句
-
-
sql = "(select * from mysql.db where db='wp230') t"
-
df = spark . read . format ( 'jdbc' ) . options (
-
url = 'jdbc:mysql://127.0.0.1' ,
-
dbtable = sql ,
-
user = 'root' ,
-
password = '123456'
-
) . load ()
-
df . show ()
2.6. 从pandas.dataframe创建
-
# 如果不指定schema则用pandas的列名
-
df = pd . DataFrame ( np . random . random (( 4 , 4 )))
-
spark_df = spark . createDataFrame ( df , schema = [ 'a' , 'b' , 'c' , 'd' ])
2.7. 从列式存储的parquet读取
-
# 读取example下面的parquet文件
-
file = r"D:\apps\spark-2.2.0-bin-hadoop2.7\examples\src\main\resources\users.parquet"
-
df = spark . read . parquet ( file )
-
df . show ()
2.8. 从hive读取
-
# 如果已经配置spark连接hive的参数,可以直接读取hive数据
-
spark = SparkSession \
-
. builder \
-
. enableHiveSupport () \
-
. master ( "172.31.100.170:7077" ) \
-
. appName ( "my_first_app_name" ) \
-
. getOrCreate ()
-
-
df = spark . sql ( "select * from hive_tb_name" )
-
df . show ()
3. 保存数据
3.1. 写到csv
-
# 创建dataframe
-
import numpy as np
-
df = pd . DataFrame ( np . random . random (( 4 , 4 )), columns = [ 'a' , 'b' , 'c' , 'd' ])
-
spark_df = spark . createDataFrame ( df )
-
-
# 写到csv
-
file = r"D:\apps\spark-2.2.0-bin-hadoop2.7\examples\src\main\resources\test.csv"
-
spark_df . write . csv ( path = file , header = True , sep = "," , mode = 'overwrite' )
3.2. 保存到parquet
-
# 创建dataframe
-
import numpy as np
-
df = pd . DataFrame ( np . random . random (( 4 , 4 )), columns = [ 'a' , 'b' , 'c' , 'd' ])
-
spark_df = spark . createDataFrame ( df )
-
-
# 写到parquet
-
file = r"D:\apps\spark-2.2.0-bin-hadoop2.7\examples\src\main\resources\test.parquet"
-
spark_df . write . parquet ( path = file , mode = 'overwrite' )
3.3. 写到hive
-
# 打开动态分区
-
spark . sql ( "set hive.exec.dynamic.partition.mode = nonstrict" )
-
spark . sql ( "set hive.exec.dynamic.partition=true" )
-
-
# 使用普通的hive-sql写入分区表
-
spark . sql ( """
-
insert overwrite table ai.da_aipurchase_dailysale_hive
-
partition (saledate)
-
select productid, propertyid, processcenterid, saleplatform, sku, poa, salecount, saledate
-
from szy_aipurchase_tmp_szy_dailysale distribute by saledate
-
""" )
-
-
# 或者使用每次重建分区表的方式
-
jdbcDF . write . mode ( "overwrite" ) . partitionBy ( "saledate" ) . insertInto ( "ai.da_aipurchase_dailysale_hive" )
-
jdbcDF . write . saveAsTable ( "ai.da_aipurchase_dailysale_hive" , None , "append" , partitionBy = 'saledate' )
-
-
# 不写分区表,只是简单的导入到hive表
-
jdbcDF . write . saveAsTable ( "ai.da_aipurchase_dailysale_for_ema_predict" , None , "overwrite" , None )
3.4. 写到hdfs
-
# 数据写到hdfs,而且以csv格式保存
-
jdbcDF . write . mode ( "overwrite" ) . options ( header = "true" ) . csv ( "/home/ai/da/da_aipurchase_dailysale_for_ema_predict.csv" )
3.5. 写到mysql
-
# 会自动对齐字段,也就是说,spark_df 的列不一定要全部包含MySQL的表的全部列才行
-
-
# overwrite 清空表再导入
-
spark_df . write . mode ( "overwrite" ) . format ( "jdbc" ) . options (
-
url = 'jdbc:mysql://127.0.0.1' ,
-
user = 'root' ,
-
password = '123456' ,
-
dbtable = "test.test" ,
-
batchsize = "1000" ,
-
) . save ()
-
-
# append 追加方式
-
spark_df . write . mode ( "append" ) . format ( "jdbc" ) . options (
-
url = 'jdbc:mysql://127.0.0.1' ,
-
user = 'root' ,
-
password = '123456' ,
-
dbtable = "test.test" ,
-
batchsize = "1000" ,
-
) . save ()