pyspark系列--pyspark读写dataframe

pyspark读写dataframe

  • 1. 连接spark
  • 2. 创建dataframe
    • 2.1. 从变量创建
    • 2.2. 从变量创建
    • 2.3. 读取json
    • 2.4. 读取csv
    • 2.5. 读取MySQL
    • 2.6. 从pandas.dataframe创建
    • 2.7. 从列式存储的parquet读取
    • 2.8. 从hive读取
  • 3. 保存数据
    • 3.1. 写到csv
    • 3.2. 保存到parquet
    • 3.3. 写到hive
    • 3.4. 写到hdfs
    • 3.5. 写到mysql

1. 连接spark

  1. from pyspark.sql import SparkSession
  2. spark = SparkSession \
  3. . builder \
  4. . appName ( 'my_first_app_name' ) \
  5. . getOrCreate ()

2. 创建dataframe

2.1. 从变量创建

  1. # 生成以逗号分隔的数据
  2. stringCSVRDD = spark . sparkContext . parallelize ([
  3. ( 123 , "Katie" , 19 , "brown" ),
  4. ( 234 , "Michael" , 22 , "green" ),
  5. ( 345 , "Simone" , 23 , "blue" )
  6. ])
  7. # 指定模式, StructField(name,dataType,nullable)
  8. # 其中:
  9. # name: 该字段的名字,
  10. # dataType:该字段的数据类型,
  11. # nullable: 指示该字段的值是否为空
  12. from pyspark.sql.types import StructType , StructField , LongType , StringType # 导入类型
  13. schema = StructType ([
  14. StructField ( "id" , LongType (), True ),
  15. StructField ( "name" , StringType (), True ),
  16. StructField ( "age" , LongType (), True ),
  17. StructField ( "eyeColor" , StringType (), True )
  18. ])
  19. # 对RDD应用该模式并且创建DataFrame
  20. swimmers = spark . createDataFrame ( stringCSVRDD , schema )
  21. # 利用DataFrame创建一个临时视图
  22. swimmers . registerTempTable ( "swimmers" )
  23. # 查看DataFrame的行数
  24. swimmers . count ()

2.2. 从变量创建

  1. # 使用自动类型推断的方式创建dataframe
  2. data = [( 123 , "Katie" , 19 , "brown" ),
  3. ( 234 , "Michael" , 22 , "green" ),
  4. ( 345 , "Simone" , 23 , "blue" )]
  5. df = spark . createDataFrame ( data , schema = [ 'id' , 'name' , 'age' , 'eyccolor' ])
  6. df . show ()
  7. df . count ()

2.3. 读取json

  1. # 读取spark下面的示例数据
  2. file = r"D:\hadoop_spark\spark-2.1.0-bin-hadoop2.7\examples\src\main\resources\people.json"
  3. df = spark . read . json ( file )
  4. df . show ()

2.4. 读取csv

  1. # 先创建csv文件
  2. import pandas as pd
  3. import numpy as np
  4. df = pd . DataFrame ( np . random . rand ( 5 , 5 ), columns = [ 'a' , 'b' , 'c' , 'd' , 'e' ]) .\
  5. applymap ( lambda x : int ( x * 10 ))
  6. file = r"D:\hadoop_spark\spark-2.1.0-bin-hadoop2.7\examples\src\main\resources\random.csv"
  7. df . to_csv ( file , index = False )
  8. # 再读取csv文件
  9. monthlySales = spark . read . csv ( file , header = True , inferSchema = True )
  10. monthlySales . show ()

2.5. 读取MySQL

  1. # 此时需要将mysql-jar驱动放到spark-2.2.0-bin-hadoop2.7\jars下面
  2. # 单机环境可行,集群环境不行
  3. # 重新执行
  4. df = spark . read . format ( 'jdbc' ) . options (
  5. url = 'jdbc:mysql://127.0.0.1' ,
  6. dbtable = 'mysql.db' ,
  7. user = 'root' ,
  8. password = '123456'
  9. ) . load ()
  10. df . show ()
  11. # 也可以传入SQL语句
  12. sql = "(select * from mysql.db where db='wp230') t"
  13. df = spark . read . format ( 'jdbc' ) . options (
  14. url = 'jdbc:mysql://127.0.0.1' ,
  15. dbtable = sql ,
  16. user = 'root' ,
  17. password = '123456'
  18. ) . load ()
  19. df . show ()

2.6. 从pandas.dataframe创建

  1. # 如果不指定schema则用pandas的列名
  2. df = pd . DataFrame ( np . random . random (( 4 , 4 )))
  3. spark_df = spark . createDataFrame ( df , schema = [ 'a' , 'b' , 'c' , 'd' ])

2.7. 从列式存储的parquet读取

  1. # 读取example下面的parquet文件
  2. file = r"D:\apps\spark-2.2.0-bin-hadoop2.7\examples\src\main\resources\users.parquet"
  3. df = spark . read . parquet ( file )
  4. df . show ()

2.8. 从hive读取

  1. # 如果已经配置spark连接hive的参数,可以直接读取hive数据
  2. spark = SparkSession \
  3. . builder \
  4. . enableHiveSupport () \
  5. . master ( "172.31.100.170:7077" ) \
  6. . appName ( "my_first_app_name" ) \
  7. . getOrCreate ()
  8. df = spark . sql ( "select * from hive_tb_name" )
  9. df . show ()

3. 保存数据

3.1. 写到csv

  1. # 创建dataframe
  2. import numpy as np
  3. df = pd . DataFrame ( np . random . random (( 4 , 4 )), columns = [ 'a' , 'b' , 'c' , 'd' ])
  4. spark_df = spark . createDataFrame ( df )
  5. # 写到csv
  6. file = r"D:\apps\spark-2.2.0-bin-hadoop2.7\examples\src\main\resources\test.csv"
  7. spark_df . write . csv ( path = file , header = True , sep = "," , mode = 'overwrite' )

3.2. 保存到parquet

  1. # 创建dataframe
  2. import numpy as np
  3. df = pd . DataFrame ( np . random . random (( 4 , 4 )), columns = [ 'a' , 'b' , 'c' , 'd' ])
  4. spark_df = spark . createDataFrame ( df )
  5. # 写到parquet
  6. file = r"D:\apps\spark-2.2.0-bin-hadoop2.7\examples\src\main\resources\test.parquet"
  7. spark_df . write . parquet ( path = file , mode = 'overwrite' )

3.3. 写到hive

  1. # 打开动态分区
  2. spark . sql ( "set hive.exec.dynamic.partition.mode = nonstrict" )
  3. spark . sql ( "set hive.exec.dynamic.partition=true" )
  4. # 使用普通的hive-sql写入分区表
  5. spark . sql ( """
  6. insert overwrite table ai.da_aipurchase_dailysale_hive
  7. partition (saledate)
  8. select productid, propertyid, processcenterid, saleplatform, sku, poa, salecount, saledate
  9. from szy_aipurchase_tmp_szy_dailysale distribute by saledate
  10. """ )
  11. # 或者使用每次重建分区表的方式
  12. jdbcDF . write . mode ( "overwrite" ) . partitionBy ( "saledate" ) . insertInto ( "ai.da_aipurchase_dailysale_hive" )
  13. jdbcDF . write . saveAsTable ( "ai.da_aipurchase_dailysale_hive" , None , "append" , partitionBy = 'saledate' )
  14. # 不写分区表,只是简单的导入到hive表
  15. jdbcDF . write . saveAsTable ( "ai.da_aipurchase_dailysale_for_ema_predict" , None , "overwrite" , None )

3.4. 写到hdfs

  1. # 数据写到hdfs,而且以csv格式保存
  2. jdbcDF . write . mode ( "overwrite" ) . options ( header = "true" ) . csv ( "/home/ai/da/da_aipurchase_dailysale_for_ema_predict.csv" )

3.5. 写到mysql

  1. # 会自动对齐字段,也就是说,spark_df 的列不一定要全部包含MySQL的表的全部列才行
  2. # overwrite 清空表再导入
  3. spark_df . write . mode ( "overwrite" ) . format ( "jdbc" ) . options (
  4. url = 'jdbc:mysql://127.0.0.1' ,
  5. user = 'root' ,
  6. password = '123456' ,
  7. dbtable = "test.test" ,
  8. batchsize = "1000" ,
  9. ) . save ()
  10. # append 追加方式
  11. spark_df . write . mode ( "append" ) . format ( "jdbc" ) . options (
  12. url = 'jdbc:mysql://127.0.0.1' ,
  13. user = 'root' ,
  14. password = '123456' ,
  15. dbtable = "test.test" ,
  16. batchsize = "1000" ,
  17. ) . save ()

猜你喜欢

转载自blog.csdn.net/u013129109/article/details/80928525