from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ops').getOrCreate()
df = spark.read.csv('appe_stock.csv',inferSchema =True, header =True)
df.printSchema()
df.show()# The first way
df.filter("Close < 500").show()# 传入一个条件
df.filter("Close < 500").select('Open').show()
df.filter("Close < 500").select(['Open','Close']).show()
filter写法二
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ops').getOrCreate()
df = spark.read.csv('appe_stock.csv',inferSchema =True, header =True)
df.printSchema()
df.show()# The second way
df.filter(df['Close']<500).select('Volume').show()
df.filter(df['Close']<200and df['Open']>200).show()# wrong
df.filter((df['Close']<200)&(df['Open']>200)).show()# right
条件符号
# not operation
df.filter((df['Close']<200)&~(df['Open']>200)).show()# right# equal operation
df.filter(df['Low']==197.16).show()
获取结果
# if we want to save it, we could use collect()
result = df.filter(df['Low']==197.16).collect()# one row as many format
result[0].asDict()# and then you could get specific attribute
result[0].asDict()['Volume']