pyspark系列--集合操作

汇总函数

1. 创建map

# Creates a new map column.
from pyspark.sql.functions import create_map

df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age"))
df.show()

df.select(create_map([df.name, df.age]).alias("map")).show()
# +-------------------+
# |                map|
# +-------------------+
# |Map(John Doe -> 21)|
# +-------------------+

2. 创建列表

# Creates a new array column.
from pyspark.sql.functions import array

df.select(array('age', 'age').alias("arr")).show()
# +--------+
# |     arr|
# +--------+
# |[21, 21]|
# +--------+

3. 元素存在判断

相当于 pandas.isin, pandas.notin

from pyspark.sql.functions import array_contains

df = spark.createDataFrame([(["a", "b", "c"],), ([],)], ['data'])

df.select(array_contains(df.data, "a")).show()
# +-----------------------+
# |array_contains(data, a)|
# +-----------------------+
# |                   true|
# |                  false|
# +-----------------------+

4. 数据拉直

这是我造的名词,大概意思是,如果col的值是列表之类的复合数据,则将每个数据单独赋予一行。
Returns a new row for each element in the given array or map

from pyspark.sql import Row
from pyspark.sql.functions import explode

eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])
eDF.show()
# +---+---------+-----------+
# |  a|  intlist|   mapfield|
# +---+---------+-----------+
# |  1|[1, 2, 3]|Map(a -> b)|
# +---+---------+-----------+

eDF.select(explode('intlist').alias("anInt")).show()
# |anInt|
# +-----+
# |    1|
# |    2|
# |    3|
# +-----+

eDF.select(explode('mapfield').alias("key", "value")).show()
# +---+-----+
# |key|value|
# +---+-----+
# |  a|    b|
# +---+-----+

5. posexplode

# Returns a new row for each element with position in the given array or map.
from pyspark.sql import Row
from pyspark.sql.functions import posexplode

eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])
eDF.show()
# +---+---------+-----------+
# |  a|  intlist|   mapfield|
# +---+---------+-----------+
# |  1|[1, 2, 3]|Map(a -> b)|
# +---+---------+-----------+

eDF.select(posexplode('intlist')).show()
# +---+---+
# |pos|col|
# +---+---+
# |  0|  1|
# |  1|  2|
# |  2|  3|
# +---+---+

6. json操作

6.1. get_json_object

6.2. json_tuple

6.3. from_json

6.4. to_json

7. 列表排序

# Collection function: sorts the input array in ascending or descending order according
# to the natural ordering of the array elements.
from pyspark.sql.functions import sort_array

df = spark.createDataFrame([([2, 1, 3],),([1],),([],)], ['data'])

df.select(sort_array(df.data).alias('r')).show()
# +---------+
# |        r|
# +---------+
# |[1, 2, 3]|
# |      [1]|
# |       []|
# +---------+

df.select(sort_array(df.data, asc=False).alias('r')).show()
# +---------+
# |        r|
# +---------+
# |[3, 2, 1]|
# |      [1]|
# |       []|
# +---------+

猜你喜欢

转载自blog.csdn.net/suzyu12345/article/details/79673586