直接上代码:
from pyspark.ml.feature import QuantileDiscretizer
lis = [{
'cli': 11}, {
'cli': 0}, {
'cli': 0}, {
'cli': 3}, {
'cli': 100}, {
'cli': 76}, {
'cli': 23}, {
'cli': 2},
{
'cli': 800}, {
'cli': 9}, {
'cli': 10}, {
'cli': 100}, {
'cli': 339}, {
'cli': 678}, {
'cli': 297},
{
'cli': 10}]
dlis = ss.createDataFrame(lis)
dlis.show()
dqis = QuantileDiscretizer(numBuckets=3, inputCol='cli', outputCol='cli_but', relativeError=0.001)
model = dqis.fit(dlis)
dlis = model.transform(dlis)
dlis.show()
ltmp = [{
'cli': 7}, {
'cli': 70}]
tmplis = ss.createDataFrame(ltmp)
# # 保存分箱规则到HDFS
# model.write().overwrite().save('XXX.model')
# mmmodel = QuantileDiscretizer.load('XXX.model')
# df1 = mmmodel.transform(tmplis)
# df1.show()
'''
+---+
|cli|
+---+
| 11|
| 0|
| 0|
| 3|
|100|
| 76|
| 23|
| 2|
|800|
| 9|
| 10|
|100|
|339|
|678|
|297|
| 10|
+---+
+---+-------+
|cli|cli_but|
+---+-------+
| 11| 1.0|
| 0| 0.0|
| 0| 0.0|
| 3| 0.0|
|100| 2.0|
| 76| 1.0|
| 23| 1.0|
| 2| 0.0|
|800| 2.0|
| 9| 0.0|
| 10| 1.0|
|100| 2.0|
|339| 2.0|
|678| 2.0|
|297| 2.0|
| 10| 1.0|
+---+-------+
'''
官方API:
其它例子: