rdd

dfh1 = sqlContext.read.csv(myfeature_path + 'base_info_name.csv', header=True)
    dfh1.show()
    dfh1.createOrReplaceTempView('h1')
    dfh2=dfh1.rdd.map(lambda _: Row(name=_['name'], province=_['province'], hydm=_['hydm'],clrq=_['clrq'],
                               big_hy_code=get_big_hy_code(_['hydm'])))
    dfh2.repartition(1).toDF().write.csv(os.path.join(myfeature_path, "base_bighycode.csv"),mode='overwrite', header=True)
    spark.stop()

`dfkk2.rdd.map(lambda _ : Row(_['_1'],_['_2'],_['_3'],_['_4'],
                        _['_5'],_['_6'],_['_7'],_['_8'],
                        _['_9'],_['_10'],the_second_hy_map((_['_11']))).repartition(1).toDF().write.csv(os.path.join(hdfsSaveStats, "the_last_second_update_bighyname.csv"),
                        mode='overwrite', header=True))

猜你喜欢

转载自blog.csdn.net/sinat_26566137/article/details/80344794
rdd