基于apache drill 的HDFS查询json 单机实例

参考 https://drill.apache.org/docs/json-data-model/

假设有原始数据在hdfs上:
hdfs://dc1:8020/xf/mytest/ia/2017/0208/details/part-00000
多条数据,按行存储的json文件,实际上是spark saveAsTextFile方法生成。
格式如下(已删除部分数据)

{
    "afterOpenDay": 9,
    "basic": {
        "availableMoney": 24063.51344060898,
        "closeReason": 0,
        "cutEarning": 0,
        "end_date": "20170222",
        "ism_id": "170208206199185",
        "losePercentage": 0,
        "profitPercentage": 0,
        "start_date": "20170209",
        "tm_close": -1,
        "totalMoney": 23600,
        "user_id": "8888"
    },
    "closeDay": true,
    "dailySummary": [
        {
            "TN": 0,
            "annualProfitRate": 0,
            "asset": 23515.820100307465,
            "commission": 75.1798996925354,
            "cost": 21560.179899692535,
            "day": "20170209",
            "floatProfit": -84.1798996925354,
            "freeMoney": 2039.8201003074646,
            "maketValue": 21476,
            "profitRate": -0.0035669449022260762
        },
        {
            "TN": 1,
            "annualProfitRate": 0,
            "asset": 23585.904140472412,
            "commission": 81.09585952758789,
            "cost": 20668.095859527588,
            "day": "20170210",
            "floatProfit": -14.09585952758789,
            "freeMoney": 2931.904140472412,
            "maketValue": 20654,
            "profitRate": -0.0005972821833723683
        },
        {
            "TN": 2,
            "annualProfitRate": 0,
            "asset": 23830.72134065628,
            "commission": 88.27865934371948,
            "cost": 18535.27865934372,
            "day": "20170213",
            "floatProfit": 230.72134065628052,
            "freeMoney": 5064.7213406562805,
            "maketValue": 18766,
            "profitRate": 0.009776327993910192
        },
        {
            "TN": 3,
            "annualProfitRate": 0,
            "asset": 23887.72134065628,
            "commission": 88.27865934371948,
            "cost": 18535.27865934372,
            "day": "20170214",
            "floatProfit": 287.7213406562805,
            "freeMoney": 5064.7213406562805,
            "maketValue": 18823,
            "profitRate": 0.012191582231198327
        },
        {
            "TN": 4,
            "annualProfitRate": 0,
            "asset": 23652.72134065628,
            "commission": 88.27865934371948,
            "cost": 18535.27865934372,
            "day": "20170215",
            "floatProfit": 52.72134065628052,
            "freeMoney": 5064.7213406562805,
            "maketValue": 18588,
            "profitRate": 0.002233955112554259
        },
        {
            "TN": 5,
            "annualProfitRate": 0,
            "asset": 23716.917340755463,
            "commission": 94.08265924453735,
            "cost": 17737.082659244537,
            "day": "20170216",
            "floatProfit": 116.91734075546265,
            "freeMoney": 5862.917340755463,
            "maketValue": 17854,
            "profitRate": 0.004954124608282316
        },
        {
            "TN": 6,
            "annualProfitRate": 0,
            "asset": 23595.554340839386,
            "commission": 100.44565916061401,
            "cost": 16380.445659160614,
            "day": "20170217",
            "floatProfit": -4.445659160614014,
            "freeMoney": 7219.554340839386,
            "maketValue": 16376,
            "profitRate": -0.00018837538816161075
        },
        {
            "TN": 7,
            "annualProfitRate": 0,
            "asset": 23780.802600860596,
            "commission": 106.1973991394043,
            "cost": 15649.197399139404,
            "day": "20170220",
            "floatProfit": 180.8026008605957,
            "freeMoney": 7950.802600860596,
            "maketValue": 15830,
            "profitRate": 0.007661127155109988
        },
        {
            "TN": 8,
            "annualProfitRate": 0,
            "asset": 24011.805600643158,
            "commission": 113.19439888000488,
            "cost": 13659.194399356842,
            "day": "20170221",
            "floatProfit": 411.80560064315796,
            "freeMoney": 9940.805600643158,
            "maketValue": 14071,
            "profitRate": 0.01744938985776093
        }
    ]
}


(1)下载并启动apache drill
bin/drill-embedded
(2)配置storage plugin,dc1是机器hostname
http://dc1:8047/storage

{
  "type": "file",
  "enabled": true,
  "connection": "hdfs://dc1:8020",
  "config": null,
  "workspaces": {
    "root": {
      "location": "/",
      "writable": false,
      "defaultInputFormat": null
    },
    "tmp": {
      "location": "/tmp",
      "writable": true,
      "defaultInputFormat": null
    },
    "ism": {
      "location": "/wx/mytest/ia/2017",
      "writable": true,
      "defaultInputFormat": "json"
    }
  },
  "formats": {
    "psv": {
      "type": "text",
      "extensions": [
        "tbl"
      ],
      "delimiter": "|"
    },
    "csv": {
      "type": "text",
      "extensions": [
        "csv"
      ],
      "delimiter": ","
    },
    "tsv": {
      "type": "text",
      "extensions": [
        "tsv"
      ],
      "delimiter": "\t"
    },
    "httpd": {
      "type": "httpd",
      "logFormat": "%h %t \"%r\" %>s %b \"%{Referer}i\"",
      "timestampFormat": null
    },
    "parquet": {
      "type": "parquet"
    },
    "json": {
      "type": "json",
      "extensions": [
        "json"
      ]
    },
    "avro": {
      "type": "avro"
    },
    "sequencefile": {
      "type": "sequencefile",
      "extensions": [
        "seq"
      ]
    },
    "csvh": {
      "type": "text",
      "extensions": [
        "csvh"
      ],
      "extractHeader": true,
      "delimiter": ","
    }
  }
}

(3)修改配置
http://dc1:8047/options
store.json.read_numbers_as_double 改为true,这个是因为我这边的json数据,有的浮点数输出为整数如5.0直接输出为5,导致错误”DATA_READ ERROR: You tried to write a Float8 type when you are using a ValueWriter of type ...“

(4)执行sql语句,这里dfs.ism.表示使用的是storage plguin 中的dfs里面配置的工作目录为ism


a.basic.ism_id,表示使用json文件中的basic字段(basic是个OBJECT类型)里面的ism_id字段

0: jdbc:drill:zk=local> select a.basic.ism_id as ism_id,a.dailySummary.asset as asset from dfs.ism.`0208/details/part-00000` a limit 10;
+------------------+---------------------+
|      ism_id      |        asset        |
+------------------+---------------------+
| 170208206199185  | 23515.820100307465  |
| 170208206199187  | 23585.904140472412  |
| 170208206199188  | 23830.72134065628   |
| 170208206199189  | 23887.72134065628   |
| 170208206199191  | 23652.72134065628   |
| 170208206199196  | 23716.917340755463  |
| 170208206199199  | 23595.554340839386  |
| 170208206199201  | 23780.802600860596  |
| 170208206199206  | 24011.805600643158  |
| 170208206199209  | 24063.51344060898   |
+------------------+---------------------+
10 rows selected (0.898 seconds)



[思考问题]上述字段中,如果遇到数组应该如何处理?
比如,要查询dailySummary 中的每日资产asset?
参考Drill官方文档,使用子查询(nest query)和FLATTEN函数,
FLATTEN用于将数组扁平化,即1行拆分成多行数据。

0: jdbc:drill:zk=local> select b.ism_id,b.daily.asset as asset from (select a.basic.ism_id as ism_id,FLATTEN(a.dailySummary) as daily from dfs.ism.`0208/details/part-00000` a ) b limit 10;

+------------------+---------------------+
|      ism_id      |        asset        |
+------------------+---------------------+
| 170208206199185  | 23515.820100307465  |
| 170208206199185  | 23585.904140472412  |
| 170208206199185  | 23830.72134065628   |
| 170208206199185  | 23887.72134065628   |
| 170208206199185  | 23652.72134065628   |
| 170208206199185  | 23716.917340755463  |
| 170208206199185  | 23595.554340839386  |
| 170208206199185  | 23780.802600860596  |
| 170208206199185  | 24011.805600643158  |
| 170208206199185  | 24063.51344060898   |
| 170208206199187  | 20130.834299087524  |
| 170208206199187  | 19987.834299087524  |
| 170208206199187  | 20333.938299179077  |
| 170208206199187  | 20277.938299179077  |
| 170208206199187  | 20153.938299179077  |
| 170208206199187  | 20321.938299179077  |
| 170208206199187  | 20165.137598991394  |
| 170208206199187  | 20376.137598991394  |
| 170208206199187  | 20496.137598991394  |
| 170208206199187  | 20428.81975889206   |
+------------------+---------------------+
20 rows selected (0.978 seconds)


上述查询也可以通过web方式
http://dc1:8047/query
查询获得。

猜你喜欢

转载自lvdccyb.iteye.com/blog/2366116