Learning the Pig language

Pig: is a parallel data flow language based on Hadoop

Pig: input and output

Step 1: Load the load

        After the data flow is finally executed, you can: store storage or dump output to the screen

 

Pig: Relational Operations

  foreach: For each record, you can select the corresponding field and pass it to the next operator (equivalent to selecting the required column in SQL, you can perform count and sum operations)

  filter: filter (equivalent to where in SQL)

    group: grouping, grouping according to a segment, (by including fields in it)

  Order: Sort (by which fields are included)

  Distinct: Deduplication will only deduplicate the entire record, not a field alone

  Join: To link the two loaded datasets, the link fields of the two data are required. Note that Join cannot be used for the same relationship.

( It is best not to repeat the names of the two associated fields, you can use aliases )

  Limit: Limit the amount of data.

  Count: When using pig to count the number of rows, select a column that is not empty.

  Flatten: The combination of grouped fields can be split.

  

 

Pig: some basic concepts: relation ( relation ), bag ( bag--can be seen as a database ), tuple ( tuple--can be seen as a row in a database ), field ( field ), data ( data ) relationship

      A relation is a bag, a bag consists of one or more tuples, and a tuple consists of multiple fields

      Note: The number of fields in each tuple can vary

<workflow-app xmlns="uri:oozie:workflow:0.2" name="dashboard-money"
              half="3000">
  <params>
    <param key="yestoday">${yestoday}</param>
    <param key="targetDir">/user/cmo_ipc/dashboard/store/repertoryExamination/app_dashboard_store_sales_d/${yestoday}</param>
    <param key="mysql_jdbc_1" import="/export/App/etl.sone.jd.local/WEB-INF/classes/conf/jdbc.properties"></param>
    <param key="mysql_jdbc_2" import="/export/App/etl.sone.jd.local/WEB-INF/classes/conf/important.properties"></param>
  </params>
  <start to="genBrand" />
  <action name="genBrand">
    <delete path="${targetDir}" />
    <pig>
      <script>yuanshi_fact_table = load '/user/cmo_ipc/app/dashboard/app_dashboard_store_sales_d/tx_dt=${yestoday}/*.lzo' as 
      ( data_date,
    data_year,
    data_month,
    data_week ,
    data_day,
    data_type,
    brand_code,
    item_sku_id,
    sku_status_cd,
    shelves_tm,
    otc_tm,
    utc_tm,
    purchaser_erp_acct,
    purchaser_name,
    saler_erp_acct,
    sale_staf_name,
    item_first_cate_cd,
    item_second_cate_cd,
    item_third_cate_cd,
    dept_id_3,
    band,
    free_goods_flag,
    delv_center_num,
    major_supp_brevity_code,
    ky_stock,
    num_order_booking,
    num_app_booking,
    num_stock,
    num_purchase_plan,
    num_order_transfer,
    num_zt_stock,
    num_transfer_plan_in,
    num_transfer_plan_out,
    basestock,
    loweststock,
    num_nosale,
    target_num_stock,
    health_num_stock,
    health_je_stock,
    mkt_prc,
    jd_prc ,
    stk_prc,
    wh_qtn,
           into_wh_qtty ,
           status ,
           sys_reple_qty,
           reple_qty,
           sales_qtty_1,
           sales_mount_1,
           sales_qtty_7,
 sales_mount_7,
           sales_qtty_14,
           sales_mount_14,
           sales_qtty_28,
           sales_mount_28,
           sales_qtty_60 ,
           sales_mount_60,
           sales_qtty_90 ,
           sales_mount_90,
           is_xhkc_flag  ,
           is_sg_flag    ,
           is_xg_flag    ,
           is_bh_flag    ,
           zzts_sl       ,
           zzts_je       ,
           nosale_days   ,
           xiagui_kc_sl  ,
           xiagui_kc_je  ,
           xiagui_sc    ,
           is_dh_flag    ,
           is_qh_flag    ,
           qh_sl         ,
           qh_je         ,
           is_zx_flag    ,
           zx_sl         ,
           zx_je         ,
           zt_sl         ,
    kc_je         ,
           pv_xhl        ,
           is_bdx_flag   ,
           bdx_sl        ,
           bdx_je        ,
           qh_ss_je );
      </script>
      
    
  <script>temp11= filter yuanshi_fact_table  by  item_first_cate_cd=='737' ;</script>
      
      <script>
      
      temp1 = FOREACH temp11 GENERATE data_date,
    data_year,
    data_month,
    data_week ,
    data_day,
    data_type,
    brand_code,
    item_sku_id as sku_id,
    sku_status_cd,
    shelves_tm,
    otc_tm,
    utc_tm,
    purchaser_erp_acct,
    purchaser_name,
    saler_erp_acct,
    sale_staf_name,
    item_first_cate_cd,
    item_second_cate_cd,
    item_third_cate_cd,
    dept_id_3,
    band,
    free_goods_flag,
    delv_center_num,
    major_supp_brevity_code,
    ky_stock,
    num_order_booking,
    num_app_booking,
    num_stock,
    num_purchase_plan,
    num_order_transfer,
    num_zt_stock,
    num_transfer_plan_in,
    num_transfer_plan_out,
    basestock,
    loweststock,
    num_nosale,
    target_num_stock,
    health_num_stock,
    health_je_stock,
    mkt_prc,
    jd_prc ,
    stk_prc,
    wh_qtn,
           into_wh_qtty ,
           status ,
           sys_reple_qty,
           reple_qty,
           sales_qtty_1,
           sales_mount_1,
           sales_qtty_7,
 sales_mount_7,
           sales_qtty_14,
           sales_mount_14,
           sales_qtty_28,
           sales_mount_28,
           sales_qtty_60 ,
           sales_mount_60,
           sales_qtty_90 ,
           sales_mount_90,
           is_xhkc_flag  ,
           is_sg_flag    ,
           is_xg_flag    ,
           is_bh_flag    ,
           zzts_sl       ,
           zzts_je       ,
           nosale_days   ,
           xiagui_kc_sl  ,
           xiagui_kc_je  ,
           xiagui_sc    ,
           is_dh_flag    ,
           is_qh_flag    ,
           qh_sl         ,
           qh_je         ,
           is_zx_flag    ,
           zx_sl         ,
           zx_je         ,
           zt_sl         ,
    kc_je         ,
           pv_xhl        ,
           is_bdx_flag   ,
           bdx_sl        ,
           bdx_je        ,
           qh_ss_je;
      </script>


      <script>temp2=load '/user/cmo_ipc/app/dashboard/app_dashboard_store_score_d/tx_dt=${yestoday}/*.lzo' as
      (data_date,data_year,data_month,data_week,data_day,item_sku_id,delv_center_num,score_total,score_qh,score_zx,score_bdx,score_xg,gjdj_total,gjdj_qh,gjdj_zx,gjdj_bdx,gjdj_xg); 
     </script>


<script>
t2 = FOREACH temp2 GENERATE item_sku_id,score_total,score_qh,score_zx,score_bdx,score_xg;
</script>

<script>
     t3 = JOIN temp1 BY sku_id, t2 BY item_sku_id;
</script>



      <script>
      t1= FOREACH t3 GENERATE data_date,data_type,item_third_cate_cd,brand_code,item_sku_id,band,dept_id_3,purchaser_erp_acct,
     major_supp_brevity_code,delv_center_num,CONCAT(CONCAT(item_sku_id,'_'),delv_center_num),num_stock,ky_stock,sales_qtty_1,sales_mount_1,
     sales_qtty_7,sales_mount_7,sales_qtty_14,sales_mount_14,sales_qtty_28,sales_mount_28,sales_qtty_60,sales_mount_60,sales_qtty_90,
     sales_mount_90,zt_sl,num_zt_stock,num_order_booking,pv_xhl,qh_ss_je,qh_sl,is_qh_flag,zzts_sl,zzts_je,zx_sl,is_zx_flag,nosale_days,
     bdx_sl,is_bdx_flag,xiagui_kc_sl,xiagui_sc,is_xg_flag,is_xhkc_flag,is_sg_flag,health_num_stock,score_total,score_qh,score_zx,
      score_bdx,score_xg,qh_je,xiagui_kc_je,bdx_je,zx_je,health_je_stock; 
      </script>
      
      <script>store t1 into '${targetDir}' ;</script>
    </pig>
    <ok to="end" />
    <error to="fail" />
  </action>
  <kill name="fail" />
  <end name="end" />
</workflow-app>

Guess you like

Origin http://10.200.1.11:23101/article/api/json?id=326654674&siteId=291194637
pig
pig