Proyecto de análisis del comportamiento del consumidor de comercio electrónico de Hive

los datos muestran

Los datos recopilados por las tiendas de una empresa minorista en el último año.

customer_details.csv:客户信息
transaction_details.csv:交易信息
store_details.csv:门店信息
store_review.csv:评价信息

inserte la descripción de la imagen aquí

Preparación ambiental

máquina virtual centos 7, Hadoop+Hive+Zeppelin
iniciar Hadoop, Hive, Zeppelin

./hadoop/sbin/start-all.sh
nohup hive --service hiveserver2 &
./zeppelin09/bin/zeppelin-daemon.sh start

Abra la página de Zeppelin (hive es el nombre de la máquina virtual, también puede completar la IP de la máquina virtual)

http://hive:8000/

inserte la descripción de la imagen aquí

código de proyecto

Cargue archivos de datos y cree tablas de datos

Ingrese al directorio de la máquina virtual donde se almacenan los datos y vea la información del archivo

%sh
cd /workspace/hive/store/
wc -l customer_details.csv
wc -l store_details.csv
wc -l store_review.csv
wc -l transaction_details.csv
head -2 customer_details.csv
head -2 store_details.csv
head -2 store_review.csv
head -2 transaction_details.csv

inserte la descripción de la imagen aquí
Subir datos al directorio hdfs

%sh
cd /workspace/hive/store/
hdfs dfs -rm -r -f -skipTrash /data/shopping/

hdfs dfs -mkdir -p /data/shopping/customer/
hdfs dfs -put customer_details.csv /data/shopping/customer/

hdfs dfs -mkdir -p /data/shopping/transaction/
hdfs dfs -put transaction_details.csv /data/shopping/transaction/

hdfs dfs -mkdir -p /data/shopping/store/
hdfs dfs -put store_details.csv /data/shopping/store/

hdfs dfs -mkdir -p /data/shopping/review/
hdfs dfs -put store_review.csv /data/shopping/review/

hdfs dfs -ls -R /data/shopping

inserte la descripción de la imagen aquí
Crear tabla de datos de colmena y cargar datos

%hive
create database if not exists shopping;
use shopping;
create external table if not exists ext_customer_details(
    customer_id string,
    first_name string,
    last_name string,
    email string,
    gender string,
    address string,
    country string,
    language string,
    job string,
    credit_type string,
    credit_no string
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
location '/data/shopping/customer'
tblproperties("skip.header.line.count"="1")
%hive
use shopping;
create external table if not exists ext_transaction_details(
    transaction_id string,
    customer_id string,
    store_id string,
    price decimal(8,2),
    product string,
    purchase_date date,
    purchase_time string
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
location '/data/shopping/transaction'
tblproperties("skip.header.line.count"="1")
%hive
use shopping;
create external table if not exists ext_store_details(
    store_id string,
    store_name string,
    employee_number string
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
location '/data/shopping/store'
tblproperties("skip.header.line.count"="1")
%hive
use shopping;
create external table if not exists ext_store_review(
    transaction_id string,
    store_id string,
    review_score string
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
location '/data/shopping/review'
tblproperties("skip.header.line.count"="1")

limpieza de datos

%hive
create view if not exists vw_customer_details as select
customer_id,
first_name,
unbase64(last_name) as last_name,
unbase64(email) as email,
gender,
unbase64(address) as address,
country,job,credit_type,
unbase64(concat(unbase64(credit_no),'seed')) as credit_no
from ext_customer_details
%hive
create table if not exists transaction_details(
    transaction_id string,
    customer_id string,
    store_id string,
    price decimal(8,2),
    product string,
    purchase_date date,
    purchase_time string
)
partitioned by (purchase_month string)
%hive
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrick;
with base as(
select transaction_id,customer_id,store_id,price,product,purchase_date,purchase_time,
from_unixtime(unix_timestamp(purchase_date,'yyyy-MM-dd'),'yyyy-MM') as purchase_month,
row_number() over(partition by transaction_id order by store_id) as rn
from ext_transaction_details
where customer_id<>'customer_id'
)
from base
insert overwrite table transaction_details partition(purchase_month)
select
if(rn=1,transaction_id,concat(transaction_id,'_fix',rn)) as transaction_id,
customer_id,store_id,price,product,purchase_date,purchase_time,purchase_month;
select transaction_id,customer_id,store_id,price,product,purchase_date,purchase_time,purchase_month from transaction_details where transaction_id like '%fix%';
%hive
select count(*) from ext_store_review r join ext_transaction_details t on
r.transaction_id=t.transaction_id and r.store_id=t.store_id
where review_score<>''
%hive
select count(*) from ext_store_review where review_score <>'';
%hive
create view if not exists vw_store_review as
select transaction_id,review_score from ext_store_review where review_score<>''

Visualización de datos

Análisis de clientes

Tarjetas de crédito más populares

%hive
select credit_type,count(distinct credit_no) as credit_cnt
from vw_customer_details group by country,credit_type order by credit_cnt desc;

inserte la descripción de la imagen aquí
Las 5 principales ocupaciones de los clientes

%hive
select job,count(*) as pn from vw_customer_details group by job order by pn desc limit 5;

inserte la descripción de la imagen aquí
Las 3 principales tarjetas de crédito de clientes mujeres en los EE. UU.

%hive
select credit_type,count(*) as ct from vw_customer_details
where country='United States' and gender='Female'
group by credit_type order by ct desc limit 3;

inserte la descripción de la imagen aquí
Estadísticas de clientes por país y género

%hive
select count(*),country,gender from vw_customer_details group by country,gender;

inserte la descripción de la imagen aquí

análisis de transacciones

Ingreso total por mes

%hive
select sum(price) as revenue_mom,purchase_month from transaction_details group by purchase_month;

inserte la descripción de la imagen aquí
Ingresos totales por trimestre

%hive
with base as(select price,
concat_ws('-',substr(purchase_date,1,4),cast(ceil(month(purchase_date)/3.0)as string)) as year_quarter
from transaction_details)
select sum(price) as revenue_qoq,year_quarter from base group by year_quarter;

inserte la descripción de la imagen aquí
Ingresos totales por año

%hive
select sum(price) as revenue_mom,substr(purchase_date,1,4) as year
from transaction_details group by substr(purchase_date,1,4);

inserte la descripción de la imagen aquí
Estadísticas de los ingresos totales de cada día de la semana

%hive
select sum(price) as revenue_wow,date_format(purchase_date,'u') as weekday
from transaction_details group by date_format(purchase_date,'u');

inserte la descripción de la imagen aquí
Estadísticas de rentabilidad media y rentabilidad total por periodo de tiempo

%hive
with base as(
select price, purchase_time, if(purchase_time like '%PM',
concat_ws(':',string(hour(from_unixtime(unix_timestamp(purchase_time,'hh:mm')))+12),
string(minute(from_unixtime(unix_timestamp(purchase_time,'hh:mm'))))),
from_unixtime(unix_timestamp(purchase_time,'hh:mm'),'HH:mm')) as time_format
from transaction_details
),
timeformat as (
select
purchase_time,price,
(cast(split(time_format,':')[0] as decimal(4,2))+ cast(split(time_format,':')[1] as decimal(4,2))/60)
as purchase_time_in_hrs
from base
),
timebucket as (
select
price,purchase_time, purchase_time_in_hrs,
if(purchase_time_in_hrs>5 and purchase_time_in_hrs <=8,'early morning',
if(purchase_time_in_hrs >8 and purchase_time_in_hrs <=11,'morning',
if(purchase_time_in_hrs>11 and purchase_time_in_hrs<=13,'noon',
if(purchase_time_in_hrs >13 and purchase_time_in_hrs <=18,'afternoon',
if(purchase_time_in_hrs>18 and purchase_time_in_hrs <=22,'evening', 'night'))))) as time_bucket from timeformat
)
select time_bucket, avg(price) as avg_spend, sum(price)/1000 as revenue_k
from timebucket group by time_bucket -- divide 1k to see the chater more clear;

inserte la descripción de la imagen aquí
Calcular el ingreso promedio de cada día de la semana

%hive
select avg(price) as avg_price,date_format(purchase_date,'u') as weekday from transaction_details
where date_format(purchase_date,'u') is not null group by date_format(purchase_date,'u');

inserte la descripción de la imagen aquí
Volumen total de transacciones año-mes estadístico

%hive
with base as (select
transaction_id,date_format(purchase_date,'u') as weekday,purchase_month,
concat_ws('-', substr(purchase_date,1,4),
cast(ceil(month(purchase_date)/3.0) as string)) as year_quarter,substr(purchase_date,1,4)as year
from transaction_details where purchase_month is not null)
select count(distinct transaction_id) as total,weekday,purchase_month,year_quarter,year
from base group by weekday, purchase_month,year_quarter,year order by year,purchase_month

inserte la descripción de la imagen aquí
Estadísticas de los 10 primeros clientes en el ranking de tiempos de consumo

%hive
with base as (
select customer_id,count(distinct transaction_id) as trans_cnt,sum(price) as spend_total 
from transaction_details where purchase_month is not null group by customer_id), 
cust_detail as(
select td.*,first_name as cust_name from
base td join vw_customer_details cd on td.customer_id=cd.customer_id)
select trans_cnt,cust_name as top10_trans_cust from cust_detail order by trans_cnt desc limit 10;

inserte la descripción de la imagen aquí
Estadísticas de los 10 mejores clientes en gasto

%hive
with base as (
select
customer_id,
count(distinct transaction_id) as trans_cnt,
sum(price) as spend_total
from transaction_details
where purchase_month is not null
group by customer_id
),
cust_detail as (
select td.*,first_name as cust_name from
base td join vw_customer_details cd on td.customer_id =cd.customer_id
)
select spend_total,cust_name as top10_trans_cust from cust_detail order by spend_total desc limit 10;

inserte la descripción de la imagen aquí
El cliente con el menor número de compras en el periodo estadístico

%hive
with base as (select customer_id,count(distinct transaction_id) as trans_cnt
from transaction_details where purchase_month is not null group by customer_id)
select * from base order by trans_cnt limit 10;

inserte la descripción de la imagen aquí
Estadística anual - trimestral número total de clientes

%hive
with base as (select customer_id,
concat_ws('-',substr(purchase_date,1,4),
cast(ceil(month(purchase_date)/3.0) as string)) as year_quarter,substr(purchase_date,1,4) as year
from transaction_details where purchase_month is not null)
select count(distinct customer_id) as total, year_quarter, year
from base group by year_quarter,year order by year_quarter;

inserte la descripción de la imagen aquí
Calcular el consumo medio del cliente más grande

%hive
with base as (select customer_id,avg(price) as price_avg,max(price)as price_max
from transaction_details where purchase_month is not null group by customer_id)
select max(price_avg) from base;

inserte la descripción de la imagen aquí
Estadísticas de los mayores consumos mensuales y los clientes más frecuentes

%hive
with base as(
select customer_id,purchase_month,sum(price) as price_sum, count(transaction_id) as trans_cnt
from transaction_details where purchase_month is not null group by purchase_month,customer_id), 
rank_sum as (select
rank() over(partition by purchase_month order by price_sum desc) as rn_sum,
rank() over(partition by purchase_month order by trans_cnt desc) as rn_cnt,
purchase_month,price_sum,trans_cnt,customer_id from base)
select purchase_month,'spend' as measure_name,price_sum as measure_value,customer_id 
from rank_sum where rn_sum=1
union all
select purchase_month,'visit' as measure_name,trans_cnt as measure_value,customer_id 
from rank_sum where rn_cnt =1 order by measure_name, purchase_month;

inserte la descripción de la imagen aquí
Estadísticas de los 5 productos más populares según el consumo y la verificación

%hive
select product,sum(price) as price_sum from transaction_details
where purchase_month is not null group by product order by price_sum desc limit 5;

inserte la descripción de la imagen aquí
Cuente y verifique los 5 productos más populares según la frecuencia de compra

%hive
select product,count(transaction_id) as freq_buy from transaction_details
where purchase_month is not null group by product order by freq_buy desc limit 5;

inserte la descripción de la imagen aquí
Cuente y verifique los 5 productos más populares según la cantidad de clientes

%hive
select product,count(customer_id) as freq_cust from transaction_details
where purchase_month is not null group by product order by freq_cust desc limit 5;

inserte la descripción de la imagen aquí

Análisis de la tienda

Tiendas más populares por tráfico

%hive
select sd.store_name,count(distinct customer_id) as unique_visit
from transaction_details td join ext_store_details sd on td.store_id=sd.store_id
group by store_name order by unique_visit desc limit 5;

inserte la descripción de la imagen aquí
Estadísticas de las tiendas más populares por gasto de los clientes

%hive
select sd.store_name,sum(td.price) as total_revnue from
transaction_details td join ext_store_details sd on td.store_id=sd.store_id
group by store_name order by total_revnue desc limit 5;

inserte la descripción de la imagen aquí
Estadísticas de las tiendas más populares por frecuencia de transacciones

%hive
select sd.store_name,count(transaction_id) as unique_purchase
from transaction_details td join ext_store_details sd on td.store_id=sd.store_id
group by store_name order by unique_purchase desc limit 5;

inserte la descripción de la imagen aquí
Estadísticas de los productos más populares en cada tienda por tráfico de clientes

%hive
with base as (select store_id,product,count(distinct customer_id) as freq_cust
from transaction_details where purchase_month is not null group by store_id, product),
prod_rank as (select store_id,product,freq_cust,
rank() over(partition by store_id order by freq_cust desc) as rn from base)
select store_name, product, freq_cust
from prod_rank td join ext_store_details sd on td.store_id =sd.store_id
where td.rn=1;

inserte la descripción de la imagen aquí
Calcule la proporción de tráfico de clientes a empleados para cada tienda

%hive
with base as (select store_id,count(distinct customer_id,purchase_date)as cust_visit
from transaction_details where purchase_month is not null group by store_id)
select store_name,cust_visit,employee_number,
round(cust_visit/employee_number,2) as cust_per_employee_within_period
from base td join ext_store_details sd on td.store_id=sd.store_id;

inserte la descripción de la imagen aquí
Calcular los ingresos de cada tienda por año-trimestre

%hive
select store_name,purchase_month,sum(price) as revenue
from transaction_details td join ext_store_details sd on td.store_id=sd.store_id
where purchase_month is not null group by store_name,purchase_month;

inserte la descripción de la imagen aquí
Hacer un gráfico de ingresos totales por tienda

%hive
select store_name,sum(price) as revenue
from transaction_details td join ext_store_details sd on td.store_id=sd.store_id
where purchase_month is not null group by store_name;

inserte la descripción de la imagen aquí
Cuente el período de tiempo más ocupado de cada tienda

%hive
with base as(
select transaction_id, purchase_time, if(purchase_time like '%PM',
concat_ws(':',string(hour(from_unixtime(unix_timestamp(purchase_time,'hh:mm')))+12),
string(minute(from_unixtime(unix_timestamp(purchase_time,'hh:mm'))))),
from_unixtime(unix_timestamp(purchase_time,'hh:mm'),'HH:mm')) as time_format,
store_id from transaction_details
where purchase_month is not null),
timeformat as (
select purchase_time,transaction_id,
(cast(split(time_format,':')[0] as decimal(4,2))+ cast(split(time_format,':')[1] as decimal(4,2))/60)
as purchase_time_in_hrs,store_id from base),
timebucket as (
select transaction_id,purchase_time, purchase_time_in_hrs,store_id,
if(purchase_time_in_hrs>5 and purchase_time_in_hrs <=8,'early morning',
if(purchase_time_in_hrs >8 and purchase_time_in_hrs <=11,'morning',
if(purchase_time_in_hrs>11 and purchase_time_in_hrs<=13,'noon',
if(purchase_time_in_hrs >13 and purchase_time_in_hrs <=18,'afternoon',
if(purchase_time_in_hrs>18 and purchase_time_in_hrs <=22,'evening', 'night'))))) as time_bucket from timeformat
)
select sd.store_name, count(transaction_id) as tran_cnt, time_bucket
from timebucket td join ext_store_details sd on td.store_id=sd.store_id
group by sd.store_name,time_bucket order by sd.store_name,tran_cnt desc;

inserte la descripción de la imagen aquí
Determinar la tienda estrella en función de los ingresos máximos de cada empleado

%hive
with base as (select store_name,customer_id,sum(td.price) as total_cust_purphase
from transaction_details td join ext_store_details sd on td.store_id =sd.store_id
where purchase_month is not null group by store_name, customer_id),
rk_cust as (select store_name,customer_id,total_cust_purphase,
rank() over(partition by store_name order by total_cust_purphase desc) as rn
from base)
select * from rk_cust where rn <=5;

with base as (select store_id,sum(price) as revenue from transaction_details
where purchase_month is not null group by store_id)
select store_name, revenue,employee_number,
round(revenue/employee_number,2) as revenue_per_employee_within_period
from base td join ext_store_details sd on td.store_id =sd.store_id;

inserte la descripción de la imagen aquí

Evaluación Análisis

Descubra la cobertura de testimonios de clientes

%hive
select count(td.transaction_id) as total_trans,
sum(if(sd.transaction_id is null,1, 0))as total_review_missed,
sum(if(sd.transaction_id is not null,1, 0)) as total_review_exist
from transaction_details td left join ext_store_review sd on td.transaction_id =sd.transaction_id
where purchase_month is not null;

inserte la descripción de la imagen aquí
Comprender el número de clientes y transacciones en función de las calificaciones

%hive
select review_score,count(distinct customer_id) as num_customer,count(*) as num_reviews
from transaction_details td join ext_store_review sd on td.transaction_id=sd.transaction_id
where purchase_month is not null and review_score <>'' group by review_score;

inserte la descripción de la imagen aquí
¿Los clientes siempre le dan a la misma tienda las mejores calificaciones?

%hive
select count(*)as visit_cnt,customer_id,td.store_id
from transaction_details td join ext_store_review sd on td.transaction_id=sd.transaction_id
where purchase_month is not null and review_score='5'
group by customer_id,td.store_id order by visit_cnt desc;

inserte la descripción de la imagen aquí
La creación no es fácil, por favor dale me gusta y apoya.

Supongo que te gusta

Origin blog.csdn.net/weixin_46322367/article/details/128359177
Recomendado
Clasificación