需求描述

针对销售数据,完成统计:

  1. 按年统计销售额

  2. 销售金额在 10W 以上的订单

  3. 每年销售额的差值

  4. 年度订单金额前10位(年度、订单号、订单金额、排名)

  5. 季度订单金额前10位(年度、季度、订单id、订单金额、排名)

  6. 求所有交易日中订单金额最高的前10位 7. 每年度销售额最大的交易日

  7. 年度最畅销的商品(即每年销售金额最大的商品)

数据说明

日期表(dimdate) 类型 备注
dt date 日期
yearmonth int 年月
year smallint
month tinyint
day tinyint
week tinyint 周几
weeks tinyint 第几周
quat tinyint 季度
tendays tinyint
halfmonth tinyint 半月
订单表(sale) 类型 备注
orderid string 订单号
locationid string 交易位置
dt date 交易日期
订单销售明细表(saledetail) 类型 备注
orderid string 订单号
rownum int 行号
itemid string 货品
num int 数量
price double 单价
amount double 金额

实现

创建表

进入 Linux123 ,/root/hadoop/script下创建createtable.hql文件,输入以下建表内容

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
drop database sale cascade;

create database if not exists sale;

-- 创建txt源表
create table sale.dimdate_ori(
dt date,
yearmonth int,
year smallint,
month tinyint,
day tinyint,
week tinyint,
weeks tinyint,
quat tinyint,
tendays tinyint,
halfmonth tinyint
)
row format delimited fields terminated by ",";

create table sale.sale_ori(
orderid string,
locationid string,
dt date
)
row format delimited fields terminated by ",";

create table sale.saledetail_ori(
orderid string,
rownum int,
goods string,
num int,
price double,
amount double
)
row format delimited fields terminated by ",";

-- 创建orc表
create table sale.dimdate(
dt date,
yearmonth int,
year smallint,
month tinyint,
day tinyint,
week tinyint,
weeks tinyint,
quat tinyint,
tendays tinyint,
halfmonth tinyint
)
stored as orc;

create table sale.sale(
orderid string,
locationid string,
dt date
)
stored as orc;

create table sale.saledetail(
orderid string,
rownum int,
goods string,
num int,
price double,
amount double
)
stored as orc;

在当前目录下执行createtable.hql文件

1
hive -f createtable.hql

导入数据

上传数据文件到Linux123的/root/hadoop/data中


进入 Linux123 ,/root/hadoop/script下创建loaddata.hql文件,输入以下新增数据内容

1
2
3
4
5
6
7
8
9
10
11
-- 加载数据
use sale;

load data local inpath "/root/hadoop/data/tbDate.txt" overwrite into table dimdate_ori;
load data local inpath "/root/hadoop/data/tbSale.txt" overwrite into table sale_ori;
load data local inpath "/root/hadoop/data/tbSaleDetail.txt" overwrite into table saledetail_ori;

-- 导入数据
insert into table dimdate select * from dimdate_ori;
insert into table sale select * from sale_ori;
insert into table saledetail select * from saledetail_ori;

在当前目录下执行loaddata.hql文件

1
hive -f loaddata.hql

SQL实现

  • 1、按年统计销售额
1
2
3
4
SELECT year(B.dt) year, round(sum(A.amount)/10000, 2) amount
FROM saledetail A
join sale B on A.orderid=B.orderid
group by year(B.dt);
  • 2、销售金额在 10W 以上的订单
1
2
3
4
SELECT orderid, round(sum(amount), 2) amount
FROM saledetail
group by orderid
having sum(amount) > 100000
  • 3、每年销售额的差值
1
2
3
4
5
6
7
8
9
SELECT year, round(amount, 2) amount,
round(lag(amount) over (ORDER BY year), 2) prioramount ,
round(amount - lag(amount) over (ORDER BY year), 2) diff
from (
SELECT year(B.dt) year, sum(A.amount) amount
from saledetail A
join sale B on A.orderid=B.orderid
group by year(B.dt)
) tmp;
  • 4、年度订单金额前10位(年度、订单号、订单金额、排名)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
-- 方法一
SELECT dt, orderid, amount, rank
from (
SELECT dt, orderid, amount,
dense_rank() over(PARTITION BY dt ORDER BY amount desc) rank
from (
SELECT year(B.dt) dt, A.orderid, sum(A.amount) amount
from saledetail A
join sale B on A.orderid=B.orderid
GROUP BY year(B.dt), A.orderid
) tmp1
) tmp2
where rank <= 10;

-- 方法二
with tmp as (
SELECT year(B.dt) dt, A.orderid, sum(A.amount) amount
from saledetail A
join sale B on A.orderid=B.orderid
GROUP BY year(B.dt), A.orderid
)
SELECT dt, orderid, amount, rank
from (
SELECT dt, orderid, amount,
dense_rank() over(PARTITION BY dt ORDER BY amount desc) rank
from tmp
) tmp2
where rank <= 10;
  • 5、季度订单金额前10位(年度、季度、订单id、订单金额、排名)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
-- 方法一
with tmp as (
select C.year, C.quat, A.orderid, round(sum(B.amount), 2) amount
from sale A
join saledetail B on A.orderid=B.orderid
join dimdate C on A.dt=C.dt
group by C.year, C.quat, A.orderid
)
select year, quat, orderid, amount, rank
from (
select year, quat, orderid, amount,
dense_rank() over (partition by year, quat order by amount desc) rank
from tmp
) tmp1
where rank <= 10;

-- 方法二
with tmp as(
select
year(A.dt) year,
case when month(A.dt) <= 3 then 1
when month(A.dt) <= 6 then 2
when month(A.dt) <= 9 then 3
else 4
end quat, A.orderid, round(sum(B.amount), 2) amount
from sale A
join saledetail B on A.orderid = B.orderid
group by year(A.dt),
case when month(A.dt) <= 3 then 1
when month(A.dt) <= 6 then 2
when month(A.dt) <= 9 then 3
else 4
end, A.orderid
)
select year, quat, orderid, amount, rank
from (
select year, quat, orderid, amount,
dense_rank() over (partition by year, quat order by amount desc) rank from tmp
) tmp1
where rank <= 10;

-- 方法三。求季度
select floor(month(dt/3.1)) + 1;

with tmp as (
select year(A.dt) year, floor(month(A.dt)/3.1) + 1 quat, A.orderid, round(sum(B.amount), 2) amount
from sale A
join saledetail B on A.orderid=B.orderid
group by year(A.dt), floor(month(A.dt)/3.1) + 1, A.orderid
)
select year, quat, orderid, amount, rank
from (
select year, quat, orderid, amount,
dense_rank() over (partition by year, quat order by amount desc) rank
from tmp
) tmp1
where rank <= 10;
  • 6、求所有交易日中订单金额最高的前10位
1
2
3
4
5
6
7
8
9
10
11
12
13
-- topN问题: 1、基础数据 2、上排名函数 3、解决N的问题

with tmp as (
select A.dt, A.orderid, round(sum(B.amount), 2) amount
from sale A
join saledetail B on A.orderid=B.orderid
group by A.dt, A.orderid
)
select dt, orderid, amount,
rank from (select dt, orderid, amount, dense_rank() over(order by amount desc) rank
from tmp
) tmp1
where rank <= 10;
  • 7、每年度销售额最大的交易日
1
2
3
4
5
6
7
8
9
with tmp as (
select A.dt, round(sum(B.amount), 2) amount
from sale A
join saledetail B on A.orderid=B.orderid
group by A.dt
)
select year(dt) year, max(amount) dayamount
from tmp
group by year(dt);

备注:以上求解忽略了交易日,以下SQL更符合题意

1
2
3
4
5
6
7
8
9
10
11
12
13
with tmp as (
select dt, amount,
dense_rank() over (partition by year(dt) order by amount desc) as rank
from (
select A.dt, round(sum(B.amount), 2) amount
from sale A
join saledetail B on A.orderid=B.orderid
group by A.dt
) tab1
)
select year(dt) as year, dt, amount
from tmp
where rank=1;
  • 8、年度最畅销的商品(即每年销售金额最大的商品)
1
2
3
4
5
6
7
8
9
10
11
12
13
with tmp as (
select year(B.dt) year, goods, round(sum(amount),2) amount
from saledetail A
join sale B on A.orderid=B.orderid
group by year(B.dt), goods
)
select year, goods, amount
from (
select year, goods, amount,
dense_rank() over (partition by year order by amount desc) rank
from tmp
) tmp1
where rank = 1;