1
2020-08-02 18:19:32.966 [main] INFO com.lagou.ecommerce.AppStart - {"app_active": {"name":"app_active","json": {"entry":"1","action":"1","error_code":"0"},"time":159630958586 1},"attr":{"area":"绍 兴","uid":"2F10092A10","app_v":"1.1.16","event_type":"common"," device_id":"1FB872- 9A10010","os_type":"3.0","channel":"ML","language":"chinese","b rand":"Huawei-2"}}

主要任务:ODS(包含json串) => DWD

json数据解析,丢弃无用数据(数据清洗),保留有效信息,并将数据展开,形成每日启动明细表。

创建DWD层表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
use DWD;

drop table if exists dwd.dwd_start_log;

CREATE TABLE dwd.dwd_start_log(
`device_id` string,
`area` string,
`uid` string,
`app_v` string,
`event_type` string,
`os_type` string,
`channel` string,
`language` string,
`brand` string,
`entry` string,
`action` string,
`error_code` string
)
PARTITIONED BY (dt string)
STORED AS parquet;

表的格式:parquet、分区表

加载DWD层数据

创建script/member_active/dwd_load_start.sh

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/bin/bash
# 可以输入日期;如果未输入日期取昨天的时间
if [ -n "$1" ]
then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi

# 定义要执行的SQL
sql="
with tmp as(
select split(str, ' ')[7] line
from ods.ods_start_log
where dt='$do_date'
)
insert overwrite table dwd.dwd_start_log partition(dt='$do_date')
select get_json_object(line, '$.attr.device_id'),
get_json_object(line, '$.attr.area'),
get_json_object(line, '$.attr.uid'),
get_json_object(line, '$.attr.app_v'),
get_json_object(line, '$.attr.event_type'),
get_json_object(line, '$.attr.os_type'),
get_json_object(line, '$.attr.channel'),
get_json_object(line, '$.attr.language'),
get_json_object(line, '$.attr.brand'),
get_json_object(line, '$.app_active.json.entry'),
get_json_object(line, '$.app_active.json.action'),
get_json_object(line, '$.app_active.json.error_code')
from tmp;
"

hive -e "$sql"

日志文件 =》 Flume =》 HDFS =》 ODS =》 DWD

ODS =》 DWD;json数据的解析;数据清洗

下一步任务:DWD(会员的每日启动信息明细) => DWS(如何建表,如何加载数据)

活跃会员 ===> 新增会员 ===> 会员留存