Datax数据导出

  • 在MySQL创建对应的表

    1
    2
    3
    4
    5
    6
    7
    8
    9
    drop table if exists dwads.ads_ad_show_place;
    create table dwads.ads_ad_show_place(
    ad_action tinyint,
    hour varchar(2),
    place varchar(20),
    product_id int,
    cnt int,
    dt varchar(10)
    );
  • 创建配置文件(json)

    创建/data/lagoudw/script/advertisement/ads_ad_show_place.json

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    {
    "job":{
    "setting":{
    "speed":{
    "channel":1
    }
    },
    "content":[
    {
    "reader":{
    "name":"hdfsreader",
    "parameter":{
    "path":"/user/hive/warehouse/ads.db/ads_ad_show_place/dt=$do _date/*",
    "defaultFS":"hdfs://192.168.91.121:9000",
    "column":[
    {
    "index":0,
    "type":"string"
    },
    {
    "index":1,
    "type":"string"
    },
    {
    "index":2,
    "type":"string"
    },
    {
    "index":3,
    "type":"string"
    },
    {
    "index":4,
    "type":"string"
    },
    {
    "type":"string",
    "value":"$do_date"
    }
    ],
    "fileType":"text",
    "encoding":"UTF-8",
    "fieldDelimiter":","
    }
    },
    "writer":{
    "name":"mysqlwriter",
    "parameter":{
    "writeMode":"insert",
    "username":"hive",
    "password":"12345678",
    "column":[
    "ad_action",
    "hour",
    "place",
    "product_id",
    "cnt",
    "dt"
    ],
    "preSql":[
    "delete from ads_ad_show_place where dt='$do_date'"
    ],
    "connection":[
    {
    "jdbcUrl":"jdbc:mysql://192.168.91.123:3306/dwads?useUnicode=true&characterEncoding=utf-8",
    "table":[
    "ads_ad_show_place"
    ]
    }
    ]
    }
    }
    }
    ]
    }
    }
  • 执行命令,使用json配置文件;测试

    1
    python /data/modules/datax/bin/datax.py -p "-Ddo_date=2020-07-20" /data/lagoudw/script/advertisement/ads_ad_show_place.json
  • 编写执行脚本(shell)

    创建/data/lagoudw/script/advertisement/ads_ad_show_place.sh

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    #!/bin/bash
    source /etc/profile
    JSON=/data/lagoudw/script

    if [ -n "$1" ]
    then
    do_date=$1
    else
    do_date=`date -d "-1 day" +%F`
    fi

    python $DATAX_HOME/bin/datax.py -p "-Ddo_date=$do_date" $JSON/advertisement/ads_ad_show_place.json
  • shell脚本的测试

    1
    sh /data/lagoudw/script/advertisement/ads_ad_show_place.sh 2020-07-20

高仿日志数据测试

  • 数据采集

    1000W左右日活用户

    按 30条日志 / 人天,合计3亿条事件日志

    每条日志 650字节 左右

    总数据量大概在180G

    采集数据时间约2.5小时

  • 1、清理环境

  • 2、启动Flume

    1
    nohup flume-ng agent --conf /opt/apps/flume-1.9/conf --conf- file /data/lagoudw/conf/flume-log2hdfs4.conf -name a1 - Dflume.root.logger=INFO,console &

    日志文件很大,可以将hdfs文件滚动设置为10G甚至更大

  • 3、写日志

    1
    2
    cd /data/lagoudw/jars
    nohup java -cp data-generator-1.1-SNAPSHOT-jar-with-dependencies.jar com.lagou.ecommerce.AppEvent 300000000 2020- 08-03 > /data/lagoudw/logs/event/eventlog0803.log &
  • 4、执行脚本

    1
    2
    3
    4
    5
    6
    7
    8
    9
    sh ods_load_event_log.sh 2020-07-20

    sh dwd_load_event_log.sh 2020-07-20
    sh dwd_load_ad_log.sh 2020-07-20

    sh ads_load_ad_show.sh 2020-07-20
    sh ads_load_ad_show_rate.sh 2020-07-20
    sh ads_load_ad_show_page.sh 2020-07-20
    sh ads_load_ad_show_page_window.sh 2020-07-20