1、缘由
本文主要涉及的是建表,不是计算指标。
2、mysql表
假设mysql中存在下面的八张表:
3、ods
ods层和源数据最好保持一致:
//创建用户表drop table if exists ods_user_info;create table ods_user_info( `id` string COMMENT '用户id', `name` string COMMENT '姓名', `birthday` string COMMENT '生日' , `gender` string COMMENT '性别', `email` string COMMENT '邮箱', `user_level` string COMMENT '用户等级', `create_time` string COMMENT '创建时间') COMMENT '用户信息'PARTITIONED BY ( `dt` string)row format delimited fields terminated by '\t' location '/warehouse/gmall/ods/ods_user_info/'tblproperties ("parquet.compression"="snappy")//创建订单表drop table if exists ods_order_info;create table ods_order_info ( `id` string COMMENT '订单编号', `total_amount` decimal(10,2) COMMENT '订单金额', `order_status` string COMMENT '订单状态', `user_id` string COMMENT '用户id' , `payment_way` string COMMENT '支付方式', `out_trade_no` string COMMENT '支付流水号', `create_time` string COMMENT '创建时间', `operate_time` string COMMENT '操作时间' ) COMMENT '订单表'PARTITIONED BY ( `dt` string)row format delimited fields terminated by '\t' location '/warehouse/gmall/ods/ods_order_info/'tblproperties ("parquet.compression"="snappy");//创建订单详情表drop table if exists ods_order_detail;create table ods_order_detail( `id` string COMMENT '订单编号', `order_id` string COMMENT '订单号', `user_id` string COMMENT '用户id' , `sku_id` string COMMENT '商品id', `sku_name` string COMMENT '商品名称', `order_price` string COMMENT '下单价格', `sku_num` string COMMENT '商品数量', `create_time` string COMMENT '创建时间') COMMENT '订单明细表'PARTITIONED BY ( `dt` string)row format delimited fields terminated by '\t' location '/warehouse/gmall/ods/ods_order_detail/'tblproperties ("parquet.compression"="snappy")//创建支付流水表drop table if exists `ods_payment_info`;create table `ods_payment_info`( `id` bigint COMMENT '编号', `out_trade_no` string COMMENT '对外业务编号', `order_id` string COMMENT '订单编号', `user_id` string COMMENT '用户编号', `alipay_trade_no` string COMMENT '支付宝交易流水编号', `total_amount` decimal(16,2) COMMENT '支付金额', `subject` string COMMENT '交易内容', `payment_type` string COMMENT '支付类型', `payment_time` string COMMENT '支付时间' ) COMMENT '支付流水表'PARTITIONED BY ( `dt` string)row format delimited fields terminated by '\t' location '/warehouse/gmall/ods/ods_payment_info/'tblproperties ("parquet.compression"="snappy");//创建商品一级、二级、三级分类表drop table if exists ods_base_category1;create table ods_base_category1( `id` string COMMENT 'id', `name` string COMMENT '名称') COMMENT '商品一级分类'PARTITIONED BY ( `dt` string)row format delimited fields terminated by '\t' location '/warehouse/gmall/ods/ods_base_category1/'tblproperties ("parquet.compression"="snappy");drop table if exists ods_base_category2;create external table ods_base_category2( `id` string COMMENT ' id', `name` string COMMENT '名称', category1_id string COMMENT '一级品类id') COMMENT '商品二级分类'PARTITIONED BY ( `dt` string)row format delimited fields terminated by '\t' location '/warehouse/gmall/ods/ods_base_category2/'tblproperties ("parquet.compression"="snappy");drop table if exists ods_base_category3;create table ods_base_category3( `id` string COMMENT ' id', `name` string COMMENT '名称', category2_id string COMMENT '二级品类id') COMMENT '商品三级分类'PARTITIONED BY ( `dt` string)row format delimited fields terminated by '\t' location '/warehouse/gmall/ods/ods_base_category3/'tblproperties ("parquet.compression"="snappy");
4、dwd
①将文件的存储格式改为parquet
②对三级商品进行降维
注:从下面的建表sql中可以看出,只有商品表的创建有变化
③当然在导数据的过程中,也可以进行ETL(去空,去脏数据等)
//创建用户表drop table if exists dwd_user_info;create external table dwd_user_info( .....) COMMENT ''PARTITIONED BY ( `dt` string)stored as parquetlocation '/warehouse/gmall/dwd/dwd_user_info/'tblproperties ("parquet.compression"="snappy")//创建订单表drop table if exists dwd_order_info;create external table dwd_order_info ( .....) COMMENT ''PARTITIONED BY ( `dt` string)stored as parquetlocation '/warehouse/gmall/dwd/dwd_order_info/'tblproperties ("parquet.compression"="snappy");//创建订单详情表drop table if exists dwd_order_detail;create external table dwd_order_detail( .....) COMMENT ''PARTITIONED BY ( `dt` string)stored as parquetlocation '/warehouse/gmall/dwd/dwd_order_detail/'tblproperties ("parquet.compression"="snappy"); //创建支付流水表drop table if exists `dwd_payment_info`;create external table `dwd_payment_info`( ..... ) COMMENT ''PARTITIONED BY ( `dt` string)stored as parquetlocation '/warehouse/gmall/dwd/dwd_payment_info/'tblproperties ("parquet.compression"="snappy")//创建商品表drop table if exists dwd_sku_info;create external table dwd_sku_info( `id` string COMMENT 'skuId', `spu_id` string COMMENT 'spuid', `price` decimal(10,2) COMMENT '' , `sku_name` string COMMENT '', `sku_desc` string COMMENT '', `weight` string COMMENT '', `tm_id` string COMMENT 'id', `category3_id` string COMMENT '1id', `category2_id` string COMMENT '2id', `category1_id` string COMMENT '3id', `category3_name` string COMMENT '3', `category2_name` string COMMENT '2', `category1_name` string COMMENT '1', `create_time` string COMMENT '') COMMENT ''PARTITIONED BY ( `dt` string)stored as parquetlocation '/warehouse/gmall/dwd/dwd_sku_info/'tblproperties ("parquet.compression"="snappy")
5、DWS
以每日为粒度,对用户这个实体进行轻度汇总,建立每日用户行为宽表
drop table if exists dws_user_action;
create external table dws_user_action (
user_id string comment '用户 id',
order_count bigint comment '下单次数 ',
order_amount decimal(16,2) comment '下单金额 ',
payment_count bigint comment '支付次数',
payment_amount decimal(16,2) comment '支付金额 ',
comment_count bigint comment '评论次数'
) COMMENT '每日用户行为宽表'
PARTITIONED BY ( `dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_user_action/' tblproperties ("parquet.compression"="snappy");
6、ads层
①求GVM
定义:一定时间段内的成交总额。即用户下单金额:包括未付款、已付款。因为只要下单了,就说明已经有足够的吸引力了。
需求:单日GMV,单日订单数量、单日收入等思路:从需求可以看出:因为每天都要统计,所以在dm层(ads层)还需要一张表:
create table ads_gmv_sum_day( 'dt' string comment '统计日期', 'gmv_count' long comment '当日gmv个数', 'gmv_amount' decimal(16,2) comment '当日gmv订单总金额', 'gmv_payment' decimal(16,2) comment '当日支付金额' )comment '用户行为宽表' stored as parquet //存储格式 location '/warehouse/online_trade/ads/ads_gmv_sum_day' tblproperties("parquet.compression"="snappy") //压缩算法
②求转化率
新访问用户转化率 = 单日新访问设备数/日活数
新注册用户转化率 = 单日新注册用户数/日活数
新付费用户转化率 = 单日新付费用户数/日活数
付费用户转化率 = 单日消费用户数/单日日活数量
思路:需要在ads中建立一张表:
drop table if exists ads_user_convert_day;create table ads_user_convert_day('dt' string comment '统计日期','uv_count' bigint comment '当日日活','new_visitor_count' bigint comment '当日新增访问设备数量','new_regi_u_count' bigint comment '当日新增用户注册数量', //从user_info中的create_date ='2019-02-21''new_consume_u_count' bigint comment '当日新增用户消费数量''new_consume_count' bigint comment '当日消费数量')comment '每日活跃用户转化情况'partitioned by('dt' string) stored as parquet //存储格式location '/warehouse/online_trade/ads/ads_user_convert_day' tblproperties("parquet.compression"="snappy") //压缩算法
③用户行为漏斗分析
create table ads_event_convert_day( 'dt' string comment '统计日期', 'total_visitor_mcount' bigint comment '点击商品人数', 'visit2Item_convert_ratio' decimal(10,2) comment '首页到点击商品转化率', 'addcart_u_count' bigint comment '添加购物车人数', ........ comment '点击商品到添加购物车转化率', ........ comment '下单人数', ........ comment '购物车到下单转化率', ........ comment '支付人数', ........ comment '下单到支付的转化率' )comment '每日活跃用户转化情况' partitioned by('dt' string) stored as parquet //存储格式 location '/warehouse/online_trade/ads/ads_event_convert_day' tblproperties("parquet.compression"="snappy") //压缩算法