阿里妹导读
文章导读
一、场景描述
二、常见的优化方法
2.1.Mapjoin
SELECT /*+MAPJOIN(dim)*/ *
FROM (SELECT * FROM dwd_tbl) base
LEFT OUTER JOIN (SELECT * FROM dim_tbl) dim
ON base.dim_key = dim.dim_key
2.2.特殊值/空值打散
SELECT *
FROM (SELECT * FROM dwd_tbl) base
LEFT OUTER JOIN (SELECT * FROM dim_tbl) dim
ON IF(COALESCE(base.dim_key,'')='',CONCAT('HIVE_',RAND()),base.dim_key) = dim.dim_key
2.3.热点值打散,副表呈倍数扩散
SELECT *
FROM (
SELECT *,CAST(RAND()*10 AS BIGINT) AS ext_a
FROM dwd_tbl
) base
LEFT OUTER JOIN (
SELECT *
FROM dim_tbl
LATERAL VIEW EXPLODE(SPLIT('0;1;2;3;4;5;6;7;8;9',';')) tt AS ext_b
-- 或者Join一个用于倍数膨胀的小表
) dim
ON base.dim_key = dim.dim_key
AND base.ext_a = dim.ext_b
2.4.热点数据单独处理/SkewJoin
-- Step01:热点数据记录提取
INSERT OVERWRITE TABLE tmp_hot_list PARTITION (dt = '${bizdate}')
SELECT dim_shop_id AS hot_id
FROM main_table
WHERE dt = '${bizdate}'
GROUP BY dim_shop_id
HAVING COUNT(1) > 10000
;
INSERT OVERWRITE TABLE final_result_table PARTITION (dt = '${bizdate}')
-- Step02:热点数据处理,使用MapJoin完成处理
SELECT /*+MAPJOIN(a2,a3)*/
a1.trade_no AS trade_no
,a1.dim_shop_id AS shop_id
,a3.shop_name AS shop_name
,a3.shop_type AS shop_type
FROM (SELECT * FROM main_table WHERE dt = '${bizdate}') a1
-- Step02-1:主表用JOIN关联热点表进行热点记录筛选
JOIN (SELECT * FROM tmp_hot_list WHERE dt = '${bizdate}') a2 -- 热点数据清单
ON a1.dim_shop_id = a2.dim_shop_id
-- Step02-2:热点维度数据处理
LEFT OUTER JOIN (
SELECT /*+MAPJOIN(b2)*/ b1.*
FROM (SELECT * FROM dim_table_info WHERE dt = '${bizdate}') b1
JOIN (SELECT * FROM tmp_hot_list WHERE dt = '${bizdate}') b2 -- 热点数据清单
ON b1.dim_shop_id = b2.dim_shop_id
) a3
ON a1.dim_shop_id = a3.dim_shop_id
UNION ALL
-- Step03:非热点数据处理,使用普通Join完成处理,两张表均需要进行Shuffle
SELECT /*+MAPJOIN(a12)*/
a11.trade_no AS trade_no
,a11.dim_shop_id AS shop_id
,a13.shop_name AS shop_name
,a13.shop_type AS shop_type
FROM (SELECT * FROM main_table WHERE dt = '${bizdate}') a11
-- Step03-1:主表用ANTI JOIN关联热点表进行剔除
LEFT ANTI JOIN (SELECT * FROM tmp_hot_list WHERE dt = '${bizdate}') a12
ON a11.dim_shop_id = a12.dim_shop_id
-- Step03-2:非热点维度数据处理
LEFT OUTER JOIN (
SELECT /*+MAPJOIN(b12)*/ b11.*
FROM (SELECT * FROM dim_table_info WHERE dt = '${bizdate}') b11
LEFT ANTI JOIN (SELECT * FROM tmp_hot_list WHERE dt = '${bizdate}') b12
ON b11.dim_shop_id = b12.dim_shop_id
) a13
ON a11.dim_shop_id = a13.dim_shop_id
;
INSERT OVERWRITE TABLE final_result_table PARTITION (dt = '${bizdate}')
SELECT /*+SKEWJOIN(a1)*/
a1.trade_no AS trade_no
,a1.dim_shop_id AS shop_id
,a2.shop_name AS shop_name
,a2.shop_type AS shop_type
FROM (SELECT * FROM main_table WHERE dt = '${bizdate}') a1
LEFT JOIN (SELECT * FROM dim_table_info WHERE dt = '${bizdate}') a2
ON a1.dim_shop_id = a2.dim_shop_id
;
2.5.方案总结
三、一种新的思路 WithDistmapjoin~
3.1.核心思路
3.2.代码实现
WITH
-- STEP01:热点Key采集
tmp_hot_pid AS (
SELECT dim_shop_id,'Y' AS is_hot
FROM main_table_detail
WHERE dt = '${bizdate}'
GROUP BY dim_shop_id
HAVING COUNT(1) > 100000
)
-- STEP02:维表热点数据打标
,tmp_dim_tbl AS (
SELECT /*+MAPJOIN(hot)*/
dim.*
,COALESCE(hot.is_hot,'N') AS is_hot
FROM (
SELECT *
FROM dim_table_info
WHERE dt = '${bizdate}'
) dim
LEFT OUTER JOIN tmp_hot_pid hot
ON dim.dim_shop_id = hot.dim_shop_id
)
-- STEP03:明细热点数据打标
,tmp_dwd_tbl AS (
SELECT /*+MAPJOIN(hot)*/
base.*
,COALESCE(hot.is_hot,'N') AS is_hot
FROM (
SELECT *
FROM main_table_detail
WHERE dt = '${bizdate}'
) base
LEFT OUTER JOIN tmp_hot_pid hot
ON base.dim_shop_id = hot.dim_shop_id
)
-- STEP04:数据合并处理,热点数据用Mapjoin,非热点数据用DISTMAPJOIN
INSERT OVERWRITE TABLE final_result_table PARTITION (dt = '${bizdate}')
SELECT *
FROM (
-- STEP04-1:非热点数据用DISTMAPJOIN
SELECT /*+ DISTMAPJOIN(dim(shard_count=77)) */
dwd_tbl.trade_no AS trade_no
,dwd_tbl.trade_date AS trade_date
,dwd_tbl.shop_id AS shop_id
,dim.shop_name AS shop_name
,dim.shop_type AS shop_type
FROM (SELECT * FROM tmp_dwd_tbl WHERE is_hot = 'N') dwd_tbl
LEFT OUTER JOIN (SELECT * FROM tmp_dim_tbl WHERE is_hot = 'N') dim
ON dwd_tbl.partner_id = dim.partner_id
UNION ALL
-- STEP04-1:热点数据用Mapjoin
SELECT /*+MAPJOIN(dim)*/
dwd_tbl.trade_no AS trade_no
,dwd_tbl.trade_date AS trade_date
,dwd_tbl.shop_id AS shop_id
,dim.shop_name AS shop_name
,dim.shop_type AS shop_type
FROM (SELECT *FROM tmp_dwd_tbl WHERE is_hot = 'Y') dwd_tbl
LEFT OUTER JOIN (SELECT *FROM tmp_dim_tbl WHERE is_hot = 'Y') dim
ON dwd_tbl.partner_id = dim.partner_id
) base
;
3.3.真实效果
四、方案总结