欢迎投稿

今日深度:

检查数据倾斜分布,数据倾斜分布

检查数据倾斜分布,数据倾斜分布



从传统数据库迁移到GP中一个重要的且经常被开发人员忽略的概念是数据分布,没有良好的设计表的分布键会导致严重的性能问题,以下函数将给开发人员及DBA检测一个表的数据倾斜情况。
-- Function: gpmg.data_skew(character varying)
 
-- DROP FUNCTION gpmg.data_skew(character varying);
 
CREATE OR REPLACE FUNCTION gpmg.data_skew(tablename character varying)
  RETURNS text AS
$BODY$
--2014-05-26,Gtlions,收集和统计数据倾斜情况
declare
  v_func character varying(200)='gpmg.data_skew()';
  v_begin_time timestamp;
  v_end_time timestamp;
  v_status int=0;
  v_msg text='Done.';
  v_record record;
 
  v_id integer;
  v_rq timestamp;  
  v_segs integer=64;
  v_totalnums bigint=0;
  v_maxskew numeric=0.0;
  v_minskew numeric=0.0;
  v_maxskew_seg varchar(20);
  v_minskew_seg varchar(20);
  v_maxrows bigint=0;
  v_minrows bigint=0;   
  v_result varchar(2000);
 
begin
  v_id=nextval('gpmg.commonseq');
  v_rq=now();
  v_begin_time=clock_timestamp();
  v_result = 'GP hava ';
  select into v_segs count(*) segs from gp_segment_configuration where role='p' and content<>-1;
  v_result = v_result||v_segs||' instances, Standard skew is '||1.0/v_segs||'. ';
  -- bg1 segid, bg2 节点记录数量
  execute 'insert into gpmg.commontab(seq,tabname,bg1,bg2) select '||v_id||','''||$1||''',gp_segment_id,count(*) segrownums from '||$1||' group by rollup(( gp_segment_id)) order by gp_segment_id';
  select into v_segs,v_totalnums v_segs,max(bg2) from gpmg.commontab where seq=v_id and tabname=$1;
  --nm1 标准倾斜率, nm2 节点倾斜率, nm3 标准-节点倾斜率绝对值
  update gpmg.commontab set nm1=1::numeric/v_segs,nm2=bg2::numeric/v_totalnums,nm3=abs(1::numeric/v_segs-bg2::numeric/v_totalnums) where seq=v_id and tabname=$1;
  select into v_maxskew,v_minskew max(nm2),min(nm2) from gpmg.commontab where seq=v_id and tabname=$1 and bg1 is not null;
 
  select into v_maxskew_seg hostname from gp_segment_configuration where role='p' and content in (select bg1 from gpmg.commontab where seq=v_id and tabname=$1 and bg1 is not null and nm2=v_maxskew limit 1);
  select into v_minskew_seg hostname from gp_segment_configuration where role='p' and content in (select bg1 from gpmg.commontab where seq=v_id and tabname=$1 and bg1 is not null and nm2=v_minskew limit 1);
 
  select into v_maxrows bg2 from gpmg.commontab where seq=v_id and tabname=$1 and bg1 is not null and nm2=v_maxskew limit 1;
  select into v_minrows bg2 from gpmg.commontab where seq=v_id and tabname=$1 and bg1 is not null and nm2=v_minskew limit 1;
 
  v_result =v_result ||'You Table ['||$1||'] skew info: [table_totalrows:'||v_totalnums||', maxskew:seg-'||v_maxskew_seg||', rows-'||v_maxrows||' '||v_maxskew||', minskew:seg-'||v_minskew_seg||', rows-'||v_minrows||' '||v_minskew||']';
  delete from gpmg.commontab where seq=v_id and tabname=$1;
  return v_result;
  v_end_time=clock_timestamp();
end;
$BODY$
  LANGUAGE plpgsql VOLATILE;
ALTER FUNCTION gpmg.data_skew(character varying)
  OWNER TO gpadmin;
GRANT EXECUTE ON FUNCTION gpmg.data_skew(character varying) TO public;
GRANT EXECUTE ON FUNCTION gpmg.data_skew(character varying) TO gpadmin;

bigdatagp=# select gpmg.data_skew('gpmg.manager_table');
                                                                                                            data_skew                                                  
                                                           
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
-----------------------------------------------------------
 GP hava 64 instances, Standard skew is 0.01562500000000000000. You Table [gpmg.manager_table] skew info: [table_totalrows:83, maxskew:seg-sdw16, rows-3 0.036144578313
25301205, minskew:seg-sdw2, rows-1 0.01204819277108433735]
(1 row)
 
bigdatagp=# select gpmg.data_skew('gpmg.func_log');
                                                                                                             data_skew                                                 
                                                             
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------
 GP hava 64 instances, Standard skew is 0.01562500000000000000. You Table [gpmg.func_log] skew info: [table_totalrows:53708, maxskew:seg-sdw10, rows-907 0.016887614508
08073285, minskew:seg-sdw7, rows-773 0.01439264169211290683]
(1 row)
2014-10-14 09:53:00


-EOF-

excel 数据倾斜显示

你只需要让该单元格内的文字是舍斜体字,不要让文本框倾斜。
 

GP42中怎检查数是否均匀分布

SELECT gp_segment_id,count(*) from tab group by gp_segment_Id 快速简单的了解数据是否存放均匀。但是这种方法的缺陷在于,1、只能判断存储是否均匀,不能判断数据处理是否均匀。2、判断的标准不是很直观,还需要用户进行一些额外计算才能进行准确判断。从GP4.2开始,GP提供如下两个视图,可以帮助客户更方便准确的判断数据分布键选择是否合理。gp_toolkit.gp_skew_coefficients -- 它通过提供每个segment存储表数据的变异系数,去帮助客户判断检查表数据在各个segmetn存储是否均匀。列 描述Skcoid 表的对象IDskcnamespace 表的命名空间 The namespace where the table is defined.skcrelname 表名.skccoeff 通过存储记录均值计算出的标准差,这个值越低说明数据存放约均匀,反之说明数据存储分布不均匀,要考虑分布键选择是否合理。 gp_toolkit.gp_skew_idle_fractions 通过计算表扫描过程中,系统闲置的百分比,帮助用户快速判断,是否存在分布键选择不合理,导致数据处理倾斜的问题。列名 描述sifoid 表对象id.sifnamespace 表定义的命名空间sifrelname 表名.siffraction 表扫描过程中系统闲置的百分比。比如0.1表示10%的倾斜
 

www.htsjk.Com true http://www.htsjk.com/shujukunews/3859.html NewsArticle 检查数据倾斜分布,数据倾斜分布 从传统数据库迁移到GP中一个重要的且经常被开发人员忽略的概念是数据分布,没有良好的设计表的分布键会导致严重的性能问题,以下函数将给开发...
相关文章
    暂无相关文章
评论暂时关闭