【数盟倡导“数据创造价值”,致力于打造最卓越的数据科学交流平台,为企业、个人提供最卓越的服务】
num_Map_tasks = max[${Mapred.min.split.size},
min(${dfs.block.size}, ${Mapred.max.split.size})]
Mapred.min.split.size指的是数据的最小分割单元大小。
Mapred.max.split.size指的是数据的最大分割单元大小。
dfs.block.size指的是HDFS设置的数据块大小。
Hive> set dfs.block.size;
dfs.block.size is undefined
Hive> set Mapred.min.split。size;
Mapred.min.split.size=1
Hive> set Mapred.max.split。size;
Mapred.max.split.size=256000000
num_Reduce_tasks = min[${Hive.exec.Reducers.max},
(${input.size} / ${ Hive.exec.Reducers.bytes.per.Reducer})]
create table rc_file_test( colint) stored as rcfile;
set Hive.exec.compress.output =true;
insert overwrite table rc_file_test
select * from source_table;
set Hive.default.fileformat = SequenceFile;
set Hive.exec.compress.output =true;
/*对于sequencefile,有record和block两种压缩方式可选,block压缩比更高*/
set Mapred.output.compression.type = BLOCK;
create table seq_file_test
as select * from source_table;
public synchronized boolean next(K key, V value) throwsIOException
{
Text tKey = (Text) key;
Text tValue = (Text) value;
if (!super.next(innerKey, innerValue))
return false;
Text inner_key = (Text) innerKey;//在构造函数中用createKey()生成
Text inner_value = (Text) innerValue;//在构造函数中用createValue()生成
tKey.set(inner_key);
tValue.set(inner_key.toString() +‘\t’ + inner_value.toString()); // 分隔符注意自己定义
return true;
}
set Mapred.Job.reuse.jvm.num.tasks = 5;
/*在index_test_table表的id字段上创建索引*/
create index idx on table index_test_table(id)
as‘org.apache.Hadoop.Hive.ql.index.compact.CompactIndexHandler’ withdeferred rebuild;
alter index idx on index_test_table rebuild;
/*索引的剪裁。找到上面建的索引表,根据你最终要用的查询条件剪裁一下。*/
/*如果你想跟RDBMS一样建完索引就用,那是不行的,会直接报错,这也是其麻烦的地方*/
create table my_index
as select _bucketname, `_offsets`
from default__index_test_table_idx__ where id = 10;
/*现在可以用索引了,注意最终查询条件跟上面的剪裁条件一致*/
set Hive.index.compact.file = /user/Hive/warehouse/my_index;
set Hive.input.format = org.apache.Hadoop.Hive.ql.index.compact.HiveCompactIndexInputFormat;
select count(*) from index_test_table where id = 10;
replication join:把其中一个表复制到所有节点,这样另一个表在每个节点上面的分片就可以跟这个完整的表join了;
repartition join:把两份数据按照join key进行hash重分布,让每个节点处理hash值相同的join key数据,也就是做局部的join。
create table Map_join_test(idint)
clustered by (id) Sorted by (id) into 32 buckets
stored as textfile;
set Hive.enforce.bucketing =true;
insert overwrite table Map_join_test
select * from Map_join_source_data;
select/*+Mapjoin(a) */count(*)
from Map_join_test a
join Map_join_test b on a.id = b.id;
2013-08-31 09:08:43 Starting to launch local task to process Map join; maximum memory = 1004929024
2013-08-31 09:08:45 Processing rows: 200000 Hashtable size: 199999 Memory usage: 38823016 rate: 0.039
2013-08-31 09:08:46 Processing rows: 300000 Hashtable size: 299999 Memory usage: 56166968 rate: 0.056
……
2013-08-31 09:12:39 Processing rows: 4900000 Hashtable size: 4899999 Memory usage: 896968104 rate: 0.893
2013-08-31 09:12:47 Processing rows: 5000000 Hashtable size: 4999999 Memory usage: 922733048 rate: 0.918
Execution failedwithexit status: 2
Obtaining error information
Task failed!
Task ID:
Stage-4
set Hive。optimize。bucketMapjoin =true;
2013-08-31 09:20:39 Starting to launch local task to process Map join; maximum memory = 1004929024
2013-08-31 09:20:41 Processing rows: 200000 Hashtable size: 199999 Memory usage: 38844832 rate: 0.039
2013-08-31 09:20:42 Processing rows: 275567 Hashtable size: 275567 Memory usage: 51873632 rate: 0.052
2013-08-31 09:20:42 Dump the hashtable into file: file:/tmp/Hadoop/Hive_2013-08-31_21-20-37_444_1135806892100127714/-local-10003/HashTable-Stage-1/MapJoin-a-10-000000_0。hashtable
2013-08-31 09:20:46 Upload 1 File to: file:/tmp/Hadoop/Hive_2013-08-31_21-20-37_444_1135806892100127714/-local-10003/HashTable-Stage-1/MapJoin-a-10-000000_0。hashtable File size: 11022975
2013-08-31 09:20:47 Processing rows: 300000 Hashtable size: 24432 Memory usage: 8470976 rate: 0.008
2013-08-31 09:20:47 Processing rows: 400000 Hashtable size: 124432 Memory usage: 25368080 rate: 0.025
2013-08-31 09:20:48 Processing rows: 500000 Hashtable size: 224432 Memory usage: 42968080 rate: 0.043
2013-08-31 09:20:49 Processing rows: 551527 Hashtable size: 275960 Memory usage: 52022488 rate: 0.052
2013-08-31 09:20:49 Dump the hashtable into file: file:/tmp/Hadoop/Hive_2013-08-31_21-20-37_444_1135806892100127714/-local-10003/HashTable-Stage-1/MapJoin-a-10-000001_0。hashtable
……
set Hive.optimize.bucketMapjoin.Sortedmerge =true;
set Hive.input.format = org.apache.Hadoop.Hive.ql.io.BucketizedHiveInputFormat;
/*改写前*/
select a, count(distinct b) as c from tbl group by a;
/*改写后*/
select a, count(*) as c
from (select distinct a, b from tbl) group by a;
select a.* from logs a join users b on a。user_id = b.user_id;
set Hive.optimize.skewjoin =true;
select a.* from
(
select a.*
from (select * from logs where user_id = 0) a
join (select * from users where user_id = 0) b
on a。user_id = b。user_id
union all
select a.*
from logs a join users b
on a。user_id <> 0 and a。user_id = b.user_id
)t;
select * from
(
select count(*) from logs
where log_date = 20130801 and item_id = 1
union all
select count(*) from logs
where log_date = 20130802 and item_id = 2
union all
select count(*) from logs
where log_date = 20130803 and item_id = 3
)t
select count(*)
from
(select distinct user_id
from logs where page_name = ‘a’) a
join
(select distinct user_id
from logs where blog_owner = ‘b’) b
on a.user_id = b.user_id;
select count(*)
from logs group by user_id
having (count(casewhen page_name = ‘a’ then 1 end) > 0
and count(casewhen page_name = ‘b’ then 1 end) > 0)
—————————————————
数盟网站:www.dataunion.org
数盟微信:DataScientistUnion
数盟【大数据群】272089418
数盟【数据可视化群】 179287077
数盟【数据分析群】 110875722
—————————————————
点击阅读原文,更多精彩技术、资讯内容~