Hbase 的热点问题及rowkey的散列原则，hbaserowkey

和通数据库htsjk.Com2019-08-15 03:25 来源:未知阅读:7356 评论 550 热度5

标签：

Hbase 的热点问题及rowkey的散列原则，hbaserowkey

这是我直接根据0-9 进行partition 预分组，然后以已1-10000为rowkey进行插入，我想象中的是，起始位分布都差不多，根据高位字典进行排序，按道理来说是很完美的，但是，确实很完美，完美造成了热点，还是最热的那种，心里一万个不愿意，但是没办法，说明如果要按照数字进行预分区，应该要分得细粒度一点，这种就要1-1000,1000-2000......9000-10000，这样来分，但是这种只适合测试啦，真的数据跑起来，是不OK的，还是得靠hash,md5等等散列函数来实现，真正的分布均匀。

看下图，照样是我，用1-10000为rowkey进行。数据put，结果也是很完美的，那是确实完美，这种除非数据真的非常非常大，对region进行了多次split，那么他还是会在不久之后出现热点问题，但是应付一般的数据量还是没问题的，所以应该提前考虑到数据的增量问题，对分区进行和里的设计

1、进行建表的时候，对表进行预分区

package make.hbase.com;

import java.util.Iterator;
import java.util.Random;
import java.util.TreeSet;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.MD5Hash;

public class hash_example {
	// 随机取机数目 实验基数
	private int baseRecord;
	// rowkey生成器
	private RowKeyGenerator rkGen;
	// 取样时，由取样数目及region数相除所得的数量.
	private int splitKeysBase;
	// splitkeys个数
	private int splitKeysNumber;
	// 由抽样计算出来的splitkeys结果
	private byte[][] splitKeys;

	public hash_example(int baseRecord, int prepareRegions) {
		this.baseRecord = baseRecord;
		// 实例化rowkey生成器
		rkGen = new HashRowKeyGenerator();
		// region数量减一
		splitKeysNumber = prepareRegions - 1;
		// 抽样基数 除以 region数量
		splitKeysBase = baseRecord / prepareRegions;
	}

	public byte[][] calcSplitKeys() {
		splitKeys = new byte[splitKeysNumber][]; // new byte[9][]
		// 使用treeset保存抽样数据，已排序过
		TreeSet<byte[]> rows = new TreeSet<byte[]>(Bytes.BYTES_COMPARATOR);
		// 把生成的散列byte[] 添加到rows
		for (int i = 0; i < baseRecord; i++) {
			rows.add(rkGen.nextId());
		}
		int pointer = 0;
		Iterator<byte[]> rowKeyIter = rows.iterator();
		int index = 0;
		while (rowKeyIter.hasNext()) {
			byte[] tempRow = rowKeyIter.next();
			rowKeyIter.remove();
			if ((pointer != 0) && (pointer % splitKeysBase == 0)) {
				if (index < splitKeysNumber) {
					splitKeys[index] = tempRow;
					index++;
				}
			}
			pointer++;
		}
		rows.clear();
		rows = null;
		return splitKeys;
	}

	// interface
	public interface RowKeyGenerator {
		byte[] nextId();
	}

	// implements
	public class HashRowKeyGenerator implements RowKeyGenerator {
		private long currentId = 1;
		private long currentTime = System.currentTimeMillis();
		private Random random = new Random();

		public byte[] nextId() {
			try {
				currentTime += random.nextInt(1000);
				byte[] lowT = Bytes.copy(Bytes.toBytes(currentTime), 4, 4);
				byte[] lowU = Bytes.copy(Bytes.toBytes(currentId), 4, 4);
				return Bytes.add(MD5Hash.getMD5AsHex(Bytes.add(lowU, lowT))
						.substring(0, 8).getBytes(), Bytes.toBytes(currentId));
			} finally {
				currentId++;
			}
		}
	}

	public static void main(String[] args) throws Exception {
		hash_example worker = new hash_example(1000000, 20);
		byte[][] splitKeys = worker.calcSplitKeys();
		String namespace = "hbase_api";
		String tbname = "hbase_hashrowkey2";
		String colfamily = "info";

		// 读取配置信息
		Configuration conf = HBaseConfiguration.create();

		conf.set("hbase.zookeeper.quorum", "make.hadoop.com");

		HBaseAdmin admin = new HBaseAdmin(conf);
		TableName tableName = TableName.valueOf(tbname);

		if (admin.tableExists(tableName)) {
			try {
				admin.disableTable(tableName);
			} catch (Exception e) {
			}
			admin.deleteTable(tableName);
		}

		HTableDescriptor tableDesc = new HTableDescriptor(tableName);
		HColumnDescriptor columnDesc = new HColumnDescriptor(
				Bytes.toBytes(colfamily));
		columnDesc.setMaxVersions(1);
		tableDesc.addFamily(columnDesc);

		admin.createTable(tableDesc, splitKeys);

		admin.close();
	}

}

2、对顺序id进行插入测试

@Test
	public void put_hash() throws MasterNotRunningException,
	ZooKeeperConnectionException, IOException {
	
	HTable table = GetTable_test("rwokry_partition");
	
	long currentTime = System.currentTimeMillis(); 
	Random random = new Random();
	
	// 模拟put 10000条数据 
	
	for (int i = 1; i < 10000; i++) {
	
		currentTime += random.nextInt(1000); byte[] lowT =
		Bytes.copy(Bytes.toBytes(currentTime), 4, 4); byte[] lowI =
		Bytes.copy(Bytes.toBytes((long) i), 4, 4); byte[] rowkey =
		Bytes.add(MD5Hash .getMD5AsHex(Bytes.add(lowI, lowT)).substring(0, 8)
		.getBytes(), Bytes.toBytes(i));
	
		Put put = new Put(rowkey);
	
		put.add(Bytes.toBytes(colfamily), Bytes.toBytes(column),
		Bytes.toBytes("make"));
	
		table.put(put); 
		}
	
		table.close(); System.err.println("数据插入成功");
	
	}

我们上UI看请求结果，确实做到了负载均衡，防止了热点的问题

其中我们查看一下rowkey，也是符合长度原则的，这里rowkey达到了12个字节，应该是正常的，其实实现这个的代码也是挺简单的，参考：