HIVE的学习笔记,
hive原来是hadoop开源项目中的一个子模块,hadoop0.20以后独立出来成为单独项目,不过安装时还是要指定hadoop路径:export HADOOP_HOME=<%YOUR HADOOP DIR%>,看看都有些什么命令吧:
hive> CREATE TABLE pokes (foo INT, bar STRING);//建表
hive> CREATE TABLE invites (foo INT, bar STRING) PARTITIONED BY (ds STRING);//创建可分区表
hive> SHOW TABLES;//查有哪些表
hive> SHOW TABLES '.*s';//查以's'结尾的表
hive> ALTER TABLE pokes ADD COLUMNS (new_col INT);//添加行
hive> ALTER TABLE invites ADD COLUMNS (new_col2 INT COMMENT 'a comment');//添加(这个COMMENT还不知道什么意思)行
hive> ALTER TABLE events RENAME TO 3koobecaf;//改表名
hive> DROP TABLE pokes;//删除
hive> LOAD DATA LOCAL INPATH './examples/files/kv1.txt' OVERWRITE INTO TABLE pokes;//给目标表加载数据
hive> LOAD DATA LOCAL INPATH './examples/files/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');
hive> LOAD DATA LOCAL INPATH './examples/files/kv3.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-08');//将数据导入到表的分区中
hive> SET mapred.job.tracker=myhost.mycompany.com:50030
hive> SET -v //属性设置
hive> INSERT OVERWRITE DIRECTORY '/tmp/hdfs_out' SELECT a.* FROM invites a WHERE a.ds='';
hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/local_out' SELECT a.* FROM pokes a;//插入操作
hive> FROM pokes t1 JOIN invites t2 ON (t1.bar = t2.bar) INSERT OVERWRITE TABLE events SELECT t1.bar, t1.foo, t2.foo;//连接操作
hive> FROM invites a INSERT OVERWRITE TABLE events SELECT a.bar, count(1) WHERE a.foo > 0 GROUP BY a.bar;
hive> INSERT OVERWRITE TABLE events SELECT a.bar, count(1) FROM invites a WHERE a.foo > 0 GROUP BY a.bar;//group by
FROM src
INSERT OVERWRITE TABLE dest1 SELECT src.* WHERE src.key < 100
INSERT OVERWRITE TABLE dest2 SELECT src.key, src.value WHERE src.key >= 100 and src.key < 200
INSERT OVERWRITE TABLE dest3 PARTITION(ds='2008-04-08', hr='12') SELECT src.key WHERE src.key >= 200 and src.key < 300
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/dest4.out' SELECT src.value WHERE src.key >= 300;//单表复合操作
hive> FROM invites a INSERT OVERWRITE TABLE events SELECT TRANSFORM(a.foo, a.bar) AS (oof, rab) USING '/bin/cat' WHERE a.ds > '2008-08-09';//流式操作???
接下来研究一下源代码了。