Hive学习笔记(16),
1 项目思路
* 数据存储格式 orcfile /parquet
* 数据压缩
* map output 数据压缩 snappy
* 外部表
* 分区表
2 实战
drop TABLE if exists defalut.web_log_src;
create table if NOT exists default.web_log_src(
remote_addr string,
remote_user string,
time_local string,
request string,
status string,
body_bytes_sent string,
request_body string,
http_referer string,
http_user_agent string,
http_x_forwarded_for string,
host string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "(\"[^ ]*\") (\"-|[^ ]*\") (\"[^\]]*\") (\"[^\"]*\") (\"[0-9]*\") (\"[0-9]*\") (-|[^ ]*) (\"[^ ]*\") (\"[^\"]*\") (-|[^ ]*) (\"[^ ]*\")"
)
STORED AS TEXTFILE;
- 加载数据
- https://cwiki.apache.org/confluence/display/Hive/GettingStarted#GettingStarted-ApacheWeblogData
- https://c.runoob.com/front-end/854
- https://issues.apache.org/jira/browse/HIVE-662
- https://www.cnblogs.com/cxchanpin/p/6911286.html
正则网站
https://www.regexpal.com/
http://www.txt2re.com/
load data local inpath '/home/hadoop/tempdata/webfb.access.log'
into table default.web_log_src;
- 创建子表
drop table if exists default.web_log_comm;
create table IF NOT exists default.web_log_comm(
remote_addr string,
time_local string,
request string,
http_referer string
)
row format delimited FIELDS terminated BY '\t'
STORED AS orc tblproperties ("orc.compress"="SNAPPY");
- 为子表导入数据
INSERT into TABLE default.web_log_comm SELECT remote_addr,time_local,request,http_referer
FROM default.web_log_src;
本站文章为和通数据库网友分享或者投稿,欢迎任何形式的转载,但请务必注明出处.
同时文章内容如有侵犯了您的权益,请联系QQ:970679559,我们会在尽快处理。