Hadoop中的MR的demo-,HadoopMRdemo-
1、 jar包: 官网的share/hadoop下面的所有目录中的jar包
2、 目录结构 (没有添加配置文件)
3 、代码如下
package com.download;
import java.io.IOException;
import java.net.URLDecoder;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class WordCount extends Configured implements Tool{
// mapred里面的Mapper接口是Hadoop0.2.x之前的接口,已经不再使用了
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
}
// mapred里面的Reducer接口是Hadoop0.2.x之前的接口,已经不再使用了
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text text, Iterable<IntWritable> iterable,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : iterable) {
sum += value.get();
}
context.write(text, new IntWritable(sum));
}
}
public static class MyCombiner extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
// 显示次数表示规约函数被调用了多少次
int count = 0;
for (IntWritable value : values) {
count += value.get();
}
context.write(key, new IntWritable(count));
}
}
public static class MyPartitioner extends Partitioner<Text, IntWritable> {
@Override
public int getPartition(Text key, IntWritable value, int numPartition) {
return (key.hashCode() & Integer.MAX_VALUE) & numPartition;
}
}
public static class MyPathFilter implements PathFilter {
@Override
public boolean accept(Path arg0) {
return true;
}
}
@Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
String path = URLDecoder.decode(WordCount.class.getClassLoader().getResource("/").getPath(), "UTF-8");
// 如果不在项目中放置配置文件,也可以直接对configuration进行配置 配置没有测试过
conf.setBoolean("mapreduce.app-submission,cross-platform", true); // 设置使用跨平台提交任务
conf.set("fs.defaultFS", "hdfs://namenode:8020"); // 指定namenode
conf.set("mapreduce.framework.name", "yarn"); // 指定使用yarn框架
conf.set("yarn.resoucemanager.address", "resourcenode:8032"); // 指定resoucemanager
conf.set("yarn.resoucemanager.scheduler.address", "schedulerNode:8030"); // 指定资源分配器
conf.set("mapreduce.jobhistory.address", "jobhistorynode:10020"); // 指定jobhistory
//如果想导出手动放在linux上运行,则需要将下面这一步删除,否则提示找不到jar包,同时需要设置job.setJarByClass()
conf.set("mapred.jar", path + "/hadoop.jar"); // 在mr代码写完后,将代码导出成jar包,并放在根目录下
Job job = Job.getInstance(conf); // 此处采用job, 而jobClient则是0.2.X之前的版本
// job.setJarByClass(WordCount.class); // 此方法目前没有效果
// job.setJar(path + "/hadoop.jar");
job.setJobName("My_WordCount");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setPartitionerClass(MyPartitioner.class);
job.setCombinerClass(MyCombiner.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class); // mapred包里面也存在TextInputFormat需要注意此处用mapreduce包里的
job.setOutputFormatClass(TextOutputFormat.class); // mapred包里面也存在TextOutputFormat需要注意此处用mapreduce包里的
// MultipleInputs.addInputPath(job, new Path(""), TextInputFormat.class, Map.class); // 多个输入情况
FileInputFormat.setInputPaths(job, new Path("/hadoop_input"));
FileOutputFormat.setOutputPath(job, new Path("/hadoop_output")); // 输出目录不能存在,否则提示错误
Path outputPath = new Path("/hadoop_output");
FileSystem fs = outputPath.getFileSystem(conf);
if (fs.exists(outputPath)) {
fs.delete(outputPath, true);
}
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
public static void main(String[] args) throws Exception {
// 运行的前期准备:
// 1、需要2.7的版本需要修改YARNRunner类中的vargs和environment两个变量的值 445、552行
// 2、 需要将配置文件(core-site.xml hdfs-site.xml mapred-site.xml yarn-site.xml)放在src目录下,也可以在configuration中配置环境变量
System.setProperty("HADOOP_USER_NAME", "root"); // 设置权限
int run = ToolRunner.run(new WordCount(), args);
System.exit(run);
}
}
4、 运行后的日志文件:
2.7之后的日志文件都在 hdfs的tmp目录下面,不在userlogs目录下了
查看mr运行的日志: 通过jobhistory实现查看job的运行
启动jobhistoryserve: $HAOOP_HOME/sbin/mr-jobhistory-daemon.sh start historyserver
参考
本站文章为和通数据库网友分享或者投稿,欢迎任何形式的转载,但请务必注明出处.
同时文章内容如有侵犯了您的权益,请联系QQ:970679559,我们会在尽快处理。