Hadoop中的MR的demo-，HadoopMRdemo-

和通数据库htsjk.Com2019-07-24 22:00 来源:未知阅读:18774 评论 215 热度3
标签：
Hadoop中的MR的demo-，HadoopMRdemo-

1、 jar包：官网的share/hadoop下面的所有目录中的jar包
2、目录结构 (没有添加配置文件)

3 、代码如下
package com.download;

import java.io.IOException;
import java.net.URLDecoder;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class WordCount extends Configured implements Tool{

    // mapred里面的Mapper接口是Hadoop0.2.x之前的接口，已经不再使用了
    public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {

        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();

        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            String line = value.toString();
            StringTokenizer tokenizer = new StringTokenizer(line);
            while (tokenizer.hasMoreTokens()) {
                word.set(tokenizer.nextToken());
                context.write(word, one);
            }
        }
    }
    // mapred里面的Reducer接口是Hadoop0.2.x之前的接口，已经不再使用了
    public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {

        @Override
        protected void reduce(Text text, Iterable<IntWritable> iterable,
                Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable value : iterable) {
                sum += value.get();
            }
            context.write(text, new IntWritable(sum));
        }
    }

    public static class MyCombiner extends Reducer<Text, IntWritable, Text, IntWritable> {

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values,
                Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            // 显示次数表示规约函数被调用了多少次
            int count = 0;
            for (IntWritable value : values) {
                count += value.get();
            }
            context.write(key, new IntWritable(count));
        }
    }

    public static class MyPartitioner extends Partitioner<Text, IntWritable> {

        @Override
        public int getPartition(Text key, IntWritable value, int numPartition) {
            return (key.hashCode() & Integer.MAX_VALUE) & numPartition;
        }
    }

    public static class MyPathFilter implements PathFilter {

        @Override
        public boolean accept(Path arg0) {
            return true;
        }
    }

    @Override
    public int run(String[] args) throws Exception {

        Configuration conf = new Configuration();
        String path = URLDecoder.decode(WordCount.class.getClassLoader().getResource("/").getPath(), "UTF-8");

        // 如果不在项目中放置配置文件，也可以直接对configuration进行配置  配置没有测试过
        conf.setBoolean("mapreduce.app-submission,cross-platform", true); // 设置使用跨平台提交任务
        conf.set("fs.defaultFS", "hdfs://namenode:8020"); // 指定namenode
        conf.set("mapreduce.framework.name", "yarn"); // 指定使用yarn框架
        conf.set("yarn.resoucemanager.address", "resourcenode:8032"); // 指定resoucemanager
        conf.set("yarn.resoucemanager.scheduler.address", "schedulerNode:8030"); // 指定资源分配器
        conf.set("mapreduce.jobhistory.address", "jobhistorynode:10020"); // 指定jobhistory

        //如果想导出手动放在linux上运行，则需要将下面这一步删除，否则提示找不到jar包,同时需要设置job.setJarByClass()
        conf.set("mapred.jar", path + "/hadoop.jar"); // 在mr代码写完后，将代码导出成jar包，并放在根目录下
        Job job = Job.getInstance(conf); // 此处采用job, 而jobClient则是0.2.X之前的版本


//      job.setJarByClass(WordCount.class);  // 此方法目前没有效果
//      job.setJar(path + "/hadoop.jar");
        job.setJobName("My_WordCount");

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.setMapperClass(Map.class);
        job.setPartitionerClass(MyPartitioner.class);
        job.setCombinerClass(MyCombiner.class);
        job.setReducerClass(Reduce.class);

        job.setInputFormatClass(TextInputFormat.class); // mapred包里面也存在TextInputFormat需要注意此处用mapreduce包里的
        job.setOutputFormatClass(TextOutputFormat.class); // mapred包里面也存在TextOutputFormat需要注意此处用mapreduce包里的

//      MultipleInputs.addInputPath(job, new Path(""), TextInputFormat.class, Map.class); // 多个输入情况
        FileInputFormat.setInputPaths(job, new Path("/hadoop_input"));
        FileOutputFormat.setOutputPath(job, new Path("/hadoop_output")); // 输出目录不能存在，否则提示错误

        Path outputPath = new Path("/hadoop_output");
        FileSystem fs = outputPath.getFileSystem(conf);
        if (fs.exists(outputPath)) {
            fs.delete(outputPath, true);
        }

        boolean success = job.waitForCompletion(true);
        return success ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        // 运行的前期准备: 
        // 1、需要2.7的版本需要修改YARNRunner类中的vargs和environment两个变量的值  445、552行
        // 2、 需要将配置文件(core-site.xml hdfs-site.xml mapred-site.xml yarn-site.xml)放在src目录下，也可以在configuration中配置环境变量

        System.setProperty("HADOOP_USER_NAME", "root"); // 设置权限
        int run = ToolRunner.run(new WordCount(), args);
        System.exit(run);
    }
}
4、运行后的日志文件：
2.7之后的日志文件都在 hdfs的tmp目录下面，不在userlogs目录下了
查看mr运行的日志：通过jobhistory实现查看job的运行
启动jobhistoryserve： $HAOOP_HOME/sbin/mr-jobhistory-daemon.sh start historyserver
参考