大数据处理思路---java

package org;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.File;
import java.io.IOException;
import java.net.URI;

public class WordCountApp {

    public static class mymapper extends Mapper
            <LongWritable,Text,Text,LongWritable>{
        //
        LongWritable one=new LongWritable(1);

        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {

            //先读出一行
            String lines=value.toString();
            //拆分成单词，放入字符串数组
            String [] words=lines.split(" ");
            //把字符数组中的每一个单词  --> word，1
            for (String word:words
                 ) {
                context.write(new Text(word),one);
            }
        }
    }

    public static class myreduce extends Reducer
            <Text,LongWritable,Text,LongWritable>{
        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Context context)throws IOException, InterruptedException {
            //统计count
            long sum=0;
            // 把values 中的值，累加
            for (LongWritable value:values
                 ) {
                sum+=value.get();
            }
            //把结果输出
            context.write(key,new LongWritable(sum));
        }
    }
    /**
     * 主方法
     * Driver,封装了MapReduce作业的所有信息
     */
    public static void main(String[] args)throws  IOException, ClassNotFoundException, InterruptedException {

        //创建配置项
        Configuration configuration = new Configuration();

        // hadoop jar 包  wordcount（类名） 数据输入地址args[0]  输出地址args[1]
        //准备工作，清理已经存在的输出目录
        Path outputpath = new Path(args[1]);
        //链接到HDFS
        //FileSystem.get(new URI("192.168.1.1"),configuration,"hadoop");
        FileSystem fileSystem = FileSystem.get(configuration);
        //0、如果 输出目录存在，--删除
        if (fileSystem.exists(outputpath)) {
            fileSystem.delete(outputpath, true);
            System.out.println("输出目录存在，但是已经被删除了！");
        }
        //1、创建一个job
        Job job = Job.getInstance(configuration, "wordcount");
        //2.设置处理哪个类  告诉job要处理的是哪个类
        job.setJarByClass(WordCountApp.class);
        //3.作业要处理的数据的路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        //4map处理的相关参数（固定套路） 1.找到自己处理的类  2。设置输出的key   3.设置输出value
        job.setMapperClass(mymapper.class);

        //固定
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        //5设置reduce先关参数
        job.setReducerClass(myreduce.class);
        //
        job.setoutputKeyClass(Text.class);
        job.setoutputValueClass(LongWritable.class);
        //6.设置combiner  逻辑跟reduce一样
        job.setCombinerClass(myreduce.class);
        //7.作业处理完之后数据输出的路径
        FileOutputFormat.setoutputPath(job, new Path(args[1]));
        //8.程序执行后的处理
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }

}
大数据处理思路---java

相关推荐