微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

大数据处理思路---java

package org;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.File;
import java.io.IOException;
import java.net.URI;

public class WordCountApp {

    public static class mymapper extends Mapper
            <LongWritable,Text,Text,LongWritable>{
        //
        LongWritable one=new LongWritable(1);

        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {

            //先读出一行
            String lines=value.toString();
            //拆分成单词,放入字符串数组
            String [] words=lines.split(" ");
            //把字符数组中的每一个单词  --> word,1
            for (String word:words
                 ) {
                context.write(new Text(word),one);
            }
        }
    }

    public static class myreduce extends Reducer
            <Text,LongWritable,Text,LongWritable>{
        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Context context)throws IOException, InterruptedException {
            //统计count
            long sum=0;
            // 把values 中的值,累加
            for (LongWritable value:values
                 ) {
                sum+=value.get();
            }
            //把结果输出
            context.write(key,new LongWritable(sum));
        }
    }
    /**
     * 主方法
     * Driver,封装了MapReduce作业的所有信息
     */
    public static void main(String[] args)throws  IOException, ClassNotFoundException, InterruptedException {

        //创建配置项
        Configuration configuration = new Configuration();

        // hadoop jar 包  wordcount(类名) 数据输入地址args[0]  输出地址args[1]
        //准备工作,清理已经存在的输出目录
        Path outputpath = new Path(args[1]);
        //链接到HDFS
        //FileSystem.get(new URI("192.168.1.1"),configuration,"hadoop");
        FileSystem fileSystem = FileSystem.get(configuration);
        //0、如果 输出目录存在,--删除
        if (fileSystem.exists(outputpath)) {
            fileSystem.delete(outputpath, true);
            System.out.println("输出目录存在,但是已经被删除了!");
        }
        //1、创建一个job
        Job job = Job.getInstance(configuration, "wordcount");
        //2.设置处理哪个类  告诉job要处理的是哪个类
        job.setJarByClass(WordCountApp.class);
        //3.作业要处理的数据的路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        //4map处理的相关参数(固定套路) 1.找到自己处理的类  2。设置输出的key   3.设置输出value
        job.setMapperClass(mymapper.class);

        //固定
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        //5设置reduce先关参数
        job.setReducerClass(myreduce.class);
        //
        job.setoutputKeyClass(Text.class);
        job.setoutputValueClass(LongWritable.class);
        //6.设置combiner  逻辑跟reduce一样
        job.setCombinerClass(myreduce.class);
        //7.作业处理完之后数据输出的路径
        FileOutputFormat.setoutputPath(job, new Path(args[1]));
        //8.程序执行后的处理
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }

}

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 [email protected] 举报,一经查实,本站将立刻删除。

相关推荐