用IDEA编写一个wordcount

创建一个maven项目：

在pom.xml中插入以下代码，导入对应包：这里注意<mainClass>cn.itcast.hadoop.wordcountdrive</mainClass>，不添加主类路径hadoop jar ***.jar命令无法找到执行主类

<dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.7.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.7.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.7.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>2.7.1</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.11</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>jdk.tools</groupId>
            <artifactId>jdk.tools</artifactId>
            <version>1.8</version>
            <scope>system</scope>
            <systemPath>C:/Program Files/Java/jdk1.8.0_144/lib/tools.jar</systemPath>
        </dependency>
    </dependencies>
    <build>
        <pluginManagement><!-- lock down plugins versions to avoid using Maven
				defaults (may be moved to parent pom) -->
            <plugins>
                <plugin>
                    <artifactId>maven-clean-plugin</artifactId>
                    <version>3.0.0</version>
                </plugin>
                <!-- see http://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
                <plugin>
                    <artifactId>maven-resources-plugin</artifactId>
                    <version>3.0.2</version>
                </plugin>
                <plugin>
                    <artifactId>maven-compiler-plugin</artifactId>
                    <version>3.7.0</version>
                </plugin>
                <plugin>
                    <artifactId>maven-surefire-plugin</artifactId>
                    <version>2.20.1</version>
                    <!--<configuration>-->
                    <!--<source>1.8</source>-->
                    <!--<target>1.8</target>-->
                    <!--<encoding>UTF-8</encoding>-->
                    <!--</configuration>-->
                </plugin>
                <plugin>
                    <artifactId>maven-jar-plugin</artifactId>
                    <version>3.0.2</version>
                    <configuration>
                        <archive>
                            <manifest>
                                <addClasspath>true</addClasspath>
                                <classpathPrefix>lib</classpathPrefix>
                                <mainClass>cn.itcast.hadoop.wordcountdrive</mainClass>
                            </manifest>
                        </archive>
                    </configuration>
                </plugin>
                <plugin>
                    <artifactId>maven-install-plugin</artifactId>
                    <version>2.5.2</version>
                </plugin>
                <plugin>
                    <artifactId>maven-deploy-plugin</artifactId>
                    <version>2.8.2</version>
                </plugin>
            </plugins>
        </pluginManagement>
    </build>

在Java包中创建itcast.hadoop包，创建wordcountmapper，wordcountreducer和wordcountdrive类：

wordcountdrive：这个类就是mr程序运行时的主类告诉本类中组装了一些程序运行时所需的信息比如哪个reduce或mapper类输入数据在哪输出数据在哪

public class wordcountdrive {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        // 通过job这个类来封装本次mr的相关信息
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        //指定本次mrjobjar包的运行主类
        job.setJarByClass(wordcountdrive.class);
        //指定reducetask的个数
        job.setNumReduceTasks(3);
        //指定重写的分区类
        job.setPartitionerClass(PPartitioner.class);
        //指定本次mr所用的mapper reduce类分别是什么
        job.setMapperClass(wordcountmapper.class);
        job.setReducerClass(wordcountreduce.class);
        //指定本次mrmapper阶段的输出kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        //指定本次mr最终输出的kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        //指定本次mr输入的数据路径和最终输出结果存放于在什么位置
        FileInputFormat.setInputPaths(job, "/wordcount/input");
        FileOutputFormat.setOutputPath(job, new Path("/wordcount/output"));
        //job.submit();
        //提交程序 并且监控打印执行情况
        boolean b=job.waitForCompletion(true);
        System.exit(b?0:1);
    }

}

wordcountmapper：这里就是map阶段具体的业务逻辑实现方法该方法的调用取决于读取数据的组件有没有给mr传入数据如果有的话每传入一个kv对该方法就会被调用一次

/*
 * 这里就是mapreduce程序 map阶段逐渐实现的类
 * <KEYIN, VALUEIN, KEYOUT, VALUEOUT>
 * KEYIN表示mapper数据输入的时候，在默认的读取数据组件下叫inputformat，
 * 他的行为是一行一行的读取待处理的数据，
 * 读取一行返回一行给我们的mr程序，在这种情况下，
 * KEYIN就表示我们的每一行的起始偏移量
 * 因此数据类型是long类型
 * VALUEIN表示mapper数据输入的时候value的数据类型，
 * 在默认的数据读取情况下，valuein就表示读取的这一行的内容
 * 因此数据类型是string
 * KETOUT表示mapper数据输出的时候key的数据类型 在本案列种 key是单词，
 * 所以是string
 * VALUEOUT表示mapper数据输出的时候value的数据类型 。。。是integer
 * 这里所说的数据类型是jdk自带的类型 在序列化时 效率低下
 * 因此hadoop自己封装了一套数据类型
 */
public class wordcountmapper extends Mapper<LongWritable, Text, Text, IntWritable> {
 /**
     * 这里就是map阶段具体的业务逻辑实现方法 该方法的调用取决于读取数据的组件有没有给mr传入数据 如果有的话
     * 没传入一个kv对 该方法就会被调用一次
     */
    @Override
    protected void map(LongWritable key, Text value,
                  Mapper<LongWritable, Text, Text, IntWritable>.Context context)
            throws IOException, InterruptedException {
        //拿到传入进来的类型 转化为string
        String line=value.toString();
        //将这一行内容按照分隔符，进行一行内容的切割
        //切割成一个个单词数组
        String[] words=line.split(" ");
        //遍历数组 没出现一个单词 就标记一个数字1
        //<单词，1>
        for(String word:words){
            //使用mr的程序的上下文context,吧map阶段处理的数据发送出去
            //作为reduce阶段输入数据
            context.write(new Text(word),new IntWritable(1));
        }
    }
    public static void main(String[] args) {
        // TODO Auto-generated method stub

    }

}

wordcountreduce：reduce接受后按照key的字典序进行排序按照key是否相同作为一组去调用reduce方法本方法的k就是这一组相同kv对的共同key 把这一组所有的v作为迭代器传入我们的reduce方法

public class wordcountreduce extends
        Reducer<Text, IntWritable, Text, IntWritable> {
    /**
     * reduce接受后 按照key的字典序进行排序 按照key是否相同作为一组去调用reduce方法 本方法的k就是这一组相同kv对的共同key
     * 吧这一组所有的v作为迭代器传入我们的reduce方法
     */
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values,
                          Reducer<Text, IntWritable, Text, IntWritable>.Context context)
            throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        // super.reduce(arg0, arg1, arg2);
        int count = 0;
        // 遍历一组迭代器，吧每一个数量一累加起来 就构成了单词的总次数
        for (IntWritable value : values) {
            count += value.get();
        }
        // 吧最终的结果输出
        context.write(key, new IntWritable(count));
    }

    public static void main(String[] args) {
        // TODO Auto-generated method stub

    }

}

PPartitioner ：按照首字母ASCII值进行分区，因为每个reduce会进行自排序，所以我们只用分好区就好了，排序交给框架。

package cn.itcast.hadoop;
import org.apache.hadoop.mapreduce.Partitioner;
public class PPartitioner extends Partitioner {
    @Override
    public int getPartition(Object o, Object o2, int i) {
        String value = o.toString();
        //取首字母
        char word = value.charAt(0);
        int asc = word;
        if (97 <= asc && asc <= 102) {
            return 0;
        } else if (102 < asc && asc <= 109) {
            return 1;
        }
        return 2;
    }
}

将项目打成jar包，打包成功后会在target下看到打好的jar包：

将这个jar包放在集群里执行

成功。

用IDEA编写一个wordcount

猜你喜欢