MapReduce之wordcount的java与python实现

Java代码:

  • 重写map类:
public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		
		String line = value.toString();

		String[] words = line.split();

		for (String word:words){
			Text k = new Text(word);
			IntWritable one = new IntWritable(1);
			context.write(k, one);
		}
		
	}

}
  • 重写reducer类:
public class WCReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
	@Override
	protected void reduce(Text word, Iterable<IntWritable> value, Context context) throws IOException, InterruptedException {
		int sum = 0;
		for (IntWritable one:value){
			int i = one.get();
			sum += i;
		}
		context.write(word, new IntWritable(sum));
	}
}
  • 定义主类:
public class WCDriver {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);
		
		job.setJarByClass(WCDriver.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);

		job.setMapperClass(WCMapper.class);
		job.setReducerClass(WCReducer.class);
		
		job.setCombinerClass(WCReducer.class);
		
		job.setNumReduceTasks(2);

		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		boolean bool = job.waitForCompletion(true);
		System.exit(bool?0:1);

	}

}

python代码:

  • mapper
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys


def map():
    for line in sys.stdin:
        line = line.strip()
        words = line.split()
        for word in words:
            print('{}\t{}'.format(word, 1))


if __name__ == '__main__':
    map()
  • reducer
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
from itertools import groupby


def from_stdin():
    for line in sys.stdin:
        word, count = line.strip().split('\t')
        yield (word, count)


def reduce():
    for word, group in groupby(from_stdin(), key=lambda x: x[0]):
        count = sum([int(tup[1]) for tup in group])
        print('%s\t%s' % (word, count))


if __name__ == '__main__':
    reduce()
发布了9 篇原创文章 · 获赞 4 · 访问量 2820

猜你喜欢

转载自blog.csdn.net/weixin_44129672/article/details/88719941
今日推荐