MapReduce的自定义数据类型、top-N、二次排序、多表连接案例——千峰视频

MapReduce自定义数据类型

     

到此,自定义的类型已经OK了。

package qf.com.mr;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;

/*
*类说明:
*1.自定义数据类型需要实现Writable(不能排序)、或者WritableComparable(可排序)
*2.实现该实现的方法 write readFields compareTo
*3.方法write readFields 里面的字段个数和顺序要一致 类型要匹配
*4.可以重写toString equal hashcode等方法
*
*
*
*Top-N
数据
hello qianfeng hello qianfeng qianfeng is best qianfeng is better
hadoop is good
spark is nice

取统计后的前三名:
qianfeng 4
is 3
hello 2
*/
public class TopNWritable implements WritableComparable<TopNWritable>{
	
	
	
	
	public String word;
	public int counter;
	
	
	public TopNWritable() {
		
	}
	
	public TopNWritable(String word, int counter) {
		super();
		this.word = word;
		this.counter = counter;
	}

	/**
	 * 序列化
	 */
	public void write(DataOutput out) throws IOException {
		out.writeUTF(this.word);
		out.writeInt(this.counter);
		
	}
	/**
	 * 反序列化
	 */
	public void readFields(DataInput in) throws IOException {
		this.word = in.readUTF();
		this.counter = in.readInt();
	}

	public int compareTo(TopNWritable o) {
		return o.counter - this.counter;//倒排
	}

	public String getWord() {
		return word;
	}

	public void setWord(String word) {
		this.word = word;
	}

	public int getCounter() {
		return counter;
	}

	public void setCounter(int counter) {
		this.counter = counter;
	}

	@Override
	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + counter;
		result = prime * result + ((word == null) ? 0 : word.hashCode());
		return result;
	}

	@Override
	public boolean equals(Object obj) {
		if (this == obj)
			return true;
		if (obj == null)
			return false;
		if (getClass() != obj.getClass())
			return false;
		TopNWritable other = (TopNWritable) obj;
		if (counter != other.counter)
			return false;
		if (word == null) {
			if (other.word != null)
				return false;
		} else if (!word.equals(other.word))
			return false;
		return true;
	}

	@Override
	public String toString() {
		return "[" + word + "\t" +  counter + "]";
	}
	

}

MapReduce的top-N

统计完数据后,再调用刚刚自己写的方法进行获取前几名

       

导出jar包,放到home目录下面,然后就是造数据      

到此,获取前几个的也就OK了。

package qf.com.mr;

import java.io.IOException;
import java.util.TreeSet;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/*
*类说明:Top-N
*就是求前几名
....
*/
public class TopNDemo implements Tool {

	/**
	 * map阶段
	 * 
	 * @author HP
	 *
	 */
	public static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {

		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			String line = value.toString();
			String words[] = line.split(" ");
			for (String s : words) {
				context.write(new Text(s), new Text(1 + ""));
				
			}
			
		}

	}

	/**
	 * reduce阶段
	 */
	public static class MyReducer extends Reducer<Text, Text, TopNWritable, NullWritable> {
		
		public static final int TOP_N = 3;
		//定义一个最终输出结果
		TreeSet<TopNWritable> ts = new TreeSet<TopNWritable>();
		
		@Override
		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			int counter = 0;
			for (Text t : values) {
				counter += Integer.parseInt(t.toString());
			}
			//构造最终输出类型
			TopNWritable tn = new TopNWritable(key.toString(), counter);
			//将tn对象添加到ts
			ts.add(tn);
			
			//如果ts里面的数据个数大于TOP_N的时候将移除最后一个(最大的在上面)
			if(ts.size() > TOP_N) {
				ts.remove(ts.last());
			}
			
			//context.write(tn, NullWritable.get());
		}

		@Override
		protected void cleanup(Context context)
				throws IOException, InterruptedException {
			//循环打印ts中的元素
			for (TopNWritable tn : ts) {
				context.write(tn, NullWritable.get());
			}
		}
		
	}

	public void setConf(Configuration conf) {
		// 对conf的属性设置
		conf.set("fs.defaultFS", "hdfs://qf");
		conf.set("dfs.nameservices", "qf");
		conf.set("dfs.ha.namenodes.qf", "nn1, nn2");
		conf.set("dfs.namenode.rpc-address.qf.nn1", "hadoop01:9000");
		conf.set("dfs.namenode.rpc-address.qf.nn2", "hadoop02:9000");
		conf.set("dfs.client.failover.proxy.provider.qf",
				"org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");
	}

	public Configuration getConf() {
		return new Configuration();
	}

	public int run(String[] args) throws Exception {
		// 1.获取配置对象信息
		Configuration conf = new Configuration();
		// 2.对conf进行设置(没有就不用)
		// 3.获取job对象 (注意导入的包)
		Job job = Job.getInstance(conf, "job");
		// 4.设置job的运行主类
		job.setJarByClass(TopNDemo.class);

		// set inputpath and outputpath
		setInputAndOutput(job, conf, args);

		// System.out.println("jiazai finished");
		// 5.对map阶段进行设置
		job.setMapperClass(MyMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);

		// System.out.println("map finished");
		// 6.对reduce阶段进行设置
		job.setReducerClass(MyReducer.class);
		job.setOutputKeyClass(TopNWritable.class);
		job.setOutputValueClass(NullWritable.class);

		return job.waitForCompletion(true) ? 0 : 1;
	}

	// 主方法
	public static void main(String[] args) throws Exception {
		int isok = ToolRunner.run(new Configuration(), new TopNDemo(), args);
		System.out.println(isok);
	}

	/**
	 * 处理参数的方法
	 * 
	 * @param job
	 * @param conf
	 * @param args
	 */

	private void setInputAndOutput(Job job, Configuration conf, String[] args) {
		if (args.length != 2) {
			System.out.println("usage:yarn jar /*.jar package.classname /* /*");
			return;
		}
		// 正常处理输入输出参数
		try {
			FileInputFormat.addInputPath(job, new Path(args[0]));

			FileSystem fs = FileSystem.get(conf);
			Path outputpath = new Path(args[1]);
			if (fs.exists(outputpath)) {
				fs.delete(outputpath, true);
			}
			FileOutputFormat.setOutputPath(job, outputpath);
		} catch (Exception e) {
			e.printStackTrace();
		}

	}

}

MapReduce的二次排序

    

    

到此,自定义数据类型就可以了。

然后写一个Delme

也就是首先写一个自定义类型,然后写一个MapReduce,Map端的输出key是自定义数据类型

现在就可以导jar包,将包拉到home目录下,上传到根目录下。

创造数据

到此,排序就搞定了。

package qf.com.mr;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;


import org.apache.hadoop.io.WritableComparable;

/*
*类说明:自定义类型
*/
public class SecondaryWritable implements WritableComparable<SecondaryWritable>{

	public int first;
	public int second;	
	public SecondaryWritable(int first, int second) {
		super();
		this.first = first;
		this.second = second;
	}
	public SecondaryWritable() {
		
	}
	public int getFirst() {
		return first;
	}

	public void setFirst(int first) {
		this.first = first;
	}

	public int getSecond() {
		return second;
	}

	public void setSecond(int second) {
		this.second = second;
	}

	public void write(DataOutput out) throws IOException {
		out.writeInt(this.first);
		out.writeInt(this.second);
	}

	public void readFields(DataInput in) throws IOException {
		this.first = in.readInt();
		this.second = in.readInt();
	}

	public int compareTo(SecondaryWritable o) {
		int tmp = 0;
		tmp = this.first - o.first;//升序
		if(tmp != 0) {
			return tmp;
		}
		return this.second - o.second;
	}

	@Override
	public String toString() {
		return "SecondaryWritable [first=" + first + ", second=" + second + "]";
	}

	@Override
	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + first;
		result = prime * result + second;
		return result;
	}

	@Override
	public boolean equals(Object obj) {
		if (this == obj)
			return true;
		if (obj == null)
			return false;
		if (getClass() != obj.getClass())
			return false;
		SecondaryWritable other = (SecondaryWritable) obj;
		if (first != other.first)
			return false;
		if (second != other.second)
			return false;
		return true;
	}
	

}

package qf.com.mr;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


/*
*类说明:
....
*/
public class SecondaryDemo implements Tool{
	
	
	/**
	 * map阶段
	 *
	 */
	public static class MyMapper extends Mapper<LongWritable, Text, SecondaryWritable, Text>{

		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
			String words [] = line.split(" ");
			SecondaryWritable sw = new SecondaryWritable(Integer.parseInt(words[0]),Integer.parseInt(words[1]));
			context.write(sw, new Text(words[1]));
		
		}
		
	}
	/**
	 * reduce阶段
	 */
	public static class MyReducer extends Reducer<SecondaryWritable, Text, SecondaryWritable, Text>{

		@Override
		protected void reduce(SecondaryWritable key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			//输出分隔符
			context.write(new SecondaryWritable(), new Text("----------"));
			//直接输出
			context.write(key, new Text(""));
		}
		
	}
	
	public void setConf(Configuration conf) {
		// 对conf的属性设置
		conf.set("fs.defaultFS", "hdfs://qf");
		conf.set("dfs.nameservices", "qf");
		conf.set("dfs.ha.namenodes.qf", "nn1, nn2");
		conf.set("dfs.namenode.rpc-address.qf.nn1", "hadoop01:9000");
		conf.set("dfs.namenode.rpc-address.qf.nn2", "hadoop02:9000");
		conf.set("dfs.client.failover.proxy.provider.qf", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");
	}

	public Configuration getConf() {
		return new Configuration();
	}

	public int run(String[] args) throws Exception {
		// 1.获取配置对象信息
				Configuration conf = new Configuration();
				// 2.对conf进行设置(没有就不用)
				// 3.获取job对象 (注意导入的包)
				Job job = Job.getInstance(conf, "job");
				// 4.设置job的运行主类
				job.setJarByClass(SecondaryDemo.class);
				
				//set inputpath and outputpath
				setInputAndOutput(job, conf, args);
				
				
				
				// System.out.println("jiazai finished");
				// 5.对map阶段进行设置
				job.setMapperClass(MyMapper.class);
				job.setMapOutputKeyClass(Text.class);
				job.setMapOutputValueClass(Text.class);
	
				
				
				// System.out.println("map finished");
				// 6.对reduce阶段进行设置
				job.setReducerClass(MyReducer.class);
				job.setOutputKeyClass(SecondaryWritable.class);
				job.setOutputValueClass(Text.class);
				
				
				return job.waitForCompletion(true) ? 0 : 1;
	}

	//主方法
		public static void main(String[] args) throws Exception {
			int isok = ToolRunner.run(new Configuration(), new SecondaryDemo(), args);
			System.out.println(isok);
		}

		/**
		 * 处理参数的方法
		 * @param job
		 * @param conf
		 * @param args
		 */
		
		private void setInputAndOutput(Job job, Configuration conf, String[] args) {
			if(args.length != 2) {
				System.out.println("usage:yarn jar /*.jar package.classname /* /*");
				return ;
			}
			//正常处理输入输出参数
			try {
				FileInputFormat.addInputPath(job, new Path(args[0]));
				
				FileSystem fs = FileSystem.get(conf);
				Path outputpath = new Path(args[1]);
				if(fs.exists(outputpath)) {
					fs.delete(outputpath, true);
				}
				FileOutputFormat.setOutputPath(job, outputpath);
			} catch (Exception e) {
				e.printStackTrace();
			}
			
		}

}

 MapReduce多表的join连接                                             

 

添加setUp,执行map之前先执行setup

 

然后就可以导出jar包,然后拷贝到home目录下

到此,多表关联就OK了。

package qf.com.mr;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import com.google.common.collect.Multiset.Entry;


/*
*类说明:多表连接
....
*/
public class MultiTableJoinDemo implements Tool{
	
	
	/**
	 * map阶段
	 *
	 */
	public static class MyMapper extends Mapper<LongWritable, Text, Text, Text>{
		public Map<String, String> sexMap = new ConcurrentHashMap<String, String>();
		public Map<String, String> userMap = new ConcurrentHashMap<String, String>();
		
		@Override
		protected void setup(Context context)
				throws IOException, InterruptedException {
			//首先获取缓存文件路径
			Path [] paths = context.getLocalCacheFiles();
			for (Path path : paths) {
				//获取文件名字
				String fileName = path.getName();
				BufferedReader br = null;
				String str = null;
				if(fileName.equals("sex")) {//文件名为sex
					br = new BufferedReader(new FileReader(new File(path.toString())));
					while((str = br.readLine()) != null) {
						String strs[] = str.split("\t");
						sexMap.put(strs[0], strs[1]);
					}
					//关闭流
					br.close();
					
				}else if(fileName.equals("user")){
					br = new BufferedReader(new FileReader(new File(path.toString())));
					while((str = br.readLine()) != null) {
						String strs[] = str.split("\t");
						userMap.put(strs[0], strs[1]);
					}
					br.close();
				}
			}
		}

		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
			String fields [] = line.split("\t");
			
			//fields[0] 就是uid
			if(userMap.containsKey(fields[0])) {
				
				context.write(new Text(fields[0] + "\t" 
			+ sexMap.get(fields[1]) + "\t" 
						+ userMap.get(fields[0]) + "\t" 
			+ fields[2]),
						new Text(""));
				
			}
			/**
			 * 
			 */
			/*for(Entry<String, String> S : sexMap.entrySet()) {
				context.getCounter("login sex key" + ":" + fields[1], "sexmap key" + S.)
			}*/
		}
		
	}
	/**
	 * reduce阶段
	 */
	/*public static class MyReducer extends Reducer<Text, Text, Text, Text>{

		@Override
		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			
		}
		
	}*/
	
	public void setConf(Configuration conf) {
		// 对conf的属性设置
		conf.set("fs.defaultFS", "hdfs://qf");
		conf.set("dfs.nameservices", "qf");
		conf.set("dfs.ha.namenodes.qf", "nn1, nn2");
		conf.set("dfs.namenode.rpc-address.qf.nn1", "hadoop01:9000");
		conf.set("dfs.namenode.rpc-address.qf.nn2", "hadoop02:9000");
		conf.set("dfs.client.failover.proxy.provider.qf", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");
	}

	public Configuration getConf() {
		return new Configuration();
	}

	public int run(String[] args) throws Exception {
		// 1.获取配置对象信息
				Configuration conf = new Configuration();
				// 2.对conf进行设置(没有就不用)
				// 3.获取job对象 (注意导入的包)
				Job job = Job.getInstance(conf, "job");
				// 4.设置job的运行主类
				job.setJarByClass(MultiTableJoinDemo.class);
				
				//set inputpath and outputpath
				setInputAndOutput(job, conf, args);
				
				
				
				// System.out.println("jiazai finished");
				// 5.对map阶段进行设置
				job.setMapperClass(MyMapper.class);
				job.setMapOutputKeyClass(Text.class);
				job.setMapOutputValueClass(Text.class);
	
				
				//设置缓存数据
				job.addCacheFile(new URI(args[2]));
				job.addCacheFile(new URI(args[3]));
				
				// System.out.println("map finished");
				// 6.对reduce阶段进行设置
			/*	job.setReducerClass(MyReducer.class);
				job.setOutputKeyClass(Text.class);
				job.setOutputValueClass(Text.class);*/
				
				
				return job.waitForCompletion(true) ? 0 : 1;
	}

	//主方法
		public static void main(String[] args) throws Exception {
			int isok = ToolRunner.run(new Configuration(), new MultiTableJoinDemo(), args);
			System.out.println(isok);
		}

		/**
		 * 处理参数的方法
		 * @param job
		 * @param conf
		 * @param args
		 */
		
		private void setInputAndOutput(Job job, Configuration conf, String[] args) {
			if(args.length != 4) {
				System.out.println("usage:yarn jar /*.jar package.classname /* /*");
				return ;
			}
			//正常处理输入输出参数
			try {
				FileInputFormat.addInputPath(job, new Path(args[0]));
				
				FileSystem fs = FileSystem.get(conf);
				Path outputpath = new Path(args[1]);
				if(fs.exists(outputpath)) {
					fs.delete(outputpath, true);
				}
				FileOutputFormat.setOutputPath(job, outputpath);
			} catch (Exception e) {
				e.printStackTrace();
			}
			
		}

}

发布了190 篇原创文章 · 获赞 16 · 访问量 19万+

猜你喜欢

转载自blog.csdn.net/penghao_1/article/details/104223796