单表关联:
给出child-parent(孩子——父母)表,
要求输出grandchild-grandparent(孙子——爷奶)表
输入:
文件p中数据:
Tom,Lucy
Tom,Jack
Jone,Lucy
Jone,Jack
Lucy,Mary
Lucy,Ben
Jack,Alice
Jack,Jesse
Terry,Alice
Terry,Jesse
Philip,Terry
Philip,Alma
Mark,Terry
Mark,Alma
输出:
Tom,Alice
Tom,Jesse
Jone,Alice
Jone,Jesse
Tom,Mary
Tom,Ben
Jone,Mary
Jone,Ben
Philip,Alice
Philip,Jesse
Mark,Alice
Mark,Jesse
实现方法:
1、将文件切分前后各作为key和value写一遍
2、方法同下面多表关联类似
实现代码:
package One_File_Relation
import java.io.IOException
import java.util.ArrayList
import java.util.List
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.Mapper
import org.apache.hadoop.mapreduce.Reducer
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
public class One_file_test {
static String INPUT_PATH = "hdfs://master:9000/input/p"
static String OUTPUT_PATH = "hdfs://master:9000/output"
static class MyMapper extends Mapper<Object,Object,Text,Text>{
Text output_key=new Text()
Text output_value=new Text()
protected void map(Object key, Object value, Context context) throws IOException, InterruptedException{
String[] tokens=value.toString().split(",")
if(tokens!=null && tokens.length==2){
output_key.set(tokens[0].trim())
output_value.set(2+","+value)
context.write(output_key,output_value)
output_value.set(1+","+value)
output_key.set(tokens[1])
context.write(output_key,output_value)
System.out.println(tokens[0]+" - "+tokens[1])
}
}
}
static class MyReduce extends Reducer<Text,Text,Text,Text>{
Text output_key=new Text()
Text output_value=new Text()
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException{
List<String> childs=new ArrayList()
List<String> grands=new ArrayList()
for(Text line:values){
String[] tokens=line.toString().split(",")
if(tokens[0].equals("1")){
childs.add(tokens[1])
System.out.println(1+"=="+tokens[1])
}
else if(tokens[0].equals("2")){
grands.add(tokens[2])
System.out.println(2+"=="+tokens[2])
}
}
for(String c:childs)
for(String g:grands){
output_key.set(c)
output_value.set(g)
context.write(output_key, output_value)
}
}
}
public static void main(String[]args) throws IOException, ClassNotFoundException, InterruptedException{
Path outputpath = new Path(OUTPUT_PATH)
Configuration conf = new Configuration()
FileSystem fs = outputpath.getFileSystem(conf)
if(fs.exists(outputpath)){
fs.delete(outputpath,true)
}
conf.set("fs.default.name ", "hdfs://master:9000/")
Job job = Job.getInstance(conf)
FileInputFormat.setInputPaths(job, INPUT_PATH)
FileOutputFormat.setOutputPath(job, outputpath)
job.setMapperClass(MyMapper.class)
job.setReducerClass(MyReduce.class)
job.setMapOutputKeyClass(Text.class)
job.setMapOutputValueClass(Text.class)
job.setOutputKeyClass(Text.class)
job.setOutputValueClass(Text.class)
job.waitForCompletion(true)
}
}
多表关联:
输入:
num1文件
xm@master:~$ hadoop fs -text /b/num1
1,Beijing
2,Guangzhou
3,Shenzhen
4,Xian
num2文件:
xm@master:~$ hadoop fs -text /b/num2
Beijing Red Star,1
Shenzhen Thunder,3
Guangzhou Honda,2
Beijing Rising,1
Guangzhou Development Bank,2
Tencent,3
Back of Beijing,1
输出:
Back of Beijing Beijing
Beijing Rising Beijing
Beijing Red Star Beijing
Guangzhou Development Bank Guangzhou
Guangzhou Honda Guangzhou
Tencent Shenzhen
Shenzhen Thunder Shenzhen
实现方法:
一、首先采用map里面的setup方法获得文件名称(num1还是num2)
二、map处理
对文件sum1处理成如下格式:
1 2,1,Beijing(key=1,value=2,1,Beijing)
对文件sum2处理成如下格式:
1 1,Beijing Red Star,1(key=1,value=1,Beijing Red Star,1)
然后context.write(key,value);
这样文件就写到一起了
1 2,1,Beijing
1 1,Beijing Red Star,1
三、reduce处理
reduce首先判断value的第一个是1还是2
如果是1,那么我们取第二个字符作为key写入(key=Beijing Red Star)
如果是2,那么我们取第三个字符作为value写入(value=Beijing)
context.write(key,value);
那么输出文件就会显示成如下形式:
Beijing Red Star BeiJing
完成!
实现代码:
package Sum_File_Relation
import java.io.IOException
import java.util.ArrayList
import java.util.List
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.Mapper
import org.apache.hadoop.mapreduce.Reducer
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.lib.input.FileSplit
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
public class Sum_File_Relation {
static String INPUT_PATH = "hdfs://master:9000/b"
static String OUTPUT_PATH = "hdfs://master:9000/output"
static class MyMapper extends Mapper<Object,Object,Text,Text>{
Text output_key=new Text()
Text output_value=new Text()
String k = ""
//获得文件夹下文件名称信息
protected void setup(Context context) throws IOException, InterruptedException{
FileSplit fs = (FileSplit) context.getInputSplit()
k = fs.getPath().getName()
System.out.println(k)
}
protected void map(Object key, Object value, Context context) throws IOException, InterruptedException{
String[] tokens=value.toString().split(",")
if(tokens!=null && tokens.length==2){
if(k.equals("num1")){
output_key.set(tokens[0].trim())
output_value.set(2+","+value)
context.write(output_key,output_value)
}else if(k.equals("num2")){
output_value.set(1+","+value)
output_key.set(tokens[1])
context.write(output_key,output_value)
}
System.out.println(tokens[0]+" - "+tokens[1])
}
}
}
static class MyReduce extends Reducer<Text,Text,Text,Text>{
Text output_key=new Text()
Text output_value=new Text()
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException{
List<String> childs=new ArrayList()
List<String> grands=new ArrayList()
for(Text line:values){
String[] tokens=line.toString().split(",")
if(tokens[0].equals("1")){
childs.add(tokens[1])
System.out.println(1+"=="+tokens[1])
}
else if(tokens[0].equals("2")){
grands.add(tokens[2])
System.out.println(2+"=="+tokens[2])
}
}
//写入数据,每个key对应多个value
for(String c:childs)
for(String g:grands){
output_key.set(c)
output_value.set(g)
context.write(output_key, output_value)
}
}
}
public static void main(String[]args) throws IOException, ClassNotFoundException, InterruptedException{
Path outputpath = new Path(OUTPUT_PATH)
Configuration conf = new Configuration()
FileSystem fs = outputpath.getFileSystem(conf)
if(fs.exists(outputpath)){
fs.delete(outputpath,true)
}
conf.set("fs.default.name ", "hdfs://master:9000/")
Job job = Job.getInstance(conf)
FileInputFormat.setInputPaths(job, INPUT_PATH)
FileOutputFormat.setOutputPath(job, outputpath)
job.setMapperClass(MyMapper.class)
job.setReducerClass(MyReduce.class)
job.setMapOutputKeyClass(Text.class)
job.setMapOutputValueClass(Text.class)
job.setOutputKeyClass(Text.class)
job.setOutputValueClass(Text.class)
job.waitForCompletion(true)
}
}