@ [TOC] hadoop realizes the problem of simulating crawling data

Create a new orderBean about using Hadoop application data capture function

package com.cevent.hadoop.mapreduce.order;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;
/**

Order bean, inherited serialization comparison
@author cevent
@date 2020年4月7日
*/
public class OrderBean implements WritableComparable{

private String orderID; // Order id
private Double price; // Product price
// Mandatory empty parameter, parameterized, getter / setter
public OrderBean () {
super ();
}

public OrderBean(String orderID, Double price) {
super();
this.orderID = orderID;
this.price = price;
}

public String getOrderID() {
return orderID;
}

public void setOrderID(String orderID) {
this.orderID = orderID;
}

public Double getPrice() {
return price;
}

public void setPrice(Double price) {
this.price = price;
}

@Override
public String toString() {
return orderID + “\t” + price;
}

@Override
public void write(DataOutput output) throws IOException {
// 1.写入类型设置（序列化）
output.writeUTF(orderID);
output.writeDouble(price);

}
// Override method
@Override
public void readFields (DataInput input) throws IOException {
// 2. Set the read type (deserialization)
this.orderID = input.readUTF ();
this.price = input.readDouble () ;
}

@Override
public int compareTo (OrderBean o) {
// twice sort
//1.ID sort, return 0 is equal, return -1 is less than, return + 1–> positive order
int compareResult = this.orderID.compareTo (o. getOrderID ());
if (compareResult == 0) {
// 2. Price ordering-> Reverse order
compareResult = this.price> o.getPrice ()?-1: 1;
}
```
 return compareResult;
```
}

}

orderMapper

package com.cevent.hadoop.mapreduce.order;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**

Mapper cuts data by rows
@author cevent
@date 2020年4月7日
*/
public class OrderMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable>{

OrderBean orderBean = new OrderBean ();

@Override
protected void map(LongWritable key, Text value,Context context)
throws IOException, InterruptedException {
// 1.读取数据
String mapLine=value.toString();
// 2.切割数据
String[] fields=mapLine.split("\t");
```
 //3.封装bean对象:Order_0000001	Pdt_01	222.8  下标=1的产品id跳过
 orderBean.setOrderID(fields[0]);
 orderBean.setPrice(Double.parseDouble(fields[2]));
 //4.写出数据
 context.write(orderBean, NullWritable.get());
```
}

}

orderPartitioner

package com.cevent.hadoop.mapreduce.order;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
/**

partitioner: According to ID
@author cevent
@date 2020年4月7日
*/
public class OrderPartitioner extends Partitioner<OrderBean, NullWritable>{

@Override
public int getPartition (OrderBean key, NullWritable value, int numPartitions) {
// HashPartitioner takes the remainder by default, gets the ID value, and uses the hashCode value of oderID to partition
return (key.getOrderID (). HashCode () & Integer.MAX_VALUE) % numPartitions;
}

}

orderGroupingComparator

package com.cevent.hadoop.mapreduce.order;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class OrderGroupingComparator extends WritableComparator{

//必须有一个空参构造
public OrderGroupingComparator() {
	super(OrderBean.class,true);
}

@Override
public int compare(WritableComparable a,WritableComparable b) {
	// 根据订单id比较，判断是否是一组数据
	OrderBean asBean=(OrderBean) a;
	OrderBean bsBean=(OrderBean) b;
	return asBean.getOrderID().compareTo(bsBean.getOrderID());
}

}

orderReducer

package com.cevent.hadoop.mapreduce.order;

import java.io.IOException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
/**

Reducer receives mapper data
@author cevent
@date 2020 年 4 月 7 日
* /
public class OrderReducer extends Reduce <OrderBean, NullWritable, OrderBean, NullWritable> {

@Override
protected void reduce(OrderBean bean, Iterable values,Context context)
throws IOException, InterruptedException {
// 写出数据
context.write(bean,NullWritable.get());
}
}

orderDriver

Link: link .

package com.cevent.hadoop.mapreduce.order;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**

Driver
@author cevent

@date 2020年4月7日
*/
public class OrderDriver {
public static void main(String[] args) throws Exception {
//1.获取配置信息
Configuration configuration=new Configuration();
Job job=Job.getInstance(configuration);

 //2.设置jar包加载路径
 job.setJarByClass(OrderDriver.class);
 
 //3.设置map/reduce加载类
 job.setMapperClass(OrderMapper.class);
 job.setReducerClass(OrderReducer.class);
 
 //4.设置map输出数据的key和value类型
 job.setMapOutputKeyClass(OrderBean.class);
 job.setMapOutputValueClass(NullWritable.class);
 
 //5.设置最终输出数据的key和value类型
 job.setOutputKeyClass(OrderBean.class);
 job.setOutputValueClass(NullWritable.class);
 
 //6.设置输入数据和输出数据的路径
 FileInputFormat.setInputPaths(job, new Path(args[0]));
 FileOutputFormat.setOutputPath(job, new Path(args[1]));
 
 //关联groupingComparator/comparable
 job.setGroupingComparatorClass(OrderGroupingComparator.class);
 
 //7.设置分区
 job.setPartitionerClass(OrderPartitioner.class);
 
 //8.设置reduce个数
 job.setNumReduceTasks(3);
 //9.提交
 boolean result=job.waitForCompletion(true);
 System.exit(result?0:1);

}
}

But I encountered a disgusting result ...
Insert picture description here

cevent

Published 5 original articles · praised 4 · visits 61

Private letter concerns

hadoop to simulate the problem of fetching data

Create a new orderBean about using Hadoop application data capture function

orderMapper

orderPartitioner

orderGroupingComparator

orderReducer

orderDriver

Guess you like