Article Directory
MapReduce communication data cleaning processing
A copy of the communication data of related personnel is now obtained, and the data is simply preprocessed and integrated, and finally output.
demand:
- Replace phone number with adult name
- Convert the timestamps of making and receiving calls into dates
- Find out the call time, in seconds
- Replace the province code with the province name
Data set::
phone
caller's mobile phone number, recipient's mobile phone number, start timestamp, acceptance timestamp, caller address province code, recipient address province code
city
: address id, province code, province name
person
: phone ID, phone number, name
Example of final data:
任宗阳,邓二,1970年01月18日 06:41:23,1970年01月18日 06:41:23,186秒,浙江省,新疆维吾尔自治区
1. Encapsulate Bean class
- Create Bean class:Package data output content
- Override toString(): Define the output format
- Inherited from WritableComparable: Realize serialization and deserialization methods
- Define setter and getter methods
- Custom set() method: Used for assignment
package 通信数据处理;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class BeanTest implements WritableComparable<BeanTest> {
// 封装属性
private String sender; // 呼叫者
private String receiver; // 接受者
private String start; // 开始时间
private String end; // 结束时间
private String interval; // 间隔
private String p; // 省份
private String c; // 城市
// toString()
@Override
public String toString() {
StringBuffer buffer = new StringBuffer();
buffer.append(sender).append(",");
buffer.append(receiver).append(",");
buffer.append(start).append(",");
buffer.append(end).append(",");
buffer.append(interval).append(",");
buffer.append(p).append(",");
buffer.append(c);
return buffer.toString();
}
// 无参构造
public BeanTest() {
}
@Override
public int compareTo(BeanTest o) {
return 0;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(sender);
dataOutput.writeUTF(receiver);
dataOutput.writeUTF(start);
dataOutput.writeUTF(end);
dataOutput.writeUTF(interval);
dataOutput.writeUTF(p);
dataOutput.writeUTF(c);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
sender = dataInput.readUTF();
receiver = dataInput.readUTF();
start = dataInput.readUTF();
end = dataInput.readUTF();
interval = dataInput.readUTF();
p = dataInput.readUTF();
c = dataInput.readUTF();
}
public void set(String sender, String receiver, String start, String end, String interval, String p, String c) {
this.sender = sender;
this.receiver = receiver;
this.start = start;
this.end = end;
this.interval = interval;
this.p = p;
this.c = c;
}
public String getSender() {
return sender;
}
public void setSender(String sender) {
this.sender = sender;
}
public String getReceiver() {
return receiver;
}
public void setReceiver(String receiver) {
this.receiver = receiver;
}
public String getStart() {
return start;
}
public void setStart(String start) {
this.start = start;
}
public String getEnd() {
return end;
}
public void setEnd(String end) {
this.end = end;
}
public String getInterval() {
return interval;
}
public void setInterval(String interval) {
this.interval = interval;
}
public String getP() {
return p;
}
public void setP(String p) {
this.p = p;
}
public String getC() {
return c;
}
public void setC(String c) {
this.c = c;
}
}
2. MapperTest class for data conversion
- Here we were in the form of a small cache file operations, relatively speaking
person.txt
, andcity.txt
less content of the file, they will be used as cache files. - First,
context.getCacheFiles()
get the cache file group. Take person datasets example: the original data set there are three, namely电话ID
,电话号码
,姓名
, our goal is tophone.txt
in the first two columns into a corresponding number owner's name, the first column of data in the person become less important phone ID (Not used), so it is ignored during processing. The first column in the same city data set can also be ignored~ - We were treated in the form of caching
归属人
,归属地
information, and which is电话->人名
,地址ID->省份
in the form of two key-value pairsHashMap
, the advantage of doing so is by back some information to the original dataget()
corresponding to the value of the key, to complete the replacement operation. - For the conversion of time data, use the SimpleDateFormat class (specify the output style). Since it was originally a
String
type of data, we must first convert it to aLong
type of time type data (Long.parseLong(String型原始数据)
). The time interval is simply the ending time minus the starting time.
package 通信数据处理;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.*;
import java.net.URI;
import java.text.SimpleDateFormat;
import java.util.HashMap;
public class MapperTest extends Mapper<LongWritable, Text, BeanTest, NullWritable> {
String sender; // 拨号人
String receiver; // 接收人
String p; // 拨号人居住地
String c; // 接受者居住地
String start; // 拨号时间
String end; // 挂机时间
String interval; // 间隔时间
HashMap<String, String> people = new HashMap(); // 存储人员数据缓存的内容
HashMap<String, String> provience = new HashMap(); // 存储省份数据缓存的内容
BeanTest k = new BeanTest();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//分别获取缓存文件中的数据
// 1.获取所有缓存文件集
URI[] cacheFiles = context.getCacheFiles();
String line;
// 2.获取人员文件中的数据
String path1 = cacheFiles[0].getPath().toString();
BufferedReader person = new BufferedReader(new InputStreamReader(new FileInputStream(path1),"UTF-8"));
while (StringUtils.isNotEmpty(line = person.readLine())){
// 7,18000696806,赵贺彪
String[] fields = line.split(",");
people.put(fields[1],fields[2]); // 存储18000696806-->赵贺彪
}
// 3.获取省份文件中的数据
String path2 = cacheFiles[1].getPath().toString();
BufferedReader city = new BufferedReader(new InputStreamReader(new FileInputStream(path2),"UTF-8"));
while (StringUtils.isNotEmpty(line = city.readLine())){
// 1,110000,北京市
String[] fields = line.split(",");
provience.put(fields[1],fields[2]); // 存储110000-->北京市
}
// 4.关闭资源
IOUtils.closeStream(person);
IOUtils.closeStream(city);
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 18620192711,15733218050,1506628174,1506628265,650000,810000 获取一行数据
String line = value.toString();
// 拆分
String[] fields = line.split(",");
// 替换拨号及接收人名信息\以及他们的所在省份信息
sender = people.get(fields[0]);
receiver = people.get(fields[1]);
p = provience.get(fields[4]);
c = provience.get(fields[5]);
// 转换时间日期格式
SimpleDateFormat sdf = new SimpleDateFormat("yyyy年MM月dd日 HH:mm:ss");
start = sdf.format(Long.parseLong(fields[2]));
end = sdf.format(Long.parseLong(fields[3]));
interval = Long.parseLong(fields[3]) - Long.parseLong(fields[2])+"秒";
k.set(sender,receiver,start,end,interval,p,c);
System.out.println(k);
// 写出
context.write(k,NullWritable.get());
}
}
3.Reducer class output data
package 通信数据处理;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class ReducerTest extends Reducer<BeanTest, NullWritable,BeanTest, NullWritable> {
@Override
protected void reduce(BeanTest key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
// 直接写出
for (NullWritable v:values){
context.write(key,NullWritable.get());
}
}
}
4. DriverTest class configuration job
package 通信数据处理;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.net.URI;
public class DriverTest {
public static void main(String[] args) {
Job job;
Configuration conf = new Configuration();
try {
// 获取job
job = Job.getInstance(conf);
// 基础配置
job.setMapperClass(MapperTest.class);
job.setReducerClass(ReducerTest.class);
job.setJarByClass(DriverTest.class);
job.setMapOutputKeyClass(BeanTest.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(BeanTest.class);
job.setOutputValueClass(NullWritable.class);
// 配置缓存文件
URI[] uris = new URI[2];
uris[0] = new URI("file:///G:/Projects/IdeaProject-C/MapReduce/src/main/java/通信数据处理/cache/person.txt");
uris[1] = new URI("file:///G:/Projects/IdeaProject-C/MapReduce/src/main/java/通信数据处理/cache/city.txt");
job.setCacheFiles(uris);
// 输入输出文件
FileInputFormat.setInputPaths(job, new Path("G:\\Projects\\IdeaProject-C\\MapReduce\\src\\main\\java\\通信数据处理\\data\\phone.txt"));
FileOutputFormat.setOutputPath(job, new Path("G:\\Projects\\IdeaProject-C\\MapReduce\\src\\main\\java\\通信数据处理\\output"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
} catch (Exception e){
e.printStackTrace();
}
}
}