uuid

1.今天对三千万数据进行pv,uv处理,突然有意思的建立了uuid作为主键,应用六个线程去处理六个文件,测试结果发现
(1)有uuid批量插入,每一万个数据提交一次[数据量达到十万时发生内存溢出],
   前六分钟跑的速度还行,插入了250w数据,后面的插入数据慢慢走。让我无法忍受,直接关了。
(2)对puv汇总的时候select url,sum(nums) from puv group by url,发现也是慢的可以用时2.2分钟。
下面是测试结果:
  320w 31   2.2分钟
   270w 12
  254w 8
  250w 6
-------------------
去掉uuid后,发现批量插入的数据很平衡,cpu一直处于平衡状态。
(1)对puv汇总的时候select url,sum(nums) from puv group by url,发现也是慢的可以用时92秒
下面是运行时候的统计:
32811948 38  92秒
3090w   36
2761w   32
2568w   29
2056w   24   
1787w   21
1123w   13
1029w   12
913w    11
823w    10
708w    8
604w    7
470w    6
330w    4
130w    2
总结:发现当数据量达到两百万左右时,用uuid反而降低了批量插入和查询的性能。
-----------------------------------
当我用hadoop去处理这批数据时,我发现才花了3.6分钟。此时,我猜想spark处理时间应该是1.5分钟左右。
代码清单:
package com.sho9wbox.utils;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.recommend.JDBIBase;

public class AppRank {
      private static long totalTimes = 0l;
      private static Date beginTime1 = null;
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
beginTime1 = new Date();
Thread t1 = new Thread(new Runnable() {

@Override
public void run() {
    new AppRank().dealData("1");

}
});
t1.start();
System.out.println("t1 begin...........");
// TODO Auto-generated method stub
Thread t2 = new Thread(new Runnable() {

@Override
public void run() {
new AppRank().dealData("2");


}
});
t2.start();
System.out.println("t2 begin...........");
Thread t3 = new Thread(new Runnable() {

@Override
public void run() {
new AppRank().dealData("3");

}
});
t3.start();
System.out.println("t3 begin...........");
Thread t4 = new Thread(new Runnable() {

@Override
public void run() {
new AppRank().dealData("4");

}
});
t4.start();
System.out.println("t4 begin...........");
Thread t5 = new Thread(new Runnable() {

@Override
public void run() {
new AppRank().dealData("5");

}
});
t5.start();
System.out.println("t5 begin...........");
Thread t6 = new Thread(new Runnable() {

@Override
public void run() { new AppRank().dealData("6");
}
});
t6.start();
System.out.println("t6 begin...........");

}
   
private static String comile = "(.)\\w+(.)com(.)cn|(.)\\w+(\\.)com|(.)\\w+(.)cn";
//http://\\d+.\\d+.\\d+.\\d+:\\d+/
private static String compile1 = "(?<=http://)(\\w+[\\.|/])+\\w+:\\d+/";
private static Pattern pattern = Pattern.compile(comile);
private static Pattern pattern1 = Pattern.compile(compile1);
public void dealData(String s) {
long beginTime = System.currentTimeMillis();
        List<Map<String,Object>> list = new ArrayList<>();
String path = "E://"+s+".txt";
FileReader file  = null;
try {
file = new FileReader(path);
BufferedReader buffer = new BufferedReader(file);
String str = new String();
StringBuilder sb = new StringBuilder();
sb.append("insert into cs1(url,nums) values");
while ((str = buffer.readLine()) != null) {
System.out.println(s+"------"+new Date()+"      "+beginTime1);
Map<String, Object> map = new AppRank().getData(str);
list.add(map);
if(list.size()%10000==0){
for(Map<String, Object> mapc:list){
for(Entry<String, Object> entry:mapc.entrySet()){
String key = entry.getKey().replace("\n", "").trim();
String val = entry.getValue().toString().replace("\n", "").trim();
int values = Integer.parseInt(val);
//sb.append("('"+UUID.randomUUID()+"','"+key+"',"+values+"),");
sb.append("('"+key+"',"+values+"),");
}
}
String s1 = sb.toString();
s1 = s1.substring(0, s1.lastIndexOf(","));
JDBIBase.execute(s1);
list.clear();
sb.delete(0, sb.length());
sb.append("insert into cs1(url,nums) values");
}
}
if(list.size()<=10000){
for(Map<String, Object> mapc:list){
for(Entry<String, Object> entry:mapc.entrySet()){
String key = entry.getKey().replace("\n", "").trim();
String val = entry.getValue().toString().replace("\n", "").trim();
int values = Integer.parseInt(val);
//sb.append("('"+UUID.randomUUID()+"','"+key+"',"+values+"),");
sb.append("('"+key+"',"+values+"),");
}
}
String s1 = sb.toString();
s1 = s1.substring(0, s1.lastIndexOf(","));
JDBIBase.execute(s1);
list.clear();
sb.delete(0, sb.length());
sb.append("insert into cs1(url,nums) values");
//Thread.sleep(2000);
}
long end = System.currentTimeMillis();
totalTimes+=end-beginTime;
System.out.println("文件耗时  "+totalTimes/1000+" 秒");
} catch (Exception e) {
e.printStackTrace();
System.out.println("发生异常......");
}
finally{
try {
file.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}


}

/**
* @param s
* @return
* @throws Exception
*/
public  Map<String, Object>  getData(String s) throws Exception{
Map<String, Object> map = new ConcurrentHashMap<String,Object>();

String arr[] = s.split(" "); 
Matcher m = pattern.matcher(arr[0]);
String rs = null;
if(m.find()) {
rs = m.group().replace("/", ".");
rs= "http://www"+rs;
}
else  
{
Matcher m1 = pattern1.matcher(arr[0]);
while(m1.find()){
rs = m1.group();
}
if(null==rs)
rs = arr[0];
}
//System.out.println(rs +" "+entry.getValue().toString());
if(rs.indexOf("localhost")<=0||rs.indexOf("127.0.0.1")<=0) {
rs.replace("/", ".");
map.put(rs, arr[1]);
}
return map;
}
}
hadoop处理代码:
package com;




import java.io.IOException;
import java.net.URI;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.mapreduce.Counter;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class RankAppSort extends Configured implements Tool{
static String INPUT_PATH = "hdfs://192.168.1.230:9000/input";
static String OUT_PATH = "hdfs://192.168.1.230:9000/output/";
@Override
public int run(String[] arg0) throws Exception {
/*INPUT_PATH = arg0[0];
OUT_PATH = arg0[1];
*/
long beginTime = System.currentTimeMillis();
System.out.println(System.currentTimeMillis());
Configuration conf = new Configuration();
final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);
final Path outPath = new Path(OUT_PATH);
if(fileSystem.exists(outPath)){
fileSystem.delete(outPath, true);
}

final Job job = new Job(conf , RankAppSort.class.getSimpleName());
job.setJarByClass(RankAppSort.class);

FileInputFormat.setInputPaths(job, INPUT_PATH);
//1.2 指定自定义的map类
job.setMapperClass(MyMapper.class);
//map输出的<k,v>类型。如果<k3,v3>的类型与<k2,v2>类型一致,则可以省略
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);

//1.3 分区
//job.setPartitionerClass(HashPartitioner.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);

//2.3 指定写出到哪里
FileOutputFormat.setOutputPath(job, outPath);
//指定输出文件的格式化类
//job.setOutputFormatClass(TextOutputFormat.class);

//把job提交给JobTracker运行
job.waitForCompletion(true);
long end = System.currentTimeMillis();
System.out.println((end-beginTime)/1000+"秒");
return 0;
//1429522143046
//
}
private static String comile = "(.)\\w+(.)com(.)cn|(.)\\w+(\\.)com|(.)\\w+(.)cn";
//http://\\d+.\\d+.\\d+.\\d+:\\d+/
private static String compile1 = "(?<=http://)(\\w+[\\.|/])+\\w+:\\d+/";
private static Pattern pattern = Pattern.compile(comile);
private static Pattern pattern1 = Pattern.compile(compile1);
public static void main(String[] args) throws Exception {
ToolRunner.run(new RankAppSort(), args);
}

static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
protected void map(LongWritable k1, Text v1, Context context) throws java.io.IOException ,InterruptedException {
final String[] arr = v1.toString().split(" ");
Matcher m = null;
if(null!=arr[0]&&!"".equals(arr[0]))
m = pattern.matcher(arr[0]);
String rs = null;
if(m.find()) {
rs = m.group().replace("/", ".");
rs= "http://www"+rs;
}
else  
{
Matcher m1 = pattern1.matcher(arr[0]);
while(m1.find()){
rs = m1.group();
}
if(null==rs)
rs = arr[0];
}
//System.out.println(rs +" "+entry.getValue().toString());
if(rs.indexOf("localhost")>-1||rs.indexOf("127.0.0.1")>-1){
//Log.info("排除localhost");
}else{
rs.replace("/", ".");
context.write(new Text(rs), new LongWritable(Long.parseLong(arr[1])));
}
}
}


static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
protected void reduce(Text k2,Iterable<LongWritable> v2s, Context ctx) throws java.io.IOException ,InterruptedException {
int times = 0;
for (LongWritable count : v2s) {
times += count.get();
}
IntPair ip = new IntPair(k2.toString(), times);
ctx.write(new Text(ip.getKey()), new LongWritable(ip.getVal()));
}

@Override
protected void cleanup(
Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
//super.cleanup(context);

}

}


}

spark的代码清单后续有时间贴上。

猜你喜欢

转载自hadasione.iteye.com/blog/2205197