一、对文件分区
为了充分利用多线程读取,就需要把文件划分成多个区域,供每个线程读取。那么就需要有一个算法来计算出每个线程读取的开始位置和结束位置。那么首先根据配置的线程数和文件的总长度计,算出每个线程平均分配的读取长度。但是有一点,由于文件是纯文本文件,必须按行来处理,如果分割点在某一行中间,那么这一行数据就会被分成两部分,分别由两个线程同时处理,这种情况是不能出现的。所以各个区域的结束点上的字符必须是换行符。第一个区域的开始位置是0,结束位置首先设为(文件长度/线程数),如果结束点位置不是换行符,就只能加1,直到是换行符位置。第一个区域的结束位置有了,自然我们就能求出第二个区域的开始位置了,同理根据上边算法求出第二个区域的结束位置,然后依次类推第三个、第四个......
上边的算法中,第一个区域的结束位置定了,才能有第二个区域的开始位置,第二个区域的结束位置定了,才能有第三个区域的开始位置,依次这么下去。照这种规律,自然地想到的是用递归来解决。(详情看源码)
二、内存文件映射
简单说一下内存文件映射:
内存文件映射,简单地说就是将文件映射到内存的某个地址上。
要理解内存文件映射,首先得明白普通方式读取文件的流程:
首先内存空间分为内核空间和用户空间,在应用程序读取文件时,底层会发起系统调用,由系统调用将数据先读入到内核空间,然后再将数据拷贝到应用程序的用户空间供应用程序使用。这个过程多了一个从内核空间到用户空间拷贝的过程。
如果使用内存文件映射,文件会被映射到物理内存的某个地址上(不是数据加载到内存),此时应用程序读取文件的地址就是一个内存地址,而这个内存地址会被映射到了前面说到的物理内存的地址上。应用程序发起读之后,如果数据没有加载,系统调用就会负责把数据从文件加载到这块物理地址。应用程序便可以读取到文件的数据。省去了数据从内核空间到用户空间的拷贝过程。所以速度上也会有所提高。
在我的读取大文件的实现中,就是用了Java的内存映射API,这样我们就可以在要读取某个地址时再将内容加载到内存。不需要一下子全部将内容加载进来。
package cn.dyz.tools.file; | |
import java.io.ByteArrayOutputStream; | |
import java.io.File; | |
import java.io.FileNotFoundException; | |
import java.io.IOException; | |
import java.io.RandomAccessFile; | |
import java.io.UnsupportedEncodingException; | |
import java.nio.MappedByteBuffer; | |
import java.nio.channels.FileChannel.MapMode; | |
import java.util.HashSet; | |
import java.util.Set; | |
import java.util.concurrent.CyclicBarrier; | |
import java.util.concurrent.ExecutorService; | |
import java.util.concurrent.Executors; | |
import java.util.concurrent.atomic.AtomicLong; | |
public class BigFileReader { | |
private int threadSize; | |
private String charset; | |
private int bufferSize; | |
private IHandle handle; | |
private ExecutorService executorService; | |
private long fileLength; | |
private RandomAccessFile rAccessFile; | |
private Set<StartEndPair> startEndPairs; | |
private CyclicBarrier cyclicBarrier; | |
private AtomicLong counter = new AtomicLong(0); | |
private BigFileReader(File file,IHandle handle,String charset,int bufferSize,int threadSize){ | |
this.fileLength = file.length(); | |
this.handle = handle; | |
this.charset = charset; | |
this.bufferSize = bufferSize; | |
this.threadSize = threadSize; | |
try { | |
this.rAccessFile = new RandomAccessFile(file,"r"); | |
} catch (FileNotFoundException e) { | |
e.printStackTrace(); | |
} | |
this.executorService = Executors.newFixedThreadPool(threadSize); | |
startEndPairs = new HashSet<BigFileReader.StartEndPair>(); | |
} | |
public void start(){ | |
long everySize = this.fileLength/this.threadSize; | |
try { | |
calculateStartEnd(0, everySize); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
return; | |
} | |
final long startTime = System.currentTimeMillis(); | |
cyclicBarrier = new CyclicBarrier(startEndPairs.size(),new Runnable() { | |
@Override | |
public void run() { | |
System.out.println("use time: "+(System.currentTimeMillis()-startTime)); | |
System.out.println("all line: "+counter.get()); | |
} | |
}); | |
for(StartEndPair pair:startEndPairs){ | |
System.out.println("分配分片:"+pair); | |
this.executorService.execute(new SliceReaderTask(pair)); | |
} | |
} | |
private void calculateStartEnd(long start,long size) throws IOException{ | |
if(start>fileLength-1){ | |
return; | |
} | |
StartEndPair pair = new StartEndPair(); | |
pair.start=start; | |
long endPosition = start+size-1; | |
if(endPosition>=fileLength-1){ | |
pair.end=fileLength-1; | |
startEndPairs.add(pair); | |
return; | |
} | |
rAccessFile.seek(endPosition); | |
byte tmp =(byte) rAccessFile.read(); | |
while(tmp!='\n' && tmp!='\r'){ | |
endPosition++; | |
if(endPosition>=fileLength-1){ | |
endPosition=fileLength-1; | |
break; | |
} | |
rAccessFile.seek(endPosition); | |
tmp =(byte) rAccessFile.read(); | |
} | |
pair.end=endPosition; | |
startEndPairs.add(pair); | |
calculateStartEnd(endPosition+1, size); | |
} | |
public void shutdown(){ | |
try { | |
this.rAccessFile.close(); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
this.executorService.shutdown(); | |
} | |
private void handle(byte[] bytes) throws UnsupportedEncodingException{ | |
String line = null; | |
if(this.charset==null){ | |
line = new String(bytes); | |
}else{ | |
line = new String(bytes,charset); | |
} | |
if(line!=null && !"".equals(line)){ | |
this.handle.handle(line); | |
counter.incrementAndGet(); | |
} | |
} | |
private static class StartEndPair{ | |
public long start; | |
public long end; | |
@Override | |
public String toString() { | |
return "star="+start+";end="+end; | |
} | |
@Override | |
public int hashCode() { | |
final int prime = 31; | |
int result = 1; | |
result = prime * result + (int) (end ^ (end >>> 32)); | |
result = prime * result + (int) (start ^ (start >>> 32)); | |
return result; | |
} | |
@Override | |
public boolean equals(Object obj) { | |
if (this == obj) | |
return true; | |
if (obj == null) | |
return false; | |
if (getClass() != obj.getClass()) | |
return false; | |
StartEndPair other = (StartEndPair) obj; | |
if (end != other.end) | |
return false; | |
if (start != other.start) | |
return false; | |
return true; | |
} | |
} | |
private class SliceReaderTask implements Runnable{ | |
private long start; | |
private long sliceSize; | |
private byte[] readBuff; | |
/** | |
* @param start read position (include) | |
* @param end the position read to(include) | |
*/ | |
public SliceReaderTask(StartEndPair pair) { | |
this.start = pair.start; | |
this.sliceSize = pair.end-pair.start+1; | |
this.readBuff = new byte[bufferSize]; | |
} | |
@Override | |
public void run() { | |
try { | |
MappedByteBuffer mapBuffer = rAccessFile.getChannel().map(MapMode.READ_ONLY,start, this.sliceSize); | |
ByteArrayOutputStream bos = new ByteArrayOutputStream(); | |
for(int offset=0;offset<sliceSize;offset+=bufferSize){ | |
int readLength; | |
if(offset+bufferSize<=sliceSize){ | |
readLength = bufferSize; | |
}else{ | |
readLength = (int) (sliceSize-offset); | |
} | |
mapBuffer.get(readBuff, 0, readLength); | |
for(int i=0;i<readLength;i++){ | |
byte tmp = readBuff[i]; | |
if(tmp=='\n' || tmp=='\r'){ | |
handle(bos.toByteArray()); | |
bos.reset(); | |
}else{ | |
bos.write(tmp); | |
} | |
} | |
} | |
if(bos.size()>0){ | |
handle(bos.toByteArray()); | |
} | |
cyclicBarrier.await();//测试性能用 | |
}catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
} | |
public static class Builder{ | |
private int threadSize=1; | |
private String charset=null; | |
private int bufferSize=1024*1024; | |
private IHandle handle; | |
private File file; | |
public Builder(String file,IHandle handle){ | |
this.file = new File(file); | |
if(!this.file.exists()) | |
throw new IllegalArgumentException("文件不存在!"); | |
this.handle = handle; | |
} | |
public Builder withTreahdSize(int size){ | |
this.threadSize = size; | |
return this; | |
} | |
public Builder withCharset(String charset){ | |
this.charset= charset; | |
return this; | |
} | |
public Builder withBufferSize(int bufferSize){ | |
this.bufferSize = bufferSize; | |
return this; | |
} | |
public BigFileReader build(){ | |
return new BigFileReader(this.file,this.handle,this.charset,this.bufferSize,this.threadSize); | |
} | |
} | |
} |
package cn.dyz.tools.file; | |
public class Main { | |
public static void main(String[] args) { | |
BigFileReader.Builder builder = new BigFileReader.Builder("d:/reliability.txt",new IHandle() { | |
@Override | |
public void handle(String line) { | |
//System.out.println(line); | |
//increat(); | |
} | |
}); | |
builder.withTreahdSize(10) | |
.withCharset("gbk") | |
.withBufferSize(1024*1024); | |
BigFileReader bigFileReader = builder.build(); | |
bigFileReader.start(); | |
} | |
} |