统计一篇超过10G的文章中每个单词出现的次数,基本代码思路如下:(代码具体细节思路见以下代码的后续分析)
(实现过程中存在一定问题,后续完善)
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class CountWordsOfArticle {
public void countWordsOfArticle(String fileName, int arraySize) throws IOException {
File file = new File(fileName);
if (!file.exists()) {
System.out.println("该文件不存在");
return;
}
MappedBiggerFileReader reader = new MappedBiggerFileReader(fileName, arraySize);
while (reader.read() != -1) {
wordCount(reader);
}
}
private static void wordCount(MappedBiggerFileReader reader) throws IOException {
Map<String, Integer> map = new ConcurrentHashMap<>();
BufferedReader in = new BufferedReader(new InputStreamReader(reader));
StringBuffer buffer = new StringBuffer();
String line = " ";
while ((line = in.readLine()) != null) {
buffer.append(line);
}
String request = buffer.toString();
Pattern p = Pattern.compile("[, . ; ! ? ]");
Matcher m = p.matcher(request);
String[] strs = p.split(request);
for (int i = 0; i < strs.length; i++) {
if (map.containsKey(strs[i].toLowerCase())) {
map.put(strs[i].toLowerCase(), map.get(strs[i].toLowerCase()) + 1);
} else {
map.put(strs[i].toLowerCase(), 1);
}
}
List<Map.Entry<String, Integer>> result = map.entrySet().stream()
.sorted(new Comparator<Map.Entry<String, Integer>>() {
@Override
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return o2.getValue().compareTo(o1.getValue());
}
}).collect(Collectors.toList());
result.forEach(item -> {
System.out.println(item.getKey() + " " + item.getValue());
});
}
public class MappedBiggerFileReader extends InputStream{
private MappedByteBuffer[] mappedBufArray;
private int count = 0;
private int number;
private FileInputStream fileIn;
private long fileLength;
private int arraySize;
private byte[] array;
public MappedBiggerFileReader(String fileName, int arraySize) throws IOException {
this.fileIn = new FileInputStream(fileName);
FileChannel fileChannel = fileIn.getChannel();
this.fileLength = fileChannel.size();
this.number = (int) Math.ceil((double) fileLength / (double) Integer.MAX_VALUE);
this.mappedBufArray = new MappedByteBuffer[number];// 内存文件映射数组
long preLength = 0;
long regionSize = (long) Integer.MAX_VALUE;// 映射区域的大小
for (int i = 0; i < number; i++) {// 将文件的连续区域映射到内存文件映射数组中
if (fileLength - preLength < (long) Integer.MAX_VALUE) {
regionSize = fileLength - preLength;// 最后一片区域的大小
}
mappedBufArray[i] = fileChannel.map(FileChannel.MapMode.READ_ONLY, preLength, regionSize);
preLength += regionSize;// 下一片区域的开始
}
this.arraySize = arraySize;
}
public int read() throws IOException {
if (count >= number) {
return -1;
}
int limit = mappedBufArray[count].limit();
int position = mappedBufArray[count].position();
if (limit - position > arraySize) {
array = new byte[arraySize];
mappedBufArray[count].get(array);
return arraySize;
} else {// 本内存文件映射最后一次读取数据
array = new byte[limit - position];
mappedBufArray[count].get(array);
if (count < number) {
count++;// 转换到下一个内存文件映射
}
return limit - position;
}
}
public void close() throws IOException {
fileIn.close();
array = null;
}
public byte[] getArray() {
return array;
}
public long getFileLength() {
return fileLength;
}
}
}
- 思考一:如何有效读取一个超过10G的超大文件?有哪些方法?那个方法最优?
- 思考二:如何有效统计一篇文章中每个单词出现的次数?
- 思考三:将问题一和二结合起来分析,如何实现?-----见上面代码
思考一:如何有效读取一个超过10G的超大文件?有哪些方法?那个方法最优?
一.常规方法读取大文件所遇到的问题
Java 读取文件的一般操作如下,将文件数据全部读取到内存中,然后再对数据进行操作。
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
public class SmallFileTest {
public static void main(String[] args) throws IOException {
Path path = Paths.get("file path");
byte[] data = Files.readAllBytes(path);
...........(省略)
}
}
这对于小文件是没有问题的,但是对于稍大一些的文件就会抛出异常:
Exception in thread "main" java.lang.OutOfMemoryError: Required array size too large
at java.nio.file.Files.readAllBytes(Files.java:3156)
分析:从错误定位看出,Files.readAllBytes
方法最大支持 Integer.MAX_VALUE - 8
大小的文件,也即最大2GB的文件。一旦超过了这个限度,java 原生的方法就不能直接使用了。
二.读取大文件的三大方法与对比
- 文件字节流:对文件建立
java.io.BufferedInputStream
,每次调用read()
方法时会接连取出文件中长度为arraySize
的数据到array
中。这种方法可行但是效率不高。 - 文件通道:对文件建立 java.nio.channels.FileChannel ,每次调用 read() 方法时会先将文件数据读取到分配的长度为 arraySize 的 java.nio.ByteBuffer 中,再从中将已经读取到的文件数据转化到 array 中。这种利用了NIO中的通道的方法,比传统的字节流读取文件是要快一些。
- 内存文件映射:这种方法就是把文件的内容被映像到计算机虚拟内存的一块区域,从而可以直接操作内存当中的数据而无需每次都通过 I/O 去物理硬盘读取文件。这是由当前 java 态进入到操作系统内核态,由操作系统读取文件,再返回数据到当前 java 态的过程。这样就能大幅提高我们操作大文件的速度。一个内存文件映射不够用,那么试一试用多个就可以了。
1.文件字节流----测试代码如下:
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.IOException;
public class StreamFileReader {
private BufferedInputStream fileIn;
private long fileLength;
private int arraySize;
private byte[] array;
public StreamFileReader(String fileName, int arraySize) throws IOException {
this.fileIn = new BufferedInputStream(new FileInputStream(fileName), arraySize);
this.fileLength = fileIn.available();
this.arraySize = arraySize;
}
public int read() throws IOException {
byte[] tmpArray = new byte[arraySize];
int bytes = fileIn.read(tmpArray);// 暂存到字节数组中
if (bytes != -1) {
array = new byte[bytes];// 字节数组长度为已读取长度
System.arraycopy(tmpArray, 0, array, 0, bytes);// 复制已读取数据
return bytes;
}
return -1;
}
public void close() throws IOException {
fileIn.close();
array = null;
}
public byte[] getArray() {
return array;
}
public long getFileLength() {
return fileLength;
}
public static void main(String[] args) throws IOException {
StreamFileReader reader = new StreamFileReader("/home/zfh/movie.mkv", 65536);
long start = System.nanoTime();
while (reader.read() != -1) ;
long end = System.nanoTime();
reader.close();
System.out.println("StreamFileReader: " + (end - start));
}
}
2.文件通道----测试代码如下:
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
public class ChannelFileReader {
private FileInputStream fileIn;
private ByteBuffer byteBuf;
private long fileLength;
private int arraySize;
private byte[] array;
public ChannelFileReader(String fileName, int arraySize) throws IOException {
this.fileIn = new FileInputStream(fileName);
this.fileLength = fileIn.getChannel().size();
this.arraySize = arraySize;
this.byteBuf = ByteBuffer.allocate(arraySize);
}
public int read() throws IOException {
FileChannel fileChannel = fileIn.getChannel();
int bytes = fileChannel.read(byteBuf);// 读取到ByteBuffer中
if (bytes != -1) {
array = new byte[bytes];// 字节数组长度为已读取长度
byteBuf.flip();
byteBuf.get(array);// 从ByteBuffer中得到字节数组
byteBuf.clear();
return bytes;
}
return -1;
}
public void close() throws IOException {
fileIn.close();
array = null;
}
public byte[] getArray() {
return array;
}
public long getFileLength() {
return fileLength;
}
public static void main(String[] args) throws IOException {
ChannelFileReader reader = new ChannelFileReader("/home/zfh/movie.mkv", 65536);
long start = System.nanoTime();
while (reader.read() != -1) ;
long end = System.nanoTime();
reader.close();
System.out.println("ChannelFileReader: " + (end - start));
}
}
3.内存文件映射----测试代码如下:
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
public class MappedBiggerFileReader {
private MappedByteBuffer[] mappedBufArray;
private int count = 0;
private int number;
private FileInputStream fileIn;
private long fileLength;
private int arraySize;
private byte[] array;
public MappedBiggerFileReader(String fileName, int arraySize) throws IOException {
this.fileIn = new FileInputStream(fileName);
FileChannel fileChannel = fileIn.getChannel();
this.fileLength = fileChannel.size();
this.number = (int) Math.ceil((double) fileLength / (double) Integer.MAX_VALUE);
this.mappedBufArray = new MappedByteBuffer[number];// 内存文件映射数组
long preLength = 0;
long regionSize = (long) Integer.MAX_VALUE;// 映射区域的大小
for (int i = 0; i < number; i++) {// 将文件的连续区域映射到内存文件映射数组中
if (fileLength - preLength < (long) Integer.MAX_VALUE) {
regionSize = fileLength - preLength;// 最后一片区域的大小
}
mappedBufArray[i] = fileChannel.map(FileChannel.MapMode.READ_ONLY, preLength, regionSize);
preLength += regionSize;// 下一片区域的开始
}
this.arraySize = arraySize;
}
public int read() throws IOException {
if (count >= number) {
return -1;
}
int limit = mappedBufArray[count].limit();
int position = mappedBufArray[count].position();
if (limit - position > arraySize) {
array = new byte[arraySize];
mappedBufArray[count].get(array);
return arraySize;
} else {// 本内存文件映射最后一次读取数据
array = new byte[limit - position];
mappedBufArray[count].get(array);
if (count < number) {
count++;// 转换到下一个内存文件映射
}
return limit - position;
}
}
public void close() throws IOException {
fileIn.close();
array = null;
}
public byte[] getArray() {
return array;
}
public long getFileLength() {
return fileLength;
}
public static void main(String[] args) throws IOException {
MappedBiggerFileReader reader = new MappedBiggerFileReader("/home/zfh/movie.mkv", 65536);
long start = System.nanoTime();
while (reader.read() != -1) ;
long end = System.nanoTime();
reader.close();
System.out.println("MappedBiggerFileReader: " + (end - start));
}
}
4.运行结果比较
用上面三种方法读取1GB文件,运行结果如下
StreamFileReader: 11494900386
ChannelFileReader: 11329346316
MappedFileReader: 11169097480
读取10GB文件,运行结果如下
StreamFileReader: 194579779394
ChannelFileReader: 190430242497
MappedBiggerFileReader: 186923035795
思考二:如何有效统计一篇文章中每个单词出现的次数?
扫描文章,使用正则表达式分割出一个个单词,然后把这个单词放到map<String,Integer>集合中作为key,同时它的value置1,以后每扫描到一个单吃都去检查map里面有没有这个单词,如果有value就加1,然后再放回map中更新value。
import java.io.File;
import java.io.FileNotFoundException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Scanner;
import java.util.Set;
public class splitWords
{
public static void main(String[] args) throws FileNotFoundException
{
File file=new File("C:\\Users\\Administrator\\Desktop\\English.txt");
if(!file.exists())
{
System.out.println("文件不存在");
return;
}
Scanner scanner=new Scanner(file);
//单词和数量映射表
HashMap<String, Integer > hashMap=new HashMap<String,Integer>();
System.out.println("文章-----------------------------------");
while(scanner.hasNextLine())
{
String line=scanner.nextLine();
System.out.println(line);
//\w+ : 匹配所有的单词
//\W+ : 匹配所有非单词
String[] lineWords=line.split("\\W+");//用非单词符来做分割,分割出来的就是一个个单词
Set<String> wordSet=hashMap.keySet();
for(int i=0;i<lineWords.length;i++)
{
//如果已经有这个单词了,
if(wordSet.contains(lineWords[i]))
{
Integer number=hashMap.get(lineWords[i]);
number++;
hashMap.put(lineWords[i], number);
}
else
{
hashMap.put(lineWords[i], 1);
}
}
}
System.out.println("统计单词:------------------------------");
Iterator<String> iterator=hashMap.keySet().iterator();
while(iterator.hasNext())
{
String word=iterator.next();
// System.out.printf("单词: "+word+"出现次数:"+hashMap.get(word));
System.out.printf("单词:%-12s 出现次数:%d\n",word,hashMap.get(word));
}
System.out.println("程序结束--------------------------------");
}
}
采用多线程时:
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class CountWordsOfArticle {
private static void wordCount(String request) {
Map<String, Integer> map = new ConcurrentHashMap<>();
Pattern p = Pattern.compile("[, . ; ! ? ]");
Matcher m = p.matcher(request);
String[] strs = p.split(request);
for (int i = 0; i < strs.length; i++) {
if (map.containsKey(strs[i].toLowerCase())) {
map.put(strs[i].toLowerCase(), map.get(strs[i].toLowerCase())+1);
} else {
map.put(strs[i].toLowerCase(), 1);
}
}
List<Map.Entry<String, Integer>> result = map.entrySet().stream().sorted(new Comparator<Map.Entry<String, Integer>>() {
@Override
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return o2.getValue().compareTo(o1.getValue());
}
}).collect(Collectors.toList());
result.forEach(item -> {
System.out.println(item.getKey() + " " + item.getValue());
}
);
}
}
参考书籍、文献和资料:
【1】https://blog.csdn.net/zhufenghao/article/details/51192043.
【2】https://blog.csdn.net/qq_21808961/article/details/78857170.