关于XML文件的导入实在是一个郁闷的问题。之前使用的DOM来解析XML文件,但是发现当XML文件的超过20M的时候,DOM就暴出了内存不够的问题。
现在设计的思想如下:
1.按照DOM的load时间和暴出的异常来判断,DOM应该是把整个XML文件全部读取以后再进行解析处理。
这样的缺点就是预先需要分配这么多内存来存放读取的XML文件。容易暴出内存不足的异常。
2.实际的解析方法可以每次读取1024个字节,然后对1024个字节进行解析,如果发现解析到了一个数据,则存入数据list后继续解析剩余字节的内容。
如果发现已经到1024字节末尾,则保存当前解析的状态,以及未判断出数据类型的字节首地址。
3.继续读取1024个字节内容,然后将步骤2中保存的未判断出数据类型的字节首地址到末尾的字节与新读取的1024个字节合成新的解析数据,再次
进行解析。反复2,3步。即能完成该数据解析。
其实如果熟悉XML解析的同学可能会提出使用SAX即可解决这类问题。其实,参看了SAX的使用方法,个人觉得其实SAX内部的实现应该和2,3步差不多。
但是由于我在数据库的XML格式已经定义,实际上需要解析的内容只有DATA和DATA TYPE这个Attr。所以如果单纯只解析几个属性和数据,应该会比使用
SAX速度快。本地测试了一下10万条的解析速度,貌似比SAX快了0.2S。
相关代码:
public ArrayList<Object> LoadXml_Ex(String className) {
long current = System.currentTimeMillis();
LogUtil.d(TAG, "LoadXml_Ex start at " + current);
ArrayList<Object> list = mDataListMap.get(className);
if (list != null) {
return list;
}
Class clazz;
try {
clazz = Class.forName(className);
} catch (ClassNotFoundException e1) {
return null;
}
Constructor[] constructorList = clazz.getDeclaredConstructors();
int constructorLength = constructorList.length;
if (constructorLength > 1) {
return null;
}
/**
* the data class can not create constructor
*
* */
Constructor constructor = constructorList[0];
if (constructor.getParameterTypes().length > 0) {
return null;
}
constructor.setAccessible(true);
// Use Index to fasten search,so we must to creat index first start
Field[] field_list = clazz.getDeclaredFields();
File file = new File(root + className + XML_FILE_TAG);
FileReader fileReader;
list = new ArrayList<Object>();
try {
fileReader = new FileReader(file);
BufferedReader reader = new BufferedReader(fileReader);
long end = file.length();
System.out.println("end is " + end);
int READ_LENGTH = 100 * 1024;
char buffer[] = new char[READ_LENGTH];
boolean isFinish = false;
int findDataNum = 0;
int[] typeList = null;
int fieldLength = field_list.length;
int anylizeMemberIndex = 0;
int anylizeTypeIndex = 0;
int anylizeIndex = 0;
int currentStatus = STATUS_FIND_TAG_IDLE;
int attrStartIndex = 0;
int attrEndIndex = 0;
int readSize = 0;
System.out.println("start at " + System.currentTimeMillis());
Object membet = null;
Field[] fieldlist = null;
while (!isFinish) {
if (anylizeIndex != 0 && anylizeIndex < buffer.length) {
char[] tempBuffer = new char[READ_LENGTH];
readSize = reader.read(tempBuffer);
char[] finalBuffer = new char[buffer.length - anylizeIndex
+ READ_LENGTH];
System.arraycopy(buffer, anylizeIndex, finalBuffer, 0,
buffer.length - anylizeIndex);
System.arraycopy(tempBuffer, 0, finalBuffer, buffer.length
- anylizeIndex - 1, READ_LENGTH);
buffer = finalBuffer;
if (currentStatus != STATUS_FIND_TAG_ATTR_START) {
attrStartIndex = 0;
attrEndIndex = 0;
anylizeIndex = 0;
} else {
if (attrStartIndex != 0) {
attrStartIndex = attrStartIndex - anylizeIndex;
anylizeIndex = attrStartIndex + 1;
} else {
anylizeIndex = 0;
}
}
} else {
readSize = reader.read(buffer);
anylizeIndex = 0;
attrStartIndex = 0;
attrEndIndex = 0;
}
if (readSize == -1) {
if (anylizeIndex == buffer.length) {
break;
}
isFinish = true;
}
// if it is first anylize,we should check its type...
boolean isAnylizeFinish = false;
while (!isAnylizeFinish) {
if (anylizeMemberIndex == 0) {
membet = constructor.newInstance();
fieldlist = membet.getClass().getDeclaredFields();
if (typeList == null) {
typeList = new int[fieldlist.length];
}
}
if (anylizeIndex >= buffer.length - 1) {
break;
}
if (buffer[anylizeIndex] == ' ') {
anylizeIndex++;
continue;
}
int restAnylize = buffer.length - anylizeIndex;
// System.out.println("restAnylize = " + restAnylize);
// System.out.println("current status is " + currentStatus);
switch (currentStatus) {
case STATUS_FIND_TAG_IDLE:
if (buffer[anylizeIndex] == XML_DATA_BEGIN_CHAR_ARRAY[0]) {
if (anylizeIndex + XML_DATA_START_LENGTH > buffer.length) {
break;
}
if (buffer[anylizeIndex + XML_DATA_START_LENGTH - 1] == XML_DATA_BEGIN_CHAR_ARRAY[XML_DATA_START_LENGTH - 1]) {
currentStatus = STATUS_FIND_TAG_BEIGN_START;
anylizeIndex = anylizeIndex
+ XML_DATA_START_LENGTH;
findDataNum++;
}
}
anylizeIndex++;
break;
case STATUS_FIND_TAG_BEIGN_START:
if (anylizeIndex + XML_DATA_START_LENGTH > buffer.length) {
break;
}
anylizeIndex = anylizeIndex + XML_DATA_START_LENGTH;
currentStatus = STATUS_FIND_TAG_ATTR_START;
break;
case STATUS_FIND_TAG_ATTR_START:
// if this is the first time to anylize,we should
// anylize attr
if (findDataNum == 1) {
int findAttrIndex = restAnylize;
int backAnylizeIndex = anylizeIndex;
for (; findAttrIndex > 0; findAttrIndex--, backAnylizeIndex++) {
if (buffer[backAnylizeIndex] == '"'
&& attrStartIndex == 0) {
attrStartIndex = backAnylizeIndex;
attrEndIndex = 0;
} else if (buffer[backAnylizeIndex] == '"'
&& attrEndIndex == 0) {
attrEndIndex = backAnylizeIndex;
String type = String.valueOf(buffer,
attrStartIndex + 1, attrEndIndex
- attrStartIndex - 1);
attrStartIndex = 0;
currentStatus = STATUS_FIND_DATA_START;
if (anylizeTypeIndex < fieldLength) {
typeList[anylizeTypeIndex] = PraseParamUtil
.PraseObjectType(type);
anylizeTypeIndex++;
}
break;
}
}
if (findAttrIndex == 0) {
isAnylizeFinish = true;
} else {
anylizeIndex = backAnylizeIndex + 1;
}
} else {
int findAttrIndex = restAnylize;
int backAnylizeIndex = anylizeIndex;
boolean maybeAttr = false;
for (; findAttrIndex > 0; findAttrIndex--, backAnylizeIndex++) {
if (buffer[backAnylizeIndex] == '"') {
maybeAttr = true;
} else if (maybeAttr
&& buffer[backAnylizeIndex] == '>') {
break;
}
}
if (findAttrIndex == 0) {
isAnylizeFinish = true;
} else {
anylizeIndex = backAnylizeIndex;
currentStatus = STATUS_FIND_DATA_START;
}
}
break;
case STATUS_FIND_DATA_START:
int findDataIndex = restAnylize;
int backAnylizeIndex = anylizeIndex;
boolean mayBeData = false;
for (; findDataIndex > 0; findDataIndex--, backAnylizeIndex++) {
if (buffer[backAnylizeIndex] == '<') {
// TODO we should change data
mayBeData = true;
} else if (mayBeData
&& buffer[backAnylizeIndex] == '/') {
// TODO </ can not be used in data......
// ohhhh,my god~~~
String data = String.valueOf(buffer,
anylizeIndex + 1, backAnylizeIndex
- anylizeIndex - 2);// because
// of < & /
// System.out.println("data is " + data);
Field field = fieldlist[anylizeMemberIndex];
field.setAccessible(true);
switch (typeList[anylizeMemberIndex]) {
case PraseParamUtil.PRASE_TYPE_INT:
int intValue = Integer.valueOf(data);
field.setInt(membet, intValue);
break;
case PraseParamUtil.PRASE_TYPE_BOOLEAN:
Boolean boolValue = Boolean.valueOf(data);
field.setBoolean(membet, boolValue);
break;
case PraseParamUtil.PRASE_TYPE_FLOAT:
float floatValue = Float.valueOf(data);
field.setFloat(membet, floatValue);
break;
case PraseParamUtil.PRASE_TYPE_LONG:
Long longValue = Long.valueOf(data);
field.setLong(membet, longValue);
break;
case PraseParamUtil.PRASE_TYPE_STRING:
String stringValue = String.valueOf(data);
field.set(membet, stringValue);
break;
}
anylizeMemberIndex++;
if (anylizeMemberIndex == fieldlist.length) {
findDataNum++;
list.add(membet);
anylizeMemberIndex = 0;
membet = constructor.newInstance();
fieldlist = membet.getClass()
.getDeclaredFields();
}
currentStatus = STATUS_FIND_TAG_ATTR_START;
break;
}
}
if (findDataIndex == 0) {
isAnylizeFinish = true;
} else {
anylizeIndex = backAnylizeIndex + 1;
}
break;
}
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
LogUtil.d(TAG, "LoadXml_Ex end at " + System.currentTimeMillis());
return list;
}
此外,为了加快搜索的速度,在数据库中还需要加入红黑二叉树。这个对应准备下周继续介绍。