PDF交叉参考表是PDF文件的重要部分。该表保存了所有简介对象在PDF文件中物理偏移地址。该表在文件中可以存在单个,也可以存在多个。多个交叉引用表通常出现在两个情况:一、增量保存,二、线性化。
通常,PDF交叉引用表将具有以下形式:
交叉引用表以单词“xref”开头。图中,“0 4”表示对象号为0开始的连续4个对象,对象号为0和3的简介对象不存在,对象号为1和2的对象使用中。对象号为1的简介对象的起始位置是17,版本号是0.
一个文件中出现多个交叉引用表时,可能出现同一个间接对象存在不同的引用表中,这时,要以出现在文件最后位置的那个为准,前面的忽略,这种情况,通常是由于修改了PDF文件,导致其中的一个或多个对象发生了变化,PDF生成器根据输出要求,进行增加输出,只输出修改的对象,然后在文件末尾加上更新的交叉引用表。
那么交叉引用表的其实位置在什么地方呢?其实很简单,在文件的末尾保存有以下内容:
startxref
217929
%%EOF
“217929”就是交叉引用表的偏移位置。
当存在多个交叉引用表时,通常在交叉引用表之后的“trailer”字典中会保存“/Prev”,该key对应的值就是上一个交叉引用表的位置。
交叉引用表读取算法如下:(pdfbox,详细的代码请参考“/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/FDFParser.java”)
1、交叉引用表读取入口
/**
* The initial parse will first parse only the trailer, the xrefstart and all xref tables to have a pointer (offset)
* to all the pdf's objects. It can handle linearized pdfs, which will have an xref at the end pointing to an xref
* at the beginning of the file. Last the root object is parsed.
*
* @throws IOException If something went wrong.
*/
private void initialParse() throws IOException
{
......
try
{
// parse startxref
long startXRefOffset = getStartxrefOffset();
if (startXRefOffset > 0)
{
trailer = parseXref(startXRefOffset);
}
}
catch (IOException exception)
{
......
}
//其他操作
}
2、从文件末尾查找“startxref”
/**
* Looks for and parses startxref. We first look for last '%%EOF' marker (within last
* {@link #DEFAULT_TRAIL_BYTECOUNT} bytes (or range set via {@link #setEOFLookupRange(int)}) and go back to find
* <code>startxref</code>.
*
* @return the offset of StartXref
* @throws IOException If something went wrong.
*/
protected final long getStartxrefOffset() throws IOException
{
byte[] buf;
long skipBytes;
// read trailing bytes into buffer
try
{
final int trailByteCount = (fileLen < readTrailBytes) ? (int) fileLen : readTrailBytes;
buf = new byte[trailByteCount];
skipBytes = fileLen - trailByteCount;
source.seek(skipBytes);
int off = 0;
int readBytes;
while (off < trailByteCount)
{
readBytes = source.read(buf, off, trailByteCount - off);
// in order to not get stuck in a loop we check readBytes (this should never happen)
if (readBytes < 1)
{
throw new IOException(
"No more bytes to read for trailing buffer, but expected: "
+ (trailByteCount - off));
}
off += readBytes;
}
}
finally
{
source.seek(0);
}
// find last '%%EOF'
int bufOff = lastIndexOf(EOF_MARKER, buf, buf.length);
if (bufOff < 0)
{
if (isLenient)
{
// in lenient mode the '%%EOF' isn't needed
bufOff = buf.length;
LOG.debug("Missing end of file marker '" + new String(EOF_MARKER) + "'");
}
else
{
throw new IOException("Missing end of file marker '" + new String(EOF_MARKER) + "'");
}
}
// find last startxref preceding EOF marker
bufOff = lastIndexOf(STARTXREF, buf, bufOff);
if (bufOff < 0)
{
throw new IOException("Missing 'startxref' marker.");
}
else
{
return skipBytes + bufOff;
}
}
3、解析“xref”
/**
* Parses cross reference tables.
*
* @param startXRefOffset start offset of the first table
* @return the trailer dictionary
* @throws IOException if something went wrong
*/
protected COSDictionary parseXref(long startXRefOffset) throws IOException
{
source.seek(startXRefOffset);
long startXrefOffset = Math.max(0, parseStartXref());
// check the startxref offset
long fixedOffset = checkXRefOffset(startXrefOffset);
if (fixedOffset > -1)
{
startXrefOffset = fixedOffset;
}
document.setStartXref(startXrefOffset);
long prev = startXrefOffset;
// ---- parse whole chain of xref tables/object streams using PREV reference
Set<Long> prevSet = new HashSet<Long>();
while (prev > 0)
{
// seek to xref table
source.seek(prev);
// skip white spaces
skipSpaces();
// -- parse xref
if (source.peek() == X)
{
// xref table and trailer
// use existing parser to parse xref table
parseXrefTable(prev);
if (!parseTrailer())
{
throw new IOException("Expected trailer object at position: "
+ source.getPosition());
}
COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer();
// check for a XRef stream, it may contain some object ids of compressed objects
if(trailer.containsKey(COSName.XREF_STM))
{
int streamOffset = trailer.getInt(COSName.XREF_STM);
// check the xref stream reference
fixedOffset = checkXRefOffset(streamOffset);
if (fixedOffset > -1 && fixedOffset != streamOffset)
{
LOG.warn("/XRefStm offset " + streamOffset + " is incorrect, corrected to " + fixedOffset);
streamOffset = (int)fixedOffset;
trailer.setInt(COSName.XREF_STM, streamOffset);
}
if (streamOffset > 0)
{
source.seek(streamOffset);
skipSpaces();
try
{
parseXrefObjStream(prev, false);
}
catch (IOException ex)
{
if (isLenient)
{
LOG.error("Failed to parse /XRefStm at offset " + streamOffset, ex);
}
else
{
throw ex;
}
}
}
else
{
if(isLenient)
{
LOG.error("Skipped XRef stream due to a corrupt offset:"+streamOffset);
}
else
{
throw new IOException("Skipped XRef stream due to a corrupt offset:"+streamOffset);
}
}
}
prev = trailer.getLong(COSName.PREV);
if (prev > 0)
{
// check the xref table reference
fixedOffset = checkXRefOffset(prev);
if (fixedOffset > -1 && fixedOffset != prev)
{
prev = fixedOffset;
trailer.setLong(COSName.PREV, prev);
}
}
}
else
{
// parse xref stream
prev = parseXrefObjStream(prev, true);
if (prev > 0)
{
// check the xref table reference
fixedOffset = checkXRefOffset(prev);
if (fixedOffset > -1 && fixedOffset != prev)
{
prev = fixedOffset;
COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer();
trailer.setLong(COSName.PREV, prev);
}
}
}
if (prevSet.contains(prev))
{
throw new IOException("/Prev loop at offset " + prev);
}
prevSet.add(prev);
}
// ---- build valid xrefs out of the xref chain
xrefTrailerResolver.setStartxref(startXrefOffset);
COSDictionary trailer = xrefTrailerResolver.getTrailer();
document.setTrailer(trailer);
document.setIsXRefStream(XRefType.STREAM == xrefTrailerResolver.getXrefType());
// check the offsets of all referenced objects
checkXrefOffsets();
// copy xref table
document.addXRefTable(xrefTrailerResolver.getXrefTable());
return trailer;
}