PDF格式分析（十二）Cross-Reference Table 交叉引用表

PDF交叉参考表是PDF文件的重要部分。该表保存了所有简介对象在PDF文件中物理偏移地址。该表在文件中可以存在单个，也可以存在多个。多个交叉引用表通常出现在两个情况：一、增量保存，二、线性化。

通常，PDF交叉引用表将具有以下形式：

交叉引用表以单词“xref”开头。图中，“0 4”表示对象号为0开始的连续4个对象，对象号为0和3的简介对象不存在，对象号为1和2的对象使用中。对象号为1的简介对象的起始位置是17，版本号是0.

一个文件中出现多个交叉引用表时，可能出现同一个间接对象存在不同的引用表中，这时，要以出现在文件最后位置的那个为准，前面的忽略，这种情况，通常是由于修改了PDF文件，导致其中的一个或多个对象发生了变化，PDF生成器根据输出要求，进行增加输出，只输出修改的对象，然后在文件末尾加上更新的交叉引用表。

那么交叉引用表的其实位置在什么地方呢？其实很简单，在文件的末尾保存有以下内容：

startxref
217929
%%EOF

“217929”就是交叉引用表的偏移位置。

当存在多个交叉引用表时，通常在交叉引用表之后的“trailer”字典中会保存“/Prev”，该key对应的值就是上一个交叉引用表的位置。

交叉引用表读取算法如下：（pdfbox，详细的代码请参考“/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/FDFParser.java”）

1、交叉引用表读取入口

    /**
     * The initial parse will first parse only the trailer, the xrefstart and all xref tables to have a pointer (offset)
     * to all the pdf's objects. It can handle linearized pdfs, which will have an xref at the end pointing to an xref
     * at the beginning of the file. Last the root object is parsed.
     * 
     * @throws IOException If something went wrong.
     */
    private void initialParse() throws IOException
    {
        ......
        try
        {
            // parse startxref
            long startXRefOffset = getStartxrefOffset();
            if (startXRefOffset > 0)
            {
                trailer = parseXref(startXRefOffset);
            }

        }
        catch (IOException exception)
        {
              ......
        }
        //其他操作
    }

2、从文件末尾查找“startxref”

    /**
     * Looks for and parses startxref. We first look for last '%%EOF' marker (within last
     * {@link #DEFAULT_TRAIL_BYTECOUNT} bytes (or range set via {@link #setEOFLookupRange(int)}) and go back to find
     * <code>startxref</code>.
     * 
     * @return the offset of StartXref
     * @throws IOException If something went wrong.
     */
    protected final long getStartxrefOffset() throws IOException
    {
        byte[] buf;
        long skipBytes;
        // read trailing bytes into buffer
        try
        {
            final int trailByteCount = (fileLen < readTrailBytes) ? (int) fileLen : readTrailBytes;
            buf = new byte[trailByteCount];
            skipBytes = fileLen - trailByteCount;
            source.seek(skipBytes);
            int off = 0;
            int readBytes;
            while (off < trailByteCount)
            {
                readBytes = source.read(buf, off, trailByteCount - off);
                // in order to not get stuck in a loop we check readBytes (this should never happen)
                if (readBytes < 1)
                {
                    throw new IOException(
                            "No more bytes to read for trailing buffer, but expected: "
                                    + (trailByteCount - off));
                }
                off += readBytes;
            }
        }
        finally
        {
            source.seek(0);
        }
        // find last '%%EOF'
        int bufOff = lastIndexOf(EOF_MARKER, buf, buf.length);
        if (bufOff < 0)
        {
            if (isLenient) 
            {
                // in lenient mode the '%%EOF' isn't needed
                bufOff = buf.length;
                LOG.debug("Missing end of file marker '" + new String(EOF_MARKER) + "'");
            } 
            else 
            {
                throw new IOException("Missing end of file marker '" + new String(EOF_MARKER) + "'");
            }
        }
        // find last startxref preceding EOF marker
        bufOff = lastIndexOf(STARTXREF, buf, bufOff);
        if (bufOff < 0)
        {
            throw new IOException("Missing 'startxref' marker.");
        }
        else
        {
            return skipBytes + bufOff;
        }
    }

3、解析“xref”

    /**
     * Parses cross reference tables.
     * 
     * @param startXRefOffset start offset of the first table
     * @return the trailer dictionary
     * @throws IOException if something went wrong
     */
    protected COSDictionary parseXref(long startXRefOffset) throws IOException
    {
        source.seek(startXRefOffset);
        long startXrefOffset = Math.max(0, parseStartXref());
        // check the startxref offset
        long fixedOffset = checkXRefOffset(startXrefOffset);
        if (fixedOffset > -1)
        {
            startXrefOffset = fixedOffset;
        }
        document.setStartXref(startXrefOffset);
        long prev = startXrefOffset;
        // ---- parse whole chain of xref tables/object streams using PREV reference
        Set<Long> prevSet = new HashSet<Long>();
        while (prev > 0)
        {
            // seek to xref table
            source.seek(prev);

            // skip white spaces
            skipSpaces();
            // -- parse xref
            if (source.peek() == X)
            {
                // xref table and trailer
                // use existing parser to parse xref table
                parseXrefTable(prev);
                if (!parseTrailer())
                {
                    throw new IOException("Expected trailer object at position: "
                            + source.getPosition());
                }
                COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer();
                // check for a XRef stream, it may contain some object ids of compressed objects 
                if(trailer.containsKey(COSName.XREF_STM))
                {
                    int streamOffset = trailer.getInt(COSName.XREF_STM);
                    // check the xref stream reference
                    fixedOffset = checkXRefOffset(streamOffset);
                    if (fixedOffset > -1 && fixedOffset != streamOffset)
                    {
                        LOG.warn("/XRefStm offset " + streamOffset + " is incorrect, corrected to " + fixedOffset);
                        streamOffset = (int)fixedOffset;
                        trailer.setInt(COSName.XREF_STM, streamOffset);
                    }
                    if (streamOffset > 0)
                    {
                        source.seek(streamOffset);
                        skipSpaces();
                        try
                        {
                            parseXrefObjStream(prev, false);
                        }
                        catch (IOException ex)
                        {
                            if (isLenient)
                            {
                                LOG.error("Failed to parse /XRefStm at offset " + streamOffset, ex);
                            }
                            else
                            {
                                throw ex;
                            }
                        }
                    }
                    else
                    {
                        if(isLenient)
                        {
                            LOG.error("Skipped XRef stream due to a corrupt offset:"+streamOffset);
                        }
                        else
                        {
                            throw new IOException("Skipped XRef stream due to a corrupt offset:"+streamOffset);
                        }
                    }
                }
                prev = trailer.getLong(COSName.PREV);
                if (prev > 0)
                {
                    // check the xref table reference
                    fixedOffset = checkXRefOffset(prev);
                    if (fixedOffset > -1 && fixedOffset != prev)
                    {
                        prev = fixedOffset;
                        trailer.setLong(COSName.PREV, prev);
                    }
                }
            }
            else
            {
                // parse xref stream
                prev = parseXrefObjStream(prev, true);
                if (prev > 0)
                {
                    // check the xref table reference
                    fixedOffset = checkXRefOffset(prev);
                    if (fixedOffset > -1 && fixedOffset != prev)
                    {
                        prev = fixedOffset;
                        COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer();
                        trailer.setLong(COSName.PREV, prev);
                    }
                }
            }
            if (prevSet.contains(prev))
            {
                throw new IOException("/Prev loop at offset " + prev);
            }
            prevSet.add(prev);
        }
        // ---- build valid xrefs out of the xref chain
        xrefTrailerResolver.setStartxref(startXrefOffset);
        COSDictionary trailer = xrefTrailerResolver.getTrailer();
        document.setTrailer(trailer);
        document.setIsXRefStream(XRefType.STREAM == xrefTrailerResolver.getXrefType());
        // check the offsets of all referenced objects
        checkXrefOffsets();
        // copy xref table
        document.addXRefTable(xrefTrailerResolver.getXrefTable());
        return trailer;
    }

PDF格式分析（十二）Cross-Reference Table 交叉引用表

猜你喜欢