pdf content difference comparison

Scene: Prevent plagiarism! Word comparison is very easy, so here is the pdf content comparison technology.

1. Environment

  • win10
  • python3.9.6

2. Ideas

pdf to picture jpg, picture to text, text for comparison.

  1. Convert each page of a PDF file to an image
  2. The image is converted to text, the text is compared, and a difference map is obtained.
  3. Stitch all generated diff images into one PDF file

3. Realize the effect

How to use: python diff-pdf.py test1.pdf test2.pdf
The results are as follows:
insert image description here

4. Part of the program

Mainly used modules: pdfminer, StringIO in io.

4.1 Read pdf

    def read_pdf(self, file_name):
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        fp = open(file_name, 'rb')
        parser = PDFParser(fp)
        document = PDFDocument(parser)
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        pages = {
    
    }
        for page in PDFPage.create_pages(document):
            dictionary = {
    
    }
            dictionary['textbox'] = []
            dictionary['textline'] = []

            interpreter.process_page(page)
            layout = device.get_result()
            for item in layout:
                if isinstance(item, LTTextBox):
                    dictionary['textbox'].append(item)
                    for child in item:
                        if isinstance(child, LTTextLine):
                            dictionary['textline'].append(child)
            pages[layout.pageid] = dictionary
        return pages

4.2 pdf to text

    def convert_pdf_to_txt(self, path, page_no=-1):
        rsrcmgr = PDFResourceManager()

        retstr = StringIO()
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, laparams=laparams)
        fp = open(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        i = 0
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching,
                                      check_extractable=True):
            i += 1
            if i != page_no:
                continue
            interpreter.process_page(page)

        fp.close()
        device.close()
        str = retstr.getvalue()
        retstr.close()

        return str

4.3 pdf comparison

    def compare_pdf(self, file1, file2, header_text, x_margin=10, compare_margin=0.2):
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        laparams = LAParams()
        fp = open(file1, 'rb')
        parser = PDFParser(fp)
        document = PDFDocument(parser)
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        out = StringIO()

        layoutmode = 'normal'
        scale = 1.3
        fontscale = 1

        html_coverter = HTMLPrivateConverter(rsrcmgr, out, scale=scale,
                                             layoutmode=layoutmode, laparams=laparams, fontscale=fontscale,
                                             imagewriter=None, header_text=header_text, x_margin=x_margin)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        testpages = PDFPage.create_pages(document)

        file_dict = self.read_pdf(file2)

        for page in testpages:
            interpreter.process_page(page)
            layout = device.get_result()

            html_coverter.page_begin(layout)
            if file_dict.get(layout.pageid) == None:
                break
            compare_page = file_dict[layout.pageid]

            for item in layout:
                if isinstance(item, LTTextBox):
                    html_coverter.begin_div('textbox', 1, item.x0 + html_coverter.x_margin, item.y1, item.width,
                                            item.height,
                                            item.get_writing_mode())

                    for child in item:
                        if isinstance(child, LTTextLine):
                            self.compare_textline(child, compare_page, html_coverter, compare_margin)
                            html_coverter.put_newline()
                    html_coverter.end_div()
            html_coverter.page_end()

        fp.close()
        device.close()
        retstr.close()

        return out.getvalue()

4.4 Complete program

Click: full source code

Guess you like

Origin blog.csdn.net/weixin_46211269/article/details/124480259