Python- Use python to implement pdf2txt

The method in this article mainly realizes batch processing pdf2txt. Force push method two! ! !

Method 1: Use pdfminer3k

Refer to the code from GitHub.

######################################
# tesseract OCR

from PIL import Image
import pytesseract

def img_to_str_tesseract(image_path, lang='chi_sim'):
    return pytesseract.image_to_string(Image.open(image_path), lang)
  
######################################
# 百度 OCR

from aip import AipOcr

config = {
    
    
    'appId': '',
    'apiKey': '',
    'secretKey': ''
}
client = AipOcr(**config)

def img_to_str_baidu(image_path):
    with open(image_path, 'rb') as fp:
        image = fp.read()
        result = client.basicGeneral(image)
        if 'words_result' in result:
            return '\n'.join([w['words'] for w in result['words_result']])
    return ""

######################################
# 解析PDF文件

from pdfminer.pdftypes import LITERALS_DCT_DECODE, LITERALS_FLATE_DECODE
from pdfminer.pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
from pdfminer.pdfparser import PDFParser,PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTChar, LTTextLine
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
import os
import sys
import numpy as np
import importlib
importlib.reload(sys)

TMPDIR = 'tmp/'
PARSEIMG = True
OCR_ONLINE = False

# 保存图片
def write_image(image, outdir):
    stream = image.stream
    filters = stream.get_filters()
    if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE:
        ext = '.jpg'
        data = stream.get_rawdata()
    elif image.colorspace is LITERAL_DEVICE_RGB:
        ext = '.bmp'
        data = create_bmp(stream.get_data(), stream.bits*3, image.width, image.height)
    elif image.colorspace is LITERAL_DEVICE_GRAY:
        ext = '.bmp'
        data = create_bmp(stream.get_data(), stream.bits, image.width, image.height)
    else:
        ext = '.img'
        data = stream.get_data()
    name = image.name+ext
    path = os.path.join(outdir, name)
    fp = open(path, 'wb')
    fp.write(data)
    fp.close()
    return path, len(data)

# 写入文件
def write_file(path, text, ftype, debug=False):
    with open(path, ftype) as f:
        if debug:
            print("write", len(text))
        f.write(text)

# 去掉文中多余的回车
def adjust(inpath, outpath):
    f = open(inpath)
    lines = f.readlines()
    arr = [len(line) for line in lines]
    length = np.median(arr) # 行字符数中值
    
    string = ""
    for line in lines:
        if len(line) >= length and line[-1]=='\n':
            string += line[:-1] # 去掉句尾的回车
        elif line == '-----------\n':
            pass
        else:
            string += line
    write_file(outpath, string, 'w')
    return

# 解析每个数据块
def parse_section(layout, outpath, debug = False):
    for x in layout:
        if (isinstance(x, LTTextBoxHorizontal)): # 文本
            write_file(outpath, x.get_text(), 'a')
        elif (isinstance(x, LTFigure)):
            parse_section(x, outpath)
        elif (isinstance(x, LTImage)) and PARSEIMG: # 图片
            path,length = write_image(x, TMPDIR)
            if length > 0:
                if OCR_ONLINE:
                    write_file(outpath, img_to_str_baidu(path), 'a')
                else:
                    write_file(outpath, img_to_str_tesseract(path), 'a')
                write_file(outpath, '\n' + '-----------' + '\n', 'a')

# 删除文件  
def remove(path):
    if not os.path.exists(path):
        return
    if os.path.isfile(path):
        os.remove(path)
        return
    dirs = os.listdir(path)
    for f in dirs:
        file_name = os.path.join(path, f)
        if os.path.isfile(file_name):
            os.remove(file_name)
        else:
            remove(file_name)
    os.rmdir(path)

# 解析PDF文件
def parse(inpath, outpath):
    remove(TMPDIR) # 清除临时目录 
    os.mkdir(TMPDIR)
    remove(outpath) # 清除输出文件
    fp = open(inpath, 'rb')
    praser = PDFParser(fp) # pdf文档分析器
    doc = PDFDocument(praser)# 创建一个PDF文档
    praser.set_document(doc) # 连接分析器与文档对象
    doc.set_parser(praser)
    doc.initialize("")
    if not doc.is_extractable: # 是否提供txt转换
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager() # 创建PDF资源管理器
        laparams = LAParams() 
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device) # 创建PDF解释器对象
                
        for idx,page in enumerate(doc.get_pages()): # 获取page列表
            interpreter.process_page(page)
            layout = device.get_result()
            print("parse", idx)
            parse_section(layout, outpath)

if __name__ == '__main__':
    #批处理
    target_path = sys.argv[1]
    for base_path,folder_list,file_list in os.walk(target_path):
        for file_name in file_list:
            file_path = file_name
            if file_path[-3:] != 'pdf':
                # 不是pdf文件
                continue
            try:
                pdffile = file_path
                tmpfile = pdffile.replace('pdf','tmp')
                txtfile = pdffile.replace('pdf','txt')
                parse(pdffile, tmpfile)
                adjust(tmpfile, txtfile)
                remove(tmpfile) #删除tmp
            except Exception as e:
                print(file_name," error!")



The author has made adjustments according to his needs.

! pit

Be sure not to install pdfminer, it's pdfminer3k, big brother. If unfortunately installed, you can uninstall pdfminer, pdfminer3k, then install pdfminer3k, delete all existing packages according to the prompts, and then install pdfminer3k

Guiqiu the great god to answer why pdfminer and pdfminer3k will guide the package confusion

Method 2: With the help of xpdf

Refer to self-knowledge, optimize according to your own needs and pdfminer3k code:


import numpy as np
import os 
import subprocess
from os.path import isfile,join

ef = r'./xpdf/pdftotext.exe'
cfg = r'./xpdf/xpdfrc'

def convert(file_name_pdf):
    file_name_pdf = join(r'./resourses',file_name_pdf)
    bo = subprocess.check_output([ef,'-f','1','-l','1000','-cfg',cfg,'-raw',file_name_pdf,'-']) #这个命令中的所有调用文件参数必须使用full path.否则调用出错。
    return bo.decode('utf-8')

def write_file(bo,file_name,method="wb"):
    file_name = join(r'./results/',file_name)
    with open(file_name,method) as f:
        f.write(bo)

# 去除换行
def adjust(inpath, outpath):
    inpath = join(r'./results/',inpath)
    f = open(inpath,encoding='utf-8')
    lines = f.readlines()
    arr = [len(line) for line in lines]
    length = np.median(arr) # 行字符数中值
    string = ""
    for line in lines:
        if len(line) >= length and line[-1]=='\n':
            string += line[:-1] # 去掉句尾的回车
        elif line == '-----------\n':
            pass
        else:
            string += line
    string=string.encode('utf-8')
    write_file(string, outpath)
     
def rm(inpath):
    inpath = join(r'./results/',inpath)
    os.remove(inpath)
    
if __name__ == '__main__':
    #批处理
    su_count = 0
    er_count = 0
    count = 0
    target_path = r'./resourses'
    for base_path,folder_list,file_list in os.walk(target_path):
        for file_name in file_list:
            if file_name[-3:] != 'pdf':
                # 不是pdf文件
                continue
            try:
                pdffile = file_name
                tmpfile = pdffile.replace('pdf','tmp')
                txtfile = pdffile.replace('pdf','txt')
                bo = convert(pdffile).encode('utf-8')
                write_file(bo,tmpfile)
                adjust(tmpfile, txtfile)
                rm(tmpfile)
                su_count += 1
                count += 1
                print(count,"-->",file_name," success!\n ")
            except Exception as e:
                er_count += 1
                count += 1
                print(count,"-->",file_name," error!\n ")
                
    print("\ncount: ",count,"\n","success: ",su_count,"\n","error: ",er_count)

Thanks to the great gods who are willing to share on the Internet, I finally don’t need to post one by one!

The link is here:
Link: https://pan.baidu.com/s/1QW3XMAvf8qJlaLHxmUBEXg
Extraction code: tw95

Those who use it, remember to read README.md in advance, you must not step on the pits I have stepped on anymore, tears~~

Guess you like

Origin blog.csdn.net/MaoziYa/article/details/105361981