docx pdf读取

import docx
import os

from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
#from pdfminer.pdfpage import PDFPage

def preprocess_text(text):
    text = ' '.join(text.split())
    text = join_name_tag(text)
    return text


def join_name_tag(text):
    text = text.replace('\u2003', '').replace(' ','').replace('\ue5e5','').replace('\xae','').replace('\ufb01','').replace('\u2022','').replace('\u2212','')
    return text

#读取段落和表格
def docx_to_text(file_path):
    doc = docx.Document(file_path)
    result = []
    for p in doc.paragraphs:
        txt = p.text.strip()
        #txt.replace(' ', '') #去除空格
        if txt != '':
            #txt.replace('\n', '') #去掉换行符
            txt = preprocess_text(txt)
            result.append(txt)
    for q in doc.tables:
        for row in q.rows:
            txt = ''
            for col in range(len(q.columns)):
                try:
                    if row.cells[col].text != '' and row.cells[col].text not in txt:
                        txt = txt + row.cells[col].text + ' '
                except:
                    pass
            if txt != '':
                txt = preprocess_text(txt)
                result.append(txt)
                
    return result

#读取pdf文本
def pdf_to_text(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    result = []
    for line in text.split('\n'):
        line2 = line.strip()
        line2 = preprocess_text(line2)
        if line2 != '':
            result.append(line2)
    return result

def read_pdf_and_docx(file_path):
    #读取docx or pdf文件
    txt = None
    if file_path.lower().endswith('.docx'):
        print('extracting text from docx: ', file_path)
        txt = docx_to_text(file_path)
    elif file_path.lower().endswith('.pdf'):
        print('extracting text from pdf: ', file_path)
        txt = pdf_to_text(file_path)
    if txt is not None and len(txt) > 0:
        return txt
'''
#读取文件夹内的所有文件
def read_pdf_and_docx(dir_path, collected=None, command_logging=False, callback=None):
    if collected is None:
        collected = dict()
    for f in os.listdir(dir_path):
        file_path = os.path.join(dir_path, f)
        if os.path.isfile(file_path):
            txt = None
            if f.lower().endswith('.docx'):
                if command_logging:
                    print('extracting text from docx: ', file_path)
                txt = docx_to_text(file_path)
            elif f.lower().endswith('.pdf'):
                if command_logging:
                    print('extracting text from pdf: ', file_path)
                txt = pdf_to_text(file_path)
            if txt is not None and len(txt) > 1:
                if callback is not None:
                    callback(len(collected), file_path, txt)
                collected[file_path] = txt
        elif os.path.isdir(file_path):
            read_pdf_and_docx(file_path, collected, command_logging, callback)

    return collected
'''

if __name__ == '__main__':
    data_dir_path = './test_read_docx'
    for f in os.listdir(data_dir_path):
        data_file_path = os.path.join(data_dir_path, f)
        if os.path.isfile(data_file_path) and f.lower().endswith('.docx'):
            collected = read_pdf_and_docx(data_file_path)
            print(collected)
猜你喜欢