在线分析office(xls, xlsx, doc, docx)文档内容扩展可在线预览

最近在做一个文档分析系统，要求是在上传的两千份左右的office文件里筛选出含有一千个左右的关键词，因为以前主业开发PHP，但是PHP在处理这块的时候“力不从心”，整好研究了小半年的Python，于是希望用Py和PHP混合开发，简单架构如下：

(为啥有Node，因为后边调用了 textract 来处理某些文件)

前台PHP上传那块不做赘述，常规CURD，Upload，重点是后边 Py+Node处理这一块，贴代码

环境：Centos6.9 Python3.6 Node10.16 Liboffice

框架：Thinkphp5.1 Layui

依赖都在代码声明里了。

# -*- coding: UTF-8 -*-
import sys
import getopt
import pymysql
import os
import re
import subprocess
import time

# 这个文件是处理主逻辑，基本思路是 将所有文件都转为 xhtml, 部分liboffice转不了的文件交给textract处理, 双保险, 再处理不了的, 返回提示, 让用户处理一下文件, 那种文件数量极少.
# init
db = pymysql.connect("127.0.0.1", "root", "root", "document")
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()


# 接受一个参数, 文件id
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "i:p:h", ["id=", "path=", "help"])
    except getopt.GetoptError:
        print('Wrong opts! you can use do.py -h to get help')
        sys.exit(3)
    params = {
        'type': 0,
        'args': ''
    }
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            print('if you want to deal files by id, this order will according to these id')
            print('main.py -i <ex:1,2,3,4,5,6,7,8>')
            print('if you want to deal files by path, this order will according to this path , get all files to deal')
            print('main.py -p <ex: ./pdf/>')
            sys.exit()
        elif opt in ('-i', '--id='):
            params = {
                'type': 1,
                'args': arg
            }
        elif opt in ('-p', '--path='):
            params = {
                'type': 2,
                'args': arg
            }
    return params


def getFile(params):
    # 加一层判断, 如果传入的 id = all , 则处理全部文件,

    # 判断 传入的是 id 还是 路径
    # 如果传入的是 id, 去数据库 获取到他的路径
    if 1 == params['type']:
        if 'all' == params['args']:
            sql = "SELECT id, path from doc_file where delete_time is null"
        else:
            sql = "SELECT id, path from doc_file where id IN (" + params['args'] + ")"
        cds = cursor.execute(sql)
        cds = cursor.fetchall()
        for i in cds:
            # print(i)
            do(i[0], i[1])


def do(id, path):
    # 追加 path 完整路径
    # path_reg = r"/www_2/wwwroot/document/public/"
    # path = path_reg+ path

    # 先转
    x_path = dealFormat(path)
    if os.path.exists(x_path):
        sql = "INSERT INTO doc_content(doc_id) VALUES ('%s')" % (id)
        cursor.execute(sql)
        db.commit()
    else:
        # 尝试 textract 读取
        sub = subprocess.Popen([r"textract", path],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
        while sub.poll() is None:
            print('textract Waiting...')
            time.sleep(0.1)
        content = sub.stdout.read()
        print(content)
        content = content.decode('utf-8', 'ignore')
        f = open(x_path,'w+')
        f.write(content)
        f.close()
        sql = "INSERT INTO doc_content(doc_id) VALUES ('%s')" % (id)
        cursor.execute(sql)
        db.commit()


        f = open('bad.txt','a+')
        f.write(id.__str__())
        f.write('|')
        # f.seek(0)
        f.close()

        print('This file is bad:', id)
    # 再读
    # content = read(x_path)
    # try:
    #     cont_str = content.decode('utf-8', 'ignore')
    # except:
    # cont_str = content.decode('gbk', 'ignore')

    # 存

    # print(cont_str)


def dealFormat(path):

    path = r_path+path
    file = os.path.splitext(path)
    filename, type = file

    _, tempfilename = os.path.split(filename + '.xhtml')
    x_path = r_path + r'cache3/' + tempfilename
    # print(x_path)
    if not os.path.exists(x_path):
        # 转换文件格式
        print('Now:',path)
        if '.html' == type:
            sub = subprocess.Popen([r"cp", path, r_path+'cache3/' + tempfilename])
            # os.system('cp ' + path + ' ' + r_path+'/cache/' + filename + '.xhtml')
        else:
            # sub = subprocess.Popen([r"/opt/openoffice4/program/soffice", "--convert-to", "xhtml", "--outdir", r_path+r"cache3/", path])
            sub = subprocess.Popen([r"/opt/libreoffice6.1/program/soffice", "--convert-to", "xhtml", "--outdir", r_path+r"cache3/", path])
            # 检查进程是否结束
        t = 0
        while sub.poll() is None:
            print('soffice Waiting...')
            time.sleep(0.1)
        return x_path
    else:
        print('This file has exists, pass')
        return 'This file has exists, pass'


# 读html
def read(path=''):
    try:
        if 0 == len(path):
            return False
        fp = open(path, "rb")
        data = fp.read()
        return data
    except:
        print('error')


if __name__ == "__main__":
    # 网站所在的目录
    r_path = r"/www_2/wwwroot/document/public/"
    # print(sys.argv[1:])
    params = main(sys.argv[1:])

    getFile(params)

# -*- coding: UTF-8 -*-

# 文件获取两个参数 -i [文件id] -c [公司id]
import sys
import getopt
import pymysql
import os

# 这个文件是搜索关键的逻辑, 直接去让python读上一步处理好的 xhtml 文件, 利用python处理字符串快的优势, 返回所需格式
# init
db = pymysql.connect("127.0.0.1", "root", "root", "document")
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()


# 接受两个参数 文件id  公司id
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "i:c:h", ["id=", "cid=", "help"])
    except getopt.GetoptError:
        print('Wrong opts! you can use do.py -h to get help')
        sys.exit(3)
    params = {}
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            print('if you want to deal files by id, this order will according to these id')
            print('main.py -i <ex:1,2,3,4,5,6,7,8> -c <ex:1,2,3,4,5,6,7,8>')
            sys.exit()
        elif opt in ('-i', '--id='):
            params['id'] = arg
        elif opt in ('-c', '--cid='):
            params['cid'] = arg
    return params


def got(params):
    # 获取文件 id
    id = params['id']
    # 获取id 所对的 生成 xhtml 文件名
    sql = "SELECT  path, id FROM doc_file WHERE id IN (" + id + ")"
    cds = cursor.execute(sql)
    cds = cursor.fetchall()
    # print(cds)
    xhtml_file = []
    for i in cds:
        file = os.path.splitext(i[0])
        filename, type = file
        _, tempfilename = os.path.split(filename + '.xhtml')
        tempfilename = r_path+"/cache3/"+ tempfilename
        xhtml_file.append([tempfilename, i[1]])
    # 获取企业 id
    cid = params['cid']
    sql_c = "SELECT company_id, name FROM doc_keyword WHERE delete_time is null AND company_id IN (" + cid + ")"
    cds = cursor.execute(sql_c)
    cds = cursor.fetchall()
    keys = []
    for c_i in cds:
        keys.append([c_i[1],c_i[0]])

    # print(xhtml_file)
    # 开始循环 文件判断文件中是否存在关键词
    has_key = []
    for x_i in xhtml_file:
        f = open(x_i[0], 'r', -1, 'utf-8','ignore')
        fr = f.read()
        for key in keys:
            if key[0] in fr:
                has_key.append([x_i[1], key[1], key[0]])
        f.close()
    print(has_key)





if __name__ == "__main__":
    # 网站所在的目录
    r_path = r"/www_2/wwwroot/document/public/"
    # print(sys.argv[1:])
    params = main(sys.argv[1:])
    got(params)

至于调用, PHP exec 调用即可, 不过要注意对 exec的安全

待鸣

发布了31 篇原创文章 · 获赞 8 · 访问量 2万+

私信关注

在线分析office(xls, xlsx, doc, docx)文档内容 扩展可在线预览

猜你喜欢

在线分析office(xls, xlsx, doc, docx)文档内容扩展可在线预览