文档转图片实践总结

需求： ppt，pptx，word，pdf转化为图片，不限于png，jpg

1 初级方案

方案：linux+poi+icepdf

缺点：office文档转化图片不清晰，smart图形无法渲染，效率不高

转化方案需要针对不同的文档类型分开处理，office文档早期采用poi的方案处理，优点是简单，但是带来的问题是转化的图片不清晰，文档中的图片严重失真，pdf最终采用的是icepdf进行转化，但是需要处理的中文乱码和水印的问题。期间调研了集中pdf转图片的方案，PDFRenderer，Pdfbox、Icepdf，JPedal，jacob调用adobe等，其中

PDFRenderer：确实效率最高，但是缺少字体支持对大多数中文pdf处理不了，.

pdfbox：字体基本都可以转换，但容易内存溢出。

JPedal是一个商业的处理PDF软件，但是JPedal有一个裁切版，裁切版JPedal使用LGPL协议进行开源，可免费使用，但是不能满足需求，只能作罢，但是通过测试效果确实不错，由此可见花钱的就是好啊。

poi-ppt转图片代码

 @SuppressWarnings("resource")
    public static Map<String, Object> converPPTtoImage(String orignalPPTFileName, String targetImageFileDir,
                                                       String imageFormatNameString) {
        Map<String, Object> map = new HashMap<String, Object>();
        boolean converReturnResult = true;//是否全部转成功
        List<String> imgNamesList = new ArrayList<String>();//PPT转成图片后所有名称集合
        FileInputStream orignalPPTFileInputStream = null;
        FileOutputStream orignalPPTFileOutStream = null;
        HSLFSlideShow oneHSLFSlideShow = null;
        try {
            try {
                orignalPPTFileInputStream = new FileInputStream(orignalPPTFileName);
            } catch (FileNotFoundException e) {
                e.printStackTrace();
                converReturnResult = false;
                map.put("converReturnResult", converReturnResult);
                return map;
            }

            try {
                oneHSLFSlideShow = new HSLFSlideShow(orignalPPTFileInputStream);
            } catch (IOException e) {
                e.printStackTrace();
                converReturnResult = false;
                map.put("converReturnResult", converReturnResult);
                return map;
            }
            //获取PPT每页的大小（宽和高度）
            Dimension onePPTPageSize = oneHSLFSlideShow.getPageSize();

            //获得PPT文件中的所有的PPT页面（获得每一张幻灯片）,并转为一张张的播放片
            List<HSLFSlide> pptPageSlideList = oneHSLFSlideShow.getSlides();
            //下面循环的主要功能是实现对PPT文件中的每一张幻灯片进行转换和操作
            for (int i = 0; i < pptPageSlideList.size(); i++) {
                //这几个循环只要是设置字体为宋体，防止中文乱码，
                List<List<HSLFTextParagraph>> oneTextParagraphs = pptPageSlideList.get(i).getTextParagraphs();
                for (List<HSLFTextParagraph> list : oneTextParagraphs) {
                    for (HSLFTextParagraph hslfTextParagraph : list) {
                        List<HSLFTextRun> HSLFTextRunList = hslfTextParagraph.getTextRuns();
                        for (int j = 0; j < HSLFTextRunList.size(); j++) {

                            /*
                             * 如果PPT在WPS中保存过，则 HSLFTextRunList.get(j).getFontSize();的值为0或者26040，
                             * 因此首先识别当前文本框内的字体尺寸是否为0或者大于26040，则设置默认的字体尺寸。
                             *
                             */
                            //设置字体大小
                            Double size = HSLFTextRunList.get(j).getFontSize();
                            if ((size <= 0) || (size >= 26040)) {
                                HSLFTextRunList.get(j).setFontSize(20.0);
                            }
                            //设置字体样式为宋体
                            HSLFTextRunList.get(j).setFontFamily("宋体");
                        }
                    }
                }
                /**
                 * 创建BufferedImage对象，图像的尺寸为原来的每页的尺寸
                 */
                BufferedImage oneBufferedImage = new BufferedImage(onePPTPageSize.width, onePPTPageSize.height, BufferedImage.TYPE_INT_RGB);
                Graphics2D oneGraphics2D = oneBufferedImage.createGraphics();
                /**
                 * 设置转换后的图片背景色为白色
                 *
                 */
                oneGraphics2D.setPaint(Color.white);
                oneGraphics2D.fill(new Rectangle2D.Float(0, 0, onePPTPageSize.width, onePPTPageSize.height));
                pptPageSlideList.get(i).draw(oneGraphics2D);
                /**
                 * 设置图片的存放路径和图片格式，注意生成的图片路径为绝对路径，最终获得各个图像文件所对应的输出流对象
                 */
                try {
                    String imgName = (i + 1) + "." + imageFormatNameString;
                    imgNamesList.add(imgName);//将图片名称添加的集合中
                    orignalPPTFileOutStream = new FileOutputStream(targetImageFileDir + imgName);
                } catch (FileNotFoundException e) {
                    e.printStackTrace();
                    converReturnResult = false;
                    map.put("converReturnResult", converReturnResult);
                    return map;
                }

                /**
                 * 转换后的图片文件保存的指定的目录中
                 */
                try {
                    ImageIO.write(oneBufferedImage, imageFormatNameString, orignalPPTFileOutStream);
                } catch (IOException e) {
                    e.printStackTrace();
                    converReturnResult = false;
                    map.put("converReturnResult", converReturnResult);
                    return map;
                }
            }
        } finally {
            try {
                if (orignalPPTFileInputStream != null) {
                    orignalPPTFileInputStream.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }

            try {
                if (orignalPPTFileOutStream != null) {
                    orignalPPTFileOutStream.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
            map.put("converReturnResult", converReturnResult);
            map.put("imgNames", imgNamesList);
        }
        return map;
    }

pptx转图片代码

 @SuppressWarnings("resource")
    public static Map<String, Object> converPPTXtoImage(String orignalPPTFileName, String targetImageFileDir,
                                                        String imageFormatNameString) {
        Map<String, Object> map = new HashMap<String, Object>();
        boolean converReturnResult = true;//是否全部转成功
        List<String> imgNamesList = new ArrayList<String>();//PPT转成图片后所有名称集合
        FileInputStream orignalPPTFileInputStream = null;
        FileOutputStream orignalPPTFileOutStream = null;
        XMLSlideShow oneSlideShow = null;
        try {
            try {
                orignalPPTFileInputStream = new FileInputStream(orignalPPTFileName);
            } catch (FileNotFoundException e) {
                e.printStackTrace();
                converReturnResult = false;
                map.put("converReturnResult", converReturnResult);
                return map;
            }

            try {
                oneSlideShow = new XMLSlideShow(orignalPPTFileInputStream);
            } catch (IOException e) {
                e.printStackTrace();
                converReturnResult = false;
                map.put("converReturnResult", converReturnResult);
                return map;
            }
            //获取PPT每页的尺寸大小（宽和高度）
            Dimension onePPTPageSize = oneSlideShow.getPageSize();
            //获取PPT文件中的所有PPT页面，并转换为一张张播放片
            List<XSLFSlide> pptPageXSLFSLiseList = oneSlideShow.getSlides();

            String xmlFontFormat = "<xml-fragment xmlns:a=\"http://schemas.openxmlformats.org/drawingml/2006/main\" xmlns:r=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships\" xmlns:p=\"http://schemas.openxmlformats.org/presentationml/2006/main\">" +
                    "<a:rPr lang=\"zh-CN\" altLang=\"en-US\" dirty=\"0\" smtClean=\"0\"> " +
                    "<a:latin typeface=\"+mj-ea\"/> " +
                    "</a:rPr>" +
                    "</xml-fragment>";

            for (int i = 0; i < pptPageXSLFSLiseList.size(); i++) {
                /**
                 * 设置中文为宋体，解决中文乱码问题
                 */
                CTSlide oneCTSlide = pptPageXSLFSLiseList.get(i).getXmlObject();
                CTGroupShape oneCTGroupShape = oneCTSlide.getCSld().getSpTree();
                List<CTShape> oneCTShapeList = oneCTGroupShape.getSpList();
                for (CTShape ctShape : oneCTShapeList) {
                    CTTextBody oneCTTextBody = ctShape.getTxBody();

                    if (null == oneCTTextBody) {
                        continue;
                    }
                    CTTextParagraph[] oneCTTextParagraph = oneCTTextBody.getPArray();
                    CTTextFont oneCTTextFont = null;
                    try {
                        oneCTTextFont = CTTextFont.Factory.parse(xmlFontFormat);
                    } catch (XmlException e) {
                        e.printStackTrace();
                    }

                    for (CTTextParagraph ctTextParagraph : oneCTTextParagraph) {
                        CTRegularTextRun[] onrCTRegularTextRunArray = ctTextParagraph.getRArray();
                        for (CTRegularTextRun ctRegularTextRun : onrCTRegularTextRunArray) {
                            CTTextCharacterProperties oneCTTextCharacterProperties = ctRegularTextRun.getRPr();
                            oneCTTextCharacterProperties.setLatin(oneCTTextFont);
                        }
                    }
                }

                //创建BufferedImage 对象，图像尺寸为原来的PPT的每页尺寸

                BufferedImage oneBufferedImage = new BufferedImage(onePPTPageSize.width, onePPTPageSize.height, BufferedImage.TYPE_INT_RGB);
                Graphics2D oneGraphics2D = oneBufferedImage.createGraphics();
                //将PPT文件中的每个页面中的相关内容画到转换后的图片中
                pptPageXSLFSLiseList.get(i).draw(oneGraphics2D);

                /**
                 * 设置图片的存放路径和图片格式，注意生成的文件路径为绝对路径，最终获得各个图像文件所对应的输出流的对象
                 */
                try {
                    String imgName = (i + 1) + "." + imageFormatNameString;
                    imgNamesList.add(imgName);//将图片名称添加的集合中
                    orignalPPTFileOutStream = new FileOutputStream(targetImageFileDir + imgName);
                } catch (FileNotFoundException e) {
                    e.printStackTrace();
                    converReturnResult = false;
                    map.put("converReturnResult", converReturnResult);
                    return map;
                }

                //将转换后的各个图片文件保存带指定的目录中
                try {
                    ImageIO.write(oneBufferedImage, imageFormatNameString, orignalPPTFileOutStream);
                } catch (IOException e) {
                    e.printStackTrace();
                    converReturnResult = false;
                    map.put("converReturnResult", converReturnResult);
                    return map;
                }
            }
        } finally {
            try {
                if (orignalPPTFileInputStream != null) {
                    orignalPPTFileInputStream.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }

            try {
                if (orignalPPTFileOutStream != null) {
                    orignalPPTFileOutStream.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
            map.put("converReturnResult", converReturnResult);
            map.put("imgNames", imgNamesList);
        }
        return map;
    }

pdf转图片

public static int convert(String filePath,String targetFilePath) {
        Document document = new Document();
        try {
            document.setFile(filePath);
        } catch (Exception e1) {
            e1.printStackTrace();
        }
        float scale = 2.5f;// 缩放比例
        float rotation = 0f;// 旋转角度
        int pageSize = document.getNumberOfPages();
        for (int i = 0; i < pageSize; i++) {
            BufferedImage image = null;
            try {
                image = (BufferedImage) document.getPageImage(i, GraphicsRenderingHints.SCREEN,
                        org.icepdf.core.pobjects.Page.BOUNDARY_CROPBOX, rotation, scale);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            RenderedImage rendImage = image;
            try {
                File file = new File(targetFilePath + i + ".jpg");
                ImageIO.write(rendImage, "jpg", file);
            } catch (IOException e) {
                e.printStackTrace();
            }
            image.flush();
        }
        document.dispose();
        return pageSize;
    }

文档格式判断代码

  /**
     * 判断是否是2003版本
     * @param filePath
     * @return
     */
    public static boolean isExcel2003(String filePath) {
        return filePath.matches("^.+\\.(?i)(ppt)$");
    }

    /**
     * 判断是否是2007版本
     * @param filePath
     * @return
     */
    public static boolean isExcel2007(String filePath) {
        return filePath.matches("^.+\\.(?i)(pptx)$");
    }

    /**
     * 判断是否是pdf
     * @param filePath
     * @return
     */
    public static boolean isPDF(String filePath) {
        return filePath.matches("^.+\\.(?i)(pdf)$");
    }

（备注：为了提高效率，采用的队列+多线程的方式，采用2层多线程的方式，外层负责处理文件级别的多线程，内层负责单个文件按照页数多线程处理，为了过度的占用资源，控制了并发的个数，内层服务虽然多线程，但是可能会出现图像渲染阻塞的问题，需要额外加锁处理）

CountDownLatch countDownLatch = new CountDownLatch(pageSize);
            for (int i = 0; i < pageSize; i++) {
                String imgName = targetFilePath+(i+1) + "."+imageFormatNameString;
                ExecutorConvertPDFToImage executorConvertPDFToImage = new ExecutorConvertPDFToImage(
                        i,
                        imgName,
                        document,
                        countDownLatch);
                ThreadPool.submitCallBack(executorConvertPDFToImage);
                while(ThreadPool.getQueueSize()>=200) {
                    Thread.sleep(100);
                }
            }
      countDownLatch.await();

icepdf中文乱码处理

升级到最新的6.2.2版本，源码中已经带了字体引擎，但是为了保险起见，还是在linux服务器下安装了相应的中文字体，相关命令为mkdirscale ,mkdirfont , fc-cache fv , fc-list :lang=zh,

icepdf水印处理

复写icepdf对应的源码，只需要在项目中建立相同的文件路径和类名，覆写相应的方法即可

productInfo.java中覆写getVersion() ，让它返回空字符串即可。

Padding.java中主要是绘制文字，让其绘制的内容为空即可。padding1={},padding3={}.

(不同版本可能有所不同)

图片本来按照原始大小，考虑到客户端的需求，做了图片等比缩放，但是有可能会不清晰（大分辨率转成小分辨率）

            double imgScale = sz.getHeight()/sz.getWidth();
            double newWith = width;     //2018 
            double newHeight = imgScale*newWith;   //等比缩放
            newImage = image.getScaledInstance((int)newWith, (int)newHeight, image.SCALE_SMOOTH);
            AffineTransformOp ato = new AffineTransformOp(AffineTransform.getScaleInstance(newWith/pageWidth, newHeight/pageHeight), null);
            newImage = ato.filter(image, null);
            ImageIO.write((BufferedImage)newImage, "jpg", file);

2 过渡方案

方案：linux+libreoffice +jodconverter+icepdf

缺点：office中数学公式错位，smart图形转化稍有改善

在初级方案中最不能容忍的是office转化为图片的清晰度的问题，于是采用过度方案，过度方案重新做了架构上的改变，从原来的同步方式，变成了异步的方式，当用户上传完文件后，把文件相关的信息插入到任务队列，消费线程监听任务队列，当发现有任务的时候，根据文件类型采用不同的转化策略，当为office文档类型时，先转化为pdf，然后统一采用icepdf进行转化，通过初级方案中的调试和升级icepdf版本，转化图片的质量还是让人非常满意的。

由于中间多了一步，先让office转化为pdf，通过调研有如下几种方案

poi +itext 样式设计比较繁琐，并且样式效果也不是太好，由于初级方案中图片问题，对poi没有好感pass.

openoffice/libreoffice linux转化服务，libreoffice比openoffice有更能好的兼容性，最终选用了libreoffice ,但是对于数学公式转化为pdf有些错位，更不用说pdf转图片了。

jacob 终极方案，暂不叙述。

其中为了保证服务的稳定的性和提高效率，做了2个架构上的优化。

优化1 ：libreoffice以服务的方式启动，nohup /usr/bin/libreoffice6.0 --headless --accept="socket,host=127.0.0.1,port=8100;urp;" --nofirststartwizard & ，并且通过crontab 定时启动脚本监测服务，* * * * * /bin/bash /etc/checking.sh 由于linux下crontabl ,定时力度知道分钟，所以为了保证服务异常后被速度拉起，故每2s中监测一次

checking.sh 如下：

#!/bin/bash

libpids=`ps -ef | grep /opt/libreoffice6.0/program/oosplash | grep port=8888 | wc -l`
echo "时间:`date +"%Y-%m-%d %H:%M:%S"`,信息:libreoffice进程数为###### $libpids" >> /home/liuzhaoming/info.log

for((i=1;i<30;i++));
do
libpid=`ps -ef | grep /opt/libreoffice6.0/program/oosplash | grep port=8888 | wc -l`
sleep 2
if [ "$libpid" = 0 ]
        then
        echo "时间:`date +"%Y-%m-%d %H:%M:%S"`,信息:没有libreoffice进程执行,执行启动命令，启动进程" >> /home/liuzhaoming/info.log
        nohup /usr/bin/libreoffice6.0 --headless --accept="socket,host=0.0.0.0,port=8888;urp;" --nofirststartwizard   >>/home/liuzhaoming/info.log  2>&1 &
        echo "时间:`date +"%Y-%m-%d %H:%M:%S"`,信息:启动完成" >> /home/liuzhaoming/info.log
fi
done

优化2：由于中间必须转化为pdf,所以把服务拆分成2级，第一层服务实现office转化为pdf，代码层采用了jodconverter来调用服务，当然也可以通过直接调用脚本的方式，第二层服务实现pdf转化为图片，并且两层服务支持横向扩展，分别部署了集群，并且实现了简答的负载均衡算法，消费队列由原来的1个拆分成了4个（2个节点为一个集群），分别对应两层服务，两层服务每秒进行上报，采用本机ip+时间戳作为队列标识，由于项目是基于docker运行，中间牵扯到如何容器如何访问本机服务的问题，最终结合docker的版本，采用了hosts的方式解决，也就是启动的时候把宿主机器的ip写入镜像hosts。

3 终极方案

方案：windows+jacob+icepdf (可选)

缺点：需要单独维护windows服务，转化速度慢（windows+office2010）

由于过渡方案中的出现的数学公式错位（样式问题），并且通过验证发现，利用wps也会发生错位，这可能就是office文件的兼容性问题，如果要想原真原味还原文档，只能用微软的自己的办公软件打开并且渲染，于是有了终极方案jacob，原生调用ms office.

原理：java通过jni调用jacob.dll文件，利用程序模拟人为操作，利用office办公软件打开文档，调用另存为pdf,或者转存为图片，实现转化。（虽然利用wps也可以，但是尽量用office,因为之前说过如果文档是office,用wps渲染会有公式错位的问题，但是用office打开的话，兼容性会好很多）

ppt转化为pdf（也可以直接转图片，只需要调用SavaAs传不同的数字即可，17转jpg,16bmp,gif等等经国测试位图相对清晰）

 public void ppt2pdf(String srcFilePath, String pdfFilePath) throws Exception {
        ActiveXComponent app = null;
        Dispatch ppt = null;
        try {
            ComThread.InitSTA();
            app = new ActiveXComponent("PowerPoint.Application");
            Dispatch ppts = app.getProperty("Presentations").toDispatch();

            /*
             * call 
             * param 4: ReadOnly
             * param 5: Untitled指定文件是否有标题
             * param 6: WithWindow指定文件是否可见
             * */
            ppt = Dispatch.call(ppts, "Open", srcFilePath, false,false, false).toDispatch();
            Dispatch.call(ppt, "SaveAs", pdfFilePath, PPT_TO_PDF_OPERAND); // ppSaveAsPDF为特定值32  

        } catch (Exception e) {
            e.printStackTrace();
            throw e;
        } finally {
            if (ppt != null) {
                Dispatch.call(ppt, "Close");
            }
            if (app != null) {
                app.invoke("Quit");
            }
            ComThread.Release();
        }
    }

doc转pdf

public void doc2pdf(String srcFilePath, String pdfFilePath) throws Exception {  
        ActiveXComponent app = null;  
        Dispatch doc = null;  
        try {  
            ComThread.InitSTA();  
            app = new ActiveXComponent("Word.Application");  
            app.setProperty("Visible", false);  
            Dispatch docs = app.getProperty("Documents").toDispatch(); 
            Object[] obj = new Object[]{
                    srcFilePath, 
                    new Variant(false),  
                    new Variant(false),//是否只读  
                    new Variant(false),   
                    new Variant("pwd")
            };
            doc = Dispatch.invoke(docs, "Open", Dispatch.Method, obj, new int[1]).toDispatch();  
//          Dispatch.put(doc, "Compatibility", false);  //兼容性检查,为特定值false不正确  
            Dispatch.put(doc, "RemovePersonalInformation", false);  
            Dispatch.call(doc, "ExportAsFixedFormat", pdfFilePath, WORD_TO_PDF_OPERAND); // word保存为pdf格式宏，值为17  
  
        }catch (Exception e) {  
            e.printStackTrace();
            throw e;
        } finally {  
            if (doc != null) {  
                Dispatch.call(doc, "Close", false);  
            }  
            if (app != null) {  
                app.invoke("Quit", 0);  
            }  
            ComThread.Release();  
        }  
    }

备注：请确保dll和jar放到相应的位置，另外安装office的时候，尽量最大化安装，jni调用office需要.netFramework组件的支持，当使用完后一定要释放资源，另外在测试中发现如果直接转图片不太清晰，可以采用先转pdf，在调用其他服务转图片的策略。

4 花钱方案

花钱的方案就是好，省心省力效果好，推荐aspose

文档转图片实践总结

猜你喜欢