HTML文件转换PDF文件

闲话少说，先上代码：

package com.xxxxx.util.file;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.Reader;
import java.io.StringReader;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.w3c.tidy.Tidy;
import org.xhtmlrenderer.pdf.ITextFontResolver;
import org.xhtmlrenderer.pdf.ITextRenderer;

import com.itextpdf.text.pdf.BaseFont;
import com.lowagie.text.DocumentException;
import com.xxxxx.entity.SystemParam;
import com.xxxxx.service.SystemParamService;
import com.xxxxx.util.ApplicationUtil;
import com.xxxxx.util.Const;
import com.xxxxx.util.StringUtil;

/**
 * 用于将html转换成PDF
 * 
 * @author sxgkwei
 *
 */
public class HtmlToPdf {
	private static final Log log = LogFactory.getLog(HtmlToPdf.class);

	public static void main(String[] args) throws IOException, Exception {

	}

	/**
	 * 给源文件在系统默认目录下生成出pdf，并逐页切图
	 * 
	 * @param sourceFile
	 *            源文件
	 * @param pageImg
	 *            是否切图
	 * @return -1=失败；0=创建成功；
	 */
	public static int htmlToPdf(String sourceFile) {
		String outPath = com.xxxxx.util.FileUtils.getPdfByPath(sourceFile);
		try {
			String txt = getStringFromHtml(sourceFile);
			if (StringUtils.isBlank(txt)) {
				log.error("html转换PDF失败，html中未读取当任何内容。path=" + sourceFile);
				return -1;
			}
			toPdf(formatHtml(txt), outPath);
		} catch (Exception e) {
			log.error("html转换PDF失败:path=" + sourceFile, e);
			return MsToPdf.officeToPdf(sourceFile, outPath);// 还是转换失败，使用MS转换兜底，尽量保证能转换出PDF文件
		}

		return 0;
	}

	private static void toPdf(String html, String savePath) throws Exception {
		ITextRenderer renderer = new ITextRenderer();
		// 解决中文支持问题
		addFontDirectory(renderer.getFontResolver(), ApplicationUtil.getRoot() + "/css/fonts", BaseFont.NOT_EMBEDDED);
		renderer.setDocumentFromString(html);
		renderer.layout();
		try (OutputStream os = FileUtils.openOutputStream(new File(savePath))) {
			renderer.createPDF(os);
			os.flush();
		}
	}

	private static void addFontDirectory(ITextFontResolver resolver, String dir, boolean embedded) throws DocumentException, IOException {
		File f = new File(dir);
		if (f.isDirectory()) {
			File[] files = f.listFiles((d, name) -> {
				String lower = name.toLowerCase();
				return lower.endsWith(".otf") || lower.endsWith(".ttf") || lower.endsWith(".ttc");
			});
			if (files != null) {
				for (int i = 0; i < files.length; i++) {
					resolver.addFont(files[i].getAbsolutePath(), BaseFont.IDENTITY_H, embedded);
				}
			}
		}

	}

	private static String formatHtml(String html) throws IOException {

		SystemParamService service = ApplicationUtil.getBean(SystemParamService.class);
		String command = service.queryValueByKey(SystemParam.KEY_3664);
		html = "<style>" + command + " body{font-family: SimSun;}</style>" + html;

		Tidy tidy = new Tidy();
		tidy.setQuiet(true);// 不在控制台输出html内部描述的balabala的一大堆话
		tidy.setShowErrors(-1);// 各种行/未识别标签报错信息都不要输出

		tidy.setWraplen(Integer.MAX_VALUE);// 必须设置行宽，否则格式化出的HTML代码标签在无错误的情况下，可能因为换行被折叠出错误
		tidy.setMakeClean(true);// ms office 输出的html清理
		tidy.setXHTML(true); // 设定输出为xhtml(还可以输出为xml)
		tidy.setTidyMark(false); // 不设置它会在输出的文件中给加条meta信息
		tidy.setXmlPi(true); // 让它加上<?xml version="1.0"?>
		tidy.setInputEncoding(Const.WEB_CHARSET);// 输入的字符集
		tidy.setOutputEncoding(Const.WEB_CHARSET);// 输出的字符集
		tidy.setForceOutput(true);// 无论是否还有错误，强制输出html源码:否则在有错误时，Tidy会不输出字符串

		try (ByteArrayOutputStream os2 = new ByteArrayOutputStream(); Reader reader = new StringReader(html)) {
			tidy.parse(reader, os2);
			return os2.toString(Const.WEB_CHARSET);
		}
	}

	private static String getStringFromHtml(String path) throws IOException {
		String txt = "";
		File file = new File(path);
		if (file.exists() && file.length() > 0) {
			txt = FileUtils.readFileToString(file, Const.WEB_CHARSET);
			String charset = StringUtil.getHtmlCharset(txt);
			if (!Const.WEB_CHARSET.equalsIgnoreCase(charset)) {
				txt = FileUtils.readFileToString(file, charset);
			}
		}
		return txt;
	}

}

来做一下代码解释：

1，在 formatHtml 方法中，所有html代码开头插入的语句，实际上是：

<style>@page{size:210mm 297mm;} body{font-family: SimSun;}</style>

这一句非常重要，其中：

@page 是指令，意在指定PDF文件生成的页面大小，而 210*297 正好是 A4 纸张的大小。如果要横向PDF，则转换这个数字即可。扩大数字，可以时PDF页面变的更大。html 纵向输出时，可能页面很长，可以考虑把竖向值扩大成 500mm

body 对字体的指定，是为了对页面字体方案兜底，万一页面没有设置任何字体方案时，可以直接使用默认的宋体字。这种情况下，只要把宋体字体文件 SIMSUN.TTC 放在项目 /css/fonts/ 目录下即可。

2，html 文件依照正确编码进行读取，以防止乱码的情况，主要在 getStringFromHtml 方法中进行。其中，系统默认读取编码是：

Const.WEB_CHARSET="UTF-8"

这儿涉及到一个先读取出来，再判断当前 html 编码的方法。具体方法代码如下：

/**
 * 从html代码中找到本html文本的 charset 值，如果未找到，则返回 UTF-8
 * 
 * @param html
 * @return
 */
public static String getHtmlCharset(String html) {
	String charset = Const.WEB_CHARSET;
	String reg = "charset[\\s]*=[\\s]*['\"]?[\\s]*([a-zA-z0-9\\-]+)[\\s]*['\"]?";
	Pattern p = Pattern.compile(reg);
	Matcher m = p.matcher(html);
	if (m.find()) {
		charset = m.group(1);
	}
	return charset;
}

实际上，就是从一般 html 页面的 charset 属性们描述去读取，默认返回 UTF-8。如上正则表达式，依赖的常规对页面的编码描述如下两种：

<meta charset='utf-8'>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">

当然，这2种主要写法，还有很多变体，比如主体部分里面有些位置有空格，单引号和双引号的不同之类的，都是需要正则表达式能考虑在内的。

如此，先用默认 UTF-8 读取一次 HTML 文件，然后分析其自身的编码。如果自身编码与预读取使用的编码不一致，则使用自身编码再读取一次。

3，中文字体文件夹动态支持，这么设计是为了客户在使用系统过程中，发现转换出的有些中文文字空白时，能时刻向文件夹通过添加字体文件的方式，重新转换，而不需要启动应用服务器，主要为 addFontDirectory 方法。

这种方法，实际是复制的 ITextFontResolver 类的 addFontDirectory 方法的源代码，只是稍作改动，加入了对 ttc 后缀文件的支持。将原方法的字体编码从 CP1252 改成了 IDENTITY_H 。

4，JTidy 的使用。JTidy 是用来格式化 HTML 代码的，这个在转换中是核心，将重点阐述。

a，JTidy 目前有缺陷：因为其最后更新时间是 2009 年，故而，缺失对 HTML5 标签的支持，只能在遇到时会直接做删除处理。

b，测试过 org.jsoup.Jsoup ，其对 html 文本的处理，更偏向于格式化，对于 html 本身内部的错误清洗能力不强，故而在后续交给 itext 转换时，会报各种格式错误。

c，测试过 htmlcleaner ，其相比于 JTidy 而言，未对 js，css 代码块做 CDATA 标签包裹，导致处理出的 HTML 代码，交给后续转换处理时，经常因为 js 代码里面的字符而报错。

综上：在我尝试的方案内，只能使用 JTidy 来处理这个 html 清洗问题，以使其达到传入后续转换 itext 内时不报错。

那么，重点来了，JTidy 的各种设置项简直多到人眼花，而度娘能搜到的设置项，在有些情况下总不尽如人意。经过我逐项参看源代码，目前就设置项如下：

Tidy tidy = new Tidy();
tidy.setQuiet(true);// 不在控制台输出html内部描述的balabala的一大堆话
tidy.setShowErrors(-1);// 各种行/未识别标签报错信息都不要输出
tidy.setWraplen(Integer.MAX_VALUE);// 必须设置行宽，否则格式化出的HTML代码标签在无错误的情况下，可能因为换行被折叠出错误
tidy.setMakeClean(true);// ms office 输出的html清理
tidy.setXHTML(true); // 设定输出为xhtml(还可以输出为xml)
tidy.setTidyMark(false); // 不设置它会在输出的文件中给加条meta信息
tidy.setXmlPi(true); // 让它加上<?xml version="1.0"?>
tidy.setInputEncoding(Const.WEB_CHARSET);// 输入的字符集
tidy.setOutputEncoding(Const.WEB_CHARSET);// 输出的字符集
tidy.setForceOutput(true);// 无论是否还有错误，强制输出html源码:否则在有错误时，Tidy会不输出字符串

不用怀疑，以上每一行都是经典。在度娘能搜到的配置项中，增加的关于控制台输出的，关于行宽的，关于强制输出的，关于MS office处理的，虽然一起只有短短的几句代码，却是我花了2天时间逐个看源码梳理出来的；且行且珍惜。

5，关于 htmlToPdf 方法的最终兜底，调用了 MsToPdf.officeToPdf。为什么这样写？

实际上，我们系统之前是有转换方法的，是通过 jacob 调用 ms office 软件，来实现转换的，但是，转换效果非常不好，各种表格线/样式等丢失。这才促使我们需要实现一种新的更好的策略来实现 HTML 到 PDF的转换。

不可否认的是，MS office 转换有个替代不了的好处：它总是不会出错，并且能给你转换出一个PDF文件——哪怕这个PDF文件内存排版很丑或者稍有错乱。这才有了上面的兜底策略；我们希望要到更好的，但如果没有更好的，也多少给我一个还能看的，最少比啥都没有强。

写在最后：

在编程领域，技术无处不在。技术不是最新的版本，不是分布式微服务云计算大数据，它是每一个深思熟虑的想法，每一个场景下最合适的处理决策，每一个对外接口所处理的事务边界的思考，是对细节孜孜不倦的追求。不要迷恋新技术新版本，真正的技术，是你对事物原理的掌握。

HTML文件转换PDF文件

猜你喜欢