把开发过程中常用的一些代码段做个珍藏,下面的代码是关于C# 用 iTextSharp 将 PDF 转成文本的代码。
using System;
using System.IO;
using iTextSharp.text;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
public class ParsingPDF {
static string PDF;
static string TEXT2;
public void parsePdf(String src, String dest)
{
PdfReader reader = new PdfReader(src);
StreamWriter output = new StreamWriter(new FileStream(dest, FileMode.Create));
int pageCount = reader.NumberOfPages;
for (int pg = 1; pg <= pageCount; pg++)
{
byte[] streamBytes = reader.GetPageContent(pg);
PRTokeniser tokenizer = new PRTokeniser(streamBytes);
while (tokenizer.NextToken())
{
if (tokenizer.TokenType == PRTokeniser.TokType.STRING)
{
output.WriteLine(tokenizer.StringValue);
}
}
}
output.Flush();
output.Close();
}
static void Main(string[] args)
{
if (args.Length < 1 || args.Length > 2)
{
Console.WriteLine("USAGE: ParsePDF infile.pdf <outfile.txt>");
return;
}
else if (args.Length == 1)
{
PDF = args[0];
TEXT2 = Path.GetFileNameWithoutExtension(PDF) + ".txt";
}
else
{
PDF = args[0];
TEXT2 = args[1];
}
try
{
DateTime t1 = DateTime.Now;
ParsingPDF example = new ParsingPDF();
example.parsePdf(PDF, TEXT2);
DateTime t2 = DateTime.Now;
TimeSpan ts = t2 - t1;
Console.WriteLine("Parsing completed in {0:0.00} seconds.", ts.TotalSeconds);
}
catch (Exception ex)
{
Console.WriteLine("ERROR: " + ex.Message);
}
public class MyTextRenderListener : IRenderListener
{
protected StreamWriter output;
public MyTextRenderListener(StreamWriter output)
{
this.output = output;
}
public void BeginTextBlock()
{
output.Write("<");
}
public void EndTextBlock()
{
output.WriteLine(">");
}
public void RenderImage(ImageRenderInfo renderInfo)
{
}
public void RenderText(TextRenderInfo renderInfo)
{
output.Write("<");
output.Write(renderInfo.GetText());
output.Write(">");
}