首页 > 代码库 > 文库关键技术之PDF转换为HTML
文库关键技术之PDF转换为HTML
Aspose.PDF的使用方法,官网上介绍的都有,但是都比较简单,主要是起示例作用。
结合自己实际工作,我对相关方法进行了整理,自认为比较关键的地方,会作简要说明。
用来转换成html之前,必须对filepath和folder进行定义,切记切记。
class PdfTools{ /// <summary> /// 源文件 /// </summary> public string filepath { get; set; } /// <summary> /// /// </summary> private string filename { get { string _filename = Path.GetFileName(filepath); _filename = _filename.Substring(0, _filename.IndexOf(‘.‘)); return _filename; } } /// <summary> /// 最终文件目标夹 /// </summary> public string folder { get; set; } public PdfTools() { } public PdfTools(string _filepath, string _folder) { filepath = _filepath; folder = _folder; folder = folder.Replace("/",@"\"); if (folder.EndsWith(@"\") == false) folder = folder + @"\"; } /// <summary> /// 将PDF文件转换成一个完整的HTML /// filepath为PDF源文件,必须将文件路径填写完整 /// folder为最终保存目录,生成文件均在此目录下 /// </summary> /// <returns></returns> public bool Pdf2Html_NoSplit() { //判断PDF文件是否存在 if (File.Exists(filepath) == false) return false; folder = folder.Replace("/", @"\"); if (folder.EndsWith(@"\") == false) folder = folder + @"\"; if (Directory.Exists(folder) == true) Directory.Delete(folder,true); Directory.CreateDirectory(folder); Document doc = new Document(filepath); HtmlSaveOptions hso = new HtmlSaveOptions(); hso.SplitIntoPages = false; hso.FixedLayout = true; hso.CompressSvgGraphicsIfAny = true;
//我偏向于所有资源存放在一个文件夹内,因此,需要重写下面三个函数 hso.CustomResourceSavingStrategy = new HtmlSaveOptions.ResourceSavingStrategy(ResourceStrategy); hso.CustomCssSavingStrategy = new HtmlSaveOptions.CssSavingStrategy(CssStrategy); hso.CustomStrategyOfCssUrlCreation = new HtmlSaveOptions.CssUrlMakingStrategy(CssUrlStrategy); hso.FontSavingMode = HtmlSaveOptions.FontSavingModes.AlwaysSaveAsTTF; string outpath = folder + filename + ".html"; try { doc.Save(outpath, hso); return true; } catch { Directory.Delete(folder,true); return false; } }/// <summary> /// 根据PDF文件生成首页缩略图 /// </summary> /// <param name="_input">PDF源文件完整路径</param> /// <param name="_output">图片保存路径</param> public void CreateThumb(string _input,string _output) { if (File.Exists(_output) == true) File.Delete(_output); using (Document doc = new Document(_input)) using (FileStream imgstream = new FileStream(_output, FileMode.Create)) { //图片显示质量 Aspose.Pdf.Devices.Resolution resolution = new Aspose.Pdf.Devices.Resolution(100); Aspose.Pdf.Devices.PngDevice device = new Aspose.Pdf.Devices.PngDevice(resolution); device.Process(doc.Pages[1], imgstream); imgstream.Close(); } } /// <summary> /// 从PDF文件中提取正文 /// </summary> /// <param name="_input">PDF文件目录</param> /// <returns></returns> public string GetTxtFromPDF(string _input) { Document doc = new Document(_input); StringBuilder sb = new StringBuilder(); for (int i = 1; i <= doc.Pages.Count; i++) { Aspose.Pdf.Text.TextAbsorber ab = new Aspose.Pdf.Text.TextAbsorber(); ab.TextSearchOptions.LimitToPageBounds = true; try { doc.Pages[i].Accept(ab); sb.Append(ab.Text); } catch { return null; } } sb = sb.Replace("\r\n", " ").Replace(" "," "); string output = sb.ToString(); while (output.IndexOf(" ") >= 0) output = output.Replace(" "," "); return output; }/// <summary> /// 对字体、图片进行重命名,如果不用该函数,页面可能出现异常 /// </summary> /// <param name="resource"></param> /// <returns></returns> private string ResourceStrategy(SaveOptions.ResourceSavingInfo resource) { string resourcename = ""; if (resource.ResourceType == SaveOptions.NodeLevelResourceType.Image) resourcename = Guid.NewGuid().ToString() + Path.GetExtension(resource.SupposedFileName); else resourcename = resource.SupposedFileName; if (!Directory.Exists(folder)) Directory.CreateDirectory(folder); string outfile = folder + resourcename; if (File.Exists(resourcename) == true) return resourcename; System.IO.BinaryReader reader = new BinaryReader(resource.ContentStream); System.IO.File.WriteAllBytes(outfile, reader.ReadBytes((int)resource.ContentStream.Length)); return resourcename; }/// <summary> /// 生成页面引用的css文件 /// </summary> /// <param name="resource"></param> private void CssStrategy(HtmlSaveOptions.CssSavingInfo resource) { string path = folder+ filename + @".css"; BinaryReader reader = new BinaryReader(resource.ContentStream); File.WriteAllBytes(path, reader.ReadBytes((int)resource.ContentStream.Length)); } /// <summary> /// 对页面引用的css地址进行重写 /// </summary> /// <param name="resource"></param> /// <returns></returns> private string CssUrlStrategy(HtmlSaveOptions.CssUrlRequestInfo resource) { return filename + ".css"; }}
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。