首页 > 代码库 > Python 将pdf转换成txt(不处理图片)

Python 将pdf转换成txt(不处理图片)



  PDFMiner 的简介:PDFMiner is a tool for extracting information from PDF documents. Unlike other PDF-related tools, it focuses entirely on getting and analyzing text data.有兴趣的同学请通过官网进行详细查看,通过PDFMiner中的小工具pdf2txt.py,便能将pdf转换成txt,而且仍保留pdf中的格式,超赞!


# -*- coding: utf-8 -*-  #----------------------------------------------------- #   功能:将pdf转换成txt(不处理图片)#   作者:chenbjin #   日期:2014-07-11#   语言:Python 2.7.6  #   环境:linux(ubuntu)#        PDFMiner20140328(Must be installed)#   使用:python pdf2txt.py file.pdf#-----------------------------------------------------import sysfrom pdfminer.pdfinterp import PDFResourceManager,PDFPageInterpreterfrom pdfminer.converter import TextConverterfrom pdfminer.layout import LAParamsfrom pdfminer.pdfpage import PDFPage#maindef main(argv) :    #输出文件名,这里只处理单文档,所以只用了argv[1]    outfile = argv[1] + .txt    args = [argv[1]]    debug = 0    pagenos = set()    password = ‘‘    maxpages = 0    rotation = 0    codec = utf-8#输出编码    caching = True    imagewriter = None    laparams = LAParams()    #    PDFResourceManager.debug = debug    PDFPageInterpreter.debug = debug    rsrcmgr = PDFResourceManager(caching=caching)    outfp = file(outfile,w)
   #pdf转换 device
= TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter)
for fname in args: fp = file(fname,rb) interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) : page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() returnif __name__ == __main__ : main(sys.argv)

  下一步将尝试将pdf中的图片进行转换,可以通过http://denis.papathanasiou.org/2010/08/04/extracting-text-images-from-pdf-files/ 进行了解。

