首页 > 代码库 > python_读取 doc,docx,pdf
python_读取 doc,docx,pdf
#!/usr/bin/env python # -*- coding: utf-8 -*- import docx from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from cStringIO import StringIO from win32com import client import sys reload(sys) sys.setdefaultencoding(‘gb2312‘) def readDocx(docxPath): fullText = [] doc = docx.Document(docxPath) paras = doc.paragraphs for p in paras: fullText.append(p.text.strip()) return ‘\n‘.join(fullText) def readPdf(pdfPath): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = ‘utf-8‘ laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(pdfPath, ‘rb‘) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() str = retstr.getvalue() retstr.close() return str def readDoc(docPath): fullText = [] word = client.Dispatch(‘Word.Application‘) # 打开一个已存在的文件 doc = word.Documents.Open(docPath) #print doc.Content #print text doc.SaveAs(‘c:/temp.txt‘, 2) # 关闭 doc.Close() word.Quit() f=open(r‘c:/temp.txt‘,‘r‘) for line in f.readlines(): #f len(line)!=line.count(‘\n‘): fullText.append(line.decode(‘gbk‘).strip()) f.close() return ‘\n‘.join(fullText) if __name__ == ‘__main__‘: #docxValue=http://www.mamicode.com/readDocx(‘d:/1.docx‘) #print docxValue #pdfValue = http://www.mamicode.com/readPdf(‘d:/3.pdf‘) #print pdfValue docValue = http://www.mamicode.com/readDoc(‘d:/2.doc‘) print docValue
python_读取 doc,docx,pdf
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。