首页 > 代码库 > 代码片段,使用TIKA来解析PDF,WORD和EMAIL
代码片段,使用TIKA来解析PDF,WORD和EMAIL
/** * com.jiaoyiping.pdstest.TestTika.java * Copyright (c) 2009 Hewlett-Packard Development Company, L.P. * All rights reserved. */package com.jiaoyiping.pdstest;import java.io.BufferedInputStream;import java.io.BufferedOutputStream;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.InputStream;import java.io.OutputStream;import org.apache.tika.metadata.Metadata;import org.apache.tika.parser.ParseContext;import org.apache.tika.parser.Parser;import org.apache.tika.parser.mail.RFC822Parser;import org.apache.tika.parser.microsoft.OfficeParser;import org.apache.tika.parser.pdf.PDFParser;import org.apache.tika.sax.BodyContentHandler;import org.junit.Test;import org.xml.sax.ContentHandler;/** * <pre> * Desc: * @author 焦一平 * @refactor 焦一平 * @date 2014年12月4日 下午1:31:09 * @version 1.0 * @see * REVISIONS: * Version Date Author Description * ------------------------------------------------------------------- * 1.0 2014年12月4日 焦一平 1. Created this class. * </pre> */public class TestTika { //解析PDF @Test public void testPdf() throws Exception{ Long start = System.currentTimeMillis(); Parser parser = new PDFParser(); InputStream is = new BufferedInputStream(new FileInputStream(new File("D:\\我的微盘\\文档\\参考文档\\Linux Shell脚本攻略.pdf"))); OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\\Users\\Administrator\\Desktop\\result.txt"))); Metadata meta = new Metadata(); meta.add(Metadata.CONTENT_ENCODING, "utf-8"); ContentHandler iHandler = new BodyContentHandler(os); parser.parse(is, iHandler, meta, new ParseContext()); Long end = System.currentTimeMillis(); Long used = (end-start)/1000; System.out.println("耗时: "+used+"秒"); } //解析Word @Test public void testWrod() throws Exception{ Long start = System.currentTimeMillis(); Parser parser = new OfficeParser(); InputStream is = new BufferedInputStream(new FileInputStream(new File("D:\\我的微盘\\文档\\参考文档\\jBPM5_用户指南中文版.doc"))); OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\\Users\\Administrator\\Desktop\\result.txt"))); Metadata meta = new Metadata(); meta.add(Metadata.CONTENT_ENCODING, "utf-8"); ContentHandler iHandler = new BodyContentHandler(os); parser.parse(is, iHandler, meta, new ParseContext()); Long end = System.currentTimeMillis(); Long used = (end-start)/1000; System.out.println("耗时:"+used+"秒"); } //解析EMAIL(只能解析标准的eml格式的,不能解析微软的msg格式) //使用commons-email来进行解析的可以得到收件人、发件人、主题、内容等元数据,TIkA是否支持未尝试 @Test public void testEmail() throws Exception{ Long start = System.currentTimeMillis(); Parser parser = new RFC822Parser(); InputStream is = new BufferedInputStream(new FileInputStream(new File("C:\\Users\\Administrator\\Downloads\\回复_ RE_ 数据导入工作 - 外部系统枚举与U-Cloud枚举映射.eml"))); OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\\Users\\Administrator\\Desktop\\result.txt"))); Metadata meta = new Metadata(); meta.add(Metadata.CONTENT_ENCODING, "utf-8"); ContentHandler iHandler = new BodyContentHandler(os); parser.parse(is, iHandler, meta, new ParseContext()); Long end = System.currentTimeMillis(); Long used = (end-start)/1000; System.out.println("耗时:"+used+"秒"); }}
代码片段,使用TIKA来解析PDF,WORD和EMAIL
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。