首页 > 代码库 > 代码片段,使用TIKA来解析PDF,WORD和EMAIL

代码片段,使用TIKA来解析PDF,WORD和EMAIL

/** * com.jiaoyiping.pdstest.TestTika.java * Copyright (c) 2009 Hewlett-Packard Development Company, L.P. * All rights reserved. */package com.jiaoyiping.pdstest;import java.io.BufferedInputStream;import java.io.BufferedOutputStream;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.InputStream;import java.io.OutputStream;import org.apache.tika.metadata.Metadata;import org.apache.tika.parser.ParseContext;import org.apache.tika.parser.Parser;import org.apache.tika.parser.mail.RFC822Parser;import org.apache.tika.parser.microsoft.OfficeParser;import org.apache.tika.parser.pdf.PDFParser;import org.apache.tika.sax.BodyContentHandler;import org.junit.Test;import org.xml.sax.ContentHandler;/** * <pre> * Desc:  * @author 焦一平 * @refactor 焦一平 * @date   2014年12月4日 下午1:31:09 * @version 1.0 * @see   * REVISIONS:  * Version 	   Date 		    Author 			  Description * -------------------------------------------------------------------  * 1.0 		  2014年12月4日 	                              焦一平  	         1. Created this class.  * </pre>   */public class TestTika {		//解析PDF	@Test	public void testPdf() throws Exception{		Long start = System.currentTimeMillis();		Parser parser = new PDFParser();		InputStream is = new BufferedInputStream(new FileInputStream(new File("D:\\我的微盘\\文档\\参考文档\\Linux Shell脚本攻略.pdf")));		OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\\Users\\Administrator\\Desktop\\result.txt")));	    Metadata meta = new Metadata();  	    meta.add(Metadata.CONTENT_ENCODING, "utf-8");          ContentHandler iHandler = new BodyContentHandler(os);  	    parser.parse(is, iHandler, meta, new ParseContext());	    Long end = System.currentTimeMillis();	    Long used = (end-start)/1000;	    System.out.println("耗时: "+used+"秒");	}	//解析Word	@Test	public void testWrod() throws Exception{		Long start = System.currentTimeMillis();		Parser parser = new OfficeParser();		InputStream is = new BufferedInputStream(new FileInputStream(new File("D:\\我的微盘\\文档\\参考文档\\jBPM5_用户指南中文版.doc")));		OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\\Users\\Administrator\\Desktop\\result.txt")));		Metadata meta = new Metadata();  	    meta.add(Metadata.CONTENT_ENCODING, "utf-8");          ContentHandler iHandler = new BodyContentHandler(os);  	    parser.parse(is, iHandler, meta, new ParseContext());				Long end = System.currentTimeMillis();		Long used = (end-start)/1000;		System.out.println("耗时:"+used+"秒");	}	//解析EMAIL(只能解析标准的eml格式的,不能解析微软的msg格式) 	//使用commons-email来进行解析的可以得到收件人、发件人、主题、内容等元数据,TIkA是否支持未尝试	@Test	public void testEmail() throws Exception{		Long start = System.currentTimeMillis();		Parser parser = new RFC822Parser();		InputStream is = new BufferedInputStream(new FileInputStream(new File("C:\\Users\\Administrator\\Downloads\\回复_ RE_ 数据导入工作 - 外部系统枚举与U-Cloud枚举映射.eml")));		OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\\Users\\Administrator\\Desktop\\result.txt")));		Metadata meta = new Metadata();  		meta.add(Metadata.CONTENT_ENCODING, "utf-8"); 		ContentHandler iHandler = new BodyContentHandler(os);  		parser.parse(is, iHandler, meta, new ParseContext());				Long end = System.currentTimeMillis();		Long used = (end-start)/1000;		System.out.println("耗时:"+used+"秒");	}}

  

 

代码片段,使用TIKA来解析PDF,WORD和EMAIL