首页 > 代码库 > 如何读取Hadoop中压缩的文件

如何读取Hadoop中压缩的文件

最近在处理离线数据导入HBase的问题,涉及从Hdfs中读取gz压缩文件,把思路记录下来,以作备用。具体代码如下:

package org.dba.util;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.io.PrintStream;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataInputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.compress.CompressionCodec;import org.apache.hadoop.io.compress.CompressionCodecFactory;import org.apache.hadoop.io.compress.CompressionInputStream;public class ReadHdfs {    public static void ReadFile(String fileName) throws IOException{        Configuration conf = new Configuration();        Path file = new Path(fileName);        FileSystem fs = FileSystem.get(conf);        FSDataInputStream hdfsInstream = fs.open(file);        CompressionCodecFactory factory = new CompressionCodecFactory(conf);        CompressionCodec codec = factory.getCodec(file);        BufferedReader reader = null;        try{            if(codec == null){                reader = new BufferedReader(new InputStreamReader(hdfsInstream));            }else{                CompressionInputStream comInStream = codec.createInputStream(hdfsInstream);                reader = new BufferedReader(new InputStreamReader(comInStream));                System.out.println(reader.readLine().substring(0, 100));            }        }catch(Exception e){            e.printStackTrace();        }    }    public static void main(String[] args) throws IOException{        ReadFile(args[0]);    }}

 

如何读取Hadoop中压缩的文件