hadoop不仅可以处理文本信息,还可以处理二进制格式数据,sequenceFile就是这样的文件。有以下特点:
- 可分割
- 支持压缩
- 每一行都是一个key value对
- 可以设置同步点
SequenceFile的格式是由一个header 跟随一个或多个记录组成。前三个字节是一个Bytes SEQ代表着版本号,同时header也包括key的名称,value class , 压缩细节,metadata,以及Sync markers。Sync markers的作用在于可以读取任意位置的数据。
使用以下方法生成一个seqFile,其中的key是年份value是温度。
@Test public void testWrite() throws Exception { Configuration conf = new Configuration(); conf.set("fs.defaultFS","file:///"); FileSystem fs = FileSystem.get(conf); Path p = new Path("/home/hadoop/seq/1.seq"); SequenceFile.Writer writer = null; writer = SequenceFile.createWriter(conf,SequenceFile.Writer.file(p),//设置文件输出路径 SequenceFile.Writer.keyClass(IntWritable.class),//设置输出key的类型 SequenceFile.Writer.valueClass(IntWritable.class)//设置输出value的类型 ); IntWritable year;//年份 IntWritable temperature;//温度 for (int i = 0;i<8000;i++){ year = new IntWritable(); year.set(1970 + new Random().nextInt(100)); temperature = new IntWritable(); temperature.set(new Random().nextInt(100)-30); writer.append(year,temperature); } writer.close(); }使用命令查看生成文件的内容:
hdfs dfs -text 1.seq写seqFile的时候生成压缩后的文件:
/** * 压缩后的seqFile */ @Test public void testWriteZipSeqFile() throws Exception { Configuration conf = new Configuration(); conf.set("fs.defaultFS","file:///"); FileSystem fs = FileSystem.get(conf); Path p = new Path("/home/hadoop/seq/2.seq"); SequenceFile.Writer writer = null; writer = SequenceFile.createWriter(conf,SequenceFile.Writer.file(p), SequenceFile.Writer.keyClass(IntWritable.class), SequenceFile.Writer.valueClass(IntWritable.class), SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK,new DefaultCodec())//采取块压缩的方式,defaultCodec的压缩编码 ); IntWritable year;//年份 IntWritable temperature;//温度 for (int i = 0;i<8000;i++){ year = new IntWritable(); year.set(1970 + new Random().nextInt(100)); temperature = new IntWritable(); temperature.set(new Random().nextInt(100)-30); writer.append(year,temperature); if(i % 400 == 0){ writer.sync();//每四百条记录添加一个同步点 } } writer.close(); }
压缩方式有三种:块压缩"BLOCK",记录压缩(压缩value)“RECORD”,还有就是不压缩。压缩方式有:DeflateCodec GzipCodec BZip2Codec Lz4Codec LZO SnappyCodec。lzo snappyCodec需要你机器本地有响应的库支持。
读操作,读取所有的记录,不管是压缩还是非压缩文件都可以直接读取
/** * 读操作,循环输出所有key-value */ @Test public void readSeqFile() throws Exception { Configuration conf = new Configuration(); conf.set("fs.defaultFS","file:///"); FileSystem fs = FileSystem.get(conf); Path p = new Path("/home/hadoop/seq/1.seq") ; SequenceFile.Reader reader = null; reader = new SequenceFile.Reader(conf,SequenceFile.Reader.file(p)//路径 ); IntWritable key = new IntWritable(); IntWritable value = new IntWritable() ; while(reader.next(key,value)){ System.out.println(key.get() + " : " + value.toString()); } reader.close(); }从seqFile中定位偏移量,读取文件
/** * 读操作,循环输出所有key-value * 根据定位到某个字节,来读取文件 */ @Test public void readSeqFileFromSeek() throws Exception { Configuration conf = new Configuration(); conf.set("fs.defaultFS","file:///"); FileSystem fs = FileSystem.get(conf); Path p = new Path("/home/hadoop/seq/2.seq") ; SequenceFile.Reader reader = null; reader = new SequenceFile.Reader(conf,SequenceFile.Reader.file(p)//路径 ); IntWritable key = new IntWritable(); IntWritable value = new IntWritable() ; reader.seek(25130);//从这个偏移量的下一个同步点开始读,偏移量必须是准确的 while(reader.next(key,value)){ System.out.println(reader.getPosition() + " " +key.get() + " : " + value.toString()); } reader.close(); }
/** * 读操作,循环输出所有key-value * 根据同步点,来读取文件 */ @Test public void readSeqFileFromSync() throws Exception { Configuration conf = new Configuration(); conf.set("fs.defaultFS","file:///"); FileSystem fs = FileSystem.get(conf); Path p = new Path("/home/hadoop/seq/2.seq") ; SequenceFile.Reader reader = null; reader = new SequenceFile.Reader(conf,SequenceFile.Reader.file(p)//路径 ); IntWritable key = new IntWritable(); IntWritable value = new IntWritable() ; reader.sync(23810);//从这个偏移量的下一同步点开始读 while(reader.next(key,value)){ System.out.println(reader.getPosition() + " " +key.get() + " : " + value.toString()); } reader.close(); }