天天看點

Hadoop 檔案格式

檔案格式:SequenceFile

------------------

    1.SequenceFile

        Key-Value對方式。

    2.不是文本檔案,是二進制檔案。

    3.可切割

        因為有同步點。

        reader.sync(pos);    //定位到pos之後的第一個同步點。

        writer.sync();        //寫入同步點

    4.壓縮方式

        不壓縮

        record壓縮            //隻壓縮value

        塊壓縮                //按照多個record形成一個block.

package com.it18zhang.hdfs;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.junit.Test;

import java.io.IOException;

/**
 *序列檔案
 */
public class TestSeqFile {
    /**
     * 寫操作
     */
    @Test
    public void save() throws Exception {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS","file:///");
        FileSystem fs = FileSystem.get(conf);
        Path p = new Path("d:/seq/1.seq") ;
        SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf,p, IntWritable.class, Text.class);
        for(int i = 0 ; i < 10 ; i ++){
            writer.append(new IntWritable(i),new Text("tom" + i));
            //添加一個同步點
            // 同步标示用于在讀取檔案時能夠從任意位置開始識别記錄邊界
            writer.sync();
        }
        for(int i = 0 ; i < 10 ; i ++){
            writer.append(new IntWritable(i),new Text("tom" + i));
            if(i % 2 == 0){
                writer.sync();
            }
        }
        writer.close();
    }

    /**
     * 寫操作
     */
    @Test
    public void zipGzip() throws Exception {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS","file:///");
        FileSystem fs = FileSystem.get(conf);
        Path p = new Path("d:/seq/1.seq") ;
        SequenceFile.Writer writer = SequenceFile.createWriter(fs,
                conf,
                p,
                IntWritable.class,
                Text.class,
                SequenceFile.CompressionType.BLOCK,
                new GzipCodec());
        for(int i = 0 ; i < 10 ; i ++){
            writer.append(new IntWritable(i),new Text("tom" + i));
            //添加一個同步點
            writer.sync();
        }
        for(int i = 0 ; i < 10 ; i ++){
            writer.append(new IntWritable(i),new Text("tom" + i));
            if(i % 2 == 0){
                writer.sync();
            }
        }
        writer.close();
    }

    /**
     * 讀操作,循環輸出所有key-value
     */
    @Test
    public void read() throws Exception {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS","file:///");
        FileSystem fs = FileSystem.get(conf);
        Path p = new Path("d:/seq/1.seq") ;
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, p , conf);

        IntWritable key = new IntWritable();
        Text value = new Text() ;
        while(reader.next(key,value)){
            System.out.println(key.get() + " : " + value.toString());
        }
        reader.close();
    }

    /**
     * 讀操作,得到目前value
     */
    @Test
    public void read2() throws Exception {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS","file:///");
        FileSystem fs = FileSystem.get(conf);
        Path p = new Path("d:/seq/1.seq") ;
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, p , conf);

        IntWritable key = new IntWritable();
        Text value = new Text() ;
        while(reader.next(key)){
            reader.getCurrentValue(value);
            System.out.println(value.toString());
        }
        reader.close();
    }

    /**
     * 讀操作
     */
    @Test
    public void read3() throws Exception {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS","file:///");
        FileSystem fs = FileSystem.get(conf);
        Path p = new Path("d:/seq/1.seq") ;
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, p , conf);
        IntWritable key = new IntWritable();
        Text value = new Text() ;
        reader.seek(288);

        reader.next(key,value);
        System.out.println(value.toString());
        reader.close();
    }

    /**
     *
     * 操縱同步點
     */
    @Test
    public void read4() throws Exception {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS","file:///");
        FileSystem fs = FileSystem.get(conf);
        Path p = new Path("d:/seq/1.seq") ;
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, p , conf);
        IntWritable key = new IntWritable();
        Text value = new Text() ;

        reader.sync(648);
        while(reader.next(key,value)){
            System.out.println(reader.getPosition() + "   " + key.get() + "-" + value.toString());
        }
        reader.close();
    }
}
           

檔案格式:MapFile

--------------------

    1.Key-value

    2.key按升序寫入(可重複)。

    3.mapFile對應一個目錄,目錄下有index和data檔案,都是序列檔案。

    4.index檔案劃分key區間,用于快速定位。

繼續閱讀