package org.apache.hadoop.mapred; import java.io.IOException; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.*; import org.apache.hadoop.util.ReflectionUtils; /** An {@link RecordReader} for {@link SequenceFile}s. */ @InterfaceAudience.Public @InterfaceStability.Stable /** *hadoop中一个读取文件中某个片段(由其内部的start、length决定片段)的类,继承自RecordReader,可以作为InputFomat中 public *RecordReader<K, V> getRecordReader(InputSplit split,JobConf job, Reporter reporter)方法的返回的结果,处理FileSplit 。 * K V对应map中K V,其每次取得K V,要作为map的输入来做的 **/ public class SequenceFileRecordReader<K, V> implements RecordReader<K, V> { private SequenceFile.Reader in; //读取的Reader private long start; //文件在Reader的开始位置 private long end; //结束位置,一个hdfs文件在Reader中是连续的? private boolean more = true; protected Configuration conf; public SequenceFileRecordReader(Configuration conf, FileSplit split) throws IOException { Path path = split.getPath(); FileSystem fs = path.getFileSystem(conf); this.in = new SequenceFile.Reader(fs, path, conf); this.end = split.getStart() + split.getLength(); this.conf = conf; if (split.getStart() > in.getPosition()) in.sync(split.getStart()); // sync to start this.start = in.getPosition(); // 一个hdfs的开始位置,但不是在Reader中的开始位置,是以取Reader的开始位置 ?? more = start < end; } /** The class of key that must be passed to {@link * #next(Object, Object)}.. */ public Class getKeyClass() { return in.getKeyClass(); } /** The class of value that must be passed to {@link * #next(Object, Object)}.. */ public Class getValueClass() { return in.getValueClass(); } /** *每次调用next(K key, V value)方法需要传入相应的K 和 V 对象,是以以下像个方法构造K 、V的实例, *以便调用next(K key, V value),传给map方法 * */ @SuppressWarnings("unchecked") public K createKey() { return (K) ReflectionUtils.newInstance(getKeyClass(), conf); } @SuppressWarnings("unchecked") public V createValue() { return (V) ReflectionUtils.newInstance(getValueClass(), conf); } /** *取值的函数 */ public synchronized boolean next(K key, V value) throws IOException { if (!more) return false; long pos = in.getPosition(); boolean remaining = (in.next(key) != null); if (remaining) { getCurrentValue(value); } if (pos >= end && in.syncSeen()) { more = false; } else { more = remaining; } return more; } /** *用于跳过某个key */ protected synchronized boolean next(K key) throws IOException { if (!more) return false; long pos = in.getPosition(); boolean remaining = (in.next(key) != null); if (pos >= end && in.syncSeen()) { more = false; } else { more = remaining; } return more; } protected synchronized void getCurrentValue(V value) throws IOException { in.getCurrentValue(value); } /** * Return the progress within the input split * @return 0.0 to 1.0 of the input byte range */ public float getProgress() throws IOException { if (end == start) { return 0.0f; } else { return Math.min(1.0f, (in.getPosition() - start) / (float)(end - start)); } } public synchronized long getPos() throws IOException { return in.getPosition(); } protected synchronized void seek(long pos) throws IOException { in.seek(pos); } public synchronized void close() throws IOException { in.close(); } }hadoop中一个读取文件中某个片段(由其内部的start、length决定片段)的类,继承自RecordReader,可以作为InputFomat中 public RecordReader<K, V> getRecordReader(InputSplit split,JobConf job, Reporter reporter)方法的返回的结果,处理FileSplit 。
可以读取FileSplit中的key、Value对,以便map可以使用 。