2021SC@SDUSC
上篇,我们分析了NonSpillableDataBag,本篇,我们将分析三种"spillable"DataBag
Data目录下包含bag的文件列表如下:
回忆一下bag的类图
上篇提到,创建bag的方法有两种,一种是调用BagFactory的方法,创建的是三种spillableDataBag,另一种是直接调用nonSpillableDataBag
先去看看三种DataBag,三者继承于DefaultAbstractBag,DefaultAbstractBag实现DataBag接口,上篇已经对接口类进行了分析,总结如下
下面分析DefaultAbstractBag源码
/**
* DataBag的默认实现。这是一个抽象类,用作所有这三种类型的数据包的父类。
*/
@SuppressWarnings("serial")
public abstract class DefaultAbstractBag implements DataBag {
private static final Log log = LogFactory.getLog(DataBag.class);
// If we grow past 100K, may be worthwhile to register.
private static final int SPILL_REGISTER_THRESHOLD = 100 * 1024;
private static PigLogger pigLogger;
private static InterSedes sedes = InterSedesFactory.getInterSedesInstance();
// 保存元组的容器。由子类实例化的实际对象。
protected Collection<Tuple> mContents;
// 我们已经创建了Spill文件。这些需要在最后定型时删除。
protected FileList mSpillFiles;
//总大小,包括磁盘上的元组。存储在这里,这样当人们询问时,我们就不用遍历磁盘了。
protected long mSize = 0;
// 每个包需要抽样的元组数目,以得到元组大小的估计
private static final int SPILL_SAMPLE_SIZE = 100;
private static final int SPILL_SAMPLE_FREQUENCY = 10;
long aggSampleTupleSize = 0;
int sampled = 0;
private boolean spillableRegistered = false;
/**
* 获取包中元素的数量,包括内存和磁盘上的元素。
*/
@Override
public long size() {
return mSize;
}
/**
* 对每个SPILL_SAMPLE_FREQUENCYth元组进行采样,直到达到SPILL_SAMPLE_SIZE的最大值,以获得元组大小的估计
*/
protected void sampleContents() {
synchronized (mContents) {
Iterator<Tuple> iter = mContents.iterator();
for (int i = 0; i < sampled * SPILL_SAMPLE_FREQUENCY && iter.hasNext(); i++) {
iter.next();
}
for (int i = sampled; iter.hasNext() && sampled < SPILL_SAMPLE_SIZE; i++) {
Tuple t = iter.next();
if (t != null && i % SPILL_SAMPLE_FREQUENCY == 0) {
aggSampleTupleSize += t.getMemorySize();
sampled += 1;
}
}
}
}
/**
* 向包中添加一个元组。
* @param t tuple to add.
*/
@Override
public void add(Tuple t) {
synchronized (mContents) {
mSize++;
mContents.add(t);
}
markSpillableIfNecessary();
}
/**
* 所有袋子实现都应该在每次添加元素后调用这个方法。
*/
protected void markSpillableIfNecessary() {
if (!spillableRegistered) {
long estimate = getMemorySize();
if ( estimate >= SPILL_REGISTER_THRESHOLD) {
SpillableMemoryManager.getInstance().registerSpillable(this);
spillableRegistered = true;
}
}
}
@Override
public void addAll(DataBag b) {
addAll((Iterable<Tuple>) b);
}
public void addAll(Collection<Tuple> c) {
addAll((Iterable<Tuple>) c);
}
/**
* 添加可迭代对象的内容(集合或DataBag)
*
* @param iterable a Collection or DataBag to add contents of
*/
public void addAll(Iterable<Tuple> iterable) {
synchronized (mContents) {
for (Tuple t : iterable) {
add(t);
}
}
}
/**
* 返回内存使用的大小
*/
@Override
public long getMemorySize() {
int numInMem = 0;
synchronized (mContents) {
numInMem = mContents.size();
// 如果我们已经得到了估计,而元组的数量没有变化,或者大于样本容量并且仍然大于样本容量,我们可以生成一个新的估计而不需要对元组再次抽样。
if (sampled != 0 && (sampled == numInMem ||
sampled > SPILL_SAMPLE_SIZE && numInMem > SPILL_SAMPLE_SIZE)) {
return totalSizeFromAvgTupleSize(aggSampleTupleSize/sampled, numInMem);
}
sampleContents();
int avgTupleSize;
if (sampled != 0) {
avgTupleSize = (int) (aggSampleTupleSize / sampled);
} else {
avgTupleSize = 0;
}
return totalSizeFromAvgTupleSize(avgTupleSize, numInMem);
}
}
private long totalSizeFromAvgTupleSize(long avgTupleSize, int numInMem) {
long used = avgTupleSize * numInMem;
long mFields_size = roundToEight(4 + numInMem*4); /* mContents fixed + per entry */
// 在Java热点32位vm中,似乎有一个最小包大小为188字节
// 一些额外的字节可能来自这个数组列表的最小大小
mFields_size = Math.max(40, mFields_size);
// 这个对象和其他对象变量的固定开销= 84字节
// 8 - object header
// 4 + 8 + 8 - sampled + aggSampleTupleSize + mSize
// 8 + 8 - mContents ref + mSpillFiles ref
// 4 - spillableRegistered +4 instead of 1 to round it to eight
// 36 - mContents fixed
used += 84 + mFields_size;
// 将mSpillFiles ArrayList、ArrayList中的Object[]、ArrayList中的对象变量和spill文件的引用的开销相加
if (mSpillFiles != null) {
used += roundToEight(36 /* mSpillFiles fixed overhead*/ + mSpillFiles.size()*4);
if(mSpillFiles.size() > 0){
//粗略估计每个自动生成的文件条目所使用的内存可能具有相同的长度
long approx_per_entry_size =
roundToEight(mSpillFiles.get(0).toString().length() * 2 + 38);
used += mSpillFiles.size() * approx_per_entry_size;
}
}
return used;
}
/**
* 对象的内存大小四舍五入为8个字节的倍数
* @param i
* @return i rounded to a equal of higher multiple of 8
*/
private long roundToEight(long i) {
return 8 * ((i+7)/8); // integer division rounds the result down
}
/**
* 清除包中的内容,包括磁盘和内存中的内容。
* 调用此函数后,任何读取的尝试都会产生未定义的结果。
*/
@Override
public void clear() {
synchronized (mContents) {
mContents.clear();
if (mSpillFiles != null) {
for (int i = 0; i < mSpillFiles.size(); i++) {
boolean res = mSpillFiles.get(i).delete();
if (!res)
warn ("DefaultAbstractBag.clear: failed to delete " + mSpillFiles.get(i), PigWarning.DELETE_FAILED, null);
}
mSpillFiles.clear();
}
mSize = 0;
aggSampleTupleSize = 0;
sampled = 0;
// not changing spillableRegistered -- clear doesn't change that.
}
}
/**
* 这种方法可能非常昂贵,因为它可能需要一种包;除非迫不得已,否则别用。
*/
@Override
@SuppressWarnings("unchecked")
public int compareTo(Object other) {
if (this == other)
return 0;
if (other instanceof DataBag) {
DataBag bOther = (DataBag) other;
if (this.size() != bOther.size()) {
if (this.size() > bOther.size()) return 1;
else return -1;
}
// 这是假的。但我必须知道两个包是否有相同的元组,不管顺序如何。希望在大多数情况下,上面的大小检查可以防止这种情况发生。如果任何一个包还没有分类,创建一个已分类的包,这样我可以保证秩序。
DataBag thisClone;
DataBag otherClone;
BagFactory factory = BagFactory.getInstance();
if (this.isSorted() || this.isDistinct()) {
thisClone = this;
} else {
thisClone = factory.newSortedBag(null);
Iterator<Tuple> i = iterator();
while (i.hasNext()) thisClone.add(i.next());
}
if (((DataBag) other).isSorted() || ((DataBag)other).isDistinct()) {
otherClone = bOther;
} else {
otherClone = factory.newSortedBag(null);
Iterator<Tuple> i = bOther.iterator();
while (i.hasNext()) otherClone.add(i.next());
}
Iterator<Tuple> thisIt = thisClone.iterator();
Iterator<Tuple> otherIt = otherClone.iterator();
while (thisIt.hasNext() && otherIt.hasNext()) {
Tuple thisT = thisIt.next();
Tuple otherT = otherIt.next();
int c = thisT.compareTo(otherT);
if (c != 0) return c;
}
return 0; // 如果我们走了这么远,它们一定是相等的
} else {
return DataType.compare(this, other);
}
}
@Override
public boolean equals(Object other) {
if( other == null ) {
return false;
}
return compareTo(other) == 0;
}
/**
* 把包里的东西写到磁盘上
* @param out DataOutput to write data to.
* @throws IOException (passes it on from underlying calls).
*/
@Override
public void write(DataOutput out) throws IOException {
sedes.writeDatum(out, this);
}
/**
* 从磁盘读取一个包
* @param in DataInput to read data from.
* @throws IOException (passes it on from underlying calls).
*/
@Override
public void readFields(DataInput in) throws IOException {
long size = in.readLong();
for (long i = 0; i < size; i++) {
try {
Object o = sedes.readDatum(in);
add((Tuple)o);
} catch (ExecException ee) {
throw ee;
}
}
}
/**
* FuncEvalSpec.FakeDataBag使用这个
* @param stale Set stale state.
*/
@Override
public void markStale(boolean stale)
{
}
/**
* Write the bag into a string. */
@Override
public String toString() {
return BagFormat.format(this);
}
@Override
public int hashCode() {
int hash = 0;
Iterator<Tuple> i = iterator();
while (i.hasNext()) {
hash += i.next().hashCode();
}
return hash;
}
/**
* 获取要溢出内容的文件。该文件将在mSpillFiles数组中注册。
* @return stream to write tuples to.
*/
protected DataOutputStream getSpillFile() throws IOException {
if (mSpillFiles == null) {
// We want to keep the list as small as possible.
mSpillFiles = new FileList(1);
}
String tmpDirName= System.getProperties().getProperty("java.io.tmpdir") ;
File tmpDir = new File(tmpDirName);
// 如果目录不存在,请创建该目录。
if (!tmpDir.exists()){
log.info("Temporary directory doesn't exists. Trying to create: " + tmpDir.getAbsolutePath());
// 创建目录并查看是否成功
if (tmpDir.mkdir()){
log.info("Successfully created temporary directory: " + tmpDir.getAbsolutePath());
} else {
// 如果执行到了这里,则意味着我们需要创建目录,但没有成功如果执行到了这里,则意味着我们需要创建目录,但没有成功如果执行到了这里,则意味着我们需要创建目录,但没有成功.
//
// 如果这个目录是最近创建的,那么我们可以简单地跳过创建。这是为了解决集群中出现的一个罕见问题,尽管事实上spill()在一个同步块中调用getSpillFile()。
if (tmpDir.exists()) {
log.info("Temporary directory already exists: " + tmpDir.getAbsolutePath());
} else {
int errCode = 2111;
String msg = "Unable to create temporary directory: " + tmpDir.getAbsolutePath();
throw new ExecException(msg, errCode, PigException.BUG);
}
}
}
File f = File.createTempFile("pigbag", null);
f.deleteOnExit();
mSpillFiles.add(f);
return new DataOutputStream(new BufferedOutputStream(
new FileOutputStream(f)));
}
/**
* Report progress to HDFS.
*/
protected void reportProgress() {
if (PhysicalOperator.getReporter() != null) {
PhysicalOperator.getReporter().progress();
}
}
@SuppressWarnings("rawtypes")
protected void warn(String msg, Enum warningEnum, Throwable e) {
pigLogger = PhysicalOperator.getPigLogger();
if(pigLogger != null) {
pigLogger.warn(this, msg, warningEnum);
} else {
log.warn(msg, e);
}
}
@SuppressWarnings("rawtypes")
protected void incSpillCount(Enum counter) {
incSpillCount(counter, 1);
}
@SuppressWarnings("rawtypes")
protected void incSpillCount(Enum counter, long numRecsSpilled) {
PigStatusReporter reporter = PigStatusReporter.getInstance();
if (reporter != null && reporter.getCounter(counter)!=null) {
reporter.getCounter(counter).increment(numRecsSpilled);
} else {
PigHadoopLogger.getInstance().warn(mContents, "Spill counter incremented", counter);
}
}
public static abstract class BagDelimiterTuple extends DefaultTuple{}
public static class StartBag extends BagDelimiterTuple{
private static final long serialVersionUID = 1L;}
public static class EndBag extends BagDelimiterTuple{
private static final long serialVersionUID = 1L;}
public static final Tuple startBag = new StartBag();
public static final Tuple endBag = new EndBag();
protected static final int MAX_SPILL_FILES = 100;
}
类图总览
总体而言,没有特别要注意的地方,但整理时发现以下三个接口是未实现的
+ isSorted(): boolean
+ isDistinct(): boolean
+ iterator: iterator<Tuple>
推测是在三个实例中实现