2021SC@SDUSC
本篇继续介绍其他包的变种
上篇我们已经讲过的包变种有InternalCachedBag、InternalDistinctBag、SelfSpillBag、SortedSpillBag
其中InternalCachedBag和SortedSpillBag继承自SelfSpillBag,InternalDistinctBag继承自SortedSpillBag
InternalSortedBag
继承关系,完全不出意外
public class InternalSortedBag extends SortedSpillBag
一些注释
/**
* 具有倍数的元组(可能)的有序集合。数据在进入时未排序存储,并且仅在需要将其转储到文件或请求第一个迭代器时进行排序。
* 实验发现这比开始排序的存储速度更快。
*
* 我们允许使用用户定义的比较器,但在用户未指定比较器的情况下提供默认比较器。
*
* 这个包没有注册到 SpillableMemoryManager。它计算要保存在内存中并主动溢出到文件中的元组数。
*/
UML图
构造函数
public InternalSortedBag() {
this(null);
}
public InternalSortedBag(Comparator<Tuple> comp) {
this(1, comp);
}
public InternalSortedBag(int bagCount, Comparator<Tuple> comp) {
this(bagCount, -1.0f, comp);
}
public InternalSortedBag(int bagCount, float percent, Comparator<Tuple> comp) {
super(bagCount, percent);
init(bagCount, percent, comp);
}
总觉得特别眼熟,好像这几个包基本都是这么构造的..SortedSpillBag的构造在上一篇,接下来看看测试函数
@Test
public void testInternalSortedBag() throws Exception {
// check adding empty tuple
DataBag bg0 = new InternalSortedBag();
bg0.add(TupleFactory.getInstance().newTuple());
bg0.add(TupleFactory.getInstance().newTuple());
assertEquals(bg0.size(), 2); // 因为是非独特的
// check equal of bags
DataBag bg1 = new InternalSortedBag();
assertEquals(bg1.size(), 0);
String[][] tupleContents = new String[][] {
{ "e", "f"}, {
"a", "b"}, {
"c", "d" }};
for (int i = 0; i < tupleContents.length; i++) {
bg1.add(Util.createTuple(tupleContents[i]));
}
// check size, and isSorted(), isDistinct()
assertEquals(bg1.size(), 3);
assertTrue(bg1.isSorted());
assertFalse(bg1.isDistinct());
tupleContents = new String[][] {
{
"c", "d" }, {
"a", "b"},{ "e", "f"} };
DataBag bg2 = new InternalSortedBag();
for (int i = 0; i < tupleContents.length; i++) {
bg2.add(Util.createTuple(tupleContents[i]));
}
assertEquals(bg1, bg2);// 不同元组顺序的包被认为是等同的,元组的顺序不同视为同一个包
Iterator<Tuple> iter = bg1.iterator();
iter.next().equals(Util.createTuple(new String[] {
"a", "b"}));
iter.next().equals(Util.createTuple(new String[] {
"c", "d"}));
iter.next().equals(Util.createTuple(new String[] {
"e", "f"}));// bg1的排序为{a,b},{c,d},{e,f}
// check bag with data written to disk
DataBag bg3 = new InternalSortedBag(1, 0.0f, null);
tupleContents = new String[][] {
{ "e", "f"}, {
"c", "d" }, {
"a", "b"}};
for (int i = 0; i < tupleContents.length; i++) {
bg3.add(Util.createTuple(tupleContents[i]));
}
assertEquals(bg1, bg3);
iter = bg3.iterator();
iter.next().equals(Util.createTuple(new String[] {
"a", "b"}));
iter.next().equals(Util.createTuple(new String[] {
"c", "d"}));
iter.next().equals(Util.createTuple(new String[] {
"e", "f"}));
// call iterator methods with irregular order
iter = bg3.iterator();
assertTrue(iter.hasNext());
assertTrue(iter.hasNext());
DataBag bg4 = new InternalSortedBag(1, 0.0f, null);
bg4.add(iter.next());
bg4.add(iter.next());
assertTrue(iter.hasNext());
bg4.add(iter.next());
assertFalse(iter.hasNext());
assertFalse(iter.hasNext());
assertEquals(bg3, bg4);
// check clear
bg3.clear();
assertEquals(bg3.size(), 0);
// test with all data spill out
DataBag bg5 = new InternalSortedBag();
for(int j=0; j<3; j++) {
for (int i = 0; i < tupleContents.length; i++) {
bg5.add(Util.createTuple(tupleContents[i]));
}
bg5.spill();
}
assertEquals(bg5.size(), 9);
iter = bg5.iterator();
for(int i=0; i<3; i++) {
iter.next().equals(Util.createTuple(new String[] {
"a", "b"}));
}
for(int i=0; i<3; i++) {
iter.next().equals(Util.createTuple(new String[] {
"c", "d"}));
}
for(int i=0; i<3; i++) {
iter.next().equals(Util.createTuple(new String[] {
"e", "f"}));// 因为会排序,所以和add的顺序不一样
}
// test with most data spill out, with some data in memory
// and merge of spill files
DataBag bg6 = new InternalSortedBag();
for(int j=0; j<104; j++) {
for (int i = 0; i < tupleContents.length; i++) {
bg6.add(Util.createTuple(tupleContents[i]));
}
if (j != 103) {
bg6.spill();
}
}
assertEquals(bg6.size(), 104*3);
iter = bg6.iterator();
for(int i=0; i<104; i++) {
iter.next().equals(Util.createTuple(new String[] {
"a", "b"}));
}
for(int i=0; i<104; i++) {
iter.next().equals(Util.createTuple(new String[] {
"c", "d"}));
}
for(int i=0; i<104; i++) {
iter.next().equals(Util.createTuple(new String[] {
"e", "f"}));
}
// check two implementation of sorted bag can compare correctly
DataBag bg7 = new SortedDataBag(null);
for(int j=0; j<104; j++) {
for (int i = 0; i < tupleContents.length; i++) {
bg7.add(Util.createTuple(tupleContents[i]));
}
if (j != 103) {
bg7.spill();
}
}
assertEquals(bg6, bg7);
}
可以看出来,由于会排序,所以元组输出的顺序和加入的顺序没有关系,只和元组的字母排序有关
LimitedSortedBag
继承关系
public class LimitedSortedDataBag implements DataBag
顺带一提,前面的SelfSpillDataBag也是继承自DefaultAbractBag,DefaultAbractBag继承自DataBag
一些注释
/** * 具有倍数的元组(可能)的有序集合。数据在进入时存储在优先级队列中,并且仅在迭代器被请求时进行排序 * * LimitedSortedDataBag 不可溢出。 * * 我们允许使用用户定义的比较器,但在用户未指定比较器的情况下提供默认比较器 */
UML图
构造函数
/**
* @param comp 用于进行排序的比较器。
* 如果为 null,则将使用 DefaultComparator。
*/
public LimitedSortedDataBag(Comparator<Tuple> comp, long limit) {
this.mComp = comp == null ? new DefaultComparator() : comp;
this.limit = limit;
this.priorityQ = new PriorityQueue<Tuple>(
(int)limit, getReversedComparator(mComp));
}
终于不是内部包那一套的模板了,参数要求是个构造器,也就是说可以自定义排序方式,感觉还挺有用的,接下来老规矩,看测试函数
@Test
public void testLimitedSortedBag() throws ExecException {
DataBag bag = new LimitedSortedDataBag(null, 2);
Tuple t;
t = TupleFactory.getInstance().newTuple(1);
t.set(0, 2);// 将0字段的值设为2
bag.add(t);
t = TupleFactory.getInstance().newTuple(1);
t.set(0, 0);
bag.add(t);
t = TupleFactory.getInstance().newTuple(1);
t.set(0, 1);
bag.add(t);
// test size()
assertEquals(bag.size(), 2);
// test isSorted()
assertTrue(bag.isSorted());
// test isDistinct()
assertFalse(bag.isDistinct());
// test iterator()
Iterator<Tuple> it = bag.iterator();
assertEquals(it.next().get(0), 0);
assertEquals(it.next().get(0), 1);//数据2被丢失了
assertEquals(it.hasNext(), false);
// test addAll()
DataBag bag1 = new LimitedSortedDataBag(null, 1);
bag1.addAll(bag);
assertEquals(bag1.size(), 1);
// test compareTo()
assertEquals(bag.compareTo(bag), 0);
assertEquals(bag.compareTo(bag1), 1);
// test clear()
bag1.clear();
assertEquals(bag1.size(), 0);
}
从上面的测试可以看出,LimitedSortedBag是非溢出的(如果容量满再继续加入数据会丢失)
排序的
本篇博客内容就到这里,下一篇将讲解其他元组的变种