Performance example source code from Inside the c++ object model

编译环境：gcc version 7.5.0 (Ubuntu 7.5.0-3ubuntu1~18.04)
例子按3.5节所写：对象成员的效率（object member efficiency）


constexpr int ITERATION_COUNT = 10000001;
void test_plain_float()
{
    
    
	float pA_x = 1.725f, pA_y = 0.875f, pA_z = 0.478f;
	float pB_x = 0.315f, pB_y = 0.317f, pB_z = 0.838f;
	START_TIMING(0);
	for (int i = 0; i < ITERATION_COUNT; i++)
	{
    
    
		pB_x = pA_x - pB_z;
		pB_y = pA_y + pB_x;
		pB_z = pA_z + pB_y;
	}
	END_TIMING(0);
	cout << "-----plain:" << pB_z << endl;
}
void test_plain_array()
{
    
    
	enum fussy
	{
    
    
		x,
		y,
		z
	};
	float pA[z + 1], pB[z + 1];
	pA[x] = 1.725f, pA[y] = 0.875f, pA[z] = 0.478f;
	pB[x] = 0.315f, pB[y] = 0.317f, pB[z] = 0.838f;
	START_TIMING(0);
	for (int iters = 0; iters < ITERATION_COUNT; iters++)
	{
    
    
		pB[x] = pA[x] - pB[z];
		pB[y] = pA[y] + pB[x];
		pB[z] = pA[z] + pB[y];
	}
	END_TIMING(0);

	cout << "-----array:" << pB[z] << endl;
}

struct Point3d
{
    
    
	float _x;
	float _y;
	float _z;
};

void test_c_struct()
{
    
    
	Point3d pA, pB;
	pA._x = 1.725f, pA._y = 0.875f, pA._z = 0.478f;
	pB._x = 0.315f, pB._y = 0.317f, pB._z = 0.838f;

	START_TIMING(0);
	for (int iters = 0; iters < ITERATION_COUNT; iters++)
	{
    
    
		pB._x = pA._x - pB._z;
		pB._y = pA._y + pB._x;
		pB._z = pA._z + pB._y;
	}
	END_TIMING(0);
	cout << "-----struct:" << pB._z << endl;
}
void test_c_struct_pointer()
{
    
    
	Point3d *pA = new Point3d;
	Point3d *pB = new Point3d;
	pA->_x = 1.725f, pA->_y = 0.875f, pA->_z = 0.478f;
	pB->_x = 0.315f, pB->_y = 0.317f, pB->_z = 0.838f;

	START_TIMING(0);
	for (int iters = 0; iters < ITERATION_COUNT; iters++)
	{
    
    
		pB->_x = pA->_x - pB->_z;
		pB->_y = pA->_y + pB->_x;
		pB->_z = pA->_z + pB->_y;
	}
	END_TIMING(0);
	cout << "-----struct pointer:" << pB->_z << endl;
}
void test_c_struct_pointer_volatile()
{
    
    
	volatile Point3d *pA = new Point3d;
	volatile Point3d *pB = new Point3d;
	pA->_x = 1.725f, pA->_y = 0.875f, pA->_z = 0.478f;
	pB->_x = 0.315f, pB->_y = 0.317f, pB->_z = 0.838f;

	START_TIMING(0);
	for (int iters = 0; iters < ITERATION_COUNT; iters++)
	{
    
    
		pB->_x = pA->_x - pB->_z;
		pB->_y = pA->_y + pB->_x;
		pB->_z = pA->_z + pB->_y;
	}
	END_TIMING(0);
	cout << "-----struct pointer volatile:" << pB->_z << endl;
}
void test_c_struct_member_pointer()
{
    
    
	Point3d pA, pB;
	pA._x = 1.725f, pA._y = 0.875f, pA._z = 0.478f;
	pB._x = 0.315f, pB._y = 0.317f, pB._z = 0.838f;
	float *ax = &pA._x;
	float *ay = &pA._y;
	float *az = &pA._z;
	float *bx = &pB._x;
	float *by = &pB._y;
	float *bz = &pB._z;
	START_TIMING(0);
	for (int iters = 0; iters < ITERATION_COUNT; iters++)
	{
    
    
		*bx = *ax - *bz;
		*by = *ay + *bx;
		*bz = *az + *by;
	}
	END_TIMING(0);
	cout << "-----struct_member_pointer:" << pB._z << endl;
}
class Point3d_Virtual
{
    
    
public:
	Point3d_Virtual(float xx = 0.0, float yy = 0.0, float zz = 0.0)
		: _x(xx), _y(yy), _z(zz) {
    
    }
	virtual inline float &x() {
    
     return _x; } //it's 3 times slower than non-virtual function invoke
	virtual inline float &y() {
    
     return _y; }
	virtual inline float &z() {
    
     return _z; }

	virtual inline void x(float x) {
    
     _x = x; }
	virtual inline void y(float y) {
    
     _y = y; }
	virtual inline void z(float z) {
    
     _z = z; }

public:
	float _x, _y, _z;
};

void test_access_by_virtual_function()
{
    
    
	Point3d_Virtual*pA = new Point3d_Virtual, *pB = new Point3d_Virtual;
	pA->_x = 1.725f, pA->_y = 0.875f, pA->_z = 0.478f;
	pB->_x = 0.315f, pB->_y = 0.317f, pB->_z = 0.838f;

	START_TIMING(0);
	for (int iters = 0; iters < ITERATION_COUNT; iters++)
	{
    
    
		pB->x() = pA->x() - pB->z();
		pB->y() = pA->y() + pB->x();
		pB->z() = pA->z() + pB->y();
	}
	END_TIMING(0);
	cout << "-----virtual func:" << pB->z() << endl;
}
void test_access_by_inline_function()
{
    
    
	Point3d_Virtual pA, pB;
	pA._x = 1.725f, pA._y = 0.875f, pA._z = 0.478f;
	pB._x = 0.315f, pB._y = 0.317f, pB._z = 0.838f;

	START_TIMING(0);
	for (int iters = 0; iters < ITERATION_COUNT; iters++)
	{
    
    
		pB.x()=(pA.x() - pB.z());
		pB.y()=(pA.y() + pB.x());
		pB.z()=(pA.z() + pB.y());
	}
	END_TIMING(0);
	cout << "-----inline func:" << pB.z() << endl;
}
void test_class_member_pointer()
{
    
    
	Point3d_Virtual pA, pB;
	pA._x = 1.725f, pA._y = 0.875f, pA._z = 0.478f;
	pB._x = 0.315f, pB._y = 0.317f, pB._z = 0.838f;
	float *ax = &pA._x;
	float *ay = &pA._y;
	float *az = &pA._z;
	float *bx = &pB._x;
	float *by = &pB._y;
	float *bz = &pB._z;
	START_TIMING(0);
	for (int iters = 0; iters < ITERATION_COUNT; iters++)
	{
    
    
		*bx = *ax - *bz;
		*by = *ay + *bx;
		*bz = *az + *by;
	}
	END_TIMING(0);
	cout << "-----class_member_pointer:" << pB._z << endl;
}
void test_inside_cplusplus_object_model_performance()
{
    
    
	test_plain_float();
	test_plain_array();
	test_c_struct();
	test_c_struct_pointer();
	test_c_struct_pointer_volatile();
	test_c_struct_member_pointer();
	test_access_by_virtual_function();
	test_access_by_inline_function();
	test_class_member_pointer();
}

-O2编译，输出结果如下：

0.122494-----plain:2.24
0.123217-----array:2.24
0.122705-----struct:2.24
0.130809-----struct pointer:2.24
0.367345-----struct pointer volatile:2.24
0.122326-----struct_member_pointer:2.24
0.368905-----virtual func:2.24
0.123141-----inline func:2.24
0.122887-----class_member_pointer:2.24

-O0编译，输出结果如下：

0.365971-----plain:2.24
0.366328-----array:2.24
0.366668-----struct:2.24
0.366613-----struct pointer:2.24
0.369769-----struct pointer volatile:2.24
0.371624-----struct_member_pointer:2.24
0.957507-----virtual func:2.24
0.815733-----inline func:2.24
0.373669-----class_member_pointer:2.24

对比结果，结论：
1.-O2优化的情况，虚函数开销是inline函数的3倍
2.-O2优化的情况，内联函数与直接存取成员变量效率一致；-O0不优化情况，内联函数（内联不起作用）存取成员变量相比直接存取，时间加倍
3.通过成员变量指针访问，与直接访问成员变量，效率一样
4.在一次开机测试出现如下结果，汇编代码完全一样见下，不理解。
原因应该是数据加载问题，测试发现哪种方法第一个运行，哪种方法耗时长。打印可知，这6个变量的地址均是连续的，不存在内存访问的差异。clock_gettime的调用的影响，也可忽略。机器负载越高，数据预先加载的影响越小。

   0.061806-----plain:2.24
   0.031168-----array:2.24
   0.031260-----struct:2.24
   .....

test_plain_float（）循环部分的汇编代码：

 cc0:       e8 eb fd ff ff          callq  ab0 <clock_gettime@plt>
 cc5:       83 f8 ff                cmp    $0xffffffff,%eax
 cc8:       0f 84 db 01 00 00       je     ea9 <_Z16test_plain_floatv+0x209>
 cce:       b8 81 96 98 00          mov    $0x989681,%eax
    float pB_x = 0.315f, pB_y = 0.317f, pB_z = 0.838f;
 cd3:       f3 0f 10 0d 99 10 00    movss  0x1099(%rip),%xmm1        # 1d74 <_IO_stdin_used+0xc4>
 cda:       00 
 cdb:       f3 0f 10 1d 95 10 00    movss  0x1095(%rip),%xmm3        # 1d78 <_IO_stdin_used+0xc8>
 ce2:       00 
 ce3:       f3 0f 10 15 91 10 00    movss  0x1091(%rip),%xmm2        # 1d7c <_IO_stdin_used+0xcc>
 cea:       00 
 ceb:       f3 0f 10 05 8d 10 00    movss  0x108d(%rip),%xmm0        # 1d80 <_IO_stdin_used+0xd0>
 cf2:       00 
 cf3:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
            pB_x = pA_x - pB_z;
 cf8:       0f 28 e3                movaps %xmm3,%xmm4
    for (int i = 0; i < ITERATION_COUNT; i++)
 cfb:       83 e8 01                sub    $0x1,%eax
            pB_x = pA_x - pB_z;
 cfe:       f3 0f 5c e1             subss  %xmm1,%xmm4
 d02:       0f 28 cc                movaps %xmm4,%xmm1
            pB_y = pA_y + pB_x;
 d05:       f3 0f 58 ca             addss  %xmm2,%xmm1
            pB_z = pA_z + pB_y;
 d09:       f3 0f 58 c8             addss  %xmm0,%xmm1
    for (int i = 0; i < ITERATION_COUNT; i++)
 d0d:       75 e9                   jne    cf8 <_Z16test_plain_floatv+0x58>
 d0f:       48 8d 74 24 20          lea    0x20(%rsp),%rsi
 d14:       bf 01 00 00 00          mov    $0x1,%edi
 d19:       f3 0f 11 4c 24 0c       movss  %xmm1,0xc(%rsp)
 d1f:       e8 8c fd ff ff          callq  ab0 <clock_gettime@plt>

test_plain_array（）循环部分的汇编代码：

 ee0:       e8 cb fb ff ff          callq  ab0 <clock_gettime@plt>
 ee5:       83 f8 ff                cmp    $0xffffffff,%eax
 ee8:       0f 84 db 01 00 00       je     10c9 <_Z16test_plain_arrayv+0x209>
 eee:       b8 81 96 98 00          mov    $0x989681,%eax
    pB[x] = 0.315f, pB[y] = 0.317f, pB[z] = 0.838f;
 ef3:       f3 0f 10 0d 79 0e 00    movss  0xe79(%rip),%xmm1        # 1d74 <_IO_stdin_used+0xc4>
 efa:       00 
 efb:       f3 0f 10 1d 75 0e 00    movss  0xe75(%rip),%xmm3        # 1d78 <_IO_stdin_used+0xc8>
 f02:       00 
 f03:       f3 0f 10 15 71 0e 00    movss  0xe71(%rip),%xmm2        # 1d7c <_IO_stdin_used+0xcc>
 f0a:       00 
 f0b:       f3 0f 10 05 6d 0e 00    movss  0xe6d(%rip),%xmm0        # 1d80 <_IO_stdin_used+0xd0>
 f12:       00 
 f13:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
            pB[x] = pA[x] - pB[z];
 f18:       0f 28 e3                movaps %xmm3,%xmm4
    for (int iters = 0; iters < ITERATION_COUNT; iters++)
 f1b:       83 e8 01                sub    $0x1,%eax
            pB[x] = pA[x] - pB[z];
 f1e:       f3 0f 5c e1             subss  %xmm1,%xmm4
 f22:       0f 28 cc                movaps %xmm4,%xmm1
            pB[y] = pA[y] + pB[x];
 f25:       f3 0f 58 ca             addss  %xmm2,%xmm1
            pB[z] = pA[z] + pB[y];
 f29:       f3 0f 58 c8             addss  %xmm0,%xmm1
    for (int iters = 0; iters < ITERATION_COUNT; iters++)
 f2d:       75 e9                   jne    f18 <_Z16test_plain_arrayv+0x58>
 f2f:       48 8d 74 24 20          lea    0x20(%rsp),%rsi
 f34:       bf 01 00 00 00          mov    $0x1,%edi
 f39:       f3 0f 11 4c 24 0c       movss  %xmm1,0xc(%rsp)
 f3f:       e8 6c fb ff ff          callq  ab0 <clock_gettime@plt>

Performance example source code from Inside the c++ object model

猜你喜欢