Labmda efficiency problem in C++

Recently, I used lambda to replace the original logic. I didn't think about the difference between using lambda and writing logic directly. Of course, lambda will become an anonymous function without optimization, but is it still the same when O2 optimization is enabled? Whether lambda will also be inlined.

The various assembly codes in this article are all generated with O2 optimization turned on, using VS2019.

The simplest labmda

First, let's take a look at the simplest lambda.

// main.cpp

int main()
{
    
    
	using namespace std;
	auto testf = []() {
    
    
		for (int i = 0; i < 10; ++i)
		{
    
    
			if (i % 2 == 0)
				cout << "hi";
		}
	};
	testf();
}

View the disassembly code, here is the disassembly of testf()

00007FF6DB7E1246  xor         ebx,ebx  
00007FF6DB7E1248  test        bl,1  
00007FF6DB7E124B  jne         main+20h (07FF6DB7E1260h)  
00007FF6DB7E124D  mov         rcx,qword ptr [__imp_std::cout (07FF6DB7E20B0h)]  
00007FF6DB7E1254  lea         rdx,[string "hi" (07FF6DB7E2284h)]  
00007FF6DB7E125B  call        std::operator<<<std::char_traits<char> > (07FF6DB7E1000h)  
00007FF6DB7E1260  inc         ebx  
00007FF6DB7E1262  cmp         ebx,0Ah  
00007FF6DB7E1265  jl          main+8h (07FF6DB7E1248h)

It is found that under the optimization of O2, it is directly inlined, and no function call is generated. The call in line 6 calls cout.

lambdas in classes

Declaration and definition of class A

// A.h
#pragma once
#include <iostream>
#include<string>
//#define VIRTUAL
//#define NOINLINE
#ifdef VIRTUAL
class Base {
    
    
public:
	virtual void f(){
    
    
		std::cout << "Base";
	};
};
#endif 
class A
#ifdef VIRTUAL
	: public Base
#endif
{
    
    
public:
	A(int a, std::string b) :parm1(a), parm2(b) {
    
    };
	A(int a) :parm1(a) {
    
    };
#ifdef NOINLINE
	__declspec(noinline)
#endif
#ifdef VIRTUAL
		virtual
#endif 
	void f();
	int parm1;
	std::string parm2;
};

Definition of f function

// A.cpp
#include"A.h"
void A::f()
{
    
    
	int s_t = 1;
	auto t = [&s_t, this]() {
    
    
		for (int i = 0; i < 10; ++i)
		{
    
    
			if (i % 2 == 0)
				std::cout << "hi";
		}
		parm1 += parm2.size();
#ifdef VIRTUAL
		Base::f();
#else
		f();
#endif 
	};
	t();
}

main file

//main.cpp
#include "A.h"
extern class Base;
extern class A;
int main()
{
    
    
	using namespace std;
	
	A a(1, "hi");
#ifdef VIRTUAL
	a.f();
	Base* t = &a;
	t->f();
	std::cout << a.parm1 << a.parm2;
#else
	a.f();
#endif // VURTUAL
}

VIRTUAL and ONINLINE use macros to control what happens when virtual functions are used versus when direct inlining is forced.
First of all, we don't define ONINLINE and VIRTUAL macros, let's take a look at the disassembly code of af()

00007FF7283E1557  test        bl,1  
00007FF7283E155A  jne         main+178h (07FF7283E1568h)  
00007FF7283E155C  mov         rcx,qword ptr [__imp_std::cout (07FF7283E3080h)]  
00007FF7283E1563  call        std::operator<<<std::char_traits<char> > (07FF7283E1000h)  
00007FF7283E1568  inc         ebx  
00007FF7283E156A  cmp         ebx,0Ah  
00007FF7283E156D  jl          main+167h (07FF7283E1557h)  

It can be found that the f() function is directly optimized, and the function calls are all optimized. In this way, we can't test the use of lambda in the function of the class, so I defined the ONINLINE macro to make the compiler force the f() function not to be inlined.
Let's look at the disassembly where ONINLINE is defined.

00007FF7B39C1599  lea         rcx,[rbp-38h]  
00007FF7B39C159D  call        A::f (07FF7B39C1000h)  
00007FF7B39C15A2  nop  

It can be seen that it is not inlined, let's take a look inside the function

void A::f()
{
    
    
00007FF7B39C1000  mov         qword ptr [rsp+8],rbx  
00007FF7B39C1005  push        rdi  
00007FF7B39C1006  sub         rsp,20h  
00007FF7B39C100A  mov         rdi,rcx  
	int s_t = 1;
	auto t = [&s_t, this]() {
    
    
		for (int i = 0; i < 10; ++i)
		{
    
    
			if (i % 2 == 0)
				std::cout << "hi";
		}
		parm1 += parm2.size();
#ifdef VIRTUAL
		Base::f();
#else
		//f();
#endif 
	};
00007FF7B39C100D  xor         ebx,ebx  
00007FF7B39C100F  nop  
	t();
00007FF7B39C1010  test        bl,1  
00007FF7B39C1013  jne         A::f+21h (07FF7B39C1021h)  
00007FF7B39C1015  mov         rcx,qword ptr [__imp_std::cout (07FF7B39C3080h)]  
00007FF7B39C101C  call        std::operator<<<std::char_traits<char> > (07FF7B39C1040h)  
00007FF7B39C1021  inc         ebx  
00007FF7B39C1023  cmp         ebx,0Ah  
00007FF7B39C1026  jl          A::f+10h (07FF7B39C1010h)  
00007FF7B39C1028  mov         eax,dword ptr [rdi+18h]  
00007FF7B39C102B  add         dword ptr [rdi],eax  
}

It can be found that lambda has been optimized again.

lambdas in virtual functions

Then we define the VIRTUAL macro, do not define the NOINLINE macro, take a look at the disassembly

#ifdef VIRTUAL
	a.f();
00007FF6C5EB15D4  test        bl,1  
00007FF6C5EB15D7  jne         main+18Ch (07FF6C5EB15ECh)  
00007FF6C5EB15D9  lea         rdx,[string "hi" (07FF6C5EB3358h)]  
00007FF6C5EB15E0  mov         rcx,qword ptr [__imp_std::cout (07FF6C5EB3088h)]  
00007FF6C5EB15E7  call        std::operator<<<std::char_traits<char> > (07FF6C5EB1060h)  
00007FF6C5EB15EC  inc         ebx  
00007FF6C5EB15EE  cmp         ebx,0Ah  
00007FF6C5EB15F1  jl          main+174h (07FF6C5EB15D4h)  
00007FF6C5EB15F3  mov         eax,dword ptr [rbp-18h]  
00007FF6C5EB15F6  add         dword ptr [rbp-30h],eax  
00007FF6C5EB15F9  lea         rdx,[string "Base" (07FF6C5EB3350h)]  
00007FF6C5EB1600  mov         rcx,qword ptr [__imp_std::cout (07FF6C5EB3088h)]  
00007FF6C5EB1607  call        std::operator<<<std::char_traits<char> > (07FF6C5EB1060h)  
	Base* t = &a;
	t->f();
00007FF6C5EB160C  lea         rcx,[rbp-38h]  
00007FF6C5EB1610  mov         rax,qword ptr [rbp-38h]  
00007FF6C5EB1614  call        qword ptr [rax]  

It can be found that the self-call of af () is directly inline optimized, but after polymorphic conversion, it cannot be inline optimized. Let's go in and see if lambda will be inline optimized.

void A::f()
{
    
    
00007FF6C5EB1000  mov         qword ptr [rsp+8],rbx  
00007FF6C5EB1005  push        rdi  
00007FF6C5EB1006  sub         rsp,20h  
00007FF6C5EB100A  mov         rdi,rcx  
	int s_t = 1;
	auto t = [&s_t, this]() {
    
    
		for (int i = 0; i < 10; ++i)
		{
    
    
			if (i % 2 == 0)
				std::cout << "hi";
		}
		parm1 += parm2.size();
#ifdef VIRTUAL
		Base::f();
#else
		//f();
#endif 
	};
00007FF6C5EB100D  xor         ebx,ebx  
00007FF6C5EB100F  nop  
	t();
00007FF6C5EB1010  test        bl,1  
00007FF6C5EB1013  jne         A::f+28h (07FF6C5EB1028h)  
00007FF6C5EB1015  mov         rcx,qword ptr [__imp_std::cout (07FF6C5EB3088h)]  
00007FF6C5EB101C  lea         rdx,[string "hi" (07FF6C5EB3358h)]  
00007FF6C5EB1023  call        std::operator<<<std::char_traits<char> > (07FF6C5EB1060h)  
00007FF6C5EB1028  inc         ebx  
00007FF6C5EB102A  cmp         ebx,0Ah  
00007FF6C5EB102D  jl          A::f+10h (07FF6C5EB1010h)  
00007FF6C5EB102F  mov         eax,dword ptr [rdi+20h]  
00007FF6C5EB1032  lea         rdx,[string "Base" (07FF6C5EB3350h)]  
00007FF6C5EB1039  add         dword ptr [rdi+8],eax  
00007FF6C5EB103C  mov         rcx,qword ptr [__imp_std::cout (07FF6C5EB3088h)]  
}

It can be seen that not only the lambda is optimized, but even the parent class function is also optimized.

Summarize

It can be found that if polymorphism is not involved, the compiler can determine the specific calling function at compile time, then this function may be inlined. Even lambdas are no exception.

So in general, the overhead of using lambda is almost 0, because the compiler will do inline optimization.

Guess you like

Origin blog.csdn.net/ninesnow_c/article/details/121947046