일반적으로 (A)와 같이 작성하는 것이 좋지만 (B)로 코드를 작성한 것과 성능의 차이는 어떨까 생각이 들었습니다. next를 5번 연속적으로 호출하는 행위를 foo(), bar(), baz()에 대해 각각 이루어지는지 한번만 이루어지는지…

(A)
next = node->next->next->next->next->next;
next->foo();
next->bar();
next->baz();

(B)
node->next->next->next->next->next->foo();
node->next->next->next->next->next->bar();
node->next->next->next->next->next->baz();

다음과 같은 코드로 disassemble을 해 보았습니다. 컴파일러는 “g++ (Debian 8.2.0-20) 8.2.0”.

#include <iostream>

using namespace std;

struct Node {
  Node* next;
  int n;

  void foo();
  void bar();
  void baz();
};

void Node::foo() { n += 1; }
void Node::bar() { n += 2; }
void Node::baz() { n += 3; }

Node* makeNodeList(int c) {
  if (c == 0) return nullptr;
  Node* node = new Node;
  node->next = makeNodeList(c - 1);
  node->n = 0;
  return node;
}

int main() {
  Node* node = makeNodeList(10);
  node->next->next->next->next->next->foo();
  node->next->next->next->next->next->bar();
  node->next->next->next->next->next->baz();
  cout << node->next->next->next->next->next->n << endl;
  return 0;
}

main 함수 disassembly는 다음과 같습니다. next-> 5번 연산이 계속해서 발생하는 것을 확인할 수 있습니다.

27 [1]	  Node* node = makeNodeList(10);
          53                    push   rbx
<+    1>        bf 0a 00 00 00        mov    edi,0xa
<+    6>        e8 c5 01 00 00        call   0x565331d1e260 <makeNodeList(int)>
<+   11>        48 89 c3              mov    rbx,rax
28 [1]	  node->next->next->next->next->next->foo();
<+   14>        48 8b 00              mov    rax,QWORD PTR [rax]
<+   17>        48 8b 00              mov    rax,QWORD PTR [rax]
<+   20>        48 8b 00              mov    rax,QWORD PTR [rax]
<+   23>        48 8b 00              mov    rax,QWORD PTR [rax]
<+   26>        48 8b 38              mov    rdi,QWORD PTR [rax]
<+   29>        e8 7e 01 00 00        call   0x565331d1e230 <Node::foo()>
29 [1]	  node->next->next->next->next->next->bar();
<+   34>        48 8b 03              mov    rax,QWORD PTR [rbx]
<+   37>        48 8b 00              mov    rax,QWORD PTR [rax]
<+   40>        48 8b 00              mov    rax,QWORD PTR [rax]
<+   43>        48 8b 00              mov    rax,QWORD PTR [rax]
<+   46>        48 8b 38              mov    rdi,QWORD PTR [rax]
<+   49>        e8 7a 01 00 00        call   0x565331d1e240 <Node::bar()>
30 [1]	  node->next->next->next->next->next->baz();
<+   54>        48 8b 03              mov    rax,QWORD PTR [rbx]
<+   57>        48 8b 00              mov    rax,QWORD PTR [rax]
<+   60>        48 8b 00              mov    rax,QWORD PTR [rax]
<+   63>        48 8b 00              mov    rax,QWORD PTR [rax]
<+   66>        48 8b 38              mov    rdi,QWORD PTR [rax]
<+   69>        e8 76 01 00 00        call   0x565331d1e250 <Node::baz()>
31 [1]	  cout << node->next->next->next->next->next->n << endl;
<+   74>        48 8b 03              mov    rax,QWORD PTR [rbx]
<+   77>        48 8b 3d fc 2e 00 00  mov    rdi,QWORD PTR [rip+0x2efc]        # 0x565331d20fe0
<+   84>        48 8b 00              mov    rax,QWORD PTR [rax]
<+   87>        48 8b 00              mov    rax,QWORD PTR [rax]
<+   90>        48 8b 00              mov    rax,QWORD PTR [rax]
<+   93>        48 8b 00              mov    rax,QWORD PTR [rax]
<+   96>        8b 70 08              mov    esi,DWORD PTR [rax+0x8]
<+   99>        e8 48 ff ff ff        call   0x565331d1e040 <_ZNSolsEi@plt>
108 [1]	      operator<<(__ostream_type& (*__pf)(__ostream_type&))
<+  104>        48 89 c7              mov    rdi,rax
<+  107>        e8 60 ff ff ff        call   0x565331d1e060 <_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_@plt>
32 [1]	  return 0;
<+  112>        31 c0                 xor    eax,eax
<+  114>        5b                    pop    rbx
<+  115>        c3                    ret

상기 C++ 코드에서 foo(), bar(), baz() 함수를 inline으로 선언하는 경우 main 함수 disassembly는 다음과 같습니다. n이라는 변수에 1, 2, 3을 더하는 것을 그냥 한꺼번에 최적화를 해 버리네요.

27 [1]	  Node* node = makeNodeList(10);
          48 83 ec 08           sub    rsp,0x8
<+    4>        bf 0a 00 00 00        mov    edi,0xa
<+    9>        e8 62 01 00 00        call   0x562937aaa200 <makeNodeList(int)>
28 [1]	  node->next->next->next->next->next->foo();
<+   14>        48 8b 3d 3b 2f 00 00  mov    rdi,QWORD PTR [rip+0x2f3b]        # 0x562937aacfe0
<+   21>        48 8b 00              mov    rax,QWORD PTR [rax]
<+   24>        48 8b 00              mov    rax,QWORD PTR [rax]
<+   27>        48 8b 00              mov    rax,QWORD PTR [rax]
<+   30>        48 8b 00              mov    rax,QWORD PTR [rax]
<+   33>        48 8b 00              mov    rax,QWORD PTR [rax]
30 [1]	  node->next->next->next->next->next->baz();
<+   36>        8b 50 08              mov    edx,DWORD PTR [rax+0x8]
<+   39>        8d 72 06              lea    esi,[rdx+0x6]
<+   42>        89 70 08              mov    DWORD PTR [rax+0x8],esi
31 [1]	  cout << node->next->next->next->next->next->n << endl;
<+   45>        e8 7e ff ff ff        call   0x562937aaa040 <_ZNSolsEi@plt>
108 [1]	      operator<<(__ostream_type& (*__pf)(__ostream_type&))
<+   50>        48 89 c7              mov    rdi,rax
<+   53>        e8 96 ff ff ff        call   0x562937aaa060 <_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_@plt>
32 [1]	  return 0;
<+   58>        31 c0                 xor    eax,eax
<+   60>        48 83 c4 08           add    rsp,0x8
<+   64>        c3                    ret

Node 클래스의 n 변수에 volatile을 붙여 봅니다. foo(), bar(), baz()는 각각 순서대로 호출이 되지만 next-> 5번 연산은 한번만 이루어 지는 것을 볼 수 있습니다.

volatile int n;

disassembly는 다음과 같습니다.

        27 [1]	  Node* node = makeNodeList(10);
                  48 83 ec 08           sub    rsp,0x8
  <+    4>        bf 0a 00 00 00        mov    edi,0xa
  <+    9>        e8 72 01 00 00        call   0x5600ab2c9210 <makeNodeList(int)>
        28 [1]	  node->next->next->next->next->next->foo();
  <+   14>        48 8b 3d 3b 2f 00 00  mov    rdi,QWORD PTR [rip+0x2f3b]        # 0x5600ab2cbfe0
  <+   21>        48 8b 00              mov    rax,QWORD PTR [rax]
  <+   24>        48 8b 00              mov    rax,QWORD PTR [rax]
  <+   27>        48 8b 00              mov    rax,QWORD PTR [rax]
  <+   30>        48 8b 00              mov    rax,QWORD PTR [rax]
  <+   33>        48 8b 00              mov    rax,QWORD PTR [rax]
        14 [1]	void Node::foo() { n += 1; }
  <+   36>        8b 50 08              mov    edx,DWORD PTR [rax+0x8]
  <+   39>        83 c2 01              add    edx,0x1
  <+   42>        89 50 08              mov    DWORD PTR [rax+0x8],edx
        29 [1]	  node->next->next->next->next->next->bar();
  <+   45>        8b 50 08              mov    edx,DWORD PTR [rax+0x8]
  <+   48>        83 c2 02              add    edx,0x2
  <+   51>        89 50 08              mov    DWORD PTR [rax+0x8],edx
        30 [1]	  node->next->next->next->next->next->baz();
  <+   54>        8b 50 08              mov    edx,DWORD PTR [rax+0x8]
  <+   57>        83 c2 03              add    edx,0x3
  <+   60>        89 50 08              mov    DWORD PTR [rax+0x8],edx
        31 [1]	  cout << node->next->next->next->next->next->n << endl;
  <+   63>        8b 70 08              mov    esi,DWORD PTR [rax+0x8]
  <+   66>        e8 69 ff ff ff        call   0x5600ab2c9040 <_ZNSolsEi@plt>
        108 [1]	      operator<<(__ostream_type& (*__pf)(__ostream_type&))
  <+   71>        48 89 c7              mov    rdi,rax
  <+   74>        e8 81 ff ff ff        call   0x5600ab2c9060 <_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_@plt>
        32 [1]	  return 0;
  <+   79>        31 c0                 xor    eax,eax
  <+   81>        48 83 c4 08           add    rsp,0x8
  <+   85>        c3                    ret